def main(): ''' Command line interface for pulling a WAF ''' parser = ArgumentParser(description=main.__doc__) parser.add_argument('-s', '--src', help='Source WAF or Database Connection String') parser.add_argument('-d', '--dest', help='Destination Folder') parser.add_argument('-v', '--verbose', action='store_true', help='Enables verbose logging') parser.add_argument('-f', '--force-clean', action='store_true', help='Removes stale contents of the folder') args = parser.parse_args() if args.verbose: enable_logging() get_logger().info("Starting") if args.src and args.dest: if args.src.startswith('http'): download_waf(args.src, args.dest) else: download_from_db(args.src, args.dest) if args.force_clean and args.dest: force_clean(args.dest)
def send_notifications(db, harvest): ''' Send an email to all users belonging to the organization of the harvest notifying them that the harvest failed. :param db: Mongo DB Client :param dict harvest: A dictionary returned from the mongo collection for harvests. ''' users = db.users.find({"profile.organization": harvest['organization']}) mail = Mail() emails = [] for user in list(users): user_emails = user['emails'] if user_emails and user_emails[0]['address']: emails.append(user_emails[0]['address']) recipients = [email for email in emails if throttle_email(email)] # If there are no recipients, obviously don't send an email if not recipients: return for recipient in recipients: get_logger().info("Sending a notification to %s", recipient) msg = Message("Failed to correctly harvest", sender=MAIL_DEFAULT_SENDER or "*****@*****.**", recipients=recipients) body = ("We were unable to harvest from the harvest source {url}. " "Please verify that the source URL is correct and contains " "valid XML Documents. \n\n" "Thanks!\nIOOS Catalog Harvester".format(url=harvest['url'])) msg.body = body mail.send(msg)
def download_from_db(conn_string, dest): ''' Download several WAFs using collections from MongoDB as a source :param str conn_string: MongoDB connection string :param str db_name: The name of the MongoDB database to connect to :param str dest: Write directory destination ''' tokens = conn_string.split('/') if len(tokens) > 3: db_name = tokens[3] else: db_name = 'default' db = MongoClient(conn_string)[db_name] for harvest in list(db.Harvests.find({"publish": True})): try: download_harvest(db, harvest, dest) except KeyboardInterrupt: # exit on SIGINT raise except: get_logger().exception("Failed to harvest") get_logger().error(harvest)
def download_from_db(conn_string, dest): ''' Download several WAFs using collections from MongoDB as a source :param str conn_string: MongoDB connection string :param str db_name: The name of the MongoDB database to connect to :param str dest: Write directory destination ''' tokens = conn_string.split('/') if len(tokens) > 3: db_name = tokens[3] else: db_name = 'default' db = MongoClient(conn_string)[db_name] for harvest in list(db.Harvests.find({"publish": True})): try: src = harvest['url'] provider_str = harvest['organization'] path = os.path.join(dest, provider_str) download_waf(src, path) db.Harvests.update( {"_id": harvest['_id']}, {"$set": { "last_harvest_dt": datetime.utcnow() }}) except KeyboardInterrupt: # exit on SIGINT raise except: get_logger().exception("Failed to harvest") get_logger().error(harvest) continue
def get_harvest_info(db, harvest): ''' Returns a CKAN Harvest object from the CKAN API for Harvests (harvest_source_show) :param db: Mongo DB Client :param dict harvest: A dictionary returned from the mongo collection for harvests. ''' organization = db.Organizations.find_one({"name": harvest['organization']}) if organization is None: raise ValueError("Harvest object does not contain a valid organization: %s" % harvest['organization']) if 'ckan_harvest_url' not in organization: raise ValueError("Organization does not contain a ckan_harvest_url field") ckan_harvest_url = organization['ckan_harvest_url'] regx = r'(.*)(/harvest/)(.*)' matches = re.match(regx, ckan_harvest_url) if not matches: raise ValueError("The ckan_harvest_url can not be parsed into its constituent parts containing a valid harvest_id") groups = matches.groups() if groups is None or len(groups) < 3: raise ValueError("The ckan_harvest_url can not be parsed into its constituent parts containing a valid harvest_id") ckan_harvest_id = groups[2] ckan_harvest_url = posixpath.join(CKAN_API, 'action/harvest_source_show') response = requests.get(ckan_harvest_url, params={"id": ckan_harvest_id}, allow_redirects=True, timeout=10) if response.status_code != 200: get_logger().error("CKAN ERROR: HTTP %s", str(response.status_code)) get_logger().error(response.content) raise IOError("Failed to connect to CKAN: HTTP {}".format(response.status_code)) ckan_harvest = response.json()['result'] return ckan_harvest
def delete_harvest_job(harvest_id): ''' Schedules the deletion of a harvest :param str harvest_id: harvest_id ''' get_logger().info("Deleting harvest") harvest = db.Harvests.find_one({"_id": harvest_id}) harvest_api.delete_harvest(db, harvest) return json.dumps({"result": True})
def main(): ''' Command line interface for pulling a WAF ''' parser = ArgumentParser(description=main.__doc__) parser.add_argument('-t', '--type', choices=['waf', 'csw'], default='waf', help='Data type ("waf" or "csw", defaults to "waf")') parser.add_argument('-s', '--src', required=True, help='Source WAF or Database Connection String') parser.add_argument('-d', '--dest', required=True, help='Destination Folder') parser.add_argument('-v', '--verbose', action='store_true', help='Enables verbose logging') parser.add_argument('-f', '--force-clean', action='store_true', help='Removes stale contents of the folder') args = parser.parse_args() if args.verbose: setup_logging() get_logger().info("Starting") if args.src and args.dest: if args.src.startswith('http'): if args.type == 'waf': download_waf(args.src, args.dest) elif args.type == 'csw': download_csw(args.src, args.dest) else: download_from_db(args.src, args.dest) if args.force_clean and args.dest: get_logger().info("Removing stale datasets") try: # get the STALE_EXPIRATION_DAYS and parse to int or set it to 3 # if unset max_days = int(os.getenv('STALE_EXPIRATION_DAYS', 3)) # if the environment variable was unparseable to int, also set to 3 except ValueError: max_days = 3 force_clean(args.dest, max_days)
def download_harvest(db, harvest, dest): ''' Downloads a harvest from the mongo db and updates the harvest with the latest harvest date. :param db: Mongo DB Client :param dict harvest: A dictionary returned from the mongo collection for harvests. ''' src = harvest['url'] get_logger().info('harvesting: %s' % src) db.Harvests.update({"_id": harvest['_id']}, { "$set": { "last_harvest_dt": "harvesting", "last_harvest_status": None } }) try: provider_str = harvest['organization'] path = os.path.join(dest, provider_str) if harvest['harvest_type'] == 'WAF': records, errors = download_waf(db, harvest, src, path) elif harvest['harvest_type'] == 'ERDDAP-WAF': records, errors = download_erddap_waf(db, harvest, src, path) elif harvest['harvest_type'] == 'CSW': records, errors = download_csw(db, harvest, src, path) else: raise TypeError( 'harvest_type "{}" is not supported; use WAF or CSW'.format( harvest['harvest_type'])) db.Harvests.update({"_id": harvest['_id']}, { "$set": { "last_harvest_dt": datetime.utcnow(), "last_record_count": records, "last_good_count": (records - errors), "last_bad_count": errors, "last_harvest_status": "ok" } }) trigger_ckan_harvest(db, harvest) except: send_notifications(db, harvest) get_logger().exception("Failed to successfully harvest %s", harvest['url']) db.Harvests.update({"_id": harvest['_id']}, { "$set": { "last_harvest_dt": datetime.utcnow(), "last_harvest_status": "fail" } })
def trigger_ckan_harvest(db, harvest): ''' Initiates a CKAN Harvest :param db: Mongo DB Client :param dict harvest: A dictionary returned from the mongo collection for harvests. ''' try: ckan_harvest = get_harvest_info(db, harvest) ckan_harvest_id = ckan_harvest['id'] create_harvest_job(ckan_harvest_id) except: get_logger().exception("Failed to initiate CKAN Harvest")
def download_harvest(db, harvest, dest): ''' Downloads a harvest from the mongo db and updates the harvest with the latest harvest date. :param db: Mongo DB Client :param dict harvest: A dictionary returned from the mongo collection for harvests. ''' src = harvest['url'] get_logger().info('harvesting: %s' % src) db.Harvests.update({"_id": harvest['_id']}, { "$set": { "last_harvest_dt": "harvesting", "last_harvest_status": None } }) try: provider_str = harvest['organization'] path = os.path.join(dest, provider_str) if harvest['harvest_type'] == 'WAF': records, errors = download_waf(db, harvest, src, path) elif harvest['harvest_type'] == 'ERDDAP-WAF': records, errors = download_erddap_waf(db, harvest, src, path) elif harvest['harvest_type'] == 'CSW': records, errors = download_csw(db, harvest, src, path) else: raise TypeError('harvest_type "{}" is not supported; use WAF or CSW'.format(harvest['harvest_type'])) db.Harvests.update({"_id": harvest['_id']}, { "$set": { "last_harvest_dt": datetime.utcnow(), "last_record_count": records, "last_good_count": (records - errors), "last_bad_count": errors, "last_harvest_status": "ok" } }) trigger_ckan_harvest(db, harvest) except: send_notifications(db, harvest) get_logger().exception("Failed to successfully harvest %s", harvest['url']) db.Harvests.update({"_id": harvest['_id']}, { "$set": { "last_harvest_dt": datetime.utcnow(), "last_harvest_status": "fail" } })
def purge_old_records(new_records, old_records): ''' Deletes any records in old_records that aren't in new_records :param list new_records: List of records :param list old_records: List of records ''' get_logger().info("Purging old records from WAF") new_files = [r['location'] for r in new_records if 'location' in r] removal = [r for r in old_records if 'location' in r and r['location'] not in new_files] for record in removal: if 'location' not in record: continue if os.path.exists(record['location']): get_logger().info("Removing %s", record['location']) os.remove(record['location'])
def force_clean(path): ''' Deletes any files in path that end in .xml and are older than 1 day :param str path: Path to a folder to clean ''' now = time.time() for filename in os.listdir(path): filepath = os.path.join(path, filename) if not filename.endswith('.xml'): continue file_st = os.stat(filepath) mtime = file_st.st_mtime if (now - mtime) > (24 * 3600): get_logger().info("Removing %s", filepath) os.remove(filepath)
def get_harvest_info(db, harvest): ''' Returns a CKAN Harvest object from the CKAN API for Harvests (harvest_source_show) :param db: Mongo DB Client :param dict harvest: A dictionary returned from the mongo collection for harvests. ''' organization = db.Organizations.find_one({"name": harvest['organization']}) if organization is None: raise ValueError( "Harvest object does not contain a valid organization: %s" % harvest['organization']) if 'ckan_harvest_url' not in organization: raise ValueError( "Organization does not contain a ckan_harvest_url field") ckan_harvest_url = organization['ckan_harvest_url'] regx = r'(.*)(/harvest/)(.*)' matches = re.match(regx, ckan_harvest_url) if not matches: raise ValueError( "The ckan_harvest_url can not be parsed into its constituent parts containing a valid harvest_id" ) groups = matches.groups() if groups is None or len(groups) < 3: raise ValueError( "The ckan_harvest_url can not be parsed into its constituent parts containing a valid harvest_id" ) ckan_harvest_id = groups[2] ckan_harvest_url = posixpath.join(CKAN_API, 'action/harvest_source_show') response = requests.get(ckan_harvest_url, params={"id": ckan_harvest_id}, allow_redirects=True, timeout=10) if response.status_code != 200: get_logger().error("CKAN ERROR: HTTP %s", str(response.status_code)) get_logger().error(response.content) raise IOError("Failed to connect to CKAN: HTTP {}".format( response.status_code)) ckan_harvest = response.json()['result'] return ckan_harvest
def purge_old_records(new_records, old_records): ''' Deletes any records in old_records that aren't in new_records :param list new_records: List of records :param list old_records: List of records ''' get_logger().info("Purging old records from WAF") new_files = [r['location'] for r in new_records if 'location' in r] removal = [ r for r in old_records if 'location' in r and r['location'] not in new_files ] for record in removal: if 'location' not in record: continue if os.path.exists(record['location']): get_logger().info("Removing %s", record['location']) os.remove(record['location'])
def download_erddap_waf(db, harvest, src, dest): ''' Downloads a WAF's from ERDDAP to a destination :param db: Mongo DB Client :param dict harvest: A dictionary returned from the mongo collection for harvests. :param url src: URL to the WAF :param str dest: Folder to download to ''' if not os.path.exists(dest): os.makedirs(dest) waf_parser = ERDDAPWAFParser(src) old_records = list(db.Records.find({"harvest_id": harvest['_id']})) db.Records.remove({"harvest_id": harvest['_id']}) new_records = [] count = 0 errors = 0 for link in waf_parser.parse(): get_logger().info("Downloading %s", link) try: doc_name = link.split('/')[-1] local_filename = os.path.join(dest, doc_name) # CKAN only looks for XML documents for the harvester if not local_filename.endswith('.xml'): local_filename += '.xml' download_file(link, local_filename) rec = parse_records(db, harvest, link, local_filename) new_records.append(rec) if len(rec['validation_errors']): errors += 1 count += 1 except KeyboardInterrupt: raise except Exception: errors += 1 get_logger().exception("Failed to download") continue purge_old_records(new_records, old_records) return count, errors
def create_harvest_job(ckan_harvest_id): ''' Creates a new harvest job on CKAN :param ckan_harvest_id: ''' ckan_harvest_url = posixpath.join(CKAN_API, 'action/harvest_job_create') payload = json.dumps({"source_id": ckan_harvest_id}) response = requests.post(ckan_harvest_url, headers={ 'Content-Type': 'application/json;charset=utf-8', 'Authorization': CKAN_API_KEY }, data=payload) if response.status_code != 200: get_logger().error("CKAN ERROR: HTTP %s", str(response.status_code)) get_logger().error(response.content) raise IOError("Failed to connect to CKAN: HTTP {}".format(response.status_code)) return response.json()
def force_clean(path, max_days=3): ''' Deletes any files in path that end in .xml and are older than the specified number of days :param str path: Path to a folder to clean :param int max_days: Maximum number of days to keep an old record before removing it. ''' now = time.time() for root, dirs, files in os.walk(path): for filename in files: filepath = os.path.join(root, filename) if not filename.endswith('.xml'): continue file_st = os.stat(filepath) mtime = file_st.st_mtime if (now - mtime) > (24 * 3600 * max_days): get_logger().info("Removing %s", filepath) os.remove(filepath)
def create_harvest_job(ckan_harvest_id): ''' Creates a new harvest job on CKAN :param ckan_harvest_id: ''' ckan_harvest_url = posixpath.join(CKAN_API, 'action/harvest_job_create') payload = json.dumps({"source_id": ckan_harvest_id}) response = requests.post(ckan_harvest_url, headers={ 'Content-Type': 'application/json;charset=utf-8', 'Authorization': CKAN_API_KEY }, data=payload) if response.status_code != 200: get_logger().error("CKAN ERROR: HTTP %s", str(response.status_code)) get_logger().error(response.content) raise IOError("Failed to connect to CKAN: HTTP {}".format( response.status_code)) return response.json()
def download_waf(src, dest): ''' Downloads a WAF's contents to a destination :param url src: URL to the WAF :param str dest: Folder to download to ''' if not os.path.exists(dest): os.makedirs(dest) waf_parser = WAFParser(src) for link in waf_parser.parse(): get_logger().info("Downloading %s", link) try: doc_name = link.split('/')[-1] local_filename = os.path.join(dest, doc_name) download_file(link, local_filename) except KeyboardInterrupt: raise except: get_logger().exception("Failed to download") continue
def delete_harvest(db, harvest): ''' Deletes a harvest, all associated attempts and records :param db: MongoDB Client :param dict harvest: A dictionary returned from the mongo collection for harvests. ''' try: # Remove attempts records = list(db.Records.find({"harvest_id": harvest['_id']})) for record in records: if os.path.exists(record['location']): get_logger().info("Removing %s", record['location']) os.remove(record['location']) db.Records.remove({"harvest_id": harvest['_id']}) db.Attempts.remove({"parent_harvest": harvest['_id']}) db.Harvests.remove({"_id": harvest['_id']}) except: get_logger().exception("Could not successfully delete harvest")
def process_doc(doc, record_url, location, harvest_obj, link, db): """ Processes a document, validating the document and modifying any point geometry, and then inserts a record object into the database. :param str doc: A string which is parseable XML representing the record contents :param str record_url: A URL to the record in the Central WAF :param str location: File path to the XML document on local filesystem. :param dict harvest_obj: A dictionary representing a harvest to be run :param str link: URL to the original document's URL :param db: MongoDB Database Object """ try: rec = validate(doc) rec['record_url'] = record_url # After the validation has been performed, patch the geometry try: patch_geometry(location) except: get_logger().exception("Failed to patch geometry for %s", record_url) rec["validation_errors"] = [{ "line_number": "?", "error": "Invalid Geometry. See gmd:EX_GeographicBoundingBox" }] rec['record_url'] = None rec['url'] = link rec['update_time'] = datetime.now() rec['harvest_id'] = harvest_obj['_id'] rec['location'] = location # hash the xml contents except etree.XMLSyntaxError as e: err_msg = "Record for '{}' had malformed XML, skipping".format(link) rec = { "title": record_url, "description": "", "services": [], "hash_val": None, "metadata_data": None, "harvest_id": harvest_obj['_id'], "location": location, "validation_errors": [{ "line_number": "?", "error": "XML Syntax Error: %s" % (e.message or "Malformed XML") }] } get_logger().error(err_msg) except: get_logger().exception("Failed to create record: %s", record_url) raise # upsert the record based on whether the url is already existing insert_result = db.Records.insert(rec) rec['_id'] = str(insert_result) return rec
def parse_csw_record(db, harvest, csw_url, dest, name, raw_rec): ''' Parses and writes ISO metadata record ''' # replace slashes with underscore so writing to file does not # cause missing file name_sanitize = name.replace('/', '_') file_loc = os.path.join(dest, name_sanitize + '.xml') get_logger().info("Writing to file %s", file_loc) with open(file_loc, 'wb') as f: f.write(raw_rec.xml) try: parts = file_loc.split('/') organization = parts[-2] filename = parts[-1] waf_url = os.environ.get('WAF_URL_ROOT', 'http://registry.ioos.us/') record_url = os.path.join(waf_url, organization, filename) # Get the HTTP GET Request for the record csw_get_record_by_id = get_csw_url(csw_url, name) rec = process_doc(raw_rec.xml, record_url, file_loc, harvest, csw_get_record_by_id, db) if len(rec['validation_errors']): return False except etree.XMLSyntaxError as e: err_msg = "Record for '{}' had malformed XML, skipping".format(name) rec = { "title": "", "description": "", "services": [], "hash_val": None, "validation_errors": [{ "line_number": "?", "error": "XML Syntax Error: %s" % e.message }] } get_logger().error(err_msg) return False except: get_logger().exception("Failed to create record: %s", name) raise return True
def download_waf(db, harvest, src, dest): ''' Downloads a WAF's contents to a destination :param db: Mongo DB Client :param dict harvest: A dictionary returned from the mongo collection for harvests. :param url src: URL to the WAF :param str dest: Folder to download to ''' if not os.path.exists(dest): os.makedirs(dest) waf_parser = WAFParser(src) old_records = list(db.Records.find({"harvest_id": harvest['_id']})) db.Records.remove({"harvest_id": harvest['_id']}) new_records = [] count = 0 errors = 0 for link in waf_parser.parse(): get_logger().info("Downloading %s", link) try: link_hash = sha1(link.encode('utf-8')).hexdigest() doc_name = link_hash + '.xml' local_filename = os.path.join(dest, doc_name) get_logger().info("Saving to %s", local_filename) download_file(link, local_filename) rec = parse_records(db, harvest, link, local_filename) new_records.append(rec) if len(rec['validation_errors']): errors += 1 count += 1 except KeyboardInterrupt: raise except Exception: errors += 1 get_logger().exception("Failed to download") continue purge_old_records(new_records, old_records) return count, errors