Example #1
0
def main():
    '''
    Command line interface for pulling a WAF
    '''

    parser = ArgumentParser(description=main.__doc__)

    parser.add_argument('-s',
                        '--src',
                        help='Source WAF or Database Connection String')
    parser.add_argument('-d', '--dest', help='Destination Folder')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='Enables verbose logging')
    parser.add_argument('-f',
                        '--force-clean',
                        action='store_true',
                        help='Removes stale contents of the folder')

    args = parser.parse_args()

    if args.verbose:
        enable_logging()

    get_logger().info("Starting")
    if args.src and args.dest:
        if args.src.startswith('http'):
            download_waf(args.src, args.dest)
        else:
            download_from_db(args.src, args.dest)

    if args.force_clean and args.dest:
        force_clean(args.dest)
Example #2
0
def send_notifications(db, harvest):
    '''
    Send an email to all users belonging to the organization of the harvest
    notifying them that the harvest failed.

    :param db: Mongo DB Client
    :param dict harvest: A dictionary returned from the mongo collection for
                         harvests.
    '''
    users = db.users.find({"profile.organization": harvest['organization']})
    mail = Mail()
    emails = []
    for user in list(users):
        user_emails = user['emails']
        if user_emails and user_emails[0]['address']:
            emails.append(user_emails[0]['address'])

    recipients = [email for email in emails if throttle_email(email)]
    # If there are no recipients, obviously don't send an email
    if not recipients:
        return
    for recipient in recipients:
        get_logger().info("Sending a notification to %s", recipient)
    msg = Message("Failed to correctly harvest",
                  sender=MAIL_DEFAULT_SENDER or "*****@*****.**",
                  recipients=recipients)
    body = ("We were unable to harvest from the harvest source {url}. "
            "Please verify that the source URL is correct and contains "
            "valid XML Documents. \n\n"
            "Thanks!\nIOOS Catalog Harvester".format(url=harvest['url']))
    msg.body = body
    mail.send(msg)
Example #3
0
def download_from_db(conn_string, dest):
    '''
    Download several WAFs using collections from MongoDB as a source

    :param str conn_string: MongoDB connection string
    :param str db_name: The name of the MongoDB database to connect to
    :param str dest: Write directory destination
    '''

    tokens = conn_string.split('/')
    if len(tokens) > 3:
        db_name = tokens[3]
    else:
        db_name = 'default'

    db = MongoClient(conn_string)[db_name]
    for harvest in list(db.Harvests.find({"publish": True})):
        try:
            download_harvest(db, harvest, dest)
        except KeyboardInterrupt:
            # exit on SIGINT
            raise
        except:
            get_logger().exception("Failed to harvest")
            get_logger().error(harvest)
Example #4
0
def download_from_db(conn_string, dest):
    '''
    Download several WAFs using collections from MongoDB as a source

    :param str conn_string: MongoDB connection string
    :param str db_name: The name of the MongoDB database to connect to
    :param str dest: Write directory destination
    '''

    tokens = conn_string.split('/')
    if len(tokens) > 3:
        db_name = tokens[3]
    else:
        db_name = 'default'

    db = MongoClient(conn_string)[db_name]
    for harvest in list(db.Harvests.find({"publish": True})):
        try:
            src = harvest['url']
            provider_str = harvest['organization']
            path = os.path.join(dest, provider_str)
            download_waf(src, path)
            db.Harvests.update(
                {"_id": harvest['_id']},
                {"$set": {
                    "last_harvest_dt": datetime.utcnow()
                }})
        except KeyboardInterrupt:
            # exit on SIGINT
            raise
        except:
            get_logger().exception("Failed to harvest")
            get_logger().error(harvest)
            continue
Example #5
0
def get_harvest_info(db, harvest):
    '''
    Returns a CKAN Harvest object from the CKAN API for Harvests (harvest_source_show)

    :param db: Mongo DB Client
    :param dict harvest: A dictionary returned from the mongo collection for
                         harvests.
    '''

    organization = db.Organizations.find_one({"name": harvest['organization']})
    if organization is None:
        raise ValueError("Harvest object does not contain a valid organization: %s" % harvest['organization'])
    if 'ckan_harvest_url' not in organization:
        raise ValueError("Organization does not contain a ckan_harvest_url field")
    ckan_harvest_url = organization['ckan_harvest_url']
    regx = r'(.*)(/harvest/)(.*)'
    matches = re.match(regx, ckan_harvest_url)
    if not matches:
        raise ValueError("The ckan_harvest_url can not be parsed into its constituent parts containing a valid harvest_id")

    groups = matches.groups()
    if groups is None or len(groups) < 3:
        raise ValueError("The ckan_harvest_url can not be parsed into its constituent parts containing a valid harvest_id")

    ckan_harvest_id = groups[2]
    ckan_harvest_url = posixpath.join(CKAN_API, 'action/harvest_source_show')

    response = requests.get(ckan_harvest_url, params={"id": ckan_harvest_id}, allow_redirects=True, timeout=10)
    if response.status_code != 200:
        get_logger().error("CKAN ERROR: HTTP %s", str(response.status_code))
        get_logger().error(response.content)
        raise IOError("Failed to connect to CKAN: HTTP {}".format(response.status_code))

    ckan_harvest = response.json()['result']
    return ckan_harvest
Example #6
0
def send_notifications(db, harvest):
    '''
    Send an email to all users belonging to the organization of the harvest
    notifying them that the harvest failed.

    :param db: Mongo DB Client
    :param dict harvest: A dictionary returned from the mongo collection for
                         harvests.
    '''
    users = db.users.find({"profile.organization": harvest['organization']})
    mail = Mail()
    emails = []
    for user in list(users):
        user_emails = user['emails']
        if user_emails and user_emails[0]['address']:
            emails.append(user_emails[0]['address'])

    recipients = [email for email in emails if throttle_email(email)]
    # If there are no recipients, obviously don't send an email
    if not recipients:
        return
    for recipient in recipients:
        get_logger().info("Sending a notification to %s", recipient)
    msg = Message("Failed to correctly harvest",
                  sender=MAIL_DEFAULT_SENDER or "*****@*****.**",
                  recipients=recipients)
    body = ("We were unable to harvest from the harvest source {url}. "
            "Please verify that the source URL is correct and contains "
            "valid XML Documents. \n\n"
            "Thanks!\nIOOS Catalog Harvester".format(url=harvest['url']))
    msg.body = body
    mail.send(msg)
Example #7
0
def main():
    '''
    Command line interface for pulling a WAF
    '''

    parser = ArgumentParser(description=main.__doc__)

    parser.add_argument('-s', '--src', help='Source WAF or Database Connection String')
    parser.add_argument('-d', '--dest', help='Destination Folder')
    parser.add_argument('-v', '--verbose', action='store_true', help='Enables verbose logging')
    parser.add_argument('-f', '--force-clean', action='store_true', help='Removes stale contents of the folder')

    args = parser.parse_args()

    if args.verbose:
        enable_logging()

    get_logger().info("Starting")
    if args.src and args.dest:
        if args.src.startswith('http'):
            download_waf(args.src, args.dest)
        else:
            download_from_db(args.src, args.dest)

    if args.force_clean and args.dest:
        force_clean(args.dest)
Example #8
0
def download_from_db(conn_string, dest):
    '''
    Download several WAFs using collections from MongoDB as a source

    :param str conn_string: MongoDB connection string
    :param str db_name: The name of the MongoDB database to connect to
    :param str dest: Write directory destination
    '''

    tokens = conn_string.split('/')
    if len(tokens) > 3:
        db_name = tokens[3]
    else:
        db_name = 'default'

    db = MongoClient(conn_string)[db_name]
    for harvest in list(db.Harvests.find({"publish": True})):
        try:
            download_harvest(db, harvest, dest)
        except KeyboardInterrupt:
            # exit on SIGINT
            raise
        except:
            get_logger().exception("Failed to harvest")
            get_logger().error(harvest)
Example #9
0
def delete_harvest_job(harvest_id):
    '''
    Schedules the deletion of a harvest

    :param str harvest_id: harvest_id
    '''
    get_logger().info("Deleting harvest")
    harvest = db.Harvests.find_one({"_id": harvest_id})
    harvest_api.delete_harvest(db, harvest)
    return json.dumps({"result": True})
Example #10
0
def delete_harvest_job(harvest_id):
    '''
    Schedules the deletion of a harvest

    :param str harvest_id: harvest_id
    '''
    get_logger().info("Deleting harvest")
    harvest = db.Harvests.find_one({"_id": harvest_id})
    harvest_api.delete_harvest(db, harvest)
    return json.dumps({"result": True})
Example #11
0
def main():
    '''
    Command line interface for pulling a WAF
    '''

    parser = ArgumentParser(description=main.__doc__)

    parser.add_argument('-t',
                        '--type',
                        choices=['waf', 'csw'],
                        default='waf',
                        help='Data type ("waf" or "csw", defaults to "waf")')
    parser.add_argument('-s',
                        '--src',
                        required=True,
                        help='Source WAF or Database Connection String')
    parser.add_argument('-d',
                        '--dest',
                        required=True,
                        help='Destination Folder')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='Enables verbose logging')
    parser.add_argument('-f',
                        '--force-clean',
                        action='store_true',
                        help='Removes stale contents of the folder')
    args = parser.parse_args()

    if args.verbose:
        setup_logging()

    get_logger().info("Starting")
    if args.src and args.dest:
        if args.src.startswith('http'):
            if args.type == 'waf':
                download_waf(args.src, args.dest)
            elif args.type == 'csw':
                download_csw(args.src, args.dest)
        else:
            download_from_db(args.src, args.dest)

    if args.force_clean and args.dest:
        get_logger().info("Removing stale datasets")
        try:
            # get the STALE_EXPIRATION_DAYS and parse to int or set it to 3
            # if unset
            max_days = int(os.getenv('STALE_EXPIRATION_DAYS', 3))
        # if the environment variable was unparseable to int, also set to 3
        except ValueError:
            max_days = 3
        force_clean(args.dest, max_days)
Example #12
0
def download_harvest(db, harvest, dest):
    '''
    Downloads a harvest from the mongo db and updates the harvest with the
    latest harvest date.

    :param db: Mongo DB Client
    :param dict harvest: A dictionary returned from the mongo collection for
                         harvests.
    '''
    src = harvest['url']
    get_logger().info('harvesting: %s' % src)
    db.Harvests.update({"_id": harvest['_id']}, {
        "$set": {
            "last_harvest_dt": "harvesting",
            "last_harvest_status": None
        }
    })
    try:
        provider_str = harvest['organization']
        path = os.path.join(dest, provider_str)
        if harvest['harvest_type'] == 'WAF':
            records, errors = download_waf(db, harvest, src, path)
        elif harvest['harvest_type'] == 'ERDDAP-WAF':
            records, errors = download_erddap_waf(db, harvest, src, path)
        elif harvest['harvest_type'] == 'CSW':
            records, errors = download_csw(db, harvest, src, path)
        else:
            raise TypeError(
                'harvest_type "{}" is not supported; use WAF or CSW'.format(
                    harvest['harvest_type']))
        db.Harvests.update({"_id": harvest['_id']}, {
            "$set": {
                "last_harvest_dt": datetime.utcnow(),
                "last_record_count": records,
                "last_good_count": (records - errors),
                "last_bad_count": errors,
                "last_harvest_status": "ok"
            }
        })
        trigger_ckan_harvest(db, harvest)
    except:
        send_notifications(db, harvest)
        get_logger().exception("Failed to successfully harvest %s",
                               harvest['url'])
        db.Harvests.update({"_id": harvest['_id']}, {
            "$set": {
                "last_harvest_dt": datetime.utcnow(),
                "last_harvest_status": "fail"
            }
        })
Example #13
0
def trigger_ckan_harvest(db, harvest):
    '''
    Initiates a CKAN Harvest

    :param db: Mongo DB Client
    :param dict harvest: A dictionary returned from the mongo collection for
                         harvests.
    '''
    try:
        ckan_harvest = get_harvest_info(db, harvest)
        ckan_harvest_id = ckan_harvest['id']

        create_harvest_job(ckan_harvest_id)
    except:
        get_logger().exception("Failed to initiate CKAN Harvest")
Example #14
0
def trigger_ckan_harvest(db, harvest):
    '''
    Initiates a CKAN Harvest

    :param db: Mongo DB Client
    :param dict harvest: A dictionary returned from the mongo collection for
                         harvests.
    '''
    try:
        ckan_harvest = get_harvest_info(db, harvest)
        ckan_harvest_id = ckan_harvest['id']

        create_harvest_job(ckan_harvest_id)
    except:
        get_logger().exception("Failed to initiate CKAN Harvest")
Example #15
0
def download_harvest(db, harvest, dest):
    '''
    Downloads a harvest from the mongo db and updates the harvest with the
    latest harvest date.

    :param db: Mongo DB Client
    :param dict harvest: A dictionary returned from the mongo collection for
                         harvests.
    '''
    src = harvest['url']
    get_logger().info('harvesting: %s' % src)
    db.Harvests.update({"_id": harvest['_id']}, {
        "$set": {
            "last_harvest_dt": "harvesting",
            "last_harvest_status": None
        }
    })
    try:
        provider_str = harvest['organization']
        path = os.path.join(dest, provider_str)
        if harvest['harvest_type'] == 'WAF':
            records, errors = download_waf(db, harvest, src, path)
        elif harvest['harvest_type'] == 'ERDDAP-WAF':
            records, errors = download_erddap_waf(db, harvest, src, path)
        elif harvest['harvest_type'] == 'CSW':
            records, errors = download_csw(db, harvest, src, path)
        else:
            raise TypeError('harvest_type "{}" is not supported; use WAF or CSW'.format(harvest['harvest_type']))
        db.Harvests.update({"_id": harvest['_id']}, {
            "$set": {
                "last_harvest_dt": datetime.utcnow(),
                "last_record_count": records,
                "last_good_count": (records - errors),
                "last_bad_count": errors,
                "last_harvest_status": "ok"
            }
        })
        trigger_ckan_harvest(db, harvest)
    except:
        send_notifications(db, harvest)
        get_logger().exception("Failed to successfully harvest %s",
                               harvest['url'])
        db.Harvests.update({"_id": harvest['_id']}, {
            "$set": {
                "last_harvest_dt": datetime.utcnow(),
                "last_harvest_status": "fail"
            }
        })
Example #16
0
def purge_old_records(new_records, old_records):
    '''
    Deletes any records in old_records that aren't in new_records

    :param list new_records: List of records
    :param list old_records: List of records
    '''
    get_logger().info("Purging old records from WAF")
    new_files = [r['location'] for r in new_records if 'location' in r]
    removal = [r for r in old_records if 'location' in r and r['location'] not in new_files]
    for record in removal:
        if 'location' not in record:
            continue
        if os.path.exists(record['location']):
            get_logger().info("Removing %s", record['location'])
            os.remove(record['location'])
Example #17
0
def force_clean(path):
    '''
    Deletes any files in path that end in .xml and are older than 1 day

    :param str path: Path to a folder to clean
    '''
    now = time.time()
    for filename in os.listdir(path):
        filepath = os.path.join(path, filename)
        if not filename.endswith('.xml'):
            continue

        file_st = os.stat(filepath)
        mtime = file_st.st_mtime
        if (now - mtime) > (24 * 3600):
            get_logger().info("Removing %s", filepath)
            os.remove(filepath)
Example #18
0
def get_harvest_info(db, harvest):
    '''
    Returns a CKAN Harvest object from the CKAN API for Harvests (harvest_source_show)

    :param db: Mongo DB Client
    :param dict harvest: A dictionary returned from the mongo collection for
                         harvests.
    '''

    organization = db.Organizations.find_one({"name": harvest['organization']})
    if organization is None:
        raise ValueError(
            "Harvest object does not contain a valid organization: %s" %
            harvest['organization'])
    if 'ckan_harvest_url' not in organization:
        raise ValueError(
            "Organization does not contain a ckan_harvest_url field")
    ckan_harvest_url = organization['ckan_harvest_url']
    regx = r'(.*)(/harvest/)(.*)'
    matches = re.match(regx, ckan_harvest_url)
    if not matches:
        raise ValueError(
            "The ckan_harvest_url can not be parsed into its constituent parts containing a valid harvest_id"
        )

    groups = matches.groups()
    if groups is None or len(groups) < 3:
        raise ValueError(
            "The ckan_harvest_url can not be parsed into its constituent parts containing a valid harvest_id"
        )

    ckan_harvest_id = groups[2]
    ckan_harvest_url = posixpath.join(CKAN_API, 'action/harvest_source_show')

    response = requests.get(ckan_harvest_url,
                            params={"id": ckan_harvest_id},
                            allow_redirects=True,
                            timeout=10)
    if response.status_code != 200:
        get_logger().error("CKAN ERROR: HTTP %s", str(response.status_code))
        get_logger().error(response.content)
        raise IOError("Failed to connect to CKAN: HTTP {}".format(
            response.status_code))

    ckan_harvest = response.json()['result']
    return ckan_harvest
Example #19
0
def force_clean(path):
    '''
    Deletes any files in path that end in .xml and are older than 1 day

    :param str path: Path to a folder to clean
    '''
    now = time.time()
    for filename in os.listdir(path):
        filepath = os.path.join(path, filename)
        if not filename.endswith('.xml'):
            continue

        file_st = os.stat(filepath)
        mtime = file_st.st_mtime
        if (now - mtime) > (24 * 3600):
            get_logger().info("Removing %s", filepath)
            os.remove(filepath)
Example #20
0
def purge_old_records(new_records, old_records):
    '''
    Deletes any records in old_records that aren't in new_records

    :param list new_records: List of records
    :param list old_records: List of records
    '''
    get_logger().info("Purging old records from WAF")
    new_files = [r['location'] for r in new_records if 'location' in r]
    removal = [
        r for r in old_records
        if 'location' in r and r['location'] not in new_files
    ]
    for record in removal:
        if 'location' not in record:
            continue
        if os.path.exists(record['location']):
            get_logger().info("Removing %s", record['location'])
            os.remove(record['location'])
Example #21
0
def main():
    '''
    Command line interface for pulling a WAF
    '''

    parser = ArgumentParser(description=main.__doc__)

    parser.add_argument('-t', '--type', choices=['waf', 'csw'], default='waf',
                        help='Data type ("waf" or "csw", defaults to "waf")')
    parser.add_argument('-s', '--src', required=True,
                        help='Source WAF or Database Connection String')
    parser.add_argument('-d', '--dest', required=True,
                        help='Destination Folder')
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='Enables verbose logging')
    parser.add_argument('-f', '--force-clean', action='store_true',
                        help='Removes stale contents of the folder')
    args = parser.parse_args()

    if args.verbose:
        setup_logging()

    get_logger().info("Starting")
    if args.src and args.dest:
        if args.src.startswith('http'):
            if args.type == 'waf':
                download_waf(args.src, args.dest)
            elif args.type == 'csw':
                download_csw(args.src, args.dest)
        else:
            download_from_db(args.src, args.dest)

    if args.force_clean and args.dest:
        get_logger().info("Removing stale datasets")
        try:
            # get the STALE_EXPIRATION_DAYS and parse to int or set it to 3
            # if unset
            max_days = int(os.getenv('STALE_EXPIRATION_DAYS', 3))
        # if the environment variable was unparseable to int, also set to 3
        except ValueError:
            max_days = 3
        force_clean(args.dest, max_days)
Example #22
0
def download_erddap_waf(db, harvest, src, dest):
    '''
    Downloads a WAF's from ERDDAP to a destination

    :param db: Mongo DB Client
    :param dict harvest: A dictionary returned from the mongo collection for
                         harvests.
    :param url src: URL to the WAF
    :param str dest: Folder to download to
    '''
    if not os.path.exists(dest):
        os.makedirs(dest)

    waf_parser = ERDDAPWAFParser(src)
    old_records = list(db.Records.find({"harvest_id": harvest['_id']}))
    db.Records.remove({"harvest_id": harvest['_id']})
    new_records = []

    count = 0
    errors = 0
    for link in waf_parser.parse():
        get_logger().info("Downloading %s", link)
        try:
            doc_name = link.split('/')[-1]
            local_filename = os.path.join(dest, doc_name)
            # CKAN only looks for XML documents for the harvester
            if not local_filename.endswith('.xml'):
                local_filename += '.xml'
            download_file(link, local_filename)
            rec = parse_records(db, harvest, link, local_filename)
            new_records.append(rec)
            if len(rec['validation_errors']):
                errors += 1
            count += 1
        except KeyboardInterrupt:
            raise
        except Exception:
            errors += 1
            get_logger().exception("Failed to download")
            continue
    purge_old_records(new_records, old_records)
    return count, errors
Example #23
0
def download_erddap_waf(db, harvest, src, dest):
    '''
    Downloads a WAF's from ERDDAP to a destination

    :param db: Mongo DB Client
    :param dict harvest: A dictionary returned from the mongo collection for
                         harvests.
    :param url src: URL to the WAF
    :param str dest: Folder to download to
    '''
    if not os.path.exists(dest):
        os.makedirs(dest)

    waf_parser = ERDDAPWAFParser(src)
    old_records = list(db.Records.find({"harvest_id": harvest['_id']}))
    db.Records.remove({"harvest_id": harvest['_id']})
    new_records = []

    count = 0
    errors = 0
    for link in waf_parser.parse():
        get_logger().info("Downloading %s", link)
        try:
            doc_name = link.split('/')[-1]
            local_filename = os.path.join(dest, doc_name)
            # CKAN only looks for XML documents for the harvester
            if not local_filename.endswith('.xml'):
                local_filename += '.xml'
            download_file(link, local_filename)
            rec = parse_records(db, harvest, link, local_filename)
            new_records.append(rec)
            if len(rec['validation_errors']):
                errors += 1
            count += 1
        except KeyboardInterrupt:
            raise
        except Exception:
            errors += 1
            get_logger().exception("Failed to download")
            continue
    purge_old_records(new_records, old_records)
    return count, errors
Example #24
0
def create_harvest_job(ckan_harvest_id):
    '''
    Creates a new harvest job on CKAN

    :param ckan_harvest_id:
    '''
    ckan_harvest_url = posixpath.join(CKAN_API, 'action/harvest_job_create')
    payload = json.dumps({"source_id": ckan_harvest_id})

    response = requests.post(ckan_harvest_url,
                             headers={
                                 'Content-Type': 'application/json;charset=utf-8',
                                 'Authorization': CKAN_API_KEY
                             },
                             data=payload)
    if response.status_code != 200:
        get_logger().error("CKAN ERROR: HTTP %s", str(response.status_code))
        get_logger().error(response.content)
        raise IOError("Failed to connect to CKAN: HTTP {}".format(response.status_code))
    return response.json()
Example #25
0
def force_clean(path, max_days=3):
    '''
    Deletes any files in path that end in .xml and are older than the specified
    number of days

    :param str path: Path to a folder to clean
    :param int max_days: Maximum number of days to keep an old record before
                         removing it.
    '''
    now = time.time()
    for root, dirs, files in os.walk(path):
        for filename in files:
            filepath = os.path.join(root, filename)
            if not filename.endswith('.xml'):
                continue

            file_st = os.stat(filepath)
            mtime = file_st.st_mtime
            if (now - mtime) > (24 * 3600 * max_days):
                get_logger().info("Removing %s", filepath)
                os.remove(filepath)
Example #26
0
def force_clean(path, max_days=3):
    '''
    Deletes any files in path that end in .xml and are older than the specified
    number of days

    :param str path: Path to a folder to clean
    :param int max_days: Maximum number of days to keep an old record before
                         removing it.
    '''
    now = time.time()
    for root, dirs, files in os.walk(path):
        for filename in files:
            filepath = os.path.join(root, filename)
            if not filename.endswith('.xml'):
                continue

            file_st = os.stat(filepath)
            mtime = file_st.st_mtime
            if (now - mtime) > (24 * 3600 * max_days):
                get_logger().info("Removing %s", filepath)
                os.remove(filepath)
Example #27
0
def create_harvest_job(ckan_harvest_id):
    '''
    Creates a new harvest job on CKAN

    :param ckan_harvest_id:
    '''
    ckan_harvest_url = posixpath.join(CKAN_API, 'action/harvest_job_create')
    payload = json.dumps({"source_id": ckan_harvest_id})

    response = requests.post(ckan_harvest_url,
                             headers={
                                 'Content-Type':
                                 'application/json;charset=utf-8',
                                 'Authorization': CKAN_API_KEY
                             },
                             data=payload)
    if response.status_code != 200:
        get_logger().error("CKAN ERROR: HTTP %s", str(response.status_code))
        get_logger().error(response.content)
        raise IOError("Failed to connect to CKAN: HTTP {}".format(
            response.status_code))
    return response.json()
Example #28
0
def download_waf(src, dest):
    '''
    Downloads a WAF's contents to a destination

    :param url src: URL to the WAF
    :param str dest: Folder to download to
    '''
    if not os.path.exists(dest):
        os.makedirs(dest)

    waf_parser = WAFParser(src)

    for link in waf_parser.parse():
        get_logger().info("Downloading %s", link)
        try:
            doc_name = link.split('/')[-1]
            local_filename = os.path.join(dest, doc_name)
            download_file(link, local_filename)
        except KeyboardInterrupt:
            raise
        except:
            get_logger().exception("Failed to download")
            continue
Example #29
0
def download_waf(src, dest):
    '''
    Downloads a WAF's contents to a destination

    :param url src: URL to the WAF
    :param str dest: Folder to download to
    '''
    if not os.path.exists(dest):
        os.makedirs(dest)

    waf_parser = WAFParser(src)

    for link in waf_parser.parse():
        get_logger().info("Downloading %s", link)
        try:
            doc_name = link.split('/')[-1]
            local_filename = os.path.join(dest, doc_name)
            download_file(link, local_filename)
        except KeyboardInterrupt:
            raise
        except:
            get_logger().exception("Failed to download")
            continue
Example #30
0
def delete_harvest(db, harvest):
    '''
    Deletes a harvest, all associated attempts and records

    :param db: MongoDB Client
    :param dict harvest: A dictionary returned from the mongo collection for
                         harvests.
    '''

    try:
        # Remove attempts
        records = list(db.Records.find({"harvest_id": harvest['_id']}))
        for record in records:
            if os.path.exists(record['location']):
                get_logger().info("Removing %s", record['location'])
                os.remove(record['location'])

        db.Records.remove({"harvest_id": harvest['_id']})

        db.Attempts.remove({"parent_harvest": harvest['_id']})
        db.Harvests.remove({"_id": harvest['_id']})

    except:
        get_logger().exception("Could not successfully delete harvest")
Example #31
0
def delete_harvest(db, harvest):
    '''
    Deletes a harvest, all associated attempts and records

    :param db: MongoDB Client
    :param dict harvest: A dictionary returned from the mongo collection for
                         harvests.
    '''

    try:
        # Remove attempts
        records = list(db.Records.find({"harvest_id": harvest['_id']}))
        for record in records:
            if os.path.exists(record['location']):
                get_logger().info("Removing %s", record['location'])
                os.remove(record['location'])

        db.Records.remove({"harvest_id": harvest['_id']})

        db.Attempts.remove({"parent_harvest": harvest['_id']})
        db.Harvests.remove({"_id": harvest['_id']})

    except:
        get_logger().exception("Could not successfully delete harvest")
Example #32
0
def process_doc(doc, record_url, location, harvest_obj, link, db):
    """
    Processes a document, validating the document and modifying any point
    geometry, and then inserts a record object into the database.

    :param str doc: A string which is parseable XML representing the record
                    contents
    :param str record_url: A URL to the record in the Central WAF
    :param str location: File path to the XML document on local filesystem.
    :param dict harvest_obj: A dictionary representing a harvest to be run
    :param str link: URL to the original document's URL
    :param db: MongoDB Database Object
    """
    try:
        rec = validate(doc)
        rec['record_url'] = record_url
        # After the validation has been performed, patch the geometry
        try:
            patch_geometry(location)
        except:
            get_logger().exception("Failed to patch geometry for %s",
                                   record_url)
            rec["validation_errors"] = [{
                "line_number": "?",
                "error": "Invalid Geometry. See gmd:EX_GeographicBoundingBox"
            }]
            rec['record_url'] = None
        rec['url'] = link
        rec['update_time'] = datetime.now()
        rec['harvest_id'] = harvest_obj['_id']
        rec['location'] = location
        # hash the xml contents
    except etree.XMLSyntaxError as e:
        err_msg = "Record for '{}' had malformed XML, skipping".format(link)
        rec = {
            "title": record_url,
            "description": "",
            "services": [],
            "hash_val": None,
            "metadata_data": None,
            "harvest_id": harvest_obj['_id'],
            "location": location,
            "validation_errors": [{
                "line_number": "?",
                "error": "XML Syntax Error: %s" % (e.message or "Malformed XML")
            }]
        }
        get_logger().error(err_msg)
    except:
        get_logger().exception("Failed to create record: %s", record_url)
        raise
    # upsert the record based on whether the url is already existing
    insert_result = db.Records.insert(rec)
    rec['_id'] = str(insert_result)
    return rec
Example #33
0
def parse_csw_record(db, harvest, csw_url, dest, name, raw_rec):
    '''
    Parses and writes ISO metadata record
    '''
    # replace slashes with underscore so writing to file does not
    # cause missing file
    name_sanitize = name.replace('/', '_')
    file_loc = os.path.join(dest, name_sanitize + '.xml')
    get_logger().info("Writing to file %s", file_loc)
    with open(file_loc, 'wb') as f:
        f.write(raw_rec.xml)
    try:
        parts = file_loc.split('/')
        organization = parts[-2]
        filename = parts[-1]
        waf_url = os.environ.get('WAF_URL_ROOT', 'http://registry.ioos.us/')
        record_url = os.path.join(waf_url, organization, filename)

        # Get the HTTP GET Request for the record
        csw_get_record_by_id = get_csw_url(csw_url, name)

        rec = process_doc(raw_rec.xml, record_url, file_loc, harvest,
                          csw_get_record_by_id, db)
        if len(rec['validation_errors']):
            return False
    except etree.XMLSyntaxError as e:
        err_msg = "Record for '{}' had malformed XML, skipping".format(name)
        rec = {
            "title":
            "",
            "description":
            "",
            "services": [],
            "hash_val":
            None,
            "validation_errors": [{
                "line_number": "?",
                "error": "XML Syntax Error: %s" % e.message
            }]
        }
        get_logger().error(err_msg)
        return False
    except:
        get_logger().exception("Failed to create record: %s", name)
        raise
    return True
Example #34
0
def download_waf(db, harvest, src, dest):
    '''
    Downloads a WAF's contents to a destination

    :param db: Mongo DB Client
    :param dict harvest: A dictionary returned from the mongo collection for
                         harvests.
    :param url src: URL to the WAF
    :param str dest: Folder to download to
    '''
    if not os.path.exists(dest):
        os.makedirs(dest)

    waf_parser = WAFParser(src)
    old_records = list(db.Records.find({"harvest_id": harvest['_id']}))
    db.Records.remove({"harvest_id": harvest['_id']})
    new_records = []

    count = 0
    errors = 0
    for link in waf_parser.parse():
        get_logger().info("Downloading %s", link)
        try:
            link_hash = sha1(link.encode('utf-8')).hexdigest()
            doc_name = link_hash + '.xml'
            local_filename = os.path.join(dest, doc_name)
            get_logger().info("Saving to %s", local_filename)

            download_file(link, local_filename)
            rec = parse_records(db, harvest, link, local_filename)
            new_records.append(rec)

            if len(rec['validation_errors']):
                errors += 1
            count += 1

        except KeyboardInterrupt:
            raise
        except Exception:
            errors += 1
            get_logger().exception("Failed to download")
            continue
    purge_old_records(new_records, old_records)
    return count, errors
Example #35
0
def download_waf(db, harvest, src, dest):
    '''
    Downloads a WAF's contents to a destination

    :param db: Mongo DB Client
    :param dict harvest: A dictionary returned from the mongo collection for
                         harvests.
    :param url src: URL to the WAF
    :param str dest: Folder to download to
    '''
    if not os.path.exists(dest):
        os.makedirs(dest)

    waf_parser = WAFParser(src)
    old_records = list(db.Records.find({"harvest_id": harvest['_id']}))
    db.Records.remove({"harvest_id": harvest['_id']})
    new_records = []

    count = 0
    errors = 0
    for link in waf_parser.parse():
        get_logger().info("Downloading %s", link)
        try:
            link_hash = sha1(link.encode('utf-8')).hexdigest()
            doc_name = link_hash + '.xml'
            local_filename = os.path.join(dest, doc_name)
            get_logger().info("Saving to %s", local_filename)

            download_file(link, local_filename)
            rec = parse_records(db, harvest, link, local_filename)
            new_records.append(rec)

            if len(rec['validation_errors']):
                errors += 1
            count += 1

        except KeyboardInterrupt:
            raise
        except Exception:
            errors += 1
            get_logger().exception("Failed to download")
            continue
    purge_old_records(new_records, old_records)
    return count, errors
Example #36
0
def parse_csw_record(db, harvest, csw_url, dest, name, raw_rec):
    '''
    Parses and writes ISO metadata record
    '''
    # replace slashes with underscore so writing to file does not
    # cause missing file
    name_sanitize = name.replace('/', '_')
    file_loc = os.path.join(dest, name_sanitize + '.xml')
    get_logger().info("Writing to file %s", file_loc)
    with open(file_loc, 'wb') as f:
        f.write(raw_rec.xml)
    try:
        parts = file_loc.split('/')
        organization = parts[-2]
        filename = parts[-1]
        waf_url = os.environ.get('WAF_URL_ROOT', 'http://registry.ioos.us/')
        record_url = os.path.join(waf_url, organization, filename)

        # Get the HTTP GET Request for the record
        csw_get_record_by_id = get_csw_url(csw_url, name)

        rec = process_doc(raw_rec.xml, record_url, file_loc, harvest, csw_get_record_by_id, db)
        if len(rec['validation_errors']):
            return False
    except etree.XMLSyntaxError as e:
        err_msg = "Record for '{}' had malformed XML, skipping".format(name)
        rec = {
            "title": "",
            "description": "",
            "services": [],
            "hash_val": None,
            "validation_errors": [{
                "line_number": "?",
                "error": "XML Syntax Error: %s" % e.message
            }]
        }
        get_logger().error(err_msg)
        return False
    except:
        get_logger().exception("Failed to create record: %s", name)
        raise
    return True
Example #37
0
def process_doc(doc, record_url, location, harvest_obj, link, db):
    """
    Processes a document, validating the document and modifying any point
    geometry, and then inserts a record object into the database.

    :param str doc: A string which is parseable XML representing the record
                    contents
    :param str record_url: A URL to the record in the Central WAF
    :param str location: File path to the XML document on local filesystem.
    :param dict harvest_obj: A dictionary representing a harvest to be run
    :param str link: URL to the original document's URL
    :param db: MongoDB Database Object
    """
    try:
        rec = validate(doc)
        rec['record_url'] = record_url
        # After the validation has been performed, patch the geometry
        try:
            patch_geometry(location)
        except:
            get_logger().exception("Failed to patch geometry for %s",
                                   record_url)
            rec["validation_errors"] = [{
                "line_number":
                "?",
                "error":
                "Invalid Geometry. See gmd:EX_GeographicBoundingBox"
            }]
            rec['record_url'] = None
        rec['url'] = link
        rec['update_time'] = datetime.now()
        rec['harvest_id'] = harvest_obj['_id']
        rec['location'] = location
        # hash the xml contents
    except etree.XMLSyntaxError as e:
        err_msg = "Record for '{}' had malformed XML, skipping".format(link)
        rec = {
            "title":
            record_url,
            "description":
            "",
            "services": [],
            "hash_val":
            None,
            "metadata_data":
            None,
            "harvest_id":
            harvest_obj['_id'],
            "location":
            location,
            "validation_errors": [{
                "line_number":
                "?",
                "error":
                "XML Syntax Error: %s" % (e.message or "Malformed XML")
            }]
        }
        get_logger().error(err_msg)
    except:
        get_logger().exception("Failed to create record: %s", record_url)
        raise
    # upsert the record based on whether the url is already existing
    insert_result = db.Records.insert(rec)
    rec['_id'] = str(insert_result)
    return rec