Example #1
0
def get_ia_db(configfile=None):
    """Metadata API is slow.

    Talk to archive.org database directly if it is specified in the
    global configuration or if a configfile is provided.
    """
    if configfile:
        from openlibrary.config import load_config
        load_config(configfile)

    if not config.get("ia_db"):
        return None
    global _ia_db
    if not _ia_db:
        settings = config.ia_db
        host = settings['host']
        db = settings['db']
        user = settings['user']
        pw = os.popen(settings['pw_file']).read().strip()
        _ia_db = web.database(dbn="postgres",
                              host=host,
                              db=db,
                              user=user,
                              pw=pw)
    return _ia_db
Example #2
0
def main(ol_config: str):
    load_config(ol_config)

    # Partner data is offset ~15 days from start of month
    date = datetime.date.today() - timedelta(days=15)
    batch_name = "%s-%04d%02d" % ('bwb', date.year, date.month)
    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch_import(sys.argv[1], batch)
def main():
    load_config(
        os.path.abspath(
            os.path.join(os.sep, 'olsystem', 'etc', 'openlibrary.yml')))
    # Partner data is offset ~15 days from start of month
    date = datetime.date.today() - timedelta(days=15)
    batch_name = "%s-%04d%02d" % ('bwb', date.year, date.month)
    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch_import(sys.argv[1], batch)
Example #4
0
def connect_to_couch(config_file):
    "Connects to the couch databases"
    load_config(config_file)
    infogami._setup()

    f = open(config_file)
    config = yaml.load(f)
    f.close()
    admin_db = config["admin"]["counts_db"]
    return couchdb.Database(admin_db)
Example #5
0
def connect_to_couch(config_file):
    "Connects to the couch databases"
    load_config(config_file)
    infogami._setup()

    f = open(config_file)
    config = yaml.load(f)
    f.close()
    admin_db = config["admin"]["counts_db"]
    return couchdb.Database(admin_db)
Example #6
0
def main():
    '''Command Line interface for search in the OL database and update the solr's search index.'''
    global options
    options = parse_options()
    if not config.runtime_config:
        config.load(options.config)
        config.load_config(options.config)

    if (options.daemon == True):
        start_daemon()
    else:
        scan_days()
def main():
    '''Command Line interface for search in the OL database and update the solr's search index.'''
    global options
    options = parse_options()
    if not config.runtime_config:
        config.load(options.config)
        config.load_config(options.config)

    if (options.daemon == True):
        start_daemon()
    else:
        scan_days()
Example #8
0
def import_job(
    ol_config: str,
    dry_run=False,
) -> None:
    """
    :param ol_config: Path to openlibrary.yml file
    :param dry_run: If true, only print out records to import
    """
    load_config(ol_config)

    # Make HEAD request to get last-modified time
    last_modified = find_last_updated()

    if not last_modified:
        print(
            f'HEAD request to {FEED_URL} failed. Not attempting GET request.')
        return

    print(f'Last-Modified date: {last_modified}')

    updated_on = get_last_updated_time()
    if last_modified == updated_on:
        print(f'No new updates since {updated_on}. Processing completed.')
        return

    print(f'Last import job: {updated_on or "No date found"}')
    # Get feed:
    d = get_feed()

    # Create datetime using updated_on:
    modified_since = convert_date_string(updated_on)

    # Map feed entries to list of import objects:
    print(
        f'Importing all entries that have been updated since {modified_since}.'
    )
    modified_entries = filter_modified_since(d.entries, modified_since)
    print(f'{len(modified_entries)} import objects created.')

    if not dry_run:
        create_batch(modified_entries)
        print(
            f'{len(modified_entries)} entries added to the batch import job.')
    else:
        for record in modified_entries:
            print(json.dumps(record))

    # Store timestamp for header
    if not dry_run:
        with open(LAST_UPDATED_TIME, 'w+') as f:
            f.write(last_modified)
            print(f'Last updated timestamp written to: {LAST_UPDATED_TIME}')
Example #9
0
def main():
    if "--config" in sys.argv:
        index = sys.argv.index("--config")
        configfile = sys.argv[index + 1]
        del sys.argv[index:index + 2]
    else:
        import os

        configfile = os.path.abspath(
            os.path.join(
                os.path.dirname(__file__),
                os.pardir,
                os.pardir,
                'openlibrary',
                'conf',
                'openlibrary.yml',
            ))

    load_config(configfile)

    from infogami import config

    cmd = sys.argv[1]
    args, flags = [], {
        'servername': config.get('servername', 'https://openlibrary.org')
    }
    for i in sys.argv[2:]:
        if i.startswith('--'):
            flags[i[2:]] = True
        else:
            args.append(i)

    if cmd == "import-retro":
        start, stop = ((int(a) for a in args) if (args and len(args) == 2) else
                       (None, None))
        return retroactive_import(start=start,
                                  stop=stop,
                                  servername=flags['servername'])
    if cmd == "import-ocaids":
        return import_ocaids(*args, **flags)
    if cmd == "add-items":
        return add_items(*args)
    elif cmd == "add-new-scans":
        return add_new_scans(args)
    elif cmd == "import-batch":
        return import_batch(args, **flags)
    elif cmd == "import-all":
        return import_all(args, **flags)
    elif cmd == "import-item":
        return import_item(args, **flags)
    else:
        logger.error("Unknown command: %s", cmd)
Example #10
0
def main():
    global args
    FORMAT = "%(asctime)-15s %(levelname)s %(message)s"
    logging.basicConfig(level=logging.INFO, format=FORMAT)

    logger.info("BEGIN new-solr-updater")

    args = parse_arguments()
    process_args(args)

    # set OL URL when running on a dev-instance
    if args.ol_url:
        host = web.lstrips(args.ol_url, "http://").strip("/")
        update_work.set_query_host(host)

    logger.info(str(args))
    logger.info("loading config from %s", args.config)
    load_config(args.config)

    state_file = args.state_file
    offset = read_state_file(state_file)

    logfile = InfobaseLog(config.get('infobase_server'),
                          exclude=args.exclude_edits_containing)
    logfile.seek(offset)

    solr = Solr()

    while True:
        records = logfile.read_records()
        keys = parse_log(records)
        count = update_keys(keys)

        if logfile.tell() != offset:
            offset = logfile.tell()
            logger.info("saving offset %s", offset)
            with open(state_file, "w") as f:
                f.write(offset)

        if COMMIT:
            solr.commit(ndocs=count)
        else:
            logger.info("not doing solr commit as commit is off")

        # don't sleep after committing some records.
        # While the commit was on, some more edits might have happened.
        if count == 0:
            logger.debug("No more log records available, sleeping...")
            time.sleep(5)
def main():
    global args
    FORMAT = "%(asctime)-15s %(levelname)s %(message)s"
    logging.basicConfig(level=logging.INFO, format=FORMAT)

    logger.info("BEGIN new-solr-updater")

    args = parse_arguments()
    process_args(args)

    # set OL URL when running on a dev-instance
    if args.ol_url:
        host = web.lstrips(args.ol_url, "http://").strip("/")
        update_work.set_query_host(host)

    print(str(args))
    logger.info("loading config from %s", args.config)
    load_config(args.config)

    state_file = args.state_file
    offset = read_state_file(state_file)

    logfile = InfobaseLog(config.get('infobase_server'))
    logfile.seek(offset)

    solr = Solr()

    while True:
        records = logfile.read_records()
        keys = parse_log(records)
        count = update_keys(keys)

        if logfile.tell() != offset:
            offset = logfile.tell()
            logger.info("saving offset %s", offset)
            with open(state_file, "w") as f:
                f.write(offset)

        if COMMIT:
            solr.commit(ndocs=count)
        else:
            logger.info("not doing solr commit as commit is off")

        # don't sleep after committing some records.
        # While the commit was on, some more edits might have happened.
        if count == 0:
            logger.debug("No more log records available, sleeping...")
            time.sleep(5)
Example #12
0
def main(config, start, end):
    """
    Get the unique visitors per day between the 2 dates (inclusive) and store them
    in the infogami database. Ignores errors
    :param datetime start:
    :param datetime end:
    :return:
    """
    load_config(config)  # loads config for psql db under the hood
    infogami._setup()

    current = start
    while current <= end:
        try:
            count = count_unique_ips_for_day(current)
            store_data(dict(visitors=count), current)
        except IndexError as e:
            print(e.message)
        current += timedelta(days=1)
Example #13
0
def main(config, start, end):
    """
    Get the unique visitors per day between the 2 dates (inclusive) and store them
    in the infogami database. Ignores errors
    :param datetime start:
    :param datetime end:
    :return:
    """
    load_config(config)  # loads config for psql db under the hood
    infogami._setup()

    current = start
    while current <= end:
        try:
            count = count_unique_ips_for_day(current)
            store_data(dict(visitors=count), current)
        except IndexError as e:
            print(e.message)
        current += timedelta(days=1)
Example #14
0
def main():
    if "--config" in sys.argv:
        index = sys.argv.index("--config")
        configfile = sys.argv[index+1]
        del sys.argv[index:index+2]
    else:
        configfile = "openlibrary.yml"
    load_config(configfile)

    cmd = sys.argv[1]
    args = sys.argv[2:]
    if cmd == "add-items":
        return add_items(args)
    elif cmd == "add-new-scans":
        return add_new_scans(args)
    elif cmd == "import-batch":
        return import_batch(args)
    elif cmd == "import-all":
        return import_all(args)
Example #15
0
def get_ia_db(configfile=None):
    """Metadata API is slow.

    Talk to archive.org database directly if it is specified in the
    global configuration or if a configfile is provided.
    """
    if configfile:
        from openlibrary.config import load_config
        load_config(configfile)

    if not config.get("ia_db"):
        return None
    global _ia_db
    if not _ia_db:
        settings = config.ia_db
        host = settings['host']
        db = settings['db']
        user = settings['user']
        pw = os.popen(settings['pw_file']).read().strip()
        _ia_db = web.database(dbn="postgres", host=host, db=db, user=user, pw=pw)
    return _ia_db
Example #16
0
async def main(
    ol_config: str,
    debugger=False,
    state_file='solr-update.state',
    exclude_edits_containing: str = None,
    ol_url='http://openlibrary.org/',
    solr_url: str = None,
    solr_next=False,
    socket_timeout=10,
    load_ia_scans=False,
    commit=True,
    initial_state: str = None,
):
    """
    :param debugger: Wait for a debugger to attach before beginning
    :param exclude_edits_containing: Don't index matching edits
    :param solr_url: If wanting to override what's in the config file
    :param solr_next: Whether to assume new schema/etc are used
    :param initial_state: State to use if state file doesn't exist. Defaults to today.
    """
    FORMAT = "%(asctime)-15s %(levelname)s %(message)s"
    logging.basicConfig(level=logging.INFO, format=FORMAT)
    logger.info("BEGIN new-solr-updater")

    if debugger:
        import debugpy

        logger.info("Enabling debugger attachment (attach if it hangs here)")
        debugpy.listen(address=('0.0.0.0', 3000))
        logger.info("Waiting for debugger to attach...")
        debugpy.wait_for_client()
        logger.info("Debugger attached to port 3000")

    # Sometimes archive.org requests blocks forever.
    # Setting a timeout will make the request fail instead of waiting forever.
    socket.setdefaulttimeout(socket_timeout)

    # set OL URL when running on a dev-instance
    if ol_url:
        host = web.lstrips(ol_url, "http://").strip("/")
        update_work.set_query_host(host)

    if solr_url:
        update_work.set_solr_base_url(solr_url)

    update_work.set_solr_next(solr_next)

    logger.info("loading config from %s", ol_config)
    load_config(ol_config)

    offset = read_state_file(state_file, initial_state)

    logfile = InfobaseLog(config.get('infobase_server'),
                          exclude=exclude_edits_containing)
    logfile.seek(offset)

    solr = Solr()

    while True:
        records = logfile.read_records()
        keys = parse_log(records, load_ia_scans)
        count = await update_keys(keys)

        if logfile.tell() != offset:
            offset = logfile.tell()
            logger.info("saving offset %s", offset)
            with open(state_file, "w") as f:
                f.write(offset)

        if commit:
            solr.commit(ndocs=count)
        else:
            logger.info("not doing solr commit as commit is off")

        # don't sleep after committing some records.
        # While the commit was on, some more edits might have happened.
        if count == 0:
            logger.debug("No more log records available, sleeping...")
            time.sleep(5)
Example #17
0
def load_config(path):
    logger.info("loading config from %s", path)    
    print "***load_config", path
    config.load(path)
    config.load_config(path)
    return config.runtime_config
def load_config(path):
    logger.info("loading config from %s", path)    
    print "***load_config", path
    config.load(path)
    config.load_config(path)
    return config.runtime_config
Example #19
0
    func = {
        "cdump": generate_cdump,
        "dump": generate_dump,
        "idump": generate_idump,
        "sort": sort_dump,
        "split": split_dump,
        "index": make_index,
        "sitemaps": generate_sitemaps,
        "htmlindex": generate_html_index,
    }.get(cmd)
    if func:
        func(*args, **kwargs)
    elif cmd == "solrdump":
        from openlibrary.data import solr  # noqa: E402 avoid circular import
        solr.generate_dump(*args, **kwargs)
    else:
        logger.error(f"Unknown command: {cmd}")
        print("Unknown command:", cmd, file=sys.stderr)


if __name__ == "__main__":
    ol_config = os.getenv("OL_CONFIG")
    if ol_config:
        logger.info(f"loading config from {ol_config}")
        load_config(ol_config)
        sentry = Sentry(getattr(config, 'sentry_cron_jobs', {}))
        if sentry.enabled:
            sentry.init()

    main(sys.argv[1], sys.argv[2:])