async def main( ol_config: str, debugger=False, state_file='solr-update.state', exclude_edits_containing: str = None, ol_url='http://openlibrary.org/', solr_url: str = None, solr_next=False, socket_timeout=10, load_ia_scans=False, commit=True, initial_state: str = None, ): """ :param debugger: Wait for a debugger to attach before beginning :param exclude_edits_containing: Don't index matching edits :param solr_url: If wanting to override what's in the config file :param solr_next: Whether to assume new schema/etc are used :param initial_state: State to use if state file doesn't exist. Defaults to today. """ FORMAT = "%(asctime)-15s %(levelname)s %(message)s" logging.basicConfig(level=logging.INFO, format=FORMAT) logger.info("BEGIN new-solr-updater") if debugger: import debugpy logger.info("Enabling debugger attachment (attach if it hangs here)") debugpy.listen(address=('0.0.0.0', 3000)) logger.info("Waiting for debugger to attach...") debugpy.wait_for_client() logger.info("Debugger attached to port 3000") # Sometimes archive.org requests blocks forever. # Setting a timeout will make the request fail instead of waiting forever. socket.setdefaulttimeout(socket_timeout) # set OL URL when running on a dev-instance if ol_url: host = web.lstrips(ol_url, "http://").strip("/") update_work.set_query_host(host) if solr_url: update_work.set_solr_base_url(solr_url) update_work.set_solr_next(solr_next) logger.info("loading config from %s", ol_config) load_config(ol_config) offset = read_state_file(state_file, initial_state) logfile = InfobaseLog(config.get('infobase_server'), exclude=exclude_edits_containing) logfile.seek(offset) solr = Solr() while True: records = logfile.read_records() keys = parse_log(records, load_ia_scans) count = await update_keys(keys) if logfile.tell() != offset: offset = logfile.tell() logger.info("saving offset %s", offset) with open(state_file, "w") as f: f.write(offset) if commit: solr.commit(ndocs=count) else: logger.info("not doing solr commit as commit is off") # don't sleep after committing some records. # While the commit was on, some more edits might have happened. if count == 0: logger.debug("No more log records available, sleeping...") time.sleep(5)
async def main( cmd: Literal['index', 'fetch-end'], job: Literal['works', 'orphans', 'authors'], postgres="postgres.ini", ol="http://ol/", ol_config="../../conf/openlibrary.yml", solr: str = None, skip_solr_id_check=True, start_at: str = None, offset=0, limit=1, last_modified: str = None, progress: str = None, log_file: str = None, log_level=logging.INFO, dry_run=False, ) -> None: """ :param cmd: Whether to do the index or just fetch end of the chunk :param job: Type to index. Orphans gets orphaned editions. :param postgres: Path to postgres config file :param ol: Open Library endpoint :param ol_config: Path to Open Library config file :param solr: Overwrite solr base url from ol_config :param start_at: key (type-prefixed) to start from as opposed to offset; WAY more efficient since offset has to walk through all `offset` rows. :param offset: Use `start_at` if possible. :param last_modified: Limit results to those modifier >= this date :param progress: Where/if to save progress indicator to :param log_file: Redirect logs to file instead of stdout """ logging.basicConfig( filename=log_file, level=log_level, format="%(asctime)s [%(levelname)s] %(message)s", ) if solr: update_work.set_solr_base_url(solr) PLogEntry = namedtuple( 'PLogEntry', [ 'seen', 'total', 'percent', 'elapsed', 'q_1', 'q_auth', 'q_ia', 'cached', 'ia_cache', 'next', ], ) class PLog: def __init__(self, filename): """ :param str or None filename: """ self.filename = filename self.last_entry = None def log(self, entry): """ :param PLogEntry entry: """ self.last_entry = entry if self.filename: with open(progress, 'a') as f: f.write('\t'.join( self.fmt(k, val) for k, val in entry._asdict().items())) f.write('\n') def update( self, seen=None, total=None, percent=None, elapsed=None, q_1=None, q_auth=None, cached=None, q_ia=None, ia_cache=None, next=None, ): """ :param str or int or None seen: :param str or int or None total: :param str or float or None percent: :param str or float or None elapsed: :param str or float or None q_1: :param str or float or None q_auth: :param str or int or None cached: :param str or float or None q_ia: :param str or int or None ia_cache: :param str or None next: :return: None """ args = locals() entry = self.last_entry._replace( ** {f: args[f] for f in PLogEntry._fields if args[f] is not None}) self.log(entry) def fmt(self, k, val): """ :param str k: :param Any val: :rtype: str """ if val is None: return '?' if isinstance(val, str): return val if k == 'percent': return '%.2f%%' % (100 * val) if k in ['elapsed', 'q_1', 'q_auth', 'q_ia']: return '%.2fs' % val if isinstance(val, float): return '%.2f' % val if k == 'next': return val.split('/')[-1] return str(val) plog = PLog(progress) # load the contents of the config? with LocalPostgresDataProvider(postgres) as db: # Check to see where we should be starting from if cmd == 'fetch-end': next_start_query = build_job_query(job, start_at, limit, last_modified, 1) next_start_results = db.query_all(next_start_query) if next_start_results: print(next_start_results[0][0]) return logger.info( json.dumps({ 'scope': 'solr_builder::main', 'event': 'Indexing started', 'start_at': start_at, })) load_configs(ol, ol_config, db) q = build_job_query(job, start_at, offset, last_modified, limit) if progress: # Clear the file with open(progress, 'w') as f: f.write('') with open(progress, 'a') as f: f.write('Calculating total... ') start = time.time() q_count = """SELECT COUNT(*) FROM(%s) AS foo""" % q count = db.query_all(q_count)[0][0] end = time.time() if progress: with open(progress, 'a') as f: f.write('%d (%.2fs)\n' % (count, end - start)) f.write('\t'.join(PLogEntry._fields) + '\n') plog.log( PLogEntry(0, count, '0.00%', 0, '?', '?', '?', '?', '?', start_at or '?')) plog.update(q_1=0, q_auth=0, q_ia=0) start = time.time() seen = 0 for batch in db.query_batched(q, size=1000, cache_json=True): keys = [x[0] for x in batch] plog.update(next=keys[0], cached=len(db.cache), ia_cache=0) with LocalPostgresDataProvider(postgres) as db2: key_range = [keys[0], keys[-1]] if job == "works": # cache editions editions_time, _ = simple_timeit( lambda: db2.cache_work_editions(*key_range)) plog.update( q_1=plog.last_entry.q_1 + editions_time, cached=len(db.cache) + len(db2.cache), ) # cache editions' ocaid metadata ocaids_time, _ = await simple_timeit_async( db2.cache_cached_editions_ia_metadata()) plog.update( q_ia=plog.last_entry.q_ia + ocaids_time, ia_cache=len(db2.ia_cache), ) # cache authors authors_time, _ = simple_timeit( lambda: db2.cache_work_authors(*key_range)) plog.update( q_auth=plog.last_entry.q_auth + authors_time, cached=len(db.cache) + len(db2.cache), ) elif job == "orphans": # cache editions' ocaid metadata ocaids_time, _ = await simple_timeit_async( db2.cache_cached_editions_ia_metadata()) plog.update( q_ia=plog.last_entry.q_ia + ocaids_time, ia_cache=len(db2.ia_cache), ) # cache authors authors_time, _ = simple_timeit( lambda: db2.cache_edition_authors(*key_range)) plog.update( q_auth=plog.last_entry.q_auth + authors_time, cached=len(db.cache) + len(db2.cache), ) elif job == "authors": # Nothing to cache; update_work.py queries solr directly for each # other, and provides no way to cache. pass # Store in main cache db.cache.update(db2.cache) db.ia_cache.update(db2.ia_cache) db.cached_work_editions_ranges += db2.cached_work_editions_ranges update_keys( keys, commit=False, commit_way_later=True, skip_id_check=skip_solr_id_check, update='quiet' if dry_run else 'update', ) seen += len(keys) plog.update( elapsed=time.time() - start, seen=seen, percent=seen / count, cached=len(db.cache), ia_cache=len(db.ia_cache), ) db.clear_cache()