def update_keys(keys): keys = (k for k in keys if k.count("/") == 2 and k.split("/")[1] in ["books", "authors", "works"]) count = 0 for chunk in web.group(keys, 100): chunk = list(chunk) count += len(chunk) update_work.update_keys(chunk, commit=False) if count: logger.info("updated %d documents", count) return count
def update_solr(changeset): """Updates solr on edit. """ from openlibrary.solr import update_work keys = set() docs = changeset['docs'] + changeset['old_docs'] docs = [doc for doc in docs if doc] # doc can be None if it is newly created. for doc in docs: if doc['type']['key'] == '/type/edition': keys.update(w['key'] for w in doc.get('works', [])) elif doc['type']['key'] == '/type/work': keys.add(doc['key']) keys.update(a['author']['key'] for a in doc.get('authors', []) if 'author' in a) elif doc['type']['key'] == '/type/author': keys.add(doc['key']) update_work.update_keys(list(keys))
def main(job, postgres="postgres.ini", ol="http://ol/", ol_config="../../conf/openlibrary.yml", start_at=None, offset=0, limit=1, last_modified=None, progress=None, log_file=None, log_level=logging.WARN): """ :param str job: job to complete. One of 'works', 'orphans', 'authors' :param str postgres: path to postgres config file :param str ol: openlibrary endpoint :param str ol_config: path to openlibrary config file :param str or None start_at: key (type-prefixed) to start from as opposed to offset; WAY more efficient since offset has to walk through all `offset` rows. :param int offset: Use `start_at` if possible. :param int limit: :param str or None last_modified: Limit results to those modifier >= this date :param str or None progress: Where/if to save progress indicator to :param str or None log_file: Redirect logs to file instead of stdout :param int log_level: :return: None """ logging.basicConfig(filename=log_file, level=log_level, format="%(asctime)s [%(levelname)s] %(message)s") PLogEntry = namedtuple('PLogEntry', [ 'seen', 'total', 'percent', 'elapsed', 'q_1', 'q_auth', 'q_ia', 'cached', 'ia_cache', 'next' ]) class PLog: def __init__(self, filename): """ :param str or None filename: """ self.filename = filename self.last_entry = None def log(self, entry): """ :param PLogEntry entry: """ self.last_entry = entry if self.filename: with open(progress, 'a') as f: f.write('\t'.join( self.fmt(k, val) for k, val in entry._asdict().iteritems())) f.write('\n') def update(self, seen=None, total=None, percent=None, elapsed=None, q_1=None, q_auth=None, cached=None, q_ia=None, ia_cache=None, next=None): """ :param str or int or None seen: :param str or int or None total: :param str or float or None percent: :param str or float or None elapsed: :param str or float or None q_1: :param str or float or None q_auth: :param str or int or None cached: :param str or float or None q_ia: :param str or int or None ia_cache: :param str or None next: :return: None """ args = locals() entry = self.last_entry._replace( ** {f: args[f] for f in PLogEntry._fields if args[f] is not None}) self.log(entry) def fmt(self, k, val): """ :param str k: :param Any val: :rtype: str """ if val is None: return '?' if isinstance(val, str): return val if k == 'percent': return '%.2f%%' % (100 * val) if k in ['elapsed', 'q_1', 'q_auth', 'q_ia']: return '%.2fs' % val if isinstance(val, float): return '%.2f' % val if k == 'next': return val.split('/')[-1] return str(val) plog = PLog(progress) # load the contents of the config? with LocalPostgresDataProvider(postgres) as db: load_configs(ol, ol_config, db) q = build_job_query(job, start_at, offset, last_modified, limit) count = None if progress: with open(progress, 'w', buffering=0) as f: f.write('Calculating total... ') q_count = """SELECT COUNT(*) FROM(%s) AS foo""" % q start = time.time() count = db.query_all(q_count)[0][0] end = time.time() f.write('%d (%.2fs)\n' % (count, end - start)) f.write('\t'.join(PLogEntry._fields) + '\n') plog.log( PLogEntry(0, count, '0.00%', 0, '?', '?', '?', '?', '?', start_at or '?')) start = time.time() seen = 0 for batch in db.query_batched(q, size=5000, cache_json=True): keys = [x[0] for x in batch] plog.update(next=keys[0], cached=len(db.cache), ia_cache=0, q_1='?', q_auth='?', q_ia='?') with LocalPostgresDataProvider(postgres) as db2: key_range = [keys[0], keys[-1]] if job == "works": # cache editions editions_time, _ = simple_timeit( lambda: db2.cache_work_editions(*key_range)) plog.update(q_1=editions_time, cached=len(db.cache) + len(db2.cache)) # cache editions' ocaid metadata ocaids_time, _ = simple_timeit( lambda: db2.cache_cached_editions_ia_metadata()) plog.update(q_ia=ocaids_time, ia_cache=len(db2.ia_cache)) # cache authors authors_time, _ = simple_timeit( lambda: db2.cache_work_authors(*key_range)) plog.update(q_auth=authors_time, cached=len(db.cache) + len(db2.cache)) elif job == "orphans": # cache editions' ocaid metadata ocaids_time, _ = simple_timeit( lambda: db2.cache_cached_editions_ia_metadata()) plog.update(q_ia=ocaids_time, ia_cache=len(db2.ia_cache)) # cache authors authors_time, _ = simple_timeit( lambda: db2.cache_work_authors(*key_range)) plog.update(q_auth=authors_time, cached=len(db.cache) + len(db2.cache)) elif job == "authors": # Nothing to cache; update_work.py queries solr directly for each # other, and provides no way to cache. pass # Store in main cache db.cache.update(db2.cache) db.ia_cache.update(db2.ia_cache) update_keys(keys, commit=False, commit_way_later=True) seen += len(keys) plog.update(elapsed=time.time() - start, seen=seen, percent=seen / count, cached=len(db.cache), ia_cache=len(db.ia_cache)) db.clear_cache()
async def main( cmd: Literal['index', 'fetch-end'], job: Literal['works', 'orphans', 'authors'], postgres="postgres.ini", ol="http://ol/", ol_config="../../conf/openlibrary.yml", solr: str = None, skip_solr_id_check=True, start_at: str = None, offset=0, limit=1, last_modified: str = None, progress: str = None, log_file: str = None, log_level=logging.INFO, dry_run=False, ) -> None: """ :param cmd: Whether to do the index or just fetch end of the chunk :param job: Type to index. Orphans gets orphaned editions. :param postgres: Path to postgres config file :param ol: Open Library endpoint :param ol_config: Path to Open Library config file :param solr: Overwrite solr base url from ol_config :param start_at: key (type-prefixed) to start from as opposed to offset; WAY more efficient since offset has to walk through all `offset` rows. :param offset: Use `start_at` if possible. :param last_modified: Limit results to those modifier >= this date :param progress: Where/if to save progress indicator to :param log_file: Redirect logs to file instead of stdout """ logging.basicConfig( filename=log_file, level=log_level, format="%(asctime)s [%(levelname)s] %(message)s", ) if solr: update_work.set_solr_base_url(solr) PLogEntry = namedtuple( 'PLogEntry', [ 'seen', 'total', 'percent', 'elapsed', 'q_1', 'q_auth', 'q_ia', 'cached', 'ia_cache', 'next', ], ) class PLog: def __init__(self, filename): """ :param str or None filename: """ self.filename = filename self.last_entry = None def log(self, entry): """ :param PLogEntry entry: """ self.last_entry = entry if self.filename: with open(progress, 'a') as f: f.write('\t'.join( self.fmt(k, val) for k, val in entry._asdict().items())) f.write('\n') def update( self, seen=None, total=None, percent=None, elapsed=None, q_1=None, q_auth=None, cached=None, q_ia=None, ia_cache=None, next=None, ): """ :param str or int or None seen: :param str or int or None total: :param str or float or None percent: :param str or float or None elapsed: :param str or float or None q_1: :param str or float or None q_auth: :param str or int or None cached: :param str or float or None q_ia: :param str or int or None ia_cache: :param str or None next: :return: None """ args = locals() entry = self.last_entry._replace( ** {f: args[f] for f in PLogEntry._fields if args[f] is not None}) self.log(entry) def fmt(self, k, val): """ :param str k: :param Any val: :rtype: str """ if val is None: return '?' if isinstance(val, str): return val if k == 'percent': return '%.2f%%' % (100 * val) if k in ['elapsed', 'q_1', 'q_auth', 'q_ia']: return '%.2fs' % val if isinstance(val, float): return '%.2f' % val if k == 'next': return val.split('/')[-1] return str(val) plog = PLog(progress) # load the contents of the config? with LocalPostgresDataProvider(postgres) as db: # Check to see where we should be starting from if cmd == 'fetch-end': next_start_query = build_job_query(job, start_at, limit, last_modified, 1) next_start_results = db.query_all(next_start_query) if next_start_results: print(next_start_results[0][0]) return logger.info( json.dumps({ 'scope': 'solr_builder::main', 'event': 'Indexing started', 'start_at': start_at, })) load_configs(ol, ol_config, db) q = build_job_query(job, start_at, offset, last_modified, limit) if progress: # Clear the file with open(progress, 'w') as f: f.write('') with open(progress, 'a') as f: f.write('Calculating total... ') start = time.time() q_count = """SELECT COUNT(*) FROM(%s) AS foo""" % q count = db.query_all(q_count)[0][0] end = time.time() if progress: with open(progress, 'a') as f: f.write('%d (%.2fs)\n' % (count, end - start)) f.write('\t'.join(PLogEntry._fields) + '\n') plog.log( PLogEntry(0, count, '0.00%', 0, '?', '?', '?', '?', '?', start_at or '?')) plog.update(q_1=0, q_auth=0, q_ia=0) start = time.time() seen = 0 for batch in db.query_batched(q, size=1000, cache_json=True): keys = [x[0] for x in batch] plog.update(next=keys[0], cached=len(db.cache), ia_cache=0) with LocalPostgresDataProvider(postgres) as db2: key_range = [keys[0], keys[-1]] if job == "works": # cache editions editions_time, _ = simple_timeit( lambda: db2.cache_work_editions(*key_range)) plog.update( q_1=plog.last_entry.q_1 + editions_time, cached=len(db.cache) + len(db2.cache), ) # cache editions' ocaid metadata ocaids_time, _ = await simple_timeit_async( db2.cache_cached_editions_ia_metadata()) plog.update( q_ia=plog.last_entry.q_ia + ocaids_time, ia_cache=len(db2.ia_cache), ) # cache authors authors_time, _ = simple_timeit( lambda: db2.cache_work_authors(*key_range)) plog.update( q_auth=plog.last_entry.q_auth + authors_time, cached=len(db.cache) + len(db2.cache), ) elif job == "orphans": # cache editions' ocaid metadata ocaids_time, _ = await simple_timeit_async( db2.cache_cached_editions_ia_metadata()) plog.update( q_ia=plog.last_entry.q_ia + ocaids_time, ia_cache=len(db2.ia_cache), ) # cache authors authors_time, _ = simple_timeit( lambda: db2.cache_edition_authors(*key_range)) plog.update( q_auth=plog.last_entry.q_auth + authors_time, cached=len(db.cache) + len(db2.cache), ) elif job == "authors": # Nothing to cache; update_work.py queries solr directly for each # other, and provides no way to cache. pass # Store in main cache db.cache.update(db2.cache) db.ia_cache.update(db2.ia_cache) db.cached_work_editions_ranges += db2.cached_work_editions_ranges update_keys( keys, commit=False, commit_way_later=True, skip_id_check=skip_solr_id_check, update='quiet' if dry_run else 'update', ) seen += len(keys) plog.update( elapsed=time.time() - start, seen=seen, percent=seen / count, cached=len(db.cache), ia_cache=len(db.ia_cache), ) db.clear_cache()