Example #1
0
def update_keys(keys):
    keys = (k for k in keys if k.count("/") == 2 and k.split("/")[1] in ["books", "authors", "works"])

    count = 0
    for chunk in web.group(keys, 100):
        chunk = list(chunk)
        count += len(chunk)
        update_work.update_keys(chunk, commit=False)

    if count:
        logger.info("updated %d documents", count)

    return count
Example #2
0
def update_solr(changeset):
    """Updates solr on edit.
    """
    from openlibrary.solr import update_work
    
    keys = set()
    docs = changeset['docs'] + changeset['old_docs']
    docs = [doc for doc in docs if doc] # doc can be None if it is newly created.
    for doc in docs:
        if doc['type']['key'] == '/type/edition':
            keys.update(w['key'] for w in doc.get('works', []))
        elif doc['type']['key'] == '/type/work':
            keys.add(doc['key'])
            keys.update(a['author']['key'] for a in doc.get('authors', []) if 'author' in a)
        elif doc['type']['key'] == '/type/author':
            keys.add(doc['key'])
            
    update_work.update_keys(list(keys))
Example #3
0
def main(job,
         postgres="postgres.ini",
         ol="http://ol/",
         ol_config="../../conf/openlibrary.yml",
         start_at=None,
         offset=0,
         limit=1,
         last_modified=None,
         progress=None,
         log_file=None,
         log_level=logging.WARN):
    """
    :param str job: job to complete. One of 'works', 'orphans', 'authors'
    :param str postgres: path to postgres config file
    :param str ol: openlibrary endpoint
    :param str ol_config: path to openlibrary config file
    :param str or None start_at: key (type-prefixed) to start from as opposed to
    offset; WAY more efficient since offset
     has to walk through all `offset` rows.
    :param int offset: Use `start_at` if possible.
    :param int limit:
    :param str or None last_modified: Limit results to those modifier >= this date
    :param str or None progress: Where/if to save progress indicator to
    :param str or None log_file: Redirect logs to file instead of stdout
    :param int log_level:
    :return: None
    """

    logging.basicConfig(filename=log_file,
                        level=log_level,
                        format="%(asctime)s [%(levelname)s] %(message)s")

    PLogEntry = namedtuple('PLogEntry', [
        'seen', 'total', 'percent', 'elapsed', 'q_1', 'q_auth', 'q_ia',
        'cached', 'ia_cache', 'next'
    ])

    class PLog:
        def __init__(self, filename):
            """
            :param str or None filename:
            """
            self.filename = filename
            self.last_entry = None

        def log(self, entry):
            """
            :param PLogEntry entry:
            """
            self.last_entry = entry
            if self.filename:
                with open(progress, 'a') as f:
                    f.write('\t'.join(
                        self.fmt(k, val)
                        for k, val in entry._asdict().iteritems()))
                    f.write('\n')

        def update(self,
                   seen=None,
                   total=None,
                   percent=None,
                   elapsed=None,
                   q_1=None,
                   q_auth=None,
                   cached=None,
                   q_ia=None,
                   ia_cache=None,
                   next=None):
            """
            :param str or int or None seen:
            :param str or int or None total:
            :param str or float or None percent:
            :param str or float or None elapsed:
            :param str or float or None q_1:
            :param str or float or None q_auth:
            :param str or int or None cached:
            :param str or float or None q_ia:
            :param str or int or None ia_cache:
            :param str or None next:
            :return: None
            """
            args = locals()
            entry = self.last_entry._replace(
                **
                {f: args[f]
                 for f in PLogEntry._fields if args[f] is not None})
            self.log(entry)

        def fmt(self, k, val):
            """
            :param str k:
            :param Any val:
            :rtype: str
            """
            if val is None:
                return '?'
            if isinstance(val, str):
                return val
            if k == 'percent':
                return '%.2f%%' % (100 * val)
            if k in ['elapsed', 'q_1', 'q_auth', 'q_ia']:
                return '%.2fs' % val
            if isinstance(val, float):
                return '%.2f' % val
            if k == 'next':
                return val.split('/')[-1]
            return str(val)

    plog = PLog(progress)

    # load the contents of the config?
    with LocalPostgresDataProvider(postgres) as db:
        load_configs(ol, ol_config, db)

        q = build_job_query(job, start_at, offset, last_modified, limit)

        count = None
        if progress:
            with open(progress, 'w', buffering=0) as f:
                f.write('Calculating total... ')
                q_count = """SELECT COUNT(*) FROM(%s) AS foo""" % q
                start = time.time()
                count = db.query_all(q_count)[0][0]
                end = time.time()
                f.write('%d (%.2fs)\n' % (count, end - start))
                f.write('\t'.join(PLogEntry._fields) + '\n')

        plog.log(
            PLogEntry(0, count, '0.00%', 0, '?', '?', '?', '?', '?', start_at
                      or '?'))

        start = time.time()
        seen = 0
        for batch in db.query_batched(q, size=5000, cache_json=True):
            keys = [x[0] for x in batch]
            plog.update(next=keys[0],
                        cached=len(db.cache),
                        ia_cache=0,
                        q_1='?',
                        q_auth='?',
                        q_ia='?')

            with LocalPostgresDataProvider(postgres) as db2:
                key_range = [keys[0], keys[-1]]

                if job == "works":
                    # cache editions
                    editions_time, _ = simple_timeit(
                        lambda: db2.cache_work_editions(*key_range))
                    plog.update(q_1=editions_time,
                                cached=len(db.cache) + len(db2.cache))

                    # cache editions' ocaid metadata
                    ocaids_time, _ = simple_timeit(
                        lambda: db2.cache_cached_editions_ia_metadata())
                    plog.update(q_ia=ocaids_time, ia_cache=len(db2.ia_cache))

                    # cache authors
                    authors_time, _ = simple_timeit(
                        lambda: db2.cache_work_authors(*key_range))
                    plog.update(q_auth=authors_time,
                                cached=len(db.cache) + len(db2.cache))
                elif job == "orphans":
                    # cache editions' ocaid metadata
                    ocaids_time, _ = simple_timeit(
                        lambda: db2.cache_cached_editions_ia_metadata())
                    plog.update(q_ia=ocaids_time, ia_cache=len(db2.ia_cache))

                    # cache authors
                    authors_time, _ = simple_timeit(
                        lambda: db2.cache_work_authors(*key_range))
                    plog.update(q_auth=authors_time,
                                cached=len(db.cache) + len(db2.cache))
                elif job == "authors":
                    # Nothing to cache; update_work.py queries solr directly for each
                    # other, and provides no way to cache.
                    pass

                # Store in main cache
                db.cache.update(db2.cache)
                db.ia_cache.update(db2.ia_cache)

            update_keys(keys, commit=False, commit_way_later=True)

            seen += len(keys)
            plog.update(elapsed=time.time() - start,
                        seen=seen,
                        percent=seen / count,
                        cached=len(db.cache),
                        ia_cache=len(db.ia_cache))

            db.clear_cache()
Example #4
0
async def main(
    cmd: Literal['index', 'fetch-end'],
    job: Literal['works', 'orphans', 'authors'],
    postgres="postgres.ini",
    ol="http://ol/",
    ol_config="../../conf/openlibrary.yml",
    solr: str = None,
    skip_solr_id_check=True,
    start_at: str = None,
    offset=0,
    limit=1,
    last_modified: str = None,
    progress: str = None,
    log_file: str = None,
    log_level=logging.INFO,
    dry_run=False,
) -> None:
    """
    :param cmd: Whether to do the index or just fetch end of the chunk
    :param job: Type to index. Orphans gets orphaned editions.
    :param postgres: Path to postgres config file
    :param ol: Open Library endpoint
    :param ol_config: Path to Open Library config file
    :param solr: Overwrite solr base url from ol_config
    :param start_at: key (type-prefixed) to start from as opposed to offset; WAY more
    efficient since offset has to walk through all `offset` rows.
    :param offset: Use `start_at` if possible.
    :param last_modified: Limit results to those modifier >= this date
    :param progress: Where/if to save progress indicator to
    :param log_file: Redirect logs to file instead of stdout
    """

    logging.basicConfig(
        filename=log_file,
        level=log_level,
        format="%(asctime)s [%(levelname)s] %(message)s",
    )

    if solr:
        update_work.set_solr_base_url(solr)

    PLogEntry = namedtuple(
        'PLogEntry',
        [
            'seen',
            'total',
            'percent',
            'elapsed',
            'q_1',
            'q_auth',
            'q_ia',
            'cached',
            'ia_cache',
            'next',
        ],
    )

    class PLog:
        def __init__(self, filename):
            """
            :param str or None filename:
            """
            self.filename = filename
            self.last_entry = None

        def log(self, entry):
            """
            :param PLogEntry entry:
            """
            self.last_entry = entry
            if self.filename:
                with open(progress, 'a') as f:
                    f.write('\t'.join(
                        self.fmt(k, val)
                        for k, val in entry._asdict().items()))
                    f.write('\n')

        def update(
            self,
            seen=None,
            total=None,
            percent=None,
            elapsed=None,
            q_1=None,
            q_auth=None,
            cached=None,
            q_ia=None,
            ia_cache=None,
            next=None,
        ):
            """
            :param str or int or None seen:
            :param str or int or None total:
            :param str or float or None percent:
            :param str or float or None elapsed:
            :param str or float or None q_1:
            :param str or float or None q_auth:
            :param str or int or None cached:
            :param str or float or None q_ia:
            :param str or int or None ia_cache:
            :param str or None next:
            :return: None
            """
            args = locals()
            entry = self.last_entry._replace(
                **
                {f: args[f]
                 for f in PLogEntry._fields if args[f] is not None})
            self.log(entry)

        def fmt(self, k, val):
            """
            :param str k:
            :param Any val:
            :rtype: str
            """
            if val is None:
                return '?'
            if isinstance(val, str):
                return val
            if k == 'percent':
                return '%.2f%%' % (100 * val)
            if k in ['elapsed', 'q_1', 'q_auth', 'q_ia']:
                return '%.2fs' % val
            if isinstance(val, float):
                return '%.2f' % val
            if k == 'next':
                return val.split('/')[-1]
            return str(val)

    plog = PLog(progress)

    # load the contents of the config?
    with LocalPostgresDataProvider(postgres) as db:
        # Check to see where we should be starting from
        if cmd == 'fetch-end':
            next_start_query = build_job_query(job, start_at, limit,
                                               last_modified, 1)
            next_start_results = db.query_all(next_start_query)
            if next_start_results:
                print(next_start_results[0][0])
            return

        logger.info(
            json.dumps({
                'scope': 'solr_builder::main',
                'event': 'Indexing started',
                'start_at': start_at,
            }))
        load_configs(ol, ol_config, db)
        q = build_job_query(job, start_at, offset, last_modified, limit)

        if progress:
            # Clear the file
            with open(progress, 'w') as f:
                f.write('')
            with open(progress, 'a') as f:
                f.write('Calculating total... ')

        start = time.time()
        q_count = """SELECT COUNT(*) FROM(%s) AS foo""" % q
        count = db.query_all(q_count)[0][0]
        end = time.time()

        if progress:
            with open(progress, 'a') as f:
                f.write('%d (%.2fs)\n' % (count, end - start))
                f.write('\t'.join(PLogEntry._fields) + '\n')

        plog.log(
            PLogEntry(0, count, '0.00%', 0, '?', '?', '?', '?', '?', start_at
                      or '?'))
        plog.update(q_1=0, q_auth=0, q_ia=0)

        start = time.time()
        seen = 0
        for batch in db.query_batched(q, size=1000, cache_json=True):
            keys = [x[0] for x in batch]
            plog.update(next=keys[0], cached=len(db.cache), ia_cache=0)

            with LocalPostgresDataProvider(postgres) as db2:
                key_range = [keys[0], keys[-1]]

                if job == "works":
                    # cache editions
                    editions_time, _ = simple_timeit(
                        lambda: db2.cache_work_editions(*key_range))
                    plog.update(
                        q_1=plog.last_entry.q_1 + editions_time,
                        cached=len(db.cache) + len(db2.cache),
                    )

                    # cache editions' ocaid metadata
                    ocaids_time, _ = await simple_timeit_async(
                        db2.cache_cached_editions_ia_metadata())
                    plog.update(
                        q_ia=plog.last_entry.q_ia + ocaids_time,
                        ia_cache=len(db2.ia_cache),
                    )

                    # cache authors
                    authors_time, _ = simple_timeit(
                        lambda: db2.cache_work_authors(*key_range))
                    plog.update(
                        q_auth=plog.last_entry.q_auth + authors_time,
                        cached=len(db.cache) + len(db2.cache),
                    )
                elif job == "orphans":
                    # cache editions' ocaid metadata
                    ocaids_time, _ = await simple_timeit_async(
                        db2.cache_cached_editions_ia_metadata())
                    plog.update(
                        q_ia=plog.last_entry.q_ia + ocaids_time,
                        ia_cache=len(db2.ia_cache),
                    )

                    # cache authors
                    authors_time, _ = simple_timeit(
                        lambda: db2.cache_edition_authors(*key_range))
                    plog.update(
                        q_auth=plog.last_entry.q_auth + authors_time,
                        cached=len(db.cache) + len(db2.cache),
                    )
                elif job == "authors":
                    # Nothing to cache; update_work.py queries solr directly for each
                    # other, and provides no way to cache.
                    pass

                # Store in main cache
                db.cache.update(db2.cache)
                db.ia_cache.update(db2.ia_cache)
                db.cached_work_editions_ranges += db2.cached_work_editions_ranges

            update_keys(
                keys,
                commit=False,
                commit_way_later=True,
                skip_id_check=skip_solr_id_check,
                update='quiet' if dry_run else 'update',
            )

            seen += len(keys)
            plog.update(
                elapsed=time.time() - start,
                seen=seen,
                percent=seen / count,
                cached=len(db.cache),
                ia_cache=len(db.ia_cache),
            )

            db.clear_cache()