Ejemplo n.º 1
0
def _task_submit_elaborate_specific_parameter(key, value, opts, args):
    """Given the string key it checks it's meaning, eventually using the
    value. Usually it fills some key in the options dict.
    It must return True if it has elaborated the key, False, if it doesn't
    know that key.
    eg:
    if key in ('-n', '--number'):
        bibtask.task_get_option(\1) = value
        return True
    return False
    """
    # Recid option
    if key in ("-i", "--recid"):
        try:
            value = int(value)
        except ValueError:
            bibtask.write_message("The value specified for --recid must be a "
                                  "valid integer, not '%s'." % value,
                                  stream=sys.stderr,
                                  verbose=0)
        if not _recid_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid record ID." %
                                  value,
                                  stream=sys.stderr,
                                  verbose=0)
            return False
        recids = bibtask.task_get_option('recids')
        if recids is None:
            recids = []
        recids.append(value)
        bibtask.task_set_option('recids', recids)

    # Collection option
    elif key in ("-c", "--collection"):
        if not _collection_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid collection." %
                                  value,
                                  stream=sys.stderr,
                                  verbose=0)
            return False
        collections = bibtask.task_get_option("collections")
        collections = collections or []
        collections.append(value)
        bibtask.task_set_option("collections", collections)

    # Taxonomy option
    elif key in ("-k", "--taxonomy"):
        if not _ontology_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid taxonomy name." %
                                  value,
                                  stream=sys.stderr,
                                  verbose=0)
            return False
        bibtask.task_set_option("taxonomy", value)
    elif key in ("-f", "--force"):
        bibtask.task_set_option("force", True)
    else:
        return False

    return True
def _task_submit_check_options():
    """
    Required by bibtask. Checks the options.
    """
    lastname = bibtask.task_get_option('lastname')
    process_all = bibtask.task_get_option('process_all')
    prepare_grid = bibtask.task_get_option('prepare_grid')
    load_grid = bibtask.task_get_option('load_grid_results')
    data_dir = bibtask.task_get_option('data_dir')
    prefix = bibtask.task_get_option('prefix')
    max_records = bibtask.task_get_option('max_records')
    update = bibtask.task_get_option('update')
    clean_cache = bibtask.task_get_option('clean_cache')
    update_cache = bibtask.task_get_option('update_cache')

    if (lastname == "None," or lastname == "None"):
        lastname = False

    if (not lastname and not process_all and not update and not prepare_grid
            and not load_grid and not clean_cache and not update_cache):
        bibtask.write_message(
            "ERROR: One of the options -a, -n, -U, -G, -R, "
            "--clean-cache, --update-cache is"
            " required!",
            stream=sys.stdout,
            verbose=0)
        return False
    elif not (bool(lastname) ^ bool(process_all) ^ bool(update)
              ^ bool(prepare_grid) ^ bool(load_grid) ^ bool(clean_cache)
              ^ bool(update_cache)):
        bibtask.write_message(
            "ERROR: Options -a -n -U -R -G --clean-cache "
            "--update-cache are mutually"
            " exclusive!",
            stream=sys.stdout,
            verbose=0)
        return False
    elif ((not prepare_grid and (data_dir or prefix or max_records))
          and (not load_grid and (data_dir))):
        bibtask.write_message(
            "ERROR: The options -d, -m and -p require -G or "
            "-R to run!",
            stream=sys.stdout,
            verbose=0)
        return False
    elif load_grid and not bool(data_dir):
        bibtask.write_message(
            "ERROR: The option -R requires the option -d "
            "to run!",
            stream=sys.stdout,
            verbose=0)
        return False

    return True
Ejemplo n.º 3
0
def _task_submit_elaborate_specific_parameter(key, value, opts, args):
    """Given the string key it checks it's meaning, eventually using the
    value. Usually it fills some key in the options dict.
    It must return True if it has elaborated the key, False, if it doesn't
    know that key.
    eg:
    if key in ('-n', '--number'):
        bibtask.task_get_option(\1) = value
        return True
    return False
    """
    # Recid option
    if key in ("-i", "--recid"):
        try:
            value = int(value)
        except ValueError:
            bibtask.write_message("The value specified for --recid must be a "
                "valid integer, not '%s'." % value, stream=sys.stderr,
                verbose=0)
        if not _recid_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid record ID." % value,
                stream=sys.stderr, verbose=0)
            return False
        recids = bibtask.task_get_option('recids')
        if recids is None:
            recids = []
        recids.append(value)
        bibtask.task_set_option('recids', recids)

    # Collection option
    elif key in ("-c", "--collection"):
        if not _collection_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid collection." % value,
                stream=sys.stderr, verbose=0)
            return False
        collections = bibtask.task_get_option("collections")
        collections = collections or []
        collections.append(value)
        bibtask.task_set_option("collections", collections)

    # Taxonomy option
    elif key in ("-k", "--taxonomy"):
        if not _ontology_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid taxonomy name." % value,
                stream=sys.stderr, verbose=0)
            return False
        bibtask.task_set_option("taxonomy", value)
    elif key in ("-f", "--force"):
        bibtask.task_set_option("force", True)
    else:
        return False

    return True
Ejemplo n.º 4
0
def _task_submit_check_options():
    """Required by bibtask. Checks the options."""
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    # If a recid or a collection is specified, check that the taxonomy
    # is also specified.
    if (recids is not None or collections is not None) and \
        taxonomy is None:
        bibtask.write_message("ERROR: When specifying a record ID or a collection, "
            "you have to precise which\ntaxonomy to use.", stream=sys.stderr,
            verbose=0)
        return False

    return True
Ejemplo n.º 5
0
def _task_submit_check_options():
    """Required by bibtask. Checks the options."""
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    # If a recid or a collection is specified, check that the taxonomy
    # is also specified.
    if (recids is not None or collections is not None) and \
        taxonomy is None:
        bibtask.write_message(
            "ERROR: When specifying a record ID or a collection, "
            "you have to precise which\ntaxonomy to use.",
            stream=sys.stderr,
            verbose=0)
        return False

    return True
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    all_pids = bibtask.task_get_option('all_pids', False)
    mp = bibtask.task_get_option('mp', False)

    if all_pids:
        pids = list(get_existing_personids(with_papers_only=True))
    else:
        pids = get_expired_person_ids()

    if mp:
        compute_cache_mp(pids)
    else:
        compute_cache(pids)

    return 1
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    all_pids = bibtask.task_get_option('all_pids', False)
    mp = bibtask.task_get_option('mp', False)

    if all_pids:
        pids = list(get_existing_personids(with_papers_only=True))
    else:
        pids = get_expired_person_ids()

    if mp:
        compute_cache_mp(pids)
    else:
        compute_cache(pids)

    return 1
def _task_submit_check_options():
    """
    Required by bibtask. Checks the options.
    """
    lastname = bibtask.task_get_option('lastname')
    process_all = bibtask.task_get_option('process_all')
    prepare_grid = bibtask.task_get_option('prepare_grid')
    load_grid = bibtask.task_get_option('load_grid_results')
    data_dir = bibtask.task_get_option('data_dir')
    prefix = bibtask.task_get_option('prefix')
    max_records = bibtask.task_get_option('max_records')
    update = bibtask.task_get_option('update')
    clean_cache = bibtask.task_get_option('clean_cache')
    update_cache = bibtask.task_get_option('update_cache')

    if (lastname == "None," or lastname == "None"):
        lastname = False

    if (not lastname and not process_all and not update
        and not prepare_grid and not load_grid and not clean_cache
        and not update_cache):
        bibtask.write_message("ERROR: One of the options -a, -n, -U, -G, -R, "
                              "--clean-cache, --update-cache is"
                              " required!", stream=sys.stdout, verbose=0)
        return False
    elif not (bool(lastname) ^ bool(process_all) ^ bool(update)
              ^ bool(prepare_grid) ^ bool(load_grid) ^ bool(clean_cache)
              ^ bool(update_cache)):
        bibtask.write_message("ERROR: Options -a -n -U -R -G --clean-cache "
                              "--update-cache are mutually"
                              " exclusive!", stream=sys.stdout, verbose=0)
        return False
    elif ((not prepare_grid and (data_dir or prefix or max_records)) and
          (not load_grid and (data_dir))):
        bibtask.write_message("ERROR: The options -d, -m and -p require -G or "
                              "-R to run!", stream=sys.stdout, verbose=0)
        return False
    elif load_grid and not bool(data_dir):
        bibtask.write_message("ERROR: The option -R requires the option -d "
                              "to run!", stream=sys.stdout, verbose=0)
        return False

    return True
Ejemplo n.º 9
0
def _task_submit_check_options():
    """
    Required by bibtask. Checks the options.
    """
    record_ids = bibtask.task_get_option('record_ids')
    all_records = bibtask.task_get_option('all_records')
    repair_pid = bibtask.task_get_option('repair_pid')
    fast_update_personid = bibtask.task_get_option('fast_update_personid')
    personid_gc = bibtask.task_get_option('personid_gc')

    params = bool(record_ids) + bool(all_records)
    if params > 1:
        bibtask.write_message("ERROR: conflicting options: --record-ids and "
                              "--all-records cannot be specified at the same "
                              "time.", stream=sys.stdout, verbose=0)
        return False

    if record_ids:
        for iden in record_ids:
            if not iden.isdigit():
                bibtask.write_message("ERROR: Record_ids expects numbers. "
                                      "Provided: %s." % iden)
                return False

    opts = bool(repair_pid) + bool(fast_update_personid) + bool(personid_gc)
    if opts == 0:
        bibtask.write_message("ERROR: One of the options --fast-update-personid, "
                              "--personid-gc, --repair-personid is required!"
                              , stream=sys.stdout, verbose=0)
        return False
    elif opts > 1:
        bibtask.write_message("ERROR: Options --fast-update-personid, "
                              "--personid-gc, --repair-personid "
                              "are mutually exclusive!", stream=sys.stdout, verbose=0)
        return False

    if repair_pid and params:
        bibtask.write_message("ERROR: --repair_pid does not require any parameters!"
                              , stream=sys.stdout, verbose=0)
        return False

    return True
Ejemplo n.º 10
0
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    repair_pid = bibtask.task_get_option('repair_pid')
    fast_update_personid = bibtask.task_get_option('fast_update_personid')
    personid_gc = bibtask.task_get_option('personid_gc')
    record_ids = bibtask.task_get_option('record_ids')
    all_records = bibtask.task_get_option('all_records')

    if record_ids:
        record_ids_nested = [[p] for p in record_ids]
    else:
        record_ids_nested = None

    if repair_pid:
        bibtask.task_update_progress('Updating names cache...')
        _run_update_authornames_tables_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('Removing person entities not touched by '
                                     'humans...')
        personid_remove_automatically_assigned_papers()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('Updating person entities...')
        update_personID_from_algorithm()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('Cleaning person tables...')
        _run_update_personID_table_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('All repairs done.')

    if fast_update_personid:
        bibtask.task_update_progress('Updating personid...')
        _run_personid_fast_assign_papers(record_ids_nested, all_records)
        bibtask.task_update_progress('PersonID update finished!')

    if personid_gc:
        bibtask.task_update_progress('Updating personid (GC)...')
        _run_personid_gc(record_ids_nested, all_records)
        bibtask.task_update_progress('PersonID update finished (GC)!')
    return 1
Ejemplo n.º 11
0
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """
    if bibtask.task_get_option('update_personid'):
        record_ids = bibtask.task_get_option('record_ids')
        if record_ids:
            record_ids = map(int, record_ids)
        all_records = bibtask.task_get_option('all_records')

        bibtask.task_update_progress('Updating personid...')
        run_rabbit(record_ids, all_records)
        bibtask.task_update_progress('PersonID update finished!')

    if bibtask.task_get_option("disambiguate"):
        bibtask.task_update_progress('Performing full disambiguation...')
        run_tortoise(bool(bibtask.task_get_option("from_scratch")))
        bibtask.task_update_progress('Full disambiguation finished!')

    if bibtask.task_get_option("merge"):
        bibtask.task_update_progress('Merging results...')
        run_merge()
        bibtask.task_update_progress('Merging finished!')

    return 1
Ejemplo n.º 12
0
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """
    if bibtask.task_get_option('update_personid'):
        record_ids = bibtask.task_get_option('record_ids')
        if record_ids:
            record_ids = map(int, record_ids)
        all_records = bibtask.task_get_option('all_records')

        bibtask.task_update_progress('Updating personid...')
        run_rabbit(record_ids, all_records)
        bibtask.task_update_progress('PersonID update finished!')

    if bibtask.task_get_option("disambiguate"):
        bibtask.task_update_progress('Performing full disambiguation...')
        run_tortoise(bool(bibtask.task_get_option("from_scratch")))
        bibtask.task_update_progress('Full disambiguation finished!')

    if bibtask.task_get_option("merge"):
        bibtask.task_update_progress('Merging results...')
        run_merge()
        bibtask.task_update_progress('Merging finished!')

    return 1
Ejemplo n.º 13
0
def _get_recids_foreach_ontology(recids=None, collections=None, taxonomy=None):
    """Returns an array containing hash objects containing the
    collection, its corresponding ontology and the records belonging to
    the given collection."""
    rec_onts = []

    # User specified record IDs.
    if recids:
        rec_onts.append({
            'ontology': taxonomy,
            'collection': None,
            'recIDs': recids,
        })
        return rec_onts

    # User specified collections.
    if collections:
        for collection in collections:
            records = get_collection_reclist(collection)
            if records:
                rec_onts.append({
                    'ontology': taxonomy,
                    'collection': collection,
                    'recIDs': records
                })
        return rec_onts

    # Use rules found in collection_clsMETHOD.
    result = run_sql(
        "SELECT clsMETHOD.name, clsMETHOD.last_updated, "
        "collection.name FROM clsMETHOD JOIN collection_clsMETHOD ON "
        "clsMETHOD.id=id_clsMETHOD JOIN collection ON "
        "id_collection=collection.id")

    for ontology, date_last_run, collection in result:
        records = get_collection_reclist(collection)
        if records:
            if not date_last_run:
                bibtask.write_message(
                    "INFO: Collection %s has not been previously "
                    "analyzed." % collection,
                    stream=sys.stderr,
                    verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            elif bibtask.task_get_option('force'):
                bibtask.write_message(
                    "INFO: Analysis is forced for collection %s." % collection,
                    stream=sys.stderr,
                    verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            else:
                modified_records = intbitset(
                    run_sql(
                        "SELECT id FROM bibrec "
                        "WHERE modification_date >= %s", (date_last_run, )))

            records &= modified_records
            if records:
                rec_onts.append({
                    'ontology': ontology,
                    'collection': collection,
                    'recIDs': records
                })
            else:
                bibtask.write_message(
                    "WARNING: All records from collection '%s' have "
                    "already been analyzed for keywords with ontology '%s' "
                    "on %s." % (collection, ontology, date_last_run),
                    stream=sys.stderr,
                    verbose=2)
        else:
            bibtask.write_message(
                "ERROR: Collection '%s' doesn't contain any record. "
                "Cannot analyse keywords." % (collection, ),
                stream=sys.stderr,
                verbose=0)

    return rec_onts
Ejemplo n.º 14
0
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    lastname = bibtask.task_get_option("lastname")
    process_all = bibtask.task_get_option("process_all")
    prepare_grid = bibtask.task_get_option("prepare_grid")
    load_grid = bibtask.task_get_option("load_grid_results")
    data_dir = bibtask.task_get_option("data_dir")
    prefix = bibtask.task_get_option("prefix")
    max_records_option = bibtask.task_get_option("max_records")
    update = bibtask.task_get_option("update")
    clean_cache = bibtask.task_get_option("clean_cache")
    update_cache = bibtask.task_get_option("update_cache")
    record_ids = bibtask.task_get_option("record_ids")
    record_ids_nested = None
    all_records = bibtask.task_get_option("all_records")
    repair_pid = bibtask.task_get_option("repair_pid")
    fast_update_personid = bibtask.task_get_option("fast_update_personid")

    if record_ids:
        record_ids_nested = [[p] for p in record_ids]

    if fast_update_personid:
        fast_update_personid = [[p] for p in fast_update_personid]
    #    automated_daemon_mode_p = True

    if lastname:
        bibtask.write_message("Processing last name %s" % (lastname), stream=sys.stdout, verbose=0)

    if process_all:
        if bconfig.STANDALONE:
            bibtask.write_message("Processing not possible in standalone!", stream=sys.stdout, verbose=0)
            return 0

        bibtask.write_message("Processing all names...", stream=sys.stdout, verbose=0)

        lengths = get_len_authornames_bibrefs()

        if not check_and_create_aid_tables():
            bibtask.write_message("Failed to create database tables!", stream=sys.stdout, verbose=0)
            return 0

        if lengths["names"] < 1:
            bibtask.write_message("Populating Authornames table. It's Empty.", stream=sys.stdout, verbose=0)
            bibtask.task_update_progress("Populating Authornames table.")
            populate_authornames()
            insert_user_log(
                "daemon",
                "-1",
                "UATFP",
                "bibsched",
                "status",
                comment="bibauthorid_daemon, " "update_authornames_tables_from_paper",
            )

        if lengths["bibrefs"] < 1:
            bibtask.write_message("Populating Bibrefs lookup. It's Empty.", stream=sys.stdout, verbose=0)
            bibtask.task_update_progress("Populating Bibrefs lookup table.")
            populate_authornames_bibrefs_from_authornames()

        bibtask.task_update_progress("Processing all authors.")
        start_full_disambiguation(
            last_names="all", process_orphans=True, db_exists=False, populate_doclist=True, write_to_db=True
        )
        update_personID_from_algorithm()
        insert_user_log(
            "daemon", "-1", "update_aid", "bibsched", "status", comment="bibauthorid_daemon, update_authorid_universe"
        )

    if prepare_grid:
        bibtask.write_message("Preparing Grid Job", stream=sys.stdout, verbose=0)
        data_dir_name = "grid_data"
        workdir_prefix = "job"
        max_records = 4000

        if data_dir:
            data_dir_name = data_dir

        if prefix:
            workdir_prefix = prefix

        if max_records_option:
            max_records = max_records_option

        _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records)

    if load_grid:
        bibtask.write_message(
            "Reading Grid Job results and will write" " them to the database.", stream=sys.stdout, verbose=0
        )

        _write_data_files_to_db(data_dir)

    if update or update_cache:
        bibtask.write_message("update-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("update-cache: Processing recently" " updated papers")
        _run_update_authornames_tables_from_paper(record_ids_nested, all_records)
        bibtask.write_message("update-cache: Finished processing papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("update-cache: DONE")

    if update:
        bibtask.write_message("updating authorid universe", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("updating authorid universe")
        _update_authorid_universe(record_ids, all_records)
        bibtask.write_message("done updating authorid universe", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("done updating authorid universe")

    if clean_cache:
        bibtask.write_message("clean-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("clean-cache: Processing recently updated" " papers for names")
        _run_authornames_tables_gc()
        bibtask.write_message("update-cache: Finished cleaning authornames " "tables", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("clean-cache: Processing recently updated" " papers for persons")
        _run_update_personID_table_from_paper(record_ids_nested, all_records)
        bibtask.write_message("update-cache: Finished cleaning PersonID" " table", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("clean-cache: DONE")

    if repair_pid:
        bibtask.task_update_progress("Updating names cache...")
        _run_update_authornames_tables_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("Removing person entities not touched by " "humans...")
        personid_remove_automatically_assigned_papers()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("Updating person entities...")
        update_personID_from_algorithm()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("Cleaning person tables...")
        _run_update_personID_table_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("All repairs done.")

    if fast_update_personid:
        bibtask.task_update_progress("Updating personid...")
        _run_personid_fast_assign_papers(fast_update_personid)
        bibtask.task_update_progress("Update finished...")
        # TODO: remember to pass the papers list!
    return 1
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    lastname = bibtask.task_get_option('lastname')
    process_all = bibtask.task_get_option('process_all')
    prepare_grid = bibtask.task_get_option('prepare_grid')
    load_grid = bibtask.task_get_option('load_grid_results')
    data_dir = bibtask.task_get_option('data_dir')
    prefix = bibtask.task_get_option('prefix')
    max_records_option = bibtask.task_get_option('max_records')
    update = bibtask.task_get_option('update')
    clean_cache = bibtask.task_get_option('clean_cache')
    update_cache = bibtask.task_get_option('update_cache')

#    automated_daemon_mode_p = True

    if lastname:
        bibtask.write_message("Processing last name %s" % (lastname),
                              stream=sys.stdout, verbose=0)

    if process_all:
        if bconfig.STANDALONE:
            bibtask.write_message("Processing not possible in standalone!",
                                  stream=sys.stdout, verbose=0)
            return 0

        bibtask.write_message("Processing all names...",
                              stream=sys.stdout, verbose=0)

        lengths = get_len_authornames_bibrefs()

        if not check_and_create_aid_tables():
            bibtask.write_message("Failed to create database tables!",
                                  stream=sys.stdout, verbose=0)
            return 0

        if lengths['names'] < 1:
            bibtask.write_message("Populating Authornames table. It's Empty.",
                                  stream=sys.stdout, verbose=0)
            bibtask.task_update_progress('Populating Authornames table.')
            populate_authornames()
            insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status',
                            comment='bibauthorid_daemon, '
                            'update_authornames_tables_from_paper')


        if lengths['bibrefs'] < 1:
            bibtask.write_message("Populating Bibrefs lookup. It's Empty.",
                                  stream=sys.stdout, verbose=0)
            bibtask.task_update_progress('Populating Bibrefs lookup table.')
            populate_authornames_bibrefs_from_authornames()

        bibtask.task_update_progress('Processing all authors.')
        start_full_disambiguation(last_names="all",
                                 process_orphans=True,
                                 db_exists=False,
                                 populate_doclist=True,
                                 write_to_db=True)
        update_personID_from_algorithm()
        insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status',
                    comment='bibauthorid_daemon, update_authorid_universe')

    if prepare_grid:
        bibtask.write_message("Preparing Grid Job",
                              stream=sys.stdout, verbose=0)
        data_dir_name = "grid_data"
        workdir_prefix = "job"
        max_records = 4000

        if data_dir:
            data_dir_name = data_dir

        if prefix:
            workdir_prefix = prefix

        if max_records_option:
            max_records = max_records_option

        _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records)

    if load_grid:
        bibtask.write_message("Reading Grid Job results and will write"
                              " them to the database.",
                              stream=sys.stdout, verbose=0)

        _write_data_files_to_db(data_dir)

    if update or update_cache:
        bibtask.write_message("update-cache: Processing recently updated"
                              " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('update-cache: Processing recently'
                                     ' updated papers')
        _run_update_authornames_tables_from_paper()
        bibtask.write_message("update-cache: Finished processing papers",
                              stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('update-cache: DONE')

    if update:
        bibtask.write_message("updating authorid universe",
                              stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('updating authorid universe')
        _update_authorid_universe()
        bibtask.write_message("done updating authorid universe",
                              stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('done updating authorid universe')

    if clean_cache:
        bibtask.write_message("clean-cache: Processing recently updated"
                              " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for names')
        _run_authornames_tables_gc()
        bibtask.write_message("update-cache: Finished cleaning authornames "
                              "tables", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for persons')
        _run_update_personID_table_from_paper()
        bibtask.write_message("update-cache: Finished cleaning PersonID"
                              " table", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('clean-cache: DONE')

    return 1
Ejemplo n.º 16
0
def _task_submit_check_options():
    """
    Required by bibtask. Checks the options.
    """
    update_personid = bibtask.task_get_option("update_personid")
    disambiguate = bibtask.task_get_option("disambiguate")
    merge = bibtask.task_get_option("merge")

    record_ids = bibtask.task_get_option("record_ids")
    all_records = bibtask.task_get_option("all_records")
    from_scratch = bibtask.task_get_option("from_scratch")

    commands = bool(update_personid) + bool(disambiguate) + bool(merge)

    if commands == 0:
        bibtask.write_message("ERROR: At least one command should be specified!"
                              , stream=sys.stdout, verbose=0)
        return False

    if commands > 1:
        bibtask.write_message("ERROR: The options --update-personid, --disambiguate "
                              "and --merge are mutually exclusive."
                              , stream=sys.stdout, verbose=0)
        return False

    assert commands == 1

    if update_personid:
        if any((from_scratch,)):
            bibtask.write_message("ERROR: The only options which can be specified "
                                  "with --update-personid are --record-ids and "
                                  "--all-records"
                                  , stream=sys.stdout, verbose=0)
            return False

        options = bool(record_ids) + bool(all_records)
        if options > 1:
            bibtask.write_message("ERROR: conflicting options: --record-ids and "
                                  "--all-records are mutually exclusive."
                                  , stream=sys.stdout, verbose=0)
            return False

        if record_ids:
            for iden in record_ids:
                if not iden.isdigit():
                    bibtask.write_message("ERROR: Record_ids expects numbers. "
                                          "Provided: %s." % iden)
                    return False

    if disambiguate:
        if any((record_ids, all_records)):
            bibtask.write_message("ERROR: The only option which can be specified "
                                  "with --disambiguate is from-scratch"
                                  , stream=sys.stdout, verbose=0)
            return False

    if merge:
        if any((record_ids, all_records, from_scratch)):
            bibtask.write_message("ERROR: There are no options which can be "
                                  "specified along with --merge"
                                  , stream=sys.stdout, verbose=0)
            return False

    return True
Ejemplo n.º 17
0
def _task_submit_check_options():
    """
    Required by bibtask. Checks the options.
    """
    lastname = bibtask.task_get_option("lastname")
    process_all = bibtask.task_get_option("process_all")
    prepare_grid = bibtask.task_get_option("prepare_grid")
    load_grid = bibtask.task_get_option("load_grid_results")
    data_dir = bibtask.task_get_option("data_dir")
    prefix = bibtask.task_get_option("prefix")
    max_records = bibtask.task_get_option("max_records")
    update = bibtask.task_get_option("update")
    clean_cache = bibtask.task_get_option("clean_cache")
    update_cache = bibtask.task_get_option("update_cache")
    record_ids = bibtask.task_get_option("record_ids")
    all_records = bibtask.task_get_option("all_records")
    repair_pid = bibtask.task_get_option("repair_pid")
    fast_update_personid = bibtask.task_get_option("fast_update_personid")

    if record_ids and all_records:
        bibtask.write_message(
            "ERROR: conflicting options: --record-ids and " "--all-records cannot be specified at the same " "time.",
            stream=sys.stdout,
            verbose=0,
        )
        return False

    if lastname == "None," or lastname == "None":
        lastname = False

    if (
        not lastname
        and not process_all
        and not update
        and not prepare_grid
        and not load_grid
        and not clean_cache
        and not update_cache
        and not fast_update_personid
    ):
        bibtask.write_message(
            "ERROR: One of the options -a, -n, -U, -G, -R, "
            "--clean-cache, --update-cache, --fast-update-personid is"
            " required!",
            stream=sys.stdout,
            verbose=0,
        )
        return False
    elif not (
        bool(lastname)
        ^ bool(process_all)
        ^ bool(update)
        ^ bool(prepare_grid)
        ^ bool(load_grid)
        ^ bool(clean_cache)
        ^ bool(update_cache)
        ^ bool(repair_pid)
        ^ bool(fast_update_personid)
    ):
        bibtask.write_message(
            "ERROR: Options -a -n -U -R -G --clean-cache "
            "--update-cache --repair-personid --fast-update-personid "
            "are mutually"
            " exclusive!",
            stream=sys.stdout,
            verbose=0,
        )
        return False
    elif (not prepare_grid and (data_dir or prefix or max_records)) and (not load_grid and (data_dir)):
        bibtask.write_message(
            "ERROR: The options -d, -m and -p require -G or " "-R to run!", stream=sys.stdout, verbose=0
        )
        return False
    elif load_grid and not bool(data_dir):
        bibtask.write_message("ERROR: The option -R requires the option -d " "to run!", stream=sys.stdout, verbose=0)
        return False

    return True
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    lastname = bibtask.task_get_option('lastname')
    process_all = bibtask.task_get_option('process_all')
    prepare_grid = bibtask.task_get_option('prepare_grid')
    load_grid = bibtask.task_get_option('load_grid_results')
    data_dir = bibtask.task_get_option('data_dir')
    prefix = bibtask.task_get_option('prefix')
    max_records_option = bibtask.task_get_option('max_records')
    update = bibtask.task_get_option('update')
    clean_cache = bibtask.task_get_option('clean_cache')
    update_cache = bibtask.task_get_option('update_cache')

    #    automated_daemon_mode_p = True

    if lastname:
        bibtask.write_message("Processing last name %s" % (lastname),
                              stream=sys.stdout,
                              verbose=0)

    if process_all:
        if bconfig.STANDALONE:
            bibtask.write_message("Processing not possible in standalone!",
                                  stream=sys.stdout,
                                  verbose=0)
            return 0

        bibtask.write_message("Processing all names...",
                              stream=sys.stdout,
                              verbose=0)

        lengths = get_len_authornames_bibrefs()

        if not check_and_create_aid_tables():
            bibtask.write_message("Failed to create database tables!",
                                  stream=sys.stdout,
                                  verbose=0)
            return 0

        if lengths['names'] < 1:
            bibtask.write_message("Populating Authornames table. It's Empty.",
                                  stream=sys.stdout,
                                  verbose=0)
            bibtask.task_update_progress('Populating Authornames table.')
            populate_authornames()
            insert_user_log('daemon',
                            '-1',
                            'UATFP',
                            'bibsched',
                            'status',
                            comment='bibauthorid_daemon, '
                            'update_authornames_tables_from_paper')

        if lengths['bibrefs'] < 1:
            bibtask.write_message("Populating Bibrefs lookup. It's Empty.",
                                  stream=sys.stdout,
                                  verbose=0)
            bibtask.task_update_progress('Populating Bibrefs lookup table.')
            populate_authornames_bibrefs_from_authornames()

        bibtask.task_update_progress('Processing all authors.')
        start_full_disambiguation(last_names="all",
                                  process_orphans=True,
                                  db_exists=False,
                                  populate_doclist=True,
                                  write_to_db=True)
        update_personID_from_algorithm()
        insert_user_log('daemon',
                        '-1',
                        'update_aid',
                        'bibsched',
                        'status',
                        comment='bibauthorid_daemon, update_authorid_universe')

    if prepare_grid:
        bibtask.write_message("Preparing Grid Job",
                              stream=sys.stdout,
                              verbose=0)
        data_dir_name = "grid_data"
        workdir_prefix = "job"
        max_records = 4000

        if data_dir:
            data_dir_name = data_dir

        if prefix:
            workdir_prefix = prefix

        if max_records_option:
            max_records = max_records_option

        _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records)

    if load_grid:
        bibtask.write_message(
            "Reading Grid Job results and will write"
            " them to the database.",
            stream=sys.stdout,
            verbose=0)

        _write_data_files_to_db(data_dir)

    if update or update_cache:
        bibtask.write_message(
            "update-cache: Processing recently updated"
            " papers",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('update-cache: Processing recently'
                                     ' updated papers')
        _run_update_authornames_tables_from_paper()
        bibtask.write_message("update-cache: Finished processing papers",
                              stream=sys.stdout,
                              verbose=0)
        bibtask.task_update_progress('update-cache: DONE')

    if update:
        bibtask.write_message("updating authorid universe",
                              stream=sys.stdout,
                              verbose=0)
        bibtask.task_update_progress('updating authorid universe')
        _update_authorid_universe()
        bibtask.write_message("done updating authorid universe",
                              stream=sys.stdout,
                              verbose=0)
        bibtask.task_update_progress('done updating authorid universe')

    if clean_cache:
        bibtask.write_message(
            "clean-cache: Processing recently updated"
            " papers",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for names')
        _run_authornames_tables_gc()
        bibtask.write_message(
            "update-cache: Finished cleaning authornames "
            "tables",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for persons')
        _run_update_personID_table_from_paper()
        bibtask.write_message(
            "update-cache: Finished cleaning PersonID"
            " table",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('clean-cache: DONE')

    return 1
Ejemplo n.º 19
0
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
                                                   taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
                                                   taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(
                bibtask.task_get_task_param('task_starting_time'))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = bibclassify_engine.get_tmp_file(_rid)
    fo = open(abs_path, 'w')

    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                              len(onto_rec['recIDs'])),
                stream=sys.stderr,
                verbose=3)
        else:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to recIDs %s. ' %
                (onto_rec['ontology'], ', '.join(
                    [str(recid) for recid in onto_rec['recIDs']])),
                stream=sys.stderr,
                verbose=3)
        if onto_rec['recIDs']:
            xml = _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'],
                                     onto_rec['collection'])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write('</collection>\n')
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            bibclassify_webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message(
                "INFO: CFG_DB_SAVE_KW is false, we don't save results",
                stream=sys.stderr,
                verbose=0)
    else:
        bibtask.write_message("WARNING: No keywords found, recids: %s" %
                              onto_recids,
                              stream=sys.stderr,
                              verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(
            bibtask.task_get_task_param('task_starting_time'))
    return 1
Ejemplo n.º 20
0
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
            taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
            taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(bibtask.task_get_task_param('task_starting_time'))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = bibclassify_engine.get_tmp_file(_rid)
    fo = open(abs_path, 'w')


    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            bibtask.write_message('INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                len(onto_rec['recIDs'])), stream=sys.stderr, verbose=3)
        else:
            bibtask.write_message('INFO: Applying taxonomy %s to recIDs %s. ' %
                (onto_rec['ontology'],
                ', '.join([str(recid) for recid in onto_rec['recIDs']])),
                stream=sys.stderr, verbose=3)
        if onto_rec['recIDs']:
            xml = _analyze_documents(onto_rec['recIDs'],
                onto_rec['ontology'], onto_rec['collection'])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write('</collection>\n')
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            bibclassify_webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message("INFO: CFG_DB_SAVE_KW is false, we don't save results",
                                  stream=sys.stderr, verbose=0)
    else:
        bibtask.write_message("WARNING: No keywords found, recids: %s" % onto_recids,
                                  stream=sys.stderr, verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(bibtask.task_get_task_param('task_starting_time'))
    return 1
Ejemplo n.º 21
0
def _task_submit_check_options():
    """
    Required by bibtask. Checks the options.
    """
    update_personid = bibtask.task_get_option("update_personid")
    disambiguate = bibtask.task_get_option("disambiguate")
    merge = bibtask.task_get_option("merge")

    record_ids = bibtask.task_get_option("record_ids")
    all_records = bibtask.task_get_option("all_records")
    from_scratch = bibtask.task_get_option("from_scratch")

    commands = bool(update_personid) + bool(disambiguate) + bool(merge)

    if commands == 0:
        bibtask.write_message("ERROR: At least one command should be specified!"
                              , stream=sys.stdout, verbose=0)
        return False

    if commands > 1:
        bibtask.write_message("ERROR: The options --update-personid, --disambiguate "
                              "and --merge are mutually exclusive."
                              , stream=sys.stdout, verbose=0)
        return False

    assert commands == 1

    if update_personid:
        if any((from_scratch,)):
            bibtask.write_message("ERROR: The only options which can be specified "
                                  "with --update-personid are --record-ids and "
                                  "--all-records"
                                  , stream=sys.stdout, verbose=0)
            return False

        options = bool(record_ids) + bool(all_records)
        if options > 1:
            bibtask.write_message("ERROR: conflicting options: --record-ids and "
                                  "--all-records are mutually exclusive."
                                  , stream=sys.stdout, verbose=0)
            return False

        if record_ids:
            for iden in record_ids:
                if not iden.isdigit():
                    bibtask.write_message("ERROR: Record_ids expects numbers. "
                                          "Provided: %s." % iden)
                    return False

    if disambiguate:
        if any((record_ids, all_records)):
            bibtask.write_message("ERROR: The only option which can be specified "
                                  "with --disambiguate is from-scratch"
                                  , stream=sys.stdout, verbose=0)
            return False

    if merge:
        if any((record_ids, all_records, from_scratch)):
            bibtask.write_message("ERROR: There are no options which can be "
                                  "specified along with --merge"
                                  , stream=sys.stdout, verbose=0)
            return False

    return True
Ejemplo n.º 22
0
def _get_recids_foreach_ontology(recids=None, collections=None, taxonomy=None):
    """Returns an array containing hash objects containing the
    collection, its corresponding ontology and the records belonging to
    the given collection."""
    rec_onts = []

    # User specified record IDs.
    if recids:
        rec_onts.append({
            'ontology': taxonomy,
            'collection': None,
            'recIDs': recids,
        })
        return rec_onts

    # User specified collections.
    if collections:
        for collection in collections:
            records = get_collection_reclist(collection)
            if records:
                rec_onts.append({
                    'ontology': taxonomy,
                    'collection': collection,
                    'recIDs': records
                })
        return rec_onts

    # Use rules found in collection_clsMETHOD.
    result = run_sql("SELECT clsMETHOD.name, clsMETHOD.last_updated, "
        "collection.name FROM clsMETHOD JOIN collection_clsMETHOD ON "
        "clsMETHOD.id=id_clsMETHOD JOIN collection ON "
        "id_collection=collection.id")

    for ontology, date_last_run, collection in result:
        records = get_collection_reclist(collection)
        if records:
            if not date_last_run:
                bibtask.write_message("INFO: Collection %s has not been previously "
                    "analyzed." % collection, stream=sys.stderr, verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            elif bibtask.task_get_option('force'):
                bibtask.write_message("INFO: Analysis is forced for collection %s." %
                    collection, stream=sys.stderr, verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            else:
                modified_records = intbitset(run_sql("SELECT id FROM bibrec "
                    "WHERE modification_date >= %s", (date_last_run, )))

            records &= modified_records
            if records:
                rec_onts.append({
                    'ontology': ontology,
                    'collection': collection,
                    'recIDs': records
                })
            else:
                bibtask.write_message("WARNING: All records from collection '%s' have "
                    "already been analyzed for keywords with ontology '%s' "
                    "on %s." % (collection, ontology, date_last_run),
                    stream=sys.stderr, verbose=2)
        else:
            bibtask.write_message("ERROR: Collection '%s' doesn't contain any record. "
                "Cannot analyse keywords." % (collection,),
                stream=sys.stderr, verbose=0)

    return rec_onts