def run_rabbit(paperslist, all_records=False):
    if not paperslist and all_records:
        rabbit_with_log(None, True, "bibauthorid_daemon, update_personid on all papers")
    elif not paperslist:
        last_log = get_user_log(userinfo="daemon", action="PID_UPDATE", only_most_recent=True)

        if len(last_log) >= 1:
            # select only the most recent papers
            recently_modified = get_recently_modified_record_ids(date=last_log[0][2])
            if not recently_modified:
                bibtask.write_message(
                    "update_personID_table_from_paper: " "All person entities up to date.", stream=sys.stdout, verbose=0
                )
            else:
                bibtask.write_message(
                    "update_personID_table_from_paper: Running on: " + str(recently_modified),
                    stream=sys.stdout,
                    verbose=0,
                )
                rabbit_with_log(
                    recently_modified,
                    True,
                    "bibauthorid_daemon, run_personid_fast_assign_papers on "
                    + str([paperslist, all_records, recently_modified]),
                )
        else:
            rabbit_with_log(None, True, "bibauthorid_daemon, update_personid on all papers")
    else:
        rabbit_with_log(
            paperslist, True, "bibauthorid_daemon, personid_fast_assign_papers on " + str(paperslist), partial=True
        )
def _task_submit_elaborate_specific_parameter(key, value, opts, args):
    """
    Given the string key it checks it's meaning, eventually using the
    value. Usually, it fills some key in the options dict.
    It must return True if it has elaborated the key, False, if it doesn't
    know that key.
    """
    if key in ("-n", "--lastname"):
        if value == "None," or value == "None":
            bibtask.write_message("The value specified for --lastname must "
                "be a valid name. Not '%s'." % value, stream=sys.stdout,
                verbose=0)
            return False

        bibtask.task_set_option('lastname', value)

    elif key in ("-a", "--process-all"):
        bibtask.task_set_option("process_all", True)

    elif key in ("-U", "--update-universe"):
        bibtask.task_set_option("update", True)

    elif key in ("-G", "--prepare-grid"):
        bibtask.task_set_option("prepare_grid", True)

    elif key in ("-R", "--load-grid-results"):
        bibtask.task_set_option("load_grid_results", True)

    elif key in ("-d", "--data-dir"):
        bibtask.task_set_option("data_dir", value)

    elif key in ("-p", "--prefix"):
        bibtask.task_set_option("prefix", value)

    elif key in ("-m", "--max-records"):
        bibtask.task_set_option("max_records", value)

    elif key in ("--update-cache",):
        bibtask.task_set_option("update_cache", True)

    elif key in ("--clean-cache",):
        bibtask.task_set_option("clean_cache", True)

    elif key in ("--record-ids", '-r'):
        if value.count("="):
            value = value[1:]

        value = value.split(",")
        bibtask.task_set_option("record_ids", value)

    elif key in ("--all-records"):
        bibtask.task_set_option("all_records", True)

    elif key in ("--repair-personid"):
        bibtask.task_set_option("repair_pid", True)

    else:
        return False

    return True
def compute_cache(pids):
    bibtask.write_message("WebAuthorProfile: %s persons to go" % len(pids),
                          stream=stdout, verbose=0)
    for _, p in enumerate(pids):
        bibtask.write_message("WebAuthorProfile: doing %s out of %s (personid: %s)" % (pids.index(p) + 1, len(pids), p))
        bibtask.task_update_progress("WebAuthorProfile: doing %s out of %s (personid: %s)" % (pids.index(p) + 1, len(pids), p))
        _compute_cache_for_person(p)
        bibtask.task_sleep_now_if_required(can_stop_too=True)
def compute_cache(pids):
    bibtask.write_message("WebAuthorProfile: %s persons to go" % len(pids),
                          stream=sys.stdout, verbose=0)
    for i, p in enumerate(pids):
        bibtask.write_message("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids)))
        bibtask.task_update_progress("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids)))
        _compute_cache_for_person(p)
        bibtask.task_sleep_now_if_required(can_stop_too=True)
def compute_cache_mp(pids):
    from multiprocessing import Pool
    p = Pool()
    bibtask.write_message("WebAuthorProfileMP: %s persons to go" % len(pids),
                          stream=sys.stdout, verbose=0)
    sl = 100
    ss = [pids[i: i + sl] for i in range(0, len(pids), sl)]
    for i, bunch in enumerate(ss):
        bibtask.write_message("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss)))
        bibtask.task_update_progress("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss)))
        p.map(_compute_cache_for_person, bunch)
        bibtask.task_sleep_now_if_required(can_stop_too=True)
def compute_cache_mp(pids):
    from multiprocessing import Pool
    p = Pool()
    bibtask.write_message("WebAuthorProfileMP: %s persons to go" % len(pids),
                          stream=stdout, verbose=0)
    sl = 100
    ss = [pids[i: i + sl] for i in range(0, len(pids), sl)]
    for i, bunch in enumerate(ss):
        bibtask.write_message("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss)))
        bibtask.task_update_progress("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss)))
        p.map(_compute_cache_for_person, bunch)
        bibtask.task_sleep_now_if_required(can_stop_too=True)
def _task_submit_check_options():
    """
    Required by bibtask. Checks the options.
    """
    lastname = bibtask.task_get_option('lastname')
    process_all = bibtask.task_get_option('process_all')
    prepare_grid = bibtask.task_get_option('prepare_grid')
    load_grid = bibtask.task_get_option('load_grid_results')
    data_dir = bibtask.task_get_option('data_dir')
    prefix = bibtask.task_get_option('prefix')
    max_records = bibtask.task_get_option('max_records')
    update = bibtask.task_get_option('update')
    clean_cache = bibtask.task_get_option('clean_cache')
    update_cache = bibtask.task_get_option('update_cache')
    record_ids = bibtask.task_get_option('record_ids')
    all_records = bibtask.task_get_option('all_records')
    repair_pid = bibtask.task_get_option('repair_pid')

    if (record_ids and all_records):
        bibtask.write_message("ERROR: conflicting options: --record-ids and "
                              "--all-records cannot be specified at the same "
                              "time.", stream=sys.stdout, verbose=0)
        return False

    if (lastname == "None," or lastname == "None"):
        lastname = False

    if (not lastname and not process_all and not update
        and not prepare_grid and not load_grid and not clean_cache
        and not update_cache):
        bibtask.write_message("ERROR: One of the options -a, -n, -U, -G, -R, "
                              "--clean-cache, --update-cache is"
                              " required!", stream=sys.stdout, verbose=0)
        return False
    elif not (bool(lastname) ^ bool(process_all) ^ bool(update)
              ^ bool(prepare_grid) ^ bool(load_grid) ^ bool(clean_cache)
              ^ bool(update_cache) ^ bool(repair_pid)):
        bibtask.write_message("ERROR: Options -a -n -U -R -G --clean-cache "
                              "--update-cache --repair-personid are mutually"
                              " exclusive!", stream=sys.stdout, verbose=0)
        return False
    elif ((not prepare_grid and (data_dir or prefix or max_records)) and
          (not load_grid and (data_dir))):
        bibtask.write_message("ERROR: The options -d, -m and -p require -G or "
                              "-R to run!", stream=sys.stdout, verbose=0)
        return False
    elif load_grid and not bool(data_dir):
        bibtask.write_message("ERROR: The option -R requires the option -d "
                              "to run!", stream=sys.stdout, verbose=0)
        return False

    return True
def _run_update_authornames_tables_from_paper(record_ids=None, all_records=False):
    '''
    Runs the update on the papers which have been modified since the last run

    @note: This should be run as often as possible to keep authornames and
           authornames_bibrefs cache tables up to date.
    '''
    if not all_records and not record_ids:
        last_log = get_user_log(userinfo='daemon', action='UATFP', only_most_recent=True)
        if len(last_log) >= 1:
            #select only the most recent papers
            recently_modified, min_date = get_papers_recently_modified(date=last_log[0][2])
            insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status', comment='bibauthorid_daemon, update_authornames_tables_from_paper', timestamp=min_date[0][0])

            if not recently_modified:
                bibtask.write_message("update_authornames_tables_from_paper: "
                                      "All names up to date.",
                                      stream=sys.stdout, verbose=0)
            else:
                bibtask.write_message("update_authornames_tables_from_paper: Running on %s papers " % str(len(recently_modified)), stream=sys.stdout, verbose=0)
                update_authornames_tables_from_paper(recently_modified)
        else:
            #this is the first time the utility is run, run on all the papers?
            #Probably better to write the log on the first authornames population
            #@todo: authornames population writes the log
            recently_modified, min_date = get_papers_recently_modified()
            insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status', comment='bibauthorid_daemon, update_authornames_tables_from_paper', timestamp=min_date[0][0])
            bibtask.write_message("update_authornames_tables_from_paper: Running on %s papers " % str(len(recently_modified)), stream=sys.stdout, verbose=0)
            update_authornames_tables_from_paper(recently_modified)
    else:
        bibtask.write_message("update_authornames_tables_from_paper: Running "
                              "on all papers ",
                              stream=sys.stdout, verbose=0)
        update_authornames_tables_from_paper(record_ids)
def _task_submit_elaborate_specific_parameter(key, value, opts, args):
    """Given the string key it checks it's meaning, eventually using the
    value. Usually it fills some key in the options dict.
    It must return True if it has elaborated the key, False, if it doesn't
    know that key.
    eg:
    if key in ('-n', '--number'):
        bibtask.task_get_option(\1) = value
        return True
    return False
    """
    # Recid option
    if key in ("-i", "--recid"):
        try:
            value = int(value)
        except ValueError:
            bibtask.write_message("The value specified for --recid must be a "
                                  "valid integer, not '%s'." % value,
                                  stream=sys.stderr,
                                  verbose=0)
        if not _recid_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid record ID." %
                                  value,
                                  stream=sys.stderr,
                                  verbose=0)
            return False
        recids = bibtask.task_get_option('recids')
        if recids is None:
            recids = []
        recids.append(value)
        bibtask.task_set_option('recids', recids)

    # Collection option
    elif key in ("-c", "--collection"):
        if not _collection_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid collection." %
                                  value,
                                  stream=sys.stderr,
                                  verbose=0)
            return False
        collections = bibtask.task_get_option("collections")
        collections = collections or []
        collections.append(value)
        bibtask.task_set_option("collections", collections)

    # Taxonomy option
    elif key in ("-k", "--taxonomy"):
        if not _ontology_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid taxonomy name." %
                                  value,
                                  stream=sys.stderr,
                                  verbose=0)
            return False
        bibtask.task_set_option("taxonomy", value)
    elif key in ("-f", "--force"):
        bibtask.task_set_option("force", True)
    else:
        return False

    return True
def _task_submit_elaborate_specific_parameter(key, value, opts, args):
    """
    Given the string key it checks it's meaning, eventually using the
    value. Usually, it fills some key in the options dict.
    It must return True if it has elaborated the key, False, if it doesn't
    know that key.
    """
    if key in ("-n", "--lastname"):
        if value == "None," or value == "None":
            bibtask.write_message("The value specified for --lastname must "
                                  "be a valid name. Not '%s'." % value,
                                  stream=sys.stdout,
                                  verbose=0)
            return False

        bibtask.task_set_option('lastname', value)

    elif key in ("-a", "--process-all"):
        bibtask.task_set_option("process_all", True)

    elif key in ("-U", "--update-universe"):
        bibtask.task_set_option("update", True)

    elif key in ("-G", "--prepare-grid"):
        bibtask.task_set_option("prepare_grid", True)

    elif key in ("-R", "--load-grid-results"):
        bibtask.task_set_option("load_grid_results", True)

    elif key in ("-d", "--data-dir"):
        bibtask.task_set_option("data_dir", value)

    elif key in ("-p", "--prefix"):
        bibtask.task_set_option("prefix", value)

    elif key in ("-m", "--max-records"):
        bibtask.task_set_option("max_records", value)

    elif key in ("--update-cache", ):
        bibtask.task_set_option("update_cache", True)

    elif key in ("--clean-cache", ):
        bibtask.task_set_option("clean_cache", True)

    else:
        return False

    return True
def _task_submit_check_options():
    """Required by bibtask. Checks the options."""
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    # If a recid or a collection is specified, check that the taxonomy
    # is also specified.
    if (recids is not None or collections is not None) and \
        taxonomy is None:
        bibtask.write_message("ERROR: When specifying a record ID or a collection, "
            "you have to precise which\ntaxonomy to use.", stream=sys.stderr,
            verbose=0)
        return False

    return True
def _run_update_authornames_tables_from_paper():
    '''
    Runs the update on the papers which have been modified since the last run

    @note: This should be run as often as possible to keep authornames and
           authornames_bibrefs cache tables up to date.
    '''
    last_log = get_user_log(userinfo='daemon',
                            action='UATFP',
                            only_most_recent=True)
    if len(last_log) >= 1:
        #select only the most recent papers
        recently_modified, min_date = get_papers_recently_modified(
            date=last_log[0][2])
        insert_user_log(
            'daemon',
            '-1',
            'UATFP',
            'bibsched',
            'status',
            comment='bibauthorid_daemon, update_authornames_tables_from_paper',
            timestamp=min_date[0][0])
        bibtask.write_message(
            "update_authornames_tables_from_paper: Running on: " +
            str(recently_modified),
            stream=sys.stdout,
            verbose=0)
        update_authornames_tables_from_paper(recently_modified)
    else:
        #this is the first time the utility is run, run on all the papers?
        #Probably better to write the log on the first authornames population
        #@todo: authornames population writes the log
        recently_modified, min_date = get_papers_recently_modified()
        insert_user_log(
            'daemon',
            '-1',
            'UATFP',
            'bibsched',
            'status',
            comment='bibauthorid_daemon, update_authornames_tables_from_paper',
            timestamp=min_date[0][0])
        bibtask.write_message(
            "update_authornames_tables_from_paper: Running on: " +
            str(recently_modified),
            stream=sys.stdout,
            verbose=0)
        update_authornames_tables_from_paper(recently_modified)
def _run_update_personID_table_from_paper():
    '''
    Runs the update on the papers which have been modified since the last run
    This is removing no-longer existing papers from the personid table.

    @note: Update recommended monthly.
    @warning: quite resource intensive.
    '''
    last_log = get_user_log(userinfo='daemon',
                            action='UPITFP',
                            only_most_recent=True)
    if len(last_log) >= 1:
        #select only the most recent papers
        recently_modified, min_date = get_papers_recently_modified(
            date=last_log[0][2])
        insert_user_log(
            'daemon',
            '-1',
            'UPITFP',
            'bibsched',
            'status',
            comment='bibauthorid_daemon, update_personID_table_from_paper',
            timestamp=min_date[0][0])
        bibtask.write_message(
            "update_personID_table_from_paper: Running on: " +
            str(recently_modified),
            stream=sys.stdout,
            verbose=0)
        update_personID_table_from_paper(recently_modified)
    else:
        # Should not process all papers, hence authornames population writes
        # the appropriate log. In case the log is missing, process everything.
        recently_modified, min_date = get_papers_recently_modified()
        insert_user_log(
            'daemon',
            '-1',
            'UPITFP',
            'bibsched',
            'status',
            comment='bibauthorid_daemon, update_personID_table_from_paper',
            timestamp=min_date[0][0])
        bibtask.write_message(
            "update_personID_table_from_paper: Running on: " +
            str(recently_modified),
            stream=sys.stdout,
            verbose=0)
        update_personID_table_from_paper(recently_modified)
def _task_submit_check_options():
    """Required by bibtask. Checks the options."""
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    # If a recid or a collection is specified, check that the taxonomy
    # is also specified.
    if (recids is not None or collections is not None) and \
        taxonomy is None:
        bibtask.write_message(
            "ERROR: When specifying a record ID or a collection, "
            "you have to precise which\ntaxonomy to use.",
            stream=sys.stderr,
            verbose=0)
        return False

    return True
def _task_submit_check_options():
    """
    Required by bibtask. Checks the options.
    """
    lastname = bibtask.task_get_option('lastname')
    process_all = bibtask.task_get_option('process_all')
    prepare_grid = bibtask.task_get_option('prepare_grid')
    load_grid = bibtask.task_get_option('load_grid_results')
    data_dir = bibtask.task_get_option('data_dir')
    prefix = bibtask.task_get_option('prefix')
    max_records = bibtask.task_get_option('max_records')
    update = bibtask.task_get_option('update')
    clean_cache = bibtask.task_get_option('clean_cache')
    update_cache = bibtask.task_get_option('update_cache')

    if (lastname == "None," or lastname == "None"):
        lastname = False

    if (not lastname and not process_all and not update and not prepare_grid
            and not load_grid and not clean_cache and not update_cache):
        bibtask.write_message(
            "ERROR: One of the options -a, -n, -U, -G, -R, "
            "--clean-cache, --update-cache is"
            " required!",
            stream=sys.stdout,
            verbose=0)
        return False
    elif not (bool(lastname) ^ bool(process_all) ^ bool(update)
              ^ bool(prepare_grid) ^ bool(load_grid) ^ bool(clean_cache)
              ^ bool(update_cache)):
        bibtask.write_message(
            "ERROR: Options -a -n -U -R -G --clean-cache "
            "--update-cache are mutually"
            " exclusive!",
            stream=sys.stdout,
            verbose=0)
        return False
    elif ((not prepare_grid and (data_dir or prefix or max_records))
          and (not load_grid and (data_dir))):
        bibtask.write_message(
            "ERROR: The options -d, -m and -p require -G or "
            "-R to run!",
            stream=sys.stdout,
            verbose=0)
        return False
    elif load_grid and not bool(data_dir):
        bibtask.write_message(
            "ERROR: The option -R requires the option -d "
            "to run!",
            stream=sys.stdout,
            verbose=0)
        return False

    return True
def _task_submit_elaborate_specific_parameter(key, value, opts, args):
    """Given the string key it checks it's meaning, eventually using the
    value. Usually it fills some key in the options dict.
    It must return True if it has elaborated the key, False, if it doesn't
    know that key.
    eg:
    if key in ('-n', '--number'):
        bibtask.task_get_option(\1) = value
        return True
    return False
    """
    # Recid option
    if key in ("-i", "--recid"):
        try:
            value = int(value)
        except ValueError:
            bibtask.write_message("The value specified for --recid must be a "
                "valid integer, not '%s'." % value, stream=sys.stderr,
                verbose=0)
        if not _recid_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid record ID." % value,
                stream=sys.stderr, verbose=0)
            return False
        recids = bibtask.task_get_option('recids')
        if recids is None:
            recids = []
        recids.append(value)
        bibtask.task_set_option('recids', recids)

    # Collection option
    elif key in ("-c", "--collection"):
        if not _collection_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid collection." % value,
                stream=sys.stderr, verbose=0)
            return False
        collections = bibtask.task_get_option("collections")
        collections = collections or []
        collections.append(value)
        bibtask.task_set_option("collections", collections)

    # Taxonomy option
    elif key in ("-k", "--taxonomy"):
        if not _ontology_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid taxonomy name." % value,
                stream=sys.stderr, verbose=0)
            return False
        bibtask.task_set_option("taxonomy", value)
    elif key in ("-f", "--force"):
        bibtask.task_set_option("force", True)
    else:
        return False

    return True
Beispiel #17
0
def _task_submit_check_options():
    """
    Required by bibtask. Checks the options.
    """
    record_ids = bibtask.task_get_option('record_ids')
    all_records = bibtask.task_get_option('all_records')
    repair_pid = bibtask.task_get_option('repair_pid')
    fast_update_personid = bibtask.task_get_option('fast_update_personid')
    personid_gc = bibtask.task_get_option('personid_gc')

    params = bool(record_ids) + bool(all_records)
    if params > 1:
        bibtask.write_message("ERROR: conflicting options: --record-ids and "
                              "--all-records cannot be specified at the same "
                              "time.", stream=sys.stdout, verbose=0)
        return False

    if record_ids:
        for iden in record_ids:
            if not iden.isdigit():
                bibtask.write_message("ERROR: Record_ids expects numbers. "
                                      "Provided: %s." % iden)
                return False

    opts = bool(repair_pid) + bool(fast_update_personid) + bool(personid_gc)
    if opts == 0:
        bibtask.write_message("ERROR: One of the options --fast-update-personid, "
                              "--personid-gc, --repair-personid is required!"
                              , stream=sys.stdout, verbose=0)
        return False
    elif opts > 1:
        bibtask.write_message("ERROR: Options --fast-update-personid, "
                              "--personid-gc, --repair-personid "
                              "are mutually exclusive!", stream=sys.stdout, verbose=0)
        return False

    if repair_pid and params:
        bibtask.write_message("ERROR: --repair_pid does not require any parameters!"
                              , stream=sys.stdout, verbose=0)
        return False

    return True
def _write_to_files(work_dir, job_lnames):
    '''
    Wrapper function around this internal write process.
    Triggers the write-back to the files to the mem cache.

    @param work_dir: where shall the files be stored?
    @type work_dir: string
    @param job_lnames: list of names
    @type job_lnames: list
    '''
    bibtask.task_update_progress('Writing to files in %s' % (work_dir))
    bibtask.write_message("Writing cluster with %s entries to "
                          "files in %s"
                          % (len(dat.RELEVANT_RECORDS), work_dir,),
                            stream=sys.stdout, verbose=0)

    if not os.path.exists(work_dir):
        os.mkdir(work_dir)

    write_mem_cache_to_files(work_dir, job_lnames)
    dat.reset_mem_cache(True)
Beispiel #19
0
def run_rabbit(paperslist, all_records=False):
    if not paperslist and all_records:
        rabbit_with_log(None, True, 'bibauthorid_daemon, update_personid on all papers')
    elif not paperslist:
        last_log = get_user_log(userinfo='daemon', action='PID_UPDATE', only_most_recent=True)

        if len(last_log) >= 1:
            #select only the most recent papers
            recently_modified = get_recently_modified_record_ids(date=last_log[0][2])
            if not recently_modified:
                bibtask.write_message("update_personID_table_from_paper: "
                                      "All person entities up to date.",
                                      stream=sys.stdout, verbose=0)
            else:
                bibtask.write_message("update_personID_table_from_paper: Running on: " +
                                      str(recently_modified), stream=sys.stdout, verbose=0)
                rabbit_with_log(recently_modified, True, 'bibauthorid_daemon, run_personid_fast_assign_papers on '
                                                 + str([paperslist, all_records, recently_modified]))
        else:
            rabbit_with_log(None, True, 'bibauthorid_daemon, update_personid on all papers')
    else:
        rabbit_with_log(paperslist, True, 'bibauthorid_daemon, personid_fast_assign_papers on ' + str(paperslist), partial=True)
def _run_update_personID_table_from_paper(record_ids=None, all_records=False):
    """
    Runs the update on the papers which have been modified since the last run
    This is removing no-longer existing papers from the personid table.

    @note: Update recommended monthly.
    @warning: quite resource intensive.
    """
    if not record_ids and not all_records:
        last_log = get_user_log(userinfo="daemon", action="UPITFP", only_most_recent=True)
        if len(last_log) >= 1:
            # select only the most recent papers
            recently_modified, min_date = get_papers_recently_modified(date=last_log[0][2])
            insert_user_log(
                "daemon",
                "-1",
                "UPITFP",
                "bibsched",
                "status",
                comment="bibauthorid_daemon, update_personID_table_from_paper",
                timestamp=min_date[0][0],
            )

            if not recently_modified:
                bibtask.write_message(
                    "update_personID_table_from_paper: " "All person entities up to date.", stream=sys.stdout, verbose=0
                )
            else:
                bibtask.write_message(
                    "update_personID_table_from_paper: Running on: " + str(recently_modified),
                    stream=sys.stdout,
                    verbose=0,
                )
                update_personID_table_from_paper(recently_modified)
        else:
            # Should not process all papers, hence authornames population writes
            # the appropriate log. In case the log is missing, process everything.
            recently_modified, min_date = get_papers_recently_modified()
            insert_user_log(
                "daemon",
                "-1",
                "UPITFP",
                "bibsched",
                "status",
                comment="bibauthorid_daemon, update_personID_table_from_paper",
                timestamp=min_date[0][0],
            )
            bibtask.write_message(
                "update_personID_table_from_paper: Running on: " + str(recently_modified), stream=sys.stdout, verbose=0
            )
            update_personID_table_from_paper(recently_modified)
        # @todo: develop a method that removes the respective VAs from the database
        # as well since no reference will be there for them any longer. VAs can be
        # found by searching for the authornames ID in the VA table. The
        # method has to kill RA data based on the VA (cf. del_ra_data_by_vaid in
        # ra utils as a reference), all VA2RA links, all VA data, all VAs and
        # finally all doclist refs that point to the respective bibrefs.
    else:
        update_personID_table_from_paper(record_ids)
def _run_update_personID_table_from_paper():
    '''
    Runs the update on the papers which have been modified since the last run
    This is removing no-longer existing papers from the personid table.

    @note: Update recommended monthly.
    @warning: quite resource intensive.
    '''
    last_log = get_user_log(userinfo='daemon', action='UPITFP', only_most_recent=True)
    if len(last_log) >= 1:
        #select only the most recent papers
        recently_modified, min_date = get_papers_recently_modified(date=last_log[0][2])
        insert_user_log('daemon', '-1', 'UPITFP', 'bibsched', 'status', comment='bibauthorid_daemon, update_personID_table_from_paper', timestamp=min_date[0][0])
        bibtask.write_message("update_personID_table_from_paper: Running on: " + str(recently_modified), stream=sys.stdout, verbose=0)
        update_personID_table_from_paper(recently_modified)
    else:
        # Should not process all papers, hence authornames population writes
        # the appropriate log. In case the log is missing, process everything.
        recently_modified, min_date = get_papers_recently_modified()
        insert_user_log('daemon', '-1', 'UPITFP', 'bibsched', 'status', comment='bibauthorid_daemon, update_personID_table_from_paper', timestamp=min_date[0][0])
        bibtask.write_message("update_personID_table_from_paper: Running on: " + str(recently_modified), stream=sys.stdout, verbose=0)
        update_personID_table_from_paper(recently_modified)
Beispiel #22
0
def _run_personid_gc(paperslist, all_records=False):
#    insert_user_log('daemon', '-1', 'PGC', 'bibsched', 'status',
#                    comment='bibauthorid_daemon, personid_gc (update_personid_from_papers) on '
#                    + str(paperslist))
    if not paperslist and  all_records:
        #update_authornames_tables_from_paper()
        insert_user_log('daemon', '-1', 'PGC', 'bibsched', 'status',
            comment='bibauthorid_daemon, personid_gc (update_personid_from_papers) on all papers')
        update_personID_table_from_paper()
    elif not paperslist:
        last_log = get_user_log(userinfo='daemon', action='PGC', only_most_recent=True)
        if len(last_log) >= 1:
            #select only the most recent papers
            recently_modified, min_date = get_papers_recently_modified(date=last_log[0][2])
            insert_user_log('daemon', '-1', 'PGC', 'bibsched', 'status',
                            comment='bibauthorid_daemon, update_personid_from_papers on '
                            + str([paperslist, all_records, recently_modified]),
                            timestamp=min_date[0][0])

            if not recently_modified:
                bibtask.write_message("update_personID_table_from_paper: "
                                      "All person entities up to date.",
                                      stream=sys.stdout, verbose=0)
            else:
                bibtask.write_message("update_personID_table_from_paper: Running on: " +
                                      str(recently_modified), stream=sys.stdout, verbose=0)
                personid_fast_assign_papers(recently_modified)
        else:
            insert_user_log('daemon', '-1', 'PGC', 'bibsched', 'status',
            comment='bibauthorid_daemon, personid_gc (update_personid_from_papers) on all papers')
            update_personID_table_from_paper()
    else:
        insert_user_log('daemon', '-1', 'PGC', 'bibsched', 'status',
                comment='bibauthorid_daemon, personid_gc (update_personid_from_papers) on '
                + str(paperslist))
        update_authornames_tables_from_paper(paperslist)
        update_personID_table_from_paper(paperslist)
def _write_to_files(work_dir, job_lnames):
    '''
    Wrapper function around this internal write process.
    Triggers the write-back to the files to the mem cache.

    @param work_dir: where shall the files be stored?
    @type work_dir: string
    @param job_lnames: list of names
    @type job_lnames: list
    '''
    bibtask.task_update_progress('Writing to files in %s' % (work_dir))
    bibtask.write_message("Writing cluster with %s entries to "
                          "files in %s" % (
                              len(dat.RELEVANT_RECORDS),
                              work_dir,
                          ),
                          stream=sys.stdout,
                          verbose=0)

    if not os.path.exists(work_dir):
        os.mkdir(work_dir)

    write_mem_cache_to_files(work_dir, job_lnames)
    dat.reset_mem_cache(True)
def _write_data_files_to_db(data_dir_name):
    '''
    Reads all the files of a specified directory and writes the content
    to the memory cache and from there to the database.

    @param data_dir_name: Directory where to look for the files
    @type data_dir_name: string
    '''

    if data_dir_name.endswith("/"):
        data_dir_name = data_dir_name[0:-1]

    if not data_dir_name:
        bibtask.write_message("Data directory not specified. Task failed.",
                              stream=sys.stdout, verbose=0)
        return False

    if not osp.isdir(data_dir_name):
        bibtask.write_message("Specified Data directory is not a directory. "
                              "Task failed.",
                              stream=sys.stdout, verbose=0)
        return False

    job_dirs = os.listdir(data_dir_name)

    total = len(job_dirs)
    status = 0

    for job_dir in job_dirs:
        status += 1
        job_dir = "%s/%s" % (data_dir_name, job_dir)

        if not osp.isdir(job_dir):
            bibtask.write_message("This is not a directory and therefore "
                                  "skipped: %s." % job_dir,
                              stream=sys.stdout, verbose=0)
            continue

        results_dir = "%s/results/" % (job_dir,)

        if not osp.isdir(results_dir):
            bibtask.write_message("No result set found in %s"
                                  % (results_dir,), stream=sys.stdout,
                                  verbose=0)
            continue

        log_name = osp.abspath(job_dir).split("/")
        logfile = "%s/%s.log" % (job_dir, log_name[-1])
        logfile_lastline = ""

        if not osp.isfile(logfile):
            bibtask.write_message("No log file found in %s" % (job_dir,),
                                  stream=sys.stdout, verbose=0)
            continue

        try:
            logfile_lastline = tail(logfile)
        except IOError:
            logfile_lastline = ""

        if logfile_lastline.count("Finish! The computation finished in") < 1:
            bibtask.write_message("Log file indicates broken results for %s"
                                  % (job_dir,), stream=sys.stdout, verbose=0)
            continue

        correct_files = set(['realauthors.dat',
                             'ids.dat',
                             'virtual_author_clusters.dat',
                             'virtual_authors.dat',
                             'doclist.dat',
                             'virtual_author_data.dat',
                             'authornames.dat',
                             'virtual_author_cluster_cache.dat',
                             'realauthor_data.dat',
                             'ra_va_cache.dat']
                            )
        result_files = os.listdir(results_dir)

        if not correct_files.issubset(set(result_files)):
            bibtask.write_message("Reults folder does not hold the "
                                  "correct files: %s" % (results_dir,),
                                  stream=sys.stdout, verbose=0)
            continue

        bibtask.task_update_progress('Loading job %s of %s: %s'
                                     % (status, total, log_name[-1]))

        if (populate_structs_from_files(results_dir, results=True) and
            write_mem_cache_to_tables(sanity_checks=True)):
            bibtask.write_message("All Done.",
                                  stream=sys.stdout, verbose=0)
        else:
            bibtask.write_message("Could not write data to the tables from %s"
                                  % (results_dir,),
                                  stream=sys.stdout, verbose=0)
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    lastname = bibtask.task_get_option('lastname')
    process_all = bibtask.task_get_option('process_all')
    prepare_grid = bibtask.task_get_option('prepare_grid')
    load_grid = bibtask.task_get_option('load_grid_results')
    data_dir = bibtask.task_get_option('data_dir')
    prefix = bibtask.task_get_option('prefix')
    max_records_option = bibtask.task_get_option('max_records')
    update = bibtask.task_get_option('update')
    clean_cache = bibtask.task_get_option('clean_cache')
    update_cache = bibtask.task_get_option('update_cache')

#    automated_daemon_mode_p = True

    if lastname:
        bibtask.write_message("Processing last name %s" % (lastname),
                              stream=sys.stdout, verbose=0)

    if process_all:
        if bconfig.STANDALONE:
            bibtask.write_message("Processing not possible in standalone!",
                                  stream=sys.stdout, verbose=0)
            return 0

        bibtask.write_message("Processing all names...",
                              stream=sys.stdout, verbose=0)

        lengths = get_len_authornames_bibrefs()

        if not check_and_create_aid_tables():
            bibtask.write_message("Failed to create database tables!",
                                  stream=sys.stdout, verbose=0)
            return 0

        if lengths['names'] < 1:
            bibtask.write_message("Populating Authornames table. It's Empty.",
                                  stream=sys.stdout, verbose=0)
            bibtask.task_update_progress('Populating Authornames table.')
            populate_authornames()
            insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status',
                            comment='bibauthorid_daemon, '
                            'update_authornames_tables_from_paper')


        if lengths['bibrefs'] < 1:
            bibtask.write_message("Populating Bibrefs lookup. It's Empty.",
                                  stream=sys.stdout, verbose=0)
            bibtask.task_update_progress('Populating Bibrefs lookup table.')
            populate_authornames_bibrefs_from_authornames()

        bibtask.task_update_progress('Processing all authors.')
        start_full_disambiguation(last_names="all",
                                 process_orphans=True,
                                 db_exists=False,
                                 populate_doclist=True,
                                 write_to_db=True)
        update_personID_from_algorithm()
        insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status',
                    comment='bibauthorid_daemon, update_authorid_universe')

    if prepare_grid:
        bibtask.write_message("Preparing Grid Job",
                              stream=sys.stdout, verbose=0)
        data_dir_name = "grid_data"
        workdir_prefix = "job"
        max_records = 4000

        if data_dir:
            data_dir_name = data_dir

        if prefix:
            workdir_prefix = prefix

        if max_records_option:
            max_records = max_records_option

        _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records)

    if load_grid:
        bibtask.write_message("Reading Grid Job results and will write"
                              " them to the database.",
                              stream=sys.stdout, verbose=0)

        _write_data_files_to_db(data_dir)

    if update or update_cache:
        bibtask.write_message("update-cache: Processing recently updated"
                              " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('update-cache: Processing recently'
                                     ' updated papers')
        _run_update_authornames_tables_from_paper()
        bibtask.write_message("update-cache: Finished processing papers",
                              stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('update-cache: DONE')

    if update:
        bibtask.write_message("updating authorid universe",
                              stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('updating authorid universe')
        _update_authorid_universe()
        bibtask.write_message("done updating authorid universe",
                              stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('done updating authorid universe')

    if clean_cache:
        bibtask.write_message("clean-cache: Processing recently updated"
                              " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for names')
        _run_authornames_tables_gc()
        bibtask.write_message("update-cache: Finished cleaning authornames "
                              "tables", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for persons')
        _run_update_personID_table_from_paper()
        bibtask.write_message("update-cache: Finished cleaning PersonID"
                              " table", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('clean-cache: DONE')

    return 1
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    lastname = bibtask.task_get_option("lastname")
    process_all = bibtask.task_get_option("process_all")
    prepare_grid = bibtask.task_get_option("prepare_grid")
    load_grid = bibtask.task_get_option("load_grid_results")
    data_dir = bibtask.task_get_option("data_dir")
    prefix = bibtask.task_get_option("prefix")
    max_records_option = bibtask.task_get_option("max_records")
    update = bibtask.task_get_option("update")
    clean_cache = bibtask.task_get_option("clean_cache")
    update_cache = bibtask.task_get_option("update_cache")
    record_ids = bibtask.task_get_option("record_ids")
    record_ids_nested = None
    all_records = bibtask.task_get_option("all_records")
    repair_pid = bibtask.task_get_option("repair_pid")
    fast_update_personid = bibtask.task_get_option("fast_update_personid")

    if record_ids:
        record_ids_nested = [[p] for p in record_ids]

    if fast_update_personid:
        fast_update_personid = [[p] for p in fast_update_personid]
    #    automated_daemon_mode_p = True

    if lastname:
        bibtask.write_message("Processing last name %s" % (lastname), stream=sys.stdout, verbose=0)

    if process_all:
        if bconfig.STANDALONE:
            bibtask.write_message("Processing not possible in standalone!", stream=sys.stdout, verbose=0)
            return 0

        bibtask.write_message("Processing all names...", stream=sys.stdout, verbose=0)

        lengths = get_len_authornames_bibrefs()

        if not check_and_create_aid_tables():
            bibtask.write_message("Failed to create database tables!", stream=sys.stdout, verbose=0)
            return 0

        if lengths["names"] < 1:
            bibtask.write_message("Populating Authornames table. It's Empty.", stream=sys.stdout, verbose=0)
            bibtask.task_update_progress("Populating Authornames table.")
            populate_authornames()
            insert_user_log(
                "daemon",
                "-1",
                "UATFP",
                "bibsched",
                "status",
                comment="bibauthorid_daemon, " "update_authornames_tables_from_paper",
            )

        if lengths["bibrefs"] < 1:
            bibtask.write_message("Populating Bibrefs lookup. It's Empty.", stream=sys.stdout, verbose=0)
            bibtask.task_update_progress("Populating Bibrefs lookup table.")
            populate_authornames_bibrefs_from_authornames()

        bibtask.task_update_progress("Processing all authors.")
        start_full_disambiguation(
            last_names="all", process_orphans=True, db_exists=False, populate_doclist=True, write_to_db=True
        )
        update_personID_from_algorithm()
        insert_user_log(
            "daemon", "-1", "update_aid", "bibsched", "status", comment="bibauthorid_daemon, update_authorid_universe"
        )

    if prepare_grid:
        bibtask.write_message("Preparing Grid Job", stream=sys.stdout, verbose=0)
        data_dir_name = "grid_data"
        workdir_prefix = "job"
        max_records = 4000

        if data_dir:
            data_dir_name = data_dir

        if prefix:
            workdir_prefix = prefix

        if max_records_option:
            max_records = max_records_option

        _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records)

    if load_grid:
        bibtask.write_message(
            "Reading Grid Job results and will write" " them to the database.", stream=sys.stdout, verbose=0
        )

        _write_data_files_to_db(data_dir)

    if update or update_cache:
        bibtask.write_message("update-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("update-cache: Processing recently" " updated papers")
        _run_update_authornames_tables_from_paper(record_ids_nested, all_records)
        bibtask.write_message("update-cache: Finished processing papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("update-cache: DONE")

    if update:
        bibtask.write_message("updating authorid universe", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("updating authorid universe")
        _update_authorid_universe(record_ids, all_records)
        bibtask.write_message("done updating authorid universe", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("done updating authorid universe")

    if clean_cache:
        bibtask.write_message("clean-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("clean-cache: Processing recently updated" " papers for names")
        _run_authornames_tables_gc()
        bibtask.write_message("update-cache: Finished cleaning authornames " "tables", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("clean-cache: Processing recently updated" " papers for persons")
        _run_update_personID_table_from_paper(record_ids_nested, all_records)
        bibtask.write_message("update-cache: Finished cleaning PersonID" " table", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("clean-cache: DONE")

    if repair_pid:
        bibtask.task_update_progress("Updating names cache...")
        _run_update_authornames_tables_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("Removing person entities not touched by " "humans...")
        personid_remove_automatically_assigned_papers()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("Updating person entities...")
        update_personID_from_algorithm()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("Cleaning person tables...")
        _run_update_personID_table_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("All repairs done.")

    if fast_update_personid:
        bibtask.task_update_progress("Updating personid...")
        _run_personid_fast_assign_papers(fast_update_personid)
        bibtask.task_update_progress("Update finished...")
        # TODO: remember to pass the papers list!
    return 1
def _task_submit_check_options():
    """
    Required by bibtask. Checks the options.
    """
    update_personid = bibtask.task_get_option("update_personid")
    disambiguate = bibtask.task_get_option("disambiguate")
    merge = bibtask.task_get_option("merge")

    record_ids = bibtask.task_get_option("record_ids")
    all_records = bibtask.task_get_option("all_records")
    from_scratch = bibtask.task_get_option("from_scratch")

    commands = bool(update_personid) + bool(disambiguate) + bool(merge)

    if commands == 0:
        bibtask.write_message("ERROR: At least one command should be specified!"
                              , stream=sys.stdout, verbose=0)
        return False

    if commands > 1:
        bibtask.write_message("ERROR: The options --update-personid, --disambiguate "
                              "and --merge are mutually exclusive."
                              , stream=sys.stdout, verbose=0)
        return False

    assert commands == 1

    if update_personid:
        if any((from_scratch,)):
            bibtask.write_message("ERROR: The only options which can be specified "
                                  "with --update-personid are --record-ids and "
                                  "--all-records"
                                  , stream=sys.stdout, verbose=0)
            return False

        options = bool(record_ids) + bool(all_records)
        if options > 1:
            bibtask.write_message("ERROR: conflicting options: --record-ids and "
                                  "--all-records are mutually exclusive."
                                  , stream=sys.stdout, verbose=0)
            return False

        if record_ids:
            for iden in record_ids:
                if not iden.isdigit():
                    bibtask.write_message("ERROR: Record_ids expects numbers. "
                                          "Provided: %s." % iden)
                    return False

    if disambiguate:
        if any((record_ids, all_records)):
            bibtask.write_message("ERROR: The only option which can be specified "
                                  "with --disambiguate is from-scratch"
                                  , stream=sys.stdout, verbose=0)
            return False

    if merge:
        if any((record_ids, all_records, from_scratch)):
            bibtask.write_message("ERROR: There are no options which can be "
                                  "specified along with --merge"
                                  , stream=sys.stdout, verbose=0)
            return False

    return True
def _run_update_personID_table_from_paper(record_ids=None, all_records=False):
    '''
    Runs the update on the papers which have been modified since the last run
    This is removing no-longer existing papers from the personid table.

    @note: Update recommended monthly.
    @warning: quite resource intensive.
    '''
    if not record_ids and not all_records:
        last_log = get_user_log(userinfo='daemon',
                                action='UPITFP',
                                only_most_recent=True)
        if len(last_log) >= 1:
            #select only the most recent papers
            recently_modified, min_date = get_papers_recently_modified(
                date=last_log[0][2])
            insert_user_log(
                'daemon',
                '-1',
                'UPITFP',
                'bibsched',
                'status',
                comment='bibauthorid_daemon, update_personID_table_from_paper',
                timestamp=min_date[0][0])

            if not recently_modified:
                bibtask.write_message(
                    "update_personID_table_from_paper: "
                    "All person entities up to date.",
                    stream=sys.stdout,
                    verbose=0)
            else:
                bibtask.write_message(
                    "update_personID_table_from_paper: Running on: " +
                    str(recently_modified),
                    stream=sys.stdout,
                    verbose=0)
                update_personID_table_from_paper(recently_modified)
        else:
            # Should not process all papers, hence authornames population writes
            # the appropriate log. In case the log is missing, process everything.
            recently_modified, min_date = get_papers_recently_modified()
            insert_user_log(
                'daemon',
                '-1',
                'UPITFP',
                'bibsched',
                'status',
                comment='bibauthorid_daemon, update_personID_table_from_paper',
                timestamp=min_date[0][0])
            bibtask.write_message(
                "update_personID_table_from_paper: Running on: " +
                str(recently_modified),
                stream=sys.stdout,
                verbose=0)
            update_personID_table_from_paper(recently_modified)
        # @todo: develop a method that removes the respective VAs from the database
        # as well since no reference will be there for them any longer. VAs can be
        # found by searching for the authornames ID in the VA table. The
        # method has to kill RA data based on the VA (cf. del_ra_data_by_vaid in
        # ra utils as a reference), all VA2RA links, all VA data, all VAs and
        # finally all doclist refs that point to the respective bibrefs.
    else:
        update_personID_table_from_paper(record_ids)
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
            taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
            taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(bibtask.task_get_task_param('task_starting_time'))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = bibclassify_engine.get_tmp_file(_rid)
    fo = open(abs_path, 'w')


    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            bibtask.write_message('INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                len(onto_rec['recIDs'])), stream=sys.stderr, verbose=3)
        else:
            bibtask.write_message('INFO: Applying taxonomy %s to recIDs %s. ' %
                (onto_rec['ontology'],
                ', '.join([str(recid) for recid in onto_rec['recIDs']])),
                stream=sys.stderr, verbose=3)
        if onto_rec['recIDs']:
            xml = _analyze_documents(onto_rec['recIDs'],
                onto_rec['ontology'], onto_rec['collection'])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write('</collection>\n')
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            bibclassify_webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message("INFO: CFG_DB_SAVE_KW is false, we don't save results",
                                  stream=sys.stderr, verbose=0)
    else:
        bibtask.write_message("WARNING: No keywords found, recids: %s" % onto_recids,
                                  stream=sys.stderr, verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(bibtask.task_get_task_param('task_starting_time'))
    return 1
def _get_recids_foreach_ontology(recids=None, collections=None, taxonomy=None):
    """Returns an array containing hash objects containing the
    collection, its corresponding ontology and the records belonging to
    the given collection."""
    rec_onts = []

    # User specified record IDs.
    if recids:
        rec_onts.append({
            'ontology': taxonomy,
            'collection': None,
            'recIDs': recids,
        })
        return rec_onts

    # User specified collections.
    if collections:
        for collection in collections:
            records = get_collection_reclist(collection)
            if records:
                rec_onts.append({
                    'ontology': taxonomy,
                    'collection': collection,
                    'recIDs': records
                })
        return rec_onts

    # Use rules found in collection_clsMETHOD.
    result = run_sql(
        "SELECT clsMETHOD.name, clsMETHOD.last_updated, "
        "collection.name FROM clsMETHOD JOIN collection_clsMETHOD ON "
        "clsMETHOD.id=id_clsMETHOD JOIN collection ON "
        "id_collection=collection.id")

    for ontology, date_last_run, collection in result:
        records = get_collection_reclist(collection)
        if records:
            if not date_last_run:
                bibtask.write_message(
                    "INFO: Collection %s has not been previously "
                    "analyzed." % collection,
                    stream=sys.stderr,
                    verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            elif bibtask.task_get_option('force'):
                bibtask.write_message(
                    "INFO: Analysis is forced for collection %s." % collection,
                    stream=sys.stderr,
                    verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            else:
                modified_records = intbitset(
                    run_sql(
                        "SELECT id FROM bibrec "
                        "WHERE modification_date >= %s", (date_last_run, )))

            records &= modified_records
            if records:
                rec_onts.append({
                    'ontology': ontology,
                    'collection': collection,
                    'recIDs': records
                })
            else:
                bibtask.write_message(
                    "WARNING: All records from collection '%s' have "
                    "already been analyzed for keywords with ontology '%s' "
                    "on %s." % (collection, ontology, date_last_run),
                    stream=sys.stderr,
                    verbose=2)
        else:
            bibtask.write_message(
                "ERROR: Collection '%s' doesn't contain any record. "
                "Cannot analyse keywords." % (collection, ),
                stream=sys.stderr,
                verbose=0)

    return rec_onts
def _prepare_data_files_from_db(data_dir_name="grid_data",
                                workdir_prefix="job",
                                max_records=4000):
    '''
    Prepares grid jobs. Is a task running in bibsched.
    Meaning:
        1. Find all last names in the database
        2. For each last name:
            - find all documents regarding this last name (ignore first names)
            - if number of documents loaded into memory exceeds max_records,
              write the memory cache into files (cf. Files section).
              Each write back procedure will happen into a newly created
              directory. The prefix for the respective job directory may
              be specified as well as the name of the data directory where
              these job directories will be created.
    Files:
        - authornames.dat
        - virtual_authors.dat
        - virtual_author_data.dat
        - virtual_author_clusters.dat
        - virtual_author_cluster_cache.dat
        - realauthors.dat
        - realauthor_data.dat
        - doclist.dat
        - records.dat
        - ids.dat
        - ra_va_cache.dat

    @param data_dir_name: the name of the directory that will hold all the
        sub directories for the jobs.
    @type data_dir_name: string
    @param workdir_prefix: prefix for the job sub directories.
    @type workdir_prefix: string
    @param max_records: maximum number of records after which the memory
        cache is to be flushed to files.
    @type max_records: int
    '''
    try:
        max_records = int(max_records)
    except ValueError:
        max_records = 4000

    bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0)
    bibtask.write_message("Limiting files to %s records" % (max_records, ),
                          stream=sys.stdout,
                          verbose=0)
    bibtask.task_update_progress('Loading last names...')

    last_names = find_all_last_names()
    last_name_queue = Queue.Queue()

    for last_name in sorted(last_names):
        last_name_queue.put(last_name)

    total = len(last_names)
    status = 1
    bibtask.write_message("Done. Loaded %s last names." % (total),
                          stream=sys.stdout,
                          verbose=0)
    job_id = 0
    data_dir = ""

    if data_dir_name.startswith("/"):
        data_dir = data_dir_name
    else:
        data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name)

    if not data_dir.endswith("/"):
        data_dir = "%s/" % (data_dir, )

    job_lnames = []

    while True:
        if last_name_queue.empty():
            bibtask.write_message("Done with all names.",
                                  stream=sys.stdout,
                                  verbose=0)
            break

        lname_list = last_name_queue.get()
        lname = None

        if lname_list:
            lname = lname_list[0]
            del (lname_list[0])
        else:
            bconfig.LOGGER.warning("Got an empty Queue element. "
                                   "Queue seems corrupted.")
            continue

        job_lnames.append(lname)
        bibtask.task_update_progress('Preparing job %d of %d: %s.' %
                                     (status, total, lname))
        bibtask.write_message(
            ("Processing: %s (%d/%d).") % (lname, status, total),
            stream=sys.stdout,
            verbose=0)

        populate_doclist_for_author_surname(lname)

        post_remove_names = set()

        for name in [
                row['name'] for row in dat.AUTHOR_NAMES if not row['processed']
        ]:
            potential_removal = "%s," % (name.split(',')[0], )

            if not potential_removal == "%s" % (lname, ):
                post_remove_names.add(potential_removal)

        if len(post_remove_names) > 1:
            removed = 0
            removed_names = []

            for post_remove_name in post_remove_names:
                if post_remove_name in lname_list:
                    lname_list.remove(post_remove_name)
                    removed_names.append(post_remove_name)
                    removed += 1

            bibtask.write_message(
                ("-> Removed %s entries from the " + "computation list: %s") %
                (removed, removed_names),
                stream=sys.stdout,
                verbose=0)
            total -= removed

        if lname_list:
            last_name_queue.put(lname_list)

        if len(dat.RELEVANT_RECORDS) >= max_records:
            if not os.path.exists(data_dir):
                os.mkdir(data_dir)

            work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

            _write_to_files(work_dir, job_lnames)
            job_lnames = []
            job_id += 1

        status += 1

    if dat.RELEVANT_RECORDS:
        if not os.path.exists(data_dir):
            os.mkdir(data_dir)

        work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

        _write_to_files(work_dir, job_lnames)

    return True
def _update_authorid_universe():
    '''
    Updates all data related to the authorid algorithm.

    Sequence of operations:
        - Get all recently updated papers and remember time in the log
        - Get all authors on all papers
        - Extract collection of last names
        - For each last name:
            - Populate mem cache with cluster data
            - Delete updated records and their virtual authors from mem cache
            - Create virtual authors for new and updated records
            - Start matching algorithm
        - Update tables with results of the computation
        - Start personid update procedure
    '''
    def create_vas_from_specific_doclist(bibrec_ids):
        '''
        Processes the document list and creates a new minimal virtual author
        for each author in each record specified in the given list.

        @param bibrec_ids: Record IDs to concern in this update
        @type bibrec_ids: list of int
        '''
        num_docs = len(
            [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids])

        bconfig.LOGGER.log(
            25, "Creating minimal virtual authors for "
            "all loaded docs (%s)" % (num_docs))

        for docs in [
                row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids
        ]:
            for author_id in docs['authornameids']:
                author_name = [
                    an['name'] for an in dat.AUTHOR_NAMES
                    if an['id'] == author_id
                ]
                refrecs = [
                    ref[1] for ref in docs['authornameid_bibrefrec']
                    if ref[0] == author_id
                ]
                refrec = -1

                if len(refrecs) > 1:
                    print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!"
                    refrec = refrecs[0]
                elif refrecs:
                    refrec = refrecs[0]

                if refrec and author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [], refrec)
                elif author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [])

    dat.reset_mem_cache(True)
    last_log = get_user_log(userinfo='daemon',
                            action='update_aid',
                            only_most_recent=True)
    updated_records = []

    if last_log:
        #select only the most recent papers
        recently_modified, last_update_time = get_papers_recently_modified(
            date=last_log[0][2])
        insert_user_log('daemon',
                        '-1',
                        'update_aid',
                        'bibsched',
                        'status',
                        comment='bibauthorid_daemon, update_authorid_universe',
                        timestamp=last_update_time[0][0])
        bibtask.write_message("Update authorid will operate on %s records." %
                              (len(recently_modified)),
                              stream=sys.stdout,
                              verbose=0)

        if not recently_modified:
            bibtask.write_message("Update authorid: Nothing to do",
                                  stream=sys.stdout,
                                  verbose=0)
            return

        for rec in recently_modified:
            updated_records.append(rec[0])
            dat.update_log("rec_updates", rec[0])

    else:
        bibtask.write_message("Update authorid: Nothing to do",
                              stream=sys.stdout,
                              verbose=0)
        return

    authors = []
    author_last_names = set()

    bibtask.task_update_progress('Reading authors from updated records')
    bibtask.write_message("Reading authors from updated records",
                          stream=sys.stdout,
                          verbose=0)
    updated_ras = set()

    # get all authors from all updated records
    for rec in updated_records:
        rec_authors = get_field_values_on_condition(rec, ['100', '700'],
                                                    "a",
                                                    source="API")

        for rec_author in rec_authors:
            if not rec_author:
                bconfig.LOGGER.error("Invalid empty author string, which "
                                     "will be skipped on record %s" % (rec))
                continue

            author_in_list = [
                row for row in authors if row['db_name'] == rec_author
            ]

            if author_in_list:
                for upd in [
                        row for row in authors if row['db_name'] == rec_author
                ]:
                    upd['records'].append(rec)
            else:
                last_name = split_name_parts(rec_author)[0]
                author_last_names.add(last_name)
                authors.append({
                    'db_name': rec_author,
                    'records': [rec],
                    'last_name': last_name
                })

    for status, author_last_name in enumerate(author_last_names):
        current_authors = [
            row for row in authors if row['last_name'] == author_last_name
        ]
        total_lnames = len(author_last_names)
        total_authors = len(current_authors)
        bibtask.task_update_progress(
            'Processing %s of %s cluster: "%s" '
            '(%s authors)' %
            (status + 1, total_lnames, author_last_name, total_authors))
        bibtask.write_message(
            'Processing %s of %s cluster: "%s" '
            '(%s authors)' %
            (status + 1, total_lnames, author_last_name, total_authors),
            stream=sys.stdout,
            verbose=0)
        dat.reset_mem_cache(True)
        init_authornames(author_last_name)
        load_mem_cache_from_tables()
        bconfig.LOGGER.log(
            25, "-- Relevant data successfully read into memory"
            " to start processing")

        for current_author in current_authors:
            load_records_to_mem_cache(current_author['records'])
            authornamesid = [
                row['id'] for row in dat.AUTHOR_NAMES
                if row['db_name'] == current_author['db_name']
            ]

            if not authornamesid:
                bconfig.LOGGER.error(
                    "The author '%s' rec '%s' is not in authornames "
                    "and will be skipped. You might want "
                    "to run authornames update before?" %
                    (current_author['db_name'], rec))
                continue
            else:
                try:
                    authornamesid = int(authornamesid[0])
                except (IndexError, TypeError, ValueError):
                    bconfig.LOGGER.error("Invalid authornames ID!")
                    continue

            if not current_author['records']:
                bconfig.LOGGER.error("The author '%s' is not associated to any"
                                     " document and will be skipped." %
                                     (current_author['db_name']))
                continue

            for rec in current_author['records']:
                # remove VAs already existing for the record
                va_ids = get_va_ids_by_recid_lname(rec,
                                                   current_author["last_name"])

                if va_ids:
                    for va_id in va_ids:
                        ra_list = get_realauthors_by_virtuala_id(va_id)

                        for ra_id in ra_list:
                            remove_va_from_ra(ra_id, va_id)
                            del_ra_data_by_vaid(ra_id, va_id)

                        va_anames_id = get_virtualauthor_records(
                            va_id, "orig_authorname_id")

                        for an_list in [
                                row['authornameids'] for row in dat.DOC_LIST
                                if row['bibrecid'] == rec
                        ]:
                            try:
                                an_list.remove(va_anames_id)
                            except (ValueError):
                                # This names id is not in the list...don't care
                                pass

                        delete_virtual_author(va_id)

                # create new VAs for the record.
                update_doclist(rec, authornamesid)
                dat.update_log("rec_updates", rec)

            create_vas_from_specific_doclist(current_author['records'])

        bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.")
        start_computation(process_doclist=False,
                          process_orphans=True,
                          print_stats=True)
        bconfig.LOGGER.log(
            25, "-- Computation finished. Will write back to "
            "the database now.")
        update_db_result = update_tables_from_mem_cache(return_ra_updates=True)

        if not update_db_result[0]:
            bconfig.LOGGER.log(25, "Writing to persistence layer failed.")
        else:
            if update_db_result[1]:
                for updated_ra in update_db_result[1]:
                    if updated_ra:
                        updated_ras.add(updated_ra[0])

            bconfig.LOGGER.log(25, "Done updating authorid universe.")

    personid_ra_format = []

    for ra_id in updated_ras:
        personid_ra_format.append((ra_id, ))

    bconfig.LOGGER.log(
        25, "Will now run personid update to make the "
        "changes visible also on the front end and to "
        "create person IDs for %s newly created and changed "
        "authors." % len(updated_ras))
    bibtask.task_update_progress('Updating persistent Person IDs')
    update_personID_from_algorithm(personid_ra_format)
    bconfig.LOGGER.log(
        25, "Done updating everything. Thanks for flying "
        "with bibauthorid!")
def _write_data_files_to_db(data_dir_name):
    '''
    Reads all the files of a specified directory and writes the content
    to the memory cache and from there to the database.

    @param data_dir_name: Directory where to look for the files
    @type data_dir_name: string
    '''

    if data_dir_name.endswith("/"):
        data_dir_name = data_dir_name[0:-1]

    if not data_dir_name:
        bibtask.write_message("Data directory not specified. Task failed.",
                              stream=sys.stdout,
                              verbose=0)
        return False

    if not osp.isdir(data_dir_name):
        bibtask.write_message(
            "Specified Data directory is not a directory. "
            "Task failed.",
            stream=sys.stdout,
            verbose=0)
        return False

    job_dirs = os.listdir(data_dir_name)

    total = len(job_dirs)
    status = 0

    for job_dir in job_dirs:
        status += 1
        job_dir = "%s/%s" % (data_dir_name, job_dir)

        if not osp.isdir(job_dir):
            bibtask.write_message("This is not a directory and therefore "
                                  "skipped: %s." % job_dir,
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        results_dir = "%s/results/" % (job_dir, )

        if not osp.isdir(results_dir):
            bibtask.write_message("No result set found in %s" %
                                  (results_dir, ),
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        log_name = osp.abspath(job_dir).split("/")
        logfile = "%s/%s.log" % (job_dir, log_name[-1])
        logfile_lastline = ""

        if not osp.isfile(logfile):
            bibtask.write_message("No log file found in %s" % (job_dir, ),
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        try:
            logfile_lastline = tail(logfile)
        except IOError:
            logfile_lastline = ""

        if logfile_lastline.count("Finish! The computation finished in") < 1:
            bibtask.write_message("Log file indicates broken results for %s" %
                                  (job_dir, ),
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        correct_files = set([
            'realauthors.dat', 'ids.dat', 'virtual_author_clusters.dat',
            'virtual_authors.dat', 'doclist.dat', 'virtual_author_data.dat',
            'authornames.dat', 'virtual_author_cluster_cache.dat',
            'realauthor_data.dat', 'ra_va_cache.dat'
        ])
        result_files = os.listdir(results_dir)

        if not correct_files.issubset(set(result_files)):
            bibtask.write_message("Reults folder does not hold the "
                                  "correct files: %s" % (results_dir, ),
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        bibtask.task_update_progress('Loading job %s of %s: %s' %
                                     (status, total, log_name[-1]))

        if (populate_structs_from_files(results_dir, results=True)
                and write_mem_cache_to_tables(sanity_checks=True)):
            bibtask.write_message("All Done.", stream=sys.stdout, verbose=0)
        else:
            bibtask.write_message(
                "Could not write data to the tables from %s" % (results_dir, ),
                stream=sys.stdout,
                verbose=0)
def _get_recids_foreach_ontology(recids=None, collections=None, taxonomy=None):
    """Returns an array containing hash objects containing the
    collection, its corresponding ontology and the records belonging to
    the given collection."""
    rec_onts = []

    # User specified record IDs.
    if recids:
        rec_onts.append({
            'ontology': taxonomy,
            'collection': None,
            'recIDs': recids,
        })
        return rec_onts

    # User specified collections.
    if collections:
        for collection in collections:
            records = get_collection_reclist(collection)
            if records:
                rec_onts.append({
                    'ontology': taxonomy,
                    'collection': collection,
                    'recIDs': records
                })
        return rec_onts

    # Use rules found in collection_clsMETHOD.
    result = run_sql("SELECT clsMETHOD.name, clsMETHOD.last_updated, "
        "collection.name FROM clsMETHOD JOIN collection_clsMETHOD ON "
        "clsMETHOD.id=id_clsMETHOD JOIN collection ON "
        "id_collection=collection.id")

    for ontology, date_last_run, collection in result:
        records = get_collection_reclist(collection)
        if records:
            if not date_last_run:
                bibtask.write_message("INFO: Collection %s has not been previously "
                    "analyzed." % collection, stream=sys.stderr, verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            elif bibtask.task_get_option('force'):
                bibtask.write_message("INFO: Analysis is forced for collection %s." %
                    collection, stream=sys.stderr, verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            else:
                modified_records = intbitset(run_sql("SELECT id FROM bibrec "
                    "WHERE modification_date >= %s", (date_last_run, )))

            records &= modified_records
            if records:
                rec_onts.append({
                    'ontology': ontology,
                    'collection': collection,
                    'recIDs': records
                })
            else:
                bibtask.write_message("WARNING: All records from collection '%s' have "
                    "already been analyzed for keywords with ontology '%s' "
                    "on %s." % (collection, ontology, date_last_run),
                    stream=sys.stderr, verbose=2)
        else:
            bibtask.write_message("ERROR: Collection '%s' doesn't contain any record. "
                "Cannot analyse keywords." % (collection,),
                stream=sys.stderr, verbose=0)

    return rec_onts
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    lastname = bibtask.task_get_option('lastname')
    process_all = bibtask.task_get_option('process_all')
    prepare_grid = bibtask.task_get_option('prepare_grid')
    load_grid = bibtask.task_get_option('load_grid_results')
    data_dir = bibtask.task_get_option('data_dir')
    prefix = bibtask.task_get_option('prefix')
    max_records_option = bibtask.task_get_option('max_records')
    update = bibtask.task_get_option('update')
    clean_cache = bibtask.task_get_option('clean_cache')
    update_cache = bibtask.task_get_option('update_cache')

    #    automated_daemon_mode_p = True

    if lastname:
        bibtask.write_message("Processing last name %s" % (lastname),
                              stream=sys.stdout,
                              verbose=0)

    if process_all:
        if bconfig.STANDALONE:
            bibtask.write_message("Processing not possible in standalone!",
                                  stream=sys.stdout,
                                  verbose=0)
            return 0

        bibtask.write_message("Processing all names...",
                              stream=sys.stdout,
                              verbose=0)

        lengths = get_len_authornames_bibrefs()

        if not check_and_create_aid_tables():
            bibtask.write_message("Failed to create database tables!",
                                  stream=sys.stdout,
                                  verbose=0)
            return 0

        if lengths['names'] < 1:
            bibtask.write_message("Populating Authornames table. It's Empty.",
                                  stream=sys.stdout,
                                  verbose=0)
            bibtask.task_update_progress('Populating Authornames table.')
            populate_authornames()
            insert_user_log('daemon',
                            '-1',
                            'UATFP',
                            'bibsched',
                            'status',
                            comment='bibauthorid_daemon, '
                            'update_authornames_tables_from_paper')

        if lengths['bibrefs'] < 1:
            bibtask.write_message("Populating Bibrefs lookup. It's Empty.",
                                  stream=sys.stdout,
                                  verbose=0)
            bibtask.task_update_progress('Populating Bibrefs lookup table.')
            populate_authornames_bibrefs_from_authornames()

        bibtask.task_update_progress('Processing all authors.')
        start_full_disambiguation(last_names="all",
                                  process_orphans=True,
                                  db_exists=False,
                                  populate_doclist=True,
                                  write_to_db=True)
        update_personID_from_algorithm()
        insert_user_log('daemon',
                        '-1',
                        'update_aid',
                        'bibsched',
                        'status',
                        comment='bibauthorid_daemon, update_authorid_universe')

    if prepare_grid:
        bibtask.write_message("Preparing Grid Job",
                              stream=sys.stdout,
                              verbose=0)
        data_dir_name = "grid_data"
        workdir_prefix = "job"
        max_records = 4000

        if data_dir:
            data_dir_name = data_dir

        if prefix:
            workdir_prefix = prefix

        if max_records_option:
            max_records = max_records_option

        _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records)

    if load_grid:
        bibtask.write_message(
            "Reading Grid Job results and will write"
            " them to the database.",
            stream=sys.stdout,
            verbose=0)

        _write_data_files_to_db(data_dir)

    if update or update_cache:
        bibtask.write_message(
            "update-cache: Processing recently updated"
            " papers",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('update-cache: Processing recently'
                                     ' updated papers')
        _run_update_authornames_tables_from_paper()
        bibtask.write_message("update-cache: Finished processing papers",
                              stream=sys.stdout,
                              verbose=0)
        bibtask.task_update_progress('update-cache: DONE')

    if update:
        bibtask.write_message("updating authorid universe",
                              stream=sys.stdout,
                              verbose=0)
        bibtask.task_update_progress('updating authorid universe')
        _update_authorid_universe()
        bibtask.write_message("done updating authorid universe",
                              stream=sys.stdout,
                              verbose=0)
        bibtask.task_update_progress('done updating authorid universe')

    if clean_cache:
        bibtask.write_message(
            "clean-cache: Processing recently updated"
            " papers",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for names')
        _run_authornames_tables_gc()
        bibtask.write_message(
            "update-cache: Finished cleaning authornames "
            "tables",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for persons')
        _run_update_personID_table_from_paper()
        bibtask.write_message(
            "update-cache: Finished cleaning PersonID"
            " table",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('clean-cache: DONE')

    return 1
def _prepare_data_files_from_db(data_dir_name="grid_data",
                                workdir_prefix="job",
                                max_records=4000):
    '''
    Prepares grid jobs. Is a task running in bibsched.
    Meaning:
        1. Find all last names in the database
        2. For each last name:
            - find all documents regarding this last name (ignore first names)
            - if number of documents loaded into memory exceeds max_records,
              write the memory cache into files (cf. Files section).
              Each write back procedure will happen into a newly created
              directory. The prefix for the respective job directory may
              be specified as well as the name of the data directory where
              these job directories will be created.
    Files:
        - authornames.dat
        - virtual_authors.dat
        - virtual_author_data.dat
        - virtual_author_clusters.dat
        - virtual_author_cluster_cache.dat
        - realauthors.dat
        - realauthor_data.dat
        - doclist.dat
        - records.dat
        - ids.dat
        - ra_va_cache.dat

    @param data_dir_name: the name of the directory that will hold all the
        sub directories for the jobs.
    @type data_dir_name: string
    @param workdir_prefix: prefix for the job sub directories.
    @type workdir_prefix: string
    @param max_records: maximum number of records after which the memory
        cache is to be flushed to files.
    @type max_records: int
    '''
    try:
        max_records = int(max_records)
    except ValueError:
        max_records = 4000

    bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0)
    bibtask.write_message("Limiting files to %s records" % (max_records,),
                          stream=sys.stdout, verbose=0)
    bibtask.task_update_progress('Loading last names...')

    last_names = find_all_last_names()
    last_name_queue = Queue.Queue()

    for last_name in sorted(last_names):
        last_name_queue.put(last_name)

    total = len(last_names)
    status = 1
    bibtask.write_message("Done. Loaded %s last names."
                          % (total), stream=sys.stdout, verbose=0)
    job_id = 0
    data_dir = ""

    if data_dir_name.startswith("/"):
        data_dir = data_dir_name
    else:
        data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name)

    if not data_dir.endswith("/"):
        data_dir = "%s/" % (data_dir,)

    job_lnames = []

    while True:
        if last_name_queue.empty():
            bibtask.write_message("Done with all names.",
                                    stream=sys.stdout, verbose=0)
            break

        lname_list = last_name_queue.get()
        lname = None

        if lname_list:
            lname = lname_list[0]
            del(lname_list[0])
        else:
            bconfig.LOGGER.warning("Got an empty Queue element. "
                                   "Queue seems corrupted.")
            continue

        job_lnames.append(lname)
        bibtask.task_update_progress('Preparing job %d of %d: %s.'
                                     % (status, total, lname))
        bibtask.write_message(("Processing: %s (%d/%d).")
                                    % (lname, status, total),
                                    stream=sys.stdout, verbose=0)

        populate_doclist_for_author_surname(lname)

        post_remove_names = set()

        for name in [row['name'] for row in dat.AUTHOR_NAMES
                     if not row['processed']]:
            potential_removal = "%s," % (name.split(',')[0],)

            if not potential_removal == "%s" % (lname,):
                post_remove_names.add(potential_removal)

        if len(post_remove_names) > 1:
            removed = 0
            removed_names = []

            for post_remove_name in post_remove_names:
                if post_remove_name in lname_list:
                    lname_list.remove(post_remove_name)
                    removed_names.append(post_remove_name)
                    removed += 1

            bibtask.write_message(("-> Removed %s entries from the "
                                    + "computation list: %s")
                                    % (removed, removed_names),
                                    stream=sys.stdout, verbose=0)
            total -= removed

        if lname_list:
            last_name_queue.put(lname_list)

        if len(dat.RELEVANT_RECORDS) >= max_records:
            if not os.path.exists(data_dir):
                os.mkdir(data_dir)

            work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

            _write_to_files(work_dir, job_lnames)
            job_lnames = []
            job_id += 1

        status += 1

    if dat.RELEVANT_RECORDS:
        if not os.path.exists(data_dir):
            os.mkdir(data_dir)

        work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

        _write_to_files(work_dir, job_lnames)

    return True
def _analyze_documents(
        records,
        taxonomy_name,
        collection,
        output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding taxonomy_name.
    @var records: list of recids to process
    @var taxonomy_name: str, name of the taxonomy, e.g. HEP
    @var collection: str, collection name
    @keyword output_limit: int, max number of keywords to extract [3]
    @return: str, marcxml output format of results
    """
    global _INDEX

    if not records:
        # No records could be found.
        bibtask.write_message(
            "WARNING: No records were found in collection %s." % collection,
            stream=sys.stderr,
            verbose=2)
        return False

    # Process records:
    output = []
    for record in records:
        bibdocfiles = BibRecDocs(record).list_latest_files(
        )  # TODO: why this doesn't call list_all_files() ?
        keywords = {}
        akws = {}
        acro = {}
        single_keywords = composite_keywords = author_keywords = acronyms = None

        for doc in bibdocfiles:
            # Get the keywords for all PDF documents contained in the record.
            if bibclassify_text_extractor.is_pdf(doc.get_full_path()):
                bibtask.write_message(
                    'INFO: Generating keywords for record %d.' % record,
                    stream=sys.stderr,
                    verbose=3)
                fulltext = doc.get_path()

                single_keywords, composite_keywords, author_keywords, acronyms = \
                    bibclassify_engine.get_keywords_from_local_file(fulltext,
                    taxonomy_name, with_author_keywords=True, output_mode="raw",
                    output_limit=output_limit, match_mode='partial')
            else:
                bibtask.write_message(
                    'WARNING: BibClassify does not know how to process \
                    doc: %s (type: %s) -- ignoring it.' %
                    (doc.fullpath, doc.doctype),
                    stream=sys.stderr,
                    verbose=3)

            if single_keywords or composite_keywords:
                cleaned_single = bibclassify_engine.clean_before_output(
                    single_keywords)
                cleaned_composite = bibclassify_engine.clean_before_output(
                    composite_keywords)
                # merge the groups into one
                keywords.update(cleaned_single)
                keywords.update(cleaned_composite)
            acro.update(acronyms)
            akws.update(author_keywords)

        if len(keywords):
            output.append('<record>')
            output.append('<controlfield tag="001">%s</controlfield>' % record)
            output.append(
                bibclassify_engine._output_marc(
                    keywords.items(), (),
                    akws,
                    acro,
                    spires=bconfig.CFG_SPIRES_FORMAT))
            output.append('</record>')
        else:
            bibtask.write_message('WARNING: No keywords found for record %d.' %
                                  record,
                                  stream=sys.stderr,
                                  verbose=0)

        _INDEX += 1

        bibtask.task_update_progress('Done %d out of %d.' %
                                     (_INDEX, _RECIDS_NUMBER))
        bibtask.task_sleep_now_if_required(can_stop_too=False)

    return '\n'.join(output)
def _update_authorid_universe():
    '''
    Updates all data related to the authorid algorithm.

    Sequence of operations:
        - Get all recently updated papers and remember time in the log
        - Get all authors on all papers
        - Extract collection of last names
        - For each last name:
            - Populate mem cache with cluster data
            - Delete updated records and their virtual authors from mem cache
            - Create virtual authors for new and updated records
            - Start matching algorithm
        - Update tables with results of the computation
        - Start personid update procedure
    '''

    def create_vas_from_specific_doclist(bibrec_ids):
        '''
        Processes the document list and creates a new minimal virtual author
        for each author in each record specified in the given list.

        @param bibrec_ids: Record IDs to concern in this update
        @type bibrec_ids: list of int
        '''
        num_docs = len([row for row in dat.DOC_LIST
                     if row['bibrecid'] in bibrec_ids])

        bconfig.LOGGER.log(25, "Creating minimal virtual authors for "
                                "all loaded docs (%s)"
                                % (num_docs))

        for docs in [row for row in dat.DOC_LIST
                     if row['bibrecid'] in bibrec_ids]:
            for author_id in docs['authornameids']:
                author_name = [an['name'] for an in dat.AUTHOR_NAMES
                               if an['id'] == author_id]
                refrecs = [ref[1] for ref in docs['authornameid_bibrefrec']
                           if ref[0] == author_id]
                refrec = -1

                if len(refrecs) > 1:
                    print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!"
                    refrec = refrecs[0]
                elif refrecs:
                    refrec = refrecs[0]

                if refrec and author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [], refrec)
                elif author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [])

    dat.reset_mem_cache(True)
    last_log = get_user_log(userinfo='daemon',
                            action='update_aid',
                            only_most_recent=True)
    updated_records = []

    if last_log:
        #select only the most recent papers
        recently_modified, last_update_time = get_papers_recently_modified(
                                                        date=last_log[0][2])
        insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status',
                    comment='bibauthorid_daemon, update_authorid_universe',
                    timestamp=last_update_time[0][0])
        bibtask.write_message("Update authorid will operate on %s records."
                              % (len(recently_modified)), stream=sys.stdout,
                              verbose=0)

        if not recently_modified:
            bibtask.write_message("Update authorid: Nothing to do",
                                  stream=sys.stdout, verbose=0)
            return

        for rec in recently_modified:
            updated_records.append(rec[0])
            dat.update_log("rec_updates", rec[0])

    else:
        bibtask.write_message("Update authorid: Nothing to do",
                              stream=sys.stdout, verbose=0)
        return

    authors = []
    author_last_names = set()

    bibtask.task_update_progress('Reading authors from updated records')
    bibtask.write_message("Reading authors from updated records",
                                stream=sys.stdout, verbose=0)
    updated_ras = set()

    # get all authors from all updated records
    for rec in updated_records:
        rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a",
                                                    source="API")

        for rec_author in rec_authors:
            if not rec_author:
                bconfig.LOGGER.error("Invalid empty author string, which "
                                     "will be skipped on record %s"
                                     % (rec))
                continue

            author_in_list = [row for row in authors
                              if row['db_name'] == rec_author]

            if author_in_list:
                for upd in [row for row in authors
                            if row['db_name'] == rec_author]:
                    upd['records'].append(rec)
            else:
                last_name = split_name_parts(rec_author)[0]
                author_last_names.add(last_name)
                authors.append({'db_name': rec_author,
                                'records': [rec],
                                'last_name': last_name})

    for status, author_last_name in enumerate(author_last_names):
        current_authors = [row for row in authors
                           if row['last_name'] == author_last_name]
        total_lnames = len(author_last_names)
        total_authors = len(current_authors)
        bibtask.task_update_progress('Processing %s of %s cluster: "%s" '
                                     '(%s authors)'
                                     % (status + 1, total_lnames,
                                        author_last_name, total_authors))
        bibtask.write_message('Processing %s of %s cluster: "%s" '
                              '(%s authors)'
                              % (status + 1, total_lnames, author_last_name,
                                 total_authors), stream=sys.stdout, verbose=0)
        dat.reset_mem_cache(True)
        init_authornames(author_last_name)
        load_mem_cache_from_tables()
        bconfig.LOGGER.log(25, "-- Relevant data successfully read into memory"
                               " to start processing")

        for current_author in current_authors:
            load_records_to_mem_cache(current_author['records'])
            authornamesid = [row['id'] for row in dat.AUTHOR_NAMES
                             if row['db_name'] == current_author['db_name']]

            if not authornamesid:
                bconfig.LOGGER.error("The author '%s' rec '%s' is not in authornames "
                                     "and will be skipped. You might want "
                                     "to run authornames update before?"
                                     % (current_author['db_name'], rec))
                continue
            else:
                try:
                    authornamesid = int(authornamesid[0])
                except (IndexError, TypeError, ValueError):
                    bconfig.LOGGER.error("Invalid authornames ID!")
                    continue

            if not current_author['records']:
                bconfig.LOGGER.error("The author '%s' is not associated to any"
                                     " document and will be skipped."
                                     % (current_author['db_name']))
                continue

            for rec in current_author['records']:
                # remove VAs already existing for the record
                va_ids = get_va_ids_by_recid_lname(rec,
                                                   current_author["last_name"])

                if va_ids:
                    for va_id in va_ids:
                        ra_list = get_realauthors_by_virtuala_id(va_id)

                        for ra_id in ra_list:
                            remove_va_from_ra(ra_id, va_id)
                            del_ra_data_by_vaid(ra_id, va_id)

                        va_anames_id = get_virtualauthor_records(va_id,
                                                        "orig_authorname_id")

                        for an_list in [row['authornameids'] for row in
                                    dat.DOC_LIST if row['bibrecid'] == rec]:
                            try:
                                an_list.remove(va_anames_id)
                            except (ValueError):
                                # This names id is not in the list...don't care
                                pass

                        delete_virtual_author(va_id)

                # create new VAs for the record.
                update_doclist(rec, authornamesid)
                dat.update_log("rec_updates", rec)

            create_vas_from_specific_doclist(current_author['records'])

        bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.")
        start_computation(process_doclist=False,
                          process_orphans=True,
                          print_stats=True)
        bconfig.LOGGER.log(25, "-- Computation finished. Will write back to "
                               "the database now.")
        update_db_result = update_tables_from_mem_cache(return_ra_updates=True)

        if not update_db_result[0]:
            bconfig.LOGGER.log(25, "Writing to persistence layer failed.")
        else:
            if update_db_result[1]:
                for updated_ra in update_db_result[1]:
                    if updated_ra:
                        updated_ras.add(updated_ra[0])

            bconfig.LOGGER.log(25, "Done updating authorid universe.")

    personid_ra_format = []

    for ra_id in updated_ras:
        personid_ra_format.append((ra_id,))

    bconfig.LOGGER.log(25, "Will now run personid update to make the "
                       "changes visible also on the front end and to "
                       "create person IDs for %s newly created and changed "
                       "authors." % len(updated_ras))
    bibtask.task_update_progress('Updating persistent Person IDs')
    update_personID_from_algorithm(personid_ra_format)
    bconfig.LOGGER.log(25, "Done updating everything. Thanks for flying "
                       "with bibauthorid!")
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
                                                   taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
                                                   taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(
                bibtask.task_get_task_param('task_starting_time'))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = bibclassify_engine.get_tmp_file(_rid)
    fo = open(abs_path, 'w')

    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                              len(onto_rec['recIDs'])),
                stream=sys.stderr,
                verbose=3)
        else:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to recIDs %s. ' %
                (onto_rec['ontology'], ', '.join(
                    [str(recid) for recid in onto_rec['recIDs']])),
                stream=sys.stderr,
                verbose=3)
        if onto_rec['recIDs']:
            xml = _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'],
                                     onto_rec['collection'])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write('</collection>\n')
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            bibclassify_webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message(
                "INFO: CFG_DB_SAVE_KW is false, we don't save results",
                stream=sys.stderr,
                verbose=0)
    else:
        bibtask.write_message("WARNING: No keywords found, recids: %s" %
                              onto_recids,
                              stream=sys.stderr,
                              verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(
            bibtask.task_get_task_param('task_starting_time'))
    return 1
def _analyze_documents(records, taxonomy_name, collection,
                       output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding taxonomy_name.
    @var records: list of recids to process
    @var taxonomy_name: str, name of the taxonomy, e.g. HEP
    @var collection: str, collection name
    @keyword output_limit: int, max number of keywords to extract [3]
    @return: str, marcxml output format of results
    """
    global _INDEX

    if not records:
        # No records could be found.
        bibtask.write_message("WARNING: No records were found in collection %s." %
            collection, stream=sys.stderr, verbose=2)
        return False

    # Process records:
    output = []
    for record in records:
        bibdocfiles = BibRecDocs(record).list_latest_files() # TODO: why this doesn't call list_all_files() ?
        keywords = {}
        akws = {}
        acro = {}
        single_keywords = composite_keywords = author_keywords = acronyms = None


        for doc in bibdocfiles:
            # Get the keywords for all PDF documents contained in the record.
            if bibclassify_text_extractor.is_pdf(doc.get_full_path()):
                bibtask.write_message('INFO: Generating keywords for record %d.' %
                    record, stream=sys.stderr, verbose=3)
                fulltext = doc.get_path()

                single_keywords, composite_keywords, author_keywords, acronyms = \
                    bibclassify_engine.get_keywords_from_local_file(fulltext,
                    taxonomy_name, with_author_keywords=True, output_mode="raw",
                    output_limit=output_limit, match_mode='partial')
            else:
                bibtask.write_message('WARNING: BibClassify does not know how to process \
                    doc: %s (type: %s) -- ignoring it.' %
                    (doc.fullpath, doc.doctype), stream=sys.stderr, verbose=3)

            if single_keywords or composite_keywords:
                cleaned_single = bibclassify_engine.clean_before_output(single_keywords)
                cleaned_composite = bibclassify_engine.clean_before_output(composite_keywords)
                # merge the groups into one
                keywords.update(cleaned_single)
                keywords.update(cleaned_composite)
            acro.update(acronyms)
            akws.update(author_keywords)

        if len(keywords):
            output.append('<record>')
            output.append('<controlfield tag="001">%s</controlfield>' % record)
            output.append(bibclassify_engine._output_marc(keywords.items(), (), akws, acro,
                                                      spires=bconfig.CFG_SPIRES_FORMAT))
            output.append('</record>')
        else:
            bibtask.write_message('WARNING: No keywords found for record %d.' %
                    record, stream=sys.stderr, verbose=0)

        _INDEX += 1

        bibtask.task_update_progress('Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER))
        bibtask.task_sleep_now_if_required(can_stop_too=False)

    return '\n'.join(output)
Beispiel #41
0
def _task_submit_check_options():
    """
    Required by bibtask. Checks the options.
    """
    update_personid = bibtask.task_get_option("update_personid")
    disambiguate = bibtask.task_get_option("disambiguate")
    merge = bibtask.task_get_option("merge")

    record_ids = bibtask.task_get_option("record_ids")
    all_records = bibtask.task_get_option("all_records")
    from_scratch = bibtask.task_get_option("from_scratch")

    commands = bool(update_personid) + bool(disambiguate) + bool(merge)

    if commands == 0:
        bibtask.write_message("ERROR: At least one command should be specified!"
                              , stream=sys.stdout, verbose=0)
        return False

    if commands > 1:
        bibtask.write_message("ERROR: The options --update-personid, --disambiguate "
                              "and --merge are mutually exclusive."
                              , stream=sys.stdout, verbose=0)
        return False

    assert commands == 1

    if update_personid:
        if any((from_scratch,)):
            bibtask.write_message("ERROR: The only options which can be specified "
                                  "with --update-personid are --record-ids and "
                                  "--all-records"
                                  , stream=sys.stdout, verbose=0)
            return False

        options = bool(record_ids) + bool(all_records)
        if options > 1:
            bibtask.write_message("ERROR: conflicting options: --record-ids and "
                                  "--all-records are mutually exclusive."
                                  , stream=sys.stdout, verbose=0)
            return False

        if record_ids:
            for iden in record_ids:
                if not iden.isdigit():
                    bibtask.write_message("ERROR: Record_ids expects numbers. "
                                          "Provided: %s." % iden)
                    return False

    if disambiguate:
        if any((record_ids, all_records)):
            bibtask.write_message("ERROR: The only option which can be specified "
                                  "with --disambiguate is from-scratch"
                                  , stream=sys.stdout, verbose=0)
            return False

    if merge:
        if any((record_ids, all_records, from_scratch)):
            bibtask.write_message("ERROR: There are no options which can be "
                                  "specified along with --merge"
                                  , stream=sys.stdout, verbose=0)
            return False

    return True