def compute_cache(pids):
    bibtask.write_message("WebAuthorProfile: %s persons to go" % len(pids),
                          stream=sys.stdout, verbose=0)
    for i, p in enumerate(pids):
        bibtask.write_message("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids)))
        bibtask.task_update_progress("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids)))
        _compute_cache_for_person(p)
        bibtask.task_sleep_now_if_required(can_stop_too=True)
def compute_cache(pids):
    bibtask.write_message("WebAuthorProfile: %s persons to go" % len(pids),
                          stream=stdout, verbose=0)
    for _, p in enumerate(pids):
        bibtask.write_message("WebAuthorProfile: doing %s out of %s (personid: %s)" % (pids.index(p) + 1, len(pids), p))
        bibtask.task_update_progress("WebAuthorProfile: doing %s out of %s (personid: %s)" % (pids.index(p) + 1, len(pids), p))
        _compute_cache_for_person(p)
        bibtask.task_sleep_now_if_required(can_stop_too=True)
def compute_cache_mp(pids):
    from multiprocessing import Pool
    p = Pool()
    bibtask.write_message("WebAuthorProfileMP: %s persons to go" % len(pids),
                          stream=sys.stdout, verbose=0)
    sl = 100
    ss = [pids[i: i + sl] for i in range(0, len(pids), sl)]
    for i, bunch in enumerate(ss):
        bibtask.write_message("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss)))
        bibtask.task_update_progress("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss)))
        p.map(_compute_cache_for_person, bunch)
        bibtask.task_sleep_now_if_required(can_stop_too=True)
def compute_cache_mp(pids):
    from multiprocessing import Pool
    p = Pool()
    bibtask.write_message("WebAuthorProfileMP: %s persons to go" % len(pids),
                          stream=stdout, verbose=0)
    sl = 100
    ss = [pids[i: i + sl] for i in range(0, len(pids), sl)]
    for i, bunch in enumerate(ss):
        bibtask.write_message("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss)))
        bibtask.task_update_progress("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss)))
        p.map(_compute_cache_for_person, bunch)
        bibtask.task_sleep_now_if_required(can_stop_too=True)
예제 #5
0
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    repair_pid = bibtask.task_get_option('repair_pid')
    fast_update_personid = bibtask.task_get_option('fast_update_personid')
    personid_gc = bibtask.task_get_option('personid_gc')
    record_ids = bibtask.task_get_option('record_ids')
    all_records = bibtask.task_get_option('all_records')

    if record_ids:
        record_ids_nested = [[p] for p in record_ids]
    else:
        record_ids_nested = None

    if repair_pid:
        bibtask.task_update_progress('Updating names cache...')
        _run_update_authornames_tables_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('Removing person entities not touched by '
                                     'humans...')
        personid_remove_automatically_assigned_papers()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('Updating person entities...')
        update_personID_from_algorithm()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('Cleaning person tables...')
        _run_update_personID_table_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('All repairs done.')

    if fast_update_personid:
        bibtask.task_update_progress('Updating personid...')
        _run_personid_fast_assign_papers(record_ids_nested, all_records)
        bibtask.task_update_progress('PersonID update finished!')

    if personid_gc:
        bibtask.task_update_progress('Updating personid (GC)...')
        _run_personid_gc(record_ids_nested, all_records)
        bibtask.task_update_progress('PersonID update finished (GC)!')
    return 1
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    lastname = bibtask.task_get_option("lastname")
    process_all = bibtask.task_get_option("process_all")
    prepare_grid = bibtask.task_get_option("prepare_grid")
    load_grid = bibtask.task_get_option("load_grid_results")
    data_dir = bibtask.task_get_option("data_dir")
    prefix = bibtask.task_get_option("prefix")
    max_records_option = bibtask.task_get_option("max_records")
    update = bibtask.task_get_option("update")
    clean_cache = bibtask.task_get_option("clean_cache")
    update_cache = bibtask.task_get_option("update_cache")
    record_ids = bibtask.task_get_option("record_ids")
    record_ids_nested = None
    all_records = bibtask.task_get_option("all_records")
    repair_pid = bibtask.task_get_option("repair_pid")
    fast_update_personid = bibtask.task_get_option("fast_update_personid")

    if record_ids:
        record_ids_nested = [[p] for p in record_ids]

    if fast_update_personid:
        fast_update_personid = [[p] for p in fast_update_personid]
    #    automated_daemon_mode_p = True

    if lastname:
        bibtask.write_message("Processing last name %s" % (lastname), stream=sys.stdout, verbose=0)

    if process_all:
        if bconfig.STANDALONE:
            bibtask.write_message("Processing not possible in standalone!", stream=sys.stdout, verbose=0)
            return 0

        bibtask.write_message("Processing all names...", stream=sys.stdout, verbose=0)

        lengths = get_len_authornames_bibrefs()

        if not check_and_create_aid_tables():
            bibtask.write_message("Failed to create database tables!", stream=sys.stdout, verbose=0)
            return 0

        if lengths["names"] < 1:
            bibtask.write_message("Populating Authornames table. It's Empty.", stream=sys.stdout, verbose=0)
            bibtask.task_update_progress("Populating Authornames table.")
            populate_authornames()
            insert_user_log(
                "daemon",
                "-1",
                "UATFP",
                "bibsched",
                "status",
                comment="bibauthorid_daemon, " "update_authornames_tables_from_paper",
            )

        if lengths["bibrefs"] < 1:
            bibtask.write_message("Populating Bibrefs lookup. It's Empty.", stream=sys.stdout, verbose=0)
            bibtask.task_update_progress("Populating Bibrefs lookup table.")
            populate_authornames_bibrefs_from_authornames()

        bibtask.task_update_progress("Processing all authors.")
        start_full_disambiguation(
            last_names="all", process_orphans=True, db_exists=False, populate_doclist=True, write_to_db=True
        )
        update_personID_from_algorithm()
        insert_user_log(
            "daemon", "-1", "update_aid", "bibsched", "status", comment="bibauthorid_daemon, update_authorid_universe"
        )

    if prepare_grid:
        bibtask.write_message("Preparing Grid Job", stream=sys.stdout, verbose=0)
        data_dir_name = "grid_data"
        workdir_prefix = "job"
        max_records = 4000

        if data_dir:
            data_dir_name = data_dir

        if prefix:
            workdir_prefix = prefix

        if max_records_option:
            max_records = max_records_option

        _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records)

    if load_grid:
        bibtask.write_message(
            "Reading Grid Job results and will write" " them to the database.", stream=sys.stdout, verbose=0
        )

        _write_data_files_to_db(data_dir)

    if update or update_cache:
        bibtask.write_message("update-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("update-cache: Processing recently" " updated papers")
        _run_update_authornames_tables_from_paper(record_ids_nested, all_records)
        bibtask.write_message("update-cache: Finished processing papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("update-cache: DONE")

    if update:
        bibtask.write_message("updating authorid universe", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("updating authorid universe")
        _update_authorid_universe(record_ids, all_records)
        bibtask.write_message("done updating authorid universe", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("done updating authorid universe")

    if clean_cache:
        bibtask.write_message("clean-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("clean-cache: Processing recently updated" " papers for names")
        _run_authornames_tables_gc()
        bibtask.write_message("update-cache: Finished cleaning authornames " "tables", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("clean-cache: Processing recently updated" " papers for persons")
        _run_update_personID_table_from_paper(record_ids_nested, all_records)
        bibtask.write_message("update-cache: Finished cleaning PersonID" " table", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress("clean-cache: DONE")

    if repair_pid:
        bibtask.task_update_progress("Updating names cache...")
        _run_update_authornames_tables_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("Removing person entities not touched by " "humans...")
        personid_remove_automatically_assigned_papers()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("Updating person entities...")
        update_personID_from_algorithm()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("Cleaning person tables...")
        _run_update_personID_table_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress("All repairs done.")

    if fast_update_personid:
        bibtask.task_update_progress("Updating personid...")
        _run_personid_fast_assign_papers(fast_update_personid)
        bibtask.task_update_progress("Update finished...")
        # TODO: remember to pass the papers list!
    return 1
def personid_fast_assign_papers(paperslist=None, use_threading_not_multiprocessing=True):
    '''
    Assign papers to the most compatible person.
    Compares only the name to find the right person to assign to. If nobody seems compatible,
    create a new person.
    '''

    class Worker(Thread):
        def __init__(self, i, p_q, atul, personid_new_id_lock, checker):
            Thread.__init__(self)
            self.i = i
            self.checker = checker
            self.p_q = p_q
            self.atul = atul
            self.personid_new_id_lock = personid_new_id_lock

        def run(self):
            while True:
                if checker.should_stop():
                    break
                try:
                    bibrec = self.p_q.get_nowait()
                except Empty:
                    break
                close_connection()

                pfap_assign_paper_iteration(self.i, bibrec, self.atul, self.personid_new_id_lock)

    def _pfap_assign_paper(i, p_q, atul, personid_new_id_lock, checker):
        while True:
            # check bibsched
            if checker.should_stop():
                break

            try:
                bibrec = p_q.get_nowait()
            except Empty:
                break

            pfap_assign_paper_iteration(i, bibrec, atul, personid_new_id_lock)


    _pfap_printmsg('starter', 'Started')
    if not paperslist:
        #paperslist = run_sql('select id from bibrec where 1')
        paperslist = [[x] for x in perform_request_search(p="")]

    paperslist = [k[0] for k in paperslist]

    _pfap_printmsg('starter', 'Starting on %s papers ' % len(paperslist))

    if use_threading_not_multiprocessing:
        authornames_table_update_lock = Lock()
        personid_new_id_lock = Lock()
        papers_q = Queue()
    else:
        authornames_table_update_lock = multiprocessing.Lock()
        personid_new_id_lock = multiprocessing.Lock()
        papers_q = multiprocessing.Queue()

    for p in paperslist:
        papers_q.put(p)

    process_list = []
    c = 0
    if not use_threading_not_multiprocessing:
        while not papers_q.empty():
            checker = status_checker()
            while len(process_list) <= bconfig.CFG_BIBAUTHORID_MAX_PROCESSES:
                p = multiprocessing.Process(target=_pfap_assign_paper, args=(c, papers_q,
                                                                    authornames_table_update_lock,
                                                                    personid_new_id_lock, checker))
                c += 1
                process_list.append(p)
                p.start()

            for i, p in enumerate(tuple(process_list)):
                if not p.is_alive():
                    p.join()
                    process_list.remove(p)

            task_sleep_now_if_required(can_stop_too=False)
    else:
        max_processes = bconfig.CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS
        checker = status_checker()
        workers = []
        while not papers_q.empty():
            i = 0
            while len(workers) < max_processes:
                w = Worker(i, papers_q, authornames_table_update_lock,
                           personid_new_id_lock, checker)
                i += 1
                w.start()
                workers.append(w)
            for c, p in enumerate(tuple(workers)):
                if not p.is_alive():
                    p.join()
                    workers.remove(p)

            task_sleep_now_if_required(can_stop_too=False)
예제 #8
0
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
            taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
            taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(bibtask.task_get_task_param('task_starting_time'))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = bibclassify_engine.get_tmp_file(_rid)
    fo = open(abs_path, 'w')


    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            bibtask.write_message('INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                len(onto_rec['recIDs'])), stream=sys.stderr, verbose=3)
        else:
            bibtask.write_message('INFO: Applying taxonomy %s to recIDs %s. ' %
                (onto_rec['ontology'],
                ', '.join([str(recid) for recid in onto_rec['recIDs']])),
                stream=sys.stderr, verbose=3)
        if onto_rec['recIDs']:
            xml = _analyze_documents(onto_rec['recIDs'],
                onto_rec['ontology'], onto_rec['collection'])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write('</collection>\n')
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            bibclassify_webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message("INFO: CFG_DB_SAVE_KW is false, we don't save results",
                                  stream=sys.stderr, verbose=0)
    else:
        bibtask.write_message("WARNING: No keywords found, recids: %s" % onto_recids,
                                  stream=sys.stderr, verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(bibtask.task_get_task_param('task_starting_time'))
    return 1
예제 #9
0
def _analyze_documents(records, taxonomy_name, collection,
                       output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding taxonomy_name.
    @var records: list of recids to process
    @var taxonomy_name: str, name of the taxonomy, e.g. HEP
    @var collection: str, collection name
    @keyword output_limit: int, max number of keywords to extract [3]
    @return: str, marcxml output format of results
    """
    global _INDEX

    if not records:
        # No records could be found.
        bibtask.write_message("WARNING: No records were found in collection %s." %
            collection, stream=sys.stderr, verbose=2)
        return False

    # Process records:
    output = []
    for record in records:
        bibdocfiles = BibRecDocs(record).list_latest_files() # TODO: why this doesn't call list_all_files() ?
        keywords = {}
        akws = {}
        acro = {}
        single_keywords = composite_keywords = author_keywords = acronyms = None


        for doc in bibdocfiles:
            # Get the keywords for all PDF documents contained in the record.
            if bibclassify_text_extractor.is_pdf(doc.get_full_path()):
                bibtask.write_message('INFO: Generating keywords for record %d.' %
                    record, stream=sys.stderr, verbose=3)
                fulltext = doc.get_path()

                single_keywords, composite_keywords, author_keywords, acronyms = \
                    bibclassify_engine.get_keywords_from_local_file(fulltext,
                    taxonomy_name, with_author_keywords=True, output_mode="raw",
                    output_limit=output_limit, match_mode='partial')
            else:
                bibtask.write_message('WARNING: BibClassify does not know how to process \
                    doc: %s (type: %s) -- ignoring it.' %
                    (doc.fullpath, doc.doctype), stream=sys.stderr, verbose=3)

            if single_keywords or composite_keywords:
                cleaned_single = bibclassify_engine.clean_before_output(single_keywords)
                cleaned_composite = bibclassify_engine.clean_before_output(composite_keywords)
                # merge the groups into one
                keywords.update(cleaned_single)
                keywords.update(cleaned_composite)
            acro.update(acronyms)
            akws.update(author_keywords)

        if len(keywords):
            output.append('<record>')
            output.append('<controlfield tag="001">%s</controlfield>' % record)
            output.append(bibclassify_engine._output_marc(keywords.items(), (), akws, acro,
                                                      spires=bconfig.CFG_SPIRES_FORMAT))
            output.append('</record>')
        else:
            bibtask.write_message('WARNING: No keywords found for record %d.' %
                    record, stream=sys.stderr, verbose=0)

        _INDEX += 1

        bibtask.task_update_progress('Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER))
        bibtask.task_sleep_now_if_required(can_stop_too=False)

    return '\n'.join(output)
    deleted_recs = dbinter.get_deleted_papers()
    deleted_recs = frozenset(x[0] for x in deleted_recs)
    if bconfig.TABLES_UTILS_DEBUG:
        print "%d total deleted papers" % (len(deleted_recs),)

    if personid:
        personid_q = dbinter.list_2_SQL_str(personid, lambda x: str(x[0]))
    else:
        personid_q = None

    counter = 0
    rows_limit = 10000000
    end_loop = False
    while not end_loop:
        task_sleep_now_if_required(True)
        papers_data = dbinter.collect_personid_papers(person=personid_q,
                                                      limit=(counter, rows_limit,))

        if bconfig.TABLES_UTILS_DEBUG:
            print "query with limit %d %d" % (counter, rows_limit)

        if len(papers_data) == rows_limit:
            counter += rows_limit
        else:
            end_loop = True

        papers_data = tuple((extract_bibrec(p[3]), p) for p in papers_data)
        to_remove = set()
        jobs = dict()
        for p in papers_data:
def personid_fast_assign_papers(paperslist=None, use_threading_not_multiprocessing=True):
    '''
    Assign papers to the most compatible person.
    Compares only the name to find the right person to assign to. If nobody seems compatible,
    create a new person.
    '''

    class Worker(Thread):
        def __init__(self, i, p_q, atul, personid_new_id_lock, checker):
            Thread.__init__(self)
            self.i = i
            self.checker = checker
            self.p_q = p_q
            self.atul = atul
            self.personid_new_id_lock = personid_new_id_lock

        def run(self):
            while True:
                if checker.should_stop():
                    break
                try:
                    bibrec = self.p_q.get_nowait()
                except Empty:
                    break
                close_connection()

                pfap_assign_paper_iteration(self.i, bibrec, self.atul, self.personid_new_id_lock)

    def _pfap_assign_paper(i, p_q, atul, personid_new_id_lock, checker):
        while True:
            # check bibsched
            if checker.should_stop():
                break

            try:
                bibrec = p_q.get_nowait()
            except Empty:
                break

            pfap_assign_paper_iteration(i, bibrec, atul, personid_new_id_lock)


    _pfap_printmsg('starter', 'Started')
    if not paperslist:
        #paperslist = run_sql('select id from bibrec where 1')
        paperslist = [[x] for x in perform_request_search(p="")]

    paperslist = [k[0] for k in paperslist]

    _pfap_printmsg('starter', 'Starting on %s papers ' % len(paperslist))

    if use_threading_not_multiprocessing:
        authornames_table_update_lock = Lock()
        personid_new_id_lock = Lock()
        papers_q = Queue()
    else:
        authornames_table_update_lock = multiprocessing.Lock()
        personid_new_id_lock = multiprocessing.Lock()
        papers_q = multiprocessing.Queue()

    for p in paperslist:
        papers_q.put(p)

    process_list = []
    c = 0
    if not use_threading_not_multiprocessing:
        while not papers_q.empty():
            checker = status_checker()
            while len(process_list) <= bconfig.CFG_BIBAUTHORID_MAX_PROCESSES:
                p = multiprocessing.Process(target=_pfap_assign_paper, args=(c, papers_q,
                                                                    authornames_table_update_lock,
                                                                    personid_new_id_lock, checker))
                c += 1
                process_list.append(p)
                p.start()

            for i, p in enumerate(tuple(process_list)):
                if not p.is_alive():
                    p.join()
                    process_list.remove(p)

            task_sleep_now_if_required(True)
    else:
        max_processes = bconfig.CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS
        checker = status_checker()
        workers = []
        while not papers_q.empty():
            i = 0
            while len(workers) < max_processes:
                w = Worker(i, papers_q, authornames_table_update_lock,
                           personid_new_id_lock, checker)
                i += 1
                w.start()
                workers.append(w)
            for c, p in enumerate(tuple(workers)):
                if not p.is_alive():
                    p.join()
                    workers.remove(p)

            task_sleep_now_if_required(True)
예제 #12
0
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
                                                   taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
                                                   taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(
                bibtask.task_get_task_param('task_starting_time'))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = bibclassify_engine.get_tmp_file(_rid)
    fo = open(abs_path, 'w')

    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                              len(onto_rec['recIDs'])),
                stream=sys.stderr,
                verbose=3)
        else:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to recIDs %s. ' %
                (onto_rec['ontology'], ', '.join(
                    [str(recid) for recid in onto_rec['recIDs']])),
                stream=sys.stderr,
                verbose=3)
        if onto_rec['recIDs']:
            xml = _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'],
                                     onto_rec['collection'])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write('</collection>\n')
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            bibclassify_webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message(
                "INFO: CFG_DB_SAVE_KW is false, we don't save results",
                stream=sys.stderr,
                verbose=0)
    else:
        bibtask.write_message("WARNING: No keywords found, recids: %s" %
                              onto_recids,
                              stream=sys.stderr,
                              verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(
            bibtask.task_get_task_param('task_starting_time'))
    return 1
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    lastname = bibtask.task_get_option('lastname')
    process_all = bibtask.task_get_option('process_all')
    prepare_grid = bibtask.task_get_option('prepare_grid')
    load_grid = bibtask.task_get_option('load_grid_results')
    data_dir = bibtask.task_get_option('data_dir')
    prefix = bibtask.task_get_option('prefix')
    max_records_option = bibtask.task_get_option('max_records')
    update = bibtask.task_get_option('update')
    clean_cache = bibtask.task_get_option('clean_cache')
    update_cache = bibtask.task_get_option('update_cache')
    record_ids = bibtask.task_get_option('record_ids')
    record_ids_nested = None
    all_records = bibtask.task_get_option('all_records')
    repair_pid = bibtask.task_get_option('repair_pid')

    if record_ids:
        record_ids_nested = [[p] for p in record_ids]
#    automated_daemon_mode_p = True

    if lastname:
        bibtask.write_message("Processing last name %s" % (lastname),
                              stream=sys.stdout, verbose=0)

    if process_all:
        if bconfig.STANDALONE:
            bibtask.write_message("Processing not possible in standalone!",
                                  stream=sys.stdout, verbose=0)
            return 0

        bibtask.write_message("Processing all names...",
                              stream=sys.stdout, verbose=0)

        lengths = get_len_authornames_bibrefs()

        if not check_and_create_aid_tables():
            bibtask.write_message("Failed to create database tables!",
                                  stream=sys.stdout, verbose=0)
            return 0

        if lengths['names'] < 1:
            bibtask.write_message("Populating Authornames table. It's Empty.",
                                  stream=sys.stdout, verbose=0)
            bibtask.task_update_progress('Populating Authornames table.')
            populate_authornames()
            insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status',
                            comment='bibauthorid_daemon, '
                            'update_authornames_tables_from_paper')


        if lengths['bibrefs'] < 1:
            bibtask.write_message("Populating Bibrefs lookup. It's Empty.",
                                  stream=sys.stdout, verbose=0)
            bibtask.task_update_progress('Populating Bibrefs lookup table.')
            populate_authornames_bibrefs_from_authornames()

        bibtask.task_update_progress('Processing all authors.')
        start_full_disambiguation(last_names="all",
                                 process_orphans=True,
                                 db_exists=False,
                                 populate_doclist=True,
                                 write_to_db=True)
        update_personID_from_algorithm()
        insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status',
                    comment='bibauthorid_daemon, update_authorid_universe')

    if prepare_grid:
        bibtask.write_message("Preparing Grid Job",
                              stream=sys.stdout, verbose=0)
        data_dir_name = "grid_data"
        workdir_prefix = "job"
        max_records = 4000

        if data_dir:
            data_dir_name = data_dir

        if prefix:
            workdir_prefix = prefix

        if max_records_option:
            max_records = max_records_option

        _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records)

    if load_grid:
        bibtask.write_message("Reading Grid Job results and will write"
                              " them to the database.",
                              stream=sys.stdout, verbose=0)

        _write_data_files_to_db(data_dir)

    if update or update_cache:
        bibtask.write_message("update-cache: Processing recently updated"
                              " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('update-cache: Processing recently'
                                     ' updated papers')
        _run_update_authornames_tables_from_paper(record_ids_nested, all_records)
        bibtask.write_message("update-cache: Finished processing papers",
                              stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('update-cache: DONE')

    if update:
        bibtask.write_message("updating authorid universe",
                              stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('updating authorid universe')
        _update_authorid_universe(record_ids, all_records)
        bibtask.write_message("done updating authorid universe",
                              stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('done updating authorid universe')

    if clean_cache:
        bibtask.write_message("clean-cache: Processing recently updated"
                              " papers", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for names')
        _run_authornames_tables_gc()
        bibtask.write_message("update-cache: Finished cleaning authornames "
                              "tables", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for persons')
        _run_update_personID_table_from_paper(record_ids_nested, all_records)
        bibtask.write_message("update-cache: Finished cleaning PersonID"
                              " table", stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('clean-cache: DONE')

    if repair_pid:
        bibtask.task_update_progress('Updating names cache...')
        _run_update_authornames_tables_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('Removing person entities not touched by '
                                     'humans...')
        personid_remove_automatically_assigned_papers()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('Updating person entities...')
        update_personID_from_algorithm()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('Cleaning person tables...')
        _run_update_personID_table_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('All repairs done.')

    return 1
예제 #14
0
def _prepare_data_files_from_db(data_dir_name="grid_data",
                                workdir_prefix="job",
                                max_records=4000):
    '''
    Prepares grid jobs. Is a task running in bibsched.
    Meaning:
        1. Find all last names in the database
        2. For each last name:
            - find all documents regarding this last name (ignore first names)
            - if number of documents loaded into memory exceeds max_records,
              write the memory cache into files (cf. Files section).
              Each write back procedure will happen into a newly created
              directory. The prefix for the respective job directory may
              be specified as well as the name of the data directory where
              these job directories will be created.
    Files:
        - authornames.dat
        - virtual_authors.dat
        - virtual_author_data.dat
        - virtual_author_clusters.dat
        - virtual_author_cluster_cache.dat
        - realauthors.dat
        - realauthor_data.dat
        - doclist.dat
        - records.dat
        - ids.dat
        - ra_va_cache.dat

    @param data_dir_name: the name of the directory that will hold all the
        sub directories for the jobs.
    @type data_dir_name: string
    @param workdir_prefix: prefix for the job sub directories.
    @type workdir_prefix: string
    @param max_records: maximum number of records after which the memory
        cache is to be flushed to files.
    @type max_records: int
    '''
    try:
        max_records = int(max_records)
    except ValueError:
        max_records = 4000

    bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0)
    bibtask.write_message("Limiting files to %s records" % (max_records, ),
                          stream=sys.stdout,
                          verbose=0)
    bibtask.task_update_progress('Loading last names...')

    last_names = find_all_last_names()
    last_name_queue = Queue.Queue()

    for last_name in sorted(last_names):
        last_name_queue.put(last_name)

    total = len(last_names)
    status = 1
    bibtask.write_message("Done. Loaded %s last names." % (total),
                          stream=sys.stdout,
                          verbose=0)
    job_id = 0
    data_dir = ""

    if data_dir_name.startswith("/"):
        data_dir = data_dir_name
    else:
        data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name)

    if not data_dir.endswith("/"):
        data_dir = "%s/" % (data_dir, )

    job_lnames = []

    while True:
        if last_name_queue.empty():
            bibtask.write_message("Done with all names.",
                                  stream=sys.stdout,
                                  verbose=0)
            break

        bibtask.task_sleep_now_if_required(can_stop_too=False)
        lname_list = last_name_queue.get()
        lname = None

        if lname_list:
            lname = lname_list[0]
            del (lname_list[0])
        else:
            bconfig.LOGGER.warning("Got an empty Queue element. "
                                   "Queue seems corrupted.")
            continue

        job_lnames.append(lname)
        bibtask.task_update_progress('Preparing job %d of %d: %s.' %
                                     (status, total, lname))
        bibtask.write_message(
            ("Processing: %s (%d/%d).") % (lname, status, total),
            stream=sys.stdout,
            verbose=0)

        bibtask.task_sleep_now_if_required(can_stop_too=False)
        populate_doclist_for_author_surname(lname)
        post_remove_names = set()

        for name in [
                row['name'] for row in dat.AUTHOR_NAMES if not row['processed']
        ]:
            potential_removal = "%s," % (name.split(',')[0], )

            if not potential_removal == "%s" % (lname, ):
                post_remove_names.add(potential_removal)

        if len(post_remove_names) > 1:
            removed = 0
            removed_names = []

            for post_remove_name in post_remove_names:
                if post_remove_name in lname_list:
                    lname_list.remove(post_remove_name)
                    removed_names.append(post_remove_name)
                    removed += 1

            bibtask.write_message(
                ("-> Removed %s entries from the " + "computation list: %s") %
                (removed, removed_names),
                stream=sys.stdout,
                verbose=0)
            total -= removed

        if lname_list:
            last_name_queue.put(lname_list)

        if len(dat.RELEVANT_RECORDS) >= max_records:
            if not os.path.exists(data_dir):
                os.mkdir(data_dir)

            work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

            _write_to_files(work_dir, job_lnames)
            bibtask.task_sleep_now_if_required(can_stop_too=True)
            job_lnames = []
            job_id += 1

        status += 1

    if dat.RELEVANT_RECORDS:
        if not os.path.exists(data_dir):
            os.mkdir(data_dir)

        work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

        _write_to_files(work_dir, job_lnames)
        bibtask.task_sleep_now_if_required(can_stop_too=True)

    return True
    deleted_recs = dbinter.get_deleted_papers()
    deleted_recs = frozenset(x[0] for x in deleted_recs)
    if bconfig.TABLES_UTILS_DEBUG:
        print "%d total deleted papers" % (len(deleted_recs),)

    if personid:
        personid_q = dbinter.list_2_SQL_str(personid, lambda x: str(x[0]))
    else:
        personid_q = None

    counter = 0
    rows_limit = 10000000
    end_loop = False
    while not end_loop:
        task_sleep_now_if_required(True)
        papers_data = dbinter.collect_personid_papers(person=personid_q, limit=(counter, rows_limit))

        if bconfig.TABLES_UTILS_DEBUG:
            print "query with limit %d %d" % (counter, rows_limit)

        if len(papers_data) == rows_limit:
            counter += rows_limit
        else:
            end_loop = True

        papers_data = tuple((extract_bibrec(p[3]), p) for p in papers_data)
        to_remove = set()
        jobs = dict()
        for p in papers_data:
            if int(p[0]) in deleted_recs:
예제 #16
0
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """

    lastname = bibtask.task_get_option('lastname')
    process_all = bibtask.task_get_option('process_all')
    prepare_grid = bibtask.task_get_option('prepare_grid')
    load_grid = bibtask.task_get_option('load_grid_results')
    data_dir = bibtask.task_get_option('data_dir')
    prefix = bibtask.task_get_option('prefix')
    max_records_option = bibtask.task_get_option('max_records')
    update = bibtask.task_get_option('update')
    clean_cache = bibtask.task_get_option('clean_cache')
    update_cache = bibtask.task_get_option('update_cache')
    record_ids = bibtask.task_get_option('record_ids')
    record_ids_nested = None
    all_records = bibtask.task_get_option('all_records')
    repair_pid = bibtask.task_get_option('repair_pid')

    if record_ids:
        record_ids_nested = [[p] for p in record_ids]


#    automated_daemon_mode_p = True

    if lastname:
        bibtask.write_message("Processing last name %s" % (lastname),
                              stream=sys.stdout,
                              verbose=0)

    if process_all:
        if bconfig.STANDALONE:
            bibtask.write_message("Processing not possible in standalone!",
                                  stream=sys.stdout,
                                  verbose=0)
            return 0

        bibtask.write_message("Processing all names...",
                              stream=sys.stdout,
                              verbose=0)

        lengths = get_len_authornames_bibrefs()

        if not check_and_create_aid_tables():
            bibtask.write_message("Failed to create database tables!",
                                  stream=sys.stdout,
                                  verbose=0)
            return 0

        if lengths['names'] < 1:
            bibtask.write_message("Populating Authornames table. It's Empty.",
                                  stream=sys.stdout,
                                  verbose=0)
            bibtask.task_update_progress('Populating Authornames table.')
            populate_authornames()
            insert_user_log('daemon',
                            '-1',
                            'UATFP',
                            'bibsched',
                            'status',
                            comment='bibauthorid_daemon, '
                            'update_authornames_tables_from_paper')

        if lengths['bibrefs'] < 1:
            bibtask.write_message("Populating Bibrefs lookup. It's Empty.",
                                  stream=sys.stdout,
                                  verbose=0)
            bibtask.task_update_progress('Populating Bibrefs lookup table.')
            populate_authornames_bibrefs_from_authornames()

        bibtask.task_update_progress('Processing all authors.')
        start_full_disambiguation(last_names="all",
                                  process_orphans=True,
                                  db_exists=False,
                                  populate_doclist=True,
                                  write_to_db=True)
        update_personID_from_algorithm()
        insert_user_log('daemon',
                        '-1',
                        'update_aid',
                        'bibsched',
                        'status',
                        comment='bibauthorid_daemon, update_authorid_universe')

    if prepare_grid:
        bibtask.write_message("Preparing Grid Job",
                              stream=sys.stdout,
                              verbose=0)
        data_dir_name = "grid_data"
        workdir_prefix = "job"
        max_records = 4000

        if data_dir:
            data_dir_name = data_dir

        if prefix:
            workdir_prefix = prefix

        if max_records_option:
            max_records = max_records_option

        _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records)

    if load_grid:
        bibtask.write_message(
            "Reading Grid Job results and will write"
            " them to the database.",
            stream=sys.stdout,
            verbose=0)

        _write_data_files_to_db(data_dir)

    if update or update_cache:
        bibtask.write_message(
            "update-cache: Processing recently updated"
            " papers",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('update-cache: Processing recently'
                                     ' updated papers')
        _run_update_authornames_tables_from_paper(record_ids_nested,
                                                  all_records)
        bibtask.write_message("update-cache: Finished processing papers",
                              stream=sys.stdout,
                              verbose=0)
        bibtask.task_update_progress('update-cache: DONE')

    if update:
        bibtask.write_message("updating authorid universe",
                              stream=sys.stdout,
                              verbose=0)
        bibtask.task_update_progress('updating authorid universe')
        _update_authorid_universe(record_ids, all_records)
        bibtask.write_message("done updating authorid universe",
                              stream=sys.stdout,
                              verbose=0)
        bibtask.task_update_progress('done updating authorid universe')

    if clean_cache:
        bibtask.write_message(
            "clean-cache: Processing recently updated"
            " papers",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for names')
        _run_authornames_tables_gc()
        bibtask.write_message(
            "update-cache: Finished cleaning authornames "
            "tables",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('clean-cache: Processing recently updated'
                                     ' papers for persons')
        _run_update_personID_table_from_paper(record_ids_nested, all_records)
        bibtask.write_message(
            "update-cache: Finished cleaning PersonID"
            " table",
            stream=sys.stdout,
            verbose=0)
        bibtask.task_update_progress('clean-cache: DONE')

    if repair_pid:
        bibtask.task_update_progress('Updating names cache...')
        _run_update_authornames_tables_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('Removing person entities not touched by '
                                     'humans...')
        personid_remove_automatically_assigned_papers()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('Updating person entities...')
        update_personID_from_algorithm()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('Cleaning person tables...')
        _run_update_personID_table_from_paper()
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        bibtask.task_update_progress('All repairs done.')

    return 1
def _update_authorid_universe(record_ids=None, all_records=False):
    '''
    Updates all data related to the authorid algorithm.

    Sequence of operations:
        - Get all recently updated papers and remember time in the log
        - Get all authors on all papers
        - Extract collection of last names
        - For each last name:
            - Populate mem cache with cluster data
            - Delete updated records and their virtual authors from mem cache
            - Create virtual authors for new and updated records
            - Start matching algorithm
        - Update tables with results of the computation
        - Start personid update procedure
    '''

    def create_vas_from_specific_doclist(bibrec_ids):
        '''
        Processes the document list and creates a new minimal virtual author
        for each author in each record specified in the given list.

        @param bibrec_ids: Record IDs to concern in this update
        @type bibrec_ids: list of int
        '''
        num_docs = len([row for row in dat.DOC_LIST
                     if row['bibrecid'] in bibrec_ids])

        bconfig.LOGGER.log(25, "Creating minimal virtual authors for "
                                "all loaded docs (%s)"
                                % (num_docs))

        for docs in [row for row in dat.DOC_LIST
                     if row['bibrecid'] in bibrec_ids]:
            for author_id in docs['authornameids']:
                author_name = [an['name'] for an in dat.AUTHOR_NAMES
                               if an['id'] == author_id]
                refrecs = [ref[1] for ref in docs['authornameid_bibrefrec']
                           if ref[0] == author_id]
                refrec = -1

                if len(refrecs) > 1:
                    refrec = refrecs[0]
                elif refrecs:
                    refrec = refrecs[0]

                if refrec and author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [], refrec)
                elif author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [])

    dat.reset_mem_cache(True)
    last_log = None
    updated_records = []

    if not record_ids and not all_records:
        last_log = get_user_log(userinfo='daemon',
                                action='update_aid',
                                only_most_recent=True)
        if last_log:
            #select only the most recent papers
            recently_modified, last_update_time = get_papers_recently_modified(
                                                        date=last_log[0][2])
            insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status',
                        comment='bibauthorid_daemon, update_authorid_universe',
                        timestamp=last_update_time[0][0])
            bibtask.write_message("Update authorid will operate on %s records."
                                  % (len(recently_modified)), stream=sys.stdout,
                                  verbose=0)
    
            if not recently_modified:
                bibtask.write_message("Update authorid: Nothing to do",
                                      stream=sys.stdout, verbose=0)
                return
    
            for rec in recently_modified:
                updated_records.append(rec[0])
                dat.update_log("rec_updates", rec[0])
    
        else:
            bibtask.write_message("Update authorid: Nothing to do",
                                  stream=sys.stdout, verbose=0)
            return

    elif record_ids and not all_records:
        updated_records = record_ids

    elif not record_ids and all_records:
        bibtask.write_message("Update is going to empty all aid tables...",
                              stream=sys.stdout, verbose=0)
        empty_aid_tables()
        bibtask.write_message("Update authorid will operate on all! records.",
                              stream=sys.stdout, verbose=0)
        bibtask.task_update_progress('Update is operating on all! records.')
        start_full_disambiguation(process_orphans=True,
                                  db_exists=False,
                                  populate_doclist=True,
                                  write_to_db=True)
        bibtask.task_update_progress('Update is done.')
        return

    bibtask.task_sleep_now_if_required(can_stop_too=True)
    authors = []
    author_last_names = set()

    bibtask.task_update_progress('Reading authors from updated records')
    bibtask.write_message("Reading authors from updated records",
                                stream=sys.stdout, verbose=0)
    updated_ras = set()

    # get all authors from all updated records
    for rec in updated_records:
        rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a",
                                                    source="API")

        for rec_author in rec_authors:
            if not rec_author:
                bconfig.LOGGER.error("Invalid empty author string, which "
                                     "will be skipped on record %s"
                                     % (rec))
                continue

            author_in_list = [row for row in authors
                              if row['db_name'] == rec_author]

            if author_in_list:
                for upd in [row for row in authors
                            if row['db_name'] == rec_author]:
                    upd['records'].append(rec)
            else:
                last_name = split_name_parts(rec_author)[0]
                author_last_names.add(last_name)
                authors.append({'db_name': rec_author,
                                'records': [rec],
                                'last_name': last_name})

    for status, author_last_name in enumerate(author_last_names):
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        current_authors = [row for row in authors
                           if row['last_name'] == author_last_name]
        total_lnames = len(author_last_names)
        total_authors = len(current_authors)
        bibtask.task_update_progress('Processing %s of %s cluster: "%s" '
                                     '(%s authors)'
                                     % (status + 1, total_lnames,
                                        author_last_name, total_authors))
        bibtask.write_message('Processing %s of %s cluster: "%s" '
                              '(%s authors)'
                              % (status + 1, total_lnames, author_last_name,
                                 total_authors), stream=sys.stdout, verbose=0)
        dat.reset_mem_cache(True)
        init_authornames(author_last_name)
        load_mem_cache_from_tables()
        bconfig.LOGGER.log(25, "-- Relevant data successfully read into memory"
                               " to start processing")

        for current_author in current_authors:
            load_records_to_mem_cache(current_author['records'])
            authornamesid = [row['id'] for row in dat.AUTHOR_NAMES
                             if row['db_name'] == current_author['db_name']]

            if not authornamesid:
                bconfig.LOGGER.error("The author '%s' rec '%s' is not in authornames "
                                     "and will be skipped. You might want "
                                     "to run authornames update before?"
                                     % (current_author['db_name'], rec))
                continue
            else:
                try:
                    authornamesid = int(authornamesid[0])
                except (IndexError, TypeError, ValueError):
                    bconfig.LOGGER.error("Invalid authornames ID!")
                    continue

            if not current_author['records']:
                bconfig.LOGGER.error("The author '%s' is not associated to any"
                                     " document and will be skipped."
                                     % (current_author['db_name']))
                continue

            for rec in current_author['records']:
                # remove VAs already existing for the record
                va_ids = get_va_ids_by_recid_lname(rec,
                                                   current_author["last_name"])

                if va_ids:
                    for va_id in va_ids:
                        ra_list = get_realauthors_by_virtuala_id(va_id)

                        for ra_id in ra_list:
                            remove_va_from_ra(ra_id, va_id)
                            del_ra_data_by_vaid(ra_id, va_id)

                        va_anames_id = get_virtualauthor_records(va_id,
                                                        "orig_authorname_id")

                        for an_list in [row['authornameids'] for row in
                                    dat.DOC_LIST if row['bibrecid'] == rec]:
                            try:
                                an_list.remove(va_anames_id)
                            except (ValueError):
                                # This names id is not in the list...don't care
                                pass

                        delete_virtual_author(va_id)

                # create new VAs for the record.
                update_doclist(rec, authornamesid)
                dat.update_log("rec_updates", rec)

            create_vas_from_specific_doclist(current_author['records'])

        bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.")
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        start_computation(process_doclist=False,
                          process_orphans=True,
                          print_stats=True)
        bconfig.LOGGER.log(25, "-- Computation finished. Will write back to "
                               "the database now.")
        update_db_result = update_tables_from_mem_cache(return_ra_updates=True)
        bibtask.task_sleep_now_if_required(can_stop_too=True)

        if not update_db_result[0]:
            bconfig.LOGGER.log(25, "Writing to persistence layer failed.")
        else:
            if update_db_result[1]:
                for updated_ra in update_db_result[1]:
                    if updated_ra:
                        updated_ras.add(updated_ra[0])

            bconfig.LOGGER.log(25, "Done updating authorid universe.")

    personid_ra_format = []

    for ra_id in updated_ras:
        personid_ra_format.append((ra_id,))

    bconfig.LOGGER.log(25, "Will now run personid update to make the "
                       "changes visible also on the front end and to "
                       "create person IDs for %s newly created and changed "
                       "authors." % len(updated_ras))
    bibtask.task_update_progress('Updating persistent Person IDs')
    bibtask.task_sleep_now_if_required(can_stop_too=False)
    update_personID_from_algorithm(personid_ra_format)
    bconfig.LOGGER.log(25, "Done updating everything. Thanks for flying "
                       "with bibauthorid!")
def _prepare_data_files_from_db(data_dir_name="grid_data",
                                workdir_prefix="job",
                                max_records=4000):
    '''
    Prepares grid jobs. Is a task running in bibsched.
    Meaning:
        1. Find all last names in the database
        2. For each last name:
            - find all documents regarding this last name (ignore first names)
            - if number of documents loaded into memory exceeds max_records,
              write the memory cache into files (cf. Files section).
              Each write back procedure will happen into a newly created
              directory. The prefix for the respective job directory may
              be specified as well as the name of the data directory where
              these job directories will be created.
    Files:
        - authornames.dat
        - virtual_authors.dat
        - virtual_author_data.dat
        - virtual_author_clusters.dat
        - virtual_author_cluster_cache.dat
        - realauthors.dat
        - realauthor_data.dat
        - doclist.dat
        - records.dat
        - ids.dat
        - ra_va_cache.dat

    @param data_dir_name: the name of the directory that will hold all the
        sub directories for the jobs.
    @type data_dir_name: string
    @param workdir_prefix: prefix for the job sub directories.
    @type workdir_prefix: string
    @param max_records: maximum number of records after which the memory
        cache is to be flushed to files.
    @type max_records: int
    '''
    try:
        max_records = int(max_records)
    except ValueError:
        max_records = 4000

    bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0)
    bibtask.write_message("Limiting files to %s records" % (max_records,),
                          stream=sys.stdout, verbose=0)
    bibtask.task_update_progress('Loading last names...')

    last_names = find_all_last_names()
    last_name_queue = Queue.Queue()

    for last_name in sorted(last_names):
        last_name_queue.put(last_name)

    total = len(last_names)
    status = 1
    bibtask.write_message("Done. Loaded %s last names."
                          % (total), stream=sys.stdout, verbose=0)
    job_id = 0
    data_dir = ""

    if data_dir_name.startswith("/"):
        data_dir = data_dir_name
    else:
        data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name)

    if not data_dir.endswith("/"):
        data_dir = "%s/" % (data_dir,)

    job_lnames = []

    while True:
        if last_name_queue.empty():
            bibtask.write_message("Done with all names.",
                                    stream=sys.stdout, verbose=0)
            break

        bibtask.task_sleep_now_if_required(can_stop_too=False)
        lname_list = last_name_queue.get()
        lname = None

        if lname_list:
            lname = lname_list[0]
            del(lname_list[0])
        else:
            bconfig.LOGGER.warning("Got an empty Queue element. "
                                   "Queue seems corrupted.")
            continue

        job_lnames.append(lname)
        bibtask.task_update_progress('Preparing job %d of %d: %s.'
                                     % (status, total, lname))
        bibtask.write_message(("Processing: %s (%d/%d).")
                                    % (lname, status, total),
                                    stream=sys.stdout, verbose=0)

        bibtask.task_sleep_now_if_required(can_stop_too=False)
        populate_doclist_for_author_surname(lname)
        post_remove_names = set()

        for name in [row['name'] for row in dat.AUTHOR_NAMES
                     if not row['processed']]:
            potential_removal = "%s," % (name.split(',')[0],)

            if not potential_removal == "%s" % (lname,):
                post_remove_names.add(potential_removal)

        if len(post_remove_names) > 1:
            removed = 0
            removed_names = []

            for post_remove_name in post_remove_names:
                if post_remove_name in lname_list:
                    lname_list.remove(post_remove_name)
                    removed_names.append(post_remove_name)
                    removed += 1

            bibtask.write_message(("-> Removed %s entries from the "
                                    + "computation list: %s")
                                    % (removed, removed_names),
                                    stream=sys.stdout, verbose=0)
            total -= removed

        if lname_list:
            last_name_queue.put(lname_list)

        if len(dat.RELEVANT_RECORDS) >= max_records:
            if not os.path.exists(data_dir):
                os.mkdir(data_dir)

            work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

            _write_to_files(work_dir, job_lnames)
            bibtask.task_sleep_now_if_required(can_stop_too=True)
            job_lnames = []
            job_id += 1

        status += 1

    if dat.RELEVANT_RECORDS:
        if not os.path.exists(data_dir):
            os.mkdir(data_dir)

        work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

        _write_to_files(work_dir, job_lnames)
        bibtask.task_sleep_now_if_required(can_stop_too=True)

    return True
    deleted_recs = dbinter.get_deleted_papers()
    deleted_recs = frozenset(x[0] for x in deleted_recs)
    if bconfig.TABLES_UTILS_DEBUG:
        print "%d total deleted papers" % (len(deleted_recs),)

    if personid:
        personid_q = dbinter.list_2_SQL_str(personid, lambda x: str(x[0]))
    else:
        personid_q = None

    counter = 0
    rows_limit = 10000000
    end_loop = False
    while not end_loop:
        task_sleep_now_if_required(can_stop_too=False)
        papers_data = dbinter.collect_personid_papers(person=personid_q,
                                                      limit=(counter, rows_limit,))

        if bconfig.TABLES_UTILS_DEBUG:
            print "query with limit %d %d" % (counter, rows_limit)

        if len(papers_data) == rows_limit:
            counter += rows_limit
        else:
            end_loop = True

        papers_data = tuple((extract_bibrec(p[3]), p) for p in papers_data)
        to_remove = set()
        jobs = dict()
        for p in papers_data:
예제 #20
0
def _analyze_documents(
        records,
        taxonomy_name,
        collection,
        output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding taxonomy_name.
    @var records: list of recids to process
    @var taxonomy_name: str, name of the taxonomy, e.g. HEP
    @var collection: str, collection name
    @keyword output_limit: int, max number of keywords to extract [3]
    @return: str, marcxml output format of results
    """
    global _INDEX

    if not records:
        # No records could be found.
        bibtask.write_message(
            "WARNING: No records were found in collection %s." % collection,
            stream=sys.stderr,
            verbose=2)
        return False

    # Process records:
    output = []
    for record in records:
        bibdocfiles = BibRecDocs(record).list_latest_files(
        )  # TODO: why this doesn't call list_all_files() ?
        keywords = {}
        akws = {}
        acro = {}
        single_keywords = composite_keywords = author_keywords = acronyms = None

        for doc in bibdocfiles:
            # Get the keywords for all PDF documents contained in the record.
            if bibclassify_text_extractor.is_pdf(doc.get_full_path()):
                bibtask.write_message(
                    'INFO: Generating keywords for record %d.' % record,
                    stream=sys.stderr,
                    verbose=3)
                fulltext = doc.get_path()

                single_keywords, composite_keywords, author_keywords, acronyms = \
                    bibclassify_engine.get_keywords_from_local_file(fulltext,
                    taxonomy_name, with_author_keywords=True, output_mode="raw",
                    output_limit=output_limit, match_mode='partial')
            else:
                bibtask.write_message(
                    'WARNING: BibClassify does not know how to process \
                    doc: %s (type: %s) -- ignoring it.' %
                    (doc.fullpath, doc.doctype),
                    stream=sys.stderr,
                    verbose=3)

            if single_keywords or composite_keywords:
                cleaned_single = bibclassify_engine.clean_before_output(
                    single_keywords)
                cleaned_composite = bibclassify_engine.clean_before_output(
                    composite_keywords)
                # merge the groups into one
                keywords.update(cleaned_single)
                keywords.update(cleaned_composite)
            acro.update(acronyms)
            akws.update(author_keywords)

        if len(keywords):
            output.append('<record>')
            output.append('<controlfield tag="001">%s</controlfield>' % record)
            output.append(
                bibclassify_engine._output_marc(
                    keywords.items(), (),
                    akws,
                    acro,
                    spires=bconfig.CFG_SPIRES_FORMAT))
            output.append('</record>')
        else:
            bibtask.write_message('WARNING: No keywords found for record %d.' %
                                  record,
                                  stream=sys.stderr,
                                  verbose=0)

        _INDEX += 1

        bibtask.task_update_progress('Done %d out of %d.' %
                                     (_INDEX, _RECIDS_NUMBER))
        bibtask.task_sleep_now_if_required(can_stop_too=False)

    return '\n'.join(output)
예제 #21
0
def _update_authorid_universe(record_ids=None, all_records=False):
    '''
    Updates all data related to the authorid algorithm.

    Sequence of operations:
        - Get all recently updated papers and remember time in the log
        - Get all authors on all papers
        - Extract collection of last names
        - For each last name:
            - Populate mem cache with cluster data
            - Delete updated records and their virtual authors from mem cache
            - Create virtual authors for new and updated records
            - Start matching algorithm
        - Update tables with results of the computation
        - Start personid update procedure
    '''
    def create_vas_from_specific_doclist(bibrec_ids):
        '''
        Processes the document list and creates a new minimal virtual author
        for each author in each record specified in the given list.

        @param bibrec_ids: Record IDs to concern in this update
        @type bibrec_ids: list of int
        '''
        num_docs = len(
            [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids])

        bconfig.LOGGER.log(
            25, "Creating minimal virtual authors for "
            "all loaded docs (%s)" % (num_docs))

        for docs in [
                row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids
        ]:
            for author_id in docs['authornameids']:
                author_name = [
                    an['name'] for an in dat.AUTHOR_NAMES
                    if an['id'] == author_id
                ]
                refrecs = [
                    ref[1] for ref in docs['authornameid_bibrefrec']
                    if ref[0] == author_id
                ]
                refrec = -1

                if len(refrecs) > 1:
                    refrec = refrecs[0]
                elif refrecs:
                    refrec = refrecs[0]

                if refrec and author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [], refrec)
                elif author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [])

    dat.reset_mem_cache(True)
    last_log = None
    updated_records = []

    if not record_ids and not all_records:
        last_log = get_user_log(userinfo='daemon',
                                action='update_aid',
                                only_most_recent=True)
        if last_log:
            #select only the most recent papers
            recently_modified, last_update_time = get_papers_recently_modified(
                date=last_log[0][2])
            insert_user_log(
                'daemon',
                '-1',
                'update_aid',
                'bibsched',
                'status',
                comment='bibauthorid_daemon, update_authorid_universe',
                timestamp=last_update_time[0][0])
            bibtask.write_message(
                "Update authorid will operate on %s records." %
                (len(recently_modified)),
                stream=sys.stdout,
                verbose=0)

            if not recently_modified:
                bibtask.write_message("Update authorid: Nothing to do",
                                      stream=sys.stdout,
                                      verbose=0)
                return

            for rec in recently_modified:
                updated_records.append(rec[0])
                dat.update_log("rec_updates", rec[0])

        else:
            bibtask.write_message("Update authorid: Nothing to do",
                                  stream=sys.stdout,
                                  verbose=0)
            return

    elif record_ids and not all_records:
        updated_records = record_ids

    elif not record_ids and all_records:
        bibtask.write_message("Update is going to empty all aid tables...",
                              stream=sys.stdout,
                              verbose=0)
        empty_aid_tables()
        bibtask.write_message("Update authorid will operate on all! records.",
                              stream=sys.stdout,
                              verbose=0)
        bibtask.task_update_progress('Update is operating on all! records.')
        start_full_disambiguation(process_orphans=True,
                                  db_exists=False,
                                  populate_doclist=True,
                                  write_to_db=True)
        bibtask.task_update_progress('Update is done.')
        return

    bibtask.task_sleep_now_if_required(can_stop_too=True)
    authors = []
    author_last_names = set()

    bibtask.task_update_progress('Reading authors from updated records')
    bibtask.write_message("Reading authors from updated records",
                          stream=sys.stdout,
                          verbose=0)
    updated_ras = set()

    # get all authors from all updated records
    for rec in updated_records:
        rec_authors = get_field_values_on_condition(rec, ['100', '700'],
                                                    "a",
                                                    source="API")

        for rec_author in rec_authors:
            if not rec_author:
                bconfig.LOGGER.error("Invalid empty author string, which "
                                     "will be skipped on record %s" % (rec))
                continue

            author_in_list = [
                row for row in authors if row['db_name'] == rec_author
            ]

            if author_in_list:
                for upd in [
                        row for row in authors if row['db_name'] == rec_author
                ]:
                    upd['records'].append(rec)
            else:
                last_name = split_name_parts(rec_author)[0]
                author_last_names.add(last_name)
                authors.append({
                    'db_name': rec_author,
                    'records': [rec],
                    'last_name': last_name
                })

    for status, author_last_name in enumerate(author_last_names):
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        current_authors = [
            row for row in authors if row['last_name'] == author_last_name
        ]
        total_lnames = len(author_last_names)
        total_authors = len(current_authors)
        bibtask.task_update_progress(
            'Processing %s of %s cluster: "%s" '
            '(%s authors)' %
            (status + 1, total_lnames, author_last_name, total_authors))
        bibtask.write_message(
            'Processing %s of %s cluster: "%s" '
            '(%s authors)' %
            (status + 1, total_lnames, author_last_name, total_authors),
            stream=sys.stdout,
            verbose=0)
        dat.reset_mem_cache(True)
        init_authornames(author_last_name)
        load_mem_cache_from_tables()
        bconfig.LOGGER.log(
            25, "-- Relevant data successfully read into memory"
            " to start processing")

        for current_author in current_authors:
            load_records_to_mem_cache(current_author['records'])
            authornamesid = [
                row['id'] for row in dat.AUTHOR_NAMES
                if row['db_name'] == current_author['db_name']
            ]

            if not authornamesid:
                bconfig.LOGGER.error(
                    "The author '%s' rec '%s' is not in authornames "
                    "and will be skipped. You might want "
                    "to run authornames update before?" %
                    (current_author['db_name'], rec))
                continue
            else:
                try:
                    authornamesid = int(authornamesid[0])
                except (IndexError, TypeError, ValueError):
                    bconfig.LOGGER.error("Invalid authornames ID!")
                    continue

            if not current_author['records']:
                bconfig.LOGGER.error("The author '%s' is not associated to any"
                                     " document and will be skipped." %
                                     (current_author['db_name']))
                continue

            for rec in current_author['records']:
                # remove VAs already existing for the record
                va_ids = get_va_ids_by_recid_lname(rec,
                                                   current_author["last_name"])

                if va_ids:
                    for va_id in va_ids:
                        ra_list = get_realauthors_by_virtuala_id(va_id)

                        for ra_id in ra_list:
                            remove_va_from_ra(ra_id, va_id)
                            del_ra_data_by_vaid(ra_id, va_id)

                        va_anames_id = get_virtualauthor_records(
                            va_id, "orig_authorname_id")

                        for an_list in [
                                row['authornameids'] for row in dat.DOC_LIST
                                if row['bibrecid'] == rec
                        ]:
                            try:
                                an_list.remove(va_anames_id)
                            except (ValueError):
                                # This names id is not in the list...don't care
                                pass

                        delete_virtual_author(va_id)

                # create new VAs for the record.
                update_doclist(rec, authornamesid)
                dat.update_log("rec_updates", rec)

            create_vas_from_specific_doclist(current_author['records'])

        bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.")
        bibtask.task_sleep_now_if_required(can_stop_too=False)
        start_computation(process_doclist=False,
                          process_orphans=True,
                          print_stats=True)
        bconfig.LOGGER.log(
            25, "-- Computation finished. Will write back to "
            "the database now.")
        update_db_result = update_tables_from_mem_cache(return_ra_updates=True)
        bibtask.task_sleep_now_if_required(can_stop_too=True)

        if not update_db_result[0]:
            bconfig.LOGGER.log(25, "Writing to persistence layer failed.")
        else:
            if update_db_result[1]:
                for updated_ra in update_db_result[1]:
                    if updated_ra:
                        updated_ras.add(updated_ra[0])

            bconfig.LOGGER.log(25, "Done updating authorid universe.")

    personid_ra_format = []

    for ra_id in updated_ras:
        personid_ra_format.append((ra_id, ))

    bconfig.LOGGER.log(
        25, "Will now run personid update to make the "
        "changes visible also on the front end and to "
        "create person IDs for %s newly created and changed "
        "authors." % len(updated_ras))
    bibtask.task_update_progress('Updating persistent Person IDs')
    bibtask.task_sleep_now_if_required(can_stop_too=False)
    update_personID_from_algorithm(personid_ra_format)
    bconfig.LOGGER.log(
        25, "Done updating everything. Thanks for flying "
        "with bibauthorid!")
예제 #22
0
def rabbit(bibrecs, check_invalid_papers=False):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):
        task_sleep_now_if_required(True)
        update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec))
        if rec in deleted:
            delete_paper_from_personid(rec)
            continue

        markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))),
                                   izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec)))))

        personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)]
        personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new))))
                                    for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old])
                  for old in old_signatures] for new in new_signatures]

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix) if score > threshold]
        for new, old in best_match:
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_sigs(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids]

            if not matched_pids:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])

    update_status_final()

    if updated_pids: # an empty set will update all canonical_names
        update_personID_canonical_names(updated_pids)