def calculate_name_score(query_string, nameids):
    '''
    docstring

    @param query_string:
    @type query_string:
    @param nameids:
    @type nameids:

    @return:
    @rtype:
    '''
    name_personids_list = get_authors_data_from_indexable_name_ids(nameids)
    query_last_name = split_name_parts(query_string)[0]
    query_last_name_len = len(query_last_name)
    name_score_list = list()

    for name, personids in name_personids_list:
        current_last_name = split_name_parts(name)[0]
        current_last_name_len = len(current_last_name)
        if abs(query_last_name_len - current_last_name_len) == 0:
            dist = distance(query_last_name, current_last_name)
            limit = min([query_last_name_len, current_last_name_len])
            name_score = sum([1/float(2**(i+1)) for i in range(limit) if query_last_name[i] == current_last_name[i]])/(dist + 1)
            if name_score > 0.5:
                name_score_list.append((name, name_score, deserialize(personids)))

    return name_score_list
Beispiel #2
0
def crossref_normalize_name(record):
    """
    Changes the format of author's name (often with initials) to the proper,
    unified one, using bibauthor_name_utils tools
    @return: changed record
    """
    # pattern for removing the spaces between two initials
    pattern_initials = '([A-Z]\\.)\\s([A-Z]\\.)'
    # first, change the main author
    for field in record_get_field_instances(record, '100'):
        main_author = field[0][0][1]
        new_author = create_normalized_name(split_name_parts(main_author))
        # remove spaces between initials
        # two iterations are required
        for _ in range(2):
            new_author = re.sub(pattern_initials, '\g<1>\g<2>', new_author)
        position = field[4]
        record_modify_subfield(rec=record, tag='100', subfield_code='a', \
        value=new_author, subfield_position=0, field_position_global=position)

    # then, change additional authors
    for field in record_get_field_instances(record, '700'):
        author = field[0][0][1]
        new_author = create_normalized_name(split_name_parts(author))
        for _ in range(2):
            new_author = re.sub(pattern_initials, '\g<1>\g<2>', new_author)
        position = field[4]
        record_modify_subfield(rec=record, tag='700', subfield_code='a', \
            value=new_author, subfield_position=0, field_position_global=position)
Beispiel #3
0
def calculate_name_score(query_string, nameids):
    '''
    docstring

    @param query_string:
    @type query_string:
    @param nameids:
    @type nameids:

    @return:
    @rtype:
    '''
    name_personids_list = get_authors_data_from_indexable_name_ids(nameids)
    query_last_name = split_name_parts(query_string)[0]
    query_last_name_len = len(query_last_name)
    name_score_list = list()

    for name, personids in name_personids_list:
        current_last_name = split_name_parts(name)[0]
        current_last_name_len = len(current_last_name)
        if abs(query_last_name_len - current_last_name_len) == 0:
            dist = distance(query_last_name, current_last_name)
            limit = min([query_last_name_len, current_last_name_len])
            name_score = sum([
                1 / float(2**(i + 1)) for i in range(limit)
                if query_last_name[i] == current_last_name[i]
            ]) / (dist + 1)
            if name_score > 0.5:
                name_score_list.append(
                    (name, name_score, deserialize(personids)))

    return name_score_list
Beispiel #4
0
def crossref_normalize_name(record):
    """
    Changes the format of author's name (often with initials) to the proper,
    unified one, using bibauthor_name_utils tools
    @return: changed record
    """
    # pattern for removing the spaces between two initials
    pattern_initials = '([A-Z]\\.)\\s([A-Z]\\.)'
    # first, change the main author
    for field in record_get_field_instances(record, '100'):
        main_author = field[0][0][1]
        new_author = create_normalized_name(split_name_parts(main_author))
        # remove spaces between initials
        # two iterations are required
        for _ in range(2):
            new_author = re.sub(pattern_initials, r'\g<1>\g<2>', new_author)
        position = field[4]
        record_modify_subfield(rec=record, tag='100', subfield_code='a',
        value=new_author, subfield_position=0, field_position_global=position)

    # then, change additional authors
    for field in record_get_field_instances(record, '700'):
        author = field[0][0][1]
        new_author = create_normalized_name(split_name_parts(author))
        for _ in range(2):
            new_author = re.sub(pattern_initials, r'\g<1>\g<2>', new_author)
        position = field[4]
        record_modify_subfield(rec=record, tag='700', subfield_code='a',
            value=new_author, subfield_position=0, field_position_global=position)
Beispiel #5
0
def convert_personid():
    from dbquery import run_sql # oh come on, the whole function will be removed soon
    from itertools import repeat
    chunk = 1000

    old_personid = run_sql("SELECT `personid`, `tag`, `data`, `flag`, `lcul` FROM `aidPERSONID`")

    def flush_papers(args):
        run_sql("INSERT INTO `aidPERSONIDPAPERS` "
                "(`personid`, "
                " `bibref_table`, "
                " `bibref_value`, "
                " `bibrec`, "
                " `name`, "
                " `flag`, "
                " `lcul`) "
                "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7))
                , tuple(args))

    def flush_data(args):
        run_sql("INSERT INTO `aidPERSONIDDATA` "
                "(`personid`, "
                " `tag`, "
                " `data`, "
                " `opt1`, "
                " `opt2`) "
                "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5))
               , tuple(args))

    paper_args = []
    data_args = []
    for row in old_personid:
        if row[1] == 'paper':
            bibref, rec = row[2].split(',')
            tab, ref = bibref.split(':')
            try:
                name = get_name_by_bibrecref((int(tab), int(ref), int(rec)))
            except:
                continue
            name = split_name_parts(name)
            name = create_normalized_name(name)
            paper_args += [row[0], tab, ref, rec, name, row[3], row[4]]
            if len(paper_args) > chunk:
                flush_papers(paper_args)
                paper_args = []

        elif row[1] == 'gathered_name':
            continue
        else:
            data_args += list(row)
            if len(data_args) > chunk:
                flush_data(data_args)
                data_args = []

    if paper_args:
        flush_papers(paper_args)

    if data_args:
        flush_data(data_args)
def convert_personid():
    from invenio.dbquery import run_sql # oh come on, the whole function will be removed soon
    from itertools import repeat
    chunk = 1000

    old_personid = run_sql("SELECT `personid`, `tag`, `data`, `flag`, `lcul` FROM `aidPERSONID`")

    def flush_papers(args):
        run_sql("INSERT INTO `aidPERSONIDPAPERS` "
                "(`personid`, "
                " `bibref_table`, "
                " `bibref_value`, "
                " `bibrec`, "
                " `name`, "
                " `flag`, "
                " `lcul`) "
                "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7))
                , tuple(args))

    def flush_data(args):
        run_sql("INSERT INTO `aidPERSONIDDATA` "
                "(`personid`, "
                " `tag`, "
                " `data`, "
                " `opt1`, "
                " `opt2`) "
                "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5))
               , tuple(args))

    paper_args = []
    data_args = []
    for row in old_personid:
        if row[1] == 'paper':
            bibref, rec = row[2].split(',')
            tab, ref = bibref.split(':')
            try:
                name = get_name_by_bibref((int(tab), int(ref), int(rec)))
            except:
                continue
            name = split_name_parts(name)
            name = create_normalized_name(name)
            paper_args += [row[0], tab, ref, rec, name, row[3], row[4]]
            if len(paper_args) > chunk:
                flush_papers(paper_args)
                paper_args = []

        elif row[1] == 'gathered_name':
            continue
        else:
            data_args += list(row)
            if len(data_args) > chunk:
                flush_data(data_args)
                data_args = []

    if paper_args:
        flush_papers(paper_args)

    if data_args:
        flush_data(data_args)
def _split_and_index(el):
    name, pids = el
    asciified_name = translate_to_ascii(name)[0]
    split_name = split_name_parts(indexable_name_re.sub(' ', asciified_name))
    indexable_name = create_indexable_name(split_name)
    surname = split_name[0] + ','
    indexable_surname = create_indexable_name([surname, [], [], []])
    return (name, pids, indexable_name, indexable_surname)
def create_bibauthorid_indexer():
    '''
    It constructs the disk-based indexer. It consists of the dense index (which maps a name
    to the set of personids who withhold that name) and the inverted lists (which map a qgram
    to the set of name ids that share that qgram).
    '''
    name_pids_dict = get_confirmed_name_to_authors_mapping()
    if not name_pids_dict:
        return

    indexable_name_pids_dict = dict()

    for name in name_pids_dict.keys():
        asciified_name = translate_to_ascii(name)[0]
        indexable_name = create_indexable_name(asciified_name)
        if indexable_name:
            try:
                asciified_name, pids = indexable_name_pids_dict[indexable_name]
                updated_pids = pids | name_pids_dict[name]
                indexable_name_pids_dict[indexable_name] = (asciified_name, updated_pids)
            except KeyError:
                indexable_name_pids_dict[indexable_name] = (asciified_name, name_pids_dict[name])

        surname = split_name_parts(name)[0]
        asciified_surname = translate_to_ascii(surname)[0]
        indexable_surname = create_indexable_name(asciified_surname)
        if indexable_surname:
            try:
                asciified_surname, pids = indexable_name_pids_dict[indexable_surname]
                updated_pids = pids | name_pids_dict[name]
                indexable_name_pids_dict[indexable_surname] = (asciified_surname, updated_pids)
            except KeyError:
                indexable_name_pids_dict[indexable_surname] = (asciified_surname, name_pids_dict[name])

    indexable_names_list = indexable_name_pids_dict.keys()

    # If an exception/error occurs in any of the threads it is not detectable
    # so inter-thread communication is necessary to make it visible.
    q = Queue()
    threads = list()
    threads.append(Thread(target=create_dense_index, args=(indexable_name_pids_dict, indexable_names_list, q)))
    threads.append(Thread(target=create_inverted_lists, args=(indexable_names_list, q)))

    for t in threads:
        t.start()

    for t in threads:
        all_ok, error = q.get(block=True)
        if not all_ok:
            raise error
        q.task_done()

    for t in threads:
        t.join()
def find_personids_by_name(query_string):
    query_string_surname = split_name_parts(query_string)[0]
    
    name_score_list = set(find_personids_by_name1(query_string) + find_personids_by_name1(query_string_surname))
    name_ranking_list = sorted(name_score_list, key=itemgetter(1), reverse=True)
    
    pid_score_list = calculate_pid_score(name_ranking_list)
    pids_ranking_list = sorted(pid_score_list, key=itemgetter(2), reverse=True)

    ranked_pid_name_list = [pid for pid, name, final_score in pids_ranking_list]

    return ranked_pid_name_list    
def fallback_find_personids_by_name_string(target):
    '''
    Search engine to find persons matching the given string
    The matching is done on the surname first, and names if present.
    An ordered list (per compatibility) of pids and found names is returned.

    @param namestring: string name, 'surname, names I.'
    @type: string
    @param strict: Define if this shall perform an exact or a fuzzy match
    @type strict: boolean
    @return: pid list of lists
    [pid,[[name string, occur count, compatibility]]]
    '''
    splitted_name = split_name_parts(target)
    family = splitted_name[0]

    levels = (  # target + '%', #this introduces a weird problem: different results for mele, salvatore and salvatore mele
        family + ',%', family[:-2] + '%', '%' + family + ',%',
        '%' + family[1:-1] + '%')

    if len(family) <= 4:
        levels = [levels[0], levels[2]]

    for lev in levels:
        names = dbinter.get_authors_by_name_regexp(lev)
        if names:
            print "%s" % lev
            break

    is_canonical = False
    if not names:
        names = dbinter.get_authors_by_canonical_name_regexp(target)
        is_canonical = True

    names = groupby(sorted(names))
    names = [(key[0], key[1], len(list(data)),
              soft_compare_names(target, key[1])) for key, data in names]
    names = groupby(names, itemgetter(0))
    names = [(key,
              sorted([(d[1], d[2], d[3])
                      for d in data if (d[3] > 0.5 or is_canonical)],
                     key=itemgetter(2),
                     reverse=True)) for key, data in names]
    names = [name for name in names if name[1]]
    names = sorted(names,
                   key=lambda x: (x[1][0][2], x[1][0][0], x[1][0][1]),
                   reverse=True)

    return names
def fallback_find_personids_by_name_string(target):
    '''
    Search engine to find persons matching the given string
    The matching is done on the surname first, and names if present.
    An ordered list (per compatibility) of pids and found names is returned.

    @param namestring: string name, 'surname, names I.'
    @type: string
    @param strict: Define if this shall perform an exact or a fuzzy match
    @type strict: boolean
    @return: pid list of lists
    [pid,[[name string, occur count, compatibility]]]
    '''
    splitted_name = split_name_parts(target)
    family = splitted_name[0]

    levels = (# target + '%', #this introduces a weird problem: different results for mele, salvatore and salvatore mele
              family + ',%',
              family[:-2] + '%',
              '%' + family + ',%',
              '%' + family[1:-1] + '%')

    if len(family) <= 4:
        levels = [levels[0], levels[2]]

    for lev in levels:
        names = dbinter.get_authors_by_name_regexp(lev)
        if names:
            print "%s" % lev
            break

    is_canonical = False
    if not names:
        names = dbinter.get_authors_by_canonical_name_regexp(target)
        is_canonical = True

    names = groupby(sorted(names))
    names = [(key[0], key[1], len(list(data)), soft_compare_names(target, key[1])) for key, data in names]
    names = groupby(names, itemgetter(0))
    names = [(key, sorted([(d[1], d[2], d[3]) for d in data if (d[3] > 0.5 or is_canonical)],
             key=itemgetter(2), reverse=True)) for key, data in names]
    names = [name for name in names if name[1]]
    names = sorted(names, key=lambda x: (x[1][0][2], x[1][0][0], x[1][0][1]), reverse=True)

    return names
Beispiel #12
0
def find_personids_by_name(query_string):
    query_string_surname = split_name_parts(query_string)[0]

    name_score_list = set(
        find_personids_by_name1(query_string) +
        find_personids_by_name1(query_string_surname))
    name_ranking_list = sorted(name_score_list,
                               key=itemgetter(1),
                               reverse=True)

    pid_score_list = calculate_pid_score(name_ranking_list)
    pids_ranking_list = sorted(pid_score_list, key=itemgetter(2), reverse=True)

    ranked_pid_name_list = [
        pid for pid, name, final_score in pids_ranking_list
    ]

    return ranked_pid_name_list
def cache_name_variants_of_authors(author_to_name_and_occurrence_mapping):
    args = list()
    for author, names_and_occurrence in author_to_name_and_occurrence_mapping.iteritems(
    ):
        indexable_names_and_occurrence = dict()
        for name, occurrences in names_and_occurrence.iteritems():
            asciified_name = translate_to_ascii(name)[0]
            indexable_name = create_indexable_name(
                split_name_parts(indexable_name_re.sub(' ', asciified_name)))
            try:
                indexable_names_and_occurrence[indexable_name] += occurrences
            except KeyError:
                indexable_names_and_occurrence[indexable_name] = occurrences

        args += [author, serialize(indexable_names_and_occurrence), 1]

    populate_table('aidDENSEINDEX', ['id', 'personids', 'flag'],
                   args,
                   empty_table_first=False)
pdata = {}
ndata = {}
artifact_removal = re.compile("[^a-zA-Z0-9]")

for i in claimnames:
    pid = i[0]
    n = i[1]
    if not pid in pdata:
        pdata[pid] = {}
        pdata[pid]['name'] = []
        pdata[pid]['lname'] = set()
        pdata[pid]['olnames'] = set()
        pdata[pid]['vclaims'] = run_sql("select count(id) from aidPERSONID where flag = 2 and tag='paper' and personid = %s", (pid,))[0][0]
    pdata[pid]['name'].append(n)
    clname = artifact_removal.sub("", split_name_parts(n)[0].lower())
    pdata[pid]['lname'].add(clname)
    pdata[pid]['olnames'].add(split_name_parts(n)[0])


for p in pdata:
    if len(pdata[p]['lname']) > 1:
        print "multiple names in", pdata[p]['lname']
    l = list(pdata[p]['lname'])[0]
    if l in ndata:
        ndata[l]['pc'] += 1
        ndata[l]['vp'] += pdata[p]['vclaims']
    else:
        ndata[l] = {}
        ndata[l]['pc'] = 1
        ndata[l]['vp'] = pdata[p]['vclaims']
 def test_create_normalized_name(self):
     for tn in self.tc.keys():
         self.assertEqual(create_normalized_name(split_name_parts(tn)), self.tc[tn])
from invenio.dbquery import run_sql
import invenio.bibauthorid_name_utils as nu

names = run_sql("select name from aidAUTHORNAMES")
fnames = set([i[0].split(',')[0] for i in names])
splitnames = []

for i in names:
    splitnames.append(nu.split_name_parts(i[0]))

multinames = 0
cntnames = 0
cntinitonly = 0
multiinitials = 0
fnlt5 = 0          
fnonly = 0

for i in splitnames:
    if len(i[0]) < 5:
        fnlt5 += 1
    if i[2] and len(i[2]) > 1:
        multinames += 1
    if i[2]:
        cntnames += 1
    else:
        if i[1] and len(i[1]) > 1:
            multiinitials += 1
        if i[1]:
            cntinitonly += 1
        else:
            fnonly += 1
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_string_to_pid_dictionary()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_new_personid()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and
        len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):
        task_sleep_now_if_required(True)
        update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec))
        if rec in deleted:
            delete_paper_from_personid(rec)
            continue

        markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))),
                                   izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec)))))

        personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)]
        personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new))))
                                    for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old])
                  for old in old_signatures] for new in new_signatures]

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix) if score > threshold]
        for new, old in best_match:
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_sigs(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = []
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id(sig + (rec,))
                    if inspire_id:
                        matched_pids = list(get_person_with_extid(inspire_id[0]))
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids]

            if not matched_pids:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids: # an empty set will update all canonical_names
        update_personID_canonical_names(updated_pids)
        update_personID_external_ids(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()
from invenio.dbquery import run_sql
from invenio.bibauthorid_name_utils import split_name_parts
from invenio.bibauthorid_dbinterface import get_all_authors
from invenio.bibauthorid_dbinterface import get_all_bibrecs
from bibauthorid_general_utils import update_status

print("Getting records...")
#records = [p[0] for p in run_sql("select id from bibrec")]
records = get_all_bibrecs() 
lastnames = []

for index, bibrec in enumerate(records):
    if index % 1000 == 0:
        percent = float(index) / len(records)
        update_status(percent, "%s of all %s records done." % (index, len(records)))

    for author in get_all_authors(bibrec):
        lastnames.append(split_name_parts(author)[0])

fp = open("/tmp/lastnames.txt", "w")
fp.write("\n".join(lastnames))
fp.close()
Beispiel #19
0
def worker(i, gpid_queue, qdata_queue):

    if gpid_queue.empty():
        return

    gpid = gpid_queue.get()

    gpdata = {}
    rpdata = {}
    
    if not PRINT_STATS:
        sys.stdout.write(".")
        sys.stdout.flush()
        
    artifact_removal = re.compile("[^a-zA-Z0-9]")
    gpdata[gpid] = {}
    pdata = gpdata[gpid]
    gpids_scores = {}
    gperson_papers = []
    gperson_papers_set = set()
    total_papers = set()

    gpapers = run_sql("select data from aidGOLD where tag='paper' and personid=%s", (gpid,))
    msg('|-- %s: Starting w/ %s gold papers' % (i, len(gpapers)))


    for gp in gpapers:
        total_papers.add(gp[0])
        gperson_papers.append(gp[0])
        gperson_papers_set.add(gp[0])

    rpids = run_sql("select distinct o.personid from aidRESULTS o, (select i.data as idata from aidGOLD i "
                    "where tag='paper' and i.personid=%s) as dummy where dummy.idata = o.data", (gpid,))
    
    if not rpids:
        msg("%s: Nothing to do. No person entities in result set" % i)
        return

    msg('  |-- %s: Collecting data for rpids' % i)

    for rpid in rpids:
        rpp = run_sql("select data from aidRESULTS where tag='paper' and personid=%s", (rpid[0],))
        rpdata[rpid[0]] = {}
        rpdata[rpid[0]]['papers'] = []

        for rp in rpp:
            total_papers.add(rp[0])
            rpdata[rpid[0]]['papers'].append(rp[0])
    
    # construct reference vectors for total papers and gold person papers for the C measure 
    total_papers_vector = list(total_papers)
    gp_papers_vector = zeros(len(total_papers_vector))

    for p in gperson_papers:
        gp_papers_vector[total_papers_vector.index(p)] = 1

    msg('  |-- %s: Performing QA data for %s rpids' % (i,len(rpids)))

    # assess quality for each result person for this gold person
    for rpid in rpdata:
        rpd = rpdata[rpid]
        rpaperset = set(rpd['papers'])
        
        # F1 measure...
        true_positives = rpaperset.intersection(gperson_papers_set)
        false_positives = set()
        true_negatives = set()
        false_negatives = set()
        
        for rpp in rpaperset:
            if not rpp in gperson_papers_set:
                false_positives.add(rpp)
        
        for gpp in gperson_papers_set:
            if not gpp in rpaperset:
                false_negatives.add(gpp)
        
        tp = float(len(true_positives))
        fp = float(len(false_positives))
        fn = float(len(false_negatives))
        
        precision = tp / max(tp + fp, 1.0)
        recall = tp / max(tp + fn, 1.0)
    
        f1 = 2.0 * (float(precision * recall) / max(float(precision + recall), 1.0))

        # C measure...
        c = 0.0
        if len(total_papers) > 0 and len(rpaperset) > 0:
            k = zeros(len(total_papers_vector))
    
            for p in rpd['papers']:
                k[total_papers_vector.index(p)] = 1
            
            c = linalg.norm(gp_papers_vector - k) #* (tp / float(len(total_papers_vector)))

        #/ float(len(total_papers_vector))
        
        # store results for this person:
        pdata[rpid] = {}

        pdata[rpid]['true_positives'] = tp
        pdata[rpid]['false_positives'] = fp
        pdata[rpid]['false_negatives'] = fn
        pdata[rpid]['true_positives_set'] = true_positives
        pdata[rpid]['false_positives_set'] = false_positives
        pdata[rpid]['false_negatives_set'] = false_negatives

        pdata[rpid]['f1'] = f1
        pdata[rpid]['c'] = c
        
        # print the f1 score parts
        msg('    |-- GOLDPID %s || True Positives for ResultPID %s: %s' % (gpid, rpid, str(true_positives))) 
        msg('    |-- GOLDPID %s || False Positives for ResultPID %s: %s' % (gpid, rpid, str(false_positives))) 
        msg('    |-- GOLDPID %s || False Negatives for ResultPID %s: %s' % (gpid, rpid, str(false_negatives))) 

    pdata['gpid_f1'] = 0.0
    pdata['gpid_c'] = 0.0
    gpid_f1 = []
    gpid_c = []
 
    for rpid in rpdata:
        gpid_f1.append(pdata[rpid]['f1'])
        gpid_c.append((pdata[rpid]['c'] * pdata[rpid]['true_positives'])/ float(len(gp_papers_vector)))
    
    pdata['gpid_f1'] = average(gpid_f1)
    pdata['gpid_c'] = average(gpid_c)
    pdata['lastnames'] = set()

    all_refs = set([br.split(',')[0] for br in gperson_papers])
    all_last_names = set()
    all_names = set()

    msg('  |-- %s: Finding last name for gPid' % i)

    for ref in all_refs:
        try:
            all_names.add(run_sql("select o.name from aidAUTHORNAMES o, "
                              "(select i.name_id as nid from aidAUTHORNAMESBIBREFS i "
                              "where bibref=%s) as dummy where o.id = dummy.nid", (ref,))[0][0])
        except IndexError:
            msg("Ignoring bibref (no name found): %s" % (ref))

    for name in all_names:
        cln = artifact_removal.sub("", split_name_parts(name)[0].lower())
        pdata['lastnames'].add(cln)

    pdata['lastname'] = list(pdata['lastnames'])[0]

    qdata_queue.put(gpdata)

    msg('  |-- %s: Done with golden pid %s.' % (i,gpid))

    return
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None, verbose=False):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w')
    logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs)))

    def logwrite(msg, is_error):
        verb = 9
        if is_error or verbose:
            verb = 1
        write_message(msg, verbose=verb)

    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_to_authors_mapping()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_free_author_id()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_papers()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and
        len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):

        logwrite("\nConsidering %s" % str(rec), False)

        if idx%200 == 0:
            task_sleep_now_if_required(True)

            update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec))
            task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec))

        if rec in deleted:
            logwrite(" - Record was deleted, removing from pid and continuing with next record", True)
            remove_papers([rec])
            continue


        markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_author_refs_of_paper(rec))),
                                   izip(cycle([700]), imap(itemgetter(0), get_coauthor_refs_of_paper(rec)))))

        personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec)]
        personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibref(new))))
                                    for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old])
                  for old in old_signatures] for new in new_signatures]

        logwrite(" - Old signatures: %s" % str(old_signatures), bool(old_signatures))
        logwrite(" - New signatures: %s" % str(new_signatures), bool(new_signatures))
        logwrite(" - Matrix: %s" % str(matrix), bool(matrix))

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix) if score > threshold]

        logwrite(" - Best match: %s " % str(best_match), bool(best_match))

        for new, old in best_match:
            logwrite(" - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new]), True)
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_signatures(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match))

        pids_having_rec = set([int(row[0]) for row in get_signatures_of_paper(rec)])
        logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = list()
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id_of_signature(sig + (rec,))
                    if inspire_id:
                        matched_pids = list(get_author_by_external_id(inspire_id[0]))
                        if matched_pids and int(matched_pids[0][0]) in pids_having_rec:
                            matched_pids = list()
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    pids_having_rec.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids]

            if not matched_pids or int(matched_pids[0][0]) in pids_having_rec:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])
                pids_having_rec.add(matched_pids[0][0])

        logwrite('Finished with %s' % str(rec), False)

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids: # an empty set will update all canonical_names
        update_canonical_names_of_authors(updated_pids)
        update_external_ids_of_authors(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()

    remove_empty_authors()
Beispiel #21
0
 def test_create_normalized_name(self):
     for tn in self.tc.keys():
         self.assertEqual(create_normalized_name(split_name_parts(tn)),
                          self.tc[tn])
def main():
    """
    Reads import file and verfies the md5 hash.
    For each line in the import file:
        find new record from bibcode, find new ref from name on record
        find old row in personid tables
        copy row with new authorref (tab:bibref,rec) to temp table
    overwrite personid tables w/ temp table
    """
    ## create temporary tables...
    print "Creating temporary tables..."
    create_temp_pid_sql_table()
    create_temp_piddata_sql_table()
    create_temp_user_input_log_sql_table()
    
    ## fill temp tables w/ static values...
    print "Filling temporary tables with static, unchanged content"
    copy_unaltered_piddata_rows_to_temp()
    copy_unaltered_user_input_log_table_rows_to_temp()
    ## compile regexp for line break removal
    nlr = re.compile('[\n\r]+')

    #verify file integrity
    print ("Verifying file integrity of %s with"
           " MD5 checksum from %s" % (IMPORT_FILE_NAME, IMPORT_MD5_FILE_NAME))
    fp = open(IMPORT_FILE_NAME, "rb")
    fmd5 = md5_for_file(fp)
    fp.close()

    fp = open(IMPORT_MD5_FILE_NAME, "r")
    vmd5 = fp.read()
    fp.close()
    
    if not fmd5 == vmd5:
        print "WARNING: Detected a disturbance in the file. Will exit here."
        return

    total_lines = file_len()
    fp = open(IMPORT_FILE_NAME, "r")
    print "Processing file %s..." % IMPORT_FILE_NAME

    for index, line in enumerate(fp.readlines()):
#        if index == 100:
#            break
        if index % 5000 == 0:
            percent = float(index) / float(total_lines)
            update_status(percent, "%s of %s lines processed in %s" % (index, total_lines, IMPORT_FILE_NAME))

        new_ref = None
        tab1, old_ref, old_rec, tab2, enname, bibcode = line.split("    ")
        
        assert tab1 == tab2

        if tab1 == "table":
            continue

        name = base64.b64decode(enname)
#        name = nq.sub("", name)
        bibcode = nlr.sub("", bibcode)
        new_rec = get_bibrec_from_bibcode(bibcode)

        for ref in get_authorrefs_and_names_from_bibrec(new_rec):
#            refname = create_normalized_name(split_name_parts(ref[2]))
            refname = ref[2]

            if refname == name and str(ref[0]) == tab1:
                #MySQL equivalent: col_name COLLATE utf8_bin = 'Case SenSitive name'
                new_ref = ref[1]
        
        if not new_ref:
            print "WARN: Authorref not found for name %s on new record %s?!" % (name, new_rec)
            continue

        # get personid, flag, lcul and last_updated from old aidPERSONIDPAPERS
        old_data = find_old_pidtable_row(tab1, old_ref, old_rec)

        if old_data:
            ## prepare data in temporary tables...
            pid, flag, lcul, lupdate = old_data
            old_authorref = "%s:%s,%s" % (tab1, old_ref, old_rec)
            new_authorref = "%s:%s,%s" % (tab1, new_ref, new_rec)
            ## Transform the name into a more consistent form
            inname = create_normalized_name(split_name_parts(name))
            ## Insert transformed data into temp tables...
            insert_into_temp_table(pid, tab1, new_ref, new_rec, inname, flag, lcul, lupdate)
            update_temp_piddata_table(old_authorref, new_authorref)
            update_temp_user_input_log_table(old_authorref, new_authorref)
        else:
            print "WARN: %s does not exist in db!" % ([tab1, old_ref, old_rec])

        # The following is true only if applied on the same data set
        # Commented out by default. For testing/debug uses only
        try:
            if RUN_IN_TEST_MODE:
                assert str(old_rec) == str(new_rec)
                assert str(old_ref) == str(new_ref)
                pass
        except AssertionError, e:
            print "ERROR: ", e
            print "%s:%s,%s vs. %s:%s,%s on %s:%s" % (tab1, old_ref, old_rec, tab1, new_ref, new_rec, bibcode, name)
Beispiel #23
0
 def test_split_name_parts(self):
     for tn in self.names_split_name_parts.keys():
         self.assertEqual(split_name_parts(tn),
                          self.names_split_name_parts[tn])
Beispiel #24
0
def create_bibauthorid_indexer():
    '''
    It constructs the disk-based indexer. It consists of the dense index (which maps a name
    to the set of personids who withhold that name) and the inverted lists (which map a qgram
    to the set of name ids that share that qgram).
    '''
    name_pids_dict = get_confirmed_name_to_authors_mapping()
    if not name_pids_dict:
        return

    indexable_name_pids_dict = dict()

    for name in name_pids_dict.keys():
        asciified_name = translate_to_ascii(name)[0]
        indexable_name = create_indexable_name(asciified_name)
        if indexable_name:
            try:
                asciified_name, pids = indexable_name_pids_dict[indexable_name]
                updated_pids = pids | name_pids_dict[name]
                indexable_name_pids_dict[indexable_name] = (asciified_name,
                                                            updated_pids)
            except KeyError:
                indexable_name_pids_dict[indexable_name] = (
                    asciified_name, name_pids_dict[name])

        surname = split_name_parts(name)[0]
        asciified_surname = translate_to_ascii(surname)[0]
        indexable_surname = create_indexable_name(asciified_surname)
        if indexable_surname:
            try:
                asciified_surname, pids = indexable_name_pids_dict[
                    indexable_surname]
                updated_pids = pids | name_pids_dict[name]
                indexable_name_pids_dict[indexable_surname] = (
                    asciified_surname, updated_pids)
            except KeyError:
                indexable_name_pids_dict[indexable_surname] = (
                    asciified_surname, name_pids_dict[name])

    indexable_names_list = indexable_name_pids_dict.keys()

    # If an exception/error occurs in any of the threads it is not detectable
    # so inter-thread communication is necessary to make it visible.
    q = Queue()
    threads = list()
    threads.append(
        Thread(target=create_dense_index,
               args=(indexable_name_pids_dict, indexable_names_list, q)))
    threads.append(
        Thread(target=create_inverted_lists, args=(indexable_names_list, q)))

    for t in threads:
        t.start()

    for t in threads:
        all_ok, error = q.get(block=True)
        if not all_ok:
            raise error
        q.task_done()

    for t in threads:
        t.join()
def find_personids_by_name(query_string, trust_is_operating=False):
    '''
    It returns all the authors that match the query string, sorted by compatibility.

    WARNING: this is just querying the search engine, for a proper person search query one
    should use person_search_engine_query in bibauthorid_dbinterface

    @param query_string: the query string
    @type query_string: str

    @return: author identifiers
    @rtype: list [int,]
    '''
    if not trust_is_operating:
        search_engine_is_oper = search_engine_is_operating()
        if not search_engine_is_oper:
            return None

    asciified_qstring = translate_to_ascii(query_string)[0]
    indexable_qstring = create_indexable_name(
        split_name_parts(indexable_name_re.sub(' ', asciified_qstring)))

    surname = split_name_parts(query_string)[0] + ','
    asciified_qstring_sur = translate_to_ascii(surname)[0]
    indexable_qstring_sur = create_indexable_name(
        split_name_parts(indexable_name_re.sub(' ', asciified_qstring_sur)))

    qstring_first_names = indexable_qstring.split(
        ' ')[len(indexable_qstring_sur.split(' ')):]

    string_ids = solve_T_occurence_problem(
        indexable_qstring) | solve_T_occurence_problem(indexable_qstring_sur)
    if not string_ids:
        return list()

    strings_to_ids_mapping = get_indexed_strings(string_ids)

    passing_string_ids, surname_score_cache = remove_false_positives(
        indexable_qstring_sur, strings_to_ids_mapping)

    if not passing_string_ids:
        return list()

    author_groups = get_author_groups_from_string_ids(passing_string_ids)

    authors = set()
    for author_group in author_groups:
        authors |= set(deserialize(author_group[0]))

    author_to_names_mapping = get_name_variants_for_authors(authors)

    surname_score_clusters = create_surname_score_clusters(
        indexable_qstring_sur, author_to_names_mapping, surname_score_cache,
        strings_to_ids_mapping)

    sorted_authors = sort_authors(indexable_qstring, qstring_first_names,
                                  surname_score_clusters,
                                  author_to_names_mapping,
                                  strings_to_ids_mapping)

    return sorted_authors
def arxiv_login(req, picked_profile=None):
    '''
    Log in through arxive. If user already associated to a personid, returns the personid.
    If user has no pid, try to guess which personid to associate based on surname and papers
    from arxiv. If no compatible person is found, creates a new person.
    At the end of the process opens a ticket for the user claiming the papers from arxiv.
    !!! the user will find the open ticket, which will require him to go through the
    final review before getting committed.

    @param req: Apache request object
    @type req: Apache request object

    @return: Returns the pid resulting in the process
    @rtype: int
    '''
    def session_bareinit(req):
        session = get_session(req)
        try:
            pinfo = session["personinfo"]
            if 'ticket' not in pinfo:
                pinfo["ticket"] = []
        except KeyError:
            pinfo = dict()
            session['personinfo'] = pinfo
            pinfo["ticket"] = []
        session.dirty = True




    session_bareinit(req)
    session = get_session(req)

    pinfo = session['personinfo']
    ticket = session['personinfo']['ticket']

    uinfo = collect_user_info(req)
    pinfo['external_first_entry'] = False

    try:
        name = uinfo['external_firstname']
    except KeyError:
        name = ''
    try:
        surname = uinfo['external_familyname']
    except KeyError:
        surname = ''

    if surname:
        session['personinfo']['arxiv_name'] = nameapi.create_normalized_name(
                                          nameapi.split_name_parts(surname + ', ' + name))
    else:
        session['personinfo']['arxiv_name'] = ''

    session.dirty = True

    try:
        arxiv_p_ids = uinfo['external_arxivids'].split(';')
    except KeyError:
        arxiv_p_ids = []

    #'external_arxivids': 'hep-th/0112017;hep-th/0112020',
    #'external_familyname': 'Weiler',
    #'external_firstname': 'Henning',

    try:
        found_bibrecs = set(reduce(add, [perform_request_search(p='037:' + str(arx), of='id', rg=0)for arx in arxiv_p_ids]))
    except (IndexError, TypeError):
        found_bibrecs = set()

    #found_bibrecs = [567700, 567744]

    uid = getUid(req)
    pid, pid_found = dbapi.get_personid_from_uid([[uid]])

    if pid_found:
        pid = pid[0]
    else:
        if picked_profile == None:
            top5_list = dbapi.find_top5_personid_for_new_arXiv_user(found_bibrecs,
                nameapi.create_normalized_name(nameapi.split_name_parts(surname + ', ' + name)))
            return ("top5_list", top5_list)
        else:
            pid = dbapi.check_personids_availability(picked_profile, uid)

    pid_bibrecs = set([i[0] for i in dbapi.get_all_personids_recs(pid, claimed_only=True)])
    missing_bibrecs = found_bibrecs - pid_bibrecs
    #present_bibrecs = found_bibrecs.intersection(pid_bibrecs)

    #assert len(found_bibrecs) == len(missing_bibrecs) + len(present_bibrecs)

    tempticket = []
    #now we have to open the tickets...
    #person_papers contains the papers which are already assigned to the person and came from arxive,
    #they can be claimed regardless

    for bibrec in missing_bibrecs:
        tempticket.append({'pid':pid, 'bibref':str(bibrec), 'action':'confirm'})

    #check if ticket targets (bibref for pid) are already in ticket
    for t in list(tempticket):
        for e in list(ticket):
            if e['pid'] == t['pid'] and e['bibref'] == t['bibref']:
                ticket.remove(e)
        ticket.append(t)

    session.dirty = True

    if picked_profile != None and picked_profile != pid and picked_profile != -1:

        return ("chosen pid not available", pid)
    elif picked_profile != None and picked_profile == pid and picked_profile != -1:
        return ("pid assigned by user", pid)
    else:
        return ("pid", pid)
 def test_split_name_parss(self):
     for tn in self.names_split_name_parts.keys():
         self.assertEqual(split_name_parts(tn), self.names_split_name_parts[tn])
Beispiel #28
0
def rabbit(bibrecs,
           check_invalid_papers=False,
           personids_to_update_extids=None):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_string_to_pid_dictionary()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_new_personid()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) >
            bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):
        task_sleep_now_if_required(True)
        update_status(
            float(idx) / len(bibrecs),
            "%d/%d current: %d" % (idx, len(bibrecs), rec))
        if rec in deleted:
            delete_paper_from_personid(rec)
            continue

        markrefs = frozenset(
            chain(
                izip(cycle([100]),
                     imap(itemgetter(0), get_authors_from_paper(rec))),
                izip(cycle([700]),
                     imap(itemgetter(0), get_coauthors_from_paper(rec)))))

        personid_rows = [
            map(int, row[:3]) + [row[4]]
            for row in get_signatures_from_rec(rec)
        ]
        personidrefs_names = dict(
            ((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict(
            (new,
             create_normalized_name(
                 split_name_parts(get_name_by_bibrecref(new))))
            for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[
            compare_names(new_signatures_names[new], personidrefs_names[old])
            for old in old_signatures
        ] for new in new_signatures]

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix)
                      if score > threshold]
        for new, old in best_match:
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_sigs(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(
            map(itemgetter(0), best_match))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = []
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id(sig + (rec, ))
                    if inspire_id:
                        matched_pids = list(
                            get_person_with_extid(inspire_id[0]))
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [
                p for p in matched_pids if int(p[0]) not in used_pids
            ]

            if not matched_pids:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids:  # an empty set will update all canonical_names
        update_personID_canonical_names(updated_pids)
        update_personID_external_ids(
            updated_pids,
            limit_to_claimed_papers=bconfig.
            LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()
def rabbit(bibrecs,
           check_invalid_papers=False,
           personids_to_update_extids=None,
           verbose=False):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w')
    logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs)))

    def logwrite(msg, is_error):
        verb = 9
        if is_error or verbose:
            verb = 1
        write_message(msg, verbose=verb)

    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_to_authors_mapping()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_free_author_id()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_papers()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) >
            bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):

        logwrite("\nConsidering %s" % str(rec), False)

        if idx % 200 == 0:
            task_sleep_now_if_required(True)

            update_status(
                float(idx) / len(bibrecs),
                "%d/%d current: %d" % (idx, len(bibrecs), rec))
            task_update_progress("%d/%d current: %d" %
                                 (idx, len(bibrecs), rec))

        if rec in deleted:
            logwrite(
                " - Record was deleted, removing from pid and continuing with next record",
                True)
            remove_papers([rec])
            continue

        markrefs = frozenset(
            chain(
                izip(cycle([100]),
                     imap(itemgetter(0), get_author_refs_of_paper(rec))),
                izip(cycle([700]),
                     imap(itemgetter(0), get_coauthor_refs_of_paper(rec)))))

        personid_rows = [
            map(int, row[:3]) + [row[4]]
            for row in get_signatures_of_paper(rec)
        ]
        personidrefs_names = dict(
            ((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict(
            (new,
             create_normalized_name(split_name_parts(get_name_by_bibref(new))))
            for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[
            compare_names(new_signatures_names[new], personidrefs_names[old])
            for old in old_signatures
        ] for new in new_signatures]

        logwrite(" - Old signatures: %s" % str(old_signatures),
                 bool(old_signatures))
        logwrite(" - New signatures: %s" % str(new_signatures),
                 bool(new_signatures))
        logwrite(" - Matrix: %s" % str(matrix), bool(matrix))

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix)
                      if score > threshold]

        logwrite(" - Best match: %s " % str(best_match), bool(best_match))

        for new, old in best_match:
            logwrite(
                " - - Moving signature: %s on %s to %s as %s" %
                (old, rec, new, new_signatures_names[new]), True)
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_signatures(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(
            map(itemgetter(0), best_match))

        pids_having_rec = set(
            [int(row[0]) for row in get_signatures_of_paper(rec)])
        logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = list()
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id_of_signature(sig + (rec, ))
                    if inspire_id:
                        matched_pids = list(
                            get_author_by_external_id(inspire_id[0]))
                        if matched_pids and int(
                                matched_pids[0][0]) in pids_having_rec:
                            matched_pids = list()
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    pids_having_rec.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [
                p for p in matched_pids if int(p[0]) not in used_pids
            ]

            if not matched_pids or int(matched_pids[0][0]) in pids_having_rec:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])
                pids_having_rec.add(matched_pids[0][0])

        logwrite('Finished with %s' % str(rec), False)

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids:  # an empty set will update all canonical_names
        update_canonical_names_of_authors(updated_pids)
        update_external_ids_of_authors(
            updated_pids,
            limit_to_claimed_papers=bconfig.
            LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()

    remove_empty_authors()
Beispiel #30
0
def arxiv_login(req, picked_profile=None):
    '''
    Log in through arxive. If user already associated to a personid, returns the personid.
    If user has no pid, try to guess which personid to associate based on surname and papers
    from arxiv. If no compatible person is found, creates a new person.
    At the end of the process opens a ticket for the user claiming the papers from arxiv.
    !!! the user will find the open ticket, which will require him to go through the
    final review before getting committed.

    @param req: Apache request object
    @type req: Apache request object

    @return: Returns the pid resulting in the process
    @rtype: int
    '''
    def session_bareinit(req):
        session = get_session(req)
        try:
            pinfo = session["personinfo"]
            if 'ticket' not in pinfo:
                pinfo["ticket"] = []
        except KeyError:
            pinfo = dict()
            session['personinfo'] = pinfo
            pinfo["ticket"] = []
        session.dirty = True




    session_bareinit(req)
    session = get_session(req)

    pinfo = session['personinfo']
    ticket = session['personinfo']['ticket']

    uinfo = collect_user_info(req)
    pinfo['external_first_entry'] = False

    try:
        name = uinfo['external_firstname']
    except KeyError:
        name = ''
    try:
        surname = uinfo['external_familyname']
    except KeyError:
        surname = ''

    if surname:
        session['personinfo']['arxiv_name'] = nameapi.create_normalized_name(
                                          nameapi.split_name_parts(surname + ', ' + name))
    else:
        session['personinfo']['arxiv_name'] = ''

    session.dirty = True

    try:
        arxiv_p_ids = uinfo['external_arxivids'].split(';')
    except KeyError:
        arxiv_p_ids = []

    #'external_arxivids': 'hep-th/0112017;hep-th/0112020',
    #'external_familyname': 'Weiler',
    #'external_firstname': 'Henning',

    try:
        found_bibrecs = set(reduce(add, [perform_request_search(p='037:' + str(arx), of='id', rg=0)for arx in arxiv_p_ids]))
    except (IndexError, TypeError):
        found_bibrecs = set()

    #found_bibrecs = [567700, 567744]

    uid = getUid(req)
    pid, pid_found = dbapi.get_personid_from_uid([[uid]])

    if pid_found:
        pid = pid[0]
    else:
        if picked_profile == None:
            top5_list = dbapi.find_top5_personid_for_new_arXiv_user(found_bibrecs,
                nameapi.create_normalized_name(nameapi.split_name_parts(surname + ', ' + name)))
            return ("top5_list", top5_list)
        else:
            pid = dbapi.check_personids_availability(picked_profile, uid)

    pid_bibrecs = set([i[0] for i in dbapi.get_all_personids_recs(pid, claimed_only=True)])
    missing_bibrecs = found_bibrecs - pid_bibrecs
    #present_bibrecs = found_bibrecs.intersection(pid_bibrecs)

    #assert len(found_bibrecs) == len(missing_bibrecs) + len(present_bibrecs)

    tempticket = []
    #now we have to open the tickets...
    #person_papers contains the papers which are already assigned to the person and came from arxive,
    #they can be claimed regardless

    for bibrec in missing_bibrecs:
        tempticket.append({'pid':pid, 'bibref':str(bibrec), 'action':'confirm'})

    #check if ticket targets (bibref for pid) are already in ticket
    for t in list(tempticket):
        for e in list(ticket):
            if e['pid'] == t['pid'] and e['bibref'] == t['bibref']:
                ticket.remove(e)
        ticket.append(t)

    session.dirty = True

    if picked_profile != None and picked_profile != pid and picked_profile != -1:

        return ("chosen pid not available", pid)
    elif picked_profile != None and picked_profile == pid and picked_profile != -1:
        return ("pid assigned by user", pid)
    else:
        return ("pid", pid)