def convert_personid():
    from dbquery import run_sql # oh come on, the whole function will be removed soon
    from itertools import repeat
    chunk = 1000

    old_personid = run_sql("SELECT `personid`, `tag`, `data`, `flag`, `lcul` FROM `aidPERSONID`")

    def flush_papers(args):
        run_sql("INSERT INTO `aidPERSONIDPAPERS` "
                "(`personid`, "
                " `bibref_table`, "
                " `bibref_value`, "
                " `bibrec`, "
                " `name`, "
                " `flag`, "
                " `lcul`) "
                "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7))
                , tuple(args))

    def flush_data(args):
        run_sql("INSERT INTO `aidPERSONIDDATA` "
                "(`personid`, "
                " `tag`, "
                " `data`, "
                " `opt1`, "
                " `opt2`) "
                "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5))
               , tuple(args))

    paper_args = []
    data_args = []
    for row in old_personid:
        if row[1] == 'paper':
            bibref, rec = row[2].split(',')
            tab, ref = bibref.split(':')
            try:
                name = get_name_by_bibrecref((int(tab), int(ref), int(rec)))
            except:
                continue
            name = split_name_parts(name)
            name = create_normalized_name(name)
            paper_args += [row[0], tab, ref, rec, name, row[3], row[4]]
            if len(paper_args) > chunk:
                flush_papers(paper_args)
                paper_args = []

        elif row[1] == 'gathered_name':
            continue
        else:
            data_args += list(row)
            if len(data_args) > chunk:
                flush_data(data_args)
                data_args = []

    if paper_args:
        flush_papers(paper_args)

    if data_args:
        flush_data(data_args)
def find_personIDs_by_name_string(target):
    '''
    Search engine to find persons matching the given string
    The matching is done on the surname first, and names if present.
    An ordered list (per compatibility) of pids and found names is returned.

    @param namestring: string name, 'surname, names I.'
    @type: string
    @param strict: Define if this shall perform an exact or a fuzzy match
    @type strict: boolean
    @return: pid list of lists
    [pid,[[name string, occur count, compatibility]]]
    '''
    splitted_name = split_name_parts(target)
    family = splitted_name[0]
    target_cleaned = create_normalized_name(splitted_name)

    levels = (#target + '%', #this introduces a weird problem: different results for mele, salvatore and salvatore mele
              family + ',%',
              family[:-2] + '%',
              '%' + family + ',%',
              '%' + family[1:-1] + '%')

    if len(family) <= 4:
        levels = [levels[0], levels[2]]

    for lev in levels:
        names = dbinter.get_all_personids_by_name(lev)
        if names:
            break

    is_canonical = False
    if not names:
        names = dbinter.get_personids_by_canonical_name(target)
        is_canonical = True

    names = groupby(sorted(names))
    names = [(key[0], key[1], len(list(data)), soft_compare_names(target, key[1])) for key, data in names]
    names = groupby(names, itemgetter(0))
    names = [(key, sorted([(d[1], d[2], d[3]) for d in data if (d[3] > 0.5 or is_canonical)],
             key=itemgetter(2), reverse=True)) for key, data in names]
    names = [name for name in names if name[1]]
    names = sorted(names, key=lambda x: (x[1][0][2], x[1][0][0], x[1][0][1]), reverse=True)

    return names
def create_lastname_list_from_personid():
    '''
    This function generates a dictionary from a last name
    to list of personids which have this lastname.
    '''
    # ((personid, fulL Name1) ... )
    all_names = get_all_names_from_personid()

    # ((personid, last_name) ... )
    artifact_removal = re.compile("[^a-zA-Z0-9]")
    all_names = tuple((row[0], artifact_removal.sub("", split_name_parts(row[1].decode('utf-8'))[0]).lower())
                      for row in all_names)

    # { (last_name : [personid ... ]) ... }
    ret = {}
    for pair in all_names:
        ret[pair[1]] = ret.get(pair[1], []) + [pair[0]]

    return ret
def create_lastname_list_from_personid():
    '''
    This function generates a dictionary from a last name
    to list of personids which have this lastname.
    '''
    # ((personid, fulL Name1) ... )
    all_names = get_all_names_from_personid()

    # ((personid, last_name) ... )
    artifact_removal = re.compile("[^a-zA-Z0-9]")
    all_names = tuple((row[0], artifact_removal.sub("", split_name_parts(row[1].decode('utf-8'))[0]).lower())
                      for row in all_names)

    # { (last_name : [personid ... ]) ... }
    ret = {}
    for pair in all_names:
        ret[pair[1]] = ret.get(pair[1], []) + [pair[0]]

    return ret
def rabbit(bibrecs, check_invalid_papers=False):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):
        task_sleep_now_if_required(True)
        update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec))
        if rec in deleted:
            delete_paper_from_personid(rec)
            continue

        markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))),
                                   izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec)))))

        personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)]
        personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new))))
                                    for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old])
                  for old in old_signatures] for new in new_signatures]

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix) if score > threshold]
        for new, old in best_match:
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_sigs(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids]

            if not matched_pids:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])

    update_status_final()

    if updated_pids: # an empty set will update all canonical_names
        update_personID_canonical_names(updated_pids)