def calculate_name_score(query_string, nameids):
    '''
    docstring

    @param query_string:
    @type query_string:
    @param nameids:
    @type nameids:

    @return:
    @rtype:
    '''
    name_personids_list = get_authors_data_from_indexable_name_ids(nameids)
    query_last_name = split_name_parts(query_string)[0]
    query_last_name_len = len(query_last_name)
    name_score_list = list()

    for name, personids in name_personids_list:
        current_last_name = split_name_parts(name)[0]
        current_last_name_len = len(current_last_name)
        if abs(query_last_name_len - current_last_name_len) == 0:
            dist = distance(query_last_name, current_last_name)
            limit = min([query_last_name_len, current_last_name_len])
            name_score = sum([1/float(2**(i+1)) for i in range(limit) if query_last_name[i] == current_last_name[i]])/(dist + 1)
            if name_score > 0.5:
                name_score_list.append((name, name_score, deserialize(personids)))

    return name_score_list
Beispiel #2
0
def calculate_name_score(query_string, nameids):
    '''
    docstring

    @param query_string:
    @type query_string:
    @param nameids:
    @type nameids:

    @return:
    @rtype:
    '''
    name_personids_list = get_authors_data_from_indexable_name_ids(nameids)
    query_last_name = split_name_parts(query_string)[0]
    query_last_name_len = len(query_last_name)
    name_score_list = list()

    for name, personids in name_personids_list:
        current_last_name = split_name_parts(name)[0]
        current_last_name_len = len(current_last_name)
        if abs(query_last_name_len - current_last_name_len) == 0:
            dist = distance(query_last_name, current_last_name)
            limit = min([query_last_name_len, current_last_name_len])
            name_score = sum([
                1 / float(2**(i + 1)) for i in range(limit)
                if query_last_name[i] == current_last_name[i]
            ]) / (dist + 1)
            if name_score > 0.5:
                name_score_list.append(
                    (name, name_score, deserialize(personids)))

    return name_score_list
def solve_T_occurence_problem(query_string):
    '''
    It solves a 'T-occurence problem' which is defined as follows: find the
    string ids that appear at least T times in the inverted lists which
    correspond to each of the query string qgrams. T respresents the number of
    qgrams that the query string and the strings in the result dataset must
    share. If the result dataset is bigger than a threshold it tries to limit
    it further.

    @param query_string: the query string
    @type query_string: str

    @return: strings that share T (or more) common qgrams with the query string
    @rtype: intbitset intbitset(int,)
    '''
    qgrams = set(get_qgrams_from_string(query_string, QGRAM_LEN))
    if not qgrams:
        return intbitset()

    inverted_lists = get_inverted_lists(qgrams)
    if not inverted_lists:
        return intbitset()

    inverted_lists = sorted(inverted_lists, key=itemgetter(1), reverse=True)
    T = int(MATCHING_QGRAMS_PERCENTAGE * len(inverted_lists))
    string_ids = intbitset(deserialize(inverted_lists[0][0]))

    for i in range(1, T):
        inverted_list = intbitset(deserialize(inverted_lists[i][0]))
        string_ids &= inverted_list

    for i in range(T, len(inverted_lists)):
        if len(string_ids) < MAX_T_OCCURANCE_RESULT_LIST_CARDINALITY:
            break
        inverted_list = intbitset(deserialize(inverted_lists[i][0]))
        string_ids_temp = string_ids & inverted_list
        if len(string_ids_temp) > MIN_T_OCCURANCE_RESULT_LIST_CARDINALITY:
            string_ids = string_ids_temp
        else:
            break

    return string_ids
def solve_T_occurence_problem(query_string):
    '''
    It solves a 'T-occurence problem' which is defined as follows: find the string ids
    that apper at least T times on the inverted lists of the query string qgrams. If the
    result dataset is bigger than a threshold it tries to limit it further.

    @param query_string:
    @type query_string: str

    @return: T_occurence_problem answers
    @rtype: list
    '''
    query_string_qgrams = get_qgrams_from_string(query_string, QGRAM_LEN)
    query_string_qgrams_set = set(query_string_qgrams)
    if not query_string_qgrams_set:
        return None

    inverted_lists = get_inverted_lists(query_string_qgrams_set)
    if not inverted_lists:
        return None

    inverted_lists = sorted(inverted_lists, key=itemgetter(1), reverse=True)
    T = int(MATCHING_QGRAMS_PERCENTAGE * len(inverted_lists))
    nameids = intbitset(deserialize(inverted_lists[0][0]))

    for i in range(1, T):
        inverted_list = intbitset(deserialize(inverted_lists[i][0]))
        nameids &= inverted_list

    for i in range(T, len(inverted_lists)):
        if len(nameids) < MAX_T_OCCURANCE_RESULT_LIST_CARDINALITY:
            break
        inverted_list = intbitset(deserialize(inverted_lists[i][0]))
        nameids_temp = inverted_list & nameids
        if len(nameids_temp) > MIN_T_OCCURANCE_RESULT_LIST_CARDINALITY:
            nameids = nameids_temp
        else:
            break

    return nameids
Beispiel #5
0
def solve_T_occurence_problem(query_string):
    '''
    It solves a 'T-occurence problem' which is defined as follows: find the string ids
    that apper at least T times on the inverted lists of the query string qgrams. If the
    result dataset is bigger than a threshold it tries to limit it further.

    @param query_string:
    @type query_string: str

    @return: T_occurence_problem answers
    @rtype: list
    '''
    query_string_qgrams = get_qgrams_from_string(query_string, QGRAM_LEN)
    query_string_qgrams_set = set(query_string_qgrams)
    if not query_string_qgrams_set:
        return None

    inverted_lists = get_inverted_lists(query_string_qgrams_set)
    if not inverted_lists:
        return None

    inverted_lists = sorted(inverted_lists, key=itemgetter(1), reverse=True)
    T = int(MATCHING_QGRAMS_PERCENTAGE * len(inverted_lists))
    nameids = intbitset(deserialize(inverted_lists[0][0]))

    for i in range(1, T):
        inverted_list = intbitset(deserialize(inverted_lists[i][0]))
        nameids &= inverted_list

    for i in range(T, len(inverted_lists)):
        if len(nameids) < MAX_T_OCCURANCE_RESULT_LIST_CARDINALITY:
            break
        inverted_list = intbitset(deserialize(inverted_lists[i][0]))
        nameids_temp = inverted_list & nameids
        if len(nameids_temp) > MIN_T_OCCURANCE_RESULT_LIST_CARDINALITY:
            nameids = nameids_temp
        else:
            break

    return nameids
def find_personids_by_name(query_string, trust_is_operating=False):
    '''
    It returns all the authors that match the query string, sorted by compatibility.

    WARNING: this is just querying the search engine, for a proper person search query one
    should use person_search_engine_query in bibauthorid_dbinterface

    @param query_string: the query string
    @type query_string: str

    @return: author identifiers
    @rtype: list [int,]
    '''
    if not trust_is_operating:
        search_engine_is_oper = search_engine_is_operating()
        if not search_engine_is_oper:
            return None

    asciified_qstring = translate_to_ascii(query_string)[0]
    indexable_qstring = create_indexable_name(
        split_name_parts(indexable_name_re.sub(' ', asciified_qstring)))

    surname = split_name_parts(query_string)[0] + ','
    asciified_qstring_sur = translate_to_ascii(surname)[0]
    indexable_qstring_sur = create_indexable_name(
        split_name_parts(indexable_name_re.sub(' ', asciified_qstring_sur)))

    qstring_first_names = indexable_qstring.split(
        ' ')[len(indexable_qstring_sur.split(' ')):]

    string_ids = solve_T_occurence_problem(
        indexable_qstring) | solve_T_occurence_problem(indexable_qstring_sur)
    if not string_ids:
        return list()

    strings_to_ids_mapping = get_indexed_strings(string_ids)

    passing_string_ids, surname_score_cache = remove_false_positives(
        indexable_qstring_sur, strings_to_ids_mapping)

    if not passing_string_ids:
        return list()

    author_groups = get_author_groups_from_string_ids(passing_string_ids)

    authors = set()
    for author_group in author_groups:
        authors |= set(deserialize(author_group[0]))

    author_to_names_mapping = get_name_variants_for_authors(authors)

    surname_score_clusters = create_surname_score_clusters(
        indexable_qstring_sur, author_to_names_mapping, surname_score_cache,
        strings_to_ids_mapping)

    sorted_authors = sort_authors(indexable_qstring, qstring_first_names,
                                  surname_score_clusters,
                                  author_to_names_mapping,
                                  strings_to_ids_mapping)

    return sorted_authors