Ejemplo n.º 1
0
def _split_and_index(el):
    name, pids = el
    asciified_name = translate_to_ascii(name)[0]
    split_name = split_name_parts(indexable_name_re.sub(' ', asciified_name))
    indexable_name = create_indexable_name(split_name)
    surname = split_name[0] + ','
    indexable_surname = create_indexable_name([surname, [], [], []])
    return (name, pids, indexable_name, indexable_surname)
def create_bibauthorid_indexer():
    '''
    It constructs the disk-based indexer. It consists of the dense index (which maps a name
    to the set of personids who withhold that name) and the inverted lists (which map a qgram
    to the set of name ids that share that qgram).
    '''
    name_pids_dict = get_confirmed_name_to_authors_mapping()
    if not name_pids_dict:
        return

    indexable_name_pids_dict = dict()

    for name in name_pids_dict.keys():
        asciified_name = translate_to_ascii(name)[0]
        indexable_name = create_indexable_name(asciified_name)
        if indexable_name:
            try:
                asciified_name, pids = indexable_name_pids_dict[indexable_name]
                updated_pids = pids | name_pids_dict[name]
                indexable_name_pids_dict[indexable_name] = (asciified_name, updated_pids)
            except KeyError:
                indexable_name_pids_dict[indexable_name] = (asciified_name, name_pids_dict[name])

        surname = split_name_parts(name)[0]
        asciified_surname = translate_to_ascii(surname)[0]
        indexable_surname = create_indexable_name(asciified_surname)
        if indexable_surname:
            try:
                asciified_surname, pids = indexable_name_pids_dict[indexable_surname]
                updated_pids = pids | name_pids_dict[name]
                indexable_name_pids_dict[indexable_surname] = (asciified_surname, updated_pids)
            except KeyError:
                indexable_name_pids_dict[indexable_surname] = (asciified_surname, name_pids_dict[name])

    indexable_names_list = indexable_name_pids_dict.keys()

    # If an exception/error occurs in any of the threads it is not detectable
    # so inter-thread communication is necessary to make it visible.
    q = Queue()
    threads = list()
    threads.append(Thread(target=create_dense_index, args=(indexable_name_pids_dict, indexable_names_list, q)))
    threads.append(Thread(target=create_inverted_lists, args=(indexable_names_list, q)))

    for t in threads:
        t.start()

    for t in threads:
        all_ok, error = q.get(block=True)
        if not all_ok:
            raise error
        q.task_done()

    for t in threads:
        t.join()
def find_personids_by_name1(query_string):
    '''
    It finds a collection of personids who own a signature that is similar to the given query string.
    Its approach is by solving a 'T-occurance problem' and then it applies some filters to the candidate
    answers so it can remove the false positives. In the end it sorts the result set based on the score
    they obtained.

    @param query_string:
    @type query_string: str

    @return: personids which own a signature similar to the query string
    @rtype: list
    '''
    search_engine_is_functioning = search_engine_is_operating()
    if not search_engine_is_functioning:
        return list()

    asciified_query_string = translate_to_ascii(query_string)[0]
    indexable_query_string = create_indexable_name(asciified_query_string)
    if not indexable_query_string:
        return list()

    #query_string_surname = split_name_parts(query_string)[0]
    #asciified_query_string_surname = translate_to_ascii(query_string_surname)[0]
    #indexable_query_string_surname = create_indexable_name(asciified_query_string_surname)

    #if not indexable_query_string and not indexable_query_string_surname:
    #    return list()

    s1 = solve_T_occurence_problem(indexable_query_string)

    if not s1:
        s1 = intbitset()

    nameids = solve_T_occurence_problem(indexable_query_string)

    #s2 = solve_T_occurence_problem(indexable_query_string_surname)
    #if not s2:
    #    s2 = intbitset()

    #nameids = s1 | s2
    if not nameids:
        return list()

    name_score_list = calculate_name_score(asciified_query_string, nameids)
    
    return name_score_list
Ejemplo n.º 4
0
def find_personids_by_name1(query_string):
    '''
    It finds a collection of personids who own a signature that is similar to the given query string.
    Its approach is by solving a 'T-occurance problem' and then it applies some filters to the candidate
    answers so it can remove the false positives. In the end it sorts the result set based on the score
    they obtained.

    @param query_string:
    @type query_string: str

    @return: personids which own a signature similar to the query string
    @rtype: list
    '''
    search_engine_is_functioning = search_engine_is_operating()
    if not search_engine_is_functioning:
        return list()

    asciified_query_string = translate_to_ascii(query_string)[0]
    indexable_query_string = create_indexable_name(asciified_query_string)
    if not indexable_query_string:
        return list()

    #query_string_surname = split_name_parts(query_string)[0]
    #asciified_query_string_surname = translate_to_ascii(query_string_surname)[0]
    #indexable_query_string_surname = create_indexable_name(asciified_query_string_surname)

    #if not indexable_query_string and not indexable_query_string_surname:
    #    return list()

    s1 = solve_T_occurence_problem(indexable_query_string)

    if not s1:
        s1 = intbitset()

    nameids = solve_T_occurence_problem(indexable_query_string)

    #s2 = solve_T_occurence_problem(indexable_query_string_surname)
    #if not s2:
    #    s2 = intbitset()

    #nameids = s1 | s2
    if not nameids:
        return list()

    name_score_list = calculate_name_score(asciified_query_string, nameids)

    return name_score_list
Ejemplo n.º 5
0
def cache_name_variants_of_authors(author_to_name_and_occurrence_mapping):
    args = list()
    for author, names_and_occurrence in author_to_name_and_occurrence_mapping.iteritems(
    ):
        indexable_names_and_occurrence = dict()
        for name, occurrences in names_and_occurrence.iteritems():
            asciified_name = translate_to_ascii(name)[0]
            indexable_name = create_indexable_name(
                split_name_parts(indexable_name_re.sub(' ', asciified_name)))
            try:
                indexable_names_and_occurrence[indexable_name] += occurrences
            except KeyError:
                indexable_names_and_occurrence[indexable_name] = occurrences

        args += [author, serialize(indexable_names_and_occurrence), 1]

    populate_table('aidDENSEINDEX', ['id', 'personids', 'flag'],
                   args,
                   empty_table_first=False)
Ejemplo n.º 6
0
def find_personids_by_name(query_string, trust_is_operating=False):
    '''
    It returns all the authors that match the query string, sorted by compatibility.

    WARNING: this is just querying the search engine, for a proper person search query one
    should use person_search_engine_query in bibauthorid_dbinterface

    @param query_string: the query string
    @type query_string: str

    @return: author identifiers
    @rtype: list [int,]
    '''
    if not trust_is_operating:
        search_engine_is_oper = search_engine_is_operating()
        if not search_engine_is_oper:
            return None

    asciified_qstring = translate_to_ascii(query_string)[0]
    indexable_qstring = create_indexable_name(
        split_name_parts(indexable_name_re.sub(' ', asciified_qstring)))

    surname = split_name_parts(query_string)[0] + ','
    asciified_qstring_sur = translate_to_ascii(surname)[0]
    indexable_qstring_sur = create_indexable_name(
        split_name_parts(indexable_name_re.sub(' ', asciified_qstring_sur)))

    qstring_first_names = indexable_qstring.split(
        ' ')[len(indexable_qstring_sur.split(' ')):]

    string_ids = solve_T_occurence_problem(
        indexable_qstring) | solve_T_occurence_problem(indexable_qstring_sur)
    if not string_ids:
        return list()

    strings_to_ids_mapping = get_indexed_strings(string_ids)

    passing_string_ids, surname_score_cache = remove_false_positives(
        indexable_qstring_sur, strings_to_ids_mapping)

    if not passing_string_ids:
        return list()

    author_groups = get_author_groups_from_string_ids(passing_string_ids)

    authors = set()
    for author_group in author_groups:
        authors |= set(deserialize(author_group[0]))

    author_to_names_mapping = get_name_variants_for_authors(authors)

    surname_score_clusters = create_surname_score_clusters(
        indexable_qstring_sur, author_to_names_mapping, surname_score_cache,
        strings_to_ids_mapping)

    sorted_authors = sort_authors(indexable_qstring, qstring_first_names,
                                  surname_score_clusters,
                                  author_to_names_mapping,
                                  strings_to_ids_mapping)

    return sorted_authors
Ejemplo n.º 7
0
def create_bibauthorid_indexer():
    '''
    It constructs the disk-based indexer. It consists of the dense index (which maps a name
    to the set of personids who withhold that name) and the inverted lists (which map a qgram
    to the set of name ids that share that qgram).
    '''
    name_pids_dict = get_confirmed_name_to_authors_mapping()
    if not name_pids_dict:
        return

    indexable_name_pids_dict = dict()

    for name in name_pids_dict.keys():
        asciified_name = translate_to_ascii(name)[0]
        indexable_name = create_indexable_name(asciified_name)
        if indexable_name:
            try:
                asciified_name, pids = indexable_name_pids_dict[indexable_name]
                updated_pids = pids | name_pids_dict[name]
                indexable_name_pids_dict[indexable_name] = (asciified_name,
                                                            updated_pids)
            except KeyError:
                indexable_name_pids_dict[indexable_name] = (
                    asciified_name, name_pids_dict[name])

        surname = split_name_parts(name)[0]
        asciified_surname = translate_to_ascii(surname)[0]
        indexable_surname = create_indexable_name(asciified_surname)
        if indexable_surname:
            try:
                asciified_surname, pids = indexable_name_pids_dict[
                    indexable_surname]
                updated_pids = pids | name_pids_dict[name]
                indexable_name_pids_dict[indexable_surname] = (
                    asciified_surname, updated_pids)
            except KeyError:
                indexable_name_pids_dict[indexable_surname] = (
                    asciified_surname, name_pids_dict[name])

    indexable_names_list = indexable_name_pids_dict.keys()

    # If an exception/error occurs in any of the threads it is not detectable
    # so inter-thread communication is necessary to make it visible.
    q = Queue()
    threads = list()
    threads.append(
        Thread(target=create_dense_index,
               args=(indexable_name_pids_dict, indexable_names_list, q)))
    threads.append(
        Thread(target=create_inverted_lists, args=(indexable_names_list, q)))

    for t in threads:
        t.start()

    for t in threads:
        all_ok, error = q.get(block=True)
        if not all_ok:
            raise error
        q.task_done()

    for t in threads:
        t.join()