def calculate_name_score(query_string, nameids): ''' docstring @param query_string: @type query_string: @param nameids: @type nameids: @return: @rtype: ''' name_personids_list = get_authors_data_from_indexable_name_ids(nameids) query_last_name = split_name_parts(query_string)[0] query_last_name_len = len(query_last_name) name_score_list = list() for name, personids in name_personids_list: current_last_name = split_name_parts(name)[0] current_last_name_len = len(current_last_name) if abs(query_last_name_len - current_last_name_len) == 0: dist = distance(query_last_name, current_last_name) limit = min([query_last_name_len, current_last_name_len]) name_score = sum([1/float(2**(i+1)) for i in range(limit) if query_last_name[i] == current_last_name[i]])/(dist + 1) if name_score > 0.5: name_score_list.append((name, name_score, deserialize(personids))) return name_score_list
def calculate_name_score(query_string, nameids): ''' docstring @param query_string: @type query_string: @param nameids: @type nameids: @return: @rtype: ''' name_personids_list = get_authors_data_from_indexable_name_ids(nameids) query_last_name = split_name_parts(query_string)[0] query_last_name_len = len(query_last_name) name_score_list = list() for name, personids in name_personids_list: current_last_name = split_name_parts(name)[0] current_last_name_len = len(current_last_name) if abs(query_last_name_len - current_last_name_len) == 0: dist = distance(query_last_name, current_last_name) limit = min([query_last_name_len, current_last_name_len]) name_score = sum([ 1 / float(2**(i + 1)) for i in range(limit) if query_last_name[i] == current_last_name[i] ]) / (dist + 1) if name_score > 0.5: name_score_list.append( (name, name_score, deserialize(personids))) return name_score_list
def solve_T_occurence_problem(query_string): ''' It solves a 'T-occurence problem' which is defined as follows: find the string ids that appear at least T times in the inverted lists which correspond to each of the query string qgrams. T respresents the number of qgrams that the query string and the strings in the result dataset must share. If the result dataset is bigger than a threshold it tries to limit it further. @param query_string: the query string @type query_string: str @return: strings that share T (or more) common qgrams with the query string @rtype: intbitset intbitset(int,) ''' qgrams = set(get_qgrams_from_string(query_string, QGRAM_LEN)) if not qgrams: return intbitset() inverted_lists = get_inverted_lists(qgrams) if not inverted_lists: return intbitset() inverted_lists = sorted(inverted_lists, key=itemgetter(1), reverse=True) T = int(MATCHING_QGRAMS_PERCENTAGE * len(inverted_lists)) string_ids = intbitset(deserialize(inverted_lists[0][0])) for i in range(1, T): inverted_list = intbitset(deserialize(inverted_lists[i][0])) string_ids &= inverted_list for i in range(T, len(inverted_lists)): if len(string_ids) < MAX_T_OCCURANCE_RESULT_LIST_CARDINALITY: break inverted_list = intbitset(deserialize(inverted_lists[i][0])) string_ids_temp = string_ids & inverted_list if len(string_ids_temp) > MIN_T_OCCURANCE_RESULT_LIST_CARDINALITY: string_ids = string_ids_temp else: break return string_ids
def solve_T_occurence_problem(query_string): ''' It solves a 'T-occurence problem' which is defined as follows: find the string ids that apper at least T times on the inverted lists of the query string qgrams. If the result dataset is bigger than a threshold it tries to limit it further. @param query_string: @type query_string: str @return: T_occurence_problem answers @rtype: list ''' query_string_qgrams = get_qgrams_from_string(query_string, QGRAM_LEN) query_string_qgrams_set = set(query_string_qgrams) if not query_string_qgrams_set: return None inverted_lists = get_inverted_lists(query_string_qgrams_set) if not inverted_lists: return None inverted_lists = sorted(inverted_lists, key=itemgetter(1), reverse=True) T = int(MATCHING_QGRAMS_PERCENTAGE * len(inverted_lists)) nameids = intbitset(deserialize(inverted_lists[0][0])) for i in range(1, T): inverted_list = intbitset(deserialize(inverted_lists[i][0])) nameids &= inverted_list for i in range(T, len(inverted_lists)): if len(nameids) < MAX_T_OCCURANCE_RESULT_LIST_CARDINALITY: break inverted_list = intbitset(deserialize(inverted_lists[i][0])) nameids_temp = inverted_list & nameids if len(nameids_temp) > MIN_T_OCCURANCE_RESULT_LIST_CARDINALITY: nameids = nameids_temp else: break return nameids
def find_personids_by_name(query_string, trust_is_operating=False): ''' It returns all the authors that match the query string, sorted by compatibility. WARNING: this is just querying the search engine, for a proper person search query one should use person_search_engine_query in bibauthorid_dbinterface @param query_string: the query string @type query_string: str @return: author identifiers @rtype: list [int,] ''' if not trust_is_operating: search_engine_is_oper = search_engine_is_operating() if not search_engine_is_oper: return None asciified_qstring = translate_to_ascii(query_string)[0] indexable_qstring = create_indexable_name( split_name_parts(indexable_name_re.sub(' ', asciified_qstring))) surname = split_name_parts(query_string)[0] + ',' asciified_qstring_sur = translate_to_ascii(surname)[0] indexable_qstring_sur = create_indexable_name( split_name_parts(indexable_name_re.sub(' ', asciified_qstring_sur))) qstring_first_names = indexable_qstring.split( ' ')[len(indexable_qstring_sur.split(' ')):] string_ids = solve_T_occurence_problem( indexable_qstring) | solve_T_occurence_problem(indexable_qstring_sur) if not string_ids: return list() strings_to_ids_mapping = get_indexed_strings(string_ids) passing_string_ids, surname_score_cache = remove_false_positives( indexable_qstring_sur, strings_to_ids_mapping) if not passing_string_ids: return list() author_groups = get_author_groups_from_string_ids(passing_string_ids) authors = set() for author_group in author_groups: authors |= set(deserialize(author_group[0])) author_to_names_mapping = get_name_variants_for_authors(authors) surname_score_clusters = create_surname_score_clusters( indexable_qstring_sur, author_to_names_mapping, surname_score_cache, strings_to_ids_mapping) sorted_authors = sort_authors(indexable_qstring, qstring_first_names, surname_score_clusters, author_to_names_mapping, strings_to_ids_mapping) return sorted_authors