def computeDifferentionFromSolutionsMatrix(submits, use_visited: bool): """ for each submit, compute its difference from all solutions, so the output matrix is N_SUBMITS x N_SOLUTIONS :param submits :param use_visited: see global variable USE_VISITED :return: difference matrix """ solutions = [submit for submit in submits if submit.flowers_left == 0] print("{} solutions".format(len(solutions))) matrix = np.zeros((len(submits), len(solutions))) for i in range(len(submits)): print(i) for s in range(len(solutions)): dist = 0 if use_visited: str1 = submits[i].visited_to_unicode(100) str2 = solutions[s].visited_to_unicode(100) dist = StringMatcher.distance(str1, str2) else: str1 = submits[i].functions_to_unicode(use_canonized=True) str2 = solutions[s].functions_to_unicode(use_canonized=True) dist = StringMatcher.distance(str1, str2) matrix[i, s] = dist return matrix
def statistic_similarity(self, paper, min_similarity): """Function that splits the paper text in n-grams (unigrams,bigrams,trigrams) and with a Levenshtein it check the similarity for each of them with the topics in the ontology. Args: paper (string): The paper to analyse. At this stage it is a string. cso (dictionary): the ontology previously loaded from the file. min_similarity (integer): minimum Levenshtein similarity between the n-gram and the topics within the CSO. Returns: found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed. """ # analysing grams found_topics = {} unigrams = ngrams(word_tokenize(paper, preserve_line=True), 1) for grams in unigrams: gram = " ".join(grams) topics = [key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4])] for topic in topics: m = ls.StringMatcher(None, topic, gram).ratio() if m >= min_similarity: if topic in found_topics: found_topics[topic].append({'matched':gram, 'similarity':m}) else: found_topics[topic] = [{'matched':gram, 'similarity':m}] bigrams = ngrams(word_tokenize(paper, preserve_line=True), 2) for grams in bigrams: gram = " ".join(grams) topics = [key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4])] for topic in topics: m = ls.StringMatcher(None, topic, gram).ratio() if m >= min_similarity: if topic in found_topics: found_topics[topic].append({'matched':gram, 'similarity':m}) else: found_topics[topic] = [{'matched':gram, 'similarity':m}] trigrams = ngrams(word_tokenize(paper, preserve_line=True), 3) for grams in trigrams: gram = " ".join(grams) topics = [key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4])] for topic in topics: m = ls.StringMatcher(None, topic, gram).ratio() if m >= min_similarity: if topic in found_topics: found_topics[topic].append({'matched':gram, 'similarity':m}) else: found_topics[topic] = [{'matched':gram, 'similarity':m}] return found_topics
def validate_word(ideal_word: str, recognized_word: str, levenshtein_percent: int) -> bool: normalized_levenshtein = 1 - ( levenshtein.distance(ideal_word, recognized_word) / max(len(ideal_word), len(recognized_word))) return normalized_levenshtein >= levenshtein_percent
def check(word, dictionary): if word is None or word is '' or str(word)[0].isnumeric(): return [] # if str(word)[0].isupper(): # return [' '] fl = str(word).lower()[0] sub_dict = list(filter(lambda x: x.startswith(fl), dictionary)) similar = list( filter(lambda x: StringMatcher.distance(str(word).lower(), x) < 2, sub_dict)) # similar = list(filter(lambda x: word.lower() == x, sub_dict)) return similar
def __refine_found_words(self,similar_words): """ Args: gram (string): the n-gram found (joined) grams (list): list of tokens to be analysed and founf in the model Returns: list_of_matched_topics (list): containing of all found topics """ identified_topics = list() for word, sim in similar_words: topics = self.cso.find_closest_matches(word) for topic in topics: str_sim = ls.StringMatcher(None, topic, word).ratio() #topic is from cso, wet is from word embedding if str_sim >= self.min_similarity: identified_topics.append({"topic":topic,"sim_t":str_sim,"wet":word,"sim_w":sim}) return identified_topics
def find_unit( *, ulist, in_unit, verbose: bool, debug: bool, ): distance = -1 for unit in ulist: dist = StringMatcher.distance(in_unit, unit) if distance < 0: distance = dist winning_unit = unit else: if dist < distance: distance = dist winning_unit = unit eprint("Warning: converting {0} to {1}".format(in_unit, winning_unit)) return winning_unit
def getDifflibOrPyLev( seq2=None, junk=None, forceDifflib=False, ): ''' Returns either a difflib.SequenceMatcher or pyLevenshtein StringMatcher.StringMatcher object depending on what is installed. If forceDifflib is True then use difflib even if pyLevenshtein is installed: ''' if forceDifflib is True: smObject = difflib.SequenceMatcher(junk, '', seq2) else: try: from Levenshtein import StringMatcher as pyLevenshtein smObject = pyLevenshtein.StringMatcher(junk, '', seq2) except ImportError: smObject = difflib.SequenceMatcher(junk, '', seq2) return smObject
def find_closest_string_distance( *, string_dict, in_string, verbose: bool, debug: bool, ): distances_to_paths = defaultdict(list) distance = -1 if verbose: ic(len(string_dict)) for path_key, string in string_dict.items(): dist = StringMatcher.distance(in_string, string) if verbose: ic(dist, path_key) distances_to_paths[dist].append(path_key) if distance < 0: distance = dist winning_key = path_key else: if dist < distance: distance = dist winning_key = path_key if verbose: for path_distance in distances_to_paths.keys(): ic(path_distance) for path in distances_to_paths[path_distance]: ic(path) print("\n", file=sys.stderr) eprint('\n', in_string) ic(winning_key) eprint('\n', string_dict[winning_key]) ic(distance, winning_key) winning_distances = sorted(distances_to_paths.keys())[:10] for distance in winning_distances: ic(distance, distances_to_paths[distance]) return winning_key
def compute_feature(cls, HL: Headline) -> np.ndarray: # replaced word & replacement word. words = [HL.sentence[HL.word_index], HL.edit] # transcibe each token to arpabet. phones = [" ".join(cls.g2p(w.lower())) for w in words] # for i, w in enumerate(words): # try: # s = " " # words[i] = s.join(cls.g2p(w)) # except KeyError: # # print erroneous key # print(w) # # tracks and prints errors # cls.counter += 1 # print(cls.counter) # calculate levenshtein distance between the two pronunciation. levenshtein_dist = StringMatcher.distance(*phones) # scale using the max difference in "word length" scale_factor = max([len(w) for w in phones]) scaled_dist = levenshtein_dist/scale_factor return np.array([scaled_dist])
def phase_compare(docu, file, n): dom = docu.lower() drs = filter_source(file, n) for i, e in enumerate(drs): try: lis = [] word = e['Word'] phase_ = e['Pahse'] index = dom.find(str(phase_)) # print(e['Pahse']) # print(index) # print('\n') if index != -1: patten = re.compile('\d*') result = re.search(patten, word) index_begin = word.index(str(result.group())) index_middle = word.find('(') index_end = word.find(')') index_last = word.find('[') if len(result.group()) > 3: if ')' in word: tag1 = word[index_begin + 4:index_middle] tag2 = word[index_middle + 1:index_end] lis.append(tag1.strip()) lis.append(tag2.strip()) else: tag3 = word[index_begin + 4:index_last] lis.append(tag3.strip()) else: if ')' in word: tag1 = word[index_begin + 4:index_middle] tag2 = word[index_middle + 1:index_end] lis.append(tag1.strip()) lis.append(tag2.strip()) else: tag3 = word[index_begin + 4:index_last] lis.append(tag3.strip()) #去重 aim_word = set(lis) aim_word_ = list(aim_word) #比对 if len(aim_word_) == 2: s1 = aim_word_[0] s2 = aim_word_[1] len1 = StringMatcher.ratio(s1, phase_) len2 = StringMatcher.ratio(s2, phase_) aim_word_.clear() if len1 > len2: aim_word_.append(s1) else: aim_word_.append(s2) if len(aim_word_) == 1: s3 = aim_word_[0] aim_word_.clear() aim_word_.append(s3) index1 = phase_.find(aim_word_[0].strip()) if index1 != -1: global index2 index2 = index + index1 yield { 'index': index2, 'word': aim_word_[0], 'len': len(aim_word_[0]) } X = 'a' * len(aim_word_[0]) word = dom[index2:index2 + len(aim_word_[0])] dom = dom.replace(word, X, 1) except KeyError: continue
def search(name_string, bin_to_id, id_to_name, gender=None, birthdate=None, similarity_threshold=60): # TODO should distinguish between first name (less reliable match) and other names. # consider storing in each bin, a namepart object linking to its name linking to its subject, that has name.isFirstName:bool # TODO consider searching per name alias instead of per candidate (list of aliases), requires a different data structure for lookups # 1. calculate the phonetics bins of the input name name_parts = [NamePart(name_string)] name_parts = normalizer.normalize_name_alias(NameAlias(name_parts, None)) bins = set() for name_part in name_parts: name_part_bins = [ b for b in dmeta(name_part) if b ] # dmeta sometimes outputs an empty 'None' bin, filter it out for bin in name_part_bins: bins.add((bin, name_part)) # 2. find candidates with one or more matching bins candidates = set() name_parts_matched = set() bad_candidates = [] # candidates found to be bad matches for the query for (bin, name_part) in bins: if bin in bin_to_id: candidates_in_bin = bin_to_id[bin] for c in candidates_in_bin: (candidate_id, candidate_name_part) = c if candidate_id in bad_candidates: # we already know this candidate is a bad match continue (names, birthdates) = id_to_name[candidate_id] registered_genders = [ g for g in [x.gender for x in names] if g ] # filter out None value for gender, i.e. unknown if gender and len(registered_genders ) == 1 and gender not in registered_genders: # mark the candidate as bad, so that we don't have to consider it again for this search query bad_candidates.append(candidate_id) continue # skip to next candidate if birthdate and birthdates: # exact birthdates are known if birthdate not in birthdates: # mark the candidate as bad, so that we don't have to consider it again for this search query bad_candidates.append(candidate_id) continue # skip to next candidate # TODO also check birthdate ranges, or birthyear list only # TODO could optionally check birth country if levenshtein_distance.ratio( name_part, candidate_name_part ) >= 0.6: # do not add really bad matches candidates.add(candidate_id) name_parts_matched.add(name_part) # 3. calculate phonetic string similarity name_parts_missed = name_parts - name_parts_matched matching_character_count = sum(map(len, name_parts_matched)) missing_character_count = sum(map(len, name_parts_missed)) phonetic_similarity_ratio = 100 * matching_character_count / ( matching_character_count + missing_character_count) if phonetic_similarity_ratio < 25: # performance: Early exit for really bad matches return [] # return no matches # 4. look up candidate names, filter out matches that are really bad, sort the remaining matches by similarity ratio normalized_query_name = " ".join(name_parts) # TODO word counts can be precomputed for better performance input_word_count = 1 if normalized_query_name.find(" ") < 0 else len( normalized_query_name.split() ) # makes sure to split only on whitespace, short_name_length_limit = 12 is_short_input_name = len(normalized_query_name) <= short_name_length_limit shortness = max(0, short_name_length_limit - len(normalized_query_name)) filtered_candidates = [] for candidate_id in candidates: list_subject = id_to_name[candidate_id] (list_subject_aliases, birthdays) = list_subject for candidate_name in list_subject_aliases: normalized_candidate_name = " ".join( normalizer.normalize_name_alias(candidate_name) ) # TODO precompute this for better performance string_similarity = fuzz.token_sort_ratio( normalized_candidate_name, normalized_query_name) exact_match = string_similarity == 100 similarity_score = string_similarity - 5 if not exact_match: # 1. apply boosts: # boost phonetically similar matches boost_from_phonetic_similarity = similarity_threshold / 100.0 * phonetic_similarity_ratio / 16 # up to approx 6 points at 90% threshold similarity_score += boost_from_phonetic_similarity # 2. apply penalties: if is_short_input_name: # TODO hackish, look for a better solution # short matches must be extra good. Reduces false positives. debuff = 2 * (similarity_threshold / 100.0) * shortness similarity_score -= debuff # TODO word counts can be precomputed for better performance candidate_word_count = 1 if normalized_candidate_name.find( " ") < 0 else len(normalized_candidate_name.split()) missing_words = abs(candidate_word_count - input_word_count) if missing_words: missing_words_score = missing_words * 5 * similarity_threshold / 100.0 missing_words_penalty = min( 20, missing_words_score) # set a ceiling for the penalty similarity_score -= missing_words_penalty # 0 if missing 0 words, -4 if missing 2 words, etc # 3. normalize score after applying boosts and penalties similarity_score = max( 0, min(similarity_score, 99.9) ) # present all non-exact matches as no more than 99.9 if similarity_score >= similarity_threshold: element = (candidate_id, similarity_score, candidate_name) filtered_candidates.append(element) filtered_candidates.sort(key=lambda tup: tup[1], reverse=True) # sort by ratio, descending unique_candidates = [] seen_candidates = set() for c in filtered_candidates: # only report one match against each list-subject, the best matching alias (candidate_id, similarity_score, candidate_name) = c if candidate_id not in seen_candidates: unique_candidates.append(c) seen_candidates.add(candidate_id) return unique_candidates
def statistic_similarity(self, paper, min_similarity): """Function that splits the paper text in n-grams (unigrams,bigrams,trigrams) and with a Levenshtein it check the similarity for each of them with the topics in the ontology. Args: paper (string): The paper to analyse. At this stage it is a string. min_similarity (integer): minimum Levenshtein similarity between the n-gram and the topics within the CSO. Returns: found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed. """ # analysing grams found_topics = defaultdict(list) matches = set() tokens = word_tokenize(paper, preserve_line=True) # TODO: this is constant; factor out topic_stems = defaultdict(list) for k in self.cso['topics'].keys(): topic_stems[k[:4]].append(k) for n in range(3, 0, -1): for i, grams in enumerate(ngrams(tokens, n)): if i in matches: continue gram = " ".join(grams) try: topic_block = topic_stems[gram[:4]] except KeyError: continue for topic in topic_block: m = ls.StringMatcher(None, topic, gram).ratio() if m >= min_similarity: topic = self.get_primary_label(topic, self.cso['primary_labels']) found_topics[topic].append({'matched': gram, 'similarity': m}) matches.add(i) # idx = 0 # trigrams = ngrams(word_tokenize(paper, preserve_line=True), 3) # matched_trigrams = [] # for grams in trigrams: # idx += 1 # gram = " ".join(grams) # topic_block = [key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4])] # for topic in topic_block: # m = ls.StringMatcher(None, topic, gram).ratio() # if m >= min_similarity: # topic = self.get_primary_label(topic, self.cso['primary_labels']) # if topic in found_topics: # found_topics[topic].append({'matched': gram, 'similarity': m}) # else: # found_topics[topic] = [{'matched': gram, 'similarity': m}] # matched_trigrams.append(idx) # # idx = 0 # bigrams = ngrams(word_tokenize(paper, preserve_line=True), 2) # matched_bigrams = [] # for grams in bigrams: # idx += 1 # if (idx not in matched_trigrams) and ((idx - 1) not in matched_trigrams): # gram = " ".join(grams) # topic_block = [key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4])] # for topic in topic_block: # m = ls.StringMatcher(None, topic, gram).ratio() # if m >= min_similarity: # topic = self.get_primary_label(topic, self.cso['primary_labels']) # if topic in found_topics: # found_topics[topic].append({'matched': gram, 'similarity': m}) # else: # found_topics[topic] = [{'matched': gram, 'similarity': m}] # matched_bigrams.append(idx) # # idx = 0 # unigrams = ngrams(word_tokenize(paper, preserve_line=True), 1) # for grams in unigrams: # idx += 1 # if (idx not in matched_trigrams) and ((idx - 1) not in matched_trigrams) and ( # idx not in matched_bigrams) and ((idx - 1) not in matched_bigrams) and ( # (idx - 1) not in matched_bigrams): # gram = " ".join(grams) # topic_block = [key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4])] # for topic in topic_block: # m = ls.StringMatcher(None, topic, gram).ratio() # if m >= min_similarity: # topic = self.get_primary_label(topic, self.cso['primary_labels']) # if topic in found_topics: # found_topics[topic].append({'matched': gram, 'similarity': m}) # else: # found_topics[topic] = [{'matched': gram, 'similarity': m}] return found_topics
def statistic_similarity(self, paper, min_similarity): """Function that splits the paper text in n-grams (unigrams,bigrams,trigrams) and with a Levenshtein it check the similarity for each of them with the topics in the ontology. Args: paper (string): The paper to analyse. At this stage it is a string. min_similarity (integer): minimum Levenshtein similarity between the n-gram and the topics within the CSO. Returns: found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed. """ # analysing grams found_topics = {} idx = 0 trigrams = ngrams(word_tokenize(paper, preserve_line=True), 3) matched_trigrams = [] for grams in trigrams: idx += 1 gram = " ".join(grams) topics = [ key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4]) ] for topic in topics: m = ls.StringMatcher(None, topic, gram).ratio() if m >= min_similarity: topic = self.get_primary_label(topic, self.cso['primary_labels']) if topic in found_topics: found_topics[topic].append({ 'matched': gram, 'similarity': m }) else: found_topics[topic] = [{ 'matched': gram, 'similarity': m }] matched_trigrams.append(idx) idx = 0 bigrams = ngrams(word_tokenize(paper, preserve_line=True), 2) matched_bigrams = [] for grams in bigrams: idx += 1 if (idx not in matched_trigrams) and ((idx - 1) not in matched_trigrams): gram = " ".join(grams) topics = [ key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4]) ] for topic in topics: m = ls.StringMatcher(None, topic, gram).ratio() if m >= min_similarity: topic = self.get_primary_label( topic, self.cso['primary_labels']) if topic in found_topics: found_topics[topic].append({ 'matched': gram, 'similarity': m }) else: found_topics[topic] = [{ 'matched': gram, 'similarity': m }] matched_bigrams.append(idx) idx = 0 unigrams = ngrams(word_tokenize(paper, preserve_line=True), 1) for grams in unigrams: idx += 1 if (idx not in matched_trigrams) and ( (idx - 1) not in matched_trigrams) and ( idx not in matched_bigrams) and ( (idx - 1) not in matched_bigrams) and ( (idx - 1) not in matched_bigrams): gram = " ".join(grams) topics = [ key for key, _ in self.cso['topics'].items() if key.startswith(gram[:4]) ] for topic in topics: m = ls.StringMatcher(None, topic, gram).ratio() if m >= min_similarity: topic = self.get_primary_label( topic, self.cso['primary_labels']) if topic in found_topics: found_topics[topic].append({ 'matched': gram, 'similarity': m }) else: found_topics[topic] = [{ 'matched': gram, 'similarity': m }] return found_topics