def strip_bogus_lines(arg_lines): re_blank = re.compile(r'^\s$') re_page_left = re.compile(r'^\s*Page [0-9]+') re_page_right = re.compile(r'Page [0-9]+\s*$') berk_string = 'BERKELEY TRAINING ASSOCIATES © 2009\n' mft_string = 'MFT PRACTICE EXAMINATIONS' lines = [] for line in arg_lines: bogosity = 0.0 if re_blank.search(line): bogosity += 1.0 if re_page_left.search(line): bogosity += 0.5 if re_page_right.search(line): bogosity += 0.5 l = Levenshtein.distance(line[-(len(mft_string)):], mft_string) if l < 5: bogosity += (5 - l) / 5.0 l = Levenshtein.distance(line[:len(mft_string)], mft_string) if l < 5: bogosity += (5 - l) / 5.0 l = Levenshtein.distance(line, berk_string) if l < 5: bogosity += (5 - l) / 5.0 if bogosity < 0.25: lines.append(line) return lines
def write_lex_stats(b, num, syll = None): """Use Levenshtein package to calcualte lev and count up mps, neighbors, etc""" total = 0. mps = 0 neighbors = 0 homophones = 0 lev_total = 0 for item in itertools.combinations(b, 2): if syll != None: #if len(item[0].split("-"))==syll or len(item[1].split("-"))==syll: lev = Levenshtein.distance(re.sub("-", "", item[0]), re.sub("-", "", item[1])) if lev == 0: homophones += 1 elif lev == 1: neighbors += 1 if len(re.sub("-", "", item[0])) == len(re.sub("-", "", item[1])): mps += 1 total += 1 lev_total += lev else: lev = Levenshtein.distance(re.sub("-", "", item[0]), re.sub("-", "", item[1])) if lev == 0: homophones += 1 elif lev == 1: neighbors += 1 if len(re.sub("-", "", item[0])) == len(re.sub("-", "", item[1])): mps += 1 total += 1 lev_total += lev print str(num) f.write(",".join([str(x) for x in [num, homophones, mps, neighbors, lev_total/total, len(b)] ]) + "\n") return
def search(self, keywords, cutoff=0.3): """Search through all songs in self.songs. Determines all songs being matched by the supplied keywords. Returns a list of tuples of the form (song, matchratio), where matchratio goes from <cutoff> to 1.0; 1.0 being a perfect match. The result is sorted by that value, highest match ratios first.""" num_keywords = len(keywords) results = [] for song in self.songs.values(): # search in title and gametitle haystack1 = set(song.title.lower().split()) haystack2 = set(song.game.title.lower().split()) ratio = 0 for keyword in keywords: keyword = keyword.lower() # determine best keyword match subratio1 = max(Levenshtein.ratio(keyword, word) for word in haystack1) subratio2 = max(Levenshtein.ratio(keyword, word) for word in haystack2) subratio = max(subratio1,subratio2*0.8) if subratio > 0.7: # assume low ratios are no match ratio += subratio ratio /= num_keywords if ratio > cutoff: # random cutoff value results.append((song, ratio)) return sorted(results, key=lambda s: s[1], reverse=True)
def predictionRatio(df, metric="Levenshtein"): #Generate all possible combinations for string matching soc_media_1, soc_media_2 = df.columns # Convert everything to lower case df[soc_media_1] = df[soc_media_1].str.lower() df[soc_media_2] = df[soc_media_2].str.lower() df_known = DataFrame([df[soc_media_1].tolist()] * df.shape[0], index=df.index, columns=df.index) df_search = DataFrame([df[soc_media_2].tolist()] * df.shape[0], index=df.index, columns=df.index) df_known_list = df_known.applymap(lambda x: list([x])) df_search_list = df_search.applymap(lambda x: list([x])) df_search_list = df_known_list+df_search_list.T # Find the indices of columns for each row based on metric # For Levenshtein get the min., for JaroWinkler get the max. if metric == 'Levenshtein': search_res = df_search_list.applymap(lambda x: Levenshtein.distance(x[0], x[1])) indices = search_res.idxmin(axis=1) else: search_res = df_search_list.applymap(lambda x: Levenshtein.jaro_winkler(x[0], x[1])) indices = search_res.idxmax(axis=1) # Get the matches for social media account match = df[soc_media_2].ix[indices] df_t = DataFrame() df_t['actual'] = df[soc_media_2].reset_index(drop=True) df_t['match'] = match.reset_index(drop=True) # Find the ratio of correct matches match_count = (df_t.actual == df_t.match).value_counts() ratio = float(match_count[True]) / (match_count[True] + match_count[False]) return ratio
def compare_list(self, company_name, table_name): iac_hz_list = [ [0, 1, 2, 3, 4, 5, 6] ] iac_guanwang_list = [ [1, 1, 2, 3, 4, 5, 6] ] hz_list = [x.replace("\n", "").replace("\r", "") for x in iac_hz_list] guanwang_list = [x.replace("\n", "").replace("\r", "") for x in iac_guanwang_list] if iac_hz_list is None: print "haizhi system lack %s's %s." % (company_name, table_name) return if iac_guanwang_list is None: print "guanwang lack %s's %s." % (company_name, table_name) return if len(iac_hz_list) == len(iac_guanwang_list): print "the length of list %s is same : d%" % (table_name, len(iac_hz_list)) else: print "the length of list %s is different : d%,d%" % (table_name, len(iac_hz_list), len(iac_guanwang_list)) # 相同的元素数量 num = 0 for row in hz_list: if row in guanwang_list: # guanwang_list.remove(row) # hz_list.remove(row) num += 1 continue else: for row2 in guanwang_list: Levenshtein.distance(row, row2)
def check_clusters(seq_list, all_clusters, cutoff, unique): print('Checking for unique input sequences') seqs = {} for seq in seq_list: seqs[seq[0]] = seqs.get(seq, 0) + 1 # Sequences in seq_list are unique if unique: for k, v in seqs.items(): if v != 1: print('Error: sequence %s appears in seq_list more than once.' % k) # There is a one-to-one correspondence between sequences in seq_list and sequences in all_clusters print('Checking for one-to-one correspondence between input and output sequences.') for cluster, max_len, min_len in all_clusters: for seq, id in cluster: if seq not in seqs: print('Error: sequence %s is in all_clusters but not in seq_list.' % seq) else: seqs[seq] += 1 for k, v in seqs.items(): if v < 2: print('Error: sequence %s appears in seq_list but not in all_clusters.' % k) elif unique and v > 2: print('Error: sequence %s appears in all_clusters more than once.' % k) # The cluster forms a connected network with each sequence in a cluster having a nearest neighbour within the cutoff distance print('Checking cluster membership.') t0 = time.time() i = 0 for cluster, max_len, min_len in all_clusters: # push each cluster through get_clusters and check it results in a single cluster # one could argue that this isn't strictly an independent check, but the underlying algorithm is in scipy # this does check that merging across chunks has happened correctly if len(cluster) > 1: res = get_clusters(cluster, cutoff) if len(res) != 1: print('Error: cluster with sequence %s (id %s) is partitioned into %d clusters by further application of get_cluster.' % (cluster[0], len(res))) i += 1 t1 = time.time() if t1 - t0 > 10: print 'Checking cluster %d\n' % i t0 = time.time() # No clusters are mergeable print('Checking that clusters are distinct.') for i in range(len(all_clusters)): c1 = all_clusters[i][0] for c2, max_len, min_len in all_clusters[i+1:]: for s1, i1 in c1: for s2, i2 in c2: cut = int(cutoff * min(len(s1), len(s2))) if hamming: if len(s1) == len(s2) and ld.hamming(s1, s2) <= cut: print('Error: sequences %s (id %s) and %s (id %s) are in different clusters but are within the cutoff distance.' % (s1, i1, s2, i2)) else: if ld.distance(s1, s2, cut) <= cut: print('Error: sequences %s (id %s) and %s (id %s) are in different clusters but are within the cutoff distance.' % (s1, i1, s2, i2))
def byLevenshtein(key, result_yield): lang = "zha" try: str(key).encode('iso-8859-1') except UnicodeEncodeError: lang = "zh" result_list2d = [] if lang == "zha": for i in result_yield: result_list2d.append([Levenshtein.distance(key, i[0]), i]) else: for i in result_yield: for j in i[1]: list_tmp = split("[\[\]\(\)\ \;\,\。\,\.]", j) list_distance = [] if len(list_tmp) == 2: # 2 means this entry contains only 1 word distance = Levenshtein.distance(key, list_tmp[1]) if distance == 0: list_distance.append(-1) # -1 means the best matched one else: list_distance.append(distance) continue for tmp in list_tmp: if key in tmp: list_distance.append(Levenshtein.distance(key, tmp)) result_list2d.append([min(list_distance), i]) # The method above is not so accurate,but it might work better than the previous one result_list2d.sort() for i in result_list2d: yield i[1]
def similarTerms(self,target): the_same=[] counter=0 with codecs.open(termfile,'rb',encoding='utf-8')as tf: list_of_t=tf.readlines() for item in list_of_t: item=item.strip('\n') if item!=target: if self.if_compoTerm(target): List_target=self.splitTerms(target) for t in List_target: if item.find(t)!=-1: if item not in the_same: dist=Levenshtein.distance(item,target) print("the dist:",dist) if item!=target: the_same.append(item) if Levenshtein.ratio(t,item)==0.8: if item not in the_same: if re.fullmatch(item,target): the_same.append(item) #print("the ratio is ",the_ratio) #print("is",the_same) return the_same
def splitted_word_distance( pattern, text ): #remove trailing chars... words = text.split() lp = len(pattern) lw = len(words) if lp < lw: d = len(''.join(words[lp:])) n = lp else: if re_type in map( lambda e: type(e), pattern[lw:] ): return float("inf") d = len(''.join( pattern[lw:] )) n = lw d0 = [ d ] for i in range(n): word = words[i] p = pattern[i] if type( p ) == str: d += levenshtein.distance( p, word ) d0.append( levenshtein.distance( p, word ) ) elif type( p ) == re_type: if not p.fullmatch( word ): return float("inf") else: raise Exception( "Pattern has wrong type %s" % (str(type(pattern))) ) return d
def identify_anchor_kmer_in_reference_graph(reference_graph, kmer_to_anchor, leftmost=None, rightmost=None, path_length=None): """ :type reference_graph: nx.DiGraph """ toposort = {v: k for k, v in enumerate(nx.topological_sort(reference_graph))} # print "Righmost is ",rightmost,toposort[rightmost] nodes_to_consider = reference_graph.nodes() if rightmost: idx = toposort[rightmost] nodes_to_consider = ifilter(lambda x: toposort[x] <= idx, nodes_to_consider) # print "Max is ", idx if leftmost: idx = toposort[leftmost] nodes_to_consider = ifilter(lambda x: toposort[x] >= idx, nodes_to_consider) # print "Min is ", idx nodes_to_consider = list(nodes_to_consider) node_dists = [(node, Levenshtein.distance(node, kmer_to_anchor), Levenshtein.editops(node, kmer_to_anchor)) for node in nodes_to_consider] # print "Will search anchor in ",list(node_dists) min_dist = min(node_dists, key=itemgetter(1))[1] node_dists = [x for x in node_dists if x[1] == min_dist] print "Min possible dist is", min_dist if rightmost: score_func = lambda x: (x[1] - min_dist) + abs(toposort[x[0]] - (toposort[rightmost] - path_length)) elif leftmost: score_func = lambda x: (x[1] - min_dist) + abs(toposort[x[0]] - (toposort[leftmost] + path_length)) dist_sorted = sorted(node_dists, key=score_func) # identify the rightmost node with minimal distance return dist_sorted[0][0]
def get_closest_match(self, cells, matching_threshold, suppress_non_answer_cells=False): """ Returns a list of cells that most closely match the question prompt. If no match is better than the matching_threshold, the empty list will be returned. """ return_value = [] distances = [Levenshtein.distance(self.start_md, u''.join(cell['source'])) for cell in cells] if min(distances) > matching_threshold: return return_value best_match = argmin(distances) if self.stop_md == u"next_cell": end_offset = 2 elif len(self.stop_md) == 0: end_offset = len(cells) - best_match else: distances = [Levenshtein.distance(self.stop_md, u''.join(cell['source'])) for cell in cells[best_match:]] if min(distances) > matching_threshold: return return_value end_offset = argmin(distances) if len(self.question_heading) != 0 and not suppress_non_answer_cells: return_value.append(NotebookExtractor.markdown_heading_cell(self.question_heading, 2)) if not suppress_non_answer_cells: return_value.append(cells[best_match]) return_value.extend(cells[best_match + 1:best_match + end_offset]) return return_value
def _match_user_agent(cls, user_agent): device = cls.objects.filter(user_agent=user_agent).order_by("-actual_device_root")[:1] if len(device): return device[0] else: if settings.UA_PREFIX_MATCHING: # Try more flexible matching, 1 third of the UA string ds_user_agent = user_agent[: len(user_agent) // 3] devices = cls.objects.filter(user_agent__startswith=ds_user_agent) devices = devices.order_by("-actual_device_root")[: settings.UA_PREFIX_MATCHING_LIMIT] if len(devices): user_agent = force_unicode(user_agent) best = reduce( lambda x, y: Levenshtein.distance(user_agent, x.user_agent) < Levenshtein.distance(user_agent, y.user_agent) and x or y, devices, ) if Levenshtein.distance(user_agent, best.user_agent) <= settings.UA_PREFIX_MATCHING_MAX_DISTANCE: return best if settings.UA_GENERIC_FALLBACK: # Try to match with generic properties # :TODO: raise NotImplemented raise NoMatch, "Can't find a match in currently installed WURFL table for user_agent `%s`" % user_agent
def findMatchScore(searchName, foundName) : if(type(searchName) is unicode): searchName = unicodedata.normalize('NFKD', searchName).encode('ascii','ignore') if(type(foundName) is unicode): foundName = unicodedata.normalize('NFKD', foundName).encode('ascii','ignore') bigR = 0 inputWords = searchName.replace(':', ' ').split(' ') foundWords = foundName.replace(':', ' ').split(' ') inputWords = removeSkipWords(inputWords) foundWords = removeSkipWords(foundWords) for iWord in inputWords: maxRatio = 0 for fWord in foundWords: r = Levenshtein.ratio(iWord.lower().replace("/'s", ''), fWord.lower().replace("/'s", '')) if r > maxRatio: maxRatio = r bigR += maxRatio bigR2 = 0 # if the input has MORE words than the solution (rare) for fWord in foundWords: maxRatio = 0 for iWord in inputWords: r = Levenshtein.ratio(iWord.lower().replace("/'s", ''), fWord.lower().replace("/'s", '')) if r > maxRatio: maxRatio = r bigR2 += maxRatio bigR /= len(inputWords) bigR2 /= len(foundWords) return max(bigR, bigR2)
def find_by_distance(): word_nl = getNormalWord(word).lower() min_dist = 10 min_dist_word_idx = wordIdx import Levenshtein for i in range(0, self.mTotalEntries): word_ = getNormalWord(self.getWord(i)).lower() dist = Levenshtein.distance(word_nl, word_) if dist == 1: return i if dist < min_dist: min_dist = dist min_dist_word_idx = i for i in range(0, self.derived_dict.mTotalEntries): word_ = getNormalWord(self.derived_dict.getWord(i)).lower() dist = Levenshtein.distance(word_nl, word_) if dist == 1: words = self.derived_dict.getExplanations(word_) return self.getWordIdxInternal(words[0]) if dist < min_dist: min_dist = dist words = self.derived_dict.getExplanations(word_) min_dist_word_idx = self.getWordIdxInternal(words[0]) return min_dist_word_idx
def read_type(left_read, right_read, left_enzsite, right_enzsite, left_bc, right_bc): """Determine if bisulfite read is watson or crick""" lr_enz_left = left_read[1][len(left_bc):len(left_bc)+5] rr_enz_right = right_read[1][len(right_bc):len(right_bc)+5] if left_enzsite == 'TACAA' and right_enzsite == 'TGCAG': return 'crick' elif right_enzsite == 'TACAA' and left_enzsite == 'TGCAG': return 'watson' elif right_enzsite == 'TGCAG' and left_enzsite == 'TGCAG': return 'gbs' else: #enzyme sites have not been establshed correctly, establish read #type based on closest matching enz site and CG count. watson_count = left_read[1].count('G') + right_read[1].count('C') +0.001 crick_count = left_read[1].count('C') + right_read[1].count('G') +0.001 left_distance = Levenshtein.distance(lr_enz_left, left_enzsite) right_distance = Levenshtein.distance(rr_enz_right, right_enzsite) if left_distance < right_distance: #left enz_site should be leading since it has fewer mismatches. if left_enzsite == 'TACAA' and crick_count/float(watson_count)>2: return 'crick' else: return 'nodet' else: if left_enzsite == 'TGCAG' and watson_count/float(crick_count)>2: return 'watson' else: return 'nodet'
def identify_similar_teams(cls): matches = [] final = [] teams = Team.objects.all() for team_primary in teams: p_school = team_primary.team_code[:-3] for team_sec in teams: s_school = team_sec.team_code[:-3] if (p_school != s_school): if Levenshtein.ratio(team_primary.team_code, team_sec.team_code) > 0.50: matches.append((team_primary, team_sec)) for t1, t2 in matches: if Levenshtein.ratio(t1.team_name, t2.team_name) > .60: data = {} data["team1"] = TeamDataFetch.process_teams(TeamSerializer(Team.objects.get(id=t1.id)).data) data["team2"] = TeamDataFetch.process_teams(TeamSerializer(Team.objects.get(id=t2.id)).data) data["score"] = Levenshtein.ratio(t1.team_name, t2.team_name) + Levenshtein.ratio(t1.team_code, t2.team_code) final.append(data) processed_data = [] for obj in final: if not processed_data: processed_data.append(obj) elif (obj not in processed_data) and (not SimilarTeams.double_count_present(obj, processed_data)): processed_data.append(obj) return processed_data
def _match_user_agent(cls, user_agent): device = cls.objects.filter(user_agent=user_agent).order_by('-actual_device_root')[:1] if len(device): return device[0] else: if settings.UA_PREFIX_MATCHING: #~ Try more flexible matching, from 1/3rd to 1/10th of the original UA string #~ We break out as soon as we get a match (or matches, in which case we use Levenshtein #~ distance to determine which one we want to use) or if the shortened UA string is less #~ than 5 characters long devices = None for factor in range(3,10): if len(user_agent)/factor <= 5: break devices = cls._match_partial_user_agent(user_agent,factor) if len(devices): break if len(devices): user_agent = force_unicode(user_agent) best = reduce( lambda x,y: Levenshtein.distance(user_agent, x.user_agent) < Levenshtein.distance(user_agent, y.user_agent) and x or y, devices, ) if Levenshtein.distance(user_agent, best.user_agent) <= settings.UA_PREFIX_MATCHING_MAX_DISTANCE: return best if settings.UA_GENERIC_FALLBACK: # Try to match with generic properties # :TODO: raise NotImplementedError, 'Generic properties matching is not implemented' raise NoMatch, "Can't find a match in currently installed WURFL table for user_agent `%s`" % user_agent
def prettyprint(self): print "Timestamp: " + self.data["timeseed"] print "Expected Data: " + self.data["expected_data"] print "PSK31 Data: " + self.data["psk_data"] print "PSK31 Jaro Dist: " + str(Levenshtein.jaro(self.data["expected_data"],self.data["psk_data"])) print "DOMEX8 Data: " + self.data["domex_data"] print "DOMEX Jaro Dist: " + str(Levenshtein.jaro(self.data["expected_data"],self.data["domex_data"]))
def getScoredMatches(self, word, possibilities, num, score, context=None): """ Take a word, compare it to a list of possibilities, return max. num matches > score). """ self._checkPermission(context) if not USE_LEVENSHTEIN: # No levenshtein module around. Fall back to difflib return difflib.get_close_matches(word, possibilities, num, score) # Levenshtein is around, so let's use it. res = [] # Search for all similar terms in possibilities if isinstance(word, str): oword = unicode(word, 'utf-8') else: oword = word.encode('utf-8') for item in possibilities: if isinstance(item, type(word)): lscore = Levenshtein.ratio(word, item) elif isinstance(item, type(oword)): lscore = Levenshtein.ratio(oword, item) else: raise ValueError, "%s is not a normal, or unicode string" % item if lscore > score: res.append((item, lscore)) # Sort by score (high scores on top of list) res.sort(lambda x, y: -cmp(x[1], y[1])) # Return first n terms without scores return [item[0] for item in res[:num]]
def GetSuggestionsFromCorpus(options, corpus, numSuggestions = 5, MAXDIST = 4, MINRATIO = 0.4): # if len(corpus) == 0: # return GetSuggestions(options) suggs = [] for word in corpus: count = corpus[word] mindist = 999 maxratio = 0 for option in options: dist = Levenshtein.distance(word, option) # Edit Distance ratio = Levenshtein.ratio(word, option) # Edit Distance if mindist > dist: mindist = dist if maxratio < ratio: maxratio = ratio if mindist > MAXDIST or maxratio < MINRATIO: continue if mindist == 0: # do not want same ones continue score = 1.0 / mindist * math.log(int(count) + 1.0) suggs.append((score, word, mindist, count, maxratio)) suggs.sort(reverse=True) return suggs[:numSuggestions]
def SuggestCand(words): ret = [] total_edits = 0 total_err_ratio = 0.0 iscorrect = True for word in words: if en_dict.check(word) or corpus_dict.check(word): ret.append(word) else: iscorrect = False sugg_list = corpus_dict.suggest(word) if len(sugg_list) == 0: return None, 0, 0. # sorted_suggs = sorted(sugg_list, key=lambda sw: Levenshtein.ratio(word, sw), reverse=True) best_sugg = max(sugg_list, key=lambda sw: Levenshtein.ratio(word, sw)) for w in best_sugg.split(' '): ret.append(w) total_edits += Levenshtein.distance(word, best_sugg) total_err_ratio += 1.0 - Levenshtein.ratio(word, best_sugg) if iscorrect: return None, 0, 0. if total_err_ratio > RATIO_THRESHOLD: return None, 0, 0. return ret, total_edits, total_err_ratio
def get_all_scores_before_after_no_change_avg(specificity,length_of_no_change, graph=False): avgs = [] for new,old in find_patterns_of_no_change(length_of_no_change,specificity).iteritems(): inew=new[0] vnew=new[1] iold = old[0] vold = old[1] all_indices = [get_future_mappings(vnew-length_of_no_change,x,length_of_no_change*2) for x,y in enumerate(current_paratexts[vnew])] for k, indices in enumerate(all_indices): if indices: before = 0 after = 0 for offset, paraindex in enumerate(indices): if offset < len(indices)-1: if offset < length_of_no_change: before += (Levenshtein.ratio(current_paratexts[vnew-length_of_no_change+offset][paraindex], current_paratexts[vnew-length_of_no_change+offset+1][indices[offset+1]])) else: after += (Levenshtein.ratio(current_paratexts[vnew-length_of_no_change+offset][paraindex], current_paratexts[vnew-length_of_no_change+offset+1][indices[offset+1]])) before = before/length_of_no_change after = after/length_of_no_change avgs.append(before-after) if graph: plt.hist(avgs, bins=21) plt.show() return avgs
def get_neighbor_scores_before_after_no_change_avg(specificity,length_of_no_change, graph=False): avgs = [] for new,old in find_patterns_of_no_change(length_of_no_change,specificity).iteritems(): inew=new[0] vnew=new[1] iold = old[0] vold = old[1] back1 = get_backward_mapping(vnew,inew+1, length_of_no_change) back2 = get_backward_mapping(vnew,inew-1, length_of_no_change) all_indices = [get_future_mappings(vnew-length_of_no_change,back1,length_of_no_change*2), get_future_mappings(vnew-length_of_no_change,back2,length_of_no_change*2)] for indices in all_indices: if indices: before = 0 after = 0 for offset, paraindex in enumerate(indices): if offset < len(indices)-1: if offset < length_of_no_change: before += (Levenshtein.ratio(current_paratexts[vnew-length_of_no_change+offset][paraindex], current_paratexts[vnew-length_of_no_change+offset+1][indices[offset+1]])) else: after += (Levenshtein.ratio(current_paratexts[vnew-length_of_no_change+offset][paraindex], current_paratexts[vnew-length_of_no_change+offset+1][indices[offset+1]])) before = before/length_of_no_change after = after/length_of_no_change avgs.append(before-after) return avgs
def decompose_multiple_alterations(reference_path, alternative_path, kmer_length): reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length) multi_alternative_sequence = ALT.kmerpathToSeq(alternative_path, kmer_length) edit_ops = Levenshtein.editops(reference_sequence, multi_alternative_sequence) if len(edit_ops) > 2: logger.info("Multiple alt when considering ref %s vs alt %s", reference_sequence, multi_alternative_sequence) logger.info("Globally apply %s", edit_ops) start, end = 0, 0 while start < len(edit_ops): if edit_ops[start] == 'replace': atomic_sequence = Levenshtein.apply_edit([edit_ops[start]], reference_sequence, multi_alternative_sequence) # print atomic_sequence atomic_path = ALT.kmerize(atomic_sequence, kmer_length) start += 1 else: start_e = edit_ops[start] end = start + 1 while (end < len(edit_ops) and edit_ops[end][0] == start_e[0] and (start_e[1] == edit_ops[end][1] or start_e[2] == edit_ops[end][2])): end += 1 edit_op_to_apply = edit_ops[start:end] start = end logger.info("Will apply %s", edit_op_to_apply) atomic_sequence = Levenshtein.apply_edit(edit_op_to_apply, reference_sequence, multi_alternative_sequence) atomic_path = ALT.kmerize(atomic_sequence, kmer_length) # record each atomic alteration logger.info("Adding atomic alteration for ref %s vs alt %s", reference_sequence, atomic_sequence) yield atomic_sequence, atomic_path
def janalysis(read): hold_j = j_key.findall(read) if hold_j: if len(hold_j) > 1: counts['multiple_j_matches'] += 1 return j_match = j_seqs.index(hold_j[0][0]) # Assigns J temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be j_seq_end = hold_j[0][1] + len(hold_j[0][0]) start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions ) if start_j_j_dels: # If the number of deletions has been found return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end else: hold_j1 = half1_j_key.findall(read) if hold_j1: for i in range(len(hold_j1)): indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len(read[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], read[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1: counts['jerr2'] += 1 j_match = k temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be j_seq_end = hold_j1[i][1] + len(hold_j1[i][0]) + j_half_split start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions ) if start_j_j_dels: return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end counts['foundj1notj2'] += 1 return else: hold_j2 = half2_j_key.findall(read) if hold_j2: for i in range(len(hold_j2)): indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len(read[hold_j2[i][1]-j_half_split:hold_j2[i][1]-j_half_split+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], read[hold_j2[i][1]-j_half_split:hold_j2[i][1]+len(j_seqs[k])-j_half_split] ) <= 1: counts['jerr1'] += 1 j_match = k temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - j_half_split # Finds where the start of a full J would be j_seq_end = hold_j2[i][1] + len(hold_j2[i][0]) start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions ) if start_j_j_dels: return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end counts['foundv2notv1'] += 1 return else: counts['no_j_assigned'] += 1 return
def vanalysis(read): hold_v = v_key.findall(read) if hold_v: if len(hold_v) > 1: counts['multiple_v_matches'] += 1 return v_match = v_seqs.index(hold_v[0][0]) # Assigns V temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be v_seq_start = hold_v[0][1] end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions ) if end_v_v_dels: # If the number of deletions has been found return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start else: hold_v1 = half1_v_key.findall(read) if hold_v1: for i in range(len(hold_v1)): indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ] for k in indices: if len(v_seqs[k]) == len(read[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]): if lev.hamming( v_seqs[k], read[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1: counts['verr2'] += 1 v_match = k temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions ) if end_v_v_dels: v_seq_start = hold_v1[i][1] return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start counts['foundv1notv2'] += 1 return else: hold_v2 = half2_v_key.findall(read) if hold_v2: for i in range(len(hold_v2)): indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ] for k in indices: if len(v_seqs[k]) == len(read[hold_v2[i][1]-v_half_split:hold_v2[i][1]-v_half_split+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]): if lev.hamming( v_seqs[k], read[hold_v2[i][1]-v_half_split:hold_v2[i][1]+len(v_seqs[k])-v_half_split] ) <= 1: counts['verr1'] += 1 v_match = k temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - v_half_split - 1 # Finds where the end of a full V would be end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions ) if end_v_v_dels: v_seq_start = hold_v2[i][1] - v_half_split return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start counts['foundv2notv1'] += 1 return else: counts['no_vtags_found'] += 1 return
def check_sure(name1, name2): ratio = Levenshtein.ratio(name1, name2) jaro = Levenshtein.jaro(name1, name2) jaro_winkler = Levenshtein.jaro_winkler(name1, name2) if ratio >= 0.9 and jaro >= 0.95 and jaro_winkler >= 0.95: return True else: return False
def check_cons(name1, name2): ratio = Levenshtein.ratio(name1, name2) jaro = Levenshtein.jaro(name1, name2) jaro_winkler = Levenshtein.jaro_winkler(name1, name2) if ratio > .6 or jaro > .7 or jaro_winkler > .7: return True else: return False
def find_future_of_linked_paras(sig_change_threshold, min_past,change_occured_threshold, link_threshold, n_into_future_min, n_into_future_max): #find sig changes linked paras to validate in future linked_paras_with_sig_change = find_para_linked_to_sig_change(sig_change_threshold,min_past,change_occured_threshold,link_threshold) total_prev_links = 0 found_link = 0 looked_at = 0 #construct new dictionary to validate validation_dictionary = copy.deepcopy(linked_paras_with_sig_change) for v in linked_paras_with_sig_change: for i,link in enumerate(linked_paras_with_sig_change[v]): total_prev_links += 1 t = None get_fm = get_future_mappings_indefinite(v,link[0],n_into_future_min) get_fm2 = get_future_mappings_indefinite(v,link[1],n_into_future_min) if not (get_fm and get_fm2): validation_dictionary[v][i] = (linked_paras_with_sig_change[v][i],False) else: looked_at+=1 min_len = min(len(get_fm), len(get_fm2), n_into_future_max+1) get_fm = get_fm[:min_len] get_fm2 = get_fm2[:min_len] t = (get_fm[-1],get_fm2[-1]) linked = False #calculate if still linked (2 dimensional??) change_scores1 = [] change_scores2 = [] #version is v #get_fmX is indices for t, para_index in enumerate(get_fm): if t< len(get_fm)-1: change_scores1.append(Levenshtein.ratio(current_paratexts[v+t][para_index], current_paratexts[v+t+1][get_fm[t+1]])) for t, para_index in enumerate(get_fm2): if t< len(get_fm2)-1: change_scores2.append(Levenshtein.ratio(current_paratexts[v+t][para_index], current_paratexts[v+t+1][get_fm2[t+1]])) num_first_changes = 0 for change in change_scores1: if change < change_occured_threshold: num_first_changes += 1 num_second_changes = 0 for t, change in enumerate(change_scores2): if change < change_occured_threshold and change_scores1[t] < change_occured_threshold: num_second_changes += 1 avg = 0 if num_first_changes == 0 and num_second_changes == 0: avg = 1.0 elif num_first_changes > 0: avg = float(num_second_changes)/num_first_changes #only do from current verision to this n versions away if avg > link_threshold: validation_dictionary[v][i] = (linked_paras_with_sig_change[v][i],avg) found_link +=1 else: validation_dictionary[v][i] = (linked_paras_with_sig_change[v][i],False, avg) return validation_dictionary, (found_link, looked_at, total_prev_links)
def karsilastir(a, b): if a == u'None' and b == u'None': return '24' elif a == u'None': return '22' elif b == u'None': return '21' elif Levenshtein.ratio(a,b) < minratio: if len(a) > len(b): return '25' elif len(b) > len(a): return '26' else: return '23' else: return str(Levenshtein.ratio(a,b))
def are_barcodes_equivalent(bc1, bc2, threshold): if lev.distance(bc1, bc2) <= threshold: return 1 else: return 0
if child.tag == 'abstract' and child.attrib == {"lang": "tr"}: ozet4 = unicode(child.text) for keywords in root1.findall('keywords'): if keywords.attrib == {"lang": "en"}: for keyword in keywords.findall('keyword'): ahmet = ET.SubElement(keywords_en, 'keyword') anahtar = unicode(keyword.text) liste1 = ['20'] for keys in root2.findall('keywords'): if keys.attrib == {"lang": "en"}: if liste1 == ['20']: liste1 = [] for key in keys.findall('keyword'): anahtar2 = unicode(key.text) liste1.append( str(Levenshtein.ratio(anahtar, anahtar2))) ahmet.text = max(liste1) elif keywords.attrib == {"lang": "tr"}: for keyword in keywords.findall('keyword'): ahmet = ET.SubElement(keywords_tr, 'keyword') anahtar = unicode(keyword.text) liste2 = ['20'] for keys in root2.findall('keywords'): if keys.attrib == {"lang": "tr"}: if liste2 == ['20']: liste2 = [] for key in keys.findall('keyword'): anahtar2 = unicode(key.text) liste2.append( str(Levenshtein.ratio(anahtar, anahtar2))) ahmet.text = max(liste2)
def cred_management_action(): supplied = request.args.get('term') action = request.args.get('action') section = request.args.get('section') extensive = request.args.get('extensive') extensive = True if extensive == "true" else False if extensive: #collectDico AllUsernameInRedis = r_serv_cred.hgetall(REDIS_KEY_ALL_CRED_SET).keys() uniq_num_set = set() if action == "seek": possibilities = mixUserName(supplied, extensive) for poss in possibilities: num = r_serv_cred.hget(REDIS_KEY_ALL_CRED_SET, poss) if num is not None: uniq_num_set.add(num) for num in r_serv_cred.smembers(poss): uniq_num_set.add(num) #Extensive /!\ if extensive: iter_num = 0 tot_iter = len(AllUsernameInRedis) * len(possibilities) for tempUsername in AllUsernameInRedis: for poss in possibilities: #FIXME print progress if (iter_num % int(tot_iter / 20) == 0): #print("searching: {}% done".format(int(iter_num/tot_iter*100)), sep=' ', end='\r', flush=True) print("searching: {}% done".format( float(iter_num) / float(tot_iter) * 100)) iter_num += 1 if poss in tempUsername: num = (r_serv_cred.hget(REDIS_KEY_ALL_CRED_SET, tempUsername)) if num is not None: uniq_num_set.add(num) for num in r_serv_cred.smembers(tempUsername): uniq_num_set.add(num) data = {'usr': [], 'path': [], 'numPaste': [], 'simil': []} for Unum in uniq_num_set: levenRatio = 2.0 username = (r_serv_cred.hget(REDIS_KEY_ALL_CRED_SET_REV, Unum)) # Calculate Levenshtein distance, ignore negative ratio supp_splitted = supplied.split() supp_mixed = supplied.replace(' ', '') supp_splitted.append(supp_mixed) for indiv_supplied in supp_splitted: levenRatio = float(Levenshtein.ratio(indiv_supplied, username)) levenRatioStr = "{:.1%}".format(levenRatio) data['usr'].append(username) allPathNum = list( r_serv_cred.smembers(REDIS_KEY_MAP_CRED_TO_PATH + '_' + Unum)) data['path'].append(allPathNum) data['numPaste'].append(len(allPathNum)) data['simil'].append(levenRatioStr) to_return = {} to_return["section"] = section to_return["action"] = action to_return["term"] = supplied to_return["data"] = data return jsonify(to_return)
def are_similar(name1, name2): name1, name2 = (asciipunct(s.strip().lower()) for s in (name1, name2)) ratio = Levenshtein.jaro_winkler(name1, name2) return ratio >= 0.8 or name1 in name2 or name2 in name1
company_type WANT date_of_creation MAYBE description description_identifier kind links matches snippet title (name of company) WANT ''' api_ext= a[['company_number', 'title', 'address_snippet','company_type', 'company_status']].loc[0] api_ext=api_ext.append(pd.Series(company, index=['subcontractor_name'])) api_ext= api_ext.str.lower() api_ext= api_ext.map(lambda x: x.replace('ltd','').replace('limited','')) api_ext= api_ext.str.strip() api_ext=api_ext.append(pd.Series(Levenshtein.distance(str(company), str(api_ext.loc['title'])), index=['similarity_distance'])) api_ext=api_ext.append(pd.Series(Levenshtein.ratio(str(company), str(api_ext.loc['title'])), index=['similarity_ratio'])) api_ext=api_ext.append(pd.Series(Levenshtein.jaro(str(company), str(api_ext.loc['title'])), index=['similarity_jaro'])) company_data.append(api_ext) '''JSONDecodeError: Expecting value''' except Exception as e: print(e, company), missed_subs.append(company) time.sleep(0.5) '''a lot of companies look like they should be in companies house. why aren't they fetching results? that is the issue. Companies house has a request limit of 600 per 5 minutes. timing repeat operations every 0.5 seconds means I'm completing 600 requests per 5 minutes. Binning a list of missed subcontractors showed that companies were being extracted in each instance, the problem was the request limit. Also noted on API forums that some clients simply aren't on the companies house api, so they may be on the CH system but they're not visible under the API.''' df_cd=pd.DataFrame(company_data)
def main(): USEANSWER = True random.seed(0) # np.random.seed(0) questions, tables, table_idx = load_data() random.shuffle(questions) table_name = list(table_idx) # test = ['regents-02', 'regents-03', 'regents-08', 'regents-13', 'regents-17', 'regents-19', 'regents-22', # 'regents-25&26', 'regents-42', 'monarch-44', 'monarch-47', 'monarch-50', 'monarch-53', 'monarch-57', # 'monarch-62', 'monarch-64'] # train = [t for t in table_idx if t not in test] # train_questions = [q for q in questions if q[7] in train] # test_questions = [q for q in questions if q[7] in test] punc_table = str.maketrans({key: None for key in string.punctuation}) trainx, trainy = [], [] cap_corpus = {} header_corpus = {} cell_corpus = {} for t in table_name: table = tables[t] cap_corpus[t] = nltk.word_tokenize(table_idx[t].translate(punc_table).lower()) header = list(table) temp = [] for h in header: if not h.startswith('Unnamed'): temp += nltk.word_tokenize(h.translate(punc_table).lower()) header_corpus[t] = temp cells = table.applymap(str).values body = "" for row in cells: body += " ".join(row) + " " cell_corpus[t] = nltk.word_tokenize(body.translate(punc_table).lower()) vocab = [] for t in table_name: vocab += cap_corpus[t] vocab += header_corpus[t] vocab += cell_corpus[t] vocab = set(vocab) if USEANSWER: queries = [nltk.word_tokenize( q[0].translate(punc_table).lower() + " " + " ".join(q[2:6]).translate(punc_table).lower()) for q in questions] else: queries = [nltk.word_tokenize(q[0].translate(punc_table).lower()) for q in questions] # Compute bm25 scores and idf scores cap_bm25 = QueryProcessor(queries, cap_corpus) cap_results = cap_bm25.run() cap_idf = cap_bm25.idf() header_bm25 = QueryProcessor(queries, header_corpus) header_results = header_bm25.run() header_idf = header_bm25.idf() cell_bm25 = QueryProcessor(queries, cell_corpus) cell_results = cell_bm25.run() cell_idf = cell_bm25.idf() for i in range(8200): q = questions[i] q_tok = queries[i] tab = q[7] table = tables[tab] cap = cap_corpus[tab] header = header_corpus[tab] body = cell_corpus[tab] capc = collections.Counter(cap) headerc = collections.Counter(header) bodyc = collections.Counter(body) x = list() # Query length x.append(len(q_tok)) # Sum of idf scores x.append(sum(cap_idf[i])) x.append(sum(header_idf[i])) x.append(sum(cell_idf[i])) # # Max of idf scores # x.append(max(cap_idf[i])) # x.append(max(header_idf[i])) # x.append(max(cell_idf[i])) # # # Average of idf scores # x.append(mean(cap_idf[i])) # x.append(mean(header_idf[i])) # x.append(mean(cell_idf[i])) # Num of columns x.append(len(list(table))) # LCS normalized by length of query que = " ".join(q_tok) cap = " ".join(cap) header = " ".join(header) body = " ".join(body) x.append(SequenceMatcher(None, que, cap).find_longest_match(0, len(que), 0, len(cap)).size / len(que)) x.append(SequenceMatcher(None, que, header).find_longest_match(0, len(que), 0, len(header)).size / len(que)) x.append(SequenceMatcher(None, que, body).find_longest_match(0, len(que), 0, len(body)).size / len(que)) # Term frequency cap_tf = [capc[tok] / sum(capc.values()) for tok in q_tok] header_tf = [headerc[tok] / sum(headerc.values()) for tok in q_tok] body_tf = [bodyc[tok] / sum(bodyc.values()) for tok in q_tok] # Sum of term frequency x.append(sum(cap_tf)) x.append(sum(header_tf)) x.append(sum(body_tf)) # # Max of term frequency # x.append(max(cap_tf)) # x.append(max(header_tf)) # x.append(max(body_tf)) # # # Average of term frequency # x.append(mean(cap_tf)) # x.append(mean(header_tf)) # x.append(mean(body_tf)) # BM25 scores x.append(cap_results[i][tab]) x.append(header_results[i][tab]) x.append(cell_results[i][tab]) # Fix typo cap_typo = [] header_typo = [] cell_typo = [] for tok in q_tok: if tok not in vocab: cap_typo.append(max(Levenshtein.ratio(tok, cc) for cc in capc)) header_typo.append(max(Levenshtein.ratio(tok, cc) for cc in headerc)) cell_typo.append(max(Levenshtein.ratio(tok, cc) for cc in bodyc)) if not cap_typo: # x += [0, 0, 0, 0, 0, 0, 0, 0, 0] x += [0, 0, 0] else: x.append(sum(cap_typo)) x.append(sum(header_typo)) x.append(sum(cell_typo)) # x.append(max(cap_typo)) # x.append(max(header_typo)) # x.append(max(cell_typo)) # # x.append(mean(cap_typo)) # x.append(mean(header_typo)) # x.append(mean(cell_typo)) trainx.append(x) trainy.append(1) # Negative samples neg_samp = random.sample(table_name, 2) while neg_samp[0] == tab or neg_samp[1] == tab: neg_samp = random.sample(table_name, 2) for tab in neg_samp: table = tables[tab] cap = cap_corpus[tab] header = header_corpus[tab] body = cell_corpus[tab] capc = collections.Counter(cap) headerc = collections.Counter(header) bodyc = collections.Counter(body) x = list() # Query length x.append(len(q_tok)) # Sum of idf scores x.append(sum(cap_idf[i])) x.append(sum(header_idf[i])) x.append(sum(cell_idf[i])) # # Max of idf scores # x.append(max(cap_idf[i])) # x.append(max(header_idf[i])) # x.append(max(cell_idf[i])) # # # Average of idf scores # x.append(mean(cap_idf[i])) # x.append(mean(header_idf[i])) # x.append(mean(cell_idf[i])) # Num of columns x.append(len(list(table))) # LCS normalized by length of query que = " ".join(q_tok) cap = " ".join(cap) header = " ".join(header) body = " ".join(body) x.append(SequenceMatcher(None, que, cap).find_longest_match(0, len(que), 0, len(cap)).size / len(que)) x.append(SequenceMatcher(None, que, header).find_longest_match(0, len(que), 0, len(header)).size / len(que)) x.append(SequenceMatcher(None, que, body).find_longest_match(0, len(que), 0, len(body)).size / len(que)) # Term frequency cap_tf = [capc[tok] / sum(capc.values()) for tok in q_tok] header_tf = [headerc[tok] / sum(headerc.values()) for tok in q_tok] body_tf = [bodyc[tok] / sum(bodyc.values()) for tok in q_tok] # Sum of term frequency x.append(sum(cap_tf)) x.append(sum(header_tf)) x.append(sum(body_tf)) # # Max of term frequency # x.append(max(cap_tf)) # x.append(max(header_tf)) # x.append(max(body_tf)) # # # Average of term frequency # x.append(mean(cap_tf)) # x.append(mean(header_tf)) # x.append(mean(body_tf)) # BM25 scores x.append(cap_results[i][tab]) x.append(header_results[i][tab]) x.append(cell_results[i][tab]) # Fix typo cap_typo = [] header_typo = [] cell_typo = [] for tok in q_tok: if tok not in vocab: cap_typo.append(max(Levenshtein.ratio(tok, cc) for cc in capc)) header_typo.append(max(Levenshtein.ratio(tok, cc) for cc in headerc)) cell_typo.append(max(Levenshtein.ratio(tok, cc) for cc in bodyc)) if not cap_typo: # x += [0, 0, 0, 0, 0, 0, 0, 0, 0] x += [0, 0, 0] else: x.append(sum(cap_typo)) x.append(sum(header_typo)) x.append(sum(cell_typo)) # x.append(max(cap_typo)) # x.append(max(header_typo)) # x.append(max(cell_typo)) # # x.append(mean(cap_typo)) # x.append(mean(header_typo)) # x.append(mean(cell_typo)) trainx.append(x) trainy.append(0) inplen = len(trainx[0]) trainx = np.array(trainx) trainy = np.array(trainy) # print(trainx[:3]) # print(trainy[:3]) Xtest = [] Ytest = [] for i in range(8200, len(questions)): q = questions[i] q_tok = queries[i] testx = [] for tab in table_name: table = tables[tab] cap = cap_corpus[tab] header = header_corpus[tab] body = cell_corpus[tab] capc = collections.Counter(cap) headerc = collections.Counter(header) bodyc = collections.Counter(body) x = list() # Query length x.append(len(q_tok)) # Sum of idf scores x.append(sum(cap_idf[i])) x.append(sum(header_idf[i])) x.append(sum(cell_idf[i])) # # Max of idf scores # x.append(max(cap_idf[i])) # x.append(max(header_idf[i])) # x.append(max(cell_idf[i])) # # # Average of idf scores # x.append(mean(cap_idf[i])) # x.append(mean(header_idf[i])) # x.append(mean(cell_idf[i])) # Num of columns x.append(len(list(table))) # LCS normalized by length of query que = " ".join(q_tok) cap = " ".join(cap) header = " ".join(header) body = " ".join(body) x.append(SequenceMatcher(None, que, cap).find_longest_match(0, len(que), 0, len(cap)).size / len(que)) x.append(SequenceMatcher(None, que, header).find_longest_match(0, len(que), 0, len(header)).size / len(que)) x.append(SequenceMatcher(None, que, body).find_longest_match(0, len(que), 0, len(body)).size / len(que)) # Term frequency cap_tf = [capc[tok] / sum(capc.values()) for tok in q_tok] header_tf = [headerc[tok] / sum(headerc.values()) for tok in q_tok] body_tf = [bodyc[tok] / sum(bodyc.values()) for tok in q_tok] # Sum of term frequency x.append(sum(cap_tf)) x.append(sum(header_tf)) x.append(sum(body_tf)) # # Max of term frequency # x.append(max(cap_tf)) # x.append(max(header_tf)) # x.append(max(body_tf)) # # # Average of term frequency # x.append(mean(cap_tf)) # x.append(mean(header_tf)) # x.append(mean(body_tf)) # BM25 scores x.append(cap_results[i][tab]) x.append(header_results[i][tab]) x.append(cell_results[i][tab]) # Fix typo cap_typo = [] header_typo = [] cell_typo = [] for tok in q_tok: if tok not in vocab: cap_typo.append(max(Levenshtein.ratio(tok, cc) for cc in capc)) header_typo.append(max(Levenshtein.ratio(tok, cc) for cc in headerc)) cell_typo.append(max(Levenshtein.ratio(tok, cc) for cc in bodyc)) if not cap_typo: #x += [0, 0, 0, 0, 0, 0, 0, 0, 0] x += [0, 0, 0] else: x.append(sum(cap_typo)) x.append(sum(header_typo)) x.append(sum(cell_typo)) # x.append(max(cap_typo)) # x.append(max(header_typo)) # x.append(max(cell_typo)) # # x.append(mean(cap_typo)) # x.append(mean(header_typo)) # x.append(mean(cell_typo)) testx.append(x) testx = np.array(testx) Xtest.append(testx) Ytest.append(q[7]) # Test for _ in range(5): # lm = RandomForestRegressor(n_estimators=1000) # lm.fit(trainx, trainy) model = Sequential() model.add(Dense(32, input_shape=(inplen,), activation='tanh')) model.add(Dense(1, activation='sigmoid')) model.summary() model.compile(optimizer='adadelta', loss='binary_crossentropy') model.fit(trainx, trainy, batch_size=100, epochs=40) ap1 = 0 ap2 = 0 ap3 = 0 saveresult = [] for i in range(len(Xtest)): q = questions[8200 + i] pre = model.predict(Xtest[i]).reshape(1, -1) predictions = pre[0].argsort()[::-1] # pre = lm.predict(Xtest[i]) # predictions = pre.argsort()[::-1] saveresult.append(pre[0]) if table_name[predictions[0]] == Ytest[i]: ap1 += 1 ap2 += 1 ap3 += 1 elif table_name[predictions[1]] == Ytest[i]: ap2 += 0.5 ap3 += 0.5 elif table_name[predictions[2]] == Ytest[i]: ap3 += 1/3 # else: # print(q[0]) # print(q[2:6]) # print("Pre:", table_name[predictions[0]], ", gold:", q[7]) # np.save('ft', np.array(saveresult)) print(ap1 / len(Xtest)) print(ap2 / len(Xtest)) print(ap3 / len(Xtest))
version1 = row1.version for index2, row2 in sampled_sequence_df.iterrows(): version2 = row2.version if version1 != version2: country_code1 = row1["country_code"] country_code2 = row2["country_code"] region_name1 = row1["region_name"] region_name2 = row2["region_name"] covid_nucleic_acid_sequence1 = row1["sequence"] covid_nucleic_acid_sequence2 = row2["sequence"] lev_dist = Levenshtein.distance(covid_nucleic_acid_sequence1, covid_nucleic_acid_sequence2) divider = len(covid_nucleic_acid_sequence1 ) if len(covid_nucleic_acid_sequence1) > len( covid_nucleic_acid_sequence2) else len( covid_nucleic_acid_sequence2) lev_dist = lev_dist / divider lev_dist = 1 - lev_dist print(str(index) + " " + str(index2) + " " + str(lev_dist)) s = pd.Series([ version1, country_code1, region_name1, version2, country_code2, region_name2, lev_dist ], index=dist_df.columns) dist_df = pd.DataFrame([s]) if i == 0:
def parser_list(self): result = Result([], True) self.browser.save_page_screenshot('./images/n.png') page_sources = self.get_all_html() response_status = self.get_response_status() is_all_parser = False if self.task.get('parser_class'): parser = self.__parsers.get(self.task.get('parser_class')) result = self._result_pipeline(parser.parser_main, page_sources=page_sources, task=self.task, logger=self.logger) result.parser_class = parser.parser_class if len(result.get_result()) < 3: is_all_parser = True self.task['count'] = 0 else: is_all_parser = True self.task['count'] = 0 if is_all_parser: for parser in self.__parsers.values(): if not result.is_next: break result = self._result_pipeline(parser.parser_main, page_sources=page_sources, task=self.task, logger=self.logger) result.parser_class = parser.parser_class self.task['parser_class'] = parser.parser_class self.task['count'] = len(result.get_result()) self.task['status'] = str(response_status) # 计算提取结果相似度(采用计算莱文斯坦比) 、 (两个文本长度之和-类编辑距离)/两个文本长度之和 val = self.clear("".join( [res.get('title') for res in result.get_result()])) if self.first_result_val: first_similarity = Levenshtein.ratio(self.first_result_val, val) if first_similarity > 0.92: self.logger.info( f'当前取出的结果与第一页取出的结果相似,退出:当前第{self.page_num}页,start_url:{self.task.get("start_url")}' ) return False else: self.first_result_val = val if self.last_result_val: last_similarity = Levenshtein.ratio(self.last_result_val, val) if last_similarity > 0.92: self.logger.info( f'当前取出结果与上一页取出的结果相似,退出:当前第{self.page_num}页,start_url:{self.task.get("start_url")}' ) return False self.last_result_val = val if not val: self.logger.info( f'未取出结果,退出:当前第{self.page_num}页,start_url:{self.task.get("start_url")}' ) return False turn_page_conf = self.turn_page(page_sources) result.cookies = self.browser.browser.get_cookies() result.browser = self.browser for result_process in self.__results.values(): result_process.result_main(result=result, task=self.task, logger=self.logger, response_status=response_status, turn_page_conf=turn_page_conf) dates = [x.get('send_date') for x in result.get_result()] min_date = self.get_min_date(dates) if min_date and datetime.datetime.now( ) > min_date > self.min_date and self.task.get('end_send_date'): if min_date < datetime.datetime.utcfromtimestamp( self.task.get('end_send_date')): self.logger.info( f'获取的留言日期时间已小于上次抓取的最大时间,退出,start_url:{self.task.get("start_url")}' ) return False if self.page_num >= TURN_PAGE_COUNT: self.logger.info( f'翻到指定页数,退出:当前第{self.page_num}页,start_url:{self.task.get("start_url")}' ) return False # 执行翻页,并判断翻页情况 if self.__turn_page_action(turn_page_conf): self.logger.info(f'翻页成功,当前{self.page_num}页') return True else: return False
def _levenshteinDistance(self, str1, str2): return Levenshtein.distance(str1, str2)
".lettr (language encoded and typed text with \"raspa\") file with all the information about the processed files (.lett file is also valid)", dest="lettr", required=True) options = oparser.parse_args() if options.ridx == None: reader = sys.stdin else: reader = open(options.ridx, "r") index = {} documents = {} readLETT(options.lettr, documents) for i in reader: fields = i.strip().split("\t") #The document must have at least one candidate if len(fields) > 1: len_s = len(documents[int(fields[0])]) sys.stdout.write(str(fields[0])) for j in range(1, len(fields)): candidate = fields[j] candidateid = int(fields[j].split(":")[0]) len_t = len(documents[candidateid]) dist = Levenshtein.distance(documents[int(fields[0])], documents[candidateid]) port = 1 - (dist / float(max(len_s, len_t))) candidate += ":" + str(port) sys.stdout.write("\t" + candidate) sys.stdout.write("\n")
def evaluation(submit: models.Submit): syntax_error_msg = "" type_ = None schema = submit.Question.Schema question = submit.Question answers = models.Answer.query.filter_by(idQuestion=question.id) student = submit.Student recover_schema(schema) path = os.path.join(config.save_db_path, student.id) if not os.path.exists(path): os.makedirs(path) path = os.path.join(path, schema.name) shutil.copyfile(schema.path, path) correct_sql, count_spelling_err, answer, correct = correct_spelling( submit.answer, list(answers), schema, ) submit.correct = correct_sql submit.spelling = count_spelling_err submit.idAnswer = answer.id submit.Answer = answer conn = sqlite3.connect(path) cur = conn.cursor() try: cur.execute(correct_sql) values = cur.fetchall() result = {'data': values, 'len': len(values)} submit.result = json.dumps(result) result = json.loads(submit.result) if question.result is None: abort(500) else: origin = json.loads(question.result) if origin == result: submit.score = question.score - count_spelling_err submit.info = ' '.join(map(str, correct)) + '\n' + syntax_error_msg type_ = type_submit.all_right if count_spelling_err == 0 else type_submit.error_spelling else: type_ = type_submit.error_result except Exception as e: syntax_error_msg = str(e) submit.result = str(e) type_ = type_submit.error_syntax finally: cur.close() conn.close() submit.segmentJson = json.dumps({'compare': []}) if type_ != type_submit.error_spelling and type_ != type_submit.all_right: submit.info = ' '.join(map(str, correct)) + '\n' + syntax_error_msg if type_ == type_submit.error_result: submit.score = question.score - count_spelling_err stu_segments = Segment(submit.correct) segments = models.Segmentation.query.filter_by( idAnswer=submit.Answer.id).order_by(models.Segmentation.rank) segments = [s for s in segments] submit.segmentJson = {'compare': []} idx_student_segment = 0 idx_segment = 0 while idx_segment < len(segments): compare = {'right_segment': segments[idx_segment].data} if idx_student_segment < len( stu_segments.segment_str ) and Segment.filter_segment_punctuation( segments[idx_segment].data ) == Segment.filter_segment_punctuation( stu_segments.segment_str[idx_student_segment]): compare['student_segment'] = stu_segments.segment_str[ idx_student_segment] compare['deduction'] = 0 idx_student_segment += 1 else: tmp_idx = idx_student_segment max_score = 0 max_idx = tmp_idx while tmp_idx < len(stu_segments.segment_str): score = Levenshtein.ratio( Segment.filter_segment_punctuation( segments[idx_segment].data), Segment.filter_segment_punctuation( stu_segments.segment_str[tmp_idx])) if score > max_score: max_idx = tmp_idx max_score = score tmp_idx += 1 if max_score < 0.6: compare['student_segment'] = '' compare['deduction'] = segments[idx_segment].score submit.score -= segments[idx_segment].score else: compare['student_segment'] = stu_segments.segment_str[ max_idx] while idx_student_segment < max_idx: submit.segmentJson['compare'].append({ 'student_segment': stu_segments.segment_str[idx_student_segment], 'right_segment': '', 'deduction': 2 }) submit.score -= 2 idx_student_segment += 1 idx_student_segment = max_idx + 1 if max_score == 1: compare['deduction'] = 0 else: compare['deduction'] = segments[idx_segment].score submit.score -= segments[idx_segment].score idx_segment += 1 submit.segmentJson['compare'].append(compare) while idx_student_segment < len(stu_segments.segment_str): submit.segmentJson['compare'].append({ 'student_segment': stu_segments.segment_str[idx_student_segment], 'right_segment': '', 'deduction': 2 }) submit.score -= 2 idx_student_segment += 1 elif type_ == type_submit.error_syntax: submit.score = 0 pass submit.score = 0 if submit.score < 0 else submit.score submit.segmentJson = json.dumps(submit.segmentJson) submit.type = type_.value os.remove(path)
def correct_spelling(stem, answers, schema): keywords_schema = schema.keywords.split(' ') keywords_schema.append('*') format_sql = sqlparse.format(stem, keyword_case='upper') correct = [0] * len(format_sql) format_sql += '\0' # segment_sql = re.split('[. \t\n]', format_sql) correct_sql = '' start_word_idx = 0 count_spelling_err = 0 keywords = [ keywords_schema, list(sqlparse.keywords.KEYWORDS.keys()), list(sqlparse.keywords.KEYWORDS_COMMON.keys()) ] for i in range(0, len(format_sql)): if format_sql[i] in (' ', '.', '\0', '=', '<', '>', '!', ',', ')', '(') or format_sql[i].isdigit(): word = format_sql[start_word_idx:i] if word.strip() != '' and word not in keywords[0] and word.upper( ) not in keywords[1] and word.upper() not in keywords[2]: count_spelling_err += 1 max_word = '' max_value = 0 done = False for idx in range(0, len(keywords)): if idx > 1: word = word.upper() for j in range(0, len(keywords[idx])): ratio = Levenshtein.ratio(word, keywords[idx][j]) if ratio > replace_threshold: done = True max_word = keywords[idx][j] max_value = ratio break elif ratio > max_value: max_word = keywords[idx][j] max_value = ratio if done: break correct_sql += max_word correct = list( map( lambda idx, x: round(max_value, 3) if start_word_idx <= idx < i else x, range(0, len(correct)), correct)) else: correct_sql += word if format_sql[i] != '\0': correct_sql += format_sql[i] start_word_idx = i + 1 else: correct[i] = 1 max_answer = answers[0] max_value = Levenshtein.ratio(answers[0].sql, correct_sql) for i in range(1, len(answers)): ratio = Levenshtein.ratio(answers[i].sql, correct_sql) if ratio > max_value: max_answer = answers[i] max_value = ratio return correct_sql, count_spelling_err, max_answer, correct
def ambiguous_df_look_up(self, tag, recipe_ingredient, recipe_key): temp_nutritional_df = self.nutrition_init.NDB_NO_lookup( tag, filter_list=['Measure', 'Weight(g)']) filtered_ambiguous_df = self.ambiguous_df[ self.ambiguous_df['NDB_NO'] == "\"{}\"".format(tag.strip('"'))] filtered_ambiguous_df = filtered_ambiguous_df.reset_index() recipe_ingredient_unit_dict = self.extact_unit_from_recipe( recipe_ingredient) amount_recipe, unit_recip = self.extact_number_from_recipe( recipe_ingredient, recipe_ingredient_unit_dict) unit_recip = recipe_key temp_recipe_ingredient = str( recipe_ingredient.replace(str(amount_recipe), "").lstrip(" ")) itr = 0 matching_levenstein_ratio_list = [] if len(filtered_ambiguous_df) > 1: while itr < len(filtered_ambiguous_df): temp_ambiguos_ingredient = filtered_ambiguous_df.loc[ itr, 'Ingredient'] temp_ambiguos_ingredient = re.sub( r"([/.0-9]*)", "", temp_ambiguos_ingredient).lstrip(" ") matching_levenstein_ratio_list.append( Levenshtein.ratio(temp_ambiguos_ingredient, temp_recipe_ingredient)) itr += 1 levenstein_index = matching_levenstein_ratio_list.index( max(matching_levenstein_ratio_list)) elif len(filtered_ambiguous_df) == 1: levenstein_index = 0 else: print("\t\tERROR", recipe_ingredient, tag, unit_recip, self.food_unit_standard_dictionary[unit_recip]['type']) levenstein_index = 0 if self.food_unit_standard_dictionary[unit_recip]['type'] == 'weight': temp_recipe_g = float(amount_recipe) * float( self.weight_unit_df['gram']) conversion_factor = float(temp_recipe_g) / float( temp_nutritional_df['Weight(g)'].get_values()[0]) elif self.food_unit_standard_dictionary[unit_recip][ 'type'] == 'volume': temp_recipe_g = (float(amount_recipe) * float( self.volume_unit_df['cup'].get_values()[0])) / ( float(filtered_ambiguous_df.loc[levenstein_index, 'cups']) * float(self.volume_unit_df[unit_recip].get_values()[0])) conversion_factor = (float(temp_recipe_g) * float( filtered_ambiguous_df.loc[levenstein_index, 'grams'])) / float( temp_nutritional_df['Weight(g)'].get_values()[0]) elif self.food_unit_standard_dictionary[unit_recip]['type'] == 'unit': temp_recipe_g = float(amount_recipe) * float( filtered_ambiguous_df.loc[levenstein_index, 'grams']) conversion_factor = float(temp_recipe_g) / float( temp_nutritional_df['Weight(g)'].get_values()[0]) return conversion_factor
#res = {'relevance': [], 'pmid': [], 'title': [], 'url': [], 'year': [], 'cost': []} res = {'relevance': [], 'pmid': [], 'title': [], 'cit_count': [], 'year': []} # use Levenshtein distance to determine each article's relevance # to the search phrase. This could be replaced by something smarter. it = 0 num_pmids = len(pmids) for pmid in pmids: art = get_article(pmid) cits = get_citedby(pmid) it += 1 latest_iteration.text(f"Getting PMID {pmid} (article {it}/{num_pmids})") bar.progress(100 * int((it / num_pmids))) res['relevance'].append(Levenshtein.distance(search, art.title)) res['pmid'].append(pmid) res['title'].append(art.title) # lower cit count by 2 to compensate for self-mentions in the eutils XML return. res['cit_count'].append(len(cits) - 2) res['year'].append(art.year) #res['cost'].append(0) #res['url'].append(art.url) if res: latest_iteration.text("Done! See results below.") # start already sorted on relevance df = pd.DataFrame(res) #st.dataframe(df) st.dataframe(df.style.highlight_max())
def _distance(str1, str2): distance = Levenshtein.distance(str1, str2) if len(str1) > len(str2): return 1 - float(distance) / len(str1) else: return 1 - float(distance) / len(str2)
def phoneme_error_rate(p_seq1, p_seq2): p_vocab = set(p_seq1 + p_seq2) p2c = dict(zip(p_vocab, range(len(p_vocab)))) c_seq1 = [chr(p2c[p]) for p in p_seq1] c_seq2 = [chr(p2c[p]) for p in p_seq2] return Levenshtein.distance(''.join(c_seq1), ''.join(c_seq2)) / len(c_seq2)
import Levenshtein a1 = '尊敬的客户,您好!建议您可以选择考虑我司康爱卫士老年防癌疾病有关产品。您可以登陆 http://baoxian.cntaiping.com/(太平网上商城)或拨打 400-868-8888(太平电话销售);选择您需要的产品购买。祝您太平幸福!' a2 = '尊敬的客户,您好!建议您可以选择考虑我司康爱卫士老年防癌疾病有关产品。您可以登陆 http://baoxian.cntaiping.com/(太平网上商城)或拨打 400-868-8888(太平电话销售);选择您需要的产品购买。祝您太平幸福!' print(Levenshtein.distance(a1, a2))
def _lev_tok_similarity(self, source_tok, target_tok): if len(source_tok) == 0 or len(target_tok) == 0: return 0 return 1 - (py_lev.distance(source_tok, target_tok) / max([len(source_tok), len(target_tok)]))
def spell_correctness2(self, ask, dictionary): final_candidate_str = [] final_candidate_ratio = [] for word in ask.split(' '): candidate_str = [] candidate_ratio = [] for dict in dictionary: if word in dict: candidate_str.append(str(dict)) candidate_ratio.append(Levenshtein.ratio(word, str(dict))) #print "1 ",candidate_str #print "1 ",candidate_ratio temp_candidate_str = candidate_str temp_candidate_ratio = candidate_ratio candidate_str = [] candidate_ratio = [] i = 0 for item in temp_candidate_str: if (word + " ") in item or (" " + word) in item: candidate_str.append(item) candidate_ratio.append(temp_candidate_ratio[i]) i = i + 1 #print "2 ",candidate_str #print "2 ",candidate_ratio if len(candidate_str) == 1: final_candidate_str.append(candidate_str[0]) final_candidate_ratio.append(candidate_ratio[0]) elif len(candidate_str) > 1: x = 0 j = 0 i = 0 for item in candidate_str: if x < candidate_ratio[i]: x = candidate_ratio[i] j = i i = i + 1 final_candidate_str.append(candidate_str[j]) final_candidate_ratio.append(candidate_ratio[j]) elif len(candidate_str) == 0: if len(word) > 3 and len(temp_candidate_str) > 0: x = 0 j = 0 i = 0 for item in temp_candidate_str: if x < temp_candidate_ratio[i]: x = temp_candidate_ratio[i] j = i i = i + 1 final_candidate_str.append(temp_candidate_str[j]) final_candidate_ratio.append(temp_candidate_ratio[j]) else: final_candidate_str.append(word) final_candidate_ratio.append(0) #print ">>",final_candidate_str #print ">>",final_candidate_ratio s = '' x = '' for item in final_candidate_str: if x != item: s = s + item + ' ' x = item return s
def levenshtein_distance(str1, str2): dist = Levenshtein.distance(str1, str2) longer_str = str1 if len(str1) > len(str2) else str2 score = float(dist) * 100 / len(longer_str) return score
def spell_correctness(self, ask): fuzzy_str = {} fuzzy_ratio = {} new_ask = {} fuzzy_str2 = {} fuzzy_ratio2 = {} new_ask2 = {} i = 0 for word in ask.split(' '): ratio = 0 for pool in self.pool_xtrans: if pool.find(word) != -1 and len(word) > 3: x = 0.71 else: x = Levenshtein.ratio(word, pool) if ratio < x: ratio = x fuzzy_str[i] = pool fuzzy_ratio[i] = x new_ask[i] = word if fuzzy_ratio[i] > 0.7: new_ask[i] = fuzzy_str[i] i = i + 1 #print " answer", fuzzy_str #print " ratio", fuzzy_ratio #print " new ask", new_ask #print "-------------------------------" #try using bigrams bigram = ngrams(ask.split(), 2) fuzzy_str2 = {} fuzzy_ratio2 = {} new_ask2 = {} i = 0 for grams in bigram: token = ' '.join(grams) ratio = 0 for pool in self.pool_xtrans: x = Levenshtein.ratio(token, pool) if ratio < x: ratio = x fuzzy_str2[i] = pool fuzzy_ratio2[i] = x new_ask2[i] = token if fuzzy_ratio2[i] > 0.7: new_ask2[i] = fuzzy_str2[i] i = i + 1 #print " answer", fuzzy_str2 #print " ratio", fuzzy_ratio2 #print " new ask", new_ask2 j = 0 i = 0 ask_correction = {} ask_array = ask.split(' ') words_count = len(ask.split(' ')) while (i < words_count): if i < (words_count - 1): if fuzzy_ratio[i] <= 0.7 and fuzzy_ratio2[i] <= 0.7: ask_correction[j] = ask_array[i] elif fuzzy_ratio[i] <= 0.7 and fuzzy_ratio2[i] > 0.7: ask_correction[j] = fuzzy_str2[i] i = i + 1 elif fuzzy_ratio[i] > 0.7 and fuzzy_ratio2[i] <= 0.7: ask_correction[j] = fuzzy_str[i] elif fuzzy_ratio[i] > 0.7 and fuzzy_ratio2[i] > 0.7: ask_correction[j] = fuzzy_str[i] if fuzzy_ratio2[i] > fuzzy_ratio[i]: ask_correction[j] = fuzzy_str2[i] else: if fuzzy_ratio[i] <= 0.7: ask_correction[j] = ask_array[i] else: ask_correction[j] = fuzzy_str[i] i = i + 1 j = j + 1 #print ask_correction s = '' for key, value in ask_correction.iteritems(): s = s + value + ' ' #print s return s
def analysisQuestion(question, phrase): if question[-1] == '?': question = question[:-1] #print('analyzing...') print(phrase) candidateResource = [] candidateProperty = [] p = phrase[-1] if p in enResourceDic: candidateResource.append(ResourceData(p, 1, 1, 0)) if p in zhResourceDic: candidateResource.append(ResourceData(p, 1, 1, 1)) # if p in frResourceDic: # candidateResource.append(ResourceData(p, 1, 1, 2)) #print(candidateResource) resource = [] ph = phrase[:-1] for w in ph: resource.append(w.replace(' ', '')) #En->Zh translist = transList(set(ph), 'en', 'cht') parse = list(set(resource + translist + ph)) print(parse) #print('trans Done') dis = [] for p in parse: f = 0 for d in dis: if d == p: f = 1 if f == 0: dis.append(p) parse = dis # print(candidateResource) # print(parse) samePharse = [] for p in parse: num = 0 same = [] if p == '': continue # print(p) for pro in enPropertyList: con = Levenshtein.ratio(pro, p) if con > ThresHold: new = ResourceData(pro, con, 1, 0) candidateProperty.append(new) same.append(new) num += 1 for pro in zhPropertyList: con = Levenshtein.ratio(pro, p) if con > zhThresHold: new = ResourceData(pro, con, 1, 1) candidateProperty.append(new) same.append(new) num += 1 # for pro in frPropertyList: # con = Levenshtein.ratio(pro, p) # if con > ThresHold: # new = ResourceData(pro, con, 1, 2) # candidateProperty.append(new) # same.append(new) # num += 1 # for res in frKG: # con0=Levenshtein.ratio(res[0], p) # if con0>ThresHold: # new=ResourceData(res[0], con0, 1, 2) # candidateResource.append(new) # same.append(new) # num += 1 # con1=Levenshtein.ratio(res[1], p) # if con1>ThresHold: # new=ResourceData(res[1], con1, 1, 2) # candidateProperty.append(new) # same.append(new) # num += 1 # con2 = Levenshtein.ratio(res[2], p) # if con2>ThresHold: # new=ResourceData(res[2], con2, 1, 2) # candidateResource.append(new) # same.append(new) # num += 1 if num > 1: samePharse.append(same) cr = [] cp = [] for c in candidateResource: f = 0 for i in cr: if i == c: f = 1 if f == 1: continue else: cr.append(c) for c in candidateProperty: f = 0 for i in cp: if i == c: f = 1 if f == 1: continue else: cp.append(c) candidate = [cr, cp] sameP = [] for s in samePharse: sset = [] for c in s: f = 0 for i in sset: if i == c: f = 1 if f == 1: continue else: sset.append(c) sameP.append(sset) #print('------resource analyze!------') # for i in cr: # i.print() #print('-----property----') # for i in cp: # i.print() # print('analyzing done!') # print('---------same phrase--------') # for i in sameP: # print('same:') # for s in i: # print(str(s)) return sameP, candidate
def calculate_compile_distance(shopname): distances = [] for meb in data: distances.append(Levenshtein.distance(shopname, meb)) return distances
def calculate_levenshtein_distance(self, s1, s2): return Levenshtein.ratio(s1, s2)
def main(dirname): df = pd.read_excel( '/Users/nakamurasatoru/git/d_genji/kouigenjimonogatari.github.io/src/data/metadata.xlsx', header=None, index_col=None, engine="openpyxl") configs = {} for i in range(len(df.index)): uri = df.iloc[i, 0] if not pd.isnull(uri): row_num = df.iloc[i, 2] if int(row_num) == 1: title = df.iloc[i, 3] vol = df.iloc[i, 6] page = df.iloc[i, 1] if vol not in configs: configs[vol] = {"data": {}} configs[vol]["data"][title] = page for vol in configs: config = configs[vol] koui = config["data"] VOL = str(vol).zfill(2) ''' if VOL != "51" and False: continue ''' print(VOL) path = '../../docs/iiif/' + dirname + '/' + VOL + '.json' if not os.path.exists(path): continue with open(path) as f: df = json.load(f) members = df["selections"][0]["members"] ################## マッチング map = {} indexedObj = {} for line in koui: map[line] = [] for i in range(len(members)): label = "" # -1行 if i - 1 >= 0: label += members[i - 1]["label"] + "/" # 該当行 member = members[i] label += member["label"] # +1行 ''' if i + 1 <= len(members) - 1: label += "/" + members[i+1]["label"] ''' score = Levenshtein.distance(line, label.replace("/", "")) score = score / max(len(line), len(label.replace("/", ""))) # 正規化 obj = { "label": label, "main": member["label"], "score": score, "member_id": member["@id"], "index": i } map[line].append(obj) indexedObj[i] = obj ################## 集計 prev_index = 0 # 校異のライン毎に for line in map: print(str(koui[line]) + "\t" + line) obj = map[line] # 部分取得 # obj = obj[prev_index:] # スコアが小さい順に並び替え score_sorted = sorted(obj, key=lambda x: x["score"]) flg = True for i in range(len(score_sorted)): data = score_sorted[i] index = data["index"] ''' if i < 10: print(i, data["index"], data["score"], data["member_id"].split("/canvas/")[1], data["label"]) ''' # if index - prev_index < 50: if flg: # print("******:") prev_index = index + 1 # if prev_index - 1 < len(obj): # data = obj[prev_index - 1] index = data["index"] if index > 0: data = indexedObj[index - 1] table = ''' <table class="table"> <tr> <th>項目</th> <th>値</th> </tr> <tr> <td>大成番号</td> <td>''' + str(koui[line]) + '''</td> </tr> <tr> <td>校異源氏テキスト</td> <td>''' + line + '''</td> </tr> <tr> <td>KuroNet翻刻</td> <td>''' + data["main"] + '''</td> </tr> <tr> <td>KuroNet翻刻(前後を含む3行)</td> <td>''' + data["label"] + '''</td> </tr> </table> ''' ########### マーカーのためのID作成 member_id = data["member_id"] # member_id = member["@id"] sss = member_id.split("#xywh=") canvas_id = sss[0] xywh = sss[1].split(",") d = 5 y = int(int(xywh[1]) * d / (d + 1)) if y == 0: y = 800 w = 1 x = int(xywh[0]) + int(int(xywh[2]) / 2) member_id = canvas_id + "#xywh=" + str(x) + "," + str( y) + "," + str(w) + ",1" ########### members.append({ "@id": member_id, "@type": "sc:Canvas", "description": "", "label": "[" + str(len(members) + 1) + "]", "metadata": [{ "label": "p", "value": koui[line] }, { "label": "校異源氏テキスト", "value": line }, { "label": "KuroNet翻刻", "value": data["main"] }, { "label": "KuroNet翻刻(前行を含む)", "value": data["label"] }, { "label": "Annotation", "value": [{ "@id": member_id, "@type": "oa:Annotation", "motivation": "sc:painting", "resource": { "@type": "cnt:ContentAsText", "chars": table, "format": "text/html", "marker": { "border-color": "red", "@type": "dctypes:Image", "@id": "https://nakamura196.github.io/genji_curation/icon/red.png#xy=16,16" } }, "on": member_id }] }] }) flg = False print("----------------") curation = { "@context": [ "http://iiif.io/api/presentation/2/context.json", "http://codh.rois.ac.jp/iiif/curation/1/context.json" ], "@id": df["@id"], "@type": "cr:Curation", "label": "Character List", "selections": [{ "@id": df["@id"] + "/range1", "@type": "sc:Range", "label": "Characters", "members": members, "within": df["selections"][0]["within"] }] } path = path.replace("_kuronet/", "_kuronet_taisei_all/") dirpath = os.path.dirname(path) os.makedirs(dirpath, exist_ok=True) f2 = open(path, 'w') json.dump(curation, f2, ensure_ascii=False, indent=4, sort_keys=True, separators=(',', ': '))
def known_edits2(word): return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS) data = open('0643/SHEFFIELDDAT.643').read() corrections = [ re.sub('\s+', ' ', item).split(' ') for item in data.split('\n') ][:-1] num = 0 correct = 0 for correction in corrections: if lev.distance(correction[0], correction[1]) > 2: print 'Distance greater than 2', correction continue s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect(('127.0.0.1', 11000)) s.sendall(correction[1] + '<EOF>') res = s.recv(8000) num += 1 if res == 'Not Found': res = '' corrected1 = set(res.split()) corrected2 = known_edits2(correction[1].lower()) difference = corrected1 - corrected2 if len(difference) > 0:
x_all['omsk_dist'] = omsk_dist x_all['khabarovsk_dist'] = khabarovsk_dist x_all['klyuchi_dist'] = klyuchi_dist x_all['norilsk_dist'] = norilsk_dist a.print_elapsed(start) ## Levenshtein features print(a.c.BOLD + 'Finding levenshtein features ...' + a.c.END) start = time.time() print('Finding Levenshtein distance between titles ... ', end='', flush=True) vals = [] data = df[['title_1', 'title_2']].values.tolist() for x in data: #vals.append(levenshtein(str(x[0]), str(x[1]))) vals.append(Levenshtein.distance(str(x[0]), str(x[1]))) x_all['title_lev'] = vals a.print_elapsed(start) start = time.time() print('Finding total title length ... ', end='', flush=True) vals = [] data = df[['title_1', 'title_2']].values.tolist() for x in data: vals.append(len(str(x[0])) + len(str(x[1]))) title_tot_len = vals a.print_elapsed(start) start = time.time() print('Finding normalised Levenshtein distance ...', end='', flush=True) x_all['title_lev_norm'] = x_all['title_lev'] / title_tot_len
def fit(self, dataset, knowledge_base): train_set = [] train_labels = [] for mention in dataset.mentions: context_1 = [x.lower().strip() for x in mention.contexts[0]] context_2 = [x.lower().strip() for x in mention.contexts[1]] word_emb = np.zeros((300, )) for word in mention.contexts[0]: word_emb += self.word2embed.get(word, np.zeros(300, )) for word in mention.contexts[1]: word_emb += self.word2embed.get(word, np.zeros(300, )) for candidate in mention.candidates: feat = [ candidate.prob, math.exp( -Levenshtein.distance(candidate.name, mention.surface)) ] words = candidate.name.lower().split() context_sim_1 = sum( [1 if x.strip() in context_1 else 0 for x in words]) context_sim_2 = sum( [1 if x.strip() in context_2 else 0 for x in words]) feat.append(context_sim_1 / len(words)) feat.append(context_sim_2 / len(words)) ent = '_'.join(candidate.name.split(' ')) ent_emb = self.ent2embed[ent] feat.extend(ent_emb) feat.extend(word_emb) feat.append(cosine_sim_vectors(word_emb, ent_emb)) max_sim_1 = 0 max_sim_2 = 0 if candidate.id in knowledge_base.documents: for section in knowledge_base.documents[ candidate.id].sections: for sentence in section: s = sentence.lower() tmp_sim_1 = sum([ 1 if x.strip() in s else 0 for x in context_1 ]) tmp_sim_2 = sum([ 1 if x.strip() in s else 0 for x in context_2 ]) if tmp_sim_1 > max_sim_1: max_sim_1 = tmp_sim_1 if tmp_sim_2 > max_sim_2: max_sim_2 = tmp_sim_2 feat.append(max_sim_1 / len(context_1) if len(context_1) > 0 else 0) feat.append(max_sim_2 / len(context_2) if len(context_2) > 0 else 0) train_set.append(feat) train_labels.append(1 if mention.gt.id == candidate.id else 0) train_set = np.array(train_set) train_labels = np.array(train_labels) self.net.fit(train_set, train_labels, epochs=10, batch_size=100, verbose=False)
def predict(self, dataset, knowledge_base): pred_cids = [] for mention in dataset.mentions: context_1 = [x.lower().strip() for x in mention.contexts[0]] context_2 = [x.lower().strip() for x in mention.contexts[1]] word_emb = np.zeros(300, ) for word in mention.contexts[0]: word_emb += self.word2embed.get(word, np.zeros(300, )) for word in mention.contexts[1]: word_emb += self.word2embed.get(word, np.zeros(300, )) dev_set = [] for candidate in mention.candidates: feat = [ candidate.prob, math.exp( -Levenshtein.distance(candidate.name, mention.surface)) ] words = candidate.name.lower().split() context_sim_1 = sum( [1 if x.strip() in context_1 else 0 for x in words]) context_sim_2 = sum( [1 if x.strip() in context_2 else 0 for x in words]) feat.append(context_sim_1 / len(words)) feat.append(context_sim_2 / len(words)) ent = '_'.join(candidate.name.split(' ')) ent_emb = self.ent2embed.get(ent, np.zeros(300, )) feat.extend(ent_emb) feat.extend(word_emb) feat.append(cosine_sim_vectors(word_emb, ent_emb)) max_sim_1 = 0 max_sim_2 = 0 if candidate.id in knowledge_base.documents: for section in knowledge_base.documents[ candidate.id].sections: for sentence in section: s = sentence.lower() tmp_sim_1 = sum([ 1 if x.strip() in s else 0 for x in context_1 ]) tmp_sim_2 = sum([ 1 if x.strip() in s else 0 for x in context_2 ]) if tmp_sim_1 > max_sim_1: max_sim_1 = tmp_sim_1 if tmp_sim_2 > max_sim_2: max_sim_2 = tmp_sim_2 feat.append(max_sim_1 / len(context_1) if len(context_1) > 0 else 0) feat.append(max_sim_2 / len(context_2) if len(context_2) > 0 else 0) dev_set.append(feat) dev_set = np.array(dev_set) if mention.candidates: pred = self.net.predict_proba(dev_set) pred_cids.append(mention.candidates[np.argmax(pred)].id) else: pred_cids.append('NIL') return pred_cids
b = (norm(ans[i]-A[i]) + norm(ans[i]-B[i]) + norm(ans[i]-C[i]) + norm(ans[i]-D[i])) / 3.0 cos_ans.append(a) euc_dis.append(b) cos_ans = list(np.array(cos_ans, dtype = 'float64')) euc_dis = list(np.array(euc_dis, dtype = 'float64')) data["cos_sim"] = cos_ans data["euc_dis"] = euc_dis del(cos_ans, euc_dis, a, b) #-----------------------------------------------------------------------------# #-----------------------------------------------------------------------------# # get levenshtein distance import Levenshtein as lv LdistA = [lv.distance(data.iloc[i,1], data.iloc[i,2]) for i in range(data.shape[0])] LdistB = [lv.distance(data.iloc[i,1], data.iloc[i,3]) for i in range(data.shape[0])] LdistC = [lv.distance(data.iloc[i,1], data.iloc[i,4]) for i in range(data.shape[0])] LdistD = [lv.distance(data.iloc[i,1], data.iloc[i,5]) for i in range(data.shape[0])] data['Ldist'] = np.sum(np.transpose(np.array([(LdistA), (LdistB), (LdistC), (LdistD)])), axis=1)/4 del(LdistA, LdistB, LdistC, LdistD) #-----------------------------------------------------------------------------# #-----------------------------------------------------------------------------# # 답과 보기의 태그를 비교하자 tags_diff = [] # 0 = false (tags are the same) for i in range(data.shape[0]): # create sentences with choices ans_s = data.iloc[i,0].replace('_____', data.iloc[i,1]) A_s = data.iloc[i,0].replace('_____', data.iloc[i,2])