def changes(text): stop_words = get_stop_words('russian') s = '' text_lst = [] for w in text.lower().split(): if w not in stop_words: s += w else: if s != '': text_lst.append(s) s = '' text_lst.append(s) result_word = [] for elem in text_lst: word = '' min_dist = float('inf') for s in rc.class3: s = ' '.join([w for w in s.lower().split() if w not in stop_words]) dist3 = textdistance.hamming(s, elem) if dist3 < min_dist: min_dist = dist3 word = s result_word.append(word) return result_word
def execute(stemmed_text, lemmatized_text): try: lemmatized_words = word_tokenize(lemmatized_text) stemmed_words = word_tokenize(stemmed_text) final_text = [] # iterate all lemmas for stemm in stemmed_words: min_distance = 10 current_word = '' # for each stemm, calculate distance for lemma in lemmatized_words: distance = textdistance.hamming(stemm, lemma) # only keep words with min distance if distance < min_distance: min_distance = distance current_word = lemma final_text.append(current_word) result = ' '.join(final_text) return result except (Exception) as error: raise Exception('[text_distance] - error {0}'.format(error))
def checkImageHashes(): # gets the keys and values from the hashes dictionary, in order items = sorted(hashes.items()) # print("Results:") - this is for checking the images # iterates over the current and next tuple in the directory for cur, nxt in zip(items, items[1:]): text1 = cur[1] # gets the 2nd item in the tuple, the image file itself text2 = nxt[1] # gets the 2nd item in the tuple, the image file itself # finds the variation in the image hashes using the Hamming algorithm answer = textdistance.hamming(text1, text2) # if the Hamming difference between the 32 character strings is less than # 30, it detects the image as a duplicate and deletes the current image, # keeping the subsequent image if 0 <= answer <= 30: # log the files that have been deleted with open("duplicates.txt", "a") as f: f.write("File: " + str(cur[0]) + ", Deviation: " + str(answer) + "\n") f.close() # create filepath toDelete = os.path.join(directory, cur[0]) # this moves them to the recyle bin os.remove(toDelete)
def suggest_symbols(game_id: int, user_id: int, text: str, buy_or_sell: str): if buy_or_sell == "buy": to_match = f"{text.upper()}%" symbol_suggestions = query_to_dict( """ SELECT * FROM symbols WHERE symbol LIKE %s OR name LIKE %s;""", to_match, to_match) if buy_or_sell == "sell": balances = get_active_balances(game_id, user_id) symbols = list(balances["symbol"].unique()) to_match = f"{text.upper()}%" params_list = [to_match] * 2 + symbols symbol_suggestions = query_to_dict( f""" SELECT * FROM symbols WHERE (symbol LIKE %s OR name LIKE %s) AND symbol IN ({','.join(['%s'] * len(symbols))});""", params_list) suggestions = [{ "symbol": entry["symbol"], "label": f"{entry['symbol']} ({entry['name']})", "dist": hamming(text, entry['symbol']) } for entry in symbol_suggestions] # sort suggestions by hamming distance between text and ticker entry return sorted(suggestions, key=lambda i: i["dist"])
def check_dist(filename,result): """ :param filename: (str) the path of query file :param result:(list) list contains the documents from corpus with similarity to query file :return: lists containing jaccard distances, hamming distances, edit distances and cosine distances with each document in the result with that of the corpus """ hamming_dist=[] jaccard_dist=[] edit_dist=[] cosine_dist=[] file, content = utils.tokenize_file(filename) query_cont =set(content) for each_result in result: file1, content1 = utils.tokenize_file(each_result) doc_cont= set(content1) jac = nltk.jaccard_distance(query_cont, doc_cont) edit_dis =nltk.edit_distance(content,content1) hamming = textdistance.hamming(content, content1) cos=textdistance.cosine(content, content1) cosine_dist.append(cos) jaccard_dist.append(jac) edit_dist.append(edit_dis) hamming_dist.append(hamming) return jaccard_dist, edit_dist, hamming_dist,cosine_dist
def hamming_distance(x, y): """Calculate the hamming distance (number of bits different) between the two integers given. >>> [hamming_distance(x, 15) for x in [0, 8, 10, 12, 14, 15]] [4, 3, 2, 2, 1, 0] """ return textdistance.hamming(x,y)
def helpful_substation_lookup(self, substation: str): if substation in self.graph.nodes: return substation, self.graph.nodes(data=True)[substation] else: close_matches = [ name for name in list(self.graph.nodes) if textdistance.hamming(substation, name) <= 1 ] helper_message = " Did you mean: {}?".format( close_matches) if len(close_matches) > 0 else "" raise KeyError("Did not find substation '{}'.{}".format( substation, helper_message))
def define_station(text): stop_words = get_stop_words('russian') text = ' '.join([w for w in text.lower().split() if w not in stop_words]) min_dist = float('inf') word = '' for s in rc.class3: s = ' '.join([w for w in s.lower().split() if w not in stop_words]) dist3 = textdistance.hamming(s, text) if dist3 < min_dist: min_dist = dist3 word = s return word
def compute_differences(f1_n,f2_n,numerical=False): with open(f1_n,"r") as f1 , open(f2_n,"r") as f2: lines_1 = f1.readlines() lines_2 = f2.readlines() if len(lines_1) != len(lines_2): return {"lines":len(lines_1)-len(lines_2),"n_line":len(lines_1)} d=0 val = [] #print(len(lines_2)) for l1,l2 in zip(lines_1,lines_2): if l1 != l2: if not numerical: d += textdistance.hamming(l1,l2) else: if l1.startswith(">"): d += textdistance.hamming(l1,l2) else: p1 = np.array(list(map(float,l1.strip().split()))) p2 = np.array(list(map(float,l2.strip().split()))) print(f"percent b string1 {np.nanmean(p1):.2f} percent b string 2 {np.nanmean(p2):.2f} , size {len(p1)}") val.append(np.abs(np.nanmean(p1)-np.nanmean(p2))) return {"letters":d,"n_line":len(lines_1),"val":val}
def compare(self, str1, str2): if self.debug: self.log("hamming comparison") self.start_time() self.result.distance = hamming(str1, str2) self.end_time() self.result.nos = max(len(str1), len(str2)) self.result.threshold = 90 self.result.similarity = (100.0 / float(self.result.nos)) * ( self.result.nos - self.result.distance) return self.result
def load_lat_long_from_csv(df): """ dataset: https://github.com/datosgobar/georef-ar-api/blob/master/config/georef.example.cfg """ # all_localidades = df.to_dict(orient="records") data = pd.DataFrame(columns=["lat", "lon"]) registros = load_localidades() localidades = df.Localidad.str.upper() provincias = df.Provincia # bar = st.progress(0.0) # step = 1 / localidades.shape[0] # progress = 0 for l, p in zip(localidades, provincias): aux = registros[(registros.nombre == l)] # progress += step # bar.progress(progress) if aux.shape[0] == 1: data = data.append(aux[["lat", "lon"]]) elif aux.shape[0] > 1: province_id_max = aux.provincia.apply( lambda prov: td.hamming(prov, p)).argmin() # print(aux) # print( # province_id_max, # p, # ) data = data.append(aux.iloc[[province_id_max]].loc[:, ["lat", "lon"]]) elif aux.shape[0] == 0: data = data.append( pd.DataFrame.from_records([{ "lat": None, "lon": None }])) return data.set_index(df.index)
def sequenceDistance(dfEnsp, ref_dic, newcolresult, hamming, hammingNorm, levenshtein, levenshteinNorm): res = [] ham = [] hamnorm = [] lev = [] levnorm = [] serSeq = dfEnsp['proSequence'].copy() serID = dfEnsp['stableID_key'].copy() for inx, val in serSeq.items(): pep = str(val) p = pep.strip() idd = str(serID[inx]) # check pep to dict pep sequence mypep = ref_dic[idd] str(mypep) # identical if mypep == p: res.append('True') ham.append('identical') hamnorm.append('identical') lev.append('identical') levnorm.append('identical') # not identical to canonical if mypep != p: res.append('False') # calculates hamming distance, penalizes positional differences, edit based distance ham.append(textdistance.hamming(mypep, p)) # normalized hamming = # mismatched positions/ len of longer sequence hamnorm.append(textdistance.hamming.normalized_distance(mypep, p)) # levenshtein score is edit based but not not penalized position, insertion at pos 1 is jsut 1 diff lev.append(textdistance.levenshtein(mypep, p)) levnorm.append( textdistance.levenshtein.normalized_distance(mypep, p)) dfEnsp.loc[:, newcolresult] = res dfEnsp.loc[:, hamming] = ham dfEnsp.loc[:, hammingNorm] = hamnorm dfEnsp.loc[:, levenshtein] = lev dfEnsp.loc[:, levenshteinNorm] = levnorm return dfEnsp
def TextMatch(str1, str2): """return True is there is a close match """ # Exact match if str1 == str2: return True # Match Text (simple) ignore case if str1.lower() == str2.lower(): return True # Use Python 3 casefold to (agressive) ignore case if str1.casefold() == str2.casefold(): return True # compare the hamming distance to tolerate a close match if hamming(str1, str2) < 2: return True # no match found return False
def valid_word_permutations( word_map: WordMap, word: Optional[str] ) -> List[PermutationOption]: if word is None: return [] options = [] answers = word_map.correct_answers(word) if len(answers) == 0: return [] for s in permutations(word): candidate = "".join(s) if not word_map.is_word(candidate): options.append( PermutationOption( word=candidate, minimum_distance=min( map(lambda word: textdistance.hamming(candidate, word), answers) ), ) ) if len(options) > 1000: break options.sort(key=lambda opt: -opt.minimum_distance) return options
def simple_example(): str1, str2 = 'test', 'text' qval = 2 #-------------------- # Edit-based. if True: print("textdistance.hamming({}, {}) = {}.".format( str1, str2, textdistance.hamming(str1, str2))) print("textdistance.hamming.distance({}, {}) = {}.".format( str1, str2, textdistance.hamming.distance(str1, str2))) print("textdistance.hamming.similarity({}, {}) = {}.".format( str1, str2, textdistance.hamming.similarity(str1, str2))) print("textdistance.hamming.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.hamming.normalized_distance(str1, str2))) print( "textdistance.hamming.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.hamming.normalized_similarity(str1, str2))) print( "textdistance.Hamming(qval={}, test_func=None, truncate=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Hamming(qval=qval, test_func=None, truncate=False, external=True).distance(str1, str2))) print("textdistance.mlipns({}, {}) = {}.".format( str1, str2, textdistance.mlipns(str1, str2))) print("textdistance.mlipns.distance({}, {}) = {}.".format( str1, str2, textdistance.mlipns.distance(str1, str2))) print("textdistance.mlipns.similarity({}, {}) = {}.".format( str1, str2, textdistance.mlipns.similarity(str1, str2))) print("textdistance.mlipns.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.mlipns.normalized_distance(str1, str2))) print("textdistance.mlipns.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.mlipns.normalized_similarity(str1, str2))) print( "textdistance.MLIPNS(threshold=0.25, maxmismatches=2, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.MLIPNS(threshold=0.25, maxmismatches=2, qval=qval, external=True).distance(str1, str2))) print("textdistance.levenshtein({}, {}) = {}.".format( str1, str2, textdistance.levenshtein(str1, str2))) print("textdistance.levenshtein.distance({}, {}) = {}.".format( str1, str2, textdistance.levenshtein.distance(str1, str2))) print("textdistance.levenshtein.similarity({}, {}) = {}.".format( str1, str2, textdistance.levenshtein.similarity(str1, str2))) print("textdistance.levenshtein.normalized_distance({}, {}) = {}.". format(str1, str2, textdistance.levenshtein.normalized_distance(str1, str2))) print("textdistance.levenshtein.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.levenshtein.normalized_similarity(str1, str2))) print( "textdistance.Levenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Levenshtein(qval=qval, test_func=None, external=True).distance(str1, str2))) print("textdistance.damerau_levenshtein({}, {}) = {}.".format( str1, str2, textdistance.damerau_levenshtein(str1, str2))) print("textdistance.damerau_levenshtein.distance({}, {}) = {}.".format( str1, str2, textdistance.damerau_levenshtein.distance(str1, str2))) print( "textdistance.damerau_levenshtein.similarity({}, {}) = {}.".format( str1, str2, textdistance.damerau_levenshtein.similarity(str1, str2))) print( "textdistance.damerau_levenshtein.normalized_distance({}, {}) = {}." .format( str1, str2, textdistance.damerau_levenshtein.normalized_distance( str1, str2))) print( "textdistance.damerau_levenshtein.normalized_similarity({}, {}) = {}." .format( str1, str2, textdistance.damerau_levenshtein.normalized_similarity( str1, str2))) print( "textdistance.DamerauLevenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.DamerauLevenshtein(qval=qval, test_func=None, external=True).distance( str1, str2))) print("textdistance.jaro({}, {}) = {}.".format( str1, str2, textdistance.jaro(str1, str2))) print("textdistance.jaro.distance({}, {}) = {}.".format( str1, str2, textdistance.jaro.distance(str1, str2))) print("textdistance.jaro.similarity({}, {}) = {}.".format( str1, str2, textdistance.jaro.similarity(str1, str2))) print("textdistance.jaro.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.jaro.normalized_distance(str1, str2))) print("textdistance.jaro.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.jaro.normalized_similarity(str1, str2))) print( "textdistance.Jaro(long_tolerance=False, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Jaro(long_tolerance=False, qval=qval, external=True).distance(str1, str2))) print("textdistance.jaro_winkler({}, {}) = {}.".format( str1, str2, textdistance.jaro_winkler(str1, str2))) print("textdistance.jaro_winkler.distance({}, {}) = {}.".format( str1, str2, textdistance.jaro_winkler.distance(str1, str2))) print("textdistance.jaro_winkler.similarity({}, {}) = {}.".format( str1, str2, textdistance.jaro_winkler.similarity(str1, str2))) print("textdistance.jaro_winkler.normalized_distance({}, {}) = {}.". format(str1, str2, textdistance.jaro_winkler.normalized_distance(str1, str2))) print("textdistance.jaro_winkler.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.jaro_winkler.normalized_similarity(str1, str2))) print( "textdistance.JaroWinkler(long_tolerance=False, winklerize=True, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.JaroWinkler(long_tolerance=False, winklerize=True, qval=qval, external=True).distance(str1, str2))) print("textdistance.strcmp95({}, {}) = {}.".format( str1, str2, textdistance.strcmp95(str1, str2))) print("textdistance.strcmp95.distance({}, {}) = {}.".format( str1, str2, textdistance.strcmp95.distance(str1, str2))) print("textdistance.strcmp95.similarity({}, {}) = {}.".format( str1, str2, textdistance.strcmp95.similarity(str1, str2))) print("textdistance.strcmp95.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.strcmp95.normalized_distance(str1, str2))) print( "textdistance.strcmp95.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.strcmp95.normalized_similarity(str1, str2))) print( "textdistance.StrCmp95(long_strings=False, external=True).distance({}, {}) = {}." .format( str1, str2, textdistance.StrCmp95(long_strings=False, external=True).distance(str1, str2))) print("textdistance.needleman_wunsch({}, {}) = {}.".format( str1, str2, textdistance.needleman_wunsch(str1, str2))) print("textdistance.needleman_wunsch.distance({}, {}) = {}.".format( str1, str2, textdistance.needleman_wunsch.distance(str1, str2))) print("textdistance.needleman_wunsch.similarity({}, {}) = {}.".format( str1, str2, textdistance.needleman_wunsch.similarity(str1, str2))) print( "textdistance.needleman_wunsch.normalized_distance({}, {}) = {}.". format( str1, str2, textdistance.needleman_wunsch.normalized_distance(str1, str2))) print( "textdistance.needleman_wunsch.normalized_similarity({}, {}) = {}." .format( str1, str2, textdistance.needleman_wunsch.normalized_similarity( str1, str2))) print( "textdistance.NeedlemanWunsch(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.NeedlemanWunsch(gap_cost=1.0, sim_func=None, qval=qval, external=True).distance( str1, str2))) print("textdistance.gotoh({}, {}) = {}.".format( str1, str2, textdistance.gotoh(str1, str2))) print("textdistance.gotoh.distance({}, {}) = {}.".format( str1, str2, textdistance.gotoh.distance(str1, str2))) print("textdistance.gotoh.similarity({}, {}) = {}.".format( str1, str2, textdistance.gotoh.similarity(str1, str2))) print("textdistance.gotoh.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.gotoh.normalized_distance(str1, str2))) print("textdistance.gotoh.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.gotoh.normalized_similarity(str1, str2))) print( "textdistance.Gotoh(gap_open=1, gap_ext=0.4, sim_func=None, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Gotoh(gap_open=1, gap_ext=0.4, sim_func=None, qval=qval, external=True).distance(str1, str2))) print("textdistance.smith_waterman({}, {}) = {}.".format( str1, str2, textdistance.smith_waterman(str1, str2))) print("textdistance.smith_waterman.distance({}, {}) = {}.".format( str1, str2, textdistance.smith_waterman.distance(str1, str2))) print("textdistance.smith_waterman.similarity({}, {}) = {}.".format( str1, str2, textdistance.smith_waterman.similarity(str1, str2))) print("textdistance.smith_waterman.normalized_distance({}, {}) = {}.". format( str1, str2, textdistance.smith_waterman.normalized_distance(str1, str2))) print( "textdistance.smith_waterman.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.smith_waterman.normalized_similarity(str1, str2))) print( "textdistance.SmithWaterman(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.SmithWaterman(gap_cost=1.0, sim_func=None, qval=qval, external=True).distance(str1, str2))) #-------------------- # Token-based. if False: print("textdistance.jaccard({}, {}) = {}.".format( str1, str2, textdistance.jaccard(str1, str2))) print("textdistance.jaccard.distance({}, {}) = {}.".format( str1, str2, textdistance.jaccard.distance(str1, str2))) print("textdistance.jaccard.similarity({}, {}) = {}.".format( str1, str2, textdistance.jaccard.similarity(str1, str2))) print("textdistance.jaccard.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.jaccard.normalized_distance(str1, str2))) print( "textdistance.jaccard.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.jaccard.normalized_similarity(str1, str2))) print( "textdistance.Jaccard(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Jaccard(qval=qval, as_set=False, external=True).distance(str1, str2))) print("textdistance.sorensen({}, {}) = {}.".format( str1, str2, textdistance.sorensen(str1, str2))) print("textdistance.sorensen.distance({}, {}) = {}.".format( str1, str2, textdistance.sorensen.distance(str1, str2))) print("textdistance.sorensen.similarity({}, {}) = {}.".format( str1, str2, textdistance.sorensen.similarity(str1, str2))) print("textdistance.sorensen.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.sorensen.normalized_distance(str1, str2))) print( "textdistance.sorensen.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.sorensen.normalized_similarity(str1, str2))) print( "textdistance.Sorensen(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Sorensen(qval=qval, as_set=False, external=True).distance(str1, str2))) print("textdistance.sorensen_dice({}, {}) = {}.".format( str1, str2, textdistance.sorensen_dice(str1, str2))) print("textdistance.sorensen_dice.distance({}, {}) = {}.".format( str1, str2, textdistance.sorensen_dice.distance(str1, str2))) print("textdistance.sorensen_dice.similarity({}, {}) = {}.".format( str1, str2, textdistance.sorensen_dice.similarity(str1, str2))) print("textdistance.sorensen_dice.normalized_distance({}, {}) = {}.". format( str1, str2, textdistance.sorensen_dice.normalized_distance(str1, str2))) print("textdistance.sorensen_dice.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.sorensen_dice.normalized_similarity(str1, str2))) #print("textdistance.SorensenDice().distance({}, {}) = {}.".format(str1, str2, textdistance.SorensenDice().distance(str1, str2))) print("textdistance.tversky({}, {}) = {}.".format( str1, str2, textdistance.tversky(str1, str2))) print("textdistance.tversky.distance({}, {}) = {}.".format( str1, str2, textdistance.tversky.distance(str1, str2))) print("textdistance.tversky.similarity({}, {}) = {}.".format( str1, str2, textdistance.tversky.similarity(str1, str2))) print("textdistance.tversky.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.tversky.normalized_distance(str1, str2))) print( "textdistance.tversky.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.tversky.normalized_similarity(str1, str2))) print( "textdistance.Tversky(qval={}, ks=None, bias=None, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Tversky(qval=qval, ks=None, bias=None, as_set=False, external=True).distance(str1, str2))) print("textdistance.overlap({}, {}) = {}.".format( str1, str2, textdistance.overlap(str1, str2))) print("textdistance.overlap.distance({}, {}) = {}.".format( str1, str2, textdistance.overlap.distance(str1, str2))) print("textdistance.overlap.similarity({}, {}) = {}.".format( str1, str2, textdistance.overlap.similarity(str1, str2))) print("textdistance.overlap.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.overlap.normalized_distance(str1, str2))) print( "textdistance.overlap.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.overlap.normalized_similarity(str1, str2))) print( "textdistance.Overlap(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Overlap(qval=qval, as_set=False, external=True).distance(str1, str2))) # This is identical to the Jaccard similarity coefficient and the Tversky index for alpha=1 and beta=1. print("textdistance.tanimoto({}, {}) = {}.".format( str1, str2, textdistance.tanimoto(str1, str2))) print("textdistance.tanimoto.distance({}, {}) = {}.".format( str1, str2, textdistance.tanimoto.distance(str1, str2))) print("textdistance.tanimoto.similarity({}, {}) = {}.".format( str1, str2, textdistance.tanimoto.similarity(str1, str2))) print("textdistance.tanimoto.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.tanimoto.normalized_distance(str1, str2))) print( "textdistance.tanimoto.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.tanimoto.normalized_similarity(str1, str2))) print( "textdistance.Tanimoto(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Tanimoto(qval=qval, as_set=False, external=True).distance(str1, str2))) print("textdistance.cosine({}, {}) = {}.".format( str1, str2, textdistance.cosine(str1, str2))) print("textdistance.cosine.distance({}, {}) = {}.".format( str1, str2, textdistance.cosine.distance(str1, str2))) print("textdistance.cosine.similarity({}, {}) = {}.".format( str1, str2, textdistance.cosine.similarity(str1, str2))) print("textdistance.cosine.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.cosine.normalized_distance(str1, str2))) print("textdistance.cosine.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.cosine.normalized_similarity(str1, str2))) print( "textdistance.Cosine(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Cosine(qval=qval, as_set=False, external=True).distance(str1, str2))) print("textdistance.monge_elkan({}, {}) = {}.".format( str1, str2, textdistance.monge_elkan(str1, str2))) print("textdistance.monge_elkan.distance({}, {}) = {}.".format( str1, str2, textdistance.monge_elkan.distance(str1, str2))) print("textdistance.monge_elkan.similarity({}, {}) = {}.".format( str1, str2, textdistance.monge_elkan.similarity(str1, str2))) print("textdistance.monge_elkan.normalized_distance({}, {}) = {}.". format(str1, str2, textdistance.monge_elkan.normalized_distance(str1, str2))) print("textdistance.monge_elkan.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.monge_elkan.normalized_similarity(str1, str2))) print( "textdistance.MongeElkan(algorithm=textdistance.DamerauLevenshtein(), symmetric=False, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.MongeElkan( algorithm=textdistance.DamerauLevenshtein(), symmetric=False, qval=qval, external=True).distance(str1, str2))) print("textdistance.bag({}, {}) = {}.".format( str1, str2, textdistance.bag(str1, str2))) print("textdistance.bag.distance({}, {}) = {}.".format( str1, str2, textdistance.bag.distance(str1, str2))) print("textdistance.bag.similarity({}, {}) = {}.".format( str1, str2, textdistance.bag.similarity(str1, str2))) print("textdistance.bag.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.bag.normalized_distance(str1, str2))) print("textdistance.bag.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.bag.normalized_similarity(str1, str2))) print("textdistance.Bag(qval={}).distance({}, {}) = {}.".format( qval, str1, str2, textdistance.Bag(qval=qval).distance(str1, str2))) #-------------------- # Sequence-based. if False: print("textdistance.lcsseq({}, {}) = {}.".format( str1, str2, textdistance.lcsseq(str1, str2))) print("textdistance.lcsseq.distance({}, {}) = {}.".format( str1, str2, textdistance.lcsseq.distance(str1, str2))) print("textdistance.lcsseq.similarity({}, {}) = {}.".format( str1, str2, textdistance.lcsseq.similarity(str1, str2))) print("textdistance.lcsseq.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.lcsseq.normalized_distance(str1, str2))) print("textdistance.lcsseq.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.lcsseq.normalized_similarity(str1, str2))) #print("textdistance.LCSSeq(qval={}, test_func=None, external=True).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.LCSSeq(qval=qval, test_func=None, external=True).distance(str1, str2))) print("textdistance.LCSSeq().distance({}, {}) = {}.".format( str1, str2, textdistance.LCSSeq().distance(str1, str2))) print("textdistance.lcsstr({}, {}) = {}.".format( str1, str2, textdistance.lcsstr(str1, str2))) print("textdistance.lcsstr.distance({}, {}) = {}.".format( str1, str2, textdistance.lcsstr.distance(str1, str2))) print("textdistance.lcsstr.similarity({}, {}) = {}.".format( str1, str2, textdistance.lcsstr.similarity(str1, str2))) print("textdistance.lcsstr.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.lcsstr.normalized_distance(str1, str2))) print("textdistance.lcsstr.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.lcsstr.normalized_similarity(str1, str2))) print("textdistance.LCSStr(qval={}).distance({}, {}) = {}.".format( qval, str1, str2, textdistance.LCSStr(qval=qval).distance(str1, str2))) print("textdistance.ratcliff_obershelp({}, {}) = {}.".format( str1, str2, textdistance.ratcliff_obershelp(str1, str2))) print("textdistance.ratcliff_obershelp.distance({}, {}) = {}.".format( str1, str2, textdistance.ratcliff_obershelp.distance(str1, str2))) print( "textdistance.ratcliff_obershelp.similarity({}, {}) = {}.".format( str1, str2, textdistance.ratcliff_obershelp.similarity(str1, str2))) print( "textdistance.ratcliff_obershelp.normalized_distance({}, {}) = {}." .format( str1, str2, textdistance.ratcliff_obershelp.normalized_distance( str1, str2))) print( "textdistance.ratcliff_obershelp.normalized_similarity({}, {}) = {}." .format( str1, str2, textdistance.ratcliff_obershelp.normalized_similarity( str1, str2))) print("textdistance.RatcliffObershelp().distance({}, {}) = {}.".format( str1, str2, textdistance.RatcliffObershelp().distance(str1, str2))) #-------------------- # Compression-based. if False: print("textdistance.arith_ncd({}, {}) = {}.".format( str1, str2, textdistance.arith_ncd(str1, str2))) print("textdistance.arith_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.arith_ncd.distance(str1, str2))) print("textdistance.arith_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.arith_ncd.similarity(str1, str2))) print( "textdistance.arith_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.arith_ncd.normalized_distance(str1, str2))) print("textdistance.arith_ncd.normalized_similarity({}, {}) = {}.". format(str1, str2, textdistance.arith_ncd.normalized_similarity(str1, str2))) #print("textdistance.ArithNCD(base=2, terminator=None, qval={}).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.ArithNCD(base=2, terminator=None, qval=qval).distance(str1, str2))) print("textdistance.ArithNCD().distance({}, {}) = {}.".format( str1, str2, textdistance.ArithNCD().distance(str1, str2))) print("textdistance.rle_ncd({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd(str1, str2))) print("textdistance.rle_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd.distance(str1, str2))) print("textdistance.rle_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd.similarity(str1, str2))) print("textdistance.rle_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd.normalized_distance(str1, str2))) print( "textdistance.rle_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd.normalized_similarity(str1, str2))) print("textdistance.RLENCD().distance({}, {}) = {}.".format( str1, str2, textdistance.RLENCD().distance(str1, str2))) print("textdistance.bwtrle_ncd({}, {}) = {}.".format( str1, str2, textdistance.bwtrle_ncd(str1, str2))) print("textdistance.bwtrle_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.bwtrle_ncd.distance(str1, str2))) print("textdistance.bwtrle_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.bwtrle_ncd.similarity(str1, str2))) print( "textdistance.bwtrle_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.bwtrle_ncd.normalized_distance(str1, str2))) print("textdistance.bwtrle_ncd.normalized_similarity({}, {}) = {}.". format(str1, str2, textdistance.bwtrle_ncd.normalized_similarity(str1, str2))) print("textdistance.BWTRLENCD(terminator='\0').distance({}, {}) = {}.". format( str1, str2, textdistance.BWTRLENCD(terminator='\0').distance(str1, str2))) print("textdistance.sqrt_ncd({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd(str1, str2))) print("textdistance.sqrt_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd.distance(str1, str2))) print("textdistance.sqrt_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd.similarity(str1, str2))) print("textdistance.sqrt_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd.normalized_distance(str1, str2))) print( "textdistance.sqrt_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd.normalized_similarity(str1, str2))) print("textdistance.SqrtNCD(qval={}).distance({}, {}) = {}.".format( qval, str1, str2, textdistance.SqrtNCD(qval=qval).distance(str1, str2))) print("textdistance.entropy_ncd({}, {}) = {}.".format( str1, str2, textdistance.entropy_ncd(str1, str2))) print("textdistance.entropy_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.entropy_ncd.distance(str1, str2))) print("textdistance.entropy_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.entropy_ncd.similarity(str1, str2))) print("textdistance.entropy_ncd.normalized_distance({}, {}) = {}.". format(str1, str2, textdistance.entropy_ncd.normalized_distance(str1, str2))) print("textdistance.entropy_ncd.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.entropy_ncd.normalized_similarity(str1, str2))) print( "textdistance.EntropyNCD(qval={}, coef=1, base=2).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.EntropyNCD(qval=qval, coef=1, base=2).distance(str1, str2))) print("textdistance.bz2_ncd({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd(str1, str2))) print("textdistance.bz2_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd.distance(str1, str2))) print("textdistance.bz2_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd.similarity(str1, str2))) print("textdistance.bz2_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd.normalized_distance(str1, str2))) print( "textdistance.bz2_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd.normalized_similarity(str1, str2))) print("textdistance.BZ2NCD().distance({}, {}) = {}.".format( str1, str2, textdistance.BZ2NCD().distance(str1, str2))) print("textdistance.lzma_ncd({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd(str1, str2))) print("textdistance.lzma_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd.distance(str1, str2))) print("textdistance.lzma_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd.similarity(str1, str2))) print("textdistance.lzma_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd.normalized_distance(str1, str2))) print( "textdistance.lzma_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd.normalized_similarity(str1, str2))) print("textdistance.LZMANCD().distance({}, {}) = {}.".format( str1, str2, textdistance.LZMANCD().distance(str1, str2))) print("textdistance.zlib_ncd({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd(str1, str2))) print("textdistance.zlib_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd.distance(str1, str2))) print("textdistance.zlib_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd.similarity(str1, str2))) print("textdistance.zlib_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd.normalized_distance(str1, str2))) print( "textdistance.zlib_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd.normalized_similarity(str1, str2))) print("textdistance.ZLIBNCD().distance({}, {}) = {}.".format( str1, str2, textdistance.ZLIBNCD().distance(str1, str2))) #-------------------- # Phonetic. if False: print("textdistance.mra({}, {}) = {}.".format( str1, str2, textdistance.mra(str1, str2))) print("textdistance.mra.distance({}, {}) = {}.".format( str1, str2, textdistance.mra.distance(str1, str2))) print("textdistance.mra.similarity({}, {}) = {}.".format( str1, str2, textdistance.mra.similarity(str1, str2))) print("textdistance.mra.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.mra.normalized_distance(str1, str2))) print("textdistance.mra.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.mra.normalized_similarity(str1, str2))) print("textdistance.MRA().distance({}, {}) = {}.".format( str1, str2, textdistance.MRA().distance(str1, str2))) print("textdistance.editex({}, {}) = {}.".format( str1, str2, textdistance.editex(str1, str2))) print("textdistance.editex.distance({}, {}) = {}.".format( str1, str2, textdistance.editex.distance(str1, str2))) print("textdistance.editex.similarity({}, {}) = {}.".format( str1, str2, textdistance.editex.similarity(str1, str2))) print("textdistance.editex.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.editex.normalized_distance(str1, str2))) print("textdistance.editex.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.editex.normalized_similarity(str1, str2))) print( "textdistance.Editex(local=False, match_cost=0, group_cost=1, mismatch_cost=2, groups=None, ungrouped=None, external=True).distance({}, {}) = {}." .format( str1, str2, textdistance.Editex(local=False, match_cost=0, group_cost=1, mismatch_cost=2, groups=None, ungrouped=None, external=True).distance(str1, str2))) #-------------------- # Simple. if False: print("textdistance.prefix({}, {}) = {}.".format( str1, str2, textdistance.prefix(str1, str2))) print("textdistance.prefix.distance({}, {}) = {}.".format( str1, str2, textdistance.prefix.distance(str1, str2))) print("textdistance.prefix.similarity({}, {}) = {}.".format( str1, str2, textdistance.prefix.similarity(str1, str2))) print("textdistance.prefix.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.prefix.normalized_distance(str1, str2))) print("textdistance.prefix.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.prefix.normalized_similarity(str1, str2))) print( "textdistance.Prefix(qval={}, sim_test=None).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Prefix(qval=qval, sim_test=None).distance(str1, str2))) print("textdistance.postfix({}, {}) = {}.".format( str1, str2, textdistance.postfix(str1, str2))) print("textdistance.postfix.distance({}, {}) = {}.".format( str1, str2, textdistance.postfix.distance(str1, str2))) print("textdistance.postfix.similarity({}, {}) = {}.".format( str1, str2, textdistance.postfix.similarity(str1, str2))) print("textdistance.postfix.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.postfix.normalized_distance(str1, str2))) print( "textdistance.postfix.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.postfix.normalized_similarity(str1, str2))) #print("textdistance.Postfix(qval={}, sim_test=None).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.Postfix(qval=qval, sim_test=None).distance(str1, str2))) print("textdistance.Postfix().distance({}, {}) = {}.".format( str1, str2, textdistance.Postfix().distance(str1, str2))) print("textdistance.length({}, {}) = {}.".format( str1, str2, textdistance.length(str1, str2))) print("textdistance.length.distance({}, {}) = {}.".format( str1, str2, textdistance.length.distance(str1, str2))) print("textdistance.length.similarity({}, {}) = {}.".format( str1, str2, textdistance.length.similarity(str1, str2))) print("textdistance.length.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.length.normalized_distance(str1, str2))) print("textdistance.length.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.length.normalized_similarity(str1, str2))) print("textdistance.Length().distance({}, {}) = {}.".format( str1, str2, textdistance.Length().distance(str1, str2))) print("textdistance.identity({}, {}) = {}.".format( str1, str2, textdistance.identity(str1, str2))) print("textdistance.identity.distance({}, {}) = {}.".format( str1, str2, textdistance.identity.distance(str1, str2))) print("textdistance.identity.similarity({}, {}) = {}.".format( str1, str2, textdistance.identity.similarity(str1, str2))) print("textdistance.identity.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.identity.normalized_distance(str1, str2))) print( "textdistance.identity.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.identity.normalized_similarity(str1, str2))) print("textdistance.Identity().distance({}, {}) = {}.".format( str1, str2, textdistance.Identity().distance(str1, str2))) print("textdistance.matrix({}, {}) = {}.".format( str1, str2, textdistance.matrix(str1, str2))) print("textdistance.matrix.distance({}, {}) = {}.".format( str1, str2, textdistance.matrix.distance(str1, str2))) print("textdistance.matrix.similarity({}, {}) = {}.".format( str1, str2, textdistance.matrix.similarity(str1, str2))) print("textdistance.matrix.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.matrix.normalized_distance(str1, str2))) print("textdistance.matrix.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.matrix.normalized_similarity(str1, str2))) print( "textdistance.Matrix(mat=None, mismatch_cost=0, match_cost=1, symmetric=True, external=True).distance({}, {}) = {}." .format( str1, str2, textdistance.Matrix(mat=None, mismatch_cost=0, match_cost=1, symmetric=True, external=True).distance(str1, str2)))
import pathlib import textdistance from fuzzywuzzy import fuzz if __name__ == "__main__": string_a = 'test' string_b = 'text' fuzz_result = fuzz.ratio(string_a, string_b) result = textdistance.hamming(string_a, string_b) print("Fuzzy Result: ", fuzz_result) for function_name in dir(textdistance): try: function = getattr(textdistance, function_name) result = function(string_a, string_b) if isinstance(result, (int,float)): print("{}\t{}".format(function_name, result)) except: pass
def Seq_StringDistance(str_seq, str_ref, method="hamming"): if (method is "hamming"): return [ textdistance.hamming(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "levenshtein"): return [ textdistance.levenshtein(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "damerau_lev"): return [ textdistance.damerau_levenshtein(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "j-winkler"): return [ textdistance.jaro_winkler(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "smith-waterman"): return [ textdistance.smith_waterman(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "jaccard"): return [ textdistance.jaccard(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "sorensen-dice"): return [ textdistance.sorensen_dice(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "tversky"): return [ textdistance.tversky(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "tanimoto"): return [ textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "cosine"): return [ textdistance.cosine(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "tanimoto"): return [ textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "ratcliff"): return [ textdistance.ratcliff_obershelp(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "bwt"): return [ textdistance.bwtrle_ncd(str_seq_i, str_ref) for str_seq_i in str_seq ]
# ## Example Methode # # Hamming # # https://github.com/life4/textdistance # In[2]: import textdistance # In[3]: textdistance.hamming('test', 'text') # In[4]: textdistance.hamming.distance('test', 'text') # In[5]: textdistance.hamming.similarity('test', 'text') # In[6]:
def getEntityLocation(location_string): census_hi_file = root + "/Location/census_hindi_sd.csv" cdf = pandas.read_csv(census_hi_file) cdf['name_hi'] = cdf['name_hi'].str.strip() ############ SEGREGATING ALL THE STATE, DISTRICTS, SUB-DISTRICTs, PANCHAYATS/TOWNS, M. CORP. INTO DIFFERENT DATAFRAMES################ states = cdf[['state_code', 'name_en', 'name_hi' ]][(cdf.district_code == 0) & (cdf.subdistrict_code == 0) & (cdf.panchayat_town_code == 0) & (cdf.state_code != 0)] state = states.set_index('state_code') districts = cdf[[ 'district_code', 'name_en', 'name_hi' ]][(cdf.subdistrict_code == 0) & (cdf.panchayat_town_code == 0) & (cdf.state_code != 0) & (cdf.district_code != 0)] district = districts.set_index('district_code') sub_districts = cdf[[ 'subdistrict_code', 'name_en', 'name_hi' ]][(cdf.panchayat_town_code == 0) & (cdf.state_code != 0) & (cdf.district_code != 0) & (cdf.subdistrict_code != 0)] sub_district = sub_districts.set_index('subdistrict_code') panchayats_towns = cdf[[ 'panchayat_town_code', 'name_en', 'name_hi' ]][(cdf.state_code != 0) & (cdf.district_code != 0) & (cdf.subdistrict_code != 0) & (cdf.panchayat_town_code != 0)] ################################################# MAIN LOOP STARTS ################################################################## flag_perfectmatch = False #flag to track the perfect match or not S = [] D = [] SD = [] PT = "" Loc_hd = "" list_output = [] locations = location_string if locations == 'n': #location contains NaN and can't be processed # print("can't understand") return (-1, S, D, SD, PT) location_entity = list(dict.fromkeys( locations.split(','))) #separating the entities # print("Entities : ",location_entity) for location in location_entity: #### for a single entity in a loop #### State Direct Match Code for loc in list(states["name_hi"]): if " " + loc + " " in location: S.append(loc) location = location.replace(loc, '') alphastate = 0 flag_perfectmatch = True break #### District Direct Match Code if len(S) != 0: for s in S: indexnum = cdf[cdf['name_hi'] == s].index.values.astype(int)[0] statenum = cdf["state_code"][indexnum] inState = find_between(cdf, 'state_code', statenum, statenum + 1) districtsinState = inState[[ 'district_code', 'name_en', 'name_hi' ]][(cdf.subdistrict_code == 0) & (cdf.panchayat_town_code == 0) & (cdf.state_code != 0) & (cdf.district_code != 0)] possibledistricts = districtsinState # print(possibledistricts) else: possibledistricts = districts for loc in list(possibledistricts["name_hi"]): if " " + loc + " " in location: D.append(loc) location = location.replace(loc, '') alphadistrict = 0 flag_perfectmatch = True break #### Subdistrict Direct Match Code if len(D) != 0: for d in D: indexnum = cdf[cdf['name_hi'] == d].index.values.astype(int)[0] districtnum = cdf["district_code"][indexnum] inDistrict = find_between(cdf, 'district_code', districtnum, districtnum + 1) subdistrictsinstate = inDistrict[[ 'subdistrict_code', 'name_en', 'name_hi' ]][(cdf.panchayat_town_code == 0) & (cdf.state_code != 0) & (cdf.district_code != 0) & (cdf.subdistrict_code != 0)] possiblesubdistricts = subdistrictsinstate elif len(S) != 0: for s in S: indexnum = cdf[cdf['name_hi'] == s].index.values.astype(int)[0] statenum = cdf["state_code"][indexnum] inState = find_between(cdf, 'state_code', statenum, statenum + 1) subdistrictsinstate = inState[[ 'subdistrict_code', 'name_en', 'name_hi' ]][(cdf.panchayat_town_code == 0) & (cdf.state_code != 0) & (cdf.district_code != 0) & (cdf.subdistrict_code != 0)] possiblesubdistricts = subdistrictsinstate else: possiblesubdistricts = sub_districts for loc in list(possiblesubdistricts["name_hi"]): if " " + loc + " " in location: SD.append(loc) location = location.replace(loc, '') alphasubdistrict = 0 flag_perfectmatch = True break #### Backpropagate States, Districts if len(D) == 0 and len(SD) != 0: for sd in SD: l = ( possiblesubdistricts[possiblesubdistricts["name_hi"] == sd] ).index.tolist() #Index of all matched rows for ll in l: #for each index print corresponding District,State # print("District Code: ",cdf.at[ll,'district_code'],", District: ",district.at[cdf.at[ll,'district_code'],'name_hi']) D.append(district.at[cdf.at[ll, 'district_code'], 'name_hi']) if len(S) == 0 and len(D) != 0: for d in D: l = (possibledistricts[possibledistricts["name_hi"] == d] ).index.tolist() #Index of all matched rows for ll in l: #for each index print corresponding State # print("State Code: ",cdf.at[ll,'state_code'],", State: ",state.at[cdf.at[ll,'state_code'],'name_hi']) S.append(state.at[cdf.at[ll, 'state_code'], 'name_hi']) #### Approximate Matching if len(S) == 0: min_d_state = 10 min_s_state = "" for loc in list(states["name_hi"]): lenloc = len(loc.split()) tokenised_instance = location.split() ngrams = list( zip(*[tokenised_instance[i:] for i in range(lenloc)])) ngrams = [' '.join(ngram) for ngram in ngrams] for ng in ngrams: d = textdistance.hamming(ng, loc) / len( ng) #Hamming textdistance algo if (d < min_d_state): min_s_state = loc min_d_state = d if len(D) == 0: min_d_district = 10 min_s_district = "" for loc in list(possibledistricts["name_hi"]): lenloc = len(loc.split()) tokenised_instance = location.split() ngrams = list( zip(*[tokenised_instance[i:] for i in range(lenloc)])) ngrams = [' '.join(ngram) for ngram in ngrams] for ng in ngrams: d = textdistance.hamming(ng, loc) / len( ng) #Hamming textdistance algo if (d < min_d_district): min_s_district = loc min_d_district = d alpha = 10 if len(S) == 0 and len(D) == 0: flag_perfectmatch = False if min_s_state != "" and min_s_district != "": if min_d_district < min_d_state: D.append(min_s_district) alpha = min_d_district else: S.append(min_s_state) alpha = min_d_state elif min_s_district != "": D.append(min_s_district) alpha = min_d_district elif min_s_state != "": S.append(min_s_state) alpha = min_d_state elif len(D) == 0: if min_s_district != "": D.append(min_s_district) alpha = min_d_district #### Backpropagate States, Districts if len(D) == 0 and len(SD) != 0: for sd in SD: l = ( possiblesubdistricts[possiblesubdistricts["name_hi"] == sd] ).index.tolist() #Index of all matched rows for ll in l: #for each index print corresponding District,State # print("District Code: ",cdf.at[ll,'district_code'],", District: ",district.at[cdf.at[ll,'district_code'],'name_hi']) D.append(district.at[cdf.at[ll, 'district_code'], 'name_hi']) if len(S) == 0 and len(D) != 0: for d in D: l = (possibledistricts[possibledistricts["name_hi"] == d] ).index.tolist() #Index of all matched rows for ll in l: #for each index print corresponding State # print("State Code: ",cdf.at[ll,'state_code'],", State: ",state.at[cdf.at[ll,'state_code'],'name_hi']) S.append(state.at[cdf.at[ll, 'state_code'], 'name_hi']) list_output.append(locations) # print(S, D, SD, PT) if alpha == 10: list_output.append("Yes") return (0, S, D, SD, PT) else: list_output.append("No") return (alpha, S, D, SD, PT) return
rank = 1 for i in range(len(items)): # go through formattedUrl = items[i].get("formattedUrl", None) if "/showcase/" in formattedUrl or "/in/" in formattedUrl or "/.../" in formattedUrl: # print ("\tContinuing on ", formattedUrl) continue else: # print("\tProcessing ", formattedUrl) pass snippet = items[i].get("snippet", None) title = items[i].get("title", None) # formatted "company name | LinkedIn" li_firm = title.split('|')[0].rstrip() li_dict[li_firm]["jaro"] = textdistance.hamming(firm_clnd, li_firm) snippet_clnd = re.sub("[,.]", "", snippet) emps = re.findall(p, snippet_clnd) if emps: li_dict[li_firm]["emps"] = int(emps[0]) else: li_dict[li_firm]["emps"] = 0 print(li_firm + ': ' + str(li_dict[li_firm]["emps"])) li_dict[li_firm]["formattedUrl"] = formattedUrl li_dict[li_firm]["rank"] = rank rank = rank + 1 srtd_keys = sorted(li_dict, key=lambda x: (li_dict[x]['emps']), reverse=True)
def modify_file_content(file_content, file_path): global count file_content = HTMLEntitiesToUnicode(file_content) new_file_lines = [] for line in file_content.split('\n'): new_line = line if 'title: Main Page' in line: new_file_lines.append('title: Wiki Main Page\n') continue if 'permalink' in line or '#drawio' in line: continue if '[Category:Projects]' in line: new_line = new_line[new_line.index(')')+2:] if '[:Category:Projects]' in line: new_file_lines.append("- [Category: Projects]({{ site.baseurl }}/wiki/categories/wikiprojects)\n") continue if 'date: ' in line: print(f'Appending layout line') new_file_lines.append('layout: wiki_post\n') new_file_lines.append('base: Wiki\n') new_file_lines.append('base_url: /wiki\n') if 'tensorboard' in file_path: new_file_lines.append('categories:\n - wikitools\n') elif any([p.lower().replace(' ','_') in file_path for p in ['cifar10_classifier', 'examples', 'iris_classifier', 'Male or Female classifier', 'MNIST classifier', 'Word Embeddings']]): new_file_lines.append('categories:\n - wikiprojects\n') else: new_file_lines.append('categories:\n - wikimisc\n') continue empty_link_regex = r'(\[\]\(([^\)]*)\))' empty_link_matches = re.findall(empty_link_regex, new_line) for empty_link_match in empty_link_matches: new_link = f'[{empty_link_match[1]}]({empty_link_match[1]})' new_file_lines.append(new_line.replace(empty_link_match[0], new_link)) print(f'New link found, replacing with {new_link}') continue img_src_regex = r'((\W|^)\[([^\[\]]*)(\[\d*\])?\]\(([^\ \[\]#]*)[^\)\[\]]*\))' img_src_regex_matches = re.findall(img_src_regex, new_line, flags=re.IGNORECASE|re.MULTILINE) for img_src_regex_match in img_src_regex_matches: # don't mess with normal links if 'http' in img_src_regex_match[2]: continue src = str(img_src_regex_match[4]) src = src.replace('/File:', '/assets/img/wiki/') # make sure we only do this to images if src.split('.')[-1] in ['jpeg','jpg','png','gif','svg']: caption = str(img_src_regex_match[2]) caption = caption.replace('thumb|','') # new_line = new_line.replace(img_src_regex_match.group(0), f"![{caption}]({src})") new_image_s = '{%% include figure_caption.html url="%s" description="%s" %%}' % (src, caption.replace('|', ',')) new_line = new_line.replace(img_src_regex_match[0], new_image_s) print(f'Found image tag with src {img_src_regex_match[2]}, replacing with {new_line}') elif 'wikilink' in str(img_src_regex_match[0]).lower() and 'file:' not in str(img_src_regex_match[0]).lower(): title = img_src_regex_match[4] title = title.replace('/','').lower() found_files = [] for f in files: if title in f: found_files.append(f) if len(found_files) > 0: distances = np.zeros(len(found_files)) for i in range(len(distances)): distances[i] = textdistance.hamming(title, found_files[i]) closest_file = found_files[np.argmin(distances)] post_link = "{% post_url /wiki/" + closest_file.split('/')[-1].replace('.md','') + "%}" new_link_s = f'[{img_src_regex_match[2]}]({post_link})' if 'file:' not in new_link_s: new_line = new_line.replace(img_src_regex_match[0], new_link_s) print(f'Found link tag with src {img_src_regex_match[3]}, replacing "{img_src_regex_match[0]}" -> {new_link_s}') else: print(f'welpy {title} not found in {files} files') reference_regex = r'(\[(\d*)\]\W?<([^>]*)>)' reference_regex_matches = re.findall(reference_regex, new_line) for reference_regex_match in reference_regex_matches: reference = reference_regex_match[1] + f'. [{reference_regex_match[2]}]({reference_regex_match[2]})' new_line = new_line.replace(reference_regex_match[0], reference) print(f'Replaced reference {reference_regex_match[0]} with {reference}') reference_number_regex = r'(\[(\d*)\])' reference_number_matches = re.findall(reference_number_regex, new_line) for reference_number_match in reference_number_matches: # scroll_to_bottom = f'<a href="javascript: document.body.scrollIntoView(false);">{reference_number_match[1]}</a>' reference_nr = f'<sup>{reference_number_match[1]}</sup>' new_line = new_line.replace(reference_number_match[0], reference_nr) # latex_regex = r'(\$[^\$]*\$)' # latex_regex_matches = re.findall(latex_regex, new_line) # for latex_regex_match in latex_regex_matches: # new_latex = '$' + latex_regex_match[0].replace("\\","\\") + '$' # new_line = new_line.replace(latex_regex_match[0], new_latex) # remove all html tags except those in github links if '<http' not in new_line: new_line = re.sub(r'<\/?[^>]*>', '', new_line) new_line += '\n' new_file_lines.append(new_line) return new_file_lines
def best_match(m): hs = [ hamming(f['Facility Name'].lower(), x['Data']['commonName'].lower()) for x in m ] return hs.index(min(hs))
def fitness(self, word): return textdistance.hamming(word, self.constants.solution)
def main(argv): file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/results_wide/results_MAHARASHTRA_2020.csv' if not os.path.isfile(file_path): return all_villages = { 'PUNE': {}, 'SOLAPUR': {} } with open(file_path, 'r') as original: lines = csv.reader(original, delimiter=',') skip_first = True for line in lines: if skip_first is True: skip_first = False continue district = line[3].strip().upper() if district != 'PUNE' and district != 'SOLAPUR': continue block_name = line[5].strip().upper() panchayat_name = line[7].strip().upper() panchayat_id = line[9].strip().upper() block_villages = all_villages[district].get(block_name) if block_villages is None: all_villages[district][block_name] = [] all_villages[district][block_name].append({ 'name': panchayat_name, 'id': panchayat_id, 'line': line }) file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/villages/sarpanch.csv' if not os.path.isfile(file_path): return result_lines = [] with open(file_path, 'r') as original: lines = csv.reader(original, delimiter=',') skip_first = True for line in lines: if skip_first is True: skip_first = False continue village_id = line[0] village_name = line[5].upper().replace('GRAMPANCHAYAT', '').replace('GRAMPANCHAYT', '')\ .replace(', AKKALKOT', '').replace('(BHOINJE)', '').replace('SAPATNE(BHO)', 'SAPATNE (BHOSE)')\ .replace('GRAMPANCHYAT', '').replace('GRAMPANACHAYAT', '').replace('GRAM PANCHAYT', '')\ .replace('GRAMAPANCHAYAT', '').strip() block_name = line[7] if block_name == '1': block_name = 'MADHA' elif block_name == '2': block_name = 'AKKALKOT' elif block_name == '3': block_name = 'SOUTH SOLAPUR' elif block_name == '4': block_name = 'PANDHARPUR' elif block_name == '5': block_name = 'MOHOL' elif block_name == '6': block_name = 'BHOR' elif block_name == '7': block_name = 'BARAMATI' elif block_name == '8': block_name = 'DAUND' elif block_name == '9': block_name = 'MULSHI' elif block_name == '10': block_name = 'KHED' else: raise Exception(f'No block_name found for {line}') district_name = line[6] if district_name == '1': district_name = 'SOLAPUR' if block_name not in ['MADHA', 'AKKALKOT', 'SOUTH SOLAPUR', 'PANDHARPUR', 'MOHOL']: print(f'District and block mistmatch for {line}') continue elif district_name == '2': district_name = 'PUNE' if block_name not in ['BHOR', 'BARAMATI', 'DAUND', 'MULSHI', 'KHED']: print(f'District and block mistmatch for {line}') continue else: print(f'No district found for {line}') continue cmp_results = [] for village in all_villages[district_name][block_name]: cmp_results.append({ 'score': textdistance.hamming(village_name, village['name']), 'match': village['name'], 'id': village['id'], 'line': village['line'] }) cmp_results.sort(key=lambda v: v['score']) print(f'{district_name} - {block_name} - {village_name} vs {cmp_results[0]["match"]} = {cmp_results[0]["score"]}') line = cmp_results[0]['line'] if cmp_results[0]['score'] > 10: for idx, cmp_result in enumerate(cmp_results[0:4]): print('{:>2} {}'.format(cmp_result['score'], cmp_result['match'])) print() selected_row = read_user_input() - 1 if selected_row < 4: line = cmp_results[selected_row]['line'] elif selected_row == 4: line = [] new_line = [village_id, village_name] + line result_lines.append(new_line) new_file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/villages/merge_sarpanch.csv' CsvWriter.write(new_file_path, result_lines)
def main(argv): file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/results_wide/results_HARYANA_2020.csv' if not os.path.isfile(file_path): return all_villages = {} chosen_matches = {} with open(file_path, 'r') as original: lines = csv.reader(original, delimiter=',') skip_first = True for line in lines: if skip_first is True: skip_first = False continue district = line[3].strip().upper() block_name = line[5].strip().upper().replace(' (PART)', '') panchayat_name = line[7].strip().upper() panchayat_id = line[9].strip().upper() if all_villages.get(district) is None: all_villages[district] = {} block_villages = all_villages[district].get(block_name) if block_villages is None: all_villages[district][block_name] = [] all_villages[district][block_name].append({ 'name': panchayat_name, 'id': panchayat_id, 'line': line }) file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/villages/Haryana_new_incomplete.csv' if not os.path.isfile(file_path): return result_lines = [] with open(file_path, 'r') as original: lines = csv.reader(original, delimiter=',') skip_first = 1 for line in lines: if skip_first > 0: skip_first -= 1 continue village_id = line[0] village_name = line[1].upper().strip() district_name = line[12].upper().strip() block_name = line[13].replace(' 1', '-I').replace(' 2', '-II').replace('Bhattu', 'Bhattu Kalan')\ .replace('Ballabhgarh', 'Ballabgarh').replace('Nissing', 'Nissing At Chirao')\ .replace('Meham', 'Maham').replace('Lakhan', 'Lakhan Majra') \ .replace('GHARAUNDA (PART)', 'GHARAUNDA')\ .replace('Block Saha', 'Saha').replace('Block Naraingarh', 'Naraingarh')\ .replace('Block Shahzadpur', 'Shahzadpur').replace('Block Barara', 'Barara')\ .upper().strip().replace('BLOCK ', f'{district_name}-') if all_villages.get(district_name) is None: raise Exception(f'Invalid district {district_name} for {line}') if all_villages[district_name].get(block_name) is None: raise Exception(f'Invalid {block_name} for {line}') cmp_results = [] for village in all_villages[district_name][block_name]: cmp_results.append({ 'score': textdistance.hamming(village_name, village['name']), 'match': village['name'], 'id': village['id'], 'line': village['line'] }) cmp_results.sort(key=lambda v: v['score']) print( f'{district_name} - {block_name} - {village_name} vs {cmp_results[0]["match"]} = {cmp_results[0]["score"]}' ) line = cmp_results[0]['line'] if cmp_results[0]['score'] > 10: selected_row = chosen_matches.get( f'{district_name} - {block_name} - {village_name}') if selected_row is None: for idx, cmp_result in enumerate(cmp_results[0:4]): print('{:>2} {}'.format(cmp_result['score'], cmp_result['match'])) print() selected_row = read_user_input() - 1 chosen_matches[ f'{district_name} - {block_name} - {village_name}'] = selected_row if selected_row < 4: line = cmp_results[selected_row]['line'] elif selected_row == 4: line = [] new_line = [village_id, village_name] + line result_lines.append(new_line) new_file_path = f'C:/Data_PoloFr/scrap-python-indian-gov/villages/merge_sarpanch_haryana.csv' CsvWriter.write(new_file_path, result_lines)
def run(experiment): save_path = "checkpoints/" + experiment.name log_path = "tensorboard/train/" + experiment.name # create or clean directory for path in [save_path, log_path]: if not os.path.exists(path): os.makedirs(path) else: shutil.rmtree(path) os.makedirs(path) save_path += "/dev" # log git commit hash repo = git.Repo(search_parent_directories=True) sha = repo.head.object.hexsha file = open(log_path + "/git_commit_" + sha, 'w') file.close() epochs, input_batch_size, rnn_size, num_layers, encoding_embedding_size, decoding_embedding_size, learning_rate, keep_probability, num_samples, reward = map(experiment.hyperparams.get, ('epochs', 'input_batch_size', 'rnn_size', 'num_layers', 'encoding_embedding_size', 'decoding_embedding_size', 'learning_rate', 'keep_probability', 'num_samples', "reward")) ### prepare data ### (train_source_int_text, train_target_int_text), (valid_source_int_text, valid_target_int_text), ( source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = data_preprocessing.get_data(experiment.data["dataset"], experiment.data["folder"], experiment.data["train_source_file"], experiment.data["train_target_file"], experiment.data["dev_source_file"], experiment.data["dev_target_file"], experiment.tokenization) max_source_sentence_length = max([len(sentence) for sentence in train_source_int_text]) train_source = train_source_int_text train_target = train_target_int_text valid_source = valid_source_int_text valid_target = valid_target_int_text # shuffle rnd = random.Random(1234) train_combined = list(zip(train_source, train_target)) rnd.shuffle(train_combined) train_source, train_target = zip(*train_combined) valid_combined = list(zip(valid_source, valid_target)) rnd.shuffle(valid_combined) valid_source, valid_target = zip(*valid_combined) # set reward function if reward == "levenshtein": reward_func = lambda ref_hyp: - textdistance.levenshtein(ref_hyp[0], ref_hyp[1]) elif reward == "jaro-winkler": reward_func = lambda ref_hyp: textdistance.JaroWinkler()(ref_hyp[0], ref_hyp[1]) elif reward == "hamming": reward_func = lambda ref_hyp: - textdistance.hamming(ref_hyp[0], ref_hyp[1]) if experiment.train_method == 'MLE': graph_batch_size = input_batch_size elif experiment.train_method == 'reinforce' or experiment.train_method == 'reinforce_test': graph_batch_size = num_samples ### prepare model ### tf.reset_default_graph()# maybe need? with tf.variable_scope(tf.get_variable_scope(), reuse=False): model = rnn_model.RNN(graph_batch_size, max_source_sentence_length, source_vocab_to_int, target_vocab_to_int, encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers) eval_batch_size = 128 with tf.variable_scope(tf.get_variable_scope(), reuse=True): eval_model = rnn_model.RNN(eval_batch_size, max_source_sentence_length, source_vocab_to_int, target_vocab_to_int, encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers, False) early_stopping = True ### train model ### if experiment.train_method == 'reinforce_test': train.reinforce_test(model, experiment.start_checkpoint, source_vocab_to_int, learning_rate, keep_probability, graph_batch_size, target_int_to_vocab, source_int_to_vocab, valid_source, valid_target) else: train.train(experiment.name, experiment.train_method, model, epochs, input_batch_size, train_source, train_target, valid_source, valid_target, learning_rate, keep_probability, save_path, experiment.start_checkpoint, target_int_to_vocab, source_int_to_vocab, source_vocab_to_int, log_path, graph_batch_size, experiment.max_hours, eval_model, eval_batch_size, reward_func, early_stopping)
print(item + ' --vs-- ' + contentTwo[count]) posOne = pos_tagging(item) posTwo = pos_tagging(contentTwo[count]) for x, y in zip(posOne, posTwo): sentOne = [] sentTwo = [] for pair in x: sentOne.append(str(pair[1])) for pair in y: sentTwo.append(str(pair[1])) if (sentOne == ['NN']): continue print(str(sentOne), str(sentTwo)) print("hamming: ") ham = textdistance.hamming(str(sentOne), str(sentTwo)) totalHamming += ham print(ham) print("cosine: ") cos = textdistance.cosine(str(sentOne), str(sentTwo)) totalCosine += cos print(cos) print("gotoh: ") got = textdistance.gotoh(str(sentOne), str(sentTwo)) totalGotoh += got print(got) print("levenshtein: ") lev = textdistance.levenshtein(str(sentOne), str(sentTwo)) totalLev += lev print(lev) print('\n')