def match_using_edit_distance(word_1, word_2): """ Calculate de Levenshtein distance between two words, if the distance is less than a threshold is considered a match. The Levenshtein distance is the number of edits that are require to change one word into the other. It's only calculate de distance if the sum of the length of the two words is greater than 6. :param word_1: (String, default none) A word of any size. :param word_2: (String, default none) A word of any size. :return: A Boolean, True if was considered a match, False otherwise. """ len_word_1 = len(word_1) len_word_2 = len(word_2) if len_word_1 + len_word_2 > 6: threshold = np.floor((len_word_1 + len_word_2) * 0.2) # Malleable threshold for larger words if threshold < 2: threshold = 2 return edit_distance(word_1, word_2) <= threshold else: return edit_distance(word_1, word_2) == 0
def get_similar_names(team_name, all_team_names): similar_team_names = [] # Go through the mapping for name in NAME_MAPPING: if name.startswith(team_name): similar_team_names.append(NAME_MAPPING[name]) if team_name[0] == name[0] and nltk.edit_distance(name, team_name) <= 2: # If no more than 2 substitution, insertion or deletion are needed to obtain one string starting with the other # AND if first letter are the same similar_team_names.append(NAME_MAPPING[name]) # Go through all_team_names for name in all_team_names: if name.startswith(team_name): similar_team_names.append(name) if team_name[0] == name[0] and nltk.edit_distance(name, team_name) <= 2: # If no more than 2 substitution, insertion or deletion are needed to obtain one string starting with the other # AND if first letter are the same similar_team_names.append(name) return similar_team_names
def _prune_suggestions_using_editdist_dm(self, word, suggested_corrections): suggested_corrections = map(lambda suggestion: suggestion.lower(), suggested_corrections) suggested_corrections = filter(lambda suggestion: suggestion[0] == word[0], suggested_corrections) _suggestions = [] for suggested_word in suggested_corrections: e_distance = nltk.edit_distance(word, suggested_word) if e_distance <= SpellCorrector.MAX_EDIT_DISTANCE_THRESHOLD: _suggestions.append((suggested_word, e_distance)) suggested_corrections = _suggestions _suggestions = [] word_dms = doublemetaphone(word) # doublemetaphones of word for suggested_word, e_distance in suggested_corrections: suggested_word_dms = doublemetaphone(suggested_word) dme_distance = 1000 for dm in word_dms: for sw_dm in suggested_word_dms: dme_distance = min(dme_distance, nltk.edit_distance(dm, sw_dm)) if dme_distance <= SpellCorrector.DOUBLE_METAPHONE_MAX_EDIT_DISTANCE_THRESHOLD: _suggestions.append((suggested_word, (e_distance, dme_distance))) suggested_corrections = _suggestions _suggestions = [] for suggested_word, (e_distance, dme_distance) in suggested_corrections: freq = english_words.get(suggested_word, len(english_words) + 1) _suggestions.append((suggested_word, freq)) # Reranked on usage suggested_corrections = sorted(_suggestions, key=lambda x: x[1]) suggested_corrections = [suggested_word for suggested_word, _ in suggested_corrections] return suggested_corrections
def closeEnough(self, strToCmp, errors): names = self._cats.loc[:, 'enname'].to_list() names = [str(x).lower() for x in names] # edit distance of everything in the tsv dss = list(map(lambda x: nl.edit_distance(x, strToCmp), names)) closest = [i for i, x in enumerate(dss) if x == min(dss)] # from dictionary distancedict = defaultdict(list) for i in self._customnames: distancedict[nl.edit_distance(strToCmp, i.lower())].append( self._customnames[i]) customnames = [] try: customnames = min(distancedict.items()) except ValueError: # empty custom names customnames.append(errors + 1) if min(dss) > errors and customnames[0] > errors: # both were too bad return None if min(dss) < customnames[0]: # normal names were better return [closest, min(dss) ] # all of the closest and the distance of the closests else: # custom names were better return [customnames[1], customnames[0]] # the best matches of all custom names
def similarityToSystemHeader(headerName): mindistance = [0, nltk.edit_distance(system_libraries[0], headerName[1])] for i in range(2, len(system_libraries), 2): tmp = nltk.edit_distance(system_libraries[i], headerName[1]) if mindistance[1] > tmp: mindistance = [i, tmp] return system_libraries[mindistance[0]]
def typing_error(self, theta1, theta2): ''' typing error 체크. :param sentences: (list): list of strings :param theta: (float): 오류 판정 threshold :return return_pairs: (list): ori 문장-수정된 문장 페어 리스트 ''' error_token = set() sep = ':-:' min_edit = 4 tokens = nltk.word_tokenize(self.sentence) refined_tokens = [] for token in tokens: if any(char.isdigit() for char in token): continue else: refined_tokens.append(token) for trigram in ngrams(refined_tokens, 3): given_phrase = sep.join(trigram) pre_tok, target_tok, next_tok = trigram[0], trigram[1], trigram[2] total_freq = 0 try: for predict_tok in tri_vocab[pre_tok + sep + next_tok]: predict_phrase = pre_tok + sep + predict_tok + sep + next_tok total_freq = total_freq + tri_frequency[predict_phrase] # if target_tok == 'Maderote': # print(pre_tok, next_tok) ambiguity = len(tri_vocab[pre_tok + sep + next_tok]) theta = theta1 for predict_tok in tri_vocab[pre_tok + sep + next_tok]: predict_phrase = pre_tok + sep + predict_tok + sep + next_tok if predict_phrase != given_phrase: prob = round( tri_frequency[predict_phrase] / float(total_freq), 5) if nltk.edit_distance(target_tok, predict_tok) < len(target_tok): if nltk.edit_distance(target_tok, predict_tok) < min_edit: if total_freq < 5: error_token.add(target_tok) else: if ambiguity >= 5: theta = theta2 # elif ambiguity >= 3: # theta = 0.5 if prob >= theta: error_token.add(target_tok) except KeyError as k: # print(k) # print(pre_tok + sep + next_tok) pass return error_token
def func(word): min=999999999 output="" for i in range(X.shape[0]): if(nltk.edit_distance(X[i][0], word)<min): min=nltk.edit_distance(X[i][0], word) output=X[i][0] return output
def outputResults(text, code): with open('test/text{}.txt'.format(sys.argv[1]), 'r') as f: data = f.read() print('DATA:\n' + data) print('Difference between tesseract and real text is:') print(nltk.edit_distance(text, data)) print('Difference between optimization and real text is:') print(nltk.edit_distance(code, data)) for i in range(len(code)): if code[i] != data[i]: print(code[i] + '\t\t\t' + data[i])
def Similarity(var, fields=[ 'Passpord Card no', 'Nationality', 'Surname', 'Given Names', 'Sex', 'Date of Birth', 'Place of Birth' ]): min = len(var) / 3 fieldMin = ' ' for field in fields: if nltk.edit_distance(field, var) <= min or field.__contains__(var): min = nltk.edit_distance(field, var) fieldMin = field return fieldMin
def similarity( var, fields=[ 'Passpord Card no', 'Nationality', 'Surname', 'Given Names', 'Sex', 'Date of Birth', 'Place of Birth' ]): # gets some string and returns the field name similar to it if any min = len(var) / 3 field_min = ' ' for field in fields: if nltk.edit_distance(field, var) <= min or field.__contains__(var): min = nltk.edit_distance(field, var) field_min = field return field_min
def findNearestWords(self, wordlist, actualWord, tag): distance = [] for term in wordlist: if term[0][0] not in string.punctuation: wordTowordDistance = nltk.edit_distance(actualWord, term[0].lower()) lemma = self.lemmatizer.lemmatize(term[0].lower(), self.get_wordnet_pos(tag)) lemmaTowordDistance = nltk.edit_distance(actualWord, lemma) if wordTowordDistance <= self.maxDistance: distance.append([term[0].lower(), wordTowordDistance]) elif lemmaTowordDistance <= self.maxDistance: distance.append([lemma, lemmaTowordDistance]) distance.sort(key=lambda x: x[1]) return distance
def getDiseases(inputText): #in list of words text check whether a given word is present or not #read disease name from 2 databases Data = pd.read_csv('Disease-symptoms-medication-dataset.csv') allDiseases = list(Data['Disease']) answer=[] wordnet_lemmatizer = WordNetLemmatizer() for word in inputText: word_new = wordnet_lemmatizer.lemmatize(word) for diseases in allDiseases: if word == diseases or word_new==diseases and diseases not in answer: answer.append(diseases) if (nltk.edit_distance(word,diseases) <= 2 or nltk.edit_distance(word_new,diseases) <=2) and diseases not in answer: answer.append(diseases) return answer
def get_min_lingual_distance(w, lst, pos=None): if not isinstance(lst, list): lst = [lst] if len(lst) <= 0: return w best = (lst[0], get_distance(w, lst[0], pos=pos)[0]) for i in range(1, len(lst)): dst = get_distance(w, lst[i], pos=pos) if dst[0] < best[1]: best = (lst[i], dst[0]) elif dst[0] == best[ 1]: # if the distance is the same (most often 0 and 0)... # if edit distance is less, that's the better option if nltk.edit_distance(best[0], w) / max(len( best[0]), 1) > nltk.edit_distance(lst[i], w) / max( len(lst[i]), 1): best = (lst[i], dst[0]) return best
def nametoenemies( self, stringtosearch, errors): # TODO refine failing to get data for whatever reason results = None try: conn = sqlite3.connect('stages.db') cursor = conn.cursor() query = '''select * from searchunitstages''' # TODO this is going to change later stagenames = cursor.execute(query).fetchall() stagenames = [x[0].lower() for x in stagenames] dss = list( map(lambda x: nl.edit_distance(x, stringtosearch), stagenames)) if min(dss) > errors: results = -1 raise Exception('String could not match anything.') nearestmatch = [i for i, x in enumerate(dss) if x == min(dss)] if len(nearestmatch) > 1: results = -2 raise Exception('Could not discriminate.') else: nearestmatch = [(stagenames[nearestmatch[0]])] cursor.execute( 'SELECT * from enemylines, stage where stage.stage_id=enemylines.stage_appearance and ' 'LOWER(name)=?', nearestmatch) results = cursor.fetchall() except: print('something went wrong') finally: conn.close() return results
def query_list_of_words(target_word, list_of_words, edit_distance=1): """ Checks whether a target word is within editing distance of any one in a set of keywords. Inputs: - target_word: A string containing the word we want to search in a list. - list_of_words: A python list of words. - edit_distance: For larger words, we also check for similar words based on edit_distance. Outputs: - new_list_of_words: This is the input list of words minus any found keywords. - found_list_of_words: This is the list of words that are within edit distance of the target word. """ # Initialize lists new_list_of_words = list() found_list_of_words = list() append_left_keyword = new_list_of_words.append append_found_keyword = found_list_of_words.append # Iterate over the list of words for word in list_of_words: if len(word) > 6: effective_edit_distance = edit_distance else: effective_edit_distance = 0 # No edit distance for small words. if abs(len(word)-len(target_word)) <= effective_edit_distance: if nltk.edit_distance(word, target_word) <= effective_edit_distance: append_found_keyword(word) else: append_left_keyword(word) else: append_left_keyword(word) return new_list_of_words, found_list_of_words
def getSuggestion(): sugg_final = [] wordReq = request.args['word'] suggest = [] suggestFreq = [] lis = [] lisFreq = [] key_list = [] # if to check for key----------------------------------- if find_language(wordReq) == 'tamil': for x in range(-2, 3): inp_key = getAccKey(wordReq, x, checkLettersTamil, 'tamil') key_list.append(inp_key) for key in key_list: if key in hashtable_ta: thisKeyList = [e['word'] for e in hashtable_ta[key]] thisKeyListFreq = [f['freq'] for f in hashtable_ta[key]] for word in thisKeyList: lis.append(word) lisFreq.append( float(thisKeyListFreq[thisKeyList.index(word)])) elif find_language(wordReq) == 'sinhala': for x in range(-2, 3): inp_key = getAccKey(wordReq, x, checkLettersTamil, 'sinhala') key_list.append(inp_key) for key in key_list: if key in hashtable_si: thisKeyList = [e['word'] for e in hashtable_si[key]] thisKeyListFreq = [f['freq'] for f in hashtable_si[key]] for word in thisKeyList: lis.append(word) lisFreq.append( int(thisKeyListFreq[thisKeyList.index(word)])) else: pass for word in lis: ed = nltk.edit_distance(wordReq, word) if (ed <= 2): suggest.append(word) suggestFreq.append(lisFreq[lis.index(word)]) ng_list = list(ngrams(wordReq, 2)) sugg_dic = [] for word in suggest: l = list(ngrams(word, 2)) lis = set(l + ng_list) val = len(list(set(ng_list) & set(l))) value = val / (len(lis)) sugg_dic.append( (word, value, value * (suggestFreq[suggest.index(word)]))) sugg_dic.sort(key=takeSecond) sorted_x = sugg_dic[-10:] sorted_x.sort(key=takeThird, reverse=True) for sugg in sorted_x: sugg_final.append(sugg[0]) return jsonify({'suggestion': sugg_final})
def test_replace(self): s = 'replace a character?' res = self.mistakes.replace(s) print(f'swap: `{s}` -> `{res}`') self.assertEqual(len(s), len(res)) self.assertLessEqual(edit_distance(s, res), 1) self.assertEqual(self.mistakes.replace(''), '')
def test_delete(self): s = 'abcdefgh' position = 3 res = self.mistakes.delete(s, start=position, end=position) print(f'delete: `{s}` -> `{res}`') self.assertEqual(res, s[:position] + s[position + 1:]) self.assertEqual(edit_distance(s, res), 1) res = self.mistakes.delete(s) print(f'delete: `{s}` -> `{res}`') self.assertEqual(edit_distance(s, res), 1) self.assertEqual(len(s) - 1, len(res)) self.assertEqual(edit_distance(s, res), 1) self.assertEqual(self.mistakes.delete(''), '')
def similarity(string1, string2): len1 = float(len(string1)) len2 = float(len(string2)) lensum = len1 + len2 levdist = float(nltk.edit_distance(string1, string2)) similarityMetric = ((lensum - levdist) / lensum) return similarityMetric
def answer_eleven(entries=['cormulent', 'incendenece', 'validrate']): recommendations = [] for e in entries: distances = [(nltk.edit_distance(e, a), a) for a in correct_spellings if a[0] == e[0] and len(a) > 2] recommendations.append(sorted(distances)[0][1]) return recommendations # Your answer here
def word_jaccard(self, w1, w2): cw1 = [char.lower() for char in w1] cw2 = [char.lower() for char in w2] c = max(len(cw1), len(cw2)) - abs(nltk.edit_distance(w1, w2)) return float(c) / (len(cw1) + len(cw2) - c)
def main(): test_file = sys.argv[1] output_file = sys.argv[2] THRESHOLD = 20.7782 # Testing data df_test = read_csv(test_file) df_test['title1_zh'] = df_test['title1_zh'].fillna('') df_test['title2_zh'] = df_test['title2_zh'].fillna('') X_1_test = df_test['title1_zh'].tolist() X_2_test = df_test['title2_zh'].tolist() id_test = df_test['id'].tolist() # Predict y_pred = [] for i in range(len(X_1_test)): dist = edit_distance(X_1_test[i], X_2_test[i]) if dist > THRESHOLD: y_pred.append('unrelated') else: y_pred.append('agreed') # Export with open(output_file, 'w') as file: file.write('Id,Category\n') for i in range(len(y_pred)): file.write(f'{id_test[i]},{y_pred[i]}\n')
def get_common_letters(values): ''' find the lowest edit distance and get similarity :param values: list of words :return: string containing the similarities ''' distance = None id1, id2 = None, None # calculate all distances for i, w_id1 in enumerate(values): for w_id2 in values[i+1:]: # since the pre condition was that just one pair has a distance of one # cut the computational time and stop if that pair is found distance = nltk.edit_distance(w_id1, w_id2) if distance == 1: id1, id2 = w_id1, w_id2 break if distance == 1: break # convert the string to list id1_list = list(id1) # compare the two strings for i, (id_letter1, id_letter2) in enumerate(zip(id1_list, list(id2))): if not id_letter1 == id_letter2: # and replace the differential character id1_list[i] = '' return ''.join(id1_list) return None
def location(self): """ Gets the location from the news story. Inputs include the parts of speech tagged words. Output is the phrase containing the location of mishap. """ ktm_location = LocationInformation().all_ktm_locations() bkt_location = LocationInformation().all_bkt_locations() ltp_location = LocationInformation().all_ltp_locations() outside_location = LocationInformation().all_locations() all_locations = ktm_location + outside_location + bkt_location + ltp_location locations = self.location_extractor() print(locations) return_location = [] max_ratio = 0 max_location = [] for glocation in locations: for location in all_locations: dist = nltk.edit_distance(glocation, location) ratio = (1 - (dist / len(glocation))) * 100 max_ratio = max(max_ratio, ratio) if max_ratio >= 70: max_location = location if max_ratio == ratio: if max_location in ktm_location: return_location = max_location elif max_location in ltp_location: return_location = max_location elif max_location in bkt_location: return_location = max_location elif max_location in outside_location: return_location = max_location return (return_location)
def bigram_edit_distance(code1, code2): bigram1 = transform_to_ngram(code1.split(), n=2) bigram2 = transform_to_ngram(code2.split(), n=2) dist = nltk.edit_distance(bigram1, bigram2) if dist == 0: print(bigram1, ' -----> ', bigram2) return dist
def combined_caption_score(res): """Takes the per caption score and returns a collective score for the image. The score is calculated as follows: * Total length of all the captions (L) * Total number of unique words (W) * Arguments: captions_score {[type]} -- [description] """ final_caption_score = {} for img, per_caption_stats in res.items(): W, L, i = 0, 0, 0 captions = [] # all the captions for a given image for c, (caption, (num_uniq_words, total_len)) in per_caption_stats.items(): W += num_uniq_words L += total_len captions.append(caption) i += 1 assert i == 5 edit_dist = 0 for i in range(5): for j in range(i + 1, 5): edit_dist += nltk.edit_distance(captions[i], captions[j]) final_caption_score[img] = edit_dist return final_caption_score
def calculate(self): keywords1 = self.split(self.key1) keywords2 = self.split(self.key2) similarity_constant = 4 minimum = 1 / math.e maximum = 1 score_list = [] for i in keywords1: for j in keywords2: x = len(i) y = len(j) distance = nltk.edit_distance(i, j) if ((x + y) / similarity_constant) >= distance: raw_score = 2 / (math.e**(x / distance) + (math.e**(y / distance))) scaled_score = (raw_score - minimum) / (maximum - minimum) score_list.append(scaled_score) if score_list: return max(score_list) else: return None
def guess_word(prediction, id_to_labels): dictionary = [line.strip() for line in open("dictionary.txt", 'r')] similar = [] with open('similar.txt') as f: for line in f: temp = [x.strip() for x in line.split(',')] similar.append(temp) new_prediction = [] i = 0 while i < len(prediction): if i < len(prediction)-1: for line in similar: if id_to_labels[prediction[i]] in line: if id_to_labels[prediction[i+1]] in line: new_prediction.append(prediction[i]) prediction.remove(prediction[i+1]) break else: new_prediction.append(prediction[i]) i += 1 output = '' for p in prediction: output += id_to_labels[p] output = output.lower() print(output) possibilities = [] for words in dictionary: if len(words) == len(output): if nltk.edit_distance(words, output) == 1: possibilities.append(words) print(possibilities)
def canFind(text, item_data): word_list = re.split(' ', text) item_list = re.split(' ', item_data) wl_cnt = len(word_list) il_cnt = len(item_list) res_list = [] for i in range(wl_cnt - il_cnt + 1): isOK = True for j in range(il_cnt): if nltk.edit_distance(word_list[i+j].lower(), item_list[j].lower()) > 1: isOK = False break if isOK == True: res = word_list[i] for j in range(il_cnt - 1): res = res + " " + word_list[i + 1 + j] res_list.append(res) return res_list
def find_nearest(input, datalist): #Uses nltk to find the closest match in a list datalist indices = [] for string in datalist: indices.append(nltk.edit_distance(input, string)) closest_index = indices.index(min(indices)) closest = datalist[closest_index] return closest #Returns closest word from the list
def incomplete_pred(words, n): all_succeeding = bgs_freq[(words[n-2])].most_common() #print (all_succeeding, file=sys.stderr) preds = [] number=0 for pred in all_succeeding: if pred[0].startswith(words[n-1]): appendwithcheck(preds, pred) number+=1 if number==3: return preds if len(preds)<3: med=[] for pred in all_succeeding: med.append((pred[0], nltk.edit_distance(pred[0],words[n-1], transpositions=True))) med.sort(key=lambda x:x[1]) index=0 while len(preds)<3: print (index, len(med)) if index<len(med): if med[index][1]>0: appendwithcheck(preds, med[index]) index+=1 if index>=len(preds): return preds return preds
def error_rate(string, grammar): """ Calculate the Word Error Rate of a grammar from a string :param string: The :param grammar: :return: """ # TODO implement this return nltk.edit_distance(string, grammar)
def get_teamname(self,teamname): try: return kp_to_kag[teamname] except KeyError: try: return bd_to_kag[teamname] except KeyError: name,val = '',100 for team in self.teams_kaggle.name: newval = edit_distance(team,teamname) if newval < val: name = team val = newval return name
def correct_spelling(self, query): """Correct spelling. Corrects all spelling errors in `query` with Google's algorithm. Args: query: Query to correct. """ try: wait = ui.WebDriverWait(self._driver, self._timeout) self._driver.get(self._url) wait.until( lambda driver: driver.find_elements_by_xpath( "/html/body/center/form/table/tbody/" "tr/td[2]/span[1]/span/input" ) ) logger.debug("Request done. Back on page: {}".format(self._driver.current_url)) # Set waiting handler for AJAX request. wait = ui.WebDriverWait(self._driver, self._timeout) input_element = self._driver.find_element_by_name("q") # Input query into search box. input_element.send_keys(query) input_element.submit() logger.info("Submitting query: {}".format(query)) wait.until(lambda driver: driver.find_elements_by_xpath("//*[@id='resultStats']")) logger.debug("Response loaded. Now on page: {}".format(self._driver.current_url)) except Exception as t: logger.error(t) self.__reset_driver() return None # Get suggestion field. field = self._driver.find_elements_by_xpath("//*[@id='_FQd']/div/a") if 0 < len(field): suggested_text = str(field[0].text) logger.debug("Did you mean encountered. Suggested query: {}".format(suggested_text)) else: suggested_text = query logger.debug("No suggestion.") # Google messed things up. if self._max_edit_dist < nltk.edit_distance(suggested_text, query): logger.warn( "Suggested text beyond edit distance threshold" "of {}. Returning original query.".format(self._max_edit_dist) ) suggested_text = query logger.info("Checker returns: {}".format(suggested_text)) return suggested_text
def levenshtein(first, second, transpositions=False): """ Return a similarity ratio of two pieces of text. 0 means the strings are not similar at all, 1.0 means they're identical. This is the Levenshtein ratio: (lensum - ldist) / lensum where lensum is the sum of the length of the two strings and ldist is the Levenshtein distance (edit distance). See https://groups.google.com/forum/#!topic/nltk-users/u94RFDWbGyw """ lensum = len(first) + len(second) ldist = nltk.edit_distance(first, second, transpositions=transpositions) if lensum == 0: return 0 return (lensum - ldist) / lensum
def simple_word_query(target_word, list_of_words, edit_distance=1): found_list_of_words = list() append_found_keyword = found_list_of_words.append for word in list_of_words: if len(word) > 6: effective_edit_distance = edit_distance else: effective_edit_distance = 0 # No edit distance for small words. if abs(len(word)-len(target_word)) <= effective_edit_distance: if nltk.edit_distance(word, target_word) <= effective_edit_distance: append_found_keyword(word) else: pass else: pass return found_list_of_words
def include_spell_mistake(word, similar_word, score): """ Check if similar word passes some rules to be considered a spelling mistake Rules: 1. Similarity score should be greater than a threshold 2. Length of the word with spelling error should be greater than 3. 3. spelling mistake must occur at least some N times in the corpus 4. Must not be a correct English word. 5. First character of both correct spelling and wrong spelling should be same. 6. Has edit distance less than 2 """ edit_distance_threshold = 1 if len(word) <= 4 else 2 return (score > fasttext_min_similarity and len(similar_word) > 3 and vocab[similar_word] >= spell_mistake_min_frequency and not enchant_us.check(similar_word) and word[0] == similar_word[0] and nltk.edit_distance(word, similar_word) <= edit_distance_threshold)
def similarity(self, other): """ Return a similarity ratio of two quotes. 0 means the strings are not similar at all, 1.0 means they're identical. This is the Levenshtein ratio: (lensum - ldist) / lensum where lensum is the sum of the length of the two strings and ldist is the Levenshtein distance (edit distance). See https://groups.google.com/forum/#!topic/nltk-users/u94RFDWbGyw """ lensum = len(self.quote) + len(other.quote) ldist = nltk.edit_distance(self.quote, other.quote) if lensum == 0: return 0 return (lensum - ldist) / lensum
def suggest(self,word): if word in self._valid: return word if word in self._invalid: return None if word in self._suggested: return self._suggested[word] if self._spelling.check(word): self._valid.add(word) return word if self._maxdist > 0: suggestions = self._spelling.suggest(word) if suggestions and nltk.edit_distance(word, suggestions[0]) <= self._maxdist: self._suggested[word] = suggestions[0] return suggestions[0] else: self._invalid.add(word) return None else: self._invalid.add(word)
def compare_tokenset(l,r): if l == r: return True else: return False #if the names sound similar, return true #if 'Byung' in l: # pdb.set_trace() #(l, r) = remove_similar_sounds(l,r) length = len(l) #if 'VAZIRANIz' in l: # pdb.set_trace() for i in range(length): try: if (len(l[i]) == 1 or len(r[i])==1) and (l[i][0] == r[i][0]): continue except: pdb.set_trace() if (len(l[i]) > 1 and len(r[i]) > 1) and (nltk.edit_distance(l[i],r[i]) <= 2) and len(l[i])>4: continue else: return False return True
# User-variables begin textQuery = 'querySequence.txt' # User-variables end # Get queries in a list queryList = [] fd = open(textQuery, 'r') for line in fd: queryList.append(urllib.unquote(line.strip())) # What is the Euclidean distance between queries when in order? orderedDistance = [] for i in range(len(queryList)): try: orderedDistance.append(nltk.edit_distance(queryList[i],queryList[i+1])) except IndexError: continue print 'Ordered average distance ', numpy.average(orderedDistance), 'std deviation', numpy.std(orderedDistance) # Re-run simulation multiple times averageRandDist = [] for i in range(100): # Set the seed to different values numpy.random.seed(i) # What is Euclidean distance between queries taken at random? numpy.random.shuffle(queryList) shuffledDistance = [] for i in range(len(queryList)):
#!/usr/bin/python import nltk, os, numpy # Compare Levenstein distance and rank changes # Date: 27 February 2012 previousQuery = ''; previousUserID = '' previousRank = '' fd = os.popen("awk -F '\\t' '{if ($2 ~ / / && length($4)) print $1\"\\t\"$2\"\\t\"$4}' user-ct-test-collection-01.txt | sort | uniq") for line in fd: LevDistance = []; queryDist = 0 lineWords = line.split('\t')[1:-1] if (len(previousQuery) and (line.split('\t')[0] == previousUserID)): queryDist = nltk.edit_distance(' '.join(lineWords),' '.join(previousQuery)) else: previousUserID = line.split('\t')[0] previousQuery = lineWords # Has the query change resulted in clicked page rank change? if (len(previousRank) and (previousRank != line.split('\t')[-1])): print line.split('\t')[0],queryDist,1 elif (len(previousRank) and (previousRank == line.split('\t')[-1])): print line.split('\t')[0],queryDist,0 previousRank = line.split('\t')[-1]
#coding:utf8 from libjade import * from nltk import edit_distance from string import ascii_letters if __name__=='__main__': ss=fread('test.txt').split(); s=ss[0] time_init() for i in xrange(1, len(ss)): # for i in xrange(1, 10): t=ss[i] # print edit_distance(s, t) edit_distance(s, t) print time_gap('finished') # l=list(ascii_letters) # ss=[] # for i in xrange(100000): # shuffle(l) # s=''.join(l) # ss.append(s) # fwrite('\n'.join(ss), 'test.txt')
#!/usr/bin/python import nltk, os, numpy # Calc Levenshtein distances # Date: 24 February 2012 # Author: Evgeniy previousQuery = "" fd = open("userID_query", "r") for line in fd: LevDistance = [] queryDist = 0 # Exlclude the userID lineWords = line.split()[1:] for i in range(len(lineWords)): try: LevDistance.append(nltk.edit_distance(lineWords[i], lineWords[i + 1])) except IndexError: continue if len(previousQuery): queryDist = nltk.edit_distance(" ".join(line.split()[1:]), previousQuery) else: queryDist = 0 previousQuery = " ".join(line.split()[1:-1]) # userID, mean, std, min, max, queryDist print line.split()[0], numpy.mean(LevDistance), numpy.std(LevDistance), numpy.min(LevDistance), numpy.max( LevDistance ), queryDist
def preprocess_query(var, doc_idx, metadata, cleanmetadata): ''' This method is used to check for statistical queries like df, freq, tf, title, author, biblio, text and similar terms. It then strips off these terms from the search query and passes the main query to classify_query. Essentially the main function of the project. ''' final_out = {} if 'df ' in var: var = var.replace('df ', '') final_out = classify_query(var, doc_idx) print "\nDocument Frequency of " + var + " : " + str(len(final_out.keys())) elif 'freq ' in var: var = var.replace('freq ', '') final_out = classify_query(var, doc_idx) print "\nFrequency of " + var + " : " + str(sum(final_out.values())) elif 'tf ' in var: var = var.replace('tf ', '') doc_num = re.findall(r'\d+\s', var) var = re.sub(r'\d+\s', "", var) doc_num = int(doc_num[0]) final_out = classify_query(var, doc_idx) print "\nTerm Frequency of " + var + " : " + str(final_out[doc_num]) elif 'title ' in var: var = var.replace('title ', '') doc_num = re.findall(r'\d+', var) print doc_num var = re.sub(r'\d+', "", var) print "\nDocument Title: " + metadata[doc_num[0]][0] elif 'author ' in var: var = var.replace('author ', '') doc_num = re.findall(r'\d+', var) var = re.sub(r'\d+', "", var) print "\nDocument Author: " + metadata[doc_num[0]][1] elif 'bib ' in var: var = var.replace('bib ', '') doc_num = re.findall(r'\d+', var) var = re.sub(r'\d+', "", var) print "\nDocument Biblio: " + metadata[doc_num[0]][2] elif 'doc ' in var: var = var.replace('doc ', '') doc_num = re.findall(r'\d+', var) var = re.sub(r'\d+', "", var) print "\nDocument Text: " + metadata[doc_num[0]][3] elif 'similar ' in var: var = var.replace('similar ', '') similar_words ={} for i in metadata.values(): for k in i[3].split(): if k.strip(punctuation): k = k.strip(punctuation) similarity = nltk.edit_distance(k, var) if similarity < 3: similar_words[k] = similarity print "\nWords similar to " + var + ": " sort_scores = sorted(similar_words.iteritems(), key=operator.itemgetter(1)) unique_similar = set() for (i,j) in sort_scores: unique_similar.add(i) print unique_similar print len(unique_similar) else: # print "No stat query" # print "var" final_out = classify_query(var, doc_idx) if final_out != {}: print_format(var, final_out, index_data, metadata, cleanmetadata) return final_out
def is_similar_to(str1, str2): avg_len = (len(str1) + len(str2)) / 2 return nltk.edit_distance(str1, str2) <= avg_len / 5
# Tokenize the text line tokens = nltk.word_tokenize(line.strip()) # Ignore empty line if (not len(tokens)): continue # Remove stopwords from tokens tokens = [w for w in tokens if w.lower() not in stopwords] # Remove any non-letters from tokens tokens = [re.sub('\W+','',n) for n in tokens] # Remove empty and non-words from tokens tokensRefined = [] for t in tokens: if (len(t.strip()) == 0): continue elif (len(t.strip()) <= 3): continue else: tokensRefined.append(t) # Ignore empty tokensRefined if (not len(tokensRefined)): continue # Stem the words stems = [porter.stem(t) for t in tokensRefined] # Print token, difference with stem and position in text for i in range(len(tokensRefined)): globalTextPosition += 1 editDistance = nltk.edit_distance(tokensRefined[i], stems[i]) print tokensRefined[i], editDistance, globalTextPosition
import edit_distance2 import nltk from string import ascii_letters if __name__=='__main__': l=list(ascii_letters[:10]) s='' for i in xrange(200): shuffle(l) s+=''.join(l) t='' for i in xrange(100): shuffle(l) t+=''.join(l) n=20 print len(s), len(t) time_init() for i in xrange(n): b=edit_distance.edit_distance(s, t) print time_gap('edit_distance.edit_distance') for i in xrange(n): a=edit_distance2.edit_distance(s, t) print time_gap('edit_distance2.edit_distance') for i in xrange(n): a=nltk.edit_distance(s, t) print time_gap('nltk.edit_distance')
def get(folder, host, user, password, database, incremental_ind): def id_generator(size=25, chars=string.ascii_lowercase + string.digits): return ''.join(random.choice(chars) for _ in range(size)) punctuation = "( + ) [ ? : ! . ; ] * # % ` ' / _ = -".split() punctuation.append('"') ###SETUP MAJOR VARS fdmain = folder+ "/location_disambiguation/" #need to figure out what this is #separate first(0) and incremental(1) disambiguations incremental = incremental_ind # Step 1 mydb = MySQLdb.connect(host, user, password, database) cursor = mydb.cursor() if incremental == 0: increm = '' else: increm = ' AND (location_id is NULL or location_id = "")' print "Step 1..." cursor.execute('select distinct country_transformed from rawlocation where country_transformed is not NULL and country_transformed != "" and country_transformed!="s" and country_transformed!="B." and country_transformed!="omitted" '+increm) countries = [item[0] for item in cursor.fetchall() if item[0] is not None] print countries os.makedirs(fdmain) os.makedirs(fdmain+'uspto_disamb/') os.makedirs(fdmain+'uspto_disamb_counts/') os.makedirs(fdmain+'uspto_disamb_v2/') os.makedirs(fdmain+'uspto_disamb_loc_latlong/') os.makedirs(fdmain+'uspto_disamb_only_loc/') for c in countries: print c datum = {} output = open(fdmain+'uspto_disamb/'+c+'.tsv','wb') output2 = open(fdmain+'uspto_disamb_counts/'+c+'.tsv','wb') outp = csv.writer(output,delimiter='\t') outp2 = csv.writer(output2,delimiter='\t') cursor.execute("select city,state,country_transformed,count(city) from rawlocation where country_transformed = '"+c+"'"+increm+" group by city,state order by count(city) desc") outp2.writerows(cursor.fetchall()) cursor.execute('select distinct state from rawlocation where country_transformed = "'+c+'"'+increm) states = [f[0] for f in cursor.fetchall()] for s in states: if str(s) == 'None' or str(s)=='NULL': cursor.execute('select id,city from rawlocation where country_transformed = "'+c+'" and (state is NULL or state="NULL")'+increm) s = '' else: s = re.sub('[\n\t\f\r]+','',s.strip()) cursor.execute('select id,city from rawlocation where country_transformed = "'+c+'" and state ="'+s+'"'+increm) locs = [list(f) for f in cursor.fetchall()] for l in locs: ll = [] for l1 in l: if l1: ll.append(re.sub('[\n\t\r\f]+','',l1.strip())) else: ll.append('') outp.writerow(ll+[s,c]) output.close() output2.close() print "Step 2..." fd = fdmain+'uspto_disamb_counts/' diri = os.listdir(fd) mastdata = {} mastdatum = {} for d in diri: #this is separate from the forloop below because otherwise places that are in the wrong file break it mastdata[d.replace('.tsv','')] = {} mastdatum[d.replace('.tsv','')] = {} for d in diri: input = open(fd+d,'rb') inp = csv.reader(input,delimiter='\t') try: head = inp.next() top = int(head[-1]) except: pass num = 1 for i in inp: num+=1 inp = csv.reader(file(fd+d),delimiter='\t') for e,i in enumerate(inp): if e<=int(num/3) and int(i[-1])>int(top/5): city = unidecode(i[0]) for p in punctuation: city = city.replace(p,'') city = re.sub('[0-9]+','',city) city = re.sub('^\s+','',city) city = re.sub('\s+$','',city) city = city.replace(' ','') state = i[1] state = re.sub('^\s+','',state) state = re.sub('\s+$','',state) country = i[2] key = id_generator(size=12) try: gg = mastdata[country][city.lower()+'_'+state.lower()] except: #print len(mastdata[country]) mastdata[country][city.lower()+'_'+state.lower()] = [key,i[0].strip(),i[1].strip(),i[2],int(i[3])] mastdatum[country][city.lower()] = [key,i[0],i[1].strip(),i[2].strip(),int(i[3])] input.close() print "Step 3..." # Step 3 fd = fdmain+'uspto_disamb/' diri = os.listdir(fd) for d in diri: output = open(fdmain+'uspto_disamb_v2/'+d,'wb') input = open(fd+d,'rb') outp = csv.writer(output,delimiter='\t') inp = csv.reader(input,delimiter='\t') data = mastdata[d.replace('.tsv','')] datum = mastdatum[d.replace(".tsv",'')] secdata = {} secdatum = {} for i in inp: city = unidecode(i[1]) state = i[2] country = i[3] for p in punctuation: city = city.replace(p,'') city = re.sub('[0-9]+','',city) city = re.sub('^\s+','',city) city = re.sub('\s+$','',city) origcity = city city = city.replace(' ','') try: gg = data[city.lower()+'_'+state.lower()] outp.writerow(i+gg) except: try: cit = city.lower().split(",")[0] gg = data[cit.lower()+'_'+state.lower()] outp.writerow(i+gg) except: try: cit = city.lower().split("/") for cc in cit: gg = data[cc.lower()+'_'+state.lower()] outp.writerow(i+gg) break except: try: cit = city.lower().split("-") for cc in cit: gg = data[cc.lower()+'_'+state.lower()] outp.writerow(i+gg) break except: try: cit = city.lower().split("&")[0] gg = data[cit.lower()+'_'+state.lower()] outp.writerow(i+gg) except: try: gg = datum[city.lower()] outp.writerow(i+gg) except: try: howdy = 0 for k,v in data.items(): dist = jaro.jaro_winkler_metric((city.lower()+'_'+state.lower()).decode('utf-8','ignore'),k.decode('utf-8','ignore')) edit = nltk.edit_distance(city.lower()+'_'+state.lower(),k) if (re.search(k.split("_")[0],city.lower()) and k.split("_")[0]!='') or dist >= 0.95 or (edit==2 and len(city.lower())>5): outp.writerow(i+v) howdy = 1 break gg = datum[city] except: if howdy == 0: cit = [cc for cc in origcity.lower().split(" ") if len(cc) > 4] howdy2 = 0 for cc in cit: try: gg = datum[cc] outp.writerow(i+gg) howdy2 = 1 break except: pass if howdy2 == 0: try: gg = secdata[city.lower()+'_'+state.lower()] outp.writerow(i+gg) except: try: cit = city.lower().split(",")[0] gg = secdata[cit.lower()+'_'+state.lower()] outp.writerow(i+gg) except: try: cit = city.lower().split("&")[0] gg = secdata[cit.lower()+'_'+state.lower()] outp.writerow(i+gg) except: try: gg = secdatum[city.lower()] outp.writerow(i+gg) except: try: howdy = 0 gg = datum[city] except: if howdy == 0: cit = [cc for cc in origcity.lower().split(" ") if len(cc) > 4] howdy2 = 0 for cc in cit: try: gg = secdatum[cc] outp.writerow(i+gg) howdy2 = 1 break except: pass if howdy2 == 0: key = id_generator(size=12) secdata[city.lower()+'_'+state.lower()] = [key,i[1],i[2],i[3]] secdatum[city.lower()] = [key,i[1],i[2],i[3]] outp.writerow(i+[key,i[1],i[2],i[3]]) input.close() output.close() print "Step 4..." #Step 4 fd = fdmain+'uspto_disamb_v2/' fd3 = fdmain+'uspto_disamb_only_loc/' diri = os.listdir(fd) for d in diri: input = open(fd+d,'rb') output = open(fd3+d,'wb') inp = csv.reader(input,delimiter='\t') outp2 = csv.writer(output,delimiter='\t') data = {} final = {} disamb = {} for i in inp: try: gg = data[' '.join(i[5:])] final[i[0]] = i[:4]+[gg]+i[5:] except: try: data[' '.join(i[5:])] = i[4] final[i[0]] = i disamb[i[4]] = i[4:] except: print d,i input.close() for k,v in disamb.items(): if len(v) == 5: v = v[:-1] outp2.writerow(v) output.close() #exit() print "Done Step 1 - 4"
#!/usr/bin/python import os, nltk from nltk.corpus import stopwords porter = nltk.PorterStemmer() stopwords = nltk.corpus.stopwords.words('english') fd = os.popen("awk -F '\t' '{if ($2 ~ / /) print $2}' user-ct-test-collection-01.txt | sort | uniq") for line in fd: # Stem the queries tokens = nltk.word_tokenize(line) tokens = [w for w in tokens if w.lower() not in stopwords] indexWords = [porter.stem(t) for t in tokens] # How many chars are extra in the queries charSaving = nltk.edit_distance(' '.join(indexWords),line) print len(line), charSaving
i = i+1 t_pos_num=t_pos_num+1 if i == len(phrase): tf = tf+1 print tf exit(0) elif split_words[0]=="similar": term = split_words[1] sim_dic ={} length = len(term) for i in range(1,len(uncategorized_wordlist)): for word in uncategorized_wordlist[i]: if word not in sim_dic: sim = nltk.edit_distance(term,word) if sim!=0 and sim<length/2: print word sim_dic[word] = sim exit(0) phrase_list,not_list,not_phrase_list,plain_list= query_extraction(query) #print phrase_list #print not_list #print not_phrase_list
def build_dataset(data_set, home_dir, dir, unique_spam, spam_top_50): """Build dataset Data set should have the fields 1.) IP Address from the received field in the header 2.) Matching degree of domain names between Message-Id and Received/From 3.) Subject 4.) Name from the From field 5.) Content type 6.) Attachments: none, text, or non-text 7.) Number of URLs present 8.) URL ratio 9.) SPAM word ratio 10.) SPAM degree as by equation in paper 11.) Classification label: Spam or Ham """ file_list = os.listdir(home_dir + dir) count = len(file_list) for file_name in file_list: data_set[file_name] = DataMember() #Ignore files that start with . if file_name[0] == '.' or os.path.isdir(home_dir + dir + '/' + file_name) or \ os.path.islink(home_dir + dir + '/' + file_name): continue #print file_name file = open(home_dir + dir + '/' + file_name) mail = email.message_from_file(file) file.close() #Extract information from header for key in mail.keys(): #1.) IP Address from the received field in the header (Easy just read it) #Get the IP address of the last Received from field unless its 127.0.0.1 if key == 'Received': address = re.search('(\d{1,3}\.){3}\d{1,3}',mail[key]).group() if address != '127.0.0.1': data_set[file_name].ip_address_str += address + ' ' #3.) Subject (Easy just read it) if key == 'Subject': data_set[file_name].subject_str = repr(mail[key])[1:-1] #4.) Name from the From field (Easy just read it) if key == 'From': data_set[file_name].from_name_str = repr(mail[key])[1:-1] #2.) Matching degree of domain names between Message-Id and (Received/From ??) field (Easy just read and compare) if mail['From'] != None: from_domain = re.search('@[\[\]\w+\.]+', mail['From']) else: from_domain = None; if str(from_domain) != 'None': from_domain = from_domain.group()[1:] else: #Non-ascii domain name, pull out the hex encoding from_domain = repr(mail['From']).replace('\\x','') if from_domain.find('@') == -1: from_domain = ' ' else: from_domain = re.search('@[\[\]\w+\.]+', from_domain).group()[1:] message_domain = re.search('@[\[\]\w+\.]+',mail['Message-ID']) if str(message_domain) != 'None': message_domain = message_domain.group()[1:] else: #Non-ascii domain name, pull out the hex encoding message_domain = repr(mail['Message-ID']).replace('\\x','') message_domain = repr(mail['Message-ID']).replace('%','') if message_domain.find('@') == -1: message_domain = ' ' else: message_domain = re.search('@[\[\]\w+\.]+', message_domain).group()[1:] distance = nltk.edit_distance(from_domain, message_domain) domain_len = max(len(from_domain), len(message_domain), 1) * 1.0 data_set[file_name].degree_domains_match = 1.0 - distance / domain_len #Get the length of the message and the text length = (get_message_len(mail) * 1.0) body = get_message_body(mail) #5.) Content type (Easy just read it) data_set[file_name].type_HTML = get_type_content(mail) #6.) Attachments: none, text, or non-text data_set[file_name].attachments = get_type_attachments(mail) #7.) Number of URLs present urls = re.findall( \ 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', \ body) data_set[file_name].num_urls = len(urls) #8.) URL ratio (% of message body that is URLs) data_set[file_name].percent_urls = len(''.join(urls)) / length #9.) SPAM word ratio #10.) SPAM degree as by equation in paper spam_count = 0 w1 = 50 / 51.0 w2 = 1 / 51.0 freq_spam = 0.0 s1 = 0.0 s2 = 0 body = nltk.clean_html(body) words = nltk.word_tokenize(body) word_count = max(1, len(words)) #Don't allow divide by zero for word in nltk.word_tokenize(body): if word in unique_spam: #Must be SPAM s2 = 1 spam_count += 1 elif word in spam_top_50: freq_spam += 1.0 spam_count += 1 s1 = freq_spam / word_count data_set[file_name].percent_spam = spam_count / length data_set[file_name].degree_spam = w1 * s1 + w2 * s2 #11.) Classification label: Spam or Ham if file_name.startswith('ham'): data_set[file_name].spam = 1 else: data_set[file_name].spam = 2 #Fields that need to be md5 encoded are: IP address, Subject, and from ip_address_md5 = hashlib.md5() ip_address_md5.update(data_set[file_name].ip_address_str) data_set[file_name].ip_address = int(ip_address_md5.hexdigest(),16) subject_md5 = hashlib.md5() subject_md5.update(data_set[file_name].subject_str) data_set[file_name].subject = int(subject_md5.hexdigest(),16) from_name_md5 = hashlib.md5() from_name_md5.update(data_set[file_name].from_name_str) data_set[file_name].from_name = int(from_name_md5.hexdigest(),16) #for key in data_set.keys(): # print data_set[key] return data_set
print 'Averega query length:', len(queryWords)/float(len(queryList)) # Get unique query words uniqueQueryWords = list(set(queryWords)) print 'Unique queries:', len(list(set(queryList))) print 'Unique words in queries:', len(uniqueQueryWords) # What is the percent of repetitions? print 'Percent of query reuse:', 100 - len(list(set(queryList)))/float(len(queryList))*100 print 'Percent of word reuse:', 100 - len(uniqueQueryWords)/float(len(queryWords))*100 # Extracting sessions of queries orderedDistance = [] for i in range(len(queryList)): try: orderedDistance.append(nltk.edit_distance(queryList[i],queryList[i+1])) except IndexError: continue # The average distance serves for separating 'query sessions' sessionSearched = defaultdict(list) sessionID = 1 averageQueryDistance = numpy.average(orderedDistance) for i in range(len(queryList)): try: # Queries that are below the average distnace belong the same session # NOTE: due to implementation each session contains only unique queries duplicate input queries are thus lost if (nltk.edit_distance(queryList[i],queryList[i+1]) <= averageQueryDistance): # Do no duplicate adding same query try: sessionSearched[sessionID].index(queryList[i])
def correct(self,word): candidates = self.known([word]) or self.known(self.edits1(word)) or self.knownEdits2(word) or [word] sugg=list(candidates) sugg.sort(key = lambda s: nltk.edit_distance(word,s)) return sugg[:min(len(sugg),10)]