def levenshtein_distance2(words: Iterator[str], vocabulary: Dict[str, int]): """Corrects the words based on JaroWinkler distances Args: words (Iterator[str]): Iterator over the misspelled words vocabulary (Dict[str, int]) : dictionary holding words and their frequency """ for word in words: distances = [] suggestions = [] vocab_list = list(vocabulary) for (i,vocab) in enumerate(vocab_list): distances.append(distance(word, vocab)) idx = np.array(distances).argsort()[:5] for i in range(5): for j in range(i+1,5): if distances[idx[i]] == distances[idx[j]]: if vocabulary.get(vocab_list[idx[i]]) < vocabulary.get(vocab_list[idx[j]]): temp = idx[i] idx[i] = idx[j] idx[j] = temp for i in idx: suggestions.append(vocab_list[i]) output("{misspelled}\t{corrections}".format( misspelled=word, corrections="\t".join(suggestions) )) # may cause IO bottleneck
def levenshtein_distance(words: Iterator[str], vocabulary: str): """Corrects the words based on Levenshtein distances Args: words (Iterator[str]): Iterator over the misspelled words vocabulary (str) : Path to the json file holding the vocabulary """ # Create instance of spellchecker spell = SpellChecker() # Load out custom made dictionary spell.word_frequency.load_dictionary(vocabulary) for word in words: suggestions = sorted(spell.candidates( word), key=spell.word_probability, reverse=True) output("{misspelled}\t{corrections}".format( misspelled=word, corrections="\t".join(suggestions[:5]) )) # may cause IO bottleneck
def salient_bigrams(phrases: Phrases): """Finds the most salient bigrams Args: phrases (Phrases): Phrases class set up for bigram search """ for slice in read_corpus(): phrases.add_vocab(read_slice(slice)) # evaluate all previous corpus slices found = set() total_bigrams_encountered = 0 for previous_slice in read_corpus(): for phrase, score in phrases.export_phrases( read_slice(previous_slice)): found.add((phrase, score)) total_bigrams_encountered += 1 if previous_slice == slice: break found = sorted(found, key=lambda element: element[1], reverse=True) # no bigrams found? if len(found) == 0: output(slice, "") # log the top ten bigrams for phrase, score in found[:10]: output(slice, "{phrase}, {score}".format(phrase=phrase, score=score)) # log the total counts output( slice, """ Total bigrams: {total} Unique bigrams: {unique} Median score:{median} Max score:{max} Min score:{min} """.format(total=total_bigrams_encountered, unique=len(found), median=found[len(found) // 2] if len(found) != 0 else 0, max=found[0] if len(found) != 0 else 0, min=found[-1]) if len(found) != 0 else 0) # will log a time if command line args were enabled Timer.try_to_time()
def salient_trigrams(phrases: Phrases): """Finds the most salient trigrams Args: phrases (Phrases): Phrases class set up for bigram search """ trigram = Phrases() for slice in read_corpus(): # prepare the bigram for previous_slice in read_corpus(): phrases.add_vocab(read_slice(slice)) if previous_slice == slice: break # transform sentences into possible bigrams bigram_phraser = Phraser(phrases) def bigrammed(slice: str): for sent in read_slice(slice): yield bigram_phraser[sent] trigram.add_vocab(bigrammed(slice)) # evaluate all previous corpus slices found = set() total_trigrams_encountered = 0 for previous_slice in read_corpus(): for phrase, score in trigram.export_phrases( bigrammed(previous_slice)): if phrase.count(b'_') == 2: found.add((phrase, score)) total_trigrams_encountered += 1 elif '_' in phrase: print(phrase) if previous_slice == slice: break found = sorted(found, key=lambda element: element[1], reverse=True) # no trigrams found? if len(found) == 0: output(slice, "") # log the top ten trigrams for phrase, score in found[:10]: output(slice, "{phrase}, {score}".format(phrase=phrase, score=score)) # log the total counts output( slice, """ Total trigrams: {total} Unique trigrams: {unique} Mean score:{median} Max score:{max} Min score:{min} """.format(total=total_trigrams_encountered, unique=len(found), median=found[len(found) // 2] if len(found) != 0 else 0, max=found[0] if len(found) != 0 else 0, min=found[-1] if len(found) != 0 else 0)) # will log a time if command line args were enabled Timer.try_to_time()