Exemple #1
0
def levenshtein_distance2(words: Iterator[str], vocabulary: Dict[str, int]):
    """Corrects the words based on JaroWinkler distances

    Args:
        words (Iterator[str]): Iterator over the misspelled words
        vocabulary (Dict[str, int]) : dictionary holding words and their frequency
    """

    for word in words:
        distances = []
        suggestions = []
        vocab_list = list(vocabulary)
        for (i,vocab) in enumerate(vocab_list):
            distances.append(distance(word, vocab))
        idx = np.array(distances).argsort()[:5]
        
        for i in range(5):
            for j in range(i+1,5):
                if distances[idx[i]] == distances[idx[j]]:
                    if vocabulary.get(vocab_list[idx[i]]) < vocabulary.get(vocab_list[idx[j]]):
                        temp = idx[i] 
                        idx[i] = idx[j]
                        idx[j] = temp   

        for i in idx:
            suggestions.append(vocab_list[i])

        output("{misspelled}\t{corrections}".format(
            misspelled=word,
            corrections="\t".join(suggestions)
        ))  # may cause IO bottleneck
Exemple #2
0
def levenshtein_distance(words: Iterator[str], vocabulary: str):
    """Corrects the words based on Levenshtein distances

    Args:
        words (Iterator[str]): Iterator over the misspelled words
        vocabulary (str) : Path to the json file holding the vocabulary
    """

    # Create instance of spellchecker
    spell = SpellChecker()

    # Load out custom made dictionary
    spell.word_frequency.load_dictionary(vocabulary)

    for word in words:
        suggestions = sorted(spell.candidates(
            word), key=spell.word_probability, reverse=True)

        output("{misspelled}\t{corrections}".format(
            misspelled=word,
            corrections="\t".join(suggestions[:5])
        ))  # may cause IO bottleneck
Exemple #3
0
def salient_bigrams(phrases: Phrases):
    """Finds the most salient bigrams

    Args:
        phrases (Phrases): Phrases class set up for bigram search
    """
    for slice in read_corpus():
        phrases.add_vocab(read_slice(slice))

        # evaluate all previous corpus slices
        found = set()
        total_bigrams_encountered = 0
        for previous_slice in read_corpus():
            for phrase, score in phrases.export_phrases(
                    read_slice(previous_slice)):
                found.add((phrase, score))
                total_bigrams_encountered += 1
            if previous_slice == slice:
                break

        found = sorted(found, key=lambda element: element[1], reverse=True)

        # no bigrams found?
        if len(found) == 0:
            output(slice, "")

        # log the top ten bigrams
        for phrase, score in found[:10]:
            output(slice, "{phrase}, {score}".format(phrase=phrase,
                                                     score=score))

        # log the total counts
        output(
            slice, """
Total bigrams: {total}
Unique bigrams: {unique}
Median score:{median}
Max score:{max}
Min score:{min}
""".format(total=total_bigrams_encountered,
           unique=len(found),
           median=found[len(found) // 2] if len(found) != 0 else 0,
           max=found[0] if len(found) != 0 else 0,
           min=found[-1]) if len(found) != 0 else 0)

        # will log a time if command line args were enabled
        Timer.try_to_time()
Exemple #4
0
def salient_trigrams(phrases: Phrases):
    """Finds the most salient trigrams

    Args:
        phrases (Phrases): Phrases class set up for bigram search
    """
    trigram = Phrases()

    for slice in read_corpus():
        # prepare the bigram
        for previous_slice in read_corpus():
            phrases.add_vocab(read_slice(slice))
            if previous_slice == slice:
                break

        # transform sentences into possible bigrams
        bigram_phraser = Phraser(phrases)

        def bigrammed(slice: str):
            for sent in read_slice(slice):
                yield bigram_phraser[sent]

        trigram.add_vocab(bigrammed(slice))

        # evaluate all previous corpus slices
        found = set()
        total_trigrams_encountered = 0
        for previous_slice in read_corpus():
            for phrase, score in trigram.export_phrases(
                    bigrammed(previous_slice)):
                if phrase.count(b'_') == 2:
                    found.add((phrase, score))
                    total_trigrams_encountered += 1
                elif '_' in phrase:
                    print(phrase)
            if previous_slice == slice:
                break

        found = sorted(found, key=lambda element: element[1], reverse=True)

        # no trigrams found?
        if len(found) == 0:
            output(slice, "")

        # log the top ten trigrams
        for phrase, score in found[:10]:
            output(slice, "{phrase}, {score}".format(phrase=phrase,
                                                     score=score))

        # log the total counts
        output(
            slice, """
Total trigrams: {total}
Unique trigrams: {unique}
Mean score:{median}
Max score:{max}
Min score:{min}
""".format(total=total_trigrams_encountered,
           unique=len(found),
           median=found[len(found) // 2] if len(found) != 0 else 0,
           max=found[0] if len(found) != 0 else 0,
           min=found[-1] if len(found) != 0 else 0))

        # will log a time if command line args were enabled
        Timer.try_to_time()