def initializeSymspell():
    print("inside initializeSymspell()")
    symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    print("symspell created")
    resourceNames = [
        "symspellpy", "frequency_dictionary_en_82_765.txt",
        "frequency_bigramdictionary_en_243_342.txt"
    ]
    dictionaryPath = pkg_resources.resource_filename(resourceNames[0],
                                                     resourceNames[1])
    bigramPath = pkg_resources.resource_filename(resourceNames[0],
                                                 resourceNames[2])
    print("dictionaryPath created")
    symspell.load_dictionary(dictionaryPath, 0, 1)
    symspell.create_dictionary_entry(key='ap', count=500000000)
    print(list(islice(symspell.words.items(), 5)))
    print("symspell.load_ditionary() done")
    symspell.load_bigram_dictionary(bigramPath, 0, 1)
    print(list(islice(symspell.bigrams.items(), 5)))
    print("symspell.load_bigram_ditionary() done")

    # Create vocab
    vocab = set([w for w, f in symspell.words.items()])

    return symspell, vocab
Beispiel #2
0
def load_spell_checker():
    """Return spell checker"""
    if not os.path.exists("data/unigrams.txt"):
        sents = [normalize_text(" ".join(x)).split() for x in floresta.sents()]
        sents += [normalize_text(" ".join(x)).split() for x in machado.sents()]
        sents += [
            normalize_text(" ".join(x)).split() for x in mac_morpho.sents()
        ]

        unigrams = [item for sublist in sents for item in sublist]
        unigrams = nltk.probability.FreqDist(unigrams)

        file = open("data/unigrams.txt", "w")
        for k, v in unigrams.items():
            file.write(f"{k} {v}\n")
        file.close()

        bigrams = []

        for sent in sents:
            bigrams += list(nltk.bigrams(sent))

        bigrams = nltk.probability.FreqDist(bigrams)

        file = open("data/bigrams.txt", "w")
        for k, v in bigrams.items():
            file.write(f"{' '.join(k)} {v}\n")
        file.close()

    result = SymSpell()

    result.load_dictionary("data/unigrams.txt", 0, 1)
    result.load_bigram_dictionary("data/bigrams.txt", 0, 2)

    return result
def symspell_checker(text):
    from symspellpy.symspellpy import SymSpell
    spell = SymSpell()
    spell.load_dictionary(r"frequency_dictionary_en_82_765.txt", 0, 1)
    spell.load_bigram_dictionary(r"frequency_bigramdictionary_en_243_342.txt", 0, 2)
    result = spell.lookup_compound(text, 2)
    for r in result:
        return r.term
    return text    
Beispiel #4
0
 def __new__(cls):
     if cls._instance is None:
         ##Symspell configuration
         max_edit_distance_dictionary= 3
         prefix_length = 4
         spellchecker = SymSpell(max_edit_distance_dictionary, prefix_length)
         dictionary_path = pkg_resources.resource_filename(
             "symspellpy", "frequency_dictionary_en_82_765.txt")
         bigram_path = pkg_resources.resource_filename(
             "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
         spellchecker.load_dictionary(dictionary_path, term_index=0, count_index=1)
         spellchecker.load_bigram_dictionary(dictionary_path, term_index=0, count_index=2)
         cls._instance=spellchecker
     return cls._instance
Beispiel #5
0
def spell_correction(texte):
    max_edit_distance_dictionary = 2
    prefix_length = 7
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = "../ressources/fr-100k.txt"
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    if not sym_spell.load_dictionary(
            dictionary_path, term_index=0, count_index=1):
        print("Dictionary file not found")
        return
    if not sym_spell.load_bigram_dictionary(
            bigram_path, term_index=0, count_index=2):
        print("Bigram dictionary file not found")
        return
    input_term = texte
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
    if (len(suggestions) > 0):
        return suggestions[0].term
    else:
        print("error with : ", texte)
        return texte
Beispiel #6
0
def init():
    ''' Init symspellpy, loading the frequency words models
    (dictionary and bigram dictionary)
    '''
    global sym_spell
    max_edit_distance_dictionary = 2
    prefix_length = 7

    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    #sym_spell.load_dictionary(os.path.dirname(os.path.abspath(__file__)) + "/frequency_words_models/pt_frequency_50k.txt", term_index=0, count_index=1)
    sym_spell.load_dictionary(os.path.dirname(os.path.abspath(__file__)) +
                              "/frequency_words_models/fw_pt.txt",
                              term_index=0,
                              count_index=1)
    sym_spell.load_bigram_dictionary(
        os.path.dirname(os.path.abspath(__file__)) +
        "/frequency_words_models/fw_bi_pt.txt",
        term_index=0,
        count_index=2)
Beispiel #7
0
def extract_misspellings(s):
    global sym_spell
    if sym_spell is None:
        # Initialize SymSpell checker
        # maximum edit distance per dictionary precalculation
        max_edit_distance_dictionary = 2
        prefix_length = 7
        # create object
        sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

        # load dictionary
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt")
        bigram_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
        # term_index is the column of the term and count_index is the
        # column of the term frequency
        if not sym_spell.load_dictionary(
                dictionary_path, term_index=0, count_index=1):
            print("Dictionary file not found")

        if not sym_spell.load_bigram_dictionary(
                bigram_path, term_index=0, count_index=2):
            print("Bigram dictionary file not found")

    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL

    # Start correcting word by word
    article_text = s.split()
    misspelled = 0
    for word in article_text:
        word = word.strip()
        suggestions = sym_spell.lookup(word, suggestion_verbosity,
                                       max_edit_distance_lookup)
        # Correct the text
        if len(suggestions) == 0:
            continue
        sug = suggestions[0]
        if sug.term != word:
            s = re.sub("\s+" + word + "\s+", " " + sug.term + " ", s)
            misspelled = misspelled + 1
    mpw = misspelled / len(article_text)

    return mpw, s
Beispiel #8
0
def postprocessing(text):
    max_edit_distance_dictionary = 2
    prefix_length = 7
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    if not sym_spell.load_dictionary(dictionary_path, term_index=0,
                                     count_index=1):
        print("Dictionary file not found")
        return
    if not sym_spell.load_bigram_dictionary(dictionary_path, term_index=0,
                                            count_index=2):
        print("Bigram dictionary file not found")
        return

    result = sym_spell.word_segmentation(text.lower())
    return result.corrected_string
Beispiel #9
0
def process(input_string):
    max_edit_distance_dictionary = 2
    prefix_length = 7
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    if not sym_spell.load_dictionary(
            dictionary_path, term_index=0, count_index=1):
        print("Dictionary file not found")
        return
    if not sym_spell.load_bigram_dictionary(
            bigram_path, term_index=0, count_index=2):
        print("Bigram dictionary file not found")
        return

    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST
    suggestions = sym_spell.lookup(input_string, suggestion_verbosity,
                                   max_edit_distance_lookup)
    return list(
        map(lambda sug: (sug.term, sug.distance, sug.count), suggestions))
Beispiel #10
0
class Spellchecker:
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7

    def __init__(self):
        # create object
        self.sym_spell = SymSpell(self.max_edit_distance_dictionary,
                                  self.prefix_length)
        # load dictionary
        dictionary_path = "frequency_dictionary_en_82_765.txt"
        bigram_path = "frequency_bigramdictionary_en_243_342.txt"
        # term_index is the column of the term and count_index is the
        # column of the term frequency
        if not self.sym_spell.load_dictionary(
                dictionary_path, term_index=0, count_index=1):
            print("Dictionary file not found")
            return
        if not self.sym_spell.load_bigram_dictionary(
                bigram_path, term_index=0, count_index=2):
            print("Bigram dictionary file not found")
            return

    def get_correction(self, input_term):
        max_edit_distance_lookup = 2  #must be equal or less than max_edit_distance_dictionary
        suggestion_verbosity = Verbosity.ALL  # TOP, CLOSEST, or ALL
        suggestions = self.sym_spell.lookup(input_term, suggestion_verbosity,
                                            max_edit_distance_lookup)
        # return suggestion terms (top 3)
        corrected_term = []
        for suggestion in suggestions:
            corrected_term.append(suggestion.term)
        if len(corrected_term) > 3:
            return corrected_term[0:3]
        else:
            return corrected_term
Beispiel #11
0
for h in range(len(sntnc)):
    i1.append(' '.join(sntnc[h]))
x_train = []
x_train = i1

with open('x_train.pkl', 'wb') as f:
    pickle.dump(x_train, f)

spellchk = SymSpell(max_dictionary_edit_distance=3, prefix_length=5)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt")

spellchk.load_dictionary(dictionary_path, term_index=0, count_index=1)
spellchk.load_bigram_dictionary(dictionary_path, term_index=0, count_index=2)

normal = []
for sent in tqdm(df['texts']):
    x = str(sent).split()
    for i in range(len(x)):
        w = x[i]
        if not w.isdigit() and not (w.lower() in spellchk.words.keys()):
            sug = spellchk.lookup(w, Verbosity.TOP, 2)
            if len(sug) > 0:
                corr = sug[0].term
                rep = corr
            else:
                rep = re.sub(r'([\w])\1+', r'\1', str(w))
            w = rep
            x[i] = w
Beispiel #12
0
df_pos = pd.DataFrame(pd.unique(df_pos[0]).T, columns=['tweet'])
df_pos['sentiment'] = 1
print(df_pos.shape)

df = pd.concat([df_neg, df_pos])

# Load all dictionnary used for spelling correction, instantiate SymSpell object
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt")

sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

# Adding some expression to dictionnary
dic_yes = 159595214
list_add_dic = ['lol', 'haha', 'tv', 'xoxo', 'lmao', 'omg', 'url', 'jk', 'rt']

for word in list_add_dic:
    sym_spell.words[word] = dic_yes

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Little list of stop word  to remove from tweet
stop_list = ['user', 'url', 'a', 'an', 'the', 'and', 'of', 'at', 'by']

Beispiel #13
0
# maximum edit distance per dictionary precalculation
max_edit_distance_dictionary = 0
prefix_length = 7
# create object
sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
# load dictionary
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
# term_index is the column of the term and count_index is the
# column of the term frequency
if not sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1):
    print("Dictionary file not found")

if not sym_spell.load_bigram_dictionary(
        dictionary_path, term_index=0, count_index=2):
    print("Bigram dictionary file not found")

## ---------------------------------


def clean_and_lemmatize(tweet):
    '''
    Clean and lemmatize a given tweet.
    
    Arg:
        tweet (str): The tweet to clean and lemmatize

    Returns:
        str: The cleaned and lemmatized tweet
    '''
class preprocessing:
    # ======================================================================================================================
    # Remove Contractions (pre-processing)
    # ======================================================================================================================

    def get_contractions(self):
        contraction_dict = {
            "ain't": "is not",
            "aren't": "are not",
            "can't": "cannot",
            "'cause": "because",
            "could've": "could have",
            "couldn't": "could not",
            "didn't": "did not",
            "doesn't": "does not",
            "don't": "do not",
            "hadn't": "had not",
            "hasn't": "has not",
            "haven't": "have not",
            "he'd": "he would",
            "he'll": "he will",
            "he's": "he is",
            "how'd": "how did",
            "how'd'y": "how do you",
            "how'll": "how will",
            "how's": "how is",
            "I'd": "I would",
            "I'd've": "I would have",
            "I'll": "I will",
            "I'll've": "I will have",
            "I'm": "I am",
            "I've": "I have",
            "i'd": "i would",
            "i'd've": "i would have",
            "i'll": "i will",
            "i'll've": "i will have",
            "i'm": "i am",
            "i've": "i have",
            "isn't": "is not",
            "it'd": "it would",
            "it'd've": "it would have",
            "it'll": "it will",
            "it'll've": "it will have",
            "it's": "it is",
            "let's": "let us",
            "ma'am": "madam",
            "mayn't": "may not",
            "might've": "might have",
            "mightn't": "might not",
            "mightn't've": "might not have",
            "must've": "must have",
            "mustn't": "must not",
            "mustn't've": "must not have",
            "needn't": "need not",
            "needn't've": "need not have",
            "o'clock": "of the clock",
            "oughtn't": "ought not",
            "oughtn't've": "ought not have",
            "shan't": "shall not",
            "sha'n't": "shall not",
            "shan't've": "shall not have",
            "she'd": "she would",
            "she'd've": "she would have",
            "she'll": "she will",
            "she'll've": "she will have",
            "she's": "she is",
            "should've": "should have",
            "shouldn't": "should not",
            "shouldn't've": "should not have",
            "so've": "so have",
            "so's": "so as",
            "this's": "this is",
            "that'd": "that would",
            "that'd've": "that would have",
            "that's": "that is",
            "there'd": "there would",
            "there'd've": "there would have",
            "there's": "there is",
            "here's": "here is",
            "they'd": "they would",
            "they'd've": "they would have",
            "they'll": "they will",
            "they'll've": "they will have",
            "they're": "they are",
            "they've": "they have",
            "to've": "to have",
            "wasn't": "was not",
            "we'd": "we would",
            "we'd've": "we would have",
            "we'll": "we will",
            "we'll've": "we will have",
            "we're": "we are",
            "we've": "we have",
            "weren't": "were not",
            "what'll": "what will",
            "what'll've": "what will have",
            "what're": "what are",
            "what's": "what is",
            "what've": "what have",
            "when's": "when is",
            "when've": "when have",
            "where'd": "where did",
            "where's": "where is",
            "where've": "where have",
            "who'll": "who will",
            "who'll've": "who will have",
            "who's": "who is",
            "who've": "who have",
            "why's": "why is",
            "why've": "why have",
            "will've": "will have",
            "won't": "will not",
            "won't've": "will not have",
            "would've": "would have",
            "wouldn't": "would not",
            "wouldn't've": "would not have",
            "y'all": "you all",
            "y'all'd": "you all would",
            "y'all'd've": "you all would have",
            "y'all're": "you all are",
            "y'all've": "you all have",
            "you'd": "you would",
            "you'd've": "you would have",
            "you'll": "you will",
            "you'll've": "you will have",
            "you're": "you are",
            "you've": "you have",
            "nor": "not",
            "nt": "not"
        }

        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

    def replace_contractions(self, text):
        contractions, contractions_re = self.get_contractions()

        def replace(match):
            return contractions[match.group(0)]

        return contractions_re.sub(replace, text)

    whitelist = ["not",
                 'nor']  # Keep the words "n't" and "not", 'nor' and "nt"
    stopwords_verbs = [
        'say', 'get', 'go', 'know', 'may', 'need', 'make', 'see', 'want',
        'come', 'take', 'use', 'would', 'can'
    ]
    stopwords_other = [
        'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
        'nine', 'ten', 'may', 'also', 'across', 'among', 'beside', 'yet',
        'within', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption',
        'copyright', 'something'
    ]
    # further filter stopwords
    more_stopwords = [
        'tag', 'wait', 'set', 'put', 'add', 'post', 'give', 'way', 'check',
        'think', 'www', 'must', 'look', 'call', 'minute', 'com', 'thing',
        'much', 'happen', 'quaranotine', 'day', 'time', 'week', 'amp', 'find',
        'BTu'
    ]
    stop_words = set(
        list(stopwords.words('english')) + ['"', '|'] + stopwords_verbs +
        stopwords_other + more_stopwords)

    # Happy Emoticons
    emoticons_happy = {
        ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)',
        ':}', ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D',
        '=D', '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P',
        ':P', 'X-P', 'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)',
        '>;)', '>:-)', '<3'
    }

    # Sad Emoticons
    emoticons_sad = {
        ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
        ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\',
        ':-c', ':c', ':{', '>:\\', ';('
    }

    # Emoji patterns
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE)

    # Combine sad and happy emoticons
    emoticons = emoticons_happy.union(emoticons_sad)

    def strip_links(self, text):
        all_links_regex = re.compile('http\S+|www.\S+', re.DOTALL)
        text = re.sub(all_links_regex, '', text)
        '''
        link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
        links = re.findall(link_regex, text)
        for link in links:
            text = text.replace(link[0], ', ')
        '''
        return text

    def remove_punctuation(self, text):
        text = re.sub(r'@\S+', '', text)  # Delete Usernames
        #text = re.sub(r'#quarantine', '', text)  # Replace hashtag quarantine with space, as it was used for data scraping
        text = re.sub(r'#', '', text)  # Delete the hashtag sign

        # remove punctuation from each word (Replace hashtags with space, keeping hashtag context)
        for separator in string.punctuation:
            if separator not in ["'"]:
                text = text.replace(separator, '')

        return text

    # convert POS tag to wordnet tag in order to use in lemmatizer
    def get_wordnet_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''

    # function for lemmatazing
    def lemmatizing(self, tokenized_text):
        lemmatizer = WordNetLemmatizer()
        lemma_text = []

        # annotate words with Part-of-Speech tags, format: ((word1, post_tag), (word2, post_tag), ...)
        word_pos_tag = pos_tag(tokenized_text)
        #print("word_pos_tag", word_pos_tag)

        for word_tag in word_pos_tag:  # word_tag[0]: word, word_tag[1]: tag
            # Lemmatizing each word with its POS tag, in each sentence
            if self.get_wordnet_pos(
                    word_tag[1]
            ) != '':  # if the POS tagger annotated the given word, lemmatize the word using its POS tag
                if self.only_verbs_nouns:  # if the only_verbs_nouns is True, get only verbs and nouns
                    if self.get_wordnet_pos(
                            word_tag[1]) in [wordnet.NOUN, wordnet.VERB]:
                        lemma = lemmatizer.lemmatize(
                            word_tag[0], self.get_wordnet_pos(word_tag[1]))
                    else:  # if word non noun or verb, then return empty string
                        lemma = ''
                else:  # if only_verbs_nouns is disabled (False), keep all words
                    lemma = lemmatizer.lemmatize(
                        word_tag[0], self.get_wordnet_pos(word_tag[1]))
            else:  # if the post tagger did NOT annotate the given word, lemmatize the word WITHOUT POS tag
                lemma = lemmatizer.lemmatize(word_tag[0])
            lemma_text.append(lemma)
        return lemma_text

    # function for stemming
    def stemming(self, tokenized_text):
        # stemmer = PorterStemmer()
        stemmer = SnowballStemmer("english")
        stemmed_text = []
        for word in tokenized_text:
            stem = stemmer.stem(word)
            stemmed_text.append(stem)
        return stemmed_text

    # function to keep only alpharethmetic values
    def only_alpha(self, tokenized_text):
        text_alpha = []
        for word in tokenized_text:
            word_alpha = re.sub('[^a-z A-Z]+', ' ', word)
            text_alpha.append(word_alpha)
        return text_alpha

    # initiate whether to use and spell corrector when the class object is created
    def __init__(self,
                 convert_lower=True,
                 use_spell_corrector=False,
                 only_verbs_nouns=False):
        """
        :param convert_lower: whether to convert to lower case or not
        :param use_spell_corrector: boolean to select whether to use spell corrector or not
        :param only_verbs_nouns: whether to filter words to keep only verbs and nouns
        """

        # set boolean to select whether to use spell corrector or not
        self.use_spell_corrector = use_spell_corrector

        # set boolean to select whether to convert text to lower case
        self.convert_lower = convert_lower

        # whether to filter words to keep only verbs and nouns
        self.only_verbs_nouns = only_verbs_nouns

        if self.use_spell_corrector:
            # maximum edit distance per dictionary precalculation
            # count_threshold: the least amount of word frequency to confirm that a word is an actual word
            self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                      count_threshold=10,
                                      prefix_length=7)

            # load dictionary
            dictionary_path = pkg_resources.resource_filename(
                "symspellpy", "frequency_dictionary_en_82_765.txt")
            bigram_path = pkg_resources.resource_filename(
                "symspellpy", "frequency_bigramdictionary_en_243_342.txt")

            # term_index is the column of the term and count_index is the column of the term frequency
            if not self.sym_spell.load_dictionary(
                    dictionary_path, term_index=0, count_index=1):
                print("Dictionary file not found")
            if not self.sym_spell.load_bigram_dictionary(
                    bigram_path, term_index=0, count_index=2):
                print("Bigram dictionary file not found")

            # paths for custom dictionaries
            custom_unigram_dict_path = '../dataset/sym_spell-dictionaries/unigram_twitter_posts_dict.csv'
            custom_bigram_dict_path = '../dataset/sym_spell-dictionaries/bigram_twitter_posts_dict.csv'

            # add custom dicitonaries (uni-gram + bi-gram)
            if not self.sym_spell.load_dictionary(
                    custom_unigram_dict_path, term_index=0, count_index=1):
                print("Custom uni-gram dictionary file not found")
            if not self.sym_spell.load_bigram_dictionary(
                    custom_bigram_dict_path, term_index=0, count_index=2):
                print("Custom bi-gram dictionary file not found")

            # add words from the post we scraped from Twitter/Instagram
            #for word, frequency in corpus_freq:
            #self.sym_spell.create_dictionary_entry(word, frequency)

            #self.sym_spell._distance_algorithm = DistanceAlgorithm.LEVENSHTEIN

    # spell check phrases and correct them
    def spell_corrector(self, post_text):
        # lookup suggestions for multi-word input strings (supports compound splitting & merging)
        # max edit distance per lookup (per single word, not per whole input string)
        # max_edit_distance_lookup <= max_edit_distance_dictionary
        # ignore_non_words : determine whether numbers and acronyms are left alone during the spell checking process
        #        suggestions = self.sym_spell.lookup_compound(post_text, max_edit_distance=2, ignore_non_words=True, transfer_casing=True)  # keep original casing

        # Verbosity: TOP, CLOSEST, ALL
        corrected_posts = []
        for post in post_text:
            suggestions = self.sym_spell.lookup(post,
                                                Verbosity.CLOSEST,
                                                max_edit_distance=2,
                                                include_unknown=True,
                                                transfer_casing=True)
            corrected_posts.append(suggestions[0].term)

#        print(post_text)
#        print(corrected_posts)
#print(suggestions[0].term)

# return the most probable (first) recommendation
        return corrected_posts  #suggestions[0].term

    # Method to clean tweets and instagram posts
    def clean_text(self, text):
        # remove entities and links
        text = self.remove_punctuation(self.strip_links(text))

        # convert text to lower case
        if self.convert_lower:
            text = text.lower()

        # remove emails
        text = re.sub('\S*@\S*\s?', '', text)

        # remove rt and via in case of tweet data
        text = re.sub(r"\b( rt|RT)\b", "", text)
        text = re.sub(r"\b( via|VIA)\b", "", text)
        text = re.sub(r"\b( it|IT)\b", "", text)
        text = re.sub(r"\b( btu|BTu)\b", "", text)
        text = re.sub(r"\b( bt |BT )\b", "", text)

        # remove repost in case of instagram data
        text = re.sub(r"\b( repost|REPOST)\b", "", text)

        # format contractions without apostrophe in order to use for contraction replacement
        text = re.sub(r"\b( s| 's)\b", " is ", text)
        text = re.sub(r"\b( ve| 've)\b", " have ", text)
        text = re.sub(r"\b( nt| 'nt| 't)\b", " not ", text)
        text = re.sub(r"\b( re| 're)\b", " are ", text)
        text = re.sub(r"\b( d| 'd)\b", " would ", text)
        text = re.sub(r"\b( ll| 'll)\b", " will ", text)
        text = re.sub(r"\b( m| 'm)\b", " am", text)

        # replace consecutive non-ASCII characters with a space
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)

        # remove emojis from text
        text = self.emoji_pattern.sub(r'', text)

        # substitute contractions with full words
        text = self.replace_contractions(text)

        # tokenize text
        tokenized_text = word_tokenize(text)

        # remove all non alpharethmetic values
        tokenized_text = self.only_alpha(tokenized_text)

        #print("tokenized_text", tokenized_text)

        # correct the spelling of the text - need full sentences (not tokens)
        if self.use_spell_corrector:
            tokenized_text = self.spell_corrector(tokenized_text)

        # lemmatize / stem words
        tokenized_text = self.lemmatizing(tokenized_text)
        # text = stemming(tokenized_text)

        filtered_text = []
        # looping through conditions
        for word in tokenized_text:
            word = word.strip()
            # check tokens against stop words, emoticons and punctuations
            # biggest english word: Pneumonoultramicroscopicsilicovolcanoconiosis (45 letters)
            if (word not in self.stop_words and word not in self.emoticons
                    and word not in string.punctuation and not word.isspace()
                    and len(word) > 2
                    and len(word) < 46) or word in self.whitelist:
                # print("word", word)
                filtered_text.append(word)

        #print("filtered_text 2", filtered_text)

        return filtered_text
Beispiel #15
0
class SymDeletingTypoCorrecter(Module):
    def __init__(self, max_edit_dist: int = 2, prefix_length: int = 10):
        self.symspell = SymSpell(max_dictionary_edit_distance=max_edit_dist,
                                 prefix_length=prefix_length)
        self.max_edit_dist = max_edit_dist

    def train(self,
              corpus_path: str,
              save_path: str,
              unigram_dict_prefix: str,
              bigram_dict_prefix: str = None,
              **kwargs):
        self.symspell.create_dictionary(corpus_path)
        # 1) Unigram dict
        worddict = ''
        for key, count in self.symspell.words.items():
            worddict += '{} {}\n'.format(''.join(flat_hangeul(key)), count)

        unigram_save_path = os.path.join(save_path,
                                         unigram_dict_prefix + '.txt')
        with open(unigram_save_path, 'w', encoding='utf-8') as file:
            for line in worddict:
                file.write(line)
            file.close()
        print("Total {} Unigrams are saved!".format(
            len(self.symspell.words.items())))

        if bigram_dict_prefix:
            # 2) Bigram dict
            with open(corpus_path, 'r', encoding='utf-8') as file:
                corpus = file.readlines()
            corpus = [s.strip() for s in corpus]

            bi_count = self.count_bigrams(corpus, min_count=5)

            bi_dict = ''
            for key, count in bi_count.items():
                s1, s2 = key.split(' ')
                bi_dict += '{} {} {}\n'.format(''.join(flat_hangeul(s1)),
                                               ''.join(flat_hangeul(s2)),
                                               count)

            bigram_save_path = os.path.join(save_path,
                                            bigram_dict_prefix + '.txt')
            with open(bigram_save_path, 'w', encoding='utf-8') as biFile:
                for line in bi_dict:
                    biFile.write(line)
                biFile.close()
            print("Total {} bigrams are saved!".format(len(bi_count)))

    def load_model(self,
                   unigram_dict_path: str,
                   bigram_dict_path: str = None,
                   **kwargs):
        try:
            here = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
            default_path = os.path.join(here, "resources",
                                        'default_uni_dict.txt')

            self.symspell.load_dictionary(default_path,
                                          term_index=0,
                                          count_index=1)
            self.symspell.load_dictionary(unigram_dict_path,
                                          term_index=0,
                                          count_index=1)
        except ValueError:
            raise ValueError("Specified unigram dictionary path not exist")

        if bigram_dict_path:
            try:
                self.symspell.load_bigram_dictionary(unigram_dict_path,
                                                     term_index=0,
                                                     count_index=1)
            except ValueError:
                raise ValueError("Specified bigram dictionary path not exist")

    def infer(self, word: Text, **kwargs):
        suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
        suggestions = self.symspell.lookup(''.join(flat_hangeul(word)),
                                           suggestion_verbosity,
                                           self.max_edit_dist)
        if suggestions:
            word = list(suggestions[0].term)
            return merge_flatted_hangeul(word)
        return word

    @staticmethod
    def count_bigrams(corpus: list, min_count: int):
        bigrams = []
        for t in tqdm(corpus):
            if t.__class__ != str:
                continue
            else:
                text = t.split(' ')
                _bigrams = zip(*[text[i:] for i in range(2)])
                bigrams += [' '.join(s) for s in list(_bigrams)]

        count = Counter(bigrams)
        new_dict = {}
        for key, value in count.items():
            if value >= min_count:
                new_dict[key] = value
        return new_dict
class WordSimilarity:
    def __init__(self, spell):
        max_edit_distance_dictionary = 2
        prefix_length = 7
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt")
        bigram_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
        if not self.sym_spell.load_dictionary(
                dictionary_path, term_index=0, count_index=1):
            print("Dictionary file not found")
            return
        if not self.sym_spell.load_bigram_dictionary(
                bigram_path, term_index=0, count_index=2):
            print("Bigram dictionary file not found")
            return

        self.nlp = spacy.load(
            "shop_recognizer/semantic_detector/models/en_core_web_lg")
        self.spell = spell

    def checkSemanticSimilarity(self, labels, words):
        result = {}
        texts = self.removeNoise2(words)
        for label in labels:
            tmp = ""
            doc1 = self.nlp(label)
            for text in texts:
                tmp += text + " "
            doc2 = self.nlp(tmp)
            score = doc2.similarity(doc1)
            result[label] = int(score * 100)
        prob = self.softmax(labels, result)
        counter = 0
        for cls in labels:
            if len(words):
                result[cls] = float(prob[counter])
                counter = counter + 1
            else:
                result[cls] = 0
        return result

    def checkSemanticSimilarity2(self, labels, words):
        result = {}
        texts = self.removeNoise2(words)
        for label in labels:
            tmp = 0
            doc1 = self.nlp(label)
            for text in texts:
                doc2 = self.nlp(text)
                similarity = doc2.similarity(doc1)
                if similarity > tmp:
                    tmp = similarity
            result[label] = int(tmp * 100)
        prob = self.softmax(labels, result)
        counter = 0
        for cls in labels:
            if len(words):
                result[cls] = float(prob[counter])
                counter = counter + 1
            else:
                result[cls] = 0
        return result

    def removeNoise(self, words):
        result = []
        for word in words:
            if len(word) > 2 and (word.isdigit() is False):
                if (word in self.nlp.Defaults.stop_words):
                    continue
                else:
                    newWord = self.spell.correction(word)
                    if self.nlp.vocab.has_vector(newWord):
                        result.append(newWord)
        return result

    def removeNoise2(self, words):
        result = []
        for word in words:
            if len(word) > 2 and (word.isdigit() is False):
                newWord = self.correct(word)
                newWord = newWord.replace(" ", "")
                result.append(newWord)
        return result

    def correct(self, word):
        input_term = (word)
        max_edit_distance_lookup = 2
        suggestions = self.sym_spell.lookup_compound(input_term,
                                                     max_edit_distance_lookup)
        return suggestions[0].term

    def softmax(self, classes, scores):
        inputArry = []
        for cls in classes:
            inputArry.append(scores[cls])
        ex = np.exp(inputArry)
        sum_ex = np.sum(np.exp(inputArry))
        return ex / sum_ex