Example #1
0
def spell_correcter(tokenized_tweets):
    from ekphrasis.classes.spellcorrect import SpellCorrector

    spell_corrector = SpellCorrector(corpus="english")

    return tokenized_tweets.apply(
        lambda tweet:
        [spell_corrector.correct(word) for word in tweet.split(" ")])
Example #2
0
def test_pell_correct():
    from ekphrasis.classes.spellcorrect import SpellCorrector
    sp = SpellCorrector(corpus="english")
    print(sp.correct("Thaaaanks"))
Example #3
0
    CASS_fn: the file name of a pickle that stores the output Correction_All_Sentences_Scores
output:
    Correction_All_Sentences_Scores: a list [0..3] of list of
        (original_sentence, original_score, revised_sentence, revised_toxic_score, correct_word,
        new_word_list, correction_word_list, correction_score, corrected_sentence), where
        correction_word_list is a list of (wrong_word, suggested_word).
note:
    The input sentences are pre-processed, such that punctuations are either non-existent, or
        separated from words.
'''

ASSF_fn = 'input/All_Sentences_Scores_Filtered.pickle'
CASS_fn = "output/Correction_All_Sentences_Scores.pickle"

sp = SpellCorrector(corpus="english")
ekphrasis_word_correction_func = lambda w: sp.correct(w)

Correction_All_Sentences_Scores = eval_spelling_correction_perspective(
    ASSF_fn, CASS_fn, word_correction_func=ekphrasis_word_correction_func)
'''
2018.5.20
Plot correction effects
'''

CASS_fn = "output/Correction_All_Sentences_Scores.pickle"
plot_correction_effects(CASS_fn)
'''
Calculate
1. accuracy
2. score distribution
'''
Example #4
0
def preprocess_corpus(corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,  ##OBS denne er nå ikke testet, eventuelt bare fjerne den
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False, 
                      remove_signs=False
                      ):
    """ Function used to apply preprocessing
    Input:
        corpus: a corpus on the format as the output in creat_corpus. Default False. 
        all_smilies: if true, same effect as if pos_smilies, neg_smilies, and other_smilies were true.Default False.
        pos_smilies: if true, positive smilies such as : ), : (, ; ), ( ;, :p, ;p, : p, are replaced by "possmiley.Default False.
        neg_smilies: if true, negative smilies such as : (, ) : are replaced by "negsmiely".Default False.
        other_smilies: if true, smilies such as ^_^ are replaced by a describing word.Default False. 
        hugs_and_kisses: if true, words such as xxx xoxo etc are replaced by "kisses" or "hug" and "kisses". Default False.
        hearts: if true, "<3" are replaced by "heart".Default False.
        hashtags: if true, hashtags are removed from the beginning of words, so #apple becomes apple.Default False. 
        hashtag_mention: if true, and if hashtag is true, the word "hashatag" is added at the end of a tweet that used to contain
            one or more words beginning with a hashtag. Default False.
        numbers: if true, words that are purely numbers are removed.Default False.
        number_mention: if true, and if number is true, the word "thereisanumber" is added at the end of a tweet that used 
            to contain one or more words that were purely numbers. Default False.
        exclamation: if true, the word "exclamation" is added at the end of a tweet that contain one or more "!".Default False. 
        set_to_not: if true, all words ending with "n't" is replaced by not.Default False. 
        segmentation_hash: if true, words starting with # that do not appear in the english dictionary is split into segments, 
            eg '#iammoving' becomes 'i am moving'. Default False.
        spelling: if true, all words that are not a part of the english dictionary is set to the most likely word,
            within two alterations. Default False.
        elongation: if true, the length of all sequences of letters in words that are not a part of the English dictionary 
            is set to max 2. Before words that are altered because of this, the word 'elongation' appears. Default False.
        remove_signs: if true, signs such as ",", ".", ":", ";", "-", are removed. Default False.
    
    Output:
        new_corpus: a new corpus, on same format as the input corpus. 
    """
   
    start = time.time()
    
    #initialising the new corpus:
    new_corpus=[]

    #Want to split the tweets using this tokenizer:
    tknzr = TweetTokenizer(reduce_len=True)
    
    
    
    if stemming:
        ps = PorterStemmer()
    
    if segmentation_hash or spelling or elongation:
        d = enchant.Dict("en_US")
    
    if segmentation_hash: 
        #seg = Segmenter(corpus="english")
        seg = Segmenter(corpus="twitter")

    if spelling: 
        sp = SpellCorrector(corpus="english")
        
    
    elapsed = time.time()
    print("Time in min before starting first for loop:", (elapsed - start) / 60 )
    
    #Want to go though each line (tweet) in the corpus
    for k, line in enumerate(corpus):
        
        
        if hashtag_mention:
            there_is_hashtag=False
        if number_mention:
            there_is_number=False
        if exclamation:
            there_is_exclamation=False
            
        #Splitting the tweet using the chosen tokenizer. 
        words=tknzr.tokenize(line)
        #Initializing for cleaned_tweet:
        cleaned_tweet=[]
        
        for i, word in enumerate(words):
            #Indicating that the word has not been treated yet
            word_not_treated=True
            end_=len(words)-1
            if ((pos_smilies or all_smilies) and word_not_treated):
                if (i>0 and (word=='d' and (words[i-1]==':' or words[i-1]==';'))) or word == ':d' or word == ';d':
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif (i>0 and (word=='p' and (words[i-1]==':' or words[i-1]==';'))) or word == ':p' or word == ';p' :
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and word=='d' and (words[i-1]==':' or words[i-1]==';' or words[i-1]=='x'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and words[i-1]=='(' and (word==':' or word==';'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and word==')' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False

            if ((neg_smilies or all_smilies) and word_not_treated):
                if i>0 and words[i-1]==')' and (word==':' or word==';'):
                    cleaned_tweet.append('sad')
                    word_not_treated=False
                elif i>0 and word=='(' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('sad')
                    word_not_treated=False
            
            if ((other_smilies or all_smilies) and word_not_treated):
                if i>0  and i<end_ and word=='_' and words[i-1]=='^' and words[i+1]=='^':
                    cleaned_tweet.append('eyesmiley')
                    word_not_treated=False
                elif i>0 and word=='o' and words[i-1]==':':
                    cleaned_tweet.append('openmouthface')
                    word_not_treated=False
                elif i>0 and word=='/' and words[i-1]==':':
                    cleaned_tweet.append('slashsmiely')
                    word_not_treated=False
                elif i>0 and word=='*' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
                
            if ((hugs_and_kisses and word_not_treated)):
                    #want to find hearts, hugs, kisses, etc: 
                if (word == "xoxo" or word == "xo" or word == "xoxoxo" or word == "xxoo"):
                    cleaned_tweet.append('hug')
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
                elif (word=='xx' or word=='xxx'or word=='xxxx'):
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
            
            if ((hearts and word_not_treated)):
                if word == "<3":
                    cleaned_tweet.append('heart')
                    word_not_treated=False
            
            if (hashtag and word_not_treated):
                if word[0]=='#':
                    there_is_hashtag=True
                    if (len(word)>1 and segmentation_hash and not d.check(word[1:])):
                        cleaned_tweet.append(seg.segment(word[1:]))
                    else:
                        cleaned_tweet.append(word[1:])
                    word_not_treated=False
            
            if (numbers and word_not_treated):
                if word.isdigit():
                    there_is_number=True
                    word_not_treated=False
                    
            if (exclamation and word_not_treated):
                if word=='!':
                    there_is_exclamation=True
                    cleaned_tweet.append(word)
                    word_not_treated=False
            
            if (set_to_not and word_not_treated):
                if word[-3:]=='n\'t':
                    cleaned_tweet.append('not')
                    word_not_treated=False
           
            
         
            if (word_not_treated):
                if (not remove_signs) or (remove_signs and ( (word!= '^' and word!=',' and word!='.' and word!=':' 
                                                              and word!='-' and word!='´' and word!=';'and word!=')' 
                                                              and word!='(' and word!='*'))):
                  
                    if ((not word[0].isdigit()) and elongation and not d.check(word) and len(word)>2):
                        new=[]
                        new.append(word[0])
                        for i,letter in enumerate(word):
                            if i>0 and i<len(word)-1: 
                                if not( letter==word[i-1]==word[i+1]):
                                    new.append(letter)
                        new.append(word[-1])
                        new_word=''.join(new)
                        if new_word!= word:
                            cleaned_tweet.append('elongation')
                            word=new_word

                    if spelling and not d.check(word)and len(word)>2: 
                        word=sp.correct(word)
                    if stemming:
                        word=ps.stem(word)

                    
                    cleaned_tweet.append(word)

           
                
        
        if (hashtag_mention and there_is_hashtag) :
            cleaned_tweet.append('hashtag')
        if (number_mention and there_is_number) :
            cleaned_tweet.append('number')
        if (exclamation and there_is_exclamation):
            cleaned_tweet.append('exclamation')
            
            
        new_words = ' '.join(cleaned_tweet)
        new_words = new_words.encode('utf-8')
        new_corpus.append(new_words)
        
        if np.mod(k,25000)==1:
                elapsed = time.time()
                print("Time in min after", k, " tweets:", (elapsed - start) / 60 )

        
    elapsed = time.time()
    print("Time in min total:", (elapsed - start) / 60 )
    return new_corpus       
Example #5
0
    },
    fix_html=True,  # fix HTML tokens
    segmenter="twitter",
    corrector="twitter",
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons])

seg_tw = Segmenter(corpus="twitter")
sp = SpellCorrector(corpus="twitter")
f1 = open('tokenized_tweets_golbeck.txt', 'w')
c = 1
for line in data:
    a = line.strip().split('\t')
    if len(a) >= 3:
        b = a[2]
        c = a[1]
        b = b.split()
        for i in range(len(b)):
            if b[i].startswith('http'):
                b[i] = '<url>'
        b = ' '.join(b)
        a = text_processor.pre_process_doc(b)
        for i in range(len(a)):
            if a[i].isalpha():
                a[i] = seg_tw.segment(sp.correct(a[i]))
        a = ' '.join(a)
        f1.write(a + ' ' + c + '\n')