Python SpellCorrector Exemples, ekphrasis.classes.spellcorrect.SpellCorrector Python Exemples

Exemple #1

0

Afficher le fichier

def spell_correcter(tokenized_tweets):
    from ekphrasis.classes.spellcorrect import SpellCorrector

    spell_corrector = SpellCorrector(corpus="english")

    return tokenized_tweets.apply(
        lambda tweet:
        [spell_corrector.correct(word) for word in tweet.split(" ")])

Exemple #2

0

Afficher le fichier

 def __init__(self, **kwargs):
     self.tokens_to_normalize = kwargs.get("normalize", [])
     self.annotate = kwargs.get("annotate", [])
     self.unpack_hashtags = kwargs.get("unpack_hashtags", False)
     self.unpack_contractions = kwargs.get("unpack_contractions", False)
     self.segmenter_corpus = kwargs.get("segmenter", "english")
     self.corrector_corpus = kwargs.get("corrector", "english")
     self.segmenter = Segmenter(corpus=self.segmenter_corpus)
     self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus)
     self.tokenizer = kwargs.get("tokenizer", None)
     self.simplify_emoticons = kwargs.get("simplify_emoticons", False)
     self.dictionaries = kwargs.get("dictionaries", [])
     self.stats = {}
     self.preprocessed_texts = -1

Exemple #3

0

Afficher le fichier

Fichier : spell_check.py Projet : ikrizanic/BA-Thesis

import csv
import nltk
from ekphrasis.classes.spellcorrect import SpellCorrector
from nltk.corpus import words

#  CHANGE PATH FOR SERVER
local = "/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/slang.csv"
djurdja = '/home/ikrizanic/pycharm/zavrsni/data/slang.csv'
with open(local, mode='r') as infile:
    reader = csv.reader(infile, delimiter=';')
    slang_dict = dict(reader)

sp = SpellCorrector(corpus="english")
nltk.download("words")
words = set(words.words())
punctuations = '''!()-[]{};:\'"\,<>./?@#$%^&*_~'''


def replace_slang(raw, tokenized):
    tokens = []
    for token in tokenized:
        if token not in words:
            for key, value in slang_dict.items():
                if str(key).lower() == str(token).lower():
                    token = value.split(" ")
            if type(token) is list:
                tokens.extend(token)
            else:
                tokens.append(token)
        else:
            tokens.append(token)

Exemple #4

0

Afficher le fichier

Fichier : tool.py Projet : JunKong5/Semveal2020-task9

def test_pell_correct():
    from ekphrasis.classes.spellcorrect import SpellCorrector
    sp = SpellCorrector(corpus="english")
    print(sp.correct("Thaaaanks"))

Exemple #5

0

Afficher le fichier

        'url',
        'email',
        'percent',
        'money',
        'phone',
        'user',
        'time',
        'url',
        'date',
        'number',
    ],
    annotate={
        'hashtag',
        'allcaps',
        'elongated',
        'repeated',
        'emphasis',
        'censored',
    },
    fix_html=True,
    segmenter='twitter',
    corrector='twitter',
    unpack_hashtags=True,
    unpack_contractions=True,
    spell_correct_elong=False,
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons],
)

sp = SpellCorrector(corpus='english')

Exemple #6

0

Afficher le fichier

    def __init__(self, **kwargs):
        """
        Kwargs:
            omit (list): choose what tokens that you want to omit from the text.
                possible values: ['email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'hashtag']
                Important Notes:
                            1 - put url at front, if you plan to use it.
                                Messes with the regexes!
                            2 - if you use hashtag then unpack_hashtags will
                                automatically be set to False

            normalize (list): choose what tokens that you want to normalize
                from the text.
                possible values: ['email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'hashtag']
                for example: [email protected] will be transformed to <email>
                Important Notes:
                            1 - put url at front, if you plan to use it.
                                Messes with the regexes!
                            2 - if you use hashtag then unpack_hashtags will
                                automatically be set to False

            unpack_contractions (bool): Replace *English* contractions in
                ``text`` str with their unshortened forms
                for example: can't -> can not, wouldn't -> would not, and so on...

            unpack_hashtags (bool): split a hashtag to it's constituent words.
                for example: #ilikedogs -> i like dogs

            annotate (list): add special tags to special tokens.
                possible values: ['hashtag', 'allcaps', 'elongated', 'repeated']
                for example: [email protected] -> [email protected] <email>

            tokenizer (callable): callable function that accepts a string and
                returns a list of strings if no tokenizer is provided then
                the text will be tokenized on whitespace

            segmenter (str): define the statistics of what corpus you would
                like to use [english, twitter]

            corrector (str): define the statistics of what corpus you would
                like to use [english, twitter]

            all_caps_tag (str): how to wrap the capitalized words
                values [single, wrap, every]
                Note: applicable only when `allcaps` is included in annotate[]
                    - single: add a tag after the last capitalized word
                    - wrap: wrap all words with opening and closing tags
                    - every: add a tag after each word

            spell_correct_elong (bool): choose if you want to perform
                spell correction after the normalization of elongated words.
                * significantly affects performance (speed)

            spell_correction (bool): choose if you want to perform
                spell correction to the text
                * significantly affects performance (speed)

            fix_text (bool): choose if you want to fix bad unicode terms and
                html entities.
        """
        self.omit = kwargs.get("omit", {})
        self.backoff = kwargs.get("normalize", {})
        self.include_tags = kwargs.get("annotate", {})
        self.unpack_contractions = kwargs.get("unpack_contractions", False)
        self.tokenizer = kwargs.get("tokenizer", None)
        self.dicts = kwargs.get("dicts", None)
        self.spell_correction = kwargs.get("spell_correction", False)
        self.spell_correct_elong = kwargs.get("spell_correct_elong", False)
        self.fix_text = kwargs.get("fix_bad_unicode", False)
        self.unpack_hashtags = kwargs.get("unpack_hashtags", False)
        self.segmenter_corpus = kwargs.get("segmenter", "english")
        self.corrector_corpus = kwargs.get("corrector", "english")
        self.all_caps_tag = kwargs.get("all_caps_tag", "wrap")
        self.mode = kwargs.get("mode", "normal")

        if self.unpack_hashtags:
            self.segmenter = Segmenter(corpus=self.segmenter_corpus)
        if self.mode != "fast":
            self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus)

        self.regexes = ExManager().get_compiled()
        if 'hashtag' in self.omit or 'hashtag' in self.backoff:
            print("You can't omit/backoff and unpack hashtags!\n "
                  "unpack_hashtags will be set to False")
            self.unpack_hashtags = False

Exemple #7

0

Afficher le fichier

class TextPreProcessor:
    def __init__(self, **kwargs):
        """
        Kwargs:
            omit (list): choose what tokens that you want to omit from the text.
                possible values: ['email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'hashtag']
                Important Notes:
                            1 - put url at front, if you plan to use it.
                                Messes with the regexes!
                            2 - if you use hashtag then unpack_hashtags will
                                automatically be set to False

            normalize (list): choose what tokens that you want to normalize
                from the text.
                possible values: ['email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'hashtag']
                for example: [email protected] will be transformed to <email>
                Important Notes:
                            1 - put url at front, if you plan to use it.
                                Messes with the regexes!
                            2 - if you use hashtag then unpack_hashtags will
                                automatically be set to False

            unpack_contractions (bool): Replace *English* contractions in
                ``text`` str with their unshortened forms
                for example: can't -> can not, wouldn't -> would not, and so on...

            unpack_hashtags (bool): split a hashtag to it's constituent words.
                for example: #ilikedogs -> i like dogs

            annotate (list): add special tags to special tokens.
                possible values: ['hashtag', 'allcaps', 'elongated', 'repeated']
                for example: [email protected] -> [email protected] <email>

            tokenizer (callable): callable function that accepts a string and
                returns a list of strings if no tokenizer is provided then
                the text will be tokenized on whitespace

            segmenter (str): define the statistics of what corpus you would
                like to use [english, twitter]

            corrector (str): define the statistics of what corpus you would
                like to use [english, twitter]

            all_caps_tag (str): how to wrap the capitalized words
                values [single, wrap, every]
                Note: applicable only when `allcaps` is included in annotate[]
                    - single: add a tag after the last capitalized word
                    - wrap: wrap all words with opening and closing tags
                    - every: add a tag after each word

            spell_correct_elong (bool): choose if you want to perform
                spell correction after the normalization of elongated words.
                * significantly affects performance (speed)

            spell_correction (bool): choose if you want to perform
                spell correction to the text
                * significantly affects performance (speed)

            fix_text (bool): choose if you want to fix bad unicode terms and
                html entities.
        """
        self.omit = kwargs.get("omit", {})
        self.backoff = kwargs.get("normalize", {})
        self.include_tags = kwargs.get("annotate", {})
        self.unpack_contractions = kwargs.get("unpack_contractions", False)
        self.tokenizer = kwargs.get("tokenizer", None)
        self.dicts = kwargs.get("dicts", None)
        self.spell_correction = kwargs.get("spell_correction", False)
        self.spell_correct_elong = kwargs.get("spell_correct_elong", False)
        self.fix_text = kwargs.get("fix_bad_unicode", False)
        self.unpack_hashtags = kwargs.get("unpack_hashtags", False)
        self.segmenter_corpus = kwargs.get("segmenter", "english")
        self.corrector_corpus = kwargs.get("corrector", "english")
        self.all_caps_tag = kwargs.get("all_caps_tag", "wrap")
        self.mode = kwargs.get("mode", "normal")

        if self.unpack_hashtags:
            self.segmenter = Segmenter(corpus=self.segmenter_corpus)
        if self.mode != "fast":
            self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus)

        self.regexes = ExManager().get_compiled()
        if 'hashtag' in self.omit or 'hashtag' in self.backoff:
            print("You can't omit/backoff and unpack hashtags!\n "
                  "unpack_hashtags will be set to False")
            self.unpack_hashtags = False

    def __copy__(self):
        return self

    def __deepcopy__(self, memo):
        return self

    @staticmethod
    def add_special_tag(m, tag, mode="single"):

        if isinstance(m, str):
            text = m
        else:
            text = m.group()

        if mode == "single":
            return " {} <{}> ".format(text, tag)
        elif mode == "wrap":
            return " ".join([" <{}> {} </{}> ".format(tag, text, tag)]) + " "
        elif mode == "every":
            tokens = text.split()
            processed = " ".join([" {} <{}> ".format(t, tag) for t in tokens])
            return " " + processed + " "

    @lru_cache(maxsize=4096)
    def handle_hashtag_match(self, m):
        """
        Break a string to its constituent words (using Viterbi algorithm)
        """
        text = m.group()[1:]

        # todo:simplify routine
        if text.islower():
            expanded = self.segmenter.segment(text)
            expanded = " ".join(expanded.split("-"))
            expanded = " ".join(expanded.split("_"))
            # print(m.group(), " - ", expanded)
            # with open("analysis/segmenter_" +
            # self.segmenter_corpus + ".txt", "a") as f:
            #     f.write(m.group() + "\t" + expanded + "\n")

        else:
            # split words following CamelCase convention
            expanded = self.regexes["camel_split"].sub(r' \1', text)
            expanded = expanded.replace("-", "")
            expanded = expanded.replace("_", "")
            # print(m.group(), " - ", expanded)

        if "hashtag" in self.include_tags:
            expanded = self.add_special_tag(expanded, "hashtag", mode="wrap")

        return expanded

    def handle_elongated_match(self, m):
        text = m.group()

        # normalize to at most 2 repeating chars
        text = self.regexes["normalize_elong"].sub(r'\1\1', text)

        normalized = self.spell_corrector.normalize_elongated(text)
        if normalized:
            text = normalized

        # try to spell correct the word
        if self.spell_correct_elong:
            text = self.spell_corrector.correct_word(text,
                                                     assume_wrong=True,
                                                     fast=True)
            # with open("analysis/spell_corrector_" +
            # self.corrector_corpus + ".txt", "a") as f:
            #     f.write(m.group() + " - " + text + "\n")

            # print(m.group(), "-", text)
        if "elongated" in self.include_tags:
            text = self.add_special_tag(text, "elongated")

        return text

    @lru_cache(maxsize=4096)
    def handle_repeated_puncts(self, m):
        """
        return the sorted set so mathes random combinations of puncts
        will be mapped to the same token
        "!??!?!!", "?!!!!?!", "!!?", "!?!?" --> "?!"
        "!...", "...?!" --> ".!"
        :param m:
        :return:
        """
        text = m.group()
        text = "".join(sorted(set(text), reverse=True))

        if "repeated" in self.include_tags:
            text = self.add_special_tag(text, "repeated")

        return text

    @lru_cache(maxsize=4096)
    def handle_generic_match(self, m, tag, mode="every"):
        """

        Args:
            m ():
            tag ():
            mode ():

        Returns:

        """
        text = m.group()
        text = self.add_special_tag(text, tag, mode=mode)

        return text

    @lru_cache(maxsize=4096)
    def handle_emphasis_match(self, m):
        """
        :param m:
        :return:
        """
        text = m.group().replace("*", "")
        if "emphasis" in self.include_tags:
            text = self.add_special_tag(text, "emphasis")

        return text

    @staticmethod
    def dict_replace(wordlist, _dict):
        return [_dict[w] if w in _dict else w for w in wordlist]

    @staticmethod
    def remove_hashtag_allcaps(wordlist):
        in_hashtag = False
        _words = []
        for word in wordlist:

            if word == "<hashtag>":
                in_hashtag = True
            elif word == "</hashtag>":
                in_hashtag = False
            elif word in {"<allcaps>", "</allcaps>"} and in_hashtag:
                continue

            _words.append(word)

        return _words

    @lru_cache(maxsize=4096)
    def handle_general_word_segment_and_spelling(self, m):
        """
        :param m:
        :return:
        """
        text = m.group()
        text = self.segmenter.segment(text)

        return text

    def pre_process_doc(self, doc):

        doc = re.sub(r' +', ' ', doc)  # remove repeating spaces

        # ###########################
        # # fix bad unicode
        # ###########################
        # if self.fix_bad_unicode:
        #     doc = textacy.preprocess.fix_bad_unicode(doc)
        #
        # ###########################
        # # fix html leftovers
        # ###########################
        # doc = html.unescape(doc)

        ###########################
        # fix text
        ###########################
        if self.fix_text:
            doc = ftfy.fix_text(doc)

        ###########################
        # BACKOFF & OMIT
        ###########################
        for item in self.backoff:
            # better add an extra space after the match.
            # Just to be safe. extra spaces will be normalized later anyway
            doc = self.regexes[item].sub(
                lambda m: " " + "<" + item + ">" + " ", doc)
        for item in self.omit:
            doc = doc.replace("<" + item + ">", '')

        ###########################
        # segment other words not hashtags
        ###########################

        # doc = self.regexes['not_hashtag'].sub(
        # lambda w: self.handle_general_word_segment_and_spelling(w), doc)

        # for word in doc.split(" "):
        # if(not word.startswith('#')):
        # word = self.segmenter.segment(word)
        # new_doc.append(word)
        # doc = " ".join(new_doc)

        ###########################
        # unpack hashtags
        ###########################

        if self.unpack_hashtags:
            doc = self.regexes["hashtag"].sub(
                lambda w: self.handle_hashtag_match(w), doc)

        ###########################
        # handle special cases
        ###########################
        if self.mode != "fast":
            if "allcaps" in self.include_tags:
                doc = self.regexes["allcaps"].sub(
                    lambda w: self.handle_generic_match(
                        w, "allcaps", mode=self.all_caps_tag), doc)

            if "elongated" in self.include_tags:
                doc = self.regexes["elongated"].sub(
                    lambda w: self.handle_elongated_match(w), doc)

            if "repeated" in self.include_tags:
                doc = self.regexes["repeat_puncts"].sub(
                    lambda w: self.handle_repeated_puncts(w), doc)

            if "emphasis" in self.include_tags:
                doc = self.regexes["emphasis"].sub(
                    lambda w: self.handle_emphasis_match(w), doc)

            if "censored" in self.include_tags:
                doc = self.regexes["censored"].sub(
                    lambda w: self.handle_generic_match(w, "censored"), doc)

        ###########################
        # unpack contractions: i'm -> i am, can't -> can not...
        ###########################

        # remove textacy dependency
        if self.unpack_contractions:
            doc = unpack_contractions(doc)

        # omit allcaps if inside hashtags
        doc = re.sub(r' +', ' ', doc)  # remove repeating spaces
        # doc = re.sub(r'<hashtag><allcaps>', '<hashtag>', doc)  # remove repeating spaces
        # doc = doc.replace('<hashtag> <allcaps>', '<hashtag>')
        # doc = doc.replace('</allcaps> </hashtag>', '</hashtag>')

        ###########################
        # Tokenize
        ###########################
        doc = self.remove_hashtag_allcaps(doc.split())
        doc = " ".join(doc)  # normalize whitespace
        if self.tokenizer:
            doc = self.tokenizer(doc)

            # Replace tokens with special dictionaries (slang,emoticons ...)
            # todo: add spell check before!
            if self.dicts:
                for d in self.dicts:
                    doc = self.dict_replace(doc, d)

        return doc

    def pre_process_docs(self, docs, lazy=True):
        from tqdm import tqdm
        for d in tqdm(docs, desc="PreProcessing..."):
            yield self.pre_process_doc(d)

Exemple #8

0

Afficher le fichier

Fichier : ekphrasis_tool.py Projet : strategist922/SemEval2019-OffensEval

from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.classes.spellcorrect import SpellCorrector
from ekphrasis.classes.preprocessor import TextPreProcessor

sp = SpellCorrector(corpus="twitter")

EMOTICONS_TOKEN = {
    ':*': '<kiss>',
    ':-*': '<kiss>',
    ':x': '<kiss>',
    ':-)': '<happy>',
    ':-))': '<happy>',
    ':-)))': '<happy>',
    ':-))))': '<happy>',
    ':-)))))': '<happy>',
    ':-))))))': '<happy>',
    ':)': '<happy>',
    ':))': '<happy>',
    ':)))': '<happy>',
    ':))))': '<happy>',
    ':)))))': '<happy>',
    ':))))))': '<happy>',
    ':)))))))': '<happy>',
    ':o)': '<happy>',
    ':]': '<happy>',
    ':3': '<happy>',
    ':c)': '<happy>',
    ':>': '<happy>',
    '=]': '<happy>',
    '8)': '<happy>',
    '=)': '<happy>',

Exemple #9

0

Afficher le fichier

Fichier : process.py Projet : Leyan529/Master-Summarizer

import os
import pickle
from data_util.my_stopwords import *
from data_util.extract_key import extract_PF

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
# from ekphrasis.classes.segmenter import Segmenter

from ekphrasis.classes.segmenter import Segmenter
# segmenter using the word statistics from english Wikipedia
seg_eng = Segmenter(corpus="twitter") # english or twitter

from ekphrasis.classes.spellcorrect import SpellCorrector
sp = SpellCorrector(corpus="english") # english or twitter

alphbet_stopword = ['','b','c','d','e','f','g','h','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','#']

# 斷詞辭典
from nltk.corpus import stopwords as nltk_stopwords
nltk_stopwords = set(nltk_stopwords.words("english"))
stpwords_list3 = [f.replace("\n","") for f in open("data_util/stopwords.txt","r",encoding = "utf-8").readlines()]
stpwords_list3.remove("not")
stopwords = list(html_escape_table + stpwords_list2) + list(list(nltk_stopwords) + list(stpwords_list1) + list(stpwords_list3))
stopwords = stopwords + ["."] + alphbet_stopword
# stopwords = list(html_escape_table)  #+ list(stpwords_list1) + list(stpwords_list3)
print("斷詞辭典 已取得")

# Total Opinion
opinion_lexicon = {}

Exemple #10

0

Afficher le fichier

import numpy as np
from twokenize import *
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.classes.spellcorrect import SpellCorrector
import re
from langdetect import detect
from tqdm import tqdm
import nltk
from cleantext import clean
import spacy
from spacy.lang.en import English

social_tokenizer = SocialTokenizer(lowercase=False).tokenize
spell_corrector = SpellCorrector(corpus="english")


def extract_url(row, min_len_url=10):
    if len(row['rt_urls_list']) > min_len_url:
        tweet_url = row['rt_urls_list'].split(',')[1].split('\'')[-2]
    else:
        tweet_url = 'None'
    return tweet_url


class SentClean:
    prep_default = {'spell': False,
                    'remove_sequences': False,
                    'lowercase': False,
                    'punctuations': [],

Exemple #11

0

Afficher le fichier

def preprocess_corpus(corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,  ##OBS denne er nå ikke testet, eventuelt bare fjerne den
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False, 
                      remove_signs=False
                      ):
    """ Function used to apply preprocessing
    Input:
        corpus: a corpus on the format as the output in creat_corpus. Default False. 
        all_smilies: if true, same effect as if pos_smilies, neg_smilies, and other_smilies were true.Default False.
        pos_smilies: if true, positive smilies such as : ), : (, ; ), ( ;, :p, ;p, : p, are replaced by "possmiley.Default False.
        neg_smilies: if true, negative smilies such as : (, ) : are replaced by "negsmiely".Default False.
        other_smilies: if true, smilies such as ^_^ are replaced by a describing word.Default False. 
        hugs_and_kisses: if true, words such as xxx xoxo etc are replaced by "kisses" or "hug" and "kisses". Default False.
        hearts: if true, "<3" are replaced by "heart".Default False.
        hashtags: if true, hashtags are removed from the beginning of words, so #apple becomes apple.Default False. 
        hashtag_mention: if true, and if hashtag is true, the word "hashatag" is added at the end of a tweet that used to contain
            one or more words beginning with a hashtag. Default False.
        numbers: if true, words that are purely numbers are removed.Default False.
        number_mention: if true, and if number is true, the word "thereisanumber" is added at the end of a tweet that used 
            to contain one or more words that were purely numbers. Default False.
        exclamation: if true, the word "exclamation" is added at the end of a tweet that contain one or more "!".Default False. 
        set_to_not: if true, all words ending with "n't" is replaced by not.Default False. 
        segmentation_hash: if true, words starting with # that do not appear in the english dictionary is split into segments, 
            eg '#iammoving' becomes 'i am moving'. Default False.
        spelling: if true, all words that are not a part of the english dictionary is set to the most likely word,
            within two alterations. Default False.
        elongation: if true, the length of all sequences of letters in words that are not a part of the English dictionary 
            is set to max 2. Before words that are altered because of this, the word 'elongation' appears. Default False.
        remove_signs: if true, signs such as ",", ".", ":", ";", "-", are removed. Default False.
    
    Output:
        new_corpus: a new corpus, on same format as the input corpus. 
    """
   
    start = time.time()
    
    #initialising the new corpus:
    new_corpus=[]

    #Want to split the tweets using this tokenizer:
    tknzr = TweetTokenizer(reduce_len=True)
    
    
    
    if stemming:
        ps = PorterStemmer()
    
    if segmentation_hash or spelling or elongation:
        d = enchant.Dict("en_US")
    
    if segmentation_hash: 
        #seg = Segmenter(corpus="english")
        seg = Segmenter(corpus="twitter")

    if spelling: 
        sp = SpellCorrector(corpus="english")
        
    
    elapsed = time.time()
    print("Time in min before starting first for loop:", (elapsed - start) / 60 )
    
    #Want to go though each line (tweet) in the corpus
    for k, line in enumerate(corpus):
        
        
        if hashtag_mention:
            there_is_hashtag=False
        if number_mention:
            there_is_number=False
        if exclamation:
            there_is_exclamation=False
            
        #Splitting the tweet using the chosen tokenizer. 
        words=tknzr.tokenize(line)
        #Initializing for cleaned_tweet:
        cleaned_tweet=[]
        
        for i, word in enumerate(words):
            #Indicating that the word has not been treated yet
            word_not_treated=True
            end_=len(words)-1
            if ((pos_smilies or all_smilies) and word_not_treated):
                if (i>0 and (word=='d' and (words[i-1]==':' or words[i-1]==';'))) or word == ':d' or word == ';d':
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif (i>0 and (word=='p' and (words[i-1]==':' or words[i-1]==';'))) or word == ':p' or word == ';p' :
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and word=='d' and (words[i-1]==':' or words[i-1]==';' or words[i-1]=='x'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and words[i-1]=='(' and (word==':' or word==';'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and word==')' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False

            if ((neg_smilies or all_smilies) and word_not_treated):
                if i>0 and words[i-1]==')' and (word==':' or word==';'):
                    cleaned_tweet.append('sad')
                    word_not_treated=False
                elif i>0 and word=='(' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('sad')
                    word_not_treated=False
            
            if ((other_smilies or all_smilies) and word_not_treated):
                if i>0  and i<end_ and word=='_' and words[i-1]=='^' and words[i+1]=='^':
                    cleaned_tweet.append('eyesmiley')
                    word_not_treated=False
                elif i>0 and word=='o' and words[i-1]==':':
                    cleaned_tweet.append('openmouthface')
                    word_not_treated=False
                elif i>0 and word=='/' and words[i-1]==':':
                    cleaned_tweet.append('slashsmiely')
                    word_not_treated=False
                elif i>0 and word=='*' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
                
            if ((hugs_and_kisses and word_not_treated)):
                    #want to find hearts, hugs, kisses, etc: 
                if (word == "xoxo" or word == "xo" or word == "xoxoxo" or word == "xxoo"):
                    cleaned_tweet.append('hug')
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
                elif (word=='xx' or word=='xxx'or word=='xxxx'):
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
            
            if ((hearts and word_not_treated)):
                if word == "<3":
                    cleaned_tweet.append('heart')
                    word_not_treated=False
            
            if (hashtag and word_not_treated):
                if word[0]=='#':
                    there_is_hashtag=True
                    if (len(word)>1 and segmentation_hash and not d.check(word[1:])):
                        cleaned_tweet.append(seg.segment(word[1:]))
                    else:
                        cleaned_tweet.append(word[1:])
                    word_not_treated=False
            
            if (numbers and word_not_treated):
                if word.isdigit():
                    there_is_number=True
                    word_not_treated=False
                    
            if (exclamation and word_not_treated):
                if word=='!':
                    there_is_exclamation=True
                    cleaned_tweet.append(word)
                    word_not_treated=False
            
            if (set_to_not and word_not_treated):
                if word[-3:]=='n\'t':
                    cleaned_tweet.append('not')
                    word_not_treated=False
           
            
         
            if (word_not_treated):
                if (not remove_signs) or (remove_signs and ( (word!= '^' and word!=',' and word!='.' and word!=':' 
                                                              and word!='-' and word!='´' and word!=';'and word!=')' 
                                                              and word!='(' and word!='*'))):
                  
                    if ((not word[0].isdigit()) and elongation and not d.check(word) and len(word)>2):
                        new=[]
                        new.append(word[0])
                        for i,letter in enumerate(word):
                            if i>0 and i<len(word)-1: 
                                if not( letter==word[i-1]==word[i+1]):
                                    new.append(letter)
                        new.append(word[-1])
                        new_word=''.join(new)
                        if new_word!= word:
                            cleaned_tweet.append('elongation')
                            word=new_word

                    if spelling and not d.check(word)and len(word)>2: 
                        word=sp.correct(word)
                    if stemming:
                        word=ps.stem(word)

                    
                    cleaned_tweet.append(word)

           
                
        
        if (hashtag_mention and there_is_hashtag) :
            cleaned_tweet.append('hashtag')
        if (number_mention and there_is_number) :
            cleaned_tweet.append('number')
        if (exclamation and there_is_exclamation):
            cleaned_tweet.append('exclamation')
            
            
        new_words = ' '.join(cleaned_tweet)
        new_words = new_words.encode('utf-8')
        new_corpus.append(new_words)
        
        if np.mod(k,25000)==1:
                elapsed = time.time()
                print("Time in min after", k, " tweets:", (elapsed - start) / 60 )

        
    elapsed = time.time()
    print("Time in min total:", (elapsed - start) / 60 )
    return new_corpus

Exemple #12

0

Afficher le fichier

        - if the most toxic word is auxiliary verb, then discard the sentence.
    CASS_fn: the file name of a pickle that stores the output Correction_All_Sentences_Scores
output:
    Correction_All_Sentences_Scores: a list [0..3] of list of
        (original_sentence, original_score, revised_sentence, revised_toxic_score, correct_word,
        new_word_list, correction_word_list, correction_score, corrected_sentence), where
        correction_word_list is a list of (wrong_word, suggested_word).
note:
    The input sentences are pre-processed, such that punctuations are either non-existent, or
        separated from words.
'''

ASSF_fn = 'input/All_Sentences_Scores_Filtered.pickle'
CASS_fn = "output/Correction_All_Sentences_Scores.pickle"

sp = SpellCorrector(corpus="english")
ekphrasis_word_correction_func = lambda w: sp.correct(w)

Correction_All_Sentences_Scores = eval_spelling_correction_perspective(
    ASSF_fn, CASS_fn, word_correction_func=ekphrasis_word_correction_func)
'''
2018.5.20
Plot correction effects
'''

CASS_fn = "output/Correction_All_Sentences_Scores.pickle"
plot_correction_effects(CASS_fn)
'''
Calculate
1. accuracy
2. score distribution

Exemple #13

0

Afficher le fichier

        'date', 'number'
    ],
    annotate={
        "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'
    },
    fix_html=True,  # fix HTML tokens
    segmenter="twitter",
    corrector="twitter",
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons])

seg_tw = Segmenter(corpus="twitter")
sp = SpellCorrector(corpus="twitter")
f1 = open('tokenized_tweets_golbeck.txt', 'w')
c = 1
for line in data:
    a = line.strip().split('\t')
    if len(a) >= 3:
        b = a[2]
        c = a[1]
        b = b.split()
        for i in range(len(b)):
            if b[i].startswith('http'):
                b[i] = '<url>'
        b = ' '.join(b)
        a = text_processor.pre_process_doc(b)
        for i in range(len(a)):
            if a[i].isalpha():

Exemple #14

0

Afficher le fichier

def create_models(headlines):
    headline = headlines['headline']
    label = headlines['label']
    headlines.loc[headlines['label'] == -1, 'label'] = 0
    arr_Accu = []

    #Random State apo edw ****************************

    # for i in range(1, 20):
    #     headline_train, headline_test, label_train, label_test = train_test_split(headline, label, test_size=0.01, random_state=i)
    #     # vect = CountVectorizer(max_features=100000, binary=True)
    #     vect = TfidfVectorizer(max_features=100000, strip_accents='unicode', analyzer='word', stop_words='english', token_pattern=r'\w{1,}', ngram_range=(1, 3))
    #     headline_train_vector = vect.fit_transform(headline_train)
    #     headline_test_vector = vect.transform(headline_test)
    #
    #     # Note: Egine prospatheia balancing tou dataset alla to accuracy sti sunexeia twn dokimwn apo katw den veltiwthike
    #     # balancing = SMOTE()
    #     # headline_train_balanced, label_train_balanced = balancing.fit_sample(headline_train_vector, label_train)
    #     # oversampled_headlines, counts = np.unique(label_train_balanced, return_counts=True)
    #     # print(list(zip(oversampled_headlines, counts)))
    #     print("pre-Dummy")
    #     dummy = DummyClassifier()
    #     print("post-Dummy")
    #     dummy.fit(headline_train_vector, label_train)
    #     prediction = dummy.predict(headline_test_vector)
    #     accuracy = metrics.accuracy_score(label_test, prediction)
    #     print("Dummy Classifier: ")
    #     print(accuracy)
    #     arr_Accu.append(accuracy)
    # print(max(arr_Accu))
    # max_random_state = arr_Accu.index(max(arr_Accu)) + 1
    # print(max_random_state)
    # for j in range(1, 20):
    #     print("Random State : ", j, "   Accuracy : ", arr_Accu[j-1])

    # Random State mexri edw ********************************

    # Dokimi me k-fold gia tin euresi katalilis timis K gia megisto accuracy
    # Note: to accuracy edw einai xeirotero apo prin

    # arr_Accu = []
    # for i in range(3, 15):
    #     vect = CountVectorizer(stop_words='english', analyzer="word", min_df=2, max_df=0.8)
    #     headline_train_vector = vect.fit_transform(headline)
    #
    #     dummy = DummyClassifier()
    #     accuracy = cross_val_score(dummy, headline_train_vector, label, cv=i, scoring='accuracy')
    #
    #     arr_Accu.append(np.mean(accuracy))
    #
    # # print(arr_Accu)
    # for j in range(3, 15):
    #     print("K-Fold : ", j, "   Accuracy : ", arr_Accu[j - 3])

    # Ksekina i dimiourgia montelwn me to veltisto random state
    # print("random state chosen: ")
    # print(max_random_state)
    # headline_train, headline_test, label_train, label_test = train_test_split(headline, label, test_size=0.20, random_state=max_random_state)

    x = headlines['headline']
    y = headlines['label']
    print("Headlines", x.shape)
    print("Labels", y.shape)

    neg = sum(headlines.label == 0)
    pos = sum(headlines.label == 1)
    print("Neg", neg)
    print("Pos", pos)
    diff = abs(pos - neg)
    print("Class difference: ", diff)

    df_filter = headlines[headlines.label == 0]

    run_stats = pd.DataFrame()
    print(headlines.head())
    from ekphrasis.classes.spellcorrect import SpellCorrector

    # Dokimastiko pre-processing twn tweets
    @lru_cache(maxsize=50000)
    def tokenization(text):
        text = re.split('\W+', text)
        return text

    headlines['headline'] = headlines['headline'].apply(
        lambda x: tokenization(x.lower()))

    print(headlines.head())

    stopword = nltk.corpus.stopwords.words('english')

    # @lru_cache(maxsize=50000)
    def remove_stopwords(text):
        return [word for word in text if word not in stopword]

    headlines['headline'] = headlines['headline'].apply(remove_stopwords)
    print(headlines.head())

    # stemmer = nltk.PorterStemmer()
    # def stemming(text):
    #     text = [stemmer.stem(word) for word in text]
    #     return text
    #
    # headlines['headline'] = headlines['headline'].apply(lambda x: stemming(x))
    # print(headlines.head(10))

    # Spell Correction, ισως να μην χρησιμοποιηθεί γιατί απαιτεί υπερβολικά πολλή RAM και χρόνο ***
    sp = SpellCorrector(corpus="english")

    def spell_corrector(text):
        print("**Text before correction: ", text)
        text = [sp.correct(word) for word in text]
        print(">>Text after correction:", text)
        return text

    # print("Spelling Correction")
    # headlines['headline'] = headlines['headline'].apply(lambda x: spell_corrector(x))

    # headlines['headline'] = headlines['headline'].apply(spell_corrector)

    lm = nltk.WordNetLemmatizer()

    def lemmatizer(text):
        return [lm.lemmatize(word) for word in text]

    print("Lemmatizer")
    headlines['headline'] = headlines['headline'].apply(lemmatizer)
    print(headlines.head(10))

    headlines['headline'] = headlines['headline'].str.join(" ")
    print(headlines.head())

    headline_train, headline_test, label_train, label_test = train_test_split(
        headline, label, test_size=.02)
    x_validation, x_test, y_validation, y_test = train_test_split(
        headline_test, label_test, test_size=.5)
    print(headline_train.shape)
    print(headline_test.shape)

    # vect = TfidfVectorizer(max_features=100000, strip_accents='unicode', analyzer='word', stop_words='english', token_pattern=r'\w{1,}',  ngram_range=(1, 3))
    vect = TfidfVectorizer(ngram_range=(1, 3))

    # Grid Searching gia veltisto apotelesma POLY XRONOVORO
    # lr = LogisticRegression()
    # text_clf = Pipeline([
    #         ('vect', CountVectorizer()),
    #         ('tfidf', TfidfTransformer()),
    #         ('clf', LogisticRegression())])
    # params = {
    #     'clf__penalty': ['l1', 'l2'],  # l1 is Lasso, l2 is Ridge
    #     'clf__solver': ['liblinear'],
    #     'clf__C': np.linspace(0.00002, 1, 10)
    # }
    # lr_gs = GridSearchCV(text_clf, params, cv=5, iid=False).fit(headline_train[:200000], label_train[:200000])
    # print("Best Params", lr_gs.best_params_)
    # print("Best Score", lr_gs.best_score_)
    # Mexri edw

    log_regression = LogisticRegression(C=1.0,
                                        class_weight="balanced",
                                        solver="liblinear",
                                        multi_class="ovr",
                                        verbose=100,
                                        random_state=42)
    linear_SVC = LinearSVC(C=0.1, verbose=100, random_state=42)
    passive_aggressive = PassiveAggressiveClassifier()
    multinomial_bayes = MultinomialNB(alpha=10)
    complementNB = ComplementNB()
    ridge_clas = RidgeClassifier(solver='lsqr', random_state=42)
    naive_bayes = BernoulliNB()
    random_forest = RandomForestClassifier(max_depth=30,
                                           n_estimators=4000,
                                           verbose=100,
                                           n_jobs=2)
    svm = SVC(gamma=0.5, C=100, kernel="linear", verbose=100)

    # Edw dokimazw me GridSearch gia ta kalutero parameter tuning
    parameters = {
        'classifier__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0],
        'classifier__max_iter': [1000],
        'classifier__solver': ['lsqr'],
        'classifier__random_state': [42]
    }

    # EDW KANW GRIDSEARCH

    # pipe = Pipeline([
    #     ('vectorizer', TfidfVectorizer(max_features=100000)),
    #     ('classifier', ridge_clas)
    # ])
    # grid = GridSearchCV(pipe, n_jobs=2, cv=5, verbose=3, param_grid=parameters)
    #
    # start_time = time.time()
    # grid.fit(headline_train, label_train)
    # end_time = time.time()
    # print('Total fit time: {}'.format(end_time - start_time))
    #
    # prediction = grid.predict(label_test)
    # print("Prediction Finished")
    # res = pd.DataFrame({'Prediction ': prediction})
    # print(res)

    # MEXRI EDW

    # algorithms = [log_regression, complementNB,  linear_SVC,  passive_aggressive, multinomial_bayes, naive_bayes, ridge_clas]
    # algo_names = ["Logistic Regression", "Complement Naive Bayes",  "Linear SVC", "Passive Aggressive", "Mutlinomial Bayes", "Naive Bayes", "Ridge Classifier"]
    # algo_name_pair = zip(algorithms, algo_names)

    algorithms = [ridge_clas]
    algo_names = ["Ridge Classifier"]
    algo_name_pair = zip(algorithms, algo_names)

    results = dict()
    for algo, name in algo_name_pair:
        ug_pipeline = Pipeline([('vectorizer', vect), ('classifier', algo)])
        print("Classifier : ", algo)
        results[name] = train_test_and_evaluate(ug_pipeline, headline_train,
                                                label_train, x_validation,
                                                y_validation)

    dframe = pd.DataFrame.from_dict(results, orient="index").reset_index()
    dframe.columns = ["classifier", "prediction"]
    dframe.sort_values(by=["prediction"], ascending=False)
    print(results)
    sns.barplot(x='classifier', y='prediction', data=dframe)
    plt.title("TFidf Vectorizer, n-gram=3")
    fig = plt.gcf()
    fig.set_size_inches(20, 10)
    plt.show()

    # ta headlines tou training kommatioy ginontai fit_transform gia to fit
    # ta headlines tou test ginontai transform gia to test

    # Multionomial Bayes
    # mbayes = MultinomialNB()
    # start_time = time.time()
    # mbayes.fit(headline_train_vector, label_train)
    # runtime = time.time() - start_time
    # # print(mbayes.score(headline_train_vector, label_train))
    #
    # # actual testing me to testing set pou diaxwrisame
    # prediction = mbayes.predict(headline_test_vector)
    # # print(prediction)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('MBayes Accuracy : ', accuracy)
    # run_stats = run_stats.append({'Classifier': 'Multinomial Naive Bayes', 'Accuracy': accuracy, 'Runtime': runtime}, ignore_index=True)
    # results["bayes_accuracy"] = prediction

    # start_time = time.time()
    # log_regression = LogisticRegression()
    # log_regression.fit(headline_train_vector, label_train)
    # prediction = log_regression.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # runtime = time.time() - start_time
    # print('LogisticRegression Accuracy : ', accuracy)
    # print('Runtime : ', runtime)
    # results["Logistic_regression"] = accuracy
    # Teleutaia fora 0.77838

    # decision_tree = DecisionTreeClassifier(criterion='entropy')
    # decision_tree.fit(headline_train_vector, label_train)
    # prediction = decision_tree.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('DecisionTree Accuracy : ', accuracy)
    # #
    # random_forest = RandomForestClassifier(criterion='entropy')
    # random_forest.fit(headline_train_vector, label_train)
    # prediction = random_forest.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('RandomForestClassifier Accuracy : ', accuracy)
    # Teleutaia fora, DEN ETREKSE, PIRE POLY WRA KAI TO EKLEISA
    #
    # adaboost = AdaBoostClassifier()
    # adaboost.fit(headline_train_vector, label_train)
    # prediction = adaboost.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('Adaboost Accuracy : ', accuracy)
    #  Teleutaio accuracy 0.66687
    #
    # bernoulli_bayes = BernoulliNB()
    # start_time = time.time()
    # bernoulli_bayes.fit(headline_train_vector, label_train)
    # runtime = time.time() - start_time
    # prediction = bernoulli_bayes.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('BernoulliNB Accuracy : ', accuracy)
    # run_stats = run_stats.append({'Classifier': 'Bernoulli', 'Accuracy': accuracy, 'Runtime': runtime}, ignore_index=True)

    # linear_SVC = LinearSVC()
    # start_time = time.time()
    # linear_SVC.fit(headline_train_vector, label_train)
    # runtime = time.time() - start_time
    # prediction = linear_SVC.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('Linear_SVC Accuracy : ', accuracy)
    # print("Runtime : ", runtime)
    # run_stats = run_stats.append({'Classifier': 'Linear SVC', 'Accuracy': accuracy, 'Runtime': runtime}, ignore_index=True)
    # Teleutaio accuracy 0.7761956

    # passive_aggressive = PassiveAggressiveClassifier()
    # passive_aggressive.fit(headline_train_vector, label_train)
    # prediction = passive_aggressive.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('PassiveAggressiveClassifier Accuracy : ', accuracy)
    pprint(run_stats)

    return results

Exemple #15

0

Afficher le fichier

class TextPreProcessor:
    """
    Kwargs:
        normalize (list)
            possible values: ['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date']

        annotate (list)
            possible values: ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored']

        unpack_hashtags (bool)

        unpack_contractions (bool)

        segmenter (str): define the statistics of what corpus you would
            like to use [english, twitter]

        corrector (str): define the statistics of what corpus you would
            like to use [english, twitter]

        tokenizer (callable): callable function that accepts a string and
                returns a list of strings if no tokenizer is provided then
                the text will be tokenized on whitespace

        simplify_emoticons (bool)

        dictionaries (list)
    """

    def __init__(self, **kwargs):
        self.tokens_to_normalize = kwargs.get("normalize", [])
        self.annotate = kwargs.get("annotate", [])
        self.unpack_hashtags = kwargs.get("unpack_hashtags", False)
        self.unpack_contractions = kwargs.get("unpack_contractions", False)
        self.segmenter_corpus = kwargs.get("segmenter", "english")
        self.corrector_corpus = kwargs.get("corrector", "english")
        self.segmenter = Segmenter(corpus=self.segmenter_corpus)
        self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus)
        self.tokenizer = kwargs.get("tokenizer", None)
        self.simplify_emoticons = kwargs.get("simplify_emoticons", False)
        self.dictionaries = kwargs.get("dictionaries", [])
        self.stats = {}
        self.preprocessed_texts = -1

    def pre_process(self, text: str, with_stats=False):
        self._increment_counter()

        text = self._remove_repeating_spaces(text)
        text = self._normalize(text)
        text = self._unpack_hashtags(text)
        text = self._annotate(text)
        text = self._unpack_contractions(text)
        text = self._remove_repeating_spaces(text)

        tokens = self._tokenize(text)
        tokens = self._simplify_emoticons(tokens)
        tokens = self._replace_using_dictionaries(tokens)

        if with_stats:
            return tokens, self._pre_processed_text_stats()
        else:
            return tokens

    def _pre_processed_text_stats(self):
        return self.stats[self.preprocessed_texts]

    def _increment_counter(self):
        self.preprocessed_texts += 1
        self.stats[self.preprocessed_texts] = {}

    def _normalize(self, text):
        for item in self.tokens_to_normalize:
            text = self._change_using_regexp(item, lambda m: f' <{item}> ', text, 'normalize')
        return text

    def _unpack_hashtags(self, text):
        if self.unpack_hashtags:
            return self._change_using_regexp("hashtag", lambda w: self._handle_hashtag_match(w), text, "unpack")
        return text

    def _annotate(self, text):
        text = self._annotate_allcaps(text)
        text = self._annotate_elongated(text)
        text = self._annotate_repeated(text)
        text = self._annotate_emphasis(text)
        text = self._annotate_censored(text)
        return text

    def _annotate_allcaps(self, text):
        if "allcaps" in self.annotate:
            return self._change_using_regexp("allcaps", lambda w: self._handle_generic_match(w, "allcaps", mode='wrap'),
                                             text, "annotate")
        return text

    def _annotate_elongated(self, text):
        if "elongated" in self.annotate:
            return self._change_using_regexp("elongated", lambda w: self._handle_elongated_match(w), text, "annotate")
        return text

    def _annotate_repeated(self, text):
        if "repeated" in self.annotate:
            return self._change_using_regexp("repeat_puncts", lambda w: self._handle_repeated_puncts(w), text,
                                             "annotate")
        return text

    def _annotate_emphasis(self, text):
        if "emphasis" in self.annotate:
            return self._change_using_regexp("emphasis", lambda w: self._handle_emphasis_match(w), text, "annotate")
        return text

    def _annotate_censored(self, text):
        if "censored" in self.annotate:
            return self._change_using_regexp("censored", lambda w: self._handle_generic_match(w, "censored"), text,
                                             "annotate")
        return text

    def _change_using_regexp(self, regexp_name, func, text, stats_name_prefix):
        changing_result = regexes[regexp_name].subn(func, text)
        self._update_stats(f'{stats_name_prefix}_{regexp_name}', changing_result[1])
        return changing_result[0]

    def _unpack_contractions(self, text):
        if self.unpack_contractions:
            text = self._unpack_selected_contrations(r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|"
                                                     r"[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n'?t",
                                                     r"\1\2 not", text)

            text = self._unpack_selected_contrations(r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll",
                                                     r"\1\2 will", text)
            text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]hat|[Ww]ho|[Yy]ou)ll", r"\1\2 will", text)

            text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text)
            text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]hat|[Yy]ou)re", r"\1\2 are", text)

            text = self._unpack_selected_contrations(r"(\b)([[Hh]e|[Ss]he)'s", r"\1\2 is", text)

            text = self._unpack_selected_contrations(
                r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)"
                r"'?ve", r"\1\2 have", text)

            text = self._unpack_selected_contrations(r"(\b)([Cc]a)n't", r"\1\2n not", text)
            text = self._unpack_selected_contrations(r"(\b)([Ii])'m", r"\1\2 am", text)
            text = self._unpack_selected_contrations(r"(\b)([Ll]et)'?s", r"\1\2 us", text)
            text = self._unpack_selected_contrations(r"(\b)([Ww])on'?t", r"\1\2ill not", text)
            text = self._unpack_selected_contrations(r"(\b)([Ss])han'?t", r"\1\2hall not", text)
            text = self._unpack_selected_contrations(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text)

        return text

    def _unpack_selected_contrations(self, regexp, replacement, text):
        unpacking_result = re.subn(regexp, replacement, text)
        self._update_stats("unpack_contrations", unpacking_result[1])
        return unpacking_result[0]

    def _tokenize(self, text):
        if self.tokenizer:
            return self.tokenizer(text)
        else:
            return text.split(' ')

    def _simplify_emoticons(self, tokens):
        if self.simplify_emoticons:
            result = []
            for token in tokens:
                if token in emoticons:
                    new_emoticon = emoticons[token]
                    if new_emoticon != token:
                        self._update_stats('emoticon_simplification', 1)
                    result.append(new_emoticon)
                else:
                    result.append(token)
            return result
        else:
            return tokens

    def _replace_using_dictionaries(self, tokens):
        if len(self.dictionaries) > 0:
            for dictionary in self.dictionaries:
                for idx, token in enumerate(tokens):
                    if token in dictionary:
                        value = dictionary[token]
                        if '<entity>' not in value:
                            tokens[idx] = value
                            self._update_stats('dictionary_replacement', 1)
            return ' '.join(tokens).split(' ')
        else:
            return tokens

    @lru_cache(maxsize=65536)
    def _handle_hashtag_match(self, m):
        text = m.group()[1:]

        if text.islower():
            expanded = self.segmenter.segment(text)
            expanded = " ".join(expanded.split("-"))
            expanded = " ".join(expanded.split("_"))
        else:
            expanded = regexes["camel_split"].sub(r' \1', text)
            expanded = expanded.replace("-", "")
            expanded = expanded.replace("_", "")

        if "hashtag" in self.annotate:
            expanded = self._add_special_tag(expanded, "hashtag", mode="wrap")

        return expanded

    @lru_cache(maxsize=65536)
    def _handle_generic_match(self, m, tag, mode="every"):
        text = m.group()
        if tag == 'allcaps':  # word around for allcaps contractions like YOU'RE TODO refactor
            text = text.lower()

        text = self._add_special_tag(text, tag, mode=mode)

        return text

    def _handle_elongated_match(self, m):
        text = m.group()

        text = regexes["normalize_elong"].sub(r'\1\1', text)

        normalized = self.spell_corrector.normalize_elongated(text)
        if normalized:
            text = normalized

        text = self._add_special_tag(text, "elongated")

        return text

    @lru_cache(maxsize=65536)
    def _handle_repeated_puncts(self, m):
        text = m.group()
        text = "".join(sorted(set(text), reverse=True))
        text = self._add_special_tag(text, "repeated")

        return text

    @lru_cache(maxsize=65536)
    def _handle_emphasis_match(self, m):
        text = m.group().replace("*", "")
        text = self._add_special_tag(text, "emphasis")

        return text

    def _update_stats(self, key, value):
        if value > 0:
            stats_for_text = self.stats[self.preprocessed_texts]

            if key not in stats_for_text:
                stats_for_text[key] = 0
            stats_for_text[key] += value

    @staticmethod
    def _remove_repeating_spaces(text):
        return re.sub(r' +', ' ', text).strip()

    @staticmethod
    def _add_special_tag(m, tag, mode="single"):

        if isinstance(m, str):
            text = m
        else:
            text = m.group()

        if mode == "single":
            return " {} <{}> ".format(text, tag)
        elif mode == "wrap":
            return " ".join([" <{}> {} </{}> ".format(tag, text, tag)]) + " "
        elif mode == "every":
            tokens = text.split()
            processed = " ".join([" {} <{}> ".format(t, tag)
                                  for t in tokens])
            return " " + processed + " "

Exemple #16

0

Afficher le fichier

Fichier : emoction.py Projet : strategist922/SemEval2019-OffensEval

def spell_correct(text):
   sp = SpellCorrector(corpus="twitter").correct(text)
   return sp