Beispiel #1
0
def hashtag_sentiment(tweet):
    hash_tag = (re.findall("#([a-zA-Z0-9]{1,25})", tweet))
    seg = Segmenter()
    hashtag_polarity = []
    for hashtag in hash_tag:
        tokens = seg.segment(hashtag)
        ss = sid.polarity_scores(tokens)
        if 'not' not in tokens.split(' '):
            hashtag_polarity.append(ss['compound'])
        else:
            hashtag_polarity.append(-ss['compound'])
    sentiment = 0
    if len(hashtag_polarity) > 0:
        sentiment = round(
            float(sum(hashtag_polarity) / float(len(hashtag_polarity))), 2)
    return sentiment
def hashtag_sentiment(tweet):
    hash_tag = (re.findall("#([a-zA-Z0-9]{1,25})", tweet))
    hashtag_polarity = []
    seg = Segmenter(corpus="twitter") 
    for hashtag in hash_tag:
        tokens = seg.segment(hashtag)
        ss = sid.polarity_scores(tokens) # polarity_scores method of SentimentIntensityAnalyzer 
   										 # object gives a sentiment dictionary. 
    									 # which contains pos, neg, neu, and compound scores. 
        if 'not' not in tokens.split(' '):
            hashtag_polarity.append(ss['compound'])
        else:
            hashtag_polarity.append(- ss['compound'])
    sentiment = 0
    if len(hashtag_polarity) > 0:
        sentiment = round(float(sum(hashtag_polarity) / float(len(hashtag_polarity))), 2)
    return sentiment
Beispiel #3
0
def handle_tweets(df_tweets):
    seg_eng = Segmenter(corpus="english")
    texts = list(df_tweets["text"])
    #f = open(data_path + "abs_tweets.txt", "w")
    hashtags = []
    clean_tweets = []
    for t in texts:
        pattern = r'#\w+|#\w+$'
        remove = re.compile(pattern)
        removed_t = remove.sub(r'', t)
        matches = re.findall(pattern, t)
        hashes = [seg_eng.segment(i.lstrip('#').lower()) for i in matches]
        tweet = tokenizer(removed_t)
        clean_tweets.append(tweet)
        hashtags.append(hashes)
    #   f.write(tweet)
    #  f.write("\n")
    #f.close()
    return clean_tweets, hashtags
fhc = open('finalallcomments.txt', 'a+')
fhp = open('finalallposts.txt', 'a+')
fht = open('finalalltags.txt', 'a+')
for commentFILE, postFILE, tagFILE in zip(listOfFILEcomments, listOfFILEposts,
                                          listOfFILEtags):
    commentGenerator = open(commentFILE, 'r')
    postGenerator = open(postFILE, 'r')
    tagGenerator = open(tagFILE, 'r')
    for comment, post, tag in zip(commentGenerator, postGenerator,
                                  tagGenerator):
        if comment.strip() and post.strip() and tag.strip():
            fhc.write(comment)
            fhp.write(post)
            fht.write(';'.join(
                [seg_eng.segment(w) for w in tag.split(';') if w]))
    commentGenerator.close()
    postGenerator.close()
    tagGenerator.close()

fhc = open('finalallcomments.txt', 'r')
fhp = open('finalallposts.txt', 'r')
fht = open('finalalltags.txt', 'r')

data = [(random.random(), line1, line2, line3)
        for line1, line2, line3 in zip(fhc, fhp, fht)]

fhc.close()
fhp.close()
fht.close()
Beispiel #5
0
 def segmentation(self):
     from ekphrasis.classes.segmenter import Segmenter
     seg_eg = Segmenter(corpus="english")
     seg_tw = Segmenter(corpus="twitter")
     self.text = [seg_tw.segment(sent) for sent in self.text]
     return self.text
Beispiel #6
0
class TextPreProcessor:
    def __init__(self, **kwargs):
        """
        Kwargs:
            omit (list): choose what tokens that you want to omit from the text.
                possible values: ['email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'hashtag']
                Important Notes:
                            1 - put url at front, if you plan to use it.
                                Messes with the regexes!
                            2 - if you use hashtag then unpack_hashtags will
                                automatically be set to False

            normalize (list): choose what tokens that you want to normalize
                from the text.
                possible values: ['email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'hashtag']
                for example: [email protected] will be transformed to <email>
                Important Notes:
                            1 - put url at front, if you plan to use it.
                                Messes with the regexes!
                            2 - if you use hashtag then unpack_hashtags will
                                automatically be set to False

            unpack_contractions (bool): Replace *English* contractions in
                ``text`` str with their unshortened forms
                for example: can't -> can not, wouldn't -> would not, and so on...

            unpack_hashtags (bool): split a hashtag to it's constituent words.
                for example: #ilikedogs -> i like dogs

            annotate (list): add special tags to special tokens.
                possible values: ['hashtag', 'allcaps', 'elongated', 'repeated']
                for example: [email protected] -> [email protected] <email>

            tokenizer (callable): callable function that accepts a string and
                returns a list of strings if no tokenizer is provided then
                the text will be tokenized on whitespace

            segmenter (str): define the statistics of what corpus you would
                like to use [english, twitter]

            corrector (str): define the statistics of what corpus you would
                like to use [english, twitter]

            all_caps_tag (str): how to wrap the capitalized words
                values [single, wrap, every]
                Note: applicable only when `allcaps` is included in annotate[]
                    - single: add a tag after the last capitalized word
                    - wrap: wrap all words with opening and closing tags
                    - every: add a tag after each word

            spell_correct_elong (bool): choose if you want to perform
                spell correction after the normalization of elongated words.
                * significantly affects performance (speed)

            spell_correction (bool): choose if you want to perform
                spell correction to the text
                * significantly affects performance (speed)

            fix_text (bool): choose if you want to fix bad unicode terms and
                html entities.
        """
        self.omit = kwargs.get("omit", {})
        self.backoff = kwargs.get("normalize", {})
        self.include_tags = kwargs.get("annotate", {})
        self.unpack_contractions = kwargs.get("unpack_contractions", False)
        self.tokenizer = kwargs.get("tokenizer", None)
        self.dicts = kwargs.get("dicts", None)
        self.spell_correction = kwargs.get("spell_correction", False)
        self.spell_correct_elong = kwargs.get("spell_correct_elong", False)
        self.fix_text = kwargs.get("fix_bad_unicode", False)
        self.unpack_hashtags = kwargs.get("unpack_hashtags", False)
        self.segmenter_corpus = kwargs.get("segmenter", "english")
        self.corrector_corpus = kwargs.get("corrector", "english")
        self.all_caps_tag = kwargs.get("all_caps_tag", "wrap")
        self.mode = kwargs.get("mode", "normal")

        if self.unpack_hashtags:
            self.segmenter = Segmenter(corpus=self.segmenter_corpus)
        if self.mode != "fast":
            self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus)

        self.regexes = ExManager().get_compiled()
        if 'hashtag' in self.omit or 'hashtag' in self.backoff:
            print("You can't omit/backoff and unpack hashtags!\n "
                  "unpack_hashtags will be set to False")
            self.unpack_hashtags = False

    def __copy__(self):
        return self

    def __deepcopy__(self, memo):
        return self

    @staticmethod
    def add_special_tag(m, tag, mode="single"):

        if isinstance(m, str):
            text = m
        else:
            text = m.group()

        if mode == "single":
            return " {} <{}> ".format(text, tag)
        elif mode == "wrap":
            return " ".join([" <{}> {} </{}> ".format(tag, text, tag)]) + " "
        elif mode == "every":
            tokens = text.split()
            processed = " ".join([" {} <{}> ".format(t, tag) for t in tokens])
            return " " + processed + " "

    @lru_cache(maxsize=4096)
    def handle_hashtag_match(self, m):
        """
        Break a string to its constituent words (using Viterbi algorithm)
        """
        text = m.group()[1:]

        # todo:simplify routine
        if text.islower():
            expanded = self.segmenter.segment(text)
            expanded = " ".join(expanded.split("-"))
            expanded = " ".join(expanded.split("_"))
            # print(m.group(), " - ", expanded)
            # with open("analysis/segmenter_" +
            # self.segmenter_corpus + ".txt", "a") as f:
            #     f.write(m.group() + "\t" + expanded + "\n")

        else:
            # split words following CamelCase convention
            expanded = self.regexes["camel_split"].sub(r' \1', text)
            expanded = expanded.replace("-", "")
            expanded = expanded.replace("_", "")
            # print(m.group(), " - ", expanded)

        if "hashtag" in self.include_tags:
            expanded = self.add_special_tag(expanded, "hashtag", mode="wrap")

        return expanded

    def handle_elongated_match(self, m):
        text = m.group()

        # normalize to at most 2 repeating chars
        text = self.regexes["normalize_elong"].sub(r'\1\1', text)

        normalized = self.spell_corrector.normalize_elongated(text)
        if normalized:
            text = normalized

        # try to spell correct the word
        if self.spell_correct_elong:
            text = self.spell_corrector.correct_word(text,
                                                     assume_wrong=True,
                                                     fast=True)
            # with open("analysis/spell_corrector_" +
            # self.corrector_corpus + ".txt", "a") as f:
            #     f.write(m.group() + " - " + text + "\n")

            # print(m.group(), "-", text)
        if "elongated" in self.include_tags:
            text = self.add_special_tag(text, "elongated")

        return text

    @lru_cache(maxsize=4096)
    def handle_repeated_puncts(self, m):
        """
        return the sorted set so mathes random combinations of puncts
        will be mapped to the same token
        "!??!?!!", "?!!!!?!", "!!?", "!?!?" --> "?!"
        "!...", "...?!" --> ".!"
        :param m:
        :return:
        """
        text = m.group()
        text = "".join(sorted(set(text), reverse=True))

        if "repeated" in self.include_tags:
            text = self.add_special_tag(text, "repeated")

        return text

    @lru_cache(maxsize=4096)
    def handle_generic_match(self, m, tag, mode="every"):
        """

        Args:
            m ():
            tag ():
            mode ():

        Returns:

        """
        text = m.group()
        text = self.add_special_tag(text, tag, mode=mode)

        return text

    @lru_cache(maxsize=4096)
    def handle_emphasis_match(self, m):
        """
        :param m:
        :return:
        """
        text = m.group().replace("*", "")
        if "emphasis" in self.include_tags:
            text = self.add_special_tag(text, "emphasis")

        return text

    @staticmethod
    def dict_replace(wordlist, _dict):
        return [_dict[w] if w in _dict else w for w in wordlist]

    @staticmethod
    def remove_hashtag_allcaps(wordlist):
        in_hashtag = False
        _words = []
        for word in wordlist:

            if word == "<hashtag>":
                in_hashtag = True
            elif word == "</hashtag>":
                in_hashtag = False
            elif word in {"<allcaps>", "</allcaps>"} and in_hashtag:
                continue

            _words.append(word)

        return _words

    @lru_cache(maxsize=4096)
    def handle_general_word_segment_and_spelling(self, m):
        """
        :param m:
        :return:
        """
        text = m.group()
        text = self.segmenter.segment(text)

        return text

    def pre_process_doc(self, doc):

        doc = re.sub(r' +', ' ', doc)  # remove repeating spaces

        # ###########################
        # # fix bad unicode
        # ###########################
        # if self.fix_bad_unicode:
        #     doc = textacy.preprocess.fix_bad_unicode(doc)
        #
        # ###########################
        # # fix html leftovers
        # ###########################
        # doc = html.unescape(doc)

        ###########################
        # fix text
        ###########################
        if self.fix_text:
            doc = ftfy.fix_text(doc)

        ###########################
        # BACKOFF & OMIT
        ###########################
        for item in self.backoff:
            # better add an extra space after the match.
            # Just to be safe. extra spaces will be normalized later anyway
            doc = self.regexes[item].sub(
                lambda m: " " + "<" + item + ">" + " ", doc)
        for item in self.omit:
            doc = doc.replace("<" + item + ">", '')

        ###########################
        # segment other words not hashtags
        ###########################

        # doc = self.regexes['not_hashtag'].sub(
        # lambda w: self.handle_general_word_segment_and_spelling(w), doc)

        # for word in doc.split(" "):
        # if(not word.startswith('#')):
        # word = self.segmenter.segment(word)
        # new_doc.append(word)
        # doc = " ".join(new_doc)

        ###########################
        # unpack hashtags
        ###########################

        if self.unpack_hashtags:
            doc = self.regexes["hashtag"].sub(
                lambda w: self.handle_hashtag_match(w), doc)

        ###########################
        # handle special cases
        ###########################
        if self.mode != "fast":
            if "allcaps" in self.include_tags:
                doc = self.regexes["allcaps"].sub(
                    lambda w: self.handle_generic_match(
                        w, "allcaps", mode=self.all_caps_tag), doc)

            if "elongated" in self.include_tags:
                doc = self.regexes["elongated"].sub(
                    lambda w: self.handle_elongated_match(w), doc)

            if "repeated" in self.include_tags:
                doc = self.regexes["repeat_puncts"].sub(
                    lambda w: self.handle_repeated_puncts(w), doc)

            if "emphasis" in self.include_tags:
                doc = self.regexes["emphasis"].sub(
                    lambda w: self.handle_emphasis_match(w), doc)

            if "censored" in self.include_tags:
                doc = self.regexes["censored"].sub(
                    lambda w: self.handle_generic_match(w, "censored"), doc)

        ###########################
        # unpack contractions: i'm -> i am, can't -> can not...
        ###########################

        # remove textacy dependency
        if self.unpack_contractions:
            doc = unpack_contractions(doc)

        # omit allcaps if inside hashtags
        doc = re.sub(r' +', ' ', doc)  # remove repeating spaces
        # doc = re.sub(r'<hashtag><allcaps>', '<hashtag>', doc)  # remove repeating spaces
        # doc = doc.replace('<hashtag> <allcaps>', '<hashtag>')
        # doc = doc.replace('</allcaps> </hashtag>', '</hashtag>')

        ###########################
        # Tokenize
        ###########################
        doc = self.remove_hashtag_allcaps(doc.split())
        doc = " ".join(doc)  # normalize whitespace
        if self.tokenizer:
            doc = self.tokenizer(doc)

            # Replace tokens with special dictionaries (slang,emoticons ...)
            # todo: add spell check before!
            if self.dicts:
                for d in self.dicts:
                    doc = self.dict_replace(doc, d)

        return doc

    def pre_process_docs(self, docs, lazy=True):
        from tqdm import tqdm
        for d in tqdm(docs, desc="PreProcessing..."):
            yield self.pre_process_doc(d)
Beispiel #7
0
def clean_tweets(df):
    # define the text preprocessro
    text_processor = TextPreProcessor(
        # terms that will be normalized
        normalize=['url', 'email', 'money', 'phone', 'time', 'date'],
        # terms that will be annotated
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,  # fix HTML tokens

        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter="twitter",

        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector="twitter",
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=False,  # spell correction for elongated words

        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        #tokenizer=SocialTokenizer(lowercase=True).tokenize,
        tokenizer=TweetTokenizer().tokenize,

        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons])
    seg = Segmenter(corpus="twitter")

    tweet_text = df.tweet_text.to_list()

    clean_tweets = []
    for tweet in tweet_text:

        # manually tag usernames
        # ex: @DoctorChristian -> <user> doctor christian </user>
        match = re.findall(r'@\w+', tweet)

        try:
            for at in match:
                user_seg = seg.segment(at[1:])
                tweet = tweet.replace(at, '<user> ' + user_seg + ' </user>')
        except:
            None

        # manually tag all caps so that the unpack_contractions functions works
        match = re.findall(r"(?<![#@$])\b([A-Z][A-Z ,.']*[A-Z])\b", tweet)

        try:
            for all_caps in match:
                tweet = tweet.replace(
                    all_caps, '<allcaps> ' + all_caps.lower() + ' </allcaps>')
        except:
            None

        # manually tag percentages
        match = re.findall(r"(\d+.?\d?%)", tweet)

        try:
            for percent in match:
                tweet = tweet.replace(
                    percent,
                    '<percent> ' + percent[0:len(percent) - 1] + ' </percent>')
        except:
            None

        # deal with contractions that the tool misses
        tweet = re.sub(
            r"(\b)([Ww]hat|[Ii]t|[Hh]e|[Ss]he|[Tt]hat|[Tt]here|[Hh]ow|[Ww]ho|[Hh]ere|[Ww]here|[Ww]hen)'s",
            r"\1\2 is", tweet)
        tweet = re.sub(r"(\b)([Aa]in)'t", r"is not", tweet)
        tweet = re.sub(r"(\b)([Ww]asn)'t", r"was not", tweet)
        tweet = re.sub(r"(\b)([Hh]e|[Ss]he|[Ii]|[Yy]ou|[Tt]hey|[Ww]e)'d",
                       r"\1\2 would", tweet)
        tweet = re.sub(r"(\b)([Ii]t|[Tt]hat|[Tt]his)'ll", r"\1\2 will", tweet)
        tweet = re.sub(r"(\b)([Cc])'mon", r"come on", tweet)

        # process the rest of the tweet with the nltk tweet tokenizer
        tweet = " ".join(text_processor.pre_process_doc(tweet)).lower()

        clean_tweets.append(tweet)

    # below is code to create the tsv file of cleaned tweets
    df['tweet_text'] = clean_tweets

    return df
Beispiel #8
0
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons])

seg = Segmenter(corpus="twitter")

clean_tweets = []
for tweet in data:

    # manually tag usernames
    # ex: @DoctorChristian -> <user> doctor christian </user>
    match = re.findall(r'@\w+', tweet)

    try:
        for at in match:
            user_seg = seg.segment(at[1:])
            tweet = tweet.replace(at, '<user> ' + user_seg + ' </user>')
    except:
        None

    # manually tag all caps so that the unpack_contractions functions works
    match = re.findall(r"(?<![#@$])\b([A-Z][A-Z ,.']*[A-Z])\b", tweet)

    try:
        for all_caps in match:
            tweet = tweet.replace(
                all_caps, '<allcaps> ' + all_caps.lower() + ' </allcaps>')
    except:
        None

    # deal with contractions that the tool misses
Beispiel #9
0
from ekphrasis.classes.segmenter import Segmenter

# segmenter using the word statistics from english Wikipedia
seg_eng = Segmenter(corpus="english")

# segmenter using the word statistics from Twitter
seg_tw = Segmenter(corpus="twitter")

# segmenter using the word statistics from Twitter
seg_tw_2018 = Segmenter(corpus="twitter_2018")

words = [
    "exponentialbackoff", "gamedev", "retrogaming", "thewatercooler",
    "panpsychism"
]
for w in words:
    print(w)
    print("(eng):", seg_eng.segment(w))
    print("(tw):", seg_tw.segment(w))
    print("(tw):", seg_tw_2018.segment(w))
    print()
Beispiel #10
0
def preprocess_corpus(corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,  ##OBS denne er nå ikke testet, eventuelt bare fjerne den
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False, 
                      remove_signs=False
                      ):
    """ Function used to apply preprocessing
    Input:
        corpus: a corpus on the format as the output in creat_corpus. Default False. 
        all_smilies: if true, same effect as if pos_smilies, neg_smilies, and other_smilies were true.Default False.
        pos_smilies: if true, positive smilies such as : ), : (, ; ), ( ;, :p, ;p, : p, are replaced by "possmiley.Default False.
        neg_smilies: if true, negative smilies such as : (, ) : are replaced by "negsmiely".Default False.
        other_smilies: if true, smilies such as ^_^ are replaced by a describing word.Default False. 
        hugs_and_kisses: if true, words such as xxx xoxo etc are replaced by "kisses" or "hug" and "kisses". Default False.
        hearts: if true, "<3" are replaced by "heart".Default False.
        hashtags: if true, hashtags are removed from the beginning of words, so #apple becomes apple.Default False. 
        hashtag_mention: if true, and if hashtag is true, the word "hashatag" is added at the end of a tweet that used to contain
            one or more words beginning with a hashtag. Default False.
        numbers: if true, words that are purely numbers are removed.Default False.
        number_mention: if true, and if number is true, the word "thereisanumber" is added at the end of a tweet that used 
            to contain one or more words that were purely numbers. Default False.
        exclamation: if true, the word "exclamation" is added at the end of a tweet that contain one or more "!".Default False. 
        set_to_not: if true, all words ending with "n't" is replaced by not.Default False. 
        segmentation_hash: if true, words starting with # that do not appear in the english dictionary is split into segments, 
            eg '#iammoving' becomes 'i am moving'. Default False.
        spelling: if true, all words that are not a part of the english dictionary is set to the most likely word,
            within two alterations. Default False.
        elongation: if true, the length of all sequences of letters in words that are not a part of the English dictionary 
            is set to max 2. Before words that are altered because of this, the word 'elongation' appears. Default False.
        remove_signs: if true, signs such as ",", ".", ":", ";", "-", are removed. Default False.
    
    Output:
        new_corpus: a new corpus, on same format as the input corpus. 
    """
   
    start = time.time()
    
    #initialising the new corpus:
    new_corpus=[]

    #Want to split the tweets using this tokenizer:
    tknzr = TweetTokenizer(reduce_len=True)
    
    
    
    if stemming:
        ps = PorterStemmer()
    
    if segmentation_hash or spelling or elongation:
        d = enchant.Dict("en_US")
    
    if segmentation_hash: 
        #seg = Segmenter(corpus="english")
        seg = Segmenter(corpus="twitter")

    if spelling: 
        sp = SpellCorrector(corpus="english")
        
    
    elapsed = time.time()
    print("Time in min before starting first for loop:", (elapsed - start) / 60 )
    
    #Want to go though each line (tweet) in the corpus
    for k, line in enumerate(corpus):
        
        
        if hashtag_mention:
            there_is_hashtag=False
        if number_mention:
            there_is_number=False
        if exclamation:
            there_is_exclamation=False
            
        #Splitting the tweet using the chosen tokenizer. 
        words=tknzr.tokenize(line)
        #Initializing for cleaned_tweet:
        cleaned_tweet=[]
        
        for i, word in enumerate(words):
            #Indicating that the word has not been treated yet
            word_not_treated=True
            end_=len(words)-1
            if ((pos_smilies or all_smilies) and word_not_treated):
                if (i>0 and (word=='d' and (words[i-1]==':' or words[i-1]==';'))) or word == ':d' or word == ';d':
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif (i>0 and (word=='p' and (words[i-1]==':' or words[i-1]==';'))) or word == ':p' or word == ';p' :
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and word=='d' and (words[i-1]==':' or words[i-1]==';' or words[i-1]=='x'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and words[i-1]=='(' and (word==':' or word==';'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and word==')' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False

            if ((neg_smilies or all_smilies) and word_not_treated):
                if i>0 and words[i-1]==')' and (word==':' or word==';'):
                    cleaned_tweet.append('sad')
                    word_not_treated=False
                elif i>0 and word=='(' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('sad')
                    word_not_treated=False
            
            if ((other_smilies or all_smilies) and word_not_treated):
                if i>0  and i<end_ and word=='_' and words[i-1]=='^' and words[i+1]=='^':
                    cleaned_tweet.append('eyesmiley')
                    word_not_treated=False
                elif i>0 and word=='o' and words[i-1]==':':
                    cleaned_tweet.append('openmouthface')
                    word_not_treated=False
                elif i>0 and word=='/' and words[i-1]==':':
                    cleaned_tweet.append('slashsmiely')
                    word_not_treated=False
                elif i>0 and word=='*' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
                
            if ((hugs_and_kisses and word_not_treated)):
                    #want to find hearts, hugs, kisses, etc: 
                if (word == "xoxo" or word == "xo" or word == "xoxoxo" or word == "xxoo"):
                    cleaned_tweet.append('hug')
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
                elif (word=='xx' or word=='xxx'or word=='xxxx'):
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
            
            if ((hearts and word_not_treated)):
                if word == "<3":
                    cleaned_tweet.append('heart')
                    word_not_treated=False
            
            if (hashtag and word_not_treated):
                if word[0]=='#':
                    there_is_hashtag=True
                    if (len(word)>1 and segmentation_hash and not d.check(word[1:])):
                        cleaned_tweet.append(seg.segment(word[1:]))
                    else:
                        cleaned_tweet.append(word[1:])
                    word_not_treated=False
            
            if (numbers and word_not_treated):
                if word.isdigit():
                    there_is_number=True
                    word_not_treated=False
                    
            if (exclamation and word_not_treated):
                if word=='!':
                    there_is_exclamation=True
                    cleaned_tweet.append(word)
                    word_not_treated=False
            
            if (set_to_not and word_not_treated):
                if word[-3:]=='n\'t':
                    cleaned_tweet.append('not')
                    word_not_treated=False
           
            
         
            if (word_not_treated):
                if (not remove_signs) or (remove_signs and ( (word!= '^' and word!=',' and word!='.' and word!=':' 
                                                              and word!='-' and word!='´' and word!=';'and word!=')' 
                                                              and word!='(' and word!='*'))):
                  
                    if ((not word[0].isdigit()) and elongation and not d.check(word) and len(word)>2):
                        new=[]
                        new.append(word[0])
                        for i,letter in enumerate(word):
                            if i>0 and i<len(word)-1: 
                                if not( letter==word[i-1]==word[i+1]):
                                    new.append(letter)
                        new.append(word[-1])
                        new_word=''.join(new)
                        if new_word!= word:
                            cleaned_tweet.append('elongation')
                            word=new_word

                    if spelling and not d.check(word)and len(word)>2: 
                        word=sp.correct(word)
                    if stemming:
                        word=ps.stem(word)

                    
                    cleaned_tweet.append(word)

           
                
        
        if (hashtag_mention and there_is_hashtag) :
            cleaned_tweet.append('hashtag')
        if (number_mention and there_is_number) :
            cleaned_tweet.append('number')
        if (exclamation and there_is_exclamation):
            cleaned_tweet.append('exclamation')
            
            
        new_words = ' '.join(cleaned_tweet)
        new_words = new_words.encode('utf-8')
        new_corpus.append(new_words)
        
        if np.mod(k,25000)==1:
                elapsed = time.time()
                print("Time in min after", k, " tweets:", (elapsed - start) / 60 )

        
    elapsed = time.time()
    print("Time in min total:", (elapsed - start) / 60 )
    return new_corpus       
# ekphrasis que es para hacer sentimental analysis en especifico aqui se uso para la segmentacion de hashtags


#Metodo para limpiar tweets quitar caracteres especiales, hashtags y url
def clean_tweet(tweet):
    tweet = re.sub(r"pic.\S+", "", tweet)
    return ' '.join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
               tweet).split())


#Query para los 20 tweets recientes
tweets = query_tweets_from_user("realDonaldTrump", 20)

#Imprimir los tweets limpios
for tweet in tweets:
    print(clean_tweet(tweet.text))
    tweetHashtag = re.findall(r"#(\w+)", tweet.text)
    if tweetHashtag.__len__ != 0:
        hashtagArray.extend(tweetHashtag)
    print("\n")

#El corpus se refiere a las estadisticas que usara para segmentar los hashtags en este caso son de twitter
seg_tw = Segmenter(corpus="twitter")
hashtagArray = []

print("Hashtags Segmention:\n")

for hashtag in hashtagArray:  #
    print("(tw):", seg_tw.segment(hashtag))
Beispiel #12
0
    },
    fix_html=True,  # fix HTML tokens
    segmenter="twitter",
    corrector="twitter",
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons])

seg_tw = Segmenter(corpus="twitter")
sp = SpellCorrector(corpus="twitter")
f1 = open('tokenized_tweets_golbeck.txt', 'w')
c = 1
for line in data:
    a = line.strip().split('\t')
    if len(a) >= 3:
        b = a[2]
        c = a[1]
        b = b.split()
        for i in range(len(b)):
            if b[i].startswith('http'):
                b[i] = '<url>'
        b = ' '.join(b)
        a = text_processor.pre_process_doc(b)
        for i in range(len(a)):
            if a[i].isalpha():
                a[i] = seg_tw.segment(sp.correct(a[i]))
        a = ' '.join(a)
        f1.write(a + ' ' + c + '\n')
Beispiel #13
0
    dicts=[emoticons, slangdict])

segmenter = Segmenter(corpus="twitter")
count = 0
all_texts = []
user_dict = defaultdict(lambda: None)

for file_name in sorted(os.listdir(tweet_path)):
    if file_name.endswith('.json'):
        print('processing ' + file_name)
        with open(tweet_path + file_name, 'r') as tweet_batch:
            tweets = json.load(tweet_batch)
            for tweet in tweets:
                # text = preprocess(tweet['content']['text'])
                tokens = text_processor.pre_process_doc(text)
                tokens = [segmenter.segment(t) for t in tokens]
                text = " ".join(tokens)
                text = process_tags(text).strip()
                username = str(tweet['username'])
                if text:
                    if user_dict[username]:
                        user_dict[username] = list(
                            set(user_dict[username]) | set(tweet['college']))
                    else:
                        user_dict[username] = tweet['college']

                    data.loc[count, 'username'] = str(tweet['username'])
                    data.loc[count, 'id'] = str(tweet['id_str'])
                    data.loc[count, 'conversation'] = str(
                        tweet['content']['conversation'])
                    data.loc[count, 'text'] = text
Beispiel #14
0
class TextPreProcessor:
    """
    Kwargs:
        normalize (list)
            possible values: ['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date']

        annotate (list)
            possible values: ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored']

        unpack_hashtags (bool)

        unpack_contractions (bool)

        segmenter (str): define the statistics of what corpus you would
            like to use [english, twitter]

        corrector (str): define the statistics of what corpus you would
            like to use [english, twitter]

        tokenizer (callable): callable function that accepts a string and
                returns a list of strings if no tokenizer is provided then
                the text will be tokenized on whitespace

        simplify_emoticons (bool)

        dictionaries (list)
    """

    def __init__(self, **kwargs):
        self.tokens_to_normalize = kwargs.get("normalize", [])
        self.annotate = kwargs.get("annotate", [])
        self.unpack_hashtags = kwargs.get("unpack_hashtags", False)
        self.unpack_contractions = kwargs.get("unpack_contractions", False)
        self.segmenter_corpus = kwargs.get("segmenter", "english")
        self.corrector_corpus = kwargs.get("corrector", "english")
        self.segmenter = Segmenter(corpus=self.segmenter_corpus)
        self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus)
        self.tokenizer = kwargs.get("tokenizer", None)
        self.simplify_emoticons = kwargs.get("simplify_emoticons", False)
        self.dictionaries = kwargs.get("dictionaries", [])
        self.stats = {}
        self.preprocessed_texts = -1

    def pre_process(self, text: str, with_stats=False):
        self._increment_counter()

        text = self._remove_repeating_spaces(text)
        text = self._normalize(text)
        text = self._unpack_hashtags(text)
        text = self._annotate(text)
        text = self._unpack_contractions(text)
        text = self._remove_repeating_spaces(text)

        tokens = self._tokenize(text)
        tokens = self._simplify_emoticons(tokens)
        tokens = self._replace_using_dictionaries(tokens)

        if with_stats:
            return tokens, self._pre_processed_text_stats()
        else:
            return tokens

    def _pre_processed_text_stats(self):
        return self.stats[self.preprocessed_texts]

    def _increment_counter(self):
        self.preprocessed_texts += 1
        self.stats[self.preprocessed_texts] = {}

    def _normalize(self, text):
        for item in self.tokens_to_normalize:
            text = self._change_using_regexp(item, lambda m: f' <{item}> ', text, 'normalize')
        return text

    def _unpack_hashtags(self, text):
        if self.unpack_hashtags:
            return self._change_using_regexp("hashtag", lambda w: self._handle_hashtag_match(w), text, "unpack")
        return text

    def _annotate(self, text):
        text = self._annotate_allcaps(text)
        text = self._annotate_elongated(text)
        text = self._annotate_repeated(text)
        text = self._annotate_emphasis(text)
        text = self._annotate_censored(text)
        return text

    def _annotate_allcaps(self, text):
        if "allcaps" in self.annotate:
            return self._change_using_regexp("allcaps", lambda w: self._handle_generic_match(w, "allcaps", mode='wrap'),
                                             text, "annotate")
        return text

    def _annotate_elongated(self, text):
        if "elongated" in self.annotate:
            return self._change_using_regexp("elongated", lambda w: self._handle_elongated_match(w), text, "annotate")
        return text

    def _annotate_repeated(self, text):
        if "repeated" in self.annotate:
            return self._change_using_regexp("repeat_puncts", lambda w: self._handle_repeated_puncts(w), text,
                                             "annotate")
        return text

    def _annotate_emphasis(self, text):
        if "emphasis" in self.annotate:
            return self._change_using_regexp("emphasis", lambda w: self._handle_emphasis_match(w), text, "annotate")
        return text

    def _annotate_censored(self, text):
        if "censored" in self.annotate:
            return self._change_using_regexp("censored", lambda w: self._handle_generic_match(w, "censored"), text,
                                             "annotate")
        return text

    def _change_using_regexp(self, regexp_name, func, text, stats_name_prefix):
        changing_result = regexes[regexp_name].subn(func, text)
        self._update_stats(f'{stats_name_prefix}_{regexp_name}', changing_result[1])
        return changing_result[0]

    def _unpack_contractions(self, text):
        if self.unpack_contractions:
            text = self._unpack_selected_contrations(r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|"
                                                     r"[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n'?t",
                                                     r"\1\2 not", text)

            text = self._unpack_selected_contrations(r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll",
                                                     r"\1\2 will", text)
            text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]hat|[Ww]ho|[Yy]ou)ll", r"\1\2 will", text)

            text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text)
            text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]hat|[Yy]ou)re", r"\1\2 are", text)

            text = self._unpack_selected_contrations(r"(\b)([[Hh]e|[Ss]he)'s", r"\1\2 is", text)

            text = self._unpack_selected_contrations(
                r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)"
                r"'?ve", r"\1\2 have", text)

            text = self._unpack_selected_contrations(r"(\b)([Cc]a)n't", r"\1\2n not", text)
            text = self._unpack_selected_contrations(r"(\b)([Ii])'m", r"\1\2 am", text)
            text = self._unpack_selected_contrations(r"(\b)([Ll]et)'?s", r"\1\2 us", text)
            text = self._unpack_selected_contrations(r"(\b)([Ww])on'?t", r"\1\2ill not", text)
            text = self._unpack_selected_contrations(r"(\b)([Ss])han'?t", r"\1\2hall not", text)
            text = self._unpack_selected_contrations(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text)

        return text

    def _unpack_selected_contrations(self, regexp, replacement, text):
        unpacking_result = re.subn(regexp, replacement, text)
        self._update_stats("unpack_contrations", unpacking_result[1])
        return unpacking_result[0]

    def _tokenize(self, text):
        if self.tokenizer:
            return self.tokenizer(text)
        else:
            return text.split(' ')

    def _simplify_emoticons(self, tokens):
        if self.simplify_emoticons:
            result = []
            for token in tokens:
                if token in emoticons:
                    new_emoticon = emoticons[token]
                    if new_emoticon != token:
                        self._update_stats('emoticon_simplification', 1)
                    result.append(new_emoticon)
                else:
                    result.append(token)
            return result
        else:
            return tokens

    def _replace_using_dictionaries(self, tokens):
        if len(self.dictionaries) > 0:
            for dictionary in self.dictionaries:
                for idx, token in enumerate(tokens):
                    if token in dictionary:
                        value = dictionary[token]
                        if '<entity>' not in value:
                            tokens[idx] = value
                            self._update_stats('dictionary_replacement', 1)
            return ' '.join(tokens).split(' ')
        else:
            return tokens

    @lru_cache(maxsize=65536)
    def _handle_hashtag_match(self, m):
        text = m.group()[1:]

        if text.islower():
            expanded = self.segmenter.segment(text)
            expanded = " ".join(expanded.split("-"))
            expanded = " ".join(expanded.split("_"))
        else:
            expanded = regexes["camel_split"].sub(r' \1', text)
            expanded = expanded.replace("-", "")
            expanded = expanded.replace("_", "")

        if "hashtag" in self.annotate:
            expanded = self._add_special_tag(expanded, "hashtag", mode="wrap")

        return expanded

    @lru_cache(maxsize=65536)
    def _handle_generic_match(self, m, tag, mode="every"):
        text = m.group()
        if tag == 'allcaps':  # word around for allcaps contractions like YOU'RE TODO refactor
            text = text.lower()

        text = self._add_special_tag(text, tag, mode=mode)

        return text

    def _handle_elongated_match(self, m):
        text = m.group()

        text = regexes["normalize_elong"].sub(r'\1\1', text)

        normalized = self.spell_corrector.normalize_elongated(text)
        if normalized:
            text = normalized

        text = self._add_special_tag(text, "elongated")

        return text

    @lru_cache(maxsize=65536)
    def _handle_repeated_puncts(self, m):
        text = m.group()
        text = "".join(sorted(set(text), reverse=True))
        text = self._add_special_tag(text, "repeated")

        return text

    @lru_cache(maxsize=65536)
    def _handle_emphasis_match(self, m):
        text = m.group().replace("*", "")
        text = self._add_special_tag(text, "emphasis")

        return text

    def _update_stats(self, key, value):
        if value > 0:
            stats_for_text = self.stats[self.preprocessed_texts]

            if key not in stats_for_text:
                stats_for_text[key] = 0
            stats_for_text[key] += value

    @staticmethod
    def _remove_repeating_spaces(text):
        return re.sub(r' +', ' ', text).strip()

    @staticmethod
    def _add_special_tag(m, tag, mode="single"):

        if isinstance(m, str):
            text = m
        else:
            text = m.group()

        if mode == "single":
            return " {} <{}> ".format(text, tag)
        elif mode == "wrap":
            return " ".join([" <{}> {} </{}> ".format(tag, text, tag)]) + " "
        elif mode == "every":
            tokens = text.split()
            processed = " ".join([" {} <{}> ".format(t, tag)
                                  for t in tokens])
            return " " + processed + " "
Beispiel #15
0
def tokenize_hashtags(hashtags):
    seg_eng = Segmenter(corpus="english")
    hash= ' '.join(seg_eng.segment(hashtags) for h in hashtags)
    return hash