Python Segmenter.Segmenter Beispiele, ekphrasis.classes.segmenter.Segmenter.Segmenter Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: preprocessing.py Projekt: hajesha/twittersentimentanalysis

def extractHashtags(dataset):
    seg_tw = Segmenter(corpus="twitter")
    stop_words = set(stopwords.words('english'))
    dataset['hashtags'] = dataset['text'].apply(lambda x: re.findall(
        r"#(\w+)", x)).apply(lambda x: splitUpTweets(x, seg_tw))

    # # Remove stop words in segmented tweet
    # for i in range(len(dataset['hashtags'])):
    #     if dataset['hashtags'][i] is not None:
    #         dataset['hashtags'][i] = list(filter(lambda a: ((a not in stop_words) & (a != "_")), dataset['hashtags'][i]))
    return dataset

Beispiel #2

0

Datei anzeigen

 def __init__(self, **kwargs):
     self.tokens_to_normalize = kwargs.get("normalize", [])
     self.annotate = kwargs.get("annotate", [])
     self.unpack_hashtags = kwargs.get("unpack_hashtags", False)
     self.unpack_contractions = kwargs.get("unpack_contractions", False)
     self.segmenter_corpus = kwargs.get("segmenter", "english")
     self.corrector_corpus = kwargs.get("corrector", "english")
     self.segmenter = Segmenter(corpus=self.segmenter_corpus)
     self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus)
     self.tokenizer = kwargs.get("tokenizer", None)
     self.simplify_emoticons = kwargs.get("simplify_emoticons", False)
     self.dictionaries = kwargs.get("dictionaries", [])
     self.stats = {}
     self.preprocessed_texts = -1

Beispiel #3

0

Datei anzeigen

def hashtag_sentiment(tweet):
    hash_tag = (re.findall("#([a-zA-Z0-9]{1,25})", tweet))
    seg = Segmenter()
    hashtag_polarity = []
    for hashtag in hash_tag:
        tokens = seg.segment(hashtag)
        ss = sid.polarity_scores(tokens)
        if 'not' not in tokens.split(' '):
            hashtag_polarity.append(ss['compound'])
        else:
            hashtag_polarity.append(-ss['compound'])
    sentiment = 0
    if len(hashtag_polarity) > 0:
        sentiment = round(
            float(sum(hashtag_polarity) / float(len(hashtag_polarity))), 2)
    return sentiment

Beispiel #4

0

Datei anzeigen

Datei: feature_extraction.py Projekt: Shruti-codes/Human-Sentiment-Analysis

def hashtag_sentiment(tweet):
    hash_tag = (re.findall("#([a-zA-Z0-9]{1,25})", tweet))
    hashtag_polarity = []
    seg = Segmenter(corpus="twitter") 
    for hashtag in hash_tag:
        tokens = seg.segment(hashtag)
        ss = sid.polarity_scores(tokens) # polarity_scores method of SentimentIntensityAnalyzer 
   										 # object gives a sentiment dictionary. 
    									 # which contains pos, neg, neu, and compound scores. 
        if 'not' not in tokens.split(' '):
            hashtag_polarity.append(ss['compound'])
        else:
            hashtag_polarity.append(- ss['compound'])
    sentiment = 0
    if len(hashtag_polarity) > 0:
        sentiment = round(float(sum(hashtag_polarity) / float(len(hashtag_polarity))), 2)
    return sentiment

Beispiel #5

0

Datei anzeigen

Datei: process_tweets.py Projekt: dheeraj7596/HashNews

def handle_tweets(df_tweets):
    seg_eng = Segmenter(corpus="english")
    texts = list(df_tweets["text"])
    #f = open(data_path + "abs_tweets.txt", "w")
    hashtags = []
    clean_tweets = []
    for t in texts:
        pattern = r'#\w+|#\w+$'
        remove = re.compile(pattern)
        removed_t = remove.sub(r'', t)
        matches = re.findall(pattern, t)
        hashes = [seg_eng.segment(i.lstrip('#').lower()) for i in matches]
        tweet = tokenizer(removed_t)
        clean_tweets.append(tweet)
        hashtags.append(hashes)
    #   f.write(tweet)
    #  f.write("\n")
    #f.close()
    return clean_tweets, hashtags

Beispiel #6

0

Datei anzeigen

Datei: disasters.py Projekt: sooodos/nlp-disaster-tweets

warnings.filterwarnings("ignore")

sys.stdout = open("./output/disaster_output.txt", "w")

plt.style.use('ggplot')

nlp = spacy.load('en_core_web_sm')
deselect_stop_words = ['no', 'not']  # we don't consider no and not stop words
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False

lemmatizer = WordNetLemmatizer()
stop_words = safe_get_stop_words('en')
hashtag_regex = re.compile(r"\#\b[\w\-\_]+\b")
twitter_segmenter = Segmenter(corpus="twitter_2018")
camelcase_regex = re.compile(
    r'((?<=[a-z])[A-Z]|(?<!^)[A-Z](?=[a-z])|[0-9]+|(?<=[0-9\-\_])[A-Za-z]|[\-\_])'
)


# DATA PRE-PROCESSING FUNCTIONS
def unescape_tweet(tweet):
    """Unescaping various chars found in text """
    return html.unescape(tweet)


def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, 'lxml')
    stripped_text = soup.get_text(separator=" ")

Beispiel #7

0

Datei anzeigen

Datei: assemble.py Projekt: rish27/Multimodal-Hashtag-Generation

import random
import re
from ekphrasis.classes.segmenter import Segmenter
listOfFILEcomments = []
listOfFILEposts = []
listOfFILEtags = []
# for i in range(0,17):
# 	listOfFILEcomments.append('allcomments' + str(i) + '.txt')
# 	listOfFILEposts.append('allposts' + str(i) + '.txt')
# 	listOfFILEtags.append('alltags' + str(i) + '.txt')

seg_eng = Segmenter(corpus="english")

listOfFILEcomments = ['allcomments.txt']
listOfFILEposts = ['allposts.txt']
listOfFILEtags = ['alltags.txt']

fhc = open('finalallcomments.txt', 'a+')
fhp = open('finalallposts.txt', 'a+')
fht = open('finalalltags.txt', 'a+')
for commentFILE, postFILE, tagFILE in zip(listOfFILEcomments, listOfFILEposts,
                                          listOfFILEtags):
    commentGenerator = open(commentFILE, 'r')
    postGenerator = open(postFILE, 'r')
    tagGenerator = open(tagFILE, 'r')
    for comment, post, tag in zip(commentGenerator, postGenerator,
                                  tagGenerator):
        if comment.strip() and post.strip() and tag.strip():
            fhc.write(comment)
            fhp.write(post)
            fht.write(';'.join(

Beispiel #8

0

Datei anzeigen

Datei: pre_processing.py Projekt: 312600692/NLP-Code

 def segmentation(self):
     from ekphrasis.classes.segmenter import Segmenter
     seg_eg = Segmenter(corpus="english")
     seg_tw = Segmenter(corpus="twitter")
     self.text = [seg_tw.segment(sent) for sent in self.text]
     return self.text

Beispiel #9

0

Datei anzeigen

    def __init__(self, **kwargs):
        """
        Kwargs:
            omit (list): choose what tokens that you want to omit from the text.
                possible values: ['email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'hashtag']
                Important Notes:
                            1 - put url at front, if you plan to use it.
                                Messes with the regexes!
                            2 - if you use hashtag then unpack_hashtags will
                                automatically be set to False

            normalize (list): choose what tokens that you want to normalize
                from the text.
                possible values: ['email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'hashtag']
                for example: [email protected] will be transformed to <email>
                Important Notes:
                            1 - put url at front, if you plan to use it.
                                Messes with the regexes!
                            2 - if you use hashtag then unpack_hashtags will
                                automatically be set to False

            unpack_contractions (bool): Replace *English* contractions in
                ``text`` str with their unshortened forms
                for example: can't -> can not, wouldn't -> would not, and so on...

            unpack_hashtags (bool): split a hashtag to it's constituent words.
                for example: #ilikedogs -> i like dogs

            annotate (list): add special tags to special tokens.
                possible values: ['hashtag', 'allcaps', 'elongated', 'repeated']
                for example: [email protected] -> [email protected] <email>

            tokenizer (callable): callable function that accepts a string and
                returns a list of strings if no tokenizer is provided then
                the text will be tokenized on whitespace

            segmenter (str): define the statistics of what corpus you would
                like to use [english, twitter]

            corrector (str): define the statistics of what corpus you would
                like to use [english, twitter]

            all_caps_tag (str): how to wrap the capitalized words
                values [single, wrap, every]
                Note: applicable only when `allcaps` is included in annotate[]
                    - single: add a tag after the last capitalized word
                    - wrap: wrap all words with opening and closing tags
                    - every: add a tag after each word

            spell_correct_elong (bool): choose if you want to perform
                spell correction after the normalization of elongated words.
                * significantly affects performance (speed)

            spell_correction (bool): choose if you want to perform
                spell correction to the text
                * significantly affects performance (speed)

            fix_text (bool): choose if you want to fix bad unicode terms and
                html entities.
        """
        self.omit = kwargs.get("omit", {})
        self.backoff = kwargs.get("normalize", {})
        self.include_tags = kwargs.get("annotate", {})
        self.unpack_contractions = kwargs.get("unpack_contractions", False)
        self.tokenizer = kwargs.get("tokenizer", None)
        self.dicts = kwargs.get("dicts", None)
        self.spell_correction = kwargs.get("spell_correction", False)
        self.spell_correct_elong = kwargs.get("spell_correct_elong", False)
        self.fix_text = kwargs.get("fix_bad_unicode", False)
        self.unpack_hashtags = kwargs.get("unpack_hashtags", False)
        self.segmenter_corpus = kwargs.get("segmenter", "english")
        self.corrector_corpus = kwargs.get("corrector", "english")
        self.all_caps_tag = kwargs.get("all_caps_tag", "wrap")
        self.mode = kwargs.get("mode", "normal")

        if self.unpack_hashtags:
            self.segmenter = Segmenter(corpus=self.segmenter_corpus)
        if self.mode != "fast":
            self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus)

        self.regexes = ExManager().get_compiled()
        if 'hashtag' in self.omit or 'hashtag' in self.backoff:
            print("You can't omit/backoff and unpack hashtags!\n "
                  "unpack_hashtags will be set to False")
            self.unpack_hashtags = False

Beispiel #10

0

Datei anzeigen

Datei: process.py Projekt: Leyan529/Master-Summarizer

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
# import networkx as nx
import os
import pickle
from data_util.my_stopwords import *
from data_util.extract_key import extract_PF

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
# from ekphrasis.classes.segmenter import Segmenter

from ekphrasis.classes.segmenter import Segmenter
# segmenter using the word statistics from english Wikipedia
seg_eng = Segmenter(corpus="twitter") # english or twitter

from ekphrasis.classes.spellcorrect import SpellCorrector
sp = SpellCorrector(corpus="english") # english or twitter

alphbet_stopword = ['','b','c','d','e','f','g','h','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','#']

# 斷詞辭典
from nltk.corpus import stopwords as nltk_stopwords
nltk_stopwords = set(nltk_stopwords.words("english"))
stpwords_list3 = [f.replace("\n","") for f in open("data_util/stopwords.txt","r",encoding = "utf-8").readlines()]
stpwords_list3.remove("not")
stopwords = list(html_escape_table + stpwords_list2) + list(list(nltk_stopwords) + list(stpwords_list1) + list(stpwords_list3))
stopwords = stopwords + ["."] + alphbet_stopword
# stopwords = list(html_escape_table)  #+ list(stpwords_list1) + list(stpwords_list3)
print("斷詞辭典 已取得")

Beispiel #11

0

Datei anzeigen

Datei: script.py Projekt: gitvivekgupta/semeval-2019-task-6-HAD

Dependency: Preinstalled Dataset for ekphrasis 
"""

import sys
import re
import numpy as np
from enum import Enum
from sklearn import metrics
import tensorflow as tf
from tensorflow.contrib import rnn
from ekphrasis.classes.segmenter import Segmenter
import warnings
warnings.simplefilter("ignore")
# Twitter Hashtag Parser
tw = Segmenter(corpus="twitter")


# Configuration class for training model.
class Configuration:
    num_epochs = 500
    size_batch = 256
    max_time_steps = 40
    LSTM_CT = 4
    LSTM_SZ = 200
    ratio_dropout = 0.95
    embedding_size = 100
    rate_learning = 0.01


class PredictionPhase(Enum):

Beispiel #12

0

Datei anzeigen

def clean_tweets(df):
    # define the text preprocessro
    text_processor = TextPreProcessor(
        # terms that will be normalized
        normalize=['url', 'email', 'money', 'phone', 'time', 'date'],
        # terms that will be annotated
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,  # fix HTML tokens

        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter="twitter",

        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector="twitter",
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=False,  # spell correction for elongated words

        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        #tokenizer=SocialTokenizer(lowercase=True).tokenize,
        tokenizer=TweetTokenizer().tokenize,

        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons])
    seg = Segmenter(corpus="twitter")

    tweet_text = df.tweet_text.to_list()

    clean_tweets = []
    for tweet in tweet_text:

        # manually tag usernames
        # ex: @DoctorChristian -> <user> doctor christian </user>
        match = re.findall(r'@\w+', tweet)

        try:
            for at in match:
                user_seg = seg.segment(at[1:])
                tweet = tweet.replace(at, '<user> ' + user_seg + ' </user>')
        except:
            None

        # manually tag all caps so that the unpack_contractions functions works
        match = re.findall(r"(?<![#@$])\b([A-Z][A-Z ,.']*[A-Z])\b", tweet)

        try:
            for all_caps in match:
                tweet = tweet.replace(
                    all_caps, '<allcaps> ' + all_caps.lower() + ' </allcaps>')
        except:
            None

        # manually tag percentages
        match = re.findall(r"(\d+.?\d?%)", tweet)

        try:
            for percent in match:
                tweet = tweet.replace(
                    percent,
                    '<percent> ' + percent[0:len(percent) - 1] + ' </percent>')
        except:
            None

        # deal with contractions that the tool misses
        tweet = re.sub(
            r"(\b)([Ww]hat|[Ii]t|[Hh]e|[Ss]he|[Tt]hat|[Tt]here|[Hh]ow|[Ww]ho|[Hh]ere|[Ww]here|[Ww]hen)'s",
            r"\1\2 is", tweet)
        tweet = re.sub(r"(\b)([Aa]in)'t", r"is not", tweet)
        tweet = re.sub(r"(\b)([Ww]asn)'t", r"was not", tweet)
        tweet = re.sub(r"(\b)([Hh]e|[Ss]he|[Ii]|[Yy]ou|[Tt]hey|[Ww]e)'d",
                       r"\1\2 would", tweet)
        tweet = re.sub(r"(\b)([Ii]t|[Tt]hat|[Tt]his)'ll", r"\1\2 will", tweet)
        tweet = re.sub(r"(\b)([Cc])'mon", r"come on", tweet)

        # process the rest of the tweet with the nltk tweet tokenizer
        tweet = " ".join(text_processor.pre_process_doc(tweet)).lower()

        clean_tweets.append(tweet)

    # below is code to create the tsv file of cleaned tweets
    df['tweet_text'] = clean_tweets

    return df

Beispiel #13

0

Datei anzeigen

from ekphrasis.classes.segmenter import Segmenter

# segmenter using the word statistics from english Wikipedia
seg_eng = Segmenter(corpus="english")

# segmenter using the word statistics from Twitter
seg_tw = Segmenter(corpus="twitter")

# segmenter using the word statistics from Twitter
seg_tw_2018 = Segmenter(corpus="twitter_2018")

words = [
    "exponentialbackoff", "gamedev", "retrogaming", "thewatercooler",
    "panpsychism"
]
for w in words:
    print(w)
    print("(eng):", seg_eng.segment(w))
    print("(tw):", seg_tw.segment(w))
    print("(tw):", seg_tw_2018.segment(w))
    print()

Beispiel #14

0

Datei anzeigen

def preprocess_corpus(corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,  ##OBS denne er nå ikke testet, eventuelt bare fjerne den
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False, 
                      remove_signs=False
                      ):
    """ Function used to apply preprocessing
    Input:
        corpus: a corpus on the format as the output in creat_corpus. Default False. 
        all_smilies: if true, same effect as if pos_smilies, neg_smilies, and other_smilies were true.Default False.
        pos_smilies: if true, positive smilies such as : ), : (, ; ), ( ;, :p, ;p, : p, are replaced by "possmiley.Default False.
        neg_smilies: if true, negative smilies such as : (, ) : are replaced by "negsmiely".Default False.
        other_smilies: if true, smilies such as ^_^ are replaced by a describing word.Default False. 
        hugs_and_kisses: if true, words such as xxx xoxo etc are replaced by "kisses" or "hug" and "kisses". Default False.
        hearts: if true, "<3" are replaced by "heart".Default False.
        hashtags: if true, hashtags are removed from the beginning of words, so #apple becomes apple.Default False. 
        hashtag_mention: if true, and if hashtag is true, the word "hashatag" is added at the end of a tweet that used to contain
            one or more words beginning with a hashtag. Default False.
        numbers: if true, words that are purely numbers are removed.Default False.
        number_mention: if true, and if number is true, the word "thereisanumber" is added at the end of a tweet that used 
            to contain one or more words that were purely numbers. Default False.
        exclamation: if true, the word "exclamation" is added at the end of a tweet that contain one or more "!".Default False. 
        set_to_not: if true, all words ending with "n't" is replaced by not.Default False. 
        segmentation_hash: if true, words starting with # that do not appear in the english dictionary is split into segments, 
            eg '#iammoving' becomes 'i am moving'. Default False.
        spelling: if true, all words that are not a part of the english dictionary is set to the most likely word,
            within two alterations. Default False.
        elongation: if true, the length of all sequences of letters in words that are not a part of the English dictionary 
            is set to max 2. Before words that are altered because of this, the word 'elongation' appears. Default False.
        remove_signs: if true, signs such as ",", ".", ":", ";", "-", are removed. Default False.
    
    Output:
        new_corpus: a new corpus, on same format as the input corpus. 
    """
   
    start = time.time()
    
    #initialising the new corpus:
    new_corpus=[]

    #Want to split the tweets using this tokenizer:
    tknzr = TweetTokenizer(reduce_len=True)
    
    
    
    if stemming:
        ps = PorterStemmer()
    
    if segmentation_hash or spelling or elongation:
        d = enchant.Dict("en_US")
    
    if segmentation_hash: 
        #seg = Segmenter(corpus="english")
        seg = Segmenter(corpus="twitter")

    if spelling: 
        sp = SpellCorrector(corpus="english")
        
    
    elapsed = time.time()
    print("Time in min before starting first for loop:", (elapsed - start) / 60 )
    
    #Want to go though each line (tweet) in the corpus
    for k, line in enumerate(corpus):
        
        
        if hashtag_mention:
            there_is_hashtag=False
        if number_mention:
            there_is_number=False
        if exclamation:
            there_is_exclamation=False
            
        #Splitting the tweet using the chosen tokenizer. 
        words=tknzr.tokenize(line)
        #Initializing for cleaned_tweet:
        cleaned_tweet=[]
        
        for i, word in enumerate(words):
            #Indicating that the word has not been treated yet
            word_not_treated=True
            end_=len(words)-1
            if ((pos_smilies or all_smilies) and word_not_treated):
                if (i>0 and (word=='d' and (words[i-1]==':' or words[i-1]==';'))) or word == ':d' or word == ';d':
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif (i>0 and (word=='p' and (words[i-1]==':' or words[i-1]==';'))) or word == ':p' or word == ';p' :
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and word=='d' and (words[i-1]==':' or words[i-1]==';' or words[i-1]=='x'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and words[i-1]=='(' and (word==':' or word==';'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False
                elif i>0 and word==')' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('smile')
                    word_not_treated=False

            if ((neg_smilies or all_smilies) and word_not_treated):
                if i>0 and words[i-1]==')' and (word==':' or word==';'):
                    cleaned_tweet.append('sad')
                    word_not_treated=False
                elif i>0 and word=='(' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('sad')
                    word_not_treated=False
            
            if ((other_smilies or all_smilies) and word_not_treated):
                if i>0  and i<end_ and word=='_' and words[i-1]=='^' and words[i+1]=='^':
                    cleaned_tweet.append('eyesmiley')
                    word_not_treated=False
                elif i>0 and word=='o' and words[i-1]==':':
                    cleaned_tweet.append('openmouthface')
                    word_not_treated=False
                elif i>0 and word=='/' and words[i-1]==':':
                    cleaned_tweet.append('slashsmiely')
                    word_not_treated=False
                elif i>0 and word=='*' and (words[i-1]==':' or words[i-1]==';'):
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
                
            if ((hugs_and_kisses and word_not_treated)):
                    #want to find hearts, hugs, kisses, etc: 
                if (word == "xoxo" or word == "xo" or word == "xoxoxo" or word == "xxoo"):
                    cleaned_tweet.append('hug')
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
                elif (word=='xx' or word=='xxx'or word=='xxxx'):
                    cleaned_tweet.append('kiss')
                    word_not_treated=False
            
            if ((hearts and word_not_treated)):
                if word == "<3":
                    cleaned_tweet.append('heart')
                    word_not_treated=False
            
            if (hashtag and word_not_treated):
                if word[0]=='#':
                    there_is_hashtag=True
                    if (len(word)>1 and segmentation_hash and not d.check(word[1:])):
                        cleaned_tweet.append(seg.segment(word[1:]))
                    else:
                        cleaned_tweet.append(word[1:])
                    word_not_treated=False
            
            if (numbers and word_not_treated):
                if word.isdigit():
                    there_is_number=True
                    word_not_treated=False
                    
            if (exclamation and word_not_treated):
                if word=='!':
                    there_is_exclamation=True
                    cleaned_tweet.append(word)
                    word_not_treated=False
            
            if (set_to_not and word_not_treated):
                if word[-3:]=='n\'t':
                    cleaned_tweet.append('not')
                    word_not_treated=False
           
            
         
            if (word_not_treated):
                if (not remove_signs) or (remove_signs and ( (word!= '^' and word!=',' and word!='.' and word!=':' 
                                                              and word!='-' and word!='´' and word!=';'and word!=')' 
                                                              and word!='(' and word!='*'))):
                  
                    if ((not word[0].isdigit()) and elongation and not d.check(word) and len(word)>2):
                        new=[]
                        new.append(word[0])
                        for i,letter in enumerate(word):
                            if i>0 and i<len(word)-1: 
                                if not( letter==word[i-1]==word[i+1]):
                                    new.append(letter)
                        new.append(word[-1])
                        new_word=''.join(new)
                        if new_word!= word:
                            cleaned_tweet.append('elongation')
                            word=new_word

                    if spelling and not d.check(word)and len(word)>2: 
                        word=sp.correct(word)
                    if stemming:
                        word=ps.stem(word)

                    
                    cleaned_tweet.append(word)

           
                
        
        if (hashtag_mention and there_is_hashtag) :
            cleaned_tweet.append('hashtag')
        if (number_mention and there_is_number) :
            cleaned_tweet.append('number')
        if (exclamation and there_is_exclamation):
            cleaned_tweet.append('exclamation')
            
            
        new_words = ' '.join(cleaned_tweet)
        new_words = new_words.encode('utf-8')
        new_corpus.append(new_words)
        
        if np.mod(k,25000)==1:
                elapsed = time.time()
                print("Time in min after", k, " tweets:", (elapsed - start) / 60 )

        
    elapsed = time.time()
    print("Time in min total:", (elapsed - start) / 60 )
    return new_corpus

Beispiel #15

0

Datei anzeigen

def tokenize_hashtags(hashtags):
    seg_eng = Segmenter(corpus="english")
    hash= ' '.join(seg_eng.segment(hashtags) for h in hashtags)
    return hash