def tokenizer_features(features_tweets): import sys sys.path.append('src/ark-twokenize-py') from twokenize import tokenizeRawTweetText for row, tweet in features_tweets: row['features']['tokenizer'] = { 'tokens': tokenizeRawTweetText(tweet.text.lower()), 'tokens_without_entities': tokenizeRawTweetText(tweet.text_without_entities.lower()), } yield row, tweet
def perprocessing(tdic): new_dic = {} POS_feature = [] for line in tdic: id = line gt = tdic[line][0] raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1])) text = twokenize.normalizeTextForTagger(raw) text_tk = twokenize.tokenize(text) # print(text_tk) print(text_tk) telist = [] for word in text_tk: word = word.lower() # ps = nltk.stem.PorterStemmer() # word = ps.stem(word) telist.append(word) # print(telist) afterlemma = lemma(telist) telist = afterlemma[0] POS_feature.append(afterlemma[1]) # print(telist) newtext = ' '.join(telist) # print(newtext) newtext = textPreprocessor01.replaceall(newtext) #now preprocess . change to URLINK SADFACE print(newtext) new_dic[id] = gt, newtext return new_dic, np.array(POS_feature)
def get_twokens_rt_fave(filename): with open(filename) as fi: for line in fi: t = json.loads(line) lang = t["twitter_lang"] if lang == "en": try: print "english" text = t['body'].decode('utf8').lower() #print text text = URL_RE.sub(" ", text) text = USERNAME_RE.sub(" ", text) #print 'text', text #words = re.findall(' (\w{3,})', text) toks = twokenize.tokenizeRawTweetText(text) print "toks", toks if toks: #print "has toks" isReply = 1 if "inReplyTo" in t.keys() else 0 #import pdb; pdb.set_trace() print(toks, t['retweetCount'], t['favoritesCount'], isReply) yield (toks, t['retweetCount'], t['favoritesCount'], isReply) else: print "NO TOKS" except: continue
def get_clean_tokens(tweet): tokens = [] ctweet = _remove_tags(clean_tweet(tweet)) #Replace this with TweetNLP tokenizer #tokens = nltk.tokenize.WhitespaceTokenizer().tokenize(tweet.lower()); tokens = twk.tokenizeRawTweetText(ctweet) return tokens
def next(self): tweet = self.tw_stream.next() if tweet is stream.End_Of_Stream: return stream.End_Of_Stream if tweet is None: return None t = tweet.timestamp uid = tweet.uid txt = tweet.str # lower case txt = txt.lower() # tokenize try: tokens = twokenize.tokenizeRawTweetText(txt) except: return None # filter tokens = filter( lambda x: (not stop_words.contains(x)) and (not _PUN_PATTERN.match(x)) and (len(x) <= 32) and (len(x) > 1), tokens) return stream.PreprocessedTweetItem(t, uid, tokens)
def download(): global twitter, arguments, tid_list with open(arguments.inputfile + "." + arguments.outputtype, "w") as fw: tid_number = len(tid_list) max_round = tid_number / MAX_LOOKUP_NUMBER + 1 for i in range(max_round): tids = tid_list[i * MAX_LOOKUP_NUMBER:(i + 1) * MAX_LOOKUP_NUMBER] time.sleep(SLEEP_TIME) jobjs = twitter.lookup_status(id=tids) for jobj in jobjs: if arguments.outputtype == "json": fw.write(json.dumps(jobj)) elif arguments.outputtype == "IdTweet": tweet = jobj["text"] tid = jobj["id_str"] fw.write(json.dumps({"id_str": tid, "text": tweet})) else: tweet = jobj["text"] tokens = twokenize.tokenizeRawTweetText(tweet) tid = jobj["id_str"] fw.write( json.dumps({ "id_str": tid, "text": " ".join(tokens) })) fw.write("\n")
def tokenize_tweet(tweet): tokens = [] ctweet = clean_tweet(tweet) #Replace this with TweetNLP tokenizer #tokens = nltk.tokenize.WhitespaceTokenizer().tokenize(tweet.lower()); tokens = twk.tokenizeRawTweetText(ctweet) return tokens
def extract(self, corpus, *args, **kwargs): """ Extract all the named entities from the corpus. The output is a list of lists. Each outer list represents a document. Each inner list is the candidates in that document. :param corpus: The corpus of documents where to extract candidate participants. :type corpus: list :return: A list of candidates separated by the document in which they were found. :rtype: list of list of str """ candidates = [] for document in corpus: document_entities = [] tokens = tokenizeRawTweetText(document.text) entities = TwitterNEREntityExtractor.ner.get_entities(tokens) candidates.append([ " ".join(tokens[start:end]).lower() for (start, end, type) in entities ]) return candidates
def perprocessing(tdic): new_dic = {} for line in tdic: id = line gt = tdic[line][0] raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1])) text = twokenize.normalizeTextForTagger(raw) text_tk = twokenize.tokenize(text) telist = [] for word in text_tk: word = word.lower() ps = nltk.stem.PorterStemmer() word = ps.stem(word) # word = nltk.stem.SnowballStemmer(word) telist.append(word) # return ''.join(ans) # newtext = ?telist # newtext = ' '.join(text_tk) newtext = ' '.join(telist) # print(newtext) newtext = textPreprocessor01.replaceall(newtext) new_dic[id] = gt, newtext # print(type(tdic[line][1])) # print(line) # print(type(line)) # print(type(newtext)) # print(newtext) return new_dic
def process(tweet=None, configuration=None): if tweet is None or configuration is None: return (False, [], [], []) else: #TwitterNER extraction tokens = tokenizeRawTweetText(tweet) ner_processed = configuration.get_entities(tokens) data = { 'PERSON': [], 'ORGANIZATION': [], 'LOCATION': [] } # print 'NER Tweet: \n' + ner_tweet '''Run extraction parser''' #TwitterNER Extraction for ner_token in ner_processed: (from_index, to_index, ner_key) = ner_token entity = " ".join(tokens[from_index:to_index]) data[ner_key].append(entity) # print 'People: {0}'.format(extracted['people']) # print 'Organizations: {0}'.format(extracted['organizations']) # print 'Locations: {0}\n'.format(extracted['locations']) return (True, data['PERSON'], data['ORGANIZATION'], data['LOCATION'])
def cleanTweet(tweet): tokens = twokenize.tokenizeRawTweetText(tweet) cleanedTweet = ""; for token in tokens: if (not token.startswith('http')) and (not token.startswith('www')) : token += ' ' cleanedTweet += token return cleanedTweet
def transform(self, documents): user_mentions = re.compile(ur"@\w\w+:?") text_number = re.compile(ur"\b\w*\d+\w*\b") url = re.compile(ur"((www\.[^\s]+)|(https?:\/\/[^\s]+))") mentioning = [len(user_mentions.findall(doc.content)) for doc in documents] # print mentioning exclamation = [doc.content.count("!") for doc in documents] # print exclamation question = [doc.content.count("?") for doc in documents] # print question hashtag = [doc.content.count("#") for doc in documents] # print hashtag n_words = [len(tokenizeRawTweetText(doc.content)) for doc in documents] # print n_words n_chars = [len(doc.content) for doc in documents] avetweetlength = [np.mean([len(tweet) for tweet in nltk.sent_tokenize(doc.content)]) for doc in documents] # print avetweetlength avgwordlenght = [np.mean([len(word) for word in tokenizeRawTweetText(doc.content)]) for doc in documents] # print avgwordlenght allcaps = [np.sum([word.isupper() for word in tokenizeRawTweetText(doc.content)]) for doc in documents] # print allcaps numtexttoken = [len(text_number.findall(doc.content)) for doc in documents] # print numtexttoken url_count = [len(url.findall(doc.content)) for doc in documents] # print url_count X = np.array( [ n_words, n_chars, numtexttoken, allcaps, exclamation, question, hashtag, mentioning, url_count, avgwordlenght, avetweetlength, ] ).T if not hasattr(self, "scalar"): self.scalar = preprocessing.StandardScaler().fit(X) return self.scalar.transform(X)
def tokenize(sentence): """ Uses twokenize to tokenize tweets. :param sentence: String of the tweet :return: list containing the tokens """ toks = twokenize.tokenizeRawTweetText(sentence.lower()) return toks
def tokenize_tweets(tweets): sys.path.append('.') import twokenize decoded = [x.replace("\\n", "\n") for x in tweets] ttweets = [twokenize.tokenizeRawTweetText(x) for x in decoded] uncased = [] for tokens in ttweets: uncased.append([x.lower() for x in tokens]) return uncased
def ner_tweet(tweet): global ner l = [] print(tweet) tokens = tokenizeRawTweetText(tweet) list = ner.get_entities(tokens) print(list) for x in list: l.append(str(" ".join(tokens[int(x[0]):int(x[1])]).encode('utf-8'))) return l
def get_twokens(filename): """CMU Twokenizer.""" tweets = [] with open(filename) as fi: for line in fi: t = json.loads(line) text = t['body'].encode('utf8') tokens = twokenize.tokenizeRawTweetText(text) tweets.append(tokens) return tweets
def relative_score1(keywords, txt): tokens = twokenize.tokenizeRawTweetText(txt) score = 0.0 for token in tokens: if token in keywords: score += 1 score /= 3 return score
def __next__(self): _tweet = next(self.stream) if _tweet is stream.End_Of_Stream: return stream.End_Of_Stream _t = _tweet.timestamp _tid = _tweet.tid _text = _tweet.tweet _tokens = tokenize.tokenizeRawTweetText(_text) _tokens = list(set(filter(lambda x:not x is "" and x not in stopwords and x not in tokenize.e_punc and not x.startswith('http'),[re.sub(r'[^A-Za-z0-9\':/.&$|@%\\]','',tokenize.deRepeatWords(i.lower())) for i in _tokens]))) return stream.PreprocessedTweetItem(_t,_tid,_tokens)
def relative_score2(keywords, txt): tokens = twokenize.tokenizeRawTweetText(txt) words = set() for token in tokens: if token in keywords: words.add(token) score = len(words) score /= 2 return score
def next(self): tweet = self.tw_stream.next() if tweet is stream.End_Of_Stream: return stream.End_Of_Stream if tweet is None: return None if tweet.is_retweet(): user = tweet.who_is_retweeted() if user: if user in _FILTERED_USERS: return None t = tweet.timestamp uid = tweet.uid txt = tweet.str # clean txt try: txt = self.wb_cleaner.clean_wb(txt) except: return None # remove all urls urls = re.findall(_HTTP_PATTERN, txt) for url in urls: txt = txt.replace(url, ' ') # lower case txt = txt.lower() # tokenize try: tokens = twokenize.tokenizeRawTweetText(txt) except: return None # filter tokens = filter( lambda x: (not stop_words.contains(x)) and (not _PUN_PATTERN.match(x)) and (len(x) <= 32), tokens) # space filter tokens = filter(lambda x: not _SPACE_PATTERN.match(x), tokens) # to ascii #tokens = map(lambda x: x.encode('ascii','ignore'), tokens) #tokens = filter(lambda x: len(x) > 0, tokens) ret = stream.PreprocessedTweetItem(t, uid, tokens, tweet) active_term_maintainer.add(ret) return ret
def tokenize(line): text = line.txt if text is not None: text = text.lower() text = re.sub(URL_PATTERN, 'URL', text) text = re.sub(USER_PATTERN, 'USER', text) text = re.sub(PUNCT_PATTERN, ' ', text) tokens = twokenize.tokenizeRawTweetText(text) line.tokens = tokens return tokens else: return None
def message_to_words(message, word_del): message_clean = re.sub( "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "URL", message ) word_tokens = tokenizeRawTweetText(message_clean) # tokenize word_tokens = replace_at(word_tokens) # replace @ using predefined char word_tokens = [s for s in word_tokens if s not in word_del] # remove few speicific words "[" "]" # word_list1 = [w.lower() for w in word_tokens if not w.lower() in nltk.corpus.stopwords.words('english') and not w in string.punctuation] #lowering the case and stop words removal word_list1 = [w.lower() for w in word_tokens if not w in string.punctuation] word_list2 = [x for x in word_list1 if not (x[0].isdigit() or x[0] == "-" and x[1:].isdigit())] # remove digit no. # word_stemmed=[porter.stem(t) for t in word_list2] # stemming to get normalized words # word_stemmed1= ' '.join(str(v).encode('ascii', 'ignore') for v in word_stemmed) # s.endcode('ascii','ignore') avoid the utf8 decode errors return word_list2
def ark_twokenize(text: str) -> List[str]: ''' A Twitter tokeniser from `CMU Ark <https://github.com/brendano/ark-tweet-nlp>`_ This is a wrapper of `this <https://github.com/Sentimentron/ark-twokenize-py>`_ :param text: A string to be tokenised. :returns: A list of tokens where each token is a String. ''' if isinstance(text, str): return twokenize.tokenizeRawTweetText(text) raise ValueError(f'The paramter must be of type str not {type(text)}')
def preprocess(documents, custom=[]): # Remove urls: http? bug fixed (SV) documents = [re.sub(r"(?:\@|https?\://)\S+", "", doc) for doc in documents] # remove duplicate tweets documents = set(documents) # Remove documents with less 100 words (some tweets contain only URLs) # documents = [doc for doc in documents if len(doc) > 100] # Tokenize documents = [set(tokenizeRawTweetText(doc.lower())) for doc in documents] # print documents # Remove stop words unigrams = [w for doc in documents for w in doc if len(w) == 1] bigrams = [w for doc in documents for w in doc if len(w) == 2] # print bigrams # print custom + STOPLIST stoplist = set( nltk.corpus.stopwords.words("english") + STOPLIST_TW + unigrams + bigrams + custom) # print stoplist # and strip # documents = [[ token.lstrip('#') for token in doc if token.lstrip('#') not in stoplist ] for doc in documents] # remove punctuation tokens documents = [[token for token in doc if not re.match(punctSeq, token)] for doc in documents] # Remove words that only occur once token_frequency = defaultdict(int) lmtzr = WordNetLemmatizer() documents = [[lmtzr.lemmatize(token) for token in doc] for doc in documents] # count all token for doc in documents: for token in doc: token_frequency[token] += 1 # keep words that occur more than once documents = [[token for token in doc if token_frequency[token] > 1] for doc in documents] # print documents return documents
def twokenize(text, no_duplicates=True, stem=False): text = text.lower() clean_text = re.sub(r"(?:\@|https?\://)\S+", "", text) # remove non alpha chars # clean_text = filter(str.isalnum, clean_text) # regex = re.compile('[^a-zA-Z ]') # clean_text = regex.sub('', clean_text) clean_text = re.sub(r'[^\x00-\x7F]+', ' ', clean_text) # strip punctuation # clean_text = clean_text.decode('unicode_escape').encode('ascii','ignore') translator = string.maketrans( string.punctuation, ' ' * len(string.punctuation)) #map punctuation to space clean_text = str(clean_text).translate(translator) # Remove documents with less 100 words (some tweets contain only URLs) # documents = [doc for doc in documents if len(doc) > 100] # Tokenize # tokens = tokenizeRawTweetText(clean_text.lower()) tokens = tokenizeRawTweetText(clean_text) # Remove stop words # unigrams = [w for doc in documents for w in doc if len(w) == 1] # bigrams = [w for doc in documents for w in doc if len(w) == 2] # print bigrams # + STOPLIST_TW + STOPLIST + unigrams + bigrams) # and strip # tokens = [token for token in tokens if token not in STOPLIST] # tokens = [token for token in tokens if token not in string.punctuation] # print tokens # remove punctuation tokens # tokens = [token for token in tokens if not re.match(punctSeq, token)] # lmtzr = WordNetLemmatizer() # tokens = [lmtzr.lemmatize(token) for token in tokens] if stem: stemmer = SnowballStemmer("english") # print tokens tokens = [stemmer.stem(token) for token in tokens] if no_duplicates: # tokens = set(tokens) tokens = f7(tokens) clean_text = ' '.join(tokens) return clean_text
def ark_twokenize(text): ''' A Twitter tokeniser from `CMU Ark <https://github.com/brendano/ark-tweet-nlp>`_ returns a list of tokens. This is just a wrapper of `this <https://github.com/Sentimentron/ark-twokenize-py>`_ :param text: A string to be tokenised. :type text: String :returns: A list of tokens where each token is a String. :rtype: list ''' if isinstance(text, str): return twokenize.tokenizeRawTweetText(text) raise ValueError('The paramter must be of type str not {}'.format( type(text)))
def perprocessing(tdic): new_dic = {} for line in tdic: id = line gt = tdic[line][0] raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1])) text = twokenize.normalizeTextForTagger(raw) text_tk = twokenize.tokenize(text) # print(text_tk) newtext = ' '.join(text_tk) newtext = textPreprocessor01.replaceall(newtext) new_dic[id] = gt, newtext # print(type(tdic[line][1])) # print(line) # print(type(line)) # print(type(newtext)) # print(newtext) return new_dic
def perprocessing(tdic): new_dic = {} for line in tdic: id = line gt = tdic[line][0] raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1])) text = twokenize.normalizeTextForTagger(raw) text_tk = twokenize.tokenize(text) telist = [] for word in text_tk: word = word.lower() ps = nltk.stem.PorterStemmer() word = ps.stem(word) telist.append(word) newtext = ' '.join(telist) newtext = textPreprocessor02.replaceall(newtext) new_dic[id] = gt, newtext return new_dic
def get_positives_and_negatives(corpus, lexicon): num_words = {} num_positive_words = {} num_negative_words = {} for user, l in corpus.items(): for tweet in l: for token in twokenize.tokenizeRawTweetText(tweet): num_words[user] = num_words[user] if user in num_words else 0 num_words[user] = num_words[user] + 1 if token in lexicon['positive']: num_positive_words[user] = num_positive_words[ user] if user in num_positive_words else 0 num_positive_words[user] = num_positive_words[user] + 1 elif token in lexicon['negative']: num_negative_words[user] = num_negative_words[ user] if user in num_negative_words else 0 num_negative_words[user] = num_negative_words[user] + 1 return num_words, num_positive_words, num_negative_words
def crawler(file): #Dictionary initilization. tweets = {} emoticons = {} hashtags = {} capitals = {} longs = {} sentiment = {} #Read the file and store it in tweets dictionary with open(file) as tsvfile: tsvreader = csv.reader(tsvfile, delimiter="\t") #diction = enchant.request_dict("en_US") for line in tsvreader: tweet_id = str(line[1])+'$'+str(line[2])+'$'+str(line[3]) splitted_text=str(line[5]).split(" ") start = int(line[2]) end= int(line[3]) isolated_phrase= isolatePhrase(splitted_text,start,end) tokenized_text = twokenize.tokenizeRawTweetText(isolated_phrase) tokenized_text = replaceUnicode(tokenized_text) capitals[tweet_id] = storeCapitals(tokenized_text) emoticons[tweet_id] = storeEmoticons(tokenized_text) hashtags[tweet_id] = storeHashtags(tokenized_text) new_text =replaceCapitals(tokenized_text) new_text = replaceURLs(tokenized_text) new_text = replaceUserMentions(new_text) new_text = replaceEmoticons(new_text) new_text = replaceHashtags(new_text) longs[tweet_id] = storeLongWords(new_text) sentiment[tweet_id] = str(line[4]) #print(sentiment) tweets[tweet_id] = new_text tsvfile.close() return (tweets, emoticons, hashtags, capitals,longs, sentiment)
def twokenize(text, no_duplicates=True, stem=False): text = text.lower() clean_text = re.sub(r"(?:\@|https?\://)\S+", "", text) # remove non alpha chars # clean_text = filter(str.isalnum, clean_text) # regex = re.compile('[^a-zA-Z ]') # clean_text = regex.sub('', clean_text) clean_text = re.sub(r'[^\x00-\x7F]+',' ', clean_text) # strip punctuation # clean_text = clean_text.decode('unicode_escape').encode('ascii','ignore') translator = string.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space clean_text = str(clean_text).translate(translator) # Remove documents with less 100 words (some tweets contain only URLs) # documents = [doc for doc in documents if len(doc) > 100] # Tokenize # tokens = tokenizeRawTweetText(clean_text.lower()) tokens = tokenizeRawTweetText(clean_text) # Remove stop words # unigrams = [w for doc in documents for w in doc if len(w) == 1] # bigrams = [w for doc in documents for w in doc if len(w) == 2] # print bigrams # + STOPLIST_TW + STOPLIST + unigrams + bigrams) # and strip # tokens = [token for token in tokens if token not in STOPLIST] # tokens = [token for token in tokens if token not in string.punctuation] # print tokens # remove punctuation tokens # tokens = [token for token in tokens if not re.match(punctSeq, token)] # lmtzr = WordNetLemmatizer() # tokens = [lmtzr.lemmatize(token) for token in tokens] if stem: stemmer = SnowballStemmer("english") # print tokens tokens = [stemmer.stem(token) for token in tokens] if no_duplicates: # tokens = set(tokens) tokens = f7(tokens) clean_text = ' '.join(tokens) return clean_text
def makeCorpus(rawTweets, rawLabels, saveLocal=False, corpusFormat='mm'): # Tokenize # Buidling stoplist #stoplist = nltk.corpus.stopwords.words('english') word_stoplist = "1,2,3,4,5,6,7,8,9,0,http,https,today,great,day,amp,u,w,a,s,able,about,above,according,accordingly,across,actually,after,afterwards,again,against,ain,t,all,allow,allows,almost,alone,along,already,also,although,always,am,among,amongst,an,and,another,any,anybody,anyhow,anyone,anything,anyway,anyways,anywhere,apart,appear,appreciate,appropriate,are,aren,t,around,as,aside,ask,asking,associated,at,available,away,awfully,be,became,because,become,becomes,becoming,been,before,beforehand,behind,being,believe,below,beside,besides,best,better,between,beyond,both,brief,but,by,c,mon,c,s,came,can,can,t,cannot,cant,cause,causes,certain,certainly,changes,clearly,co,com,come,comes,concerning,consequently,consider,considering,contain,containing,contains,corresponding,could,couldn,t,course,currently,definitely,described,despite,did,didn,t,different,do,does,doesn,t,doing,don,t,done,down,downwards,during,each,edu,eg,eight,either,else,elsewhere,enough,entirely,especially,et,etc,even,ever,every,everybody,everyone,everything,everywhere,ex,exactly,example,except,far,few,fifth,first,five,followed,following,follows,for,former,formerly,forth,four,from,further,furthermore,get,gets,getting,given,gives,go,goes,going,gone,got,gotten,greetings,had,hadn,t,happens,hardly,has,hasn,t,have,haven,t,having,he,he,s,hello,help,hence,her,here,here,s,hereafter,hereby,herein,hereupon,hers,herself,hi,him,himself,his,hither,hopefully,how,howbeit,however,i,d,i,ll,i,m,i,ve,ie,if,ignored,immediate,in,inasmuch,inc,indeed,indicate,indicated,indicates,inner,insofar,instead,into,inward,is,isn,t,it,it,d,it,ll,it,s,its,itself,just,keep,keeps,kept,know,knows,known,last,lately,later,latter,latterly,least,less,lest,let,let,s,like,liked,likely,little,look,looking,looks,ltd,mainly,many,may,maybe,me,mean,meanwhile,merely,might,more,moreover,most,mostly,much,must,my,myself,name,namely,nd,near,nearly,necessary,need,needs,neither,never,nevertheless,new,next,nine,no,nobody,non,none,noone,nor,normally,not,nothing,novel,now,nowhere,obviously,of,off,often,oh,ok,okay,old,on,once,one,ones,only,onto,or,other,others,otherwise,ought,our,ours,ourselves,out,outside,over,overall,own,particular,particularly,per,perhaps,placed,please,plus,possible,presumably,probably,provides,que,quite,qv,rather,rd,re,really,reasonably,regarding,regardless,regards,relatively,respectively,right,said,same,saw,say,saying,says,second,secondly,see,seeing,seem,seemed,seeming,seems,seen,self,selves,sensible,sent,serious,seriously,seven,several,shall,she,should,shouldn,t,since,six,so,some,somebody,somehow,someone,something,sometime,sometimes,somewhat,somewhere,soon,sorry,specified,specify,specifying,still,sub,such,sup,sure,t,s,take,taken,tell,tends,th,than,thank,thanks,thanx,that,that,s,thats,the,their,theirs,them,themselves,then,thence,there,there,s,thereafter,thereby,therefore,therein,theres,thereupon,these,they,they,d,they,ll,they,re,they,ve,think,third,this,thorough,thoroughly,those,though,three,through,throughout,thru,thus,to,together,too,took,toward,towards,tried,tries,truly,try,trying,twice,two,un,under,unfortunately,unless,unlikely,until,unto,up,upon,us,use,used,useful,uses,using,usually,value,various,very,via,viz,vs,want,wants,was,wasn,t,way,we,we,d,we,ll,we,re,we,ve,welcome,well,went,were,weren,t,what,what,s,whatever,when,whence,whenever,where,where,s,whereafter,whereas,whereby,wherein,whereupon,wherever,whether,which,while,whither,who,who,s,whoever,whole,whom,whose,why,will,willing,wish,with,within,without,won,t,wonder,would,would,wouldn,t,yes,yet,you,you,d,you,ll,you,re,you,ve,your,yours,yourself,yourselves,zero".split( ',') punctuation = [ '.', '?', '!', ':', ';', '...', '"', "'", '/', ',', '&', '|' ] stoplist = word_stoplist + punctuation #stoplist = ['for','a','of','the','and','to',',',';','.','"','in'] tweets = [] train = open("text.txt") train_tweets = train.readlines() #print np.shape(train_tweets) for train_tweet in train_tweets: tokenizetweet = tokenizeRawTweetText(train_tweet) tweet = [word for word in tokenizetweet if word not in stoplist] #print "tweet:", tweet #[tweet.remove(stopword) for stopword in stoplist if tweet.count(stopword)>0] tweets.append(tweet) # Build gensim dictionary # Will want to consider options for doing this in streaming form dictionary = corpora.Dictionary(tweets) if saveLocal: dictionary.save('congress.dict') # Build gensim corpus # Need to consider serialization options corpus = [dictionary.doc2bow(tweet) for tweet in tweets] if saveLocal: if corpusFormat == 'mm': corpora.MmCorpus.serialize('corpus.mm', corpus) elif corpusFormat == 'blei': corpora.BleiCorpus.serialize('corpus.lda-c', corpus) elif corpusFormat == 'low': corpora.LowCorpus.serialize('corpus.low', corpus) #print corpus return dictionary, corpus
def clean_dataframe(streamer): df = emotes.streamer_df(streamer) # Tokenize and clean data. Removes hyperlinks, non-ASCII characters, and rows with less than 5 tokens. Then lowercases all tokens. df['body'] = df['body'].apply(lambda x: twokenize.tokenizeRawTweetText(x)) df['body'] = df['body'].apply( lambda x: [re.sub(r"http\S+", "", i) for i in x]) df['body'] = df['body'].apply( lambda x: [re.sub(r'[^\x00-\x7F]', '', i) for i in x]) indexNames = df[df['body'].str.len() <= 4].index df.drop(indexNames, inplace=True) df['body'] = df['body'].apply(lambda x: [i.lower() for i in x]) # Grab global and streamer emotes and then lowercase them. X = emotes.global_streamer_emotes(streamer) X = [i.lower() for i in X] # Drop rows containing at least 2 distinct emotes. df['unique emotes'] = df['body'].apply(set) df['unique emotes'] = df['unique emotes'].apply( lambda x: x.intersection(set(X))) df['number of unique emotes'] = df['unique emotes'].apply(len) indexNames1 = df[df['number of unique emotes'] != 1].index df.drop(indexNames1, inplace=True) # Drop unnecessary (for now) columns. df = df.drop([ 'channel_id', 'commenter_id', 'commenter_type', 'created_at', 'fragments', 'offset', 'updated_at', 'video_id', 'number of unique emotes' ], axis=1) # Turn unique emotes column into column of strings instead of set. df['unique emotes'] = df['unique emotes'].apply(lambda x: list(x)[0]) # Remove identical messages (spam). df['body_str'] = df['body'].apply(lambda x: ' '.join(x)) df = df.drop_duplicates(subset=['body_str']) df = df.drop(columns=['body_str']) return df
def tokenize_str(string): stopwords = corpus.stopwords.words('english') stopwords.extend([u"---", u"...", u"n't"]) lemmatizer = WordNetLemmatizer() tokens = tokenizeRawTweetText(string) tokens = [token.lower() for token in tokens if len(token) > 2] tokens = [token for token in tokens if token[1:] not in hashtags] tokens = [token for token in tokens if token not in urls] tokens = [token for token in tokens if token[1:] not in user_mentions] tokens = [token for token in tokens if token not in stopwords] tokens = [token for token in tokens if not token.isdigit()] stemmed_tokens = [] for w in tokens: w = re.sub(r'(.)\1+', r'\1\1', w) stemmed_tokens.append(lemmatizer.lemmatize(w)) #.encode("ascii","ignore") return stemmed_tokens
def tweet_tokenizer(text: str) -> List[str]: ''' A Twitter tokenizer from `CMU Ark <https://github.com/brendano/ark-tweet-nlp>`_ This is a wrapper of `this <https://github.com/Sentimentron/ark-twokenize-py>`_. Further more this is an adapted version as it also splits hashtags e.g. `#anything` becomes [`#`, `anything`]. This follows the tokenization of `Yu et al. 2018 <https://www.aclweb.org/anthology/D18-1137>`_. :param text: A string to be tokenized. :returns: A list of tokens where each token is a String. :raises AssertionError: If the tokenized text is not character preserving. :raises ValueError: If the given text is not a String ''' hashtag_pattern = re.compile('^#.+') if isinstance(text, str): tokenized_text = twokenize.tokenizeRawTweetText(text) hashtag_tokenized_text = [] for token in tokenized_text: if hashtag_pattern.search(token): hashtag = token[0] hashtag_tokenized_text.append(hashtag) other_token_text = token[1:].strip() if other_token_text: hashtag_tokenized_text.append(other_token_text) else: hashtag_tokenized_text.append(token) assert_err = ( 'The tokenizer has not been charcter preserving. Original' f' text: {text}\nHashtag tokenized tokens ' f'{hashtag_tokenized_text}') assert is_character_preserving(text, hashtag_tokenized_text), assert_err return hashtag_tokenized_text raise ValueError(f'The paramter must be of type str not {type(text)}')
def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ # string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) # string = re.sub(r"\'s", " \'s", string) # string = re.sub(r"\'ve", " \'ve", string) # string = re.sub(r"n\'t", " n\'t", string) # string = re.sub(r"\'re", " \'re", string) # string = re.sub(r"\'d", " \'d", string) # string = re.sub(r"\'ll", " \'ll", string) # string = re.sub(r",", " , ", string) # string = re.sub(r"!", " ! ", string) # string = re.sub(r"\(", " \( ", string) # string = re.sub(r"\)", " \) ", string) # string = re.sub(r"\?", " \? ", string) # string = re.sub(r"\s{2,}", " ", string) string = twokenize.tokenizeRawTweetText(string) string = " ".join(str(x) for x in string) return string.strip().lower()
#--Command line parsing op = OptionParser() op.add_option('--i', dest='source', type='str', help='Filename of unprocessed text.') op.add_option('--o', dest='destination', type='str', help='Filename for processed text.') op.add_option('--stopwords', dest='stopwords_filename',type='str',help='Source of stopwords, default is local file called stopwords') op.print_help() # opts,args = op.parse_args() if len(args) > 0: op.error('This script only takes arguments preceded by command line options.') sys.exit(1) custom_stopwords = set(stopwords.words('english') if not opts.stopwords_filename else open('stopwords','rb').read().splitlines() + stopwords.words('english')) text = open(opts.source,'rb').read().splitlines() bar = Bar('Cleansing %s'%opts.source,max=len(text)) for i,item in enumerate(text): text[i] = [word for word in Tokenizer.tokenizeRawTweetText(item) if word not in custom_stopwords and not word.isdigit()] bar.next() bar.finish() #Remove words that open appeal once all_tokens = sum(text,[]) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word)==1) text = [[word for word in item if word not in tokens_once] for item in text] with open(opts.destination,'wb') as f: for item in text: print>>f, ' '.join(item)
import re import twokenize as Tokenizer test = "Happy Bday @ImTheFrancescaC You're 10 years old! I wish you have amazing day LYSM pretty #birthday #of #a #princess pic.twitter.com/1s4DiF7TOn" test = Tokenizer.tokenizeRawTweetText(test) print test
def to_words(self, in_text): return tokenizeRawTweetText(in_text)
files = listdir('../data/us-to-russia/') russia_tally = 0 putin_tally = 0 russia_list = [] putin_list = [] done_dict = defaultdict(bool) for batch in files: with open('../data/us-to-russia/'+batch) as f: tweets = json.load(f) for t in tweets: if done_dict[t[u'id']] or langid.classify(t[u'text']) != 'en' or len(t[u'text']) == 0: continue else: text = t[u'text'] tokens = tokenizeRawTweetText(text) if 'russia' in tokens: print 'found russia' russia_list.append(('0',text))# += tokens russia_tally += 1 if 'putin' in tokens: print 'found putin' putin_list.append(('0',text))# += tokens putin_tally += 1 done_dict[t[u'id']] = True russia_labeled = th.predict(russia_list) putin_labeled = th.predict(putin_list) print compute_polairty(Counter(russia_labeled)) print compute_polarity(Counter(putin_labeled))
def tokenize(text): return tokenizeRawTweetText(text)