def test_tokenize(self): tweet = "Packathon was a really #nice :) challenging 👌. @packathonorg http://packathon.org" p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY) tokenized_tweet = p.tokenize(tweet) self.assertEqual( tokenized_tweet, "Packathon was a really $HASHTAG$ $SMILEY$ challenging $EMOJI$. $MENTION$ $URL$" )
def processTweet(tweet): # process the tweets import sys reload(sys) sys.setdefaultencoding('utf8') yourstring = tweet.encode('ascii', 'ignore').decode('ascii') new_tweet = p.clean(yourstring) str = p.tokenize(new_tweet) #return str with open('CNG_test_main.txt', 'a') as createFile: createFile.write((str) + '\n') #for raw_tweet in str: #createFile.write(json.dumps(raw_tweet) + '\n') createFile.close()
def preprocessData(fn, save_fn, corpus_file): print("Preprocessing {}...".format(fn)) with open(os.path.join(param.dump_folder, fn), "rb") as handle: sent_list, label_list = pickle.load(handle) print("Tokenization...") sent_list = [p.tokenize(sent) for sent in sent_list] sent_list = [clean_str(sent).split()[:] for sent in sent_list] with open(os.path.join(param.dump_folder, save_fn), "wb") as handle: pickle.dump((sent_list, label_list), handle) # write to corpus with comments if fn.endswith("comm.data"): corpus_file.write("\n".join([" ".join(sent) for sent in sent_list])) corpus_file.write("\n") print("Done preprocessing, save to {}".format(save_fn))
def text_preprocessor(doc): # separate hyperlinks from adjacent text, e.g. goodbypic.twitter.com -> goodbye pic.twitter.com doc = re.sub(r'(\w*)(https?|pic\.)', r'\1 \2', doc) # uniformize twitter-specific tokens doc = preprocessor.tokenize(doc) # extract text from *, e.g. *nope* -> nope doc = re.sub(r'\*(.*?)\*', r'\1', doc) # replace & symbol doc = re.sub(r'&', r' and ', doc) # lower-casing doc = doc.lower() # uniformize some corpus specific errors doc = re.sub(r'xan ', r'xanax ', doc) doc = re.sub(r'rogain', r'rogaine ', doc) doc = re.sub(r'adderal|aderall', r'adderall', doc) # normalizer multiple occurences of vowels/consonants doc = re.sub(r'(\w)\1\1+', r'\1', doc) # remove reddit symbol /r/ doc = re.sub(r'/r/', r'', doc) # remove text between {} doc = re.sub(r'\{(.*?)\}', r'', doc) # uniformize emojis and numbers preprocessor.set_options(preprocessor.OPT.EMOJI, preprocessor.OPT.NUMBER) # split NUMBER and EMOJI when adjacent to text doc = preprocessor.tokenize(doc) doc = re.sub(r'(w*)(EMOJI|NUMBER)', r'\1 \2', doc) doc = re.sub(r'(EMOJI|NUMBER)(w*)', r'\1 \2', doc) # remove non-alphanumeric characters doc = re.sub('[^A-Za-z ]+', '', doc) # remove very long words >=15 and short words <2 doc = ' '.join([item for item in doc.split() if 1 < len(item) < 18]) # lower-casing doc = doc.lower() # remove multiple sequential occurences of the same token doc = re.sub(r'(\w+) \1 \1+', r'\1', doc) return doc
def raw_tweet_prep_test(raw_tweet, stopwords, html_re, space_replace_re, repeating_re, single_char_re): tweet_tokenized = html_re.sub(' ', raw_tweet) tweet_tokenized = p.tokenize(tweet_tokenized.lower().replace('\n', ' ')) tweet_tokenized = space_replace_re.sub(' ', tweet_tokenized) tweet_tokenized = repeating_re.sub(r"\1", tweet_tokenized) #raw_tweet = ' '.join(raw_tweet) #tweet_tokenized = single_char_re.sub(' ', tweet_tokenized) tweet_tokenized = tweet_tokenized.strip().split() words = [w for w in tweet_tokenized if w not in stopwords] if len(words) > 1: return words else: raise Exception("Input tweet too short")
def main(event: func.EventHubEvent) -> str: text = "" try: tweet = json.loads(event.get_body().decode('utf-8')) text = tweet[0]["text"] logging.info('Python EventHub trigger processed a tweet: %s', text) except KeyError: logging.error('Error parsing tweet.') pass else: # Tokenize the tweet and outputs it. tokenized = p.tokenize(text) logging.info('Tweet tokenized into: %s', tokenized) return tokenized
def get_pred(xyz): tweet = xyz tokens = p.tokenize(tweet) l = [] l.append(tokens) arr = np.array(list(vocab_processor.transform(l))) tmp = loaded_model.predict_proba(arr) acc = max(tmp[0]) * 100 print(arr) res = np.argmax(loaded_model.predict(arr), 1) print(res) if res[0] == 1: val = "Sexist or Racist Post" else: val = "Neutral Post" return {'result': val, 'accuracy': acc}
def segment_posts_2(self, posts, personality): post_split = posts.split("|||") final_post_list = [] for post in post_split: # Preprocess the tweets post = p.tokenize(post) post = self.post_process(post) # append to the final string final_post_list.append(post) final_post_str = " ".join(final_post_list) # Append to the row list one_personalit_one_big_post = { 'type': personality, 'post': final_post_str } self.row_list.append(one_personalit_one_big_post)
def preprocess_text(message): for i, row in dataframe.iterrows(): # print row['message'] # clean_tweet = re.match('(.*?)http.*?\s?(.*?)', message) message = p.tokenize(message) pattern = re.compile("[^\w$ ]") message = pattern.sub('', message) message = re.sub('[0-9]+', '', message) message = message.replace('$PIC$', '$PIC$ ') message = message.replace('$NUMBER$', '$NUMBER$ ') message = message.replace('$URL$', '$URL$ ') if message.__contains__('$PIC$') | message.__contains__('$URL'): message = message.rsplit('$', 1)[0] + "$" message = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', message) print message return message
def parseFile(filename, outfile): with open(filename) as csv_file: csv_reader = csv.reader(csv_file, delimiter='\n') line_count = 0 for row in csv_reader: if line_count == 0: print(f'Column names are {", ".join(row)}') line_count += 1 else: #print(row) print(line_count) if len(row) is not 0: print(row) # Removing mentions, URLS, emojiis and tokenizing clean_row = p.clean(row[0]) clean_row = deEmojify(clean_row) tokens = p.tokenize(clean_row) temp = " {}" final = temp.format(tokens) letters_only_text = re.sub(r"[^a-zA-Z\'\#]", " ", final).lower() word_array = letters_only_text.split() word_array = [ term for term in word_array if term not in stop_words ] # Recreating string cleaned = " ".join(word_array) print(cleaned) if len(cleaned) > 0: preprocessed.append(cleaned) line_count += 1 print(f'Processed {line_count} lines.') print(len(preprocessed)) # Writing preprocessed, emoji-free tweets to csv file. with open(outfile, mode='w') as csv_file: for i in range(len(preprocessed)): csv_file.write(preprocessed[i]) csv_file.write('\n') csv_file.write('\n')
def preprocess(s, lowercase=False): import string table = str.maketrans('', '', string.punctuation) p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED) s = p.tokenize(s) tokens_raw = s.replace("\n", "").strip("'[]\' '").split() # Added lemmatization (1/1/18). Potentially should remove to compare results # tokens = gensim.utils.lemmatize(s, stopwords=STOPWORDS, min_length=1) # Without lemmatization, remove punctuation if lowercase: tokens = [ token.strip().lower().translate(table) for token in tokens_raw ] else: tokens = [token.strip().translate(table) for token in tokens_raw] return tokens
def getText(text, pattern, subs, stopwords): if text is not None and len(text) > 0: text = text.replace(u'\n', u' ').replace('(', ' (').strip().lower() tlines = preprocessor.parse_field2(text) lines = [] for t in tlines: t = preprocessor.join_tokens( [tok for tok in preprocessor.tokenize(t)]) t = pattern.sub(lambda m: subs[re.escape(m.group(0))], t) t = preprocessor.join_tokens([ preprocessor.strip_leading_and_trailing_nums(tok) for tok in preprocessor.tokenize2(t) if tok not in stopwords and tok not in string.punctuation and len(tok) > 1 and not preprocessor.containsThreeConsecTokens(tok) ]) lines.append(t) t = preprocessor.join_tokens(lines) return t if len(t) > 1 else None return None
def raw_tweet_prep_stem_test(raw_tweet, stopwords, stemmer, html_re, space_replace_re, repeating_re, single_char_re): # remove some more things ('s, 'm, 't, html symbol, other non english char, and repeating expression) tweet_tokenized = html_re.sub(' ', raw_tweet) print(tweet_tokenized) tweet_tokenized = p.tokenize(tweet_tokenized.lower().replace('\n', ' ')) print(tweet_tokenized) tweet_tokenized = space_replace_re.sub(' ', tweet_tokenized) print(tweet_tokenized) tweet_tokenized = repeating_re.sub(r"\1", tweet_tokenized) print(tweet_tokenized) # tokenize and replace url with 'URL', numbers with 'NUMBER' and 'EMOJi' #tweet_tokenized = single_char_re.sub(' ', tweet_tokenized) tweet_tokenized = tweet_tokenized.strip().split() print(tweet_tokenized) words = [stemmer.stem(w) for w in tweet_tokenized if w not in stopwords] if len(words) > 1: return words else: raise Exception("Input tweet too short")
def _preprocess_tweet(tweet, stop_words, stemmer): tweet_ = _preprocess_tags(tweet) tweet_ = p.tokenize(tweet_) # emoji, smiley, number tweet_ = tokenizer_.tokenize(tweet_) def https2url(token): if "https" in token: return "url" else: return token tweet_ = list(map(https2url, tweet_)) # removes all not word token: tweet_ = list( filter(lambda token: token.isalpha() and len(token) > 1, tweet_)) # removes all the stop words: tweet_ = list(filter(lambda token: token not in stop_words, tweet_)) # stemming: tweet_ = [stemmer.stem(token) for token in tweet_] tweet_ = " ".join(tweet_) return tweet_
def clean_data(line): ## Remove @, reduce length, handle strip tokenizer = TweetTokenizer(strip_handles=False, reduce_len=True) line = ' '.join(tokenizer.tokenize(line)) ## Remove url, emoji, mention, prserved words, only preserve smiley #pre.set_options(pre.OPT.URL, pre.OPT.EMOJI, pre.OPT.MENTION, pre.OPT.RESERVED) pre.set_options(pre.OPT.URL, pre.OPT.RESERVED) line = pre.tokenize(line) ## Remove non-sacii line = ''.join([i if ord(i) else '' for i in line]) # remove non-sacii #if not line: # line = 'RT' line = line + ' <end>' """ line = line.replace(r'https?://\S+', r'') # remove url if line.startswith('RT @'): line = line.replace(r'RT ', r'') # remove RT (retweet)""" return line
def tokenize_short_text(self, raw_tweet_text): tweet_text = raw_tweet_text #tweet_text = tweet_text.strip() #tweet_text = unidecode.unidecode(tweet_text) if self.args.use_lowercase: tweet_text = tweet_text.lower() if self.tokenizer > 0: if self.tokenizer == 1: uttterance_tokens = word_tokenize(tweet_text) if self.tokenizer == 2: uttterance_tokens = wordpunct_tokenize(tweet_text) if self.tokenizer == 3: uttterance_tokens = self.tweet_tokenizer.tokenize(tweet_text) if self.tokenizer == 4: tweet_text = clean(tweet_text) tweet_text = self.remove_accented_chars(tweet_text) uttterance_tokens = self.tweetokenizer.tokenize(tweet_text) uttterance_tokens = self.remove_duplicated_sequential_words(uttterance_tokens) uttterance_tokens = self.remove_stopwords(uttterance_tokens) if self.tokenizer == 5: tweet_text = tokenize(' '.join(self.tweet_tokenizer.tokenize(tweet_text))) return tweet_text if self.tokenizer == 6: tweet_text = clean(' '.join(self.tweet_tokenizer.tokenize(tweet_text))) return tweet_text if self.stem: uttterance_tokens = [list(map(self.stemmer.stem, sub)) for sub in uttterance_tokens] if self.lemmatize: uttterance_tokens = [[self.lemmatizer.lemmatize(tok, pos='v') for tok in sub] for sub in uttterance_tokens] tweet_text = " ".join(uttterance_tokens) return tweet_text
def preprocess_word_based(tweets, vocab_model): p.set_options(p.OPT.EMOJI, p.OPT.URL, p.OPT.MENTION) # , p.OPT.HASHTAG) batch = [] pattern = re.compile('[^0-9a-z\s]+', re.UNICODE) for tweet in tweets: # Should I remove hashtags completely, or just remove the symbol? clean = p.tokenize(tweet) clean = split_hashtags(clean) clean = clean.lower() clean = pattern.sub(' ', clean) words = clean.split() res = [] for word in words: if word != '': try: vec = vocab_model.word_vec(word) except KeyError: vec = unknown_vector finally: res.append(vec) batch.append(res) return batch
def predict_personality_from_post(model, post, wb): # Preprocess the tweets post = p.tokenize(post); #print(post); # Grab the embeddings and averages embeddings_from_post = np.array(wb.compute_embeddings([post], wb.embedding_index)); embedding_avgs = wb.compute_average(embeddings_from_post); print('[+] Embedding length from test text: ' + str(len(embeddings_from_post))); print(wb.compute_average(embeddings_from_post).shape); print('[+] Prediction: '); # Compute the prediction prediction = model.predict(embedding_avgs); print(prediction); # Get personality type index_personality = prediction[0] - 1; personality_type = personality_types[index_personality]; return personality_type;
def preprocess(text): # remove the hashtags text = text.replace("#", '') text = text.replace("-", ' ') text = text.lower() # token the url, clean out the emojis tp.set_options(tp.OPT.URL) text = tp.tokenize(text) tp.set_options(tp.OPT.EMOJI) text = tp.clean(text) # flesh out all punctuation and tokenize words into base form text = "".join([char for char in text if char not in string.punctuation]) text = nltk.word_tokenize(text) # # initialize a lemmatizer from sklearn and apply to all words in the given list lemmatizer = WordNetLemmatizer() lem_text = [lemmatizer.lemmatize(word) for word in text] final_text = " ".join([word for word in lem_text]) return final_text
def plot_attention_graph(model,x,Tx,Ty,human_vocab,layer=7): # Process input tokens = np.array([tokenize(x,human_vocab,Tx)]) tokens_oh = oh_2d(tokens,len(human_vocab)) # Monitor model layer layer = model.layers[layer] layer_over_time = K.function(model.inputs,[layer.get_oiuput_at(t) for t in range(Ty)]) layer_output = layer_over_time([tokens_oh]) layer_output = [row.flatten().tolist() for row in layer_output] # Get model output prediction = get_prediction(model,tokens_oh)[1] # Graph the data fig = plt.figure() fig.set_figwidth(20) fig.set_figwidth(1,8) ax = fig.add_subplot(111) plt.title('Attention Values per Timestep') plt.rc('figure') cax = plt.imshow(layer_output,vmin=0,vmax=1) fig.colorbar(cax) plt.xlabel('Input') ax.set_xticks(range(Tx)) ax.set_xtickalabels(x) plt.ylabel('Output') ax.set_yticks(range(Ty)) ax.set_yticklabels(prediction) plt.show()
def preprocessing2(text): text = text.decode('ascii','ignore') proc.set_options(proc.OPT.URL, proc.OPT.MENTION, proc.OPT.HASHTAG) clean_ver = proc.tokenize(text).lower() return str(clean_ver)
PATH = './Dataset' # Fetch your test tweets and labels: """ Both files are stored in .pkl format. 1) x_test : list containing all tweets of users 2) y_test : contains binary class values as 1: Hate | 0:Counter """ x_test = pickle.load(open(os.path.join(PATH,'x_test.pkl'),'rb')) y_test = pickle.load(open(os.path.join(PATH,'y_test.pkl'),'rb')) #>>>>Preprocessing: prep_tweets = [] for tweet in tqdm(x_test): prep_tweets.append(prep.tokenize(x_test)) ##**************************************************************************** # # TF-IDF Vectorizers : word_vectorizer = TfidfVectorizer(vocabulary=pickle.load(open("word_vocab.pkl", "rb")) # pretrained vocabulary from 6 million tweets on word level char_vectorizer = TfidfVectorizer(vocabulary=pickle.load(open("char_vocab.pkl", "rb")) # pretrained vocabulary from 6 million tweets on char level char_features = char_vectorizer.transform(prep_tweets) word_features = word_vectorizer.transform(prep_tweets) #******************************************************************************* #>>>>Lexical Features : ''' !pip install empath from empath import Empath Run it on x_test and store it in ./Models/ in .pkl format
import preprocessor as p import re str_to_clean = 'Preprocessor is #awesome 👍 https://github.com/s/preprocessor'; clean_str = p.tokenize(str_to_clean); #print([word for word in clean_str.split() if word.startswith('$') and word.endswith('$')]) new_str = []; for word in clean_str.split(): if word.startswith('$') and word.endswith('$'): word = '<' + word[1:len(word)-1] + '>'; new_str.append(word); new_str = " ".join(new_str); #m = re.sub(r'/\$(URL|EMOJI)\$', r'<\1>', clean_str); m = re.findall(r'\b\$\w*?\$\b', clean_str.rstrip()); print(m); #print(clean_str); #print(new_str);
def cleanQuery(query): tokens = tokenize(query) query = [] for i in range(len(tokens)): token = tokens[i] print('token : ', token) if not bool(re.search('^\d+$', token)): alpha_num = bool(re.search('(^\d+|\d+$)', token)) if misspelled_synonym.has_key(token): print('i am here 1') token = misspelled_synonym[token] query = query + tokenize(token) continue elif i == 0 and len( tokens) == 1 and dictionary[token] < 50 and alpha_num: print('i am here 2') token = correct_alpha_num(token) elif i == 0 and len(tokens) == 1 and fw_dictionary[token] < 50: print('i am here 3') token = correct(token) elif i == 0 and len(tokens) > 1 and dictionary[ tokens[1]] > 50 and alpha_num: print('i am here 4') token = correct_alpha_num(token, nex=tokens[1]) elif i == 0 and len(tokens) > 1 and dictionary[ token] < 450 and dictionary[tokens[1]] < 20: print('i am here 5') token = correct(token) elif i == 0 and len(tokens) > 1 and dictionary[ tokens[1]] > 50 and cPBigram(word=token, nex=tokens[1]) == 0: print('i am here 6') token = correct(token, nex=tokens[1]) elif i > 0 and dictionary[query[-1]] >= 1 and cPBigram( word=token, prev=query[-1]) == 0 and alpha_num: print('i am here 7') token = correct_alpha_num(token, prev=query[-1]) elif i > 0 and dictionary[query[-1]] >= 20 and cPBigram( token, query[-1]) == 0: print('i am here 8' + query[-1]) token = correct(token, prev=query[-1]) elif i > 0 and dictionary[token] <= 5 and cPBigram( token, query[-1]) == 0: print('i am here 9') token = correct(token) else: print('i am here 10') query.append(token) continue for tkn in token: if dictionary[tkn]: query.append(tkn) elif len(tkn) <= 15: sep_tokens = ws.segment(tkn) if Pwords2(sep_tokens) >= 1.5e-06: query = query + sep_tokens else: query.append(tkn) else: query.append(tkn) else: query.append(token) clean_query = [] for token in merge_tokens(query): clean_query = clean_query + tokenize( misspelled_synonym.get(token, token)) return (' '.join(clean_query), Pwords2(clean_query))
def text_cleaner(text): #Set the options to tokenize. URL -> $URL$, MENTION -> $MENTION$ pp.set_options(pp.OPT.URL) toRtn = pp.tokenize(text) return toRtn.replace("$URL$", "")
def preprocess_text(text: str, opts, nlpengine=None, lang='en', special_tags=["<pad>", "<eos>"], use_tw_preprocessor=True): if use_tw_preprocessor: ## ! There is a bug in original package for twitter preprocessing # Sometomes regexp for link preprocessing freezes # So we preprocess links separately text = re.sub(r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", "$URL$", text.strip()) twitter_preprocessor.set_options('mentions') text = twitter_preprocessor.tokenize(text) # processed_chunk = twitter_preprocessor.clean(text) if nlpengine is None: global nlp if nlp is None: nlp = spacy.load(lang) nlp.add_pipe(nlp.create_pipe('sentencizer')) for x in ['URL', 'MENTION', 'HASHTAG', 'RESERVED', 'EMOJI', 'SMILEY', 'NUMBER', ]: nlp.tokenizer.add_special_case(f'${x}$', [{ORTH: f'${x}$'}]) nlpengine = nlp BLvec = [] POSvec = [] DEPvec = [] NERvec = [] processed_chunk = "" doc = nlpengine(text) doclen = 0 for sentence in doc.sents: for w in sentence: # Some phrases are automatically tokenized by Spacy # i.e. New York, in that case we want New_York in our dictionary word = "_".join(w.text.split()) if word.isspace() or word == "": continue if opts.remove_stop_words and word.lower() in stopWords: continue if opts.remove_puncuation and word in punctuation: continue # Spacy lemmatized I,He/She/It into artificial # -PRON- lemma, which is unwanted if opts.lemmatize_words: output = w.lemma_ if w.lemma_ != '-PRON-' else w.lower_ else: output = word if opts.to_lowercase: output = output.lower() if opts.replace_nums and output.replace('.', '', 1).isdigit(): output = opts.num_replacement output = output.replace("n't", "not") doclen += 1 processed_chunk += "%s " % (output) # Sometimes, when the word contains punctuation and we split it manually # the output can contain multiple tokens # In such case, just copy the features..., it happens rarely if opts.returnbiglettervector: BLvec.append(int(w.text[0].isupper())) if opts.returnposvector: POSvec.append(POS_dict.get(w.pos, POS_dict['UNK'])) if opts.returnDEPvector: try: DEPvec.append(validDEPS.index(w.dep_.lower())) except ValueError: DEPvec.append(validDEPS.index('UNK')) if opts.returnNERvector: try: NERvec.append(validNER.index(w.ent_type_)) except ValueError: NERvec.append(validNER.index('UNK')) if opts.add_eos: doclen += 1 processed_chunk += opts.eos + "\n" if opts.returnbiglettervector: BLvec.append(0) if opts.returnposvector: POSvec.append(POS_dict['EOS']) if opts.returnDEPvector: DEPvec.append(0) if opts.returnNERvector: NERvec.append(0) else: processed_chunk += "\n" processed_chunk = processed_chunk.strip() assert len(processed_chunk.split()) == len(BLvec) == len(POSvec) == len(DEPvec) == len(NERvec) return processed_chunk, BLvec, POSvec, DEPvec, NERvec
def tokenize(text, lower=True): cleaned_text = tweet_preprocessor.tokenize(text) return cleaned_text.lower() if lower else cleaned_text
def extract_stats(clean_tweet, tweet_id, human_or_bot): found = human_relationships_identifier(clean_tweet) # print(clean_tweet,found) pos_list = [] pos_counter = collections.Counter() spacy_stats = nlp(clean_tweet) for token in spacy_stats: pos_counter[token.pos_] += 1 pos_list.append(token.pos_) entity_list = [] entity_label_list = [] for ent in spacy_stats.ents: entity_label_list.append(ent.label_) entity_list.append(ent.text) char_count = len(spacy_stats.text) unique_words = [] processed_tweet = p.tokenize(clean_tweet) curve = ttr_curve(pos_list) processed_word_count = 0 spell_error_count = 0 for word in processed_tweet.split(): # clean_word = re.sub(r'[^A-Za-z]', "", word) # if dictionary.check(clean_word) is False and not word.startswith('$'): # print(word) # spell_error_count += 1 processed_word_count += 1 if word not in unique_words: unique_words.append(word) if processed_word_count > 0: ttr = float(len(unique_words)) / float(processed_word_count) else: ttr = 0 i = 1 arr = [] for pos in pos_list: arr.append(i) i += 1 if len(arr): ttr_slope = linregress(arr, curve)[0] else: ttr_slope = 0 tweet_dict = { 'index': tweet_id, 'raw': clean_tweet, 'preprocessed tweet': processed_tweet, 'char count': char_count, 'pos': pos_list, 'pronouns': pos_counter['PRON'], 'nouns': pos_counter['NOUN'], 'verbs': pos_counter['VERB'], 'adverbs': pos_counter['ADV'], 'adjectives': pos_counter['ADJ'], 'symbols': pos_counter['SYM'], 'punctuation': pos_counter['PUNCT'], 'proper nouns': pos_counter['PROPN'], 'entity label': entity_label_list, 'word count': processed_word_count, 'unique word count': len(unique_words), 'TTR': ttr, 'entity raw text': entity_list, 'mentions': clean_tweet.count('@'), 'hashtags': clean_tweet.count('#'), 'urls': clean_tweet.count('$URL$'), 'class': human_or_bot, 'relationship words': found, 'count(rel words)': len(found), 'ttr curve': curve, 'ttr slope': ttr_slope } return tweet_dict
def tokenize(sent): print(p.tokenize(p.clean(sent))) return p.tokenize(p.clean(sent))
json_file.close() loaded_model = model_from_json(loaded_model_json) # load weights into new model loaded_model.load_weights("src/model.h5") print("Loaded model from disk") #################################### Pre-process Tweets ################################ index_vocabolario = {idx: w for w, idx in vocabolario_index_twitter.items()} id_text_pad_list = [] for i in range(len(data)): try: id_text_pad_list += [ (data[i]['id'], replace_word_index_twitter((substitute_label( (normalize(p.tokenize( data[i]['retweeted_status']['text']))).split())).split())) ] except: id_text_pad_list += [ (data[i]['id'], replace_word_index_twitter((substitute_label( (normalize(p.tokenize(data[i]['text']))).split())).split())) ] array_pad = np.array([j for i, j in id_text_pad_list]) padding = sequence.pad_sequences(array_pad, maxlen=40, padding='post') prediction = loaded_model.predict_classes(padding) pred = ['negative' if i == 0 else 'positive' for i in prediction] dict_id_sentiment = {} for i, j in enumerate(array_pad): dict_id_sentiment[id_text_pad_list[i][0]] = pred[i]
import preprocessor import mlcs if __name__ == '__main__': with open('testcases/colors.json') as f: text = f.read() tokens = [token for i, token in preprocessor.tokenize(text)] mlcs.printResults(tokens) # time complexity: 110595198 # space complexity: 487550
def test_tokenize(self): tweet = 'Packathon was a really #nice :) challenging 👌. @packathonorg http://packathon.org' tokenized_tweet = p.tokenize(tweet) self.assertEqual(tokenized_tweet, 'Packathon was a really $HASHTAG$ $SMILEY$ challenging $EMOJI$. $MENTION$ $URL$')