def load_data(path, word2id, cases_per_file=None): tweet_ids = [] files = listdir(path) files = [path + '/' + file for file in files] features = pd.DataFrame() for file in files: if (cases_per_file == None): file_df = pd.read_csv(file, header=0, usecols=['text', 'label']) else: file_df = pd.read_csv(file, header=0, usecols=['text', 'label'], nrows=cases_per_file) features = pd.concat([features, file_df]) labels = features.pop('label') features['text'] = features['text'].astype(str) for _, row in features.iterrows(): string = pt.tokenize(row.values[0]) split = string.split() tweet = [word2id.get(word, UNKNOWN_INDEX) for word in split] tweet_ids.append(tweet) labels = np.asarray(labels, dtype=np.int32) return tweet_ids, labels
def tokenize(self, txt): '''Preprocess text and tokenize it''' txt = preprocess_twitter.tokenize(txt) txt = self.unescape_html(txt) txt = self.split_punctuations(txt) txt = self.split_emojis(txt) txt = self.replace_emojis(txt) words = self.tokenizer.tokenize(txt) return words
def _text_preprocessor(self, text): text = preprocess_twitter.tokenize(text) text = casual.reduce_lengthening(text) text = cleanString(setupRegexes('twitterProAna'),text) text = ' '.join([span for notentity,span in tweetPreprocessor(text, ("urls", "users", "lists")) if notentity]) text = text.replace('\t','') text = text.replace('< ','<').replace(' >','>') text = text.replace('):', '<sadface>').replace('(:', '<smile>') text = text.replace(" 't", "t")#.replace("#", "") return ' '.join(text.split())
def tweet_preprocess(tweet): text = twitter_pr.tokenize(tweet) split = text.split() tweet = [word2id.get(word, import_data.UNKNOWN_INDEX) for word in split] tweet_len = len(tweet) # sequence.pad_sequences expects ndarray tweet = sequence.pad_sequences([tweet], maxlen=TWEET_LENGTH, truncating='post', padding='post', value=import_data.PAD_INDEX) # need to remove extra dimension we put return tweet[0], tweet_len
def proc_file(in_filename, out_filename, glove_vocab): infile = open(in_filename) reader = csv.reader(infile, quotechar='"') outfile = open(out_filename, 'w') writer = csv.writer(outfile, quotechar='"') tumblr_vocab = set() tokenizer = TweetTokenizer() for i,line in enumerate(reader): if len(line) != 10: continue caption = tokenizer.tokenize(tokenize(line[8])) num_in = len([x for x in caption if x in glove_vocab]) for word in caption: tumblr_vocab.add(word) if num_in * 2 > len(caption): writer.writerow(line) print "Tumblr vocab size:", len(tumblr_vocab) print "Overlap:", len(glove_vocab & tumblr_vocab)
def tokenise_tweet(text): text = preprocess_twitter.tokenize(text) text = preprocess_tweet(text) return ' '.join(text.split())
with open("num_of_tweets.txt", 'w') as f: f.write(str(num_dem_tweets)) f.write('\n') f.write(str(num_rep_tweets)) f.close() else: with open("num_of_tweets.txt", 'w') as f: f.write(str(num_dem_tweets)) f.write('\n') f.write(str(num_rep_tweets)) f.close() # if num_rep_tweets - old_num_rep_tweets > 10000000: date = datetime.now().strftime('%Y_%m_%d') cur2 = conn.cursor('repcur') cur2.execute('''SELECT content FROM tweetstest WHERE party = False''') with open('rep_tweets_{}.txt'.format(date), 'w') as f: for record in cur2: f.write(pre.tokenize(record[0]) + '\n') f.close() # if num_dem_tweets - int(old_num_dem_tweets) > 10000000: date = datetime.now().strftime('%Y_%m_%d') cur3 = conn.cursor('demcur') cur3.execute('''SELECT content FROM tweetstest WHERE party = True''') with open('dem_tweets_{}.txt'.format(date), 'w') as f: for record in cur3: f.write(pre.tokenize(record[0] + '\n')) f.close()
model = KeyedVectors.load_word2vec_format( "./glove.twitter.27B/word2vec200d.txt", binary=False) ###### TF-IDF ##### from sklearn.feature_extraction.text import TfidfVectorizer tfidfVectorizer = TfidfVectorizer(encoding='latin-1', vocabulary=model.wv.vocab.keys(), lowercase=True) tfidf = tfidfVectorizer.fit_transform(data) ##################### # Creating a representation for the whole tweet using Glove wordvec import preprocess_twitter as stanfordPreprocessing for i, tweet in enumerate(data): tweet = stanfordPreprocessing.tokenize(tweet).split() #Without TF_IDF #features.append(buildTwitterVector(tweet,model,size=200)) #With TF_IDF - do not remove punctuation since Glove was trained with it features.append( buildTwitterVectorTFIDF(tweet, model, tfidfVectorizer, tfidf.getrow(i).toarray(), size=200)) result = cross_validate(LogisticRegression(penalty='l2'), X=features, y=sentiment, cv=5,
import re from scipy.stats import itemfreq import pandas as pd import fasttext as ft import numpy as np import sys ds_name = sys.argv[1] ds_filename = './data/{}_cleaned.txt'.format(ds_name) model_filename = './model/{}_cleaned.bin'.format(ds_name) stopset = set(stopwords.words('english')) loaded_ds = [(x[0], x[1]) for x in utils.load_ds(ds_filename, True)] labels = [x[0] for x in loaded_ds] tweets = [tokenize(x[1]) for x in loaded_ds] tweets = [ ' '.join([w for w in tw.split() if w not in stopset]) for tw in tweets ] tweets = [ ' '.join([w for w in tw.split() if not w.startswith('<')]) for tw in tweets ] r = re.compile('[^a-z\s]') tweets = [r.sub('', tw).strip() for tw in tweets] with open('./output/{}_w2v_tweets.txt'.format(ds_name), 'w') as f: for i in range(len(tweets)): f.write('{}{}\n'.format(labels[i], tweets[i]))
def run_model(q, stop, predict_fn, word2id, red): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.info("Loaded... listening on channel") while (not stop.is_set()): try: hashtags = 0 urls = 0 ids = [] message = q.get_nowait() content = message['content'].decode("utf-8") channel = message['channel'].decode("utf-8") split = pt.tokenize(content) split = split.split() for word in split: index = word2id.get(word, i_data.UNKNOWN_INDEX) if (index == i_data.HASHTAG_INDEX): hashtags += 1 elif (index == i_data.URL_INDEX): urls += 1 ids.append(index) length = len(ids) # extra dimension is for feeding through sequence.pad_sequence ids = [ids] ids = sequence.pad_sequences(ids, maxlen=50, truncating='post', padding='post', value=i_data.PAD_INDEX) # remove extra dimension ids = ids[0] # favorite_count, num_hashtags, num_urls, reply_count attributes = [0, hashtags, urls, 0] # model expects dict below inputs = {'text': ids, 'len': length, 'attributes': attributes} predictions = predict_fn(inputs) logging.info( "output class: %d \n user propability: %f \n bot probability %f", predictions['pred_output_classes'][0], predictions['probabilities'][0][0], predictions['probabilities'][0][1]) # Predictions: First array holds percentages of confidence of class # class at index 0 is a human # class at index 1 is a bot resp = {} resp['class'] = int(predictions['pred_output_classes'] [0]) # .item() converts to native python type resp['percentage0'] = float(predictions['probabilities'][0][0]) resp['percentage1'] = float(predictions['probabilities'][0][1]) channel = channel.split("-") channel = CHANNEL_REPLY + channel[1] red.publish(channel, json.dumps(resp)) # when there is nothing in the queue continue to loop except queue.Empty: continue