def __iter__(self): for (fname,pos) in self.files: for line in gzip.open(fname,'rb'): tweet = line.split('\t')[pos] tweet = preprocess_tweet(tweet) tweet = self.tknzr.tokenize(tweet.decode('utf-8')) yield filter(lambda word: ' ' not in word, tweet)
def __iter__(self): for (fname, pos) in self.files: for line in gzip.open(fname, 'rb'): tweet = line.split('\t')[pos] tweet = preprocess_tweet(tweet) tweet = self.tknzr.tokenize(tweet.decode('utf-8')) yield filter(lambda word: ' ' not in word, tweet)
def load_data(fname): tid,topics,tweets,sentiments = [],[],[],[] tknzr = TweetTokenizer(reduce_len=True) n_not_available = 0 with open(fname) as f: for line in f: splits = line.split('\t') tweet = splits[3] sentiment = convertSentiment(splits[2]) if tweet != "Not Available\n": tid.append(splits[0]) topic = pts.preprocess_tweet(splits[1]) topic_tok = tknzr.tokenize(topic.decode('utf-8')) topics.append(splits[1]) tweet = pts.preprocess_tweet(tweet) tweet_tok = tknzr.tokenize(tweet.decode('utf-8')) tweets.append(tweet_tok) sentiments.append(int(sentiment)) else: n_not_available += 1 print "Number of not availalbe tweets:", n_not_available return tid,topics,tweets,sentiments
def load_data(fname,pos): tid,tweets,sentiments = [],[],[] tknzr = TweetTokenizer(reduce_len=True) n_not_available = 0 with open(fname) as f: for line in f: splits = line.split('\t') tweet = splits[pos + 1] sentiment = convertSentiment(splits[pos]) tid.append(splits[0]) tweet = pts.preprocess_tweet(tweet) tweet_tok = tknzr.tokenize(tweet.decode('utf-8')) tweets.append(tweet_tok) sentiments.append(int(sentiment)) return tid,tweets,sentiments
def main(): input_fname = 'small' if len(sys.argv) > 1: input_fname = sys.argv[1] print input_fname input_file = 'semeval/smiley_tweets_{}.gz'.format(input_fname) output_file = 'semeval/smiley_tweets_{}_balanced.gz'.format(input_fname) read_emo('emoscores') counter = 0 pos_counter = 0 neg_counter = 0 pos_queue = deque() neg_queue = deque() f_out = gzip.open(output_file,'w') with gzip.open(input_file,'r') as f: for tweet in f: tweet,sentiment = convert_sentiment(tweet,trim=False) tweet = preprocess_tweet(tweet) if sentiment == 0: pos_queue.append(tweet) pos_counter += 1 if sentiment == 1: neg_queue.append(tweet) neg_counter += 1 counter += 1 while len(neg_queue) > 0 and len(pos_queue) > 0: pos_tweet = pos_queue.popleft() neg_tweet = neg_queue.popleft() f_out.write(pos_tweet) f_out.write(neg_tweet) if (counter%100000) == 0: print "Elements processed:",counter print "Pos tweets:",pos_counter print "Neg tweets:",neg_counter f_out.close()
def main(): input_fname = 'small' if len(sys.argv) > 1: input_fname = sys.argv[1] print input_fname input_file = 'semeval/smiley_tweets_{}.gz'.format(input_fname) output_file = 'semeval/smiley_tweets_{}_balanced.gz'.format(input_fname) read_emo('emoscores') counter = 0 pos_counter = 0 neg_counter = 0 pos_queue = deque() neg_queue = deque() f_out = gzip.open(output_file, 'w') with gzip.open(input_file, 'r') as f: for tweet in f: tweet, sentiment = convert_sentiment(tweet, trim=False) tweet = preprocess_tweet(tweet) if sentiment == 0: pos_queue.append(tweet) pos_counter += 1 if sentiment == 1: neg_queue.append(tweet) neg_counter += 1 counter += 1 while len(neg_queue) > 0 and len(pos_queue) > 0: pos_tweet = pos_queue.popleft() neg_tweet = neg_queue.popleft() f_out.write(pos_tweet) f_out.write(neg_tweet) if (counter % 100000) == 0: print "Elements processed:", counter print "Pos tweets:", pos_counter print "Neg tweets:", neg_counter f_out.close()
def main(): outdir = "semeval_parsed_200M" print outdir if not os.path.exists(outdir): os.makedirs(outdir) #supervised data train = "semeval/task-B-train-plus-dev.tsv" test = "semeval/task-B-test2014-twitter.tsv" dev = "semeval/twitter-test-gold-B.downloaded.tsv" test15 = "semeval/task-B-test2015-twitter.tsv" train16 = "semeval/task-A-train-2016.tsv" dev2016 = "semeval/task-A-dev-2016.tsv" devtest2016 = "semeval/task-A-devtest-2016.tsv" test2016 = "semeval/SemEval2016-task4-test.subtask-A.txt" #unsupervised data smiley_tweets_200M = 'semeval/smiley_tweets_200M.gz' alphabet = Alphabet(start_feature_id=0) alphabet.add('UNKNOWN_WORD_IDX') dummy_word_idx = alphabet.fid tknzr = TweetTokenizer(reduce_len=True) fnames = [ (train,3), (dev,3), (test,3), (test15,3), (train16,2), (dev2016,2), (devtest2016,2), (test2016,2) ] fnames_gz = [smiley_tweets_200M] counter = 0 for (fname,pos) in fnames: with open(fname,'r ') as f: for line in f: tweet = line.split('\t')[pos] tweet,_ = convert_sentiment(tweet) tweet = tknzr.tokenize(preprocess_tweet(tweet)) for token in tweet: alphabet.add(token) print len(alphabet) for fname in fnames_gz: with gzip.open(fname,'r') as f: for tweet in f: tweet,_ = convert_sentiment(tweet) tweet = tknzr.tokenize(preprocess_tweet(tweet)) for token in tweet: alphabet.add(token) counter += 1 if (counter % 1000000) == 0: print 'Precessed Tweets:',counter print len(alphabet) print 'Alphabet before purge:',len(alphabet) alphabet.purge_dict(input_fname=type,min_freq=10) print 'Alphabet after purge:',len(alphabet) cPickle.dump(alphabet, open(os.path.join(outdir, 'vocab.pickle'), 'w'))