forked from jderiu/deep-hashtagprediction
/
parse_tweets.py
123 lines (104 loc) · 4.24 KB
/
parse_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import numpy as np
from nltk.tokenize import TweetTokenizer
import gzip
from alphabet import Alphabet
import re
import cPickle
import os
from collections import OrderedDict
def preprocess_tweet(tweet):
#lowercase and normalize urls
tweet = tweet.lower()
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))','<url>',tweet)
tweet = re.sub('@[^\s]+','<user>',tweet)
p = re.compile('(#[^\s]+)')
hashtags = p.findall(tweet)
try:
tweet = tweet.decode('unicode_escape').encode('ascii','ignore')
except:
pass
return tweet,hashtags
def convert2indices(data, alphabet, dummy_word_idx, max_sent_length=140):
data_idx = []
max_len = 0
unknown_words = 0
for sentence in data:
ex = np.ones(max_sent_length) * dummy_word_idx
max_len = max(len(sentence),max_len)
if len(sentence) > max_sent_length:
print "Sentence length:",len(sentence)
print sentence
for i, token in enumerate(sentence):
idx = alphabet.get(token, UNKNOWN_WORD_IDX)
ex[i] = idx
if idx == UNKNOWN_WORD_IDX:
unknown_words += 1
data_idx.append(ex)
data_idx = np.array(data_idx).astype('int32')
print "Max length in this batch:",max_len
print "Number of unknown words:",unknown_words
return data_idx
def store_file(f_in, f_out, alphabet_words,alphabet_hashtags, dummy_word_idx, hashtag_fname=None):
#stores the tweets in batches so it fits in memory
tknzr = TweetTokenizer(reduce_len=True)
counter = 0
batch_counter = 0
output = open(f_out,'wb')
output_hashtag = open(hashtag_fname, 'wb')
batch_size = 500000
tweet_batch = []
hashtag_batch=[]
with gzip.open(f_in,'r') as f:
for tweet in f:
tweet,hashtags = preprocess_tweet(tweet)
if len(hashtags) == 1:
ht = hashtags[0]
alphabet_hashtags.add(ht)
ht_idx = alphabet_hashtags.get(ht,UNKNOWN_HASHTAG_IDX)
tweet = tweet.replace(ht,'')
tweet_tok = tknzr.tokenize(tweet.decode('utf-8'))
tweet_batch.append(tweet_tok)
hashtag_batch.append(ht_idx)
batch_counter += 1
for token in tweet_tok:
alphabet_words.add(token)
if batch_counter >= batch_size:
tweet_idx = convert2indices(tweet_batch, alphabet_words, dummy_word_idx)
np.save(output,tweet_idx)
np.save(output_hashtag,hashtag_batch)
print 'Saved tweets:',tweet_idx.shape
tweet_batch = []
hashtag_batch=[]
batch_counter = 0
counter += 1
if (counter%1000000) == 0:
print "Elements processed:",counter
tweet_idx = convert2indices(tweet_batch, alphabet_words, dummy_word_idx)
np.save(output,tweet_idx)
np.save(output_hashtag,hashtag_batch)
print len(alphabet_hashtags)
print len(alphabet_words)
print 'Saved tweets:',tweet_idx.shape
return counter
UNKNOWN_WORD_IDX = 0
UNKNOWN_HASHTAG_IDX = 0
DUMMY_WORD_IDX = 1
def main():
data_dir = 'tweets/hashtag_top100_smileys_tweets_{}.gz'
output_dir_tweets = 'parsed_tweets/hashtag_top100_smiley_tweets_{}.tweets.npy'
output_dir_hashtags = 'parsed_tweets/hashtag_top100_smiley_tweets_{}.hashtags.npy'
outdir = 'parsed_tweets'
alphabet_words = Alphabet(start_feature_id=0)
alphabet_words.add('UNKNOWN_WORD_IDX')
alphabet_words.add('DUMMY_WORD_IDX')
dummy_word_idx = DUMMY_WORD_IDX
alphabet_hashtags = Alphabet(start_feature_id=0)
alphabet_hashtags.add('UNKNOWN_HASHTAG_IDX')
inp = 'train'
store_file(data_dir.format(inp),output_dir_tweets.format(inp),alphabet_words,alphabet_hashtags,dummy_word_idx,output_dir_hashtags.format(inp))
inp = 'test'
store_file(data_dir.format(inp),output_dir_tweets.format(inp),alphabet_words,alphabet_hashtags,dummy_word_idx,output_dir_hashtags.format(inp))
cPickle.dump(alphabet_words, open(os.path.join(outdir, 'vocab_words.pickle'), 'w'))
cPickle.dump(alphabet_hashtags, open(os.path.join(outdir, 'vocab_hashtags.pickle'), 'w'))
if __name__ == '__main__':
main()