def writeToksToFile(): tokens, tweets_on_topic, tweets = readToks() for topic in TOPICS: tokenized_tweets = Tweets() for index in tweets_on_topic[topic]: tweet = tweets[index] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet['text'] for token in tokenize(tweet['text']): try: index = tokens.index(token) tokenized.tokens.append(index) except ValueError: tokenized.tokens.append(-1) print(tokenized.tokens) f = open(topic + '.tweets', "wb") f.write(tokenized_tweets.SerializeToString()) f.close()
tweets_on_topic = defaultdict(list) for topic in topics: for index, tweet in enumerate(tweets): for keyword in keywords[topic]: if keyword in tweet['text'].lower(): tweets_on_topic[topic].append(index) break for topic in topics: tokenized_tweets = Tweets() for index in tweets_on_topic[topic]: tweet = tweets[index] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet['text'] for token in tokenize(tweet['text']): try: index = tokens.index(token) tokenized.tokens.append(index) except ValueError: tokenized.tokens.append(-1) f = open(topic + '.tweets', "wb") f.write(tokenized_tweets.SerializeToString()) f.close()