def writeToksToFile(): tokens, tweets_on_topic, tweets = readToks() for topic in TOPICS: tokenized_tweets = Tweets() for index in tweets_on_topic[topic]: tweet = tweets[index] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet['text'] for token in tokenize(tweet['text']): try: index = tokens.index(token) tokenized.tokens.append(index) except ValueError: tokenized.tokens.append(-1) print(tokenized.tokens) f = open(topic + '.tweets', "wb") f.write(tokenized_tweets.SerializeToString()) f.close()
def convertTweetsOfficialToVec(numtoks, tokens, tweets, filtering=False, phrasemodelpath="phrase.model"): tokens_sub = tokens[:numtoks] tokenized_tweets = Tweets() vects = [] norm_tweets = [] if filtering == True: bigram = Phrases(phrasemodelpath) for tweet in tweets: vect = np.zeros( numtoks ) # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988 norm_tweet = [] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet if filtering == False: tokenised_tweet = tokenize(tokenized.tweet) else: tokens = filterStopwords(tokenize(tokenized.tweet.lower())) tokenised_tweet = bigram[tokens] for token in tokenised_tweet: try: index = tokens_sub.index(token) except ValueError: index = -1 if index > -1: vect[index] = 1 norm_tweet.append(token) else: norm_tweet.append('NULL') #print(norm_tweet) norm_tweets.append(norm_tweet) vects.append(vect) return vects, norm_tweets
print (len(tokens)) sys.exit() tweets_on_topic = defaultdict(list) for topic in topics: for index, tweet in enumerate(tweets): for keyword in keywords[topic]: if keyword in tweet['text'].lower(): tweets_on_topic[topic].append(index) break for topic in topics: tokenized_tweets = Tweets() for index in tweets_on_topic[topic]: tweet = tweets[index] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet['text'] for token in tokenize(tweet['text']): try: index = tokens.index(token) tokenized.tokens.append(index) except ValueError: tokenized.tokens.append(-1) f = open(topic + '.tweets', "wb")
def convertTweetsToVec(topic="all", numtoks='all', phrasemodel=False, phrasemodelpath="phrase.model"): print("Reading tokens") tokens, tweets_on_topic, tweets = readToks(phrasemodel) if phrasemodel == True: bigram = Phrases(phrasemodelpath) if numtoks != "all": tokens_sub = tokens[:numtoks] else: tokens_sub = tokens numtoks = tokens.__sizeof__() tokenized_tweets = Tweets() vects = [] norm_tweets = [] print("Converting JSON tweets") if topic == 'all': #for topic in TOPICS: for tweet in tweets: vect = np.zeros( numtoks, dtype=bool ) # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988 norm_tweet = [] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet['text'] if phrasemodel == False: tokenised_tweet = tokenize(tweet['text']) else: tokens = filterStopwords(tokenize(tweet['text'].lower())) tokenised_tweet = bigram[tokens] for token in tokenised_tweet: try: index = tokens_sub.index(token) except ValueError: index = -1 if index > -1: vect[index] = 1 norm_tweet.append(token) else: norm_tweet.append('NULL') #print(norm_tweet) norm_tweets.append(norm_tweet) vects.append(vect) else: # discouraged, needs to be updated for index in tweets_on_topic[topic]: tweet = tweets[index] vect = np.zeros( numtoks ) # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988 norm_tweet = [] tokenized = tokenized_tweets.tweets.add() tokenized.tweet = tweet['text'] for token in tokenize(tweet['text']): try: index = tokens_sub.index(token) except ValueError: index = -1 if index > -1: vect[index] = 1 norm_tweet.append(token) else: norm_tweet.append('NULL') print(norm_tweet) norm_tweets.append(norm_tweet) vects.append(vect) print("Finished converting JSON tweets") return tokens_sub, vects, norm_tweets