def process_song(method, label): print "process_song starting %s" % label TRAINING_SET = utils.read_tweets(sys.argv[1]) EVAL_SET = utils.read_tweets(sys.argv[2]) start_time = time.time() method(EVAL_SET, TRAINING_SET) end_time = time.time() print "done with %s after %.3f seconds" % (label, end_time - start_time)
def load_train(self): # Load and prepare training tweets print("Loading training data") neg_tweets = u.read_tweets(self.trainneg) pos_tweets = u.read_tweets(self.trainpos) train_tweets = self.prepare_data(neg_tweets + pos_tweets, True) labels = np.array(len(neg_tweets) * [self.neg] + len(pos_tweets) * [self.pos]) return train_tweets, labels
def main(): tweets = utils.read_tweets() filtered = filter_classes(tweets) analyzer = SentimentAnalyzer() train_group, eval_group = split_train_eval(filtered) train_group.update(eval_group) #for classy in train_group: #print classy + "\t" + str(len(train_group[classy])) analyzer.train_on_filtered(filtered) tweets = utils.read_tweets() analyzeByState(analyzer,tweets)
def max_tweet_id(filename): fname = 'tweets/%s'%filename if os.path.isfile(fname): tweets = utils.read_tweets(fname) return max([tweet['id'] for tweet in tweets]) else: return 0
def load_train(self): # Load and prepare training tweets # Two data sources require some tricks to comply with interface print("Loading training data") neg_tweets = u.read_tweets(self.trainneg) pos_tweets = u.read_tweets(self.trainpos) print("Reading external data") extraw = u.process_extData(self.extDataPath) extDat_size = len(extraw) self.exttweets = [l[0] for l in extraw] self.extlabels = np.asarray([l[1] for l in extraw]) train_tweets = self.prepare_data(self.exttweets + neg_tweets + pos_tweets, True) # Save processed external tweets and return processed train tweets + labels self.exttweets = train_tweets[:extDat_size] labels = np.array(len(neg_tweets) * [self.neg] + len(pos_tweets) * [self.pos]) return train_tweets[extDat_size:], labels
def tocsv(lang_detection, include_current, data_path=RAW_TWEET_DIR, out_path=CSV_TWEET_DIR): """ convert json to csv """ lst = [] raw_files = glob.glob(data_path + "/gn_tweet_*.txt") raw_files.sort(key=os.path.getmtime) # csv_files = [ name[:-4].replace(out_path + "/", "") for name in glob.glob(out_path + "/gn_tweet_*utc.csv") ] print(csv_files) # include current scrape raw_files = raw_files if include_current else raw_files[:-1] try: print('Start process...') for filename in raw_files: # get file name json_vs_csv = filename.split("/") name = json_vs_csv[-1].split(".")[ 0] # i just want the file name without extension if name not in csv_files: # if csv do not exists # to csv print(name) if read_tweets(filename): lst.append(name) else: print(name, "has been already processed") except Exception as e: print('Process aborted', e) finally: print('...End process') return lst
import utils import lstm_tr import numpy as np if __name__ == '__main__': tweets, words, chars, hashtags = utils.read_tweets("2018-E-c-En-train.txt") word_index, char_index, hashtag_index = utils.make_vectors_train( tweets, words, chars, hashtags) utils.save_indices(word_index, char_index, hashtag_index, "indices") model = lstm_tr.train(tweets, words, chars, hashtags, 100) # for i in range(30): # model = lstm_dy.Twitter(len(words)+1, len(chars)+1, 300, 50, 100, 10, 2, 50, 11) # model.train(tweets) # model.save(str(i)) # model = lstm_kr.Twitter(len(words)+1, len(chars)+1, 50, 50, 300, 300, 100, 100, 100, 11) # X_train = np.array([t.cont_vec for t in tweets]) # y_train = np.array([t.emotions for t in tweets]) # model.train(X_train, y_train, model)
# evaluate the model on the training set print('predicting on training set...') train_pred = forest.predict(train_data) train_score = roc_auc_score(train_labels, train_pred) print('train score = %.6f' % (train_score)) # evaluate the model on the held-out validation set print('predicting on validation set...') valid_pred = forest.predict(valid_data) valid_score = roc_auc_score(valid_labels, valid_pred) print('validation score = %.6f' % (valid_score)) if __name__ == '__main__': root = getcwd() datafile = join(root, 'data', 'tweets_clean.csv') tweetsfile = join(root, 'data', 'tweets_clean.pickle') # where to save the trained model and words-to-feature encoder modelfile = join(root, 'data', 'model.pickle') vectorfile = join(root, 'data', 'vectorizer.pickle') clean_tweets, clean_tweets_sentiments = read_tweets(datafile, tweetsfile) clean_tweets = np.array(clean_tweets) clean_tweets_sentiments = np.array(clean_tweets_sentiments) # because we cannot train on all the tweets, select a random subset here num_tweets = 5000 random_indices = np.random.choice(clean_tweets.shape[0], size=num_tweets, replace=False) learn_sentiment_from_tweets(clean_tweets[random_indices], clean_tweets_sentiments[random_indices], modelfile, vectorfile, retrain=True)
# loop over the words in the extraction corpus \todo determine how to include things like retweet count for term in self.class1.getTerms(): # build the chi-squared table n11 = float(self.class1.getTermCount(term)) n10 = float(self.class2.getTermCount(term)) n01 = float(self.class1.getDocCount() - n11) n00 = float(self.class2.getDocCount() - n10) # perform the chi-squared calculation and store # the score in the dictionary total = n11 + n10 + n01 + n00 top = ((n11 * n00) - (n10 * n01)) ** 2 bottom = (n11 + n01) * (n11 + n10) * (n10 + n00) * (n01 + n00) chi = (total * top) / bottom scores[term] = chi #note for format #for (v, k) in scores: # print str(k) + " : " + str(v) return scores if __name__=="__main__": cfs=ChiFeatureSelector(utils.read_tweets(sys.argv[1]), utils.read_tweets(sys.argv[2])) print 'Features written to features.%d.json'%os.getpid() output = open('features.%d.json'%os.getpid(),'w') print>>output, ujson.dumps(cfs.getScores()) print 'Sorted Features written to features.sort.%d.json'%os.getpid() output = open('features.sort.%d.json'%os.getpid(),'w') print>>output, ujson.dumps( sorted(cfs.getScores().iteritems(), key=operator.itemgetter(1), reverse=True))
if __name__=="__main__": try: scrape() cfs = ChiFeatureSelector('trending.%d.json'%os.getpid(), 'nontrending.%d.ujson'%os.getpid()) except: classify = classifier.HashtagClassifier() classify.condProb = utils.read_conf('classifierTrained.json') classify.prior = utils.read_conf('classifier_prior.json') while True: keyword = re.sub("""[\s/:*"<>?|\\.;'\[\]]+""", '', inputs()) if not keyword: print 'Please enter a valid phrase' continue try: scrapeTrends.search_tweet(keyword) except tweepy.TweepError: print 'Please enter a valid phrase' continue try: tweets = utils.read_tweets('tweets/tweets.%(name)s.json'%{'name':keyword}) except: print 'could not classify keyword' continue #try: print classify.classify(Tweets(tweets)) #except: # print ''
return dict( tweets=tweets, author=settings["author"], agree_to_honor_code=settings["agree_to_honor_code"], count=len(tweets), time=end_time - start_time, ) @bottle.route("/") def index(): return bottle.static_file("index.html", root="static") @bottle.route("/favicon.ico") def favicon(): return bottle.static_file("favicon.ico", root="static") @bottle.route("/static/<filename:path>") def server_static(filename): return bottle.static_file(filename, root="static") if __name__ == "__main__": db = utils.connect_db("msl", True) _searcher = tweetsearch.TweetSearch(db) _searcher.index_tweets(utils.read_tweets()) bottle.run(host=settings["http_host"], port=settings["http_port"], reloader=True)
import utils import torch import numpy as np import lstm_dy if __name__ == '__main__': tweets_test, w, c = utils.read_tweets("2018-E-c-En-dev.txt") model = torch.load("26.model") # model = lstm_dy.Twitter.load("29") word_index, char_index, hashtag_index = utils.load_indices("indices") utils.make_vectors_test(tweets_test, word_index, char_index, hashtag_index) score = 0 print( "ID Tweet anger anticipation disgust fear joy love optimism pessimism sadness surprise trust" ) for t in tweets_test: prediction = ([1 if i > 0 else 0 for i in model(t).data.numpy()]) # prediction = model.predict(t) t.emotions = prediction print(t)
def main(): tweets = utils.read_tweets() getRetweetCounts(tweets)
def main(): tweets = utils.read_tweets() h = HITS() h.hubs_and_authorities(tweets)
tweets=tweets, author=settings['author'], agree_to_honor_code=settings['agree_to_honor_code'], count=len(tweets), time=end_time - start_time, ) @bottle.route('/') def index(): return bottle.static_file('index.html', root='static') @bottle.route('/favicon.ico') def favicon(): return bottle.static_file('favicon.ico', root='static') @bottle.route('/static/<filename:path>') def server_static(filename): return bottle.static_file(filename, root='static') if __name__ == "__main__": db = utils.connect_db('msl', True) _searcher = tweetsearch.TweetSearch(db) _searcher.index_tweets(utils.read_tweets()) bottle.run(host=settings['http_host'], port=settings['http_port'], reloader=True)
# evaluate the model on the held-out validation set print('predicting on validation set...') valid_pred = forest.predict(valid_data) valid_score = roc_auc_score(valid_labels, valid_pred) print('validation score = %.6f' % (valid_score)) if __name__ == '__main__': root = getcwd() datafile = join(root, 'data', 'tweets_clean.csv') tweetsfile = join(root, 'data', 'tweets_clean.pickle') # where to save the trained model and words-to-feature encoder modelfile = join(root, 'data', 'model.pickle') vectorfile = join(root, 'data', 'vectorizer.pickle') clean_tweets, clean_tweets_sentiments = read_tweets(datafile, tweetsfile) clean_tweets = np.array(clean_tweets) clean_tweets_sentiments = np.array(clean_tweets_sentiments) # because we cannot train on all the tweets, select a random subset here num_tweets = 5000 random_indices = np.random.choice(clean_tweets.shape[0], size=num_tweets, replace=False) learn_sentiment_from_tweets(clean_tweets[random_indices], clean_tweets_sentiments[random_indices], modelfile, vectorfile, retrain=True)
score1 = int(score1) score2 = int(score2) prev_score1, prev_score2 = previous_score if score1 - prev_score1 + score2 - prev_score2 == 1: scorer = country1 if score1 > prev_score1 else country2 previous_score = score1, score2 yield country1, score1, country2, score2, scorer last_goal = current_time time.sleep(delta) current_time += datetime.timedelta(seconds=delta) current_counter = counter if __name__ == '__main__': tweet_filename = 'data/France_Roumanie_2016-06-10_21h_en.filtered.json' data = read_tweets(tweet_filename) for country1, score1, country2, score2, scorer in follow_euro_2016( None, None, data): scored_against = country1 if scorer == country2 else country2 print( '{scorer} just scored against {against}! - new score: {country1} {score1}-{score2} {country2}' .format(scorer=scorer, against=scored_against, country1=country1, country2=country2, score1=score1, score2=score2)) sys.stdout.flush()
def main(): filtered = filterFeatures(open('features.json')).keys() posTweets = tweets.Tweets(utils.read_tweets('tweets/tweets.Trend.json')) negTweets = tweets.Tweets(utils.read_tweets('tweets/tweets.nonTrend.json')) for term in filtered: pospercent = posTweets.counts[term]/posTweets.docCount if term in posTweets.counts else 0 negpercent = negTweets.counts[term]/posTweets.docCount if term in negTweets.counts else 0 if approx_Equal(pospercent, negpercent): continue if pospercent > negpercent: TRENDING_WORDS.add(term) else: NONTRENDING_WORDS.add(term) print 'trending dict written to trend_words.json' output = open('trend_words.json','w') print>>output, ujson.dumps(TRENDING_WORDS) print 'nontrending dict written to nontrend_words.json' output = open('nontrend_words.json','w') print>>output, ujson.dumps(NONTRENDING_WORDS) ## print 'trending:\n',TRENDING_WORDS,'\n\n' ## print 'nontrending:',NONTRENDING_WORDS print 'Begin training' analyzer = HashtagClassifier() analyzer.train_on_filtered({'pos':posTweets, 'neg':negTweets}) print 'Trained classifier written to classifierTrained.json' output = open('classifierTrained.json','w') print>>output, ujson.dumps(analyzer.condProb) print ujson.dumps(analyzer.prior) confusion = {'positive':0., 'negative':0., 'falsepos':0., 'falseneg':0.} iterations = 10 filtered = filter_classes(tweets) termFreq = {} print 'starting training' for i in range(iterations): train_group, eval_group = split_train_eval(filtered) analyzer.train_on_filtered(train_group) analyzer.classify_filtered(eval_group) for key,val in analyzer.lastConfusion.items(): confusion[key]+=float(val)/iterations for key,val in analyzer.condProb.iteritems(): if key in termFreq: termFreq[key]['positive']+=val['positive'] termFreq[key]['negative']+=val['negative'] else: termFreq[key]=val #print confuse print 'After %i iterations:'%iterations print '\tPosExp\tNegExp' print 'PosAct\t',confusion['positive'],'\t',confusion['falsepos'] print 'NegAct\t',confusion['falseneg'],'\t',confusion['negative'] posSorted = sorted(termFreq.iteritems(), key = lambda x:x[1]['positive']) posSorted.reverse() negSorted = sorted(termFreq.iteritems(), key = lambda x:x[1]['negative']) negSorted.reverse() print 'Positive correlation words: ' for val in posSorted[:25]: print val[0],val[1]['positive'] print '\nNegative correlation words: ' for val in negSorted[:25]: print val[0],val[1]['negative']