keyword = sys.argv[2] else: csvFile = 'trainingandtestdata/testdata.csv' tweetlist = loadTwitterCSV(csvFile) #tweetlist = loadTwitterCSV('trainingandtestdata/training.1600000.processed.noemoticon.csv') tweetlist = chunked(tweetlist, size) else: tweetlist = None # tweetlist must be defined tweetlist_chunk = comm.scatter(tweetlist, root=0) #print rank, 'has data:', tweetlist_chunk sentiments = {} for tweet in tweetlist_chunk: sentiment = classifier.classify_tweet(tweet['text']) tweetDate = tweet['date'].replace('PDT ','') tweetDate = tweetDate.replace('UTC ','') tweetDate = tweetDate.replace('GMT ','') date = datetime.strptime(tweetDate, '%a %b %d %H:%M:%S %Y') dayDate = date.strftime('%Y%m%d') if sentiment == 'pos': sentimentValue = 1 else: sentimentValue = -1 if not dayDate in sentiments: sentiments[dayDate] = sentimentValue else: sentiments[dayDate] += sentimentValue
tweetlist = loadTwitterCSV('trainingandtestdata/testdata.csv') labeld_features = label_feats_from_tweets(tweetlist) #training_set, test_set = split_label_feats(labeld_features) tweetlist = loadTwitterCSV('trainingandtestdata/training.1600000.processed.noemoticon.csv') training_set = label_feats_from_tweets(tweetlist) training_set, garbage = split_label_feats(training_set, 1.0) test_set, garbage = split_label_feats(labeld_features, 1.0) print "training set length: %i test set length: %i" % (len(training_set), len(test_set)) #print prettifyFeatureSet(test_set) if args.algo == 'nb': classifier = NBSentimentClassifier().train(training_set) print "training NaiveBayes classifier..." else: classifier = MaxEntSentimentClassifier().train(training_set) print "training MaxEnt classifier..." print "calculating accuracy..." print 'accuracy:', classifier.test_accuracy(test_set) #classifier.show_most_informative_features(30) classifier.save_model() # load a serialized trained classifier #classifier = NBSentimentClassifier().load_model() #classifier = MaxEntSentimentClassifier().load_model() classifier.classify_tweet("Python rocks!!!", True)