Example #1
0
def process_song(method, label):
    print "process_song starting %s" % label
    TRAINING_SET = utils.read_tweets(sys.argv[1])
    EVAL_SET = utils.read_tweets(sys.argv[2])
    start_time = time.time()
    method(EVAL_SET, TRAINING_SET)
    end_time = time.time()
    print "done with %s after %.3f seconds" % (label, end_time - start_time)
    def load_train(self):
        # Load and prepare training tweets
        print("Loading training data")
        neg_tweets = u.read_tweets(self.trainneg)
        pos_tweets = u.read_tweets(self.trainpos)

        train_tweets = self.prepare_data(neg_tweets + pos_tweets, True)
        labels = np.array(len(neg_tweets) * [self.neg] + len(pos_tweets) * [self.pos])
        return train_tweets, labels
def main():
    tweets = utils.read_tweets()
    filtered = filter_classes(tweets)
    analyzer = SentimentAnalyzer()
    train_group, eval_group = split_train_eval(filtered)  
    train_group.update(eval_group)
    #for classy in train_group:
        #print classy + "\t" + str(len(train_group[classy]))
    analyzer.train_on_filtered(filtered)
    tweets = utils.read_tweets()
    analyzeByState(analyzer,tweets)
Example #4
0
def max_tweet_id(filename):
    fname = 'tweets/%s'%filename
    if os.path.isfile(fname):
        tweets = utils.read_tweets(fname)
        return max([tweet['id'] for tweet in tweets])
    else:
        return 0
Example #5
0
    def load_train(self):
        # Load and prepare training tweets
        # Two data sources require some tricks to comply with interface
        print("Loading training data")
        neg_tweets = u.read_tweets(self.trainneg)
        pos_tweets = u.read_tweets(self.trainpos)

        print("Reading external data")
        extraw = u.process_extData(self.extDataPath)
        extDat_size = len(extraw)
        self.exttweets = [l[0] for l in extraw]
        self.extlabels = np.asarray([l[1] for l in extraw])

        train_tweets = self.prepare_data(self.exttweets + neg_tweets + pos_tweets, True)
        # Save processed external tweets and return processed train tweets + labels
        self.exttweets = train_tweets[:extDat_size]

        labels = np.array(len(neg_tweets) * [self.neg] + len(pos_tweets) * [self.pos])
        return train_tweets[extDat_size:], labels
Example #6
0
def tocsv(lang_detection,
          include_current,
          data_path=RAW_TWEET_DIR,
          out_path=CSV_TWEET_DIR):
    """
  convert json to csv
  """
    lst = []
    raw_files = glob.glob(data_path + "/gn_tweet_*.txt")
    raw_files.sort(key=os.path.getmtime)
    #
    csv_files = [
        name[:-4].replace(out_path + "/", "")
        for name in glob.glob(out_path + "/gn_tweet_*utc.csv")
    ]
    print(csv_files)
    # include current scrape
    raw_files = raw_files if include_current else raw_files[:-1]
    try:
        print('Start process...')
        for filename in raw_files:
            # get file name
            json_vs_csv = filename.split("/")
            name = json_vs_csv[-1].split(".")[
                0]  # i just want the file name without extension
            if name not in csv_files:  # if csv do not exists
                # to csv
                print(name)
                if read_tweets(filename):
                    lst.append(name)
            else:
                print(name, "has been already processed")
    except Exception as e:
        print('Process aborted', e)
    finally:
        print('...End process')
        return lst
Example #7
0
import utils
import lstm_tr
import numpy as np

if __name__ == '__main__':
    tweets, words, chars, hashtags = utils.read_tweets("2018-E-c-En-train.txt")
    word_index, char_index, hashtag_index = utils.make_vectors_train(
        tweets, words, chars, hashtags)
    utils.save_indices(word_index, char_index, hashtag_index, "indices")

    model = lstm_tr.train(tweets, words, chars, hashtags, 100)

    # for i in range(30):
    #     model = lstm_dy.Twitter(len(words)+1, len(chars)+1, 300, 50, 100, 10, 2, 50, 11)
    #     model.train(tweets)
    #     model.save(str(i))

    # model = lstm_kr.Twitter(len(words)+1, len(chars)+1, 50, 50, 300, 300, 100, 100, 100, 11)

    # X_train = np.array([t.cont_vec for t in tweets])
    # y_train = np.array([t.emotions for t in tweets])

    # model.train(X_train, y_train, model)
    # evaluate the model on the training set
    print('predicting on training set...')
    train_pred = forest.predict(train_data)
    train_score = roc_auc_score(train_labels, train_pred)
    print('train score = %.6f' % (train_score))

    # evaluate the model on the held-out validation set
    print('predicting on validation set...')
    valid_pred = forest.predict(valid_data)
    valid_score = roc_auc_score(valid_labels, valid_pred)
    print('validation score = %.6f' % (valid_score))


if __name__ == '__main__':
    root = getcwd()
    datafile = join(root, 'data', 'tweets_clean.csv')
    tweetsfile = join(root, 'data', 'tweets_clean.pickle')

    # where to save the trained model and words-to-feature encoder
    modelfile = join(root, 'data', 'model.pickle')
    vectorfile = join(root, 'data', 'vectorizer.pickle')
    clean_tweets, clean_tweets_sentiments = read_tweets(datafile, tweetsfile)

    clean_tweets = np.array(clean_tweets)
    clean_tweets_sentiments = np.array(clean_tweets_sentiments)

    # because we cannot train on all the tweets, select a random subset here
    num_tweets = 5000
    random_indices = np.random.choice(clean_tweets.shape[0], size=num_tweets, replace=False)
    learn_sentiment_from_tweets(clean_tweets[random_indices], clean_tweets_sentiments[random_indices], modelfile, vectorfile, retrain=True)
        # loop over the words in the extraction corpus \todo determine how to include things like retweet count
        for term in self.class1.getTerms():
            # build the chi-squared table
            n11 = float(self.class1.getTermCount(term))
            n10 = float(self.class2.getTermCount(term))
            n01 = float(self.class1.getDocCount() - n11)
            n00 = float(self.class2.getDocCount() - n10)

            # perform the chi-squared calculation and store
            # the score in the dictionary
            total = n11 + n10 + n01 + n00
            top = ((n11 * n00) - (n10 * n01)) ** 2
            bottom = (n11 + n01) * (n11 + n10) * (n10 + n00) * (n01 + n00)
            chi = (total * top) / bottom
            scores[term] = chi
            
        #note for format
        #for (v, k) in scores:
        #    print str(k) + " : " + str(v)
        return scores

if __name__=="__main__":
    cfs=ChiFeatureSelector(utils.read_tweets(sys.argv[1]), utils.read_tweets(sys.argv[2]))
    print 'Features written to features.%d.json'%os.getpid()
    output = open('features.%d.json'%os.getpid(),'w')
    print>>output, ujson.dumps(cfs.getScores())
    print 'Sorted Features written to features.sort.%d.json'%os.getpid()
    output = open('features.sort.%d.json'%os.getpid(),'w')
    print>>output, ujson.dumps( sorted(cfs.getScores().iteritems(), key=operator.itemgetter(1), reverse=True))
Example #10
0
if __name__=="__main__":
    try:
        scrape()
        cfs = ChiFeatureSelector('trending.%d.json'%os.getpid(), 'nontrending.%d.ujson'%os.getpid())     
    except:    
        classify = classifier.HashtagClassifier()
        classify.condProb = utils.read_conf('classifierTrained.json')
        classify.prior = utils.read_conf('classifier_prior.json') 
    while True:
        keyword = re.sub("""[\s/:*"<>?|\\.;'\[\]]+""", '', inputs())
        if not keyword:
            print 'Please enter a valid phrase'
            continue
        try:
            scrapeTrends.search_tweet(keyword)
        except tweepy.TweepError:
            print 'Please enter a valid phrase'
            continue
        
        try:
            tweets = utils.read_tweets('tweets/tweets.%(name)s.json'%{'name':keyword})
        except:
            print 'could not classify keyword'
            continue
        #try:
        print classify.classify(Tweets(tweets))
        #except:
        #    print ''        
    

Example #11
0
    return dict(
        tweets=tweets,
        author=settings["author"],
        agree_to_honor_code=settings["agree_to_honor_code"],
        count=len(tweets),
        time=end_time - start_time,
    )


@bottle.route("/")
def index():
    return bottle.static_file("index.html", root="static")


@bottle.route("/favicon.ico")
def favicon():
    return bottle.static_file("favicon.ico", root="static")


@bottle.route("/static/<filename:path>")
def server_static(filename):
    return bottle.static_file(filename, root="static")


if __name__ == "__main__":
    db = utils.connect_db("msl", True)
    _searcher = tweetsearch.TweetSearch(db)
    _searcher.index_tweets(utils.read_tweets())
    bottle.run(host=settings["http_host"], port=settings["http_port"], reloader=True)
Example #12
0
import utils
import torch
import numpy as np
import lstm_dy

if __name__ == '__main__':
    tweets_test, w, c = utils.read_tweets("2018-E-c-En-dev.txt")

    model = torch.load("26.model")
    # model = lstm_dy.Twitter.load("29")

    word_index, char_index, hashtag_index = utils.load_indices("indices")
    utils.make_vectors_test(tweets_test, word_index, char_index, hashtag_index)
    score = 0
    print(
        "ID  Tweet   anger   anticipation    disgust fear    joy love    optimism    pessimism   sadness surprise    trust"
    )
    for t in tweets_test:

        prediction = ([1 if i > 0 else 0 for i in model(t).data.numpy()])
        # prediction = model.predict(t)

        t.emotions = prediction
        print(t)
def main():
    tweets = utils.read_tweets()
    getRetweetCounts(tweets)
Example #14
0
def main():
	tweets = utils.read_tweets()
	h = HITS()
	h.hubs_and_authorities(tweets)
Example #15
0
        tweets=tweets,
        author=settings['author'],
        agree_to_honor_code=settings['agree_to_honor_code'],
        count=len(tweets),
        time=end_time - start_time,
    )


@bottle.route('/')
def index():
    return bottle.static_file('index.html', root='static')


@bottle.route('/favicon.ico')
def favicon():
    return bottle.static_file('favicon.ico', root='static')


@bottle.route('/static/<filename:path>')
def server_static(filename):
    return bottle.static_file(filename, root='static')


if __name__ == "__main__":
    db = utils.connect_db('msl', True)
    _searcher = tweetsearch.TweetSearch(db)
    _searcher.index_tweets(utils.read_tweets())
    bottle.run(host=settings['http_host'],
               port=settings['http_port'],
               reloader=True)
Example #16
0
    # evaluate the model on the held-out validation set
    print('predicting on validation set...')
    valid_pred = forest.predict(valid_data)
    valid_score = roc_auc_score(valid_labels, valid_pred)
    print('validation score = %.6f' % (valid_score))


if __name__ == '__main__':
    root = getcwd()
    datafile = join(root, 'data', 'tweets_clean.csv')
    tweetsfile = join(root, 'data', 'tweets_clean.pickle')

    # where to save the trained model and words-to-feature encoder
    modelfile = join(root, 'data', 'model.pickle')
    vectorfile = join(root, 'data', 'vectorizer.pickle')
    clean_tweets, clean_tweets_sentiments = read_tweets(datafile, tweetsfile)

    clean_tweets = np.array(clean_tweets)
    clean_tweets_sentiments = np.array(clean_tweets_sentiments)

    # because we cannot train on all the tweets, select a random subset here
    num_tweets = 5000
    random_indices = np.random.choice(clean_tweets.shape[0],
                                      size=num_tweets,
                                      replace=False)
    learn_sentiment_from_tweets(clean_tweets[random_indices],
                                clean_tweets_sentiments[random_indices],
                                modelfile,
                                vectorfile,
                                retrain=True)
Example #17
0
                score1 = int(score1)
                score2 = int(score2)

                prev_score1, prev_score2 = previous_score
                if score1 - prev_score1 + score2 - prev_score2 == 1:
                    scorer = country1 if score1 > prev_score1 else country2

                    previous_score = score1, score2
                    yield country1, score1, country2, score2, scorer
                    last_goal = current_time

        time.sleep(delta)
        current_time += datetime.timedelta(seconds=delta)
        current_counter = counter


if __name__ == '__main__':
    tweet_filename = 'data/France_Roumanie_2016-06-10_21h_en.filtered.json'
    data = read_tweets(tweet_filename)
    for country1, score1, country2, score2, scorer in follow_euro_2016(
            None, None, data):
        scored_against = country1 if scorer == country2 else country2
        print(
            '{scorer} just scored against {against}! - new score: {country1} {score1}-{score2} {country2}'
            .format(scorer=scorer,
                    against=scored_against,
                    country1=country1,
                    country2=country2,
                    score1=score1,
                    score2=score2))
        sys.stdout.flush()
Example #18
0
def main():
    filtered = filterFeatures(open('features.json')).keys()
    posTweets = tweets.Tweets(utils.read_tweets('tweets/tweets.Trend.json'))
    negTweets = tweets.Tweets(utils.read_tweets('tweets/tweets.nonTrend.json'))
    
    for term in filtered:
        pospercent = posTweets.counts[term]/posTweets.docCount if term in posTweets.counts else 0
        negpercent = negTweets.counts[term]/posTweets.docCount if term in negTweets.counts else 0
        if approx_Equal(pospercent, negpercent):
            continue
        if pospercent > negpercent:
            TRENDING_WORDS.add(term)
        else:
            NONTRENDING_WORDS.add(term)
    print 'trending dict written to trend_words.json'
    output = open('trend_words.json','w')
    print>>output, ujson.dumps(TRENDING_WORDS)

    print 'nontrending dict written to nontrend_words.json'
    output = open('nontrend_words.json','w')
    print>>output, ujson.dumps(NONTRENDING_WORDS)
##    print 'trending:\n',TRENDING_WORDS,'\n\n'
##    print 'nontrending:',NONTRENDING_WORDS
    print 'Begin training'
    analyzer = HashtagClassifier()
    analyzer.train_on_filtered({'pos':posTweets, 'neg':negTweets})

    print 'Trained classifier written to classifierTrained.json'
    output = open('classifierTrained.json','w')
    print>>output, ujson.dumps(analyzer.condProb)
    print ujson.dumps(analyzer.prior)
    
    
    confusion = {'positive':0., 'negative':0., 'falsepos':0., 'falseneg':0.}
    iterations = 10
    filtered = filter_classes(tweets)
    termFreq = {}
    print 'starting training'
    for i in range(iterations):
        
        train_group, eval_group = split_train_eval(filtered)
    

        
        analyzer.train_on_filtered(train_group)

        analyzer.classify_filtered(eval_group)
        for key,val in analyzer.lastConfusion.items():
            confusion[key]+=float(val)/iterations
        for key,val in analyzer.condProb.iteritems():
            if key in termFreq:
                termFreq[key]['positive']+=val['positive']
                termFreq[key]['negative']+=val['negative']
            else:
                termFreq[key]=val

    #print confuse
    print 'After %i iterations:'%iterations
    print '\tPosExp\tNegExp'
    print 'PosAct\t',confusion['positive'],'\t',confusion['falsepos']
    print 'NegAct\t',confusion['falseneg'],'\t',confusion['negative']
    posSorted = sorted(termFreq.iteritems(), key = lambda x:x[1]['positive'])
    posSorted.reverse()
    negSorted = sorted(termFreq.iteritems(), key = lambda x:x[1]['negative'])
    negSorted.reverse()
    print 'Positive correlation words: '
    for val in posSorted[:25]:
        print val[0],val[1]['positive']
    print '\nNegative correlation words: '
    for val in negSorted[:25]:
        print val[0],val[1]['negative']