def main(): args = sys.argv[1:] if len(args) < 1: print "Tweet collections USAGE: missing \'trend\' argument" sys.exit(1) trend = args[0] if not trend or trend == '': print "Value for argument \'trend\' is either blank or d.n.e." sys.exit(1) print "Tweet collection for trend ", trend, '...' keyset = 1 if len(args) > 1: keyset = int(args[1]) globalobjs.init(keyset) auth = tweepy.OAuthHandler(globalobjs.consumer_key, globalobjs.consumer_secret) auth.set_access_token(globalobjs.access_token, globalobjs.access_token_secret) # api = tweepy.API(auth) sapi = tweepy.streaming.Stream(auth, MongoStreamListener(trend, globalobjs.db, globalobjs.getLogFile(trend))) try: sapi.filter(track=[trend]) except(KeyboardInterrupt, SystemExit): print "User stopped with Ctrl+C" finally: print "ENTER FINALLY" globalobjs.destroy()
def fetchTweets(): query={} selector={} try: iter=crazydump.find() except: print "Could not fetch tweet", sys.exc.info()[0] #limit=crazydump.count() limit=40 counter=0 globalobjs.init() #print globalobjs.stopwords_list for tweetDoc in iter: print '\n', counter, '\n', tweetDoc['text'].encode('utf-8','ignore') tt=streamfilters.processTweetText(tweetDoc['text']) print tt counter+=1 if(counter>=limit): break
# from gensim import corpora, models, similarities # import tweetcorpus # import operator # import itertools import sys import numpy import logging from datetime import datetime from scipy import stats import ldamodel from resources import globalobjs logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) globalobjs.init() # TODO: # Handle all variations in lda parameters to be taken from command line args # numtopics, numupdates, doc_chunk, corpus_passes # offset, corpus length # filters from command line def comparison(trend1, trend2, metric = "JS", filters1 = None, filters2 = None): numtopics = globalobjs.num_topics_lda numupdates = globalobjs.update_freq doc_chunk = globalobjs.lda_chunk_size corpus_passes = globalobjs.passes_corpus start = datetime(2014, 4, 26) # filters1 = {"timestamp": {"$gt": start}} filters1 = None