def getWordWeights(data, daysPast, directory, timeStamp): dates = [entry['created_at'] for entry in data.values()] rightBound = max(dates) leftBound = rightBound - datetime.timedelta(days=daysPast) data = [ entry for entry in data.values() if leftBound < entry['created_at'] < rightBound ] if 'nlpCat' in data[0].keys(): CatCol = 'nlpCat' elif 'nltkCat' in data[0].keys(): CatCol = 'nltkCat' else: CatCol = 'tweetType' wordWeights = dict() wordList = dict() wordCloud = [] wordList['all'] = [TweetMatch.prepTweet(entry['text']) for entry in data] cats = set([entry[CatCol] for entry in data]) for cat in cats: wordList[cat] = [ TweetMatch.prepTweet(entry['text']) for entry in data if entry[CatCol] == cat ] cats.add('all') for cat in cats: wordList[cat] = [[ word.split("'")[0] for word in entry if word not in blockKeys ] for entry in wordList[cat]] for cat in cats: wordWeights[cat] = dict() for tweet in wordList[cat]: for word in tweet: if word not in wordWeights[cat].keys(): wordWeights[cat][word] = 1 else: wordWeights[cat][word] += 1 for cat in cats: listed = [] for key in wordWeights[cat].keys(): listed.append('{text: "%s", weight: %s}' % (str(key), wordWeights[cat][key])) wordCloud.append('{%s: [%s]}' % (cat, ', '.join(listed))) jsonOut = '{wordcloud: [%s]}' % ', '.join(wordCloud) outName = "wordcloud.json" print "Writing wordcloud to '" + outName + "'" outFile = open(directory + outName, "w") outFile.write(jsonOut) outFile.close() return directory + outName
def getWordWeights(data,daysPast,directory, timeStamp): dates = [entry['created_at'] for entry in data.values()] rightBound = max(dates) leftBound = rightBound - datetime.timedelta(days = daysPast) data = [entry for entry in data.values() if leftBound < entry['created_at'] < rightBound] if 'nlpCat' in data[0].keys(): CatCol = 'nlpCat' elif 'nltkCat' in data[0].keys(): CatCol = 'nltkCat' else: CatCol = 'tweetType' wordWeights = dict() wordList = dict() wordCloud = [] wordList['all'] = [TweetMatch.prepTweet(entry['text']) for entry in data] cats = set([entry[CatCol] for entry in data]) for cat in cats: wordList[cat] = [TweetMatch.prepTweet(entry['text']) for entry in data if entry[CatCol] == cat] cats.add('all') for cat in cats: wordList[cat] = [[word.split("'")[0] for word in entry if word not in blockKeys] for entry in wordList[cat]] for cat in cats: wordWeights[cat] = dict() for tweet in wordList[cat]: for word in tweet: if word not in wordWeights[cat].keys(): wordWeights[cat][word] = 1 else: wordWeights[cat][word] += 1 for cat in cats: listed = [] for key in wordWeights[cat].keys(): listed.append('{text: "%s", weight: %s}' % (str(key),wordWeights[cat][key])) wordCloud.append('{%s: [%s]}' % (cat,', '.join(listed))) jsonOut = '{wordcloud: [%s]}' % ', '.join(wordCloud) outName = "wordcloud.json" print "Writing wordcloud to '"+outName + "'" outFile = open(directory+outName, "w") outFile.write(jsonOut) outFile.close() return directory+outName
def on_status(self, status): try: if self.startDay != localTime(datetime.datetime.today(),self.cfg).strftime("%A") or self.tweetCount >= self.cfg['StopCount']: giListener.saveTweets(self) text = status.text.replace('\n',' ') tweetType = checkTweet(self.conditions, self.qualifiers, self.exclusions, text, self.cfg) geoType = isInBox(self.cfg, self.geoCache, status) percentFilled = (self.tweetCount*100)/self.cfg['StopCount'] loginInfo = "\033[94m%s:%s%%\033[0m" % (self.name,percentFilled) tweetLocalTime = outTime(localTime(status,self.cfg)) if geoType['inBox'] or self.cfg['KeepUnlocated']: if tweetType == "accepted": print loginInfo, "\033[1m%s\t%s\t%s\t%s\033[0m" % (text, status.author.screen_name, tweetLocalTime['full'], status.source,) self.tweetCount += self.cfg['KeepAccepted'] self.acceptedCount += 1 self.jsonAccepted.append(status.json) elif tweetType == "excluded": print loginInfo, "\033[91m%s\t%s\t%s\t%s\033[0m" % (text, status.author.screen_name, tweetLocalTime['full'], status.source,) self.tweetCount += self.cfg['KeepExcluded'] self.excludedCount += 1 self.jsonExcluded.append(status.json) elif tweetType == "partial": print loginInfo, "%s\t%s\t%s\t%s" % (text, status.author.screen_name, tweetLocalTime['full'], status.source,) self.tweetCount += self.cfg['KeepPartial'] self.partialCount += 1 self.jsonPartial.append(status.json) elif tweetType == "retweet": None else: self.irrelevantCount += 1 if tweetType != "retweet" and self.cfg['KeepRaw'] == True: self.jsonRaw.append(status.json) self.tweetTypes[str(status.id)] = {'tweetType':tweetType, 'geoType':geoType['text'], 'lat':geoType['lat'], 'lon':geoType['lon'], 'place':geoType['place'], 'fineLocation':geoType['trueLoc'], 'day':tweetLocalTime['day'], 'time':tweetLocalTime['time'], 'date':tweetLocalTime['date']} if self.cfg['OnlyKeepNLTK']: self.tweetTypes[str(status.id)]['nltkCat'] = TweetMatch.classifySingle(status.text,self.NLTK) except Exception, e: print "Encountered exception:", e pass
def run(self): newDay = False print "\nPreparing to run..." print "Geographic Selection:", self.geo, '\n\n' while True: collected = [] foundIDs = set() allFound = 0 if self.multiAPI: print "Multi-api mode in use" failCount = dict([key,0] for key in self.api.keys()) APIoffline = dict([key, 0] for key in self.api.keys()) if self.cfg['UseStacking']: counted = 0; increment = 10 timeNow = time.time() elapsed = timeNow - self.stackLast self.stackLast = timeNow stackDelay = getDelay(self, elapsed) print "Running %s geoStack queries at 1 query every %s seconds" % (self.stackQueries,stackDelay) queryCount = -1 for query in self.queries: if self.cfg['UseStacking']: geoCount = 0; queryCount += 1 for geoPoint in self.stackPoints: loggedIn = True ranSearch = False while not loggedIn or not ranSearch: try: if self.multiAPI: numOffline = sum((1 for key in APIoffline if APIoffline[key] != 0)) APIoffline = {key:max(value-1,0) for key,value in APIoffline.iteritems()} chooseable = [key for key,value in APIoffline.iteritems() if value == 0] #print "DEBOO CHOOSABLE", chooseable, "NUMOFFLINE", numOffline, "APIoffline", APIoffline if len(chooseable) > 0: chosen = choice(chooseable) cellCollected = self.api[chosen]['api'].search(q = query, since_id = self.stackLastTweet[queryCount][geoCount], geocode = geoString(geoPoint), result_type="recent", count = 100) failCount[chosen] = 0 time.sleep(uniform(0,.05)) else: cellCollected = self.api.search(q = query, since_id = self.stackLastTweet[queryCount][geoCount], geocode = geoString(geoPoint), result_type="recent", count = 100) allFound += len(cellCollected) if self.useNLTK: if self.cfg['KeepDiscardsNLTK']: cellCollected = [status for status in cellCollected if status.id not in foundIDs and (TweetMatch.classifySingle(status.text,self.NLTK) in self.cfg['OnlyKeepNLTK'] or uniform(0,1)<self.cfg['DiscardSampleNLTK'])] else: cellCollected = [status for status in cellCollected if TweetMatch.classifySingle(status.text,self.NLTK) in self.cfg['OnlyKeepNLTK'] and status.id not in foundIDs] for cell in cellCollected: foundIDs.add(cell.id) #print cell.text,"CLASS:", TweetMatch.classifySingle(cell.text,self.NLTK) if len(cellCollected)>0: collected += cellCollected self.stackLastTweet[queryCount][geoCount] = int(collected[0].id) geoCount += 1; counted += 1 ranSearch = True if counted == self.rateLimit/2: stackDelay = getDelay(self, 0) if counted%increment == 0: print "Running search %s out of %s with %s hits found and %s ignored" % (counted, self.stackQueries, len(collected), allFound - len(collected)) if self.multiAPI: time.sleep(stackDelay*(float(len(self.api))/max(len(chooseable),1))*0.9) else: time.sleep(stackDelay) except Exception,e: loggedIn = False while not loggedIn: if not self.multiAPI: print "Login error, will sleep 120 before reconnection, error code:",e time.sleep(120 + randint(-3,3)) else: failCount[chosen] += 1 try: if self.multiAPI: if len(chooseable) == 0: print "All logins down, will sleep 2 minutes before reconnection, error code:",e chosen = [key for key, value in APIoffline.iteritems() if value == min(APIoffline.values())][0] time.sleep(120) failCount[chosen] = 0 APIoffline[chosen] = 0 if failCount[chosen] <= 2: self.api[chosen]['api'] = getAuth(self.cfg['_login_'][chosen])['api'] else: APIoffline[chosen] = 100 else: self.api = getAuth(self.cfg['_login_'])['api'] print "Login successfull" loggedIn = True except Exception,e: print "Login unsuccessfull\n",e else: loggedIn = True ranSearch = False while not loggedIn or not ranSearch: try: #Issue of stream pagination currently unresolved #https://github.com/tweepy/tweepy/pull/296#commitcomment-3404913 #Method 1: Unlimited backstream, may have overlap or rate limiting issues """for tweet in tweepy.Cursor(self.api.search,q=query, geocode= self.geo, since_id= str(0), result_type="recent").items(): print tweet.text collected.append(tweet) for item in collected: print item.text, item.coordinates, item.geo""" #Method 2: Since id stream, may miss if keyword set yields over 100 new results if self.multiAPI: numOffline = sum((1 for key in APIoffline if APIoffline[key] != 0)) APIoffline = {key:max(value-1,0) for key,value in APIoffline.iteritems()} chooseable = [key for key,value in APIoffline.iteritems() if value == 0] if len(chooseable) > 0: chosen = choice(chooseable) cellCollected = self.api[chosen]['api'].search(q = query, since_id = self.lastTweet, geocode = self.geo, result_type="recent", count = 100) failCount[chosen] = 0 time.sleep(uniform(0,.05)) else: cellCollected = self.api.search(q = query, since_id = self.lastTweet, geocode = self.geo, result_type="recent", count = 100) if self.useNLTK: if self.cfg['KeepDiscardsNLTK']: cellCollected = [status for status in cellCollected if status.id not in foundIDs and (TweetMatch.classifySingle(status.text,self.NLTK) in self.cfg['OnlyKeepNLTK'] or uniform(0,1)<self.cfg['DiscardSampleNLTK'])] else: cellCollected = [status for status in cellCollected if TweetMatch.classifySingle(status.text,self.NLTK) in self.cfg['OnlyKeepNLTK'] and status.id not in foundIDs] for cell in cellCollected: foundIDs.add(cell.id) collected += cellCollected ranSearch = True except Exception,e: loggedIn = False while not loggedIn: if not self.multiAPI: print "Login error, will sleep 120 before reconnection, error code:",e time.sleep(120 + randint(-3,3)) else: failCount[chosen] += 1 try: if self.multiAPI: if len(chooseable) == 0: print "All logins down, will sleep 2 minutes before reconnection, error code:",e chosen = [key for key, value in APIoffline.iteritems() if value == min(APIoffline.values())][0] time.sleep(120) failCount[chosen] = 0 APIoffline[chosen] = 0 if failCount[chosen] <= 2: self.api[chosen]['api'] = getAuth(self.cfg['_login_'][chosen])['api'] else: APIoffline[chosen] = 100 else: self.api = getAuth(self.cfg['_login_'])['api'] print "Login successfull" loggedIn = True except Exception,e: print "Login unsuccessfull\n",e
def __init__(self, conditions, qualifiers, exclusions, api, cfg, name, testSpace, geoCache): self.delay = 30 self.qualifiers = qualifiers self.conditions = conditions self.api = api self.name = name self.exclusions = exclusions self.cfg = cfg self.searchDelay = 600 self.rateLimit = 180 self.rateIncrement = 900 self.geoCache = geoCache self.useNLTK = False self.NLTK = 'null' if cfg['OnlyKeepNLTK'] != False: global TweetMatch import TweetMatch self.useNLTK = True temp = cfg['OnlyKeepNLTK'] if type(temp) is str: self.cfg['OnlyKeepNLTK'] = temp.split('_') if type(temp) is list: self.cfg['OnlyKeepNLTK'] = temp if type(self.cfg['OnlyKeepNLTK']) is not list: self.cfg['OnlyKeepNLTK'] = [str(temp)] self.cfg['OnlyKeepNLTK'] = [str(key) for key in self.cfg['OnlyKeepNLTK']] if '-f' not in cfg['args']: self.NLTK = TweetMatch.getClassifier(cfg['NLTKFile']) giSeeker.flushTweets(self) giSeeker.makeQueries(self) geoTemp = getGeo(cfg) self.pathOut = self.cfg['OutDir']+'search/' if not os.path.exists(self.pathOut): os.makedirs(self.pathOut) fileOut = openWhenReady(self.pathOut + 'checkbits','w') fileOut.write('DayFinished = False') fileOut.close() giSeeker.getLastID(self) if geoTemp == "STACK": cfg['UseStacking'] = True self.geo = "STACK" else: self.geo = geoString(getGeo(cfg)) if type(api) is dict: print "Using multiple API login method" self.multiAPI = True else: print "Using single API login method" self.multiAPI = False if cfg['UseGDI']: self.searchDelay = cfg['GDI']['Frequency'] if cfg['UseStacking']: temp = fillBox(cfg,self) self.stackPoints = temp['list'] self.stackRadius = temp['radius'] self.stackQueries = len(self.queries) * len(self.stackPoints) self.stackLastTweet = [[self.lastTweet for _ in xrange(len(self.stackPoints))] for __ in xrange(len(self.queries))] self.stackLast = time.time() self.testSpace = testSpace self.runDay = datetime.datetime.now().strftime("%A %d") self.lastWrite = 'null' self.startDay = 'null' print "\nInitiated seeker '%s' with %s conditions, %s qualifiers, and %s exclusions" % (name, len(conditions), len(qualifiers), len(exclusions))
None else: self.irrelevantCount += 1 if tweetType != "retweet" and self.cfg['KeepRaw'] == True and geoStrictKeep and wordStrictKeep: self.jsonRaw.append(status.json) self.tweetTypes[str(status.id)] = {'tweetType':tweetType, 'geoType':geoType['text'], 'lat':geoType['lat'], 'lon':geoType['lon'], 'place':geoType['place'], 'fineLocation':geoType['trueLoc'], 'day':tweetLocalTime['day'], 'time':tweetLocalTime['time'], 'date':tweetLocalTime['date']} if self.cfg['OnlyKeepNLTK']: self.tweetTypes[str(status.id)]['nltkCat'] = TweetMatch.classifySingle(status.text,self.NLTK) if newDay: giSeeker.saveTweets(self) newDay = False giSeeker.closeDay(self) try: self.startDay = localTime(status.created_at,self.cfg).strftime("%A %d") self.startTime = localTime(status.created_at,self.cfg).strftime(timeArgs) except: self.startDay = 'null' self.startTime = 'null' if hasResults:
def main(): usingGDoc = False NLPClassifier = 'null' keepKeys = 'null' extra = dict() skipReformat = '-s' in sys.argv quickReformat = '-r' in sys.argv and not skipReformat try: userLogin = sys.argv[2] print "Login '%s' passed explicitly" % (userLogin) except: userLogin = '******' try: temp = sys.argv[1] if temp.startswith('http'): usingGDoc = True gDocURL = temp print "Preparing GDI Remote Access Loader" else: print "\nTaking user parameters" directory = '/'.join(temp.split('/')[:-1]) configFile = temp.split('/')[-1] if directory == '': directory = os.getcwd() + '/' except: print "Taking default parameters" directory = os.getcwd() + '/' configFile = 'config' if usingGDoc: directory = os.getcwd() + '/' temp = giSpyGDILoad(gDocURL,directory) cfg = temp['config'] lists = temp['lists'] if type(temp['login']) is list: login = getLogins(directory,temp['login']) cfg['MultiLogin'] = True else: login = getLogins(directory,[temp['login']])[temp['login']] cfg['Directory'] = directory geoCache = dict() updateGeoPickle(geoCache,directory+'caches/'+pickleName) if cfg['OnlyKeepNLP'] != False: temp = cfg['OnlyKeepNLP'] if type(temp) is str: cfg['OnlyKeepNLP'] = temp.split('_') if type(temp) is list: cfg['OnlyKeepNLP'] = temp if type(cfg['OnlyKeepNLP']) is not list: cfg['OnlyKeepNLP'] = [str(temp)] cfg['OnlyKeepNLP'] = [str(key) for key in cfg['OnlyKeepNLP']] NLPClassifier = TweetMatch.getClassifier(cfg['NLPFile'],cfg) if not skipReformat: reformatOld(directory,lists,cfg,geoCache,NLPClassifier) updateGeoPickle(geoCache,directory+'caches/'+pickleName) if quickReformat: quit() else: print "Loading parameters from config file '%s' in directory '%s'" % (configFile, directory) cfg = getConfig(directory+configFile) cfg['Directory'] = directory cfg['ConfigFile'] = configFile logins = getLogins(directory, cfg['Logins']) lists = updateWordBanks(directory, cfg) geoCache = dict() updateGeoPickle(geoCache,directory+'caches/'+pickleName) if not skipReformat: reformatOld(directory,lists,cfg,geoCache,NLPClassifier) if quickReformat: quit() updateGeoPickle(geoCache,directory+'caches/'+pickleName) print "\nPlease choose login number:" if userLogin == 'null': listed = sorted(logins.keys()); i = 0 for key in listed: print "\t%s - %s - %s" % (i,key,logins[key]['description']) i += 1 while True: try: selection = int(raw_input('\n:')) userLogin = listed[selection] break except: None login = logins[userLogin] if cfg['MultiLogin']: for key in login.keys(): temp = getAuth(login[key]) login[key]['auth'] = temp['auth'] login[key]['api'] = temp['api'] time.sleep(3) else: temp = getAuth(login) login['auth'] = temp['auth'] login['api'] = temp['api'] login['name'] = userLogin cfg['userLogin'] = userLogin cfg['_login_'] = login cfg['Directory'] = directory cfg['args'] = sys.argv getTweets(login,cfg,lists['conditions'],lists['qualifiers'],lists['exclusions'],geoCache,NLPClassifier)
None else: self.irrelevantCount += 1 if tweetType != "retweet" and self.cfg['KeepRaw'] == True and geoStrictKeep and wordStrictKeep: self.jsonRaw.append(status.json) self.tweetTypes[str(status.id)] = {'tweetType':tweetType, 'geoType':geoType['text'], 'lat':geoType['lat'], 'lon':geoType['lon'], 'place':geoType['place'], 'fineLocation':geoType['trueLoc'], 'day':tweetLocalTime['day'], 'time':tweetLocalTime['time'], 'date':tweetLocalTime['date']} if self.cfg['OnlyKeepNLP']: self.tweetTypes[str(status.id)]['NLPCat'] = TweetMatch.classifySingle(status.text,self.NLP,self.cfg['NLPnGrams']) if newDay and self.sendStatus == 'ready': giSeeker.saveTweets(self) newDay = False giSeeker.closeDay(self) try: self.startDay = localTime(status.created_at,self.cfg).strftime("%A %d") self.startTime = localTime(status.created_at,self.cfg).strftime(timeArgs) except: self.startDay = 'null' self.startTime = 'null' if hasResults:
def main(): usingGDoc = False NLPClassifier = 'null' keepKeys = 'null' extra = dict() manualTime = 'null' skipReformat = '-s' in sys.argv quickReformat = '-r' in sys.argv and not skipReformat oneTimeDump = '-o' in sys.argv and not skipReformat quickSend = '-e' in sys.argv and not skipReformat if quickSend: quickPos = sys.argv.index('-e') if quickPos != len(sys.argv) - 1: tArg = sys.argv[quickPos + 1] try: manualTime = parser.parse(tArg) print "Sending historic one time report for time %s" % manualTime except: pass skipReformat = not (quickReformat or oneTimeDump or quickSend) or skipReformat try: userLogin = sys.argv[2] print "Login '%s' passed explicitly" % (userLogin) except: userLogin = '******' try: temp = sys.argv[1] if temp.startswith('http'): usingGDoc = True gDocURL = temp configFile = 'null' print "Preparing GDI Remote Access Loader" else: print "\nTaking user parameters" directory = '/'.join(temp.split('/')[:-1]) configFile = temp.split('/')[-1] if directory == '': directory = os.getcwd() + '/' except: print "Taking default parameters" directory = os.getcwd() + '/' configFile = 'config' if configFile == '-f': configFile = 'config.txt' if usingGDoc: directory = os.getcwd() + '/' temp = giSpyGDILoad(gDocURL, directory) cfg = temp['config'] lists = temp['lists'] if type(temp['login']) is list: login = getLogins(directory, temp['login']) cfg['MultiLogin'] = True else: login = getLogins(directory, [temp['login']])[temp['login']] cfg['Directory'] = directory cfg['KeryRing'] = getOtherAPIs(directory) #geoCache = dict() geoCache = updateGeoPickle({}, getPickleName(cfg), cfg) if cfg['OnlyKeepNLP'] != False: temp = cfg['OnlyKeepNLP'] if type(temp) is str: cfg['OnlyKeepNLP'] = temp.split('_') if type(temp) is list: cfg['OnlyKeepNLP'] = temp if type(cfg['OnlyKeepNLP']) is not list: cfg['OnlyKeepNLP'] = [str(temp)] cfg['OnlyKeepNLP'] = [str(key) for key in cfg['OnlyKeepNLP']] NLPClassifier = TweetMatch.getClassifier(cfg['NLPFile'], cfg) cfg['OneTimeDump'] = oneTimeDump cfg['QuickSend'] = quickSend if oneTimeDump: cfg['DaysBack'] = 'all' if not skipReformat: reformatOld(directory, lists, cfg, geoCache, NLPClassifier, manualTime=manualTime) geoCache = updateGeoPickle(geoCache, getPickleName(cfg), cfg) if quickReformat or oneTimeDump or quickSend: sys.exit() else: print "Loading parameters from config file '%s' in directory '%s'" % ( configFile, directory) cfg = getConfig(directory + configFile) cfg['Directory'] = directory cfg['ConfigFile'] = configFile logins = getLogins(directory, cfg['Logins']) lists = updateWordBanks(directory, cfg) #geoCache = dict() geoCache = updateGeoPickle({}, directory + 'caches/' + pickleName) cfg['KeyRing'] = getOtherAPIs(directory) if not skipReformat: reformatOld(directory, lists, cfg, geoCache, NLPClassifier, manualTime=manualTime) if quickReformat: sys.exit() geoCache = updateGeoPickle(geoCache, directory + 'caches/' + pickleName) print "\nPlease choose login number:" if userLogin == 'null': listed = sorted(logins.keys()) i = 0 for key in listed: print "\t%s - %s - %s" % (i, key, logins[key]['description']) i += 1 while True: try: selection = int(raw_input('\n:')) userLogin = listed[selection] break except: None login = logins[userLogin] if cfg['MultiLogin']: for key in login.keys(): temp = getAuth(login[key]) login[key]['auth'] = temp['auth'] login[key]['api'] = temp['api'] time.sleep(3) else: temp = getAuth(login) login['auth'] = temp['auth'] login['api'] = temp['api'] login['name'] = userLogin cfg['userLogin'] = userLogin cfg['_login_'] = login cfg['Directory'] = directory cfg['args'] = sys.argv getTweets(login, cfg, lists['conditions'], lists['qualifiers'], lists['exclusions'], geoCache, NLPClassifier)
def getReformatted(directory, lists, cfg, pickleMgmt, fileList, core, out_q, keepTypes, NLPClassifier): count = 0 collectedContent = [] collectedTypes = {} geoPickle = dict(pickleMgmt.items()) useNLP = NLPClassifier != 'null' and NLPClassifier != False for fileName in fileList: inFile = open(directory+fileName) content = json.load(inFile) filteredContent = [] print "Core", core, "reclassifying", fileName, "by updated lists" if lists != "null": jsonToDictFix(content) if cfg['DaysBack'] != 'all' and type(cfg['DaysBack']) is int: leftBound = datetime.datetime.utcnow() - datetime.timedelta(days = cfg['DaysBack']) content = [item for item in content if parser.parse(item['created_at']).replace(tzinfo=None) > leftBound] for tweet in content: count += 1 if count%250 == 0: print "\tCore",core,count,"tweets sorted" tweet['text'] = tweet['text'].replace('\n',' ') tweetType = checkTweet(lists['conditions'],lists['qualifiers'],lists['exclusions'], tweet['text'], cfg) if tweetType in keepTypes: geoType = isInBox(cfg,geoPickle,tweet) if geoType['inBox'] or cfg['KeepUnlocated']: timeData = outTime(localTime(tweet,cfg)) collectedTypes[str(tweet['id'])] = {'tweetType':tweetType, 'geoType':geoType['text'], 'lat':geoType['lat'], 'lon':geoType['lon'], 'fineLocation':geoType['trueLoc'], 'place':geoType['place'], 'day':timeData['day'], 'time':timeData['time'], 'date':timeData['date']} if useNLP: collectedTypes[str(tweet['id'])]['NLPCat'] = str(TweetMatch.classifySingle(tweet['text'],NLPClassifier,cfg['NLPnGrams'])) filteredContent.append(tweet) collectedContent += filteredContent try: filteredContent = cleanJson(filteredContent,cfg,collectedTypes) except: print "DEBOOO123", cfg['OnlyKeepNLP'],count,len(collectedContent),len(filteredContent), len(collectedTypes) outName = fileName.replace('Raw','FilteredTweets') if cfg['MakeFilteredJson']: print "\tSaving file as", outName with open(directory+'studies/'+outName, 'w') as outFile: json.dump(filteredContent,outFile) outFile.close() collectedContent = cleanJson(collectedContent,cfg,collectedTypes) pickleMgmt = Manager().dict(geoPickle) #out_q.put({'content'+str(core):collectedContent,'types'+str(core):collectedTypes}) print "Core", core, "tasks complete!" out_q.put({'content'+str(core):collectedContent})
def main(): usingGDoc = False NLPClassifier = 'null' keepKeys = 'null' extra = dict() manualTime = 'null' skipReformat = '-s' in sys.argv quickReformat = '-r' in sys.argv and not skipReformat oneTimeDump = '-o' in sys.argv and not skipReformat quickSend = '-e' in sys.argv and not skipReformat if quickSend: quickPos = sys.argv.index('-e') if quickPos != len(sys.argv) - 1: tArg = sys.argv[quickPos+1] try: manualTime = parser.parse(tArg) print "Sending historic one time report for time %s" % manualTime except: pass skipReformat = not(quickReformat or oneTimeDump or quickSend) or skipReformat try: userLogin = sys.argv[2] print "Login '%s' passed explicitly" % (userLogin) except: userLogin = '******' try: temp = sys.argv[1] if temp.startswith('http'): usingGDoc = True gDocURL = temp configFile = 'null' print "Preparing GDI Remote Access Loader" else: print "\nTaking user parameters" directory = '/'.join(temp.split('/')[:-1]) configFile = temp.split('/')[-1] if directory == '': directory = os.getcwd() + '/' except: print "Taking default parameters" directory = os.getcwd() + '/' configFile = 'config' if configFile == '-f': configFile = 'config.txt' if usingGDoc: directory = os.getcwd() + '/' temp = giSpyGDILoad(gDocURL,directory) cfg = temp['config'] lists = temp['lists'] if type(temp['login']) is list: login = getLogins(directory,temp['login']) cfg['MultiLogin'] = True else: login = getLogins(directory,[temp['login']])[temp['login']] cfg['Directory'] = directory cfg['KeryRing'] = getOtherAPIs(directory) #geoCache = dict() geoCache = updateGeoPickle({},getPickleName(cfg),cfg) if cfg['OnlyKeepNLP'] != False: temp = cfg['OnlyKeepNLP'] if type(temp) is str: cfg['OnlyKeepNLP'] = temp.split('_') if type(temp) is list: cfg['OnlyKeepNLP'] = temp if type(cfg['OnlyKeepNLP']) is not list: cfg['OnlyKeepNLP'] = [str(temp)] cfg['OnlyKeepNLP'] = [str(key) for key in cfg['OnlyKeepNLP']] NLPClassifier = TweetMatch.getClassifier(cfg['NLPFile'],cfg) cfg['OneTimeDump'] = oneTimeDump cfg['QuickSend'] = quickSend if oneTimeDump: cfg['DaysBack'] = 'all' if not skipReformat: reformatOld(directory,lists,cfg,geoCache,NLPClassifier,manualTime=manualTime) geoCache = updateGeoPickle(geoCache,getPickleName(cfg),cfg) if quickReformat or oneTimeDump or quickSend: sys.exit() else: print "Loading parameters from config file '%s' in directory '%s'" % (configFile, directory) cfg = getConfig(directory+configFile) cfg['Directory'] = directory cfg['ConfigFile'] = configFile logins = getLogins(directory, cfg['Logins']) lists = updateWordBanks(directory, cfg) #geoCache = dict() geoCache = updateGeoPickle({},directory+'caches/'+pickleName) cfg['KeyRing'] = getOtherAPIs(directory) if not skipReformat: reformatOld(directory,lists,cfg,geoCache,NLPClassifier,manualTime=manualTime) if quickReformat: sys.exit() geoCache = updateGeoPickle(geoCache,directory+'caches/'+pickleName) print "\nPlease choose login number:" if userLogin == 'null': listed = sorted(logins.keys()); i = 0 for key in listed: print "\t%s - %s - %s" % (i,key,logins[key]['description']) i += 1 while True: try: selection = int(raw_input('\n:')) userLogin = listed[selection] break except: None login = logins[userLogin] if cfg['MultiLogin']: for key in login.keys(): temp = getAuth(login[key]) login[key]['auth'] = temp['auth'] login[key]['api'] = temp['api'] time.sleep(3) else: temp = getAuth(login) login['auth'] = temp['auth'] login['api'] = temp['api'] login['name'] = userLogin cfg['userLogin'] = userLogin cfg['_login_'] = login cfg['Directory'] = directory cfg['args'] = sys.argv getTweets(login,cfg,lists['conditions'],lists['qualifiers'],lists['exclusions'],geoCache,NLPClassifier)
def run(self): newDay = False print "\nPreparing to run..." print "Geographic Selection:", self.geo, '\n\n' while True: collected = [] foundIDs = set() allFound = 0 if self.multiAPI: print "Multi-api mode in use" failCount = dict([key,0] for key in self.api.keys()) APIoffline = dict([key, 0] for key in self.api.keys()) if self.cfg['UseStacking']: counted = 0; increment = 10 timeNow = time.time() elapsed = timeNow - self.stackLast self.stackLast = timeNow stackDelay = getDelay(self, elapsed) print "Running %s geoStack queries at 1 query every %s seconds" % (self.stackQueries,stackDelay) queryCount = -1 for query in self.queries: #if queryCount % 50 == 0: # setLastRan(self) if self.cfg['UseStacking']: geoCount = 0; queryCount += 1 for geoPoint in self.stackPoints: loggedIn = True ranSearch = False while not loggedIn or not ranSearch: if geoCount % 20 == 0: setLastRan(self) try: if self.multiAPI: numOffline = sum((1 for key in APIoffline if APIoffline[key] != 0)) APIoffline = {key:max(value-1,0) for key,value in APIoffline.iteritems()} chooseable = [key for key,value in APIoffline.iteritems() if value == 0] if len(chooseable) > 0: chosen = choice(chooseable) cellCollected = self.api[chosen]['api'].search(q = query, since_id = self.stackLastTweet[queryCount][geoCount], geocode = geoString(geoPoint), result_type="recent", count = 100, tweet_mode='extended') failCount[chosen] = 0 time.sleep(uniform(0,.05)) else: cellCollected = self.api.search(q = query, since_id = self.stackLastTweet[queryCount][geoCount], geocode = geoString(geoPoint), result_type="recent", count = 100, tweet_mode='extended') allFound += len(cellCollected) if self.useNLP: if self.cfg['KeepDiscardsNLP']: cellCollected = [status for status in cellCollected if status.id not in foundIDs and (TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams']) in self.cfg['OnlyKeepNLP'] or uniform(0,1)<self.cfg['DiscardSampleNLP'])] else: cellCollected = [status for status in cellCollected if TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams']) in self.cfg['OnlyKeepNLP'] and status.id not in foundIDs] for cell in cellCollected: foundIDs.add(cell.id) if len(cellCollected)>0: collected += cellCollected self.stackLastTweet[queryCount][geoCount] = int(collected[0].id) geoCount += 1; counted += 1 ranSearch = True if counted == self.rateLimit/2: stackDelay = getDelay(self, 0) if counted%increment == 0: print "Running search %s out of %s with %s hits found and %s ignored" % (counted, self.stackQueries, len(collected), allFound - len(collected)) if self.multiAPI: time.sleep(stackDelay*(float(len(self.api))/max(len(chooseable),1))*0.9) else: time.sleep(stackDelay) except Exception,e: loggedIn = False while not loggedIn: if not self.multiAPI: print "Experiment", self.cfg['FileName'],"login error, will sleep 120 before reconnection, error code:",e time.sleep(120 + randint(-3,3)) else: failCount[chosen] += 1 try: if self.multiAPI: if len(chooseable) == 0: print "All logins down, will sleep 2 minutes before reconnection, error code:",e chosen = [key for key, value in APIoffline.iteritems() if value == min(APIoffline.values())][0] time.sleep(120) failCount[chosen] = 0 APIoffline[chosen] = 0 if failCount[chosen] <= 2: self.api[chosen]['api'] = getAuth(self.cfg['_login_'][chosen])['api'] else: APIoffline[chosen] = 100 else: self.api = getAuth(self.cfg['_login_'])['api'] print "Login successfull" loggedIn = True except Exception,e: print "Login unsuccessfull\n",e else: loggedIn = True ranSearch = False while not loggedIn or not ranSearch: try: #Issue of stream pagination currently unresolved #https://github.com/tweepy/tweepy/pull/296#commitcomment-3404913 if self.multiAPI: numOffline = sum((1 for key in APIoffline if APIoffline[key] != 0)) APIoffline = {key:max(value-1,0) for key,value in APIoffline.iteritems()} chooseable = [key for key,value in APIoffline.iteritems() if value == 0] if len(chooseable) > 0: chosen = choice(chooseable) if self.cfg['RegionSearch']: cellCollected = self.api[chosen]['api'].search(q = query, since_id = self.lastTweet, result_type="recent", count = 100, tweet_mode='extended') else: cellCollected = self.api[chosen]['api'].search(q = query, since_id = self.lastTweet, geocode = self.geo, result_type="recent", count = 100, tweet_mode='extended') failCount[chosen] = 0 time.sleep(uniform(0,.05)) else: if self.cfg['RegionSearch']: cellCollected = self.api.search(q = query, since_id = self.lastTweet, result_type="recent", count = 100, tweet_mode='extended') else: cellCollected = self.api.search(q = query, since_id = self.lastTweet, geocode = self.geo, result_type="recent", count = 100, tweet_mode='extended') if self.useNLP: if self.cfg['KeepDiscardsNLP']: cellCollected = [status for status in cellCollected if status.id not in foundIDs and (TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams']) in self.cfg['OnlyKeepNLP'] or uniform(0,1)<self.cfg['DiscardSampleNLP'])] else: cellCollected = [status for status in cellCollected if TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams']) in self.cfg['OnlyKeepNLP'] and status.id not in foundIDs] for cell in cellCollected: foundIDs.add(cell.id) collected += cellCollected ranSearch = True except Exception,e: loggedIn = False while not loggedIn: if not self.multiAPI: print "Experiment", self.cfg['FileName'],"login error, will sleep 120 before reconnection, error code:",e time.sleep(120 + randint(-3,3)) else: failCount[chosen] += 1 try: if self.multiAPI: if len(chooseable) == 0: print "All logins down, will sleep 2 minutes before reconnection, error code:",e chosen = [key for key, value in APIoffline.iteritems() if value == min(APIoffline.values())][0] time.sleep(120) failCount[chosen] = 0 APIoffline[chosen] = 0 if failCount[chosen] <= 2: self.api[chosen]['api'] = getAuth(self.cfg['_login_'][chosen])['api'] else: APIoffline[chosen] = 100 else: self.api = getAuth(self.cfg['_login_'])['api'] print "Login successfull" loggedIn = True except Exception,e: print "Login unsuccessfull\n",e
None else: self.irrelevantCount += 1 if tweetType != "retweet" and self.cfg['KeepRaw'] == True and geoStrictKeep and wordStrictKeep: self.jsonRaw.append(status.json) self.tweetTypes[str(status.id)] = {'tweetType':tweetType, 'geoType':geoType['text'], 'lat':geoType['lat'], 'lon':geoType['lon'], 'place':geoType['place'], 'fineLocation':geoType['trueLoc'], 'day':tweetLocalTime['day'], 'time':tweetLocalTime['time'], 'date':tweetLocalTime['date']} if self.cfg['OnlyKeepNLP']: self.tweetTypes[str(status.id)]['NLPCat'] = TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams']) if newDay and self.sendStatus == 'ready': giSeeker.saveTweets(self) newDay = False giSeeker.closeDay(self) try: self.startDay = localTime(status.created_at,self.cfg).strftime("%A %d") self.startTime = localTime(status.created_at,self.cfg).strftime(timeArgs) except: self.startDay = 'null' self.startTime = 'null' if hasResults: