def on_status(self, status): try: if self.startDay != localTime(datetime.datetime.today(),self.cfg).strftime("%A") or self.tweetCount >= self.cfg['StopCount']: giListener.saveTweets(self) text = status.text.replace('\n',' ') tweetType = checkTweet(self.conditions, self.qualifiers, self.exclusions, text, self.cfg) geoType = isInBox(self.cfg, self.geoCache, status) percentFilled = (self.tweetCount*100)/self.cfg['StopCount'] loginInfo = "\033[94m%s:%s%%\033[0m" % (self.name,percentFilled) tweetLocalTime = outTime(localTime(status,self.cfg)) if geoType['inBox'] or self.cfg['KeepUnlocated']: if tweetType == "accepted": print loginInfo, "\033[1m%s\t%s\t%s\t%s\033[0m" % (text, status.author.screen_name, tweetLocalTime['full'], status.source,) self.tweetCount += self.cfg['KeepAccepted'] self.acceptedCount += 1 self.jsonAccepted.append(status.json) elif tweetType == "excluded": print loginInfo, "\033[91m%s\t%s\t%s\t%s\033[0m" % (text, status.author.screen_name, tweetLocalTime['full'], status.source,) self.tweetCount += self.cfg['KeepExcluded'] self.excludedCount += 1 self.jsonExcluded.append(status.json) elif tweetType == "partial": print loginInfo, "%s\t%s\t%s\t%s" % (text, status.author.screen_name, tweetLocalTime['full'], status.source,) self.tweetCount += self.cfg['KeepPartial'] self.partialCount += 1 self.jsonPartial.append(status.json) elif tweetType == "retweet": None else: self.irrelevantCount += 1 if tweetType != "retweet" and self.cfg['KeepRaw'] == True: self.jsonRaw.append(status.json) self.tweetTypes[str(status.id)] = {'tweetType':tweetType, 'geoType':geoType['text'], 'lat':geoType['lat'], 'lon':geoType['lon'], 'place':geoType['place'], 'fineLocation':geoType['trueLoc'], 'day':tweetLocalTime['day'], 'time':tweetLocalTime['time'], 'date':tweetLocalTime['date']} if self.cfg['OnlyKeepNLTK']: self.tweetTypes[str(status.id)]['nltkCat'] = TweetMatch.classifySingle(status.text,self.NLTK) except Exception, e: print "Encountered exception:", e pass
def run(self): newDay = False print "\nPreparing to run..." print "Geographic Selection:", self.geo, '\n\n' while True: collected = [] foundIDs = set() allFound = 0 if self.multiAPI: print "Multi-api mode in use" failCount = dict([key,0] for key in self.api.keys()) APIoffline = dict([key, 0] for key in self.api.keys()) if self.cfg['UseStacking']: counted = 0; increment = 10 timeNow = time.time() elapsed = timeNow - self.stackLast self.stackLast = timeNow stackDelay = getDelay(self, elapsed) print "Running %s geoStack queries at 1 query every %s seconds" % (self.stackQueries,stackDelay) queryCount = -1 for query in self.queries: if self.cfg['UseStacking']: geoCount = 0; queryCount += 1 for geoPoint in self.stackPoints: loggedIn = True ranSearch = False while not loggedIn or not ranSearch: try: if self.multiAPI: numOffline = sum((1 for key in APIoffline if APIoffline[key] != 0)) APIoffline = {key:max(value-1,0) for key,value in APIoffline.iteritems()} chooseable = [key for key,value in APIoffline.iteritems() if value == 0] #print "DEBOO CHOOSABLE", chooseable, "NUMOFFLINE", numOffline, "APIoffline", APIoffline if len(chooseable) > 0: chosen = choice(chooseable) cellCollected = self.api[chosen]['api'].search(q = query, since_id = self.stackLastTweet[queryCount][geoCount], geocode = geoString(geoPoint), result_type="recent", count = 100) failCount[chosen] = 0 time.sleep(uniform(0,.05)) else: cellCollected = self.api.search(q = query, since_id = self.stackLastTweet[queryCount][geoCount], geocode = geoString(geoPoint), result_type="recent", count = 100) allFound += len(cellCollected) if self.useNLTK: if self.cfg['KeepDiscardsNLTK']: cellCollected = [status for status in cellCollected if status.id not in foundIDs and (TweetMatch.classifySingle(status.text,self.NLTK) in self.cfg['OnlyKeepNLTK'] or uniform(0,1)<self.cfg['DiscardSampleNLTK'])] else: cellCollected = [status for status in cellCollected if TweetMatch.classifySingle(status.text,self.NLTK) in self.cfg['OnlyKeepNLTK'] and status.id not in foundIDs] for cell in cellCollected: foundIDs.add(cell.id) #print cell.text,"CLASS:", TweetMatch.classifySingle(cell.text,self.NLTK) if len(cellCollected)>0: collected += cellCollected self.stackLastTweet[queryCount][geoCount] = int(collected[0].id) geoCount += 1; counted += 1 ranSearch = True if counted == self.rateLimit/2: stackDelay = getDelay(self, 0) if counted%increment == 0: print "Running search %s out of %s with %s hits found and %s ignored" % (counted, self.stackQueries, len(collected), allFound - len(collected)) if self.multiAPI: time.sleep(stackDelay*(float(len(self.api))/max(len(chooseable),1))*0.9) else: time.sleep(stackDelay) except Exception,e: loggedIn = False while not loggedIn: if not self.multiAPI: print "Login error, will sleep 120 before reconnection, error code:",e time.sleep(120 + randint(-3,3)) else: failCount[chosen] += 1 try: if self.multiAPI: if len(chooseable) == 0: print "All logins down, will sleep 2 minutes before reconnection, error code:",e chosen = [key for key, value in APIoffline.iteritems() if value == min(APIoffline.values())][0] time.sleep(120) failCount[chosen] = 0 APIoffline[chosen] = 0 if failCount[chosen] <= 2: self.api[chosen]['api'] = getAuth(self.cfg['_login_'][chosen])['api'] else: APIoffline[chosen] = 100 else: self.api = getAuth(self.cfg['_login_'])['api'] print "Login successfull" loggedIn = True except Exception,e: print "Login unsuccessfull\n",e else: loggedIn = True ranSearch = False while not loggedIn or not ranSearch: try: #Issue of stream pagination currently unresolved #https://github.com/tweepy/tweepy/pull/296#commitcomment-3404913 #Method 1: Unlimited backstream, may have overlap or rate limiting issues """for tweet in tweepy.Cursor(self.api.search,q=query, geocode= self.geo, since_id= str(0), result_type="recent").items(): print tweet.text collected.append(tweet) for item in collected: print item.text, item.coordinates, item.geo""" #Method 2: Since id stream, may miss if keyword set yields over 100 new results if self.multiAPI: numOffline = sum((1 for key in APIoffline if APIoffline[key] != 0)) APIoffline = {key:max(value-1,0) for key,value in APIoffline.iteritems()} chooseable = [key for key,value in APIoffline.iteritems() if value == 0] if len(chooseable) > 0: chosen = choice(chooseable) cellCollected = self.api[chosen]['api'].search(q = query, since_id = self.lastTweet, geocode = self.geo, result_type="recent", count = 100) failCount[chosen] = 0 time.sleep(uniform(0,.05)) else: cellCollected = self.api.search(q = query, since_id = self.lastTweet, geocode = self.geo, result_type="recent", count = 100) if self.useNLTK: if self.cfg['KeepDiscardsNLTK']: cellCollected = [status for status in cellCollected if status.id not in foundIDs and (TweetMatch.classifySingle(status.text,self.NLTK) in self.cfg['OnlyKeepNLTK'] or uniform(0,1)<self.cfg['DiscardSampleNLTK'])] else: cellCollected = [status for status in cellCollected if TweetMatch.classifySingle(status.text,self.NLTK) in self.cfg['OnlyKeepNLTK'] and status.id not in foundIDs] for cell in cellCollected: foundIDs.add(cell.id) collected += cellCollected ranSearch = True except Exception,e: loggedIn = False while not loggedIn: if not self.multiAPI: print "Login error, will sleep 120 before reconnection, error code:",e time.sleep(120 + randint(-3,3)) else: failCount[chosen] += 1 try: if self.multiAPI: if len(chooseable) == 0: print "All logins down, will sleep 2 minutes before reconnection, error code:",e chosen = [key for key, value in APIoffline.iteritems() if value == min(APIoffline.values())][0] time.sleep(120) failCount[chosen] = 0 APIoffline[chosen] = 0 if failCount[chosen] <= 2: self.api[chosen]['api'] = getAuth(self.cfg['_login_'][chosen])['api'] else: APIoffline[chosen] = 100 else: self.api = getAuth(self.cfg['_login_'])['api'] print "Login successfull" loggedIn = True except Exception,e: print "Login unsuccessfull\n",e
None else: self.irrelevantCount += 1 if tweetType != "retweet" and self.cfg['KeepRaw'] == True and geoStrictKeep and wordStrictKeep: self.jsonRaw.append(status.json) self.tweetTypes[str(status.id)] = {'tweetType':tweetType, 'geoType':geoType['text'], 'lat':geoType['lat'], 'lon':geoType['lon'], 'place':geoType['place'], 'fineLocation':geoType['trueLoc'], 'day':tweetLocalTime['day'], 'time':tweetLocalTime['time'], 'date':tweetLocalTime['date']} if self.cfg['OnlyKeepNLTK']: self.tweetTypes[str(status.id)]['nltkCat'] = TweetMatch.classifySingle(status.text,self.NLTK) if newDay: giSeeker.saveTweets(self) newDay = False giSeeker.closeDay(self) try: self.startDay = localTime(status.created_at,self.cfg).strftime("%A %d") self.startTime = localTime(status.created_at,self.cfg).strftime(timeArgs) except: self.startDay = 'null' self.startTime = 'null' if hasResults:
None else: self.irrelevantCount += 1 if tweetType != "retweet" and self.cfg['KeepRaw'] == True and geoStrictKeep and wordStrictKeep: self.jsonRaw.append(status.json) self.tweetTypes[str(status.id)] = {'tweetType':tweetType, 'geoType':geoType['text'], 'lat':geoType['lat'], 'lon':geoType['lon'], 'place':geoType['place'], 'fineLocation':geoType['trueLoc'], 'day':tweetLocalTime['day'], 'time':tweetLocalTime['time'], 'date':tweetLocalTime['date']} if self.cfg['OnlyKeepNLP']: self.tweetTypes[str(status.id)]['NLPCat'] = TweetMatch.classifySingle(status.text,self.NLP,self.cfg['NLPnGrams']) if newDay and self.sendStatus == 'ready': giSeeker.saveTweets(self) newDay = False giSeeker.closeDay(self) try: self.startDay = localTime(status.created_at,self.cfg).strftime("%A %d") self.startTime = localTime(status.created_at,self.cfg).strftime(timeArgs) except: self.startDay = 'null' self.startTime = 'null' if hasResults:
def getReformatted(directory, lists, cfg, pickleMgmt, fileList, core, out_q, keepTypes, NLPClassifier): count = 0 collectedContent = [] collectedTypes = {} geoPickle = dict(pickleMgmt.items()) useNLP = NLPClassifier != 'null' and NLPClassifier != False for fileName in fileList: inFile = open(directory+fileName) content = json.load(inFile) filteredContent = [] print "Core", core, "reclassifying", fileName, "by updated lists" if lists != "null": jsonToDictFix(content) if cfg['DaysBack'] != 'all' and type(cfg['DaysBack']) is int: leftBound = datetime.datetime.utcnow() - datetime.timedelta(days = cfg['DaysBack']) content = [item for item in content if parser.parse(item['created_at']).replace(tzinfo=None) > leftBound] for tweet in content: count += 1 if count%250 == 0: print "\tCore",core,count,"tweets sorted" tweet['text'] = tweet['text'].replace('\n',' ') tweetType = checkTweet(lists['conditions'],lists['qualifiers'],lists['exclusions'], tweet['text'], cfg) if tweetType in keepTypes: geoType = isInBox(cfg,geoPickle,tweet) if geoType['inBox'] or cfg['KeepUnlocated']: timeData = outTime(localTime(tweet,cfg)) collectedTypes[str(tweet['id'])] = {'tweetType':tweetType, 'geoType':geoType['text'], 'lat':geoType['lat'], 'lon':geoType['lon'], 'fineLocation':geoType['trueLoc'], 'place':geoType['place'], 'day':timeData['day'], 'time':timeData['time'], 'date':timeData['date']} if useNLP: collectedTypes[str(tweet['id'])]['NLPCat'] = str(TweetMatch.classifySingle(tweet['text'],NLPClassifier,cfg['NLPnGrams'])) filteredContent.append(tweet) collectedContent += filteredContent try: filteredContent = cleanJson(filteredContent,cfg,collectedTypes) except: print "DEBOOO123", cfg['OnlyKeepNLP'],count,len(collectedContent),len(filteredContent), len(collectedTypes) outName = fileName.replace('Raw','FilteredTweets') if cfg['MakeFilteredJson']: print "\tSaving file as", outName with open(directory+'studies/'+outName, 'w') as outFile: json.dump(filteredContent,outFile) outFile.close() collectedContent = cleanJson(collectedContent,cfg,collectedTypes) pickleMgmt = Manager().dict(geoPickle) #out_q.put({'content'+str(core):collectedContent,'types'+str(core):collectedTypes}) print "Core", core, "tasks complete!" out_q.put({'content'+str(core):collectedContent})
def run(self): newDay = False print "\nPreparing to run..." print "Geographic Selection:", self.geo, '\n\n' while True: collected = [] foundIDs = set() allFound = 0 if self.multiAPI: print "Multi-api mode in use" failCount = dict([key,0] for key in self.api.keys()) APIoffline = dict([key, 0] for key in self.api.keys()) if self.cfg['UseStacking']: counted = 0; increment = 10 timeNow = time.time() elapsed = timeNow - self.stackLast self.stackLast = timeNow stackDelay = getDelay(self, elapsed) print "Running %s geoStack queries at 1 query every %s seconds" % (self.stackQueries,stackDelay) queryCount = -1 for query in self.queries: #if queryCount % 50 == 0: # setLastRan(self) if self.cfg['UseStacking']: geoCount = 0; queryCount += 1 for geoPoint in self.stackPoints: loggedIn = True ranSearch = False while not loggedIn or not ranSearch: if geoCount % 20 == 0: setLastRan(self) try: if self.multiAPI: numOffline = sum((1 for key in APIoffline if APIoffline[key] != 0)) APIoffline = {key:max(value-1,0) for key,value in APIoffline.iteritems()} chooseable = [key for key,value in APIoffline.iteritems() if value == 0] if len(chooseable) > 0: chosen = choice(chooseable) cellCollected = self.api[chosen]['api'].search(q = query, since_id = self.stackLastTweet[queryCount][geoCount], geocode = geoString(geoPoint), result_type="recent", count = 100, tweet_mode='extended') failCount[chosen] = 0 time.sleep(uniform(0,.05)) else: cellCollected = self.api.search(q = query, since_id = self.stackLastTweet[queryCount][geoCount], geocode = geoString(geoPoint), result_type="recent", count = 100, tweet_mode='extended') allFound += len(cellCollected) if self.useNLP: if self.cfg['KeepDiscardsNLP']: cellCollected = [status for status in cellCollected if status.id not in foundIDs and (TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams']) in self.cfg['OnlyKeepNLP'] or uniform(0,1)<self.cfg['DiscardSampleNLP'])] else: cellCollected = [status for status in cellCollected if TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams']) in self.cfg['OnlyKeepNLP'] and status.id not in foundIDs] for cell in cellCollected: foundIDs.add(cell.id) if len(cellCollected)>0: collected += cellCollected self.stackLastTweet[queryCount][geoCount] = int(collected[0].id) geoCount += 1; counted += 1 ranSearch = True if counted == self.rateLimit/2: stackDelay = getDelay(self, 0) if counted%increment == 0: print "Running search %s out of %s with %s hits found and %s ignored" % (counted, self.stackQueries, len(collected), allFound - len(collected)) if self.multiAPI: time.sleep(stackDelay*(float(len(self.api))/max(len(chooseable),1))*0.9) else: time.sleep(stackDelay) except Exception,e: loggedIn = False while not loggedIn: if not self.multiAPI: print "Experiment", self.cfg['FileName'],"login error, will sleep 120 before reconnection, error code:",e time.sleep(120 + randint(-3,3)) else: failCount[chosen] += 1 try: if self.multiAPI: if len(chooseable) == 0: print "All logins down, will sleep 2 minutes before reconnection, error code:",e chosen = [key for key, value in APIoffline.iteritems() if value == min(APIoffline.values())][0] time.sleep(120) failCount[chosen] = 0 APIoffline[chosen] = 0 if failCount[chosen] <= 2: self.api[chosen]['api'] = getAuth(self.cfg['_login_'][chosen])['api'] else: APIoffline[chosen] = 100 else: self.api = getAuth(self.cfg['_login_'])['api'] print "Login successfull" loggedIn = True except Exception,e: print "Login unsuccessfull\n",e else: loggedIn = True ranSearch = False while not loggedIn or not ranSearch: try: #Issue of stream pagination currently unresolved #https://github.com/tweepy/tweepy/pull/296#commitcomment-3404913 if self.multiAPI: numOffline = sum((1 for key in APIoffline if APIoffline[key] != 0)) APIoffline = {key:max(value-1,0) for key,value in APIoffline.iteritems()} chooseable = [key for key,value in APIoffline.iteritems() if value == 0] if len(chooseable) > 0: chosen = choice(chooseable) if self.cfg['RegionSearch']: cellCollected = self.api[chosen]['api'].search(q = query, since_id = self.lastTweet, result_type="recent", count = 100, tweet_mode='extended') else: cellCollected = self.api[chosen]['api'].search(q = query, since_id = self.lastTweet, geocode = self.geo, result_type="recent", count = 100, tweet_mode='extended') failCount[chosen] = 0 time.sleep(uniform(0,.05)) else: if self.cfg['RegionSearch']: cellCollected = self.api.search(q = query, since_id = self.lastTweet, result_type="recent", count = 100, tweet_mode='extended') else: cellCollected = self.api.search(q = query, since_id = self.lastTweet, geocode = self.geo, result_type="recent", count = 100, tweet_mode='extended') if self.useNLP: if self.cfg['KeepDiscardsNLP']: cellCollected = [status for status in cellCollected if status.id not in foundIDs and (TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams']) in self.cfg['OnlyKeepNLP'] or uniform(0,1)<self.cfg['DiscardSampleNLP'])] else: cellCollected = [status for status in cellCollected if TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams']) in self.cfg['OnlyKeepNLP'] and status.id not in foundIDs] for cell in cellCollected: foundIDs.add(cell.id) collected += cellCollected ranSearch = True except Exception,e: loggedIn = False while not loggedIn: if not self.multiAPI: print "Experiment", self.cfg['FileName'],"login error, will sleep 120 before reconnection, error code:",e time.sleep(120 + randint(-3,3)) else: failCount[chosen] += 1 try: if self.multiAPI: if len(chooseable) == 0: print "All logins down, will sleep 2 minutes before reconnection, error code:",e chosen = [key for key, value in APIoffline.iteritems() if value == min(APIoffline.values())][0] time.sleep(120) failCount[chosen] = 0 APIoffline[chosen] = 0 if failCount[chosen] <= 2: self.api[chosen]['api'] = getAuth(self.cfg['_login_'][chosen])['api'] else: APIoffline[chosen] = 100 else: self.api = getAuth(self.cfg['_login_'])['api'] print "Login successfull" loggedIn = True except Exception,e: print "Login unsuccessfull\n",e
None else: self.irrelevantCount += 1 if tweetType != "retweet" and self.cfg['KeepRaw'] == True and geoStrictKeep and wordStrictKeep: self.jsonRaw.append(status.json) self.tweetTypes[str(status.id)] = {'tweetType':tweetType, 'geoType':geoType['text'], 'lat':geoType['lat'], 'lon':geoType['lon'], 'place':geoType['place'], 'fineLocation':geoType['trueLoc'], 'day':tweetLocalTime['day'], 'time':tweetLocalTime['time'], 'date':tweetLocalTime['date']} if self.cfg['OnlyKeepNLP']: self.tweetTypes[str(status.id)]['NLPCat'] = TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams']) if newDay and self.sendStatus == 'ready': giSeeker.saveTweets(self) newDay = False giSeeker.closeDay(self) try: self.startDay = localTime(status.created_at,self.cfg).strftime("%A %d") self.startTime = localTime(status.created_at,self.cfg).strftime(timeArgs) except: self.startDay = 'null' self.startTime = 'null' if hasResults: