コード例 #1
0
def getWordWeights(data, daysPast, directory, timeStamp):
    dates = [entry['created_at'] for entry in data.values()]
    rightBound = max(dates)
    leftBound = rightBound - datetime.timedelta(days=daysPast)
    data = [
        entry for entry in data.values()
        if leftBound < entry['created_at'] < rightBound
    ]

    if 'nlpCat' in data[0].keys():
        CatCol = 'nlpCat'
    elif 'nltkCat' in data[0].keys():
        CatCol = 'nltkCat'
    else:
        CatCol = 'tweetType'

    wordWeights = dict()
    wordList = dict()
    wordCloud = []
    wordList['all'] = [TweetMatch.prepTweet(entry['text']) for entry in data]

    cats = set([entry[CatCol] for entry in data])
    for cat in cats:
        wordList[cat] = [
            TweetMatch.prepTweet(entry['text']) for entry in data
            if entry[CatCol] == cat
        ]

    cats.add('all')
    for cat in cats:
        wordList[cat] = [[
            word.split("'")[0] for word in entry if word not in blockKeys
        ] for entry in wordList[cat]]

    for cat in cats:
        wordWeights[cat] = dict()
        for tweet in wordList[cat]:
            for word in tweet:
                if word not in wordWeights[cat].keys():
                    wordWeights[cat][word] = 1
                else:
                    wordWeights[cat][word] += 1

    for cat in cats:
        listed = []
        for key in wordWeights[cat].keys():
            listed.append('{text: "%s", weight: %s}' %
                          (str(key), wordWeights[cat][key]))
        wordCloud.append('{%s: [%s]}' % (cat, ', '.join(listed)))

    jsonOut = '{wordcloud: [%s]}' % ', '.join(wordCloud)
    outName = "wordcloud.json"
    print "Writing wordcloud to '" + outName + "'"

    outFile = open(directory + outName, "w")
    outFile.write(jsonOut)
    outFile.close()
    return directory + outName
コード例 #2
0
def getWordWeights(data,daysPast,directory, timeStamp):
    dates = [entry['created_at'] for entry in data.values()]
    rightBound = max(dates)
    leftBound = rightBound - datetime.timedelta(days = daysPast)
    data = [entry for entry in data.values() if leftBound < entry['created_at'] < rightBound]
    
    if  'nlpCat' in data[0].keys():
        CatCol = 'nlpCat'
    elif 'nltkCat' in data[0].keys():
        CatCol = 'nltkCat'
    else:
        CatCol = 'tweetType'
        
    wordWeights = dict()
    wordList = dict()
    wordCloud = []
    wordList['all'] = [TweetMatch.prepTweet(entry['text']) for entry in data] 

    cats = set([entry[CatCol] for entry in data])
    for cat in cats:
        wordList[cat] = [TweetMatch.prepTweet(entry['text']) for entry in data if entry[CatCol] == cat] 

    cats.add('all')
    for cat in cats:
        wordList[cat] = [[word.split("'")[0] for word in entry if word not in blockKeys] for entry in wordList[cat]]
    
    
    for cat in cats:
        wordWeights[cat] = dict()  
        for tweet in wordList[cat]:
            for word in tweet:
                if word not in wordWeights[cat].keys():
                    wordWeights[cat][word] = 1
                else:
                    wordWeights[cat][word] += 1
                    
    for cat in cats:
        listed = []
        for key in wordWeights[cat].keys():
            listed.append('{text: "%s", weight: %s}' % (str(key),wordWeights[cat][key]))
        wordCloud.append('{%s: [%s]}' % (cat,', '.join(listed)))
    
    jsonOut = '{wordcloud: [%s]}' % ', '.join(wordCloud)
    outName = "wordcloud.json"
    print "Writing wordcloud to '"+outName + "'"
    
    outFile = open(directory+outName, "w")
    outFile.write(jsonOut)
    outFile.close()
    return directory+outName
コード例 #3
0
    def on_status(self, status):
        try:
            if self.startDay != localTime(datetime.datetime.today(),self.cfg).strftime("%A") or self.tweetCount >= self.cfg['StopCount']:
                giListener.saveTweets(self)
            text = status.text.replace('\n',' ')
            tweetType = checkTweet(self.conditions, self.qualifiers, self.exclusions, text, self.cfg)
            
            geoType =  isInBox(self.cfg, self.geoCache, status)

            percentFilled = (self.tweetCount*100)/self.cfg['StopCount']
            loginInfo = "\033[94m%s:%s%%\033[0m" % (self.name,percentFilled)
            tweetLocalTime = outTime(localTime(status,self.cfg))
            if geoType['inBox'] or self.cfg['KeepUnlocated']:
                if tweetType == "accepted":
                    print loginInfo, "\033[1m%s\t%s\t%s\t%s\033[0m" % (text, 
                                status.author.screen_name, 
                                tweetLocalTime['full'], 
                                status.source,)
                    self.tweetCount += self.cfg['KeepAccepted']
                    self.acceptedCount += 1
                    self.jsonAccepted.append(status.json)
                elif tweetType == "excluded":
                    print loginInfo, "\033[91m%s\t%s\t%s\t%s\033[0m" % (text, 
                                status.author.screen_name, 
                                tweetLocalTime['full'], 
                                status.source,)
                    self.tweetCount += self.cfg['KeepExcluded']
                    self.excludedCount += 1
                    self.jsonExcluded.append(status.json)
                elif tweetType == "partial":
                    print loginInfo, "%s\t%s\t%s\t%s" % (text, 
                                status.author.screen_name, 
                                tweetLocalTime['full'], 
                                status.source,)
                    self.tweetCount += self.cfg['KeepPartial']
                    self.partialCount += 1
                    self.jsonPartial.append(status.json)
                elif tweetType == "retweet":
                    None
                else:
                    self.irrelevantCount += 1
            if tweetType != "retweet" and self.cfg['KeepRaw'] == True:
                self.jsonRaw.append(status.json)
                self.tweetTypes[str(status.id)] = {'tweetType':tweetType,
                        'geoType':geoType['text'],
                        'lat':geoType['lat'],
                        'lon':geoType['lon'],
                        'place':geoType['place'],
                        'fineLocation':geoType['trueLoc'],
                        'day':tweetLocalTime['day'],
                        'time':tweetLocalTime['time'],
                        'date':tweetLocalTime['date']} 
                if self.cfg['OnlyKeepNLTK']:
                    self.tweetTypes[str(status.id)]['nltkCat'] = TweetMatch.classifySingle(status.text,self.NLTK)            
                
        except Exception, e:
            print "Encountered exception:", e
            pass
コード例 #4
0
    def run(self):
        newDay = False
        print "\nPreparing to run..."
        print "Geographic Selection:", self.geo, '\n\n'
        while True:
            collected = []
            foundIDs = set()
            allFound = 0
 
            if self.multiAPI:
		print "Multi-api mode in use"
		failCount = dict([key,0] for key in self.api.keys())
            	APIoffline = dict([key, 0] for key in self.api.keys())

            if self.cfg['UseStacking']:
                    counted = 0; increment = 10
                    timeNow = time.time()
                    elapsed = timeNow - self.stackLast
                    self.stackLast = timeNow
                    stackDelay = getDelay(self, elapsed)
                    print "Running %s geoStack queries at 1 query every %s seconds" % (self.stackQueries,stackDelay)
            
            queryCount = -1
            for query in self.queries:
                if self.cfg['UseStacking']:
                    geoCount = 0; queryCount += 1
                    
                    for geoPoint in self.stackPoints:
                        loggedIn = True
                        ranSearch = False
                        while not loggedIn or not ranSearch:  
                            try:
                                if self.multiAPI:
                                    numOffline = sum((1 for key in APIoffline if APIoffline[key] != 0))
                                    APIoffline = {key:max(value-1,0) for key,value in APIoffline.iteritems()}
				    chooseable = [key for key,value in APIoffline.iteritems() if value == 0]
				    #print "DEBOO CHOOSABLE", chooseable, "NUMOFFLINE", numOffline, "APIoffline", APIoffline
				    if len(chooseable) > 0:
					chosen = choice(chooseable)
				    cellCollected = self.api[chosen]['api'].search(q = query, 
                                                        since_id = self.stackLastTweet[queryCount][geoCount],  
                                                        geocode = geoString(geoPoint),
                                                        result_type="recent",
                                                        count = 100)
                                    failCount[chosen] = 0
				    time.sleep(uniform(0,.05))
                                    
                                else:    
                                    cellCollected = self.api.search(q = query, 
                                                            since_id = self.stackLastTweet[queryCount][geoCount],  
                                                            geocode = geoString(geoPoint),
                                                            result_type="recent",
                                                            count = 100)
                                
                                allFound += len(cellCollected)
				if self.useNLTK:
				    if self.cfg['KeepDiscardsNLTK']:
				        cellCollected = [status for status in cellCollected if status.id not in foundIDs and (TweetMatch.classifySingle(status.text,self.NLTK) in self.cfg['OnlyKeepNLTK'] or uniform(0,1)<self.cfg['DiscardSampleNLTK'])]
				    else:
                                        cellCollected = [status for status in cellCollected if TweetMatch.classifySingle(status.text,self.NLTK) in self.cfg['OnlyKeepNLTK'] and status.id not in foundIDs]
                                
                                for cell in cellCollected:
                                    foundIDs.add(cell.id)
				    #print cell.text,"CLASS:", TweetMatch.classifySingle(cell.text,self.NLTK) 
                                
                                if len(cellCollected)>0:
                                    collected += cellCollected
                                    self.stackLastTweet[queryCount][geoCount] = int(collected[0].id)
                                    
                                geoCount += 1; counted += 1
                                    
                                ranSearch = True
                                if counted == self.rateLimit/2:
                                    stackDelay = getDelay(self, 0)
                                if counted%increment == 0:
                                    print "Running search %s out of %s with %s hits found and %s ignored" % (counted, self.stackQueries, len(collected), allFound - len(collected))
                                if self.multiAPI:
                                	time.sleep(stackDelay*(float(len(self.api))/max(len(chooseable),1))*0.9)
                                else:
                                	time.sleep(stackDelay)
                            except Exception,e:
                                loggedIn = False
                                while not loggedIn:
                                    if not self.multiAPI:
					print "Login error, will sleep 120 before reconnection, error code:",e
                                        time.sleep(120 + randint(-3,3))
				    else:
					failCount[chosen] += 1
				    try:
                                        if self.multiAPI:
						if len(chooseable) == 0:
							print "All logins down, will sleep 2 minutes before reconnection, error code:",e
							chosen = [key for key, value in APIoffline.iteritems() if value == min(APIoffline.values())][0]
							time.sleep(120)
							failCount[chosen] = 0
							APIoffline[chosen] = 0
                                        	if failCount[chosen] <= 2:
                                            		self.api[chosen]['api'] = getAuth(self.cfg['_login_'][chosen])['api']
                                            	else:
                                            		APIoffline[chosen] = 100
                                        else:
                                            self.api = getAuth(self.cfg['_login_'])['api']
                                        print "Login successfull"
                                        loggedIn =  True
                                    except Exception,e:
                                        print "Login unsuccessfull\n",e
                                
                else:
                    loggedIn = True
                    ranSearch = False
                    while not loggedIn or not ranSearch:
                        try:
                            #Issue of stream pagination currently unresolved
                            #https://github.com/tweepy/tweepy/pull/296#commitcomment-3404913
                            
                            #Method 1: Unlimited backstream, may have overlap or rate limiting issues
                            """for tweet in tweepy.Cursor(self.api.search,q=query,
                                geocode= self.geo,
                                since_id= str(0),
                                result_type="recent").items():
                                
                                print tweet.text
                                collected.append(tweet)
                                
                            for item in collected:
                                print item.text, item.coordinates, item.geo"""
            
                            #Method 2: Since id stream, may miss if keyword set yields over 100 new results
    
                            if self.multiAPI:
                                numOffline = sum((1 for key in APIoffline if APIoffline[key] != 0))
                                APIoffline = {key:max(value-1,0) for key,value in APIoffline.iteritems()}
				chooseable = [key for key,value in APIoffline.iteritems() if value == 0]

				if len(chooseable) > 0:
				    chosen = choice(chooseable)
				cellCollected = self.api[chosen]['api'].search(q = query, 
                                        since_id = self.lastTweet,  
                                        geocode = self.geo,
                                        result_type="recent",
                                        count = 100)
                                failCount[chosen] = 0
				time.sleep(uniform(0,.05))
                                    
                            else:    
                                cellCollected = self.api.search(q = query, 
                                                        since_id = self.lastTweet,  
                                                        geocode = self.geo,
                                                        result_type="recent",
                                                        count = 100)
                                                    
                            if self.useNLTK:
				    if self.cfg['KeepDiscardsNLTK']:
				        cellCollected = [status for status in cellCollected if status.id not in foundIDs and (TweetMatch.classifySingle(status.text,self.NLTK) in self.cfg['OnlyKeepNLTK'] or uniform(0,1)<self.cfg['DiscardSampleNLTK'])]
				    else:
                                        cellCollected = [status for status in cellCollected if TweetMatch.classifySingle(status.text,self.NLTK) in self.cfg['OnlyKeepNLTK'] and status.id not in foundIDs]                        
                                
                            for cell in cellCollected:
                                foundIDs.add(cell.id)
                                
                            collected += cellCollected    
                                
                            ranSearch = True
                        except Exception,e:
                            loggedIn = False
                            while not loggedIn:
                                    if not self.multiAPI:
					print "Login error, will sleep 120 before reconnection, error code:",e
                                        time.sleep(120 + randint(-3,3))
				    else:
					failCount[chosen] += 1
				    try:
                                        if self.multiAPI:
						if len(chooseable) == 0:
							print "All logins down, will sleep 2 minutes before reconnection, error code:",e
							chosen = [key for key, value in APIoffline.iteritems() if value == min(APIoffline.values())][0]
							time.sleep(120)
							failCount[chosen] = 0
							APIoffline[chosen] = 0
                                        	if failCount[chosen] <= 2:
                                            		self.api[chosen]['api'] = getAuth(self.cfg['_login_'][chosen])['api']
                                            	else:
                                            		APIoffline[chosen] = 100
                                        else:
                                            self.api = getAuth(self.cfg['_login_'])['api']
                                        print "Login successfull"
                                        loggedIn =  True
                                    except Exception,e:
                                        print "Login unsuccessfull\n",e
コード例 #5
0
    def __init__(self, conditions, qualifiers, exclusions, api, cfg, name, testSpace, geoCache):
        self.delay = 30
        self.qualifiers = qualifiers
        self.conditions = conditions
        self.api = api
        self.name = name
        self.exclusions = exclusions
        self.cfg = cfg
        self.searchDelay = 600
        self.rateLimit = 180
        self.rateIncrement = 900
        self.geoCache = geoCache
        self.useNLTK = False
        self.NLTK = 'null'
        
        if cfg['OnlyKeepNLTK'] != False:
            global TweetMatch
            import TweetMatch
            self.useNLTK = True
            temp = cfg['OnlyKeepNLTK']
            if type(temp) is str:
                self.cfg['OnlyKeepNLTK'] = temp.split('_')
            if type(temp) is list:
                self.cfg['OnlyKeepNLTK'] = temp
	    if type(self.cfg['OnlyKeepNLTK']) is not list:
		self.cfg['OnlyKeepNLTK'] = [str(temp)]
            self.cfg['OnlyKeepNLTK'] = [str(key) for key in self.cfg['OnlyKeepNLTK']]
            
            if '-f' not in cfg['args']:
            	self.NLTK = TweetMatch.getClassifier(cfg['NLTKFile'])
        
        giSeeker.flushTweets(self)
        giSeeker.makeQueries(self)
        
        geoTemp = getGeo(cfg)
        
        self.pathOut = self.cfg['OutDir']+'search/'
        if not os.path.exists(self.pathOut):
            os.makedirs(self.pathOut) 
        fileOut = openWhenReady(self.pathOut + 'checkbits','w')
        fileOut.write('DayFinished = False')
        fileOut.close()
            
        giSeeker.getLastID(self)
            
        if geoTemp == "STACK":
            cfg['UseStacking'] = True
            self.geo = "STACK"
        else:
            self.geo = geoString(getGeo(cfg))
            
        if type(api) is dict:
            print "Using multiple API login method"
            self.multiAPI = True
        else:
            print "Using single API login method"
            self.multiAPI = False
            
        if cfg['UseGDI']:
            self.searchDelay = cfg['GDI']['Frequency']
        if cfg['UseStacking']:
            temp = fillBox(cfg,self)
            self.stackPoints = temp['list']
            self.stackRadius = temp['radius']
            self.stackQueries = len(self.queries) * len(self.stackPoints)
            self.stackLastTweet = [[self.lastTweet for _ in xrange(len(self.stackPoints))] for __ in xrange(len(self.queries))]
            self.stackLast = time.time()
                
        self.testSpace = testSpace
        self.runDay = datetime.datetime.now().strftime("%A %d")
        self.lastWrite = 'null'
        self.startDay = 'null'
        

        print "\nInitiated seeker '%s' with %s conditions, %s qualifiers, and %s exclusions" % (name, len(conditions), len(qualifiers), len(exclusions))
コード例 #6
0
                 None
             else:
                 self.irrelevantCount += 1
         if tweetType != "retweet" and self.cfg['KeepRaw'] == True and geoStrictKeep and wordStrictKeep:
             self.jsonRaw.append(status.json)
             self.tweetTypes[str(status.id)] = {'tweetType':tweetType,
                 'geoType':geoType['text'],
                 'lat':geoType['lat'],
                 'lon':geoType['lon'],
                 'place':geoType['place'],
                 'fineLocation':geoType['trueLoc'],
                 'day':tweetLocalTime['day'],
                 'time':tweetLocalTime['time'],
                 'date':tweetLocalTime['date']}
             if self.cfg['OnlyKeepNLTK']:
                 self.tweetTypes[str(status.id)]['nltkCat'] = TweetMatch.classifySingle(status.text,self.NLTK)
             
     
     if newDay:
         giSeeker.saveTweets(self)
         newDay = False
         giSeeker.closeDay(self)
         try:
             self.startDay = localTime(status.created_at,self.cfg).strftime("%A %d")
             self.startTime = localTime(status.created_at,self.cfg).strftime(timeArgs)
         except:
             self.startDay = 'null'
             self.startTime = 'null'
 
     
     if hasResults:
コード例 #7
0
def main():
    usingGDoc = False
    NLPClassifier = 'null'
    keepKeys = 'null'
    extra = dict()
    
    skipReformat = '-s' in sys.argv
    quickReformat = '-r' in sys.argv and not skipReformat
    
    try: 
        userLogin = sys.argv[2]
        print "Login '%s' passed explicitly" % (userLogin)
    except:
        userLogin = '******'
    try:
        temp = sys.argv[1]
        if temp.startswith('http'):
            usingGDoc = True
            gDocURL = temp
            print "Preparing GDI Remote Access Loader"
        else:
            print "\nTaking user parameters"
            directory = '/'.join(temp.split('/')[:-1])
            configFile = temp.split('/')[-1]
            if directory == '':
                directory = os.getcwd() + '/'
    except:
        print "Taking default parameters"
        directory = os.getcwd() + '/'
        configFile = 'config'
        
    if usingGDoc:
        directory = os.getcwd() + '/'
        temp = giSpyGDILoad(gDocURL,directory)
        cfg = temp['config']
        lists = temp['lists']
        if type(temp['login']) is list:
            login = getLogins(directory,temp['login'])
	    cfg['MultiLogin'] = True
        else:
            login = getLogins(directory,[temp['login']])[temp['login']]
        cfg['Directory'] = directory
        geoCache = dict()
        updateGeoPickle(geoCache,directory+'caches/'+pickleName)
        
        if cfg['OnlyKeepNLP'] != False:
            temp = cfg['OnlyKeepNLP']
            if type(temp) is str:
                cfg['OnlyKeepNLP'] = temp.split('_')
            if type(temp) is list:
                cfg['OnlyKeepNLP'] = temp
	    if type(cfg['OnlyKeepNLP']) is not list:
		cfg['OnlyKeepNLP'] = [str(temp)]
            cfg['OnlyKeepNLP'] = [str(key) for key in cfg['OnlyKeepNLP']]
            NLPClassifier = TweetMatch.getClassifier(cfg['NLPFile'],cfg)
        
        if not skipReformat:
	    reformatOld(directory,lists,cfg,geoCache,NLPClassifier)
	    updateGeoPickle(geoCache,directory+'caches/'+pickleName)
            if quickReformat:
                quit()        	
        
    else: 
        print "Loading parameters from config file '%s' in directory '%s'" % (configFile, directory)
        cfg = getConfig(directory+configFile)
        cfg['Directory'] = directory
        cfg['ConfigFile'] = configFile
        logins = getLogins(directory, cfg['Logins'])
        lists = updateWordBanks(directory, cfg)
        geoCache = dict()
        updateGeoPickle(geoCache,directory+'caches/'+pickleName)
	if not skipReformat:        
		reformatOld(directory,lists,cfg,geoCache,NLPClassifier) 
		if quickReformat:
			quit()        
		updateGeoPickle(geoCache,directory+'caches/'+pickleName)
        
        
        print "\nPlease choose login number:"
        if userLogin == 'null':
            listed = sorted(logins.keys()); i = 0
            for key in listed:
                print "\t%s - %s - %s" % (i,key,logins[key]['description'])
                i += 1
            while True:
                try:
                    selection = int(raw_input('\n:'))
                    userLogin = listed[selection]
                    break
                except:
                    None
 
        login = logins[userLogin]
    
    
    if cfg['MultiLogin']:
        for key in login.keys():
            temp = getAuth(login[key])
            login[key]['auth'] = temp['auth']
            login[key]['api'] = temp['api']
            time.sleep(3)
    else:
        temp = getAuth(login)
        login['auth'] = temp['auth']
        login['api'] = temp['api']
        login['name'] = userLogin
    
    cfg['userLogin'] = userLogin    
    cfg['_login_'] = login
    cfg['Directory'] = directory
    cfg['args'] = sys.argv  
 
    getTweets(login,cfg,lists['conditions'],lists['qualifiers'],lists['exclusions'],geoCache,NLPClassifier)
コード例 #8
0
                 None
             else:
                 self.irrelevantCount += 1
         if tweetType != "retweet" and self.cfg['KeepRaw'] == True and geoStrictKeep and wordStrictKeep:
             self.jsonRaw.append(status.json)
             self.tweetTypes[str(status.id)] = {'tweetType':tweetType,
                 'geoType':geoType['text'],
                 'lat':geoType['lat'],
                 'lon':geoType['lon'],
                 'place':geoType['place'],
                 'fineLocation':geoType['trueLoc'],
                 'day':tweetLocalTime['day'],
                 'time':tweetLocalTime['time'],
                 'date':tweetLocalTime['date']}
             if self.cfg['OnlyKeepNLP']:
                 self.tweetTypes[str(status.id)]['NLPCat'] = TweetMatch.classifySingle(status.text,self.NLP,self.cfg['NLPnGrams'])
             
     
     if newDay and self.sendStatus == 'ready':
         giSeeker.saveTweets(self)
         newDay = False
         giSeeker.closeDay(self)
         try:
             self.startDay = localTime(status.created_at,self.cfg).strftime("%A %d")
             self.startTime = localTime(status.created_at,self.cfg).strftime(timeArgs)
         except:
             self.startDay = 'null'
             self.startTime = 'null'
 
     
     if hasResults:
コード例 #9
0
def main():
    usingGDoc = False
    NLPClassifier = 'null'
    keepKeys = 'null'
    extra = dict()
    manualTime = 'null'

    skipReformat = '-s' in sys.argv
    quickReformat = '-r' in sys.argv and not skipReformat
    oneTimeDump = '-o' in sys.argv and not skipReformat
    quickSend = '-e' in sys.argv and not skipReformat
    if quickSend:
        quickPos = sys.argv.index('-e')
        if quickPos != len(sys.argv) - 1:
            tArg = sys.argv[quickPos + 1]
            try:
                manualTime = parser.parse(tArg)
                print "Sending historic one time report for time %s" % manualTime
            except:
                pass

    skipReformat = not (quickReformat or oneTimeDump
                        or quickSend) or skipReformat

    try:
        userLogin = sys.argv[2]
        print "Login '%s' passed explicitly" % (userLogin)
    except:
        userLogin = '******'
    try:
        temp = sys.argv[1]
        if temp.startswith('http'):
            usingGDoc = True
            gDocURL = temp
            configFile = 'null'
            print "Preparing GDI Remote Access Loader"
        else:
            print "\nTaking user parameters"
            directory = '/'.join(temp.split('/')[:-1])
            configFile = temp.split('/')[-1]
            if directory == '':
                directory = os.getcwd() + '/'
    except:
        print "Taking default parameters"
        directory = os.getcwd() + '/'
        configFile = 'config'

    if configFile == '-f':
        configFile = 'config.txt'

    if usingGDoc:
        directory = os.getcwd() + '/'
        temp = giSpyGDILoad(gDocURL, directory)
        cfg = temp['config']
        lists = temp['lists']
        if type(temp['login']) is list:
            login = getLogins(directory, temp['login'])
            cfg['MultiLogin'] = True
        else:
            login = getLogins(directory, [temp['login']])[temp['login']]
        cfg['Directory'] = directory
        cfg['KeryRing'] = getOtherAPIs(directory)
        #geoCache = dict()
        geoCache = updateGeoPickle({}, getPickleName(cfg), cfg)

        if cfg['OnlyKeepNLP'] != False:
            temp = cfg['OnlyKeepNLP']
            if type(temp) is str:
                cfg['OnlyKeepNLP'] = temp.split('_')
            if type(temp) is list:
                cfg['OnlyKeepNLP'] = temp
            if type(cfg['OnlyKeepNLP']) is not list:
                cfg['OnlyKeepNLP'] = [str(temp)]
            cfg['OnlyKeepNLP'] = [str(key) for key in cfg['OnlyKeepNLP']]
            NLPClassifier = TweetMatch.getClassifier(cfg['NLPFile'], cfg)

        cfg['OneTimeDump'] = oneTimeDump
        cfg['QuickSend'] = quickSend
        if oneTimeDump:
            cfg['DaysBack'] = 'all'

        if not skipReformat:
            reformatOld(directory,
                        lists,
                        cfg,
                        geoCache,
                        NLPClassifier,
                        manualTime=manualTime)
            geoCache = updateGeoPickle(geoCache, getPickleName(cfg), cfg)
        if quickReformat or oneTimeDump or quickSend:
            sys.exit()

    else:
        print "Loading parameters from config file '%s' in directory '%s'" % (
            configFile, directory)
        cfg = getConfig(directory + configFile)
        cfg['Directory'] = directory
        cfg['ConfigFile'] = configFile
        logins = getLogins(directory, cfg['Logins'])
        lists = updateWordBanks(directory, cfg)
        #geoCache = dict()
        geoCache = updateGeoPickle({}, directory + 'caches/' + pickleName)

    cfg['KeyRing'] = getOtherAPIs(directory)
    if not skipReformat:
        reformatOld(directory,
                    lists,
                    cfg,
                    geoCache,
                    NLPClassifier,
                    manualTime=manualTime)
        if quickReformat:
            sys.exit()
        geoCache = updateGeoPickle(geoCache,
                                   directory + 'caches/' + pickleName)

        print "\nPlease choose login number:"
        if userLogin == 'null':
            listed = sorted(logins.keys())
            i = 0
            for key in listed:
                print "\t%s - %s - %s" % (i, key, logins[key]['description'])
                i += 1
            while True:
                try:
                    selection = int(raw_input('\n:'))
                    userLogin = listed[selection]
                    break
                except:
                    None

        login = logins[userLogin]

    if cfg['MultiLogin']:
        for key in login.keys():
            temp = getAuth(login[key])
            login[key]['auth'] = temp['auth']
            login[key]['api'] = temp['api']
            time.sleep(3)
    else:
        temp = getAuth(login)
        login['auth'] = temp['auth']
        login['api'] = temp['api']
        login['name'] = userLogin

    cfg['userLogin'] = userLogin
    cfg['_login_'] = login
    cfg['Directory'] = directory
    cfg['args'] = sys.argv

    getTweets(login, cfg, lists['conditions'], lists['qualifiers'],
              lists['exclusions'], geoCache, NLPClassifier)
コード例 #10
0
ファイル: GISpy.py プロジェクト: bacanapps/ChatterGrabber
def getReformatted(directory, lists, cfg, pickleMgmt, fileList, core, out_q, keepTypes, NLPClassifier):
    count = 0
    collectedContent = []
    collectedTypes = {}
    geoPickle = dict(pickleMgmt.items())
    
    useNLP = NLPClassifier != 'null' and NLPClassifier != False
    
    for fileName in fileList:
            inFile = open(directory+fileName)
            content = json.load(inFile)
            filteredContent = []
            
            print "Core", core, "reclassifying", fileName, "by updated lists"
            
            if lists != "null":
                jsonToDictFix(content)
            
            if  cfg['DaysBack'] != 'all' and type(cfg['DaysBack']) is int:
                leftBound = datetime.datetime.utcnow() - datetime.timedelta(days = cfg['DaysBack'])
                content = [item for item in content if parser.parse(item['created_at']).replace(tzinfo=None) > leftBound]
            
            for tweet in content:
                count += 1
                if count%250 == 0:
                    print "\tCore",core,count,"tweets sorted"
                tweet['text'] = tweet['text'].replace('\n',' ')
                tweetType = checkTweet(lists['conditions'],lists['qualifiers'],lists['exclusions'], tweet['text'], cfg)
                if tweetType in keepTypes:
                    geoType = isInBox(cfg,geoPickle,tweet)
                    if geoType['inBox'] or cfg['KeepUnlocated']:
                        timeData = outTime(localTime(tweet,cfg))
                        collectedTypes[str(tweet['id'])] = {'tweetType':tweetType,
                            'geoType':geoType['text'],
                            'lat':geoType['lat'],
                            'lon':geoType['lon'],
                            'fineLocation':geoType['trueLoc'],
                            'place':geoType['place'],
                            'day':timeData['day'],
                            'time':timeData['time'],
                            'date':timeData['date']}
                        if useNLP:
                            collectedTypes[str(tweet['id'])]['NLPCat'] = str(TweetMatch.classifySingle(tweet['text'],NLPClassifier,cfg['NLPnGrams']))
                        
                    filteredContent.append(tweet)
            
            collectedContent += filteredContent  
           
            try:
                filteredContent = cleanJson(filteredContent,cfg,collectedTypes)
            except:
                print "DEBOOO123", cfg['OnlyKeepNLP'],count,len(collectedContent),len(filteredContent), len(collectedTypes) 
            
            outName = fileName.replace('Raw','FilteredTweets')

            if cfg['MakeFilteredJson']:
                print "\tSaving file as", outName
                with open(directory+'studies/'+outName, 'w') as outFile:
                    json.dump(filteredContent,outFile)
                outFile.close()
            
    collectedContent = cleanJson(collectedContent,cfg,collectedTypes)
    pickleMgmt = Manager().dict(geoPickle)        
    #out_q.put({'content'+str(core):collectedContent,'types'+str(core):collectedTypes})
    print "Core", core, "tasks complete!"
    out_q.put({'content'+str(core):collectedContent})        
コード例 #11
0
def main():
    usingGDoc = False
    NLPClassifier = 'null'
    keepKeys = 'null'
    extra = dict()
    manualTime = 'null'
    
    skipReformat = '-s' in sys.argv
    quickReformat = '-r' in sys.argv and not skipReformat
    oneTimeDump = '-o' in sys.argv and not skipReformat
    quickSend = '-e' in sys.argv and not skipReformat
    if quickSend:
        quickPos = sys.argv.index('-e')
        if quickPos != len(sys.argv) - 1:
            tArg = sys.argv[quickPos+1]
            try:
                manualTime = parser.parse(tArg)
                print "Sending historic one time report for time %s" % manualTime
            except:
                pass
    
    
    skipReformat = not(quickReformat or oneTimeDump or quickSend) or skipReformat
    
    try: 
        userLogin = sys.argv[2]
        print "Login '%s' passed explicitly" % (userLogin)
    except:
        userLogin = '******'
    try:
        temp = sys.argv[1]
        if temp.startswith('http'):
            usingGDoc = True
            gDocURL = temp
	    configFile = 'null'
            print "Preparing GDI Remote Access Loader"
        else:
            print "\nTaking user parameters"
            directory = '/'.join(temp.split('/')[:-1])
            configFile = temp.split('/')[-1]
            if directory == '':
                directory = os.getcwd() + '/'
    except:
        print "Taking default parameters"
        directory = os.getcwd() + '/'
        configFile = 'config'
    
    if configFile == '-f':
		configFile = 'config.txt'

    if usingGDoc:
        directory = os.getcwd() + '/'
        temp = giSpyGDILoad(gDocURL,directory)
        cfg = temp['config']
        lists = temp['lists']
        if type(temp['login']) is list:
            login = getLogins(directory,temp['login'])
	    cfg['MultiLogin'] = True
        else:
            login = getLogins(directory,[temp['login']])[temp['login']]
        cfg['Directory'] = directory
        cfg['KeryRing'] = getOtherAPIs(directory)
        #geoCache = dict()
        geoCache = updateGeoPickle({},getPickleName(cfg),cfg)
        
        if cfg['OnlyKeepNLP'] != False:
            temp = cfg['OnlyKeepNLP']
            if type(temp) is str:
                cfg['OnlyKeepNLP'] = temp.split('_')
            if type(temp) is list:
                cfg['OnlyKeepNLP'] = temp
	    if type(cfg['OnlyKeepNLP']) is not list:
		cfg['OnlyKeepNLP'] = [str(temp)]
            cfg['OnlyKeepNLP'] = [str(key) for key in cfg['OnlyKeepNLP']]
            NLPClassifier = TweetMatch.getClassifier(cfg['NLPFile'],cfg)
        
        cfg['OneTimeDump'] = oneTimeDump
        cfg['QuickSend'] = quickSend
        if oneTimeDump:
                cfg['DaysBack'] = 'all'
        
        if not skipReformat:
	    reformatOld(directory,lists,cfg,geoCache,NLPClassifier,manualTime=manualTime)
	    geoCache = updateGeoPickle(geoCache,getPickleName(cfg),cfg)
        if quickReformat or oneTimeDump or quickSend:
            sys.exit()        	
        
    else: 
        print "Loading parameters from config file '%s' in directory '%s'" % (configFile, directory)
        cfg = getConfig(directory+configFile)
        cfg['Directory'] = directory
        cfg['ConfigFile'] = configFile
        logins = getLogins(directory, cfg['Logins'])
        lists = updateWordBanks(directory, cfg)
        #geoCache = dict()
        geoCache = updateGeoPickle({},directory+'caches/'+pickleName)

    cfg['KeyRing'] = getOtherAPIs(directory)
    if not skipReformat:
        reformatOld(directory,lists,cfg,geoCache,NLPClassifier,manualTime=manualTime)
        if quickReformat:
			sys.exit()        
        geoCache = updateGeoPickle(geoCache,directory+'caches/'+pickleName)
        
        
        print "\nPlease choose login number:"
        if userLogin == 'null':
            listed = sorted(logins.keys()); i = 0
            for key in listed:
                print "\t%s - %s - %s" % (i,key,logins[key]['description'])
                i += 1
            while True:
                try:
                    selection = int(raw_input('\n:'))
                    userLogin = listed[selection]
                    break
                except:
                    None
 
        login = logins[userLogin]
    
    
    if cfg['MultiLogin']:
        for key in login.keys():
            temp = getAuth(login[key])
            login[key]['auth'] = temp['auth']
            login[key]['api'] = temp['api']
            time.sleep(3)
    else:
        temp = getAuth(login)
        login['auth'] = temp['auth']
        login['api'] = temp['api']
        login['name'] = userLogin
    
    cfg['userLogin'] = userLogin    
    cfg['_login_'] = login
    cfg['Directory'] = directory
    cfg['args'] = sys.argv  
 
    getTweets(login,cfg,lists['conditions'],lists['qualifiers'],lists['exclusions'],geoCache,NLPClassifier)
コード例 #12
0
    def run(self):
        newDay = False
        print "\nPreparing to run..."
        print "Geographic Selection:", self.geo, '\n\n'
        while True:
            collected = []
            foundIDs = set()
            allFound = 0
 
            if self.multiAPI:
		print "Multi-api mode in use"
		failCount = dict([key,0] for key in self.api.keys())
            	APIoffline = dict([key, 0] for key in self.api.keys())

            if self.cfg['UseStacking']:
                    counted = 0; increment = 10
                    timeNow = time.time()
                    elapsed = timeNow - self.stackLast
                    self.stackLast = timeNow
                    stackDelay = getDelay(self, elapsed)
                    print "Running %s geoStack queries at 1 query every %s seconds" % (self.stackQueries,stackDelay)
            
            queryCount = -1

            for query in self.queries:
                #if queryCount % 50 == 0:
                #    setLastRan(self)
                
                if self.cfg['UseStacking']:
                    geoCount = 0; queryCount += 1
                    
                    for geoPoint in self.stackPoints:
                        loggedIn = True
                        ranSearch = False
                        
                        while not loggedIn or not ranSearch:  
                            if geoCount % 20 == 0:
                                setLastRan(self)
                                
                            try:
                                if self.multiAPI:
                                    numOffline = sum((1 for key in APIoffline if APIoffline[key] != 0))
                                    APIoffline = {key:max(value-1,0) for key,value in APIoffline.iteritems()}
				    chooseable = [key for key,value in APIoffline.iteritems() if value == 0]
	
				    if len(chooseable) > 0:
                                        chosen = choice(chooseable)
                                        
				    cellCollected = self.api[chosen]['api'].search(q = query, 
                                                        since_id = self.stackLastTweet[queryCount][geoCount],  
                                                        geocode = geoString(geoPoint),
                                                        result_type="recent",
                                                        count = 100,
                                                        tweet_mode='extended')
                                    failCount[chosen] = 0
				    time.sleep(uniform(0,.05))
                                    
                                else:    
                                    cellCollected = self.api.search(q = query, 
                                                            since_id = self.stackLastTweet[queryCount][geoCount],  
                                                            geocode = geoString(geoPoint),
                                                            result_type="recent",
                                                            count = 100,
                                                            tweet_mode='extended')
                                
                                allFound += len(cellCollected)
				if self.useNLP:
				    if self.cfg['KeepDiscardsNLP']:
				        cellCollected = [status for status in cellCollected if status.id not in foundIDs and (TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams']) in self.cfg['OnlyKeepNLP'] or uniform(0,1)<self.cfg['DiscardSampleNLP'])]
				    else:
                                        cellCollected = [status for status in cellCollected if TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams']) in self.cfg['OnlyKeepNLP'] and status.id not in foundIDs]
                                
                                for cell in cellCollected:
                                    foundIDs.add(cell.id)
                                
                                if len(cellCollected)>0:
                                    collected += cellCollected
                                    self.stackLastTweet[queryCount][geoCount] = int(collected[0].id)
                                    
                                geoCount += 1; counted += 1
                                    
                                ranSearch = True
                                if counted == self.rateLimit/2:
                                    stackDelay = getDelay(self, 0)
                                if counted%increment == 0:
                                    print "Running search %s out of %s with %s hits found and %s ignored" % (counted, self.stackQueries, len(collected), allFound - len(collected))
                                if self.multiAPI:
                                	time.sleep(stackDelay*(float(len(self.api))/max(len(chooseable),1))*0.9)
                                else:
                                	time.sleep(stackDelay)
                            except Exception,e:
                                loggedIn = False
                                while not loggedIn:
                                    if not self.multiAPI:
					print "Experiment", self.cfg['FileName'],"login error, will sleep 120 before reconnection, error code:",e
                                        time.sleep(120 + randint(-3,3))
				    else:
					failCount[chosen] += 1
				    try:
                                        if self.multiAPI:
						if len(chooseable) == 0:
							print "All logins down, will sleep 2 minutes before reconnection, error code:",e
							chosen = [key for key, value in APIoffline.iteritems() if value == min(APIoffline.values())][0]
							time.sleep(120)
							failCount[chosen] = 0
							APIoffline[chosen] = 0
                                        	if failCount[chosen] <= 2:
                                            		self.api[chosen]['api'] = getAuth(self.cfg['_login_'][chosen])['api']
                                            	else:
                                            		APIoffline[chosen] = 100
                                        else:
                                            self.api = getAuth(self.cfg['_login_'])['api']
                                        print "Login successfull"
                                        loggedIn =  True
                                    except Exception,e:
                                        print "Login unsuccessfull\n",e
                                
                else:
                    loggedIn = True
                    ranSearch = False
                    while not loggedIn or not ranSearch:
                        try:
                            #Issue of stream pagination currently unresolved
                            #https://github.com/tweepy/tweepy/pull/296#commitcomment-3404913
                            
                            if self.multiAPI:
                                numOffline = sum((1 for key in APIoffline if APIoffline[key] != 0))
                                APIoffline = {key:max(value-1,0) for key,value in APIoffline.iteritems()}
				chooseable = [key for key,value in APIoffline.iteritems() if value == 0]

				if len(chooseable) > 0:
				    chosen = choice(chooseable)
				    if self.cfg['RegionSearch']:
					cellCollected = self.api[chosen]['api'].search(q = query, 
                                            since_id = self.lastTweet,
                                            result_type="recent",
                                            count = 100,
                                            tweet_mode='extended')
				    else:
				        cellCollected = self.api[chosen]['api'].search(q = query, 
                                            since_id = self.lastTweet,  
                                            geocode = self.geo,
                                            result_type="recent",
                                            count = 100,
                                            tweet_mode='extended')
                                failCount[chosen] = 0
				time.sleep(uniform(0,.05))
                                    
                            else:
				if self.cfg['RegionSearch']:
				    cellCollected = self.api.search(q = query, 
                                                        since_id = self.lastTweet,
                                                        result_type="recent",
                                                        count = 100,
                                                        tweet_mode='extended')
				else:
                                    cellCollected = self.api.search(q = query, 
                                                        since_id = self.lastTweet,  
                                                        geocode = self.geo,
                                                        result_type="recent",
                                                        count = 100,
                                                        tweet_mode='extended')
                                                    
                            if self.useNLP:
				    if self.cfg['KeepDiscardsNLP']:
				        cellCollected = [status for status in cellCollected if status.id not in foundIDs and (TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams']) in self.cfg['OnlyKeepNLP'] or uniform(0,1)<self.cfg['DiscardSampleNLP'])]
				    else:
                                        cellCollected = [status for status in cellCollected if TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams']) in self.cfg['OnlyKeepNLP'] and status.id not in foundIDs]                        
                                
                            for cell in cellCollected:
                                foundIDs.add(cell.id)
                                
                            collected += cellCollected    
                                
                            ranSearch = True
                        except Exception,e:
                            loggedIn = False
                            while not loggedIn:
                                    if not self.multiAPI:
					print "Experiment", self.cfg['FileName'],"login error, will sleep 120 before reconnection, error code:",e
                                        time.sleep(120 + randint(-3,3))
				    else:
					failCount[chosen] += 1
				    try:
                                        if self.multiAPI:
						if len(chooseable) == 0:
							print "All logins down, will sleep 2 minutes before reconnection, error code:",e
							chosen = [key for key, value in APIoffline.iteritems() if value == min(APIoffline.values())][0]
							time.sleep(120)
							failCount[chosen] = 0
							APIoffline[chosen] = 0
                                        	if failCount[chosen] <= 2:
                                            		self.api[chosen]['api'] = getAuth(self.cfg['_login_'][chosen])['api']
                                            	else:
                                            		APIoffline[chosen] = 100
                                        else:
                                            self.api = getAuth(self.cfg['_login_'])['api']
                                        print "Login successfull"
                                        loggedIn =  True
                                    except Exception,e:
                                        print "Login unsuccessfull\n",e
コード例 #13
0
                 None
             else:
                 self.irrelevantCount += 1
         if tweetType != "retweet" and self.cfg['KeepRaw'] == True and geoStrictKeep and wordStrictKeep:
             self.jsonRaw.append(status.json)
             self.tweetTypes[str(status.id)] = {'tweetType':tweetType,
                 'geoType':geoType['text'],
                 'lat':geoType['lat'],
                 'lon':geoType['lon'],
                 'place':geoType['place'],
                 'fineLocation':geoType['trueLoc'],
                 'day':tweetLocalTime['day'],
                 'time':tweetLocalTime['time'],
                 'date':tweetLocalTime['date']}
             if self.cfg['OnlyKeepNLP']:
                 self.tweetTypes[str(status.id)]['NLPCat'] = TweetMatch.classifySingle(status.full_text,self.NLP,self.cfg['NLPnGrams'])
             
     
     if newDay and self.sendStatus == 'ready':
         giSeeker.saveTweets(self)
         newDay = False
         giSeeker.closeDay(self)
         try:
             self.startDay = localTime(status.created_at,self.cfg).strftime("%A %d")
             self.startTime = localTime(status.created_at,self.cfg).strftime(timeArgs)
         except:
             self.startDay = 'null'
             self.startTime = 'null'
 
     
     if hasResults: