def liveFeedsByLocation(self, api=None, locationArea="New York City, NY"): if api is None: api = self.getAppObject() queryParam = {} queryParam['locations'] = GeoLocationModule.getGeoArea( area=locationArea) queryParam['rpp'] = 100 while True: try: time.sleep( int(3600 / 100) + 4 ) # Let's take 40 seconds pause; twitter rate limit is 100 API calls per hour in total per account; source: https://blog.twitter.com/2008/what-does-rate-limit-exceeded-mean-updated iterator = api.request('statuses/filter', queryParam).get_iterator() for item in iterator: if 'text' in item: print('\n\n\n' + item[u'lang'] + ":\t" + item['text'].encode('utf-8').strip()) rawTextClean1 = item[u'text'].encode('utf-8') rawTextClean2 = rawTextClean1.strip() rawTextClean3 = rawTextClean2.replace( "#", " ") # remove hashtags rawTextClean4 = re.sub( r'https?:\/\/.*[\r\n]*', '', rawTextClean3, flags=re.MULTILINE) # remove urls rawEnText = TranslationModule.getEnglish(rawTextClean4) fineEnText = rawEnText.replace(",", " ").replace(";", " ") print(self.getEmoTaggerObject().consolodateResult( fineEnText)) elif 'disconnect' in item: event = item['disconnect'] if event['code'] in [2, 5, 6, 7]: # something needs to be fixed before re-connecting raise Exception(event['reason']) else: # temporary interruption, re-try request break except TwitterRequestError as e: if e.status_code < 500: # something needs to be fixed before re-connecting raise # print "\n\nMJAGLAN EXCEPTION:\n"+str(e)+"\n\n" # pass else: # temporary interruption, re-try request pass except TwitterConnectionError: # temporary interruption, re-try request pass
def liveFeedsByLocation(self, api=None, locationArea="New York City, NY"): if api is None: api = self.getAppObject() queryParam = {} queryParam['locations'] = GeoLocationModule.getGeoArea(area=locationArea) queryParam['rpp'] = 100 while True: try: time.sleep(int(3600/100)+4) # Let's take 40 seconds pause; twitter rate limit is 100 API calls per hour in total per account; source: https://blog.twitter.com/2008/what-does-rate-limit-exceeded-mean-updated iterator = api.request('statuses/filter', queryParam).get_iterator() for item in iterator: if 'text' in item: print('\n\n\n' + item[u'lang'] + ":\t" + item['text'].encode('utf-8').strip()) rawTextClean1 = item[u'text'].encode('utf-8') rawTextClean2 = rawTextClean1.strip() rawTextClean3 = rawTextClean2.replace("#"," ") # remove hashtags rawTextClean4 = re.sub(r'https?:\/\/.*[\r\n]*', '', rawTextClean3, flags=re.MULTILINE) # remove urls rawEnText = TranslationModule.getEnglish(rawTextClean4) fineEnText = rawEnText.replace(",", " ").replace(";", " ") print(self.getEmoTaggerObject().consolodateResult(fineEnText)) elif 'disconnect' in item: event = item['disconnect'] if event['code'] in [2,5,6,7]: # something needs to be fixed before re-connecting raise Exception(event['reason']) else: # temporary interruption, re-try request break except TwitterRequestError as e: if e.status_code < 500: # something needs to be fixed before re-connecting raise # print "\n\nMJAGLAN EXCEPTION:\n"+str(e)+"\n\n" # pass else: # temporary interruption, re-try request pass except TwitterConnectionError: # temporary interruption, re-try request pass
def getFeedsByText(self, api=None, f1=None, isLive=True, annotation=None, queryText=u'a', textLang=None, isTrain=False, locationArea=None): if api is None: api = self.getAppObject() iteratorRunCount = 0 isDuplicateList = [] tweetsRecorded = 0 reTryCount = 0 MAX_TWEET = 20 MAX_TRIES = 10 queryParam = {} while True: try: # TODO: think of better ways to handle this if ( iteratorRunCount >= 10 ): # hack, this limits the number of tweets you want to retrieve print( "\n ASSUMPTION: there are no tweets as of now. Let's go back! \n\n" ) print(u"\n\n\n") return else: # try some iteration with original search pass # Let's take 60 seconds pause before each API call; # 2017 twitter rate limit is 15 API calls per 15 mins in total per account; time.sleep(60) if (textLang is not None): queryText = queryText + ' lang:' + textLang queryParam['rpp'] = 100 if ((isLive == True) or (queryText is None)): if queryText is not None: queryParam['track'] = queryText if (locationArea is None) and (queryText is not None): # live tweets without location filter iterator = api.request('statuses/filter', queryParam).get_iterator() elif (locationArea is not None) and (queryText is not None): # live tweets with location filter queryParam['locations'] = GeoLocationModule.getGeoArea( area=locationArea) iterator = api.request('statuses/filter', queryParam).get_iterator() elif (locationArea is not None) and (queryText is None): self.liveFeedsByLocation(api=api, locationArea=locationArea) else: print( "ERROR: locationArea and queryText cannot be None together" ) exit(-1) else: # isLive==False queryParam['q'] = queryText if locationArea is None: # search tweets without location filter iterator = api.request('search/tweets', queryParam).get_iterator() else: # search tweets with location filter queryParam['locations'] = GeoLocationModule.getGeoArea( area=locationArea) iterator = api.request('search/tweets', queryParam).get_iterator() iteratorRunCount += 1 for item in iterator: if (('text' in item) and (item[u'id'] not in isDuplicateList) and (item[u'retweeted'] == False)): rawTextClean1 = item[u'text'].encode('utf-8') rawTextClean2 = rawTextClean1.strip() rawTextClean3 = rawTextClean2.replace( "#", " ") # remove hashtags rawTextClean4 = re.sub( r'https?:\/\/.*[\r\n]*', '', rawTextClean3, flags=re.MULTILINE) # remove urls if (25 < len(rawTextClean4)) and ( len(item[u'text']) < 140): # take tweets with sufficient text isDuplicateList.append(item[u'id']) tweetsRecorded += 1 rawEnText = TranslationModule.getEnglish( rawTextClean4) fineEnText = rawEnText.replace(",", " ").replace( ";", " ") print( str(tweetsRecorded) + ":\t" + item[u'lang'] + ",\t\t" + annotation.lower() + "\t\t:" + queryText + "\t\t:" + str(len(fineEnText))) print(fineEnText) emoVector = self.getEmoTaggerObject( ).consolodateResult(fineEnText) listRes = [] keyRes = sorted(emoVector) for key in keyRes: listRes.append(emoVector[key]) print(listRes, keyRes) listStr1 = str(listRes).replace(",", " ") listStr2 = listStr1[1:-1] listStr3 = listStr2.split() listVector = [ float(i) for i in listStr3 if Utility.RepresentsNum(i) ] emoLabel = annotation if len(listVector) != 0: assert (len(listVector) == 8 ) # emo-vector length should be 8; if True: # Training Only emoTypesCount = 0 for i in range(0, 8, 1): if (listVector[i] > 0.0): emoTypesCount += 1 if (emoTypesCount == 0): emoLabel = "neutral" print(">> No Emotion \n\n\n") continue elif (emoTypesCount >= 5): emoLabel = "mixed" print(">> Mixed Emotion \n\n\n") continue else: emoLabel = annotation if isTrain == True: f1.write( unicode(item[u'id_str']) + "," + unicode(item[u'created_at']) + "," + unicode(item[u'lang']) + "," + unicode(emoLabel).lower() + "," + unicode(fineEnText).replace( "\n", " ").replace("\r", " ") + "," + "\n") f1.flush() os.fsync(f1.fileno()) else: Supervised.getPrediction(npVector=numpy.array( [listRes]), model='NBC') Supervised.getPrediction(npVector=numpy.array( [listRes]), model='SVC') if (tweetsRecorded >= MAX_TWEET) or (reTryCount >= MAX_TRIES): print("\n ReTry Count: " + str(reTryCount) + "\n\n") print(u"\n\n\n") return print(u"\n\n\n") elif 'disconnect' in item: event = item['disconnect'] reTryCount += 1 if event['code'] in [2, 5, 6, 7]: # something may or may NOT need to be fixed before re-connecting raise Exception(event['reason']) else: # temporary interruption, re-try request break elif (iteratorRunCount > 0) and ( tweetsRecorded < MAX_TWEET ): # Condition when no more unique tweets are found, go back # TODO: think of better ways to handle this if (queryText[0] == '#'): return # temporary return queryText = queryText[1:] break else: print("\n No more tweets as of now \n\n") print(u"\n\n\n") return else: pass except TwitterRequestError as e: if e.status_code < 500: print "\n\n" + "MJAGLAN EXCEPTION:\n" + str(e) + "\n\n" else: # temporary interruption, re-try request pass except TwitterConnectionError: # temporary interruption, re-try request pass
def liveFeedsByLocation(self, api=None, locationArea="New York City, NY"): if api is None: api = self.getAppObject() queryParam = {} queryParam['locations'] = GeoLocationModule.getGeoArea( area=locationArea) queryParam['rpp'] = 100 while True: try: # Let's take 60 seconds pause before next API call; # 2017 twitter rate limit is 15 API calls per 15 mins in total per account; time.sleep(60) iterator = api.request('statuses/filter', queryParam).get_iterator() for item in iterator: if 'text' in item: rawTextClean1 = item[u'text'].encode('utf-8') rawTextClean2 = rawTextClean1.strip() rawTextClean3 = rawTextClean2.replace( "#", " ") # remove hashtags rawTextClean4 = re.sub( r'https?:\/\/.*[\r\n]*', '', rawTextClean3, flags=re.MULTILINE) # remove urls rawEnText = TranslationModule.getEnglish(rawTextClean4) fineEnText = rawEnText.replace(",", " ").replace(";", " ") emoVector = self.getEmoTaggerObject( ).consolodateResult(fineEnText) listRes = [] keyRes = sorted(emoVector) for key in keyRes: listRes.append(emoVector[key]) ##### CONSOLE # print '{}: {}'.format(item[u'lang'], rawTextClean2) print(zip(keyRes, listRes)) print '\n\n\n' elif 'disconnect' in item: event = item['disconnect'] if event['code'] in [2, 5, 6, 7]: # something needs to be fixed before re-connecting raise Exception(event['reason']) else: # temporary interruption, re-try request break except TwitterRequestError as e: if e.status_code < 500: # something needs to be fixed before re-connecting print "\n\nSomething needs to be fixed before re-connecting:\n" + str( e) + "\n\n" pass else: # temporary interruption, re-try request pass except TwitterConnectionError: # temporary interruption, re-try request pass
def getFeedsByText(self, api=None, f1=None, isLive=True, annotation=None, queryText=u'a', textLang=None, isTrain=False, locationArea=None): if api is None: api = self.getAppObject() iteratorRunCount = 0 isDuplicateList = [] tweetsRecorded = 0 reTryCount = 0 MAX_TWEET = 20 MAX_TRIES = 10 queryParam = {} while True: try: # TODO: think of better ways to handle this if (iteratorRunCount >= 10): # hack, this limits the number of tweets you want to retrieve print( "\n ASSUMPTION: there are no tweets as of now. Let's go back! \n\n") print(u"\n\n\n") return else: # try some iteration with original search pass time.sleep(int(3600/100)+4) # Let's take 40 seconds pause; twitter rate limit is 100 API calls per hour in total per account; source: https://blog.twitter.com/2008/what-does-rate-limit-exceeded-mean-updated if (textLang is not None): queryText = queryText + ' lang:' + textLang queryParam['rpp'] = 100 if ((isLive==True) or (queryText is None)): if queryText is not None: queryParam['track'] = queryText if (locationArea is None) and (queryText is not None): # live tweets without location filter iterator = api.request('statuses/filter', queryParam).get_iterator() elif (locationArea is not None)and (queryText is not None): # live tweets with location filter queryParam['locations']=GeoLocationModule.getGeoArea(area=locationArea) iterator = api.request('statuses/filter', queryParam).get_iterator() elif (locationArea is not None)and (queryText is None): self.liveFeedsByLocation(api=api, locationArea=locationArea) else: print("ERROR: locationArea and queryText cannot be None together") exit(-1) else: # isLive==False queryParam['q'] = queryText if locationArea is None: # search tweets without location filter iterator = api.request('search/tweets', queryParam).get_iterator() else: # search tweets with location filter queryParam['locations']=GeoLocationModule.getGeoArea(area=locationArea) iterator = api.request('search/tweets', queryParam).get_iterator() iteratorRunCount += 1 for item in iterator: if ( ('text' in item) and (item[u'id'] not in isDuplicateList) and (item[u'retweeted']==False) ): rawTextClean1 = item[u'text'].encode('utf-8') rawTextClean2 = rawTextClean1.strip() rawTextClean3 = rawTextClean2.replace("#"," ") # remove hashtags rawTextClean4 = re.sub(r'https?:\/\/.*[\r\n]*', '', rawTextClean3, flags=re.MULTILINE) # remove urls if (25 < len(rawTextClean4)) and (len(item[u'text']) < 140): # take tweets with sufficient text isDuplicateList.append(item[u'id']) tweetsRecorded += 1 rawEnText = TranslationModule.getEnglish(rawTextClean4) fineEnText = rawEnText.replace(",", " ").replace(";", " ") print( str(tweetsRecorded) + ":\t" + item[u'lang'] + ",\t\t" + annotation.lower() + "\t\t:" + queryText + "\t\t:" + str(len(fineEnText)) + "\n\t:" + fineEnText) emoVector = self.getEmoTaggerObject().consolodateResult(fineEnText) listRes = [] keyRes = sorted(emoVector) for key in keyRes: listRes.append(emoVector[key]) print(listRes, keyRes) listStr1 = str(listRes).replace(",", " ") listStr2 = listStr1[1:-1] listStr3 = listStr2.split() listVector = [float(i) for i in listStr3 if Utility.RepresentsNum(i)] emoLabel = annotation if len(listVector) != 0: assert (len(listVector) == 8) # emo-vector length should be 8; if True: # Training Only emoTypesCount = 0 for i in range(0,8,1): if (listVector[i] > 0.0): emoTypesCount += 1 if (emoTypesCount == 0): emoLabel = "neutral" print(">> No Emotion \n\n\n") continue elif (emoTypesCount >= 5): emoLabel = "mixed" print(">> Mixed Emotion \n\n\n") continue else: emoLabel = annotation if isTrain == True: f1.write( unicode(item[u'id_str']) + "," + unicode(item[u'created_at']) + "," + unicode(item[u'lang']) + "," + unicode(emoLabel).lower() + "," + unicode(fineEnText).replace("\n", " ").replace("\r", " ") + "," + "\n" ) f1.flush() os.fsync(f1.fileno()) else: Supervised.getPrediction(npVector = numpy.array([ listRes ]), model='NBC') Supervised.getPrediction(npVector = numpy.array([ listRes ]), model='SVC') if (tweetsRecorded >= MAX_TWEET) or (reTryCount >= MAX_TRIES): print( "\n ReTry Count: " + str(reTryCount) + "\n\n") print(u"\n\n\n") return print(u"\n\n\n") elif 'disconnect' in item: event = item['disconnect'] reTryCount += 1 if event['code'] in [2,5,6,7]: # something may or may NOT need to be fixed before re-connecting raise Exception(event['reason']) else: # temporary interruption, re-try request break elif (iteratorRunCount > 0) and (tweetsRecorded < MAX_TWEET): # Condition when no more unique tweets are found, go back # TODO: think of better ways to handle this if (queryText[0] == '#'): return # temporary return queryText = queryText[1:] break else: print( "\n No more tweets as of now \n\n") print(u"\n\n\n") return else: pass except TwitterRequestError as e: if e.status_code < 500: print "\n\n" + "MJAGLAN EXCEPTION:\n" + str(e) + "\n\n" else: # temporary interruption, re-try request pass except TwitterConnectionError: # temporary interruption, re-try request pass
def liveFeedsByLocation(self, api=None, locationArea="New York City, NY", filePath=None): if api is None: api = self.getAppObject() if filePath == self.globalCSVDataStorePath: print("WARNING: Attempt to write on Train Data File!") exit(-1) else: f2 = open(filePath, 'a+') langCount = {} langEmo = {} queryParam = {} queryParam['locations'] = GeoLocationModule.getGeoArea(area=locationArea) queryParam['rpp'] = 100 recordCount = 0 while True: try: # Let's take 60 seconds pause before next API call; # 2017 twitter rate limit is 15 API calls per 15 mins in total per account; time.sleep(60) iterator = api.request('statuses/filter', queryParam).get_iterator() for item in iterator: if 'text' in item: if item[u'lang'] in langCount.keys(): langCount[item[u'lang']] += 1 else: langCount[item[u'lang']] = 1 rawTextClean1 = item[u'text'].encode('utf-8') rawTextClean2 = rawTextClean1.strip() rawTextClean3 = rawTextClean2.replace("#"," ") # remove hashtags rawTextClean4 = re.sub(r'https?:\/\/.*[\r\n]*', '', rawTextClean3, flags=re.MULTILINE) # remove urls rawEnText = TranslationModule.getEnglish(rawTextClean4) fineEnText = rawEnText.replace(",", " ").replace(";", " ") emoVector = self.getEmoTaggerObject().consolodateResult(fineEnText) listRes = [] keyRes = sorted(emoVector) for key in keyRes: listRes.append(emoVector[key]) ##### CONSOLE # print '{}: {}'.format(item[u'lang'], rawTextClean2) print (zip(keyRes, listRes)) print '\n\n\n' listStr1 = str(listRes).replace(",", " ") listStr2 = listStr1[1:-1] listStr3 = listStr2.split() listVector = [float(i) for i in listStr3 if Utility.RepresentsNum(i)] if len(listVector) == 0: listVector = [0.0] * len(langEmo[langEmo.keys()[0]]) if item[u'lang'] in langEmo.keys(): langEmo[item[u'lang']] = [sum(x) for x in zip(langEmo[item[u'lang']], listVector)] else: langEmo[item[u'lang']] = listVector assert len(langCount.keys()) == len(langEmo.keys()) ################################## ASSERT # recordCount += 1 # if recordCount > 500: # print("record count: " + str(recordCount)) # recordCount = 0 # f2.write( u'LOCATION' + "," + u"LANGUAGE" + "," + u"LANGUAGE_COUNT" + "," + u'EMO VECTOR' + "," + "\n") # for aKey in langCount: # f2.write( unicode(locationArea) + "," + unicode(aKey) + "," + unicode(langCount[aKey]) + "," + unicode(langEmo[aKey]) + "," + "\n") # f2.write( "\n\n\n") # f2.flush() # os.fsync(f2.fileno()) elif 'disconnect' in item: event = item['disconnect'] if event['code'] in [2,5,6,7]: # something needs to be fixed before re-connecting raise Exception(event['reason']) else: # temporary interruption, re-try request break except TwitterRequestError as e: print("record count: " + str(recordCount)) f2.write( u'LOCATION' + "," + u"LANGUAGE" + "," + u"LANGUAGE_COUNT" + "," + u'EMO VECTOR' + "," + "\n") for aKey in langCount: f2.write( unicode(locationArea) + "," + unicode(aKey) + "," + unicode(langCount[aKey]) + "," + unicode(langEmo[aKey]) + "," + "\n") f2.write( "\n\n\n") f2.flush() os.fsync(f2.fileno()) if e.status_code < 500: # something needs to be fixed before re-connecting print "\n\nSomething needs to be fixed before re-connecting:\n"+str(e)+"\n\n" pass else: # temporary interruption, re-try request pass except TwitterConnectionError: # print("record count: " + str(recordCount)) f2.write( u'LOCATION' + "," + u"LANGUAGE" + "," + u"LANGUAGE_COUNT" + "," + u'EMO VECTOR' + "," + "\n") for aKey in langCount: f2.write( unicode(locationArea) + "," + unicode(aKey) + "," + unicode(langCount[aKey]) + "," + unicode(langEmo[aKey]) + "," + "\n") f2.write( "\n\n\n") f2.flush() os.fsync(f2.fileno()) # temporary interruption, re-try request pass