def __init__(self, proxyList): ## ProxyList object to retrieve new proxy addresses from self.proxyList = ProxyList(proxyList, 100) ## number of attempted content lookups self.lookupAttempts = 0 self.maxpage = 9999 ## dictionary of month name to month number conversions self.months = {} self.months['Jan'] = 1 self.months['Feb'] = 2 self.months['Mar'] = 3 self.months['Apr'] = 4 self.months['May'] = 5 self.months['Jun'] = 6 self.months['Jul'] = 7 self.months['Aug'] = 8 self.months['Sep'] = 9 self.months['Oct'] = 10 self.months['Nov'] = 11 self.months['Dec'] = 12
class TwitterCaller: ## constructor method def __init__(self, proxyList): ## ProxyList object to retrieve new proxy addresses from self.proxyList = ProxyList(proxyList, 100) ## number of attempted content lookups self.lookupAttempts = 0 self.maxpage = 9999 ## dictionary of month name to month number conversions self.months = {} self.months['Jan'] = 1 self.months['Feb'] = 2 self.months['Mar'] = 3 self.months['Apr'] = 4 self.months['May'] = 5 self.months['Jun'] = 6 self.months['Jul'] = 7 self.months['Aug'] = 8 self.months['Sep'] = 9 self.months['Oct'] = 10 self.months['Nov'] = 11 self.months['Dec'] = 12 ## Splits into multiple threads, each getting the ## screename associated with an id in the given list. ## Returns the names found, as well as ids that could ## not be looked up def getNameMulti(self, pageids): foundNames = {} badIds = [] nids = len(pageids) pool = eventlet.GreenPool() ngotten = 0 returned = False for nametag in pool.imap(self.getNameMultiHelper, pageids): if returned: eventlet.kill() ngotten += 1 name, tag = nametag if tag: id, username, popularity, importance, birthtime = name foundNames[id] = (username, popularity, importance, birthtime) else: badIds.append(name) if ngotten > nids * .8: returned = True return foundNames, badIds return foundNames, badIds ## Tries to look up a users screename given their id, ## returning the name and True if successful, ## and the original id and False if not def getNameMultiHelper(self, pageid): ## self.getName(pageid) try: iddata = self.getName(pageid) return (iddata, True) except: return (pageid, False) ## Given a twitter page id as a string, retrieves and parses the ## user lookup .json file, returning as a string the user screen ## name corresponding to the id def getName(self, pageid): lookup = 'http://api.twitter.com/1/users/lookup.json?user_id=' \ + pageid content = self.getContent(lookup) try: name = ((content.split('\"screen_name\":\"'))[1]\ .split('\"'))[0] nfollowing = ((content.split('\"friends_count\":'))[1]\ .split(','))[0] nfollowing = int(nfollowing) nfollowers = ((content.split('\"followers_count\":'))[1]\ .split(','))[0] nfollowers = int(nfollowers) birthtime = (((content.split('\"created_at\":\"'))[1])\ .split('\",\"'))[0] birthtime = birthtime.split() birthtime = birthtime[5] + ':' + str(self.months[birthtime[1]]) + ':' + birthtime[2] ## Can't find the user data in the file except: raise APIParseException("Name Parse Error!") return pageid, name, nfollowers, log(float(nfollowing) / float(nfollowers), 10.), birthtime ## Splits into multiple threads to get the followers associated ## with each id in the given list. ## Found followers are returned as a list of lists, and ids ## that could not be looked up are returned in a list def getFollowersMulti(self, pageids, maxpage=9999): self.maxpage = maxpage foundFollowers = [] badIds = [] pool = eventlet.GreenPool() hasReturned = False ngotten = 0 nids = len(pageids) for idtag in pool.imap(self.getFollowersMultiHelper, pageids): if hasReturned: eventlet.kill() ids, tag = idtag if tag: foundFollowers.append(ids) ngotten += 1 else: badIds.append(ids) if ngotten > nids * .5: hasReturned = True return foundFollowers, badIds return foundFollowers, badIds ## Tries to look up the followers associated with a given page, ## returning the list of followers and True if successful, ## and the original id and False if not def getFollowersMultiHelper(self, pageid): try: ids = self.getFollowers(pageid) return (ids, True) except: return (pageid, False) ## Given a twitter page id as a string, retrieves and parses the ## follwer .json file, returning as a list of strings the ids of ## the page def getFollowers(self, pageid, verbose=False): ## looks up the first page try: nextcur, newids = self.getCursorFollowers(pageid, '-1') except APICursorException: return self.getFollowers(pageid) page = 0 ids = newids if self.maxpage == 0: return ids if verbose: print len(ids), 'followers' ## looks up subsequent pages if necessary while not nextcur == '0': page += 1 ## print 'page ', page, nextcur gotPage = False while not gotPage: try: oldcur = nextcur oldids = newids nextcur, newids = self.getCursorFollowers(pageid, nextcur) gotPage = True ## didn't work, try again except APICursorException: nextcur = oldcur newids = oldids gotPage = False ids.extend(newids) if verbose: print len(ids), 'followers' return ids ## Helper function for getFollowers ## Retrieves a single page of a users followers def getCursorFollowers(self, pageid, cursor): lookup = 'http://api.twitter.com/1/followers/ids.json?cursor=' + \ cursor + '&user_id=' + pageid content = self.getContent(lookup) try: ids = (((content.split('\"ids\":['))[1]).split(']'))[0] ids = ids.split(',') ## Can't find followers to parse except: raise APICursorException("No followers!") try: nextcur = (((content.split('\"next_cursor_str\":\"'))[1])\ .split('\"'))[0] ## Can't find the cursor for the next page except: raise APICursorException("cursor error") return nextcur, ids ## Splits into multiple threads to look up the tweets ## from each user in the given list. ## Returns found tweets as a list of lists, as well as ## a list of ids that could not be looked up def getTweetsMulti(self, userids): foundTweets = {} badIds = [] nids = len(userids) pool = eventlet.GreenPool() ngotten = 0 notFound = copy(userids) returned = False for tweettag in pool.imap(self.getTweetsMultiHelper, userids): if returned: eventlet.kill() ngotten += 1 tweets, tag, id = tweettag if tag: foundTweets[id] = tweets else: badIds.append(id) notFound.remove(id) if ngotten > nids * .8: returned = True return foundTweets, badIds, notFound return foundTweets, badIds, notFound ## Tries to get the tweets for a given user, ## returning the tweets and True if successful, ## and the original screename and False if not, ## as well as the name def getTweetsMultiHelper(self, id): try: tweets = self.getTweets(id) return (tweets, True, id) except APIParseException: return (id, False, id) ## Given a twitter username as a string, retrieves and parses a ## .json file of the user's past week of tweets, returning the ## tweets as a list of tuples, with the julian time of the ## tweet as the first element and the tweet string as the second def getTweets(self, userid): tweets = [] lookup = 'http://api.twitter.com/1/statuses/user_timeline.json?count=200&user_id=' + userid ## lookup = 'http://search.twitter.com/search.json?q=from:' + username content = self.getContent(lookup) try: tweetArr = content.split('},{') tweetArr[-1] = (tweetArr[-1].split('}]'))[0] for entry in tweetArr: tweet = ((entry.split('\"text\":\"')[1]).split('\",\"'))[0] tweettime = (((entry.split('\"created_at\":\"'))[1])\ .split('\",\"'))[0] tweettime = tweettime.split()[1:] tweet_t = tweettime[2].split(':') tweettime = datetime(int(tweettime[4]), \ self.months[tweettime[0]], \ int(tweettime[1]), \ int(tweet_t[0]), \ int(tweet_t[1]), \ int(tweet_t[2])) tweets.append((tweettime,tweet)) except: raise APIParseException("No Tweets!") return tweets ## Retrieves a given url using the TwitterCaller's proxy, ## returning the read page as a string def getContent(self, lookup): self.lookupAttempts += 1 try: proxy = self.proxyList.getProxy() except: sleep(1) return self.getContent(lookup) ## connect = HTTPConnection(proxy, timeout=20) ## connect.request("GET", lookup) ## content = connect.getresponse().read() try: connect = HTTPConnection(proxy, timeout=6) connect.request("GET", lookup) content = connect.getresponse().read() ## lookup failed, lets get a new proxy except socket.timeout: if self.lookupAttempts > 20: return '' ## print 'timeout' self.proxyList.reportBadProxy(proxy) content = self.getContent(lookup) except Exception,e: ## print e self.proxyList.reportBadProxy(proxy) return '' self.lookupAttempts = 0 return content