class TweetDumper(object): """ This class is able to retrieve tweets from the user. If you need to update a preexisting database just run with page=0 and interrupt the script as soon you see Skipping warning. """ ARGS = ('user', 'page') DESC = "Retrieve tweets of <user> starting from <page>" METHOD = 'save_tweet' URL = "http://api.twitter.com/1/statuses/user_timeline.json?" \ "&count=200&page={:d}" def __init__(self): self.url = self.URL self.collector = Collector() self.invoker = Requester('proxylist') def dump(self, politician, page=1): try: page = int(page) url = self.url if politician.isdigit(): url += '&user_id={:s}' else: url += '&screen_name={:s}' while True: print("Retrieving tweets at page {:d}".format(page)) response, content = self.invoker.request( url.format(page, politician) ) collection = json.loads(content) if len(collection) == 0: break meth = getattr(self.collector, self.METHOD) meth.__call__(collection) page += 1 finally: print("Committing changes to the database") self.collector.save() self.invoker.save('proxylist')
def __init__(self, userlist): self.filename = userlist self.userlist = [int(line.strip()) for line in open(userlist).read().splitlines()] self.total = float(len(self.userlist)) self.current = 0 self.dumpfile = GzipFile(userlist + ".json.gz", "a") self.invoker = Requester()
class TweetDumper(object): URLS = ("http://api.twitter.com/1/statuses/user_timeline.json?" \ "&count=200&page={:d}", "http://api.twitter.com/1/statuses/retweeted_by_user.json" \ "?count=100&page={:d}") def __init__(self, userlist): self.filename = userlist self.userlist = [int(line.strip()) for line in open(userlist).read().splitlines()] self.total = float(len(self.userlist)) self.current = 0 self.dumpfile = GzipFile(userlist + ".json.gz", "a") self.invoker = Requester() def run(self): try: while self.userlist: self.current += 1 self.dump(self.userlist[0], 1) self.userlist.pop(0) self.save_progress() finally: self.save_progress() self.close_dump() def close_dump(self): self.dumpfile.close() def save_progress(self): f = open(self.filename, "w") for i in self.userlist: f.write(str(i) + "\n") f.close() def dump(self, politician, page=1): page = int(page) oldpage = page politician = str(politician) msgs = ( 'Retrieving tweets at page {:d} for user {:s}', 'Retrieving retweets at page {:d} for user {:s}' ) print("Percentage: %.2f" % (self.current / self.total)) for url, msg in zip(self.URLS, msgs): if politician.isdigit(): url += '&user_id={:s}' else: url += '&screen_name={:s}' while True: print(msg.format(page, politician)) response, content = self.invoker.request( url.format(page, politician) ) if response['status'] == '401': break try: collection = json.loads(content) except: print "Error decoding json", response['status'] continue if len(collection) == 0: break self.dumpfile.write(content + "\n") page += 1 page = oldpage
def __init__(self): self.url = self.URL self.collector = Collector() self.invoker = Requester('proxylist')