def on_data(self, data): status = json.loads(data) if "text" not in status: print "Text is missing from the tweet body." print status else: terms_dict = extract.process_status(status['text']) created_at = datetime.strptime(status['created_at'], '%a %b %d %H:%M:%S +0000 %Y').strftime('%Y%m%d%H%M') now = datetime.now().strftime('%Y%m%d%H%M') self.miner.queue_for_sending(status['id'], terms_dict, created_at, now) return True
def on_data(self, data): status = json.loads(data) if "text" not in status: print "Text is missing from the tweet body." print status else: terms_dict = extract.process_status(status['text']) created_at = datetime.strptime( status['created_at'], '%a %b %d %H:%M:%S +0000 %Y').strftime('%Y%m%d%H%M') now = datetime.now().strftime('%Y%m%d%H%M') self.miner.queue_for_sending(status['id'], terms_dict, created_at, now) return True
def download_timelines(self, api): self.is_downloading = True for user_name in self.category.users.split(','): self.log("Downloading timeline for {}".format(user_name)) user = api.get_user(user_name) page_list = [] for page in tweepy.Cursor(api.user_timeline, user_id=user.id, count=800, include_rts=True).pages(16): page_list.append(page) for idx, page in enumerate(page_list): self.log("user:{} page:{}/{} statuses:{}".format(user_name, idx, len(page_list), len(page))) for status in page: terms_dict = extract.process_status(status.text) created_at = status.created_at.strftime('%Y%m%d%H%M') now = datetime.now().strftime('%Y%m%d%H%M') self.queue_for_sending(status.id, terms_dict, created_at, now) if not self.is_downloading: return # stop downloading
def download_timelines(self, api): self.is_downloading = True for user_name in self.category.users.split(','): self.log("Downloading timeline for {}".format(user_name)) user = api.get_user(user_name) page_list = [] for page in tweepy.Cursor(api.user_timeline, user_id=user.id, count=800, include_rts=True).pages(16): page_list.append(page) for idx, page in enumerate(page_list): self.log("user:{} page:{}/{} statuses:{}".format( user_name, idx, len(page_list), len(page))) for status in page: terms_dict = extract.process_status(status.text) created_at = status.created_at.strftime('%Y%m%d%H%M') now = datetime.now().strftime('%Y%m%d%H%M') self.queue_for_sending(status.id, terms_dict, created_at, now) if not self.is_downloading: return # stop downloading