def process_batch(self, status_ids): recollected_statuses = [] recollected_urls = [] success_counter = 0 for status in self.lookup_statuses(status_ids): # when passing param map_=True to Twitter API, if statuses are not available, the status will be present, but will only have an id field status_id = status.id # all statuses will have an id recollected_status = { "status_id": status_id, "user_id": None, "full_text": None, "created_at": None, "lookup_at": generate_timestamp() } # represent failed lookups with null text values if list(status._json.keys()) != [ "id" ]: # this will be the only field for empty statuses. otherwise try to parse them: success_counter += 1 recollected_status["user_id"] = status.user.id recollected_status["full_text"] = parse_full_text( status) # update the full text if possible recollected_status["created_at"] = generate_timestamp( status.created_at) for url in status.entities["urls"]: recollected_urls.append({ "status_id": status_id, "expanded_url": url["expanded_url"] }) recollected_statuses.append(recollected_status) print(generate_timestamp(), f"| SAVING BATCH OF {len(status_ids)}", "| STATUSES:", success_counter, "| URLS:", len(recollected_urls)) self.save_statuses(recollected_statuses) self.save_urls(recollected_urls)
def perform(self): self.mgr.load_model_state() print("----------------") print(f"FETCHING TEXTS...") print(f"SCORING TEXTS IN BATCHES...") batch = [] counter = 0 for row in self.fetch_texts(): batch.append(row) if len(batch) >= self.batch_size: counter += len(batch) print(" ", generate_timestamp(), "|", fmt_n(counter)) self.process_batch(batch) batch = [] # process final (potentially incomplete) batch if any(batch): counter += len(batch) print(" ", generate_timestamp(), "|", fmt_n(counter)) self.process_batch(batch) batch = []
def user_with_friends(row): start_at = generate_timestamp() #print(f"{start_at} | {current_thread().name} | {row.user_id}") friend_names = sorted(get_friends(row.screen_name)) end_at = generate_timestamp() print(f"{end_at} | {current_thread().name} | {row.user_id} | FRIENDS: {len(friend_names)}") return { "user_id": row.user_id, "screen_name": row.screen_name, "friend_count": len(friend_names), "friend_names": friend_names, "start_at": start_at, "end_at": end_at }
def perform(self): print("----------------") print(f"FETCHING TEXTS...") rows = list(self.fetch_texts()) print(f"ASSEMBLING BATCHES...") batches = list(split_into_batches(rows, batch_size=self.batch_size)) print(f"SCORING TEXTS IN BATCHES...") counter = 0 for index, batch in enumerate(batches): counter += len(batch) print(" ", generate_timestamp(), f"BATCH {index+1}", f"| {fmt_n(counter)}") self.process_batch(batch)
def parse_timeline_status(status): """ Param status (tweepy.models.Status) Converts a nested status structure into a flat row of non-normalized status and user attributes. """ if hasattr(status, "retweeted_status") and status.retweeted_status: retweeted_status_id = status.retweeted_status.id_str retweeted_user_id = status.retweeted_status.user.id retweeted_user_screen_name = status.retweeted_status.user.screen_name else: retweeted_status_id = None retweeted_user_id = None retweeted_user_screen_name = None user = status.user row = { "user_id": user.id_str, "status_id": status.id_str, "status_text": parse_string(parse_full_text(status)), "created_at": generate_timestamp(status.created_at), "geo": status.geo, "is_quote": status.is_quote_status, "truncated": status.truncated, "reply_status_id": status.in_reply_to_status_id_str, "reply_user_id": status.in_reply_to_user_id_str, "retweeted_status_id": retweeted_status_id, "retweeted_user_id": retweeted_user_id, "retweeted_user_screen_name": retweeted_user_screen_name, "lookup_at": generate_timestamp() } # the order of these columns matters, when inserting records to BQ, based on the schema definition return row
def perform_better(self): print("----------------") print(f"FETCHING TEXTS...") print(f"SCORING TEXTS IN BATCHES...") batch = [] counter = 0 for row in self.fetch_texts(): batch.append(row) if len(batch) >= self.batch_size: counter+=len(batch) print(" ", generate_timestamp(), "|", fmt_n(counter)) self.process_batch(batch) batch = []
try: # # GET FRIENDS FOR EACH USER # for index, user_id in enumerate(user_ids): print("---------------------") print("USER ID:", index, user_id) lookup = { "user_id": user_id, "friend_count": None, "error_type": None, "error_message": None, "start_at": generate_timestamp(), "end_at": None } friends = [] try: for friend in job.fetch_friends(user_id): friends.append({ "user_id": user_id, "friend_id": friend.id, # friend.id_str "friend_name": friend.screen_name.upper(), "lookup_at": generate_timestamp(), }) lookup["friend_count"] = len(friends)
def process_batch_async(self, batch): print("PROCESSING BATCH OF TEXTS...", generate_timestamp(), " | ", len(batch), " | ", current_thread().name) self.process_batch(batch)
def test_generate_timestamp(): assert isinstance(generate_timestamp(), str) assert isinstance(generate_timestamp(datetime.now()), str) assert generate_timestamp(datetime(2021,10,31)) == '2021-10-31 00:00:00'