Beispiel #1
0
    def process_batch(self, status_ids):
        recollected_statuses = []
        recollected_urls = []
        success_counter = 0
        for status in self.lookup_statuses(status_ids):
            # when passing param map_=True to Twitter API, if statuses are not available, the status will be present, but will only have an id field
            status_id = status.id  # all statuses will have an id

            recollected_status = {
                "status_id": status_id,
                "user_id": None,
                "full_text": None,
                "created_at": None,
                "lookup_at": generate_timestamp()
            }  # represent failed lookups with null text values
            if list(status._json.keys()) != [
                    "id"
            ]:  # this will be the only field for empty statuses. otherwise try to parse them:
                success_counter += 1
                recollected_status["user_id"] = status.user.id
                recollected_status["full_text"] = parse_full_text(
                    status)  # update the full text if possible
                recollected_status["created_at"] = generate_timestamp(
                    status.created_at)
                for url in status.entities["urls"]:
                    recollected_urls.append({
                        "status_id": status_id,
                        "expanded_url": url["expanded_url"]
                    })
            recollected_statuses.append(recollected_status)

        print(generate_timestamp(), f"| SAVING BATCH OF {len(status_ids)}",
              "| STATUSES:", success_counter, "| URLS:", len(recollected_urls))
        self.save_statuses(recollected_statuses)
        self.save_urls(recollected_urls)
    def perform(self):
        self.mgr.load_model_state()

        print("----------------")
        print(f"FETCHING TEXTS...")
        print(f"SCORING TEXTS IN BATCHES...")

        batch = []
        counter = 0
        for row in self.fetch_texts():
            batch.append(row)

            if len(batch) >= self.batch_size:
                counter += len(batch)
                print("  ", generate_timestamp(), "|", fmt_n(counter))

                self.process_batch(batch)
                batch = []

        # process final (potentially incomplete) batch
        if any(batch):
            counter += len(batch)
            print("  ", generate_timestamp(), "|", fmt_n(counter))

            self.process_batch(batch)
            batch = []
def user_with_friends(row):
    start_at = generate_timestamp()
    #print(f"{start_at} | {current_thread().name} | {row.user_id}")

    friend_names = sorted(get_friends(row.screen_name))
    end_at = generate_timestamp()
    print(f"{end_at} | {current_thread().name} | {row.user_id} | FRIENDS: {len(friend_names)}")

    return {
        "user_id": row.user_id,
        "screen_name": row.screen_name,
        "friend_count": len(friend_names),
        "friend_names": friend_names,
        "start_at": start_at,
        "end_at": end_at
    }
Beispiel #4
0
    def perform(self):
        print("----------------")
        print(f"FETCHING TEXTS...")
        rows = list(self.fetch_texts())

        print(f"ASSEMBLING BATCHES...")
        batches = list(split_into_batches(rows, batch_size=self.batch_size))

        print(f"SCORING TEXTS IN BATCHES...")
        counter = 0
        for index, batch in enumerate(batches):
            counter += len(batch)
            print("  ", generate_timestamp(), f"BATCH {index+1}", f"| {fmt_n(counter)}")
            self.process_batch(batch)
def parse_timeline_status(status):
    """
    Param status (tweepy.models.Status)

    Converts a nested status structure into a flat row of non-normalized status and user attributes.
    """

    if hasattr(status, "retweeted_status") and status.retweeted_status:
        retweeted_status_id = status.retweeted_status.id_str
        retweeted_user_id = status.retweeted_status.user.id
        retweeted_user_screen_name = status.retweeted_status.user.screen_name
    else:
        retweeted_status_id = None
        retweeted_user_id = None
        retweeted_user_screen_name = None

    user = status.user
    row = {
        "user_id": user.id_str,
        "status_id": status.id_str,
        "status_text": parse_string(parse_full_text(status)),
        "created_at": generate_timestamp(status.created_at),

        "geo": status.geo,
        "is_quote": status.is_quote_status,
        "truncated": status.truncated,

        "reply_status_id": status.in_reply_to_status_id_str,
        "reply_user_id": status.in_reply_to_user_id_str,
        "retweeted_status_id": retweeted_status_id,
        "retweeted_user_id": retweeted_user_id,
        "retweeted_user_screen_name": retweeted_user_screen_name,

        "lookup_at": generate_timestamp()
    } # the order of these columns matters, when inserting records to BQ, based on the schema definition
    return row
Beispiel #6
0
    def perform_better(self):
        print("----------------")
        print(f"FETCHING TEXTS...")
        print(f"SCORING TEXTS IN BATCHES...")

        batch = []
        counter = 0
        for row in self.fetch_texts():
            batch.append(row)

            if len(batch) >= self.batch_size:
                counter+=len(batch)
                print("  ", generate_timestamp(), "|", fmt_n(counter))
                self.process_batch(batch)
                batch = []
    try:

        #
        # GET FRIENDS FOR EACH USER
        #

        for index, user_id in enumerate(user_ids):
            print("---------------------")
            print("USER ID:", index, user_id)

            lookup = {
                "user_id": user_id,
                "friend_count": None,
                "error_type": None,
                "error_message": None,
                "start_at": generate_timestamp(),
                "end_at": None
            }
            friends = []

            try:

                for friend in job.fetch_friends(user_id):
                    friends.append({
                        "user_id": user_id,
                        "friend_id": friend.id,  # friend.id_str
                        "friend_name": friend.screen_name.upper(),
                        "lookup_at": generate_timestamp(),
                    })

                lookup["friend_count"] = len(friends)
 def process_batch_async(self, batch):
     print("PROCESSING BATCH OF TEXTS...", generate_timestamp(), " | ", len(batch), " | ", current_thread().name)
     self.process_batch(batch)
def test_generate_timestamp():
    assert isinstance(generate_timestamp(), str)
    assert isinstance(generate_timestamp(datetime.now()), str)
    assert generate_timestamp(datetime(2021,10,31)) == '2021-10-31 00:00:00'