Ejemplo n.º 1
0
    def perform(self):
        self.mgr.load_model_state()

        print("----------------")
        print(f"FETCHING TEXTS...")
        print(f"SCORING TEXTS IN BATCHES...")

        batch = []
        counter = 0
        for row in self.fetch_texts():
            batch.append(row)

            if len(batch) >= self.batch_size:
                counter += len(batch)
                print("  ", generate_timestamp(), "|", fmt_n(counter))

                self.process_batch(batch)
                batch = []

        # process final (potentially incomplete) batch
        if any(batch):
            counter += len(batch)
            print("  ", generate_timestamp(), "|", fmt_n(counter))

            self.process_batch(batch)
            batch = []
    def perform(self):
        self.write_metadata_to_file()
        self.upload_metadata()

        self.start()
        self.graph = DiGraph()
        self.running_results = []

        users = list(self.bq_service.fetch_random_users(limit=self.users_limit, topic=self.topic,
                                                        start_at=self.convo_start_at, end_at=self.convo_end_at))
        print("FETCHED", len(users), "USERS")
        screen_names = sorted([row["user_screen_name"] for row in users])

        for row in self.bq_service.fetch_specific_user_friends(screen_names=screen_names):
            self.counter += 1

            if not self.dry_run:
                self.graph.add_edges_from([(row["screen_name"], friend) for friend in row["friend_names"]])

            if self.counter % self.batch_size == 0:
                rr = {"ts": logstamp(), "counter": self.counter, "nodes": len(self.graph.nodes), "edges": len(self.graph.edges)}
                print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"]))
                self.running_results.append(rr)

        self.end()
        self.report()
        self.write_results_to_file()
        self.upload_results()
        self.write_graph_to_file()
        self.upload_graph()
Ejemplo n.º 3
0
    def perform(self):
        self.start()
        self.write_metadata_to_file()
        self.upload_metadata()

        print(logstamp(), "CONSTRUCTING GRAPH OBJECT...")
        self.graph = DiGraph()
        self.running_results = []
        self.cursor.execute(self.sql)
        while True:
            batch = self.cursor.fetchmany(size=self.batch_size)
            if not batch: break
            self.counter += len(batch)

            if not self.dry_run:
                for row in batch:
                    self.graph.add_edges_from([(row["screen_name"], friend) for friend in row["friend_names"]])

            rr = {"ts": logstamp(), "counter": self.counter, "nodes": len(self.graph.nodes), "edges": len(self.graph.edges)}
            print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"]))
            self.running_results.append(rr)

        self.cursor.close()
        self.connection.close()
        print(logstamp(), "GRAPH CONSTRUCTED!")
        self.report()

        self.write_results_to_file()
        self.upload_results()

        self.write_graph_to_file()
        self.upload_graph()

        self.end()
Ejemplo n.º 4
0
 def running_results(self):
     rr = {"ts": logstamp(),
         "counter": self.counter,
         "nodes": self.graph.number_of_nodes(),
         "edges": self.graph.number_of_edges()
     }
     print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"]))
     return rr
    def similarity_graph_report(self):
        if not self.similarity_graph:
            self.similarity_graph = self.load_similarity_graph()

        print("-------------------")
        print("SIMILARITY GRAPH", type(self.similarity_graph))
        print("  NODES:", fmt_n(self.similarity_graph.number_of_nodes()))
        print("  EDGES:", fmt_n(self.similarity_graph.number_of_edges()))
        print("-------------------")
Ejemplo n.º 6
0
    def report(self):
        if not self.graph:
            self.graph = self.load_graph()

        print("-------------------")
        print(type(self.graph))
        print("  NODES:", fmt_n(self.node_count))
        print("  EDGES:", fmt_n(self.edge_count))
        print("-------------------")
Ejemplo n.º 7
0
 def report(self, graph):
     """
     Params: graph (DiGraph)
     """
     print("-------------------")
     print(type(graph))
     print("  NODES:", fmt_n(graph.number_of_nodes()))
     print("  EDGES:", fmt_n(graph.number_of_edges()))
     print("-------------------")
    def perform(self):
        """
        Given:
            bot_ids (list) a unique list of bot ids, which should all be included as nodes in the bot retweet graph.
                The retweet graph will also contain retweeted users. So that's why we need a separate list.
                The bot ids will be used as nodes in the similarity graph.

            bot_retweet_graph (networkx.DiGraph) a retweet graph generated from the bot list

        Returns: a similarity graph (networkx.Graph), where the similarity is based on the Jaccard index.
            For each pair of bots we calculate the Jaccard index based on the sets of people they retweet.
            If two bots retweet exactly the same users, their Jaccard index is one.
            If they don't retweet anyone in common, their Jaccard index is zero.
        """

        grapher.retweet_graph_report()

        bot_ids = [
            row.user_id
            for row in self.bq_service.fetch_bot_ids(bot_min=self.bot_min)
        ]
        print("FETCHED", fmt_n(len(bot_ids)), "BOT IDS")

        node_pairs = []
        for i, bot_id in enumerate(bot_ids):
            for other_bot_id in bot_ids[i + 1:]:
                if self.retweet_graph.has_node(
                        other_bot_id) and self.retweet_graph.has_node(bot_id):
                    node_pairs.append((bot_id, other_bot_id))
        # could maybe just take the combinations between all nodes in the bot graph
        # because we can assume they were assembled using the same bot ids as the ones here
        # but the point is to be methodologically sound and it doesn't take that long
        print("NODE PAIRS:", fmt_n(len(node_pairs)))

        results = jaccard_coefficient(self.retweet_graph.to_undirected(),
                                      node_pairs)
        #> returns an iterator of 3-tuples in the form (u, v, p)
        #> where (u, v) is a pair of nodes and p is their Jaccard coefficient.
        print("JACCARD COEFFICIENTS BETWEEN EACH NODE PAIR - COMPLETE!"
              )  #, fmt_n(len(list(results))))

        print("CONSTRUCTING SIMILARITY GRAPH...")
        self.similarity_graph = Graph()
        edge_count = 0
        #positive_results = [r for r in results if r[2] > 0] # this takes a while, maybe let's just stick with the original iterator approach
        for bot_id, other_bot_id, similarity_score in results:
            if similarity_score > 0:
                self.similarity_graph.add_edge(bot_id,
                                               other_bot_id,
                                               weight=similarity_score)
                edge_count += 1

            self.counter += 1
            if self.counter % self.batch_size == 0:
                print(logstamp(), "|", fmt_n(self.counter), "|",
                      fmt_n(edge_count), "EDGES")
Ejemplo n.º 9
0
    def __init__(self, model_name=MODEL_NAME, limit=LIMIT, batch_size=BATCH_SIZE, bq_service=None):
        self.model_name = model_name.lower().replace(";","") # using this model name in queries, so be super safe about SQL injection, although its not a concern right now
        self.limit = limit
        self.batch_size = batch_size
        self.bq_service = bq_service or BigQueryService()

        print("----------------")
        print("TOXICITY SCORER...")
        print("  MODEL:", self.model_name.upper())
        print("  LIMIT:", fmt_n(self.limit))
        print("  BATCH SIZE:", fmt_n(self.batch_size))
Ejemplo n.º 10
0
 def compile_energy_graph(self):
     print("COMPILING ENERGY GRAPH...")
     self.energy_graph, self.bot_ids, self.user_data = compute_energy_graph(
         self.rt_graph, self.prior_probabilities, self.link_energies,
         self.out_degrees, self.in_degrees)
     #self.human_names = list(set(self.rt_graph.nodes()) - set(self.bot_ids))
     print("-----------------")
     print("ENERGY GRAPH:", type(self.energy_graph))
     print("NODE COUNT:", fmt_n(self.energy_graph.number_of_nodes()))
     print(
         f"BOT COUNT: {fmt_n(len(self.bot_ids))} ({fmt_pct(len(self.bot_ids) / self.energy_graph.number_of_nodes())})"
     )
     print("USER DATA:", fmt_n(len(self.user_data.keys())))
Ejemplo n.º 11
0
    def alpha(self):
        """Params for the link_energy function"""
        in_degrees_list = [v for _, v in self.in_degrees]
        out_degrees_list = [v for _, v in self.out_degrees]
        print("MAX IN:", fmt_n(max(in_degrees_list)))  #> 76,617
        print("MAX OUT:", fmt_n(max(out_degrees_list)))  #> 5,608

        alpha_in = np.quantile(in_degrees_list, self.alpha_percentile)
        alpha_out = np.quantile(out_degrees_list, self.alpha_percentile)
        print("ALPHA IN:", fmt_n(alpha_in))  #> 2,252
        print("ALPHA OUT:", fmt_n(alpha_out))  #> 1,339

        return [self.mu, alpha_out, alpha_in]
Ejemplo n.º 12
0
    def memory_report(self):
        if not self.graph:
            self.graph = self.load_graph()

        #memory_load = memory_usage(self.read_graph_from_file, interval=.2, timeout=1)
        file_size = os.path.getsize(self.local_graph_filepath) # in bytes
        print("-------------------")
        print(type(self.graph))
        print("  NODES:", fmt_n(self.node_count))
        print("  EDGES:", fmt_n(self.edge_count))
        print("  FILE SIZE:", fmt_n(file_size))
        print("-------------------")

        return {"nodes": self.node_count, "edges": self.edge_count, "file_size": file_size}
Ejemplo n.º 13
0
    def perform(self):
        self.graph = DiGraph()
        self.running_results = []

        for row in self.bq_service.fetch_user_friends_in_batches():
            self.counter += 1

            if not self.dry_run:
                self.graph.add_edges_from([(row["screen_name"], friend) for friend in row["friend_names"]])

            if self.counter % self.batch_size == 0:
                rr = {"ts": logstamp(), "counter": self.counter, "nodes": len(self.graph.nodes), "edges": len(self.graph.edges)}
                print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"]))
                self.running_results.append(rr)
Ejemplo n.º 14
0
    def download_user_friends(self):
        self.start_at = time.perf_counter()
        self.batch = []
        self.counter = 0

        if self.pg_destructive and UserFriend.__table__.exists():
            print("DROPPING THE USER FRIENDS TABLE!")
            UserFriend.__table__.drop(self.pg_engine)
            self.pg_session.commit()

        if not UserFriend.__table__.exists():
            print("CREATING THE USER FRIENDS TABLE!")
            UserFriend.__table__.create(self.pg_engine)
            self.pg_session.commit()

        print(logstamp(), "DATA FLOWING...")
        for row in self.bq_service.fetch_user_friends_in_batches(
                limit=self.users_limit):
            self.batch.append({
                "user_id": row["user_id"],
                "screen_name": row["screen_name"],
                "friend_count": row["friend_count"],
                "friend_names": row["friend_names"]
            })
            self.counter += 1

            if len(self.batch) >= self.batch_size:
                print(logstamp(), fmt_n(self.counter), "SAVING BATCH...")
                self.pg_session.bulk_insert_mappings(UserFriend, self.batch)
                self.pg_session.commit()
                self.batch = []

        print("ETL COMPLETE!")
        self.end_at = time.perf_counter()
        self.pg_session.close()
Ejemplo n.º 15
0
    def download_retweeter_details(self):
        self.start_job()
        self.destructively_migrate(RetweeterDetail)

        print(logstamp(), "DATA FLOWING...")
        for row in self.bq_service.fetch_retweeter_details_in_batches(
                limit=self.users_limit):
            item = {
                "user_id": row['user_id'],
                "verified": row["verified"],
                "created_at": row["created_at"],
                "screen_name_count": row["screen_name_count"],
                "name_count": row["name_count"],
                "retweet_count": row["retweet_count"],
            }
            self.batch.append(item)
            self.counter += 1

            # temporarily testing individual inserts...
            #record = RetweeterDetail(**item)
            #self.pg_session.add(record)
            #self.pg_session.commit()

            if len(self.batch) >= self.batch_size:
                print(logstamp(), fmt_n(self.counter), "SAVING BATCH...")
                self.pg_session.bulk_insert_mappings(RetweeterDetail,
                                                     self.batch)
                self.pg_session.commit()
                self.batch = []

        self.end_job()
Ejemplo n.º 16
0
 def bot_probabilities_df(self):
     df = DataFrame(list(self.bot_probabilities.items()),
                    columns=["user_id", "bot_probability"])
     df.index.name = "row_id"
     df.index = df.index + 1
     print("--------------------------")
     print("CLASSIFICATION COMPLETE!")
     print(df.head())
     print("... < 50% (NOT BOTS):",
           fmt_n(len(df[df["bot_probability"] < 0.5])))
     print("... = 50% (NOT BOTS):",
           fmt_n(len(df[df["bot_probability"] == 0.5])))
     print("... > 50% (MAYBE BOTS):",
           fmt_n(len(df[df["bot_probability"] > 0.5])))
     print("... > 90% (LIKELY BOTS):",
           fmt_n(len(df[df["bot_probability"] > 0.9])))
     return df
Ejemplo n.º 17
0
    def perform(self):
        self.start()
        self.write_metadata_to_file()
        self.upload_metadata()

        self.edges = []
        self.running_results = []
        for row in self.bq_service.fetch_user_friends_in_batches(
                limit=self.users_limit):
            self.counter += 1

            if not self.dry_run:
                self.edges += [(row["screen_name"], friend)
                               for friend in row["friend_names"]]

            if self.counter % self.batch_size == 0:
                rr = {
                    "ts": logstamp(),
                    "counter": self.counter,
                    "edges": len(self.edges)
                }
                print(rr["ts"], "|", fmt_n(rr["counter"]), "|",
                      fmt_n(rr["edges"]))
                self.running_results.append(rr)

        self.write_results_to_file()
        self.upload_results()

        self.write_edges_to_file()
        self.upload_edges()

        print(logstamp(), "CONSTRUCTING GRAPH OBJECT...")
        self.graph = DiGraph(self.edges)
        print(logstamp(), "GRAPH CONSTRUCTED!")
        self.report()

        del self.running_results  # remove in hopes of freeing up some memory
        del self.edges  # remove in hopes of freeing up some memory

        self.write_graph_to_file()
        #del self.graph # remove in hopes of freeing up some memory
        self.upload_graph()

        self.end()
Ejemplo n.º 18
0
    def download_retweeter_details(self):
        self.start_at = time.perf_counter()
        self.batch = []
        self.counter = 0

        if self.pg_destructive and RetweeterDetail.__table__.exists():
            print("DROPPING THE RETWEETER DETAILS TABLE!")
            RetweeterDetail.__table__.drop(self.pg_engine)
            self.pg_session.commit()

        if not RetweeterDetail.__table__.exists():
            print("CREATING THE RETWEETER DETAILS TABLE!")
            RetweeterDetail.__table__.create(self.pg_engine)
            self.pg_session.commit()

        print(logstamp(), "DATA FLOWING LIKE WATER...")
        for row in self.bq_service.fetch_retweeter_details_in_batches(
                limit=self.users_limit):
            item = {
                "user_id": row['user_id'],
                "verified": row["verified"],
                "created_at": row["created_at"],
                "screen_name_count": row["screen_name_count"],
                "name_count": row["name_count"],
                "retweet_count": row["retweet_count"],
                # # todo: these topics are specific to the impeachment dataset, so will need to generalize if/when working with another topic (leave for future concern)
                # "ig_report":           row["ig_report"],
                # "ig_hearing":          row["ig_hearing"],
                # "senate_hearing":      row["senate_hearing"],
                # "not_above_the_law":   row["not_above_the_law"],
                # "impeach_and_convict": row["impeach_and_convict"],
                # "impeach_and_remove":  row["impeach_and_remove"],
                # "facts_matter":        row["facts_matter"],
                # "sham_trial":          row["sham_trial"],
                # "maga":                row["maga"],
                # "acquitted_forever":   row["acquitted_forever"],
                # "country_over_party":  row["country_over_party"],
            }
            self.batch.append(item)
            self.counter += 1

            # temporarily testing individual inserts...
            #record = RetweeterDetail(**item)
            #self.pg_session.add(record)
            #self.pg_session.commit()

            if len(self.batch) >= self.batch_size:
                print(logstamp(), fmt_n(self.counter), "SAVING BATCH...")
                self.pg_session.bulk_insert_mappings(RetweeterDetail,
                                                     self.batch)
                self.pg_session.commit()
                self.batch = []

        print("ETL COMPLETE!")
        self.end_at = time.perf_counter()
        self.pg_session.close()
Ejemplo n.º 19
0
def load_data(csv_filepath):
    if os.path.isfile(csv_filepath) and not DESTRUCTIVE:
        print("LOADING CSV...")
        df = read_csv(csv_filepath)
    else:
        print("DOWNLOADING CSV...")
        df = download_data()
        df.to_csv(csv_filepath, index=False)
    print(fmt_n(len(df)))
    print(df.head())
    return df
Ejemplo n.º 20
0
def classify_bot_probabilities(rt_graph, weight_attr="rt_count"):
    """
    Given a retweet graph, computes bot probabilities, in a single function!

    Params:

        rt_graph (networkx.DiGraph) representing a retweet graph, with weights stored in the weight_attr param

        weight_attr (str) the attribute in the edge data where the weights are.
            in the rt graph, this represents number of times user a has retweeted user b
    """

    in_degrees = dict(
        rt_graph.in_degree(weight=weight_attr))  # users receiving retweets
    out_degrees = dict(
        rt_graph.out_degree(weight=weight_attr))  # users doing the retweeting
    print("IN-DEGREES:", fmt_n(len(in_degrees)))
    print("OUT-DEGREES:", fmt_n(len(out_degrees)))

    links = parse_bidirectional_links(rt_graph)
    energies = [(link[0], link[1],
                 compute_link_energy(link[0], link[1], link[4], in_degrees,
                                     out_degrees)) for link in links]
    print("ENERGIES:", fmt_n(len(energies)))
    positive_energies = [e for e in energies if sum(e[2]) > 0]
    print("POSITIVE ENERGIES:", fmt_n(len(positive_energies)))

    prior_probabilities = dict.fromkeys(list(rt_graph.nodes), 0.5)
    energy_graph, pl, user_data = compile_energy_graph(rt_graph,
                                                       prior_probabilities,
                                                       positive_energies,
                                                       out_degrees, in_degrees)
    print("ENERGIES GRAPHED...")  # this is the step that takes the longest

    bot_probabilities = dict.fromkeys(list(
        user_data.keys()), 0)  # start with defaults of 0 for each user
    for user in pl:
        user_data[user]["clustering"] = 1
        bot_probabilities[user] = 1

    return bot_probabilities, user_data
Ejemplo n.º 21
0
    def __init__(self,
                 limit=LIMIT,
                 batch_size=BATCH_SIZE,
                 bq_service=None,
                 model_manager=None):
        self.limit = limit
        self.batch_size = batch_size
        self.bq_service = bq_service or BigQueryService()
        self.mgr = model_manager or ModelManager()

        print("----------------")
        print("TOXICITY SCORER...")
        print("  MODEL CHECKPOINT:", self.mgr.checkpoint_name.upper(),
              self.mgr.checkpoint_url)
        print("  SCORES TABLE NAME:", self.scores_table_name)
        print("  LIMIT:", fmt_n(self.limit))
        print("  BATCH SIZE:", fmt_n(self.batch_size))

        self.predict = self.mgr.predict_scores  # method alias

        seek_confirmation()
    def perform(self):
        self.edges = []
        self.running_results = []
        self.start()

        self.cursor.execute(self.sql)
        while True:
            batch = self.cursor.fetchmany(size=self.batch_size)
            if not batch: break
            self.counter += len(batch)

            if not self.dry_run:
                for row in batch:
                    self.edges += [(row["screen_name"], friend)
                                   for friend in row["friend_names"]]

            rr = {
                "ts": logstamp(),
                "counter": self.counter,
                "edges": len(self.edges)
            }
            print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["edges"]))
            self.running_results.append(rr)

        self.write_results_to_file()
        self.upload_results()

        self.write_edges_to_file()
        self.upload_edges()

        print(logstamp(), "CONSTRUCTING GRAPH OBJECT...")
        self.graph = DiGraph(self.edges)
        print(logstamp(), "GRAPH CONSTRUCTED!")
        del self.edges  # try to free up some memory maybe, before writing to file
        self.report()

        self.write_graph_to_file()
        self.upload_graph()

        self.end()
Ejemplo n.º 23
0
    def perform(self):
        self.graph = DiGraph()

        print("FETCHING BOT FOLLOWERS...")

        for row in self.bq_service.fetch_bot_follower_lists(
                bot_min=self.bot_min):
            bot_id = row["bot_id"]
            self.graph.add_edges_from([(follower_id, bot_id)
                                       for follower_id in row["follower_ids"]])

            self.counter += 1
            if self.counter % self.batch_size == 0:
                print("  ", logstamp(), "| BOTS:", fmt_n(self.counter))
Ejemplo n.º 24
0
    def load_retweets(self):
        """
        Loads or downloads bot community tweets to/from CSV.
        """
        if os.path.isfile(self.retweets_filepath):
            print("READING BOT COMMUNITY RETWEETS FROM CSV...")
            self.retweets_df = read_csv(
                self.retweets_filepath
            )  # DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False
            #if ROWS_LIMIT:
            #    self.retweets_df = read_csv(local_csv_filepath, nrows=int(ROWS_LIMIT))
            #else:
            #    self.retweets_df = read_csv(local_csv_filepath)
        else:
            print("DOWNLOADING BOT COMMUNITY RETWEETS...")
            counter = 0
            records = []
            for row in self.bq_service.download_n_bot_community_retweets_in_batches(
                    self.n_clusters):
                records.append({
                    "community_id":
                    row.community_id,
                    "user_id":
                    row.user_id,
                    "user_screen_name_count":
                    row.user_screen_name_count,
                    "user_screen_names":
                    row.user_screen_names,
                    "user_created_at":
                    dt_to_s(row.user_created_at),
                    "retweeted_user_id":
                    row.retweeted_user_id,
                    "retweeted_user_screen_name":
                    row.retweeted_user_screen_name,
                    "status_id":
                    row.status_id,
                    "status_text":
                    row.status_text,
                    "status_created_at":
                    dt_to_s(row.status_created_at)
                })
                counter += 1
                if counter % BATCH_SIZE == 0: print(logstamp(), fmt_n(counter))

            self.retweets_df = DataFrame(records)
            self.retweets_df.index.name = "row_id"
            self.retweets_df.index += 1
            print("WRITING TO FILE...")
            self.retweets_df.to_csv(self.retweets_filepath)
Ejemplo n.º 25
0
    def vectorize(self):
        print("--------------------------")
        print("VECTORIZING...")

        self.tv = TfidfVectorizer()
        self.tv.fit(self.x_train)
        print("FEATURES / TOKENS:", fmt_n(len(self.tv.get_feature_names())))

        self.matrix_train = self.tv.transform(self.x_train)
        print("FEATURE MATRIX (TRAIN):", type(self.matrix_train),
              self.matrix_train.shape)

        self.matrix_test = self.tv.transform(self.x_test)
        print("FEATURE MATRIX (TEST):", type(self.matrix_test),
              self.matrix_test.shape)
    def perform(self):
        self.write_metadata_to_file()
        self.upload_metadata()

        self.start()
        self.graph = DiGraph()
        self.running_results = []

        for row in self.bq_service.fetch_retweet_counts_in_batches(topic=self.topic, start_at=self.convo_start_at, end_at=self.convo_end_at):
            # see: https://networkx.github.io/documentation/stable/reference/classes/generated/networkx.DiGraph.add_edge.html#networkx.DiGraph.add_edge
            self.graph.add_edge(row["user_screen_name"], row["retweet_user_screen_name"], rt_count=row["retweet_count"])

            self.counter += 1
            if self.counter % self.batch_size == 0:
                rr = {"ts": logstamp(), "counter": self.counter, "nodes": len(self.graph.nodes), "edges": len(self.graph.edges)}
                print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"]))
                self.running_results.append(rr)

        self.end()
        self.report()
        self.write_results_to_file()
        self.upload_results()
        self.write_graph_to_file()
        self.upload_graph()
    def perform(self):
        self.save_metadata()

        self.start()
        self.results = []
        self.graph = DiGraph()

        for row in self.bq_service.fetch_retweet_counts_in_batches(
                start_at=dt_to_s(self.tweets_start_at),
                end_at=dt_to_s(self.tweets_end_at)):

            self.graph.add_edge(
                row["user_screen_name"],  # todo: user_id
                row["retweet_user_screen_name"],  # todo: retweet_user_id
                weight=row["retweet_count"])

            self.counter += 1
            if self.counter % self.batch_size == 0:
                rr = {
                    "ts": logstamp(),
                    "counter": self.counter,
                    "nodes": self.graph.number_of_nodes(),
                    "edges": self.graph.number_of_edges()
                }
                print(rr["ts"], "|", fmt_n(rr["counter"]), "|",
                      fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"]))
                self.results.append(rr)

                # gets us an approximate users limit but reached a fraction of the time (perhaps more performant when there are millions of rows)
                if self.users_limit and self.counter >= self.users_limit:
                    break

        self.end()
        self.report()
        self.save_results()
        self.save_graph()
Ejemplo n.º 28
0
def get_tweets():
    bq_service = BigQueryService()
    print("LIMIT:", LIMIT)
    job = Job()

    tweets = []
    job.start()
    for row in bq_service.fetch_labeled_tweets_in_batches(limit=LIMIT):
        tweets.append(dict(row))
        job.counter += 1
        if job.counter % BATCH_SIZE == 0:
            job.progress_report()
    job.end()
    print("FETCHED TWEETS:", fmt_n(len(tweets)))
    return DataFrame(tweets)
Ejemplo n.º 29
0
    def perform_better(self):
        print("----------------")
        print(f"FETCHING TEXTS...")
        print(f"SCORING TEXTS IN BATCHES...")

        batch = []
        counter = 0
        for row in self.fetch_texts():
            batch.append(row)

            if len(batch) >= self.batch_size:
                counter+=len(batch)
                print("  ", generate_timestamp(), "|", fmt_n(counter))
                self.process_batch(batch)
                batch = []
Ejemplo n.º 30
0
    def download_user_details(self):
        self.start_job()
        self.destructively_migrate(UserDetail)

        print(logstamp(), "DATA FLOWING...")
        for row in self.bq_service.fetch_user_details_in_batches(
                limit=self.users_limit):
            item = {
                "user_id": row['user_id'],
                "screen_name": clean_string(row['screen_name']),
                "name": clean_string(row['name']),
                "description": clean_string(row['description']),
                "location": clean_string(row['location']),
                "verified": row['verified'],
                "created_at":
                row['created_at'],  #.strftime("%Y-%m-%d %H:%M:%S"),
                "screen_name_count": row['screen_name_count'],
                "name_count": row['name_count'],
                "description_count": row['description_count'],
                "location_count": row['location_count'],
                "verified_count": row['verified_count'],
                "created_count": row['created_at_count'],
                "screen_names": [clean_string(s) for s in row['screen_names']],
                "names": [clean_string(s) for s in row['names']],
                "descriptions": [clean_string(s) for s in row['descriptions']],
                "locations": [clean_string(s) for s in row['locations']],
                "verifieds": row['verifieds'],
                "created_ats":
                row['created_ats'],  #[dt.strftime("%Y-%m-%d %H:%M:%S") for dt in row['_created_ats']]
                "friend_count": row["friend_count"],
                "status_count": row["status_count"],
                "retweet_count": row["retweet_count"],
            }
            self.batch.append(item)
            self.counter += 1

            # temporarily testing individual inserts...
            #record = UserDetail(**item)
            #self.pg_session.add(record)
            #self.pg_session.commit()

            if len(self.batch) >= self.batch_size:
                print(logstamp(), fmt_n(self.counter), "SAVING BATCH...")
                self.pg_session.bulk_insert_mappings(UserDetail, self.batch)
                self.pg_session.commit()
                self.batch = []

        self.end_job()