コード例 #1
0
ファイル: scrape.py プロジェクト: qfjp/microblog_scraper
def stream_user_tweets():

    users = compile_users_n_others()

    api = authenticate_twitter()
    stream_listener = StreamListener(num_to_grab=-1, pickle=False)
    stream = tweepy.Stream(auth=api.auth, listener=stream_listener)
    for i, representative in enumerate(users):
        print(f"Streaming from user group {i} (representative user {representative})")
        user_set = users[representative]
        user_set.add(representative)

        def get_tweets():
            stream.filter(follow=user_set, stall_warnings=True)

        p = multiprocessing.Process(target=get_tweets)
        p.start()
        p.join(150)
        p.terminate()
        p.join()
        if (i + 1) % 50 == 0:
            if not json_it(TWEET_DICT, TWEETS_FNAME):
                sys.stderr.write(f"ERROR: Failed final pickling, abort!\n")
                sys.exit(FILE_NOT_FOUND_EXIT_CODE)
            if not json_it(USER_DICT, USER_DICT_FNAME):
                sys.stderr.write(f"ERROR: Failed final pickling, abort!\n")
                sys.exit(FILE_NOT_FOUND_EXIT_CODE)
コード例 #2
0
ファイル: analyze.py プロジェクト: qfjp/microblog_scraper
def main():
    graph = build_graph(pickle=False, from_scratch=True)
    small_graph = trim_graph(graph, pickle=False, from_scratch=True)
    small_graph.name = "Twitter User Graph"
    print(f"full graph: {len(graph)} nodes")
    print(f"trim graph: {len(small_graph)} nodes")
    print("Generating JSON")
    json_it(small_graph, USER_GRAPH_FNAME, nx.node_link_data)
コード例 #3
0
ファイル: analyze.py プロジェクト: qfjp/microblog_scraper
def trim_graph(graph, reduce_sample=True, pickle=True, from_scratch=True):
    if not graph and not from_scratch:
        graph = reload_json(USER_GRAPH_FNAME, transform=nx.node_link_graph)
        return graph

    rng_state = reload_object(RNG_FNAME, random.getstate)
    random.setstate(rng_state)
    print("Trimming graph...")
    significant_id_set = set()

    for direct in (Direct.IN, Direct.OUT):
        sample = []
        ids = []
        for user_id in graph:
            ids.append(user_id)
            num_neighb = direct.deg_view(graph)[user_id]
            sample.append(num_neighb)
        sample_mean = mean(sample)
        pop_stdev = stdev(sample)
        for i, degree in enumerate(sample):
            if abs(degree - sample_mean) > STDEV_MOD * pop_stdev:
                user_id = ids[i]
                significant_id_set.add((user_id, degree))

    by_asc_degree = sorted(list(significant_id_set), key=lambda x: x[1])
    significant_ids = [i[0] for i in by_asc_degree]

    to_subgraph = set()
    for user_id in significant_ids:
        try:
            others = set(graph.neighbors(user_id))
        except KeyError:
            continue
        if reduce_sample and len(others) != 0:
            others = random.sample(others, int(len(others) * OTHERS_MOD))

        if len(others) == 0:
            continue

        to_subgraph.add(user_id)
        for other in others:
            to_subgraph.add(other)

    pickle_it(rng_state, RNG_FNAME)

    user_graph = graph.subgraph(to_subgraph)

    if pickle:
        json_it(user_graph, USER_GRAPH_FNAME, nx.node_link_data)

    return user_graph
コード例 #4
0
ファイル: app.py プロジェクト: qfjp/microblog_scraper
def construct_graph_data():
    global GRAPH_DATA
    GRAPH_DATA = reload_json("graph_data", lambda: None)

    if GRAPH_DATA:
        return

    GRAPH_DATA = {}
    GRAPH_DATA["raw_tweets"] = run_tweets()
    for name, d_source in DataSource.__members__.items():
        sizes, colors = run_data(d_source)
        GRAPH_DATA[str(d_source)] = (sizes, colors)
    x_range, y_range = get_square_bounds()
    GRAPH_DATA["range"] = (x_range, y_range)

    json_it(GRAPH_DATA, "graph_data")
コード例 #5
0
ファイル: scrape.py プロジェクト: qfjp/microblog_scraper
def expand_user_list(user_id, api_obj, count_key):
    """
    Given a user, a Twitter api object, and a dictionary key:
    Scrape twitter for that users friends/followers (depending on the key)
    and add this information back into the user dictionary.
    """
    now = dt.datetime.now()
    print(
        "(%s) (id=%s) Expanding %s"
        % (now.strftime("%a, %b %d %I:%M %p"), user_id, count_key)
    )
    user_cursor = tweepy.Cursor(api_obj, user_id=user_id)
    pages = []
    try:
        pages = list(user_cursor.pages())
    except tweepy.error.TweepError:
        print("This user has protected tweets, skipping")
    return
    for user_id_page in pages:
        users_on_page = []
        for other_user_id in user_id_page:
            users_on_page.append(other_user_id)
        USER_DICT[user_id][count_key] += users_on_page
    if not json_it(USER_DICT, USER_DICT_FNAME):
        sys.stderr.write(f"failed to pickle after processing user {user_id}")
コード例 #6
0
ファイル: scrape.py プロジェクト: qfjp/microblog_scraper
    def on_status(self, tweet):
        user_id = tweet.user.id_str
        tweet_json = tweet._json

        if user_id not in TWEET_DICT:
            self.new_users += 1

        init_len = 0
        try:
            init_len = len(TWEET_DICT[user_id])
            TWEET_DICT[user_id].append(tweet_json)
        except KeyError:
            TWEET_DICT[user_id] = [tweet_json]

        if user_id not in USER_DICT:
            USER_DICT[user_id] = tweet.user._json
            USER_DICT[user_id]["followers"] = []
            USER_DICT[user_id]["friends"] = []

        # update tweet num
        after_len = len(TWEET_DICT[user_id])
        if init_len < after_len:
            self.new_tweets += 1

        if self.num_to_grab > 0 and self.new_tweets >= self.num_to_grab:
            if self.pickle and not json_it(TWEET_DICT, TWEETS_FNAME):
                sys.stderr.write(f"ERROR: Failed final pickling, abort!\n")
                sys.exit(FILE_NOT_FOUND_EXIT_CODE)
            if self.pickle and not json_it(USER_DICT, USER_DICT_FNAME):
                sys.stderr.write(f"ERROR: Failed final pickling, abort!\n")
                sys.exit(FILE_NOT_FOUND_EXIT_CODE)
            self.reset_state()
            return False

        if self.new_tweets % 100 == 0:
            print(f"currently scraped {self.new_tweets} new tweets")
コード例 #7
0
ファイル: scrape.py プロジェクト: qfjp/microblog_scraper
        p.join()
        if (i + 1) % 50 == 0:
            if not json_it(TWEET_DICT, TWEETS_FNAME):
                sys.stderr.write(f"ERROR: Failed final pickling, abort!\n")
                sys.exit(FILE_NOT_FOUND_EXIT_CODE)
            if not json_it(USER_DICT, USER_DICT_FNAME):
                sys.stderr.write(f"ERROR: Failed final pickling, abort!\n")
                sys.exit(FILE_NOT_FOUND_EXIT_CODE)


def main():
    global GRAB_NEW
    api = authenticate_twitter()

    if GRAB_NEW:
        stream_listener = StreamListener()
        stream = tweepy.Stream(auth=api.auth, listener=stream_listener)
        stream.filter(track=KEYWORDS, stall_warnings=True)
    expand_neighbors(api)
    stream_user_tweets()


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("Recieved siginterrupt, jsoning objects and exiting")
        json_it(TWEET_DICT, TWEETS_FNAME)
        json_it(USER_DICT, USER_DICT_FNAME)
        sys.exit(1)