예제 #1
0
def process_user(uid,
                 follower_of=None,
                 top_level_followee=None,
                 nest_level=0,
                 no_followers=False):
    cur.execute(
        "insert into metadata (uid, follower_of, top_level_followee, nest_level) values (%s, %s, %s, %s) on conflict (uid) do update set nest_level = least(metadata.nest_level, excluded.nest_level)",
        (uid, follower_of, top_level_followee, nest_level))
    if follower_of:
        cur.execute(
            "insert into followers (follower_uid, folowee_uid) values (%s, %s) on conflict (follower_uid, folowee_uid) do nothing",
            (uid, follower_of))
    if aggregate("users", uid) >= 100:
        users = get_aggregate("users", 100)
        command("post", "users/lookup", {"user_id": ",".join(users)}, "users")
    if not no_followers:
        command("get", "followers/ids", {
            "user_id": uid,
            "stringify_ids": True
        }, "followers", {
            "user_id": uid,
            "top_level_followee": top_level_followee,
            "nest_level": nest_level
        })
    command(
        "get", "statuses/user_timeline", {
            "user_id": uid,
            "trim_user": True,
            "count": 200,
            "include_rts": True,
            "exclude_replies": False
        }, "tweets", {"user_id": uid})
예제 #2
0
def insert_tweet(tweet):
    retweet_of = None
    if "retweeted_status" in tweet and tweet["retweeted_status"]:
        insert_tweet(tweet["retweeted_status"])
        retweet_of = tweet["retweeted_status"]["id_str"]
    quote_of = None
    if "quoted_status" in tweet and tweet["quoted_status"]:
        insert_tweet(tweet["quoted_status"])
        quote_of = tweet["quoted_status"]["id_str"]
    uid = tweet["user"]["id_str"]
    cur.execute("""
        insert into tweets (
        twid,
        uid,
        tweet,
        created_at,
        truncated,
        hashtags,
        symbols,
        user_mentions,
        urls,
        in_reply_to_status_id,
        in_reply_to_user_id,
        in_reply_to_screen_name,
        geo,
        coordinates,
        place,
        retweet_of,
        quote_of,
        retweet_count,
        favorite_count,
        possibly_sensitive,
        lang
        ) values ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        on conflict (twid) do nothing
    """, (
        tweet["id_str"],
        uid,
        s(tweet["text"]),
        tweet["created_at"],
        tweet["truncated"],
        json.dumps(tweet["entities"]["hashtags"]),
        json.dumps(tweet["entities"]["symbols"]),
        json.dumps(tweet["entities"]["user_mentions"]),
        json.dumps(tweet["entities"]["urls"]),
        tweet["in_reply_to_status_id"],
        tweet["in_reply_to_user_id"],
        tweet["in_reply_to_screen_name"],
        json.dumps(tweet["geo"]),
        json.dumps(tweet["coordinates"]),
        json.dumps(tweet["place"]),
        retweet_of,
        quote_of,
        tweet["retweet_count"],
        tweet["favorite_count"],
        tweet.get("possibly_sensitive"),
        tweet["lang"]
    ))
예제 #3
0
def handle_users_response(response):
    for u in response["result"]:
        cur.execute("""
            insert into users (
                uid,
                name,
                profile_image_url,
                location,
                created_at,
                favourites_count,
                utc_offset,
                profile_use_background_image,
                lang,
                followers_count,
                protected,
                geo_enabled,
                description,
                verified,
                notifications,
                time_zone,
                statuses_count,
                friends_count,
                screen_name
            ) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) on conflict (uid) do nothing
        """, (
            s(u["id_str"]),
            s(u["name"]),
            s(u["profile_image_url"]),
            u["location"],
            u["created_at"],
            u["favourites_count"],
            u["utc_offset"],
            u["profile_use_background_image"],
            u["lang"],
            u["followers_count"],
            u["protected"],
            u["geo_enabled"],
            s(u["description"]),
            u["verified"],
            u["notifications"],
            u["time_zone"],
            u["statuses_count"],
            u["friends_count"],
            s(u["screen_name"])
        ))
    conn.commit()
    return True
import time
import os
import networkx as nx
import sys

parent_path = os.path.realpath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(parent_path)

from common import cur

print("Initializing graph...")
G = nx.Graph()

print("Max hashtag...")
cur.execute("select max(total_tweets) from hashtags")

print("Requesting hashtags...")
cur.execute(
    "select id, hashtag, total_tweets/%s from hashtags where id in (select h1 from hashtags_relations order by tweets_with_both desc limit 500) or id in (select h2 from hashtags_relations order by tweets_with_both desc limit 500)",
    (float(cur.fetchone()[0]), ))

print("Transforming hashtags into nodes...")
G.add_nodes_from(((v[0], {"label": v[1], "weight": v[2]}) for v in cur))

print("Max relation...")
cur.execute("select max(tweets_with_both) from hashtags_relations")

print("Requesting relations...")
cur.execute(
    "select h1, h2, tweets_with_both/%s from hashtags_relations order by tweets_with_both desc limit 500",
예제 #5
0
from common import conn, cur, get_response, ack_response, nack_response, get_raw, command, get_raw_nb

c = statsd.StatsClient('localhost', 8125, prefix='user_processor')

while True:
    logging.info("Waiting for requests in 'users_to_download' queue...")
    meta, screen_name = get_raw("users_to_download")
    with c.timer("valid_jobs"):
        logging.info("Got request for user=%s, requesting info from Twitter...", screen_name)

        params = { "screen_name": screen_name }
        q = command("get", "users/show", params, "user_details")
        meta_user, resp = get_response("user_details")
        twid = resp["result"]["id_str"]
        cur.execute("insert into users (twid, screen_name) values (%s, %s) returning id", (twid, screen_name))
        uid = cur.fetchone()[0]
        conn.commit()

        ack_response(meta_user)
        
        # start job for downloading user tweets
        params = { "user_id": twid, "count": 200, "trim_user": "******", "include_rts": "false" }
        metadata = { "user_id": uid, "params": params, "collected": 0, "hashtags": {} }
        command("get", "statuses/user_timeline", params, "user_tweets", metadata=metadata)

        # start job for downloading friends
        params = { "user_id": twid, "count": 5000, "stringify_ids": True }
        metadata = { "params": params, "parent": twid, "parent_level": 0}
        command("get", "friends/ids", params, "friends_ids", metadata=metadata)
예제 #6
0
c = statsd.StatsClient('localhost', 8125, prefix='friends_ids_processor')
while True:
    logging.info("Waiting for data from 'friends_ids' queue...")
    meta, resp = get_response("friends_ids")

    if resp["result"] != {} and not ("code" in resp["result"]
                                     and resp["result"]["code"] == 34):
        with c.timer("valid_jobs"):
            logging.info(
                "Got friends ids for %s (parent_level: %s) requesting tweets...",
                resp["metadata"]["parent"], resp["metadata"]["parent_level"])

            for user_id in resp["result"]["ids"]:
                cur.execute(
                    "insert into users (twid) values (%s) returning id",
                    (user_id, ))
                uid = cur.fetchone()[0]
                params = {
                    "user_id": user_id,
                    "count": 200,
                    "trim_user": "******",
                    "include_rts": "false"
                }
                metadata = {
                    "params": params,
                    "collected": 0,
                    "hashtags": {},
                    "user_id": uid
                }
                command("get",
예제 #7
0
    with c.timer("valid_jobs"):
        c.incr("tweets", len(resp["result"]))
        for t in resp["result"]:
            if cmd_max_id == t["id_str"]:
                continue
            if not metadata["max_id"] or t["id_str"] < metadata["max_id"]:
                metadata["max_id"] = t["id_str"]
            metadata["collected"] += 1

            hashtags = set(
                [v["text"].lower() for v in t["entities"]["hashtags"]])
            fav_count = t["favorite_count"]
            rt_count = t["retweet_count"]
            cur.execute(
                "insert into number_of_hashtags_tmp (number) values (%s)",
                (len(hashtags), ))
            for hashtag in hashtags:
                to_update[hashtag]["favs"] += fav_count
                to_update[hashtag]["rts"] += rt_count
                to_update[hashtag]["count"] += 1

            for (h1, h2) in itertools.combinations(hashtags, 2):
                to_update[min(h1, h2)]["combinations"][max(h1, h2)] += 1

        c.incr("hashtags_upserts", len(to_update.keys()))
        for hashtag, var in ((k, to_update[k])
                             for k in sorted(to_update.keys())):
            if hashtag not in metadata["hashtags"]:
                metadata["hashtags"][hashtag] = 0
            metadata["hashtags"][hashtag] += var["count"]