def dump_retweets_job(news: NewsItem, config: Config,
                      twython_connector: TwythonConnector):
    data = news.tweet_data
    dir = news.dir
    for tweet, count in zip(data.tweet_id, data.retweet_count):
        if count != 0:
            try:
                connection = twython_connector.get_twython_connection(
                    "get_retweet")
                retweets = connection.get_retweets(id=tweet,
                                                   count=100,
                                                   cursor=-1)
            except TwythonRateLimitError:
                logging.exception(
                    "Twython API rate limit exception - tweet id : {}".format(
                        tweet))
            except Exception:
                logging.exception(
                    "Exception in getting retweets for tweet id %d using connection %s"
                    % (tweet, connection))
            for retweet in retweets:
                data = data.append(extract_retweet_features(
                    retweet, tweet, data['fake'][0]),
                                   ignore_index=True)
    print('Saving ' + dir)
    data.to_csv(dir, index=False)
Exemple #2
0
def dump_retweets_job(tweet: Tweet, config: Config,
                      twython_connector: TwythonConnector):
    retweets = []
    connection = None
    try:
        connection = twython_connector.get_twython_connection("get_retweet")
        retweets = connection.get_retweets(id=tweet.tweet_id,
                                           count=100,
                                           cursor=-1)

    except TwythonRateLimitError:
        logging.exception(
            "Twython API rate limit exception - tweet id : {}".format(
                tweet.tweet_id))

    except Exception:
        logging.exception(
            "Exception in getting retweets for tweet id %d using connection %s"
            % (tweet.tweet_id, connection))

    retweet_obj = {"retweets": retweets}

    dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source,
                                    tweet.label, tweet.news_id)
    retweet_dir = "{}/retweets".format(dump_dir)
    create_dir(dump_dir)
    create_dir(retweet_dir)
    json.dump(retweet_obj,
              open("{}/{}.json".format(retweet_dir, tweet.tweet_id), "w"))
def dump_tweet_information(tweet_chunk: list, config: Config,
                           twython_connector: TwythonConnector):
    """Collect info and dump info of tweet chunk containing atmost 100 tweets"""

    tweet_list = []
    for tweet in tweet_chunk:
        tweet_list.append(tweet.tweet_id)

    try:
        tweet_objects_map = twython_connector.get_twython_connection(
            Constants.GET_TWEET).lookup_status(id=tweet_list,
                                               include_entities=True,
                                               map=True)['id']
        for tweet in tweet_chunk:
            tweet_object = tweet_objects_map[str(tweet.tweet_id)]
            if tweet_object:
                dump_dir = "{}/{}/{}/{}".format(config.dump_location,
                                                tweet.news_source, tweet.label,
                                                tweet.news_id)
                tweet_dir = "{}/tweets".format(dump_dir)
                create_dir(dump_dir)
                create_dir(tweet_dir)

                json.dump(
                    tweet_object,
                    open("{}/{}.json".format(tweet_dir, tweet.tweet_id), "w"))

    except TwythonRateLimitError:
        print("Twython API rate limit exception")
        logging.exception("Twython API rate limit exception")

    except Exception as ex:
        logging.exception("exception in collecting tweet objects")
        print("exception in collecting tweet objects:", str(ex))
    return None
Exemple #4
0
def dump_retweets_job(
    tweet: Tweet, config: Config, twython_connector: TwythonConnector
):
    retweets = []
    connection = None

    dump_dir = get_dump_dir(config, tweet)

    if _should_fetch_retweets(tweet, dump_dir):
        try:
            connection = twython_connector.get_twython_connection(Constants.GET_RETWEET)
            retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1)

        except TwythonRateLimitError:
            logging.exception(
                "Twython API rate limit exception - tweet id : {}".format(
                    tweet.tweet_id
                )
            )

        except Exception:
            logging.exception(
                "Exception in getting retweets for tweet id %d using connection %s"
                % (tweet.tweet_id, connection)
            )

    retweet_obj = {"retweets": retweets}

    retweet_dir = "{}/retweets".format(dump_dir)
    create_dir(dump_dir)
    create_dir(retweet_dir)
    json.dump(retweet_obj, open("{}/{}.json".format(retweet_dir, tweet.tweet_id), "w"))
Exemple #5
0
def dump_tweet_information(tweet: Tweet, config: Config,
                           twython_connector: TwythonConnector):
    try:
        tweet_object = twython_connector.get_twython_connection(
            Constants.GET_TWEET).show_status(id=tweet.tweet_id)

        if tweet_object:
            dump_dir = "{}/{}/{}/{}".format(config.dump_location,
                                            tweet.news_source, tweet.label,
                                            tweet.news_id)
            tweet_dir = "{}/tweets".format(dump_dir)
            create_dir(dump_dir)
            create_dir(tweet_dir)

            json.dump(
                tweet_object,
                open("{}/{}.json".format(tweet_dir, tweet.tweet_id), "w"))

    except TwythonRateLimitError:
        logging.exception("Twython API rate limit exception")

    except Exception as ex:
        logging.exception("exception in collecting tweet objects")

    return None
Exemple #6
0
    def __init__(self, data_dir, data_collection_dir, tweet_keys_file, num_process):
        self.dataset_dir = data_dir
        self.dump_location = data_collection_dir
        self.tweet_keys_file = tweet_keys_file
        self.num_process = num_process

        self.twython_connector = TwythonConnector("localhost:5000", tweet_keys_file)
Exemple #7
0
def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonConnector):

    hop_index = tweet.hop_index

    news_dir = f"{config.dump_location}/{tweet.news_source}/{tweet.label}/{tweet.news_id}"
    retweet_dir = f"{news_dir}/retweets_{hop_index}"
    retweet_path = f"{retweet_dir}/{tweet.tweet_id}.json"

    if os.path.exists(retweet_path):
        print("[PASSED] news:{}, hop index: {}".format(tweet.news_id, hop_index))
        return
    else:
        print("[NEW] news:{}, hop index: {}".format(tweet.news_id, hop_index))

    retweets = []
    connection = None
    try:
        connection = twython_connector.get_twython_connection("get_retweet")
        retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1)

    except TwythonRateLimitError:
        logging.exception(f"Twython API rate limit exception - tweet id : {tweet.tweet_id}")

    except Exception:
        logging.exception(
            "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection))

    retweet_obj = {"retweets": retweets}

    create_dir(news_dir)
    create_dir(retweet_dir)
    json.dump(retweet_obj, open(retweet_path, "w"))
def dump_user_following(user_id, save_location, twython_connector: TwythonConnector):

    # Fetch and save user information if the file is not already present
    if not Path("{}/{}.json".format(save_location, user_id)).is_file():
        try:
            user_following = fetch_user_friends_ids(user_id, twython_connector.get_twython_connection(GET_FRIENDS_ID))

            user_following_info = {USER_ID: user_id,FOLLOWING : user_following}
            json.dump(user_following_info, open("{}/{}.json".format(save_location, user_id), "w"))

        except:
            logging.exception("Exception in getting follower_ids for user : {}".format(user_id))
def dump_user_profile_job(user_id, save_location, twython_connector: TwythonConnector):
    profile_info = None

    # Fetch and save user information if the file is not already present
    if not Path("{}/{}.json".format(save_location, user_id)).is_file():
        try:
            profile_info = twython_connector.get_twython_connection(GET_USER).show_user(user_id=user_id)

        except TwythonRateLimitError as ex:
            logging.exception("Twython API rate limit exception")

        finally:
            if profile_info:
                json.dump(profile_info, open("{}/{}.json".format(save_location, user_id), "w"))
def dump_retweets_job(tweet: Tweet, config: Config,
                      twython_connector: TwythonConnector):

    dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source,
                                    tweet.label, tweet.news_id)
    retweet_dir = "{}/retweets".format(dump_dir)
    retweet_path = "{}/{}.json".format(retweet_dir, tweet.tweet_id)

    if os.path.exists(retweet_path):
        print("[PASSED] source:{}, label:{}, news:{}, retweet: tweet{}".format(
            tweet.news_source, tweet.label, tweet.news_id, tweet.tweet_id))
        return
    else:
        print("[NEW] source:{}, label:{}, news:{}, retweet: tweet{}".format(
            tweet.news_source, tweet.label, tweet.news_id, tweet.tweet_id))

    retweets = []
    connection = None
    try:
        connection = twython_connector.get_twython_connection("get_retweet")
        retweets = connection.get_retweets(id=tweet.tweet_id,
                                           count=100,
                                           cursor=-1)

    except TwythonRateLimitError:
        logging.exception(
            "Twython API rate limit exception - tweet id : {}".format(
                tweet.tweet_id))

    except Exception:
        logging.exception(
            "Exception in getting retweets for tweet id %d using connection %s"
            % (tweet.tweet_id, connection))

    retweet_obj = {"retweets": retweets}

    create_dir(dump_dir)
    create_dir(retweet_dir)
    json.dump(retweet_obj, open(retweet_path, "w"))
def dump_user_recent_tweets_job(user_id, save_location, twython_connector: TwythonConnector):
    profile_info = None

    # Fetch and save user information if the file is not already present
    if not Path("{}/{}/{}.json".format(save_location, user_id[2], user_id[0])).is_file():
        create_dir("{}/{}".format(save_location, user_id[2]))
        try:
            profile_info = twython_connector.get_twython_connection(GET_USER_TWEETS).get_user_timeline(user_id=user_id[0],
                                                                                                           count=200,
                                                                                                           exclude_replies=False,
                                                                                                           incude_rts=True,
                                                                                                           max_id=user_id[1])
        except TwythonRateLimitError as ex:
            logging.exception("Twython API rate limit exception")

        finally:
            if len(profile_info) > 0:
                logging.info("found {} tweets in timeline for user {}".format(len(profile_info), user_id[0]))
                json.dump(profile_info, open("{}/{}/{}.json".format(save_location, user_id[2], user_id[0]), "w"))
            else:
                logging.warning("couldn't retrieve the timeline of user {}".format(user_id[0]))
    else:
        logging.info("file for users and story already existis")
Exemple #12
0
def dump_tweet_information(tweet_chunk: list, config: Config, twython_connector: TwythonConnector):
    """Collect info and dump info of tweet chunk containing at most 100 tweets"""

    # skip downloading tweets which are already been downloaded
    filtered_tweet_chunk = []
    for tweet in tweet_chunk:
        dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id)
        tweet_dir = "{}/tweets".format(dump_dir)
        tweet_path = f"{tweet_dir}/{tweet.tweet_id}.json"

        if os.path.exists(tweet_path):
            print(f"[PASSED] source:{tweet.news_source}, label:{tweet.label}, news:{tweet.news_id}")

            # save user profile stored in tweet
            user_profiles_folder = f"{config.dump_location}/user_profiles"

            with open(tweet_path, "r") as tweet_file:
                tweet_dict = json.loads(tweet_file.read())
                user_id = tweet_dict['user']['id']
                user_profile_path = f"{user_profiles_folder}/{user_id}.json"

                if not os.path.exists(user_profile_path):
                    print(f"[NEW] User profile: {user_id}")
                    with open(user_profile_path, "w") as user_profile_file:
                        user_profile_file.write(json.dumps(tweet_dict['user']))

            continue
        else:
            print(f"[NEW] source:{tweet.news_source}, label:{tweet.label}, news:{tweet.news_id}")
            filtered_tweet_chunk.append(tweet)

    tweet_id_list = []
    for tweet in filtered_tweet_chunk:
        dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id)
        tweet_dir = "{}/tweets".format(dump_dir)
        tweet_path = f"{tweet_dir}/{tweet.tweet_id}.json"
        tweet_id_list.append(tweet.tweet_id)

    try:
        tweet_objects_map = twython_connector.get_twython_connection(Constants.GET_TWEET).lookup_status(id=tweet_id_list,
                                                                                                    include_entities=True,
                                                                                                    map=True)['id']
        for tweet in filtered_tweet_chunk:
            tweet_object = tweet_objects_map[str(tweet.tweet_id)]
            if tweet_object:
                dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id)
                tweet_dir = "{}/tweets".format(dump_dir)
                tweet_path = f"{tweet_dir}/{tweet.tweet_id}.json"
                create_dir(dump_dir)
                create_dir(tweet_dir)
                json.dump(tweet_object, open(tweet_path, "w"))

                # save user profile stored in tweet
                user_profiles_folder = f"{config.dump_location}/user_profiles"
                user_id = tweet_object['user']['id']
                user_profile_path = f"{user_profiles_folder}/{user_id}.json"

                if not os.path.exists(user_profile_path):
                    print(f"[NEW] User profile: {user_id}")
                    with open(user_profile_path, "w") as user_profile_file:
                        user_profile_file.write(json.dumps(tweet_object['user']))

    except TwythonRateLimitError:
        logging.exception("Twython API rate limit exception")

    except Exception as ex:
        logging.exception("exception in collecting tweet objects")

    return None
Exemple #13
0
from os import path
import json

from util.TwythonConnector import TwythonConnector

TWEET_ID = 1309376185340538881

# read config file
config_path = path.abspath(path.join(path.dirname(__file__), '../config.json'))
json_object = json.load(open(config_path))
tweet_keys_file = json_object["tweet_keys_file"]

# get twypthon connector
twython_connector = TwythonConnector("localhost:5000", tweet_keys_file)
connection = twython_connector.get_twython_connection("get_retweet")

# request target
retweets = connection.get_retweets(id=TWEET_ID, count=100, cursor=-1)

print(json.dumps(retweets))
#%%
import os
import json
import time
import random
import argparse
from collect_tweets import TweetCollector
from util.TwythonConnector import TwythonConnector

parser = argparse.ArgumentParser(description='crawl')
parser.add_argument("--save_dir", type=str, default="../dataset")
args = parser.parse_known_args()[0]
tweet_keys_file = "./resources/tweet_keys_file.txt"
connector = TwythonConnector(tweet_keys_file)

# %%
from news import News
from collect_users import UserCollector

user_root = os.path.join(args.save_dir, "user_network")
user_collector = UserCollector(user_root, connector)
user_collector.collect_user_follower_profiles()
user_collector.collect_user_following_profiles()
# %%
Exemple #15
0
from os import path
import json

from util.TwythonConnector import TwythonConnector
from util import Constants

TWEET_ID_LIST = [1309376185340538881]

# read config file
config_path = path.abspath(path.join(path.dirname(__file__), '../config.json'))
json_object = json.load(open(config_path))
tweet_keys_file = json_object["tweet_keys_file"]

# get twypthon connector
twython_connector = TwythonConnector("localhost:5000", tweet_keys_file)
connection = twython_connector.get_twython_connection(Constants.GET_TWEET)

# request target
tweet_objects_map = connection.lookup_status(id=TWEET_ID_LIST,
                                             include_entities=True,
                                             map=True)['id']

print(json.dumps(tweet_objects_map))