Example #1
0
def print_user_archive():
    """
    Fetch all available tweets for one user and print them, line by line
    """
    archive_generator = rest.fetch_user_archive("lessig")
    for page in archive_generator:
        for tweet in page:
            print_tweet(tweet)
Example #2
0
def print_user_archive():
    """
    Fetch all available tweets for one user and print them, line by line
    """
    archive_generator = rest.fetch_user_archive("lessig")
    for page in archive_generator:
        for tweet in page:
            print_tweet(tweet)
Example #3
0
def save_user_archive_to_database():
    """
    Fetch all available tweets for one user and save them to the database.
    """
    archive_generator = rest.fetch_user_archive("lessig")
    for page in archive_generator:
        for tweet in page:
            database.create_tweet_from_dict(tweet)
    logging.warning(u"Wrote tweets from @lessig to database")
Example #4
0
def save_user_archive_to_database():
    """
    Fetch all available tweets for one user and save them to the database.
    """
    archive_generator = rest.fetch_user_archive("lessig")
    for page in archive_generator:
        for tweet in page:
            database.create_tweet_from_dict(tweet)
    logging.warning(u"Wrote tweets from @lessig to database")
Example #5
0
def save_user_archive_to_file():
    """
    Fetch all available tweets for one user and save them to a text file, one tweet per line.
    (This is approximately the format that GNIP uses)
    """
    with open("lessig-tweets.json", "w") as f:
        archive_generator = rest.fetch_user_archive("lessig")
        for page in archive_generator:
            for tweet in page:
                f.write(json.dumps(tweet) + "\n")
    logging.warning(u"Wrote tweets from @lessig to file lessig-tweets.json")
Example #6
0
def save_user_archive_to_file():
    """
    Fetch all available tweets for one user and save them to a text file, one tweet per line.
    (This is approximately the format that GNIP uses)
    """
    with open("lessig-tweets.json", "w") as f:
        archive_generator = rest.fetch_user_archive("lessig")
        for page in archive_generator:
            for tweet in page:
                f.write(json.dumps(tweet) + "\n")
    logging.warning(u"Wrote tweets from @lessig to file lessig-tweets.json")
Example #7
0
def main():
    archive = rest.fetch_user_archive(sys.argv[1])
    processed_word_list = []

    for page in archive:
        for tweet in page:
            # print_tweet(tweet)
            print(u"{0}: {1}".format(tweet["user"]["screen_name"],
                                     tweet["text"]))
            print("Length of the tweets: ", len(tweet["text"]))
            processed_word_list = filter_stop_words(processed_word_list,
                                                    tweet["text"])
            plot_tweets_by_wordlist(processed_word_list)
        break
Example #8
0
def main():
    archive = rest.fetch_user_archive(sys.argv[1])
    processed_word_list = []

    for page in archive:
        for tweet in page:
            # print_tweet(tweet)
            cur_text = tweet["text"]
            # print(u"{0}: {1}".format(tweet["user"]["screen_name"], cur_text))
            # print("Length of the tweets: ", len(cur_text))
            cur_text = cleanse_sentence(cur_text)
            processed_word_list = filter_stop_words(processed_word_list,
                                                    cur_text)
            plot_tweets_by_wordlist(processed_word_list, True)
        break
def save_user_archive_list(unfiltered_user_path, months_back, lang, capital):
    """
    Function to fetch tweets, at max 200x200 for a list of users that in a cvs, filter them, if they exist, according to
    their activity and the selection criteria: user["interface"]/user["time_zone"]/tweet["language"]/tweet["place"]
    and then save them in a cvs: 'filtered_users.csv' and some other analysis csv
    :param unfiltered_user_path:(str) path of the file that contains the list of unique users "unique_users.cvs"
    :param months_back: (int) if the user has not tweeted in months_back months back in time then he is not active
    :param lang: (str) language of the nation of analysis "it"
    :param capital: (str) capital of the nation of analysis "rome"
    :return: 1) 'filtered_users.csv' with ids of the users that satisfy the activity and selection criteria
             2) 'filtered_time_users.csv' with ids of the selected users with the retrieval date
             3) 'unique_active_users_time.csv' with number of saved
             4) 'nationality_users.csv' with parameters of selection:
                 n_tweets, user_interface, user_time_zone, tweet_language, tweet_place
             5) 'perc_overlap.csv'/'abs_overlap.csv' impact of each selection criteria
             6) 'frequency_users.csv' frequency of the users
    """

    with open('filtered_users.csv', "w") as f, open('filtered_time_users.csv', "w") as ft, \
            open('unique_active_users_time.csv', "w") as t, open('nationality_users.csv', "w") as n, \
            open('abs_overlap.csv', "w") as a, open('perc_overlap.csv', "w") as p, \
            open('frequency_users.csv', "w") as freq:

        dict_retrival_date = {}
        unique_active_users = []
        frequency = []
        perc_users = {}
        u_interface = [
            'user_interface', 0, 0, 0, 0
        ]  # user_interface, &user_time_zone, &tweet_language, &tweet_place
        u_time_zone = [
            'user_time_zone', 0, 0, 0, 0
        ]  # &user_interface, user_time_zone, &tweet_language, &tweet_place
        u_language = [
            'tweet_language', 0, 0, 0, 0
        ]  # &user_interface, &user_time_zone, tweet_language, &tweet_place
        u_place = [
            'tweet_place', 0, 0, 0, 0
        ]  # &user_interface, &user_time_zone, &tweet_language, tweet_place
        n.write(
            "user_id, n_tweets, user_interface, user_time_zone, tweet_language, tweet_place\n"
        )
        freq.write("days interval, users' count\n")

        for user, retrival_date in unfiltered_user_path:

            perc_users[user] = [0, 0, 0, 0, 0]
            archive_first_page = rest.fetch_user_archive(
                user, 1, 200)  # get the first page of the archive
            n_tweets_user = 0

            # filter
            for tweets in archive_first_page:

                # if the profile still exists
                if "errors" not in archive_first_page:

                    date_first_tweet = parser.parse(
                        tweets[0]['created_at']).strftime("%Y-%m-%d")
                    date_first_tweet_str = datetime.datetime.strptime(
                        date_first_tweet, "%Y-%m-%d").date()
                    month_ago = (datetime.datetime.today() -
                                 relativedelta(months=months_back)).date()

                    for tweet in tweets:
                        # if the user is still active
                        if date_first_tweet_str >= month_ago:
                            n_tweets_user += 1

                            language = tweet["lang"]
                            if tweet["place"] is not None:
                                place = tweet["place"]["country_code"].lower()
                            else:
                                place = None
                            if tweet["user"]["lang"] is not None:
                                interface = tweet["user"]["lang"].lower()
                            else:
                                interface = None
                            if tweet["user"]["time_zone"] is not None:
                                time_zone = tweet["user"]["time_zone"].lower()
                            else:
                                time_zone = None

                            # analyse all their tweets
                            if interface == lang:
                                perc_users[user][0] += 1
                            if time_zone == capital:
                                perc_users[user][1] += 1
                            if language == lang:
                                perc_users[user][2] += 1
                            if place == lang:
                                perc_users[user][3] += 1

                            # if the national criteria are still valid
                            if interface == lang or time_zone == capital or language == lang or place == lang:

                                # selection phase: unique users
                                if user not in unique_active_users:
                                    f.write("{0}\n".format(user))
                                    ft.write("{0},{1}\n".format(
                                        user, retrival_date))

                                    if retrival_date in dict_retrival_date:
                                        dict_retrival_date[retrival_date] += 1
                                    else:
                                        dict_retrival_date[retrival_date] = 1

                                    if interface == lang:
                                        u_interface[1] += 1
                                        if time_zone == capital:
                                            u_interface[2] += 1
                                        if language == lang:
                                            u_interface[3] += 1
                                        if place == lang:
                                            u_interface[4] += 1
                                    if time_zone == capital:
                                        u_time_zone[2] += 1
                                        if interface == lang:
                                            u_time_zone[1] += 1
                                        if language == lang:
                                            u_time_zone[3] += 1
                                        if place == lang:
                                            u_time_zone[4] += 1
                                    if language == lang:
                                        u_language[3] += 1
                                        if interface == lang:
                                            u_language[1] += 1
                                        if time_zone == capital:
                                            u_language[2] += 1
                                        if place == lang:
                                            u_language[4] += 1
                                    if place == lang:
                                        u_place[4] += 1
                                        if interface == lang:
                                            u_place[1] += 1
                                        if time_zone == capital:
                                            u_place[2] += 1
                                        if language == lang:
                                            u_place[3] += 1

                                    unique_active_users.append(user)
                            else:
                                break
                    # consider all the tweets for the nationality on the use, consistency, but print just the selected
                    if user in unique_active_users:
                        date_first_tweet = parser.parse(
                            tweets[0]['created_at'])
                        date_last_tweet = parser.parse(
                            tweets[199]['created_at'])
                        time_interval = date_first_tweet - date_last_tweet  # in days
                        frequency.append(time_interval.days)

                        perc_users[user] = [
                            l / n_tweets_user for l in perc_users[user]
                        ]
                        perc_users[user][4] = n_tweets_user
                        n.write("{0},{1},{2},{3},{4},{5}\n".format(
                            user, perc_users[user][4], perc_users[user][0],
                            perc_users[user][1], perc_users[user][2],
                            perc_users[user][3]))

        print(dict_retrival_date)

        u_tot = 0
        for key in dict_retrival_date:
            u_tot += dict_retrival_date[key]
            t.write("{0},{1}\n".format(key, dict_retrival_date[key]))

        for item in Counter(frequency).items():
            freq.write("{0},{1}\n".format(item[0], item[1]))

        # summary: overlap tables
        a.write(
            "{0}, user_interface, user_time_zone, tweet_language, tweet_place\n"
            .format(u_tot))
        a.write("{0},{1},{2},{3},{4}\n".format(u_interface[0], u_interface[1],
                                               u_interface[2], u_interface[3],
                                               u_interface[4]))
        a.write("{0},{1},{2},{3},{4}\n".format(u_time_zone[0], u_time_zone[1],
                                               u_time_zone[2], u_time_zone[3],
                                               u_time_zone[4]))
        a.write("{0},{1},{2},{3},{4}\n".format(u_language[0], u_language[1],
                                               u_language[2], u_language[3],
                                               u_language[4]))
        a.write("{0},{1},{2},{3},{4}\n".format(u_place[0], u_place[1],
                                               u_place[2], u_place[3],
                                               u_place[4]))
        p.write(
            " , user_interface, user_time_zone, tweet_language, tweet_place\n")
        if u_interface[1] != 0:
            p.write("{0},{1},{2},{3},{4}\n".format(
                u_interface[0], u_interface[1] / u_tot,
                u_interface[2] / u_interface[1],
                u_interface[3] / u_interface[1],
                u_interface[4] / u_interface[1]))
        else:
            p.write("{0},{1},{2},{3},{4}\n".format(u_interface[0],
                                                   u_interface[1] / u_tot,
                                                   None, None, None))
        if u_time_zone[2] != 0:
            p.write("{0},{1},{2},{3},{4}\n".format(
                u_time_zone[0], u_time_zone[1] / u_time_zone[2],
                u_time_zone[2] / u_tot, u_time_zone[3] / u_time_zone[2],
                u_time_zone[4] / u_time_zone[2]))
        else:
            p.write("{0},{1},{2},{3},{4}\n".format(u_time_zone[0], None,
                                                   u_time_zone[2] / u_tot,
                                                   None, None))
        if u_language[3] != 0:
            p.write("{0},{1},{2},{3},{4}\n".format(
                u_language[0], u_language[1] / u_language[3],
                u_language[2] / u_language[3], u_language[3] / u_tot,
                u_language[4] / u_language[3]))
        else:
            p.write("{0},{1},{2},{3},{4}\n".format(u_language[0], None, None,
                                                   u_language[3] / u_tot,
                                                   None))
        if u_place[4] != 0:
            p.write("{0},{1},{2},{3},{4}\n".format(u_place[0],
                                                   u_place[1] / u_place[4],
                                                   u_place[2] / u_place[4],
                                                   u_place[3] / u_place[4],
                                                   u_place[4] / u_tot))
        else:
            p.write("{0},{1},{2},{3},{4}\n".format(u_place[0], None, None,
                                                   None, u_place[4] / u_tot))
def download_archive_json(filtered_user_path, archive_name, min_tweet_date,
                          max_tweet_date, disk_out, last_user):
    # download_archive.download_archive_json('unique_frequent_users.csv', 'archive_frequent_users_2018', "2018-01-01",
    # "2018-12-31", '', None)
    """
    Function to just download archive as json
    :param filtered_user_path: (str) path of the csv file with the list of filtered selected users
    :param archive_name: (str) name of the compressed file with the archive
    :param min_tweet_date: (str) lower limit to save the tweets (to be changed each month to speed up the code)
    :param max_tweet_date: (str) upper limit to save the tweets
    :param disk_out: (str) path of the disk_out to save json
    :param last_user: (int) plan B if no json file. Insert user id to start retrieve data from there
    :return: json with the tweets between min_tweet_date and max_tweet_date
    """

    no_retrieved_users = [
        row[0]
        for row in list_no_retrieved_users(filtered_user_path, archive_name +
                                           '.json', disk_out, last_user)
    ]

    processed_users, too_frequent_users = 0, 0
    user_archive_list = None
    min_tweet_date = parser.parse(min_tweet_date)
    max_tweet_date = parser.parse(max_tweet_date)
    tot_users = len(no_retrieved_users)

    with open(os.path.join(disk_out, "too_frequent_users.csv"), "a") as tf:
        f_json = gzip.open(os.path.join(disk_out, archive_name + '.json.gz'),
                           'at',
                           encoding='utf-8')
        for user_str in no_retrieved_users:

            processed_users += 1
            user = int(user_str)
            print('User: '******'created_at']).replace(tzinfo=None)
                    if len(user_archive_list
                           ) >= 16:  # too frequent users are written in a file
                        tf.write("{0},{1}\n".format(user_str, date_last_tweet))
                        too_frequent_users += 1

                for page in user_archive_list:  # for each page, from the most recent to the oldest
                    n_tweets_pag = len(page)
                    if n_tweets_pag > 0 and "error" not in page:  # if no errors in pages
                        date_first_tweet = parser.parse(
                            page[0]['created_at']).replace(tzinfo=None)
                        date_last_tweet = parser.parse(
                            page[n_tweets_pag -
                                 1]['created_at']).replace(tzinfo=None)
                        # if the page has tweets in the desired time interval (no time zone)
                        # pages with some tweets in the interval
                        if date_first_tweet >= min_tweet_date and date_last_tweet <= max_tweet_date:
                            # if all the tweets are in the interval
                            if date_last_tweet >= min_tweet_date and date_first_tweet <= max_tweet_date:
                                # no extra checks on the tweets and save tweets of the page in json
                                for tweet in page:
                                    # save tweets in a json file
                                    json_str = '{}\n'.format(json.dumps(tweet))
                                    f_json.write(json_str)
                            else:
                                for tweet in page:  # check 1 by one and save tweets of the page in json
                                    date_tweet = parser.parse(
                                        tweet['created_at']).replace(
                                            tzinfo=None)
                                    if min_tweet_date <= date_tweet <= max_tweet_date:
                                        # save tweets in a json file
                                        json_str = '{}\n'.format(
                                            json.dumps(tweet))
                                        f_json.write(json_str)

            # timer to see what I am doing
            if processed_users % 10 == 0:
                print(
                    "---------------------------------------------------------------------------------"
                )
                print("user:"******"percentage of processed users:" +
                      str(processed_users / tot_users))
                print("processed users:" + str(processed_users))
                print("too frequent users:" + str(too_frequent_users))
def download_new_user(unfiltered_user_path, archive_name, min_tweet_date,
                      max_tweet_date, months_back, lang, capital,
                      threshold_lang, disk_out, last_user, chunk_users):
    # download_archive.download_new_user('unique_users.csv', 'archive_01_08_2018', "2018-01-01", "2018-12-31", 1, "it",
    # "rome", 2, '', None, 0, 50)
    """
    # for each user check if it still exists, if the selection criteria are still satisfied in the first page of the
    # archive (last 200 tweets), if the user is still active in the last month, if all the 3 criteria are satisfied,
    # then the user has field selected = True, all the its tweets in the time interval are saved in a json file.
    :param unfiltered_user_path: (str) path of the csv file with the list of unfiltered selected users
    :param archive_name: (str) name of the compressed file with the archive
    :param min_tweet_date: (str) lower limit to save the tweets (to be changed each month to speed up the code)
    :param max_tweet_date: (str) upper limit to save the tweets
    :param months_back: (int) number of monts to go dackwards to consider the user as inactive if no activity (1)
    :param lang: (str) selection parameter ("it")
    :param capital: (str) selection parameter ("rome")
    :param threshold_lang: (int) selection parameter: minimum lever of consistency for the language that is required.
    :param disk_out: (str) path of the disk_out to save json.gz
    :param last_user: (int) plan B if no csv file. Insert user id (as int) to start retrieve data from there
    :param chunk_users: (int) number of users of the chunk files
    :return: json and csv.gz files with the tweets and the tables of the selected users between min_tweet_date and
    max_tweet_date
    """

    unfiltered_users_retrieval_date = list_no_retrieved_users(
        unfiltered_user_path, archive_name + '.json', disk_out, last_user)
    unfiltered_users = [row[0] for row in unfiltered_users_retrieval_date]

    processed_users = 0
    unique_active_users = 0
    min_tweet_date = parser.parse(min_tweet_date)
    max_tweet_date = parser.parse(max_tweet_date)
    n_unfiltered_users = len(unfiltered_users)
    f_json = open(os.path.join(disk_out, archive_name + '.json'),
                  'a',
                  encoding='utf-8')
    filename = archive_name + '_user.csv'
    data_folder = os.path.join(disk_out, archive_name + '_' + "data")
    user_csv = csv.writer(open_file(data_folder, filename, 'w', 'csv'),
                          delimiter=',')
    data_folder = os.path.join(disk_out, archive_name + '_' + "data")

    for user_str in unfiltered_users_retrieval_date:

        selected_criteria = False  # user not yet selected for criteria
        saved_user = False  # user saved in csv
        processed_users += 1
        user = int(user_str[0])
        retrieval_date = user_str[1]
        print('User: '******'created_at'])  # as date
                date_first_tweet_str = date_first_tweet_date.strftime(
                    "%Y-%m-%d")
                date_first_tweet = datetime.datetime.strptime(
                    date_first_tweet_str, "%Y-%m-%d").date()  # as str
                month_ago = (
                    datetime.datetime.today() -
                    relativedelta(months=months_back)).date()  # as str
                date_last_tweet = parser.parse(
                    archive_first_page[n_tweets_first_page - 1]['created_at'])

                if date_first_tweet >= month_ago:

                    time_interval = date_first_tweet_date - date_last_tweet  # in days
                    if time_interval.total_seconds != 0:
                        frequency_user = n_tweets_first_page / time_interval.total_seconds(
                        )

                        # check first page
                        first_full_page = False
                        for page in archive:  # for each page, from the most recent to the oldest
                            n_tweets_pag = len(page)
                            if n_tweets_pag > 0 and "error" not in page:  # if no errors in pages
                                if first_full_page is False:
                                    # consistency
                                    nationality_user = [
                                        0, 0, 0, 0
                                    ]  # percentage of the satisfied criteria in time

                                    #  characteristics of each tweet
                                    for tweet in archive_first_page:

                                        language = tweet["lang"]

                                        if tweet["place"] is not None:
                                            place = tweet["place"][
                                                "country_code"].lower()
                                        else:
                                            place = None

                                        if tweet["user"]["lang"] is not None:
                                            interface = tweet["user"][
                                                "lang"].lower()
                                        else:
                                            interface = None

                                        if tweet["user"][
                                                "time_zone"] is not None:
                                            time_zone = tweet["user"][
                                                "time_zone"].lower()
                                        else:
                                            time_zone = None

                                        #  if the national criteria are still valid
                                        if interface == lang or time_zone == capital or language == lang or \
                                                place == lang:

                                            if interface == lang:
                                                nationality_user[0] += 1
                                            if time_zone == capital:
                                                nationality_user[1] += 1
                                            if language == lang:
                                                nationality_user[2] += 1
                                            if place == lang:
                                                nationality_user[3] += 1

                                    if nationality_user[0] > 0 or nationality_user[1] > 0 or \
                                            nationality_user[2] > threshold_lang or nationality_user[3] > 0:
                                        selected_criteria = True  # user selected: criteria satisfied
                                    # save the array in str
                                    nationality_user = str([
                                        l / n_tweets_first_page
                                        for l in nationality_user
                                    ])
                                    retrieval_date = datetime.datetime.strptime(
                                        retrieval_date, "%Y-%m-%d")
                                    first_full_page = True

                                # save user in csv, if criteria satisfied
                                if selected_criteria is True:
                                    #  save user in the csv
                                    if saved_user is False:
                                        unique_active_users += 1
                                        user_l = user_from_tweet(
                                            tweet_1, retrieval_date,
                                            frequency_user, nationality_user)
                                        user_csv.writerow(user_l)
                                        saved_user = True

                                    # save also its tweets
                                    # if the page has tweets in the desired time interval (no time zone)
                                    date_first_tweet = parser.parse(
                                        page[0]['created_at']).replace(
                                            tzinfo=None)
                                    date_last_tweet = parser.parse(
                                        page[n_tweets_pag -
                                             1]['created_at']).replace(
                                                 tzinfo=None)
                                    # pages with some tweets in the interval
                                    if date_first_tweet >= min_tweet_date and date_last_tweet <= max_tweet_date:
                                        # if all the tweets are in the interval
                                        if date_last_tweet >= min_tweet_date and date_first_tweet <= max_tweet_date:
                                            # no extra checks on the tweets and save tweets of the page in json
                                            for tweet in page:

                                                # save tweets in a json file
                                                write_tweet_in_json(
                                                    tweet, f_json)
                                        else:
                                            for tweet in page:  # check 1 by one and save tweets of the page in json
                                                date_tweet = parser.parse(
                                                    tweet['created_at']
                                                ).replace(tzinfo=None)
                                                if min_tweet_date <= date_tweet <= max_tweet_date:

                                                    # save tweets in a json file and
                                                    write_tweet_in_json(
                                                        tweet, f_json)

        # timer to see what I am doing
        if processed_users % 10 == 0:
            print(
                "---------------------------------------------------------------------------------"
            )
            print("user:"******"percentage of processed users:" +
                  str(processed_users / n_unfiltered_users))
            print("processed users:" + str(processed_users))

        # do chunk backup copies each tot users: split
        if processed_users % chunk_users == 0:
            user_csv.close()  # close file
            os.rename(filename, 'backup_' + filename)  # do a backup copy

            # open file and start writing on it again
            user_csv = csv.writer(open_file(data_folder, filename, 'w', 'csv'),
                                  delimiter=',')

    user_csv.close()
    f_json.close()