Beispiel #1
0
def main():
    folderIO = FolderIO()
    files = folderIO.get_files("D:/DLSU/Masters/MS Thesis/data-2016/02/", False, ".json")

    print("Found {} files.".format(len(files)))

    file_stats = open('results_stats.txt', 'a')
    file_summary = open('results_summary.txt', 'a')
    file_full = open('results_full.txt', 'a')
    file_frequency = open('results_frequency.txt', 'a')

    max_count = 0
    max_tweet_id = None
    max_node = None

    json_parser = JSONParser()
    for file in files:

        print("\nProcessing {}".format(file))

        # Append date-time to the result files
        file_stats.write('\n{}-{}\n'.format(datetime.datetime.now(), file.name))
        file_summary.write('\n{}-{}\n'.format(datetime.datetime.now(), file.name))
        file_full.write('\n{}-{}\n'.format(datetime.datetime.now(), file.name))
        file_frequency.write('\n{}-{}\n'.format(datetime.datetime.now(), file.name))

        thread_length_freq = {}
        processed_tweet_ids = set()
        tweet_helper = TweetHelper()
        api = tweet_helper.api
        lines_processed = 0
        tweets_processed = 0

        for tweet_json in json_parser.parse_file_into_json_generator(file):

            curr_tweet = tweet_helper.retrieve_tweet(tweet_json["id"])
            lines_processed += 1

            if curr_tweet is not None and curr_tweet.id not in processed_tweet_ids:

                processed_tweet_ids.add(curr_tweet.id)

                curr_reply_thread = tweet_helper.list_reply_ancestors(curr_tweet)
                curr_reply_thread_count = len(curr_reply_thread)

                thread_length_freq[curr_reply_thread_count] = thread_length_freq.get(curr_reply_thread_count, 0) + 1

                if curr_reply_thread_count > max_count:
                    max_count = curr_reply_thread_count
                    max_tweet_id = curr_tweet.id
                    file_summary.write("{}:\n{}\n\n".format(max_count, "\n".join(str(reply.id) for reply in curr_reply_thread)))
                    file_summary.flush()

                if curr_reply_thread_count >= 3:
                    file_full.write("{}:\n{}\n\n".format(curr_reply_thread_count, "\n".join(str(("@"+reply.user.screen_name + ": "+str(reply.text)).encode("utf-8"))+"\n"+str(reply.id)+"\n" for reply in curr_reply_thread)))
                    file_full.flush()

                tweets_processed += 1

                # Unused code for constructing reply tree
                # curr_reply_thread_tree = tweet_helper.construct_reply_thread(curr_tweet)
                # curr_reply_thread_count = count_nodes(curr_reply_thread_tree)

                # print("{} with {} nodes\n".format(curr_reply_thread_tree.data.id, curr_reply_thread_count))
                # print("{}\n".format(curr_reply_thread_tree.__str__()))

                # if curr_reply_thread_count > max_count:
                #     max_count = curr_reply_thread_count
                #     max_node = curr_reply_thread_tree
                #     max_tweet_id = max_node.data.id

                    # file_summary.write("{} with {} nodes\n".format(max_tweet_id, max_count))
                    # file_full.write("{}\n".format(max_node.__str__()))

                    # print("{} with {} nodes\n".format(max_tweet_id, max_count))
                    # print("{}\n".format(max_node.__str__()))

            # Write reply thread length frequency counts to the results_frequency file
            if lines_processed % 10 == 0:
                print("Processed {} lines now with {} tweets".format(lines_processed, tweets_processed))

        file_stats.write('{} lines with {} successfully processed tweets\n'.format(lines_processed, tweets_processed))
        file_stats.flush()
        for count, frequency in sorted(thread_length_freq.items()):
            file_frequency.write('{} - {}\n'.format(count, frequency))
        file_frequency.flush()