def __build_tree_r(self, node, reply_structure):
     if len(reply_structure) == 0:
         return
     for child_tweet_id, child_reply_structure in reply_structure.items():
         if child_tweet_id in self.reply_tweets.keys():
             child_tweet = self.reply_tweets[child_tweet_id]
         else:
             child_tweet = None
         if ThreadTree.tweet_annotations is not None and child_tweet_id in ThreadTree.tweet_annotations.keys(
         ):
             child = Tweet(child_tweet_id, child_tweet, node, self.root,
                           ThreadTree.tweet_annotations[child_tweet_id])
         else:
             child = Tweet(child_tweet_id, child_tweet, node, self.root,
                           None)
         node.children.append(child)
         if not isfile(self.tweet_folder + child_tweet_id):
             save_object(child, self.tweet_folder + child_tweet_id)
         self.__build_tree_r(child, child_reply_structure)
Exemple #2
0
    return thread


if __name__ == "__main__":

    folder_paths = ["data/class_labels", "data/datasets", "data/features/labels", "data/features/tf_idf",
                    "data/features/top", "data/features/word2vec", "data/test_tweets", "data/tf_idf_documents",
                    "data/thread_lists", "data/threads", "data/tweets", "data/vocabularies",
                    "data/word2vec_documents", "models/", "scores/ablation", "submissions/", "plots/"]

    for folder_path in folder_paths:
        if not exists(folder_path):
            makedirs(folder_path)

    twitter_dataset_folder = "../rumoureval-2019-training-data/twitter-english/"
    if not isfile('data/twitter_stories'):
        twitter_stories_folders = glob.glob(twitter_dataset_folder + "*")
        twitter_stories = [
            story_folder.split("\\")[1] for story_folder in twitter_stories_folders
        ]
        save_object(twitter_stories, 'data/twitter_stories')
    else:
        twitter_stories = load_object('data/twitter_stories')

    print('Stories: ' + str(twitter_stories))

    if not isfile('data/annotations'):
        print('Reading tweet annotations...')
        annotations_file = open(
            "../pheme-rumour-scheme-dataset/annotations/en-scheme-annotations.json", "r")
        annotations_data = [line.rstrip('\n') for line in annotations_file]
 def build_tree(self, reply_structure):
     source_tweet_id, source_reply_structure = tuple(
         reply_structure.items())[0]
     if not isfile(self.tweet_folder + source_tweet_id):
         save_object(self.root, self.tweet_folder + source_tweet_id)
     self.__build_tree_r(self.root, source_reply_structure)
Exemple #4
0
    print('Stories in test set: ' + str(twitter_stories))

    print('Processing tweet threads...')
    graph_features = {}
    for twitter_story in twitter_stories:
        thread_folders, thread_ids = get_threads_from_story(twitter_story)
        print('Number of threads about ' + twitter_story + ': ' +
              str(len(thread_ids)))
        for thread_folder, thread_id in zip(thread_folders, thread_ids):
            thread_tree = read_tweet_json(thread_folder, thread_id)
            thread_nx_graph = thread_tree.get_nx_graph()
            graph_features.update(
                get_graph_features(thread_nx_graph, thread_tree))

    tweet_files = glob.glob('data/test_tweets/*')
    if not isfile('data/datasets/test_dataset_dictionary'):
        tf_idf_document_sets = [{} for i in range(8)]

        language_style_features_tweets = {}
        language_style_features_user_descriptions = {}
        sentiment_features = {}
        extra_features = {}

        for i, tweet_file in enumerate(tweet_files):
            print('Processing tweet #' + str(i + 1) + '...')
            tweet = load_object(tweet_file)
            tweet_id = tweet.id

            tweet_words = tweet.get_words_from_tweet_text()
            tweet_ngrams = get_ngrams_from_words(tweet_words)
            for n, ngrams in enumerate(tweet_ngrams):
Exemple #5
0
    ]
    dataset_without_columns = dataset.drop(language_style_columns, axis=1)
    datasets.append(dataset_without_columns)
    dataset_without_columns = dataset.drop(tf_idf_word2vec_columns, axis=1)
    datasets.append(dataset_without_columns)
    dataset_without_columns = dataset.drop(sentiment_columns, axis=1)
    datasets.append(dataset_without_columns)
    dataset_without_columns = dataset.drop(graph_columns, axis=1)
    datasets.append(dataset_without_columns)
    dataset_without_columns = dataset.drop(extra_columns, axis=1)
    datasets.append(dataset_without_columns)
    return datasets


if __name__ == "__main__":
    if not isfile('scores/ablation/task_a_comment_scores.tsv'):
        print("Performing ablation experiment for comment model...")
        comment_dataset = pd.read_csv(
            'data/datasets/task_a_comment_dataset.tsv',
            sep='\t',
            index_col=False,
            header=0,
            encoding='utf-8')
        comment_class_labels = load_object(
            'data/class_labels/task_a_comment_class_labels')

        comment_ablation_scores = {}
        ablation_datasets = get_ablation_datasets(comment_dataset)
        del comment_dataset

        for ablation_label, ablation_dataset in zip(ablation_labels,