def social_context_dataset_statistics(): """ Utility method to perform social context (PHEME dataset structure based) corpus statistics by a given path :return: """ # social_context_data_dir = "C:\\Data\\NLP-corpus\\aug_rnr\\twitter1516" from data_loader import load_abs_path social_context_data_dir = os.path.join( os.path.dirname(__file__), '..', "data", "social_context", "aug-rnr-annotated-threads-retweets") social_context_data_dir = load_abs_path(social_context_data_dir) print("check social context corpus [%s] ... " % social_context_data_dir) events_dataset_dirs = [] for root, dirs, files in os.walk(social_context_data_dir): # print("root: ", root) # print("dirs: ", dirs) # print("files size: ", len(files)) events_dataset_dirs = dirs break print("total [%s] events dataset" % len(events_dataset_dirs)) print(events_dataset_dirs) # check every individual event corpus for event_dataset_dir in events_dataset_dirs: labelled_event_dataset_statistics(social_context_data_dir, event_dataset_dir) print(" ========================================== ") print("complete.")
def test_context_sequence_encoding(self): elmo_credbank_model_path = load_abs_path( os.path.join( os.path.dirname(__file__), '..', "resource", "embedding", "elmo_model", "elmo_credbank_2x4096_512_2048cnn_2xhighway_weights_10052019.hdf5" )) elmo_embedder = ElmoTokenEmbedder( options_file= "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json", weight_file=elmo_credbank_model_path, do_layer_norm=False, dropout=0.5) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) EXPECTED_CONTEXT_INPUT_SIZE = 60 rumor_classifier = RumorTweetsClassifer( word_embeddings, None, None, None, classifier_feedforward=None, cxt_content_encoder=None, cxt_metadata_encoder=None, social_context_self_attention_encoder=None, cuda_device=-1) tweet_id = "500327120770301952" single_source_tweet_tensor_1 = self.tweet_context_encoding_by_tweet_id( rumor_classifier, tweet_id) print(type(single_source_tweet_tensor_1)) print(single_source_tweet_tensor_1.shape) assert type(single_source_tweet_tensor_1) == torch.Tensor assert single_source_tweet_tensor_1.shape == ( 97, EXPECTED_CONTEXT_INPUT_SIZE ), "expected shape is [19, %s]" % EXPECTED_CONTEXT_INPUT_SIZE tweet_id = "552806117328568321" # with three replies single_source_tweet_tensor_2 = self.tweet_context_encoding_by_tweet_id( rumor_classifier, tweet_id) print(type(single_source_tweet_tensor_2)) print(single_source_tweet_tensor_2.shape) assert type(single_source_tweet_tensor_2) == torch.Tensor assert single_source_tweet_tensor_2.shape == ( 94, EXPECTED_CONTEXT_INPUT_SIZE ), "expected shape is [3, %s]" % EXPECTED_CONTEXT_INPUT_SIZE tweet_id = "552806117328568321" # with three replies print("social context encoding without numerical feature .") single_source_tweet_tensor_2 = self.tweet_context_encoding_by_tweet_id( rumor_classifier, tweet_id, disable_nf=True) print(type(single_source_tweet_tensor_2)) print(single_source_tweet_tensor_2.shape) assert type(single_source_tweet_tensor_2) == torch.Tensor assert single_source_tweet_tensor_2.shape == ( 94, EXPECTED_CONTEXT_INPUT_SIZE ), "expected shape is [3, %s]" % EXPECTED_CONTEXT_INPUT_SIZE
def test_elmo_with_attention(): import os from data_loader import load_abs_path from embedding_layer import word_embedding_elmo elmo_credbank_model_path = load_abs_path( os.path.join( os.path.dirname(__file__), '..', "resource", "embedding", "elmo_model", "elmo_credbank_2x4096_512_2048cnn_2xhighway_weights_10052019.hdf5") ) elmo_options_file_path = load_abs_path( os.path.join(os.path.dirname(__file__), '..', "resource", "embedding", "elmo_model", "elmo_2x4096_512_2048cnn_2xhighway_options.json")) sentence3 = [ "9/11", "sandy", "hook", "movie", "shooting", "boston", "bomb", "threats", "from", "n.", "korea", "and", "several", "other", "tragedies", "were", "all", "under", "the", "age", "of", "18" ] fine_tuned_elmo = ElmoEmbedder(options_file=elmo_options_file_path, weight_file=elmo_credbank_model_path) avg_all_layer_sent_embedding = word_embedding_elmo(sentence3, fine_tuned_elmo) # print(avg_all_layer_sent_embedding) print("content avg ELMo embedding shape : ", avg_all_layer_sent_embedding.shape) assert avg_all_layer_sent_embedding.shape == (22, 1024) attention_layer = HierarchicalAttentionNet(1024, step_dim=22) maxlen = 200 attention_weights = attention_layer.forward( torch.as_tensor(avg_all_layer_sent_embedding), maxlen) print("context attention weights shape: ", attention_weights.shape) assert attention_weights.shape == torch.Size([1, 1024]) print(attention_weights)
def test_context_feature_encoder(self): elmo_credbank_model_path = load_abs_path( os.path.join( os.path.dirname(__file__), '..', "resource", "embedding", "elmo_model", "elmo_credbank_2x4096_512_2048cnn_2xhighway_weights_10052019.hdf5" )) # test context feature encoding with small sample data # to make sure that source tweet context are sorted in temporal order elmo_embedder = ElmoTokenEmbedder( options_file= "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json", weight_file=elmo_credbank_model_path, do_layer_norm=False, dropout=0.5) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) rumor_classifier = RumorTweetsClassifer(word_embeddings, None, None, None, None) propagation_embeddings_tensor = rumor_classifier.batch_compute_context_feature_encoding( ['500294803402137600', '500327120770301952']) print("propagation_embeddings_tensor: ", propagation_embeddings_tensor)
"1) AttentionWithContext (default). However, we got [%s]" % attention_option) print( "training RumourDNN model on development dataset [%s] and [%s] with gpu [%s]" % (train_set_path, heldout_set_path, no_gpu)) import allennlp_rumor_classifier import data_loader from allennlp_rumor_classifier import config_gpu_use config_gpu_use(no_gpu) allennlp_rumor_classifier.elmo_credbank_model_path = load_abs_path( os.path.join( os.path.dirname(__file__), '..', "resource", "embedding", "elmo_model", "elmo_credbank_2x4096_512_2048cnn_2xhighway_weights_10052019.hdf5") ) data_loader.social_context_data_dir = os.path.join( os.path.dirname(__file__), '..', "data", "social_context", "aug-rnr-annotated-threads-retweets") print("Fine-tuned ELMo model is set to [%s]" % allennlp_rumor_classifier.elmo_credbank_model_path) print("social context corpus for all events directory is set to [%s]" % data_loader.social_context_data_dir) # Reasonable minibatch sizes are usually: 32, 64, 128, 256, 512, 1024 (powers of 2 are a common convention) # Usually, you can choose a batch size that is as large as your GPU memory allows # (matrix-multiplication and the size of fully-connected layers are usually the bottleneck)
def statistics_rumour_dnn_dataset(file_name): """ perform statistics of social context for a given training data set file :param file_name: :return: """ print("statistics of [%s]" % file_name) df_file = load_matrix_from_csv(file_name, 0, 1, header=0) #for dataset_row in df_file[:]: # print("tweet id: [%s]" % dataset_row[0]) all_tweet_ids = [dataset_row[0] for dataset_row in df_file[:]] print("all_tweet_ids size: ", len(all_tweet_ids)) from data_loader import load_tweets_context_dataset_dir from data_loader import load_abs_path social_context_data_dir = os.path.join( os.path.dirname(__file__), '..', "data", "social_context", "aug-rnr-annotated-threads-retweets") social_context_data_dir = load_abs_path(social_context_data_dir) context_tweets_dataset_dir_dict = load_tweets_context_dataset_dir( social_context_data_dir) all_replies_list = [] all_retweets_list = [] for tweet_id in all_tweet_ids: total_replies, total_retweets = count_social_context( str(tweet_id), context_tweets_dataset_dir_dict) all_replies_list.append(total_replies) all_retweets_list.append(total_retweets) print("total_replies_list: ", all_replies_list) print("total_retweets_list: ", all_retweets_list) total_reactions = sum(all_replies_list) min_reactions = 0 if len(all_replies_list) == 0 else min(all_replies_list) max_reactions = 0 if len(all_replies_list) == 0 else max(all_replies_list) avg_reactions = 0 if len(all_replies_list) == 0 else round( sum(all_replies_list) / len(all_replies_list), 1) std_reactions = 0 if len(all_replies_list) == 0 else statistics.stdev( all_replies_list) # Median has a very big advantage over Mean, which is the median value is not skewed so much by extremely large or small values. # see also https://www.geeksforgeeks.org/python-statistics-median/ median_reactions = 0 if len(all_replies_list) == 0 else statistics.median( all_replies_list) total_retweets = sum(all_retweets_list) min_retweets = 0 if len(all_retweets_list) == 0 else min(all_retweets_list) max_retweets = 0 if len(all_retweets_list) == 0 else max(all_retweets_list) avg_retweets = 0 if len(all_retweets_list) == 0 else round( total_retweets / len(all_retweets_list), 1) std_retweets = 0 if len(all_retweets_list) == 0 else statistics.stdev( all_retweets_list) # Median has a very big advantage over Mean, which is the median value is not skewed so much by extremely large or small values, # see also https://www.geeksforgeeks.org/python-statistics-median/ median_retweets = 0 if len(all_retweets_list) == 0 else statistics.median( all_retweets_list) print( "total reactions: [%s], min reaction: [%s], max reaction: [%s], avg reaction: [%s], std reactions: [%s], median reactions: [%s]" % (total_reactions, min_reactions, max_reactions, avg_reactions, std_reactions, median_reactions)) print( "total retweets: [%s], min retweets: [%s], max retweets: [%s], avg retweets: [%s], std retweets: [%s], median retweets: [%s]" % (total_retweets, min_retweets, max_retweets, avg_retweets, std_retweets, median_retweets)) print("total tweets without reaction: [%s]" % (all_replies_list.count(0))) print("total tweets without retweets: [%s]" % (all_retweets_list.count(0))) results = dict() results["total_reactions"] = total_reactions results["min_reactions"] = min_reactions results["max_reactions"] = max_reactions results["avg_reactions"] = avg_reactions results["std_reactions"] = std_reactions results["median_reactions"] = median_reactions results["total_retweets"] = total_retweets results["min_retweets"] = min_retweets results["max_retweets"] = max_retweets results["avg_retweets"] = avg_retweets results["std_retweets"] = std_retweets results["median_retweets"] = median_retweets print("statistics: ") print(results)
def test_elmo_output_with_self_attention(): import os from data_loader import load_abs_path import numpy as np elmo_credbank_model_path = load_abs_path( os.path.join( os.path.dirname(__file__), '..', "resource", "embedding", "elmo_model", "elmo_credbank_2x4096_512_2048cnn_2xhighway_weights_10052019.hdf5") ) elmo_options_file_path = load_abs_path( os.path.join(os.path.dirname(__file__), '..', "resource", "embedding", "elmo_model", "elmo_2x4096_512_2048cnn_2xhighway_options.json")) sentence4 = [ "i", "really", "enjoy", "Ashley", "and", "Ami", "salon", "she", "do", "a", "great", "job", "be", "friendly", "and", "professional", "I", "usually", "get", "my", "hair", "do", "when", "i", "go", "to", "to", "MI", "because", "of", "the", "quality", "of", "the", "highlight", "and", "the", "price", "be", "very", "affordable", "the", "highlight", "fantastic", "thank", "Ashley", "i", "highly", "recommend", "you", "and", "ill", "be", "back" ] fine_tuned_elmo = ElmoEmbedder(options_file=elmo_options_file_path, weight_file=elmo_credbank_model_path) sentence_vectors = fine_tuned_elmo.embed_sentence(sentence4) avg_all_layer_sent_embedding = np.mean(sentence_vectors, axis=0, dtype='float32') print("test with self-attentive model: ") self_attention_elmo_input = torch.stack( [torch.as_tensor(avg_all_layer_sent_embedding)]).permute(1, 0, 2) # ELMo output.size() = (batch_size, num_seq, 2*hidden_size) self_attention_elmo_input = self_attention_elmo_input.permute(1, 0, 2) print( "self_attention_elmo_input shape (batch_size, num_seq, 2*hidden_size) : ", self_attention_elmo_input.shape) self_attention_model = StructuredSelfAttention(1024) # print(self_attention_elmo_input) concatenated_context_embeddings, attn_weight_matrix = self_attention_model.forward( self_attention_elmo_input, if_concat=True) print( "self attention weights (annotation A) of ELMo embedding shape (batch_size, r, num_seq): ", attn_weight_matrix.shape) print("attn_weight_matrix: ", attn_weight_matrix) # assert attn_weight_matrix.shape == torch.Size([22, 30, 1]) # fc_input_tesnor = hidden_matrix.view(-1, hidden_matrix.size()[1]*hidden_matrix.size()[2]) print( " concatenate the hidden_matrix (b4 feeding into FC and softmax) shape:", concatenated_context_embeddings.shape) avg_context_embeddings, attn_weight_matrix = self_attention_model.forward( self_attention_elmo_input, if_concat=False) print( " averaged the hidden_matrix (b4 feeding into FC and softmax) shape:", avg_context_embeddings.shape)