def clean_data(ds_name: str, rare_count: int, cfg: PreProcessingConfigs): corpus_path = cfg.corpus_dir + ds_name + cfg.data_set_extension ds_corpus_cleaned = cfg.corpus_cleaned_dir + ds_name + cfg.data_set_extension # Checkers check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets) check_paths(corpus_path) create_dir(dir_path=cfg.corpus_cleaned_dir, overwrite=False) docs_of_words = [ clean_str(line.strip().decode('latin1')).split() for line in open(corpus_path, 'rb') ] word_counts = extract_word_counts(docs_of_words=docs_of_words) stop_words = retrieve_stop_words(language='english') if ds_name != 'mr': # If data-set is 'mr', don't remove stop and rare words, TODO: find why docs_of_words = remove_stop_words(docs_of_words, stop_words=stop_words) docs_of_words = remove_rare_words(docs_of_words, word_counts=word_counts, rare_count=rare_count) docs_of_words = glue_lines(lines_of_words=docs_of_words, glue_str=' ', with_strip=True) write_iterable_to_file(an_iterable=docs_of_words, file_path=ds_corpus_cleaned, file_mode='w') print("[INFO] Cleaned-Corpus Dir='{}'".format(cfg.corpus_cleaned_dir)) print("[INFO] Rare-Count=<{}>".format(rare_count)) print( "[INFO] ========= CLEANED DATA: Removed rare & stop-words. =========")
def build_adjacency(ds_name: str, cfg: PreProcessingConfigs): """Build Adjacency Matrix of Doc-Word Heterogeneous Graph""" # input files ds_corpus = cfg.corpus_shuffled_dir + ds_name + ".txt" ds_corpus_vocabulary = cfg.corpus_shuffled_vocab_dir + ds_name + '.vocab' ds_corpus_train_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.train' ds_corpus_test_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.test' # checkers check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets) check_paths(ds_corpus, ds_corpus_vocabulary, ds_corpus_train_idx, ds_corpus_test_idx) create_dir(dir_path=cfg.corpus_shuffled_adjacency_dir, overwrite=False) docs_of_words = [line.split() for line in open(file=ds_corpus)] vocab = open( ds_corpus_vocabulary).read().splitlines() # Extract Vocabulary. word_to_id = {word: i for i, word in enumerate(vocab)} # Word to its id. train_size = len(open( ds_corpus_train_idx).readlines()) # Real train-size, not adjusted. test_size = len(open(ds_corpus_test_idx).readlines()) # Real test-size. windows_of_words = extract_windows(docs_of_words=docs_of_words, window_size=20) # Extract word-word weights rows, cols, weights = extract_pmi_word_weights(windows_of_words, word_to_id, vocab, train_size) # As an alternative, use cosine similarity of word vectors as weights: # ds_corpus_word_vectors = cfg.CORPUS_WORD_VECTORS_DIR + ds_name + '.word_vectors' # rows, cols, weights = extract_cosine_similarity_word_weights(vocab, train_size, ds_corpus_word_vectors) # Extract word-doc weights rows, cols, weights = extract_tf_idf_doc_word_weights( rows, cols, weights, vocab, train_size, docs_of_words, word_to_id) adjacency_len = train_size + len(vocab) + test_size adjacency_matrix = csr_matrix((weights, (rows, cols)), shape=(adjacency_len, adjacency_len)) # Dump Adjacency Matrix with open( cfg.corpus_shuffled_adjacency_dir + "/ind.{}.adj".format(ds_name), 'wb') as f: pickle.dump(adjacency_matrix, f) print("[INFO] Adjacency Dir='{}'".format( cfg.corpus_shuffled_adjacency_dir)) print( "[INFO] ========= EXTRACTED ADJACENCY MATRIX: Heterogenous doc-word adjacency matrix. =========" )
def shuffle_data(ds_name: str, cfg: PreProcessingConfigs): ds_corpus = cfg.corpus_cleaned_dir + ds_name + cfg.data_set_extension ds_corpus_meta = cfg.corpus_meta_dir + ds_name + '.meta' ds_corpus_shuffled = cfg.corpus_shuffled_dir + ds_name + cfg.data_set_extension ds_corpus_shuffled_train_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.train' ds_corpus_shuffled_test_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.test' ds_corpus_shuffled_meta = cfg.corpus_shuffled_meta_dir + ds_name + '.meta' # Checkers check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets) check_paths(ds_corpus_meta, ds_corpus) # Create dirs if not exist create_dir(cfg.corpus_shuffled_dir, overwrite=False) create_dir(cfg.corpus_shuffled_meta_dir, overwrite=False) create_dir(cfg.corpus_shuffled_split_index_dir, overwrite=False) all_doc_meta_list, train_doc_meta_list, test_doc_meta_list = load_corpus_meta(corpus_meta_path=ds_corpus_meta) cleaned_doc_lines = [line.strip() for line in open(ds_corpus, 'r')] # Shuffle train ids and write to file train_doc_meta_ids = [all_doc_meta_list.index(train_doc_meta) for train_doc_meta in train_doc_meta_list] random.shuffle(train_doc_meta_ids) write_iterable_to_file(an_iterable=train_doc_meta_ids, file_path=ds_corpus_shuffled_train_idx, file_mode='w') # Shuffle test ids and write to file test_doc_meta_ids = [all_doc_meta_list.index(test_doc_meta) for test_doc_meta in test_doc_meta_list] random.shuffle(test_doc_meta_ids) write_iterable_to_file(an_iterable=test_doc_meta_ids, file_path=ds_corpus_shuffled_test_idx, file_mode='w') all_doc_meta_ids = train_doc_meta_ids + test_doc_meta_ids # Write shuffled meta to file shuffled_doc_meta_list = [all_doc_meta_list[all_doc_meta_id] for all_doc_meta_id in all_doc_meta_ids] write_iterable_to_file(an_iterable=shuffled_doc_meta_list, file_path=ds_corpus_shuffled_meta, file_mode='w') # Write shuffled document files to file shuffled_doc_lines = [cleaned_doc_lines[all_doc_meta_id] for all_doc_meta_id in all_doc_meta_ids] write_iterable_to_file(an_iterable=shuffled_doc_lines, file_path=ds_corpus_shuffled, file_mode='w') print("[INFO] Shuffled-Corpus Dir='{}'".format(cfg.corpus_shuffled_dir)) print("[INFO] ========= SHUFFLED DATA: Corpus documents shuffled. =========")
def prepare_words(ds_name: str, cfg: PreProcessingConfigs): ds_corpus = cfg.corpus_shuffled_dir + ds_name + cfg.data_set_extension # Checkers check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets) check_paths(ds_corpus) # Create output directories create_dir(dir_path=cfg.corpus_shuffled_vocab_dir, overwrite=False) create_dir(dir_path=cfg.corpus_shuffled_word_vectors_dir, overwrite=False) ds_corpus_vocabulary = cfg.corpus_shuffled_vocab_dir + ds_name + '.vocab' ds_corpus_word_vectors = cfg.corpus_shuffled_word_vectors_dir + ds_name + '.word_vectors' # ################################################### # Build vocabulary docs_of_words_generator = (line.split() for line in open(ds_corpus)) vocabulary = extract_vocabulary(docs_of_words=docs_of_words_generator) write_iterable_to_file(an_iterable=vocabulary, file_path=ds_corpus_vocabulary, file_mode='w') # Extract word definitions word_definitions = extract_word_definitions(vocabulary=vocabulary) # write_iterable_to_file(word_definitions, file_path='/<>' + ds, file_mode='w+') # Extract & Dump word vectors word_vectors = extract_tf_idf_word_vectors( word_definitions=word_definitions, max_features=1000) word_to_word_vectors_dict = OrderedDict( (word, vec.tolist()) for word, vec in zip(vocabulary, word_vectors)) pickle.dump(obj=word_to_word_vectors_dict, file=open(ds_corpus_word_vectors, mode='wb')) print("[INFO] Vocabulary Dir='{}'".format(cfg.corpus_shuffled_vocab_dir)) print("[INFO] Word-Vector Dir='{}'".format( cfg.corpus_shuffled_word_vectors_dir)) print( "[INFO] ========= PREPARED WORDS: Vocabulary & word-vectors extracted. =========" )
def build_node_features(ds_name: str, validation_ratio: float, use_predefined_word_vectors: bool, cfg: PreProcessingConfigs): # input files for building node features ds_corpus = cfg.corpus_shuffled_dir + ds_name + '.txt' ds_corpus_meta = cfg.corpus_shuffled_meta_dir + ds_name + '.meta' ds_corpus_vocabulary = cfg.corpus_shuffled_vocab_dir + ds_name + '.vocab' ds_corpus_train_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.train' ds_corpus_test_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.test' # output directory of node features dir_corpus_node_features = cfg.corpus_shuffled_node_features_dir + "/" + ds_name # checkers check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets) check_paths(ds_corpus, ds_corpus_meta, ds_corpus_vocabulary) check_paths(ds_corpus_train_idx, ds_corpus_train_idx) # Create output directory of node features create_dir(dir_path=dir_corpus_node_features, overwrite=False) # Adjust train size, for different training rates, for example: use 90% of training set real_train_size = len(open(ds_corpus_train_idx).readlines()) adjusted_train_size = ceil(real_train_size * (1.0 - validation_ratio)) test_size = len(open(ds_corpus_test_idx).readlines()) # Extract word_vectors and word_embedding_dimension if use_predefined_word_vectors: ds_corpus_word_vectors = cfg.corpus_shuffled_word_vectors_dir + ds_name + '.word_vectors' ds_corpus_word_vectors = 'glove.6B.300d.txt' # Alternatively, you can use GLOVE word-embeddings word_vectors, word_emb_dim = load_word_to_word_vectors( path=ds_corpus_word_vectors) else: word_vectors, word_emb_dim = OrderedDict(), 300 # todo: parametrize vocabulary = open( ds_corpus_vocabulary).read().splitlines() # Extract Vocabulary doc_meta_list = open(file=ds_corpus_meta, mode='r').read().splitlines() # Extract Meta List doc_labels = extract_doc_labels( ds_corpus_meta_file=ds_corpus_meta) # Extract Document Labels docs_of_words = [line.split() for line in open(file=ds_corpus) ] # Extract Documents of Words # for i,words in enumerate(docs_of_words): # if words == []: # if doc_meta_list[i].split('\t')[-1] == 'ham': # words.extend(['MEETING','TOMORROW']) # else: # words.extend(['WIN','LOTTERY']) # Extract mean document word vectors and one hot labels of train-set x = compute_x(docs_of_words, adjusted_train_size, word_emb_dim, w_vectors=word_vectors) y = compute_y(doc_meta_list, train_size=adjusted_train_size, doc_labels=doc_labels) # Extract mean document word vectors and one hot labels of test-set tx = compute_tx(docs_of_words, test_size, real_train_size, word_emb_dim, w_vectors=word_vectors) ty = compute_ty(doc_meta_list, test_size=test_size, real_train_size=real_train_size, doc_labels=doc_labels) # Extract doc_features + word_features allx = compute_allx(docs_of_words, real_train_size, vocabulary, word_vectors, emb_dim=word_emb_dim) ally = compute_ally(doc_meta_list, real_train_size, doc_labels, vocab_size=len(vocabulary)) # Dump node features matrices to files node_feature_matrices = { "x": x, "y": y, "tx": tx, "ty": ty, "allx": allx, "ally": ally } dump_node_features(directory=dir_corpus_node_features, ds=ds_name, node_features_dict=node_feature_matrices) print("[INFO] x.shape= {},\t y.shape= {}".format(x.shape, y.shape)) print("[INFO] tx.shape= {},\t ty.shape= {}".format(tx.shape, ty.shape)) print("[INFO] allx.shape={},\t ally.shape={}".format( allx.shape, ally.shape)) print( "[INFO] ========= EXTRACTED NODE FEATURES: x, y, tx, ty, allx, ally. =========" )
def train_model(ds_name: str, is_featureless: bool, cfg: TrainingConfigs): configure_cuda() check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets) set_seeds(seed=2019) # Load corpus & unpack values corpus_values = load_corpus(ds_name, cfg.corpus_split_index_dir, cfg.corpus_node_features_dir, cfg.corpus_adjacency_dir) adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size = corpus_values if is_featureless: features = sp.identity(features.shape[0]) features, support, num_supports, model_func = prepare_matrices( features, adj, cfg.model, cfg.chebyshev_max_degree) # Define placeholders t_features = torch.from_numpy(features) t_y_train = torch.from_numpy(y_train) t_y_val = torch.from_numpy(y_val) t_y_test = torch.from_numpy(y_test) t_train_mask = torch.from_numpy(train_mask.astype(np.float32)) tm_train_mask = torch.transpose(torch.unsqueeze(t_train_mask, 0), 1, 0).repeat(1, y_train.shape[1]) t_support = [] for i in range(len(support)): # noinspection PyArgumentList t_support.append(torch.Tensor(support[i])) # if torch.cuda.is_available(): # model_func = model_func.cuda() # t_features = t_features.cuda() # t_y_train = t_y_train.cuda() # t_y_val = t_y_val.cuda() # t_y_test = t_y_test.cuda() # t_train_mask = t_train_mask.cuda() # tm_train_mask = tm_train_mask.cuda() # for i in range(len(support)): # t_support = [t.cuda() for t in t_support if True] model = model_func(input_dim=features.shape[0], support=t_support, num_classes=y_train.shape[1]) # Loss and optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate) val_losses = [] # Train model for epoch in range(cfg.epochs): epoch_start_time = time.time() # Forward pass logits = model(t_features) loss = criterion(logits * tm_train_mask, torch.max(t_y_train, 1)[1]) acc = ( (torch.max(logits, 1)[1] == torch.max(t_y_train, 1)[1]).float() * t_train_mask).sum().item() / t_train_mask.sum().item() # Backward and optimize optimizer.zero_grad() loss.backward() optimizer.step() # Validation val_loss, val_acc, pred, labels, duration = evaluate_model( model, criterion, t_features, t_y_val, val_mask) val_losses.append(val_loss) print_log( "Epoch:{:.0f}, train_loss={:.5f}, train_acc={:.5f}, val_loss={:.5f}, val_acc={:.5f}, time={:.5f}" .format(epoch + 1, loss, acc, val_loss, val_acc, time.time() - epoch_start_time)) if epoch > cfg.early_stopping and val_losses[-1] > np.mean( val_losses[-(cfg.early_stopping + 1):-1]): print_log("Early stopping...") break print_log("Optimization Finished!") # Testing test_loss, test_acc, pred, labels, test_duration = evaluate_model( model, criterion, t_features, t_y_test, test_mask) print_log( "Test set results: \n\t loss= {:.5f}, accuracy= {:.5f}, time= {:.5f}". format(test_loss, test_acc, test_duration)) test_pred = [] test_labels = [] for i in range(len(test_mask)): if test_mask[i]: test_pred.append(pred[i]) test_labels.append(np.argmax(labels[i])) print_log("Test Precision, Recall and F1-Score...") print_log(metrics.classification_report(test_labels, test_pred, digits=4)) print_log("Macro average Test Precision, Recall and F1-Score...") print_log( metrics.precision_recall_fscore_support(test_labels, test_pred, average='macro')) print_log("Micro average Test Precision, Recall and F1-Score...") print_log( metrics.precision_recall_fscore_support(test_labels, test_pred, average='micro')) # doc and word embeddings tmp = model.layer1.embedding.numpy() word_embeddings = tmp[train_size:adj.shape[0] - test_size] train_doc_embeddings = tmp[:train_size] # include val docs test_doc_embeddings = tmp[adj.shape[0] - test_size:] print_log('Embeddings:') print_log('\rWord_embeddings:' + str(len(word_embeddings))) print_log('\rTrain_doc_embeddings:' + str(len(train_doc_embeddings))) print_log('\rTest_doc_embeddings:' + str(len(test_doc_embeddings))) print_log('\rWord_embeddings:') print(word_embeddings) # Create word-vectors and written to file # todo: commented-out """ with open(cfg.corpus_vocab_dir + ds_name + '.vocab', 'r') as f: words = f.readlines() vocab_size = len(words) word_vectors = [] for i in range(vocab_size): word = words[i].strip() word_vector = word_embeddings[i] word_vector_str = ' '.join([str(x) for x in word_vector]) word_vectors.append(word + ' ' + word_vector_str) word_embeddings_str = '\n'.join(word_vectors) with open('./data/' + ds_name + '_word_vectors.txt', 'w') as f: f.write(word_embeddings_str) """ # Create doc vectors and written to file # todo: commented-out """