def clean_data(ds_name: str, rare_count: int, cfg: PreProcessingConfigs): corpus_path = cfg.corpus_dir + ds_name + cfg.data_set_extension ds_corpus_cleaned = cfg.corpus_cleaned_dir + ds_name + cfg.data_set_extension # Checkers check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets) check_paths(corpus_path) create_dir(dir_path=cfg.corpus_cleaned_dir, overwrite=False) docs_of_words = [ clean_str(line.strip().decode('latin1')).split() for line in open(corpus_path, 'rb') ] word_counts = extract_word_counts(docs_of_words=docs_of_words) stop_words = retrieve_stop_words(language='english') if ds_name != 'mr': # If data-set is 'mr', don't remove stop and rare words, TODO: find why docs_of_words = remove_stop_words(docs_of_words, stop_words=stop_words) docs_of_words = remove_rare_words(docs_of_words, word_counts=word_counts, rare_count=rare_count) docs_of_words = glue_lines(lines_of_words=docs_of_words, glue_str=' ', with_strip=True) write_iterable_to_file(an_iterable=docs_of_words, file_path=ds_corpus_cleaned, file_mode='w') print("[INFO] Cleaned-Corpus Dir='{}'".format(cfg.corpus_cleaned_dir)) print("[INFO] Rare-Count=<{}>".format(rare_count)) print( "[INFO] ========= CLEANED DATA: Removed rare & stop-words. =========")
def build_adjacency(ds_name: str, cfg: PreProcessingConfigs): """Build Adjacency Matrix of Doc-Word Heterogeneous Graph""" # input files ds_corpus = cfg.corpus_shuffled_dir + ds_name + ".txt" ds_corpus_vocabulary = cfg.corpus_shuffled_vocab_dir + ds_name + '.vocab' ds_corpus_train_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.train' ds_corpus_test_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.test' # checkers check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets) check_paths(ds_corpus, ds_corpus_vocabulary, ds_corpus_train_idx, ds_corpus_test_idx) create_dir(dir_path=cfg.corpus_shuffled_adjacency_dir, overwrite=False) docs_of_words = [line.split() for line in open(file=ds_corpus)] vocab = open( ds_corpus_vocabulary).read().splitlines() # Extract Vocabulary. word_to_id = {word: i for i, word in enumerate(vocab)} # Word to its id. train_size = len(open( ds_corpus_train_idx).readlines()) # Real train-size, not adjusted. test_size = len(open(ds_corpus_test_idx).readlines()) # Real test-size. windows_of_words = extract_windows(docs_of_words=docs_of_words, window_size=20) # Extract word-word weights rows, cols, weights = extract_pmi_word_weights(windows_of_words, word_to_id, vocab, train_size) # As an alternative, use cosine similarity of word vectors as weights: # ds_corpus_word_vectors = cfg.CORPUS_WORD_VECTORS_DIR + ds_name + '.word_vectors' # rows, cols, weights = extract_cosine_similarity_word_weights(vocab, train_size, ds_corpus_word_vectors) # Extract word-doc weights rows, cols, weights = extract_tf_idf_doc_word_weights( rows, cols, weights, vocab, train_size, docs_of_words, word_to_id) adjacency_len = train_size + len(vocab) + test_size adjacency_matrix = csr_matrix((weights, (rows, cols)), shape=(adjacency_len, adjacency_len)) # Dump Adjacency Matrix with open( cfg.corpus_shuffled_adjacency_dir + "/ind.{}.adj".format(ds_name), 'wb') as f: pickle.dump(adjacency_matrix, f) print("[INFO] Adjacency Dir='{}'".format( cfg.corpus_shuffled_adjacency_dir)) print( "[INFO] ========= EXTRACTED ADJACENCY MATRIX: Heterogenous doc-word adjacency matrix. =========" )
def prepare_words(ds_name: str, cfg: PreProcessingConfigs): ds_corpus = cfg.corpus_shuffled_dir + ds_name + cfg.data_set_extension # Checkers check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets) check_paths(ds_corpus) # Create output directories create_dir(dir_path=cfg.corpus_shuffled_vocab_dir, overwrite=False) create_dir(dir_path=cfg.corpus_shuffled_word_vectors_dir, overwrite=False) ds_corpus_vocabulary = cfg.corpus_shuffled_vocab_dir + ds_name + '.vocab' ds_corpus_word_vectors = cfg.corpus_shuffled_word_vectors_dir + ds_name + '.word_vectors' # ################################################### # Build vocabulary docs_of_words_generator = (line.split() for line in open(ds_corpus)) vocabulary = extract_vocabulary(docs_of_words=docs_of_words_generator) write_iterable_to_file(an_iterable=vocabulary, file_path=ds_corpus_vocabulary, file_mode='w') # Extract word definitions word_definitions = extract_word_definitions(vocabulary=vocabulary) # write_iterable_to_file(word_definitions, file_path='/<>' + ds, file_mode='w+') # Extract & Dump word vectors word_vectors = extract_tf_idf_word_vectors( word_definitions=word_definitions, max_features=1000) word_to_word_vectors_dict = OrderedDict( (word, vec.tolist()) for word, vec in zip(vocabulary, word_vectors)) pickle.dump(obj=word_to_word_vectors_dict, file=open(ds_corpus_word_vectors, mode='wb')) print("[INFO] Vocabulary Dir='{}'".format(cfg.corpus_shuffled_vocab_dir)) print("[INFO] Word-Vector Dir='{}'".format( cfg.corpus_shuffled_word_vectors_dir)) print( "[INFO] ========= PREPARED WORDS: Vocabulary & word-vectors extracted. =========" )
def shuffle_data(ds_name: str, cfg: PreProcessingConfigs): ds_corpus = cfg.corpus_cleaned_dir + ds_name + cfg.data_set_extension ds_corpus_meta = cfg.corpus_meta_dir + ds_name + '.meta' ds_corpus_shuffled = cfg.corpus_shuffled_dir + ds_name + cfg.data_set_extension ds_corpus_shuffled_train_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.train' ds_corpus_shuffled_test_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.test' ds_corpus_shuffled_meta = cfg.corpus_shuffled_meta_dir + ds_name + '.meta' # Checkers check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets) check_paths(ds_corpus_meta, ds_corpus) # Create dirs if not exist create_dir(cfg.corpus_shuffled_dir, overwrite=False) create_dir(cfg.corpus_shuffled_meta_dir, overwrite=False) create_dir(cfg.corpus_shuffled_split_index_dir, overwrite=False) all_doc_meta_list, train_doc_meta_list, test_doc_meta_list = load_corpus_meta(corpus_meta_path=ds_corpus_meta) cleaned_doc_lines = [line.strip() for line in open(ds_corpus, 'r')] # Shuffle train ids and write to file train_doc_meta_ids = [all_doc_meta_list.index(train_doc_meta) for train_doc_meta in train_doc_meta_list] random.shuffle(train_doc_meta_ids) write_iterable_to_file(an_iterable=train_doc_meta_ids, file_path=ds_corpus_shuffled_train_idx, file_mode='w') # Shuffle test ids and write to file test_doc_meta_ids = [all_doc_meta_list.index(test_doc_meta) for test_doc_meta in test_doc_meta_list] random.shuffle(test_doc_meta_ids) write_iterable_to_file(an_iterable=test_doc_meta_ids, file_path=ds_corpus_shuffled_test_idx, file_mode='w') all_doc_meta_ids = train_doc_meta_ids + test_doc_meta_ids # Write shuffled meta to file shuffled_doc_meta_list = [all_doc_meta_list[all_doc_meta_id] for all_doc_meta_id in all_doc_meta_ids] write_iterable_to_file(an_iterable=shuffled_doc_meta_list, file_path=ds_corpus_shuffled_meta, file_mode='w') # Write shuffled document files to file shuffled_doc_lines = [cleaned_doc_lines[all_doc_meta_id] for all_doc_meta_id in all_doc_meta_ids] write_iterable_to_file(an_iterable=shuffled_doc_lines, file_path=ds_corpus_shuffled, file_mode='w') print("[INFO] Shuffled-Corpus Dir='{}'".format(cfg.corpus_shuffled_dir)) print("[INFO] ========= SHUFFLED DATA: Corpus documents shuffled. =========")
def build_node_features(ds_name: str, validation_ratio: float, use_predefined_word_vectors: bool, cfg: PreProcessingConfigs): # input files for building node features ds_corpus = cfg.corpus_shuffled_dir + ds_name + '.txt' ds_corpus_meta = cfg.corpus_shuffled_meta_dir + ds_name + '.meta' ds_corpus_vocabulary = cfg.corpus_shuffled_vocab_dir + ds_name + '.vocab' ds_corpus_train_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.train' ds_corpus_test_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.test' # output directory of node features dir_corpus_node_features = cfg.corpus_shuffled_node_features_dir + "/" + ds_name # checkers check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets) check_paths(ds_corpus, ds_corpus_meta, ds_corpus_vocabulary) check_paths(ds_corpus_train_idx, ds_corpus_train_idx) # Create output directory of node features create_dir(dir_path=dir_corpus_node_features, overwrite=False) # Adjust train size, for different training rates, for example: use 90% of training set real_train_size = len(open(ds_corpus_train_idx).readlines()) adjusted_train_size = ceil(real_train_size * (1.0 - validation_ratio)) test_size = len(open(ds_corpus_test_idx).readlines()) # Extract word_vectors and word_embedding_dimension if use_predefined_word_vectors: ds_corpus_word_vectors = cfg.corpus_shuffled_word_vectors_dir + ds_name + '.word_vectors' ds_corpus_word_vectors = 'glove.6B.300d.txt' # Alternatively, you can use GLOVE word-embeddings word_vectors, word_emb_dim = load_word_to_word_vectors( path=ds_corpus_word_vectors) else: word_vectors, word_emb_dim = OrderedDict(), 300 # todo: parametrize vocabulary = open( ds_corpus_vocabulary).read().splitlines() # Extract Vocabulary doc_meta_list = open(file=ds_corpus_meta, mode='r').read().splitlines() # Extract Meta List doc_labels = extract_doc_labels( ds_corpus_meta_file=ds_corpus_meta) # Extract Document Labels docs_of_words = [line.split() for line in open(file=ds_corpus) ] # Extract Documents of Words # for i,words in enumerate(docs_of_words): # if words == []: # if doc_meta_list[i].split('\t')[-1] == 'ham': # words.extend(['MEETING','TOMORROW']) # else: # words.extend(['WIN','LOTTERY']) # Extract mean document word vectors and one hot labels of train-set x = compute_x(docs_of_words, adjusted_train_size, word_emb_dim, w_vectors=word_vectors) y = compute_y(doc_meta_list, train_size=adjusted_train_size, doc_labels=doc_labels) # Extract mean document word vectors and one hot labels of test-set tx = compute_tx(docs_of_words, test_size, real_train_size, word_emb_dim, w_vectors=word_vectors) ty = compute_ty(doc_meta_list, test_size=test_size, real_train_size=real_train_size, doc_labels=doc_labels) # Extract doc_features + word_features allx = compute_allx(docs_of_words, real_train_size, vocabulary, word_vectors, emb_dim=word_emb_dim) ally = compute_ally(doc_meta_list, real_train_size, doc_labels, vocab_size=len(vocabulary)) # Dump node features matrices to files node_feature_matrices = { "x": x, "y": y, "tx": tx, "ty": ty, "allx": allx, "ally": ally } dump_node_features(directory=dir_corpus_node_features, ds=ds_name, node_features_dict=node_feature_matrices) print("[INFO] x.shape= {},\t y.shape= {}".format(x.shape, y.shape)) print("[INFO] tx.shape= {},\t ty.shape= {}".format(tx.shape, ty.shape)) print("[INFO] allx.shape={},\t ally.shape={}".format( allx.shape, ally.shape)) print( "[INFO] ========= EXTRACTED NODE FEATURES: x, y, tx, ty, allx, ally. =========" )