def clean_data(ds_name: str, rare_count: int, cfg: PreProcessingConfigs):
    corpus_path = cfg.corpus_dir + ds_name + cfg.data_set_extension
    ds_corpus_cleaned = cfg.corpus_cleaned_dir + ds_name + cfg.data_set_extension

    # Checkers
    check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets)
    check_paths(corpus_path)

    create_dir(dir_path=cfg.corpus_cleaned_dir, overwrite=False)
    docs_of_words = [
        clean_str(line.strip().decode('latin1')).split()
        for line in open(corpus_path, 'rb')
    ]
    word_counts = extract_word_counts(docs_of_words=docs_of_words)
    stop_words = retrieve_stop_words(language='english')
    if ds_name != 'mr':  # If data-set is 'mr', don't remove stop and rare words, TODO: find why
        docs_of_words = remove_stop_words(docs_of_words, stop_words=stop_words)
        docs_of_words = remove_rare_words(docs_of_words,
                                          word_counts=word_counts,
                                          rare_count=rare_count)
    docs_of_words = glue_lines(lines_of_words=docs_of_words,
                               glue_str=' ',
                               with_strip=True)

    write_iterable_to_file(an_iterable=docs_of_words,
                           file_path=ds_corpus_cleaned,
                           file_mode='w')
    print("[INFO] Cleaned-Corpus Dir='{}'".format(cfg.corpus_cleaned_dir))
    print("[INFO] Rare-Count=<{}>".format(rare_count))
    print(
        "[INFO] ========= CLEANED DATA: Removed rare & stop-words. =========")
Example #2
0
def build_adjacency(ds_name: str, cfg: PreProcessingConfigs):
    """Build Adjacency Matrix of Doc-Word Heterogeneous Graph"""

    # input files
    ds_corpus = cfg.corpus_shuffled_dir + ds_name + ".txt"
    ds_corpus_vocabulary = cfg.corpus_shuffled_vocab_dir + ds_name + '.vocab'
    ds_corpus_train_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.train'
    ds_corpus_test_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.test'

    # checkers
    check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets)
    check_paths(ds_corpus, ds_corpus_vocabulary, ds_corpus_train_idx,
                ds_corpus_test_idx)

    create_dir(dir_path=cfg.corpus_shuffled_adjacency_dir, overwrite=False)

    docs_of_words = [line.split() for line in open(file=ds_corpus)]
    vocab = open(
        ds_corpus_vocabulary).read().splitlines()  # Extract Vocabulary.
    word_to_id = {word: i for i, word in enumerate(vocab)}  # Word to its id.
    train_size = len(open(
        ds_corpus_train_idx).readlines())  # Real train-size, not adjusted.
    test_size = len(open(ds_corpus_test_idx).readlines())  # Real test-size.

    windows_of_words = extract_windows(docs_of_words=docs_of_words,
                                       window_size=20)

    # Extract word-word weights
    rows, cols, weights = extract_pmi_word_weights(windows_of_words,
                                                   word_to_id, vocab,
                                                   train_size)
    # As an alternative, use cosine similarity of word vectors as weights:
    #   ds_corpus_word_vectors = cfg.CORPUS_WORD_VECTORS_DIR + ds_name + '.word_vectors'
    #   rows, cols, weights = extract_cosine_similarity_word_weights(vocab, train_size, ds_corpus_word_vectors)

    # Extract word-doc weights
    rows, cols, weights = extract_tf_idf_doc_word_weights(
        rows, cols, weights, vocab, train_size, docs_of_words, word_to_id)

    adjacency_len = train_size + len(vocab) + test_size
    adjacency_matrix = csr_matrix((weights, (rows, cols)),
                                  shape=(adjacency_len, adjacency_len))

    # Dump Adjacency Matrix
    with open(
            cfg.corpus_shuffled_adjacency_dir + "/ind.{}.adj".format(ds_name),
            'wb') as f:
        pickle.dump(adjacency_matrix, f)

    print("[INFO] Adjacency Dir='{}'".format(
        cfg.corpus_shuffled_adjacency_dir))
    print(
        "[INFO] ========= EXTRACTED ADJACENCY MATRIX: Heterogenous doc-word adjacency matrix. ========="
    )
def prepare_words(ds_name: str, cfg: PreProcessingConfigs):
    ds_corpus = cfg.corpus_shuffled_dir + ds_name + cfg.data_set_extension

    # Checkers
    check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets)
    check_paths(ds_corpus)

    # Create output directories
    create_dir(dir_path=cfg.corpus_shuffled_vocab_dir, overwrite=False)
    create_dir(dir_path=cfg.corpus_shuffled_word_vectors_dir, overwrite=False)

    ds_corpus_vocabulary = cfg.corpus_shuffled_vocab_dir + ds_name + '.vocab'
    ds_corpus_word_vectors = cfg.corpus_shuffled_word_vectors_dir + ds_name + '.word_vectors'
    # ###################################################

    # Build vocabulary
    docs_of_words_generator = (line.split() for line in open(ds_corpus))
    vocabulary = extract_vocabulary(docs_of_words=docs_of_words_generator)
    write_iterable_to_file(an_iterable=vocabulary,
                           file_path=ds_corpus_vocabulary,
                           file_mode='w')

    # Extract word definitions
    word_definitions = extract_word_definitions(vocabulary=vocabulary)
    # write_iterable_to_file(word_definitions, file_path='/<>' + ds, file_mode='w+')

    # Extract & Dump word vectors
    word_vectors = extract_tf_idf_word_vectors(
        word_definitions=word_definitions, max_features=1000)
    word_to_word_vectors_dict = OrderedDict(
        (word, vec.tolist()) for word, vec in zip(vocabulary, word_vectors))
    pickle.dump(obj=word_to_word_vectors_dict,
                file=open(ds_corpus_word_vectors, mode='wb'))

    print("[INFO] Vocabulary Dir='{}'".format(cfg.corpus_shuffled_vocab_dir))
    print("[INFO] Word-Vector Dir='{}'".format(
        cfg.corpus_shuffled_word_vectors_dir))
    print(
        "[INFO] ========= PREPARED WORDS: Vocabulary & word-vectors extracted. ========="
    )
Example #4
0
def shuffle_data(ds_name: str, cfg: PreProcessingConfigs):
    ds_corpus = cfg.corpus_cleaned_dir + ds_name + cfg.data_set_extension
    ds_corpus_meta = cfg.corpus_meta_dir + ds_name + '.meta'

    ds_corpus_shuffled = cfg.corpus_shuffled_dir + ds_name + cfg.data_set_extension
    ds_corpus_shuffled_train_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.train'
    ds_corpus_shuffled_test_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.test'
    ds_corpus_shuffled_meta = cfg.corpus_shuffled_meta_dir + ds_name + '.meta'

    # Checkers
    check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets)
    check_paths(ds_corpus_meta, ds_corpus)

    # Create dirs if not exist
    create_dir(cfg.corpus_shuffled_dir, overwrite=False)
    create_dir(cfg.corpus_shuffled_meta_dir, overwrite=False)
    create_dir(cfg.corpus_shuffled_split_index_dir, overwrite=False)

    all_doc_meta_list, train_doc_meta_list, test_doc_meta_list = load_corpus_meta(corpus_meta_path=ds_corpus_meta)
    cleaned_doc_lines = [line.strip() for line in open(ds_corpus, 'r')]

    # Shuffle train ids and write to file
    train_doc_meta_ids = [all_doc_meta_list.index(train_doc_meta) for train_doc_meta in train_doc_meta_list]
    random.shuffle(train_doc_meta_ids)
    write_iterable_to_file(an_iterable=train_doc_meta_ids, file_path=ds_corpus_shuffled_train_idx, file_mode='w')

    # Shuffle test ids and write to file
    test_doc_meta_ids = [all_doc_meta_list.index(test_doc_meta) for test_doc_meta in test_doc_meta_list]
    random.shuffle(test_doc_meta_ids)
    write_iterable_to_file(an_iterable=test_doc_meta_ids, file_path=ds_corpus_shuffled_test_idx, file_mode='w')

    all_doc_meta_ids = train_doc_meta_ids + test_doc_meta_ids
    # Write shuffled meta to file
    shuffled_doc_meta_list = [all_doc_meta_list[all_doc_meta_id] for all_doc_meta_id in all_doc_meta_ids]
    write_iterable_to_file(an_iterable=shuffled_doc_meta_list, file_path=ds_corpus_shuffled_meta, file_mode='w')

    # Write shuffled document files to file
    shuffled_doc_lines = [cleaned_doc_lines[all_doc_meta_id] for all_doc_meta_id in all_doc_meta_ids]
    write_iterable_to_file(an_iterable=shuffled_doc_lines, file_path=ds_corpus_shuffled, file_mode='w')

    print("[INFO] Shuffled-Corpus Dir='{}'".format(cfg.corpus_shuffled_dir))
    print("[INFO] ========= SHUFFLED DATA: Corpus documents shuffled. =========")
Example #5
0
def build_node_features(ds_name: str, validation_ratio: float,
                        use_predefined_word_vectors: bool,
                        cfg: PreProcessingConfigs):
    # input files for building node features
    ds_corpus = cfg.corpus_shuffled_dir + ds_name + '.txt'
    ds_corpus_meta = cfg.corpus_shuffled_meta_dir + ds_name + '.meta'
    ds_corpus_vocabulary = cfg.corpus_shuffled_vocab_dir + ds_name + '.vocab'
    ds_corpus_train_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.train'
    ds_corpus_test_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.test'

    # output directory of node features
    dir_corpus_node_features = cfg.corpus_shuffled_node_features_dir + "/" + ds_name

    # checkers
    check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets)
    check_paths(ds_corpus, ds_corpus_meta, ds_corpus_vocabulary)
    check_paths(ds_corpus_train_idx, ds_corpus_train_idx)

    # Create output directory of node features
    create_dir(dir_path=dir_corpus_node_features, overwrite=False)

    # Adjust train size, for different training rates, for example: use 90% of training set
    real_train_size = len(open(ds_corpus_train_idx).readlines())
    adjusted_train_size = ceil(real_train_size * (1.0 - validation_ratio))
    test_size = len(open(ds_corpus_test_idx).readlines())

    # Extract word_vectors and word_embedding_dimension
    if use_predefined_word_vectors:
        ds_corpus_word_vectors = cfg.corpus_shuffled_word_vectors_dir + ds_name + '.word_vectors'
        ds_corpus_word_vectors = 'glove.6B.300d.txt'  # Alternatively, you can use GLOVE word-embeddings
        word_vectors, word_emb_dim = load_word_to_word_vectors(
            path=ds_corpus_word_vectors)
    else:
        word_vectors, word_emb_dim = OrderedDict(), 300  # todo: parametrize

    vocabulary = open(
        ds_corpus_vocabulary).read().splitlines()  # Extract Vocabulary
    doc_meta_list = open(file=ds_corpus_meta,
                         mode='r').read().splitlines()  # Extract Meta List
    doc_labels = extract_doc_labels(
        ds_corpus_meta_file=ds_corpus_meta)  # Extract Document Labels

    docs_of_words = [line.split() for line in open(file=ds_corpus)
                     ]  # Extract Documents of Words
    # for i,words in enumerate(docs_of_words):
    #   if words == []:
    #     if doc_meta_list[i].split('\t')[-1] == 'ham':
    #       words.extend(['MEETING','TOMORROW'])
    #     else:
    #       words.extend(['WIN','LOTTERY'])

    # Extract mean document word vectors and one hot labels of train-set
    x = compute_x(docs_of_words,
                  adjusted_train_size,
                  word_emb_dim,
                  w_vectors=word_vectors)
    y = compute_y(doc_meta_list,
                  train_size=adjusted_train_size,
                  doc_labels=doc_labels)

    # Extract mean document word vectors and one hot labels of test-set
    tx = compute_tx(docs_of_words,
                    test_size,
                    real_train_size,
                    word_emb_dim,
                    w_vectors=word_vectors)
    ty = compute_ty(doc_meta_list,
                    test_size=test_size,
                    real_train_size=real_train_size,
                    doc_labels=doc_labels)

    # Extract doc_features + word_features
    allx = compute_allx(docs_of_words,
                        real_train_size,
                        vocabulary,
                        word_vectors,
                        emb_dim=word_emb_dim)
    ally = compute_ally(doc_meta_list,
                        real_train_size,
                        doc_labels,
                        vocab_size=len(vocabulary))

    # Dump node features matrices to files
    node_feature_matrices = {
        "x": x,
        "y": y,
        "tx": tx,
        "ty": ty,
        "allx": allx,
        "ally": ally
    }
    dump_node_features(directory=dir_corpus_node_features,
                       ds=ds_name,
                       node_features_dict=node_feature_matrices)

    print("[INFO] x.shape=   {},\t y.shape=   {}".format(x.shape, y.shape))
    print("[INFO] tx.shape=  {},\t ty.shape=  {}".format(tx.shape, ty.shape))
    print("[INFO] allx.shape={},\t ally.shape={}".format(
        allx.shape, ally.shape))
    print(
        "[INFO] ========= EXTRACTED NODE FEATURES: x, y, tx, ty, allx, ally. ========="
    )