コード例 #1
0
def clean_data(ds_name: str, rare_count: int, cfg: PreProcessingConfigs):
    corpus_path = cfg.corpus_dir + ds_name + cfg.data_set_extension
    ds_corpus_cleaned = cfg.corpus_cleaned_dir + ds_name + cfg.data_set_extension

    # Checkers
    check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets)
    check_paths(corpus_path)

    create_dir(dir_path=cfg.corpus_cleaned_dir, overwrite=False)
    docs_of_words = [
        clean_str(line.strip().decode('latin1')).split()
        for line in open(corpus_path, 'rb')
    ]
    word_counts = extract_word_counts(docs_of_words=docs_of_words)
    stop_words = retrieve_stop_words(language='english')
    if ds_name != 'mr':  # If data-set is 'mr', don't remove stop and rare words, TODO: find why
        docs_of_words = remove_stop_words(docs_of_words, stop_words=stop_words)
        docs_of_words = remove_rare_words(docs_of_words,
                                          word_counts=word_counts,
                                          rare_count=rare_count)
    docs_of_words = glue_lines(lines_of_words=docs_of_words,
                               glue_str=' ',
                               with_strip=True)

    write_iterable_to_file(an_iterable=docs_of_words,
                           file_path=ds_corpus_cleaned,
                           file_mode='w')
    print("[INFO] Cleaned-Corpus Dir='{}'".format(cfg.corpus_cleaned_dir))
    print("[INFO] Rare-Count=<{}>".format(rare_count))
    print(
        "[INFO] ========= CLEANED DATA: Removed rare & stop-words. =========")
コード例 #2
0
def build_adjacency(ds_name: str, cfg: PreProcessingConfigs):
    """Build Adjacency Matrix of Doc-Word Heterogeneous Graph"""

    # input files
    ds_corpus = cfg.corpus_shuffled_dir + ds_name + ".txt"
    ds_corpus_vocabulary = cfg.corpus_shuffled_vocab_dir + ds_name + '.vocab'
    ds_corpus_train_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.train'
    ds_corpus_test_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.test'

    # checkers
    check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets)
    check_paths(ds_corpus, ds_corpus_vocabulary, ds_corpus_train_idx,
                ds_corpus_test_idx)

    create_dir(dir_path=cfg.corpus_shuffled_adjacency_dir, overwrite=False)

    docs_of_words = [line.split() for line in open(file=ds_corpus)]
    vocab = open(
        ds_corpus_vocabulary).read().splitlines()  # Extract Vocabulary.
    word_to_id = {word: i for i, word in enumerate(vocab)}  # Word to its id.
    train_size = len(open(
        ds_corpus_train_idx).readlines())  # Real train-size, not adjusted.
    test_size = len(open(ds_corpus_test_idx).readlines())  # Real test-size.

    windows_of_words = extract_windows(docs_of_words=docs_of_words,
                                       window_size=20)

    # Extract word-word weights
    rows, cols, weights = extract_pmi_word_weights(windows_of_words,
                                                   word_to_id, vocab,
                                                   train_size)
    # As an alternative, use cosine similarity of word vectors as weights:
    #   ds_corpus_word_vectors = cfg.CORPUS_WORD_VECTORS_DIR + ds_name + '.word_vectors'
    #   rows, cols, weights = extract_cosine_similarity_word_weights(vocab, train_size, ds_corpus_word_vectors)

    # Extract word-doc weights
    rows, cols, weights = extract_tf_idf_doc_word_weights(
        rows, cols, weights, vocab, train_size, docs_of_words, word_to_id)

    adjacency_len = train_size + len(vocab) + test_size
    adjacency_matrix = csr_matrix((weights, (rows, cols)),
                                  shape=(adjacency_len, adjacency_len))

    # Dump Adjacency Matrix
    with open(
            cfg.corpus_shuffled_adjacency_dir + "/ind.{}.adj".format(ds_name),
            'wb') as f:
        pickle.dump(adjacency_matrix, f)

    print("[INFO] Adjacency Dir='{}'".format(
        cfg.corpus_shuffled_adjacency_dir))
    print(
        "[INFO] ========= EXTRACTED ADJACENCY MATRIX: Heterogenous doc-word adjacency matrix. ========="
    )
コード例 #3
0
def shuffle_data(ds_name: str, cfg: PreProcessingConfigs):
    ds_corpus = cfg.corpus_cleaned_dir + ds_name + cfg.data_set_extension
    ds_corpus_meta = cfg.corpus_meta_dir + ds_name + '.meta'

    ds_corpus_shuffled = cfg.corpus_shuffled_dir + ds_name + cfg.data_set_extension
    ds_corpus_shuffled_train_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.train'
    ds_corpus_shuffled_test_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.test'
    ds_corpus_shuffled_meta = cfg.corpus_shuffled_meta_dir + ds_name + '.meta'

    # Checkers
    check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets)
    check_paths(ds_corpus_meta, ds_corpus)

    # Create dirs if not exist
    create_dir(cfg.corpus_shuffled_dir, overwrite=False)
    create_dir(cfg.corpus_shuffled_meta_dir, overwrite=False)
    create_dir(cfg.corpus_shuffled_split_index_dir, overwrite=False)

    all_doc_meta_list, train_doc_meta_list, test_doc_meta_list = load_corpus_meta(corpus_meta_path=ds_corpus_meta)
    cleaned_doc_lines = [line.strip() for line in open(ds_corpus, 'r')]

    # Shuffle train ids and write to file
    train_doc_meta_ids = [all_doc_meta_list.index(train_doc_meta) for train_doc_meta in train_doc_meta_list]
    random.shuffle(train_doc_meta_ids)
    write_iterable_to_file(an_iterable=train_doc_meta_ids, file_path=ds_corpus_shuffled_train_idx, file_mode='w')

    # Shuffle test ids and write to file
    test_doc_meta_ids = [all_doc_meta_list.index(test_doc_meta) for test_doc_meta in test_doc_meta_list]
    random.shuffle(test_doc_meta_ids)
    write_iterable_to_file(an_iterable=test_doc_meta_ids, file_path=ds_corpus_shuffled_test_idx, file_mode='w')

    all_doc_meta_ids = train_doc_meta_ids + test_doc_meta_ids
    # Write shuffled meta to file
    shuffled_doc_meta_list = [all_doc_meta_list[all_doc_meta_id] for all_doc_meta_id in all_doc_meta_ids]
    write_iterable_to_file(an_iterable=shuffled_doc_meta_list, file_path=ds_corpus_shuffled_meta, file_mode='w')

    # Write shuffled document files to file
    shuffled_doc_lines = [cleaned_doc_lines[all_doc_meta_id] for all_doc_meta_id in all_doc_meta_ids]
    write_iterable_to_file(an_iterable=shuffled_doc_lines, file_path=ds_corpus_shuffled, file_mode='w')

    print("[INFO] Shuffled-Corpus Dir='{}'".format(cfg.corpus_shuffled_dir))
    print("[INFO] ========= SHUFFLED DATA: Corpus documents shuffled. =========")
コード例 #4
0
def prepare_words(ds_name: str, cfg: PreProcessingConfigs):
    ds_corpus = cfg.corpus_shuffled_dir + ds_name + cfg.data_set_extension

    # Checkers
    check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets)
    check_paths(ds_corpus)

    # Create output directories
    create_dir(dir_path=cfg.corpus_shuffled_vocab_dir, overwrite=False)
    create_dir(dir_path=cfg.corpus_shuffled_word_vectors_dir, overwrite=False)

    ds_corpus_vocabulary = cfg.corpus_shuffled_vocab_dir + ds_name + '.vocab'
    ds_corpus_word_vectors = cfg.corpus_shuffled_word_vectors_dir + ds_name + '.word_vectors'
    # ###################################################

    # Build vocabulary
    docs_of_words_generator = (line.split() for line in open(ds_corpus))
    vocabulary = extract_vocabulary(docs_of_words=docs_of_words_generator)
    write_iterable_to_file(an_iterable=vocabulary,
                           file_path=ds_corpus_vocabulary,
                           file_mode='w')

    # Extract word definitions
    word_definitions = extract_word_definitions(vocabulary=vocabulary)
    # write_iterable_to_file(word_definitions, file_path='/<>' + ds, file_mode='w+')

    # Extract & Dump word vectors
    word_vectors = extract_tf_idf_word_vectors(
        word_definitions=word_definitions, max_features=1000)
    word_to_word_vectors_dict = OrderedDict(
        (word, vec.tolist()) for word, vec in zip(vocabulary, word_vectors))
    pickle.dump(obj=word_to_word_vectors_dict,
                file=open(ds_corpus_word_vectors, mode='wb'))

    print("[INFO] Vocabulary Dir='{}'".format(cfg.corpus_shuffled_vocab_dir))
    print("[INFO] Word-Vector Dir='{}'".format(
        cfg.corpus_shuffled_word_vectors_dir))
    print(
        "[INFO] ========= PREPARED WORDS: Vocabulary & word-vectors extracted. ========="
    )
コード例 #5
0
def build_node_features(ds_name: str, validation_ratio: float,
                        use_predefined_word_vectors: bool,
                        cfg: PreProcessingConfigs):
    # input files for building node features
    ds_corpus = cfg.corpus_shuffled_dir + ds_name + '.txt'
    ds_corpus_meta = cfg.corpus_shuffled_meta_dir + ds_name + '.meta'
    ds_corpus_vocabulary = cfg.corpus_shuffled_vocab_dir + ds_name + '.vocab'
    ds_corpus_train_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.train'
    ds_corpus_test_idx = cfg.corpus_shuffled_split_index_dir + ds_name + '.test'

    # output directory of node features
    dir_corpus_node_features = cfg.corpus_shuffled_node_features_dir + "/" + ds_name

    # checkers
    check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets)
    check_paths(ds_corpus, ds_corpus_meta, ds_corpus_vocabulary)
    check_paths(ds_corpus_train_idx, ds_corpus_train_idx)

    # Create output directory of node features
    create_dir(dir_path=dir_corpus_node_features, overwrite=False)

    # Adjust train size, for different training rates, for example: use 90% of training set
    real_train_size = len(open(ds_corpus_train_idx).readlines())
    adjusted_train_size = ceil(real_train_size * (1.0 - validation_ratio))
    test_size = len(open(ds_corpus_test_idx).readlines())

    # Extract word_vectors and word_embedding_dimension
    if use_predefined_word_vectors:
        ds_corpus_word_vectors = cfg.corpus_shuffled_word_vectors_dir + ds_name + '.word_vectors'
        ds_corpus_word_vectors = 'glove.6B.300d.txt'  # Alternatively, you can use GLOVE word-embeddings
        word_vectors, word_emb_dim = load_word_to_word_vectors(
            path=ds_corpus_word_vectors)
    else:
        word_vectors, word_emb_dim = OrderedDict(), 300  # todo: parametrize

    vocabulary = open(
        ds_corpus_vocabulary).read().splitlines()  # Extract Vocabulary
    doc_meta_list = open(file=ds_corpus_meta,
                         mode='r').read().splitlines()  # Extract Meta List
    doc_labels = extract_doc_labels(
        ds_corpus_meta_file=ds_corpus_meta)  # Extract Document Labels

    docs_of_words = [line.split() for line in open(file=ds_corpus)
                     ]  # Extract Documents of Words
    # for i,words in enumerate(docs_of_words):
    #   if words == []:
    #     if doc_meta_list[i].split('\t')[-1] == 'ham':
    #       words.extend(['MEETING','TOMORROW'])
    #     else:
    #       words.extend(['WIN','LOTTERY'])

    # Extract mean document word vectors and one hot labels of train-set
    x = compute_x(docs_of_words,
                  adjusted_train_size,
                  word_emb_dim,
                  w_vectors=word_vectors)
    y = compute_y(doc_meta_list,
                  train_size=adjusted_train_size,
                  doc_labels=doc_labels)

    # Extract mean document word vectors and one hot labels of test-set
    tx = compute_tx(docs_of_words,
                    test_size,
                    real_train_size,
                    word_emb_dim,
                    w_vectors=word_vectors)
    ty = compute_ty(doc_meta_list,
                    test_size=test_size,
                    real_train_size=real_train_size,
                    doc_labels=doc_labels)

    # Extract doc_features + word_features
    allx = compute_allx(docs_of_words,
                        real_train_size,
                        vocabulary,
                        word_vectors,
                        emb_dim=word_emb_dim)
    ally = compute_ally(doc_meta_list,
                        real_train_size,
                        doc_labels,
                        vocab_size=len(vocabulary))

    # Dump node features matrices to files
    node_feature_matrices = {
        "x": x,
        "y": y,
        "tx": tx,
        "ty": ty,
        "allx": allx,
        "ally": ally
    }
    dump_node_features(directory=dir_corpus_node_features,
                       ds=ds_name,
                       node_features_dict=node_feature_matrices)

    print("[INFO] x.shape=   {},\t y.shape=   {}".format(x.shape, y.shape))
    print("[INFO] tx.shape=  {},\t ty.shape=  {}".format(tx.shape, ty.shape))
    print("[INFO] allx.shape={},\t ally.shape={}".format(
        allx.shape, ally.shape))
    print(
        "[INFO] ========= EXTRACTED NODE FEATURES: x, y, tx, ty, allx, ally. ========="
    )
コード例 #6
0
def train_model(ds_name: str, is_featureless: bool, cfg: TrainingConfigs):
    configure_cuda()
    check_data_set(data_set_name=ds_name, all_data_set_names=cfg.data_sets)
    set_seeds(seed=2019)

    # Load corpus & unpack values
    corpus_values = load_corpus(ds_name, cfg.corpus_split_index_dir,
                                cfg.corpus_node_features_dir,
                                cfg.corpus_adjacency_dir)
    adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size = corpus_values

    if is_featureless:
        features = sp.identity(features.shape[0])

    features, support, num_supports, model_func = prepare_matrices(
        features, adj, cfg.model, cfg.chebyshev_max_degree)

    # Define placeholders
    t_features = torch.from_numpy(features)
    t_y_train = torch.from_numpy(y_train)
    t_y_val = torch.from_numpy(y_val)
    t_y_test = torch.from_numpy(y_test)
    t_train_mask = torch.from_numpy(train_mask.astype(np.float32))
    tm_train_mask = torch.transpose(torch.unsqueeze(t_train_mask, 0), 1,
                                    0).repeat(1, y_train.shape[1])

    t_support = []
    for i in range(len(support)):
        # noinspection PyArgumentList
        t_support.append(torch.Tensor(support[i]))

    # if torch.cuda.is_available():
    #     model_func = model_func.cuda()
    #     t_features = t_features.cuda()
    #     t_y_train = t_y_train.cuda()
    #     t_y_val = t_y_val.cuda()
    #     t_y_test = t_y_test.cuda()
    #     t_train_mask = t_train_mask.cuda()
    #     tm_train_mask = tm_train_mask.cuda()
    #     for i in range(len(support)):
    #         t_support = [t.cuda() for t in t_support if True]

    model = model_func(input_dim=features.shape[0],
                       support=t_support,
                       num_classes=y_train.shape[1])

    # Loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate)

    val_losses = []

    # Train model
    for epoch in range(cfg.epochs):
        epoch_start_time = time.time()

        # Forward pass
        logits = model(t_features)
        loss = criterion(logits * tm_train_mask, torch.max(t_y_train, 1)[1])
        acc = (
            (torch.max(logits, 1)[1] == torch.max(t_y_train, 1)[1]).float() *
            t_train_mask).sum().item() / t_train_mask.sum().item()

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Validation
        val_loss, val_acc, pred, labels, duration = evaluate_model(
            model, criterion, t_features, t_y_val, val_mask)
        val_losses.append(val_loss)

        print_log(
            "Epoch:{:.0f}, train_loss={:.5f}, train_acc={:.5f}, val_loss={:.5f}, val_acc={:.5f}, time={:.5f}"
            .format(epoch + 1, loss, acc, val_loss, val_acc,
                    time.time() - epoch_start_time))

        if epoch > cfg.early_stopping and val_losses[-1] > np.mean(
                val_losses[-(cfg.early_stopping + 1):-1]):
            print_log("Early stopping...")
            break

    print_log("Optimization Finished!")

    # Testing
    test_loss, test_acc, pred, labels, test_duration = evaluate_model(
        model, criterion, t_features, t_y_test, test_mask)
    print_log(
        "Test set results: \n\t loss= {:.5f}, accuracy= {:.5f}, time= {:.5f}".
        format(test_loss, test_acc, test_duration))

    test_pred = []
    test_labels = []
    for i in range(len(test_mask)):
        if test_mask[i]:
            test_pred.append(pred[i])
            test_labels.append(np.argmax(labels[i]))

    print_log("Test Precision, Recall and F1-Score...")
    print_log(metrics.classification_report(test_labels, test_pred, digits=4))
    print_log("Macro average Test Precision, Recall and F1-Score...")
    print_log(
        metrics.precision_recall_fscore_support(test_labels,
                                                test_pred,
                                                average='macro'))
    print_log("Micro average Test Precision, Recall and F1-Score...")
    print_log(
        metrics.precision_recall_fscore_support(test_labels,
                                                test_pred,
                                                average='micro'))

    # doc and word embeddings
    tmp = model.layer1.embedding.numpy()
    word_embeddings = tmp[train_size:adj.shape[0] - test_size]
    train_doc_embeddings = tmp[:train_size]  # include val docs
    test_doc_embeddings = tmp[adj.shape[0] - test_size:]

    print_log('Embeddings:')
    print_log('\rWord_embeddings:' + str(len(word_embeddings)))
    print_log('\rTrain_doc_embeddings:' + str(len(train_doc_embeddings)))
    print_log('\rTest_doc_embeddings:' + str(len(test_doc_embeddings)))
    print_log('\rWord_embeddings:')
    print(word_embeddings)

    # Create word-vectors and written to file # todo: commented-out
    """
    with open(cfg.corpus_vocab_dir + ds_name + '.vocab', 'r') as f:
        words = f.readlines()
        
    vocab_size = len(words)
    word_vectors = []
    for i in range(vocab_size):
        word = words[i].strip()
        word_vector = word_embeddings[i]
        word_vector_str = ' '.join([str(x) for x in word_vector])
        word_vectors.append(word + ' ' + word_vector_str)

    word_embeddings_str = '\n'.join(word_vectors) 
    with open('./data/' + ds_name + '_word_vectors.txt', 'w') as f:
        f.write(word_embeddings_str)
    """

    # Create doc vectors and written to file  # todo: commented-out
    """