コード例 #1
0
def preprocess(data_dirs, document_length_limit, is_line_as_word,
               dev_sample_percentage):
    x_text, y = load_data_and_labels(data_dirs, document_length_limit,
                                     is_line_as_word)
    #print(x_text)
    # Vocabulary
    max_document_length = max([len(text.split(" ")) for text in x_text])
    print("max_docment_length: {}".format(max_document_length))
    max_document_length = min(document_length_limit, max_document_length)
    print("max_docment_length: {}".format(max_document_length))
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_text)))
    print(x)

    # Random
    np.random.seed(100)
    shuffle_indices = np.random.permutation(np.arange(len(x_text)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    dev_sample_index = -1 * int(dev_sample_percentage * len(y))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    del x, y, x_shuffled, y_shuffled

    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    return x_train, y_train, vocab_processor, x_dev, y_dev
コード例 #2
0
def getVocabulary(data_dirs, document_length_limit, is_line_as_word,
                  dev_sample_percentage):
    x_text, y = load_data_and_labels(data_dirs, document_length_limit,
                                     is_line_as_word)

    # Vocabulary
    max_document_length = max([len(text.split(" ")) for text in x_text])
    print("max_docment_length: {}".format(max_document_length))
    max_document_length = min(document_length_limit, max_document_length)
    print("max_docment_length: {}".format(max_document_length))
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)
    # 一维化之后的向量
    x = np.array(list(vocab_processor.fit_transform(x_text)))
    print(x)
    print(y)
    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))

    return x, y, vocab_processor
コード例 #3
0
import random
import numpy as np
import torch

from torch import optim
import torch.nn as nn
from sklearn.metrics import f1_score
import data
import model_gru
import torch.nn.functional as F
from Arg import args

torch.manual_seed = (1)
random.seed(1)
#从data源文件中导入项目需要的文件,详情请查看data文件
train_sentence, valid_sentence, test_sentence, word_dict, word2ix, ix2word, word_matrix = data.load_data_and_labels(
)

args = args()


#定义训练流程,每次训练一个所有训练样本
def trainIters(SiaNetwork,
               train_sentence,
               criterion1,
               batch_size=2000,
               learning_rate=0.005):
    SiaNetwork.train()
    SiaNetwork_optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                             SiaNetwork.parameters()),
                                      lr=learning_rate)
コード例 #4
0
def compare_similar_words(n, word, vocab, embeddings_arr):
    """
    n: Number of most similar words to compare.
    word: Word to be compared.
    vocab: The vocabulary dictionary used to extract the index of word.
    embeddings_arr: Array of word embeddings to be compared.
    """
    idx = vocab.get(word)
    for i in range(len(embeddings_arr)):
        print("%d:" % (i))
        print_most_similar(n, idx, embeddings_arr[i], vocabulary)


# Load data
print("Loading data...")
x_text, y = data.load_data_and_labels("data/rt-polarity.pos", "data/rt-polarity.neg")

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Extract vocabulary from vocab_processor
vocab_dict = vocab_processor.vocabulary_._mapping
sorted_vocab = sorted(vocab_dict.items(), key=lambda x: x[1])
vocabulary = list(list(zip(*sorted_vocab))[0])  # list of words in vocabulary

# Restore model v1.1
sess = tf.Session()
saver = tf.train.import_meta_graph("runs/v1/1507798871/checkpoints/model-7100.meta")
saver.restore(sess, tf.train.latest_checkpoint("runs/v1/1507798871/checkpoints/."))
コード例 #5
0
ファイル: baseline.py プロジェクト: cemeiq/RottenTomatoesCNN
import data

# Parameters
# ==================================================

# Data params
test_sample_percentage = 0.1  # percentage of training data to use for validation
positive_data_file = "data/rt-polarity.pos"
negative_data_file = "data/rt-polarity.neg"

# Data Preparation
# ==================================================

# Load data
x, y = data.load_data_and_labels(positive_data_file, negative_data_file)

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
test_sample_index = -1 * int(test_sample_percentage * float(len(y)))
x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[
    test_sample_index:]
y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[
    test_sample_index:]

# Transform targets from arrays to labels