Example #1
0
def load_data(w2v_model=None):
    print("laoding data")
    x_text, y = data_helpers.load_data_and_labels(train_data_file)
    max_document_length = max([len(x.split(" ")) for x in x_text])
    if (w2v_model == None):
        vocab_processor = learn.preprocessing.VocabularyProcessor(
            max_document_length)
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        vocab_size = len(vocab_processor.vocabulary_)
    else:
        x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash,
                                      max_document_length)
        vocab_size = len(w2v_model.vocab_hash)
        print('use w2v .bin')

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    dev_sample_index = -1 * int(dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    return x_train, x_dev, y_train, y_dev, vocab_size
Example #2
0
def load_data(w2v_model):
    """Loads starter word-vectors and train/dev/test data."""
    # Load the starter word vectors
    print("Loading data...")
    x_text, y = load_data_and_labels(FLAGS.train_data_file)

    max_document_length = max([len(x.split(" ")) for x in x_text])  # 文本最长长度
    print('len(x) = ', len(x_text), ' ', len(y))
    print(' max_document_length = ', max_document_length)
    if (w2v_model is None):
        vocab_processor = learn.preprocessing.VocabularyProcessor(
            max_document_length)
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        vocab_size = len(vocab_processor.vocabulary_)

        # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time()))))
        vocab_processor.save("vocab.txt")
        print('save vocab.txt')
    else:
        x = get_text_idx(x_text, w2v_model.vocab_hash, max_document_length)
        vocab_size = len(w2v_model.vocab_hash)
        print('use w2v .bin')

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    return x_train, x_dev, y_train, y_dev, vocab_size
Example #3
0
def load_data(w2v_model, max_document_length=1290):
    print("Loading data...")
    x_text, y_test = data_helpers.load_data_and_labels(FLAGS.valid_data_file)
    y_test = np.argmax(y_test, axis=1)

    if (max_document_length == 0):
        max_document_length = max([len(x.split(" ")) for x in x_text])
    print('max_document_length = ', max_document_length)

    x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash,
                                  max_document_length)

    return x, y_test
def load_data(w2v_model):
    """Loads starter word-vectors and train/dev/test data."""
    # Load the starter word vectors
    print("Loading data...")
    x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file)
    # for x in x_text:
    #     l = len(x.split(" "))
    #     break

    max_document_length = max([len(x.split(" ")) for x in x_text])
    print('len(x) = ', len(x_text), ' ', len(y))
    print(' max_document_length = ', max_document_length)

    x = []
    vocab_size = 0
    if (w2v_model is None):
        vocab_processor = learn.preprocessing.VocabularyProcessor(
            max_document_length)
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        vocab_size = len(vocab_processor.vocabulary_)

        # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time()))))
        vocab_processor.save("vocab.txt")
        print('save vocab.txt')
    else:
        x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash,
                                      max_document_length)
        vocab_size = len(w2v_model.vocab_hash)
        print('use w2v .bin')

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    # print(type(shuffle_indices))  # <class 'numpy.ndarray'>
    # print(type(x))  # <class 'numpy.ndarray'>
    # print(x[1])  # [7942  181  949 ...    0    0    0]
    # print(x[2])  # [7942  174    5 ...    0    0    0]
    # print(x[1, 2])  # 949
    # print(x[[1, 2]])  # [[7942  181  949 ...    0    0    0],[7942  174    5 ...    0    0    0]]
    # print(x[(1, 2)])  # 949
    x_shuffled = x[shuffle_indices]
    # print(x_shuffled)
    # exit()
    y_shuffled = y[shuffle_indices]

    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    return x_train, x_dev, y_train, y_dev, vocab_size
Example #5
0
def load_data(w2v_model):
    """Loads starter word-vectors and train/dev/test data."""
    #“加载启动词向量和训练/开发/测试数据。

    # Load the starter word vectors加载起始词向量
    print("Loading data...")
    x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file)

    max_document_length = max([len(x.split(" ")) for x in x_text])
    print('len(x) = ', len(x_text), ' ', len(y))
    print(' max_document_length = ', max_document_length)

    x = []
    vocab_size = 0
    if (w2v_model is None):
        #learn.preprocessing.VocabularyProcessor(max_document_length)
        #根据所有已分词好的文本建立好一个词典,然后找出每个词在词典中对应的索引,不足长度或者不存在的词补0
        #max_document_length 最大文档长度
        vocab_processor = learn.preprocessing.VocabularyProcessor(
            max_document_length)

        #从x_text中学习到一个词汇表并返回一个id矩阵
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        vocab_size = len(vocab_processor.vocabulary_)

        # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time()))))
        vocab_processor.save("vocab.txt")
        print('save vocab.txt')
    else:
        x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash,
                                      max_document_length)
        vocab_size = len(w2v_model.vocab_hash)
        print('use w2v .bin')

    #索引值处理
    #训练集和测试集的获取
    np.random.seed(10)  #设定一个随机数种子
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    return x_train, x_dev, y_train, y_dev, vocab_size  #返回训练集和测试集,还有词向量大小
Example #6
0
def load_data(w2v_model,max_document_length = 20):
    """Loads starter word-vectors and train/dev/test data."""
    # Load the starter word vectors
    print("Loading data...")
    x_text, y_test = data_helpers.load_data_and_labels(FLAGS.valid_data_file)
    y_test = np.argmax(y_test, axis=1)

    if(max_document_length == 0) :
        max_document_length = max([len(x.split(" ")) for x in x_text])

    print ('max_document_length = ' , max_document_length)

    x = data_helpers.get_text_idx(x_text,w2v_model.vocab_hash,max_document_length)


    return x,y_test
Example #7
0
def load_data(w2v_model):
    print("Loading data...")
    x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file)
    max_document_length = max([len(x.split(" ")) for x in x_text])
    print('len(x) = ', len(x_text), ' ', len(y))
    print(' max_document_length = ', max_document_length)

    x = []
    vocab_size = 0
    if (w2v_model is None):
        # 随机初始化
        vocab_processor = learn.preprocessing.VocabularyProcessor(
            max_document_length)
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        vocab_size = len(vocab_processor.vocabulary_)
        vocab_processor.save("vocab.dat")
        print('save vocab.dat')
    else:
        # 加载离线w2v
        x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash,
                                      max_document_length)
        vocab_size = len(w2v_model.vocab_hash)
        print('use w2v .bin')

    # shuffle
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    return x_train, x_dev, y_train, y_dev, vocab_size