Esempio n. 1
0
def load_data(w2v_model):
    """Loads starter word-vectors and train/dev/test data."""
    # Load the starter word vectors
    print("Loading data...")
    x_text, y = load_data_and_labels(FLAGS.train_data_file)

    max_document_length = max([len(x.split(" ")) for x in x_text])  # 文本最长长度
    print('len(x) = ', len(x_text), ' ', len(y))
    print(' max_document_length = ', max_document_length)
    if (w2v_model is None):
        vocab_processor = learn.preprocessing.VocabularyProcessor(
            max_document_length)
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        vocab_size = len(vocab_processor.vocabulary_)

        # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time()))))
        vocab_processor.save("vocab.txt")
        print('save vocab.txt')
    else:
        x = get_text_idx(x_text, w2v_model.vocab_hash, max_document_length)
        vocab_size = len(w2v_model.vocab_hash)
        print('use w2v .bin')

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    return x_train, x_dev, y_train, y_dev, vocab_size
Esempio n. 2
0
def load_data(w2v_model=None):
    print("laoding data")
    x_text, y = data_helpers.load_data_and_labels(train_data_file)
    max_document_length = max([len(x.split(" ")) for x in x_text])
    if (w2v_model == None):
        vocab_processor = learn.preprocessing.VocabularyProcessor(
            max_document_length)
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        vocab_size = len(vocab_processor.vocabulary_)
    else:
        x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash,
                                      max_document_length)
        vocab_size = len(w2v_model.vocab_hash)
        print('use w2v .bin')

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    dev_sample_index = -1 * int(dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    return x_train, x_dev, y_train, y_dev, vocab_size
Esempio n. 3
0
    def deal_data(self, text, max_document_length = 10):
        
        words = jieba.cut(text)
        x_text = [' '.join(words)]
        x = data_helpers.get_text_idx(x_text, self.w2v_wr.model.vocab_hash, max_document_length)

        return x
Esempio n. 4
0
    def deal_data(self, text, max_document_length = 10):
        
        words = jieba.cut(text)#jieba分词
        x_text = [' '.join(words)]#把分出来的词用空格隔开
        x = data_helpers.get_text_idx(x_text, self.w2v_wr.model.vocab_hash, max_document_length)
		#使用data_input_helper.py里的get_text_idx函数
        return x
def load_data(embedding_model, dataset_pickle_path):
    """Loads starter word-vectors and train/dev/test data.
    Input:
        embedding_model: WordEmbeddingModel对象,这里并不特定指定使用某个词向量模型
        dataset_pickle_path: 数据集路径,可以是针对某个subject的数据集
    Output:
        x_train, y_train, x_dev, y_dev: 训练集样本以及标签,验证集样本以及标签
        vocab_size: 词典大小
    """
    import pickle
    # Load the starter word vectors
    print("Loading data...")
    #x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file)

    # 当前只读入subject类别信息
    #content_id_list, x_text, y, _ = pickle.load(open("../pickles/preprocessed-train-dataset.pickle"))
    content_id_list, x_text, y = pickle.load(open(dataset_pickle_path, 'rb'))

    # 每句话最多的单词数
    max_document_length = max([len(x.split(" ")) for x in x_text])
    print('len(x) = ', len(x_text), ' ', len(y))
    print(' max_document_length = ', max_document_length)

    x = []
    vocab_size = 0
    if embedding_model is None:
        vocab_processor = learn.preprocessing.VocabularyProcessor(
            max_document_length)
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        vocab_size = len(vocab_processor.vocabulary_)

        # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time()))))
        vocab_processor.save("vocab.txt")
        print('save vocab.txt')
    else:
        # embedding_model.vocab_hash 相对于一个dict
        print('Using word embeddings!')
        x = data_helpers.get_text_idx(x_text, embedding_model.vocab_hash,
                                      max_document_length)
        vocab_size = len(embedding_model.vocab_hash)

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]
    return x_train, x_dev, y_train, y_dev, vocab_size
Esempio n. 6
0
def load_data(w2v_model, max_document_length=1290):
    print("Loading data...")
    x_text, y_test = data_helpers.load_data_and_labels(FLAGS.valid_data_file)
    y_test = np.argmax(y_test, axis=1)

    if (max_document_length == 0):
        max_document_length = max([len(x.split(" ")) for x in x_text])
    print('max_document_length = ', max_document_length)

    x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash,
                                  max_document_length)

    return x, y_test
def load_data(w2v_model):
    """Loads starter word-vectors and train/dev/test data."""
    # Load the starter word vectors
    print("Loading data...")
    x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file)
    # for x in x_text:
    #     l = len(x.split(" "))
    #     break

    max_document_length = max([len(x.split(" ")) for x in x_text])
    print('len(x) = ', len(x_text), ' ', len(y))
    print(' max_document_length = ', max_document_length)

    x = []
    vocab_size = 0
    if (w2v_model is None):
        vocab_processor = learn.preprocessing.VocabularyProcessor(
            max_document_length)
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        vocab_size = len(vocab_processor.vocabulary_)

        # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time()))))
        vocab_processor.save("vocab.txt")
        print('save vocab.txt')
    else:
        x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash,
                                      max_document_length)
        vocab_size = len(w2v_model.vocab_hash)
        print('use w2v .bin')

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    # print(type(shuffle_indices))  # <class 'numpy.ndarray'>
    # print(type(x))  # <class 'numpy.ndarray'>
    # print(x[1])  # [7942  181  949 ...    0    0    0]
    # print(x[2])  # [7942  174    5 ...    0    0    0]
    # print(x[1, 2])  # 949
    # print(x[[1, 2]])  # [[7942  181  949 ...    0    0    0],[7942  174    5 ...    0    0    0]]
    # print(x[(1, 2)])  # 949
    x_shuffled = x[shuffle_indices]
    # print(x_shuffled)
    # exit()
    y_shuffled = y[shuffle_indices]

    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    return x_train, x_dev, y_train, y_dev, vocab_size
Esempio n. 8
0
def load_data(w2v_model):
    """Loads starter word-vectors and train/dev/test data."""
    #“加载启动词向量和训练/开发/测试数据。

    # Load the starter word vectors加载起始词向量
    print("Loading data...")
    x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file)

    max_document_length = max([len(x.split(" ")) for x in x_text])
    print('len(x) = ', len(x_text), ' ', len(y))
    print(' max_document_length = ', max_document_length)

    x = []
    vocab_size = 0
    if (w2v_model is None):
        #learn.preprocessing.VocabularyProcessor(max_document_length)
        #根据所有已分词好的文本建立好一个词典,然后找出每个词在词典中对应的索引,不足长度或者不存在的词补0
        #max_document_length 最大文档长度
        vocab_processor = learn.preprocessing.VocabularyProcessor(
            max_document_length)

        #从x_text中学习到一个词汇表并返回一个id矩阵
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        vocab_size = len(vocab_processor.vocabulary_)

        # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time()))))
        vocab_processor.save("vocab.txt")
        print('save vocab.txt')
    else:
        x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash,
                                      max_document_length)
        vocab_size = len(w2v_model.vocab_hash)
        print('use w2v .bin')

    #索引值处理
    #训练集和测试集的获取
    np.random.seed(10)  #设定一个随机数种子
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    return x_train, x_dev, y_train, y_dev, vocab_size  #返回训练集和测试集,还有词向量大小
Esempio n. 9
0
def load_data(w2v_model,max_document_length = 20):
    """Loads starter word-vectors and train/dev/test data."""
    # Load the starter word vectors
    print("Loading data...")
    x_text, y_test = data_helpers.load_data_and_labels(FLAGS.valid_data_file)
    y_test = np.argmax(y_test, axis=1)

    if(max_document_length == 0) :
        max_document_length = max([len(x.split(" ")) for x in x_text])

    print ('max_document_length = ' , max_document_length)

    x = data_helpers.get_text_idx(x_text,w2v_model.vocab_hash,max_document_length)


    return x,y_test
Esempio n. 10
0
def load_data(w2v_model):
    print("Loading data...")
    x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file)
    max_document_length = max([len(x.split(" ")) for x in x_text])
    print('len(x) = ', len(x_text), ' ', len(y))
    print(' max_document_length = ', max_document_length)

    x = []
    vocab_size = 0
    if (w2v_model is None):
        # 随机初始化
        vocab_processor = learn.preprocessing.VocabularyProcessor(
            max_document_length)
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        vocab_size = len(vocab_processor.vocabulary_)
        vocab_processor.save("vocab.dat")
        print('save vocab.dat')
    else:
        # 加载离线w2v
        x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash,
                                      max_document_length)
        vocab_size = len(w2v_model.vocab_hash)
        print('use w2v .bin')

    # shuffle
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    return x_train, x_dev, y_train, y_dev, vocab_size