def load_data(w2v_model=None): print("laoding data") x_text, y = data_helpers.load_data_and_labels(train_data_file) max_document_length = max([len(x.split(" ")) for x in x_text]) if (w2v_model == None): vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) vocab_size = len(vocab_processor.vocabulary_) else: x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash, max_document_length) vocab_size = len(w2v_model.vocab_hash) print('use w2v .bin') np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] return x_train, x_dev, y_train, y_dev, vocab_size
def load_data(w2v_model): """Loads starter word-vectors and train/dev/test data.""" # Load the starter word vectors print("Loading data...") x_text, y = load_data_and_labels(FLAGS.train_data_file) max_document_length = max([len(x.split(" ")) for x in x_text]) # 文本最长长度 print('len(x) = ', len(x_text), ' ', len(y)) print(' max_document_length = ', max_document_length) if (w2v_model is None): vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) vocab_size = len(vocab_processor.vocabulary_) # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time())))) vocab_processor.save("vocab.txt") print('save vocab.txt') else: x = get_text_idx(x_text, w2v_model.vocab_hash, max_document_length) vocab_size = len(w2v_model.vocab_hash) print('use w2v .bin') np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] return x_train, x_dev, y_train, y_dev, vocab_size
def load_data(w2v_model, max_document_length=1290): print("Loading data...") x_text, y_test = data_helpers.load_data_and_labels(FLAGS.valid_data_file) y_test = np.argmax(y_test, axis=1) if (max_document_length == 0): max_document_length = max([len(x.split(" ")) for x in x_text]) print('max_document_length = ', max_document_length) x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash, max_document_length) return x, y_test
def load_data(w2v_model): """Loads starter word-vectors and train/dev/test data.""" # Load the starter word vectors print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file) # for x in x_text: # l = len(x.split(" ")) # break max_document_length = max([len(x.split(" ")) for x in x_text]) print('len(x) = ', len(x_text), ' ', len(y)) print(' max_document_length = ', max_document_length) x = [] vocab_size = 0 if (w2v_model is None): vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) vocab_size = len(vocab_processor.vocabulary_) # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time())))) vocab_processor.save("vocab.txt") print('save vocab.txt') else: x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash, max_document_length) vocab_size = len(w2v_model.vocab_hash) print('use w2v .bin') np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) # print(type(shuffle_indices)) # <class 'numpy.ndarray'> # print(type(x)) # <class 'numpy.ndarray'> # print(x[1]) # [7942 181 949 ... 0 0 0] # print(x[2]) # [7942 174 5 ... 0 0 0] # print(x[1, 2]) # 949 # print(x[[1, 2]]) # [[7942 181 949 ... 0 0 0],[7942 174 5 ... 0 0 0]] # print(x[(1, 2)]) # 949 x_shuffled = x[shuffle_indices] # print(x_shuffled) # exit() y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] return x_train, x_dev, y_train, y_dev, vocab_size
def load_data(w2v_model): """Loads starter word-vectors and train/dev/test data.""" #“加载启动词向量和训练/开发/测试数据。 # Load the starter word vectors加载起始词向量 print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file) max_document_length = max([len(x.split(" ")) for x in x_text]) print('len(x) = ', len(x_text), ' ', len(y)) print(' max_document_length = ', max_document_length) x = [] vocab_size = 0 if (w2v_model is None): #learn.preprocessing.VocabularyProcessor(max_document_length) #根据所有已分词好的文本建立好一个词典,然后找出每个词在词典中对应的索引,不足长度或者不存在的词补0 #max_document_length 最大文档长度 vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) #从x_text中学习到一个词汇表并返回一个id矩阵 x = np.array(list(vocab_processor.fit_transform(x_text))) vocab_size = len(vocab_processor.vocabulary_) # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time())))) vocab_processor.save("vocab.txt") print('save vocab.txt') else: x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash, max_document_length) vocab_size = len(w2v_model.vocab_hash) print('use w2v .bin') #索引值处理 #训练集和测试集的获取 np.random.seed(10) #设定一个随机数种子 shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] return x_train, x_dev, y_train, y_dev, vocab_size #返回训练集和测试集,还有词向量大小
def load_data(w2v_model,max_document_length = 20): """Loads starter word-vectors and train/dev/test data.""" # Load the starter word vectors print("Loading data...") x_text, y_test = data_helpers.load_data_and_labels(FLAGS.valid_data_file) y_test = np.argmax(y_test, axis=1) if(max_document_length == 0) : max_document_length = max([len(x.split(" ")) for x in x_text]) print ('max_document_length = ' , max_document_length) x = data_helpers.get_text_idx(x_text,w2v_model.vocab_hash,max_document_length) return x,y_test
def load_data(w2v_model): print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file) max_document_length = max([len(x.split(" ")) for x in x_text]) print('len(x) = ', len(x_text), ' ', len(y)) print(' max_document_length = ', max_document_length) x = [] vocab_size = 0 if (w2v_model is None): # 随机初始化 vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) vocab_size = len(vocab_processor.vocabulary_) vocab_processor.save("vocab.dat") print('save vocab.dat') else: # 加载离线w2v x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash, max_document_length) vocab_size = len(w2v_model.vocab_hash) print('use w2v .bin') # shuffle np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] return x_train, x_dev, y_train, y_dev, vocab_size