def wd_train_get_batch(title_len=30, batch_size=128): print('loading word train_title and train_content, this should cost minutes, please wait.') train_title = np.load('../data/wd_train_title.npy') train_content = np.load('../data/wd_train_content.npy') p = Pool(6) X_title = np.asarray(list(p.map(pad_X30, train_title))) X_content = np.asarray(list(p.map(wd_pad_cut_docs, train_content))) p.close() p.join() X_content.shape = [-1, 30*10] X = np.hstack([X_title, X_content]) y = np.load('../data/y_tr.npy') # 划分验证集 sample_num = X.shape[0] np.random.seed(13) valid_num = 100000 new_index = np.random.permutation(sample_num) X = X[new_index] y = y[new_index] X_valid = X[:valid_num] y_valid = y[:valid_num] X_train = X[valid_num:] y_train = y[valid_num:] print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape) print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape) # 验证集打 batch print('creating batch data.') sample_num = len(X_valid) print('valid_sample_num=%d' % sample_num) train_batch(X_valid, y_valid, wd_valid_path, batch_size) # 训练集打 batch sample_num = len(X_train) print('train_sample_num=%d' % sample_num) train_batch(X_train, y_train, wd_train_path, batch_size)
def ch_train_get_batch(title_len=52, content_len=300, batch_size=128): print('loading char train_title and train_content.') train_title = np.load('../data/ch_train_title.npy') train_content = np.load('../data/ch_train_content.npy') p = Pool() X_title = np.asarray(p.map(pad_X52, train_title)) X_content = np.asarray(p.map(pad_X300, train_content)) p.close() p.join() X = np.hstack([X_title, X_content]) y = np.load('../data/y_tr.npy') # 划分验证集 sample_num = X.shape[0] np.random.seed(13) valid_num = 100000 new_index = np.random.permutation(sample_num) X = X[new_index] y = y[new_index] X_valid = X[:valid_num] y_valid = y[:valid_num] X_train = X[valid_num:] y_train = y[valid_num:] print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape) print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape) # 验证集打batch print('creating batch data.') sample_num = len(X_valid) print('valid_sample_num=%d' % sample_num) train_batch(X_valid, y_valid, ch_valid_path, batch_size) # 训练集打batch sample_num = len(X_train) print('train_sample_num=%d' % sample_num) train_batch(X_train, y_train, ch_train_path, batch_size)
def wd_train_get_batch(title_len=30, batch_size=128): print('loading word train_title and train_content, this should cost minutes, please wait.') train_title = np.load('../data/wd_train_title.npy') train_content = np.load('../data/wd_train_content.npy') p = Pool(6) X_title = np.asarray(p.map(pad_X30, train_title)) X_content = np.asarray(p.map(wd_pad_cut_docs, train_content)) p.close() p.join() X_content.shape = [-1, 30*10] X = np.hstack([X_title, X_content]) y = np.load('../data/y_tr.npy') # 划分验证集 sample_num = X.shape[0] np.random.seed(13) valid_num = 100000 new_index = np.random.permutation(sample_num) X = X[new_index] y = y[new_index] X_valid = X[:valid_num] y_valid = y[:valid_num] X_train = X[valid_num:] y_train = y[valid_num:] print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape) print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape) # 验证集打 batch print('creating batch data.') sample_num = len(X_valid) print('valid_sample_num=%d' % sample_num) train_batch(X_valid, y_valid, wd_valid_path, batch_size) # 训练集打 batch sample_num = len(X_train) print('train_sample_num=%d' % sample_num) train_batch(X_train, y_train, wd_train_path, batch_size)
def ch_train_get_batch(title_len=52, content_len=300, batch_size=128): print('loading char train_title and train_content.') train_title = np.load('../data/ch_train_title.npy') train_content = np.load('../data/ch_train_content.npy') print('data loaded, start to pad_X52,X300') p = Pool() X_title = np.asarray(p.map(pad_X52, train_title)) X_content = np.asarray(p.map(pad_X300, train_content)) p.close() p.join() print('Pool finished!') X = np.hstack([X_title, X_content]) del X_title, X_content gc.collect() print('del X_title, X_content') y = np.load('../data/y_tr.npy') print('y label loaded ...') # 划分验证集 sample_num = X.shape[0] np.random.seed(13) valid_num = 100000 new_index = np.random.permutation(sample_num) X = X[new_index] #运行到这产生 MemoryError y = y[new_index] X_valid = X[:valid_num] y_valid = y[:valid_num] X_train = X[valid_num:] y_train = y[valid_num:] print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape) print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape) # 验证集打batch print('creating batch data.') sample_num = len(X_valid) print('valid_sample_num=%d' % sample_num) train_batch(X_valid, y_valid, ch_valid_path, batch_size) print('release space, deleting X_valid, y_valid') del X_valid, y_valid gc.collect() # 训练集打batch sample_num = len(X_train) print('train_sample_num=%d' % sample_num) train_batch(X_train, y_train, ch_train_path, batch_size) print('release space, deleting X_train, y_train') del X_train, y_train gc.collect()
def wd_train_get_batch(title_len=30, content_len=150, batch_size=128): print('loading word train title and content...') train_title = np.load('../data/wd_train_title.npy') train_content = np.load('../data/wd_train_content.npy') p = Pool() title = np.asarray(p.map(pad_X30, train_title)) content = np.asarray(p.map(pad_X150, train_content)) p.close() p.join() X = np.hstack([title, content]) print('getting labels, this should cost several minutes, please wait...') y = get_labels() print('y.shape=', y.shape) np.save('../data/y_tr.npy', y) sample_num = X.shape[0] np.random.seed(13) valid_num = 10000 new_index = np.random.permutation(sample_num) X = X[new_index] y = y[new_index] X_valid = X[:valid_num] y_valid = X[:valid_num] X_train = X[valid_num:] y_train = X[valid_num:] print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape) print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape) print('creating batch data.') #验证集打batch sample_num = len(X_valid) print('valid_sample_num=%d' % sample_num) train_batch(X_valid, y_valid, wd_valid_path, batch_size) print('release space, deleting X_valid, y_valid') del X_valid, y_valid gc.collect() # 训练集打batch sample_num = len(X_train) print('train_sample_num=%d' % sample_num) train_batch(X_train, y_train, wd_train_path, batch_size) print('release space, deleting X_train, y_train') del X_train, y_train gc.collect()
def ch_train_get_batch(title_len=30, content_len=150, batch_size=128): print('loading char train_title and train_content.') train_title = np.load('../data/ch_train_title.npy') train_content = np.load('../data/ch_train_content.npy') p = Pool() #补全和截断 X_title = np.asarray(p.map(pad_X30, train_title)) X_content = np.asarray(p.map(pad_X150, train_content)) p.close() p.join() X = np.hstack([X_title, X_content]) print('getting labels, this should cost minutes, please wait.') y = get_lables() print('y.shape=', y.shape) np.save('../data/y_tr.npy', y) #y = np.load('../data/y_tr.npy') # 划分验证集 sample_num = X.shape[0] print(sample_num) np.random.seed(13) valid_num = 10000 new_index = np.random.permutation(sample_num) X = X[new_index] y = y[new_index] X_valid = X[:valid_num] y_valid = y[:valid_num] X_train = X[valid_num:] y_train = y[valid_num:] print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape) print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape) # 验证集打batch print('creating batch data.') sample_num = len(X_valid) print('valid_sample_num=%d' % sample_num) train_batch(X_valid, y_valid, ch_valid_path, batch_size) # 训练集打batch sample_num = len(X_train) print('train_sample_num=%d' % sample_num) train_batch(X_train, y_train, ch_train_path, batch_size)
def wd_train_get_batch(title_len=30, content_len=200, batch_size=128): print('loading word train_title and train_content.') train_title = np.load('../data_new/wd_train_title.npy') train_content = np.load('../data_new/wd_train_content.npy') y = np.load('../data_new/y_tr.npy') print('y.shape=', y.shape) print(" raw titles:", train_title[0], 'title.shape:', train_title.shape) print(" raw contents:", train_content[0], 'contents.shape:', train_content.shape) print("y:", y[0], 'y.shape:', y.shape) #补全和截断 p = Pool() X_title = np.asarray(p.map(pad_X30, train_title)) X_content = np.asarray(p.map(pad_X200, train_content)) p.close() p.join() print("padding 20 X_title:", X_title[0], "shape:", X_title.shape) print("padding 100 X_contents:", X_content[0], "shape:", X_content.shape) #拼接 X = np.hstack([X_title, X_content]) sample_num = len(X) print('sample_num=%d' % sample_num) new_index = np.random.permutation(sample_num) X = X[new_index] y = y[new_index] print("X_train.shape:", X.shape, 'y_train.shape=', y.shape) print('creating batch data.') # 打batch train_batch(X, y, wd_train_path, batch_size)
def jieba_train_get_batch(batch_size=config.BATCH_SIZE): batch_path = jieba_train_path + 'batch/' print('loading word train_title and train_content.') X_train = np.load(jieba_train_path + 'train_data.npy') # 训练集打batch y_train_law = np.load(jieba_train_path + 'train_law_label.npy') sample_num = len(X_train) print('train_sample_num=%d' % sample_num) sample_num = len(y_train_law) print('train_sample_num_law=%d' % sample_num) train_batch(X_train, y_train_law, batch_path + 'law/', batch_size) del y_train_law y_train_accu = np.load(jieba_train_path + 'train_accu_label.npy') sample_num = len(y_train_accu) print('train_sample_num_accu=%d' % sample_num) train_batch(X_train, y_train_accu, batch_path + 'accu/', batch_size) del y_train_accu y_train_time = np.load(jieba_train_path + 'train_time_label.npy') sample_num = len(y_train_time) print('train_sample_num_time=%d' % sample_num) train_batch(X_train, y_train_time, batch_path + 'time/', batch_size) del y_train_time y_train_timelog = np.load(jieba_train_path + 'train_time_labellog.npy') sample_num = len(y_train_timelog) print('train_sample_num_timelog=%d' % sample_num) train_batch(X_train, y_train_timelog, batch_path + 'timelog/', batch_size) del y_train_timelog, X_train batch_path = jieba_valid_path + 'batch/' X_valid = np.load(jieba_valid_path + 'valid_data.npy') # 验证集打batch sample_num = len(X_valid) print('valid_sample_num=%d' % sample_num) y_valid_law = np.load(jieba_valid_path + 'valid_law_label.npy') sample_num = len(y_valid_law) print('valid_sample_num_law=%d' % sample_num) train_batch(X_valid, y_valid_law, batch_path + 'law/', batch_size) del y_valid_law y_valid_accu = np.load(jieba_valid_path + 'valid_accu_label.npy') sample_num = len(y_valid_accu) print('valid_sample_num_accu=%d' % sample_num) train_batch(X_valid, y_valid_accu, batch_path + 'accu/', batch_size) del y_valid_accu y_valid_time = np.load(jieba_valid_path + 'valid_time_label.npy') sample_num = len(y_valid_time) print('valid_sample_num_time=%d' % sample_num) train_batch(X_valid, y_valid_time, batch_path + 'time/', batch_size) del y_valid_time y_valid_timelog = np.load(jieba_valid_path + 'valid_time_labellog.npy') sample_num = len(y_valid_timelog) print('valid_sample_num_timelog=%d' % sample_num) train_batch(X_valid, y_valid_timelog, batch_path + 'timelog/', batch_size) del y_valid_timelog, X_valid