def main(): args = parse_arg() param = json.load(open(args.config_path, "r")) model, train_processed, valid_processed = prepare_model_and_data(param) hist_callback = mz.data_generator.callbacks.Histogram(embedding_matrix, bin_size=30, hist_mode='LCH') tprint("defining generator") train_generator = mz.DataGenerator(train_processed, batch_size=param['batch_size'], shuffle=True, callbacks=[hist_callback]) valid_x, valid_y = valid_processed.unpack() evaluate = mz.callbacks.EvaluateAllMetrics(model, x=valid_x, y=valid_y, batch_size=len(valid_x)) early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=1, verbose=1, mode='min') tprint("fitting") callbacks = [evaluate] if param['early_stop']: callbacks.append(early_stop) history = model.fit_generator(train_generator, epochs=100, callbacks=callbacks, workers=5, use_multiprocessing=False)
def build(self, data_pack, **kwargs) -> DataGenerator: """ Build a DataGenerator. :param data_pack: DataPack to build upon. :param kwargs: Additional keyword arguments to override the keyword arguments passed in `__init__`. """ return mz.DataGenerator(data_pack, **{**self._kwargs, **kwargs})
def train(topic_number, embedding, model_type='drmm'): task = mz.tasks.Ranking() train_raw = train_data(topic_number) if model_type == 'dense': train_processed, model = dense_preprocess(train_raw, task) if model.params.completed(): model.build() model.compile() x, y = train_processed.unpack() model.fit(x, y, batch_size=32, epochs=5) if not os.path.exists(os.path.join(MODEL_DUMP, MODEL_TYPE)): os.makedirs(os.path.join(MODEL_DUMP, MODEL_TYPE)) model.save(os.path.join(MODEL_DUMP, MODEL_TYPE, str(topic_number))) if model_type == 'drmm': # glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300) train_processed, preprocessor, model = drmm_preprocess(train_raw, task, embed_out_dim=embedding.output_dim) if model.params.completed(): model.build() model.compile() embedding_matrix = embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index']) # normalize the word embedding for fast histogram generating. l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1)) embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis] model.load_embedding_matrix(embedding_matrix) hist_callback = mz.data_generator.callbacks.Histogram(embedding_matrix, bin_size=30, hist_mode='LCH') train_generator = mz.DataGenerator(train_processed, mode='point', num_dup=5, num_neg=10, batch_size=20, callbacks=[hist_callback]) history = model.fit_generator(train_generator, epochs=30, workers=30, use_multiprocessing=True) if not os.path.exists(os.path.join(MODEL_DUMP, MODEL_TYPE)): os.makedirs(os.path.join(MODEL_DUMP, MODEL_TYPE)) model.save(os.path.join(MODEL_DUMP, MODEL_TYPE, str(topic_number)))
def get_model_and_data(topic_number, d_pack_test, model_type, embedding): if model_type == 'dense': # load model model = mz.load_model(os.path.join(MODEL_DUMP, MODEL_TYPE, str(topic_number))) # prepare preprocessor train_raw = train_data(topic_number) preprocessor = mz.preprocessors.BasicPreprocessor() preprocessor.fit(train_raw) # transform document data test_processed = preprocessor.transform(d_pack_test) test_x, test_y = test_processed.unpack() if model_type == 'drmm': # load model model = mz.load_model(os.path.join(MODEL_DUMP, MODEL_TYPE, str(topic_number))) task = mz.tasks.Ranking() train_raw = train_data(topic_number) preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=10, fixed_length_right=100, remove_stop_words=False) preprocessor.fit(train_raw) test_processed = preprocessor.transform(d_pack_test) embedding_matrix = embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index']) # normalize the word embedding for fast histogram generating. l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1)) embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis] model.load_embedding_matrix(embedding_matrix) hist_callback = mz.data_generator.callbacks.Histogram(embedding_matrix, bin_size=30, hist_mode='LCH') test_generator = mz.DataGenerator(data_pack=test_processed, mode='point', callbacks=[hist_callback]) test_x, test_y = test_generator[:] return model, test_x
def test_resample(): model = mz.models.Naive() prpr = model.get_default_preprocessor() data_raw = mz.datasets.toy.load_data() data = prpr.fit_transform(data_raw) model.params.update(prpr.context) model.params['task'] = mz.tasks.Ranking() model.build() model.compile() data_gen = mz.DataGenerator( data_pack=data, mode='pair', resample=True, batch_size=4 ) class CheckResample(keras.callbacks.Callback): def __init__(self, data_gen): super().__init__() self._data_gen = data_gen self._orig_indices = None self._flags = [] def on_epoch_end(self, epoch, logs=None): curr_indices = self._data_gen.batch_indices if not self._orig_indices: self._orig_indices = copy.deepcopy(curr_indices) else: self._flags.append(self._orig_indices != curr_indices) self._orig_indices = curr_indices check_resample = CheckResample(data_gen) model.fit_generator(data_gen, epochs=5, callbacks=[check_resample]) assert check_resample._flags assert all(check_resample._flags)
def test_generator(request, test_data_processed): return mz.DataGenerator(test_data_processed)
mz.metrics.NormalizedDiscountedCumulativeGain(k=3), mz.metrics.MeanAveragePrecision() ] # #Initialize the model, fine-tune the hyper-parameters. model = mz.models.DSSM() model.params['input_shapes'] = preprocessor.context['input_shapes'] model.params['task'] = ranking_task model.guess_and_fill_missing_params() model.build() model.compile() # #Generate pair-wise training data on-the-fly, evaluate model performance using customized callbacks on validation data. # #WARNING: PairDataGenerator will be deprecated in MatchZoo v2.2. Use `DataGenerator` with callbacks instead. # #train_generator = mz.PairDataGenerator(train_processed, num_dup=1, num_neg=4, batch_size=64, shuffle=True) train_generator = mz.DataGenerator(train_processed, num_dup=1, num_neg=4, batch_size=64, shuffle=True) valid_x, valid_y = valid_processed.unpack() evaluate = mz.callbacks.EvaluateAllMetrics(model, x=valid_x, y=valid_y, batch_size=len(valid_x)) history = model.fit_generator(train_generator, epochs=20, callbacks=[evaluate], workers=5, use_multiprocessing=False)
def main2(): args = parse_arg() param = json.load(open(args.config_path, "r")) tprint("Loading data") preprocessor, train_processed, valid_processed = drmm_processed() print(train_processed) tprint("Defining task") classification_task = mz.tasks.classification.Classification() classification_task.metrics = ['accuracy'] output_dim = 300 tprint('output_dim : {}'.format(output_dim)) # Initialize the model, fine-tune the hyper-parameters. tprint("building model") model = get_drmm_model(preprocessor, classification_task, output_dim) # for key, v in param.items(): # if key in model.params: # model.params[key] = v # #model.guess_and_fill_missing_params(verbose=1) step_per_epoch = 423 * 128 num_max_steps = 100 * step_per_epoch # # if 'lr_decay' in param and param['lr_decay']: # lr = tf.keras.optimizers.schedules.ExponentialDecay( # initial_learning_rate=param['lr'], # decay_steps=num_max_steps / 20, # decay_rate=0.9) # else: # lr = param['lr'] # model.params['optimizer'] = tf.keras.optimizers.Adam(learning_rate=lr) model.build() model.compile() tprint("processing embedding") term_index = preprocessor.context['vocab_unit'].state['term_index'] embedding_matrix = prepare_embedding(output_dim, term_index) model.load_embedding_matrix(embedding_matrix) hist_callback = mz.data_generator.callbacks.Histogram(embedding_matrix, bin_size=30, hist_mode='LCH') tprint("defining generator") train_generator = mz.DataGenerator(train_processed, batch_size=param['batch_size'], shuffle=True, callbacks=[hist_callback]) valid_x, valid_y = valid_processed.unpack() evaluate = mz.callbacks.EvaluateAllMetrics(model, x=valid_x, y=valid_y, batch_size=len(valid_x)) early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=1, verbose=1, mode='min') tprint("fitting") callbacks = [evaluate] # if param['early_stop']: # callbacks.append(early_stop) history = model.fit_generator(train_generator, epochs=100, callbacks=callbacks, workers=30, use_multiprocessing=True)
def drmm_api(qpool, logdir, dataset_path, train_id, parameter): keras.backend.clear_session() # load数据并创建preprocessor对象 train_pack = load_train_data(train_id, parameter['existing_dataset'], parameter['task']) predict_pack = load_test_data(train_id, parameter['existing_dataset'], parameter['task']) preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=10, fixed_length_right=100, remove_stop_words=False) # 重定向stderr到log文件 logdir.set_preprocess_id(train_id) err_old = sys.stderr sys.stderr = logdir # preprocessor.fit的内容写出到log,写完后关闭重定向,保存preprocessor train_pack_processed = preprocessor.fit_transform(train_pack) sys.stderr = err_old preprocessor.save(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' + train_id + '.drmm_preprocessor') predict_pack_processed = preprocessor.transform(predict_pack) with open(ROOT_PATH + 'matchzoo_temp_files/logger/' + train_id + '.preprocess_log', 'a') as f: f.write('Preprocess finished!') ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss()) ranking_task.metrics = [ mz.metrics.NormalizedDiscountedCumulativeGain(k=3), mz.metrics.NormalizedDiscountedCumulativeGain(k=5), mz.metrics.MeanAveragePrecision() ] model = mz.models.DRMMTKS() bin_size = 30 #model.params['input_shapes'] = [[10, ], [10, bin_size, ]] model.params.update(preprocessor.context) model.params['task'] = ranking_task model.params['mask_value'] = parameter['mask_value'] model.params['embedding_input_dim'] = preprocessor.context['vocab_size'] model.params['embedding_output_dim'] = parameter['embedding_output_dim'] model.params['mlp_num_layers'] = parameter['mlp_num_layers'] model.params['mlp_num_units'] = parameter['mlp_num_units'] model.params['mlp_num_fan_out'] = parameter['mlp_num_fan_out'] model.params['top_k'] = parameter['top_k'] model.params['mlp_activation_func'] = 'relu' model.params['optimizer'] = 'adadelta' model.guess_and_fill_missing_params() model.build() model.compile() model.backend.summary() glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300) embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index']) l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1)) embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis] model.load_embedding_matrix(embedding_matrix) ''' pred_generator = mz.HistogramDataGenerator(data_pack=predict_pack_processed, embedding_matrix=embedding_matrix, bin_size=bin_size, hist_mode='LCH') pred_x, pred_y = pred_generator[:] ''' pred_x, pred_y = predict_pack_processed.unpack() evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, once_every=1, batch_size=len(pred_y) ) train_generator = mz.DataGenerator(train_pack_processed, mode='pair', num_dup=2, num_neg=1, batch_size=20) # 重定向stdout到log当中 qpool.set_trainid(train_id) old = sys.stdout sys.stdout = qpool model.fit_generator(train_generator, epochs=parameter['epochs'], callbacks=[evaluate], workers=5, use_multiprocessing=False) sys.stdout = old model.save(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + '.drmm_model')
embedding_matrix = glove_embedding.build_matrix( preprocessor.context['vocab_unit'].state['term_index']) #normalize the word embedding for fast histogram generating. l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1)) embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis] model.load_embedding_matrix(embedding_matrix) print("embedding matrix loaded") hist_callback = mz.data_generator.callbacks.Histogram(embedding_matrix, bin_size=30, hist_mode='LCH') pred_generator = mz.DataGenerator(dev_pack_processed, mode='point', callbacks=[hist_callback]) print("pred generator") pred_x, pred_y = pred_generator[:] evaluate = mz.callbacks.EvaluateAllMetrics( model, x=pred_x, y=pred_y, once_every=2, batch_size=len(pred_y), model_save_path='./pretrained_models/drmm_pretrained_model_fold' + fold + '/') train_generator = mz.DataGenerator(train_pack_processed,
model.load_embedding_matrix(embeddingMatrix) print('Unpacking test data') xTest, yTest = dataTestProc.unpack() evaluate = mz.callbacks.EvaluateAllMetrics(model, x=xTest, y=yTest, batch_size=len(xTest)) print('Generating training data!') # This needs to use the processed data! trainGenerator = mz.DataGenerator(dataTrainProc, mode='pair', num_dup=5, num_neg=1, batch_size=BATCH_SIZE) print('num batches:', len(trainGenerator)) history = model.fit_generator(trainGenerator, epochs=epochQty, callbacks=[evaluate], workers=WORKERS_QTY, use_multiprocessing=USE_MULTI_PROC) model.save(modelFile) print(model.evaluate(xTest, yTest, batch_size=128)) #except: # tb is traceback #type, value, tb = sys.exc_info()
model.backend.summary() glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300) embedding_matrix = glove_embedding.build_matrix( preprocessor.context['vocab_unit'].state['term_index']) # normalize the word embedding for fast histogram generating. l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1)) embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis] model.load_embedding_matrix(embedding_matrix) hist_callback = mz.data_generator.callbacks.Histogram(embedding_matrix, bin_size=30, hist_mode='LCH') test_generator = mz.DataGenerator(test_processed, mode='point', callbacks=[hist_callback]) test_x, test_y = test_generator[:] eval_generator = mz.DataGenerator(validation_processed, mode='point', callbacks=[hist_callback]) val_x, val_y = eval_generator[:] evaluate = mz.callbacks.EvaluateAllMetrics(model, x=val_x, y=val_y, batch_size=len(val_y)) callback_earlystopping = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=0,
model.backend.summary() embedding_matrix = embedding.build_matrix( preprocessor.context['vocab_unit'].state['term_index']) model.load_embedding_matrix(embedding_matrix) test_x, test_y = test_pack_processed.unpack() evaluate = mz.callbacks.EvaluateAllMetrics(model, x=test_x, y=test_y, batch_size=len(test_x)) train_generator = mz.DataGenerator(train_pack_processed, mode='pair', num_dup=2, num_neg=1, batch_size=4) # X, y = train_pack_processed.unpack() # print('num batches:', len(train_generator)) # early_stop = EarlyStopping(monitor=mz.metrics.NormalizedDiscountedCumulativeGain(k=1), mode="max", patience=1) # check_point = ModelCheckpoint(model_path + "best_model.h5", # monitor=mz.metrics.NormalizedDiscountedCumulativeGain(k=1), # verbose=1, save_best_only=True, mode="min") history = model.fit_generator(train_generator, epochs=10, callbacks=[evaluate], workers=1) # model.fit(x=X, y=y, batch_size=32, epochs=100, callbacks=[evaluate, early_stop])
def tutorial_drmm(): # data = get_processed_data() # data = knrm_processed() print("Loading data") data = get_processed_data_from_cache() preprocessor, train_processed, valid_processed = data # save_to_pickle(data, "matchzoo_prac1") print("Defining task") ranking_task = mz.tasks.Ranking(loss=mz.losses.RankCrossEntropyLoss( num_neg=4)) ranking_task.metrics = [ mz.metrics.NormalizedDiscountedCumulativeGain(k=3), mz.metrics.MeanAveragePrecision() ] glove_embedding = mz.datasets.embeddings.load_glove_embedding( dimension=300) print('output_dim', glove_embedding.output_dim) #Initialize the model, fine-tune the hyper-parameters. print("building model") bin_size = 30 model = mz.models.DRMM() model.params.update(preprocessor.context) model.params['input_shapes'] = [[ 10, ], [ 10, bin_size, ]] model.params['task'] = ranking_task model.params['mask_value'] = 0 model.params['embedding_output_dim'] = glove_embedding.output_dim model.params['mlp_num_layers'] = 1 model.params['mlp_num_units'] = 10 model.params['mlp_num_fan_out'] = 1 model.params['mlp_activation_func'] = 'tanh' model.params['optimizer'] = 'adadelta' model.build() model.compile() model.backend.summary() embedding_matrix = glove_embedding.build_matrix( preprocessor.context['vocab_unit'].state['term_index']) print(embedding_matrix.shape) # normalize the word embedding for fast histogram generating. l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1)) print(l2_norm.shape) embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis] print(embedding_matrix.shape) model.load_embedding_matrix(embedding_matrix) hist_callback = mz.data_generator.callbacks.Histogram(embedding_matrix, bin_size=30, hist_mode='LCH') pred_generator = mz.DataGenerator(valid_processed, mode='point', callbacks=[hist_callback]) pred_x, pred_y = pred_generator[:] print("defining generator") train_generator = mz.DataGenerator(train_processed, mode='pair', num_dup=5, num_neg=10, batch_size=20, callbacks=[hist_callback]) valid_x, valid_y = valid_processed.unpack() evaluate = mz.callbacks.EvaluateAllMetrics(model, x=valid_x, y=valid_y, batch_size=len(valid_x)) print("fitting") history = model.fit_generator(train_generator, epochs=20, callbacks=[evaluate], workers=5, use_multiprocessing=False)
def data_gen(): return mz.DataGenerator(mz.datasets.toy.load_data())
if __name__ == '__main__': nltk.download('punkt') train_data_path = 'my_train_data.csv' test_data_path = 'my_test_data.csv' train_data = load_data(train_data_path) # 这里就是上面数据格式转换的训练集和测试集路径 test_data = load_data(test_data_path) train_dev_split = int(len(train_data) * 0.9) # 验证集占训练数据的0.1 train = train_data[:train_dev_split] dev = train_data[train_dev_split:] train_pack_processed = preprocessor.fit_transform(train) # 其实就是做了一个字符转id操作,所以对于中文文本,不需要分词 dev_pack_processed = preprocessor.transform(dev) test_pack_processed = preprocessor.transform(test_data) train_data_generator = mz.DataGenerator(train_pack_processed, batch_size=32, shuffle=True) # 训练数据生成器 test_x, test_y = test_pack_processed.unpack() dev_x, dev_y = dev_pack_processed.unpack() model = build() batch_size = 32 evaluate = mz.callbacks.EvaluateAllMetrics(model, x=dev_x, y=dev_y, batch_size=batch_size) model.fit_generator(train_data_generator, epochs=5, callbacks=[evaluate], workers=5, use_multiprocessing=False) y_pred = model.predict(test_x) left_id = test_x['id_left'] right_id = test_x['id_right'] assert (len(left_id) == len(left_id))
def train_generator(request, train_data_processed): return mz.DataGenerator(train_data_processed)
model.params['dropout_rate'] = 0.33 print(model.params) model.build() model.compile() model.backend.summary() glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300) embedding_matrix = glove_embedding.build_matrix( preprocessor.context['vocab_unit'].state['term_index']) model.load_embedding_matrix(embedding_matrix) dpool_callback = mz.data_generator.callbacks.DynamicPooling( fixed_length_left=20, fixed_length_right=20) val_generator = mz.DataGenerator(validation_processed, callbacks=[dpool_callback]) val_x, val_y = val_generator[:] test_generator = mz.DataGenerator(test_processed, callbacks=[dpool_callback]) test_x, test_y = test_generator[:] evaluate = mz.callbacks.EvaluateAllMetrics(model, x=val_x, y=val_y, batch_size=len(val_y)) callback_earlystopping = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=100, min_delta=0.001) mcp_save = keras.callbacks.ModelCheckpoint('best_one_matchpyramid',
model.params.update(preprocessor.context) model.params['task'] = ranking_task model.params['filters'] = 64 model.params['conv_activation_func'] = 'relu' model.params['optimizer'] = 'adam' model.guess_and_fill_missing_params(verbose=0) model.build() model.compile() pred_x, pred_y = valid_processed.unpack() evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_y)) x = int(input('Enter number of batchs:')) data_generator = mz.DataGenerator(train_processed, batch_size=x) x = int(input('Enter number of epochs:')) history = model.fit_generator(data_generator, epochs=x, callbacks=[evaluate], use_multiprocessing=True, workers=max(1, multiprocessing.cpu_count() - 1)) pred_x, pred_y = test_processed.unpack() result = model.predict(pred_x) final_res = [] for x in result: final_res.append(x[1])
model.params['mlp_num_fan_out'] = 100 model.params['mlp_activation_func'] = 'relu' model.params['dropout_rate'] = 0.5 model.params['optimizer'] = 'adam' model.guess_and_fill_missing_params() model.build() model.compile() print("hyper-parameters: ") hparams = dict(model.params, **hparams) pprint(hparams) print("start to train...") # train & save model train_generator = mz.DataGenerator(train_pack_processed, batch_size=hparams['batch_size'], shuffle=True) model.fit_generator(train_generator, epochs=hparams['n_epochs'], use_multiprocessing=False, workers=1, verbose=0) model.save(models_dir) # evaluate on test dataset valid_x, valid_y = valid_pack_processed.unpack() preds = model.predict(valid_x) y_trues, y_preds = [], [] for pred, y in zip(preds, valid_y): y_true = y[1] y_trues.append(int(y[1]))