def arci_api(qpool, logdir, dataset_path, train_id, parameter): keras.backend.clear_session() # load数据并创建preprocessor对象 train_pack = load_train_data(train_id, parameter['existing_dataset'], parameter['task']) predict_pack = load_test_data(train_id, parameter['existing_dataset'], parameter['task']) preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=10, fixed_length_right=100, remove_stop_words=False) # 重定向stderr到log文件 logdir.set_preprocess_id(train_id) err_old = sys.stderr sys.stderr = logdir # preprocessor.fit的内容写出到log,写完后关闭重定向,保存preprocessor train_pack_processed = preprocessor.fit_transform(train_pack) sys.stderr = err_old preprocessor.save(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' + train_id + '.arci_preprocessor') predict_pack_processed = preprocessor.transform(predict_pack) with open(ROOT_PATH + 'matchzoo_temp_files/logger/' + train_id + '.preprocess_log', 'a') as f: f.write('Preprocess finished!') ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss()) ranking_task.metrics = [ mz.metrics.NormalizedDiscountedCumulativeGain(k=3), mz.metrics.NormalizedDiscountedCumulativeGain(k=5), mz.metrics.MeanAveragePrecision() ] model = mz.models.ArcI() model.params['input_shapes'] = preprocessor.context['input_shapes'] model.params['task'] = ranking_task model.params['embedding_input_dim'] = preprocessor.context['vocab_size'] model.params['embedding_output_dim'] = parameter['embedding_output_dim'] model.params['num_blocks'] = parameter['num_blocks'] model.params['left_filters'] = [parameter['left_filters']] model.params['left_kernel_sizes'] = [parameter['left_kernel_sizes']] model.params['left_pool_sizes'] = [parameter['left_pool_sizes']] model.params['right_filters'] = [parameter['right_filters']] model.params['right_kernel_sizes'] = [parameter['right_kernel_sizes']] model.params['right_pool_sizes'] = [parameter['right_pool_sizes']] model.params['conv_activation_func'] = 'relu' model.params['mlp_num_layers'] = parameter['mlp_num_layers'] model.params['mlp_num_units'] = parameter['mlp_num_units'] model.params['mlp_num_fan_out'] = parameter['mlp_num_fan_out'] model.params['mlp_activation_func'] = 'relu' model.params['dropout_rate'] = 0.9 model.params['optimizer'] = 'adadelta' model.guess_and_fill_missing_params() model.build() model.compile() model.backend.summary() glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300) embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index']) model.load_embedding_matrix(embedding_matrix) pred_x, pred_y = predict_pack_processed[:].unpack() evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_y)) train_generator = mz.PairDataGenerator(train_pack_processed, num_dup=2, num_neg=1, batch_size=20) # 重定向stdout到log当中 qpool.set_trainid(train_id) old = sys.stdout sys.stdout = qpool model.fit_generator(train_generator, epochs=parameter['epochs'], callbacks=[evaluate], workers=5, use_multiprocessing=False) sys.stdout = old model.save(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + '.arci_model')
def train(train_id='test_file'): train_pack = mz.datasets.wiki_qa.load_data(stage='train')[:1000] dev_pack = mz.datasets.wiki_qa.load_data(stage='dev')[:1000] predict_pack = mz.datasets.wiki_qa.load_data( stage='test').drop_label()[:1000] preprocessor = mz.preprocessors.DSSMPreprocessor() preprocessor.fit(train_pack) preprocessor.save(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' + train_id + '.dssm_preprocessor') train_pack_processed = preprocessor.transform(train_pack) dev_pack_processed = preprocessor.transform(dev_pack) train_generator = mz.PairDataGenerator(train_pack_processed, num_dup=5, num_neg=1, batch_size=32) ranking_task = mz.tasks.Ranking( loss=mz.losses.RankHingeLoss(num_neg=1, margin=1.0)) ranking_task.metrics = [ 'mae', 'map', 'precision', mz.metrics.Precision(k=3), mz.metrics.DiscountedCumulativeGain(k=1), mz.metrics.DiscountedCumulativeGain(k=3), mz.metrics.DiscountedCumulativeGain(k=5), mz.metrics.NormalizedDiscountedCumulativeGain(k=1), mz.metrics.NormalizedDiscountedCumulativeGain(k=3), mz.metrics.NormalizedDiscountedCumulativeGain(k=5) ] model = mz.models.DSSMModel() model.params['task'] = ranking_task model.params['input_shapes'] = preprocessor.context['input_shapes'] model.guess_and_fill_missing_params() model.build() model.compile() dev_x, dev_y = dev_pack_processed.unpack() evaluate = model.EvaluateOnCall(model, x=dev_x, y=dev_y, valid_steps=2, batch_size=32) model.fit(*train_pack_processed.unpack(), epochs=10, batch_size=32, callbacks=[evaluate]) model.save(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + '.dssm_model') model.fit_generator(train_generator, epochs=5, callbacks=[evaluate], workers=4, use_multiprocessing=True)
def tutorial(): # data = get_processed_data() # data = knrm_processed() print("Loading data") data = get_processed_data_from_cache() preprocessor, train_processed, valid_processed = data # save_to_pickle(data, "matchzoo_prac1") print("Defining task") ranking_task = mz.tasks.Ranking(loss=mz.losses.RankCrossEntropyLoss( num_neg=4)) ranking_task.metrics = [ mz.metrics.NormalizedDiscountedCumulativeGain(k=3), mz.metrics.MeanAveragePrecision() ] glove_embedding = mz.datasets.embeddings.load_glove_embedding( dimension=300) print('output_dim', glove_embedding.output_dim) #Initialize the model, fine-tune the hyper-parameters. print("building model") model = mz.models.KNRM() model.params.update(preprocessor.context) model.params['task'] = ranking_task model.params['embedding_output_dim'] = glove_embedding.output_dim model.params['embedding_trainable'] = True model.params['kernel_num'] = 21 model.params['sigma'] = 0.1 model.params['exact_sigma'] = 0.001 model.params['optimizer'] = 'adadelta' model.build() model.compile() embedding_matrix = glove_embedding.build_matrix( preprocessor.context['vocab_unit'].state['term_index']) print(embedding_matrix.shape) # normalize the word embedding for fast histogram generating. l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1)) print(l2_norm.shape) embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis] print(embedding_matrix.shape) model.load_embedding_matrix(embedding_matrix) print("defining generator") train_generator = mz.PairDataGenerator(train_processed, num_dup=1, num_neg=4, batch_size=64, shuffle=True) valid_x, valid_y = valid_processed.unpack() evaluate = mz.callbacks.EvaluateAllMetrics(model, x=valid_x, y=valid_y, batch_size=len(valid_x)) print("fitting") history = model.fit_generator(train_generator, epochs=20, callbacks=[evaluate], workers=5, use_multiprocessing=False)
def cdssm_api(qpool, logdir, dataset_path, train_id, parameter): keras.backend.clear_session() # load数据并创建preprocessor对象 train_pack = load_train_data(train_id, parameter['existing_dataset'], parameter['task']) predict_pack = load_test_data(train_id, parameter['existing_dataset'], parameter['task']) preprocessor = mz.preprocessors.CDSSMPreprocessor() # 重定向stderr到log文件 logdir.set_preprocess_id(train_id) err_old = sys.stderr sys.stderr = logdir # preprocessor.fit的内容写出到log,写完后关闭重定向,保存preprocessor train_pack_processed = preprocessor.fit_transform(train_pack) sys.stderr = err_old preprocessor.save(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' + train_id + '.cdssm_preprocessor') predict_pack_processed = preprocessor.transform(predict_pack) with open(ROOT_PATH + 'matchzoo_temp_files/logger/' + train_id + '.preprocess_log', 'a') as f: f.write('Preprocess finished!') ranking_task = mz.tasks.Ranking(loss=mz.losses.RankCrossEntropyLoss(num_neg=4)) ranking_task.metrics = [ mz.metrics.NormalizedDiscountedCumulativeGain(k=3), mz.metrics.NormalizedDiscountedCumulativeGain(k=5), mz.metrics.MeanAveragePrecision() ] model = mz.models.CDSSM() model.params['input_shapes'] = preprocessor.context['input_shapes'] model.params['task'] = ranking_task model.params['filters'] = parameter['filters'] model.params['kernel_size'] = parameter['kernel_size'] model.params['strides'] = parameter['strides'] model.params['padding'] = parameter['padding'] model.params['conv_activation_func'] = parameter['conv_activation_func'] model.params['w_initializer'] = parameter['w_initializer'] model.params['b_initializer'] = parameter['b_initializer'] model.params['mlp_num_layers'] = parameter['mlp_num_layers'] model.params['mlp_num_units'] = parameter['mlp_num_units'] model.params['mlp_num_fan_out'] = parameter['mlp_num_fan_out'] model.params['mlp_activation_func'] = parameter['mlp_activation_func'] model.params['dropout_rate'] = 0.8 model.params['optimizer'] = 'adadelta' model.guess_and_fill_missing_params() model.guess_and_fill_missing_params() model.build() model.compile() model.backend.summary() pred_x, pred_y = predict_pack_processed[:].unpack() evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_x)) train_generator = mz.PairDataGenerator(train_pack_processed, num_dup=1, num_neg=4, batch_size=64, shuffle=True) # 重定向stdout到log当中 qpool.set_trainid(train_id) old = sys.stdout sys.stdout = qpool model.fit_generator(train_generator, epochs=parameter['epochs'], callbacks=[evaluate], workers=5, use_multiprocessing=False) sys.stdout = old model.save(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + '.cdssm_model')
callback_earlystopping = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=100, min_delta=0.001) mcp_save = keras.callbacks.ModelCheckpoint('best_one_cdssm', save_best_only=True, monitor='val_loss', mode='min') evaluate = mz.callbacks.EvaluateAllMetrics(model, x=val_x, y=val_y, batch_size=len(val_y)) data_generator = mz.PairDataGenerator(train_processed, num_dup=2, num_neg=2, batch_size=128) start_time = time.time() model.fit_generator(data_generator, epochs=1000, validation_data=(val_x, val_y), callbacks=[evaluate, callback_earlystopping, mcp_save], verbose=2) print( "===================================================== Training time =====================================================" ) print("--- %s seconds ---" % (time.time() - start_time)) print( "=========================================================================================================================" )
model.params['mlp_num_layers'] = 3 model.params['mlp_num_units'] = 300 model.params['mlp_num_fan_out'] = 128 model.params['mlp_activation_func'] = 'relu' model.guess_and_fill_missing_params() model.build() model.compile() model.backend.summary() # In[8]: pred_x, pred_y = valid_pack_processed[:].unpack() # print(pred_x,pred_y) evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_x)) train_generator = mz.PairDataGenerator(train_pack_processed, num_dup=1, num_neg=4, batch_size=32, shuffle=True) len(train_generator) history = model.fit_generator(train_generator, epochs=100, callbacks=[evaluate], workers=5, use_multiprocessing=False)
model.guess_and_fill_missing_params() model.build() model.compile() model.backend.summary() matrix = embedding.build_matrix( preprocessor.context['vocab_unit'].state['term_index']) model.load_embedding_matrix(matrix) pred_x, pred_y = predict_pack_processed[:].unpack() evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_y)) train_generator = mz.PairDataGenerator(train_pack_processed, num_dup=2, num_neg=1, batch_size=20) history = model.fit_generator(train_generator, epochs=30, callbacks=[evaluate], workers=30, use_multiprocessing=True) # evaluate the model scores = model.evaluate(pred_x, pred_y, batch_size=len(pred_y)) mrrscores.append(scores[mz.engine.parse_metric('mrr')] * 100) mapscores.append(scores[mz.metrics.MeanAveragePrecision()] * 100) ndcgscores.append( scores[mz.metrics.NormalizedDiscountedCumulativeGain(k=1)] * 100) model.save(model_path) preprocessor.save(pre_path) print("\n>>> Resultat mrr: %.2f%% (+/- %.2f%%)" %