def doc_emb(args, config): print("{} begins {}".format(args.mode, "-" * 10)) model, query_model, doc_model = create_model(**config) load_model(doc_model, args.model_path, "doc.h5") test_files = fs_list(args.test_dir)[0] dataset = dataset_reader(test_files, shuffle=False, batch_size=128, repeat_num=1) dataset_iter = iter(dataset) try: while True: features, label = dataset_iter.next() batch = [] batch.append(features['doc0_Input']) result = doc_model.predict_on_batch(batch) # make sure to normalize the embedding norm = np.linalg.norm(result, axis=1, keepdims=True) result = result / norm for res in result: print("\t".join(["{}".format(r) for r in res])) except: #traceback.print_exc() pass print("{} ends {}".format(args.mode, "-" * 10)) pass
def predict(args, config): print("{} begins {}".format(args.mode, "-" * 10)) model, query_model, doc_model = create_model(**config) load_model(model, args.model_path, "main.h5") test_files = fs_list(args.test_dir)[0] dataset = dataset_reader(test_files, shuffle=False, batch_size=128, repeat_num=1) dataset_iter = iter(dataset) try: while True: features, label = dataset_iter.next() batch = [] batch.append(features['query_Input']) for i in range(config.get("doc_num", 5)): batch.append(features['doc{}_Input'.format(i)]) result = model.predict_on_batch(batch) for res in result: print("\t".join(["{}".format(r) for r in res])) except: #traceback.print_exc() pass print("{} ends {}".format(args.mode, "-" * 10)) pass
def doc_emb_raw(args, config): print("{} begins {}".format(args.mode, "-" * 10)) model, query_model, doc_model = create_model(**config) load_model(doc_model, args.model_path, "doc.h5") test_files = fs_list(args.test_dir)[0] for filename in test_files: with open(filename) as fp: for line in fp: features, label = parser_raw_line(line) #print(json.dumps(features)) doc_input = features["doc0_Input"] emb = doc_model.predict_on_batch([[doc_input]]) # TODO normalize the result print("\t".join([str(x) for x in emb[0].tolist()])) print("{} ends {}".format(args.mode, "-" * 10)) pass
def sub_model_test(args, config): model, query_model, doc_model = create_model(**config) load_model(model, args.model_path, "main.h5") load_model(query_model, args.model_path, "query.h5") load_model(doc_model, args.model_path, "doc.h5") test_files = fs_list(args.test_dir)[0] dataset = dataset_reader(test_files, shuffle=False, batch_size=128, repeat_num=1) dataset_iter = iter(dataset) try: while True: print("*" * 10) features, label = dataset_iter.next() batch = [] batch.append(features['query_Input']) for i in range(config.get("doc_num", 5)): batch.append(features['doc{}_Input'.format(i)]) score_result = model.predict_on_batch(batch) for res in score_result: print("\t".join(["{}".format(r) for r in res])) query_emb = query_model.predict_on_batch(batch[0]) query_emb_n = np.linalg.norm(query_emb, axis=1, keepdims=True) # make sure to normalize the embedding query_emb = query_emb / query_emb_n batch_size = query_emb.shape[0] distance_arr = np.empty(shape=[0, batch_size], dtype=np.float64) for b in batch[1:]: doc_emb = doc_model.predict_on_batch(b) doc_emb_n = np.linalg.norm(doc_emb, axis=1, keepdims=True) # make sure to normalize the embedding doc_emb = doc_emb / doc_emb_n distance_mat = query_emb.dot(doc_emb.transpose((1, 0))) distances = np.diagonal(distance_mat) distances = np.expand_dims(distances, axis=0) distance_arr = np.concatenate((distance_arr, distances), axis=0) print(softmax(distance_arr.T)) print("*" * 10) except: traceback.print_exc() pass pass
def train(args, config): print("{} begins {}".format(args.mode, "-" * 10)) if not args.pretrain: clear_model_dir(args) print("model config is {}".format(json.dumps(config))) # define and compile model # TODO distributed training mirrored_strategy = tf.distribute.MirroredStrategy( devices=["/gpu:0", "/gpu:1"]) # with mirrored_strategy.scope(): if True: model, query_model, doc_model = create_model(**config) optimizer = tf.keras.optimizers.Adam(lr=0.02, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.999, amsgrad=False) metrics = create_metrics(**config) loss = create_loss(**config) model.compile(optimizer=optimizer, loss=loss, metrics=metrics) # train model train_files = fs_list(args.train_dir)[0] dataset = dataset_reader(train_files, shuffle=True, batch_size=128, repeat_num=200) # define train callbacks cp_callback = tf.keras.callbacks.ModelCheckpoint(args.checkpoint_path, save_weights_only=True, verbose=1) tensorboard_cbk = tf.keras.callbacks.TensorBoard( log_dir=args.tensorboard_log) callbacks = [cp_callback, tensorboard_cbk] epochs = 2 steps_per_epoch = 256 history = model.fit(dataset, epochs=epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks) save_model(model, query_model, doc_model, args.model_path) print("{} ends {}".format(args.mode, "-" * 10)) pass