Exemple #1
0
def doc_emb(args, config):
    print("{} begins {}".format(args.mode, "-" * 10))
    model, query_model, doc_model = create_model(**config)
    load_model(doc_model, args.model_path, "doc.h5")
    test_files = fs_list(args.test_dir)[0]
    dataset = dataset_reader(test_files,
                             shuffle=False,
                             batch_size=128,
                             repeat_num=1)
    dataset_iter = iter(dataset)
    try:
        while True:
            features, label = dataset_iter.next()
            batch = []
            batch.append(features['doc0_Input'])
            result = doc_model.predict_on_batch(batch)
            # make sure to normalize the embedding
            norm = np.linalg.norm(result, axis=1, keepdims=True)
            result = result / norm
            for res in result:
                print("\t".join(["{}".format(r) for r in res]))
    except:
        #traceback.print_exc()
        pass
    print("{} ends   {}".format(args.mode, "-" * 10))
    pass
Exemple #2
0
def predict(args, config):
    print("{} begins {}".format(args.mode, "-" * 10))
    model, query_model, doc_model = create_model(**config)
    load_model(model, args.model_path, "main.h5")
    test_files = fs_list(args.test_dir)[0]
    dataset = dataset_reader(test_files,
                             shuffle=False,
                             batch_size=128,
                             repeat_num=1)
    dataset_iter = iter(dataset)
    try:
        while True:
            features, label = dataset_iter.next()
            batch = []
            batch.append(features['query_Input'])
            for i in range(config.get("doc_num", 5)):
                batch.append(features['doc{}_Input'.format(i)])
            result = model.predict_on_batch(batch)
            for res in result:
                print("\t".join(["{}".format(r) for r in res]))
    except:
        #traceback.print_exc()
        pass

    print("{} ends   {}".format(args.mode, "-" * 10))
    pass
Exemple #3
0
def doc_emb_raw(args, config):
    print("{} begins {}".format(args.mode, "-" * 10))
    model, query_model, doc_model = create_model(**config)
    load_model(doc_model, args.model_path, "doc.h5")
    test_files = fs_list(args.test_dir)[0]
    for filename in test_files:
        with open(filename) as fp:
            for line in fp:
                features, label = parser_raw_line(line)
                #print(json.dumps(features))
                doc_input = features["doc0_Input"]
                emb = doc_model.predict_on_batch([[doc_input]])
                # TODO normalize the result
                print("\t".join([str(x) for x in emb[0].tolist()]))
    print("{} ends   {}".format(args.mode, "-" * 10))
    pass
Exemple #4
0
def sub_model_test(args, config):
    model, query_model, doc_model = create_model(**config)
    load_model(model, args.model_path, "main.h5")
    load_model(query_model, args.model_path, "query.h5")
    load_model(doc_model, args.model_path, "doc.h5")
    test_files = fs_list(args.test_dir)[0]
    dataset = dataset_reader(test_files,
                             shuffle=False,
                             batch_size=128,
                             repeat_num=1)
    dataset_iter = iter(dataset)
    try:
        while True:
            print("*" * 10)
            features, label = dataset_iter.next()
            batch = []
            batch.append(features['query_Input'])
            for i in range(config.get("doc_num", 5)):
                batch.append(features['doc{}_Input'.format(i)])
            score_result = model.predict_on_batch(batch)
            for res in score_result:
                print("\t".join(["{}".format(r) for r in res]))
            query_emb = query_model.predict_on_batch(batch[0])
            query_emb_n = np.linalg.norm(query_emb, axis=1, keepdims=True)
            # make sure to normalize the embedding
            query_emb = query_emb / query_emb_n
            batch_size = query_emb.shape[0]
            distance_arr = np.empty(shape=[0, batch_size], dtype=np.float64)
            for b in batch[1:]:
                doc_emb = doc_model.predict_on_batch(b)
                doc_emb_n = np.linalg.norm(doc_emb, axis=1, keepdims=True)
                # make sure to normalize the embedding
                doc_emb = doc_emb / doc_emb_n
                distance_mat = query_emb.dot(doc_emb.transpose((1, 0)))
                distances = np.diagonal(distance_mat)
                distances = np.expand_dims(distances, axis=0)
                distance_arr = np.concatenate((distance_arr, distances),
                                              axis=0)
            print(softmax(distance_arr.T))
            print("*" * 10)
    except:
        traceback.print_exc()
        pass
    pass
Exemple #5
0
def train(args, config):
    print("{} begins {}".format(args.mode, "-" * 10))
    if not args.pretrain:
        clear_model_dir(args)
    print("model config is {}".format(json.dumps(config)))
    # define and compile model
    # TODO distributed training
    mirrored_strategy = tf.distribute.MirroredStrategy(
        devices=["/gpu:0", "/gpu:1"])
    # with mirrored_strategy.scope():
    if True:
        model, query_model, doc_model = create_model(**config)
        optimizer = tf.keras.optimizers.Adam(lr=0.02,
                                             beta_1=0.9,
                                             beta_2=0.999,
                                             epsilon=None,
                                             decay=0.999,
                                             amsgrad=False)
        metrics = create_metrics(**config)
        loss = create_loss(**config)
        model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    # train model
    train_files = fs_list(args.train_dir)[0]
    dataset = dataset_reader(train_files,
                             shuffle=True,
                             batch_size=128,
                             repeat_num=200)
    # define train callbacks
    cp_callback = tf.keras.callbacks.ModelCheckpoint(args.checkpoint_path,
                                                     save_weights_only=True,
                                                     verbose=1)
    tensorboard_cbk = tf.keras.callbacks.TensorBoard(
        log_dir=args.tensorboard_log)
    callbacks = [cp_callback, tensorboard_cbk]
    epochs = 2
    steps_per_epoch = 256
    history = model.fit(dataset,
                        epochs=epochs,
                        steps_per_epoch=steps_per_epoch,
                        callbacks=callbacks)
    save_model(model, query_model, doc_model, args.model_path)
    print("{} ends   {}".format(args.mode, "-" * 10))
    pass