def test_save_load(base_preprocessor):
    dirpath = '.tmpdir'
    base_preprocessor.save(dirpath)
    assert mz.load_preprocessor(dirpath)
    with pytest.raises(FileExistsError):
        base_preprocessor.save(dirpath)
    shutil.rmtree(dirpath)
def predict(train_id='test_file'):
    q = 'how did apollo creed die'
    d = "Urban legend states that Apollo Creed's name is a wordplay on the Apostles' Creed , a statement of belief used in Christian churches."
    df = pd.DataFrame(data={'text_left': [q], 'text_right': [d], 'label': [0]})
    preprocessor = mz.load_preprocessor(ROOT_PATH +
                                        'matchzoo_temp_files/preprocessors/' +
                                        train_id + '.dssm_preprocessor')
    predict_pack = mz.pack(df)
    predict_pack_processed = preprocessor.transform(predict_pack)
    model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' +
                          train_id + '.dssm_model')
    predict_score = float(
        model.predict(predict_pack_processed[:10].unpack()[0])[0][0])
    ret_dict = {'score': predict_score}
    print(ret_dict)
Example #3
0
def DSSM(model):
    request_data = json.loads(request.data.decode('utf-8'))
    q = request_data['text1']
    d = request_data['text2']
    train_id = request_data['train_id']
    df = pd.DataFrame(data={'text_left': [q], 'text_right': [d], 'label': [0]})
    preprocessor_suffix = '.' + model + '_preprocessor'
    preprocessor = mz.load_preprocessor(ROOT_PATH +
                                        'matchzoo_temp_files/preprocessors/' +
                                        train_id + preprocessor_suffix)
    predict_pack = mz.pack(df)
    predict_pack_processed = preprocessor.transform(predict_pack)
    keras.backend.clear_session()

    model_suffix = '.' + model + '_model'
    model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' +
                          train_id + model_suffix)
    predict_score = float(
        model.predict(predict_pack_processed[:10].unpack()[0])[0][0])
    ret_dict = {'score': predict_score}
    '''
    if model != 'drmm':
        model_suffix = '.' + model + '_model'
        model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + model_suffix)
        predict_score = float(model.predict(predict_pack_processed[:10].unpack()[0])[0][0])
        ret_dict = {
            'score': predict_score
        }
    else:
        glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300)
        embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index'])
        l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1))
        embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis]
        pred_generator = mz.HistogramDataGenerator(data_pack=predict_pack_processed,
                                                   embedding_matrix=embedding_matrix,
                                                   bin_size=30,
                                                   hist_mode='LCH')
        test_x, test_y = pred_generator[:]
        keras.backend.clear_session()
        model_suffix = '.' + model + '_model'
        model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + model_suffix)
        predict_score = float(model.predict(test_x)[0][0])
        ret_dict = {
            'score': predict_score
        }
    '''
    return jsonify(ret_dict)
Example #4
0
    def _load_basic_preprocessor(self, extra_terms):
        dataset_name = self.dataset.dataset
        preprocessor_name = "basic" if self.model_class != mz.models.Bert else "bert"
        preprocessor_folder = self.preprocessor_path
        save_name = ".".join([preprocessor_name, dataset_name])
        save_path = os.path.join(preprocessor_folder, save_name)
        if os.path.exists(save_path) and self.model_class != mz.models.Bert:
            preprocessor = mz.load_preprocessor(save_path)
            print("Load Preprocessor from %s" % save_path)
            preprocessor.fit = lambda *args, **argv: None
            save_path = None
        else:
            print("Init Preprocessor")
            preprocessor = self.model_class.get_default_preprocessor(
                truncated_length_left=20,
                truncated_length_right=1024
                if self.model_class != mz.models.Bert else 492,
                truncated_mode="post")

            preprocessor.multiprocessing = 0 if self.dataset.debug_mode else 1
            preprocessor.extra_terms = extra_terms
        return preprocessor, save_path
model_path = f"./model/traversal_path_esim-{split}"

task = mz.tasks.Classification(num_classes=2)
task.metrics = ['acc']
print("`classification_task` initialized with metrics", task.metrics)
best_model = sorted(os.listdir(model_path),
                    key=lambda fn: os.path.getmtime(model_path + '/' + fn))[-1]

test_raw = mz.datasets.cfq.load_data(stage='test',
                                     task=task,
                                     data_root=data_root,
                                     suffix="mask_predict_classification.csv")

print('data loaded as `train_pack_raw` `dev_pack_raw` `test_pack_raw`')
# print(model_path, )
preprocessor = mz.load_preprocessor(model_path)
# preprocessor.fit(train_raw)
# train_processed = preprocessor.transform(train_raw)
test_processed = preprocessor.transform(test_raw)

# print(test_processed.frame())

testset = mz.dataloader.Dataset(data_pack=test_processed,
                                mode='point',
                                batch_size=1024,
                                shuffle=False)

padding_callback = mz.models.ESIM.get_default_padding_callback()

testloader = mz.dataloader.DataLoader(dataset=testset,
                                      stage='test',
Example #6
0
def test_save_load(base_preprocessor):
    dirpath = '.tmpdir'
    base_preprocessor.save(dirpath)
    assert mz.load_preprocessor(dirpath)
    shutil.rmtree(dirpath)
Example #7
0
 def prepare_test(self, test_pack):
     preprocessor = mz.load_preprocessor(self.preprocessordir)
     test_pack_processed = preprocessor.transform(test_pack)
     return test_pack_processed
    print("Give fold number")
    sys.exit(0)
elif len(sys.argv) < 3:
    print("Give path folder")
    sys.exit(0)

path = sys.argv[2]

print("loading embedding ...")
glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300)
print("embedding loaded as `glove_embedding`")

fold = sys.argv[1]

print("Loading fold:  ", fold)
preprocessor = mz.load_preprocessor(path + "robust_preprocessor_fold_" + fold)

print("preprocessor context:   ", preprocessor.context)

ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss(num_neg=1))
ranking_task.metrics = [
    mz.metrics.NormalizedDiscountedCumulativeGain(k=20),
    mz.metrics.MeanAveragePrecision(),
    mz.metrics.Precision(k=20)
]

print("ranking task ok")

bin_size = 30
model = mz.models.DRMM()
model.params.update(preprocessor.context)
print("task is", task)
print("`task` initialized with metrics", task.metrics)

if fit_preprocessor:

    preprocessor = mz.models.ESIM.get_default_preprocessor(
        truncated_mode='pre',
        truncated_length_left=64,
        truncated_length_right=256,
        filter_mode='df',
        filter_low_freq=2)

    preprocessor = preprocessor.fit(all_data_raw)
    preprocessor.save("preprocessor.prep")
else:
    preprocessor = mz.load_preprocessor("preprocessor.prep")

candidate_dic = pd.read_feather('data/candidate_dic.ftr')

train_recall = pd.read_feather('data/train_recall.ftr')
train_description = pd.read_feather('data/train_description.ftr')
train_recall = pd.merge(train_recall,
                        train_description,
                        how='left',
                        on='id_left')
train_recall = pd.merge(train_recall, candidate_dic, how='left', on='id_right')
train_recall = train_recall.drop_duplicates().reset_index(drop=True)
del train_description
gc.collect()

test_recall = pd.read_feather('data/test_recall.ftr')