Example #1
0
def test_duet(train_data_processed, task, train_generator,
              valid_data_processed, test_data_processed, preprocessor):
    """Test DUET model."""
    # Create a duet model
    duet = mz.models.DUET()
    input_shapes = preprocessor.context['input_shapes']
    embed_dimension = preprocessor.context['vocab_size'] + 1
    duet.params['input_shapes'] = input_shapes
    duet.params['task'] = task
    duet.params['embedding_input_dim'] = embed_dimension
    duet.params['embedding_output_dim'] = 10
    duet.params['lm_filters'] = 32
    duet.params['lm_hidden_sizes'] = [16]
    duet.params['dm_filters'] = 32
    duet.params['dm_kernel_size'] = 3
    duet.params['dm_hidden_sizes'] = [16]
    duet.params['dropout_rate'] = 0.5
    duet.params['activation_func'] = 'relu'
    duet.guess_and_fill_missing_params()
    duet.build()
    duet.compile()

    x_valid, y_valid = valid_data_processed.unpack()
    valid_eval = mz.callbacks.EvaluateAllMetrics(duet, x_valid, y_valid)
    duet.fit_generator(train_generator, epochs=1, callbacks=[valid_eval])
    duet.save('.tmpdir')

    try:
        duet = mz.load_model('.tmpdir')
        x, y = test_data_processed.unpack()
        results = duet.evaluate(x, y)
        assert len(results) > 0
    finally:
        shutil.rmtree('.tmpdir')
Example #2
0
def test_drmmtks(train_data_processed, task, train_generator,
                 valid_data_processed, test_data_processed, preprocessor):
    """Test DRMMTKS model."""
    # Create a drmmtks model
    drmmtks_model = mz.models.DRMMTKS()
    input_shapes = preprocessor.context['input_shapes']
    embed_dimension = preprocessor.context['vocab_size'] + 1
    drmmtks_model.params['input_shapes'] = input_shapes
    drmmtks_model.params['task'] = task
    drmmtks_model.params['top_k'] = 10
    drmmtks_model.params['embedding_input_dim'] = embed_dimension
    drmmtks_model.params['embedding_output_dim'] = 10
    drmmtks_model.params['mlp_num_layers'] = 1
    drmmtks_model.params['mlp_num_units'] = 5
    drmmtks_model.params['mlp_num_fan_out'] = 1
    drmmtks_model.params['mlp_activation_func'] = 'relu'
    drmmtks_model.guess_and_fill_missing_params()
    drmmtks_model.build()
    drmmtks_model.compile()

    x_valid, y_valid = valid_data_processed.unpack()
    valid_eval = mz.callbacks.EvaluateAllMetrics(drmmtks_model, x_valid,
                                                 y_valid)
    drmmtks_model.fit_generator(train_generator,
                                epochs=1,
                                callbacks=[valid_eval])
    drmmtks_model.save('.tmpdir')

    try:
        drmmtks_model = mz.load_model('.tmpdir')
        x, y = test_data_processed.unpack()
        results = drmmtks_model.evaluate(x, y)
        assert len(results) > 0
    finally:
        shutil.rmtree('.tmpdir')
Example #3
0
def test_mvlstm(train_data_processed, task, train_generator,
                valid_data_processed, test_data_processed, preprocessor):
    """Test MVLSTM model."""
    # Create a mvlstm model
    mvlstm = mz.models.MVLSTM()
    input_shapes = preprocessor.context['input_shapes']
    embed_dimension = preprocessor.context['vocab_size'] + 1
    mvlstm.params['input_shapes'] = input_shapes
    mvlstm.params['task'] = task
    mvlstm.params['embedding_input_dim'] = embed_dimension
    mvlstm.params['embedding_output_dim'] = 10
    mvlstm.params['lstm_units'] = 10
    mvlstm.params['top_k'] = 10
    mvlstm.params['mlp_num_layers'] = 1
    mvlstm.params['mlp_num_units'] = 5
    mvlstm.params['mlp_num_fan_out'] = 1
    mvlstm.params['mlp_activation_func'] = 'relu'
    mvlstm.params['dropout_rate'] = 0.5
    mvlstm.guess_and_fill_missing_params()
    mvlstm.build()
    mvlstm.compile()

    x_valid, y_valid = valid_data_processed.unpack()
    valid_eval = mz.callbacks.EvaluateAllMetrics(mvlstm, x_valid, y_valid)
    mvlstm.fit_generator(train_generator, epochs=1, callbacks=[valid_eval])
    mvlstm.save('.tmpdir')

    try:
        mvlstm = mz.load_model('.tmpdir')
        x, y = test_data_processed.unpack()
        results = mvlstm.evaluate(x, y)
        assert len(results) > 0
    finally:
        shutil.rmtree('.tmpdir')
Example #4
0
    def __init__(self, modelDir, dtProcDir, debugPrint):
        super().__init__(exclusive=True)

        with open(dtProcDir, 'rb') as f:
            self.prep = pickle.load(f)

        self.model = mz.load_model(modelDir)
        self.model.backend.summary()
        self.debugPrint = debugPrint
Example #5
0
 def predict(self, test_pack_processed):
     model = mz.load_model(self.modeldir)
     test_generator = mz.DPoolDataGenerator(test_pack_processed,
                                            fixed_length_left=20,
                                            fixed_length_right=20,
                                            batch_size=20)
     pred_x, pred_y = test_generator[:]
     predict_value = model.predict(pred_x,
                                   batch_size=len(pred_y))  # batch_size
     return predict_value
Example #6
0
def get_model_and_data(topic_number, d_pack_test, model_type, embedding):

    if model_type == 'dense':
        # load model
        model = mz.load_model(os.path.join(MODEL_DUMP, MODEL_TYPE, str(topic_number)))

        # prepare preprocessor
        train_raw = train_data(topic_number)
        preprocessor = mz.preprocessors.BasicPreprocessor()
        preprocessor.fit(train_raw)

        # transform document data
        test_processed = preprocessor.transform(d_pack_test)
        test_x, test_y = test_processed.unpack()

    if model_type == 'drmm':
        # load model
        model = mz.load_model(os.path.join(MODEL_DUMP, MODEL_TYPE, str(topic_number)))
        task = mz.tasks.Ranking()
        train_raw = train_data(topic_number)
        preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=10,
                                                          fixed_length_right=100,
                                                          remove_stop_words=False)
        preprocessor.fit(train_raw)

        test_processed = preprocessor.transform(d_pack_test)
        embedding_matrix = embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index'])
        # normalize the word embedding for fast histogram generating.
        l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1))
        embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis]
        model.load_embedding_matrix(embedding_matrix)
        hist_callback = mz.data_generator.callbacks.Histogram(embedding_matrix,
                                                              bin_size=30,
                                                              hist_mode='LCH')
        test_generator = mz.DataGenerator(data_pack=test_processed, mode='point',
                                          callbacks=[hist_callback])
        test_x, test_y = test_generator[:]

    return model, test_x
Example #7
0
def test_save_load_model(model):
    tmpdir = '.matchzoo_test_save_load_tmpdir'

    if Path(tmpdir).exists():
        shutil.rmtree(tmpdir)

    try:
        model.save(tmpdir)
        assert mz.load_model(tmpdir)
        with pytest.raises(FileExistsError):
            model.save(tmpdir)
    finally:
        if Path(tmpdir).exists():
            shutil.rmtree(tmpdir)
def predict(train_id='test_file'):
    q = 'how did apollo creed die'
    d = "Urban legend states that Apollo Creed's name is a wordplay on the Apostles' Creed , a statement of belief used in Christian churches."
    df = pd.DataFrame(data={'text_left': [q], 'text_right': [d], 'label': [0]})
    preprocessor = mz.load_preprocessor(ROOT_PATH +
                                        'matchzoo_temp_files/preprocessors/' +
                                        train_id + '.dssm_preprocessor')
    predict_pack = mz.pack(df)
    predict_pack_processed = preprocessor.transform(predict_pack)
    model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' +
                          train_id + '.dssm_model')
    predict_score = float(
        model.predict(predict_pack_processed[:10].unpack()[0])[0][0])
    ret_dict = {'score': predict_score}
    print(ret_dict)
Example #9
0
def DSSM(model):
    request_data = json.loads(request.data.decode('utf-8'))
    q = request_data['text1']
    d = request_data['text2']
    train_id = request_data['train_id']
    df = pd.DataFrame(data={'text_left': [q], 'text_right': [d], 'label': [0]})
    preprocessor_suffix = '.' + model + '_preprocessor'
    preprocessor = mz.load_preprocessor(ROOT_PATH +
                                        'matchzoo_temp_files/preprocessors/' +
                                        train_id + preprocessor_suffix)
    predict_pack = mz.pack(df)
    predict_pack_processed = preprocessor.transform(predict_pack)
    keras.backend.clear_session()

    model_suffix = '.' + model + '_model'
    model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' +
                          train_id + model_suffix)
    predict_score = float(
        model.predict(predict_pack_processed[:10].unpack()[0])[0][0])
    ret_dict = {'score': predict_score}
    '''
    if model != 'drmm':
        model_suffix = '.' + model + '_model'
        model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + model_suffix)
        predict_score = float(model.predict(predict_pack_processed[:10].unpack()[0])[0][0])
        ret_dict = {
            'score': predict_score
        }
    else:
        glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300)
        embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index'])
        l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1))
        embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis]
        pred_generator = mz.HistogramDataGenerator(data_pack=predict_pack_processed,
                                                   embedding_matrix=embedding_matrix,
                                                   bin_size=30,
                                                   hist_mode='LCH')
        test_x, test_y = pred_generator[:]
        keras.backend.clear_session()
        model_suffix = '.' + model + '_model'
        model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + model_suffix)
        predict_score = float(model.predict(test_x)[0][0])
        ret_dict = {
            'score': predict_score
        }
    '''
    return jsonify(ret_dict)
Example #10
0
def test_dssm(train_data_processed, task, train_generator, test_generator,
              dssm_preprocessor):
    """Test DSSM model."""
    # Create a dssm model
    dssm_model = mz.models.DSSM()
    input_shapes = dssm_preprocessor.context['input_shapes']
    dssm_model.params['input_shapes'] = input_shapes
    dssm_model.params['task'] = task
    dssm_model.guess_and_fill_missing_params()
    dssm_model.build()
    dssm_model.compile()
    dssm_model.fit_generator(train_generator)
    dssm_model.save('.tmpdir')

    X, y = test_generator[0]
    try:
        dssm_model = mz.load_model('.tmpdir')
        predictions = dssm_model.predict(X)
        assert len(predictions) > 0
        assert type(predictions[0][0]) == np.float32
    finally:
        shutil.rmtree('.tmpdir')
Example #11
0
def test_cdssm(task, train_generator, test_generator, cdssm_preprocessor):
    """Test CDSSM model."""
    # Create a cdssm model
    cdssm_model = mz.models.CDSSM()
    assert isinstance(cdssm_model.get_default_preprocessor(),
                      mz.preprocessors.CDSSMPreprocessor)
    input_shapes = cdssm_preprocessor.context['input_shapes']
    cdssm_model.params['input_shapes'] = input_shapes
    cdssm_model.params['task'] = task
    cdssm_model.guess_and_fill_missing_params()
    cdssm_model.build()
    cdssm_model.compile()
    cdssm_model.fit_generator(train_generator)
    cdssm_model.save('.tmpdir')

    X, y = test_generator[0]
    try:
        cdssm_model = mz.load_model('.tmpdir')
        predictions = cdssm_model.predict(X)
        assert len(predictions) > 0
        assert type(predictions[0][0]) == np.float32
    finally:
        shutil.rmtree('.tmpdir')
Example #12
0
import matchzoo as mz
import pandas as pd
print(mz.__version__)
data_pack = mz.pack(pd.read_csv('match-zoo-corpus-top-1w.csv'))
print(data_pack[-10:])

data_pack.relation['label'] = data_pack.relation['label'].astype('float32')
frame = data_pack.frame

task = mz.tasks.Ranking()
train_raw = data_pack  # mz.datasets.toy.load_data(stage='train', task=task)
test_raw = data_pack  # mz.datasets.toy.load_data(stage='test', task=task)

model = mz.load_model('step-2-mz-model')

preprocessor = mz.preprocessors.BasicPreprocessor()
preprocessor.fit(train_raw, verbose=0)  ## init preprocessor inner state.
# train_processed = preprocessor.transform(train_raw, verbose=5)
test_processed = preprocessor.transform(test_raw, verbose=0)

# x, y = train_processed.unpack()
test_x, test_y = test_processed.unpack()

results = model.predict(test_x)

print(type(results))
print(len(results))
print(results)
for idx, item in enumerate(results[:20]):
    print('*' * 100)
    print(idx)
Example #13
0
    mz.preprocessors.units.punc_removal.PuncRemoval(),
]

model_class = mz.models.MVLSTM
model, preprocessor, data_generator_builder, embedding_matrix = mz.auto.prepare(
    task=ranking_task,
    model_class=model_class,
    preprocessor=preprocessor_class,
    data_pack=train_raw)

train_processed = preprocessor.fit_transform(train_raw, verbose=1)
tuner = mz.auto.Tuner(params=model.params,
                      train_data=train_processed,
                      test_data=train_processed,
                      num_runs=10)
results = tuner.tune()
print(results['best'])
params = results['best']['sample']
print(params)
model.params['input_shapes'] = preprocessor.context['input_shapes']
model.params['mlp_num_fan_out'] = params['mlp_num_fan_out']
model.params['mlp_num_layers'] = params['mlp_num_layers']
model.params['mlp_num_units'] = params['mlp_num_units']
model.params['top_k'] = params['top_k']
model.compile()
model.build()
model.save('my-model')
loaded_model = mz.load_model('./my-model')
print("after==================================")
print(loaded_model.params)  # 展示模型中可调参数
Example #14
0
print(
    f'Collection: {colName}  model file: {modelFile} data transform file: {dataTranFile}'
)
print(f'Test file: {dataFileTest}')

# Note dtype! don't let Pandas guess column data types!
dataTestPacked = pack(readWhiteSpacedMatchZooData(dataFileTest))

with open(dataTranFile, 'rb') as f:
    prep = pickle.load(f)

import pdb, sys

#try:
if True:
    dataTestProc = prep.transform(dataTestPacked)

    model = mz.load_model(modelFile)
    model.backend.summary()
    xTest, yTest = dataTestProc.unpack()
    model.params['task'].metrics = [
        mz.metrics.NormalizedDiscountedCumulativeGain(k=20)
    ]
    print(model.evaluate(xTest, yTest, batch_size=128))

#except:
# tb is traceback
#type, value, tb = sys.exc_info()
#pdb.post_mortem(tb)
Example #15
0
    def __init__(self, model_path, batch_size=8):

        preprocess_path = model_path + "preprocessor.dill"
        self.model = mz.load_model(model_path + "model")
        self.preprocessor = dill.load(open(preprocess_path, "rb"))
        self.batch_size = batch_size
Example #16
0
        'text_left': X_left,
        'text_right': X_right,
        'id_left': X_left_id,
        'id_right': X_right_id,
        'label': y
    })
    return mz.pack(df)


root_dir = r"D:\data\biendata\ccks2019_el\ccks_train_data\{}"

print('data loading ...')
# train_pack_raw = load_data('train', 100000)
# dev_pack_raw = load_data('validate', 200)
test_pack_raw = load_data('test', 200)

model_path = r"D:/data/biendata/ccks2019_el/entityrank/m0/model/"
preprocess_path = model_path + "preprocessor.dill"
model = load_model(model_path)
preprocessor = dill.load(open(preprocess_path, "rb"))

# train_pack_processed = preprocessor.fit_transform(train_pack_raw)
# dev_pack_processed = preprocessor.transform(dev_pack_raw)
test_pack_processed = preprocessor.transform(test_pack_raw)

test_x, test_y = test_pack_processed.unpack()

pre = model.predict(test_x, 128)

pass