Esempio n. 1
0
def test_serialize():
    """Test that models can correctly be saved to and loaded from gzipped json
    """
    texts = data['texts']
    classifier1 = load_model(os.path.join(TEST_MODEL_PATH, 'IR',
                                          'IR_model.gz'))
    temp_filename = os.path.join(SCRATCH_PATH, uuid.uuid4().hex)
    classifier1.dump_model(temp_filename)

    classifier2 = load_model(temp_filename)
    classifier2.dump_model(temp_filename)

    classifier3 = load_model(temp_filename)

    preds1, preds2, preds3 = (classifier1.predict_proba(texts),
                              classifier2.predict_proba(texts),
                              classifier3.predict_proba(texts))
    # Check that generated predictions are the same
    assert np.array_equal(preds1, preds2)
    assert np.array_equal(preds2, preds3)
    # Check that model stats are the same
    assert classifier1.stats == classifier2.stats == classifier3.stats
    # Check that the calculated feature importance scores are the same
    assert classifier1.feature_importances() == \
        classifier2.feature_importances() == \
        classifier3.feature_importances()
    os.remove(temp_filename)
Esempio n. 2
0
def test_disambiguate():
    test_model = load_model(os.path.join(TEST_MODEL_PATH, 'IR',
                                         'IR_model.gz'))
    with open(os.path.join(TEST_MODEL_PATH, 'IR',
                           'IR_grounding_dict.json')) as f:
        grounding_dict = json.load(f)
    with open(os.path.join(TEST_MODEL_PATH, 'IR',
                           'IR_names.json')) as f:
        names = json.load(f)

    ad = AdeftDisambiguator(test_model, grounding_dict, names)
    # case where there is a unique defining pattern
    disamb1 = ad.disambiguate(example1)
    assert disamb1[0] == 'HGNC:6091'
    assert disamb1[1] == 'INSR'
    assert disamb1[2]['HGNC:6091'] == 1.0
    assert disamb1[2]['MESH:D011839'] == 0.0

    # case where there are conflicting defining patterns
    disamb2 = ad.disambiguate(example2)
    preds = disamb2[2]
    nonzero = {key for key, value in preds.items() if value > 0.0}
    assert nonzero == {'HGNC:6091', 'MESH:D007333'}

    # case without a defining pattern
    disamb3 = ad.disambiguate(example3)
    assert disamb3[0] == 'HGNC:6091'
    assert disamb3[1] == 'INSR'
Esempio n. 3
0
def test_serialize():
    """Test that models can correctly be saved to and loaded from gzipped json
    """
    texts = data['texts']
    classifier1 = load_model(os.path.join(TEST_MODEL_PATH, 'IR',
                                          'IR_model.gz'))
    temp_filename = os.path.join(SCRATCH_PATH, uuid.uuid4().hex)
    classifier1.dump_model(temp_filename)

    classifier2 = load_model(temp_filename)
    classifier2.other_metadata = {'test': 'This is a test.'}
    classifier2.dump_model(temp_filename)

    classifier3 = load_model(temp_filename)

    preds1, preds2, preds3 = (classifier1.predict_proba(texts),
                              classifier2.predict_proba(texts),
                              classifier3.predict_proba(texts))
    # Check that generated predictions are the same
    assert np.array_equal(preds1, preds2)
    assert np.array_equal(preds2, preds3)
    # Check that model stats are the same
    assert classifier1.stats == classifier2.stats == classifier3.stats
    # Check that the calculated feature importance scores are the same
    assert classifier1.feature_importances() == \
        classifier2.feature_importances() == \
        classifier3.feature_importances()
    # Check timestamps are unchanged
    assert classifier1.timestamp == classifier2.timestamp == \
        classifier3.timestamp
    # Check hash of training set is unchanged
    assert classifier1.training_set_digest == \
        classifier2.training_set_digest == \
        classifier3.training_set_digest
    # Check standard deviations of feature values are unchanged
    assert np.array_equal(classifier1._std, classifier2._std)
    assert np.array_equal(classifier2._std, classifier3._std)
    # Check classifier versions are unchanged
    assert classifier1.version == classifier2.version == \
        classifier3.version
    # Check that model params are unchanged
    assert classifier1.params == classifier2.params == classifier3.params
    assert classifier2.other_metadata == classifier3.other_metadata
    os.remove(temp_filename)
Esempio n. 4
0
def load_disambiguator_directly(path):
    """Returns disambiguator located at path
    
    Parameters
    ----------
    path : str
        Path to a disambiguation model. Must be a path to a directory
       <model_name> containing the files
       <model_name>_model.gz, <model_name>_grounding_dict.json, <model_name>_names.json
       
    Returns
    -------
    py:class:`adeft.disambiguate.AdeftDisambiguator`
        A disambiguation model loaded from folder specified by path
    """
    model_name = os.path.basename(os.path.abspath(path))
    model = load_model(os.path.join(path, model_name + '_model.gz'))
    with open(os.path.join(path, model_name + '_grounding_dict.json')) as f:
        grounding_dict = json.load(f)
    with open(os.path.join(path, model_name + '_names.json')) as f:
        names = json.load(f)
    output = AdeftDisambiguator(model, grounding_dict, names)
    return output
Esempio n. 5
0
        if os.path.isdir(model_path) and \
           model_name in set(available_shortforms.values()):
            names_path = os.path.join(model_path, f'{model_name}_names.json')
            with open(names_path, 'r') as f:
                names = strip_dictionary(json.load(f))
            with open(names_path, 'w') as f:
                json.dump(names, f)
            gdict_path = os.path.join(model_path,
                                      f'{model_name}_grounding_dict.json')
            with open(gdict_path, 'r') as f:
                grounding_dict = json.load(f)
            grounding_dict = {shortform: strip_dictionary(grounding_map)
                              for shortform, grounding_map in
                              grounding_dict.items()}
            model_file = os.path.join(model_path, f'{model_name}_model.gz')
            model = load_model(model_file)
            model.pos_labels = [label.strip() for label in model.pos_labels]

            for i, label in (
                    enumerate(model.estimator.named_steps['logit'].classes_)):
                model.estimator.named_steps['logit'].classes_[i] = \
                    label.strip()

            model.dump_model(model_file)

            with open(gdict_path, 'w') as f:
                json.dump(grounding_dict, f)
            for shortform in grounding_dict:
                grounding_path = os.path.join(groundings_path, shortform)
                names_path = os.path.join(grounding_path,
                                          f'{shortform}_names.json')