def conf(tmpdir):
    # load default configuration
    tmpfile = tmpdir.join('blank')
    with open(str(tmpfile), 'w'):
        pass  # touch
    res, _ = parse_config_file(str(tmpfile), confrc='conf/confrc', quit_on_error=False)

    res['feature_extraction'].update({
        'class': 'eval.pipeline.bov.ThesaurusVectorizer',
        'min_df': 1,
        'k': 10,  # use all thesaurus entries
        'train_token_handler': 'eval.pipeline.feature_handlers.BaseFeatureHandler',
        'decode_token_handler': 'eval.pipeline.feature_handlers.BaseFeatureHandler',
        'random_neighbour_thesaurus': False,
        'train_time_opts': dict(extract_unigram_features=['J', 'N', 'V'],
                                extract_phrase_features=[]),
        'decode_time_opts': dict(extract_unigram_features=['J', 'N', 'V'],
                                 extract_phrase_features=[])
    })

    res['feature_selection'].update({
        'run': True,
        'method': 'eval.pipeline.feature_selectors.VectorBackedSelectKBest',
        'scoring_function': 'sklearn.feature_selection.chi2',
        'must_be_in_thesaurus': False,
        'k': 'all',
    })

    res['vector_sources']['is_thesaurus'] = True
    return res
def test_jsonify_XML_corpus():
    conf_file = 'tests/resources/conf/exp0/exp0.conf'
    conf, _ = parse_config_file(conf_file)
    train_set = conf['training_data']
    json_train_set = train_set + '.gz'
    tk = get_tokenizer_settings_from_conf(conf)

    # parse the XML directly
    x_tr, y_tr, _, _ = get_tokenized_data(train_set, tk)

    jsonify_single_labelled_corpus('unit_tests', train_set, conf_file)
    x_tr1, y_tr1, _, _ = get_tokenized_data(json_train_set, tk)

    # because the process of converting to json merges the train and test set, if a test set exists,
    # we need to merge them too in this test.
    for a, b in zip(x_tr, x_tr1):
        assert len(a[0]) == len(b) == 3
        assert set(str(f) for f in a[0].nodes()) == set(b)
    np.testing.assert_array_equal(y_tr, y_tr1)
    os.unlink(json_train_set)
Example #3
0
def get_tokenizer_settings_from_conf_file(conf_file):
    conf, _ = parse_config_file(conf_file)
    return get_tokenizer_settings_from_conf(conf)
Example #4
0
def is_valid_file(arg):
    if not os.path.exists(arg):
        parser.error("The conf file %s does not exist!" % arg)
    else:
        return arg


if __name__ == '__main__':
    # parse command-line arguments (conf file only)
    parser = argparse.ArgumentParser(description='Evaluate vector via document classification')
    parser.add_argument('conf_file',
                        help='Conf file that defines the experiment',
                        type=is_valid_file)

    args = parser.parse_args()
    conf, configspec_file = parse_config_file(args.conf_file)
    mkdirs_if_not_exists(conf['output_dir'])

    # set up logging to file
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s\t%(module)s.%(funcName)s (line %(lineno)d)\t%(levelname)s : %(message)s",
                        datefmt='%m-%d %H:%M',
                        filename=os.path.join(conf['output_dir'], 'log.txt'),
                        filemode='w')
    # define a Handler which writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    # set a format which is simpler for console use
    formatter = logging.Formatter("%(asctime)s\t%(module)s.%(funcName)s (line %(lineno)d)\t%(levelname)s : %(message)s")
    # tell the handler to use this format
    console.setFormatter(formatter)
def conf():
    config, _ = parse_config_file(conf_file)
    mkdirs_if_not_exists(config['output_dir'])
    return config