def test_jsonify_XML_corpus():
    conf_file = 'tests/resources/conf/exp0/exp0.conf'
    conf, _ = parse_config_file(conf_file)
    train_set = conf['training_data']
    json_train_set = train_set + '.gz'
    tk = get_tokenizer_settings_from_conf(conf)

    # parse the XML directly
    x_tr, y_tr, _, _ = get_tokenized_data(train_set, tk)

    jsonify_single_labelled_corpus('unit_tests', train_set, conf_file)
    x_tr1, y_tr1, _, _ = get_tokenized_data(json_train_set, tk)

    # because the process of converting to json merges the train and test set, if a test set exists,
    # we need to merge them too in this test.
    for a, b in zip(x_tr, x_tr1):
        assert len(a[0]) == len(b) == 3
        assert set(str(f) for f in a[0].nodes()) == set(b)
    np.testing.assert_array_equal(y_tr, y_tr1)
    os.unlink(json_train_set)
def jsonify_single_labelled_corpus(corpus_name, corpus_path,
                                   conf_file=None,
                                   tokenizer_conf=None,
                                   unigram_features=set('JNV'),
                                   phrase_features=set(['AN', 'NN', 'VO', 'SVO']),
                                   write_feature_set=False):
    """
    Tokenizes an entire XML/CoNLL corpus (sentence segmented and dependency parsed), incl test and train chunk,
    and writes its content to a single JSON gzip-ed file,
    one document per line. Each line is a JSON array, the first value of which is the label of
    the document, and the rest are JSON representation of a list of lists, containing all document
    features of interest, e.g. nouns, adj, NPs, VPs, wtc.
    The resultant document can be loaded with a GzippedJsonTokenizer.

    :param corpus_path: path to the corpus
    """

    def _write_corpus_to_json(x_tr, y_tr):
        extr = FeatureExtractor(extract_unigram_features=unigram_features,
                                extract_phrase_features=phrase_features)
        documents = []
        for doc in x_tr:
            documents.append([str(f) for f in extr.extract_features_from_tree_list(doc)])

        for document, label in zip(documents, y_tr):
            outfile.write(bytes(json.dumps([label, document]), 'UTF8'))
            outfile.write(bytes('\n', 'UTF8'))

        return set(feat for doc in documents for feat in doc)

    # load the dataset from XML/JSON/CoNLL
    if conf_file:
        conf = get_tokenizer_settings_from_conf_file(conf_file)
    elif tokenizer_conf:
        conf = tokenizer_conf
    else:
        raise ValueError('Must provide a dict or a file containing tokenizer config')
    x_tr, y_tr, x_test, y_test = get_tokenized_data(corpus_path, conf)

    with gzip.open('%s.gz' % corpus_path, 'wb') as outfile:
        feats = _write_corpus_to_json(x_tr, y_tr)
        logging.info('Writing %s to gzip json', corpus_path)
        if x_test:
            feats |= _write_corpus_to_json(x_test, y_test)

    if write_feature_set:
        _write_features_of_single_corpus_to_file(feats, corpus_name)
def test_distributional_with_vector_clusters(conf, tmpdir):
    # generate random vectors for the the appropriate features and cluster them first
    x_tr, _, _, _ = get_tokenized_data(conf['training_data'], conf['tokenizer'])
    feats = FeatureExtractor().extract_features_from_tree_list([foo[0] for foo in x_tr])
    vectors = np.random.random((len(feats), 10))
    v = DenseVectors(pd.DataFrame(vectors, index=feats))
    tmpfile = str(tmpdir.join('tmp_random_vectors'))
    v.to_tsv(tmpfile, dense_hd5=True)

    tmpclusters = str(tmpdir.join('tmp_random_clusters'))
    cluster_vectors(tmpfile, tmpclusters, n_clusters=5, n_jobs=1)

    conf['vector_sources']['neighbours_file'] = []
    conf['vectorizer']['class'] = 'eval.pipeline.multivectors.KmeansVectorizer'
    conf['vector_sources']['clusters_file'] = tmpclusters
    # the features of the document are cluster ids, not phrases
    # no point in checking in they are in the thesaurus
    conf['feature_selection']['must_be_in_thesaurus'] = False

    for debug_level in [0, 1, 2]:
        conf['debug_level'] = debug_level
        run_experiment(conf)
Example #4
0
def run_experiment(conf):
    start_time = datetime.now()
    mkdirs_if_not_exists(conf['output_dir'])
    test_path = ''
    tr_data = conf['training_data']
    if conf['test_data']:
        test_path = conf['test_data']

    # LOADING RAW TEXT
    x_tr, y_tr, x_test, y_test = get_tokenized_data(tr_data,
                                                    get_tokenizer_settings_from_conf(conf),
                                                    test_data=test_path)

    # CREATE CROSSVALIDATION ITERATOR
    cv_iterator, y_vals = _build_crossvalidation_iterator(conf['crossvalidation'],
                                                          y_tr, y_test)
    if x_test is not None:
        # concatenate all data, the CV iterator will make sure x_test is used for testing
        x_vals = list(x_tr)
        x_vals.extend(list(x_test))
    else:
        x_vals = x_tr

    all_scores = []
    params = []
    for i, (train_idx, test_idx) in enumerate(cv_iterator):
        params.append((conf, i, multiple_scores, test_idx, train_idx, x_vals, y_vals))
        logging.warning('Only using the first CV fold')
        if conf['crossvalidation']['break_after_first']:
            # only do one train/test split to save time
            logging.info('Exiting after first fold')
            break

    scores_over_cv = [_cv_loop(*foo) for foo in params]
    all_scores.extend([score for one_set_of_scores in scores_over_cv for score in one_set_of_scores])
    _store_scores(all_scores, conf['output_dir'], conf['name'])
    total_time = (datetime.now() - start_time).seconds / 60
    logging.info('MINUTES TAKEN %.2f' % total_time)
def _vectorize_data(data_paths, config, dummy=False):
    if dummy:
        config['vector_sources']['dummy_thesaurus'] = True
        config['vector_sources']['neighbours_file'] = []
    else:
        config['vector_sources']['neighbours_file'] = [tsv_file]

    config['vector_sources']['neighbour_strategy'] = 'linear'
    config['name'] = 'test_main'
    config['debug_level'] = 2
    config['output_dir'] = '.'
    pipeline, fit_params = evaluate._build_pipeline(config, 12345)

    x_tr, y_tr, x_test, y_test = get_tokenized_data(data_paths[0], tokenizer_opts, test_data=data_paths[1])

    x1 = pipeline.fit_transform(x_tr, y_tr, **fit_params)
    if 'fs' in pipeline.named_steps:
        pipeline.named_steps['vect'].vocabulary_ = pipeline.named_steps['fs'].vocabulary_

    voc = pipeline.named_steps['fs'].vocabulary_
    x2 = pipeline.transform(x_test)

    return x1, x2, voc