def test_jsonify_XML_corpus(): conf_file = 'tests/resources/conf/exp0/exp0.conf' conf, _ = parse_config_file(conf_file) train_set = conf['training_data'] json_train_set = train_set + '.gz' tk = get_tokenizer_settings_from_conf(conf) # parse the XML directly x_tr, y_tr, _, _ = get_tokenized_data(train_set, tk) jsonify_single_labelled_corpus('unit_tests', train_set, conf_file) x_tr1, y_tr1, _, _ = get_tokenized_data(json_train_set, tk) # because the process of converting to json merges the train and test set, if a test set exists, # we need to merge them too in this test. for a, b in zip(x_tr, x_tr1): assert len(a[0]) == len(b) == 3 assert set(str(f) for f in a[0].nodes()) == set(b) np.testing.assert_array_equal(y_tr, y_tr1) os.unlink(json_train_set)
def run_experiment(conf): start_time = datetime.now() mkdirs_if_not_exists(conf['output_dir']) test_path = '' tr_data = conf['training_data'] if conf['test_data']: test_path = conf['test_data'] # LOADING RAW TEXT x_tr, y_tr, x_test, y_test = get_tokenized_data(tr_data, get_tokenizer_settings_from_conf(conf), test_data=test_path) # CREATE CROSSVALIDATION ITERATOR cv_iterator, y_vals = _build_crossvalidation_iterator(conf['crossvalidation'], y_tr, y_test) if x_test is not None: # concatenate all data, the CV iterator will make sure x_test is used for testing x_vals = list(x_tr) x_vals.extend(list(x_test)) else: x_vals = x_tr all_scores = [] params = [] for i, (train_idx, test_idx) in enumerate(cv_iterator): params.append((conf, i, multiple_scores, test_idx, train_idx, x_vals, y_vals)) logging.warning('Only using the first CV fold') if conf['crossvalidation']['break_after_first']: # only do one train/test split to save time logging.info('Exiting after first fold') break scores_over_cv = [_cv_loop(*foo) for foo in params] all_scores.extend([score for one_set_of_scores in scores_over_cv for score in one_set_of_scores]) _store_scores(all_scores, conf['output_dir'], conf['name']) total_time = (datetime.now() - start_time).seconds / 60 logging.info('MINUTES TAKEN %.2f' % total_time)