def test_dataset_featurize_no_vectorize(corpus, kb): dataset = rel_ext.Dataset(corpus, kb) kbts_by_rel, _ = dataset.build_dataset(sampling_rate=0.1) def featurizer(kbt, corpus): return utils.randvec(10) dataset.featurize(kbts_by_rel, [featurizer], vectorize=False)
def test_find_new_relation_instances(corpus, kb, featurizer, vectorize): dataset = rel_ext.Dataset(corpus, kb) rel_ext.find_new_relation_instances( dataset, [featurizer], train_split='train', test_split='dev', model_factory=lambda: LogisticRegression(solver='liblinear'), k=10, vectorize=vectorize, verbose=False)
def test_experiment(featurizer, vectorize, corpus, kb): dataset = rel_ext.Dataset(corpus, kb) splits = dataset.build_splits( split_names=['tiny_train', 'tiny_dev', 'rest'], split_fracs=[0.05, 0.05, 0.90], seed=1) results = rel_ext.experiment(splits, train_split='tiny_train', test_split='tiny_dev', featurizers=[featurizer], vectorize=vectorize, verbose=False)
def test_dataset_featurize_vectorize(corpus, kb): dataset = rel_ext.Dataset(corpus, kb) kbts_by_rel, _ = dataset.build_dataset(sampling_rate=0.1) featurizers = [lambda kbt, corpus, feature_counter: {"bias": 1}] dataset.featurize(kbts_by_rel, featurizers)
def test_dataset_build_splits(corpus, kb): dataset = rel_ext.Dataset(corpus, kb) dat = dataset.build_splits(seed=1)
def test_dataset_build_dataset(corpus, kb): dataset = rel_ext.Dataset(corpus, kb) dat = dataset.build_dataset(include_positive=True, sampling_rate=0.1)
# In[3]: rel_ext_data_home = os.path.join('data', 'rel_ext_data') # In[4]: corpus = rel_ext.Corpus(os.path.join(rel_ext_data_home, 'corpus.tsv.gz')) # In[5]: kb = rel_ext.KB(os.path.join(rel_ext_data_home, 'kb.tsv.gz')) # In[6]: dataset = rel_ext.Dataset(corpus, kb) # You are not wedded to this set-up for splits. The bake-off will be conducted on a previously unseen test-set, so all of the data in `dataset` is fair game: # In[7]: splits = dataset.build_splits(split_names=['tiny', 'train', 'dev'], split_fracs=[0.01, 0.79, 0.20], seed=1) # In[8]: splits # ## Baselines