Esempio n. 1
0
def test_features(feature_extraction, split_ta):

    if feature_extraction in REQUIRES_EXTRA_DEPS and not ADVANCED_DEPS[
            "tensorflow"]:
        pytest.skip()

    embedding_fp = os.path.join("tests", "demo_data", "generic.vec")
    data_fp = os.path.join("tests", "demo_data", "generic.csv")

    as_data = ASReviewData.from_file(data_fp)
    texts = as_data.texts
    if feature_extraction.startswith("embedding-"):
        model = get_feature_model(feature_extraction,
                                  split_ta=split_ta,
                                  embedding_fp=embedding_fp)
    else:
        model = get_feature_model(feature_extraction, split_ta=split_ta)
    X = model.fit_transform(texts,
                            titles=as_data.title,
                            abstracts=as_data.abstract)

    assert X.shape[0] == len(as_data.title)
    assert X.shape[1] > 0
    assert isinstance(model.param, dict)
    assert model.name == feature_extraction
Esempio n. 2
0
 def __init__(self, data_fp):
     self.as_data = ASReviewData.from_file(data_fp)
     self.title = self.as_data.title
     self.abstract = self.as_data.abstract
     self.labels = self.as_data.labels
     self.keywords = self.as_data.keywords
     if self.labels is None:
         self.labels = np.full(len(self.as_data), LABEL_NA)
Esempio n. 3
0
 def get_cached_as_data(self, data_name):
     try:
         return self._cache[data_name]["as_data"]
     except KeyError:
         pass
     data_fp = data_fp_from_name(self.data_dir, data_name)
     as_data = ASReviewData.from_file(data_fp)
     self._cache[data_name]["as_data"] = as_data
     return as_data
Esempio n. 4
0
def test_record_id():
    data_fp = Path("tests", "demo_data", "record_id.csv")
    as_data = ASReviewData.from_file(data_fp)

    # test is labels are numpy array
    assert isinstance(as_data.labels, np.ndarray)

    # test is index name is record_id
    assert as_data.df.index.name == "record_id"
Esempio n. 5
0
def test_nan_values_csv():

    fp = Path("tests", "demo_data", "missing_values.csv")
    as_data = ASReviewData.from_file(fp)

    # check missing titles
    assert as_data.record(1, by_index=True).title == ""
    assert as_data.record(3, by_index=True).title == ""

    # check missing abstracts
    assert as_data.record(0, by_index=True).abstract == ""
    assert as_data.record(2, by_index=True).abstract == ""
Esempio n. 6
0
def test_reader(test_file, n_lines, labels, ignore_col):
    fp = Path("tests", "demo_data", test_file)
    as_data = ASReviewData.from_file(fp)
    assert len(as_data) == n_lines

    cols = ['title', 'abstract', 'authors', 'keywords']
    cols = [col for col in cols if col not in ignore_col]
    if labels is not None:
        cols.append('included')
        assert np.array_equal(as_data.labels, labels)

    for col in cols:
        values = as_data.get(col)
        assert len(values) == n_lines
Esempio n. 7
0
def test_query(query_strategy, n_features=50, n_sample=100,
               n_instances_list=[0, 1, 5, 50], n_train_idx=[0, 1, 5, 50]):
    classifier = get_model("rf")
    if query_strategy == "cluster":
        data_fp = os.path.join("tests", "demo_data", "generic.csv")
        texts = ASReviewData.from_file(data_fp).texts
        while len(texts) < n_features:
            texts = np.append(texts, texts)
            print(len(texts))
#             texts.extend(texts)
        texts = texts[:n_features]
        query_model = get_query_model(
            query_strategy, texts=texts, update_interval=None,
            cluster_size=int(n_sample/3))
        assert isinstance(query_model.param, dict)
    else:
        query_model = get_query_model(query_strategy)
    X = np.random.rand(n_sample, n_features)

    y = np.concatenate((np.zeros(n_sample//2), np.ones(n_sample//2)), axis=0)
    print(X.shape, y.shape)
    order = np.random.permutation(n_sample)
    print(order.shape)
    X = X[order]
    y = y[order]
    sources = query_strategy.split('_')

    classifier.fit(X, y)

    assert isinstance(query_model.param, dict)
    assert query_model.name == query_strategy

    for n_instances in n_instances_list:
        for n_train in n_train_idx:
            shared = {"query_src": {}, "current_queries": {}}
            train_idx = np.random.choice(
                np.arange(n_sample), n_train, replace=False)
            pool_idx = np.delete(np.arange(n_sample), train_idx)
            query_idx, X_query = query_model.query(X, classifier, pool_idx,
                                                   n_instances, shared)
            check_integrity(query_idx, X_query, X, pool_idx, shared,
                            n_instances, sources)
Esempio n. 8
0
    def get_cached_priors(self, data_name, i_run):
        try:
            return self._cache[data_name]["priors"][i_run]
        except KeyError:
            pass

        try:
            as_data = self._cache[data_name]["as_data"]
        except KeyError:
            data_fp = data_fp_from_name(self.data_dir, data_name)
            as_data = ASReviewData.from_file(data_fp)
            self._cache[data_name]["as_data"] = as_data

        np.random.seed(i_run)
        ones = np.where(as_data.labels == 1)[0]
        zeros = np.where(as_data.labels == 0)[0]
        included = np.random.choice(ones, self.n_included, replace=False)
        excluded = np.random.choice(zeros, self.n_excluded, replace=False)
        self._cache[data_name]["priors"][i_run] = np.append(included, excluded)

        return self._cache[data_name]["priors"][i_run]
Esempio n. 9
0
def test_nan_values_ris():

    fp = Path("tests", "demo_data", "missing_values.ris")
    as_data = ASReviewData.from_file(fp)

    # check missing titles
    assert as_data.record(1, by_index=True).title == ""
    assert as_data.record(3, by_index=True).title == ""

    # check missing abstracts
    assert as_data.record(0, by_index=True).abstract == ""
    assert as_data.record(2, by_index=True).abstract == ""

    # check missing authors
    assert as_data.record(0, by_index=True).authors is None
    assert as_data.record(2, by_index=True).authors is None

    # check missing keywords
    assert as_data.record(0, by_index=True).keywords is None
    assert as_data.record(2, by_index=True).keywords is None

    # check missing doi
    assert as_data.record(0, by_index=True).doi is None
    assert as_data.record(2, by_index=True).doi is None
Esempio n. 10
0
def test_fuzzy_finder(keywords, paper_id):
    fp = Path("tests", "demo_data", "embase.csv")
    as_data = ASReviewData.from_file(fp)

    assert as_data.fuzzy_find(keywords)[0] == paper_id
Esempio n. 11
0
def test_csv_write_data():
    fp_in = Path("tests", "demo_data", "generic_labels.csv")
    fp_out = Path("tests", "out_data", "generic_out.csv")
    asr_data = ASReviewData.from_file(fp_in)
    asr_data.to_csv(fp_out, labels=[[0, 0], [2, 1], [3, 1]])
Esempio n. 12
0
def test_bad_record_id():
    data_fp = Path("tests", "demo_data", "generic_bad_record_id.csv")
    as_data = ASReviewData.from_file(data_fp)
    assert (len(np.unique(as_data.df.index.values)) == len(as_data))