def test_features(feature_extraction, split_ta): if feature_extraction in REQUIRES_EXTRA_DEPS and not ADVANCED_DEPS[ "tensorflow"]: pytest.skip() embedding_fp = os.path.join("tests", "demo_data", "generic.vec") data_fp = os.path.join("tests", "demo_data", "generic.csv") as_data = ASReviewData.from_file(data_fp) texts = as_data.texts if feature_extraction.startswith("embedding-"): model = get_feature_model(feature_extraction, split_ta=split_ta, embedding_fp=embedding_fp) else: model = get_feature_model(feature_extraction, split_ta=split_ta) X = model.fit_transform(texts, titles=as_data.title, abstracts=as_data.abstract) assert X.shape[0] == len(as_data.title) assert X.shape[1] > 0 assert isinstance(model.param, dict) assert model.name == feature_extraction
def __init__(self, data_fp): self.as_data = ASReviewData.from_file(data_fp) self.title = self.as_data.title self.abstract = self.as_data.abstract self.labels = self.as_data.labels self.keywords = self.as_data.keywords if self.labels is None: self.labels = np.full(len(self.as_data), LABEL_NA)
def get_cached_as_data(self, data_name): try: return self._cache[data_name]["as_data"] except KeyError: pass data_fp = data_fp_from_name(self.data_dir, data_name) as_data = ASReviewData.from_file(data_fp) self._cache[data_name]["as_data"] = as_data return as_data
def test_record_id(): data_fp = Path("tests", "demo_data", "record_id.csv") as_data = ASReviewData.from_file(data_fp) # test is labels are numpy array assert isinstance(as_data.labels, np.ndarray) # test is index name is record_id assert as_data.df.index.name == "record_id"
def test_nan_values_csv(): fp = Path("tests", "demo_data", "missing_values.csv") as_data = ASReviewData.from_file(fp) # check missing titles assert as_data.record(1, by_index=True).title == "" assert as_data.record(3, by_index=True).title == "" # check missing abstracts assert as_data.record(0, by_index=True).abstract == "" assert as_data.record(2, by_index=True).abstract == ""
def test_reader(test_file, n_lines, labels, ignore_col): fp = Path("tests", "demo_data", test_file) as_data = ASReviewData.from_file(fp) assert len(as_data) == n_lines cols = ['title', 'abstract', 'authors', 'keywords'] cols = [col for col in cols if col not in ignore_col] if labels is not None: cols.append('included') assert np.array_equal(as_data.labels, labels) for col in cols: values = as_data.get(col) assert len(values) == n_lines
def test_query(query_strategy, n_features=50, n_sample=100, n_instances_list=[0, 1, 5, 50], n_train_idx=[0, 1, 5, 50]): classifier = get_model("rf") if query_strategy == "cluster": data_fp = os.path.join("tests", "demo_data", "generic.csv") texts = ASReviewData.from_file(data_fp).texts while len(texts) < n_features: texts = np.append(texts, texts) print(len(texts)) # texts.extend(texts) texts = texts[:n_features] query_model = get_query_model( query_strategy, texts=texts, update_interval=None, cluster_size=int(n_sample/3)) assert isinstance(query_model.param, dict) else: query_model = get_query_model(query_strategy) X = np.random.rand(n_sample, n_features) y = np.concatenate((np.zeros(n_sample//2), np.ones(n_sample//2)), axis=0) print(X.shape, y.shape) order = np.random.permutation(n_sample) print(order.shape) X = X[order] y = y[order] sources = query_strategy.split('_') classifier.fit(X, y) assert isinstance(query_model.param, dict) assert query_model.name == query_strategy for n_instances in n_instances_list: for n_train in n_train_idx: shared = {"query_src": {}, "current_queries": {}} train_idx = np.random.choice( np.arange(n_sample), n_train, replace=False) pool_idx = np.delete(np.arange(n_sample), train_idx) query_idx, X_query = query_model.query(X, classifier, pool_idx, n_instances, shared) check_integrity(query_idx, X_query, X, pool_idx, shared, n_instances, sources)
def get_cached_priors(self, data_name, i_run): try: return self._cache[data_name]["priors"][i_run] except KeyError: pass try: as_data = self._cache[data_name]["as_data"] except KeyError: data_fp = data_fp_from_name(self.data_dir, data_name) as_data = ASReviewData.from_file(data_fp) self._cache[data_name]["as_data"] = as_data np.random.seed(i_run) ones = np.where(as_data.labels == 1)[0] zeros = np.where(as_data.labels == 0)[0] included = np.random.choice(ones, self.n_included, replace=False) excluded = np.random.choice(zeros, self.n_excluded, replace=False) self._cache[data_name]["priors"][i_run] = np.append(included, excluded) return self._cache[data_name]["priors"][i_run]
def test_nan_values_ris(): fp = Path("tests", "demo_data", "missing_values.ris") as_data = ASReviewData.from_file(fp) # check missing titles assert as_data.record(1, by_index=True).title == "" assert as_data.record(3, by_index=True).title == "" # check missing abstracts assert as_data.record(0, by_index=True).abstract == "" assert as_data.record(2, by_index=True).abstract == "" # check missing authors assert as_data.record(0, by_index=True).authors is None assert as_data.record(2, by_index=True).authors is None # check missing keywords assert as_data.record(0, by_index=True).keywords is None assert as_data.record(2, by_index=True).keywords is None # check missing doi assert as_data.record(0, by_index=True).doi is None assert as_data.record(2, by_index=True).doi is None
def test_fuzzy_finder(keywords, paper_id): fp = Path("tests", "demo_data", "embase.csv") as_data = ASReviewData.from_file(fp) assert as_data.fuzzy_find(keywords)[0] == paper_id
def test_csv_write_data(): fp_in = Path("tests", "demo_data", "generic_labels.csv") fp_out = Path("tests", "out_data", "generic_out.csv") asr_data = ASReviewData.from_file(fp_in) asr_data.to_csv(fp_out, labels=[[0, 0], [2, 1], [3, 1]])
def test_bad_record_id(): data_fp = Path("tests", "demo_data", "generic_bad_record_id.csv") as_data = ASReviewData.from_file(data_fp) assert (len(np.unique(as_data.df.index.values)) == len(as_data))