def record(self, i, by_index=True): """Create a record from an index. Arguments --------- i: int, iterable Index of the record, or list of indices. by_index: bool If True, take the i-th value as used internally by the review. If False, take the record with record_id==i. Returns ------- PaperRecord: The corresponding record if i was an integer, or a list of records if i was an iterable. """ if not is_iterable(i): index_list = [i] else: index_list = i if not by_index: records = [PaperRecord(**self.df.loc[j, :], record_id=j) for j in index_list] else: records = [PaperRecord(**self.df.iloc[j], record_id=self.df.index.values[j]) for j in index_list] if is_iterable(i): return records return records[0]
def get_dataset_metadata(exclude=None, include=None): all_datasets = DatasetManager().list(latest_only=False) if exclude is not None: if not is_iterable(exclude): exclude = [exclude] for group_id in exclude: all_datasets.pop(group_id, None) if include is not None: if not is_iterable(include): include = [include] for group_id in list(all_datasets): if group_id not in include: all_datasets.pop(group_id, None) result_datasets = [] for group_id, data_list in all_datasets.items(): for dataset in data_list: if isinstance(dataset, BaseVersionedDataSet): cur_data = [] for vdata in dataset.datasets: vdata.dataset_id = f"{group_id}:{vdata.dataset_id}" cur_data.append(vdata.to_dict()) result_datasets.append(cur_data) else: dataset.dataset_id = f"{group_id}:{dataset.dataset_id}" result_datasets.append([dataset.to_dict()]) return result_datasets
def get_dataset_metadata(exclude=None, include=None): manager = DatasetManager() groups = manager.groups.copy() if exclude is not None: # make iterable if not the case if not is_iterable(exclude): exclude = [exclude] # pop items for group_id in exclude: try: groups.remove(group_id) except ValueError: pass if include is not None: # make iterable if not the case if not is_iterable(include): include = [include] # pop items for group_id in groups: if group_id not in include: groups.remove(group_id) # get datasets all_datasets = manager.list(group_name=groups, latest_only=False, raise_on_error=True) result_datasets = [] for group_id, data_list in all_datasets.items(): for dataset in data_list: if isinstance(dataset, BaseVersionedDataSet): cur_data = [] for vdata in dataset.datasets: vdata.dataset_id = f"{group_id}:{vdata.dataset_id}" cur_data.append(vdata.to_dict()) result_datasets.append(cur_data) else: dataset.dataset_id = f"{group_id}:{dataset.dataset_id}" result_datasets.append(dataset.to_dict()) return result_datasets
def find(self, dataset_name): if is_iterable(dataset_name): return [self.find(x) for x in dataset_name] if Path(dataset_name).is_file(): return BaseDataSet(dataset_name) dataset_name = str(dataset_name) split_dataset_id = dataset_name.split(":") if len(split_dataset_id) == 2: data_group = split_dataset_id[0] split_dataset_name = split_dataset_id[1] if data_group in self.all_datasets: return self.all_datasets[data_group].find(split_dataset_name) all_results = {} for group_name, dataset in self.all_datasets.items(): result = dataset.find(dataset_name) if result is not None: all_results[group_name] = result if len(all_results) > 1: raise ValueError( f"Multiple datasets found: {list(all_results)}." "Use DATAGROUP:DATASET format to specify which one" " you want.") if len(all_results) == 1: return list(all_results.values())[0] return None
def get_dataset(dataset_id): if is_iterable(dataset_id): return [get_dataset(data) for data in dataset_id] all_datasets = get_available_datasets() data_group = None try: split_dataset_id = dataset_id.split(":") if len(split_dataset_id) == 2: data_group = split_dataset_id[0] dataset_id = split_dataset_id[1] except TypeError: pass my_datasets = {} for group, cur_datasets in all_datasets.items(): if data_group is not None and group != data_group: continue if dataset_id in cur_datasets: my_datasets[dataset_id] = cur_datasets[dataset_id] if len(my_datasets) == 1: return my_datasets[list(my_datasets)[0]] if len(my_datasets) > 1: raise ValueError(f"Multiple datasets found: {list(my_datasets)}." "Use DATAGROUP:DATASET format to specify which one" " you want.") return BaseDataSet(dataset_id)
def list(self, group_name=None, latest_only=True): """List the available datasets. Parameters ---------- group_name: str, iterable List only datasets in the group(s) with that name. Lists all groups if group_name is None. latest_only: bool Only include the latest version of the dataset. Returns ------- dict: Dictionary with group names as keys and lists of datasets as values. """ if group_name is None: group_names = list(self.all_datasets) elif not is_iterable(group_name): group_names = [group_name] else: group_names = group_name dataset_list = { gn: self.all_datasets[gn].list(latest_only=latest_only) for gn in group_names } return dataset_list
def find(self, dataset_name): """Find a dataset. Parameters ---------- dataset_name: str, iterable Look for this term in aliases within any dataset. A group can be specified by setting dataset_name to 'group_id:dataset_id'. This can be helpful if the dataset_id is not unique. The dataset_name can also be a non-string iterable, in which case a list will be returned with all terms. Dataset_ids should not contain semicolons (:). Return None if the dataset could not be found. Returns ------- BaseDataSet, VersionedDataSet: If the dataset with that name is found, return it (or a list there of). """ # If dataset_name is a non-string iterable, return a list. if is_iterable(dataset_name): return [self.find(x) for x in dataset_name] # If dataset_name is a valid path, create a dataset from it. if Path(dataset_name).is_file(): return BaseDataSet(dataset_name) dataset_name = str(dataset_name) # Split into group/dataset if possible. split_dataset_id = dataset_name.split(":") if len(split_dataset_id) == 2: data_group = split_dataset_id[0] split_dataset_name = split_dataset_id[1] if data_group in self.all_datasets: return self.all_datasets[data_group].find(split_dataset_name) # Look through all available/installed groups for the name. all_results = {} for group_name, dataset in self.all_datasets.items(): result = dataset.find(dataset_name) if result is not None: all_results[group_name] = result # If we have multiple results, throw an error. if len(all_results) > 1: raise ValueError( f"Multiple datasets found: {list(all_results)}." "Use DATAGROUP:DATASET format to specify which one" " you want.") if len(all_results) == 1: return list(all_results.values())[0] # Could not find dataset, return None. raise FileNotFoundError( f"File or dataset does not exist: '{dataset_name}'")
def list(self, group_name=None, latest_only=True): if group_name is None: group_names = list(self.all_datasets) elif not is_iterable(group_name): group_names = [group_name] else: group_names = group_name dataset_list = {gn: self.all_datasets[gn].list(latest_only=latest_only) for gn in group_names} return dataset_list
def list(self, group_name=None, latest_only=True, raise_on_error=False): """List the available datasets. Parameters ---------- group_name: str, iterable List only datasets in the group(s) with that name. Lists all groups if group_name is None. latest_only: bool Only include the latest version of the dataset. raise_on_error: bool Raise error when entry point can't be loaded. Returns ------- dict: Dictionary with group names as keys and lists of datasets as values. """ if group_name is None: group_names = self.groups elif not is_iterable(group_name): group_names = [group_name] else: group_names = group_name dataset_groups = get_entry_points('asreview.datasets') dataset_list = {} for group in group_names: try: dataset_list[group] = \ dataset_groups[group].load()().list(latest_only=latest_only) except Exception as err: # don't raise error on loading entry point if raise_on_error: raise err return dataset_list