예제 #1
0
    def record(self, i, by_index=True):
        """Create a record from an index.

        Arguments
        ---------
        i: int, iterable
            Index of the record, or list of indices.
        by_index: bool
            If True, take the i-th value as used internally by the review.
            If False, take the record with record_id==i.

        Returns
        -------
        PaperRecord:
            The corresponding record if i was an integer, or a list of records
            if i was an iterable.
        """
        if not is_iterable(i):
            index_list = [i]
        else:
            index_list = i

        if not by_index:
            records = [PaperRecord(**self.df.loc[j, :], record_id=j)
                       for j in index_list]
        else:
            records = [PaperRecord(**self.df.iloc[j],
                                   record_id=self.df.index.values[j])
                       for j in index_list]

        if is_iterable(i):
            return records
        return records[0]
예제 #2
0
def get_dataset_metadata(exclude=None, include=None):
    all_datasets = DatasetManager().list(latest_only=False)
    if exclude is not None:
        if not is_iterable(exclude):
            exclude = [exclude]
        for group_id in exclude:
            all_datasets.pop(group_id, None)

    if include is not None:
        if not is_iterable(include):
            include = [include]
        for group_id in list(all_datasets):
            if group_id not in include:
                all_datasets.pop(group_id, None)

    result_datasets = []
    for group_id, data_list in all_datasets.items():
        for dataset in data_list:
            if isinstance(dataset, BaseVersionedDataSet):
                cur_data = []
                for vdata in dataset.datasets:
                    vdata.dataset_id = f"{group_id}:{vdata.dataset_id}"
                    cur_data.append(vdata.to_dict())
                result_datasets.append(cur_data)
            else:
                dataset.dataset_id = f"{group_id}:{dataset.dataset_id}"
                result_datasets.append([dataset.to_dict()])

    return result_datasets
예제 #3
0
def get_dataset_metadata(exclude=None, include=None):

    manager = DatasetManager()
    groups = manager.groups.copy()

    if exclude is not None:

        # make iterable if not the case
        if not is_iterable(exclude):
            exclude = [exclude]

        # pop items
        for group_id in exclude:
            try:
                groups.remove(group_id)
            except ValueError:
                pass

    if include is not None:

        # make iterable if not the case
        if not is_iterable(include):
            include = [include]

        # pop items
        for group_id in groups:
            if group_id not in include:
                groups.remove(group_id)

    # get datasets
    all_datasets = manager.list(group_name=groups,
                                latest_only=False,
                                raise_on_error=True)

    result_datasets = []
    for group_id, data_list in all_datasets.items():
        for dataset in data_list:
            if isinstance(dataset, BaseVersionedDataSet):
                cur_data = []
                for vdata in dataset.datasets:
                    vdata.dataset_id = f"{group_id}:{vdata.dataset_id}"
                    cur_data.append(vdata.to_dict())
                result_datasets.append(cur_data)
            else:
                dataset.dataset_id = f"{group_id}:{dataset.dataset_id}"
                result_datasets.append(dataset.to_dict())

    return result_datasets
예제 #4
0
    def find(self, dataset_name):
        if is_iterable(dataset_name):
            return [self.find(x) for x in dataset_name]

        if Path(dataset_name).is_file():
            return BaseDataSet(dataset_name)

        dataset_name = str(dataset_name)

        split_dataset_id = dataset_name.split(":")
        if len(split_dataset_id) == 2:
            data_group = split_dataset_id[0]
            split_dataset_name = split_dataset_id[1]
            if data_group in self.all_datasets:
                return self.all_datasets[data_group].find(split_dataset_name)

        all_results = {}
        for group_name, dataset in self.all_datasets.items():
            result = dataset.find(dataset_name)
            if result is not None:
                all_results[group_name] = result

        if len(all_results) > 1:
            raise ValueError(
                f"Multiple datasets found: {list(all_results)}."
                "Use DATAGROUP:DATASET format to specify which one"
                " you want.")

        if len(all_results) == 1:
            return list(all_results.values())[0]

        return None
예제 #5
0
def get_dataset(dataset_id):
    if is_iterable(dataset_id):
        return [get_dataset(data) for data in dataset_id]

    all_datasets = get_available_datasets()

    data_group = None
    try:
        split_dataset_id = dataset_id.split(":")
        if len(split_dataset_id) == 2:
            data_group = split_dataset_id[0]
            dataset_id = split_dataset_id[1]
    except TypeError:
        pass

    my_datasets = {}

    for group, cur_datasets in all_datasets.items():
        if data_group is not None and group != data_group:
            continue
        if dataset_id in cur_datasets:
            my_datasets[dataset_id] = cur_datasets[dataset_id]

    if len(my_datasets) == 1:
        return my_datasets[list(my_datasets)[0]]
    if len(my_datasets) > 1:
        raise ValueError(f"Multiple datasets found: {list(my_datasets)}."
                         "Use DATAGROUP:DATASET format to specify which one"
                         " you want.")

    return BaseDataSet(dataset_id)
예제 #6
0
    def list(self, group_name=None, latest_only=True):
        """List the available datasets.

        Parameters
        ----------
        group_name: str, iterable
            List only datasets in the group(s) with that name. Lists all
            groups if group_name is None.
        latest_only: bool
            Only include the latest version of the dataset.

        Returns
        -------
        dict:
            Dictionary with group names as keys and lists of datasets as
            values.
        """
        if group_name is None:
            group_names = list(self.all_datasets)
        elif not is_iterable(group_name):
            group_names = [group_name]
        else:
            group_names = group_name

        dataset_list = {
            gn: self.all_datasets[gn].list(latest_only=latest_only)
            for gn in group_names
        }
        return dataset_list
예제 #7
0
    def find(self, dataset_name):
        """Find a dataset.

        Parameters
        ----------
        dataset_name: str, iterable
            Look for this term in aliases within any dataset. A group can
            be specified by setting dataset_name to 'group_id:dataset_id'.
            This can be helpful if the dataset_id is not unique.
            The dataset_name can also be a non-string iterable, in which case
            a list will be returned with all terms.
            Dataset_ids should not contain semicolons (:).
            Return None if the dataset could not be found.

        Returns
        -------
        BaseDataSet, VersionedDataSet:
            If the dataset with that name is found, return it
            (or a list there of).
        """
        # If dataset_name is a non-string iterable, return a list.
        if is_iterable(dataset_name):
            return [self.find(x) for x in dataset_name]

        # If dataset_name is a valid path, create a dataset from it.
        if Path(dataset_name).is_file():
            return BaseDataSet(dataset_name)

        dataset_name = str(dataset_name)

        # Split into group/dataset if possible.
        split_dataset_id = dataset_name.split(":")
        if len(split_dataset_id) == 2:
            data_group = split_dataset_id[0]
            split_dataset_name = split_dataset_id[1]
            if data_group in self.all_datasets:
                return self.all_datasets[data_group].find(split_dataset_name)

        # Look through all available/installed groups for the name.
        all_results = {}
        for group_name, dataset in self.all_datasets.items():
            result = dataset.find(dataset_name)
            if result is not None:
                all_results[group_name] = result

        # If we have multiple results, throw an error.
        if len(all_results) > 1:
            raise ValueError(
                f"Multiple datasets found: {list(all_results)}."
                "Use DATAGROUP:DATASET format to specify which one"
                " you want.")

        if len(all_results) == 1:
            return list(all_results.values())[0]

        # Could not find dataset, return None.
        raise FileNotFoundError(
            f"File or dataset does not exist: '{dataset_name}'")
예제 #8
0
    def list(self, group_name=None, latest_only=True):
        if group_name is None:
            group_names = list(self.all_datasets)
        elif not is_iterable(group_name):
            group_names = [group_name]
        else:
            group_names = group_name

        dataset_list = {gn: self.all_datasets[gn].list(latest_only=latest_only)
                        for gn in group_names}
        return dataset_list
예제 #9
0
    def list(self, group_name=None, latest_only=True, raise_on_error=False):
        """List the available datasets.

        Parameters
        ----------
        group_name: str, iterable
            List only datasets in the group(s) with that name. Lists all
            groups if group_name is None.
        latest_only: bool
            Only include the latest version of the dataset.
        raise_on_error: bool
            Raise error when entry point can't be loaded.

        Returns
        -------
        dict:
            Dictionary with group names as keys and lists of datasets as
            values.
        """
        if group_name is None:
            group_names = self.groups
        elif not is_iterable(group_name):
            group_names = [group_name]
        else:
            group_names = group_name

        dataset_groups = get_entry_points('asreview.datasets')

        dataset_list = {}
        for group in group_names:
            try:
                dataset_list[group] = \
                    dataset_groups[group].load()().list(latest_only=latest_only)
            except Exception as err:

                # don't raise error on loading entry point
                if raise_on_error:
                    raise err

        return dataset_list