def create_generic_benchmark_from_paths(
    train_lists_of_files: Sequence[Sequence[FileAndLabel]],
    test_lists_of_files: Union[
        Sequence[FileAndLabel], Sequence[Sequence[FileAndLabel]]
    ],
    *,
    other_streams_lists_of_files: Dict[
        str, Sequence[Sequence[FileAndLabel]]
    ] = None,
    task_labels: Sequence[int],
    complete_test_set_only: bool = False,
    train_transform=None,
    train_target_transform=None,
    eval_transform=None,
    eval_target_transform=None,
    other_streams_transforms: Dict[str, Tuple[Any, Any]] = None,
    dataset_type: AvalancheDatasetType = AvalancheDatasetType.UNDEFINED
) -> GenericCLScenario:
    """
    Creates a benchmark instance given a sequence of lists of files. A separate
    dataset will be created for each list. Each of those datasets
    will be considered a separate experience.

    This is very similar to :func:`create_generic_benchmark_from_filelists`,
    with the main difference being that
    :func:`create_generic_benchmark_from_filelists` accepts, for each
    experience, a file list formatted in Caffe-style. On the contrary, this
    accepts a list of tuples where each tuple contains two elements: the full
    path to the pattern and its label. Optionally, the tuple may contain a third
    element describing the bounding box of the element to crop. This last
    bounding box may be useful when trying to extract the part of the image
    depicting the desired element.

    Apart from that, the same limitations of
    :func:`create_generic_benchmark_from_filelists` regarding task labels apply.

    The label of each pattern doesn't have to be an int. Also, a dataset type
    can be defined.

    :param train_lists_of_files: A list of lists. Each list describes the paths
        and labels of patterns to include in that training experience, as
        tuples. Each tuple must contain two elements: the full path to the
        pattern and its class label. Optionally, the tuple may contain a
        third element describing the bounding box to use for cropping (top,
        left, height, width).
    :param test_lists_of_files: A list of lists. Each list describes the paths
        and labels of patterns to include in that test experience, as tuples.
        Each tuple must contain two elements: the full path to the pattern
        and its class label. Optionally, the tuple may contain a third element
        describing the bounding box to use for cropping (top, left, height,
        width).
    :param other_streams_lists_of_files: A dictionary describing the content of
        custom streams. Keys must be valid stream names (letters and numbers,
        not starting with a number) while the value follow the same structure
        of `train_lists_of_files` and `test_lists_of_files` parameters. If this
        dictionary contains the definition for "train" or "test" streams then
        those definition will  override the `train_lists_of_files` and
        `test_lists_of_files` parameters.
    :param task_labels: A list of task labels. Must contain at least a value
        for each experience. Each value describes the task label that will be
        applied to all patterns of a certain experience. For more info on that,
        see the function description.
    :param complete_test_set_only: If True, only the complete test set will
        be returned by the benchmark. This means that the ``test_list_of_files``
        parameter must define a single experience (the complete test set).
        Defaults to False.
    :param train_transform: The transformation to apply to the training data,
        e.g. a random crop, a normalization or a concatenation of different
        transformations (see torchvision.transform documentation for a
        comprehensive list of possible transformations). Defaults to None.
    :param train_target_transform: The transformation to apply to training
        patterns targets. Defaults to None.
    :param eval_transform: The transformation to apply to the test data,
        e.g. a random crop, a normalization or a concatenation of different
        transformations (see torchvision.transform documentation for a
        comprehensive list of possible transformations). Defaults to None.
    :param eval_target_transform: The transformation to apply to test
        patterns targets. Defaults to None.
    :param other_streams_transforms: Transformations to apply to custom
        streams. If no transformations are defined for a custom stream,
        then "train" transformations will be used. This parameter must be a
        dictionary mapping stream names to transformations. The transformations
        must be a two elements tuple where the first element defines the
        X transformation while the second element is the Y transformation.
        Those elements can be None. If this dictionary contains the
        transformations for "train" or "test" streams then those transformations
        will override the `train_transform`, `train_target_transform`,
        `eval_transform` and `eval_target_transform` parameters.
    :param dataset_type: The type of the dataset. Defaults to UNDEFINED.

    :returns: A :class:`GenericCLScenario` instance.
    """

    input_streams = dict(train=train_lists_of_files, test=test_lists_of_files)

    if other_streams_lists_of_files is not None:
        input_streams = {**input_streams, **other_streams_lists_of_files}

    stream_definitions = dict()

    for stream_name, lists_of_files in input_streams.items():
        stream_datasets = []
        for exp_id, list_of_files in enumerate(lists_of_files):
            common_root, exp_paths_list = common_paths_root(list_of_files)
            paths_dataset = PathsDataset(common_root, exp_paths_list)
            stream_datasets.append(
                AvalancheDataset(paths_dataset, task_labels=task_labels[exp_id])
            )

        stream_definitions[stream_name] = stream_datasets

    return create_multi_dataset_generic_benchmark(
        [],
        [],
        other_streams_datasets=stream_definitions,
        train_transform=train_transform,
        train_target_transform=train_target_transform,
        eval_transform=eval_transform,
        eval_target_transform=eval_target_transform,
        complete_test_set_only=complete_test_set_only,
        other_streams_transforms=other_streams_transforms,
        dataset_type=dataset_type,
    )
Ejemplo n.º 2
0
def CTrL(
    stream_name: str,
    save_to_disk: bool = False,
    path: Path = default_dataset_location(""),
    seed: int = None,
    n_tasks: int = None,
):
    """
    Gives access to the Continual Transfer Learning benchmark streams
    introduced in https://arxiv.org/abs/2012.12631.
    :param stream_name: Name of the test stream to generate. Must be one of
    `s_plus`, `s_minus`, `s_in`, `s_out` and `s_pl`.
    :param save_to_disk:  Whether to save each stream on the disk or load
    everything in memory. Setting it to `True` will save memory but takes more
    time on the first generation using the corresponding seed.
    :param path: The path under which the generated stream will be saved if
    save_to_disk is True.
    :param seed: The seed to use to generate the streams. If no seed is given,
    a random one will be used to make sure that the generated stream can
    be reproduced.
    :param n_tasks: The number of tasks to generate. This parameter is only
    relevant for the `s_long` stream, as all other streams have a fixed number
    of tasks.
    :return: A scenario containing 3 streams: train, val and test.
    """
    seed = seed or random.randint(0, sys.maxsize)
    if stream_name != "s_long" and n_tasks is not None:
        raise ValueError("The n_tasks parameter can only be used with the "
                         f'"s_long" stream, asked {n_tasks} for {stream_name}')
    elif stream_name == "s_long" and n_tasks is None:
        n_tasks = 100

    stream = ctrl.get_stream(stream_name, seed)

    if save_to_disk:
        folder = path / "ctrl" / stream_name / f"seed_{seed}"

    # Train, val and test experiences
    exps = [[], [], []]
    for t_id, t in enumerate(tqdm(stream, desc=f"Loading {stream_name}"), ):
        trans = transforms.Normalize(t.statistics["mean"], t.statistics["std"])
        for split, split_name, exp in zip(t.datasets, t.split_names, exps):
            samples, labels = split.tensors
            task_labels = [t.id] * samples.size(0)
            if save_to_disk:
                exp_folder = folder / f"exp_{t_id}" / split_name
                exp_folder.mkdir(parents=True, exist_ok=True)
                files = []
                for i, (sample, label) in enumerate(zip(samples, labels)):
                    sample_path = exp_folder / f"sample_{i}.png"
                    if not sample_path.exists():
                        F.to_pil_image(sample).save(sample_path)
                    files.append((sample_path, label.item()))

                common_root, exp_paths_list = common_paths_root(files)
                paths_dataset = PathsDataset(common_root, exp_paths_list)
                dataset = AvalancheDataset(
                    paths_dataset,
                    task_labels=task_labels,
                    transform=transforms.Compose(
                        [transforms.ToTensor(), trans]),
                )
            else:
                dataset = AvalancheTensorDataset(
                    samples,
                    labels.squeeze(1),
                    task_labels=task_labels,
                    transform=trans,
                )
            exp.append(dataset)
        if stream_name == "s_long" and t_id == n_tasks - 1:
            break

    return dataset_benchmark(
        train_datasets=exps[0],
        test_datasets=exps[2],
        other_streams_datasets=dict(val=exps[1]),
    )