Example #1
0
def get_dataloader_factory(dataloader, source="kipoi"):

    # pull the dataloader & get the dataloader directory
    source = kipoi.config.get_source(source)
    yaml_path = source.pull_dataloader(dataloader)
    dataloader_dir = os.path.dirname(yaml_path)

    # --------------------------------------------
    # Setup dataloader description
    with cd(dataloader_dir):  # move to the dataloader directory temporarily
        dl = DataLoaderDescription.load(os.path.basename(yaml_path))
        file_path, obj_name = tuple(dl.defined_as.split("::"))
        CustomDataLoader = getattr(load_module(file_path), obj_name)

    # check that dl.type is correct
    if dl.type not in AVAILABLE_DATALOADERS:
        raise ValueError("dataloader type: {0} is not in supported dataloaders:{1}".
                         format(dl.type, list(AVAILABLE_DATALOADERS.keys())))
    # check that the extractor arguments match yaml arguments
    if not getargs(CustomDataLoader) == set(dl.args.keys()):
        raise ValueError("DataLoader arguments: \n{0}\n don't match ".format(set(getargs(CustomDataLoader))) +
                         "the specification in the dataloader.yaml file:\n{0}".
                         format(set(dl.args.keys())))
    # check that CustomDataLoader indeed interits from the right DataLoader
    if dl.type in DATALOADERS_AS_FUNCTIONS:
        # transform the functions into objects
        assert isinstance(CustomDataLoader, types.FunctionType)
        CustomDataLoader = AVAILABLE_DATALOADERS[dl.type].from_fn(CustomDataLoader)
    else:
        if not issubclass(CustomDataLoader, AVAILABLE_DATALOADERS[dl.type]):
            raise ValueError("DataLoader does't inherit from the specified dataloader: {0}".
                             format(AVAILABLE_DATALOADERS[dl.type].__name__))
    logger.info('successfully loaded the dataloader from {}'.
                format(os.path.normpath(os.path.join(dataloader_dir, dl.defined_as))))
    # Inherit the attributes from dl
    # TODO - make this more automatic / DRY
    # write a method to load those things?
    CustomDataLoader.type = dl.type
    CustomDataLoader.defined_as = dl.defined_as
    CustomDataLoader.args = dl.args
    CustomDataLoader.info = dl.info
    CustomDataLoader.output_schema = dl.output_schema
    CustomDataLoader.dependencies = dl.dependencies
    CustomDataLoader.postprocessing = dl.postprocessing
    # keep it hidden?
    CustomDataLoader._yaml_path = yaml_path
    CustomDataLoader.source = source
    # TODO - rename?
    CustomDataLoader.source_dir = dataloader_dir

    # Add init_example method
    CustomDataLoader.example_kwargs = example_kwargs(CustomDataLoader.args)

    def init_example(cls):
        return cls(**cls.example_kwargs)
    CustomDataLoader.init_example = classmethod(init_example)
    CustomDataLoader.print_args = classmethod(print_dl_kwargs)

    return CustomDataLoader
Example #2
0
def test_parse_correct_info(info_str, tmpdir):
    info_str = inp_targ + info_str  # add the input: targets headers
    # loading works
    info = CLS.from_config(from_yaml(info_str))
    info.path = str(tmpdir)

    outfiles = example_kwargs(info.args, str(tmpdir))
    assert os.path.exists(outfiles['intervals_file'])
    assert isinstance(info.get_example_kwargs(), dict)
    assert isinstance(example_kwargs(info.args), dict)

    assert isinstance(info.args["intervals_file"].example, RemoteFile)
    assert isinstance(info.args["fasta_file"].example, str)

    # cfg works
    cfg = info.get_config()
    info2 = CLS.from_config(cfg)
    assert str(info) == str(info2)
Example #3
0
def test_parse_correct_info(info_str):
    info_str = inp_targ + info_str  # add the input: targets headers
    # loading works
    info = CLS.from_config(from_yaml(info_str))

    assert isinstance(example_kwargs(info.args), dict)
    # cfg works
    cfg = info.get_config()
    info2 = CLS.from_config(cfg)
    assert str(info) == str(info2)
Example #4
0
    def example_kwargs(cls):
        if cls.args is None:
            raise ValueError("Class description `args` is missing. "
                             "Use `_add_description_factory` to annotate the class")
        if cls.source_dir is None:
            logger.info("Using current directory for source_dir")
            cls.source_dir = os.getcwd()

        # Add init_example method.
        # example_kwargs also downloads files to {dataloader_dir}/dataloader_files
        return example_kwargs(cls.args, os.path.join(cls.source_dir, "downloaded/example_files"))
Example #5
0
    def download_example(cls, output_dir, absolute_path=False, dry_run=False):
        """Download the example files to the desired directory

        # Arguments
          output_dir: output directory where to store the file
          absolute_path: if True, return absolute paths to the
            output directories
          dry_run: if True, return only the file paths without
            actually downloading the files

        # Returns
          dictionary of keyword arguments for the dataloader
        """
        return example_kwargs(cls.args, output_dir, absolute_path=absolute_path, dry_run=dry_run)
Example #6
0
def get_dataloader_factory(dataloader, source="kipoi"):
    """Loads the dataloader

    # Arguments
        dataloader (str): dataloader name
        source (str): source name

    # Returns
    - Instance of class inheriting from `kipoi.data.BaseDataLoader` (like `kipoi.data.Dataset`)
           decorated with additional attributes.

    # Methods
    - __batch_iter(batch_size, num_workers, **kwargs)__
         - Arguments
             - **batch_size**: batch size
             - **num_workers**: Number of workers to use in parallel.
             - ****kwargs**: Other kwargs specific to each dataloader
         - Yields
             - `dict` with `"inputs"`, `"targets"` and `"metadata"`
    - __batch_train_iter(cycle=True, **kwargs)__
         - Arguments
             - **cycle**: if True, cycle indefinitely
             - ****kwargs**: Kwargs passed to `batch_iter()` like `batch_size`
         - Yields
             - tuple of ("inputs", "targets") from the usual dict returned by `batch_iter()`
    - __batch_predict_iter(**kwargs)__
         - Arguments
             - ****kwargs**: Kwargs passed to `batch_iter()` like `batch_size`
         - Yields
             - "inputs" field from the usual dict returned by `batch_iter()`
    - __load_all(**kwargs)__ - load the whole dataset into memory
         - Arguments
             - ****kwargs**: Kwargs passed to `batch_iter()` like `batch_size`
         - Returns
             - `dict` with `"inputs"`, `"targets"` and `"metadata"`
    - **init_example()** - instantiate the dataloader with example kwargs
    - **print_args()** - print information about the required arguments

    # Appended attributes
    - **type** (str): dataloader type (class name)
    - **defined_as** (str): path and dataloader name
    - **args** (list of kipoi.specs.DataLoaderArgument): datalaoder argument description
    - **info** (kipoi.specs.Info): general information about the dataloader
    - **schema** (kipoi.specs.DataloaderSchema): information about the input/output
            data modalities
    - **dependencies** (kipoi.specs.Dependencies): class specifying the dependencies.
          (implements `install` method for running the installation)
    - **name** (str): model name
    - **source** (str): model source
    - **source_dir** (str): local path to model source storage
    - **postprocessing** (dict): dictionary of loaded plugin specifications
    - **example_kwargs** (dict): kwargs for running the provided example
    """

    # pull the dataloader & get the dataloader directory
    source = kipoi.config.get_source(source)
    yaml_path = source.pull_dataloader(dataloader)
    dataloader_dir = os.path.dirname(yaml_path)

    # --------------------------------------------
    # Setup dataloader description
    with cd(dataloader_dir):  # move to the dataloader directory temporarily
        dl = DataLoaderDescription.load(os.path.basename(yaml_path))
        file_path, obj_name = tuple(dl.defined_as.split("::"))
        CustomDataLoader = getattr(load_module(file_path), obj_name)

    # check that dl.type is correct
    if dl.type not in AVAILABLE_DATALOADERS:
        raise ValueError("dataloader type: {0} is not in supported dataloaders:{1}".
                         format(dl.type, list(AVAILABLE_DATALOADERS.keys())))
    # check that the extractor arguments match yaml arguments
    if not getargs(CustomDataLoader) == set(dl.args.keys()):
        raise ValueError("DataLoader arguments: \n{0}\n don't match ".format(set(getargs(CustomDataLoader))) +
                         "the specification in the dataloader.yaml file:\n{0}".
                         format(set(dl.args.keys())))
    # check that CustomDataLoader indeed interits from the right DataLoader
    if dl.type in DATALOADERS_AS_FUNCTIONS:
        # transform the functions into objects
        assert isinstance(CustomDataLoader, types.FunctionType)
        CustomDataLoader = AVAILABLE_DATALOADERS[dl.type].from_fn(CustomDataLoader)
    else:
        if not issubclass(CustomDataLoader, AVAILABLE_DATALOADERS[dl.type]):
            raise ValueError("DataLoader does't inherit from the specified dataloader: {0}".
                             format(AVAILABLE_DATALOADERS[dl.type].__name__))
    logger.info('successfully loaded the dataloader from {}'.
                format(os.path.normpath(os.path.join(dataloader_dir, dl.defined_as))))
    # Inherit the attributes from dl
    # TODO - make this more automatic / DRY
    # write a method to load those things?
    CustomDataLoader.type = dl.type
    CustomDataLoader.defined_as = dl.defined_as
    CustomDataLoader.args = dl.args
    CustomDataLoader.info = dl.info
    CustomDataLoader.output_schema = dl.output_schema
    CustomDataLoader.dependencies = dl.dependencies
    CustomDataLoader.postprocessing = dl.postprocessing
    # keep it hidden?
    CustomDataLoader._yaml_path = yaml_path
    CustomDataLoader.source = source
    # TODO - rename?
    CustomDataLoader.source_dir = dataloader_dir

    # Add init_example method
    CustomDataLoader.example_kwargs = example_kwargs(CustomDataLoader.args)

    def init_example(cls):
        return cls(**cls.example_kwargs)
    CustomDataLoader.init_example = classmethod(init_example)
    CustomDataLoader.print_args = classmethod(print_dl_kwargs)

    return CustomDataLoader