Example #1
0
def cli_info(command, raw_args):
    """CLI interface to predict
    """
    assert command == "info"
    parser = argparse.ArgumentParser('kipoi {}'.format(command),
                                     description="Prints dataloader" +
                                                 " keyword arguments.")
    add_model(parser)
    add_dataloader(parser, with_args=False)
    args = parser.parse_args(raw_args)

    # --------------------------------------------
    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    if args.dataloader is not None:
        dl_info = "dataloader '{0}' from source '{1}'".format(str(args.dataloader), str(args.dataloader_source))
        Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source)
    else:
        dl_info = "default dataloader for model '{0}' from source '{1}'".format(str(model.name), str(args.source))
        Dl = model.default_dataloader

    print("-" * 80)
    print("Displaying keyword arguments for {0}".format(dl_info))
    print(Dl.print_args())
    print("-" * 80)
Example #2
0
def cli_test(command, raw_args):
    """Runs test on the model
    """
    assert command == "test"
    # setup the arg-parsing
    parser = argparse.ArgumentParser('kipoi {}'.format(command),
                                     description='script to test model zoo submissions')
    add_model(parser, source="dir")
    parser.add_argument('--batch_size', type=int, default=32,
                        help='Batch size to use in prediction')
    parser.add_argument("-i", "--install_req", action='store_true',
                        help="Install required packages from requirements.txt")
    args = parser.parse_args(raw_args)
    # --------------------------------------------
    if args.install_req:
        kipoi.pipeline.install_model_requirements(args.model,
                                                  args.source,
                                                  and_dataloaders=True)
    mh = kipoi.get_model(args.model, args.source)

    if not mh._sufficient_deps(mh.dependencies):
        # model requirements should be installed
        logger.warn("Required package '{0}' for model type: {1} is not listed in the dependencies".
                    format(mh.MODEL_PACKAGE, mh.type))

    # Load the test files from model source
    # with cd(mh.source_dir):
    mh.pipeline.predict_example(batch_size=args.batch_size)
    # if not match:
    #     # logger.error("Expected targets don't match model predictions")
    #     raise Exception("Expected targets don't match model predictions")

    logger.info('Successfully ran test_predict')
Example #3
0
def cli_test(command, raw_args):
    """Runs test on the model
    """
    assert command == "test"
    # setup the arg-parsing
    parser = argparse.ArgumentParser(
        'kipoi {}'.format(command),
        description='script to test model zoo submissions. Example usage:\n'
        '`kipoi test model/directory`, where `model/directory` is the '
        'path to a directory containing a model.yaml file.')
    add_model(parser, source="dir")
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in prediction')
    args = parser.parse_args(raw_args)
    # --------------------------------------------
    mh = kipoi.get_model(args.model, args.source)

    if not mh._sufficient_deps(mh.dependencies):
        # model requirements should be installed
        logger.warn(
            "Required package '{0}' for model type: {1} is not listed in the dependencies"
            .format(mh.MODEL_PACKAGE, mh.type))

    # Load the test files from model source
    # with cd(mh.source_dir):
    mh.pipeline.predict_example(batch_size=args.batch_size)
    # if not match:
    #     # logger.error("Expected targets don't match model predictions")
    #     raise Exception("Expected targets don't match model predictions")

    logger.info('Successfully ran test_predict')
Example #4
0
def cli_info(command, raw_args):
    """CLI interface to predict
    """
    assert command == "info"
    parser = argparse.ArgumentParser('kipoi {}'.format(command),
                                     description="Prints dataloader" +
                                                 " keyword arguments.")
    parser.add_argument("-i", "--install_req", action='store_true',
                        help="Install required packages from requirements.txt")
    add_model(parser)
    add_dataloader(parser, with_args=False)
    args = parser.parse_args(raw_args)

    # --------------------------------------------
    # install args
    if args.install_req:
        kipoi.pipeline.install_model_requirements(args.model,
                                                  args.source,
                                                  and_dataloaders=True)
    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    if args.dataloader is not None:
        dl_info = "dataloader '{0}' from source '{1}'".format(str(args.dataloader), str(args.dataloader_source))
        Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source)
    else:
        dl_info = "default dataloader for model '{0}' from source '{1}'".format(str(model.name), str(args.source))
        Dl = model.default_dataloader

    print("-" * 80)
    print("Displaying keyword arguments for {0}".format(dl_info))
    print(kipoi.print_dl_kwargs(Dl))
    print("-" * 80)
Example #5
0
def cli_get_example(command, raw_args):
    """Downloads the example files to the desired directory
    """
    assert command == "get-example"
    # setup the arg-parsing
    parser = argparse.ArgumentParser('kipoi {}'.format(command),
                                     description='Get example files')
    add_model(parser, source="kipoi")
    parser.add_argument("-o", "--output", default="example", required=False,
                        help="Output directory where to store the examples. Default: 'example'")
    args = parser.parse_args(raw_args)
    # --------------------------------------------
    md = kipoi.get_model_descr(args.model, args.source)
    src = kipoi.get_source(args.source)

    # load the default dataloader
    if isinstance(md.default_dataloader, kipoi.specs.DataLoaderImport):
        with cd(src.get_model_dir(args.model)):
            dl_descr = md.default_dataloader.get()
    else:
        # load from directory
        # attach the default dataloader already to the model
        dl_descr = kipoi.get_dataloader_descr(os.path.join(args.model, md.default_dataloader),
                                              source=args.source)

    kwargs = dl_descr.download_example(output_dir=args.output, dry_run=False)

    logger.info("Example files downloaded to: {}".format(args.output))
    logger.info("use the following dataloader kwargs:")
    print(json.dumps(kwargs))
Example #6
0
def cli_info(command, raw_args):
    """CLI interface to predict
    """
    assert command == "info"
    parser = argparse.ArgumentParser('kipoi {}'.format(command),
                                     description="Prints dataloader" +
                                     " keyword arguments.")
    add_model(parser)
    add_dataloader(parser, with_args=False)
    args = parser.parse_args(raw_args)

    # --------------------------------------------
    # load model & dataloader
    md = kipoi.get_model_descr(args.model, args.source)
    src = kipoi.get_source(args.source)

    # load the default dataloader
    try:
        if isinstance(md.default_dataloader, kipoi.specs.DataLoaderImport):
            with cd(src.get_model_dir(args.model)):
                dl_descr = md.default_dataloader.get()
        else:
            # load from directory
            # attach the default dataloader already to the model
            dl_descr = kipoi.get_dataloader_descr(os.path.join(
                args.model, md.default_dataloader),
                                                  source=args.source)
    # if kipoiseq is not installed you get an ImportError
    except ImportError:
        dl_descr = None

    print("-" * 80)
    print("'{0}' from source '{1}'".format(str(args.model), str(args.source)))
    print("")
    print("Model information")
    print("-----------")
    print(md.info.get_config_as_yaml())
    if dl_descr:
        print("Dataloader arguments")
        print("--------------------")
        dl_descr.print_args()
    print("--------------------\n")
    print("Run `kipoi get-example {} -o example` to download example files.\n".
          format(args.model))
Example #7
0
def cli_get_example(command, raw_args):
    """Downloads the example files to the desired directory
    """
    assert command == "get-example"
    # setup the arg-parsing
    parser = argparse.ArgumentParser('kipoi {}'.format(command),
                                     description='Get example files')
    add_model(parser, source="kipoi")
    parser.add_argument("-o", "--output", default="example", required=False,
                        help="Output directory where to store the examples. Default: 'example'")
    args = parser.parse_args(raw_args)
    # --------------------------------------------
    mh = kipoi.get_model(args.model, args.source)

    kwargs = mh.default_dataloader.download_example(output_dir=args.output, dry_run=False)

    logger.info("Example files downloaded to: {}".format(args.output))
    logger.info("use the following dataloader kwargs:")
    print(json.dumps(kwargs))
Example #8
0
def cli_score_variants(command, raw_args):
    """CLI interface to predict
    """
    AVAILABLE_FORMATS = ["tsv", "hdf5", "h5"]
    import pybedtools
    assert command == "score_variants"
    parser = argparse.ArgumentParser(
        'kipoi postproc {}'.format(command),
        description='Predict effect of SNVs using ISM.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument('-v', '--vcf_path', help='Input VCF.')
    # TODO - rename path to fpath
    parser.add_argument('-a',
                        '--out_vcf_fpath',
                        help='Output annotated VCF file path.',
                        default=None)
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in prediction')
    parser.add_argument(
        "-n",
        "--num_workers",
        type=int,
        default=0,
        help="Number of parallel workers for loading the dataset")
    parser.add_argument("-i",
                        "--install_req",
                        action='store_true',
                        help="Install required packages from requirements.txt")
    parser.add_argument(
        '-r',
        '--restriction_bed',
        default=None,
        help="Regions for prediction can only be subsets of this bed file")
    parser.add_argument(
        '-o',
        '--output',
        required=False,
        help=
        "Additional output file. File format is inferred from the file path ending"
        + ". Available file formats are: {0}".format(
            ",".join(AVAILABLE_FORMATS)))
    parser.add_argument(
        '-s',
        "--scoring",
        default="diff",
        nargs="+",
        help=
        "Scoring method to be used. Only scoring methods selected in the model yaml file are"
        "available except for `diff` which is always available. Select scoring function by the"
        "`name` tag defined in the model yaml file.")
    parser.add_argument(
        '-k',
        "--scoring_kwargs",
        default="",
        nargs="+",
        help=
        "JSON definition of the kwargs for the scoring functions selected in --scoring. The "
        "definiton can either be in JSON in the command line or the path of a .json file. The "
        "individual JSONs are expected to be supplied in the same order as the labels defined in "
        "--scoring. If the defaults or no arguments should be used define '{}' for that respective "
        "scoring method.")

    args = parser.parse_args(raw_args)

    # extract args for kipoi.variant_effects.predict_snvs
    vcf_path = args.vcf_path
    out_vcf_fpath = args.out_vcf_fpath
    dataloader_arguments = parse_json_file_str(args.dataloader_args)

    # infer the file format
    args.file_format = args.output.split(".")[-1]
    if args.file_format not in AVAILABLE_FORMATS:
        logger.error("File ending: {0} for file {1} not from {2}".format(
            args.file_format, args.output, AVAILABLE_FORMATS))
        sys.exit(1)

    if args.file_format in ["hdf5", "h5"]:
        # only if hdf5 output is used
        import deepdish

    # Check that all the folders exist
    file_exists(args.vcf_path, logger)
    dir_exists(os.path.dirname(args.out_vcf_fpath), logger)
    if args.output is not None:
        dir_exists(os.path.dirname(args.output), logger)
    # --------------------------------------------
    # install args
    if args.install_req:
        kipoi.pipeline.install_model_requirements(args.model,
                                                  args.source,
                                                  and_dataloaders=True)
    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader,
                                          args.dataloader_source)
    else:
        Dl = model.default_dataloader

    if not os.path.exists(vcf_path):
        raise Exception("VCF file does not exist: %s" % vcf_path)

    if not isinstance(args.scoring, list):
        args.scoring = [args.scoring]

    dts = _get_scoring_fns(model, args.scoring, args.scoring_kwargs)

    # Load effect prediction related model info
    model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor(
        model, Dl)

    # Select the appropriate region generator
    if args.restriction_bed is not None:
        # Select the restricted SNV-centered region generator
        pbd = pybedtools.BedTool(args.restriction_bed)
        vcf_to_region = kipoi.postprocessing.variant_effects.SnvPosRestrictedRg(
            model_info, pbd)
        logger.info(
            'Restriction bed file defined. Only variants in defined regions will be tested.'
            'Only defined regions will be tested.')
    elif model_info.requires_region_definition:
        # Select the SNV-centered region generator
        vcf_to_region = kipoi.postprocessing.variant_effects.SnvCenteredRg(
            model_info)
        logger.info('Using variant-centered sequence generation.')
    else:
        # No regions can be defined for the given model, VCF overlap will be inferred, hence tabixed VCF is necessary
        vcf_to_region = None
        # Make sure that the vcf is tabixed
        vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf(
            vcf_path)
        logger.info(
            'Dataloader does not accept definition of a regions bed-file. Only VCF-variants that lie within'
            'produced regions can be predicted')

    if model_info.use_seq_only_rc:
        logger.info(
            'Model SUPPORTS simple reverse complementation of input DNA sequences.'
        )
    else:
        logger.info(
            'Model DOES NOT support simple reverse complementation of input DNA sequences.'
        )

    # Get a vcf output writer if needed
    if out_vcf_fpath is not None:
        logger.info('Annotated VCF will be written to %s.' %
                    str(out_vcf_fpath))
        vcf_writer = kipoi.postprocessing.variant_effects.VcfWriter(
            model, vcf_path, out_vcf_fpath)
    else:
        vcf_writer = None

    keep_predictions = args.output is not None

    res = kipoi.postprocessing.variant_effects.predict_snvs(
        model,
        Dl,
        vcf_path,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        dataloader_args=dataloader_arguments,
        vcf_to_region=vcf_to_region,
        evaluation_function_kwargs={"diff_types": dts},
        sync_pred_writer=vcf_writer,
        return_predictions=keep_predictions)

    # tabular files
    if args.output is not None:
        if args.file_format in ["tsv"]:
            for i, k in enumerate(res):
                # Remove an old file if it is still there...
                if i == 0:
                    try:
                        os.unlink(args.output)
                    except Exception:
                        pass
                with open(args.output, "w") as ofh:
                    ofh.write("KPVEP_%s\n" % k.upper())
                    res[k].to_csv(args.output, sep="\t", mode="a")

        if args.file_format in ["hdf5", "h5"]:
            deepdish.io.save(args.output, res)

    logger.info('Successfully predicted samples')
Example #9
0
def cli_predict(command, raw_args):
    """CLI interface to predict
    """
    assert command == "predict"
    parser = argparse.ArgumentParser('kipoi {}'.format(command),
                                     description='Run the model prediction.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument('--batch_size', type=int, default=32,
                        help='Batch size to use in prediction')
    parser.add_argument("-n", "--num_workers", type=int, default=0,
                        help="Number of parallel workers for loading the dataset")
    parser.add_argument("-i", "--install_req", action='store_true',
                        help="Install required packages from requirements.txt")
    parser.add_argument("-k", "--keep_inputs", action='store_true',
                        help="Keep the inputs in the output file. ")
    parser.add_argument("-l", "--layer",
                        help="Which output layer to use to make the predictions. If specified," +
                        "`model.predict_activation_on_batch` will be invoked instead of `model.predict_on_batch`")
    parser.add_argument('-o', '--output', required=True, nargs="+",
                        help="Output files. File format is inferred from the file path ending. Available file formats are: " +
                        ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP]))
    args = parser.parse_args(raw_args)

    dataloader_kwargs = parse_json_file_str(args.dataloader_args)

    # setup the files
    if not isinstance(args.output, list):
        args.output = [args.output]
    for o in args.output:
        ending = o.split('.')[-1]
        if ending not in writers.FILE_SUFFIX_MAP:
            logger.error("File ending: {0} for file {1} not from {2}".
                         format(ending, o, writers.FILE_SUFFIX_MAP))
            sys.exit(1)
        dir_exists(os.path.dirname(o), logger)
    # --------------------------------------------
    # install args
    if args.install_req:
        kipoi.pipeline.install_model_requirements(args.model,
                                                  args.source,
                                                  and_dataloaders=True)
    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source)
    else:
        Dl = model.default_dataloader

    dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs)
    dl = Dl(**dataloader_kwargs)

    # setup batching
    it = dl.batch_iter(batch_size=args.batch_size,
                       num_workers=args.num_workers)

    # Setup the writers
    use_writers = []
    for output in args.output:
        ending = output.split('.')[-1]
        W = writers.FILE_SUFFIX_MAP[ending]
        logger.info("Using {0} for file {1}".format(W.__name__, output))
        if ending == "tsv":
            assert W == writers.TsvBatchWriter
            use_writers.append(writers.TsvBatchWriter(file_path=output, nested_sep="/"))
        elif ending == "bed":
            assert W == writers.BedBatchWriter
            use_writers.append(writers.BedBatchWriter(file_path=output,
                                                      dataloader_schema=dl.output_schema.metadata,
                                                      header=True))
        elif ending in ["hdf5", "h5"]:
            assert W == writers.HDF5BatchWriter
            use_writers.append(writers.HDF5BatchWriter(file_path=output))
        else:
            logger.error("Unknown file format: {0}".format(ending))
            sys.exit(1)

    # Loop through the data, make predictions, save the output
    for i, batch in enumerate(tqdm(it)):
        # validate the data schema in the first iteration
        if i == 0 and not Dl.output_schema.compatible_with_batch(batch):
            logger.warn("First batch of data is not compatible with the dataloader schema.")

        # make the prediction
        if args.layer is None:
            pred_batch = model.predict_on_batch(batch['inputs'])
        else:
            pred_batch = model.predict_activation_on_batch(batch['inputs'], layer=args.layer)

        # write out the predictions, metadata (, inputs, targets)
        output_batch = prepare_batch(batch, pred_batch, keep_inputs=args.keep_inputs)
        for writer in use_writers:
            writer.batch_write(output_batch)

    for writer in use_writers:
        writer.close()
    logger.info('Done! Predictions stored in {0}'.format(",".join(args.output)))
Example #10
0
def cli_test(command, raw_args):
    """Runs test on the model
    """
    assert command == "test"
    # setup the arg-parsing
    parser = argparse.ArgumentParser('kipoi {}'.format(command),
                                     description='script to test model zoo submissions. Example usage:\n'
                                     '`kipoi test model/directory`, where `model/directory` is the '
                                     'path to a directory containing a model.yaml file.')
    add_model(parser, source="dir")
    parser.add_argument('--batch_size', type=int, default=32,
                        help='Batch size to use in prediction')
    parser.add_argument("-o", "--output", default=None, required=False,
                        help="Output hdf5 file")
    parser.add_argument("-s", "--skip-expect", action='store_true',
                        help="Skip validating the expected predictions if test.expect field is specified under model.yaml")
    parser.add_argument("-e", "--expect", default=None,
                        help="File path to the hdf5 file of predictions produced by kipoi test -o file.h5 "
                        "or kipoi predict -o file.h5 --keep_inputs. Overrides test.expect in model.yaml")
    args = parser.parse_args(raw_args)
    # --------------------------------------------
    mh = kipoi.get_model(args.model, args.source)

    if not mh._sufficient_deps(mh.dependencies):
        # model requirements should be installed
        logger.warning("Required package '{0}' for model type: {1} is not listed in the dependencies".
                    format(mh.MODEL_PACKAGE, mh.type))

    # Load the test files from model source
    mh.pipeline.predict_example(batch_size=args.batch_size, output_file=args.output)

    if (mh.test.expect is not None or args.expect is not None) \
            and not args.skip_expect and args.output is None:
        if args.expect is not None:
            # `expect` specified from the CLI
            expect = args.expect
        else:
            # `expect` taken from model.yaml
            if isinstance(mh.test.expect, kipoi.specs.RemoteFile):
                # download the file
                output_dir = kipoi.get_source(args.source).get_model_download_dir(args.model)
                makedir_exist_ok(output_dir)
                mh.test.expect = mh.test.expect.get_file(os.path.join(output_dir, 'test.expect.h5'))
            expect = mh.test.expect
        logger.info('Testing if the predictions match the expected ones in the file: {}'.format(expect))
        logger.info('Desired precision (number of matching decimal places): {}'.format(mh.test.precision_decimal))

        # iteratively load the expected file
        expected = kipoi.readers.HDF5Reader(expect)
        expected.open()
        it = expected.batch_iter(batch_size=args.batch_size)
        for i, batch in enumerate(tqdm(it, total=len(expected) // args.batch_size)):
            if i == 0 and ('inputs' not in batch or 'preds' not in batch):
                raise ValueError("test.expect file requires 'inputs' and 'preds' "
                                 "to be specified. Available keys: {}".format(list(expected)))
            pred_batch = mh.predict_on_batch(batch['inputs'])
            # compare to the predictions
            # import ipdb
            # ipdb.set_trace()
            try:
                compare_numpy_dict(pred_batch, batch['preds'], exact=False, decimal=mh.test.precision_decimal)
            except Exception as e:
                logger.error("Model predictions don't match the expected predictions."
                             "expected: {}\nobserved: {}. Exception: {}".format(batch['preds'], pred_batch, e))
                expected.close()
                sys.exit(1)
        expected.close()
        logger.info('All predictions match')
    logger.info('Successfully ran test_predict')
Example #11
0
def cli_predict(command, raw_args):
    """CLI interface to predict
    """
    assert command == "predict"
    parser = argparse.ArgumentParser('kipoi {}'.format(command),
                                     description='Run the model prediction.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument('--batch_size', type=int, default=32,
                        help='Batch size to use in prediction')
    parser.add_argument("-n", "--num_workers", type=int, default=0,
                        help="Number of parallel workers for loading the dataset")
    parser.add_argument("-k", "--keep_inputs", action='store_true',
                        help="Keep the inputs in the output file. ")
    parser.add_argument("-l", "--layer",
                        help="Which output layer to use to make the predictions. If specified," +
                        "`model.predict_activation_on_batch` will be invoked instead of `model.predict_on_batch`")
    parser.add_argument("--singularity", action='store_true',
                        help="Run `kipoi predict` in the appropriate singularity container. "
                        "Containters will get downloaded to ~/.kipoi/envs/ or to "
                        "$SINGULARITY_CACHEDIR if set")
    parser.add_argument('-o', '--output', required=True, nargs="+",
                        help="Output files. File format is inferred from the file path ending. Available file formats are: " +
                        ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP]))
    args = parser.parse_args(raw_args)

    dataloader_kwargs = parse_json_file_str_or_arglist(args.dataloader_args, parser)

    # setup the files
    if not isinstance(args.output, list):
        args.output = [args.output]
    for o in args.output:
        ending = o.split('.')[-1]
        if ending not in writers.FILE_SUFFIX_MAP:
            logger.error("File ending: {0} for file {1} not from {2}".
                         format(ending, o, writers.FILE_SUFFIX_MAP))
            sys.exit(1)
        dir_exists(os.path.dirname(o), logger)

    # singularity_command
    if args.singularity:
        from kipoi.cli.singularity import singularity_command
        logger.info("Running kipoi predict in the singularity container")
        # Drop the singularity flag
        raw_args = [x for x in raw_args if x != '--singularity']
        singularity_command(['kipoi', command] + raw_args,
                            args.model,
                            dataloader_kwargs,
                            output_files=args.output,
                            source=args.source,
                            dry_run=False)
        return None
    # --------------------------------------------
    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source)
    else:
        Dl = model.default_dataloader

    dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs)
    dl = Dl(**dataloader_kwargs)

    # setup batching
    it = dl.batch_iter(batch_size=args.batch_size,
                       num_workers=args.num_workers)

    # Setup the writers
    use_writers = []
    for output in args.output:
        writer = writers.get_writer(output, metadata_schema=dl.get_output_schema().metadata)
        if writer is None:
            logger.error("Unknown file format: {0}".format(ending))
            sys.exit()
        else:
            use_writers.append(writer)
    output_writers = writers.MultipleBatchWriter(use_writers)

    # Loop through the data, make predictions, save the output
    for i, batch in enumerate(tqdm(it)):
        # validate the data schema in the first iteration
        if i == 0 and not Dl.get_output_schema().compatible_with_batch(batch):
            logger.warning("First batch of data is not compatible with the dataloader schema.")

        # make the prediction
        if args.layer is None:
            pred_batch = model.predict_on_batch(batch['inputs'])
        else:
            pred_batch = model.predict_activation_on_batch(batch['inputs'], layer=args.layer)

        # write out the predictions, metadata (, inputs, targets)
        output_batch = prepare_batch(batch, pred_batch, keep_inputs=args.keep_inputs)
        output_writers.batch_write(output_batch)

    output_writers.close()
    logger.info('Done! Predictions stored in {0}'.format(",".join(args.output)))
Example #12
0
def create_tf_session(visiblegpus, per_process_gpu_memory_fraction=0.45):
    import os
    import tensorflow as tf
    import keras.backend as K
    os.environ['CUDA_VISIBLE_DEVICES'] = str(visiblegpus)
    session_config = tf.ConfigProto()
    session_config.gpu_options.per_process_gpu_memory_fraction = per_process_gpu_memory_fraction
    session = tf.Session(config=session_config)
    K.set_session(session)
    return session


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Transfer-learn a Keras model from Kipoi')
    add_model(parser)
    add_dataloader(parser, with_args=False)
    parser.add_argument('--dl_kwargs_train',
                        help="training data-loader kwargs")
    parser.add_argument('--dl_kwargs_eval',
                        help="Evaluation data-loader kwargs")
    parser.add_argument('-t',
                        '--tasks',
                        type=int,
                        help='Number of transferred tasks')
    parser.add_argument('-o', '--output', help='Output file directory')
    parser.add_argument('--transfer_to',
                        help='Layer to which to transfer the model')
    parser.add_argument(
        '--freeze_to',
        default=None,
Example #13
0
def cli_create_mutation_map(command, raw_args):
    """CLI interface to calculate mutation map data 
    """
    assert command == "create_mutation_map"
    parser = argparse.ArgumentParser(
        'kipoi postproc {}'.format(command),
        description='Predict effect of SNVs using ISM.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument(
        '-r',
        '--regions_file',
        help='Region definition as VCF or bed file. Not a required input.')
    # TODO - rename path to fpath
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in prediction')
    parser.add_argument(
        "-n",
        "--num_workers",
        type=int,
        default=0,
        help="Number of parallel workers for loading the dataset")
    parser.add_argument("-i",
                        "--install_req",
                        action='store_true',
                        help="Install required packages from requirements.txt")
    parser.add_argument(
        '-o',
        '--output',
        required=True,
        help="Output HDF5 file. To be used as input for plotting.")
    parser.add_argument(
        '-s',
        "--scores",
        default="diff",
        nargs="+",
        help=
        "Scoring method to be used. Only scoring methods selected in the model yaml file are"
        "available except for `diff` which is always available. Select scoring function by the"
        "`name` tag defined in the model yaml file.")
    parser.add_argument(
        '-k',
        "--score_kwargs",
        default="",
        nargs="+",
        help=
        "JSON definition of the kwargs for the scoring functions selected in --scores. The "
        "definiton can either be in JSON in the command line or the path of a .json file. The "
        "individual JSONs are expected to be supplied in the same order as the labels defined in "
        "--scores. If the defaults or no arguments should be used define '{}' for that respective "
        "scoring method.")
    parser.add_argument(
        '-l',
        "--seq_length",
        type=int,
        default=None,
        help=
        "Optional parameter: Model input sequence length - necessary if the model does not have a "
        "pre-defined input sequence length.")

    args = parser.parse_args(raw_args)

    # extract args for kipoi.variant_effects.predict_snvs

    dataloader_arguments = parse_json_file_str(args.dataloader_args)

    if args.output is None:
        raise Exception("Output file `--output` has to be set!")

    # --------------------------------------------
    # install args
    if args.install_req:
        kipoi.pipeline.install_model_requirements(args.model,
                                                  args.source,
                                                  and_dataloaders=True)
    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    regions_file = os.path.realpath(args.regions_file)
    output = os.path.realpath(args.output)
    with cd(model.source_dir):
        if not os.path.exists(regions_file):
            raise Exception("Regions inputs file does not exist: %s" %
                            args.regions_file)

        # Check that all the folders exist
        file_exists(regions_file, logger)
        dir_exists(os.path.dirname(output), logger)

        if args.dataloader is not None:
            Dl = kipoi.get_dataloader_factory(args.dataloader,
                                              args.dataloader_source)
        else:
            Dl = model.default_dataloader

    if not isinstance(args.scores, list):
        args.scores = [args.scores]

    dts = get_scoring_fns(model, args.scores, args.score_kwargs)

    # Load effect prediction related model info
    model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor(
        model, Dl)
    manual_seq_len = args.seq_length

    # Select the appropriate region generator and vcf or bed file input
    args.file_format = regions_file.split(".")[-1]
    bed_region_file = None
    vcf_region_file = None
    bed_to_region = None
    vcf_to_region = None
    if args.file_format == "vcf" or regions_file.endswith("vcf.gz"):
        vcf_region_file = regions_file
        if model_info.requires_region_definition:
            # Select the SNV-centered region generator
            vcf_to_region = kipoi.postprocessing.variant_effects.SnvCenteredRg(
                model_info, seq_length=manual_seq_len)
            logger.info('Using variant-centered sequence generation.')
    elif args.file_format == "bed":
        if model_info.requires_region_definition:
            # Select the SNV-centered region generator
            bed_to_region = kipoi.postprocessing.variant_effects.BedOverlappingRg(
                model_info, seq_length=manual_seq_len)
            logger.info('Using bed-file based sequence generation.')
        bed_region_file = regions_file
    else:
        raise Exception("")

    if model_info.use_seq_only_rc:
        logger.info(
            'Model SUPPORTS simple reverse complementation of input DNA sequences.'
        )
    else:
        logger.info(
            'Model DOES NOT support simple reverse complementation of input DNA sequences.'
        )

    from kipoi.postprocessing.variant_effects.mutation_map import _generate_mutation_map
    mdmm = _generate_mutation_map(
        model,
        Dl,
        vcf_fpath=vcf_region_file,
        bed_fpath=bed_region_file,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        dataloader_args=dataloader_arguments,
        vcf_to_region=vcf_to_region,
        bed_to_region=bed_to_region,
        evaluation_function_kwargs={'diff_types': dts},
    )
    mdmm.save_to_file(output)

    logger.info('Successfully generated mutation map data')
Example #14
0
def cli_grad_to_file(command, raw_args):
    """ CLI to save seq inputs of grad*input to a bigwig file
    """
    assert command == "gr_inp_to_file"
    parser = argparse.ArgumentParser('kipoi postproc {}'.format(command),
                                     description='Save grad*input in a file.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    # TODO - rename path to fpath
    parser.add_argument('-f',
                        '--input_file',
                        required=False,
                        help="Input HDF5 file produced from `grad`")
    parser.add_argument('-o',
                        '--output',
                        required=False,
                        help="Output bigwig for bedgraph file")
    parser.add_argument(
        '--sample',
        required=False,
        type=int,
        default=None,
        help=
        "Input line for which the BigWig file should be generated. If not defined all"
        "samples will be written.")
    parser.add_argument(
        '--model_input',
        required=False,
        default=None,
        help=
        "Model input name to be used for plotting. As defined in model.yaml. Can be omitted if"
        "model only has one input.")
    args = parser.parse_args(raw_args)

    # Check that all the folders exist
    dir_exists(os.path.dirname(args.output), logger)
    # --------------------------------------------
    # install args
    import matplotlib.pyplot
    matplotlib.pyplot.switch_backend('agg')
    import matplotlib.pylab as plt
    from kipoi.postprocessing.variant_effects.mutation_map import MutationMapPlotter
    from kipoi.postprocessing.gradient_vis.vis import GradPlotter
    from kipoi.writers import BedGraphWriter

    logger.info('Loading gradient results file and model info...')

    gp = GradPlotter.from_hdf5(args.input_file,
                               model=args.model,
                               source=args.source)

    if args.sample is not None:
        samples = [args.sample]
    else:
        samples = list(range(gp.get_num_samples(args.model_input)))

    if args.output.endswith(".bed") or args.output.endswith(".bedgraph"):
        of_obj = BedGraphWriter(args.output)
    else:
        raise Exception("Output file format not supported!")

    logger.info('Writing...')

    for sample in samples:
        gp.write(sample, model_input=args.model_input, writer_obj=of_obj)

    logger.info('Saving...')

    of_obj.close()

    logger.info('Successfully wrote grad*input to file.')
Example #15
0
def cli_grad(command, raw_args):
    """CLI interface to predict
    """
    from .main import prepare_batch
    from kipoi.model import GradientMixin
    assert command == "grad"
    from tqdm import tqdm
    parser = argparse.ArgumentParser(
        'kipoi {}'.format(command),
        description='Save gradients and inputs to a hdf5 file.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in prediction')
    parser.add_argument(
        "-n",
        "--num_workers",
        type=int,
        default=0,
        help="Number of parallel workers for loading the dataset")
    parser.add_argument("-i",
                        "--install_req",
                        action='store_true',
                        help="Install required packages from requirements.txt")
    parser.add_argument(
        "-l",
        "--layer",
        default=None,
        help="Which output layer to use to make the predictions. If specified,"
        +
        "`model.predict_activation_on_batch` will be invoked instead of `model.predict_on_batch`",
        required=False)
    parser.add_argument(
        "--final_layer",
        help=
        "Alternatively to `--layer` this flag can be used to indicate that the last layer should "
        "be used.",
        action='store_true')
    parser.add_argument(
        "--pre_nonlinearity",
        help=
        "Flag indicating that it should checked whether the selected output is post activation "
        "function. If a non-linear activation function is used attempt to use its input. This "
        "feature is not available for all models.",
        action='store_true')
    parser.add_argument(
        "-f",
        "--filter_idx",
        help=
        "Filter index that should be inspected with gradients. If not set all filters will "
        + "be used.",
        default=None)
    parser.add_argument(
        "-a",
        "--avg_func",
        help=
        "Averaging function to be applied across selected filters (`--filter_idx`) in "
        + "layer `--layer`.",
        choices=GradientMixin.allowed_functions,
        default="sum")
    parser.add_argument(
        '--selected_fwd_node',
        help="If the selected layer has multiple inbound connections in "
        "the graph then those can be selected here with an integer "
        "index. Not necessarily supported by all models.",
        default=None,
        type=int)
    parser.add_argument(
        '-o',
        '--output',
        required=True,
        nargs="+",
        help=
        "Output files. File format is inferred from the file path ending. Available file formats are: "
        + ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP]))
    args = parser.parse_args(raw_args)

    dataloader_kwargs = parse_json_file_str(args.dataloader_args)

    # setup the files
    if not isinstance(args.output, list):
        args.output = [args.output]
    for o in args.output:
        ending = o.split('.')[-1]
        if ending not in writers.FILE_SUFFIX_MAP:
            logger.error("File ending: {0} for file {1} not from {2}".format(
                ending, o, writers.FILE_SUFFIX_MAP))
            sys.exit(1)
        dir_exists(os.path.dirname(o), logger)
    # --------------------------------------------
    # install args
    if args.install_req:
        kipoi.pipeline.install_model_requirements(args.model,
                                                  args.source,
                                                  and_dataloaders=True)

    layer = args.layer
    if layer is None and not args.final_layer:
        raise Exception(
            "A layer has to be selected explicitely using `--layer` or implicitely by using the"
            "`--final_layer` flag.")

    # Not a good idea
    # if layer is not None and isint(layer):
    #    logger.warn("Interpreting `--layer` value as integer layer index!")
    #    layer = int(args.layer)

    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    if not isinstance(model, GradientMixin):
        raise Exception("Model does not support gradient calculation.")

    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader,
                                          args.dataloader_source)
    else:
        Dl = model.default_dataloader

    dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs)
    dl = Dl(**dataloader_kwargs)

    filter_idx_parsed = None
    if args.filter_idx is not None:
        filter_idx_parsed = parse_filter_slice(args.filter_idx)

    # setup batching
    it = dl.batch_iter(batch_size=args.batch_size,
                       num_workers=args.num_workers)

    # Setup the writers
    use_writers = []
    for output in args.output:
        ending = output.split('.')[-1]
        W = writers.FILE_SUFFIX_MAP[ending]
        logger.info("Using {0} for file {1}".format(W.__name__, output))
        if ending == "tsv":
            assert W == writers.TsvBatchWriter
            use_writers.append(
                writers.TsvBatchWriter(file_path=output, nested_sep="/"))
        elif ending == "bed":
            raise Exception("Please use tsv or hdf5 output format.")
        elif ending in ["hdf5", "h5"]:
            assert W == writers.HDF5BatchWriter
            use_writers.append(writers.HDF5BatchWriter(file_path=output))
        else:
            logger.error("Unknown file format: {0}".format(ending))
            sys.exit(1)

    # Loop through the data, make predictions, save the output
    for i, batch in enumerate(tqdm(it)):
        # validate the data schema in the first iteration
        if i == 0 and not Dl.output_schema.compatible_with_batch(batch):
            logger.warn(
                "First batch of data is not compatible with the dataloader schema."
            )

        # make the prediction
        pred_batch = model.input_grad(batch['inputs'],
                                      filter_idx=filter_idx_parsed,
                                      avg_func=args.avg_func,
                                      layer=layer,
                                      final_layer=args.final_layer,
                                      selected_fwd_node=args.selected_fwd_node,
                                      pre_nonlinearity=args.pre_nonlinearity)

        # write out the predictions, metadata (, inputs, targets)
        # always keep the inputs so that input*grad can be generated!
        # output_batch = prepare_batch(batch, pred_batch, keep_inputs=True)
        output_batch = batch
        output_batch["grads"] = pred_batch
        for writer in use_writers:
            writer.batch_write(output_batch)

    for writer in use_writers:
        writer.close()
    logger.info('Done! Gradients stored in {0}'.format(",".join(args.output)))
Example #16
0
def cli_ism(command, raw_args):
    # TODO: find a way to define the model output selection
    """CLI interface to predict
    """
    # from .main import prepare_batch
    assert command == "ism"
    from tqdm import tqdm
    from .ism import Mutation

    parser = argparse.ArgumentParser('kipoi interpret {}'.format(command),
                                     description='Calculate DeepLIFT scores.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in prediction')
    parser.add_argument(
        "-n",
        "--num_workers",
        type=int,
        default=0,
        help="Number of parallel workers for loading the dataset")
    parser.add_argument("--model_input",
                        help="Name of the model input that should be scored.",
                        required=True)
    parser.add_argument(
        '-s',
        "--scores",
        default="diff",
        nargs="+",
        help=
        "Scoring method to be used. Only scoring methods selected in the model yaml file are"
        "available except for `diff` which is always available. Select scoring function by the"
        "`name` tag defined in the model yaml file.")
    parser.add_argument(
        '-k',
        "--score_kwargs",
        default=None,
        nargs="+",
        help=
        "JSON definition of the kwargs for the scoring functions selected in --scores. The "
        "definiton can either be in JSON in the command line or the path of a .json file. The "
        "individual JSONs are expected to be supplied in the same order as the labels defined in "
        "--scores. If the defaults or no arguments should be used define '{}' for that respective "
        "scoring method.")
    parser.add_argument(
        "-c",
        "--category_axis",
        help="Using the selected model input with `--model_input`: Which "
        "dimension of that array contains the one-hot encoded categories?"
        " e.g. for a one-hot encoded DNA-sequence"
        "array with input shape (1000, 4) for a single sample, "
        "`category_dim` is 1, for (4, 1000) `category_dim`"
        "is 0.",
        default=1,
        type=int,
        required=False)
    parser.add_argument(
        "-f",
        "--output_sel_fn",
        help="Define an output selection function in order to return effects"
        "on the output of the function. example definitoin: "
        "`--output_sel_fn my_file.py::my_sel_fn`",
        default=None,
        required=False)
    parser.add_argument(
        '-o',
        '--output',
        required=True,
        nargs="+",
        help="Output files. File format is inferred from the file path ending. "
        "Available file formats are: " +
        ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP]))
    args = parser.parse_args(raw_args)

    dataloader_kwargs = parse_json_file_str(args.dataloader_args)

    # setup the files
    if not isinstance(args.output, list):
        args.output = [args.output]
    for o in args.output:
        ending = o.split('.')[-1]
        if ending not in writers.FILE_SUFFIX_MAP:
            logger.error("File ending: {0} for file {1} not from {2}".format(
                ending, o, writers.FILE_SUFFIX_MAP))
            sys.exit(1)
        dir_exists(os.path.dirname(o), logger)
    # --------------------------------------------
    if not isinstance(args.scores, list):
        args.scores = [args.scores]

    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader,
                                          args.dataloader_source)
    else:
        Dl = model.default_dataloader

    dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs)
    dl = Dl(**dataloader_kwargs)

    # setup batching
    it = dl.batch_iter(batch_size=args.batch_size,
                       num_workers=args.num_workers)

    # Setup the writers
    use_writers = []
    for output in args.output:
        ending = output.split('.')[-1]
        W = writers.FILE_SUFFIX_MAP[ending]
        logger.info("Using {0} for file {1}".format(W.__name__, output))
        if ending == "tsv":
            assert W == writers.TsvBatchWriter
            use_writers.append(
                writers.TsvBatchWriter(file_path=output, nested_sep="/"))
        elif ending == "bed":
            raise Exception("Please use tsv or hdf5 output format.")
        elif ending in ["hdf5", "h5"]:
            assert W == writers.HDF5BatchWriter
            use_writers.append(writers.HDF5BatchWriter(file_path=output))
        else:
            logger.error("Unknown file format: {0}".format(ending))
            sys.exit(1)

    output_sel_fn = None
    if args.output_sel_fn is not None:
        file_path, obj_name = tuple(args.output_sel_fn.split("::"))
        output_sel_fn = getattr(load_module(file_path), obj_name)

    m = Mutation(model,
                 args.model_input,
                 scores=args.scores,
                 score_kwargs=args.score_kwargs,
                 batch_size=args.batch_size,
                 output_sel_fn=output_sel_fn,
                 category_axis=args.category_axis,
                 test_ref_ref=True)

    out_batches = {}

    # Loop through the data, make predictions, save the output..
    # TODO: batch writer fails because it tries to concatenate on highest dimension rather than the lowest!
    for i, batch in enumerate(tqdm(it)):
        # validate the data schema in the first iteration
        if i == 0 and not Dl.output_schema.compatible_with_batch(batch):
            logger.warn(
                "First batch of data is not compatible with the dataloader schema."
            )

        # calculate scores without reference for the moment.
        pred_batch = m.score(batch['inputs'])

        # with the current writers it's not possible to store the scores and the model inputs in the same file
        output_batch = {}
        output_batch["scores"] = pred_batch

        for k in output_batch:
            if k not in out_batches:
                out_batches[k] = []
            out_batches[k].append(output_batch[k])

    # concatenate batches:
    full_output = {
        k: np.concatenate([np.array(el) for el in v])
        for k, v in out_batches.items()
    }
    logger.info('Full output shape: {0}'.format(
        str(full_output["scores"].shape)))

    for writer in use_writers:
        writer.batch_write(full_output)

    for writer in use_writers:
        writer.close()
    logger.info('Done! ISM stored in {0}'.format(",".join(args.output)))
Example #17
0
def cli_feature_importance(command, raw_args):
    """CLI interface to predict
    """
    # from .main import prepare_batch
    assert command == "feature_importance"
    parser = argparse.ArgumentParser(
        'kipoi {}'.format(command),
        description='Save gradients and inputs to a hdf5 file.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument("--imp_score",
                        help="Importance score name",
                        choices=available_importance_scores())
    parser.add_argument("--imp_score_kwargs", help="Importance score kwargs")
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in prediction')
    parser.add_argument(
        "-n",
        "--num_workers",
        type=int,
        default=0,
        help="Number of parallel workers for loading the dataset")
    # TODO - handle the reference-based importance scores...

    # io
    parser.add_argument(
        '-o',
        '--output',
        required=True,
        nargs="+",
        help=
        "Output files. File format is inferred from the file path ending. Available file formats are: "
        + ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP]))
    args = parser.parse_args(raw_args)

    dataloader_kwargs = parse_json_file_str(args.dataloader_args)
    imp_score_kwargs = parse_json_file_str(args.imp_score_kwargs)

    # setup the files
    if not isinstance(args.output, list):
        args.output = [args.output]
    for o in args.output:
        ending = o.split('.')[-1]
        if ending not in writers.FILE_SUFFIX_MAP:
            logger.error("File ending: {0} for file {1} not from {2}".format(
                ending, o, writers.FILE_SUFFIX_MAP))
            sys.exit(1)
        dir_exists(os.path.dirname(o), logger)
    # --------------------------------------------
    # install args
    if args.install_req:
        kipoi.pipeline.install_model_requirements(args.model,
                                                  args.source,
                                                  and_dataloaders=True)

    # load model & dataloader
    model = kipoi.get_model(args.model,
                            args.source,
                            with_dataloader=args.dataloader is None)

    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader,
                                          args.dataloader_source)
    else:
        Dl = model.default_dataloader

    dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs)
    dl = Dl(**dataloader_kwargs)

    # get_importance_score
    ImpScore = get_importance_score(args.imp_score)
    if not ImpScore.is_compatible(model):
        raise ValueError("model not compatible with score: {0}".format(
            args.imp_score))
    impscore = ImpScore(model, **imp_score_kwargs)

    # setup batching
    it = dl.batch_iter(batch_size=args.batch_size,
                       num_workers=args.num_workers)

    # Setup the writers
    use_writers = []
    for output in args.output:
        ending = output.split('.')[-1]
        W = writers.FILE_SUFFIX_MAP[ending]
        logger.info("Using {0} for file {1}".format(W.__name__, output))
        if ending == "tsv":
            assert W == writers.TsvBatchWriter
            use_writers.append(
                writers.TsvBatchWriter(file_path=output, nested_sep="/"))
        elif ending == "bed":
            raise Exception("Please use tsv or hdf5 output format.")
        elif ending in ["hdf5", "h5"]:
            assert W == writers.HDF5BatchWriter
            use_writers.append(writers.HDF5BatchWriter(file_path=output))
        else:
            logger.error("Unknown file format: {0}".format(ending))
            sys.exit(1)

    # Loop through the data, make predictions, save the output
    for i, batch in enumerate(tqdm(it)):
        # validate the data schema in the first iteration
        if i == 0 and not Dl.output_schema.compatible_with_batch(batch):
            logger.warn(
                "First batch of data is not compatible with the dataloader schema."
            )

        # make the prediction
        # TODO - handle the reference-based importance scores...
        importance_scores = impscore.score(batch['inputs'])

        # write out the predictions, metadata (, inputs, targets)
        # always keep the inputs so that input*grad can be generated!
        # output_batch = prepare_batch(batch, pred_batch, keep_inputs=True)
        output_batch = batch
        output_batch["importance_scores"] = importance_scores
        for writer in use_writers:
            writer.batch_write(output_batch)

    for writer in use_writers:
        writer.close()
    logger.info('Done! Importance scores stored in {0}'.format(",".join(
        args.output)))
Example #18
0
def cli_deeplift(command, raw_args):
    """CLI interface to predict
    """
    # TODO: find a way to define the "reference" for a scored sequence.
    # from .main import prepare_batch
    assert command == "deeplift"
    from tqdm import tqdm
    from .referencebased import DeepLift
    from .referencebased import get_mxts_modes
    parser = argparse.ArgumentParser('kipoi interpret {}'.format(command),
                                     description='Calculate DeepLIFT scores.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in prediction')
    parser.add_argument(
        "-n",
        "--num_workers",
        type=int,
        default=0,
        help="Number of parallel workers for loading the dataset")
    parser.add_argument(
        "-l",
        "--layer",
        type=int,
        default=None,
        help="With respect to which layer the scores should be calculated.",
        required=True)
    parser.add_argument(
        "--pre_nonlinearity",
        help=
        "Flag indicating that it should checked whether the selected output is post activation "
        "function. If a non-linear activation function is used attempt to use its input. This "
        "feature is not available for all models.",
        action='store_true')
    parser.add_argument(
        "-f",
        "--filter_idx",
        help="Filter index that should be inspected with gradients",
        default=None,
        required=True,
        type=int)
    parser.add_argument("-m",
                        "--mxts_mode",
                        help="Deeplift score, allowed values are: %s" %
                        str(list(get_mxts_modes().keys())),
                        default='rescale_conv_revealcancel_fc')
    parser.add_argument(
        '-o',
        '--output',
        required=True,
        nargs="+",
        help=
        "Output files. File format is inferred from the file path ending. Available file formats are: "
        + ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP]))
    args = parser.parse_args(raw_args)

    dataloader_kwargs = parse_json_file_str(args.dataloader_args)

    # setup the files
    if not isinstance(args.output, list):
        args.output = [args.output]
    for o in args.output:
        ending = o.split('.')[-1]
        if ending not in writers.FILE_SUFFIX_MAP:
            logger.error("File ending: {0} for file {1} not from {2}".format(
                ending, o, writers.FILE_SUFFIX_MAP))
            sys.exit(1)
        dir_exists(os.path.dirname(o), logger)
    # --------------------------------------------

    layer = args.layer
    if layer is None and not args.final_layer:
        raise Exception(
            "A layer has to be selected explicitely using `--layer` or implicitely by using the"
            "`--final_layer` flag.")

    # Not a good idea
    # if layer is not None and isint(layer):
    #    logger.warn("Interpreting `--layer` value as integer layer index!")
    #    layer = int(args.layer)

    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader,
                                          args.dataloader_source)
    else:
        Dl = model.default_dataloader

    dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs)
    dl = Dl(**dataloader_kwargs)

    # setup batching
    it = dl.batch_iter(batch_size=args.batch_size,
                       num_workers=args.num_workers)

    # Setup the writers
    use_writers = []
    for output in args.output:
        ending = output.split('.')[-1]
        W = writers.FILE_SUFFIX_MAP[ending]
        logger.info("Using {0} for file {1}".format(W.__name__, output))
        if ending == "tsv":
            assert W == writers.TsvBatchWriter
            use_writers.append(
                writers.TsvBatchWriter(file_path=output, nested_sep="/"))
        elif ending == "bed":
            raise Exception("Please use tsv or hdf5 output format.")
        elif ending in ["hdf5", "h5"]:
            assert W == writers.HDF5BatchWriter
            use_writers.append(writers.HDF5BatchWriter(file_path=output))
        else:
            logger.error("Unknown file format: {0}".format(ending))
            sys.exit(1)

    d = DeepLift(model,
                 output_layer=args.layer,
                 task_idx=args.filter_idx,
                 preact=args.pre_nonlinearity,
                 mxts_mode=args.mxts_mode,
                 batch_size=args.batch_size)

    # Loop through the data, make predictions, save the output
    for i, batch in enumerate(tqdm(it)):
        # validate the data schema in the first iteration
        if i == 0 and not Dl.output_schema.compatible_with_batch(batch):
            logger.warn(
                "First batch of data is not compatible with the dataloader schema."
            )

        # calculate scores without reference for the moment.
        pred_batch = d.score(batch['inputs'], None)

        # write out the predictions, metadata (, inputs, targets)
        # always keep the inputs so that input*grad can be generated!
        # output_batch = prepare_batch(batch, pred_batch, keep_inputs=True)
        output_batch = batch
        output_batch["scores"] = pred_batch
        for writer in use_writers:
            writer.batch_write(output_batch)

    for writer in use_writers:
        writer.close()
    logger.info('Done! Gradients stored in {0}'.format(",".join(args.output)))