Esempio n. 1
0
def test_var_eff_pred_varseq(tmpdir):
    model_name = "DeepSEA/variantEffects"
    if INSTALL_REQ:
        install_model_requirements(model_name, "kipoi", and_dataloaders=True)
    #
    model = kipoi.get_model(model_name, source="kipoi")
    # The preprocessor
    Dataloader = SeqIntervalDl
    #
    dataloader_arguments = {"intervals_file": "example_files/intervals.bed",
                            "fasta_file": "example_files/hg38_chr22.fa",
                            "required_seq_len": 1000, "alphabet_axis": 1, "dummy_axis": 2, "label_dtype": str}
    dataloader_arguments = {k: model.source_dir + "/" + v if isinstance(v, str) else v for k, v in
                            dataloader_arguments.items()}

    vcf_path = "tests/data/variants.vcf"
    out_vcf_fpath = str(tmpdir.mkdir("variants_generated", ).join("out.vcf"))
    #
    vcf_path = kipoi_veff.ensure_tabixed_vcf(vcf_path)
    model_info = kipoi_veff.ModelInfoExtractor(model, Dataloader)
    writer = kipoi_veff.VcfWriter(
        model, vcf_path, out_vcf_fpath, standardise_var_id=True)
    vcf_to_region = kipoi_veff.SnvCenteredRg(model_info)
    res = sp.predict_snvs(model, Dataloader, vcf_path, dataloader_args=dataloader_arguments,
                          batch_size=32,
                          vcf_to_region=vcf_to_region,
                          sync_pred_writer=writer)
    writer.close()
    assert os.path.exists(out_vcf_fpath)
Esempio n. 2
0
    def add_scores(self, snp_vcf_path='../data/snp_vcfs', out_dir=None):
        if out_dir is None:
            out_dir = '../data/model_scores/' + self.model_name
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        file_names = os.listdir(self.snp_vcf_path)
        for file_name in file_names:
            chrom = file_name.split('.')[0]
            Dataloader = self.model.default_dataloader
            vcf_path = self.snp_vcf_path + '/' + file_name
            out_vcf_fpath = out_dir + '/' + chrom + '.vcf'
            print(vcf_path)
            print(out_vcf_fpath)
            writer = VcfWriter(self.model, vcf_path, out_vcf_fpath)
            model_info = kipoi_veff.ModelInfoExtractor(self.model, Dataloader)
            # vcf_to_region will generate a variant-centered regions when presented a VCF record.
            vcf_to_region = kipoi_veff.SnvCenteredRg(model_info)

            dataloader_arguments = {
                "fasta_file": '../data/fasta_files/chr' + chrom + '.fa'
            }

            sp.predict_snvs(
                self.model,
                Dataloader,
                vcf_path,
                batch_size=32,
                dataloader_args=dataloader_arguments,
                vcf_to_region=vcf_to_region,
                #evaluation_function_kwargs={'diff_types': {'diff': Diff("mean"), 'deepsea_effect': DeepSEA_effect("mean")}},
                sync_pred_writer=writer)
Esempio n. 3
0
def test_mutation_map():
    if sys.version_info[0] == 2:
        pytest.skip("rbp example not supported on python 2 ")

    # Take the rbp model
    model_dir = "tests/models/rbp/"
    if INSTALL_REQ:
        install_model_requirements(model_dir, "dir", and_dataloaders=True)

    model = kipoi.get_model(model_dir, source="dir")
    # The preprocessor
    Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir")
    #
    dataloader_arguments = {
        "fasta_file": "example_files/hg38_chr22.fa",
        "preproc_transformer": "dataloader_files/encodeSplines.pkl",
        "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz",
    }
    dataloader_arguments = {
        k: model_dir + v
        for k, v in dataloader_arguments.items()
    }
    #
    # Run the actual predictions
    vcf_path = model_dir + "example_files/first_variant.vcf"
    #
    model_info = kipoi_veff.ModelInfoExtractor(model, Dataloader)
    vcf_to_region = kipoi_veff.SnvCenteredRg(model_info)
    mdmm = mm._generate_mutation_map(
        model,
        Dataloader,
        vcf_path,
        dataloader_args=dataloader_arguments,
        evaluation_function=analyse_model_preds,
        batch_size=32,
        vcf_to_region=vcf_to_region,
        evaluation_function_kwargs={'diff_types': {
            'diff': Diff("mean")
        }})
    with cd(model.source_dir):
        mdmm.save_to_file("example_files/first_variant_mm_totest.hdf5")
        from kipoi_veff.utils.generic import read_hdf5
        reference = read_hdf5("example_files/first_variant_mm.hdf5")
        obs = read_hdf5("example_files/first_variant_mm.hdf5")
        compare_rec(reference[0], obs[0])
        import matplotlib
        matplotlib.pyplot.switch_backend('agg')
        mdmm.plot_mutmap(0, "seq", "diff", "rbp_prb")
        os.unlink("example_files/first_variant_mm_totest.hdf5")
Esempio n. 4
0
def _get_vcf_to_region(model_info, restriction_bed, seq_length):
    import kipoi
    import pybedtools
    # Select the appropriate region generator
    if restriction_bed is not None:
        # Select the restricted SNV-centered region generator
        pbd = pybedtools.BedTool(restriction_bed)
        vcf_to_region = kipoi_veff.SnvPosRestrictedRg(model_info, pbd)
        logger.info(
            'Restriction bed file defined. Only variants in defined regions will be tested.'
            'Only defined regions will be tested.')
    elif model_info.requires_region_definition:
        # Select the SNV-centered region generator
        vcf_to_region = kipoi_veff.SnvCenteredRg(model_info,
                                                 seq_length=seq_length)
        logger.info('Using variant-centered sequence generation.')
    else:
        # No regions can be defined for the given model, VCF overlap will be inferred, hence tabixed VCF is necessary
        vcf_to_region = None
        logger.info(
            'Dataloader does not accept definition of a regions bed-file. Only VCF-variants that lie within'
            'produced regions can be predicted')
    return vcf_to_region
Esempio n. 5
0
model_name = "DeepBind/Homo_sapiens/TF/D00299.003_SELEX_ATF7"
# get the model
model = kipoi.get_model(model_name)
# get the dataloader factory
Dataloader = model.default_dataloader

vcf_path = "../data/test.vcf"
# The output vcf path, based on the input file name
out_vcf_fpath = vcf_path[:-4] + "%s.vcf" % model_name.replace("/", "_")
# The writer object that will output the annotated VCF
writer = VcfWriter(model, vcf_path, out_vcf_fpath)

# Information extraction from dataloader and model
model_info = kipoi_veff.ModelInfoExtractor(model, Dataloader)
# vcf_to_region will generate a variant-centered regions when presented a VCF record.
vcf_to_region = kipoi_veff.SnvCenteredRg(model_info)

dataloader_arguments = {"fasta_file": "../data/fasta_files/chr1.fa"}

sp.predict_snvs(
    model,
    Dataloader,
    vcf_path,
    batch_size=32,
    dataloader_args=dataloader_arguments,
    vcf_to_region=vcf_to_region,
    #evaluation_function_kwargs={'diff_types': {'diff': Diff("mean"), 'deepsea_effect': DeepSEA_effect("mean")}},
    sync_pred_writer=writer)
vcf_reader = KipoiVCFParser(out_vcf_fpath)
entries = [el for el in vcf_reader]
#print(pd.DataFrame(entries).head().iloc[:,:7])
Esempio n. 6
0
def cli_create_mutation_map(command, raw_args):
    """CLI interface to calculate mutation map data
    """
    assert command == "create_mutation_map"
    parser = argparse.ArgumentParser(
        'kipoi veff {}'.format(command),
        description='Predict effect of SNVs using ISM.')
    add_model(parser)
    add_dataloader(parser, with_args=True)
    parser.add_argument(
        '-r',
        '--regions_file',
        help='Region definition as VCF or bed file. Not a required input.')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size to use in prediction')
    parser.add_argument(
        "-n",
        "--num_workers",
        type=int,
        default=0,
        help="Number of parallel workers for loading the dataset")
    parser.add_argument("-i",
                        "--install_req",
                        action='store_true',
                        help="Install required packages from requirements.txt")
    parser.add_argument(
        '-o',
        '--output',
        required=True,
        help="Output HDF5 file. To be used as input for plotting.")
    parser.add_argument(
        '-s',
        "--scores",
        default="diff",
        nargs="+",
        help=
        "Scoring method to be used. Only scoring methods selected in the model yaml file are"
        "available except for `diff` which is always available. Select scoring function by the"
        "`name` tag defined in the model yaml file.")
    parser.add_argument(
        '-k',
        "--score_kwargs",
        default="",
        nargs="+",
        help=
        "JSON definition of the kwargs for the scoring functions selected in --scores. The "
        "definiton can either be in JSON in the command line or the path of a .json file. The "
        "individual JSONs are expected to be supplied in the same order as the labels defined in "
        "--scores. If the defaults or no arguments should be used define '{}' for that respective "
        "scoring method.")
    parser.add_argument(
        '-l',
        "--seq_length",
        type=int,
        default=None,
        help=
        "Optional parameter: Model input sequence length - necessary if the model does not have a "
        "pre-defined input sequence length.")

    parser.add_argument(
        "--singularity",
        action='store_true',
        help="Run `kipoi predict` in the appropriate singularity container. "
        "Containters will get downloaded to ~/.kipoi/envs/ or to "
        "$SINGULARITY_CACHEDIR if set")

    args = parser.parse_args(raw_args)

    # extract args for kipoi.variant_effects.predict_snvs
    print("DL ARGS", args.dataloader_args)
    dataloader_arguments = parse_json_file_str_or_arglist(args.dataloader_args)
    #dataloader_arguments = parse_json_file_str(args.dataloader_args)

    if args.output is None:
        raise Exception("Output file `--output` has to be set!")

    if args.singularity:
        from kipoi.cli.singularity import singularity_command
        logger.info(
            "Running kipoi veff in the singularity container".format(command))
        # Drop the singularity flag
        raw_args = [x for x in raw_args if x != '--singularity']
        singularity_command(['kipoi', 'veff', command] + raw_args,
                            args.model,
                            dataloader_arguments,
                            output_files=args.output,
                            source=args.source,
                            dry_run=False)
        return None

    # --------------------------------------------
    # install args
    if args.install_req:
        kipoi.pipeline.install_model_requirements(args.model,
                                                  args.source,
                                                  and_dataloaders=True)
    # load model & dataloader
    model = kipoi.get_model(args.model, args.source)

    regions_file = os.path.realpath(args.regions_file)
    output = os.path.realpath(args.output)
    with cd(model.source_dir):
        if not os.path.exists(regions_file):
            raise Exception("Regions inputs file does not exist: %s" %
                            args.regions_file)

        # Check that all the folders exist
        file_exists(regions_file, logger)
        dir_exists(os.path.dirname(output), logger)

        if args.dataloader is not None:
            Dl = kipoi.get_dataloader_factory(args.dataloader,
                                              args.dataloader_source)
        else:
            Dl = model.default_dataloader

    if not isinstance(args.scores, list):
        args.scores = [args.scores]

    # TODO - why is this function not a method of the model class?
    dts = get_scoring_fns(model, args.scores, args.score_kwargs)

    # Load effect prediction related model info
    model_info = kipoi_veff.ModelInfoExtractor(model, Dl)
    manual_seq_len = args.seq_length

    # Select the appropriate region generator and vcf or bed file input
    args.file_format = regions_file.split(".")[-1]
    bed_region_file = None
    vcf_region_file = None
    bed_to_region = None
    vcf_to_region = None
    if args.file_format == "vcf" or regions_file.endswith("vcf.gz"):
        vcf_region_file = regions_file
        if model_info.requires_region_definition:
            # Select the SNV-centered region generator
            vcf_to_region = kipoi_veff.SnvCenteredRg(model_info,
                                                     seq_length=manual_seq_len)
            logger.info('Using variant-centered sequence generation.')
    elif args.file_format == "bed":
        if model_info.requires_region_definition:
            # Select the SNV-centered region generator
            bed_to_region = kipoi_veff.BedOverlappingRg(
                model_info, seq_length=manual_seq_len)
            logger.info('Using bed-file based sequence generation.')
        bed_region_file = regions_file
    else:
        raise Exception("")

    if model_info.use_seq_only_rc:
        logger.info(
            'Model SUPPORTS simple reverse complementation of input DNA sequences.'
        )
    else:
        logger.info(
            'Model DOES NOT support simple reverse complementation of input DNA sequences.'
        )

    from kipoi_veff.mutation_map import _generate_mutation_map
    mdmm = _generate_mutation_map(
        model,
        Dl,
        vcf_fpath=vcf_region_file,
        bed_fpath=bed_region_file,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        dataloader_args=dataloader_arguments,
        vcf_to_region=vcf_to_region,
        bed_to_region=bed_to_region,
        evaluation_function_kwargs={'diff_types': dts},
    )
    mdmm.save_to_file(output)

    logger.info('Successfully generated mutation map data')