Example #1
0
def test__overlap_vcf_region():
    vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf(
        "examples/rbp/example_files/variants.vcf")
    vcf_obj = cyvcf2.VCF(vcf_path)
    all_records = [rec for rec in vcf_obj]
    vcf_obj.close()
    vcf_obj = cyvcf2.VCF(vcf_path)
    #
    regions_dict = {
        "chr": ["chr22"],
        "start": [21541589],
        "end": [36702137],
        "id": [0]
    }
    regions_gr = GenomicRanges(regions_dict["chr"], regions_dict["start"],
                               regions_dict["end"], regions_dict["id"])
    for regions in [regions_dict, regions_gr]:
        found_vars, overlapping_region = sp._overlap_vcf_region(
            vcf_obj, regions, exclude_indels=False)
        assert all([
            str(el1) == str(el2) for el1, el2 in zip(all_records, found_vars)
        ])
        assert len(overlapping_region) == len(found_vars)
        assert all([el == 0 for el in overlapping_region])

    regions_dict = {
        "chr": ["chr22", "chr22", "chr22"],
        "start": [21541589, 21541589, 30630220],
        "end": [36702137, 21541590, 30630222],
        "id": [0, 1, 2]
    }
    regions_gr = GenomicRanges(regions_dict["chr"], regions_dict["start"],
                               regions_dict["end"], regions_dict["id"])
    #
    plus_indel_results = all_records + all_records[:1] + all_records[3:4]
    snv_results = [el for el in plus_indel_results if not el.is_indel]
    #
    ref_lines_indel = [0] * len(all_records) + [1] + [2]
    snv_ref_lines = [
        el for el, el1 in zip(ref_lines_indel, plus_indel_results)
        if not el1.is_indel
    ]
    #
    for regions in [regions_dict, regions_gr]:
        for exclude_indels, ref_res, ref_lines in zip(
            [False, True], [plus_indel_results, snv_results],
            [ref_lines_indel, snv_ref_lines]):
            found_vars, overlapping_region = sp._overlap_vcf_region(
                vcf_obj, regions, exclude_indels)
            assert all([
                str(el1) == str(el2) for el1, el2 in zip(ref_res, found_vars)
                if not el1.is_indel
            ])
            assert overlapping_region == ref_lines
Example #2
0
    def __getitem__(self, idx):
        out = {}
        if self.MISO_AS:
            gene = self.genes[idx]
            inputs, ranges = self.get_seq(gene)
            out['inputs'] = inputs
            if self.Y is not None:
                out['targets'] = self.Y.get_target(gene.geneName)
            else:
                out['targets'] = np.nan
            out['metadata'] = {}
            out['metadata']['geneName'] = gene.geneName
            out['metadata']['chrom'] = gene.chrom
            out['metadata']['strand'] = gene.strand
            out['metadata']['start'] = gene.start
            out['metadata']['stop'] = gene.stop
            out['metadata']['extracted_regions'] = ranges

        else:
            spliceSite = self.spliceSites[idx]
            out['inputs'] = spliceSite.get_seq(self.fasta)
            out['metadata'] = {}
            out['metadata']['geneID'] = spliceSite.geneID
            out['metadata']['transcriptID'] = spliceSite.transcriptID
            out['metadata']['biotype'] = spliceSite.biotype
            out['metadata']['order'] = spliceSite.order
            out['metadata']['ranges'] = GenomicRanges(
                spliceSite.chrom,
                spliceSite.grange[0] - 1,  # use 0-base indexing
                spliceSite.grange[1],
                spliceSite.geneID,
                spliceSite.strand)

        return out
Example #3
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaExtractor(self.fasta_file)

        interval, labels = self.tsv[idx]

        if self.auto_resize_len:
            # automatically resize the sequence to cerat
            interval = resize_interval(interval, self.auto_resize_len)

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]))

        return {
            "inputs": {"seq": seq},
            "targets": labels,
            "metadata": {
                "ranges": GenomicRanges(chr=interval.chrom,
                                        start=interval.start,
                                        end=interval.stop,
                                        id=str(idx),
                                        strand=(interval.strand
                                                if interval.strand is not None
                                                else "*"),
                                        ),
                "interval_from_task": ''
            }
        }
Example #4
0
    def __getitem__(self, idx):
        """
        Return a list of Branch objects. They contain coordinates that can be
        written to bed files
        """
        out = {}
        out['inputs'] = {}
        branch = self.branches[idx]

        # input sequence
        out['inputs']['bidirectional_1_input'] = branch.seq

        # metadata
        out['metadata'] = {}
        out['metadata']['geneID'] = branch.geneID
        out['metadata']['transcriptID'] = branch.transcriptID
        out['metadata']['chrom'] = branch.chrom
        out['metadata']['strand'] = branch.strand
        out['metadata']['start'] = branch.grange[0] - 1  # use 0-base indexing
        out['metadata']['stop'] = branch.grange[1]
        out['metadata']['biotype'] = branch.biotype
        out['metadata']['ranges'] = GenomicRanges(
            branch.chrom,
            branch.grange[0] - 1,  # use 0-base indexing
            branch.grange[1],
            branch.geneID + "_" + branch.transcriptID,
            branch.strand)

        return out
Example #5
0
    def __getitem__(self, idx):
        if self.fasta_extractors is None:
            self.fasta_extractors = FastaStringExtractor(
                self.fasta_file,
                use_strand=False,  # self.use_strand,
                force_upper=self.force_upper)

        interval, labels = self.bed[idx]

        if self.auto_resize_len:
            # automatically resize the sequence to cerat
            interval = resize_interval(interval,
                                       self.auto_resize_len,
                                       anchor='center')

        # QUESTION: @kromme - why to we need max_seq_len?
        # if self.max_seq_len is not None:
        #     assert interval.stop - interval.start <= self.max_seq_len

        # Run the fasta extractor and transform if necessary
        seq = self.fasta_extractors.extract(interval)

        return {
            "inputs": np.array(seq),
            "targets": labels,
            "metadata": {
                "ranges":
                GenomicRanges(interval.chrom, interval.start, interval.stop,
                              str(idx))
            }
        }
Example #6
0
    def __getitem__(self, idx):
        if self.fasta is None:
            self.fasta = FastaFile(self.fasta_file)
        out = {}

        if self.MISO_AS:
            gene = self.genes[idx]
            out['inputs'] = self.get_seq(gene)
            out['metadata'] = {}
            out['metadata']['geneName'] = gene.geneName
            out['metadata']['chrom'] = gene.chrom
            out['metadata']['strand'] = gene.strand
            out['metadata']['start'] = gene.start
            out['metadata']['stop'] = gene.stop

        else:
            spliceSite = self.spliceSites[idx]
            out['inputs'] = spliceSite.get_seq(self.fasta)
            out['metadata'] = {}
            out['metadata']['geneID'] = spliceSite.geneID
            out['metadata']['transcriptID'] = spliceSite.transcriptID
            out['metadata']['biotype'] = spliceSite.biotype
            out['metadata']['order'] = spliceSite.order
            out['metadata']['ranges'] = GenomicRanges(
                spliceSite.chrom,
                spliceSite.grange[0] - 1,  # use 0-base indexing
                spliceSite.grange[1],
                spliceSite.geneID,
                spliceSite.strand)
        return out
Example #7
0
def dl_batch():
    return {"inputs": np.arange(3),
            "metadata": {
                "ranges": GenomicRanges(chr=np.array(["chr1", "chr1", "chr1"]),
                                        start=np.arange(3) + 1,
                                        end=np.arange(3) + 5,
                                        id=np.arange(3).astype(str),
                                        strand=np.array(["*"] * 3)
                                        ),
                "gene_id": np.arange(3).astype(str)
    }}
Example #8
0
 def __getitem__(self, idx):
     row = self._gtf_anchor.iloc[idx]
     interval = self._create_anchored_interval(
         row,
         num_upstream=self._num_upstream,
         num_downstream=self._num_downstream)
     sequence = self._fa.extract(interval)
     sequence = self._transform(sequence)
     metadata_dict = {k: row.get(k, '') for k in self._interval_attrs}
     metadata_dict["ranges"] = GenomicRanges(interval.chrom, interval.start,
                                             interval.stop, str(idx))
     return {"inputs": np.array(sequence), "metadata": metadata_dict}
Example #9
0
    def __next__(self):
        ss = next(self.exonGenerator)
        out = {}
        out['inputs'] = {}
        seq = ss.get_seq(self.fasta).upper()
        if self.split_seq:
            seq = self.split(seq, ss.overhang)['donor'][0]
        out['inputs']['ss'] = seq

        out['metadata'] = {}
        out['metadata']['ranges'] = GenomicRanges(ss.chrom, ss.Exon_Start,
                                                  ss.Exon_End,
                                                  ss.transcript_id, ss.strand)

        return out
Example #10
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaExtractor(self.fasta_file)

        interval, labels = self.tsv[idx]

        # Intervals need to be 1000bp wide
        assert interval.stop - interval.start == 1000

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]))

        return {
            "inputs": {"data/genome_data_dir": seq},
            "targets": labels,
            "metadata": {
                "ranges": GenomicRanges(interval.chrom, interval.start, interval.stop, str(idx))
            }
        }
Example #11
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaExtractor(self.fasta_file)
            self.bigwig_extractors = {
                a: [BigwigExtractor(f) for f in self.bigwigs[a]]
                for a in self.bigwigs
            }

        interval, labels = self.tsv[idx]
        interval = resize_interval(interval, 1000)
        # Intervals need to be 1000bp wide
        assert interval.stop - interval.start == 1000

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]))

        interval_wide = resize_interval(deepcopy(interval), self.track_width)

        return {
            "inputs": {
                "seq": seq
            },
            "targets": {
                a:
                sum([e([interval_wide])[0]
                     for e in self.bigwig_extractors[a]]).sum()
                for a in self.bigwig_extractors
            },
            "metadata": {
                "ranges":
                GenomicRanges(interval.chrom, interval.start, interval.stop,
                              str(idx)),
                "ranges_wide":
                GenomicRanges.from_interval(interval_wide),
                "name":
                interval.name
            }
        }
Example #12
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            # Use array extractors
            if self.bcolz:
                self.fasta_extractor = ArrayExtractor(self.ds.fasta_file,
                                                      in_memory=False)
                self.bw_extractors = {
                    task: [
                        ArrayExtractor(task_spec.pos_counts, in_memory=False),
                        ArrayExtractor(task_spec.neg_counts, in_memory=False)
                    ]
                    for task, task_spec in self.ds.task_specs.items()
                    if task in self.tasks
                }
                self.bias_bw_extractors = {
                    task: [
                        ArrayExtractor(task_spec.pos_counts, in_memory=False),
                        ArrayExtractor(task_spec.neg_counts, in_memory=False)
                    ]
                    for task, task_spec in self.ds.bias_specs.items()
                    if task in self.tasks
                }
            else:
                # Use normal fasta/bigwig extractors
                assert not self.bcolz
                # first call
                self.fasta_extractor = FastaExtractor(self.ds.fasta_file,
                                                      use_strand=True)
                self.bw_extractors = {
                    task: [
                        BigwigExtractor(task_spec.pos_counts),
                        BigwigExtractor(task_spec.neg_counts)
                    ]
                    for task, task_spec in self.ds.task_specs.items()
                    if task in self.tasks
                }
                self.bias_bw_extractors = {
                    task: [
                        BigwigExtractor(task_spec.pos_counts),
                        BigwigExtractor(task_spec.neg_counts)
                    ]
                    for task, task_spec in self.ds.bias_specs.items()
                }

        # Setup the intervals
        interval = Interval(
            self.dfm.iat[idx, 0],  # chrom
            self.dfm.iat[idx, 1],  # start
            self.dfm.iat[idx, 2])  # end

        # Transform the input interval (for say augmentation...)
        if self.interval_transformer is not None:
            interval = self.interval_transformer(interval)

        target_interval = resize_interval(deepcopy(interval), self.peak_width)
        seq_interval = resize_interval(deepcopy(interval), self.seq_width)

        # This only kicks in when we specify the taskname from dataspec
        # to the 3rd column. E.g. it doesn't apply when using intervals_file
        interval_from_task = self.dfm.iat[
            idx, 3] if self.intervals_file is None else ''

        # extract seq + tracks
        sequence = self.fasta_extractor([seq_interval])[0]

        if not self.only_classes:
            if self.taskname_first:
                cuts = {
                    f"{task}/profile":
                    run_extractors(self.bw_extractors[task], [target_interval],
                                   ignore_strand=spec.ignore_strand)[0]
                    for task, spec in self.ds.task_specs.items()
                    if task in self.tasks
                }
            else:
                cuts = {
                    f"profile/{task}":
                    run_extractors(self.bw_extractors[task], [target_interval],
                                   ignore_strand=spec.ignore_strand)[0]
                    for task, spec in self.ds.task_specs.items()
                    if task in self.tasks
                }

            # Add counts
            if self.target_transformer is not None:
                cuts = self.target_transformer.transform(cuts)

            # Add bias tracks
            if len(self.ds.bias_specs) > 0:

                biases = {
                    bias_task:
                    run_extractors(self.bias_bw_extractors[bias_task],
                                   [target_interval],
                                   ignore_strand=spec.ignore_strand)[0]
                    for bias_task, spec in self.ds.bias_specs.items()
                }

                task_biases = {
                    f"bias/{task}/profile": np.concatenate(
                        [biases[bt] for bt in self.task_bias_tracks[task]],
                        axis=-1)
                    for task in self.tasks
                }

                if self.target_transformer is not None:
                    for task in self.tasks:
                        task_biases[f'bias/{task}/counts'] = np.log(
                            1 + task_biases[f'bias/{task}/profile'].sum(0))
                    # total_count_bias = np.concatenate([np.log(1 + x[k].sum(0))
                    #                                    for k, x in biases.items()], axis=-1)
                    # task_biases['bias/total_counts'] = total_count_bias

                if self.profile_bias_pool_size is not None:
                    for task in self.tasks:
                        task_biases[f'bias/{task}/profile'] = np.concatenate(
                            [
                                moving_average(
                                    task_biases[f'bias/{task}/profile'],
                                    n=pool_size) for pool_size in to_list(
                                        self.profile_bias_pool_size)
                            ],
                            axis=-1)

                sequence = {"seq": sequence, **task_biases}
        else:
            cuts = dict()

        if self.include_classes:
            if self.taskname_first:
                # Get the classes from the tsv file
                classes = {
                    f"{task}/class": self.dfm.iat[idx, i + 3]
                    for i, task in enumerate(self.dfm_tasks)
                    if task in self.tasks
                }
            else:
                classes = {
                    f"class/{task}": self.dfm.iat[idx, i + 3]
                    for i, task in enumerate(self.dfm_tasks)
                    if task in self.tasks
                }
            cuts = {**cuts, **classes}

        out = {"inputs": sequence, "targets": cuts}

        if self.include_metadata:
            out['metadata'] = {
                "range":
                GenomicRanges(
                    chr=target_interval.chrom,
                    start=target_interval.start,
                    end=target_interval.stop,
                    id=idx,
                    strand=(target_interval.strand
                            if target_interval.strand is not None else "*"),
                ),
                "interval_from_task":
                interval_from_task
            }
        return out
Example #13
0
def test_MutationMapDataMerger():
    if sys.version_info[0] == 2:
        pytest.skip("Skip")
    model_dir = "examples/rbp/"
    vcf_sub_path = "example_files/variants.vcf"
    vcf_path = model_dir + vcf_sub_path
    vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf(
        vcf_path)
    seq_len = 10
    model_info_extractor = DummyModelInfo(seq_len)
    model_info_extractor.seq_length = seq_len
    region_generator = kipoi.postprocessing.variant_effects.utils.generic.SnvCenteredRg(
        model_info_extractor)
    vcf_fh = cyvcf2.VCF(vcf_path)
    regions = Dummy_internval()
    _write_regions_from_vcf(
        vcf_fh,
        kipoi.postprocessing.variant_effects.utils.generic.default_vcf_id_gen,
        regions.append_interval, region_generator)
    #
    vcf_fh.close()
    annotated_regions = pd.DataFrame(regions.storage)
    num_seqs = annotated_regions.shape[0]
    query_process_lines = list(range(num_seqs))
    vcf_fh = cyvcf2.VCF(vcf_path)
    query_vcf_records = [
        rec for rec in vcf_fh if kipoi.postprocessing.variant_effects.utils.
        generic.default_vcf_id_gen(rec) in annotated_regions["id"].tolist()
    ]
    #
    gr_meta = {
        "ranges":
        GenomicRanges(annotated_regions["chrom"].values,
                      annotated_regions["start"].values - 1,
                      annotated_regions["end"].values,
                      annotated_regions["id"].values, ["*"] * num_seqs)
    }
    #
    rseq = [
        "A" * (annotated_regions["end"].values[i] -
               annotated_regions["start"].values[i] + 1)
        for i in range(annotated_regions.shape[0])
    ]
    ref_seqs = {"ranges": rseq}
    #
    seq_to_meta = {"seq": "ranges"}
    # "query_vcf_records", "query_process_lines"
    pred_proto_idxs = []
    process_line = []
    for i in range(annotated_regions.shape[0]):
        for pos, ref in zip(
                range(annotated_regions["start"].values[i],
                      annotated_regions["end"].values[i] + 1), rseq[i]):
            for alt in ["A", "C", "G", "T"]:
                if ref == alt:
                    continue
                ID = ":".join(["chr22", str(pos), ref.upper(), alt])
                pred_proto_idxs.append(ID)
                process_line += [i]

    model_outputs = ["out1", "out2"]
    pred_proto = pd.DataFrame(np.zeros((num_seqs * seq_len * 3, 2)),
                              columns=["out1", "out2"],
                              index=pred_proto_idxs)
    #
    predictions = {"DIFF": pred_proto, "PRED2": pred_proto}
    #
    pred_set = {
        "query_process_lines": query_process_lines,
        "query_vcf_records": query_vcf_records,
        "process_line": process_line
    }
    mmdm = mm.MutationMapDataMerger(seq_to_meta)
    mmdm.append(predictions, pred_set, ref_seqs, gr_meta)
    merged_data = mmdm.get_merged_data()
    assert len(merged_data) == num_seqs
    for i, md in enumerate(merged_data):
        for k in md:
            assert k in list(seq_to_meta.keys())
            for scr in md[k]:
                assert scr in list(predictions.keys())
                for model_output in md[k][scr]:
                    mm_obj = md[k][scr][model_output]
                    assert model_output in model_outputs
                    exp_entries = [
                        'ovlp_var', 'mutation_map', 'ref_seq',
                        'metadata_region'
                    ]
                    assert all([k in exp_entries for k in mm_obj])
                    assert len(mm_obj) == len(exp_entries)
                    # This only works when als ref/ref mutations are taken into account
                    # retval = np.reshape(pred_proto[model_output].loc[np.array(process_line)==i], (seq_len, 4)).T
                    # assert np.all(mm_obj['mutation_map'] == retval)
                    assert mm_obj['ref_seq'] == rseq[i]
                    assert mm_obj['ovlp_var']['varpos_rel'][
                        0] == seq_len // 2 - 1
                    assert all([
                        mm_obj['metadata_region'][k] == gr_meta["ranges"][k][i]
                        for k in mm_obj['metadata_region']
                    ])
Example #14
0
def test__generate_seq_sets_mutmap_iter():
    from pybedtools import BedTool
    model_dir = "examples/rbp/"
    vcf_sub_path = "example_files/variants.vcf"
    vcf_path = model_dir + vcf_sub_path
    vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf(
        vcf_path)
    # for any given input type: list, dict and np.array return 4 identical sets, except for mutated bases on one position
    seq_len = 101
    model_info_extractor = DummyModelInfo(seq_len)
    for num_seqs in [1, 5]:
        empty_seq_input = np.zeros((num_seqs, seq_len, 4))
        empty_seq_input[:, :, 0] = 1  # All As
        empty_other_input = np.zeros((num_seqs, seq_len, 4)) - 10
        #
        relv_seq_keys = ["seq"]
        #
        vcf_fh = cyvcf2.VCF(vcf_path)
        regions = Dummy_internval()
        #
        model_info_extractor.seq_length = seq_len
        region_generator = kipoi.postprocessing.variant_effects.utils.generic.SnvCenteredRg(
            model_info_extractor)
        _write_regions_from_vcf(
            vcf_fh, kipoi.postprocessing.variant_effects.utils.generic.
            default_vcf_id_gen, regions.append_interval, region_generator)
        #
        vcf_fh.close()
        annotated_regions = pd.DataFrame(regions.storage).iloc[:num_seqs, :]
        #
        gr_meta = {
            "ranges":
            GenomicRanges(annotated_regions["chrom"].values,
                          annotated_regions["start"].values - 1,
                          annotated_regions["end"].values,
                          annotated_regions["id"].values)
        }
        #
        dict_meta = {
            "ranges": {
                "chr": annotated_regions["chrom"].values,
                "start": annotated_regions["start"].values - 1,
                "end": annotated_regions["end"].values,
                "id": annotated_regions["id"].values
            }
        }
        #
        ref_seqs = {
            "ranges": [
                "A" * (annotated_regions["end"].values[i] -
                       annotated_regions["start"].values[i])
                for i in range(annotated_regions.shape[0])
            ]
        }
        #
        meta_data_options = [gr_meta, dict_meta]
        #
        seq_to_mut = {
            "seq":
            kipoi.postprocessing.variant_effects.utils.generic.
            OneHotSequenceMutator()
        }
        seq_to_meta = {"seq": "ranges"}
        n_qseq = annotated_regions.shape[0]
        for batch_size in [4, 8]:
            for meta_data in meta_data_options:
                for vcf_search_regions in [False, True]:
                    # Test the dict case:
                    dataloader = dummy_container()
                    dataloader.output_schema = dummy_container()
                    seq_container = dummy_container()
                    seq_container.associated_metadata = ["ranges"]
                    dataloader.output_schema.inputs = {
                        "seq": seq_container,
                        "other_input": None
                    }
                    inputs = {
                        "seq":
                        copy.deepcopy(empty_seq_input[:n_qseq, ...]),
                        "other_input":
                        copy.deepcopy(empty_other_input[:n_qseq, ...])
                    }
                    inputs_2nd_copy = copy.deepcopy(inputs)
                    #
                    model_input = {"inputs": inputs, "metadata": meta_data}
                    vcf_fh = cyvcf2.VCF(vcf_path, "r")
                    # relv_seq_keys, dataloader, model_input, vcf_fh, vcf_id_generator_fn, array_trafo=None
                    sample_counter = sp.SampleCounter()
                    eval_kwargs_iter = mm._generate_seq_sets_mutmap_iter(
                        dataloader.output_schema,
                        model_input,
                        seq_to_mut=seq_to_mut,
                        seq_to_meta=seq_to_meta,
                        sample_counter=sample_counter,
                        ref_sequences=ref_seqs,
                        vcf_fh=vcf_fh,
                        vcf_id_generator_fn=kipoi.postprocessing.
                        variant_effects.utils.generic.default_vcf_id_gen,
                        vcf_search_regions=vcf_search_regions,
                        generate_rc=True,
                        batch_size=batch_size)
                    for ss_batch in eval_kwargs_iter:
                        assert (len(ss_batch['vcf_records']) == batch_size //
                                4)
                        assert (len(ss_batch['query_vcf_records']) == num_seqs)
                        req_cols = ['alt', 'ref_rc', 'ref', 'alt_rc']
                        assert np.all(np.in1d(req_cols, list(ss_batch.keys())))
                        for k in req_cols:
                            for k2 in inputs:
                                assert (k2 in ss_batch[k])
                                if k2 not in relv_seq_keys:
                                    assert np.all(
                                        ss_batch[k][k2] == inputs_2nd_copy[k2][
                                            ss_batch['process_line'], ...])
                                else:
                                    # Assuming modification of matrices works as desired - see its own unit test
                                    # Assuming 1-hot coding with background as 0
                                    if k.endswith("fwd"):
                                        assert np.sum(
                                            ss_batch[k][k2] !=
                                            inputs_2nd_copy[k2][
                                                ss_batch['process_line'],
                                                ...]) == 2 * n_qseq
                        #
                        for k in ["ref", "alt"]:
                            for k2 in relv_seq_keys:
                                assert np.all(ss_batch[k][k2] == ss_batch[
                                    k + "_rc"][k2][:, ::-1, ::-1])
                    vcf_fh.close()
        # now just also test whether things work for using bed file and using neither bed nor bed file inputs
        dataloader = dummy_container()
        dataloader.output_schema = dummy_container()
        seq_container = dummy_container()
        seq_container.associated_metadata = ["ranges"]
        dataloader.output_schema.inputs = {
            "seq": seq_container,
            "other_input": None
        }
        inputs = {
            "seq": copy.deepcopy(empty_seq_input[:n_qseq, ...]),
            "other_input": copy.deepcopy(empty_other_input[:n_qseq, ...])
        }
        inputs_2nd_copy = copy.deepcopy(inputs)
        #
        model_input = {"inputs": inputs, "metadata": gr_meta}
        vcf_fh = cyvcf2.VCF(vcf_path, "r")
        # relv_seq_keys, dataloader, model_input, vcf_fh, vcf_id_generator_fn, array_trafo=None
        sample_counter = sp.SampleCounter()
        batch_size = 4
        eval_kwargs_iter = mm._generate_seq_sets_mutmap_iter(
            dataloader.output_schema,
            model_input,
            seq_to_mut=seq_to_mut,
            seq_to_meta=seq_to_meta,
            sample_counter=sample_counter,
            ref_sequences=ref_seqs,
            generate_rc=True,
            batch_size=batch_size)
        for ss_batch in eval_kwargs_iter:
            assert (len(ss_batch['vcf_records']) == batch_size // 4)
            assert ss_batch['query_vcf_records'] is None
            req_cols = ['alt', 'ref_rc', 'ref', 'alt_rc']
            assert np.all(np.in1d(req_cols, list(ss_batch.keys())))
        # using bed input
        bed_obj = BedTool("chr22 %d %d" %
                          (annotated_regions["start"].values[0] - 1,
                           annotated_regions["end"].values[0]),
                          from_string=True).tabix()
        eval_kwargs_iter = mm._generate_seq_sets_mutmap_iter(
            dataloader.output_schema,
            model_input,
            seq_to_mut=seq_to_mut,
            seq_to_meta=seq_to_meta,
            bedtools_obj=bed_obj,
            sample_counter=sample_counter,
            ref_sequences=ref_seqs,
            generate_rc=True,
            batch_size=batch_size)
        for ss_batch in eval_kwargs_iter:
            assert (len(ss_batch['vcf_records']) == batch_size // 4)
            assert ss_batch['query_vcf_records'] is None
            assert all([el == 0 for el in ss_batch["process_line"]])
Example #15
0
def test__generate_seq_sets():
    model_dir = "examples/rbp/"
    vcf_sub_path = "example_files/variants.vcf"

    vcf_path = model_dir + vcf_sub_path
    vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf(
        vcf_path)
    # for any given input type: list, dict and np.array return 4 identical sets, except for mutated bases on one position
    seq_len = 101
    model_info_extractor = DummyModelInfo(seq_len)
    for num_seqs in [1, 5]:
        empty_seq_input = np.zeros((num_seqs, seq_len, 4))
        empty_seq_input[:, :, 0] = 1  # All As
        empty_other_input = np.zeros((num_seqs, seq_len, 4)) - 10
        #
        relv_seq_keys = ["seq"]
        #
        vcf_fh = cyvcf2.VCF(vcf_path)
        regions = Dummy_internval()
        #
        model_info_extractor.seq_length = seq_len
        region_generator = kipoi.postprocessing.variant_effects.utils.generic.SnvCenteredRg(
            model_info_extractor)
        _write_regions_from_vcf(
            vcf_fh, kipoi.postprocessing.variant_effects.utils.generic.
            default_vcf_id_gen, regions.append_interval, region_generator)
        #
        vcf_fh.close()
        annotated_regions = pd.DataFrame(regions.storage).iloc[:num_seqs, :]
        #
        gr_meta = {
            "ranges":
            GenomicRanges(annotated_regions["chrom"].values,
                          annotated_regions["start"].values - 1,
                          annotated_regions["end"].values,
                          annotated_regions["id"].values)
        }
        #
        dict_meta = {
            "ranges": {
                "chr": annotated_regions["chrom"].values,
                "start": annotated_regions["start"].values - 1,
                "end": annotated_regions["end"].values,
                "id": annotated_regions["id"].values
            }
        }
        #
        meta_data_options = [gr_meta, dict_meta]
        #
        seq_to_mut = {
            "seq":
            kipoi.postprocessing.variant_effects.utils.generic.
            OneHotSequenceMutator()
        }
        seq_to_meta = {"seq": "ranges"}
        #
        sample_counter = sp.SampleCounter()
        for meta_data in meta_data_options:
            for vcf_search_regions in [False, True]:
                # Test the dict case:
                dataloader = dummy_container()
                dataloader.output_schema = dummy_container()
                seq_container = dummy_container()
                seq_container.associated_metadata = ["ranges"]
                dataloader.output_schema.inputs = {
                    "seq": seq_container,
                    "other_input": None
                }
                inputs = {
                    "seq": copy.deepcopy(empty_seq_input),
                    "other_input": copy.deepcopy(empty_other_input)
                }
                inputs_2nd_copy = copy.deepcopy(inputs)
                #
                model_input = {"inputs": inputs, "metadata": meta_data}
                vcf_fh = cyvcf2.VCF(vcf_path, "r")
                #relv_seq_keys, dataloader, model_input, vcf_fh, vcf_id_generator_fn, array_trafo=None
                ssets = sp._generate_seq_sets(
                    dataloader.output_schema,
                    model_input,
                    vcf_fh,
                    vcf_id_generator_fn=kipoi.postprocessing.variant_effects.
                    utils.generic.default_vcf_id_gen,
                    seq_to_mut=seq_to_mut,
                    seq_to_meta=seq_to_meta,
                    sample_counter=sample_counter,
                    vcf_search_regions=vcf_search_regions)
                vcf_fh.close()
                req_cols = ['alt', 'ref_rc', 'ref', 'alt_rc']
                assert np.all(np.in1d(req_cols, list(ssets.keys())))
                for k in req_cols:
                    for k2 in inputs:
                        assert (k2 in ssets[k])
                        if k2 not in relv_seq_keys:
                            assert np.all(ssets[k][k2] == inputs_2nd_copy[k2])
                        else:
                            # Assuming modification of matrices works as desired - see its own unit test
                            # Assuming 1-hot coding with background as 0
                            if k.endswith("fwd"):
                                assert np.sum(
                                    ssets[k][k2] != inputs_2nd_copy[k2]
                                ) == 2 * num_seqs
                #
                for k in ["ref", "alt"]:
                    for k2 in relv_seq_keys:
                        assert np.all(
                            ssets[k][k2] == ssets[k +
                                                  "_rc"][k2][:, ::-1, ::-1])
        #
        #
        # ------ Now also test the bed-restricted prediction -------
        restricted_regions_fpath = "example_files/restricted_regions.bed"
        #
        pbd = pb.BedTool(model_dir + restricted_regions_fpath)
        vcf_fh = cyvcf2.VCF(vcf_path, "r")
        regions = Dummy_internval()
        region_generator = kipoi.postprocessing.variant_effects.SnvPosRestrictedRg(
            model_info_extractor, pbd)
        _write_regions_from_vcf(
            vcf_fh, kipoi.postprocessing.variant_effects.utils.generic.
            default_vcf_id_gen, regions.append_interval, region_generator)
        #sp._generate_pos_restricted_seqs(vcf_fh, sp._default_vcf_id_gen, pbd, regions.append_interval, seq_len)
        vcf_fh.close()
        annotated_regions = pd.DataFrame(regions.storage).iloc[:num_seqs, :]
        #
        gr_meta = {
            "ranges":
            GenomicRanges(annotated_regions["chrom"].values,
                          annotated_regions["start"].values - 1,
                          annotated_regions["end"].values,
                          annotated_regions["id"].values)
        }
        #
        dict_meta = {
            "ranges": {
                "chr": annotated_regions["chrom"].values,
                "start": annotated_regions["start"].values - 1,
                "end": annotated_regions["end"].values,
                "id": annotated_regions["id"].values
            }
        }
        #
        meta_data_options = [gr_meta, dict_meta]
        #
        n_qseq = annotated_regions.shape[0]
        for meta_data in meta_data_options:
            for vcf_search_regions in [False, True]:
                # Test the dict case:
                dataloader = dummy_container()
                dataloader.output_schema = dummy_container()
                seq_container = dummy_container()
                seq_container.associated_metadata = ["ranges"]
                dataloader.output_schema.inputs = {
                    "seq": seq_container,
                    "other_input": None
                }
                inputs = {
                    "seq": copy.deepcopy(empty_seq_input[:n_qseq, ...]),
                    "other_input": copy.deepcopy(empty_other_input[:n_qseq,
                                                                   ...])
                }
                inputs_2nd_copy = copy.deepcopy(inputs)
                #
                model_input = {"inputs": inputs, "metadata": meta_data}
                vcf_fh = cyvcf2.VCF(vcf_path, "r")
                # relv_seq_keys, dataloader, model_input, vcf_fh, vcf_id_generator_fn, array_trafo=None
                sample_counter = sp.SampleCounter()
                ssets = sp._generate_seq_sets(
                    dataloader.output_schema,
                    model_input,
                    vcf_fh,
                    vcf_id_generator_fn=kipoi.postprocessing.variant_effects.
                    utils.generic.default_vcf_id_gen,
                    seq_to_mut=seq_to_mut,
                    seq_to_meta=seq_to_meta,
                    sample_counter=sample_counter,
                    vcf_search_regions=vcf_search_regions)
                vcf_fh.close()
                req_cols = ['alt', 'ref_rc', 'ref', 'alt_rc']
                assert np.all(np.in1d(req_cols, list(ssets.keys())))
                for k in req_cols:
                    for k2 in inputs:
                        assert (k2 in ssets[k])
                        if k2 not in relv_seq_keys:
                            assert np.all(ssets[k][k2] == inputs_2nd_copy[k2])
                        else:
                            # Assuming modification of matrices works as desired - see its own unit test
                            # Assuming 1-hot coding with background as 0
                            if k.endswith("fwd"):
                                assert np.sum(
                                    ssets[k][k2] != inputs_2nd_copy[k2]
                                ) == 2 * n_qseq
                #
                for k in ["ref", "alt"]:
                    for k2 in relv_seq_keys:
                        assert np.all(
                            ssets[k][k2] == ssets[k +
                                                  "_rc"][k2][:, ::-1, ::-1])
                #
                # Now also assert that the nuc change has been performed at the correct position:
                # Region: chr22 36702133    36706137
                # Variant within: chr22 36702137    rs1116  C   A   .   .   .
                mut_pos = 36702137 - 36702134  # bed file is 0-based
                assert np.all(
                    ssets["ref"]["seq"][0,
                                        mut_pos, :] == np.array([0, 1, 0, 0]))
                assert np.all(
                    ssets["alt"]["seq"][0,
                                        mut_pos, :] == np.array([1, 0, 0, 0]))
Example #16
0
    def __call__(self, dcpg_data_kwargs, class_weights=None):
        """Return generator for reading data from `data_files`.
        Parameters
        ----------
        class_weights: dict
            dict of dict with class weights of individual outputs.
        *args: list
            Unnamed arguments passed to :func:`hdf.reader`
        *kwargs: dict
            Named arguments passed to :func:`hdf.reader`
        Returns
        -------
        generator
            Python generator for reading data.
        """
        names = []
        if self.use_dna:
            names.append('inputs/dna')

        if self.replicate_names:
            for name in self.replicate_names:
                names.append('inputs/cpg/%s/state' % name)
                names.append('inputs/cpg/%s/dist' % name)

        if self.output_names:
            for name in self.output_names:
                names.append('outputs/%s' % name)

        # check that the kwargs fit the model:
        if self.dna_wlen is not None:
            if ("dna_wlen" in dcpg_data_kwargs) and (dcpg_data_kwargs["dna_wlen"] != self.dna_wlen):
                log.warn("dna_wlen does not match requirements of the model (%d)"%self.dna_wlen)
            dcpg_data_kwargs["dna_wlen"] = self.dna_wlen

        if self.cpg_wlen is not None:
            if ("cpg_wlen" in dcpg_data_kwargs) and (dcpg_data_kwargs["cpg_wlen"] != self.cpg_wlen):
                log.warn("cpg_wlen does not match requirements of the model (%d)"%self.cpg_wlen)
            dcpg_data_kwargs["cpg_wlen"] = self.cpg_wlen

        ### Here insert the calling of run_dcpg_data(), require reformatting of the output
        data_iter = run_dcpg_data(**dcpg_data_kwargs)
        id_ctr_offset = 0
        for data_raw in data_iter:
            for k in names:
                if k not in data_raw:
                    raise ValueError('%s does not exist! Sample mismatch between model and input data?' % k)
            inputs = dict()

            if self.use_dna:
                inputs['dna'] = self._prepro_dna(data_raw['inputs/dna'])

            if self.replicate_names:
                states = []
                dists = []
                for name in self.replicate_names:
                    tmp = 'inputs/cpg/%s/' % name
                    states.append(data_raw[tmp + 'state'])
                    dists.append(data_raw[tmp + 'dist'])
                states, dists = self._prepro_cpg(states, dists)
                if self.encode_replicates:
                    # DEPRECATED: to support loading data for legacy models
                    tmp = '/' + encode_replicate_names(self.replicate_names)
                else:
                    tmp = ''
                inputs['cpg/state%s' % tmp] = states
                inputs['cpg/dist%s' % tmp] = dists

            outputs = dict()
            weights = dict()
            if not self.output_names:
                #yield inputs
                pass
            else:
                for name in self.output_names:
                    outputs[name] = data_raw['outputs/%s' % name]
                    cweights = class_weights[name] if class_weights else None
                    weights[name] = get_sample_weights(outputs[name], cweights)
                    if name.endswith('cat_var'):
                        output = outputs[name]
                        outputs[name] = to_categorical(output, 3)
                        outputs[name][output == dat.CPG_NAN] = 0

                #yield (inputs, outputs, weights)
            meta_data = {}
            # metadata is only generated if the respective window length is given
            if ("dna_wlen" in dcpg_data_kwargs) and (dcpg_data_kwargs["dna_wlen"] is not None):
                wlen = dcpg_data_kwargs["dna_wlen"]
                delta_pos = wlen // 2
                chrom = data_raw["chromo"].astype(str)
                start = data_raw["pos"] - delta_pos
                end = data_raw["pos"] + delta_pos + 1
                meta_data["dna_ranges"] = GenomicRanges(chrom, start, end, np.arange(chrom.shape[0])+id_ctr_offset)
            
            if ("cpg_wlen" in dcpg_data_kwargs) and (dcpg_data_kwargs["cpg_wlen"] is not None):
                wlen = dcpg_data_kwargs["cpg_wlen"]
                delta_pos = wlen // 2
                chrom = data_raw["chromo"].astype(str)
                start = data_raw["pos"] - delta_pos
                end = data_raw["pos"] + delta_pos + 1
                meta_data["cpg_ranges"] = GenomicRanges(chrom, start, end, np.arange(chrom.shape[0])+id_ctr_offset)

            id_ctr_offset += data_raw["chromo"].shape[0]
            # Weights are not supported at the moment 
            yield {"inputs": inputs, "targets":outputs, "metadata":meta_data}
Example #17
0
    def __getitem__(self, idx):
        from pybedtools import Interval

        if self.fasta_extractor is None:
            # first call
            # Use normal fasta/bigwig extractors
            self.fasta_extractor = FastaExtractor(self.ds.fasta_file, use_strand=True)

            self.bw_extractors = {task: [BigwigExtractor(track) for track in task_spec.tracks]
                                  for task, task_spec in self.ds.task_specs.items() if task in self.tasks}

            self.bias_bw_extractors = {task: [BigwigExtractor(track) for track in task_spec.tracks]
                                       for task, task_spec in self.ds.bias_specs.items()}

        # Get the genomic interval for that particular datapoint
        interval = Interval(self.dfm.iat[idx, 0],  # chrom
                            self.dfm.iat[idx, 1],  # start
                            self.dfm.iat[idx, 2])  # end

        # Transform the input interval (for say augmentation...)
        if self.interval_transformer is not None:
            interval = self.interval_transformer(interval)

        # resize the intervals to the desired widths
        target_interval = resize_interval(deepcopy(interval), self.peak_width)
        seq_interval = resize_interval(deepcopy(interval), self.seq_width)

        # This only kicks in when we specify the taskname from dataspec
        # to the 3rd column. E.g. it doesn't apply when using intervals_file
        interval_from_task = self.dfm.iat[idx, 3] if self.intervals_file is None else ''

        # extract DNA sequence + one-hot encode it
        sequence = self.fasta_extractor([seq_interval])[0]
        inputs = {"seq": sequence}

        # exctract the profile counts from the bigwigs
        cuts = {f"{task}/profile": _run_extractors(self.bw_extractors[task],
                                                   [target_interval],
                                                   sum_tracks=spec.sum_tracks)[0]
                for task, spec in self.ds.task_specs.items() if task in self.tasks}
        if self.track_transform is not None:
            for task in self.tasks:
                cuts[f'{task}/profile'] = self.track_transform(cuts[f'{task}/profile'])

        # Add total number of counts
        for task in self.tasks:
            cuts[f'{task}/counts'] = self.total_count_transform(cuts[f'{task}/profile'].sum(0))

        if len(self.ds.bias_specs) > 0:
            # Extract the bias tracks
            biases = {bias_task: _run_extractors(self.bias_bw_extractors[bias_task],
                                                 [target_interval],
                                                 sum_tracks=spec.sum_tracks)[0]
                      for bias_task, spec in self.ds.bias_specs.items()}

            task_biases = {f"bias/{task}/profile": np.concatenate([biases[bt]
                                                                   for bt in self.task_bias_tracks[task]],
                                                                  axis=-1)
                           for task in self.tasks}

            if self.track_transform is not None:
                for task in self.tasks:
                    task_biases[f'bias/{task}/profile'] = self.track_transform(task_biases[f'bias/{task}/profile'])

            # Add total number of bias counts
            for task in self.tasks:
                task_biases[f'bias/{task}/counts'] = self.total_count_transform(task_biases[f'bias/{task}/profile'].sum(0))

            inputs = {**inputs, **task_biases}

        if self.include_classes:
            # Optionally, add binary labels from the additional columns in the tsv intervals file
            classes = {f"{task}/class": self.dfm.iat[idx, i + 3]
                       for i, task in enumerate(self.dfm_tasks) if task in self.tasks}
            cuts = {**cuts, **classes}

        out = {"inputs": inputs,
               "targets": cuts}

        if self.include_metadata:
            # remember the metadata (what genomic interval was used)
            out['metadata'] = {"range": GenomicRanges(chr=target_interval.chrom,
                                                      start=target_interval.start,
                                                      end=target_interval.stop,
                                                      id=idx,
                                                      strand=(target_interval.strand
                                                              if target_interval.strand is not None
                                                              else "*"),
                                                      ),
                               "interval_from_task": interval_from_task}
        return out
Example #18
0
    def __getitem__(self, idx):
        exon = self.exons.iloc[idx]
        ########
        # Specific part for Vex-seq
        # variant side
        variant = Variant(CHROM=exon.CHROM,
                          POS=exon.POS,
                          REF=exon.REF,
                          ALT=exon.ALT,
                          ID=exon.ID,
                          strand=exon.strand,
                          side=exon.side)
        ########

        attributes = {}
        try:
            attributes['transcript_id'] = [exon.transcript_id]
        except:
            attributes['transcript_id'] = [""]
        try:
            attributes['gene_id'] = [exon.gene_id]
        except:
            attributes['gene_id'] = [""]
        try:
            attributes['exon_id'] = [
                exon.CHROM + ':' + exon.Exon_Start + '-' + exon.Exon_End
            ]
        except:
            attributes['exon_id'] = [""]
        try:
            attributes['biotype'] = exon.biotype
        except:
            attributes['biotype'] = ""
        try:
            attributes['order'] = exon.order
        except:
            attributes['order'] = ""

        exon = ExonInterval.from_exonfile(exon,
                                          attributes,
                                          overhang=self.overhang)

        out = {}
        out['inputs'] = {}
        out['mut_inputs'] = {}
        if self.split_seq:
            out['inputs']['seq'] = self.split(exon.get_seq(self.fasta))
            out['mut_inputs']['seq'] = self.split(
                exon.get_mut_seq(self.fasta, variant).upper())
        else:
            out['inputs']['seq'] = exon.get_seq(self.fasta)
            out['mut_inputs']['seq'] = exon.get_mut_seq(self.fasta,
                                                        variant).upper()
        out['inputs']['intronl_len'] = self.overhang[0]
        out['inputs']['intronr_len'] = self.overhang[1]
        out['mut_inputs']['intronl_len'] = self.overhang[0]
        out['mut_inputs']['intronr_len'] = self.overhang[1]

        out['metadata'] = {}
        out['metadata']['gene_id'] = exon.gene_id
        out['metadata']['transcript_id'] = exon.transcript_id
        out['metadata']['biotype'] = attributes['biotype']
        out['metadata']['order'] = exon.order
        out['metadata']['ranges'] = GenomicRanges(
            exon.seqid,  # exon is now object of class ExonInterval
            exon.start - 1,  # for kipoi 0-base
            exon.end,  # actual got sequence coordinates
            exon.gene_id,
            exon.strand)
        return out
@pytest.mark.parametrize("pair", BAD_ARR_SCHEMA_PAIRS)
def test_bad_array_schemas(pair):
    descr, batch = pair
    assert not descr.compatible_with_batch(batch)


# --------------------------------------------
# metadata structs
GOOD_MDATASTRUCT_PAIRS = [
    (MetadataStruct(type="str", doc=""), np.arange(10).astype(str)),
    (MetadataStruct(type="int", doc=""), np.arange(10).astype(int)),
    (MetadataStruct(type="float", doc=""), np.arange(10).astype(float)),
    (MetadataStruct(type="array", doc=""), np.arange(10).reshape((2, 5))),
    (MetadataStruct(type="GenomicRanges", doc=""), GenomicRanges(chr="chr1",
                                                                 start=10,
                                                                 end=20,
                                                                 id="1",
                                                                 strand="+")),
    (MetadataStruct(type="GenomicRanges", doc=""), dict(chr="chr1",
                                                        start=10,
                                                        end=20,
                                                        id="1",
                                                        strand="+")),
]

BAD_MDATASTRUCT_PAIRS = [
    # larger array
    (MetadataStruct(type="str", doc=""), np.arange(10).reshape((2, 5)).astype(str)),
    (MetadataStruct(type="int", doc=""), np.arange(10).reshape((2, 5)).astype(int)),
    (MetadataStruct(type="float", doc=""), np.arange(10).reshape((2, 5)).astype(float)),
    # not an array
Example #20
0
@pytest.mark.parametrize("pair", BAD_ARR_SCHEMA_PAIRS)
def test_bad_array_schemas(pair):
    descr, batch = pair
    assert not descr.compatible_with_batch(batch)


# --------------------------------------------
# metadata structs
GOOD_MDATASTRUCT_PAIRS = [
    (MetadataStruct(type="str", doc=""), np.arange(10).astype(str)),
    (MetadataStruct(type="int", doc=""), np.arange(10).astype(int)),
    (MetadataStruct(type="float", doc=""), np.arange(10).astype(float)),
    (MetadataStruct(type="array", doc=""), np.arange(10).reshape((2, 5))),
    (MetadataStruct(type="GenomicRanges", doc=""),
     GenomicRanges(chr="chr1", start=10, end=20, id="1", strand="+")),
    (MetadataStruct(type="GenomicRanges", doc=""),
     dict(chr="chr1", start=10, end=20, id="1", strand="+")),
]

BAD_MDATASTRUCT_PAIRS = [
    # larger array
    (MetadataStruct(type="str", doc=""), np.arange(10).reshape(
        (2, 5)).astype(str)),
    (MetadataStruct(type="int", doc=""), np.arange(10).reshape(
        (2, 5)).astype(int)),
    (MetadataStruct(type="float", doc=""), np.arange(10).reshape(
        (2, 5)).astype(float)),
    # not an array
    (MetadataStruct(type="array", doc=""), 1),
    (MetadataStruct(type="array", doc=""), "3"),
Example #21
0
 def grange(self):
     return GenomicRanges(self.chrom,
                          self.start,
                          self.end,
                          self.transcript_id,
                          self.strand)
Example #22
0
def extractor(intervals_file,
              input_data_sources,
              target_data_sources=None,
              batch_size=128):
    """BatchGenerator

    Args:
        intervals_file: tsv file
            Assumes bed-like `chrom start end id` format.
        input_data_sources: dict
            mapping from input name to genomelake directory
        target_data_sources: dict, optional
            mapping from input name to genomelake directory
        batch_size: int
    """
    bt = pybedtools.BedTool(intervals_file)
    input_data_extractors = {
        key: ArrayExtractor(data_source)
        for key, data_source in input_data_sources.items()
    }
    if target_data_sources is not None:
        target_data_extractors = {
            key: ArrayExtractor(data_source)
            for key, data_source in target_data_sources.items()
        }
    intervals_generator = batch_iter(bt, batch_size)
    for intervals_batch in intervals_generator:
        out = {}
        # get data
        out['inputs'] = {
            key:
            extractor(intervals_batch)[...,
                                       None]  # adds channel axis for conv1d
            for key, extractor in input_data_extractors.items()
        }
        if target_data_sources is not None:
            out['targets'] = {
                key: extractor(intervals_batch)[
                    ..., None]  # adds channel axis for conv1d
                for key, extractor in target_data_extractors.items()
            }
        # get metadata
        out['metadata'] = {}
        chrom = []
        start = []
        end = []
        ids = []
        for interval in intervals_batch:
            chrom.append(interval.chrom)
            start.append(interval.start)
            end.append(interval.stop)
            ids.append(interval.name)

        out['metadata'] = {
            'ranges':
            GenomicRanges(chr=np.array(chrom),
                          start=np.array(start),
                          end=np.array(end),
                          id=np.array(id))
        }

        yield out