def __getitem__(self, idx): if self.fasta_extractor is None: # Fasta self.fasta_extractor = FastaExtractor(self.fasta_file) # DNase self.dnase_extractor = BigwigExtractor(self.dnase_file) self.mappability_extractor = BigwigExtractor(self.mappability_file) # Get the interval interval = self.bt[idx] if interval.stop - interval.start != self.SEQ_WIDTH: center = (interval.start + interval.stop) // 2 interval.start = center - self.SEQ_WIDTH // 2 interval.end = center + self.SEQ_WIDTH // 2 + self.SEQ_WIDTH % 2 # Get the gencode features gencode_counts = np.array([v[idx].count for k, v in self.overlap_beds], dtype=bool) # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval]), axis=0) seq_rc = seq[::-1, ::-1] # Dnase dnase = np.squeeze(self.dnase_extractor([interval], axis=0))[:, np.newaxis] dnase[np.isnan(dnase)] = 0 # NA fill dnase_rc = dnase[::-1] bigwig_list = [seq] bigwig_rc_list = [seq_rc] mappability = np.squeeze(self.mappability_extractor( [interval], axis=0))[:, np.newaxis] mappability[np.isnan(mappability)] = 0 # NA fill mappability_rc = mappability[::-1] bigwig_list.append(mappability) bigwig_rc_list.append(mappability_rc) bigwig_list.append(dnase) bigwig_rc_list.append(dnase_rc) ranges = GenomicRanges.from_interval(interval) ranges_rc = GenomicRanges.from_interval(interval) ranges_rc.strand = "-" return { "inputs": [ np.concatenate(bigwig_list, axis=-1), # stack along the last axis np.concatenate(bigwig_rc_list, axis=-1), # RC version np.append(self.meta_feat, gencode_counts) ], "targets": {}, # No Targets "metadata": { "ranges": ranges, "ranges_rc": ranges_rc } }
def test__overlap_vcf_region(): vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf( "examples/rbp/example_files/variants.vcf") vcf_obj = cyvcf2.VCF(vcf_path) all_records = [rec for rec in vcf_obj] vcf_obj.close() vcf_obj = cyvcf2.VCF(vcf_path) # regions_dict = { "chr": ["chr22"], "start": [21541589], "end": [36702137], "id": [0] } regions_gr = GenomicRanges(regions_dict["chr"], regions_dict["start"], regions_dict["end"], regions_dict["id"]) for regions in [regions_dict, regions_gr]: found_vars, overlapping_region = sp._overlap_vcf_region( vcf_obj, regions, exclude_indels=False) assert all([ str(el1) == str(el2) for el1, el2 in zip(all_records, found_vars) ]) assert len(overlapping_region) == len(found_vars) assert all([el == 0 for el in overlapping_region]) regions_dict = { "chr": ["chr22", "chr22", "chr22"], "start": [21541589, 21541589, 30630220], "end": [36702137, 21541590, 30630222], "id": [0, 1, 2] } regions_gr = GenomicRanges(regions_dict["chr"], regions_dict["start"], regions_dict["end"], regions_dict["id"]) # plus_indel_results = all_records + all_records[:1] + all_records[3:4] snv_results = [el for el in plus_indel_results if not el.is_indel] # ref_lines_indel = [0] * len(all_records) + [1] + [2] snv_ref_lines = [ el for el, el1 in zip(ref_lines_indel, plus_indel_results) if not el1.is_indel ] # for regions in [regions_dict, regions_gr]: for exclude_indels, ref_res, ref_lines in zip( [False, True], [plus_indel_results, snv_results], [ref_lines_indel, snv_ref_lines]): found_vars, overlapping_region = sp._overlap_vcf_region( vcf_obj, regions, exclude_indels) assert all([ str(el1) == str(el2) for el1, el2 in zip(ref_res, found_vars) if not el1.is_indel ]) assert overlapping_region == ref_lines
def compatible_with_batch(self, batch, verbose=True): """Checks compatibility with a particular numpy array Args: batch: numpy array of a batch verbose: print the fail reason """ def print_msg(msg): if verbose: print("MetadataStruct mismatch") print(msg) # custom classess if self.type == MetadataType.GENOMIC_RANGES: if not isinstance(batch, GenomicRanges): # TODO - do we strictly require the GenomicRanges class? # - relates to metadata.py TODO about numpy_collate # for now we should just be able to convert to the GenomicRanges class # without any errors try: GenomicRanges.from_dict(batch) except Exception as e: print_msg("expecting a GenomicRanges object or a GenomicRanges-like dict") print_msg("convertion error: {0}".format(e)) return False else: return True else: return True # type = np.ndarray if not isinstance(batch, np.ndarray): print_msg("Expecting a np.ndarray. Got type(batch) = {0}".format(type(batch))) return False if not batch.ndim >= 1: print_msg("The array is a scalar (expecting at least the batch dimension)") return False bshape = batch.shape[1:] # scalars if self.type in {MetadataType.INT, MetadataType.STR, MetadataType.FLOAT}: if bshape != () and bshape != (1,): print_msg("expecting a scalar, got an array with shape (without the batch axis): {0}".format(bshape)) return False # arrays # - no checks return True
def __getitem__(self, idx): if self.fasta_extractor is None: self.fasta_extractor = FastaExtractor(self.fasta_file) interval = self.bt[idx] if interval.stop - interval.start != self.SEQ_WIDTH: center = (interval.start + interval.stop) // 2 interval.start = center - self.SEQ_WIDTH // 2 interval.end = center + self.SEQ_WIDTH // 2 + self.SEQ_WIDTH % 2 if self.targets is not None: y = self.targets.iloc[idx].values else: y = {} # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval]), axis=0) import pdb #pdb.set_trace() # Reformat so that it matches the DeepSEA shape seq = np.swapaxes(seq, 1, 0)[:, None, :] return { "inputs": seq, "targets": y, "metadata": { "ranges": GenomicRanges.from_interval(interval) } }
def __getitem__(self, idx): """ Return a list of Branch objects. They contain coordinates that can be written to bed files """ out = {} out['inputs'] = {} branch = self.branches[idx] # input sequence out['inputs']['bidirectional_1_input'] = branch.seq # metadata out['metadata'] = {} out['metadata']['geneID'] = branch.geneID out['metadata']['transcriptID'] = branch.transcriptID out['metadata']['chrom'] = branch.chrom out['metadata']['strand'] = branch.strand out['metadata']['start'] = branch.grange[0] - 1 # use 0-base indexing out['metadata']['stop'] = branch.grange[1] out['metadata']['biotype'] = branch.biotype out['metadata']['ranges'] = GenomicRanges( branch.chrom, branch.grange[0] - 1, # use 0-base indexing branch.grange[1], branch.geneID + "_" + branch.transcriptID, branch.strand) return out
def __getitem__(self, idx): if self.fasta_extractor is None: self.fasta_extractor = FastaExtractor(self.fasta_file) interval = self.bt[idx] if interval.stop - interval.start != self.SEQ_WIDTH: raise ValueError( "Expected the interval to be {0} wide. Recieved stop - start = {1}" .format(self.SEQ_WIDTH, interval.stop - interval.start)) if self.targets is not None: y = self.targets.iloc[idx].values else: y = {} # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval]), axis=0) seq = np.expand_dims(np.swapaxes(seq, 1, 0), axis=1) return { "inputs": seq, "targets": y, "metadata": { "ranges": GenomicRanges.from_interval(interval) } }
def __getitem__(self, idx): if self.fasta is None: self.fasta = FastaFile(self.fasta_file) out = {} if self.MISO_AS: gene = self.genes[idx] out['inputs'] = self.get_seq(gene) out['metadata'] = {} out['metadata']['geneName'] = gene.geneName out['metadata']['chrom'] = gene.chrom out['metadata']['strand'] = gene.strand out['metadata']['start'] = gene.start out['metadata']['stop'] = gene.stop else: spliceSite = self.spliceSites[idx] out['inputs'] = spliceSite.get_seq(self.fasta) out['metadata'] = {} out['metadata']['geneID'] = spliceSite.geneID out['metadata']['transcriptID'] = spliceSite.transcriptID out['metadata']['biotype'] = spliceSite.biotype out['metadata']['order'] = spliceSite.order out['metadata']['ranges'] = GenomicRanges( spliceSite.chrom, spliceSite.grange[0] - 1, # use 0-base indexing spliceSite.grange[1], spliceSite.geneID, spliceSite.strand) return out
def __getitem__(self, idx): if self.fasta_extractors is None: self.fasta_extractors = FastaStringExtractor( self.fasta_file, use_strand=False, # self.use_strand, force_upper=self.force_upper) interval, labels = self.bed[idx] if self.auto_resize_len: # automatically resize the sequence to cerat interval = resize_interval(interval, self.auto_resize_len, anchor='center') # QUESTION: @kromme - why to we need max_seq_len? # if self.max_seq_len is not None: # assert interval.stop - interval.start <= self.max_seq_len # Run the fasta extractor and transform if necessary seq = self.fasta_extractors.extract(interval) return { "inputs": np.array(seq), "targets": labels, "metadata": { "ranges": GenomicRanges(interval.chrom, interval.start, interval.stop, str(idx)) } }
def __getitem__(self, idx): if self.fasta_extractor is None: self.fasta_extractor = FastaExtractor(self.fasta_file) interval = self.bt[idx] if interval.stop - interval.start != self.SEQ_WIDTH: raise ValueError("Expected the interval to be {0} wide. Recieved stop - start = {1}". format(self.SEQ_WIDTH, interval.stop - interval.start)) if interval.name is not None: y = np.array([float(interval.name)]) else: y = {} # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval])) # Reformat so that it matches the Basset shape # seq = np.swapaxes(seq, 1, 0)[:,:,None] return { "inputs": {"data/genome_data_dir": seq}, "targets": y, "metadata": { "ranges": GenomicRanges.from_interval(interval) } }
def __getitem__(self, idx): interval = self.bt[idx] # Intervals can't be bigger than 1000bp if (interval.stop - interval.start) > 1000: raise Exception("Input sequences should be at maximum 1000bp.") # Fetch the fasta line seq = self.fasta.fetch(str(interval.chrom), interval.start, interval.stop).upper() # Reverse complement input string is requested if interval.strand == "-": seq = rc_str(seq) """ # generate an id id = str(interval.chrom) + ":" + str(interval.start) + "-" + str(interval.stop) if interval.name not in ["", ".", "*"]: id = interval.name """ return { "inputs": seq, "metadata": { "ranges": GenomicRanges.from_interval(interval) } }
def __iter__(self): interval: Interval variant: Variant for interval, variant in self.matcher: yield { "inputs": { "ref_seq": self.one_hot(self.reference_sequence.extract(interval)), "alt_seq": self.one_hot( self.variant_seq_extractor.extract( interval, [variant], anchor=135 if interval.neg_strand else 70, )), }, "metadata": { "variant": { "chrom": variant.chrom, "start": variant.start, "end": variant.end, "ref": variant.ref, "alt": variant.alt, "id": variant.id, "str": str(variant), }, "ranges": GenomicRanges.from_interval(interval), **{ k: interval.attrs.get(k, '') for k in self.interval_attrs }, } }
def __getitem__(self, idx): if self.seq_extractor is None: self.seq_extractor = FastaExtractor(self.fasta_file) self.dist_extractor = DistToClosestLandmarkExtractor(gtf_file=self.gtf, landmarks=ALL_LANDMARKS) interval = self.bt[idx] if interval.stop - interval.start != self.SEQ_WIDTH: raise ValueError("Expected the interval to be {0} wide. Recieved stop - start = {1}". format(self.SEQ_WIDTH, interval.stop - interval.start)) out = {} out['inputs'] = {} # input - sequence out['inputs']['seq'] = np.squeeze(self.seq_extractor([interval]), axis=0) # input - distance dist_dict = self.dist_transformer.transform(self.dist_extractor([interval])) dist_dict = {k: np.squeeze(v, axis=0) for k, v in dist_dict.items()} # squeeze the batch axis out['inputs'] = {**out['inputs'], **dist_dict} # targets if self.target_dataset is not None: out["targets"] = np.array([self.target_dataset[idx]]) # metadata out['metadata'] = {} out['metadata']['ranges'] = GenomicRanges.from_interval(interval) return out
def __getitem__(self, idx): # create interval correctly here interval = self.bt[idx] # Intervals need to be 1000bp wide assert interval.stop - interval.start == 1000 # check targets is none, pass targets file if interval.name is not None: y = np.array([float(interval.name)]) else: y = {} # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval])) # Reformat so that it matches the Basset shape # seq = np.swapaxes(seq, 1, 0)[:,:,None] return { "inputs": { "data/genome_data_dir": seq }, "targets": y, "metadata": { "ranges": GenomicRanges.from_interval(interval) } }
def __getitem__(self, idx): out = {} if self.MISO_AS: gene = self.genes[idx] inputs, ranges = self.get_seq(gene) out['inputs'] = inputs if self.Y is not None: out['targets'] = self.Y.get_target(gene.geneName) else: out['targets'] = np.nan out['metadata'] = {} out['metadata']['geneName'] = gene.geneName out['metadata']['chrom'] = gene.chrom out['metadata']['strand'] = gene.strand out['metadata']['start'] = gene.start out['metadata']['stop'] = gene.stop out['metadata']['extracted_regions'] = ranges else: spliceSite = self.spliceSites[idx] out['inputs'] = spliceSite.get_seq(self.fasta) out['metadata'] = {} out['metadata']['geneID'] = spliceSite.geneID out['metadata']['transcriptID'] = spliceSite.transcriptID out['metadata']['biotype'] = spliceSite.biotype out['metadata']['order'] = spliceSite.order out['metadata']['ranges'] = GenomicRanges( spliceSite.chrom, spliceSite.grange[0] - 1, # use 0-base indexing spliceSite.grange[1], spliceSite.geneID, spliceSite.strand) return out
def __getitem__(self, idx): if self.fasta_extractor is None: self.fasta_extractor = FastaExtractor(self.fasta_file) interval, labels = self.tsv[idx] if self.auto_resize_len: # automatically resize the sequence to cerat interval = resize_interval(interval, self.auto_resize_len) # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval])) return { "inputs": {"seq": seq}, "targets": labels, "metadata": { "ranges": GenomicRanges(chr=interval.chrom, start=interval.start, end=interval.stop, id=str(idx), strand=(interval.strand if interval.strand is not None else "*"), ), "interval_from_task": '' } }
def __getitem__(self, idx): # Get the interval interval = self.bt[idx] if interval.stop - interval.start != self.SEQ_WIDTH: center = (interval.start + interval.stop) // 2 interval.start = center - self.SEQ_WIDTH // 2 interval.end = center + self.SEQ_WIDTH // 2 + self.SEQ_WIDTH % 2 # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval]), axis=0) seq_rc = seq[::-1, ::-1] # Dnase dnase = np.squeeze(self.dnase_extractor([interval], axis=0))[:, np.newaxis] dnase[np.isnan(dnase)] = 0 # NA fill dnase_rc = dnase[::-1] bigwig_list = [seq] bigwig_rc_list = [seq_rc] mappability = np.squeeze(self.mappability_extractor( [interval], axis=0))[:, np.newaxis] mappability[np.isnan(mappability)] = 0 # NA fill mappability_rc = mappability[::-1] bigwig_list.append(mappability) bigwig_rc_list.append(mappability_rc) bigwig_list.append(dnase) bigwig_rc_list.append(dnase_rc) ranges = GenomicRanges.from_interval(interval) ranges_rc = GenomicRanges.from_interval(interval) ranges_rc.strand = "-" return { "inputs": [ np.concatenate(bigwig_list, axis=-1), # stack along the last axis np.concatenate(bigwig_rc_list, axis=-1), # RC version self.meta_feat ], "targets": {}, # No Targets "metadata": { "ranges": ranges, "ranges_rc": ranges_rc } }
def dl_batch(): return {"inputs": np.arange(3), "metadata": { "ranges": GenomicRanges(chr=np.array(["chr1", "chr1", "chr1"]), start=np.arange(3) + 1, end=np.arange(3) + 5, id=np.arange(3).astype(str), strand=np.array(["*"] * 3) ), "gene_id": np.arange(3).astype(str) }}
def __getitem__(self, idx): row = self._gtf_anchor.iloc[idx] interval = self._create_anchored_interval( row, num_upstream=self._num_upstream, num_downstream=self._num_downstream) sequence = self._fa.extract(interval) sequence = self._transform(sequence) metadata_dict = {k: row.get(k, '') for k in self._interval_attrs} metadata_dict["ranges"] = GenomicRanges(interval.chrom, interval.start, interval.stop, str(idx)) return {"inputs": np.array(sequence), "metadata": metadata_dict}
def __getitem__(self, idx): if self.fasta_extractor is None: self.fasta_extractor = FastaExtractor(self.fasta_file) self.bigwig_extractors = { a: [BigwigExtractor(f) for f in self.bigwigs[a]] for a in self.bigwigs } interval, labels = self.tsv[idx] interval = resize_interval(interval, 1000) # Intervals need to be 1000bp wide assert interval.stop - interval.start == 1000 # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval])) interval_wide = resize_interval(deepcopy(interval), self.track_width) return { "inputs": { "seq": seq }, "targets": { a: sum([e([interval_wide])[0] for e in self.bigwig_extractors[a]]).sum() for a in self.bigwig_extractors }, "metadata": { "ranges": GenomicRanges(interval.chrom, interval.start, interval.stop, str(idx)), "ranges_wide": GenomicRanges.from_interval(interval_wide), "name": interval.name } }
def __getitem__(self, idx): self.fasta_extractor = FastaStringExtractor(self.fasta_file) # get the intervals interval, targets = self.bt[idx] # resize to 500bp interval = resize_interval(interval, 500, anchor='center') # extract the sequence seq = self.fasta_extractor.extract(interval) # one-hot encode the sequence seq_onehot = self.transform(seq) seq_onehot_rc = seq_onehot[::-1, ::-1] ranges = GenomicRanges.from_interval(interval) ranges_rc = GenomicRanges.from_interval(interval) return { "inputs": [seq_onehot, seq_onehot_rc], "metadata": [ranges, ranges_rc] }
def __next__(self): ss = next(self.exonGenerator) out = {} out['inputs'] = {} seq = ss.get_seq(self.fasta).upper() if self.split_seq: seq = self.split(seq, ss.overhang)['donor'][0] out['inputs']['ss'] = seq out['metadata'] = {} out['metadata']['ranges'] = GenomicRanges(ss.chrom, ss.Exon_Start, ss.Exon_End, ss.transcript_id, ss.strand) return out
def __getitem__(self, idx): interval = self.bt[idx] if interval.stop - interval.start != self.SEQ_WIDTH: raise ValueError("Expected the interval to be {0} wide. Recieved stop - start = {1}". format(self.SEQ_WIDTH, interval.stop - interval.start)) # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval]), axis=0) return { "inputs": seq, "targets": {}, # No Targets "metadata": { "ranges": GenomicRanges.from_interval(interval) } }
def __getitem__(self, idx): interval = self.bt[idx] if self.targets is not None: y = self.targets.iloc[idx].values else: y = {} # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval]), axis=0) return { "inputs": seq, "targets": y, "metadata": { "ranges": GenomicRanges.from_interval(interval) } }
def __getitem__(self, idx): if self.fasta_extractor is None: self.fasta_extractor = FastaStringExtractor(self.fasta_file, use_strand=True, force_upper=True) feature = self.start_codons[idx] interval = get_upstream(feature, self.n_upstream) seq = self.fasta_extractor.extract(interval) seq_one_hot_encoded = self.input_transform(seq) return { "inputs": seq_one_hot_encoded, "metadata": { "ranges": GenomicRanges.from_interval(interval), "gene_id": feature.attributes.get('gene_id', [""])[0], "transcript_id": feature.attributes.get('transcript_id', [""])[0], "gene_biotype": feature.attributes.get('gene_biotype', [""])[0] } }
def __getitem__(self, idx): if self.fasta_extractor is None: self.fasta_extractor = FastaExtractor(self.fasta_file) interval, labels = self.tsv[idx] # Intervals need to be 1000bp wide assert interval.stop - interval.start == 1000 # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval])) return { "inputs": {"data/genome_data_dir": seq}, "targets": labels, "metadata": { "ranges": GenomicRanges(interval.chrom, interval.start, interval.stop, str(idx)) } }
def __iter__(self): interval: Interval variant: Variant for index, row in self.regions_of_interest.as_df().iterrows(): interval = Interval( chrom=row["Chromosome"], start=row["Start"], end=row["End"], strand=row["Strand"], ) yield { "inputs": self.one_hot(self.reference_sequence.extract(interval)), "metadata": { "ranges": GenomicRanges.from_interval(interval), **{k: row[k] for k in self.interval_attrs}, } }
def __getitem__(self, idx): interval = self.bt[idx] # Intervals need to be 101bp wide assert interval.stop - interval.start == 101 if self.targets is not None: y = self.targets.iloc[idx].values else: y = {} # Run the fasta extractor seq = self.fasta_extractor([interval]).squeeze() return { "inputs": seq, "targets": y, "metadata": { "ranges": GenomicRanges.from_interval(interval) } }
def __getitem__(self, idx): if self.fasta_extractor is None: self.fasta_extractor = Fasta(self.fasta_file) interval = self.bt[idx] interval_fasta_id = self._interval_to_fasta_id(interval) if self.targets is not None: y = self.targets.iloc[idx].values else: y = {} # Run the fasta extractor start, end = self._compute_relative_coords(interval) record = self.fasta_extractor[interval_fasta_id] seq = record[start:end].seq return { "inputs": encodeDNA([seq]).squeeze(), "targets": y, "metadata": { "ranges": GenomicRanges.from_interval(interval) } }
def __getitem__(self, idx): if self.fasta_extractor is None: self.fasta_extractor = FastaStringExtractor(self.fasta_file) interval = self.bt[idx] # Intervals need to be 1000bp wide assert interval.stop - interval.start == 1000 if self.targets is not None: y = self.targets.iloc[idx].values else: y = {} # Run the fasta extractor seq = one_hot_dna(self.fasta_extractor.extract(interval), dtype=np.float32) # TODO: Remove additional dtype after kipoiseq gets a new release return { "inputs": seq, "targets": y, "metadata": { "ranges": GenomicRanges.from_interval(interval) } }
def __getitem__(self, idx): interval = self.bt[idx] if interval.stop - interval.start != self.SEQ_WIDTH: raise ValueError( "Expected the interval to be {0} wide. Recieved stop - start = {1}" .format(self.SEQ_WIDTH, interval.stop - interval.start)) if self.targets is not None: y = self.targets.iloc[idx].values else: y = {} # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval]), axis=0) # Reformat so that it matches the Basset shape seq = np.swapaxes(seq, 1, 0)[:, :, None] return { "inputs": seq, "targets": y, "metadata": { "ranges": GenomicRanges.from_interval(interval) } }