def test_tmp_normalization(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath def loading(garray): garray[GenomicInterval('chr1', 0, 150), 0] = np.repeat(10, 150).reshape(-1, 1) garray[GenomicInterval('chr2', 0, 300), 0] = np.repeat(1, 300).reshape(-1, 1) return garray for store in ['ndarray', 'hdf5']: ga = create_genomic_array({ 'chr1': 150, 'chr2': 300 }, stranded=False, typecode='float32', storage=store, cache=True, resolution=50, loader=loading, collapser='sum', normalizer=get_normalizer('tpm')) np.testing.assert_allclose( ga[GenomicInterval('chr1', 100, 101)], np.asarray([[[10 * 1000 / 50 * 1e6 / (720.)]]])) np.testing.assert_allclose( ga[GenomicInterval('chr2', 100, 101)], np.asarray([[[1 * 1000 / 50 * 1e6 / (720.)]]]))
def get_bins(chrom_len, chromosomes, count_list, step_width, feature_len): """Creates list of bins of length <step_width> with values describing the number of reads that fall into a bin. <count_list> has to be created with 'get_count_list' It returns a dict like: {'chr1' [0,10,2,0,...], 'chr2' ...} where the first list entry gives the first bin and so on.""" result = {} for chrom in chromosomes: overrun = 0 if chrom not in chrom_len: # print("Warning: %s not found, do not consider" %chrom, file=sys.stderr) pass else: # print("... considering %s..."%chrom, file=sys.stderr) for i in range(0, chrom_len[chrom], step_width): end = min(i + step_width, chrom_len[chrom]) counts = reduce(lambda x, y: x + y, count_list[GenomicInterval(chrom, i, end)]) count_list[GenomicInterval(chrom, i, end)] = 0 counts += overrun if chrom in list(result.keys()): result[chrom].append(counts) else: result[chrom] = [counts] overrun = _get_overrun(chrom, i, end, step_width, count_list, feature_len) return result
def test_bwga_instance_unstranded_taged(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath iv = GenomicInterval('chr10', 100, 120, '.') ga = create_genomic_array({'chr10': 300}, stranded=False, typecode='int8', storage='ndarray', datatags='test_bwga_instance_unstranded') with pytest.raises(Exception): # access only via genomic interval ga[1] with pytest.raises(Exception): # access only via genomic interval and condition ga[1] = 1 np.testing.assert_equal(ga[iv].shape, (20, 1, 1)) np.testing.assert_equal(ga[iv], np.zeros((20, 1, 1))) ga[iv, 0] = np.ones((20, 1)) np.testing.assert_equal(ga[iv], np.ones((20, 1, 1))) np.testing.assert_equal(ga[iv].sum(), 20) iv = GenomicInterval('chr10', 0, 300, '.') np.testing.assert_equal(ga[iv].sum(), 20)
def parse_cigar_into_features(self): """Parse CLIPz cigar string into dict of features of the read""" i = 0 insertions = 0 for feat in self.cigar_list: if self.is_string_integer(feat): i += int(feat) else: if feat.startswith('I'): insertions += 1 if self.strand == '+': self.features[feat].append( ReadFeature(feat, type_=features_types[feat[0]], interval=GenomicInterval( self.chrom, self.position + i - insertions, self.position + i + 1 - insertions, self.strand), beg_in_read=i + insertions)) elif self.strand == "-": self.features[mutations_mapping[feat]].append( ReadFeature( mutations_mapping[feat], type_=features_types[feat[0]], interval=GenomicInterval( self.chrom, self.position + i - insertions, self.position + i + 1 - insertions, self.strand), beg_in_read=self.length - i - 1 + insertions)) else: raise Exception("Strand must be + or -") i += 1
def __iter__(self) -> Iterator[Tuple[GenomicFeature, str]]: for line in TextFile.__iter__(self): if isinstance(line, bytes): line = line.decode() if line == "\n": continue if line.startswith('#'): if line.startswith("##"): mo = re.compile(r"##\s*(\S+)\s+(\S*)").match(line) if mo: self.metadata[mo.group(1)] = mo.group(2) continue (seqname, source, feature, start, end, score, strand, frame, attributeStr) = line.split("\t", 8) (attr, name) = parse_GFF_attribute_string(attributeStr, True) if self.end_included: iv = GenomicInterval(seqname, int(start) - 1, int(end), strand) else: iv = GenomicInterval(seqname, int(start) - 1, int(end) - 1, strand) f = GenomicFeature(name, feature, iv) if score != ".": score = float(score) if frame != ".": frame = int(frame) f.source = source f.score = score f.frame = frame f.attr = attr yield (f, line)
def test_bwga_instance_unstranded(tmpdir): iv = GenomicInterval('chr10', 100, 120, '.') ga = create_genomic_array({'chr10': 300}, stranded=False, typecode='int8', storage='ndarray', cache=False) np.testing.assert_equal(ga[iv].shape, (20, 1, 1)) np.testing.assert_equal(ga[iv], np.zeros((20, 1, 1))) ga[iv, 0] = 1 np.testing.assert_equal(ga[iv], np.ones((20, 1, 1))) np.testing.assert_equal(ga[iv].sum(), 20) iv = GenomicInterval('chr10', 0, 300, '.') np.testing.assert_equal(ga[iv].sum(), 20)
def _get_overrun(chrom, i, end, step_width, count_list, feature_len): """Return overrun of reads that fall in two bins""" help_c1 = filter(lambda x: x[0].start + feature_len > end and x[1] is not 0, list(count_list[GenomicInterval(chrom, i, end)].steps())) overrun = 0 if not help_c1 else sum(map(lambda x: x[1], help_c1)) return overrun
def loadBed(filename): """ Parses bed file to `HTSeq.GenomicInterval` objects. :param filename: Filename. :type filename: str :returns: dict of featureName:`HTSeq.GenomicInterval` objects. :rtype: dict """ from collections import OrderedDict from warnings import warn warn("Function is deprecated!") from HTSeq import GenomicInterval features = OrderedDict() for line in open(filename): fields = line.split("\t") feature = GenomicInterval( fields[0], # chrom int(fields[1]), # start int(fields[2]), # end fields[5] # strand ) features[fields[4]] = feature # append with name return features
def _bigwig_loader(garray, aggregate): print("load from bigwig") for i, sample_file in enumerate(bigwigfiles): bwfile = pyBigWig.open(sample_file) for chrom in gsize: vals = np.zeros( (get_chrom_length(gsize[chrom], resolution), ), dtype=dtype) locus = _str_to_iv(chrom, template_extension=0) if len(locus) == 1: locus = locus + (0, gsize[chrom]) # when only to load parts of the genome for start in range(locus[1], locus[2], resolution): if garray._full_genome_stored: # be careful not to overshoot at the chromosome end end = min(start + resolution, gsize[chrom]) else: end = start + resolution x = np.asarray( bwfile.values(locus[0], int(start), int(end))) if nan_to_num: x = np.nan_to_num(x, copy=False) vals[(start - locus[1]) // resolution] = aggregate(x) garray[GenomicInterval(*locus), i] = vals return garray
def read_transcripts(h5fn, stranded=True): h5file = h5py.File(h5fn, 'r') # extract data tids = h5file["tid"][:] coordlut = h5file["coordlut"][:] coordmap = h5file["coordmap"][:] strands = h5file["strand"][:] chroms = h5file["chrom"][:] a = GenomicArrayOfSets("auto", stranded=stranded) ts = dict() tidlengths = empty(len(tids), dtype="uint32") for itid, tid in enumerate(tids): nexons, coordmappos = coordlut[itid] is_plus = strands[itid] strand = "+" if is_plus else "-" chrom = str(chroms[itid]) txcoords = coordmap[coordmappos:(coordmappos + nexons)] for start, end in txcoords: iv = GenomicInterval(chrom, start, end, strand) a[iv] += tid tc = TranscriptCoordinates(txcoords, strand) ts[tid] = tc tidlengths[itid] = tc.length tid2idx = {v: i for (i, v) in enumerate(tids)} return a, ts, tid2idx, tidlengths
def __iter__(self): for line in FileOrSequence.__iter__(self): if line.startswith("track"): continue chrom, start, end, score = line.rstrip().split("\t") iv = GenomicInterval(chrom, int(start), int(end), self.strand) yield iv, float(score)
def test_bwga_instance_stranded(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath iv = GenomicInterval('chr10', 100, 120, '+') ga = create_genomic_array({'chr10': 300}, stranded=True, typecode='int8', storage='ndarray') np.testing.assert_equal(ga[iv].shape, (20, 2, 1)) np.testing.assert_equal(ga[iv], np.zeros((20, 2, 1))) ga[iv, 0] = 1 x = np.zeros((20, 2, 1)) x[:, :1, :] = np.ones((20, 1, 1)) np.testing.assert_equal(ga[iv], x) np.testing.assert_equal(ga[iv].sum(), 20) iv = GenomicInterval('chr10', 0, 300) np.testing.assert_equal(ga[iv].sum(), 20)
def _make_index(self, df, mapping): index = GenomicArrayOfSets(self._all_chroms, stranded=self._is_stranded) for lab in df.index: if self._is_stranded: iv = GenomicInterval(chrom=df.loc[lab, mapping['chrom']], start=df.loc[lab, mapping['start']], end=df.loc[lab, mapping['end']], strand=df.loc[lab, mapping['strand']]) else: iv = GenomicInterval(chrom=df.loc[lab, mapping['chrom']], start=df.loc[lab, mapping['start']], end=df.loc[lab, mapping['end']]) # Record the DataFrame index value as a GenomicInterval value index[iv] += lab #self.index = index self = index
def test_bwga_instance_stranded_notcached(tmpdir): iv = GenomicInterval('chr10', 100, 120, '+') ga = create_genomic_array({'chr10': 300}, stranded=True, typecode='int8', storage='ndarray', cache=False) np.testing.assert_equal(ga[iv].shape, (20, 2, 1)) np.testing.assert_equal(ga[iv], np.zeros((20, 2, 1))) x = np.zeros((20, 2, 1)) x[:, :1, :] = 1 ga[iv, 0] = x[:, :, 0] np.testing.assert_equal(ga[iv], x) np.testing.assert_equal(ga[iv].sum(), 20) iv = GenomicInterval('chr10', 0, 300) np.testing.assert_equal(ga[iv].sum(), 20)
def _get_overrun(chrom, i, end, step_width, count_list, feature_len): """Return overrun of reads that fall in two bins""" help_c1 = [ x for x in list(count_list[GenomicInterval(chrom, i, end)].steps()) if x[0].start + feature_len > end and x[1] is not 0 ] overrun = 0 if not help_c1 else sum([x[1] for x in help_c1]) return overrun
def __getitem__(self, index): if isinstance(index, int): start = self.offsets[index] val = self.rel_end[index] end = start + (val if val > 0 else 1) return GenomicInterval(self.chrs[index], start - self.flank, end + self.flank, self.strand[index]) raise IndexError('Index support only for "int". Given {}'.format( type(index)))
def __init__(self, snor_id, organism, chrom, start, end, strand, sequence, snor_type, **kwargs): """@todo: to be defined1. Args: snor_id (str): a unique id for snoRNA organism (str): species in which snoRNA can be found chrom (str): @todo start (int): @todo end (int): @todo strand (str): @todo sequence (str): @todo snor_type (str): @todo Kwargs: snor_name (str): an official name for snoRNA snor_family (str): snoRNA family snor_precise_typ (str): snoRNA precise type as opposed to general type of snor_type alias (str): anlternative name for snoRNA gene_name (str): snoRNA gene name accession (str): accession for the snoRNA (eg. NCBI) modified_sites (str): string of modified sites eg. 28S:U46,18S:G52 separated by the coma. It will be transformed to dictionary of the form {rna: [(position, nucleotide)]} host_gene (str): host gene host_id (str): id for host locus organization (str): organization of the locus note (str): additional information about snoRNA """ # # args # self.snor_id = snor_id self.organism = organism self.position = GenomicInterval(chrom, start, end, strand) self.sequence = Seq(sequence.upper()) self.snor_type = snor_type # # kwargs # self.snor_name = kwargs.get("snor_name", None) self.snor_family = kwargs.get("snor_family", None) self.snor_precise_type = kwargs.get("snor_precise_type", None) self.alias = kwargs.get("alias", None) self.gene_name = kwargs.get("gene_name", None) self.accession = kwargs.get("accession", None) self.__assign_modification_sites(kwargs.get("modified_sites", None)) self.host_gene = kwargs.get("host_gene", None) self.host_id = kwargs.get("host_id", None) self.organization = kwargs.get("organization", None) self.note = kwargs.get("note", None) self.__validate()
def __getitem__(self, index): if isinstance(index, int): start = self.starts[index] end = self.ends[index] if end == start: end += 1 return GenomicInterval(self.chrs[index], start - self.flank, end + self.flank, self.strand[index]) raise IndexError('Index support only for "int". Given {}'.format( type(index)))
def build_genome_array(bdgfile): #genome_array = GenomicArray(chroms,stranded=False,typecode="d") genome_array = GenomicArray("auto", stranded=False, typecode="d") with open(bdgfile) as fin: for line in fin: if line.startswith("#"): continue chrom, start, end, value = line.strip().split() iv = GenomicInterval(chrom, int(start), int(end), ".") genome_array[iv] += float(value) return genome_array
def _seq_loader(cover, seqs, order): print('Convert sequences to index array') for seq in seqs: if cover._full_genome_stored: interval = GenomicInterval(seq.id, 0, len(seq) - order + 1, '.') else: interval = GenomicInterval( *_str_to_iv(seq.id, template_extension=0)) indarray = np.asarray(seq2ind(seq), dtype=dtype) if order > 1: # for higher order motifs, this part is used filter_ = np.asarray([ pow(len(seq.seq.alphabet.letters), i) for i in range(order) ]) indarray = np.convolve(indarray, filter_, mode='valid') cover[interval, 0] = indarray
def __call__(self, garray): seqs = self.seqs order = self.order dtype = garray.typecode print('Convert sequences to index array') for seq in seqs: if garray._full_genome_stored: interval = GenomicInterval(seq.id, 0, len(seq) - order + 1, '.') else: interval = GenomicInterval(*_str_to_iv(seq.id, template_extension=0)) indarray = np.asarray(seq2ind(seq), dtype=dtype) if order > 1: # for higher order motifs, this part is used filter_ = np.asarray([pow(len(seq.seq.alphabet.letters), i) for i in range(order)]) indarray = np.convolve(indarray, filter_, mode='valid') garray[interval, 0] = indarray.reshape(-1, 1)
def test_logzscore_normalization(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath def loading(garray): garray[GenomicInterval('chr1', 0, 150), 0] = np.repeat(10, 150).reshape(-1, 1) garray[GenomicInterval('chr2', 0, 300), 0] = np.repeat(100, 300).reshape(-1, 1) return garray for store in ['ndarray', 'hdf5']: ga = create_genomic_array({ 'chr1': 150, 'chr2': 300 }, stranded=False, typecode='float32', storage=store, cache=True, loader=loading, normalizer=get_normalizer('zscorelog')) np.testing.assert_allclose(ga.weighted_mean(), np.asarray([0.0]), rtol=1e-5, atol=1e-5) np.testing.assert_allclose(ga.weighted_sd(), np.asarray([1.]), rtol=1e-5, atol=1e-5) np.testing.assert_allclose(ga[GenomicInterval('chr1', 100, 101)], np.asarray([[[-1.412641340027806]]]), rtol=1e-5, atol=1e-5) np.testing.assert_allclose(ga[GenomicInterval('chr2', 100, 101)], np.asarray([[[0.706320670013903]]]), rtol=1e-5, atol=1e-5)
def __getitem__(self, idxs): if isinstance(idxs, tuple): if len(idxs) == 3 or len(idxs) == 4: # interpret idxs as genomic interval idxs = GenomicInterval(*idxs) else: raise ValueError('idxs cannot be interpreted as genomic interval.' ' use (chr, start, end) or (chr, start, end, strand)') if isinstance(idxs, int): idxs = [idxs] elif isinstance(idxs, slice): idxs = range(idxs.start if idxs.start else 0, idxs.stop if idxs.stop else len(self), idxs.step if idxs.step else 1) elif isinstance(idxs, GenomicInterval): if not self.garray._full_genome_stored: raise ValueError('Indexing with GenomicInterval only possible ' 'when the whole genome (or chromosome) was loaded') data = np.zeros((1, idxs.length - self.garray.order + 1)) data[0] = self._getsingleitem(idxs) # accept a genomic interval directly data = as_onehot(data, self.garray.order, self._alphabetsize) for transform in self.transformations: data = transform(data) if not self._channel_last: data = np.transpose(data, (0, 3, 1, 2)) return data try: iter(idxs) except TypeError: raise IndexError('Bioseq.__getitem__: ' + 'index must be iterable') data = as_onehot(self.iseq4idx(idxs), self.garray.order, self._alphabetsize) for transform in self.transformations: data = transform(data) if not self._channel_last: data = np.transpose(data, (0, 3, 1, 2)) return data
def _bam_loader(garray, files): print("load from bam") for i, sample_file in enumerate(files): print('Counting from {}'.format(sample_file)) aln_file = pysam.AlignmentFile(sample_file, 'rb') # pylint: disable=no-member for chrom in gsize: array = np.zeros( (get_chrom_length(gsize[chrom], resolution), 2), dtype=dtype) locus = _str_to_iv(chrom, template_extension=template_extension) if len(locus) == 1: locus = (locus[0], 0, gsize[chrom]) # locus = (chr, start, end) # or locus = (chr, ) for aln in aln_file.fetch(*locus): if aln.is_unmapped: continue if aln.mapq < min_mapq: continue if aln.is_read2: # only consider read1 so as not to double count # fragments for paired end reads # read2 will also be false for single end # reads. continue if aln.is_paired: # if paired end read, consider the midpoint if not (aln.is_proper_pair and aln.reference_name == aln.next_reference_name): # only consider paired end reads if both mates # are properly mapped and they map to the # same reference_name continue # if the next reference start >= 0, # the read is considered as a paired end read # in this case we consider the mid point if pairedend == 'midpoint': pos = min(aln.reference_start, aln.next_reference_start) + \ abs(aln.template_length) // 2 else: if aln.is_reverse: # last position of the downstream read pos = max( aln.reference_end, aln.next_reference_start + aln.query_length) else: # first position of the upstream read pos = min(aln.reference_start, aln.next_reference_start) else: # here we consider single end reads # whose 5 prime end is determined strand specifically if aln.is_reverse: pos = aln.reference_end else: pos = aln.reference_start if not garray._full_genome_stored: # if we get here, a region was given, # otherwise, the entire chromosome is read. pos -= locus[1] + template_extension if pos < 0 or pos >= locus[2] - locus[1]: # if the read 5 p end or mid point is outside # of the region of interest, the read is discarded continue # compute divide by the resolution pos //= resolution # fill up the read strand specifically if aln.is_reverse: array[pos, 1] += 1 else: array[pos, 0] += 1 # apply the aggregation if aggregate is not None: array = aggregate(array) if stranded: lp = locus + ('+', ) garray[GenomicInterval(*lp), i] = array[:, 0] lm = locus + ('-', ) garray[GenomicInterval(*lm), i] = array[:, 1] else: # if unstranded, aggregate the reads from # both strands garray[GenomicInterval(*locus), i] = array.sum(axis=1) return garray
def __getitem__(self, idxs): if isinstance(idxs, tuple): idxs = GenomicInterval(*idxs) if isinstance(idxs, int): idxs = [idxs] elif isinstance(idxs, slice): idxs = range(idxs.start if idxs.start else 0, idxs.stop if idxs.stop else len(self), idxs.step if idxs.step else 1) elif isinstance(idxs, GenomicInterval): if self.garray._full_genome_stored: print(idxs) # accept a genomic interval directly #data = np.zeros((1,) + self.shape[1:]) data = self._getsingleitem(idxs) data = data.reshape((1, ) + data.shape) for transform in self.transformations: data = transform(data) else: chrom = idxs.chrom start = idxs.start end = idxs.end gindexer_new = self.gindexer.filter_by_region(include=chrom, start=start, end=end) data = np.zeros( (1, ((end - start) // self.garray.resolution) + (2 * (gindexer_new.stepsize) // self.garray.resolution)) + self.shape[2:]) if self.padding_value != 0: data.fill(self.padding_value) step_size = gindexer_new.stepsize for interval in gindexer_new: print('new gindexer interval:', interval) tmp_data = np.array(self._getsingleitem(interval)) tmp_data = tmp_data.reshape((1, ) + tmp_data.shape) if interval.strand == '-': # invert the data so that is again relative # to the positive strand, # this avoids having to change the rel_pos computation tmp_data = tmp_data[:, ::-1, ::-1, :] rel_pos = (interval.start - (start - step_size)) // self.garray.resolution data[:, rel_pos:rel_pos + (step_size // self.garray.resolution), :, :] = tmp_data if interval.strand == '-': # invert it back relative to minus strand data = data[:, ::-1, ::-1, :] data = data[:, (1 * (step_size) // self.garray.resolution):-1 * (1 * (step_size) // self.garray.resolution), :, :] if not self._channel_last: data = np.transpose(data, (0, 3, 1, 2)) return data try: iter(idxs) except TypeError: raise IndexError('Cover.__getitem__: index must be iterable') data = np.zeros((len(idxs), ) + self.shape_static[1:]) if self.padding_value != 0: data.fill(self.padding_value) for i, idx in enumerate(idxs): interval = self.gindexer[idx] data[i, :, :, :] = self._getsingleitem(interval) for transform in self.transformations: data = transform(data) if not self._channel_last: data = np.transpose(data, (0, 3, 1, 2)) return data
chromosomes = list(chromosomes) hitMap = GenomicArray(chromosomes, stranded=True, typecode='i') for alignment in SAM_Reader(alignFile): if alignment.aligned: genomeRegion = alignment.iv if genomeRegion.strand == '+': hitMap[genomeRegion] = 1 else: hitMap[genomeRegion] = -1 chromo = chromosomes[0] endPoint = 2000000 plusStrand = GenomicInterval(chromo, 0, endPoint, '+') minusStrand = GenomicInterval(chromo, 0, endPoint, '-') bothStrands = GenomicInterval(chromo, 0, endPoint, '.') pyplot.plot(list(hitMap[plusStrand])) pyplot.plot(list(hitMap[minusStrand])) pyplot.show() print('\n Using HTSeq to access GFF genome features\n') remoteFileName = '/Bacteria/Escherichia_coli_536_uid58531/NC_008253.gff' gffFile = 'examples/EcoliGenomeFeatures.gff' downloadFile(FTP_ROOT + remoteFileName, gffFile) fileObj = GFF_Reader(gffFile)
def loading(garray): garray[GenomicInterval('chr1:0-150', 0, 150), 0] = np.repeat(10, 150).reshape(-1, 1) garray[GenomicInterval('chr2:0-300', 0, 300), 0] = np.repeat(1, 300).reshape(-1, 1) return garray