Ejemplo n.º 1
0
def test_tmp_normalization(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    def loading(garray):
        garray[GenomicInterval('chr1', 0, 150),
               0] = np.repeat(10, 150).reshape(-1, 1)
        garray[GenomicInterval('chr2', 0, 300),
               0] = np.repeat(1, 300).reshape(-1, 1)
        return garray

    for store in ['ndarray', 'hdf5']:
        ga = create_genomic_array({
            'chr1': 150,
            'chr2': 300
        },
                                  stranded=False,
                                  typecode='float32',
                                  storage=store,
                                  cache=True,
                                  resolution=50,
                                  loader=loading,
                                  collapser='sum',
                                  normalizer=get_normalizer('tpm'))
        np.testing.assert_allclose(
            ga[GenomicInterval('chr1', 100, 101)],
            np.asarray([[[10 * 1000 / 50 * 1e6 / (720.)]]]))
        np.testing.assert_allclose(
            ga[GenomicInterval('chr2', 100, 101)],
            np.asarray([[[1 * 1000 / 50 * 1e6 / (720.)]]]))
Ejemplo n.º 2
0
def get_bins(chrom_len, chromosomes, count_list, step_width, feature_len):
    """Creates list of bins of length <step_width> with values describing 
    the number of reads that fall into a bin.
    <count_list> has to be created with 'get_count_list' 
    It returns a dict like: {'chr1' [0,10,2,0,...], 'chr2' ...} 
    where the first list entry gives the first bin and so on."""
    result = {}
    for chrom in chromosomes:
        overrun = 0
        if chrom not in chrom_len:
            #             print("Warning: %s not found, do not consider" %chrom, file=sys.stderr)
            pass
        else:
            # print("... considering %s..."%chrom, file=sys.stderr)
            for i in range(0, chrom_len[chrom], step_width):
                end = min(i + step_width, chrom_len[chrom])
                counts = reduce(lambda x, y: x + y,
                                count_list[GenomicInterval(chrom, i, end)])
                count_list[GenomicInterval(chrom, i, end)] = 0
                counts += overrun

                if chrom in list(result.keys()):
                    result[chrom].append(counts)
                else:
                    result[chrom] = [counts]

                overrun = _get_overrun(chrom, i, end, step_width, count_list,
                                       feature_len)

    return result
Ejemplo n.º 3
0
def test_bwga_instance_unstranded_taged(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    iv = GenomicInterval('chr10', 100, 120, '.')
    ga = create_genomic_array({'chr10': 300},
                              stranded=False,
                              typecode='int8',
                              storage='ndarray',
                              datatags='test_bwga_instance_unstranded')

    with pytest.raises(Exception):
        # access only via genomic interval
        ga[1]

    with pytest.raises(Exception):
        # access only via genomic interval and condition
        ga[1] = 1

    np.testing.assert_equal(ga[iv].shape, (20, 1, 1))
    np.testing.assert_equal(ga[iv], np.zeros((20, 1, 1)))

    ga[iv, 0] = np.ones((20, 1))
    np.testing.assert_equal(ga[iv], np.ones((20, 1, 1)))
    np.testing.assert_equal(ga[iv].sum(), 20)
    iv = GenomicInterval('chr10', 0, 300, '.')
    np.testing.assert_equal(ga[iv].sum(), 20)
Ejemplo n.º 4
0
 def parse_cigar_into_features(self):
     """Parse CLIPz cigar string into dict of features of the read"""
     i = 0
     insertions = 0
     for feat in self.cigar_list:
         if self.is_string_integer(feat):
             i += int(feat)
         else:
             if feat.startswith('I'):
                 insertions += 1
             if self.strand == '+':
                 self.features[feat].append(
                     ReadFeature(feat,
                                 type_=features_types[feat[0]],
                                 interval=GenomicInterval(
                                     self.chrom,
                                     self.position + i - insertions,
                                     self.position + i + 1 - insertions,
                                     self.strand),
                                 beg_in_read=i + insertions))
             elif self.strand == "-":
                 self.features[mutations_mapping[feat]].append(
                     ReadFeature(
                         mutations_mapping[feat],
                         type_=features_types[feat[0]],
                         interval=GenomicInterval(
                             self.chrom, self.position + i - insertions,
                             self.position + i + 1 - insertions,
                             self.strand),
                         beg_in_read=self.length - i - 1 + insertions))
             else:
                 raise Exception("Strand must be + or -")
             i += 1
Ejemplo n.º 5
0
 def __iter__(self) -> Iterator[Tuple[GenomicFeature, str]]:
     for line in TextFile.__iter__(self):
         if isinstance(line, bytes):
             line = line.decode()
         if line == "\n":
             continue
         if line.startswith('#'):
             if line.startswith("##"):
                 mo = re.compile(r"##\s*(\S+)\s+(\S*)").match(line)
                 if mo:
                     self.metadata[mo.group(1)] = mo.group(2)
             continue
         (seqname, source, feature, start, end, score, strand, frame,
          attributeStr) = line.split("\t", 8)
         (attr, name) = parse_GFF_attribute_string(attributeStr, True)
         if self.end_included:
             iv = GenomicInterval(seqname, int(start) - 1, int(end), strand)
         else:
             iv = GenomicInterval(seqname,
                                  int(start) - 1,
                                  int(end) - 1, strand)
         f = GenomicFeature(name, feature, iv)
         if score != ".":
             score = float(score)
         if frame != ".":
             frame = int(frame)
         f.source = source
         f.score = score
         f.frame = frame
         f.attr = attr
         yield (f, line)
Ejemplo n.º 6
0
def test_bwga_instance_unstranded(tmpdir):
    iv = GenomicInterval('chr10', 100, 120, '.')
    ga = create_genomic_array({'chr10': 300},
                              stranded=False,
                              typecode='int8',
                              storage='ndarray',
                              cache=False)
    np.testing.assert_equal(ga[iv].shape, (20, 1, 1))
    np.testing.assert_equal(ga[iv], np.zeros((20, 1, 1)))

    ga[iv, 0] = 1
    np.testing.assert_equal(ga[iv], np.ones((20, 1, 1)))
    np.testing.assert_equal(ga[iv].sum(), 20)
    iv = GenomicInterval('chr10', 0, 300, '.')
    np.testing.assert_equal(ga[iv].sum(), 20)
Ejemplo n.º 7
0
def _get_overrun(chrom, i, end, step_width, count_list, feature_len):
    """Return overrun of reads that fall in two bins"""
    help_c1 = filter(lambda x: x[0].start + feature_len > end and x[1] is not 0,
                     list(count_list[GenomicInterval(chrom, i, end)].steps()))
    overrun = 0 if not help_c1 else sum(map(lambda x: x[1], help_c1))

    return overrun
Ejemplo n.º 8
0
def loadBed(filename):
    """
    Parses bed file to `HTSeq.GenomicInterval` objects.

    :param filename: Filename.
    :type filename: str
    :returns: dict of featureName:`HTSeq.GenomicInterval` objects.
    :rtype: dict
    """
    from collections import OrderedDict
    from warnings import warn
    warn("Function is deprecated!")
    from HTSeq import GenomicInterval

    features = OrderedDict()
    for line in open(filename):
        fields = line.split("\t")
        feature = GenomicInterval(
            fields[0],  # chrom
            int(fields[1]),  # start
            int(fields[2]),  # end
            fields[5]  # strand
        )
        features[fields[4]] = feature  # append with name
    return features
Ejemplo n.º 9
0
        def _bigwig_loader(garray, aggregate):
            print("load from bigwig")
            for i, sample_file in enumerate(bigwigfiles):
                bwfile = pyBigWig.open(sample_file)

                for chrom in gsize:

                    vals = np.zeros(
                        (get_chrom_length(gsize[chrom], resolution), ),
                        dtype=dtype)

                    locus = _str_to_iv(chrom, template_extension=0)
                    if len(locus) == 1:
                        locus = locus + (0, gsize[chrom])

                    # when only to load parts of the genome
                    for start in range(locus[1], locus[2], resolution):

                        if garray._full_genome_stored:
                            # be careful not to overshoot at the chromosome end
                            end = min(start + resolution, gsize[chrom])
                        else:
                            end = start + resolution

                        x = np.asarray(
                            bwfile.values(locus[0], int(start), int(end)))
                        if nan_to_num:
                            x = np.nan_to_num(x, copy=False)
                        vals[(start - locus[1]) // resolution] = aggregate(x)

                    garray[GenomicInterval(*locus), i] = vals
            return garray
def read_transcripts(h5fn, stranded=True):
    h5file = h5py.File(h5fn, 'r')

    # extract data
    tids = h5file["tid"][:]
    coordlut = h5file["coordlut"][:]
    coordmap = h5file["coordmap"][:]
    strands = h5file["strand"][:]
    chroms = h5file["chrom"][:]

    a = GenomicArrayOfSets("auto", stranded=stranded)
    ts = dict()
    tidlengths = empty(len(tids), dtype="uint32")
    for itid, tid in enumerate(tids):
        nexons, coordmappos = coordlut[itid]
        is_plus = strands[itid]
        strand = "+" if is_plus else "-"
        chrom = str(chroms[itid])

        txcoords = coordmap[coordmappos:(coordmappos + nexons)]

        for start, end in txcoords:
            iv = GenomicInterval(chrom, start, end, strand)
            a[iv] += tid

        tc = TranscriptCoordinates(txcoords, strand)
        ts[tid] = tc
        tidlengths[itid] = tc.length

    tid2idx = {v: i for (i, v) in enumerate(tids)}

    return a, ts, tid2idx, tidlengths
Ejemplo n.º 11
0
 def __iter__(self):
     for line in FileOrSequence.__iter__(self):
         if line.startswith("track"):
             continue
         chrom, start, end, score = line.rstrip().split("\t")
         iv = GenomicInterval(chrom, int(start), int(end), self.strand)
         yield iv, float(score)
Ejemplo n.º 12
0
def test_bwga_instance_stranded(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    iv = GenomicInterval('chr10', 100, 120, '+')
    ga = create_genomic_array({'chr10': 300},
                              stranded=True,
                              typecode='int8',
                              storage='ndarray')
    np.testing.assert_equal(ga[iv].shape, (20, 2, 1))
    np.testing.assert_equal(ga[iv], np.zeros((20, 2, 1)))

    ga[iv, 0] = 1
    x = np.zeros((20, 2, 1))
    x[:, :1, :] = np.ones((20, 1, 1))
    np.testing.assert_equal(ga[iv], x)
    np.testing.assert_equal(ga[iv].sum(), 20)
    iv = GenomicInterval('chr10', 0, 300)
    np.testing.assert_equal(ga[iv].sum(), 20)
 def _make_index(self, df, mapping):
     index = GenomicArrayOfSets(self._all_chroms,
                                stranded=self._is_stranded)
     for lab in df.index:
         if self._is_stranded:
             iv = GenomicInterval(chrom=df.loc[lab, mapping['chrom']],
                                  start=df.loc[lab, mapping['start']],
                                  end=df.loc[lab, mapping['end']],
                                  strand=df.loc[lab, mapping['strand']])
         else:
             iv = GenomicInterval(chrom=df.loc[lab, mapping['chrom']],
                                  start=df.loc[lab, mapping['start']],
                                  end=df.loc[lab, mapping['end']])
         # Record the DataFrame index value as a GenomicInterval value
         index[iv] += lab
     #self.index = index
     self = index
Ejemplo n.º 14
0
def test_bwga_instance_stranded_notcached(tmpdir):

    iv = GenomicInterval('chr10', 100, 120, '+')
    ga = create_genomic_array({'chr10': 300},
                              stranded=True,
                              typecode='int8',
                              storage='ndarray',
                              cache=False)
    np.testing.assert_equal(ga[iv].shape, (20, 2, 1))
    np.testing.assert_equal(ga[iv], np.zeros((20, 2, 1)))

    x = np.zeros((20, 2, 1))
    x[:, :1, :] = 1
    ga[iv, 0] = x[:, :, 0]
    np.testing.assert_equal(ga[iv], x)
    np.testing.assert_equal(ga[iv].sum(), 20)
    iv = GenomicInterval('chr10', 0, 300)
    np.testing.assert_equal(ga[iv].sum(), 20)
Ejemplo n.º 15
0
def _get_overrun(chrom, i, end, step_width, count_list, feature_len):
    """Return overrun of reads that fall in two bins"""
    help_c1 = [
        x for x in list(count_list[GenomicInterval(chrom, i, end)].steps())
        if x[0].start + feature_len > end and x[1] is not 0
    ]
    overrun = 0 if not help_c1 else sum([x[1] for x in help_c1])

    return overrun
Ejemplo n.º 16
0
    def __getitem__(self, index):
        if isinstance(index, int):
            start = self.offsets[index]
            val = self.rel_end[index]
            end = start + (val if val > 0 else 1)
            return GenomicInterval(self.chrs[index], start - self.flank,
                                   end + self.flank, self.strand[index])

        raise IndexError('Index support only for "int". Given {}'.format(
            type(index)))
    def __init__(self, snor_id, organism, chrom, start, end, strand, sequence,
                 snor_type, **kwargs):
        """@todo: to be defined1.

        Args:
            snor_id (str): a unique id for snoRNA
            organism (str): species in which snoRNA can be found
            chrom (str): @todo
            start (int): @todo
            end (int): @todo
            strand (str): @todo
            sequence (str): @todo
            snor_type (str): @todo

        Kwargs:
            snor_name (str): an official name for snoRNA
            snor_family (str): snoRNA family
            snor_precise_typ (str): snoRNA precise type as opposed to general type of snor_type
            alias (str): anlternative name for snoRNA
            gene_name (str): snoRNA gene name
            accession (str): accession for the snoRNA (eg. NCBI)
            modified_sites (str): string of modified sites eg. 28S:U46,18S:G52 separated
                                  by the coma.
                                  It will be transformed to dictionary of the form
                                  {rna: [(position, nucleotide)]}
            host_gene (str): host gene
            host_id (str): id for host locus
            organization (str): organization of the locus
            note (str): additional information about snoRNA


        """
        #
        # args
        #
        self.snor_id = snor_id
        self.organism = organism
        self.position = GenomicInterval(chrom, start, end, strand)
        self.sequence = Seq(sequence.upper())
        self.snor_type = snor_type
        #
        # kwargs
        #
        self.snor_name = kwargs.get("snor_name", None)
        self.snor_family = kwargs.get("snor_family", None)
        self.snor_precise_type = kwargs.get("snor_precise_type", None)
        self.alias = kwargs.get("alias", None)
        self.gene_name = kwargs.get("gene_name", None)
        self.accession = kwargs.get("accession", None)
        self.__assign_modification_sites(kwargs.get("modified_sites", None))
        self.host_gene = kwargs.get("host_gene", None)
        self.host_id = kwargs.get("host_id", None)
        self.organization = kwargs.get("organization", None)
        self.note = kwargs.get("note", None)
        self.__validate()
Ejemplo n.º 18
0
    def __getitem__(self, index):
        if isinstance(index, int):
            start = self.starts[index]
            end = self.ends[index]
            if end == start:
                end += 1
            return GenomicInterval(self.chrs[index], start - self.flank,
                                   end + self.flank, self.strand[index])

        raise IndexError('Index support only for "int". Given {}'.format(
            type(index)))
Ejemplo n.º 19
0
def build_genome_array(bdgfile):
    #genome_array = GenomicArray(chroms,stranded=False,typecode="d")
    genome_array = GenomicArray("auto", stranded=False, typecode="d")
    with open(bdgfile) as fin:
        for line in fin:
            if line.startswith("#"):
                continue
            chrom, start, end, value = line.strip().split()
            iv = GenomicInterval(chrom, int(start), int(end), ".")
            genome_array[iv] += float(value)
    return genome_array
Ejemplo n.º 20
0
        def _seq_loader(cover, seqs, order):
            print('Convert sequences to index array')
            for seq in seqs:
                if cover._full_genome_stored:
                    interval = GenomicInterval(seq.id, 0,
                                               len(seq) - order + 1, '.')
                else:
                    interval = GenomicInterval(
                        *_str_to_iv(seq.id, template_extension=0))

                indarray = np.asarray(seq2ind(seq), dtype=dtype)

                if order > 1:
                    # for higher order motifs, this part is used
                    filter_ = np.asarray([
                        pow(len(seq.seq.alphabet.letters), i)
                        for i in range(order)
                    ])
                    indarray = np.convolve(indarray, filter_, mode='valid')

                cover[interval, 0] = indarray
Ejemplo n.º 21
0
    def __call__(self, garray):
        seqs = self.seqs
        order = self.order
        dtype = garray.typecode

        print('Convert sequences to index array')
        for seq in seqs:
            if garray._full_genome_stored:
                interval = GenomicInterval(seq.id, 0,
                                           len(seq) - order + 1, '.')
            else:
                interval = GenomicInterval(*_str_to_iv(seq.id,
                                                       template_extension=0))

            indarray = np.asarray(seq2ind(seq), dtype=dtype)

            if order > 1:
                # for higher order motifs, this part is used
                filter_ = np.asarray([pow(len(seq.seq.alphabet.letters),
                                          i) for i in range(order)])
                indarray = np.convolve(indarray, filter_, mode='valid')

            garray[interval, 0] = indarray.reshape(-1, 1)
Ejemplo n.º 22
0
def test_logzscore_normalization(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    def loading(garray):
        garray[GenomicInterval('chr1', 0, 150),
               0] = np.repeat(10, 150).reshape(-1, 1)
        garray[GenomicInterval('chr2', 0, 300),
               0] = np.repeat(100, 300).reshape(-1, 1)
        return garray

    for store in ['ndarray', 'hdf5']:
        ga = create_genomic_array({
            'chr1': 150,
            'chr2': 300
        },
                                  stranded=False,
                                  typecode='float32',
                                  storage=store,
                                  cache=True,
                                  loader=loading,
                                  normalizer=get_normalizer('zscorelog'))
        np.testing.assert_allclose(ga.weighted_mean(),
                                   np.asarray([0.0]),
                                   rtol=1e-5,
                                   atol=1e-5)
        np.testing.assert_allclose(ga.weighted_sd(),
                                   np.asarray([1.]),
                                   rtol=1e-5,
                                   atol=1e-5)
        np.testing.assert_allclose(ga[GenomicInterval('chr1', 100, 101)],
                                   np.asarray([[[-1.412641340027806]]]),
                                   rtol=1e-5,
                                   atol=1e-5)
        np.testing.assert_allclose(ga[GenomicInterval('chr2', 100, 101)],
                                   np.asarray([[[0.706320670013903]]]),
                                   rtol=1e-5,
                                   atol=1e-5)
Ejemplo n.º 23
0
    def __getitem__(self, idxs):
        if isinstance(idxs, tuple):
            if len(idxs) == 3 or len(idxs) == 4:
                # interpret idxs as genomic interval
                idxs = GenomicInterval(*idxs)
            else:
                raise ValueError('idxs cannot be interpreted as genomic interval.'
                                 ' use (chr, start, end) or (chr, start, end, strand)')

        if isinstance(idxs, int):
            idxs = [idxs]
        elif isinstance(idxs, slice):
            idxs = range(idxs.start if idxs.start else 0,
                         idxs.stop if idxs.stop else len(self),
                         idxs.step if idxs.step else 1)
        elif isinstance(idxs, GenomicInterval):
            if not self.garray._full_genome_stored:
                raise ValueError('Indexing with GenomicInterval only possible '
                                 'when the whole genome (or chromosome) was loaded')

            data = np.zeros((1, idxs.length  - self.garray.order + 1))
            data[0] = self._getsingleitem(idxs)
            # accept a genomic interval directly
            data = as_onehot(data,
                             self.garray.order,
                             self._alphabetsize)
            for transform in self.transformations:
                data = transform(data)
            if not self._channel_last:
                data = np.transpose(data, (0, 3, 1, 2))
            return data

        try:
            iter(idxs)
        except TypeError:
            raise IndexError('Bioseq.__getitem__: '
                             + 'index must be iterable')

        data = as_onehot(self.iseq4idx(idxs), self.garray.order,
                         self._alphabetsize)

        for transform in self.transformations:
            data = transform(data)

        if not self._channel_last:
            data = np.transpose(data, (0, 3, 1, 2))

        return data
Ejemplo n.º 24
0
        def _bam_loader(garray, files):
            print("load from bam")
            for i, sample_file in enumerate(files):
                print('Counting from {}'.format(sample_file))
                aln_file = pysam.AlignmentFile(sample_file, 'rb')  # pylint: disable=no-member
                for chrom in gsize:

                    array = np.zeros(
                        (get_chrom_length(gsize[chrom], resolution), 2),
                        dtype=dtype)

                    locus = _str_to_iv(chrom,
                                       template_extension=template_extension)
                    if len(locus) == 1:
                        locus = (locus[0], 0, gsize[chrom])
                    # locus = (chr, start, end)
                    # or locus = (chr, )

                    for aln in aln_file.fetch(*locus):

                        if aln.is_unmapped:
                            continue

                        if aln.mapq < min_mapq:
                            continue

                        if aln.is_read2:
                            # only consider read1 so as not to double count
                            # fragments for paired end reads
                            # read2 will also be false for single end
                            # reads.
                            continue

                        if aln.is_paired:
                            # if paired end read, consider the midpoint
                            if not (aln.is_proper_pair and aln.reference_name
                                    == aln.next_reference_name):
                                # only consider paired end reads if both mates
                                # are properly mapped and they map to the
                                # same reference_name
                                continue
                            # if the next reference start >= 0,
                            # the read is considered as a paired end read
                            # in this case we consider the mid point
                            if pairedend == 'midpoint':
                                pos = min(aln.reference_start,
                                          aln.next_reference_start) + \
                                          abs(aln.template_length) // 2
                            else:
                                if aln.is_reverse:
                                    # last position of the downstream read
                                    pos = max(
                                        aln.reference_end,
                                        aln.next_reference_start +
                                        aln.query_length)
                                else:
                                    # first position of the upstream read
                                    pos = min(aln.reference_start,
                                              aln.next_reference_start)
                        else:
                            # here we consider single end reads
                            # whose 5 prime end is determined strand specifically
                            if aln.is_reverse:
                                pos = aln.reference_end
                            else:
                                pos = aln.reference_start

                        if not garray._full_genome_stored:
                            # if we get here, a region was given,
                            # otherwise, the entire chromosome is read.
                            pos -= locus[1] + template_extension

                            if pos < 0 or pos >= locus[2] - locus[1]:
                                # if the read 5 p end or mid point is outside
                                # of the region of interest, the read is discarded
                                continue

                        # compute divide by the resolution
                        pos //= resolution

                        # fill up the read strand specifically
                        if aln.is_reverse:
                            array[pos, 1] += 1
                        else:
                            array[pos, 0] += 1
                    # apply the aggregation
                    if aggregate is not None:
                        array = aggregate(array)

                    if stranded:
                        lp = locus + ('+', )
                        garray[GenomicInterval(*lp), i] = array[:, 0]
                        lm = locus + ('-', )
                        garray[GenomicInterval(*lm), i] = array[:, 1]
                    else:
                        # if unstranded, aggregate the reads from
                        # both strands
                        garray[GenomicInterval(*locus), i] = array.sum(axis=1)

            return garray
Ejemplo n.º 25
0
    def __getitem__(self, idxs):
        if isinstance(idxs, tuple):
            idxs = GenomicInterval(*idxs)

        if isinstance(idxs, int):
            idxs = [idxs]
        elif isinstance(idxs, slice):
            idxs = range(idxs.start if idxs.start else 0,
                         idxs.stop if idxs.stop else len(self),
                         idxs.step if idxs.step else 1)
        elif isinstance(idxs, GenomicInterval):
            if self.garray._full_genome_stored:
                print(idxs)
                # accept a genomic interval directly
                #data = np.zeros((1,) + self.shape[1:])
                data = self._getsingleitem(idxs)
                data = data.reshape((1, ) + data.shape)
                for transform in self.transformations:
                    data = transform(data)

            else:
                chrom = idxs.chrom
                start = idxs.start
                end = idxs.end
                gindexer_new = self.gindexer.filter_by_region(include=chrom,
                                                              start=start,
                                                              end=end)
                data = np.zeros(
                    (1, ((end - start) // self.garray.resolution) +
                     (2 * (gindexer_new.stepsize) // self.garray.resolution)) +
                    self.shape[2:])
                if self.padding_value != 0:
                    data.fill(self.padding_value)
                step_size = gindexer_new.stepsize
                for interval in gindexer_new:
                    print('new gindexer interval:', interval)
                    tmp_data = np.array(self._getsingleitem(interval))
                    tmp_data = tmp_data.reshape((1, ) + tmp_data.shape)

                    if interval.strand == '-':
                        # invert the data so that is again relative
                        # to the positive strand,
                        # this avoids having to change the rel_pos computation
                        tmp_data = tmp_data[:, ::-1, ::-1, :]

                    rel_pos = (interval.start -
                               (start - step_size)) // self.garray.resolution

                    data[:, rel_pos:rel_pos +
                         (step_size //
                          self.garray.resolution), :, :] = tmp_data

                if interval.strand == '-':
                    # invert it back relative to minus strand
                    data = data[:, ::-1, ::-1, :]

                data = data[:, (1 * (step_size) // self.garray.resolution):-1 *
                            (1 * (step_size) // self.garray.resolution), :, :]

            if not self._channel_last:
                data = np.transpose(data, (0, 3, 1, 2))

            return data

        try:
            iter(idxs)
        except TypeError:
            raise IndexError('Cover.__getitem__: index must be iterable')

        data = np.zeros((len(idxs), ) + self.shape_static[1:])
        if self.padding_value != 0:
            data.fill(self.padding_value)

        for i, idx in enumerate(idxs):
            interval = self.gindexer[idx]

            data[i, :, :, :] = self._getsingleitem(interval)

        for transform in self.transformations:
            data = transform(data)

        if not self._channel_last:
            data = np.transpose(data, (0, 3, 1, 2))

        return data
    chromosomes = list(chromosomes)
    hitMap = GenomicArray(chromosomes, stranded=True, typecode='i')

    for alignment in SAM_Reader(alignFile):

        if alignment.aligned:
            genomeRegion = alignment.iv

            if genomeRegion.strand == '+':
                hitMap[genomeRegion] = 1
            else:
                hitMap[genomeRegion] = -1

    chromo = chromosomes[0]
    endPoint = 2000000
    plusStrand = GenomicInterval(chromo, 0, endPoint, '+')
    minusStrand = GenomicInterval(chromo, 0, endPoint, '-')
    bothStrands = GenomicInterval(chromo, 0, endPoint, '.')

    pyplot.plot(list(hitMap[plusStrand]))
    pyplot.plot(list(hitMap[minusStrand]))
    pyplot.show()

    print('\n Using HTSeq to access GFF genome features\n')

    remoteFileName = '/Bacteria/Escherichia_coli_536_uid58531/NC_008253.gff'
    gffFile = 'examples/EcoliGenomeFeatures.gff'
    downloadFile(FTP_ROOT + remoteFileName, gffFile)

    fileObj = GFF_Reader(gffFile)
Ejemplo n.º 27
0
 def loading(garray):
     garray[GenomicInterval('chr1:0-150', 0, 150),
            0] = np.repeat(10, 150).reshape(-1, 1)
     garray[GenomicInterval('chr2:0-300', 0, 300),
            0] = np.repeat(1, 300).reshape(-1, 1)
     return garray