Ejemplo n.º 1
0
 def filter(self,line):
     """Process lines of `PSL`_ files input into |SegmentChain|, and group
     these by query sequence.
      
     Parameters
     ----------
     line : str
         line of `PSL`_ input
      
     Returns
     -------
     list
         list of |SegmentChain| objects sharing a query sequence 
     """
     ltmp = []
     aln = SegmentChain.from_psl(line)
     last_name = aln.attr["query_name"]
     try:
         while last_name == aln.attr["query_name"]:
             ltmp.append(aln)
             line = next(self.stream)
             aln = SegmentChain.from_psl(line)
             
         self.stream = itertools.chain([line],self.stream)
         return ltmp
     except StopIteration:
         # send final bundle
         return ltmp
Ejemplo n.º 2
0
    def filter(self, line):
        """Process lines of `PSL`_ files input into |SegmentChain|, and group
        these by query sequence.
         
        Parameters
        ----------
        line : str
            line of `PSL`_ input
         
        Returns
        -------
        list
            list of |SegmentChain| objects sharing a query sequence 
        """
        ltmp = []
        aln = SegmentChain.from_psl(line)
        last_name = aln.attr["query_name"]
        try:
            while last_name == aln.attr["query_name"]:
                ltmp.append(aln)
                line = next(self.stream)
                aln = SegmentChain.from_psl(line)

            self.stream = itertools.chain([line], self.stream)
            return ltmp
        except StopIteration:
            # send final bundle
            return ltmp
Ejemplo n.º 3
0
def test_window_landmark():
    # test cases: plus and minus-strand IVCs with splicing
    flank_up = 50
    flank_down = 100
    my_segmentchains = [
        SegmentChain(GenomicSegment("chrA", 50, 350, "+"),
                     GenomicSegment("chrA", 500, 900, "+")),
        SegmentChain(GenomicSegment("chrA", 50, 350, "-"),
                     GenomicSegment("chrA", 500, 900, "-")),
    ]
    for my_segmentchain in my_segmentchains:
        for landmark in range(0, 700, 50):
            yield check_window_landmark, my_segmentchain, landmark, flank_up, flank_down
Ejemplo n.º 4
0
    def __getitem__(self, roi, stranded=True):
        """Return list of features that overlap the region of interest (roi).
        
        Parameters
        ----------
        roi : |GenomicSegment| or |SegmentChain|
            Query feature indicating region of interest

        stranded : bool
            If `True`, retrieve only features on same strand as query feature.
            Otherwise, retrieve features on both strands
                             
                             
        Returns
        -------
        list
            Features that overlap `roi`

        Raises
        ------
        TypeError
            if `roi` is not a |GenomicSegment| or |SegmentChain|
        """
        if isinstance(roi, GenomicSegment):
            #roi_chain = SegmentChain(roi)
            roi_seg = roi
            roi_chain = SegmentChain(roi)
        elif isinstance(roi, SegmentChain):
            roi_chain = roi
            roi_seg = roi.spanning_segment
        else:
            raise TypeError(
                "Query feature must be a GenomicSegment or SegmentChain")

        chrom = roi_seg.chrom
        feature_text = "\n".join(["\n".join(list(R.fetch(chrom,
                                                         X.start,
                                                         X.end))) \
                                                         for R in self.tabix_readers \
                                                         for X in roi_chain])

        features = (self._reader_class(cStringIO.StringIO(feature_text)))
        if stranded == True:
            features = [X for X in features if roi_chain.overlaps(X)]
        else:
            features = [
                X for X in features if roi_chain.unstranded_overlaps(X)
            ]
        return features
Ejemplo n.º 5
0
def _quantify_tfam(orf_set, gnds):
    """Performs non-negative least squares regression to quantify all of the ORFs in a transcript family, using a simplified profile consisting of
    the same three numbers tiled across each ORF. All readlengths are treated identically. Regions around start and stop codons are masked in
    accordance with startmask and stopmask"""
    strand = orf_set['strand'].iat[0]
    chrom = orf_set['chrom'].iat[0]
    tids = orf_set['tid'].drop_duplicates().tolist()
    all_tfam_genpos = set()
    tid_genpos = {}
    tlens = {}
    for (i, tid) in enumerate(tids):
        currtrans = SegmentChain.from_bed(bedlinedict[tid])
        curr_pos_set = currtrans.get_position_set()
        tlens[tid] = len(curr_pos_set)
        tid_genpos[tid] = curr_pos_set
        all_tfam_genpos.update(curr_pos_set)
    all_tfam_genpos = np.array(sorted(all_tfam_genpos))
    if strand == '-':
        all_tfam_genpos = all_tfam_genpos[::-1]
    nnt = len(all_tfam_genpos)
    tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True))
                   for (tid, curr_tid_genpos) in tid_genpos.iteritems()}
    orf_matrix = np.zeros((nnt, len(orf_set)))
    ignore_coords = []
    for (orf_num, (tid, tcoord, tstop, AAlen)) in enumerate(orf_set[['tid', 'tcoord', 'tstop', 'AAlen']].itertuples(False)):
        orf_matrix[tid_indices[tid][tcoord:tstop], orf_num] = np.tile(cdsprof, AAlen + 1)
        ignore_coords.append(tid_indices[tid][max(tcoord+startmask[0], 0):tcoord+startmask[1]])
        ignore_coords.append(tid_indices[tid][max(tstop+stopmask[0], 0):tstop+stopmask[1]])
    ignore_coords = np.unique(np.concatenate(ignore_coords))
    orf_matrix[ignore_coords, :] = 0  # mask out all positions within the mask region around starts and stops
    valid_orfs = np.array([(orf_matrix[:, i] > 0).any() and (orf_matrix.T[i, :] != orf_matrix.T[:i, :]).any(1).all() for i in xrange(len(orf_set))])
    # require at least one valid position, and if >1 ORFs are identical, only include one of them
    orf_matrix[:, ~valid_orfs] = 0  # completely ignore these positions
    valid_nts = (orf_matrix > 0).any(1)  # only bother checking nucleotides where there is a valid ORF
    orf_res = orf_set.copy()
    if valid_nts.any():
        orf_matrix = orf_matrix[valid_nts, :]
        valid_nt_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos[valid_nts])))
        orf_res['nts_quantified'] = (orf_matrix > 0).sum(0)  # the number of nucleotides included in the quantification
        for colname, gnd in zip(colnames, gnds):
            orf_res[colname] = nnls(orf_matrix, valid_nt_segs.get_counts(gnd))[0]
            # gnd is a HashedReadBAMGenomeArray, but it still works with get_counts(), which will collapse all read lengths to a single array
        return orf_res
    else:
        orf_res['nts_quantified'] = 0
        for colname in colnames:
            orf_res[colname] = 0.
        return orf_res
Ejemplo n.º 6
0
    def get_overlapping_features(self, roi, stranded=True):
        """Return list of features overlapping `roi`.
        
        Parameters
        ----------
        roi : |GenomicSegment| or |SegmentChain|
            Query feature indicating region of interest

        stranded : bool
            if `True`, retrieve only features on same strand as query feature.
            Otherwise, retrieve features on both strands
                             
                             
        Returns
        -------
        list
           Features overlapping `roi`

        Raises
        ------
        TypeError
            if `roi` is not a |GenomicSegment| or |SegmentChain|
        """
        nearby_features = self.get_nearby_features(roi, stranded=stranded)
        if isinstance(roi, GenomicSegment):
            roi = SegmentChain(roi)

        if stranded == False:
            fn = roi.unstranded_overlaps
        else:
            fn = roi.overlaps
        return [X for X in nearby_features if fn(X) == True]
Ejemplo n.º 7
0
def do_count(args, alignment_parser):
    """Count the number and density covering each merged gene in an annotation made made using the `generate` subcommand).
    
    Parameters
    ----------
    args : :py:class:`argparse.Namespace`
        command-line arguments for ``count`` subprogram    
    """
    # we expect many zero-lenght segmentchains, so turn these off for now
    warnings.filterwarnings(
        "ignore",
        ".*zero-length SegmentChain.*",
    )

    keys = ("exon", "utr5", "cds", "utr3")
    column_order = ["region"]
    gene_positions = read_pl_table(args.position_file)

    # read count files
    ga = alignment_parser.get_genome_array_from_args(args, printer=printer)
    total_counts = ga.sum()

    normconst = 1000.0 * 1e6 / total_counts

    printer.write("Dataset has %s counts in it." % total_counts)
    printer.write("Tallying genes ...")

    dtmp = {"region": []}
    for x in keys:
        for y in ("reads", "length", "rpkm"):
            label = "%s_%s" % (x, y)
            dtmp[label] = []
            column_order.append(label)

    for i, name in enumerate(gene_positions["region"]):
        dtmp["region"].append(name)
        if i % 500 == 0:
            printer.write("Processed %s genes ..." % i)

        for k in keys:
            ivc = SegmentChain.from_str(gene_positions[k][i])
            total = sum(ivc.get_counts(ga))
            length = ivc.length
            rpkm = (normconst * total / length) if length > 0 else numpy.nan
            dtmp["%s_reads" % k].append(total)
            dtmp["%s_length" % k].append(length)
            dtmp["%s_rpkm" % k].append(rpkm)

    fout = argsopener("%s.txt" % args.outbase, args, "w")
    dtmp = pd.DataFrame(dtmp)
    dtmp.to_csv(fout,
                sep="\t",
                header=True,
                index=False,
                columns=column_order,
                na_rep="nan",
                float_format="%.8f")

    fout.close()
    printer.write("Done.")
Ejemplo n.º 8
0
 def test_on_SegmentChain_exclude(self):
     features = [
         SegmentChain(self.ivs[n], **self.attrs[n])
         for n in range(len(self.ivs))
     ]
     self.assertEqual(get_identical_attributes(features, exclude=["type"]),
                      self.common_attr)
Ejemplo n.º 9
0
 def test_on_SegmentChain_no_exclude(self):
     features = [
         SegmentChain(self.ivs[n], **self.attrs[n])
         for n in range(len(self.ivs))
     ]
     common_plus_type = copy.deepcopy(self.common_attr)
     common_plus_type["type"] = "exon"
     self.assertEqual(get_identical_attributes(features), common_plus_type)
Ejemplo n.º 10
0
    def __getitem__(self,roi,stranded=True):
        """Return list of features that overlap the region of interest (roi).
        
        Parameters
        ----------
        roi : |GenomicSegment| or |SegmentChain|
            Query feature indicating region of interest

        stranded : bool
            If `True`, retrieve only features on same strand as query feature.
            Otherwise, retrieve features on both strands
                             
                             
        Returns
        -------
        list
            Features that overlap `roi`

        Raises
        ------
        TypeError
            if `roi` is not a |GenomicSegment| or |SegmentChain|
        """
        if isinstance(roi,GenomicSegment):
            #roi_chain = SegmentChain(roi)
            roi_seg = roi
            roi_chain = SegmentChain(roi)
        elif isinstance(roi,SegmentChain):
            roi_chain = roi
            roi_seg = roi.spanning_segment
        else:
            raise TypeError("Query feature must be a GenomicSegment or SegmentChain")
        
        chrom = roi_seg.chrom
        feature_text = "\n".join(["\n".join(list(R.fetch(chrom,
                                                         X.start,
                                                         X.end))) \
                                                         for R in self.tabix_readers \
                                                         for X in roi_chain])
            
        features = (self._reader_class(cStringIO.StringIO(feature_text)))
        if stranded == True:
            features = [X for X in features if roi_chain.overlaps(X)]
        else:
            features = [X for X in features if roi_chain.unstranded_overlaps(X)]
        return features
Ejemplo n.º 11
0
def _get_tid_info(tup):
    """For each transcript on this chromosome/strand, identifies every sub-sequence of the appropriate length (fpsize), converts it to an integer,
    identifies the number of reads mapping to that position, and outputs all of that information to a pandas HDF store."""
    (chrom, strand) = tup
    inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles]
    gnd = BAMGenomeArray(inbams, mapping=FivePrimeMapFactory(psite))
    # map to roughly the center of each read so that identical sequences that cross different splice sites
    # (on different transcripts) still end up mapping to the same place
    gnd.add_filter('size', SizeFilterFactory(opts.minlen, opts.maxlen))

    tid_seq_info = []
    tid_summary = pd.DataFrame(
        {'chrom': chrom, 'strand': strand, 'n_psite': -1, 'n_reads': -1, 'peak_reads': -1, 'dropped': ''},
        index=pd.Index(bedlinedict[(chrom, strand)].keys(), name='tid'))
    for (tid, line) in bedlinedict[(chrom, strand)].iteritems():
        currtrans = SegmentChain.from_bed(line)
        curr_pos_list = currtrans.get_position_list()  # not in stranded order!
        if strand == '-':
            curr_pos_list = curr_pos_list[::-1]
        n_psite = len(curr_pos_list) + 1 - fpsize
        tid_summary.at[tid, 'n_psite'] = n_psite
        if n_psite > 0:
            curr_counts = np.array(currtrans.get_counts(gnd))[psite:n_psite + psite]
            #                if((curr_counts>0).any()):
            sumcounts = curr_counts.sum()
            maxcounts = curr_counts.max()
            tid_summary.at[tid, 'n_reads'] = sumcounts
            tid_summary.at[tid, 'peak_reads'] = maxcounts
            if sumcounts >= opts.minreads:
                if maxcounts < sumcounts * opts.peakfrac:
                    numseq = np.array(list(currtrans.get_sequence(genome).upper().translate(str_dict)))
                    curr_seq = ''.join(numseq)
                    tid_seq_info.append(pd.DataFrame({'tid': tid,
                                                      'genpos': curr_pos_list[psite:n_psite + psite],
                                                      'seq': np.array([(int(curr_seq[i:i + fpsize], 4) if 'N' not in curr_seq[i:i + fpsize] else -1)
                                                                       for i in xrange(n_psite)], dtype=np.int64),
                                                      'reads': curr_counts}))
                else:
                    tid_summary.at[tid, 'dropped'] = 'peakfrac'
            else:
                tid_summary.at[tid, 'dropped'] = 'lowreads'
    if tid_seq_info:  # don't bother saving anything if there's nothing to save
        pd.concat(tid_seq_info, ignore_index=True).to_hdf(seq_info_hdf % (chrom, strand), 'tid_seq_info', format='t',
                                                          data_columns=True, complevel=1, complib='blosc')
    #    sp.call(['ptrepack', orig_store_name, seq_info_hdf%(chrom,strand)])  # repack for efficiency
    #    os.remove(orig_store_name)
    if opts.verbose > 1:
        with log_lock:
            logprint('%s (%s strand) complete' % (chrom, strand))

    for inbam in inbams:
        inbam.close()

    return tid_summary
Ejemplo n.º 12
0
 def test_search_fields_multivalue(self):
     reader = BigBedReader(self.bb_indexed)
     found = list(
         reader.search("name", "should_have_no_match",
                       "should_also_have_no_match"))
     self.assertEqual([], found)
     found = list(reader.search("Name", "Sam-S-RE", "Sam-S-RK"))
     expected = [
         SegmentChain(GenomicSegment('2L', 106902, 107000, '+'),
                      GenomicSegment('2L', 107764, 107838, '+'),
                      GenomicSegment('2L', 108587, 108809, '+'),
                      GenomicSegment('2L', 110405, 110483, '+'),
                      GenomicSegment('2L', 110754, 110877, '+'),
                      GenomicSegment('2L', 111906, 112019, '+'),
                      GenomicSegment('2L', 112689, 113369, '+'),
                      GenomicSegment('2L', 113433, 114432, '+'),
                      Alias="'['M(2)21AB-RE', 'CG2674-RE']'",
                      ID='FBtr0089437',
                      Name='Sam-S-RE',
                      color='#000000',
                      gene_id='FBgn0005278',
                      score='0.0',
                      thickend='113542',
                      thickstart='108685',
                      type='exon'),
         SegmentChain(GenomicSegment('2L', 107760, 107838, '+'),
                      GenomicSegment('2L', 108587, 108809, '+'),
                      GenomicSegment('2L', 110405, 110483, '+'),
                      GenomicSegment('2L', 110754, 111337, '+'),
                      Alias='na',
                      ID='FBtr0308091',
                      Name='Sam-S-RK',
                      color='#000000',
                      gene_id='FBgn0005278',
                      score='0.0',
                      thickend='110900',
                      thickstart='108685',
                      type='exon'),
     ]
     self.assertEqual(expected, found)
Ejemplo n.º 13
0
    def test_variable_stratified_mapping_plus(self):
        offsets = {
            26 : 6,
            27 : 22,
            28 : 13,
            29 : 4,
            30 : 5
        }

        chains = {
            "fw" : SegmentChain(GenomicSegment('chrII',392959,393180,'+'),
                              GenomicSegment('chrII',393510,394742,'+'),
                              GenomicSegment('chrII',394860,394901,'+'),
                              ID='YBR078W_mRNA'),
            "rc" : SegmentChain(GenomicSegment('chrVIII',189061,189749,'-'),
                              GenomicSegment('chrVIII',189850,190017,'-'),
                              ID='YHR041C_mRNA')
        }
        expected = {
            "fw" : numpy.loadtxt(resource_filename("plastid","test/data/stratmap/strat_fw_vec.txt"),delimiter="\t"),
            "rc" : numpy.loadtxt(resource_filename("plastid","test/data/stratmap/strat_rc_vec.txt"),delimiter="\t"),
        }
        ga = BAMGenomeArray([resource_filename("plastid","test/data/stratmap/strat.bam")])
        ga.set_mapping(StratifiedVariableFivePrimeMapFactory(offsets,26,30))
Ejemplo n.º 14
0
 def filter(self,line):
     """Parse a read alignment as |SegmentChain| from a line of `bowtie`_ output"""
     items = line.strip("\n").split("\t")
     read_name      = items[0]
     strand         = items[1]
     ref_seq        = items[2]
     coord          = int(items[3])
     attr = { 'seq_as_aligned' : items[4],
              'qualstr'        : items[5],
              'mismatch_str'   : items[7],
              'type'           : "alignment",
              'ID'             : read_name,
            }
     
     iv = GenomicSegment(ref_seq,coord,coord+len(attr['seq_as_aligned']),strand)
     feature = SegmentChain(iv,**attr)
     return feature
Ejemplo n.º 15
0
def covered_by_repetitive(query_junc,minus_range,plus_range,cross_hash):
    """Determine whether one or both ends of a splice site overlap with
    a repetitive area of the genome.
    
    Parameters
    ----------
    query_junc : |SegmentChain|
         A two-exon fragment representing a query splice junction
    
    minus_range : int <= 0
        Maximum number of nucleotides splice junction could be moved 
        to the left without reducing sequence support for the junction
        see :py:func:`find_match_range`
        
    plus_range : int >= 0
        Maximum number of nucleotides splice junction could be moved 
        to the right without reducing sequence support for the junction
        see :py:func:`find_match_range`
    
    cross_hash : |GenomeHash|
        |GenomeHash| of 1-length features denoting repetitive regions of the genome
        
    
    Returns
    -------
    bool
        `True` if any of the genomic positions within `minus_range...plus_range`
        of the 5' or 3' splice sites of `query_junc` overlap a repetitive
        region of the genome as annotated by ``cross_hash``.
        Otherwise, `False`
    """
    chrom = query_junc.spanning_segment.chrom
    strand = query_junc.spanning_segment.strand
    qend = query_junc[0].end
    qstart = query_junc[1].start
    fiveprime_splice_area = GenomicSegment(chrom,
                                           qend + minus_range,
                                           qend + plus_range + 1,
                                           strand)
    threeprime_splice_area = GenomicSegment(chrom,
                                            qstart + minus_range,
                                            qstart + plus_range + 1,
                                            strand)
    support_region = SegmentChain(fiveprime_splice_area,threeprime_splice_area)
    return len(cross_hash.get_overlapping_features(support_region)) > 0
Ejemplo n.º 16
0
def roi_row_to_cds(row):
    """Helper function to extract coding portions from maximal spanning windows
    flanking CDS starts that are created by |metagene| ``generate`` subprogram.
    
    Parameters
    ----------
    row : (int, Series)
        Row from a :class:`pandas.DataFrame` of an ROI file made by the |metagene|
        ``generate`` subprogram
        
    Returns
    -------
    |SegmentChain|
        Coding portion of maximal spanning window
    """
    chainstr, alignment_offset, zero_point = row[1][["region","alignment_offset","zero_point"]]
    chain = SegmentChain.from_str(chainstr)
    cds_start = zero_point - alignment_offset
    subchain = chain.get_subchain(cds_start,chain.length)
    return subchain
Ejemplo n.º 17
0
def roi_row_to_cds(row):
    """Helper function to extract coding portions from maximal spanning windows
    flanking CDS starts that are created by |metagene| ``generate`` subprogram.
    
    Parameters
    ----------
    row : (int, Series)
        Row from a :class:`pandas.DataFrame` of an ROI file made by the |metagene|
        ``generate`` subprogram
        
    Returns
    -------
    |SegmentChain|
        Coding portion of maximal spanning window
    """
    chainstr, alignment_offset, zero_point = row[1][[
        "region", "alignment_offset", "zero_point"
    ]]
    chain = SegmentChain.from_str(chainstr)
    cds_start = zero_point - alignment_offset
    subchain = chain.get_subchain(cds_start, chain.length)
    return subchain
Ejemplo n.º 18
0
def _get_annotated_counts_by_chrom(chrom_to_do):
    """Accumulate counts from annotated CDSs into a metagene profile. Only the longest CDS in each transcript family will be included, and only if it
    meets the minimum number-of-reads requirement. Reads are normalized by gene, so every gene included contributes equally to the final metagene."""
    found_cds = pd.read_hdf(opts.orfstore, 'all_orfs', mode='r',
                            where="chrom == '%s' and orftype == 'annotated' and tstop > 0 and tcoord > %d and AAlen > %d"
                                  % (chrom_to_do, -startnt[0], min_AAlen),
                            columns=['orfname', 'tfam', 'tid', 'tcoord', 'tstop', 'AAlen']) \
        .sort_values('AAlen', ascending=False).drop_duplicates('tfam')  # use the longest annotated CDS in each transcript family
    num_cds_incl = 0  # number of CDSs included from this chromosome
    startprof = np.zeros((len(rdlens), startlen))
    cdsprof = np.zeros((len(rdlens), 3))
    stopprof = np.zeros((len(rdlens), stoplen))
    inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles]
    gnd = HashedReadBAMGenomeArray(inbams, ReadKeyMapFactory(Pdict, read_length_nmis))

    for (tid, tcoord, tstop) in found_cds[['tid', 'tcoord', 'tstop']].itertuples(False):
        curr_trans = SegmentChain.from_bed(bedlinedict[tid])
        tlen = curr_trans.get_length()
        if tlen >= tstop + stopnt[1]:  # need to guarantee that the 3' UTR is sufficiently long
            curr_hashed_counts = get_hashed_counts(curr_trans, gnd)
            cdslen = tstop+stopnt[1]-tcoord-startnt[0]  # cds length, plus the extra bases...
            curr_counts = np.zeros((len(rdlens), cdslen))
            for (i, rdlen) in enumerate(rdlens):
                for nmis in range(opts.max5mis+1):
                    curr_counts[i, :] += curr_hashed_counts[(rdlen, nmis)][tcoord+startnt[0]:tstop+stopnt[1]]
                    # curr_counts is limited to the CDS plus any extra requested nucleotides on either side
            if curr_counts.sum() >= opts.mincdsreads:
                curr_counts /= curr_counts.mean()  # normalize by mean of counts across all readlengths and positions within the CDS
                startprof += curr_counts[:, :startlen]
                cdsprof += curr_counts[:, startlen:cdslen-stoplen].reshape((len(rdlens), -1, 3)).mean(1)
                stopprof += curr_counts[:, cdslen-stoplen:cdslen]
                num_cds_incl += 1

    for inbam in inbams:
        inbam.close()

    return startprof, cdsprof, stopprof, num_cds_incl
Ejemplo n.º 19
0
def revcomp_mask_chain(seg, k, offset=0):
    """Reverse-complement a single-interval mask, correcting for `offset`.
    
    Parameters
    ----------
    seg : |SegmentChain|
        Plus-strand mask, including `offset`

    k : int
        Length of k-mers

    offset : int, optional
        Offset from 5' end of read at which to map mask (Default: `0`)

    Returns
    -------
    |SegmentChain|
        Mask on minus strand corresponding to `seg`
    """
    # Algorithm note:
    #
    #     Let
    #         FW = plus-strand coordinate
    #         RC = minus-strand coordinate
    #
    #     Then
    #         RC = FW + k - 1 - offset
    #
    #     But we are given FW + offset, so:
    #
    #         RC + offset = (FW + offset) + k - 1 - offset
    #         RC = (FW + offset) + k - 1 - 2*offset
    span = seg.spanning_segment
    new_offset = k - 1 - 2 * offset
    ivminus = GenomicSegment(span.chrom, span.start + new_offset,
                             span.end + new_offset, "-")
    return SegmentChain(ivminus)
Ejemplo n.º 20
0
def _get_annotated_counts_by_chrom(chrom_to_do):
    """Accumulate counts from annotated CDSs into a metagene profile. Only the longest CDS in each transcript family will be included, and only if it
    meets the minimum number-of-reads requirement. Reads are normalized by gene, so every gene included contributes equally to the final metagene."""
    found_cds = pd.read_hdf(opts.orfstore, 'all_orfs', mode='r',
                            where="chrom == '%s' and orftype == 'annotated' and tstop > 0 and tcoord > %d and AAlen > %d"
                                  % (chrom_to_do, -startnt[0], min_AAlen),
                            columns=['orfname', 'tfam', 'tid', 'tcoord', 'tstop', 'AAlen']) \
        .sort_values('AAlen', ascending=False).drop_duplicates('tfam')  # use the longest annotated CDS in each transcript family
    num_cds_incl = 0  # number of CDSs included from this chromosome
    startprof = np.zeros((len(rdlens), startlen))
    cdsprof = np.zeros((len(rdlens), 3))
    stopprof = np.zeros((len(rdlens), stoplen))
    inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles]
    gnd = HashedReadBAMGenomeArray(inbams, ReadKeyMapFactory(Pdict, read_length_nmis))

    for (tid, tcoord, tstop) in found_cds[['tid', 'tcoord', 'tstop']].itertuples(False):
        curr_trans = SegmentChain.from_bed(bedlinedict[tid])
        tlen = curr_trans.get_length()
        if tlen >= tstop + stopnt[1]:  # need to guarantee that the 3' UTR is sufficiently long
            curr_hashed_counts = get_hashed_counts(curr_trans, gnd)
            cdslen = tstop+stopnt[1]-tcoord-startnt[0]  # cds length, plus the extra bases...
            curr_counts = np.zeros((len(rdlens), cdslen))
            for (i, rdlen) in enumerate(rdlens):
                for nmis in range(opts.max5mis+1):
                    curr_counts[i, :] += curr_hashed_counts[(rdlen, nmis)][tcoord+startnt[0]:tstop+stopnt[1]]
                    # curr_counts is limited to the CDS plus any extra requested nucleotides on either side
            if curr_counts.sum() >= opts.mincdsreads:
                curr_counts /= curr_counts.mean()  # normalize by mean of counts across all readlengths and positions within the CDS
                startprof += curr_counts[:, :startlen]
                cdsprof += curr_counts[:, startlen:cdslen-stoplen].reshape((len(rdlens), -1, 3)).mean(1)
                stopprof += curr_counts[:, cdslen-stoplen:cdslen]
                num_cds_incl += 1

    for inbam in inbams:
        inbam.close()

    return startprof, cdsprof, stopprof, num_cds_incl
Ejemplo n.º 21
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: sys.argv[1:] (actually command-line arguments)
    """
    ap = AnnotationParser(input_choices=_ANNOTATION_INPUT_CHOICES)
    annotation_file_parser = ap.get_parser()
    
    bp = BaseParser()
    base_parser = bp.get_parser()
    
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[base_parser,annotation_file_parser])
    parser.add_argument("--export_tophat",default=False,action="store_true",
                         help="Export tophat `.juncs` file in addition to BED output")
    parser.add_argument("outbase",type=str,help="Basename for output files")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    
    transcripts = ap.get_transcripts_from_args(args,printer=printer,return_type=SegmentChain)
    
    with argsopener("%s.bed" % args.outbase,args,"w") as bed_out:
        if args.export_tophat == True:
            tophat_out = open("%s.juncs" % args.outbase,"w")
    
        printer.write("params: " +" ".join(argv))
        printer.write("Detecting & comparing junctions...")
        ex_pairs = {}
        
        c = 0
        u = 0
        for chain in transcripts:
            if len(chain) > 1: # if multi-exon
                chrom = chain.chrom
                strand = chain.strand
                try:
                    ep = ex_pairs[(chrom,strand)]
                except KeyError:
                    ex_pairs[(chrom,strand)] = []
                    ep = ex_pairs[(chrom,strand)]

                for i in range(0,len(chain)-1):
                    
                    seg1 = chain[i]
                    seg2 = chain[i+1]
                    if c % 1000 == 0 and c > 0:
                        printer.write("Processed %s junctions. Found %s unique..." % (c,u) )
                    c+=1
                    key = (seg1.end,seg2.start)
                        
                    if key not in ep:
                        ep.append(key)
                        u += 1
                        new_chain = SegmentChain(seg1,seg2)
                        bed_out.write(new_chain.as_bed())
                        if args.export_tophat == True:
                            my_junc = (chrom,seg1.end-1,seg2.start,strand)
                            tophat_out.write("%s\t%s\t%s\t%s\n" % my_junc)
                        
                        del new_chain
                    
                    del seg1
                    del seg2
                    
            del chain
    
        printer.write("Processed %s total junctions. Found %s unique." % (c,u) )
    
        bed_out.close()
        if args.export_tophat == True:
            tophat_out.close()

    printer.write("Done.")
Ejemplo n.º 22
0
def _identify_tfam_orfs((tfam, tids)):
    """Identify all of the possible ORFs within a family of transcripts. Relevant information such as genomic start and stop positions, amino acid
    length, and initiation codon will be collected for each ORF. Additionally, each ORF will be assigned a unique 'orfname', such that if it occurs
    on multiple transcripts, it can be recognized as the same ORF."""
    currtfam = SegmentChain.from_bed(tfambedlines[tfam])
    chrom = currtfam.chrom
    strand = currtfam.strand
    tfam_genpos = np.array(currtfam.get_position_list())
    if strand == '-':
        tfam_genpos = tfam_genpos[::-1]
    tmask = np.empty((len(tids), len(tfam_genpos)), dtype=np.bool)  # True if transcript covers that position, False if not
    tfam_orfs = []
    tidx_lookup = {}
    for tidx, tid in enumerate(tids):
        tidx_lookup[tid] = tidx
        curr_trans = Transcript.from_bed(bedlinedict[tid])
        tmask[tidx, :] = np.in1d(tfam_genpos, curr_trans.get_position_list(), assume_unique=True)
        trans_orfs = _find_all_orfs(curr_trans.get_sequence(genome).upper())
        if trans_orfs:
            (startpos, stoppos, codons) = zip(*trans_orfs)
            startpos = np.array(startpos, dtype='i4')
            stoppos = np.array(stoppos, dtype='i4')

            gcoords = np.array([curr_trans.get_genomic_coordinate(x)[1] for x in startpos], dtype='i4')

            stop_present = (stoppos > 0)
            gstops = np.zeros(len(trans_orfs), dtype='i4')
            gstops[stop_present] = \
                np.array([curr_trans.get_genomic_coordinate(x - 1)[1] for x in stoppos[stop_present]]) + (1 if strand == '+' else -1)
            # the decrementing/incrementing stuff preserves half-openness regardless of strand

            AAlens = np.zeros(len(trans_orfs), dtype='i4')
            AAlens[stop_present] = (stoppos[stop_present] - startpos[stop_present])/3 - 1
            tfam_orfs.append(pd.DataFrame.from_items([('tfam', tfam),
                                                      ('tid', tid),
                                                      ('tcoord', startpos),
                                                      ('tstop', stoppos),
                                                      ('chrom', chrom),
                                                      ('gcoord', gcoords),
                                                      ('gstop', gstops),
                                                      ('strand', strand),
                                                      ('codon', codons),
                                                      ('AAlen', AAlens),
                                                      ('orfname', '')]))
    if any(x is not None for x in tfam_orfs):
        orf_pos_dict = {}
        tfam_orfs = pd.concat(tfam_orfs, ignore_index=True)
        for ((gcoord, AAlen), gcoord_grp) in tfam_orfs.groupby(['gcoord', 'AAlen']):  # group by genomic start position and length
            if len(gcoord_grp) == 1:
                tfam_orfs.loc[gcoord_grp.index, 'orfname'] = _name_orf(tfam, gcoord, AAlen)
            else:
                orf_gcoords = np.vstack(np.flatnonzero(tmask[tidx_lookup[tid], :])[tcoord:tstop]
                                        for (tid, tcoord, tstop) in gcoord_grp[['tid', 'tcoord', 'tstop']].itertuples(False))
                if (orf_gcoords == orf_gcoords[0, :]).all():  # all of the grouped ORFs are identical, so should receive the same name
                    orfname = _name_orf(tfam, gcoord, AAlen)
                    tfam_orfs.loc[gcoord_grp.index, 'orfname'] = orfname
                    orf_pos_dict[orfname] = tfam_genpos[orf_gcoords[0, :]]
                else:
                    named_so_far = 0
                    unnamed = np.ones(len(gcoord_grp), dtype=np.bool)
                    basename = _name_orf(tfam, gcoord, AAlen)
                    while unnamed.any():
                        next_gcoords = orf_gcoords[unnamed, :][0, :]
                        identicals = (orf_gcoords == next_gcoords).all(1)
                        orfname = '%s_%d' % (basename, named_so_far)
                        tfam_orfs.loc[gcoord_grp.index[identicals], 'orfname'] = orfname
                        orf_pos_dict[orfname] = tfam_genpos[next_gcoords]
                        unnamed[identicals] = False
                        named_so_far += 1

        # Now that the ORFs have been found and named, figure out their orftype
        tfam_orfs['annot_start'] = False
        tfam_orfs['annot_stop'] = False  # start out assuming all are False; replace with True as needed
        tfam_orfs['orftype'] = 'new'
        tfam_orfs['untyped'] = tfam_orfs['tstop'] > 0
        tfam_orfs.loc[~tfam_orfs['untyped'], 'orftype'] = 'nonstop'  # no stop codon
        if tfam in tfams_with_annots:
            cds_info = []
            all_annot_pos = set()
            for (annot_fidx, (annot_tfam_lookup, annot_tid_lookup)) in enumerate(zip(annot_tfam_lookups, annot_tid_lookups)):
                if tfam in annot_tfam_lookup:
                    for (annot_tidx, annot_tid) in enumerate(annot_tfam_lookup[tfam]):
                        curr_trans = Transcript.from_bed(annot_tid_lookup[annot_tid])
                        if curr_trans.cds_start is not None and curr_trans.cds_end is not None:
                            curr_cds_pos_set = curr_trans.get_cds().get_position_set()
                            curr_len = len(curr_cds_pos_set)
                            if curr_len % 3 == 0:
                                curr_gcoord = curr_trans.get_genomic_coordinate(curr_trans.cds_start)[1]
                                curr_gstop = curr_trans.get_genomic_coordinate(curr_trans.cds_end - 1)[1] + (1 if strand == '+' else -1)
                                in_tfam = curr_cds_pos_set.issubset(tfam_genpos)
                                cds_info.append((curr_gcoord, curr_gstop, (curr_len-3)/3, in_tfam, annot_fidx, annot_tid, curr_cds_pos_set))
                                all_annot_pos.update(curr_cds_pos_set)
            if cds_info:  # False means no annotated CDSs or none are multiples of 3 in length
                cds_info = pd.DataFrame(cds_info, columns=['gcoord', 'gstop', 'AAlen', 'in_tfam', 'annot_fidx', 'annot_tid', 'pos']) \
                    .groupby(['gcoord', 'gstop', 'AAlen', 'in_tfam'], as_index=False) \
                    .apply(lambda x: x if len(x) == 1 else x[[not any(pos == x['pos'].iat[j] for j in xrange(i))
                                                              for (i, pos) in enumerate(x['pos'])]]) \
                    .set_index(['annot_fidx', 'annot_tid'])
                # this operation organizes cds_info into a dataframe and effectively drops duplicates
                # pandas drop_duplicates() is incompatible with sets so have to do it this manual way
                # the combination of annot_fidx (the number of the file if more than one annotation file provided) and annot_tid should be a unique ID
                tfam_orfs['annot_start'] = tfam_orfs['gcoord'].isin(cds_info['gcoord'])
                tfam_orfs['annot_stop'] = tfam_orfs['gstop'].isin(cds_info['gstop'])

                def _get_orf_pos(orfname, tid=None, tcoord=None, tstop=None):
                    """Helper function that identifies the genomic coordinates of an ORF (in stranded order) and caches them by orfname"""
                    if orfname in orf_pos_dict:
                        return orf_pos_dict[orfname]
                    else:
                        if tid is None or tcoord is None or tstop is None:
                            (tid, tcoord, tstop) = tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['tid', 'tcoord', 'tstop']].iloc[0]
                        res = tfam_genpos[np.flatnonzero(tmask[tidx_lookup[tid], :])[tcoord:tstop]]
                        orf_pos_dict[orfname] = res
                        return res

                # ANNOTATED and XISO
                cds_info['found'] = False
                possible_annot = tfam_orfs.drop_duplicates('orfname').merge(cds_info[cds_info['in_tfam']].reset_index())
                # merges on gcoord, gstop, and len - need to reset_index to preserve annot_fidx and annot_tid
                for ((orfname, tid, tcoord, tstop), cds_grp) in possible_annot.groupby(['orfname', 'tid', 'tcoord', 'tstop']):
                    orf_pos = _get_orf_pos(orfname, tid, tcoord, tstop)
                    for (annot_fidx, annot_tid, cds_pos_set) in cds_grp[['annot_fidx', 'annot_tid', 'pos']].itertuples(False):
                        if cds_pos_set.issubset(orf_pos):
                            tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['annotated', False]
                            cds_info.loc[(annot_fidx, annot_tid), 'found'] = True
                            break
                    else:
                        tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['Xiso', False]
                        # matching start and stop but differing in between
                if tfam_orfs['untyped'].any():
                    tfam_orfs.loc[tfam_orfs['orfname'].isin(tfam_orfs[tfam_orfs['untyped']].merge(cds_info[['gcoord', 'gstop']])['orfname']),
                                  ['orftype', 'untyped']] = ['Xiso', False]
                    # matching start and stop, but must differ somewhere, otherwise would have been identified as annotated (Xiso => "exact isoform")

                # SISO
                tfam_orfs.loc[tfam_orfs['annot_start'] & tfam_orfs['annot_stop'] & tfam_orfs['untyped'], ['orftype', 'untyped']] = ['Siso', False]
                # start and stop each match at least one CDS, but not the same one (Siso => "spliced isoform")

                # CISO
                tfam_orfs.loc[tfam_orfs['annot_start'] & tfam_orfs['untyped'], ['orftype', 'untyped']] = ['Ciso', False]
                # start is annotated, but stop is not - so must be on a new transcript (Ciso => "C-terminal isoform")

                # TRUNCATION
                if tfam_orfs['untyped'].any():
                    found_matched_stop = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'],
                                                                               on=['tid', 'tstop'], suffixes=('', '_annot'))
                    tfam_orfs.loc[tfam_orfs['orfname'].isin(found_matched_stop.loc[found_matched_stop['tcoord'] > found_matched_stop['tcoord_annot'],
                                                                                   'orfname']), ['orftype', 'untyped']] = ['truncation', False]
                # on the same transcript with an annotated CDS, with matching stop codon, initiating downstream - must be a truncation
                # still some missing truncations, if the original CDS was not on a transcript in the present transcriptome
                if tfam_orfs['untyped'].any() and not cds_info['found'].all():
                    possible_truncs = tfam_orfs[tfam_orfs['untyped']].drop_duplicates('orfname') \
                        .merge(cds_info.loc[~cds_info['found'], ['gstop', 'pos', 'AAlen']], on='gstop', suffixes=('', '_annot'))
                    possible_truncs = possible_truncs[possible_truncs['AAlen'] < possible_truncs['AAlen_annot']]
                    for ((orfname, tid, tcoord, tstop, gcoord), cds_pos_sets) in \
                            possible_truncs.groupby(['orfname', 'tid', 'tcoord', 'tstop', 'gcoord'])['pos']:
                        orf_pos = _get_orf_pos(orfname, tid, tcoord, tstop)
                        if strand == '-':
                            if any(cds_pos_set.issuperset(orf_pos) and
                                   all(pos in orf_pos for pos in cds_pos_set if pos <= gcoord) for cds_pos_set in cds_pos_sets):
                                tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['truncation', False]
                        else:
                            if any(cds_pos_set.issuperset(orf_pos) and
                                   all(pos in orf_pos for pos in cds_pos_set if pos >= gcoord) for cds_pos_set in cds_pos_sets):
                                tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['truncation', False]
                        # matching stop codon, contained within, and all positions in the annotation past the orf start codon are included in the orf

                # EXTENSION
                if tfam_orfs['untyped'].any():
                    found_matched_stop = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'],
                                                                               on=['tid', 'tstop'], suffixes=('', '_annot'))
                    assert (found_matched_stop['tcoord'] < found_matched_stop['tcoord_annot']).all()  # other possibilities should be done by now
                    tfam_orfs.loc[tfam_orfs['orfname'].isin(found_matched_stop['orfname']), ['orftype', 'untyped']] = ['extension', False]
                # on the same transcript with an annotated CDS, with matching stop codon, initiating upstream - must be an extension
                # no possibility for an "unfound" extension - if the extension is in the transcriptome, the CDS it comes from must be as well
                # (except for a few edge cases e.g. annotated CDS is a CUG initiator, but not considering CUG ORFs)

                # NISO
                tfam_orfs.loc[tfam_orfs['annot_stop'] & (tfam_orfs['untyped']), ['orftype', 'untyped']] = ['Niso', False]
                # stop is annotated, but start is not, and it's not a truncation or extension - so must be an isoform (Niso => "N-terminal isoform")

                # NCISO
                if tfam_orfs['untyped'].any():
                    orf_codons = []
                    for (orfname, tid, tcoord, tstop) in \
                            tfam_orfs.loc[tfam_orfs['untyped'], ['orfname', 'tid', 'tcoord', 'tstop']].drop_duplicates('orfname').itertuples(False):
                        orf_codons.append(pd.DataFrame(_get_orf_pos(orfname, tid, tcoord, tstop).reshape((-1, 3))))
                        orf_codons[-1]['orfname'] = orfname
                    orf_codons = pd.concat(orf_codons, ignore_index=True)
                    if strand == '-':
                        annot_codons = pd.DataFrame(np.vstack([np.reshape(sorted(cds_pos_set, reverse=True), (-1, 3))
                                                               for cds_pos_set in cds_info['pos'] if len(cds_pos_set) % 3 == 0])).drop_duplicates()
                    else:
                        annot_codons = pd.DataFrame(np.vstack([np.reshape(sorted(cds_pos_set, reverse=False), (-1, 3))
                                                               for cds_pos_set in cds_info['pos'] if len(cds_pos_set) % 3 == 0])).drop_duplicates()
                    tfam_orfs.loc[tfam_orfs['orfname'].isin(orf_codons.merge(annot_codons)['orfname']), ['orftype', 'untyped']] = ['NCiso', False]
                    # ORFs that have at least one full codon overlapping (in-frame) with a CDS are isoforms (NCiso => "N- and C-terminal isoform")
                    # Note that these must already differ at N- and C- termini, otherwise they would already have been classified

                # INTERNAL
                if tfam_orfs['untyped'].any():
                    sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'],
                                                                      on='tid', suffixes=('', '_annot'))
                    sametrans_internal = (sametrans['tcoord'] > sametrans['tcoord_annot']) & (sametrans['tstop'] < sametrans['tstop_annot'])
                    tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_internal, 'orfname']),
                                  ['orftype', 'untyped']] = ['internal', False]
                # ORFs completely contained within a CDS on the same transcript, and not containing any full codon overlaps, must be internal
                # Still could be other ORFs internal to a CDS on a transcript not in the current transcriptome - need to check manually

                if tfam_orfs['untyped'].any() and not cds_info['found'].all():
                    for (orfname, gcoord, gstop) in \
                            tfam_orfs.loc[tfam_orfs['untyped'], ['orfname', 'gcoord', 'gstop']].drop_duplicates('orfname').itertuples(False):
                        orf_pos = _get_orf_pos(orfname)  # should be cached by now
                        if strand == '-':
                            if any(cds_pos_set.issuperset(orf_pos) and all(pos in orf_pos for pos in cds_pos_set if gcoord >= pos > gstop)
                                   for cds_pos_set in cds_info.loc[(~cds_info['found'])
                                                                   & (cds_info['gcoord'] > gcoord)
                                                                   & (cds_info['gstop'] < gstop), 'pos']):
                                tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['internal', False]
                        else:
                            if any(cds_pos_set.issuperset(orf_pos) and all(pos in orf_pos for pos in cds_pos_set if gcoord <= pos < gstop)
                                   for cds_pos_set in cds_info.loc[(~cds_info['found'])
                                                                   & (cds_info['gcoord'] < gcoord)
                                                                   & (cds_info['gstop'] > gstop), 'pos']):
                                tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['internal', False]

                # STOP_OVERLAP
                if tfam_orfs['untyped'].any():
                    sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'],
                                                                      on='tid', suffixes=('', '_annot'))
                    sametrans_stopover = (sametrans['tcoord'] > sametrans['tcoord_annot']) & (sametrans['tcoord'] < sametrans['tstop_annot'])
                    tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_stopover, 'orfname']),
                                  ['orftype', 'untyped']] = ['stop_overlap', False]
                    # starts within a CDS and not an internal - must be a stop_overlap
                    # do not need to check for unfounds - requiring that stop_overlap must be on same transcript as cds

                # START_OVERLAP
                if tfam_orfs['untyped'].any():
                    sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'],
                                                                      on='tid', suffixes=('', '_annot'))
                    sametrans_startover = (sametrans['tstop'] > sametrans['tcoord_annot']) & (sametrans['tstop'] < sametrans['tstop_annot'])
                    tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_startover, 'orfname']),
                                  ['orftype', 'untyped']] = ['start_overlap', False]
                    # ends within a CDS and not an internal - must be a start_overlap
                    # do not need to check for unfounds - requiring that start_overlap must be on same transcript as cds

                # LOOF
                if tfam_orfs['untyped'].any():
                    sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'],
                                                                      on='tid', suffixes=('', '_annot'))
                    sametrans_loof = (sametrans['tcoord'] < sametrans['tcoord_annot']) & (sametrans['tstop'] > sametrans['tstop_annot'])
                    tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_loof, 'orfname']), ['orftype', 'untyped']] = ['LOOF', False]
                    # starts upstream of a CDS and ends downstream of it - must be a LOOF (long out-of-frame)
                    # don't need to check for unfounds because the CDS must be on the same transcript as the ORF if the ORF completely contains it

                # UPSTREAM
                if tfam_orfs['untyped'].any():
                    sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'],
                                                                      on='tid', suffixes=('', '_annot'))
                    sametrans_upstream = (sametrans['tstop'] <= sametrans['tcoord_annot'])
                    tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_upstream, 'orfname']),
                                  ['orftype', 'untyped']] = ['upstream', False]
                    # ends upstream of a CDS - must be an upstream (uORF)
                    # cannot check manually for unfounds because those are not on well-defined transcripts

                # DOWNSTREAM
                if tfam_orfs['untyped'].any():
                    sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'],
                                                                      on='tid', suffixes=('', '_annot'))
                    sametrans_downstream = (sametrans['tstop_annot'] <= sametrans['tcoord'])
                    tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_downstream, 'orfname']),
                                  ['orftype', 'untyped']] = ['downstream', False]
                    # starts downstream of a CDS - must be an upstream (uORF)
                    # cannot check manually for unfounds because those are not on well-defined transcripts

                # NEW_ISO and GISO
                for orfname in tfam_orfs.loc[tfam_orfs['untyped'], 'orfname'].drop_duplicates():
                    if all_annot_pos.isdisjoint(_get_orf_pos(orfname)):
                        tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['new_iso', False]
                        # no overlaps whatsoever with any annotated CDS, but in a tfam that has annotations: new_iso
                    else:
                        tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['Giso', False]
                        # overlaps out-of-frame with a CDS, and not on the same transcript with a CDS: Giso => "genomic isoform"

                assert not tfam_orfs['untyped'].any()
        return tfam_orfs.drop('untyped', axis=1)
    else:
        return None
Ejemplo n.º 23
0
def _get_tid_info(tup):
    """For each transcript on this chromosome/strand, identifies every sub-sequence of the appropriate length (fpsize), converts it to an integer,
    identifies the number of reads mapping to that position, and outputs all of that information to a pandas HDF store."""
    (chrom, strand) = tup
    inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles]
    gnd = BAMGenomeArray(inbams, mapping=FivePrimeMapFactory(psite))
    # map to roughly the center of each read so that identical sequences that cross different splice sites
    # (on different transcripts) still end up mapping to the same place
    gnd.add_filter('size', SizeFilterFactory(opts.minlen, opts.maxlen))

    tid_seq_info = []
    tid_summary = pd.DataFrame(
        {
            'chrom': chrom,
            'strand': strand,
            'n_psite': -1,
            'n_reads': -1,
            'peak_reads': -1,
            'dropped': ''
        },
        index=pd.Index(bedlinedict[(chrom, strand)].keys(), name='tid'))
    for (tid, line) in bedlinedict[(chrom, strand)].iteritems():
        currtrans = SegmentChain.from_bed(line)
        curr_pos_list = currtrans.get_position_list()  # not in stranded order!
        if strand == '-':
            curr_pos_list = curr_pos_list[::-1]
        n_psite = len(curr_pos_list) + 1 - fpsize
        tid_summary.at[tid, 'n_psite'] = n_psite
        if n_psite > 0:
            curr_counts = np.array(currtrans.get_counts(gnd))[psite:n_psite +
                                                              psite]
            #                if((curr_counts>0).any()):
            sumcounts = curr_counts.sum()
            maxcounts = curr_counts.max()
            tid_summary.at[tid, 'n_reads'] = sumcounts
            tid_summary.at[tid, 'peak_reads'] = maxcounts
            if sumcounts >= opts.minreads:
                if maxcounts < sumcounts * opts.peakfrac:
                    numseq = np.array(
                        list(
                            currtrans.get_sequence(genome).upper().translate(
                                str_dict)))
                    curr_seq = ''.join(numseq)
                    tid_seq_info.append(
                        pd.DataFrame({
                            'tid':
                            tid,
                            'genpos':
                            curr_pos_list[psite:n_psite + psite],
                            'seq':
                            np.array([(int(curr_seq[i:i + fpsize], 4) if 'N'
                                       not in curr_seq[i:i + fpsize] else -1)
                                      for i in xrange(n_psite)],
                                     dtype=np.int64),
                            'reads':
                            curr_counts
                        }))
                else:
                    tid_summary.at[tid, 'dropped'] = 'peakfrac'
            else:
                tid_summary.at[tid, 'dropped'] = 'lowreads'
    if tid_seq_info:  # don't bother saving anything if there's nothing to save
        pd.concat(tid_seq_info,
                  ignore_index=True).to_hdf(seq_info_hdf % (chrom, strand),
                                            'tid_seq_info',
                                            format='t',
                                            data_columns=True,
                                            complevel=1,
                                            complib='blosc')
    #    sp.call(['ptrepack', orig_store_name, seq_info_hdf%(chrom,strand)])  # repack for efficiency
    #    os.remove(orig_store_name)
    if opts.verbose > 1:
        with log_lock:
            logprint('%s (%s strand) complete' % (chrom, strand))

    for inbam in inbams:
        inbam.close()

    return tid_summary
Ejemplo n.º 24
0
def do_count(roi_table,ga,norm_start,norm_end,min_counts,min_len,max_len,aggregate=False,printer=NullWriter()):
    """Calculate a :term:`metagene profile` for each read length in the dataset
    
    Parameters
    ----------
    roi_table : :class:`pandas.DataFrame`
        Table specifying regions of interest, generated
        by :py:func:`plastid.bin.metagene.do_generate`
    
    ga : |BAMGenomeArray|
        Count data
    
    norm_start : int
        Coordinate in window specifying normalization region start
    
    norm_end : int
        Coordinate in window specifying normalization region end
    
    min_counts : float
        Minimum number of counts in `window[norm_start:norm_end]`
        required for inclusion in metagene profile

    min_len : int
        Minimum read length to include
    
    max_len : int
        Maximum read length to include

    aggregate : bool, optional
        Estimate P-site from aggregate reads at each position, instead of median
        normalized read density. Potentially noisier, but helpful for lower-count
        data or read lengths with few counts. (Default: False)
                             
    printer : file-like, optional
        filehandle to write logging info to (Default: :func:`~plastid.util.io.openers.NullWriter`)
        
               
    Returns
    -------
    dict
        Dictionary of :class:`numpy.ndarray` s of raw counts at each position (column)
        for each window (row)
    
    dict
        Dictionary of :class:`numpy.ndarray` s of normalized counts at each position (column)
        for each window (row), normalized by the total number of counts in that row
        from `norm_start` to `norm_end`
    
    :class:`pandas.DataFrame`
        Metagene profile of median normalized counts at each position across
        all windows, and the number of windows included in the calculation of each
        median, stratified by read length
    """
    window_size    = roi_table["window_size"][0]
    upstream_flank = roi_table["zero_point"][0]
    
    raw_count_dict  = OrderedDict()
    norm_count_dict = OrderedDict()
    shape = (len(roi_table),window_size)
    for i in range(min_len,max_len+1):
        # mask all by default
        raw_count_dict[i] = numpy.ma.MaskedArray(numpy.tile(numpy.nan,shape),
                                                 mask=numpy.tile(True,shape),
                                                 dtype=float)
    
    for i,row in roi_table.iterrows():
        if i % 1000 == 0:
            printer.write("Counted %s ROIs ..." % (i+1))
            
        roi    = SegmentChain.from_str(row["region"])
        mask   = SegmentChain.from_str(row["masked"])
        roi.add_masks(*mask)
        valid_mask = roi.get_masked_counts(ga).mask
        
        offset = int(round((row["alignment_offset"])))
        assert offset + roi.length <= window_size
        
        count_vectors = {}
        for k in raw_count_dict:
            count_vectors[k] = []

        for seg in roi:
            reads = ga.get_reads(seg)
            read_dict = {}
            for k in raw_count_dict:
                read_dict[k] = []

            for read in filter(lambda x: len(x.positions) in read_dict,reads):
                read_dict[len(read.positions)].append(read)
            
            for k in read_dict:
                count_vector = ga.map_fn(read_dict[k],seg)[1]
                count_vectors[k].extend(count_vector)
                
        for k in raw_count_dict:
            if roi.strand == "-":
                count_vectors[k] = count_vectors[k][::-1]

            raw_count_dict[k].data[i,offset:offset+roi.length] = numpy.array(count_vectors[k])
            raw_count_dict[k].mask[i,offset:offset+roi.length] = valid_mask
    
    profile_table = { "x" : numpy.arange(-upstream_flank,window_size-upstream_flank) }
    
    printer.write("Counted %s ROIs total." % (i+1))
    for k in raw_count_dict:
        k_raw = raw_count_dict[k]
        
        denominator = numpy.nansum(k_raw[:,norm_start:norm_end],axis=1)
        norm_count_dict[k] = (k_raw.T.astype(float) / denominator).T
        
        # copy mask from raw counts, then add nans and infs
        norm_counts = numpy.ma.MaskedArray(norm_count_dict[k],
                                           mask=k_raw.mask)
        norm_counts.mask[numpy.isnan(norm_counts)] = True
        norm_counts.mask[numpy.isinf(norm_counts)] = True
        
        with warnings.catch_warnings():
            # ignore numpy mean of empty slice warning, given by numpy in Python 2.7-3.4
            warnings.filterwarnings("ignore",".*mean of empty.*",RuntimeWarning)
            try:
                if aggregate == False:
                    profile = numpy.ma.median(norm_counts[denominator >= min_counts],axis=0)
                else:
                    profile = numpy.nansum(k_raw[denominator >= min_counts],axis=0)
                    
            # in numpy under Python3.5, this is an IndexError instead of a warning
            except IndexError:
                profile = numpy.zeros_like(profile_table["x"],dtype=float)
            # in new versions of numpy, this is a ValueEror instead of an IndexError
            except ValueError:
                profile = numpy.zeros_like(profile_table["x"],dtype=float)

        num_genes = ((~norm_counts.mask)[denominator >= min_counts]).sum(0)

        profile_table["%s-mers" % k]            = profile
        profile_table["%s_regions_counted" % k] = num_genes
        
    profile_table = pd.DataFrame(profile_table)
    
    return raw_count_dict, norm_count_dict, profile_table
Ejemplo n.º 25
0
known_juncs = {
                 "YNL130C"   : ["YNL130C:0-53^145-180(-)",],
                 "YPL249C-A" : ["YPL249C-A:0-53^291-334(-)",],
                 
                'YBR215W_mRNA_0'  : ['YBR215W_mRNA_0:0-108^192-2175(+)'],
                'YHL001W_mRNA_0'  : ['YHL001W_mRNA_0:0-146^544-961(+)'],
                'YIL018W_mRNA_0'  : ['YIL018W_mRNA_0:0-30^430-1280(+)'],
                'YIL133C_mRNA_0'  : ['YIL133C_mRNA_0:0-648^938-1007(-)'],
                'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'],
                'YKL006W_mRNA_0'  : ['YKL006W_mRNA_0:0-157^555-954(+)'],
                'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'],
                'YNL130C_mRNA_0'  : ['YNL130C_mRNA_0:0-1204^1296-1382(-)'],
                'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'],
               }
known_juncs = { K : [SegmentChain.from_str(X) for X in V] for K,V in known_juncs.items() }
"""Annotated splice junctions"""

all_known_juncs = []
for v in known_juncs.values():
    all_known_juncs.extend(v)
    
known_juncs_as_tuples = {
                 "YNL130C"   : [("YNL130C",53,145,"-"),],
                 "YPL249C-A" : [("YPL249C-A",53,291,"-"),],
                 
                'YBR215W_mRNA_0'  : [('YBR215W_mRNA_0',108,192,'+'),],
                'YHL001W_mRNA_0'  : [('YHL001W_mRNA_0',146,544,'+'),],
                'YIL018W_mRNA_0'  : [('YIL018W_mRNA_0',30,430,'+'),],
                'YIL133C_mRNA_0'  : [('YIL133C_mRNA_0',648,938,'-'),],
                'YIL156W_B_mRNA_0': [('YIL156W_B_mRNA_0',41,103,'+'),],
Ejemplo n.º 26
0
        reader = BED_Reader(cStringIO.StringIO(_NARROW_PEAK_TEXT),
                            extra_columns=14)
        with warnings.catch_warnings(record=True) as warns:
            warnings.simplefilter("always")
            ltmp = list(reader)
            assert_greater_equal(len(warns), 0)


#===============================================================================
# INDEX: test data
#===============================================================================

# test dataset, constructed manually to include various edge cases
_TEST_SEGMENTCHAINS = [
    # single-interval
    SegmentChain(GenomicSegment("chrA", 100, 1100, "+"), ID="IVC1p"),
    SegmentChain(GenomicSegment("chrA", 100, 1100, "-"), ID="IVC1m"),
    # multi-interval
    SegmentChain(GenomicSegment("chrA", 100, 1100, "+"),
                 GenomicSegment("chrA", 2100, 2600, "+"),
                 ID="IVC2p"),
    SegmentChain(GenomicSegment("chrA", 100, 1100, "-"),
                 GenomicSegment("chrA", 2100, 2600, "-"),
                 ID="IVC2m"),
    # multi-interval, with score
    SegmentChain(GenomicSegment("chrA", 100, 1100, "+"),
                 GenomicSegment("chrA", 2100, 2600, "+"),
                 ID="IVC3p",
                 score=500),
    SegmentChain(GenomicSegment("chrA", 100, 1100, "-"),
                 GenomicSegment("chrA", 2100, 2600, "-"),
Ejemplo n.º 27
0
def process_partial_group(transcripts, mask_hash, printer):
    """Correct boundaries of merged genes, as described in :func:`do_generate`

    Parameters
    ----------
    transcripts : dict
        Dictionary mapping unique transcript IDs to |Transcripts|.
        This set should be complete in the sense that it should contain
        all transcripts that have any chance of mutually overlapping
        each other (e.g. all on same chromosome and strand). 

    mask_hash : |GenomeHash|
        |GenomeHash| of regions to exclude from analysis


    Returns
    -------
    :class:`pandas.DataFrame`
        Table of merged gene positions

    :class:`pandas.DataFrame`
        Table of adjusted transcript positions

    :class:`dict`
        Dictionary mapping raw gene names to merged gene names
    """
    gene_table = {
        "region": [],
        "transcript_ids": [],
        "exon_unmasked": [],
        "exon": [],
        "masked": [],
        "utr5": [],
        "cds": [],
        "utr3": [],
        "exon_bed": [],
        "utr5_bed": [],
        "cds_bed": [],
        "utr3_bed": [],
        "masked_bed": [],
    }

    # data table for transcripts
    transcript_table = {
        "region": [],
        "exon": [],
        "utr5": [],
        "cds": [],
        "utr3": [],
        "masked": [],
        "exon_unmasked": [],
        "transcript_ids": [],
        "exon_bed": [],
        "utr5_bed": [],
        "cds_bed": [],
        "utr3_bed": [],
        "masked_bed": [],
    }

    keycombos = list(itertools.permutations(("utr5", "cds", "utr3"), 2))

    # merge genes that share exons & write output
    printer.write("Collapsing genes that share exons ...")
    merged_genes = merge_genes(transcripts)

    # remap transcripts to merged genes
    # and vice-versa
    merged_gene_tx = {}
    tx_merged_gene = {}
    printer.write("Mapping transcripts to merged genes...")
    for txid in transcripts:
        my_tx = transcripts[txid]
        my_gene = my_tx.get_gene()
        my_merged = merged_genes[my_gene]
        tx_merged_gene[txid] = my_merged
        try:
            merged_gene_tx[my_merged].append(txid)
        except KeyError:
            merged_gene_tx[my_merged] = [txid]

    # flatten merged genes
    printer.write(
        "Flattening merged genes, masking positions, and labeling subfeatures ..."
    )
    for n, (gene_id, my_txids) in enumerate(merged_gene_tx.items()):
        if n % 1000 == 0 and n > 0:
            printer.write("    %s genes ..." % n)

        my_gene_positions = []
        chroms = []
        strands = []
        for my_txid in my_txids:
            my_segmentchain = transcripts[my_txid]
            chroms.append(my_segmentchain.chrom)
            strands.append(my_segmentchain.strand)
            my_gene_positions.extend(my_segmentchain.get_position_list())

            try:
                assert len(set(chroms)) == 1
            except AssertionError:
                printer.write(
                    "Skipping gene %s which contains multiple chromosomes: %s"
                    % (gene_id, ",".join(chroms)))

            try:
                assert len(set(strands)) == 1
            except AssertionError:
                printer.write(
                    "Skipping gene %s which contains multiple strands: %s" %
                    (gene_id, ",".join(strands)))

        my_gene_positions = set(my_gene_positions)
        gene_ivc_raw = SegmentChain(
            *positions_to_segments(chroms[0], strands[0], my_gene_positions))
        gene_table["region"].append(gene_id)
        gene_table["transcript_ids"].append(",".join(sorted(my_txids)))
        gene_table["exon_unmasked"].append(gene_ivc_raw)

    printer.write("    %s genes total." % (n + 1))

    # mask genes
    printer.write("Masking positions and labeling subfeature positions ...")
    gene_hash = GenomeHash(gene_table["exon_unmasked"], do_copy=False)

    for n, (gene_id, gene_ivc_raw) in enumerate(
            zip(gene_table["region"], gene_table["exon_unmasked"])):
        if n % 2000 == 0:
            printer.write("    %s genes ..." % n)

        my_chrom = gene_ivc_raw.spanning_segment.chrom
        my_strand = gene_ivc_raw.spanning_segment.strand

        masked_positions = []
        nearby_genes = gene_hash[gene_ivc_raw]

        # don't mask out positions from identical gene
        gene_ivc_raw_positions = gene_ivc_raw.get_position_set()
        nearby_genes = [
            X for X in nearby_genes
            if X.get_position_set() != gene_ivc_raw_positions
        ]
        for gene in nearby_genes:
            masked_positions.extend(gene.get_position_list())

        nearby_masks = mask_hash[gene_ivc_raw]
        for mask in nearby_masks:
            masked_positions.extend(mask.get_position_list())

        masked_positions = set(masked_positions)

        gene_positions_raw = gene_ivc_raw.get_position_set()
        mask_ivc_positions = gene_positions_raw & masked_positions
        total_mask_ivc = SegmentChain(*positions_to_segments(
            my_chrom, my_strand, mask_ivc_positions),
                                      ID=gene_id)
        gene_table["masked"].append(total_mask_ivc)
        gene_table["masked_bed"].append(total_mask_ivc.as_bed())

        gene_post_mask = gene_positions_raw - masked_positions
        gene_post_mask_ivc = SegmentChain(*positions_to_segments(
            my_chrom, my_strand, gene_post_mask),
                                          ID=gene_id)
        gene_table["exon"].append(gene_post_mask_ivc)
        gene_table["exon_bed"].append(gene_post_mask_ivc.as_bed())

        masked_positions = total_mask_ivc.get_position_set()
        tmp_positions = {
            "utr5": set(),
            "cds": set(),
            "utr3": set(),
        }
        txids = sorted(merged_gene_tx[gene_id])
        chrom = gene_post_mask_ivc.chrom
        strand = gene_post_mask_ivc.strand

        # pool transcript positions
        for txid in txids:
            transcript = transcripts[txid]

            utr5pos = transcript.get_utr5().get_position_set()
            cdspos = transcript.get_cds().get_position_set()
            utr3pos = transcript.get_utr3().get_position_set()

            tmp_positions["utr5"] |= utr5pos
            tmp_positions["cds"] |= cdspos
            tmp_positions["utr3"] |= utr3pos

        # eliminate positions in which CDS & UTRs overlap from each transcript
        for txid in txids:
            transcript = transcripts[txid]
            transcript_positions = {
                "utr5": transcript.get_utr5().get_position_set(),
                "cds": transcript.get_cds().get_position_set(),
                "utr3": transcript.get_utr3().get_position_set(),
            }

            for key1, key2 in keycombos:
                transcript_positions[key1] -= tmp_positions[key2]
                transcript_positions[key1] -= masked_positions

            transcript_table["region"].append(txid)

            # all unmasked positions
            my_chain = SegmentChain(*positions_to_segments(
                chrom, strand,
                transcript.get_position_set() - masked_positions),
                                    ID=txid)
            transcript_table["exon"].append(str(my_chain))
            transcript_table["exon_bed"].append(my_chain.as_bed())

            # all uniquely-labeled unmasked positions
            for k, v in transcript_positions.items():
                my_chain = SegmentChain(*positions_to_segments(
                    chrom, strand, v),
                                        ID=txid)
                transcript_table[k].append(str(my_chain))
                transcript_table["%s_bed" % k].append(my_chain.as_bed())

            total_mask_ivc.attr["ID"] = txid
            transcript_table["masked"].append(str(total_mask_ivc))
            transcript_table["masked_bed"].append(total_mask_ivc.as_bed())
            transcript_table["exon_unmasked"].append(str(transcript))
            transcript_table["transcript_ids"].append(txid)

        tmp_positions2 = copy.deepcopy(tmp_positions)
        for k1, k2 in keycombos:
            tmp_positions[k1] -= tmp_positions2[k2]
            tmp_positions[k1] -= masked_positions

        for k in (tmp_positions.keys()):
            my_chain = SegmentChain(*positions_to_segments(
                chrom, strand, tmp_positions[k]),
                                    ID=gene_id)
            gene_table[k].append(str(my_chain))
            gene_table["%s_bed" % k].append(my_chain.as_bed())

    printer.write("    %s genes total." % (n + 1))

    # cast SegmentChains/Transcripts to strings to keep numpy from unpacking them
    conversion_keys = [
        "exon", "utr5", "cds", "utr3", "masked", "exon_unmasked"
    ]
    for k in conversion_keys:
        gene_table[k] = [str(X) for X in gene_table[k]]
        transcript_table[k] = [str(X) for X in transcript_table[k]]

    gene_df = pd.DataFrame(gene_table)
    gene_df.sort_values(["region"], inplace=True)

    transcript_df = pd.DataFrame(transcript_table)
    transcript_df.sort_values(["region"], inplace=True)

    return gene_df, transcript_df, merged_genes
Ejemplo n.º 28
0
    def test_search_fields_singlevalue(self):
        reader = BigBedReader(self.bb_indexed)
        found = list(reader.search("name", "should_have_no_match"))
        self.assertEqual([], found)

        found = list(reader.search("Name", "Sam-S-RE"))
        expected = [
            SegmentChain(GenomicSegment('2L', 106902, 107000, '+'),
                         GenomicSegment('2L', 107764, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RE', 'CG2674-RE']'",
                         ID='FBtr0089437',
                         Name='Sam-S-RE',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
        ]
        self.assertEqual(expected, found)

        found = list(reader.search("gene_id", "FBgn0005278"))
        expected = [
            SegmentChain(GenomicSegment('2L', 106902, 107000, '+'),
                         GenomicSegment('2L', 107764, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RE', 'CG2674-RE']'",
                         ID='FBtr0089437',
                         Name='Sam-S-RE',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107760, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 111337, '+'),
                         Alias='na',
                         ID='FBtr0308091',
                         Name='Sam-S-RK',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='110900',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107760, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111004, 111117, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114210, '+'),
                         Alias="'['M(2)21AB-RB', 'CG2674-RB']'",
                         ID='FBtr0089428',
                         Name='Sam-S-RB',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='112741',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107760, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RA', 'CG2674-RA']'",
                         ID='FBtr0089429',
                         Name='Sam-S-RA',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107760, 107956, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias='na',
                         ID='FBtr0330656',
                         Name='Sam-S-RL',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='112781',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107936, 108226, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114210, '+'),
                         Alias="'['M(2)21AB-RH', 'CG2674-RH']'",
                         ID='FBtr0089432',
                         Name='Sam-S-RH',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107936, 108101, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RD', 'CG2674-RD']'",
                         ID='FBtr0089430',
                         Name='Sam-S-RD',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107936, 108101, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111004, 111117, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RC', 'CG2674-RC']'",
                         ID='FBtr0089431',
                         Name='Sam-S-RC',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 108088, 108226, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RF', 'CG2674-RF']'",
                         ID='FBtr0089433',
                         Name='Sam-S-RF',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 108132, 108346, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RI', 'CG2674-RI']'",
                         ID='FBtr0089434',
                         Name='Sam-S-RI',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 108132, 108226, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111004, 111117, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RJ', 'CG2674-RJ']'",
                         ID='FBtr0089435',
                         Name='Sam-S-RJ',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 109593, 109793, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111004, 111117, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114210, '+'),
                         Alias="'['M(2)21AB-RG', 'CG2674-RG']'",
                         ID='FBtr0089436',
                         Name='Sam-S-RG',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='109750',
                         type='exon'),
        ]
        self.assertEqual(sorted(expected), sorted(found))
Ejemplo n.º 29
0
def _regress_tfam(orf_set, gnd):
    """Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile()
    Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False"""
    tfam = orf_set['tfam'].iat[0]
    strand = orf_set['strand'].iat[0]
    chrom = orf_set['chrom'].iat[0]
    tids = orf_set['tid'].drop_duplicates().tolist()
    all_tfam_genpos = set()
    tid_genpos = {}
    tlens = {}
    for (i, tid) in enumerate(tids):
        currtrans = SegmentChain.from_bed(bedlinedict[tid])
        curr_pos_set = currtrans.get_position_set()
        tlens[tid] = len(curr_pos_set)
        tid_genpos[tid] = curr_pos_set
        all_tfam_genpos.update(curr_pos_set)
    tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos)))
    all_tfam_genpos = np.array(sorted(all_tfam_genpos))
    if strand == '-':
        all_tfam_genpos = all_tfam_genpos[::-1]
    nnt = len(all_tfam_genpos)
    tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True))
                   for (tid, curr_tid_genpos) in tid_genpos.iteritems()}
    hashed_counts = get_hashed_counts(tfam_segs, gnd)
    counts = np.zeros((len(rdlens), nnt), dtype=np.float64)  # even though they are integer-valued, will need to do float arithmetic
    for (i, rdlen) in enumerate(rdlens):
        for nmis in range(1+opts.max5mis):
            counts[i, :] += hashed_counts[(rdlen, nmis)]
    counts = counts.ravel()

    if opts.startcount:
        # Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon
        offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3)  # offsets for each cond, expecting three positions to check for each
    #    try:
        orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in
                           [tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]]
        if orf_set.empty:
            return failure_return

    orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True)
    abort_set = orf_set.drop_duplicates('gcoord').copy()
    abort_set['gstop'] = abort_set['gcoord']  # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag
    abort_set['tstop'] = abort_set['tcoord']+3  # stop after the first codon
    abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x))
    orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True)
    if not opts.startonly:  # if marking full ORFs, include histop model
        stop_set = orf_set.drop_duplicates('gstop').copy()
        stop_set['gcoord'] = stop_set['gstop']  # this is an easy flag
        stop_set['tcoord'] = stop_set['tstop']  # should probably be -3 nt, but this is another easy flag that distinguishes from abinit
        stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x))
        orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True)
    orf_profs = []
    indices = []
    for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False):
        if tcoord != tstop:  # not a histop
            tlen = tlens[tid]
            if tcoord+startnt[0] < 0:
                startadj = -startnt[0]-tcoord  # number of nts to remove from the start due to short 5' UTR; guaranteed > 0
            else:
                startadj = 0
            if tstop+stopnt[1] > tlen:
                stopadj = tstop+stopnt[1]-tlen  # number of nts to remove from the end due to short 3' UTR; guaranteed > 0
            else:
                stopadj = 0
            curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj]
            orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel())
        else:  # histop
            curr_indices = tid_indices[tid][tstop-6:tstop]
            orf_profs.append(stopprof[:, -6:].ravel())
        indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))]))
        # need to tile the indices for each read length
        if len(indices[-1]) != len(orf_profs[-1]):
            raise AssertionError('ORF length does not match index length')
    orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs),
                                          np.concatenate(indices),
                                          np.cumsum([0]+[len(curr_indices) for curr_indices in indices])),
                                         shape=(nnt*len(rdlens), len(orf_strength_df)))
    # better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come
    nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0)
    if len(nonzero_orfs) == 0:  # no possibility of anything coming up
        return failure_return
    orf_matrix = orf_matrix[:, nonzero_orfs]
    orf_strength_df = orf_strength_df.iloc[nonzero_orfs]  # don't bother fitting ORFs with zero reads throughout their entire length
    (orf_strs, resid) = nnls(orf_matrix.toarray(), counts)
    min_str = 1e-6  # allow for machine rounding error
    usable_orfs = orf_strs > min_str
    if not usable_orfs.any():
        return failure_return
    orf_strength_df = orf_strength_df[usable_orfs]
    orf_matrix = orf_matrix[:, usable_orfs] # remove entries for zero-strength ORFs or transcripts
    orf_strs = orf_strs[usable_orfs]
    orf_strength_df['orf_strength'] = orf_strs

    covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df))
    # homoscedastic version (assume equal variance at all positions)

    # resids = counts-orf_matrix.dot(orf_strs)
    # simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())
    # covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids))))
    #                            .dot(orf_matrix).dot(simple_covmat))
    # # heteroscedastic version (Eicker-Huber-White robust estimator)

    orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat)
    orf_strength_df.set_index('orfname', inplace=True)
    elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord'])
    if opts.startonly:  # count abortive initiation events towards start strength in this case
        include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop'])
        gcoord_grps = orf_strength_df[include_starts].groupby('gcoord')
        # even if we are willing to count abinit towards start strength, we certainly shouldn't count histop
        covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)]
        orf_strs_starts = orf_strs[include_starts.values]
    else:
        gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord')
        covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)]
        orf_strs_starts = orf_strs[elongating_orfs.values]
    start_strength_df = pd.DataFrame.from_items([('tfam', tfam),
                                                 ('chrom', orf_set['chrom'].iloc[0]),
                                                 ('strand', orf_set['strand'].iloc[0]),
                                                 ('codon', gcoord_grps['codon'].first()),
                                                 ('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))])
    start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)]))
                                              .dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()})

    if not opts.startonly:
        # count histop towards the stop codon - but still exclude abinit
        include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop']))
        gstop_grps = orf_strength_df[include_stops].groupby('gstop')
        covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)]
        orf_strs_stops = orf_strs[include_stops.values]
        stop_strength_df = pd.DataFrame.from_items([('tfam', tfam),
                                                    ('chrom', orf_set['chrom'].iloc[0]),
                                                    ('strand', orf_set['strand'].iloc[0]),
                                                    ('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))])
        stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)]))
                                                .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()})

        # # nohistop
        # gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop')
        # covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)]
        # orf_strs_stops = orf_strs[elongating_orfs.values]
        # stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum)
        # stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)]))
        #                                                  .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()})

        return orf_strength_df, start_strength_df, stop_strength_df
    else:
        return orf_strength_df, start_strength_df
Ejemplo n.º 30
0
def _regress_tfam(orf_set, gnd):
    """Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile()
    Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False"""
    tfam = orf_set['tfam'].iat[0]
    strand = orf_set['strand'].iat[0]
    chrom = orf_set['chrom'].iat[0]
    tids = orf_set['tid'].drop_duplicates().tolist()
    all_tfam_genpos = set()
    tid_genpos = {}
    tlens = {}
    for (i, tid) in enumerate(tids):
        currtrans = SegmentChain.from_bed(bedlinedict[tid])
        curr_pos_set = currtrans.get_position_set()
        tlens[tid] = len(curr_pos_set)
        tid_genpos[tid] = curr_pos_set
        all_tfam_genpos.update(curr_pos_set)
    tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos)))
    all_tfam_genpos = np.array(sorted(all_tfam_genpos))
    if strand == '-':
        all_tfam_genpos = all_tfam_genpos[::-1]
    nnt = len(all_tfam_genpos)
    tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True))
                   for (tid, curr_tid_genpos) in tid_genpos.iteritems()}
    hashed_counts = get_hashed_counts(tfam_segs, gnd)
    counts = np.zeros((len(rdlens), nnt), dtype=np.float64)  # even though they are integer-valued, will need to do float arithmetic
    for (i, rdlen) in enumerate(rdlens):
        for nmis in range(1+opts.max5mis):
            counts[i, :] += hashed_counts[(rdlen, nmis)]
    counts = counts.ravel()

    if opts.startcount:
        # Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon
        offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3)  # offsets for each cond, expecting three positions to check for each
    #    try:
        orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in
                           [tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]]
        if orf_set.empty:
            return failure_return

    orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True)
    abort_set = orf_set.drop_duplicates('gcoord').copy()
    abort_set['gstop'] = abort_set['gcoord']  # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag
    abort_set['tstop'] = abort_set['tcoord']+3  # stop after the first codon
    abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x))
    orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True)
    if not opts.startonly:  # if marking full ORFs, include histop model
        stop_set = orf_set.drop_duplicates('gstop').copy()
        stop_set['gcoord'] = stop_set['gstop']  # this is an easy flag
        stop_set['tcoord'] = stop_set['tstop']  # should probably be -3 nt, but this is another easy flag that distinguishes from abinit
        stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x))
        orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True)
    orf_profs = []
    indices = []
    for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False):
        if tcoord != tstop:  # not a histop
            tlen = tlens[tid]
            if tcoord+startnt[0] < 0:
                startadj = -startnt[0]-tcoord  # number of nts to remove from the start due to short 5' UTR; guaranteed > 0
            else:
                startadj = 0
            if tstop+stopnt[1] > tlen:
                stopadj = tstop+stopnt[1]-tlen  # number of nts to remove from the end due to short 3' UTR; guaranteed > 0
            else:
                stopadj = 0
            curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj]
            orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel())
        else:  # histop
            curr_indices = tid_indices[tid][tstop-6:tstop]
            orf_profs.append(stopprof[:, -6:].ravel())
        indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))]))
        # need to tile the indices for each read length
        if len(indices[-1]) != len(orf_profs[-1]):
            raise AssertionError('ORF length does not match index length')
    orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs),
                                          np.concatenate(indices),
                                          np.cumsum([0]+[len(curr_indices) for curr_indices in indices])),
                                         shape=(nnt*len(rdlens), len(orf_strength_df)))
    # better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come
    nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0)
    if len(nonzero_orfs) == 0:  # no possibility of anything coming up
        return failure_return
    orf_matrix = orf_matrix[:, nonzero_orfs]
    orf_strength_df = orf_strength_df.iloc[nonzero_orfs]  # don't bother fitting ORFs with zero reads throughout their entire length
    (orf_strs, resid) = nnls(orf_matrix.toarray(), counts)
    min_str = 1e-6  # allow for machine rounding error
    usable_orfs = orf_strs > min_str
    if not usable_orfs.any():
        return failure_return
    orf_strength_df = orf_strength_df[usable_orfs]
    orf_matrix = orf_matrix[:, usable_orfs]  # remove entries for zero-strength ORFs or transcripts
    orf_strs = orf_strs[usable_orfs]
    orf_strength_df['orf_strength'] = orf_strs

    covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df))
    # homoscedastic version (assume equal variance at all positions)

    # resids = counts-orf_matrix.dot(orf_strs)
    # simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())
    # covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids))))
    #                            .dot(orf_matrix).dot(simple_covmat))
    # # heteroscedastic version (Eicker-Huber-White robust estimator)

    orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat)
    orf_strength_df.set_index('orfname', inplace=True)
    elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord'])
    if opts.startonly:  # count abortive initiation events towards start strength in this case
        include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop'])
        if not include_starts.any():
            return failure_return  # no need to keep going if there weren't any useful starts
        gcoord_grps = orf_strength_df[include_starts].groupby('gcoord')
        # even if we are willing to count abinit towards start strength, we certainly shouldn't count histop
        covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)]
        orf_strs_starts = orf_strs[include_starts.values]
    else:
        if not elongating_orfs.any():
            return failure_return
        gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord')
        covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)]
        orf_strs_starts = orf_strs[elongating_orfs.values]
    start_strength_df = pd.DataFrame.from_items([('tfam', tfam),
                                                 ('chrom', orf_set['chrom'].iloc[0]),
                                                 ('strand', orf_set['strand'].iloc[0]),
                                                 ('codon', gcoord_grps['codon'].first()),
                                                 ('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))])
    start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)]))
                                              .dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()})

    if not opts.startonly:
        # count histop towards the stop codon - but still exclude abinit
        include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop']))
        gstop_grps = orf_strength_df[include_stops].groupby('gstop')
        covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)]
        orf_strs_stops = orf_strs[include_stops.values]
        stop_strength_df = pd.DataFrame.from_items([('tfam', tfam),
                                                    ('chrom', orf_set['chrom'].iloc[0]),
                                                    ('strand', orf_set['strand'].iloc[0]),
                                                    ('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))])
        stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)]))
                                                .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()})

        # # nohistop
        # gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop')
        # covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)]
        # orf_strs_stops = orf_strs[elongating_orfs.values]
        # stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum)
        # stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)]))
        #                                                  .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()})

        return orf_strength_df, start_strength_df, stop_strength_df
    else:
        return orf_strength_df, start_strength_df
Ejemplo n.º 31
0
def check_window(tx_ivc,
                 known_roi,
                 known_offset,
                 known_ref_point,
                 flank_up,
                 flank_down,
                 test_method,
                 test_name,
                 ref_delta=0):
    """Helper function to test output of window landmark functions
    
    Parameters
    ----------
    tx_ivc : |SegmentChain|
        Test Transcript from which window will be derived
    
    known_roi : |SegmentChain|
        Reference output for ROI
    
    known_offset : int
        Known offset to start of ROI
    
    known_ref_point : (str,int,str) or numpy.nan
        Known offset to landmark in ROI as ("chromosome_name",position,"strand")
    
    flank_up : int
        Flank upstream of landmark to include in ROI
    
    flank_down : int
        Flank downstream of landmark to include in ROI
    
    test_method : function
        Function to test (e.g. :py:func:`window_cds_start`, py:func:`window_cds_stop`)
    
    test_name : str
        Name of test (for generating rich error output)
    
    ref_delta : int, optional
        Distance from reference landmark at which to center windows
    """
    err_str = ("Failed %s on %s (strand: '%s', up: %s, down: %s). " %
               (test_name, str(tx_ivc), tx_ivc.spanning_segment.strand,
                flank_up, flank_down)) + "%s unequal (%s vs %s)"
    test_roi, test_offset, test_ref_point = test_method(tx_ivc,
                                                        flank_up,
                                                        flank_down,
                                                        ref_delta=ref_delta)
    check_equality(SegmentChain.from_str(known_roi), test_roi)

    # if no landmark
    if numpy.isnan(known_offset) or isinstance(
            known_ref_point, float) and numpy.isnan(known_ref_point):
        assert_true(numpy.isnan(test_offset),
                    msg=err_str % ("offset", known_offset, test_offset))
        assert_true(numpy.isnan(test_ref_point),
                    msg=err_str %
                    ("ref_point", known_ref_point, test_ref_point))
    # if landmark
    else:
        assert_equal(known_offset,
                     test_offset,
                     msg=err_str % ("offset", known_offset, test_offset))
        assert_equal(known_ref_point,
                     test_ref_point,
                     msg=err_str %
                     ("ref_point", known_ref_point, test_ref_point))
Ejemplo n.º 32
0
        for tid in tfam_val[0] if tid in gene_name_lookup
    }
    if not geneset:
        geneset = set(
            tfam_val[0]
        )  # if no gene names available, just use the tids themselves
    genename = _choose_name(geneset)
    if genename in new_tfams:
        multi_names[genename] += 1
        genename = '%s_%d' % (genename, multi_names[genename])
    new_tfams[genename] = tfam_val
for (genename, num_appearances) in multi_names.iteritems():
    sys.stderr.write('WARNING: Gene name %s appears %d independent times\n' %
                     (genename, num_appearances))

if opts.verbose:
    logprint('Saving results')

with open(outbedname, 'w') as outbed:
    with open(outtxtname, 'w') as outtxt:
        for tfam, (tids, (chrom, strand), genpos) in new_tfams.iteritems():
            outbed.write(
                SegmentChain(*positionlist_to_segments(chrom, strand,
                                                       list(genpos)),
                             ID=tfam).as_bed())
            for tid in tids:
                outtxt.write('%s\t%s\n' % (tid, tfam))

if opts.verbose:
    logprint('Tasks complete')
Ejemplo n.º 33
0
CCCTCCTTCCGCTGGCCCCGACTGC
>chr30b:1(+)
CCTCCTTCCGCTGGCCCCGACTGCC
>chr30b:2(+)
CTCCTTCCGCTGGCCCCGACTGCCC
>chr30b:3(+)
TCCTTCCGCTGGCCCCGACTGCCCC
>chr30b:4(+)
CCTTCCGCTGGCCCCGACTGCCCCA
>chr30b:5(+)
CTTCCGCTGGCCCCGACTGCCCCAG
"""

CROSSMAP1 = [
    (
        SegmentChain(GenomicSegment("chr50a", 1, 10, "+")),
        SegmentChain(GenomicSegment("chr50a", 1 + 25 - 1, 10 + 25 - 1, "-")),
    ),
    (
        SegmentChain(GenomicSegment("chr50a", 19, 26, "+")),
        SegmentChain(GenomicSegment("chr50a", 19 + 25 - 1, 26 + 25 - 1, "-")),
    ),
    (
        SegmentChain(GenomicSegment("chr30b", 0, 6, "+")),
        SegmentChain(GenomicSegment("chr30b", 0 + 25 - 1, 6 + 25 - 1, "-")),
    )
]

CROSSMAP2 = [
    (
        SegmentChain(GenomicSegment("chr50a", 1 + 1000, 10 + 1000, "+")),
Ejemplo n.º 34
0
    ],
    "YPL249C-A": [
        "YPL249C-A:0-53^291-334(-)",
    ],
    'YBR215W_mRNA_0': ['YBR215W_mRNA_0:0-108^192-2175(+)'],
    'YHL001W_mRNA_0': ['YHL001W_mRNA_0:0-146^544-961(+)'],
    'YIL018W_mRNA_0': ['YIL018W_mRNA_0:0-30^430-1280(+)'],
    'YIL133C_mRNA_0': ['YIL133C_mRNA_0:0-648^938-1007(-)'],
    'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'],
    'YKL006W_mRNA_0': ['YKL006W_mRNA_0:0-157^555-954(+)'],
    'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'],
    'YNL130C_mRNA_0': ['YNL130C_mRNA_0:0-1204^1296-1382(-)'],
    'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'],
}
known_juncs = {
    K: [SegmentChain.from_str(X) for X in V]
    for K, V in known_juncs.items()
}
"""Annotated splice junctions"""

known_juncs_tuples = {
    "YNL130C": [
        ("YNL130C", 53, 145, "-"),
    ],
    "YPL249C-A": [
        ("YPL249C-A", 53, 291, "-"),
    ],
    'YBR215W_mRNA_0': [
        ('YBR215W_mRNA_0', 108, 192, '+'),
    ],
    'YHL001W_mRNA_0': [
Ejemplo n.º 35
0
def _quantify_tfam(orf_set, gnds):
    """Performs non-negative least squares regression to quantify all of the ORFs in a transcript family, using a simplified profile consisting of
    the same three numbers tiled across each ORF. All readlengths are treated identically. Regions around start and stop codons are masked in
    accordance with startmask and stopmask"""
    strand = orf_set['strand'].iat[0]
    chrom = orf_set['chrom'].iat[0]
    tids = orf_set['tid'].drop_duplicates().tolist()
    all_tfam_genpos = set()
    tid_genpos = {}
    tlens = {}
    for (i, tid) in enumerate(tids):
        currtrans = SegmentChain.from_bed(bedlinedict[tid])
        curr_pos_set = currtrans.get_position_set()
        tlens[tid] = len(curr_pos_set)
        tid_genpos[tid] = curr_pos_set
        all_tfam_genpos.update(curr_pos_set)
    all_tfam_genpos = np.array(sorted(all_tfam_genpos))
    if strand == '-':
        all_tfam_genpos = all_tfam_genpos[::-1]
    nnt = len(all_tfam_genpos)
    tid_indices = {
        tid: np.flatnonzero(
            np.in1d(all_tfam_genpos, list(curr_tid_genpos),
                    assume_unique=True))
        for (tid, curr_tid_genpos) in tid_genpos.iteritems()
    }
    orf_matrix = np.zeros((nnt, len(orf_set)))
    ignore_coords = []
    for (orf_num,
         (tid, tcoord, tstop,
          AAlen)) in enumerate(orf_set[['tid', 'tcoord', 'tstop',
                                        'AAlen']].itertuples(False)):
        orf_matrix[tid_indices[tid][tcoord:tstop],
                   orf_num] = np.tile(cdsprof, AAlen + 1)
        ignore_coords.append(tid_indices[tid][max(tcoord +
                                                  startmask[0], 0):tcoord +
                                              startmask[1]])
        ignore_coords.append(
            tid_indices[tid][max(tstop + stopmask[0], 0):tstop + stopmask[1]])
    ignore_coords = np.unique(np.concatenate(ignore_coords))
    orf_matrix[
        ignore_coords, :] = 0  # mask out all positions within the mask region around starts and stops
    valid_orfs = np.array([
        (orf_matrix[:, i] > 0).any()
        and (orf_matrix.T[i, :] != orf_matrix.T[:i, :]).any(1).all()
        for i in xrange(len(orf_set))
    ])
    # require at least one valid position, and if >1 ORFs are identical, only include one of them
    orf_matrix[:, ~valid_orfs] = 0  # completely ignore these positions
    valid_nts = (orf_matrix > 0).any(
        1)  # only bother checking nucleotides where there is a valid ORF
    orf_res = orf_set.copy()
    if valid_nts.any():
        orf_matrix = orf_matrix[valid_nts, :]
        valid_nt_segs = SegmentChain(*positionlist_to_segments(
            chrom, strand, list(all_tfam_genpos[valid_nts])))
        orf_res['nts_quantified'] = (orf_matrix > 0).sum(
            0)  # the number of nucleotides included in the quantification
        for colname, gnd in zip(colnames, gnds):
            orf_res[colname] = nnls(orf_matrix,
                                    valid_nt_segs.get_counts(gnd))[0]
            # gnd is a HashedReadBAMGenomeArray, but it still works with get_counts(), which will collapse all read lengths to a single array
        return orf_res
    else:
        orf_res['nts_quantified'] = 0
        for colname in colnames:
            orf_res[colname] = 0.
        return orf_res
Ejemplo n.º 36
0
    crossmap = GenomeHash(_MASKS)
    for flank_up, flank_down in _FLANKS:
        for test_name, test_group in _DO_GENERATE_MAX_WINDOW.items():
            result_group = _DO_GENERATE_MAX_WINDOW_RESULTS_MASKED[
                "%s_%s_%s" % (test_name, flank_up, flank_down)]
            yield check_maximal_window, test_name, crossmap, test_group, [
                result_group
            ], flank_up, flank_down


#===============================================================================
# INDEX: test data
#===============================================================================

_MASKS = [
    SegmentChain.from_str("2L:7985694-7985744(+)"),
    SegmentChain.from_str("3R:4519879-4519891(-)"),
    SegmentChain.from_str("4:50-50000(+)"),
]

_TRANSCRIPTS_GFF = """##gff-version 3
3R    FlyBase    mRNA    4517211    4523544    .    -    .    ID=FBtr0081950;Name=hb-RB;Parent=FBgn0001180;Alias=FBtr0002097,FBtr0002098,CG9786-RB,hb[+]R2.8;Dbxref=FlyBase_Annotation_IDs:CG9786-RB,REFSEQ:NM_169234;score_text=Strongly Supported;score=11
3R    FlyBase    exon    4517211    4519894    .    -    .    Name=hb:2;Parent=FBtr0081950;parent_type=mRNA
3R    FlyBase    CDS    4517600    4519876    .    -    0    Name=hb-cds;Parent=FBtr0081950;parent_type=mRNA
3R    FlyBase    exon    4523048    4523544    .    -    .    Name=hb:4;Parent=FBtr0081950;parent_type=mRNA

3R    FlyBase    mRNA    4516702    4520322    .    -    .    ID=FBtr0081951;Name=hb-RA;Parent=FBgn0001180;Alias=FBtr0002096,FBtr0002097,CG9786-RA,hb[+]R3.2;Dbxref=FlyBase_Annotation_IDs:CG9786-RA,REFSEQ:NM_169233;score_text=Strongly Supported;score=11
3R    FlyBase    exon    4516702    4519894    .    -    .    Name=hb:1;Parent=FBtr0081951;parent_type=mRNA
3R    FlyBase    CDS    4517600    4519876    .    -    0    Name=hb-cds;Parent=FBtr0081951;parent_type=mRNA
3R    FlyBase    exon    4520178    4520322    .    -    .    Name=hb:3;Parent=FBtr0081951;parent_type=mRNA
Ejemplo n.º 37
0
    ],
    "YPL249C-A": [
        "YPL249C-A:0-53^291-334(-)",
    ],
    'YBR215W_mRNA_0': ['YBR215W_mRNA_0:0-108^192-2175(+)'],
    'YHL001W_mRNA_0': ['YHL001W_mRNA_0:0-146^544-961(+)'],
    'YIL018W_mRNA_0': ['YIL018W_mRNA_0:0-30^430-1280(+)'],
    'YIL133C_mRNA_0': ['YIL133C_mRNA_0:0-648^938-1007(-)'],
    'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'],
    'YKL006W_mRNA_0': ['YKL006W_mRNA_0:0-157^555-954(+)'],
    'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'],
    'YNL130C_mRNA_0': ['YNL130C_mRNA_0:0-1204^1296-1382(-)'],
    'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'],
}
known_juncs = {
    K: [SegmentChain.from_str(X) for X in V]
    for K, V in known_juncs.items()
}
"""Annotated splice junctions"""

all_known_juncs = []
for v in known_juncs.values():
    all_known_juncs.extend(v)

known_juncs_as_tuples = {
    "YNL130C": [
        ("YNL130C", 53, 145, "-"),
    ],
    "YPL249C-A": [
        ("YPL249C-A", 53, 291, "-"),
    ],
Ejemplo n.º 38
0
def check_maximal_window(test_name, genome_hash, test_group, result_groups,
                         flank_up, flank_down):
    """
    test_name : str
        Descriptive name of test

    genome_hash : GenomeHash
        Mask hash

    test_group : list
        List of transcript IDs, referring to transcripts in the GFF text above

    result_groups : list
        list of tuples of (region_str, aligment_offset, window_length) expected from 
        maximal spanning window output

    flank_up : int
        Bases to include upstream of landmark in window

    flank_down : int
        bases to include downstream of landmark in window
    """
    # table keys:
    #    gene_id
    #    window_size
    #    roi
    #    masked
    #    alignment_offset
    #    zero_point
    err_str = ("Failed %s (up: %s, down: %s). " %
               (test_name, flank_up, flank_down)) + "%s unequal (%s vs %s)"
    tx_ivcs = (_TRANSCRIPTS[X] for X in test_group)
    roi_table = group_regions_make_windows(tx_ivcs, genome_hash, flank_up,
                                           flank_down, window_cds_start)
    roi_table.sort(columns=["region"], inplace=True)
    trows = [X[1] for X in roi_table.iterrows()]
    result_groups = sorted(result_groups, key=lambda x: x[0])
    REGION = 0

    c = 0

    for n, result_group in enumerate(result_groups):
        # if no landmark
        if numpy.isnan(result_group[1]) or numpy.isnan(result_group[2]):
            c += 1  # increment counter for input that will have no output

        # if landmark
        else:
            check_equality(SegmentChain.from_str(result_group[0]),
                           SegmentChain.from_str(trows[n - c]["region"]),
                           test_name)
            assert_equal(
                result_group[1],
                trows[n - c]["alignment_offset"],
                msg=err_str %
                ("offset", result_group[1], trows[n - c]["alignment_offset"]))
            assert_equal(
                result_group[2],
                trows[n - c]["zero_point"],
                msg=err_str %
                ("ref_point", result_group[1], trows[n - c]["zero_point"]))
            if len(result_group) == 4:
                assert_equal(result_group[3],
                             trows[n - c]["masked"],
                             msg=err_str %
                             ("mask", result_group[3], trows[n - c]["masked"]))

    assert_equal(n + 1 - c, len(roi_table))
Ejemplo n.º 39
0
 def setUpClass(cls):
     cls.ivcs = {
         "plus": [
             SegmentChain(GenomicSegment("chrA", 0, 100, "+")),
             SegmentChain(GenomicSegment("chrA", 50, 100, "+")),
             SegmentChain(GenomicSegment("chrA", 50, 51, "+"))
         ],
         "minus_k25_off0": [
             SegmentChain(
                 GenomicSegment("chrA", 0 + 25 - 1, 100 + 25 - 1, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 25 - 1, 100 + 25 - 1, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 25 - 1, 51 + 25 - 1, "-"))
         ],
         "minus_k50_off0": [
             SegmentChain(
                 GenomicSegment("chrA", 0 + 50 - 1, 100 + 50 - 1, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 50 - 1, 100 + 50 - 1, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 50 - 1, 51 + 50 - 1, "-"))
         ],
         "minus_k25_off10": [
             SegmentChain(
                 GenomicSegment("chrA", 0 + 25 - 1 - 2 * 10,
                                100 + 25 - 1 - 2 * 10, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 25 - 1 - 2 * 10,
                                100 + 25 - 1 - 2 * 10, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 25 - 1 - 2 * 10,
                                51 + 25 - 1 - 2 * 10, "-"))
         ],
         "minus_k50_off10": [
             SegmentChain(
                 GenomicSegment("chrA", 0 + 50 - 1 - 2 * 10,
                                100 + 50 - 1 - 2 * 10, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 50 - 1 - 2 * 10,
                                100 + 50 - 1 - 2 * 10, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 50 - 1 - 2 * 10,
                                51 + 50 - 1 - 2 * 10, "-"))
         ],
     }
Ejemplo n.º 40
0
known_juncs = {
                 "YNL130C"   : ["YNL130C:0-53^145-180(-)",],
                 "YPL249C-A" : ["YPL249C-A:0-53^291-334(-)",],
                 
                'YBR215W_mRNA_0'  : ['YBR215W_mRNA_0:0-108^192-2175(+)'],
                'YHL001W_mRNA_0'  : ['YHL001W_mRNA_0:0-146^544-961(+)'],
                'YIL018W_mRNA_0'  : ['YIL018W_mRNA_0:0-30^430-1280(+)'],
                'YIL133C_mRNA_0'  : ['YIL133C_mRNA_0:0-648^938-1007(-)'],
                'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'],
                'YKL006W_mRNA_0'  : ['YKL006W_mRNA_0:0-157^555-954(+)'],
                'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'],
                'YNL130C_mRNA_0'  : ['YNL130C_mRNA_0:0-1204^1296-1382(-)'],
                'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'],
               }
known_juncs = { K : [SegmentChain.from_str(X) for X in V] for K,V in known_juncs.items() }
"""Annotated splice junctions"""

known_juncs_tuples = {
                 "YNL130C"   : [("YNL130C",53,145,"-"),],
                 "YPL249C-A" : [("YPL249C-A",53,291,"-"),],
                 
                'YBR215W_mRNA_0'  : [('YBR215W_mRNA_0',108,192,'+'),],
                'YHL001W_mRNA_0'  : [('YHL001W_mRNA_0',146,544,'+'),],
                'YIL018W_mRNA_0'  : [('YIL018W_mRNA_0',30,430,'+'),],
                'YIL133C_mRNA_0'  : [('YIL133C_mRNA_0',648,938,'-'),],
                'YIL156W_B_mRNA_0': [('YIL156W_B_mRNA_0',41,103,'+'),],
                'YKL006W_mRNA_0'  : [('YKL006W_mRNA_0',157,555,'+'),],
                'YMR194C_B_mRNA_0': [('YMR194C_B_mRNA_0',325,397,'-'),],
                'YNL130C_mRNA_0'  : [('YNL130C_mRNA_0',1204,1296,'-'),],
                'YPL249C_A_mRNA_0': [('YPL249C_A_mRNA_0',415,653,'-'),],