def filter(self,line): """Process lines of `PSL`_ files input into |SegmentChain|, and group these by query sequence. Parameters ---------- line : str line of `PSL`_ input Returns ------- list list of |SegmentChain| objects sharing a query sequence """ ltmp = [] aln = SegmentChain.from_psl(line) last_name = aln.attr["query_name"] try: while last_name == aln.attr["query_name"]: ltmp.append(aln) line = next(self.stream) aln = SegmentChain.from_psl(line) self.stream = itertools.chain([line],self.stream) return ltmp except StopIteration: # send final bundle return ltmp
def filter(self, line): """Process lines of `PSL`_ files input into |SegmentChain|, and group these by query sequence. Parameters ---------- line : str line of `PSL`_ input Returns ------- list list of |SegmentChain| objects sharing a query sequence """ ltmp = [] aln = SegmentChain.from_psl(line) last_name = aln.attr["query_name"] try: while last_name == aln.attr["query_name"]: ltmp.append(aln) line = next(self.stream) aln = SegmentChain.from_psl(line) self.stream = itertools.chain([line], self.stream) return ltmp except StopIteration: # send final bundle return ltmp
def test_window_landmark(): # test cases: plus and minus-strand IVCs with splicing flank_up = 50 flank_down = 100 my_segmentchains = [ SegmentChain(GenomicSegment("chrA", 50, 350, "+"), GenomicSegment("chrA", 500, 900, "+")), SegmentChain(GenomicSegment("chrA", 50, 350, "-"), GenomicSegment("chrA", 500, 900, "-")), ] for my_segmentchain in my_segmentchains: for landmark in range(0, 700, 50): yield check_window_landmark, my_segmentchain, landmark, flank_up, flank_down
def __getitem__(self, roi, stranded=True): """Return list of features that overlap the region of interest (roi). Parameters ---------- roi : |GenomicSegment| or |SegmentChain| Query feature indicating region of interest stranded : bool If `True`, retrieve only features on same strand as query feature. Otherwise, retrieve features on both strands Returns ------- list Features that overlap `roi` Raises ------ TypeError if `roi` is not a |GenomicSegment| or |SegmentChain| """ if isinstance(roi, GenomicSegment): #roi_chain = SegmentChain(roi) roi_seg = roi roi_chain = SegmentChain(roi) elif isinstance(roi, SegmentChain): roi_chain = roi roi_seg = roi.spanning_segment else: raise TypeError( "Query feature must be a GenomicSegment or SegmentChain") chrom = roi_seg.chrom feature_text = "\n".join(["\n".join(list(R.fetch(chrom, X.start, X.end))) \ for R in self.tabix_readers \ for X in roi_chain]) features = (self._reader_class(cStringIO.StringIO(feature_text))) if stranded == True: features = [X for X in features if roi_chain.overlaps(X)] else: features = [ X for X in features if roi_chain.unstranded_overlaps(X) ] return features
def _quantify_tfam(orf_set, gnds): """Performs non-negative least squares regression to quantify all of the ORFs in a transcript family, using a simplified profile consisting of the same three numbers tiled across each ORF. All readlengths are treated identically. Regions around start and stop codons are masked in accordance with startmask and stopmask""" strand = orf_set['strand'].iat[0] chrom = orf_set['chrom'].iat[0] tids = orf_set['tid'].drop_duplicates().tolist() all_tfam_genpos = set() tid_genpos = {} tlens = {} for (i, tid) in enumerate(tids): currtrans = SegmentChain.from_bed(bedlinedict[tid]) curr_pos_set = currtrans.get_position_set() tlens[tid] = len(curr_pos_set) tid_genpos[tid] = curr_pos_set all_tfam_genpos.update(curr_pos_set) all_tfam_genpos = np.array(sorted(all_tfam_genpos)) if strand == '-': all_tfam_genpos = all_tfam_genpos[::-1] nnt = len(all_tfam_genpos) tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True)) for (tid, curr_tid_genpos) in tid_genpos.iteritems()} orf_matrix = np.zeros((nnt, len(orf_set))) ignore_coords = [] for (orf_num, (tid, tcoord, tstop, AAlen)) in enumerate(orf_set[['tid', 'tcoord', 'tstop', 'AAlen']].itertuples(False)): orf_matrix[tid_indices[tid][tcoord:tstop], orf_num] = np.tile(cdsprof, AAlen + 1) ignore_coords.append(tid_indices[tid][max(tcoord+startmask[0], 0):tcoord+startmask[1]]) ignore_coords.append(tid_indices[tid][max(tstop+stopmask[0], 0):tstop+stopmask[1]]) ignore_coords = np.unique(np.concatenate(ignore_coords)) orf_matrix[ignore_coords, :] = 0 # mask out all positions within the mask region around starts and stops valid_orfs = np.array([(orf_matrix[:, i] > 0).any() and (orf_matrix.T[i, :] != orf_matrix.T[:i, :]).any(1).all() for i in xrange(len(orf_set))]) # require at least one valid position, and if >1 ORFs are identical, only include one of them orf_matrix[:, ~valid_orfs] = 0 # completely ignore these positions valid_nts = (orf_matrix > 0).any(1) # only bother checking nucleotides where there is a valid ORF orf_res = orf_set.copy() if valid_nts.any(): orf_matrix = orf_matrix[valid_nts, :] valid_nt_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos[valid_nts]))) orf_res['nts_quantified'] = (orf_matrix > 0).sum(0) # the number of nucleotides included in the quantification for colname, gnd in zip(colnames, gnds): orf_res[colname] = nnls(orf_matrix, valid_nt_segs.get_counts(gnd))[0] # gnd is a HashedReadBAMGenomeArray, but it still works with get_counts(), which will collapse all read lengths to a single array return orf_res else: orf_res['nts_quantified'] = 0 for colname in colnames: orf_res[colname] = 0. return orf_res
def get_overlapping_features(self, roi, stranded=True): """Return list of features overlapping `roi`. Parameters ---------- roi : |GenomicSegment| or |SegmentChain| Query feature indicating region of interest stranded : bool if `True`, retrieve only features on same strand as query feature. Otherwise, retrieve features on both strands Returns ------- list Features overlapping `roi` Raises ------ TypeError if `roi` is not a |GenomicSegment| or |SegmentChain| """ nearby_features = self.get_nearby_features(roi, stranded=stranded) if isinstance(roi, GenomicSegment): roi = SegmentChain(roi) if stranded == False: fn = roi.unstranded_overlaps else: fn = roi.overlaps return [X for X in nearby_features if fn(X) == True]
def do_count(args, alignment_parser): """Count the number and density covering each merged gene in an annotation made made using the `generate` subcommand). Parameters ---------- args : :py:class:`argparse.Namespace` command-line arguments for ``count`` subprogram """ # we expect many zero-lenght segmentchains, so turn these off for now warnings.filterwarnings( "ignore", ".*zero-length SegmentChain.*", ) keys = ("exon", "utr5", "cds", "utr3") column_order = ["region"] gene_positions = read_pl_table(args.position_file) # read count files ga = alignment_parser.get_genome_array_from_args(args, printer=printer) total_counts = ga.sum() normconst = 1000.0 * 1e6 / total_counts printer.write("Dataset has %s counts in it." % total_counts) printer.write("Tallying genes ...") dtmp = {"region": []} for x in keys: for y in ("reads", "length", "rpkm"): label = "%s_%s" % (x, y) dtmp[label] = [] column_order.append(label) for i, name in enumerate(gene_positions["region"]): dtmp["region"].append(name) if i % 500 == 0: printer.write("Processed %s genes ..." % i) for k in keys: ivc = SegmentChain.from_str(gene_positions[k][i]) total = sum(ivc.get_counts(ga)) length = ivc.length rpkm = (normconst * total / length) if length > 0 else numpy.nan dtmp["%s_reads" % k].append(total) dtmp["%s_length" % k].append(length) dtmp["%s_rpkm" % k].append(rpkm) fout = argsopener("%s.txt" % args.outbase, args, "w") dtmp = pd.DataFrame(dtmp) dtmp.to_csv(fout, sep="\t", header=True, index=False, columns=column_order, na_rep="nan", float_format="%.8f") fout.close() printer.write("Done.")
def test_on_SegmentChain_exclude(self): features = [ SegmentChain(self.ivs[n], **self.attrs[n]) for n in range(len(self.ivs)) ] self.assertEqual(get_identical_attributes(features, exclude=["type"]), self.common_attr)
def test_on_SegmentChain_no_exclude(self): features = [ SegmentChain(self.ivs[n], **self.attrs[n]) for n in range(len(self.ivs)) ] common_plus_type = copy.deepcopy(self.common_attr) common_plus_type["type"] = "exon" self.assertEqual(get_identical_attributes(features), common_plus_type)
def __getitem__(self,roi,stranded=True): """Return list of features that overlap the region of interest (roi). Parameters ---------- roi : |GenomicSegment| or |SegmentChain| Query feature indicating region of interest stranded : bool If `True`, retrieve only features on same strand as query feature. Otherwise, retrieve features on both strands Returns ------- list Features that overlap `roi` Raises ------ TypeError if `roi` is not a |GenomicSegment| or |SegmentChain| """ if isinstance(roi,GenomicSegment): #roi_chain = SegmentChain(roi) roi_seg = roi roi_chain = SegmentChain(roi) elif isinstance(roi,SegmentChain): roi_chain = roi roi_seg = roi.spanning_segment else: raise TypeError("Query feature must be a GenomicSegment or SegmentChain") chrom = roi_seg.chrom feature_text = "\n".join(["\n".join(list(R.fetch(chrom, X.start, X.end))) \ for R in self.tabix_readers \ for X in roi_chain]) features = (self._reader_class(cStringIO.StringIO(feature_text))) if stranded == True: features = [X for X in features if roi_chain.overlaps(X)] else: features = [X for X in features if roi_chain.unstranded_overlaps(X)] return features
def _get_tid_info(tup): """For each transcript on this chromosome/strand, identifies every sub-sequence of the appropriate length (fpsize), converts it to an integer, identifies the number of reads mapping to that position, and outputs all of that information to a pandas HDF store.""" (chrom, strand) = tup inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles] gnd = BAMGenomeArray(inbams, mapping=FivePrimeMapFactory(psite)) # map to roughly the center of each read so that identical sequences that cross different splice sites # (on different transcripts) still end up mapping to the same place gnd.add_filter('size', SizeFilterFactory(opts.minlen, opts.maxlen)) tid_seq_info = [] tid_summary = pd.DataFrame( {'chrom': chrom, 'strand': strand, 'n_psite': -1, 'n_reads': -1, 'peak_reads': -1, 'dropped': ''}, index=pd.Index(bedlinedict[(chrom, strand)].keys(), name='tid')) for (tid, line) in bedlinedict[(chrom, strand)].iteritems(): currtrans = SegmentChain.from_bed(line) curr_pos_list = currtrans.get_position_list() # not in stranded order! if strand == '-': curr_pos_list = curr_pos_list[::-1] n_psite = len(curr_pos_list) + 1 - fpsize tid_summary.at[tid, 'n_psite'] = n_psite if n_psite > 0: curr_counts = np.array(currtrans.get_counts(gnd))[psite:n_psite + psite] # if((curr_counts>0).any()): sumcounts = curr_counts.sum() maxcounts = curr_counts.max() tid_summary.at[tid, 'n_reads'] = sumcounts tid_summary.at[tid, 'peak_reads'] = maxcounts if sumcounts >= opts.minreads: if maxcounts < sumcounts * opts.peakfrac: numseq = np.array(list(currtrans.get_sequence(genome).upper().translate(str_dict))) curr_seq = ''.join(numseq) tid_seq_info.append(pd.DataFrame({'tid': tid, 'genpos': curr_pos_list[psite:n_psite + psite], 'seq': np.array([(int(curr_seq[i:i + fpsize], 4) if 'N' not in curr_seq[i:i + fpsize] else -1) for i in xrange(n_psite)], dtype=np.int64), 'reads': curr_counts})) else: tid_summary.at[tid, 'dropped'] = 'peakfrac' else: tid_summary.at[tid, 'dropped'] = 'lowreads' if tid_seq_info: # don't bother saving anything if there's nothing to save pd.concat(tid_seq_info, ignore_index=True).to_hdf(seq_info_hdf % (chrom, strand), 'tid_seq_info', format='t', data_columns=True, complevel=1, complib='blosc') # sp.call(['ptrepack', orig_store_name, seq_info_hdf%(chrom,strand)]) # repack for efficiency # os.remove(orig_store_name) if opts.verbose > 1: with log_lock: logprint('%s (%s strand) complete' % (chrom, strand)) for inbam in inbams: inbam.close() return tid_summary
def test_search_fields_multivalue(self): reader = BigBedReader(self.bb_indexed) found = list( reader.search("name", "should_have_no_match", "should_also_have_no_match")) self.assertEqual([], found) found = list(reader.search("Name", "Sam-S-RE", "Sam-S-RK")) expected = [ SegmentChain(GenomicSegment('2L', 106902, 107000, '+'), GenomicSegment('2L', 107764, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RE', 'CG2674-RE']'", ID='FBtr0089437', Name='Sam-S-RE', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 111337, '+'), Alias='na', ID='FBtr0308091', Name='Sam-S-RK', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='110900', thickstart='108685', type='exon'), ] self.assertEqual(expected, found)
def test_variable_stratified_mapping_plus(self): offsets = { 26 : 6, 27 : 22, 28 : 13, 29 : 4, 30 : 5 } chains = { "fw" : SegmentChain(GenomicSegment('chrII',392959,393180,'+'), GenomicSegment('chrII',393510,394742,'+'), GenomicSegment('chrII',394860,394901,'+'), ID='YBR078W_mRNA'), "rc" : SegmentChain(GenomicSegment('chrVIII',189061,189749,'-'), GenomicSegment('chrVIII',189850,190017,'-'), ID='YHR041C_mRNA') } expected = { "fw" : numpy.loadtxt(resource_filename("plastid","test/data/stratmap/strat_fw_vec.txt"),delimiter="\t"), "rc" : numpy.loadtxt(resource_filename("plastid","test/data/stratmap/strat_rc_vec.txt"),delimiter="\t"), } ga = BAMGenomeArray([resource_filename("plastid","test/data/stratmap/strat.bam")]) ga.set_mapping(StratifiedVariableFivePrimeMapFactory(offsets,26,30))
def filter(self,line): """Parse a read alignment as |SegmentChain| from a line of `bowtie`_ output""" items = line.strip("\n").split("\t") read_name = items[0] strand = items[1] ref_seq = items[2] coord = int(items[3]) attr = { 'seq_as_aligned' : items[4], 'qualstr' : items[5], 'mismatch_str' : items[7], 'type' : "alignment", 'ID' : read_name, } iv = GenomicSegment(ref_seq,coord,coord+len(attr['seq_as_aligned']),strand) feature = SegmentChain(iv,**attr) return feature
def covered_by_repetitive(query_junc,minus_range,plus_range,cross_hash): """Determine whether one or both ends of a splice site overlap with a repetitive area of the genome. Parameters ---------- query_junc : |SegmentChain| A two-exon fragment representing a query splice junction minus_range : int <= 0 Maximum number of nucleotides splice junction could be moved to the left without reducing sequence support for the junction see :py:func:`find_match_range` plus_range : int >= 0 Maximum number of nucleotides splice junction could be moved to the right without reducing sequence support for the junction see :py:func:`find_match_range` cross_hash : |GenomeHash| |GenomeHash| of 1-length features denoting repetitive regions of the genome Returns ------- bool `True` if any of the genomic positions within `minus_range...plus_range` of the 5' or 3' splice sites of `query_junc` overlap a repetitive region of the genome as annotated by ``cross_hash``. Otherwise, `False` """ chrom = query_junc.spanning_segment.chrom strand = query_junc.spanning_segment.strand qend = query_junc[0].end qstart = query_junc[1].start fiveprime_splice_area = GenomicSegment(chrom, qend + minus_range, qend + plus_range + 1, strand) threeprime_splice_area = GenomicSegment(chrom, qstart + minus_range, qstart + plus_range + 1, strand) support_region = SegmentChain(fiveprime_splice_area,threeprime_splice_area) return len(cross_hash.get_overlapping_features(support_region)) > 0
def roi_row_to_cds(row): """Helper function to extract coding portions from maximal spanning windows flanking CDS starts that are created by |metagene| ``generate`` subprogram. Parameters ---------- row : (int, Series) Row from a :class:`pandas.DataFrame` of an ROI file made by the |metagene| ``generate`` subprogram Returns ------- |SegmentChain| Coding portion of maximal spanning window """ chainstr, alignment_offset, zero_point = row[1][["region","alignment_offset","zero_point"]] chain = SegmentChain.from_str(chainstr) cds_start = zero_point - alignment_offset subchain = chain.get_subchain(cds_start,chain.length) return subchain
def roi_row_to_cds(row): """Helper function to extract coding portions from maximal spanning windows flanking CDS starts that are created by |metagene| ``generate`` subprogram. Parameters ---------- row : (int, Series) Row from a :class:`pandas.DataFrame` of an ROI file made by the |metagene| ``generate`` subprogram Returns ------- |SegmentChain| Coding portion of maximal spanning window """ chainstr, alignment_offset, zero_point = row[1][[ "region", "alignment_offset", "zero_point" ]] chain = SegmentChain.from_str(chainstr) cds_start = zero_point - alignment_offset subchain = chain.get_subchain(cds_start, chain.length) return subchain
def _get_annotated_counts_by_chrom(chrom_to_do): """Accumulate counts from annotated CDSs into a metagene profile. Only the longest CDS in each transcript family will be included, and only if it meets the minimum number-of-reads requirement. Reads are normalized by gene, so every gene included contributes equally to the final metagene.""" found_cds = pd.read_hdf(opts.orfstore, 'all_orfs', mode='r', where="chrom == '%s' and orftype == 'annotated' and tstop > 0 and tcoord > %d and AAlen > %d" % (chrom_to_do, -startnt[0], min_AAlen), columns=['orfname', 'tfam', 'tid', 'tcoord', 'tstop', 'AAlen']) \ .sort_values('AAlen', ascending=False).drop_duplicates('tfam') # use the longest annotated CDS in each transcript family num_cds_incl = 0 # number of CDSs included from this chromosome startprof = np.zeros((len(rdlens), startlen)) cdsprof = np.zeros((len(rdlens), 3)) stopprof = np.zeros((len(rdlens), stoplen)) inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles] gnd = HashedReadBAMGenomeArray(inbams, ReadKeyMapFactory(Pdict, read_length_nmis)) for (tid, tcoord, tstop) in found_cds[['tid', 'tcoord', 'tstop']].itertuples(False): curr_trans = SegmentChain.from_bed(bedlinedict[tid]) tlen = curr_trans.get_length() if tlen >= tstop + stopnt[1]: # need to guarantee that the 3' UTR is sufficiently long curr_hashed_counts = get_hashed_counts(curr_trans, gnd) cdslen = tstop+stopnt[1]-tcoord-startnt[0] # cds length, plus the extra bases... curr_counts = np.zeros((len(rdlens), cdslen)) for (i, rdlen) in enumerate(rdlens): for nmis in range(opts.max5mis+1): curr_counts[i, :] += curr_hashed_counts[(rdlen, nmis)][tcoord+startnt[0]:tstop+stopnt[1]] # curr_counts is limited to the CDS plus any extra requested nucleotides on either side if curr_counts.sum() >= opts.mincdsreads: curr_counts /= curr_counts.mean() # normalize by mean of counts across all readlengths and positions within the CDS startprof += curr_counts[:, :startlen] cdsprof += curr_counts[:, startlen:cdslen-stoplen].reshape((len(rdlens), -1, 3)).mean(1) stopprof += curr_counts[:, cdslen-stoplen:cdslen] num_cds_incl += 1 for inbam in inbams: inbam.close() return startprof, cdsprof, stopprof, num_cds_incl
def revcomp_mask_chain(seg, k, offset=0): """Reverse-complement a single-interval mask, correcting for `offset`. Parameters ---------- seg : |SegmentChain| Plus-strand mask, including `offset` k : int Length of k-mers offset : int, optional Offset from 5' end of read at which to map mask (Default: `0`) Returns ------- |SegmentChain| Mask on minus strand corresponding to `seg` """ # Algorithm note: # # Let # FW = plus-strand coordinate # RC = minus-strand coordinate # # Then # RC = FW + k - 1 - offset # # But we are given FW + offset, so: # # RC + offset = (FW + offset) + k - 1 - offset # RC = (FW + offset) + k - 1 - 2*offset span = seg.spanning_segment new_offset = k - 1 - 2 * offset ivminus = GenomicSegment(span.chrom, span.start + new_offset, span.end + new_offset, "-") return SegmentChain(ivminus)
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: sys.argv[1:] (actually command-line arguments) """ ap = AnnotationParser(input_choices=_ANNOTATION_INPUT_CHOICES) annotation_file_parser = ap.get_parser() bp = BaseParser() base_parser = bp.get_parser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[base_parser,annotation_file_parser]) parser.add_argument("--export_tophat",default=False,action="store_true", help="Export tophat `.juncs` file in addition to BED output") parser.add_argument("outbase",type=str,help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) transcripts = ap.get_transcripts_from_args(args,printer=printer,return_type=SegmentChain) with argsopener("%s.bed" % args.outbase,args,"w") as bed_out: if args.export_tophat == True: tophat_out = open("%s.juncs" % args.outbase,"w") printer.write("params: " +" ".join(argv)) printer.write("Detecting & comparing junctions...") ex_pairs = {} c = 0 u = 0 for chain in transcripts: if len(chain) > 1: # if multi-exon chrom = chain.chrom strand = chain.strand try: ep = ex_pairs[(chrom,strand)] except KeyError: ex_pairs[(chrom,strand)] = [] ep = ex_pairs[(chrom,strand)] for i in range(0,len(chain)-1): seg1 = chain[i] seg2 = chain[i+1] if c % 1000 == 0 and c > 0: printer.write("Processed %s junctions. Found %s unique..." % (c,u) ) c+=1 key = (seg1.end,seg2.start) if key not in ep: ep.append(key) u += 1 new_chain = SegmentChain(seg1,seg2) bed_out.write(new_chain.as_bed()) if args.export_tophat == True: my_junc = (chrom,seg1.end-1,seg2.start,strand) tophat_out.write("%s\t%s\t%s\t%s\n" % my_junc) del new_chain del seg1 del seg2 del chain printer.write("Processed %s total junctions. Found %s unique." % (c,u) ) bed_out.close() if args.export_tophat == True: tophat_out.close() printer.write("Done.")
def _identify_tfam_orfs((tfam, tids)): """Identify all of the possible ORFs within a family of transcripts. Relevant information such as genomic start and stop positions, amino acid length, and initiation codon will be collected for each ORF. Additionally, each ORF will be assigned a unique 'orfname', such that if it occurs on multiple transcripts, it can be recognized as the same ORF.""" currtfam = SegmentChain.from_bed(tfambedlines[tfam]) chrom = currtfam.chrom strand = currtfam.strand tfam_genpos = np.array(currtfam.get_position_list()) if strand == '-': tfam_genpos = tfam_genpos[::-1] tmask = np.empty((len(tids), len(tfam_genpos)), dtype=np.bool) # True if transcript covers that position, False if not tfam_orfs = [] tidx_lookup = {} for tidx, tid in enumerate(tids): tidx_lookup[tid] = tidx curr_trans = Transcript.from_bed(bedlinedict[tid]) tmask[tidx, :] = np.in1d(tfam_genpos, curr_trans.get_position_list(), assume_unique=True) trans_orfs = _find_all_orfs(curr_trans.get_sequence(genome).upper()) if trans_orfs: (startpos, stoppos, codons) = zip(*trans_orfs) startpos = np.array(startpos, dtype='i4') stoppos = np.array(stoppos, dtype='i4') gcoords = np.array([curr_trans.get_genomic_coordinate(x)[1] for x in startpos], dtype='i4') stop_present = (stoppos > 0) gstops = np.zeros(len(trans_orfs), dtype='i4') gstops[stop_present] = \ np.array([curr_trans.get_genomic_coordinate(x - 1)[1] for x in stoppos[stop_present]]) + (1 if strand == '+' else -1) # the decrementing/incrementing stuff preserves half-openness regardless of strand AAlens = np.zeros(len(trans_orfs), dtype='i4') AAlens[stop_present] = (stoppos[stop_present] - startpos[stop_present])/3 - 1 tfam_orfs.append(pd.DataFrame.from_items([('tfam', tfam), ('tid', tid), ('tcoord', startpos), ('tstop', stoppos), ('chrom', chrom), ('gcoord', gcoords), ('gstop', gstops), ('strand', strand), ('codon', codons), ('AAlen', AAlens), ('orfname', '')])) if any(x is not None for x in tfam_orfs): orf_pos_dict = {} tfam_orfs = pd.concat(tfam_orfs, ignore_index=True) for ((gcoord, AAlen), gcoord_grp) in tfam_orfs.groupby(['gcoord', 'AAlen']): # group by genomic start position and length if len(gcoord_grp) == 1: tfam_orfs.loc[gcoord_grp.index, 'orfname'] = _name_orf(tfam, gcoord, AAlen) else: orf_gcoords = np.vstack(np.flatnonzero(tmask[tidx_lookup[tid], :])[tcoord:tstop] for (tid, tcoord, tstop) in gcoord_grp[['tid', 'tcoord', 'tstop']].itertuples(False)) if (orf_gcoords == orf_gcoords[0, :]).all(): # all of the grouped ORFs are identical, so should receive the same name orfname = _name_orf(tfam, gcoord, AAlen) tfam_orfs.loc[gcoord_grp.index, 'orfname'] = orfname orf_pos_dict[orfname] = tfam_genpos[orf_gcoords[0, :]] else: named_so_far = 0 unnamed = np.ones(len(gcoord_grp), dtype=np.bool) basename = _name_orf(tfam, gcoord, AAlen) while unnamed.any(): next_gcoords = orf_gcoords[unnamed, :][0, :] identicals = (orf_gcoords == next_gcoords).all(1) orfname = '%s_%d' % (basename, named_so_far) tfam_orfs.loc[gcoord_grp.index[identicals], 'orfname'] = orfname orf_pos_dict[orfname] = tfam_genpos[next_gcoords] unnamed[identicals] = False named_so_far += 1 # Now that the ORFs have been found and named, figure out their orftype tfam_orfs['annot_start'] = False tfam_orfs['annot_stop'] = False # start out assuming all are False; replace with True as needed tfam_orfs['orftype'] = 'new' tfam_orfs['untyped'] = tfam_orfs['tstop'] > 0 tfam_orfs.loc[~tfam_orfs['untyped'], 'orftype'] = 'nonstop' # no stop codon if tfam in tfams_with_annots: cds_info = [] all_annot_pos = set() for (annot_fidx, (annot_tfam_lookup, annot_tid_lookup)) in enumerate(zip(annot_tfam_lookups, annot_tid_lookups)): if tfam in annot_tfam_lookup: for (annot_tidx, annot_tid) in enumerate(annot_tfam_lookup[tfam]): curr_trans = Transcript.from_bed(annot_tid_lookup[annot_tid]) if curr_trans.cds_start is not None and curr_trans.cds_end is not None: curr_cds_pos_set = curr_trans.get_cds().get_position_set() curr_len = len(curr_cds_pos_set) if curr_len % 3 == 0: curr_gcoord = curr_trans.get_genomic_coordinate(curr_trans.cds_start)[1] curr_gstop = curr_trans.get_genomic_coordinate(curr_trans.cds_end - 1)[1] + (1 if strand == '+' else -1) in_tfam = curr_cds_pos_set.issubset(tfam_genpos) cds_info.append((curr_gcoord, curr_gstop, (curr_len-3)/3, in_tfam, annot_fidx, annot_tid, curr_cds_pos_set)) all_annot_pos.update(curr_cds_pos_set) if cds_info: # False means no annotated CDSs or none are multiples of 3 in length cds_info = pd.DataFrame(cds_info, columns=['gcoord', 'gstop', 'AAlen', 'in_tfam', 'annot_fidx', 'annot_tid', 'pos']) \ .groupby(['gcoord', 'gstop', 'AAlen', 'in_tfam'], as_index=False) \ .apply(lambda x: x if len(x) == 1 else x[[not any(pos == x['pos'].iat[j] for j in xrange(i)) for (i, pos) in enumerate(x['pos'])]]) \ .set_index(['annot_fidx', 'annot_tid']) # this operation organizes cds_info into a dataframe and effectively drops duplicates # pandas drop_duplicates() is incompatible with sets so have to do it this manual way # the combination of annot_fidx (the number of the file if more than one annotation file provided) and annot_tid should be a unique ID tfam_orfs['annot_start'] = tfam_orfs['gcoord'].isin(cds_info['gcoord']) tfam_orfs['annot_stop'] = tfam_orfs['gstop'].isin(cds_info['gstop']) def _get_orf_pos(orfname, tid=None, tcoord=None, tstop=None): """Helper function that identifies the genomic coordinates of an ORF (in stranded order) and caches them by orfname""" if orfname in orf_pos_dict: return orf_pos_dict[orfname] else: if tid is None or tcoord is None or tstop is None: (tid, tcoord, tstop) = tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['tid', 'tcoord', 'tstop']].iloc[0] res = tfam_genpos[np.flatnonzero(tmask[tidx_lookup[tid], :])[tcoord:tstop]] orf_pos_dict[orfname] = res return res # ANNOTATED and XISO cds_info['found'] = False possible_annot = tfam_orfs.drop_duplicates('orfname').merge(cds_info[cds_info['in_tfam']].reset_index()) # merges on gcoord, gstop, and len - need to reset_index to preserve annot_fidx and annot_tid for ((orfname, tid, tcoord, tstop), cds_grp) in possible_annot.groupby(['orfname', 'tid', 'tcoord', 'tstop']): orf_pos = _get_orf_pos(orfname, tid, tcoord, tstop) for (annot_fidx, annot_tid, cds_pos_set) in cds_grp[['annot_fidx', 'annot_tid', 'pos']].itertuples(False): if cds_pos_set.issubset(orf_pos): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['annotated', False] cds_info.loc[(annot_fidx, annot_tid), 'found'] = True break else: tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['Xiso', False] # matching start and stop but differing in between if tfam_orfs['untyped'].any(): tfam_orfs.loc[tfam_orfs['orfname'].isin(tfam_orfs[tfam_orfs['untyped']].merge(cds_info[['gcoord', 'gstop']])['orfname']), ['orftype', 'untyped']] = ['Xiso', False] # matching start and stop, but must differ somewhere, otherwise would have been identified as annotated (Xiso => "exact isoform") # SISO tfam_orfs.loc[tfam_orfs['annot_start'] & tfam_orfs['annot_stop'] & tfam_orfs['untyped'], ['orftype', 'untyped']] = ['Siso', False] # start and stop each match at least one CDS, but not the same one (Siso => "spliced isoform") # CISO tfam_orfs.loc[tfam_orfs['annot_start'] & tfam_orfs['untyped'], ['orftype', 'untyped']] = ['Ciso', False] # start is annotated, but stop is not - so must be on a new transcript (Ciso => "C-terminal isoform") # TRUNCATION if tfam_orfs['untyped'].any(): found_matched_stop = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on=['tid', 'tstop'], suffixes=('', '_annot')) tfam_orfs.loc[tfam_orfs['orfname'].isin(found_matched_stop.loc[found_matched_stop['tcoord'] > found_matched_stop['tcoord_annot'], 'orfname']), ['orftype', 'untyped']] = ['truncation', False] # on the same transcript with an annotated CDS, with matching stop codon, initiating downstream - must be a truncation # still some missing truncations, if the original CDS was not on a transcript in the present transcriptome if tfam_orfs['untyped'].any() and not cds_info['found'].all(): possible_truncs = tfam_orfs[tfam_orfs['untyped']].drop_duplicates('orfname') \ .merge(cds_info.loc[~cds_info['found'], ['gstop', 'pos', 'AAlen']], on='gstop', suffixes=('', '_annot')) possible_truncs = possible_truncs[possible_truncs['AAlen'] < possible_truncs['AAlen_annot']] for ((orfname, tid, tcoord, tstop, gcoord), cds_pos_sets) in \ possible_truncs.groupby(['orfname', 'tid', 'tcoord', 'tstop', 'gcoord'])['pos']: orf_pos = _get_orf_pos(orfname, tid, tcoord, tstop) if strand == '-': if any(cds_pos_set.issuperset(orf_pos) and all(pos in orf_pos for pos in cds_pos_set if pos <= gcoord) for cds_pos_set in cds_pos_sets): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['truncation', False] else: if any(cds_pos_set.issuperset(orf_pos) and all(pos in orf_pos for pos in cds_pos_set if pos >= gcoord) for cds_pos_set in cds_pos_sets): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['truncation', False] # matching stop codon, contained within, and all positions in the annotation past the orf start codon are included in the orf # EXTENSION if tfam_orfs['untyped'].any(): found_matched_stop = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on=['tid', 'tstop'], suffixes=('', '_annot')) assert (found_matched_stop['tcoord'] < found_matched_stop['tcoord_annot']).all() # other possibilities should be done by now tfam_orfs.loc[tfam_orfs['orfname'].isin(found_matched_stop['orfname']), ['orftype', 'untyped']] = ['extension', False] # on the same transcript with an annotated CDS, with matching stop codon, initiating upstream - must be an extension # no possibility for an "unfound" extension - if the extension is in the transcriptome, the CDS it comes from must be as well # (except for a few edge cases e.g. annotated CDS is a CUG initiator, but not considering CUG ORFs) # NISO tfam_orfs.loc[tfam_orfs['annot_stop'] & (tfam_orfs['untyped']), ['orftype', 'untyped']] = ['Niso', False] # stop is annotated, but start is not, and it's not a truncation or extension - so must be an isoform (Niso => "N-terminal isoform") # NCISO if tfam_orfs['untyped'].any(): orf_codons = [] for (orfname, tid, tcoord, tstop) in \ tfam_orfs.loc[tfam_orfs['untyped'], ['orfname', 'tid', 'tcoord', 'tstop']].drop_duplicates('orfname').itertuples(False): orf_codons.append(pd.DataFrame(_get_orf_pos(orfname, tid, tcoord, tstop).reshape((-1, 3)))) orf_codons[-1]['orfname'] = orfname orf_codons = pd.concat(orf_codons, ignore_index=True) if strand == '-': annot_codons = pd.DataFrame(np.vstack([np.reshape(sorted(cds_pos_set, reverse=True), (-1, 3)) for cds_pos_set in cds_info['pos'] if len(cds_pos_set) % 3 == 0])).drop_duplicates() else: annot_codons = pd.DataFrame(np.vstack([np.reshape(sorted(cds_pos_set, reverse=False), (-1, 3)) for cds_pos_set in cds_info['pos'] if len(cds_pos_set) % 3 == 0])).drop_duplicates() tfam_orfs.loc[tfam_orfs['orfname'].isin(orf_codons.merge(annot_codons)['orfname']), ['orftype', 'untyped']] = ['NCiso', False] # ORFs that have at least one full codon overlapping (in-frame) with a CDS are isoforms (NCiso => "N- and C-terminal isoform") # Note that these must already differ at N- and C- termini, otherwise they would already have been classified # INTERNAL if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_internal = (sametrans['tcoord'] > sametrans['tcoord_annot']) & (sametrans['tstop'] < sametrans['tstop_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_internal, 'orfname']), ['orftype', 'untyped']] = ['internal', False] # ORFs completely contained within a CDS on the same transcript, and not containing any full codon overlaps, must be internal # Still could be other ORFs internal to a CDS on a transcript not in the current transcriptome - need to check manually if tfam_orfs['untyped'].any() and not cds_info['found'].all(): for (orfname, gcoord, gstop) in \ tfam_orfs.loc[tfam_orfs['untyped'], ['orfname', 'gcoord', 'gstop']].drop_duplicates('orfname').itertuples(False): orf_pos = _get_orf_pos(orfname) # should be cached by now if strand == '-': if any(cds_pos_set.issuperset(orf_pos) and all(pos in orf_pos for pos in cds_pos_set if gcoord >= pos > gstop) for cds_pos_set in cds_info.loc[(~cds_info['found']) & (cds_info['gcoord'] > gcoord) & (cds_info['gstop'] < gstop), 'pos']): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['internal', False] else: if any(cds_pos_set.issuperset(orf_pos) and all(pos in orf_pos for pos in cds_pos_set if gcoord <= pos < gstop) for cds_pos_set in cds_info.loc[(~cds_info['found']) & (cds_info['gcoord'] < gcoord) & (cds_info['gstop'] > gstop), 'pos']): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['internal', False] # STOP_OVERLAP if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_stopover = (sametrans['tcoord'] > sametrans['tcoord_annot']) & (sametrans['tcoord'] < sametrans['tstop_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_stopover, 'orfname']), ['orftype', 'untyped']] = ['stop_overlap', False] # starts within a CDS and not an internal - must be a stop_overlap # do not need to check for unfounds - requiring that stop_overlap must be on same transcript as cds # START_OVERLAP if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_startover = (sametrans['tstop'] > sametrans['tcoord_annot']) & (sametrans['tstop'] < sametrans['tstop_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_startover, 'orfname']), ['orftype', 'untyped']] = ['start_overlap', False] # ends within a CDS and not an internal - must be a start_overlap # do not need to check for unfounds - requiring that start_overlap must be on same transcript as cds # LOOF if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_loof = (sametrans['tcoord'] < sametrans['tcoord_annot']) & (sametrans['tstop'] > sametrans['tstop_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_loof, 'orfname']), ['orftype', 'untyped']] = ['LOOF', False] # starts upstream of a CDS and ends downstream of it - must be a LOOF (long out-of-frame) # don't need to check for unfounds because the CDS must be on the same transcript as the ORF if the ORF completely contains it # UPSTREAM if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_upstream = (sametrans['tstop'] <= sametrans['tcoord_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_upstream, 'orfname']), ['orftype', 'untyped']] = ['upstream', False] # ends upstream of a CDS - must be an upstream (uORF) # cannot check manually for unfounds because those are not on well-defined transcripts # DOWNSTREAM if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_downstream = (sametrans['tstop_annot'] <= sametrans['tcoord']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_downstream, 'orfname']), ['orftype', 'untyped']] = ['downstream', False] # starts downstream of a CDS - must be an upstream (uORF) # cannot check manually for unfounds because those are not on well-defined transcripts # NEW_ISO and GISO for orfname in tfam_orfs.loc[tfam_orfs['untyped'], 'orfname'].drop_duplicates(): if all_annot_pos.isdisjoint(_get_orf_pos(orfname)): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['new_iso', False] # no overlaps whatsoever with any annotated CDS, but in a tfam that has annotations: new_iso else: tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['Giso', False] # overlaps out-of-frame with a CDS, and not on the same transcript with a CDS: Giso => "genomic isoform" assert not tfam_orfs['untyped'].any() return tfam_orfs.drop('untyped', axis=1) else: return None
def _get_tid_info(tup): """For each transcript on this chromosome/strand, identifies every sub-sequence of the appropriate length (fpsize), converts it to an integer, identifies the number of reads mapping to that position, and outputs all of that information to a pandas HDF store.""" (chrom, strand) = tup inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles] gnd = BAMGenomeArray(inbams, mapping=FivePrimeMapFactory(psite)) # map to roughly the center of each read so that identical sequences that cross different splice sites # (on different transcripts) still end up mapping to the same place gnd.add_filter('size', SizeFilterFactory(opts.minlen, opts.maxlen)) tid_seq_info = [] tid_summary = pd.DataFrame( { 'chrom': chrom, 'strand': strand, 'n_psite': -1, 'n_reads': -1, 'peak_reads': -1, 'dropped': '' }, index=pd.Index(bedlinedict[(chrom, strand)].keys(), name='tid')) for (tid, line) in bedlinedict[(chrom, strand)].iteritems(): currtrans = SegmentChain.from_bed(line) curr_pos_list = currtrans.get_position_list() # not in stranded order! if strand == '-': curr_pos_list = curr_pos_list[::-1] n_psite = len(curr_pos_list) + 1 - fpsize tid_summary.at[tid, 'n_psite'] = n_psite if n_psite > 0: curr_counts = np.array(currtrans.get_counts(gnd))[psite:n_psite + psite] # if((curr_counts>0).any()): sumcounts = curr_counts.sum() maxcounts = curr_counts.max() tid_summary.at[tid, 'n_reads'] = sumcounts tid_summary.at[tid, 'peak_reads'] = maxcounts if sumcounts >= opts.minreads: if maxcounts < sumcounts * opts.peakfrac: numseq = np.array( list( currtrans.get_sequence(genome).upper().translate( str_dict))) curr_seq = ''.join(numseq) tid_seq_info.append( pd.DataFrame({ 'tid': tid, 'genpos': curr_pos_list[psite:n_psite + psite], 'seq': np.array([(int(curr_seq[i:i + fpsize], 4) if 'N' not in curr_seq[i:i + fpsize] else -1) for i in xrange(n_psite)], dtype=np.int64), 'reads': curr_counts })) else: tid_summary.at[tid, 'dropped'] = 'peakfrac' else: tid_summary.at[tid, 'dropped'] = 'lowreads' if tid_seq_info: # don't bother saving anything if there's nothing to save pd.concat(tid_seq_info, ignore_index=True).to_hdf(seq_info_hdf % (chrom, strand), 'tid_seq_info', format='t', data_columns=True, complevel=1, complib='blosc') # sp.call(['ptrepack', orig_store_name, seq_info_hdf%(chrom,strand)]) # repack for efficiency # os.remove(orig_store_name) if opts.verbose > 1: with log_lock: logprint('%s (%s strand) complete' % (chrom, strand)) for inbam in inbams: inbam.close() return tid_summary
def do_count(roi_table,ga,norm_start,norm_end,min_counts,min_len,max_len,aggregate=False,printer=NullWriter()): """Calculate a :term:`metagene profile` for each read length in the dataset Parameters ---------- roi_table : :class:`pandas.DataFrame` Table specifying regions of interest, generated by :py:func:`plastid.bin.metagene.do_generate` ga : |BAMGenomeArray| Count data norm_start : int Coordinate in window specifying normalization region start norm_end : int Coordinate in window specifying normalization region end min_counts : float Minimum number of counts in `window[norm_start:norm_end]` required for inclusion in metagene profile min_len : int Minimum read length to include max_len : int Maximum read length to include aggregate : bool, optional Estimate P-site from aggregate reads at each position, instead of median normalized read density. Potentially noisier, but helpful for lower-count data or read lengths with few counts. (Default: False) printer : file-like, optional filehandle to write logging info to (Default: :func:`~plastid.util.io.openers.NullWriter`) Returns ------- dict Dictionary of :class:`numpy.ndarray` s of raw counts at each position (column) for each window (row) dict Dictionary of :class:`numpy.ndarray` s of normalized counts at each position (column) for each window (row), normalized by the total number of counts in that row from `norm_start` to `norm_end` :class:`pandas.DataFrame` Metagene profile of median normalized counts at each position across all windows, and the number of windows included in the calculation of each median, stratified by read length """ window_size = roi_table["window_size"][0] upstream_flank = roi_table["zero_point"][0] raw_count_dict = OrderedDict() norm_count_dict = OrderedDict() shape = (len(roi_table),window_size) for i in range(min_len,max_len+1): # mask all by default raw_count_dict[i] = numpy.ma.MaskedArray(numpy.tile(numpy.nan,shape), mask=numpy.tile(True,shape), dtype=float) for i,row in roi_table.iterrows(): if i % 1000 == 0: printer.write("Counted %s ROIs ..." % (i+1)) roi = SegmentChain.from_str(row["region"]) mask = SegmentChain.from_str(row["masked"]) roi.add_masks(*mask) valid_mask = roi.get_masked_counts(ga).mask offset = int(round((row["alignment_offset"]))) assert offset + roi.length <= window_size count_vectors = {} for k in raw_count_dict: count_vectors[k] = [] for seg in roi: reads = ga.get_reads(seg) read_dict = {} for k in raw_count_dict: read_dict[k] = [] for read in filter(lambda x: len(x.positions) in read_dict,reads): read_dict[len(read.positions)].append(read) for k in read_dict: count_vector = ga.map_fn(read_dict[k],seg)[1] count_vectors[k].extend(count_vector) for k in raw_count_dict: if roi.strand == "-": count_vectors[k] = count_vectors[k][::-1] raw_count_dict[k].data[i,offset:offset+roi.length] = numpy.array(count_vectors[k]) raw_count_dict[k].mask[i,offset:offset+roi.length] = valid_mask profile_table = { "x" : numpy.arange(-upstream_flank,window_size-upstream_flank) } printer.write("Counted %s ROIs total." % (i+1)) for k in raw_count_dict: k_raw = raw_count_dict[k] denominator = numpy.nansum(k_raw[:,norm_start:norm_end],axis=1) norm_count_dict[k] = (k_raw.T.astype(float) / denominator).T # copy mask from raw counts, then add nans and infs norm_counts = numpy.ma.MaskedArray(norm_count_dict[k], mask=k_raw.mask) norm_counts.mask[numpy.isnan(norm_counts)] = True norm_counts.mask[numpy.isinf(norm_counts)] = True with warnings.catch_warnings(): # ignore numpy mean of empty slice warning, given by numpy in Python 2.7-3.4 warnings.filterwarnings("ignore",".*mean of empty.*",RuntimeWarning) try: if aggregate == False: profile = numpy.ma.median(norm_counts[denominator >= min_counts],axis=0) else: profile = numpy.nansum(k_raw[denominator >= min_counts],axis=0) # in numpy under Python3.5, this is an IndexError instead of a warning except IndexError: profile = numpy.zeros_like(profile_table["x"],dtype=float) # in new versions of numpy, this is a ValueEror instead of an IndexError except ValueError: profile = numpy.zeros_like(profile_table["x"],dtype=float) num_genes = ((~norm_counts.mask)[denominator >= min_counts]).sum(0) profile_table["%s-mers" % k] = profile profile_table["%s_regions_counted" % k] = num_genes profile_table = pd.DataFrame(profile_table) return raw_count_dict, norm_count_dict, profile_table
known_juncs = { "YNL130C" : ["YNL130C:0-53^145-180(-)",], "YPL249C-A" : ["YPL249C-A:0-53^291-334(-)",], 'YBR215W_mRNA_0' : ['YBR215W_mRNA_0:0-108^192-2175(+)'], 'YHL001W_mRNA_0' : ['YHL001W_mRNA_0:0-146^544-961(+)'], 'YIL018W_mRNA_0' : ['YIL018W_mRNA_0:0-30^430-1280(+)'], 'YIL133C_mRNA_0' : ['YIL133C_mRNA_0:0-648^938-1007(-)'], 'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'], 'YKL006W_mRNA_0' : ['YKL006W_mRNA_0:0-157^555-954(+)'], 'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'], 'YNL130C_mRNA_0' : ['YNL130C_mRNA_0:0-1204^1296-1382(-)'], 'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'], } known_juncs = { K : [SegmentChain.from_str(X) for X in V] for K,V in known_juncs.items() } """Annotated splice junctions""" all_known_juncs = [] for v in known_juncs.values(): all_known_juncs.extend(v) known_juncs_as_tuples = { "YNL130C" : [("YNL130C",53,145,"-"),], "YPL249C-A" : [("YPL249C-A",53,291,"-"),], 'YBR215W_mRNA_0' : [('YBR215W_mRNA_0',108,192,'+'),], 'YHL001W_mRNA_0' : [('YHL001W_mRNA_0',146,544,'+'),], 'YIL018W_mRNA_0' : [('YIL018W_mRNA_0',30,430,'+'),], 'YIL133C_mRNA_0' : [('YIL133C_mRNA_0',648,938,'-'),], 'YIL156W_B_mRNA_0': [('YIL156W_B_mRNA_0',41,103,'+'),],
reader = BED_Reader(cStringIO.StringIO(_NARROW_PEAK_TEXT), extra_columns=14) with warnings.catch_warnings(record=True) as warns: warnings.simplefilter("always") ltmp = list(reader) assert_greater_equal(len(warns), 0) #=============================================================================== # INDEX: test data #=============================================================================== # test dataset, constructed manually to include various edge cases _TEST_SEGMENTCHAINS = [ # single-interval SegmentChain(GenomicSegment("chrA", 100, 1100, "+"), ID="IVC1p"), SegmentChain(GenomicSegment("chrA", 100, 1100, "-"), ID="IVC1m"), # multi-interval SegmentChain(GenomicSegment("chrA", 100, 1100, "+"), GenomicSegment("chrA", 2100, 2600, "+"), ID="IVC2p"), SegmentChain(GenomicSegment("chrA", 100, 1100, "-"), GenomicSegment("chrA", 2100, 2600, "-"), ID="IVC2m"), # multi-interval, with score SegmentChain(GenomicSegment("chrA", 100, 1100, "+"), GenomicSegment("chrA", 2100, 2600, "+"), ID="IVC3p", score=500), SegmentChain(GenomicSegment("chrA", 100, 1100, "-"), GenomicSegment("chrA", 2100, 2600, "-"),
def process_partial_group(transcripts, mask_hash, printer): """Correct boundaries of merged genes, as described in :func:`do_generate` Parameters ---------- transcripts : dict Dictionary mapping unique transcript IDs to |Transcripts|. This set should be complete in the sense that it should contain all transcripts that have any chance of mutually overlapping each other (e.g. all on same chromosome and strand). mask_hash : |GenomeHash| |GenomeHash| of regions to exclude from analysis Returns ------- :class:`pandas.DataFrame` Table of merged gene positions :class:`pandas.DataFrame` Table of adjusted transcript positions :class:`dict` Dictionary mapping raw gene names to merged gene names """ gene_table = { "region": [], "transcript_ids": [], "exon_unmasked": [], "exon": [], "masked": [], "utr5": [], "cds": [], "utr3": [], "exon_bed": [], "utr5_bed": [], "cds_bed": [], "utr3_bed": [], "masked_bed": [], } # data table for transcripts transcript_table = { "region": [], "exon": [], "utr5": [], "cds": [], "utr3": [], "masked": [], "exon_unmasked": [], "transcript_ids": [], "exon_bed": [], "utr5_bed": [], "cds_bed": [], "utr3_bed": [], "masked_bed": [], } keycombos = list(itertools.permutations(("utr5", "cds", "utr3"), 2)) # merge genes that share exons & write output printer.write("Collapsing genes that share exons ...") merged_genes = merge_genes(transcripts) # remap transcripts to merged genes # and vice-versa merged_gene_tx = {} tx_merged_gene = {} printer.write("Mapping transcripts to merged genes...") for txid in transcripts: my_tx = transcripts[txid] my_gene = my_tx.get_gene() my_merged = merged_genes[my_gene] tx_merged_gene[txid] = my_merged try: merged_gene_tx[my_merged].append(txid) except KeyError: merged_gene_tx[my_merged] = [txid] # flatten merged genes printer.write( "Flattening merged genes, masking positions, and labeling subfeatures ..." ) for n, (gene_id, my_txids) in enumerate(merged_gene_tx.items()): if n % 1000 == 0 and n > 0: printer.write(" %s genes ..." % n) my_gene_positions = [] chroms = [] strands = [] for my_txid in my_txids: my_segmentchain = transcripts[my_txid] chroms.append(my_segmentchain.chrom) strands.append(my_segmentchain.strand) my_gene_positions.extend(my_segmentchain.get_position_list()) try: assert len(set(chroms)) == 1 except AssertionError: printer.write( "Skipping gene %s which contains multiple chromosomes: %s" % (gene_id, ",".join(chroms))) try: assert len(set(strands)) == 1 except AssertionError: printer.write( "Skipping gene %s which contains multiple strands: %s" % (gene_id, ",".join(strands))) my_gene_positions = set(my_gene_positions) gene_ivc_raw = SegmentChain( *positions_to_segments(chroms[0], strands[0], my_gene_positions)) gene_table["region"].append(gene_id) gene_table["transcript_ids"].append(",".join(sorted(my_txids))) gene_table["exon_unmasked"].append(gene_ivc_raw) printer.write(" %s genes total." % (n + 1)) # mask genes printer.write("Masking positions and labeling subfeature positions ...") gene_hash = GenomeHash(gene_table["exon_unmasked"], do_copy=False) for n, (gene_id, gene_ivc_raw) in enumerate( zip(gene_table["region"], gene_table["exon_unmasked"])): if n % 2000 == 0: printer.write(" %s genes ..." % n) my_chrom = gene_ivc_raw.spanning_segment.chrom my_strand = gene_ivc_raw.spanning_segment.strand masked_positions = [] nearby_genes = gene_hash[gene_ivc_raw] # don't mask out positions from identical gene gene_ivc_raw_positions = gene_ivc_raw.get_position_set() nearby_genes = [ X for X in nearby_genes if X.get_position_set() != gene_ivc_raw_positions ] for gene in nearby_genes: masked_positions.extend(gene.get_position_list()) nearby_masks = mask_hash[gene_ivc_raw] for mask in nearby_masks: masked_positions.extend(mask.get_position_list()) masked_positions = set(masked_positions) gene_positions_raw = gene_ivc_raw.get_position_set() mask_ivc_positions = gene_positions_raw & masked_positions total_mask_ivc = SegmentChain(*positions_to_segments( my_chrom, my_strand, mask_ivc_positions), ID=gene_id) gene_table["masked"].append(total_mask_ivc) gene_table["masked_bed"].append(total_mask_ivc.as_bed()) gene_post_mask = gene_positions_raw - masked_positions gene_post_mask_ivc = SegmentChain(*positions_to_segments( my_chrom, my_strand, gene_post_mask), ID=gene_id) gene_table["exon"].append(gene_post_mask_ivc) gene_table["exon_bed"].append(gene_post_mask_ivc.as_bed()) masked_positions = total_mask_ivc.get_position_set() tmp_positions = { "utr5": set(), "cds": set(), "utr3": set(), } txids = sorted(merged_gene_tx[gene_id]) chrom = gene_post_mask_ivc.chrom strand = gene_post_mask_ivc.strand # pool transcript positions for txid in txids: transcript = transcripts[txid] utr5pos = transcript.get_utr5().get_position_set() cdspos = transcript.get_cds().get_position_set() utr3pos = transcript.get_utr3().get_position_set() tmp_positions["utr5"] |= utr5pos tmp_positions["cds"] |= cdspos tmp_positions["utr3"] |= utr3pos # eliminate positions in which CDS & UTRs overlap from each transcript for txid in txids: transcript = transcripts[txid] transcript_positions = { "utr5": transcript.get_utr5().get_position_set(), "cds": transcript.get_cds().get_position_set(), "utr3": transcript.get_utr3().get_position_set(), } for key1, key2 in keycombos: transcript_positions[key1] -= tmp_positions[key2] transcript_positions[key1] -= masked_positions transcript_table["region"].append(txid) # all unmasked positions my_chain = SegmentChain(*positions_to_segments( chrom, strand, transcript.get_position_set() - masked_positions), ID=txid) transcript_table["exon"].append(str(my_chain)) transcript_table["exon_bed"].append(my_chain.as_bed()) # all uniquely-labeled unmasked positions for k, v in transcript_positions.items(): my_chain = SegmentChain(*positions_to_segments( chrom, strand, v), ID=txid) transcript_table[k].append(str(my_chain)) transcript_table["%s_bed" % k].append(my_chain.as_bed()) total_mask_ivc.attr["ID"] = txid transcript_table["masked"].append(str(total_mask_ivc)) transcript_table["masked_bed"].append(total_mask_ivc.as_bed()) transcript_table["exon_unmasked"].append(str(transcript)) transcript_table["transcript_ids"].append(txid) tmp_positions2 = copy.deepcopy(tmp_positions) for k1, k2 in keycombos: tmp_positions[k1] -= tmp_positions2[k2] tmp_positions[k1] -= masked_positions for k in (tmp_positions.keys()): my_chain = SegmentChain(*positions_to_segments( chrom, strand, tmp_positions[k]), ID=gene_id) gene_table[k].append(str(my_chain)) gene_table["%s_bed" % k].append(my_chain.as_bed()) printer.write(" %s genes total." % (n + 1)) # cast SegmentChains/Transcripts to strings to keep numpy from unpacking them conversion_keys = [ "exon", "utr5", "cds", "utr3", "masked", "exon_unmasked" ] for k in conversion_keys: gene_table[k] = [str(X) for X in gene_table[k]] transcript_table[k] = [str(X) for X in transcript_table[k]] gene_df = pd.DataFrame(gene_table) gene_df.sort_values(["region"], inplace=True) transcript_df = pd.DataFrame(transcript_table) transcript_df.sort_values(["region"], inplace=True) return gene_df, transcript_df, merged_genes
def test_search_fields_singlevalue(self): reader = BigBedReader(self.bb_indexed) found = list(reader.search("name", "should_have_no_match")) self.assertEqual([], found) found = list(reader.search("Name", "Sam-S-RE")) expected = [ SegmentChain(GenomicSegment('2L', 106902, 107000, '+'), GenomicSegment('2L', 107764, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RE', 'CG2674-RE']'", ID='FBtr0089437', Name='Sam-S-RE', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), ] self.assertEqual(expected, found) found = list(reader.search("gene_id", "FBgn0005278")) expected = [ SegmentChain(GenomicSegment('2L', 106902, 107000, '+'), GenomicSegment('2L', 107764, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RE', 'CG2674-RE']'", ID='FBtr0089437', Name='Sam-S-RE', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 111337, '+'), Alias='na', ID='FBtr0308091', Name='Sam-S-RK', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='110900', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111004, 111117, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114210, '+'), Alias="'['M(2)21AB-RB', 'CG2674-RB']'", ID='FBtr0089428', Name='Sam-S-RB', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='112741', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RA', 'CG2674-RA']'", ID='FBtr0089429', Name='Sam-S-RA', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107956, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias='na', ID='FBtr0330656', Name='Sam-S-RL', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='112781', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107936, 108226, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114210, '+'), Alias="'['M(2)21AB-RH', 'CG2674-RH']'", ID='FBtr0089432', Name='Sam-S-RH', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107936, 108101, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RD', 'CG2674-RD']'", ID='FBtr0089430', Name='Sam-S-RD', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107936, 108101, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111004, 111117, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RC', 'CG2674-RC']'", ID='FBtr0089431', Name='Sam-S-RC', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 108088, 108226, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RF', 'CG2674-RF']'", ID='FBtr0089433', Name='Sam-S-RF', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 108132, 108346, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RI', 'CG2674-RI']'", ID='FBtr0089434', Name='Sam-S-RI', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 108132, 108226, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111004, 111117, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RJ', 'CG2674-RJ']'", ID='FBtr0089435', Name='Sam-S-RJ', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 109593, 109793, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111004, 111117, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114210, '+'), Alias="'['M(2)21AB-RG', 'CG2674-RG']'", ID='FBtr0089436', Name='Sam-S-RG', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='109750', type='exon'), ] self.assertEqual(sorted(expected), sorted(found))
def _regress_tfam(orf_set, gnd): """Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile() Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False""" tfam = orf_set['tfam'].iat[0] strand = orf_set['strand'].iat[0] chrom = orf_set['chrom'].iat[0] tids = orf_set['tid'].drop_duplicates().tolist() all_tfam_genpos = set() tid_genpos = {} tlens = {} for (i, tid) in enumerate(tids): currtrans = SegmentChain.from_bed(bedlinedict[tid]) curr_pos_set = currtrans.get_position_set() tlens[tid] = len(curr_pos_set) tid_genpos[tid] = curr_pos_set all_tfam_genpos.update(curr_pos_set) tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos))) all_tfam_genpos = np.array(sorted(all_tfam_genpos)) if strand == '-': all_tfam_genpos = all_tfam_genpos[::-1] nnt = len(all_tfam_genpos) tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True)) for (tid, curr_tid_genpos) in tid_genpos.iteritems()} hashed_counts = get_hashed_counts(tfam_segs, gnd) counts = np.zeros((len(rdlens), nnt), dtype=np.float64) # even though they are integer-valued, will need to do float arithmetic for (i, rdlen) in enumerate(rdlens): for nmis in range(1+opts.max5mis): counts[i, :] += hashed_counts[(rdlen, nmis)] counts = counts.ravel() if opts.startcount: # Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3) # offsets for each cond, expecting three positions to check for each # try: orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in [tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]] if orf_set.empty: return failure_return orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True) abort_set = orf_set.drop_duplicates('gcoord').copy() abort_set['gstop'] = abort_set['gcoord'] # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag abort_set['tstop'] = abort_set['tcoord']+3 # stop after the first codon abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True) if not opts.startonly: # if marking full ORFs, include histop model stop_set = orf_set.drop_duplicates('gstop').copy() stop_set['gcoord'] = stop_set['gstop'] # this is an easy flag stop_set['tcoord'] = stop_set['tstop'] # should probably be -3 nt, but this is another easy flag that distinguishes from abinit stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True) orf_profs = [] indices = [] for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False): if tcoord != tstop: # not a histop tlen = tlens[tid] if tcoord+startnt[0] < 0: startadj = -startnt[0]-tcoord # number of nts to remove from the start due to short 5' UTR; guaranteed > 0 else: startadj = 0 if tstop+stopnt[1] > tlen: stopadj = tstop+stopnt[1]-tlen # number of nts to remove from the end due to short 3' UTR; guaranteed > 0 else: stopadj = 0 curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj] orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel()) else: # histop curr_indices = tid_indices[tid][tstop-6:tstop] orf_profs.append(stopprof[:, -6:].ravel()) indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))])) # need to tile the indices for each read length if len(indices[-1]) != len(orf_profs[-1]): raise AssertionError('ORF length does not match index length') orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs), np.concatenate(indices), np.cumsum([0]+[len(curr_indices) for curr_indices in indices])), shape=(nnt*len(rdlens), len(orf_strength_df))) # better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0) if len(nonzero_orfs) == 0: # no possibility of anything coming up return failure_return orf_matrix = orf_matrix[:, nonzero_orfs] orf_strength_df = orf_strength_df.iloc[nonzero_orfs] # don't bother fitting ORFs with zero reads throughout their entire length (orf_strs, resid) = nnls(orf_matrix.toarray(), counts) min_str = 1e-6 # allow for machine rounding error usable_orfs = orf_strs > min_str if not usable_orfs.any(): return failure_return orf_strength_df = orf_strength_df[usable_orfs] orf_matrix = orf_matrix[:, usable_orfs] # remove entries for zero-strength ORFs or transcripts orf_strs = orf_strs[usable_orfs] orf_strength_df['orf_strength'] = orf_strs covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df)) # homoscedastic version (assume equal variance at all positions) # resids = counts-orf_matrix.dot(orf_strs) # simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray()) # covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids)))) # .dot(orf_matrix).dot(simple_covmat)) # # heteroscedastic version (Eicker-Huber-White robust estimator) orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat) orf_strength_df.set_index('orfname', inplace=True) elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord']) if opts.startonly: # count abortive initiation events towards start strength in this case include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop']) gcoord_grps = orf_strength_df[include_starts].groupby('gcoord') # even if we are willing to count abinit towards start strength, we certainly shouldn't count histop covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)] orf_strs_starts = orf_strs[include_starts.values] else: gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord') covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] orf_strs_starts = orf_strs[elongating_orfs.values] start_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('codon', gcoord_grps['codon'].first()), ('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))]) start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)])) .dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()}) if not opts.startonly: # count histop towards the stop codon - but still exclude abinit include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop'])) gstop_grps = orf_strength_df[include_stops].groupby('gstop') covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)] orf_strs_stops = orf_strs[include_stops.values] stop_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))]) stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)])) .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) # # nohistop # gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop') # covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] # orf_strs_stops = orf_strs[elongating_orfs.values] # stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum) # stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)])) # .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) return orf_strength_df, start_strength_df, stop_strength_df else: return orf_strength_df, start_strength_df
def _regress_tfam(orf_set, gnd): """Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile() Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False""" tfam = orf_set['tfam'].iat[0] strand = orf_set['strand'].iat[0] chrom = orf_set['chrom'].iat[0] tids = orf_set['tid'].drop_duplicates().tolist() all_tfam_genpos = set() tid_genpos = {} tlens = {} for (i, tid) in enumerate(tids): currtrans = SegmentChain.from_bed(bedlinedict[tid]) curr_pos_set = currtrans.get_position_set() tlens[tid] = len(curr_pos_set) tid_genpos[tid] = curr_pos_set all_tfam_genpos.update(curr_pos_set) tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos))) all_tfam_genpos = np.array(sorted(all_tfam_genpos)) if strand == '-': all_tfam_genpos = all_tfam_genpos[::-1] nnt = len(all_tfam_genpos) tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True)) for (tid, curr_tid_genpos) in tid_genpos.iteritems()} hashed_counts = get_hashed_counts(tfam_segs, gnd) counts = np.zeros((len(rdlens), nnt), dtype=np.float64) # even though they are integer-valued, will need to do float arithmetic for (i, rdlen) in enumerate(rdlens): for nmis in range(1+opts.max5mis): counts[i, :] += hashed_counts[(rdlen, nmis)] counts = counts.ravel() if opts.startcount: # Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3) # offsets for each cond, expecting three positions to check for each # try: orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in [tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]] if orf_set.empty: return failure_return orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True) abort_set = orf_set.drop_duplicates('gcoord').copy() abort_set['gstop'] = abort_set['gcoord'] # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag abort_set['tstop'] = abort_set['tcoord']+3 # stop after the first codon abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True) if not opts.startonly: # if marking full ORFs, include histop model stop_set = orf_set.drop_duplicates('gstop').copy() stop_set['gcoord'] = stop_set['gstop'] # this is an easy flag stop_set['tcoord'] = stop_set['tstop'] # should probably be -3 nt, but this is another easy flag that distinguishes from abinit stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True) orf_profs = [] indices = [] for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False): if tcoord != tstop: # not a histop tlen = tlens[tid] if tcoord+startnt[0] < 0: startadj = -startnt[0]-tcoord # number of nts to remove from the start due to short 5' UTR; guaranteed > 0 else: startadj = 0 if tstop+stopnt[1] > tlen: stopadj = tstop+stopnt[1]-tlen # number of nts to remove from the end due to short 3' UTR; guaranteed > 0 else: stopadj = 0 curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj] orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel()) else: # histop curr_indices = tid_indices[tid][tstop-6:tstop] orf_profs.append(stopprof[:, -6:].ravel()) indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))])) # need to tile the indices for each read length if len(indices[-1]) != len(orf_profs[-1]): raise AssertionError('ORF length does not match index length') orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs), np.concatenate(indices), np.cumsum([0]+[len(curr_indices) for curr_indices in indices])), shape=(nnt*len(rdlens), len(orf_strength_df))) # better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0) if len(nonzero_orfs) == 0: # no possibility of anything coming up return failure_return orf_matrix = orf_matrix[:, nonzero_orfs] orf_strength_df = orf_strength_df.iloc[nonzero_orfs] # don't bother fitting ORFs with zero reads throughout their entire length (orf_strs, resid) = nnls(orf_matrix.toarray(), counts) min_str = 1e-6 # allow for machine rounding error usable_orfs = orf_strs > min_str if not usable_orfs.any(): return failure_return orf_strength_df = orf_strength_df[usable_orfs] orf_matrix = orf_matrix[:, usable_orfs] # remove entries for zero-strength ORFs or transcripts orf_strs = orf_strs[usable_orfs] orf_strength_df['orf_strength'] = orf_strs covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df)) # homoscedastic version (assume equal variance at all positions) # resids = counts-orf_matrix.dot(orf_strs) # simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray()) # covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids)))) # .dot(orf_matrix).dot(simple_covmat)) # # heteroscedastic version (Eicker-Huber-White robust estimator) orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat) orf_strength_df.set_index('orfname', inplace=True) elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord']) if opts.startonly: # count abortive initiation events towards start strength in this case include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop']) if not include_starts.any(): return failure_return # no need to keep going if there weren't any useful starts gcoord_grps = orf_strength_df[include_starts].groupby('gcoord') # even if we are willing to count abinit towards start strength, we certainly shouldn't count histop covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)] orf_strs_starts = orf_strs[include_starts.values] else: if not elongating_orfs.any(): return failure_return gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord') covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] orf_strs_starts = orf_strs[elongating_orfs.values] start_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('codon', gcoord_grps['codon'].first()), ('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))]) start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)])) .dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()}) if not opts.startonly: # count histop towards the stop codon - but still exclude abinit include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop'])) gstop_grps = orf_strength_df[include_stops].groupby('gstop') covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)] orf_strs_stops = orf_strs[include_stops.values] stop_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))]) stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)])) .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) # # nohistop # gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop') # covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] # orf_strs_stops = orf_strs[elongating_orfs.values] # stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum) # stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)])) # .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) return orf_strength_df, start_strength_df, stop_strength_df else: return orf_strength_df, start_strength_df
def check_window(tx_ivc, known_roi, known_offset, known_ref_point, flank_up, flank_down, test_method, test_name, ref_delta=0): """Helper function to test output of window landmark functions Parameters ---------- tx_ivc : |SegmentChain| Test Transcript from which window will be derived known_roi : |SegmentChain| Reference output for ROI known_offset : int Known offset to start of ROI known_ref_point : (str,int,str) or numpy.nan Known offset to landmark in ROI as ("chromosome_name",position,"strand") flank_up : int Flank upstream of landmark to include in ROI flank_down : int Flank downstream of landmark to include in ROI test_method : function Function to test (e.g. :py:func:`window_cds_start`, py:func:`window_cds_stop`) test_name : str Name of test (for generating rich error output) ref_delta : int, optional Distance from reference landmark at which to center windows """ err_str = ("Failed %s on %s (strand: '%s', up: %s, down: %s). " % (test_name, str(tx_ivc), tx_ivc.spanning_segment.strand, flank_up, flank_down)) + "%s unequal (%s vs %s)" test_roi, test_offset, test_ref_point = test_method(tx_ivc, flank_up, flank_down, ref_delta=ref_delta) check_equality(SegmentChain.from_str(known_roi), test_roi) # if no landmark if numpy.isnan(known_offset) or isinstance( known_ref_point, float) and numpy.isnan(known_ref_point): assert_true(numpy.isnan(test_offset), msg=err_str % ("offset", known_offset, test_offset)) assert_true(numpy.isnan(test_ref_point), msg=err_str % ("ref_point", known_ref_point, test_ref_point)) # if landmark else: assert_equal(known_offset, test_offset, msg=err_str % ("offset", known_offset, test_offset)) assert_equal(known_ref_point, test_ref_point, msg=err_str % ("ref_point", known_ref_point, test_ref_point))
for tid in tfam_val[0] if tid in gene_name_lookup } if not geneset: geneset = set( tfam_val[0] ) # if no gene names available, just use the tids themselves genename = _choose_name(geneset) if genename in new_tfams: multi_names[genename] += 1 genename = '%s_%d' % (genename, multi_names[genename]) new_tfams[genename] = tfam_val for (genename, num_appearances) in multi_names.iteritems(): sys.stderr.write('WARNING: Gene name %s appears %d independent times\n' % (genename, num_appearances)) if opts.verbose: logprint('Saving results') with open(outbedname, 'w') as outbed: with open(outtxtname, 'w') as outtxt: for tfam, (tids, (chrom, strand), genpos) in new_tfams.iteritems(): outbed.write( SegmentChain(*positionlist_to_segments(chrom, strand, list(genpos)), ID=tfam).as_bed()) for tid in tids: outtxt.write('%s\t%s\n' % (tid, tfam)) if opts.verbose: logprint('Tasks complete')
CCCTCCTTCCGCTGGCCCCGACTGC >chr30b:1(+) CCTCCTTCCGCTGGCCCCGACTGCC >chr30b:2(+) CTCCTTCCGCTGGCCCCGACTGCCC >chr30b:3(+) TCCTTCCGCTGGCCCCGACTGCCCC >chr30b:4(+) CCTTCCGCTGGCCCCGACTGCCCCA >chr30b:5(+) CTTCCGCTGGCCCCGACTGCCCCAG """ CROSSMAP1 = [ ( SegmentChain(GenomicSegment("chr50a", 1, 10, "+")), SegmentChain(GenomicSegment("chr50a", 1 + 25 - 1, 10 + 25 - 1, "-")), ), ( SegmentChain(GenomicSegment("chr50a", 19, 26, "+")), SegmentChain(GenomicSegment("chr50a", 19 + 25 - 1, 26 + 25 - 1, "-")), ), ( SegmentChain(GenomicSegment("chr30b", 0, 6, "+")), SegmentChain(GenomicSegment("chr30b", 0 + 25 - 1, 6 + 25 - 1, "-")), ) ] CROSSMAP2 = [ ( SegmentChain(GenomicSegment("chr50a", 1 + 1000, 10 + 1000, "+")),
], "YPL249C-A": [ "YPL249C-A:0-53^291-334(-)", ], 'YBR215W_mRNA_0': ['YBR215W_mRNA_0:0-108^192-2175(+)'], 'YHL001W_mRNA_0': ['YHL001W_mRNA_0:0-146^544-961(+)'], 'YIL018W_mRNA_0': ['YIL018W_mRNA_0:0-30^430-1280(+)'], 'YIL133C_mRNA_0': ['YIL133C_mRNA_0:0-648^938-1007(-)'], 'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'], 'YKL006W_mRNA_0': ['YKL006W_mRNA_0:0-157^555-954(+)'], 'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'], 'YNL130C_mRNA_0': ['YNL130C_mRNA_0:0-1204^1296-1382(-)'], 'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'], } known_juncs = { K: [SegmentChain.from_str(X) for X in V] for K, V in known_juncs.items() } """Annotated splice junctions""" known_juncs_tuples = { "YNL130C": [ ("YNL130C", 53, 145, "-"), ], "YPL249C-A": [ ("YPL249C-A", 53, 291, "-"), ], 'YBR215W_mRNA_0': [ ('YBR215W_mRNA_0', 108, 192, '+'), ], 'YHL001W_mRNA_0': [
def _quantify_tfam(orf_set, gnds): """Performs non-negative least squares regression to quantify all of the ORFs in a transcript family, using a simplified profile consisting of the same three numbers tiled across each ORF. All readlengths are treated identically. Regions around start and stop codons are masked in accordance with startmask and stopmask""" strand = orf_set['strand'].iat[0] chrom = orf_set['chrom'].iat[0] tids = orf_set['tid'].drop_duplicates().tolist() all_tfam_genpos = set() tid_genpos = {} tlens = {} for (i, tid) in enumerate(tids): currtrans = SegmentChain.from_bed(bedlinedict[tid]) curr_pos_set = currtrans.get_position_set() tlens[tid] = len(curr_pos_set) tid_genpos[tid] = curr_pos_set all_tfam_genpos.update(curr_pos_set) all_tfam_genpos = np.array(sorted(all_tfam_genpos)) if strand == '-': all_tfam_genpos = all_tfam_genpos[::-1] nnt = len(all_tfam_genpos) tid_indices = { tid: np.flatnonzero( np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True)) for (tid, curr_tid_genpos) in tid_genpos.iteritems() } orf_matrix = np.zeros((nnt, len(orf_set))) ignore_coords = [] for (orf_num, (tid, tcoord, tstop, AAlen)) in enumerate(orf_set[['tid', 'tcoord', 'tstop', 'AAlen']].itertuples(False)): orf_matrix[tid_indices[tid][tcoord:tstop], orf_num] = np.tile(cdsprof, AAlen + 1) ignore_coords.append(tid_indices[tid][max(tcoord + startmask[0], 0):tcoord + startmask[1]]) ignore_coords.append( tid_indices[tid][max(tstop + stopmask[0], 0):tstop + stopmask[1]]) ignore_coords = np.unique(np.concatenate(ignore_coords)) orf_matrix[ ignore_coords, :] = 0 # mask out all positions within the mask region around starts and stops valid_orfs = np.array([ (orf_matrix[:, i] > 0).any() and (orf_matrix.T[i, :] != orf_matrix.T[:i, :]).any(1).all() for i in xrange(len(orf_set)) ]) # require at least one valid position, and if >1 ORFs are identical, only include one of them orf_matrix[:, ~valid_orfs] = 0 # completely ignore these positions valid_nts = (orf_matrix > 0).any( 1) # only bother checking nucleotides where there is a valid ORF orf_res = orf_set.copy() if valid_nts.any(): orf_matrix = orf_matrix[valid_nts, :] valid_nt_segs = SegmentChain(*positionlist_to_segments( chrom, strand, list(all_tfam_genpos[valid_nts]))) orf_res['nts_quantified'] = (orf_matrix > 0).sum( 0) # the number of nucleotides included in the quantification for colname, gnd in zip(colnames, gnds): orf_res[colname] = nnls(orf_matrix, valid_nt_segs.get_counts(gnd))[0] # gnd is a HashedReadBAMGenomeArray, but it still works with get_counts(), which will collapse all read lengths to a single array return orf_res else: orf_res['nts_quantified'] = 0 for colname in colnames: orf_res[colname] = 0. return orf_res
crossmap = GenomeHash(_MASKS) for flank_up, flank_down in _FLANKS: for test_name, test_group in _DO_GENERATE_MAX_WINDOW.items(): result_group = _DO_GENERATE_MAX_WINDOW_RESULTS_MASKED[ "%s_%s_%s" % (test_name, flank_up, flank_down)] yield check_maximal_window, test_name, crossmap, test_group, [ result_group ], flank_up, flank_down #=============================================================================== # INDEX: test data #=============================================================================== _MASKS = [ SegmentChain.from_str("2L:7985694-7985744(+)"), SegmentChain.from_str("3R:4519879-4519891(-)"), SegmentChain.from_str("4:50-50000(+)"), ] _TRANSCRIPTS_GFF = """##gff-version 3 3R FlyBase mRNA 4517211 4523544 . - . ID=FBtr0081950;Name=hb-RB;Parent=FBgn0001180;Alias=FBtr0002097,FBtr0002098,CG9786-RB,hb[+]R2.8;Dbxref=FlyBase_Annotation_IDs:CG9786-RB,REFSEQ:NM_169234;score_text=Strongly Supported;score=11 3R FlyBase exon 4517211 4519894 . - . Name=hb:2;Parent=FBtr0081950;parent_type=mRNA 3R FlyBase CDS 4517600 4519876 . - 0 Name=hb-cds;Parent=FBtr0081950;parent_type=mRNA 3R FlyBase exon 4523048 4523544 . - . Name=hb:4;Parent=FBtr0081950;parent_type=mRNA 3R FlyBase mRNA 4516702 4520322 . - . ID=FBtr0081951;Name=hb-RA;Parent=FBgn0001180;Alias=FBtr0002096,FBtr0002097,CG9786-RA,hb[+]R3.2;Dbxref=FlyBase_Annotation_IDs:CG9786-RA,REFSEQ:NM_169233;score_text=Strongly Supported;score=11 3R FlyBase exon 4516702 4519894 . - . Name=hb:1;Parent=FBtr0081951;parent_type=mRNA 3R FlyBase CDS 4517600 4519876 . - 0 Name=hb-cds;Parent=FBtr0081951;parent_type=mRNA 3R FlyBase exon 4520178 4520322 . - . Name=hb:3;Parent=FBtr0081951;parent_type=mRNA
], "YPL249C-A": [ "YPL249C-A:0-53^291-334(-)", ], 'YBR215W_mRNA_0': ['YBR215W_mRNA_0:0-108^192-2175(+)'], 'YHL001W_mRNA_0': ['YHL001W_mRNA_0:0-146^544-961(+)'], 'YIL018W_mRNA_0': ['YIL018W_mRNA_0:0-30^430-1280(+)'], 'YIL133C_mRNA_0': ['YIL133C_mRNA_0:0-648^938-1007(-)'], 'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'], 'YKL006W_mRNA_0': ['YKL006W_mRNA_0:0-157^555-954(+)'], 'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'], 'YNL130C_mRNA_0': ['YNL130C_mRNA_0:0-1204^1296-1382(-)'], 'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'], } known_juncs = { K: [SegmentChain.from_str(X) for X in V] for K, V in known_juncs.items() } """Annotated splice junctions""" all_known_juncs = [] for v in known_juncs.values(): all_known_juncs.extend(v) known_juncs_as_tuples = { "YNL130C": [ ("YNL130C", 53, 145, "-"), ], "YPL249C-A": [ ("YPL249C-A", 53, 291, "-"), ],
def check_maximal_window(test_name, genome_hash, test_group, result_groups, flank_up, flank_down): """ test_name : str Descriptive name of test genome_hash : GenomeHash Mask hash test_group : list List of transcript IDs, referring to transcripts in the GFF text above result_groups : list list of tuples of (region_str, aligment_offset, window_length) expected from maximal spanning window output flank_up : int Bases to include upstream of landmark in window flank_down : int bases to include downstream of landmark in window """ # table keys: # gene_id # window_size # roi # masked # alignment_offset # zero_point err_str = ("Failed %s (up: %s, down: %s). " % (test_name, flank_up, flank_down)) + "%s unequal (%s vs %s)" tx_ivcs = (_TRANSCRIPTS[X] for X in test_group) roi_table = group_regions_make_windows(tx_ivcs, genome_hash, flank_up, flank_down, window_cds_start) roi_table.sort(columns=["region"], inplace=True) trows = [X[1] for X in roi_table.iterrows()] result_groups = sorted(result_groups, key=lambda x: x[0]) REGION = 0 c = 0 for n, result_group in enumerate(result_groups): # if no landmark if numpy.isnan(result_group[1]) or numpy.isnan(result_group[2]): c += 1 # increment counter for input that will have no output # if landmark else: check_equality(SegmentChain.from_str(result_group[0]), SegmentChain.from_str(trows[n - c]["region"]), test_name) assert_equal( result_group[1], trows[n - c]["alignment_offset"], msg=err_str % ("offset", result_group[1], trows[n - c]["alignment_offset"])) assert_equal( result_group[2], trows[n - c]["zero_point"], msg=err_str % ("ref_point", result_group[1], trows[n - c]["zero_point"])) if len(result_group) == 4: assert_equal(result_group[3], trows[n - c]["masked"], msg=err_str % ("mask", result_group[3], trows[n - c]["masked"])) assert_equal(n + 1 - c, len(roi_table))
def setUpClass(cls): cls.ivcs = { "plus": [ SegmentChain(GenomicSegment("chrA", 0, 100, "+")), SegmentChain(GenomicSegment("chrA", 50, 100, "+")), SegmentChain(GenomicSegment("chrA", 50, 51, "+")) ], "minus_k25_off0": [ SegmentChain( GenomicSegment("chrA", 0 + 25 - 1, 100 + 25 - 1, "-")), SegmentChain( GenomicSegment("chrA", 50 + 25 - 1, 100 + 25 - 1, "-")), SegmentChain( GenomicSegment("chrA", 50 + 25 - 1, 51 + 25 - 1, "-")) ], "minus_k50_off0": [ SegmentChain( GenomicSegment("chrA", 0 + 50 - 1, 100 + 50 - 1, "-")), SegmentChain( GenomicSegment("chrA", 50 + 50 - 1, 100 + 50 - 1, "-")), SegmentChain( GenomicSegment("chrA", 50 + 50 - 1, 51 + 50 - 1, "-")) ], "minus_k25_off10": [ SegmentChain( GenomicSegment("chrA", 0 + 25 - 1 - 2 * 10, 100 + 25 - 1 - 2 * 10, "-")), SegmentChain( GenomicSegment("chrA", 50 + 25 - 1 - 2 * 10, 100 + 25 - 1 - 2 * 10, "-")), SegmentChain( GenomicSegment("chrA", 50 + 25 - 1 - 2 * 10, 51 + 25 - 1 - 2 * 10, "-")) ], "minus_k50_off10": [ SegmentChain( GenomicSegment("chrA", 0 + 50 - 1 - 2 * 10, 100 + 50 - 1 - 2 * 10, "-")), SegmentChain( GenomicSegment("chrA", 50 + 50 - 1 - 2 * 10, 100 + 50 - 1 - 2 * 10, "-")), SegmentChain( GenomicSegment("chrA", 50 + 50 - 1 - 2 * 10, 51 + 50 - 1 - 2 * 10, "-")) ], }
known_juncs = { "YNL130C" : ["YNL130C:0-53^145-180(-)",], "YPL249C-A" : ["YPL249C-A:0-53^291-334(-)",], 'YBR215W_mRNA_0' : ['YBR215W_mRNA_0:0-108^192-2175(+)'], 'YHL001W_mRNA_0' : ['YHL001W_mRNA_0:0-146^544-961(+)'], 'YIL018W_mRNA_0' : ['YIL018W_mRNA_0:0-30^430-1280(+)'], 'YIL133C_mRNA_0' : ['YIL133C_mRNA_0:0-648^938-1007(-)'], 'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-41^103-408(+)'], 'YKL006W_mRNA_0' : ['YKL006W_mRNA_0:0-157^555-954(+)'], 'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-325^397-729(-)'], 'YNL130C_mRNA_0' : ['YNL130C_mRNA_0:0-1204^1296-1382(-)'], 'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-415^653-697(-)'], } known_juncs = { K : [SegmentChain.from_str(X) for X in V] for K,V in known_juncs.items() } """Annotated splice junctions""" known_juncs_tuples = { "YNL130C" : [("YNL130C",53,145,"-"),], "YPL249C-A" : [("YPL249C-A",53,291,"-"),], 'YBR215W_mRNA_0' : [('YBR215W_mRNA_0',108,192,'+'),], 'YHL001W_mRNA_0' : [('YHL001W_mRNA_0',146,544,'+'),], 'YIL018W_mRNA_0' : [('YIL018W_mRNA_0',30,430,'+'),], 'YIL133C_mRNA_0' : [('YIL133C_mRNA_0',648,938,'-'),], 'YIL156W_B_mRNA_0': [('YIL156W_B_mRNA_0',41,103,'+'),], 'YKL006W_mRNA_0' : [('YKL006W_mRNA_0',157,555,'+'),], 'YMR194C_B_mRNA_0': [('YMR194C_B_mRNA_0',325,397,'-'),], 'YNL130C_mRNA_0' : [('YNL130C_mRNA_0',1204,1296,'-'),], 'YPL249C_A_mRNA_0': [('YPL249C_A_mRNA_0',415,653,'-'),],