def testDisjointIterators(self): # two iterators working on the same file tabix = pysam.TabixFile(self.filename) a = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=True).next() b = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=True).next() # both iterators are at top of file self.assertEqual(str(a), str(b))
def getGene(chr, start, end, strand): tmp = {'chr': chr, 'start': start, 'end': end, 'strand': strand} geneList1 = [] geneList2 = [] for gtf in tabixfile.fetch(tmp['chr'], tmp['start'] - 1, tmp['start'], parser=pysam.asGTF()): if gtf.feature == 'gene': gn = gtf.gene_name if gtf.strand == tmp['strand'] or tmp['strand'] == 'U': geneList1.append(gn) else: if gn[-4:] == '-AS1': geneList1.append(gn[0:(len(gn) - 4)]) else: geneList1.append(gn + '-AS1') for gtf in tabixfile.fetch(tmp['chr'], tmp['end'] - 1, tmp['end'], parser=pysam.asGTF()): if gtf.feature == 'gene': gn = gtf.gene_name if gtf.strand == tmp['strand'] or tmp['strand'] == 'U': geneList2.append(gn) else: if gn[-4:] == '-AS1': geneList2.append(gn[0:(len(gn) - 4)]) else: geneList2.append(gn + '-AS1') geneCom = list(set(geneList1) & set(geneList2)) if len(geneCom) > 0: geneCom = geneCom else: geneCom = [] for gtf in tabixfile.fetch(tmp['chr'], tmp['start'] - 1, tmp['end'] - 1, parser=pysam.asGTF()): if gtf.feature == 'gene': gn = gtf.gene_name if gtf.strand == tmp['strand'] or tmp['strand'] == 'U': geneCom.append(gn) else: if gn[-4:] == '-AS1': geneCom.append(gn[0:(len(gn) - 4)]) else: geneCom.append(gn + '-AS1') geneCom = list(set(geneCom)) noAS = [] if len(geneCom) > 1: for j in geneCom: if j[-4:] != '-AS1': noAS.append(j) if len(noAS) > 0: geneCom = noAS return (';'.join(geneCom))
def testJoinedIterators(self): # two iterators working on the same file tabix = pysam.TabixFile(self.filename) a = tabix.fetch(parser=pysam.asGTF()).next() b = tabix.fetch(parser=pysam.asGTF()).next() # the first two lines differ only by the feature field self.assertEqual(a.feature, "UTR") self.assertEqual(b.feature, "exon") self.assertEqual(re.sub("UTR", "", str(a)), re.sub("exon", "", str(b)))
def subset_of_feature_in_region(self, contig=None, start=None, end=None, types=None): ''' Example: # return dict of rna and tss id for rna_tss in tabix.subset_of_feature_in_region(contig="chr2L", end=9839, types=["transcript_id", 'tss_id']): print rna_tss ''' for gtf in pysam.Tabixfile.fetch(self.tabixfile, contig, start, end, parser=pysam.asGTF()): if isinstance(types, str): try: yield gtf.asDict()[types] except KeyError: print 'key \'{0}\' is not found in {1}'.format(t, self.ingtf) elif isinstance(types, list): tmp = dict() for t in types: try: tmp.update({t: gtf.asDict()[t]}) except KeyError: print 'key \'{0}\' is not found in {1}'.format(t, self.ingtf) yield tmp
def gene_in_region(self, contig=None, start=None, end=None): for gtf in pysam.Tabixfile.fetch(self.tabixfile, contig, start, end, parser=pysam.asGTF()): try: yield gtf.asDict()['gene_name'] except KeyError: print 'key \'{0}\' is not found in {1}'.format(t, self.ingtf)
def coverage(bam_paths, gtf_path, transcript_ids=None, verbose=False, agg_func=None): # Setup record iterator from gtf file. gtf_file = pysam.Tabixfile(gtf_path, parser=pysam.asGTF()) gtf_records = (rec for rec in gtf_file.fetch() if rec.feature == 'exon') if transcript_ids is not None: transcript_ids = set(transcript_ids) gtf_records = (rec for rec in gtf_records if rec['transcript_id'] in transcript_ids) if verbose: gtf_records = tqdm(gtf_records, leave=False) # Build frame. rows = _coverage_gen(bam_paths, gtf_records, agg_func=agg_func) index_names = ['transcript_id', 'chr', 'start', 'end', 'strand'] result = pd.DataFrame.from_records(rows, columns=index_names + list(bam_paths)) result = result.set_index(index_names) return result
def read_gtf(lines, scaffolds, contig_prefix): table = {} # gene_id -> transcript_id -> exon_number -> feature -> [items] for gtf in text.parse_lines(lines, pysam.asGTF()): if not filter_gtf_record(gtf): update_gtf_table(table, gtf, scaffolds, contig_prefix) return table
def _TestMultipleIteratorsHelper(filename, multiple_iterators): """open file within scope, return iterator.""" tabix = pysam.TabixFile(filename) iterator = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=multiple_iterators) tabix.close() return iterator
def testCopy(self): a = self.tabix.fetch(parser=pysam.asTuple()).next() b = copy.copy(a) self.assertEqual(a, b) a = self.tabix.fetch(parser=pysam.asGTF()).next() b = copy.copy(a) self.assertEqual(a, b)
def getgenenames(coords, gtf): eventstogenes = {} # {event : gene} tabixfile = pysam.Tabixfile(gtf) for event in coords: isoform1genes = [] isoform2genes = [] isoform1chrm = coords[event][0][0] isoform1start = coords[event][0][1] isoform1end = coords[event][0][2] isoform1strand = coords[event][0][3] isoform2chrm = coords[event][1][0] isoform2start = coords[event][1][1] isoform2end = coords[event][1][2] isoform2strand = coords[event][1][3] for entry in tabixfile.fetch(isoform1chrm, isoform1start, isoform1end, isoform1strand, parser=pysam.asGTF()): isoform1genes.append(entry.gene_name) for entry in tabixfile.fetch(isoform2chrm, isoform2start, isoform2end, isoform2strand, parser=pysam.asGTF()): isoform2genes.append(entry.gene_name) #Collapse all duplicates isoform1genes = list(set(isoform1genes)) isoform2genes = list(set(isoform2genes)) #Get genes that overlap both isoforms isoformintersection = set(isoform1genes).intersection( set(isoform2genes)) if len(isoformintersection) > 1: print 'WARNING: more than one gene found for event {0}.'.format( event) print event, list(isoformintersection) elif len(isoformintersection) == 1: eventstogenes[event] = list(isoformintersection)[0] elif len(isoformintersection) == 0: print 'No gene found for event {0}!!'.format(event) print 'Found genes for {0} of {1} events.'.format(len(eventstogenes), len(coords)) return eventstogenes
def _TestMultipleIteratorsHelper(filename, multiple_iterators): '''open file within scope, return iterator.''' tabix = pysam.TabixFile(filename) iterator = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=multiple_iterators) tabix.close() return iterator
def __init__(self, filename, **kwargs): file_path = str(filename) if not file_path.endswith('.gz'): if os.path.exists(file_path + '.gz'): file_path += '.gz' else: file_path = self.compress(file_path, create_index=True) super().__init__(filename, parser=pysam.asGTF(), **kwargs)
def get_part_from_gtf(annotation, reference=None, feature="CDS"): """ Returns a part from GTF annotation annotation: "0-References/genome.gtf.gz" reference: None # or chr in GTF file 'I' or 'XII' or 'chr1' feature: 'CDS' # or 'ORF' ar any valid feature in GTF is file name to compressed and indexed GTF. vt tabix """ tabixfile = pysam.TabixFile(annotation, parser=pysam.asGTF()) return [gtf for gtf in tabixfile.fetch(reference=reference) if (gtf.feature == feature)]
def __init__(self, file_path): file_path = str(file_path) if not file_path.endswith('.gz'): if os.path.exists(file_path + '.gz'): file_path += '.gz' else: file_path = self.compress(file_path) super().__init__(file_path, parser=pysam.asGTF())
def filterOut(chr, exfeat, pos): if len(exfeat) == 0: return 0 if fo and not chr in contigo: return 0 elif not fo: return 0 res = [(kk.feature).lower() for kk in tabixo.fetch( reference=chr, start=pos, end=pos + 1, parser=pysam.asGTF())] for i in exfeat: if i in res: return 1 return 0
def _open_file(self): # type: (...) -> pysam.TabixFile # Open gtf file. gtf_file = pysam.TabixFile( native_str(self._gtf_path), parser=pysam.asGTF()) # Yield file object and ensure it is closed. try: yield gtf_file finally: gtf_file.close()
def testRead(self): for x, r in enumerate(self.tabix.fetch(parser=pysam.asGTF())): c = self.compare[x] self.assertEqual(len(c), len(r)) self.assertEqual(list(c), list(r)) self.assertEqual(c, str(r).split("\t")) self.assertTrue(r.gene_id.startswith("ENSG")) if r.feature != 'gene': self.assertTrue(r.transcript_id.startswith("ENST")) self.assertEqual(c[0], r.contig)
def load_kg_gtf(gtf_file_name): f = pysam.TabixFile(gtf_file_name) gtf = f.fetch(parser=pysam.asGTF()) feats = [] for row in gtf: attr = parse_gtf_attr(row.attributes) currfeat = GFFFeature(row.seqname, row.source, row.feature,\ int(row.start), int(row.end), score, \ strand, frame, attr) feats.append(currfeat) return feats
def read_transcripts(gtf_file, region, genome=None, retry=0): u""" Read transcripts from tabix indexed gtf files The original function check if the junctions corresponding to any exists exons, I disable this here :param gtf_file: path to bgzip gtf files (with tabix index), only ordered exons in this gtf file :param region: splice region :param retry: if the gtf chromosome and input chromosome does not match. eg: chr9:1-100:+ <-> 9:1-100:+ :param genome: path to genome fasta file :return: SpliceRegion """ if not os.path.exists(gtf_file): raise FileNotFoundError("%s not found" % gtf_file) try: logger.info("Reading from %s" % gtf_file) if genome: with pysam.FastaFile(genome) as fa: region.sequence = fa.fetch(region.chromosome, region.start - 1, region.end + 1) with pysam.Tabixfile(gtf_file, 'r') as gtf_tabix: relevant_exons_iterator = gtf_tabix.fetch(region.chromosome, region.start - 1, region.end + 1, parser=pysam.asGTF()) # min_exon_start, max_exon_end, exons_list = float("inf"), float("-inf"), [] for line in relevant_exons_iterator: try: region.add_gtf(line) except IndexError as err: logger.error(err) except ValueError as err: logger.warn(err) # handle the mismatch of chromosomes here if retry < 2: if not region.chromosome.startswith("chr"): logger.info("Guess need 'chr'") region.chromosome = "chr" + region.chromosome else: logger.info("Guess 'chr' is redundant") region.chromosome = region.chromosome.replace("chr", "") return read_transcripts(gtf_file=gtf_file, region=region, retry=retry + 1) return region
def testSetting(self): for r in self.tabix.fetch(parser=pysam.asGTF()): r.contig = r.contig + "_test" r.source = r.source + "_test" r.feature = r.feature + "_test" r.start += 10 r.end += 10 r.score = 20 r.strand = "+" r.frame = 0 r.attributes = 'gene_id "0001";'
def __init__(self, file_path): file_path = str(file_path) if not os.path.exists(file_path): raise IOError('File does not exist ({})'.format(file_path)) if not file_path.endswith('.gz'): if os.path.exists(file_path + '.gz'): file_path += '.gz' else: file_path = self.compress(file_path) super().__init__(file_path, parser=pysam.asGTF())
def fetch_gtf(self, contig=None, start=None, end=None): ''' Returns: pysam.tabix object Example: # return dict of each GTF line for _ in tabix.fetch_gtf(contig="chr2L", end=9839): print _.asDict() ''' for gtf in pysam.Tabixfile.fetch(self.tabixfile, contig, start, end, parser=pysam.asGTF()): yield gtf
def from_gtf( cls, gtf_path, # type: pathlib.Path chromosomes=None, # type: List[str] record_filter=None # type: Callable[[Any], bool] ): # type: (...) -> TranscriptReference """Builds an Reference instance from the given GTF file.""" # Open gtf file. gtf = pysam.TabixFile(native_str(gtf_path), parser=pysam.asGTF()) if chromosomes is None: chromosomes = gtf.contigs # Build the trees. transcript_trees = {} exon_trees = {} for chrom in chromosomes: # Collect exons and transcripts. transcripts = [] exons = [] records = gtf.fetch(reference=chrom) if record_filter is not None: records = (rec for rec in records if record_filter(rec)) for record in records: if record.feature == 'transcript': transcripts.append(cls._record_to_transcript(record)) elif record.feature == 'exon': exons.append(cls._record_to_exon(record)) # Build transcript lookup tree. transcript_trees[chrom] = IntervalTree.from_tuples( (tr.start, tr.end, tr) for tr in transcripts) # Build exon lookup tree. keyfunc = lambda rec: rec.transcript_id exons = sorted(exons, key=keyfunc) grouped = itertools.groupby(exons, key=keyfunc) for tr_id, grp in grouped: exon_trees[tr_id] = IntervalTree.from_tuples( (exon.start, exon.end, exon) for exon in grp) return cls(transcript_trees, exon_trees)
def overlap_annotation(junc, anno): ts_set = anno.fetch(junc.chrom, junc.start, junc.end, parser=pysam.asGTF()) exon_set = filter(lambda x: x.feature=='exon', [i for i in ts_set]) for idx,exon in enumerate(exon_set): if idx == len(exon_set) - 1: continue for(std::size_t i = 0; i < exons.size(); i++) { if(exons[i].start > junction.end) { //No need to look any further //the rest of the exons are outside the junction break; } //known junction if(exons[i].end == junction.start && exons[i + 1].start == junction.end) { junction.known_acceptor = true; junction.known_donor = true; junction.known_junction = true; known_junction = true; } else { if(!junction_start) { if(exons[i].end >= junction.start) { junction_start = true; } } if(junction_start) { if(exons[i].start > junction.start && exons[i].end < junction.end) { junction.exons_skipped.insert(exons[i].name); } if(exons[i].start > junction.start) { junction.donors_skipped.insert(exons[i].start); } if(exons[i].end < junction.end) { junction.acceptors_skipped.insert(exons[i].end); } if(exons[i].end == junction.start) { junction.known_donor = true; } //TODO - check for last exon if(exons[i].start == junction.end) { junction.known_acceptor = true; } } } }
def dfgene(self): if 'dfgene' not in self._data: xgene = {} with pysam.TabixFile(self.gtffile) as tbx: # pylint: disable=maybe-no-member for gtf in tbx.fetch(parser=pysam.asGTF()): # pylint: disable=maybe-no-member if gtf.feature == 'gene': xgene[gtf.gene_id] = (gtf.contig, gtf.start, gtf.end, gtf.strand, gtf.gene_id) self._data['dfgene'] = pd.DataFrame( np.array(list(xgene.values()), dtype=[('Chrom', 'S10'), ('Start', 'i4'), ('End', 'i4'), ('Strand', 'S1'), ('GeneID', 'S30')])) return self._data['dfgene']
def annotate_inv(genes, chrom, start, end, strand): feats = set() parents = set() antisense_feats = set() antisense_parents = set() overlapping_feats = list( genes.fetch(chrom, start, end, parser=pysam.asGTF())) if not len(overlapping_feats): return None else: for record in overlapping_feats: if record.strand == strand: feats.add(record.feature) try: parents.add( re.search(r'Parent=(.*?)[;:\.]', record.attributes).group(1)) except AttributeError: parents.add( re.search(r'ID=(.*?)[;:\\.]', record.attributes).group(1)) else: antisense_feats.add(record.feature) try: antisense_parents.add( re.search(r'Parent=(.*?)[;:\.]', record.attributes).group(1)) except AttributeError: antisense_parents.add( re.search(r'ID=(.*?)[;:\\.]', record.attributes).group(1)) if not len(feats) and len(antisense_feats): feats = 'antisense' parents = antisense_parents else: feats = feats.difference({'gene', 'exon', 'protein', 'mRNA'}) if not feats: feats = 'intron' else: for ftype in PRIORITY: if ftype in feats: feats = ftype break else: feats = '|'.join(feats) return [feats, '|'.join(parents)]
def strand_info(self, contig=None, start=None, end=None): """ Args: contig(str)='', start(int)='', end='' Returns: strand information [+-], or [.] is 404 """ found = "." for gtf in pysam.Tabixfile.fetch(self.tabixfile, contig, start, end, parser=pysam.asGTF()): if gtf.strand: found = gtf.strand break else: continue return found
def testSetting(self): for r in self.tabix.fetch(parser=pysam.asGTF()): r.contig = r.contig + "_test_contig" r.source = r.source + "_test_source" r.feature = r.feature + "_test_feature" r.start += 10 r.end += 10 r.score = 20 r.strand = "+" r.frame = 0 r.attributes = 'gene_id "0001";' r.transcript_id = "0002" sr = str(r) self.assertTrue("_test_contig" in sr) self.assertTrue("_test_source" in sr) self.assertTrue("_test_feature" in sr) self.assertTrue("gene_id \"0001\"" in sr) self.assertTrue("transcript_id \"0002\"" in sr)
def main(): logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('--frac', type=float, default=0.0) parser.add_argument('gtf_file') args = parser.parse_args() all_t_ids = set() t_ids = set() for f in pysam.tabix_iterator(open(args.gtf_file), pysam.asGTF()): if f.feature == 'transcript': t_id = f.transcript_id frac = float(f.frac) keep = (frac >= args.frac) all_t_ids.add(t_id) if keep: t_ids.add(t_id) print str(f) elif f.feature == 'exon': t_id = f.transcript_id assert t_id in all_t_ids if t_id in t_ids: print str(f)
def getCircType(tmp): exon_start=[int(i) for i in tmp['exon_start'].split(',')] exon_end=[int(i) for i in tmp['exon_end'].split(',')] canGene_full=[] # all in the gene with same sense canGene_part=[] # only a part in the gene with same sense canGene_anti=[] # antisense for gtf in tabixfile.fetch(tmp['chr'], tmp['start']-1, tmp['end']-1,parser=pysam.asGTF()): if gtf.feature=='gene': if gtf.strand==tmp['strand']: if (gtf.start<=tmp['start']) and (gtf.end>=tmp['end']): canGene_full.append(gtf.gene_id) else: canGene_part.append(gtf.gene_id) else: canGene_anti.append(gtf.gene_id) start_type='' end_type='' circ_type='' geneName='' if len(canGene_full)>0: circ_type='full' exon_score=-1 inexon_score=-1 geneName='' for i in canGene_full: geneExon=gene2exon_dict[i] start_type_tmp,end_type_tmp,exon_score_tmp,inexon_score_tmp=compareExon2gene(geneExon,exon_start,exon_end) if exon_score_tmp>exon_score: start_type=start_type_tmp end_type=end_type_tmp exon_score=exon_score_tmp inexon_score=inexon_score_tmp geneName=gene2class_dict[i] elif exon_score_tmp==exon_score: if inexon_score_tmp>inexon_score: start_type=start_type_tmp end_type=end_type_tmp inexon_score=inexon_score_tmp geneName=gene2class_dict[i] else: continue elif len(canGene_part)>0: if len(canGene_part)>1: min_start,max_end,geneName,geneID_minmax=getMinMax(canGene_part) if (min_start<tmp['start']) and (max_end>=tmp['end']): circ_type='read through' geneExon_1=gene2exon_dict[geneID_minmax[0]] start_type_tmp_1,end_type_tmp_1,exon_score_tmp_1,inexon_score_tmp1=comparePartExon2gene(geneExon_1,exon_start,exon_end) geneExon_2=gene2exon_dict[geneID_minmax[1]] start_type_tmp_2,end_type_tmp_2,exon_score_tmp_2,inexon_score_tmp2=comparePartExon2gene(geneExon_2,exon_start,exon_end) return(combin2type(start_type_tmp_1,start_type_tmp_2),combin2type(end_type_tmp_1,end_type_tmp_2),circ_type,geneName) circ_type='part' exon_score=-1 inexon_score=-1 geneName='' for i in canGene_part: geneExon=gene2exon_dict[i] start_type_tmp,end_type_tmp,exon_score_tmp,inexon_score_tmp=comparePartExon2gene(geneExon,exon_start,exon_end) if exon_score_tmp>exon_score: start_type=start_type_tmp end_type=end_type_tmp exon_score=exon_score_tmp inexon_score=inexon_score_tmp geneName=gene2class_dict[i] elif exon_score_tmp==exon_score: if inexon_score_tmp>inexon_score: start_type=start_type_tmp end_type=end_type_tmp inexon_score=inexon_score_tmp geneName=gene2class_dict[i] else: continue elif len(canGene_anti)>0: circ_type='antisense' geneName=';'.join([gene2class_dict[i]+'-AS1' for i in canGene_anti]) start_type=','.join(['a']* len(exon_start)) end_type=','.join(['a']* len(exon_end)) else: circ_type='intergenic' start_type=','.join(['intergenic']* len(exon_start)) end_type=','.join(['intergenic']* len(exon_end)) return(start_type,end_type,circ_type,geneName)
def create_pysam_tabix(path): if path is not None and os.path.exists(path): return pysam.Tabixfile(path, parser=pysam.asGTF()) return None
def get_part_from_gtf(annotation, reference=None, feature="CDS"): tabixfile = pysam.TabixFile(annotation, parser=pysam.asGTF()) return [ gtf for gtf in tabixfile.fetch(reference=reference) if (gtf.feature == feature) ]
def readFromFile(infile): """read records from file and return as list.""" result = [] for gff in pysam.tabix_iterator(infile, pysam.asGTF()): result.append(gff) return result
def testGTF( self ): for x, r in enumerate(self.tabix.fetch( parser = pysam.asGTF() )): self.assertEqual( "\t".join( self.compare[x]), str(r) )
args.max_dist_ann, args.min_len_tail_contig, args.min_num_tail_reads, args.min_num_bridge_reads, args.min_bridge_read_tail_len, has_pas = args.has_polyadenylation_signal) print 'Loading KLEAT {}/{} ...DONE\r'.format(N,N) # Group KLEAT data sprint('Grouping kleat data ...') kleats = Kleat.groupKleat(kleats) print 'DONE' # Parse ensembl sprint('Loading ensembl annotation ...') ensembl = pysam.TabixFile(args.ensembl, parser=pysam.asGTF()) print 'DONE' # Parse aceview sprint('Loading aceview annotation ...') aceview = pysam.TabixFile(args.aceview, parser=pysam.asGTF()) print 'DONE' # Parse refseq sprint('Loading refseq annotation ...') refseq = pysam.TabixFile(args.refseq, parser=pysam.asGTF()) print 'DONE' # Parse ucsc sprint('Loading ucsc annotation ...') ucsc = pysam.TabixFile(args.ucsc, parser=pysam.asGTF())
def getStrand(x): exon_leftSeq_first = x['exon_leftSeq_first'].split(',')[1:] exon_rightSeq_first = x['exon_rightSeq_first'].split(',')[:-1] exon_leftSeq_second = x['exon_leftSeq_second'].split(',')[1:] exon_rightSeq_second = x['exon_rightSeq_second'].split(',')[:-1] exon_motif_left = [ x['exon_leftSeq_first'].split(',')[0] + x['exon_rightSeq_first'].split(',')[-1] ] exon_motif_right = [ x['exon_leftSeq_second'].split(',')[0] + x['exon_rightSeq_second'].split(',')[-1] ] strand_first = 'U' strand_second = 'U' if len(exon_leftSeq_first) > 0: for i in range(len(exon_leftSeq_first)): exon_motif_left.append(exon_leftSeq_first[i] + exon_rightSeq_first[i]) if len(exon_leftSeq_second) > 0: for i in range(len(exon_leftSeq_second)): exon_motif_right.append(exon_leftSeq_second[i] + exon_rightSeq_second[i]) for i in exon_motif_left: if i in ['AGGT', 'AGGC']: strand_first = '+' break elif i in ['ACCT', 'GCCT']: strand_first = '-' break for i in exon_motif_right: if i in ['AGGT', 'AGGC']: strand_second = '+' break elif i in ['ACCT', 'GCCT']: strand_second = '-' break if strand_first == 'U': if x['chr_first'] in tabixfile.contigs: for gtf in tabixfile.fetch(x['chr_first'], x['start_first'] - 1, x['start_first'], parser=pysam.asGTF()): if gtf.start == x['start_first'] - 1: strand_first = gtf.strand break if strand_first == 'U': if x['chr_first'] in tabixfile.contigs: for gtf in tabixfile.fetch(x['chr_first'], x['end_first'] - 1, x['end_first'], parser=pysam.asGTF()): if gtf.end == x['end_first'] - 1: strand_first = gtf.strand break if strand_second == 'U': if x['chr_second'] in tabixfile.contigs: for gtf in tabixfile.fetch(x['chr_second'], x['start_second'] - 1, x['start_second'], parser=pysam.asGTF()): if gtf.start == x['start_second'] - 1: strand_second = gtf.strand break if strand_second == 'U': if x['chr_second'] in tabixfile.contigs: for gtf in tabixfile.fetch(x['chr_second'], x['end_second'] - 1, x['end_second'], parser=pysam.asGTF()): if gtf.end == x['end_second'] - 1: strand_second = gtf.strand break return ([strand_first, strand_second])
def _aggregate_gtf(gtf_file, sample_id, gtf_expr_attr, output_fh, stats_fh, is_ref=False): def _init_t_dict(): return {'_id': None, 'num_exons': 0, 'length': 0} t_dict = collections.defaultdict(_init_t_dict) cur_t_id = 1 exprs = [] for f in pysam.tabix_iterator(open(gtf_file), pysam.asGTF()): if f.feature == 'transcript': t_id = f.transcript_id if t_id in t_dict: m = 'GTF "%s" transcript_id "%s" not unique' % (gtf_file, t_id) raise GTFError(m) t_item = t_dict[t_id] # rename transcript id new_t_id = "%s.T%d" % (sample_id, cur_t_id) cur_t_id += 1 t_item['_id'] = new_t_id if is_ref: expr = 0.0 else: expr = float(f[gtf_expr_attr]) exprs.append(expr) # prepare attributes attrs = {GTF.Attr.TRANSCRIPT_ID: new_t_id, GTF.Attr.SAMPLE_ID: sample_id, GTF.Attr.REF: str(int(is_ref)), GTF.Attr.EXPR: str(expr)} # save attributes f.fromDict(attrs) print >>output_fh, str(f) elif f.feature == 'exon': t_id = f.transcript_id t_item = t_dict[t_id] # update statistics t_item['num_exons'] += 1 t_item['length'] += (f.end - f.start) # replace transcript id f.fromDict({GTF.Attr.TRANSCRIPT_ID: t_item['_id']}) print >>output_fh, str(f) # process statistics num_exons = [] lengths = [] for t_item in t_dict.itervalues(): lengths.append(t_item['length']) num_exons.append(t_item['num_exons']) # compute and write stats quantiles = range(0, 101) expr_qs = (scoreatpercentile(exprs, q) for q in quantiles) expr_qs = ','.join(map(str, expr_qs)) length_qs = (int(round(scoreatpercentile(lengths, q))) for q in quantiles) length_qs = ','.join(map(str, length_qs)) num_exon_qs = (int(round(scoreatpercentile(num_exons, q))) for q in quantiles) num_exon_qs = ','.join(map(str, num_exon_qs)) fields = [sample_id, len(t_dict), expr_qs, length_qs, num_exon_qs] print >>stats_fh, '\t'.join(map(str, fields))
def helper(self, thread=0): for contig in self.tabix_reader.contigs[thread::self.num_threads]: load_genes_helper(self.tabix_reader.fetch(contig, start=0, parser=pysam.asGTF(), multiple_iterators=True), features=self.features)
def gtf_iterator(gtf_path): return tabix.TabixIterator(gtf_path, parser=pysam.asGTF())
def bed2df(bedFile,gtfFile): global tabixfile,gene2strand_dict,gene2trans_dict,gene2exon_dict,trans2exon_dict,trans2gene_dict,gene2status_dict,gene2class_dict,trans2class_dict FL_bed=pd.read_csv(bedFile,sep='\t',header=None) FL_bed.iloc[:,10]=FL_bed.iloc[:,10].map(str) FL_bed.iloc[:,11]=FL_bed.iloc[:,11].map(str) FL=pd.DataFrame(FL_bed.apply(lambda x: bed2FL(x),axis=1).tolist(),columns=['chr','start','end','isoID','strand','exon_start','exon_end','len']) tabixfile=pysam.TabixFile(gtfFile) gene2strand_dict={} gene2trans_dict={} gene2exon_dict={} trans2exon_dict={} trans2gene_dict={} gene2status_dict={} gene2class_dict={} trans2class_dict={} for gtf in tabixfile.fetch(parser=pysam.asGTF()): if gtf.feature=='exon': current_geneID=gtf.gene_id current_transID=gtf.transcript_id if gene2exon_dict.__contains__(current_geneID): gene2exon_dict[current_geneID].append([gtf.start,gtf.end]) else: gene2exon_dict[current_geneID]=[[gtf.start,gtf.end]] if trans2exon_dict.__contains__(current_transID): trans2exon_dict[current_transID].append([gtf.start,gtf.end]) else: trans2exon_dict[current_transID]=[[gtf.start,gtf.end]] if gtf.feature=='transcript': current_geneID=gtf.gene_id current_transID=gtf.transcript_id if 'transcript_status' in gtf.keys(): trans2class_dict[current_transID]=gtf.transcript_status else: trans2class_dict[current_transID]='NOVEL' if gene2trans_dict.__contains__(current_geneID): gene2trans_dict[current_geneID].append(current_transID) else: gene2trans_dict[current_geneID]=[current_transID] trans2gene_dict[current_transID]=current_geneID if gtf.feature=='gene': current_geneID=gtf.gene_id gene2strand_dict[current_geneID]=gtf.strand if 'gene_status' in gtf.keys(): gene2status_dict[current_geneID]=gtf.gene_status else: gene2status_dict[current_geneID]='NOVEL' gene2class_dict[current_geneID]=gtf.gene_name for i in gene2exon_dict.keys(): gene2exon_dict[i]=noDup_list(gene2exon_dict[i]) start_type_list=[] end_type_list=[] circ_type_list=[] geneName_list=[] for i in range(FL.shape[0]): tmp=FL.iloc[i,:] start_type,end_type,circ_type,geneName=getCircType(tmp) start_type_list.append(start_type) end_type_list.append(end_type) circ_type_list.append(circ_type) geneName_list.append(geneName) FL['start_type']=start_type_list FL['end_type']=end_type_list FL['geneName']=geneName_list BSJ_type_list=[] for i in range(FL.shape[0]): circ_type=circ_type_list[i] start_type=start_type_list[i].split(',') end_type=end_type_list[i].split(',') if circ_type in ['intergenic','antisense','read through']: BSJ_type=circ_type elif circ_type=='part': if start_type[0] =='intergenic' and end_type[-1]=='intergenic': BSJ_type='novel UTR5;3' elif start_type[0] =='intergenic': if FL.iloc[i,:]['strand']=='+': BSJ_type='novel UTR5' else: BSJ_type='novel UTR3' else: if FL.iloc[i,:]['strand']=='+': BSJ_type='novel UTR3' else: BSJ_type='novel UTR5' else: num=len(start_type) # judge BSJ type if start_type[0]=='m': if end_type[-1]=='m': BSJ_type='m' elif end_type[-1] in ['inE','outE']: if FL.iloc[i,:]['strand']=='+': BSJ_type='AS5' else: BSJ_type='AS3' elif end_type[-1]=='intron': BSJ_type='Intron retention' else: print('1') elif start_type[0] in ['inE','outE']: if end_type[-1]=='m': if FL.iloc[i,:]['strand']=='+': BSJ_type='AS5' else: BSJ_type='AS3' elif end_type[-1] in ['inE','outE']: BSJ_type='AS5,AS3' elif end_type[-1]=='intron': BSJ_type='Intron retention' else: print('1') elif start_type[0] =='intron': BSJ_type='Intron retention' else: print('1') BSJ_type_list.append(BSJ_type) FSJ_type_list=[] for i in range(FL.shape[0]): FSJ_type=[] circ_type=circ_type_list[i] start_type=start_type_list[i].split(',') end_type=end_type_list[i].split(',') if circ_type in ['intergenic','antisense','read through']: FSJ_type=circ_type else: num=len(start_type) #judge FSJ type if num==1: FSJ_type='1' else: for j in range(num-1): donar=end_type[j] acceptor=start_type[j+1] if donar=='m': if acceptor=='m': FSJ_type.append('m') elif acceptor in ['inE','outE']: if FL.iloc[i,:]['strand']=='+': FSJ_type.append('AS3') else: FSJ_type.append('AS5') elif acceptor=='intron': FSJ_type.append('Intron retention') else: if FL.iloc[i,:]['strand']=='+': FSJ_type.append('novel UTR3') else: FSJ_type.append('novel UTR5') elif donar in ['inE','outE']: if acceptor=='m': if FL.iloc[i,:]['strand']=='+': FSJ_type.append('AS5') else: FSJ_type.append('AS3') elif acceptor in ['inE','outE']: FSJ_type.append('AS5,AS3') elif acceptor=='intron': FSJ_type.append('Intron retention') else: if FL.iloc[i,:]['strand']=='+': FSJ_type.append('novel UTR3') else: FSJ_type.append('novel UTR5') elif donar=='intron': if acceptor=='intergenic': if FL.iloc[i,:]['strand']=='+': FSJ_type.append('novel UTR3') else: FSJ_type.append('novel UTR5') else: FSJ_type.append('Intron retention') else: if acceptor=='intergenic': FSJ_type.append('novel UTR5;3') else: if FL.iloc[i,:]['strand']=='+': FSJ_type.append('novel UTR5') else: FSJ_type.append('novel UTR3') FSJ_type=','.join(FSJ_type) FSJ_type_list.append(FSJ_type) detail_BSJ_type=[] com_BSJ_type=[] for i in BSJ_type_list: if i=='AS3': n='N3SS' m='NSS' elif i=='AS5': n='N5SS' m='NSS' elif i=='AS5,AS3': n='N5SS,N3SS' m='NSS' elif i=='Intron retention': n='intronic' m='intronic' elif i=='antisense': n='antisense' m='antisense' elif i=='intergenic': n='intergenic' m='intergenic' elif i=='m': n='exonic' m='exonic' elif i=='novel UTR3': n='novel UTR3' m='novel UTR' elif i=='novel UTR5': n='novel UTR5' m='novel UTR' elif i=='novel UTR5;3': n='novel UTR5,UTR3' m='novel UTR' elif i=='read through': n='read through' m='read through' else: n='unknown' m='unknown' detail_BSJ_type.append(n) com_BSJ_type.append(m) FL['detail_type']=detail_BSJ_type FL['type']=com_BSJ_type return(FL)
def load_data(self, filepath): """Loads GFF data""" tabix = pysam.TabixFile(filepath) for row in tabix.fetch(self.interval.chrom, self.interval.start, self.interval.end, parser = pysam.asGTF()): feature_interval = genomic_interval(row.contig, row.start, row.end, strand = row.strand) a = self.parse_gff_attributes(row.attributes) if row.feature == "gene": self.genes[a["gene_id"]] = a["gene_name"] elif row.feature == "transcript": self.transcripts[a["gene_id"]] = self.transcripts.get(a["gene_id"], []) + [(a["transcript_id"], feature_interval)] elif row.feature == "exon": self.exons[a["transcript_id"]] = self.exons.get(a["transcript_id"], []) + [(a["exon_id"], feature_interval)] elif row.feature == "CDS": self.cds[a["exon_id"]] = self.cds.get(a["exon_id"], []) + [(a["ID"], feature_interval)] elif row.feature == "UTR": self.utrs[a["exon_id"]] = self.utrs.get(a["exon_id"], []) + [(a["ID"], feature_interval)] tabix.close()
def iterator(infile): """return a simple iterator over all entries in a file.""" return pysam.tabix_iterator(infile, pysam.asGTF())
def exploreBAM(myinput): inputs = myinput.split('$') chr, bamfile = inputs[0], inputs[1] outfile = os.path.join(outfolder, 'table_%s_%s' % (chr, pid)) #outfile2=os.path.join(outfolder,'subs_%s_%s'%(chr,pid)) d, di = {}, {} bam = pysam.Samfile(bamfile, "rb") fasta = pysam.Fastafile(fastafile) ktabix = pysam.Tabixfile(kfile) lenregion = dicregions[chr] if uann: tabix = pysam.Tabixfile(annfile) if expos: extabix = pysam.Tabixfile(exfile) out = open(outfile, 'w') #if not custsub: # dsubs=dict([(x+y, 0) for x in 'ACGT' for y in 'ACGT']) # out2=open(outfile2,'w') #header='Region\tPosition\tReference\tCoverage\tMeanQuality\tBaseCount\tSubs\tFrequency\n' #out.write(header) sys.stderr.write('Started analysis on region: %s\n' % (chr)) if blatr: badblat = os.path.join(blatfolder, 'blatseqs_%s.bad' % (chr)) if os.path.exists(badblat): sys.stderr.write('Using Blat mapping for region %s\n' % (chr)) f = open(badblat) for i in f: l = (i.strip()).split() d[l[0] + '_' + l[1]] = int(l[1]) f.close() sys.stderr.write('Found %i reads for region %s\n' % (len(d), chr)) if exss: if os.path.exists(splicefile): sys.stderr.write('Loading known splice sites for region %s\n' % (chr)) f = open(splicefile) for i in f: l = (i.strip()).split() if l[0] != chr: continue st, tp, cc = l[4], l[3], int(l[1]) if st == '+' and tp == 'D': for j in range(nss): di[cc + (j + 1)] = 0 if st == '+' and tp == 'A': for j in range(nss): di[cc - (j + 1)] = 0 if st == '-' and tp == 'D': for j in range(nss): di[cc - (j + 1)] = 0 if st == '-' and tp == 'A': for j in range(nss): di[cc + (j + 1)] = 0 f.close() sys.stderr.write('Loaded %i positions for %s\n' % (len(di), chr)) if chr in ktabix.contigs: for kpos in range(0, lenregion, chunckval): startk, endk = kpos, (kpos + chunckval) - 1 kres = [ kk for kk in ktabix.fetch(reference=chr, start=startk, end=endk) ] if len(kres) == 0: continue kdic = getd(kres) #print kdic # else explore bam to find exact positions for pileupcolumn in bam.pileup(chr, startk, endk): if not startk <= pileupcolumn.pos <= endk: continue if not kdic.has_key(pileupcolumn.pos + 1): continue ref = fasta.fetch(chr, pileupcolumn.pos, pileupcolumn.pos + 1).upper() seq, qual, strand, squal, blatc = '', 0, '', '', '' if rmsh: if ((pileupcolumn.pos + 1) - h**o) - 1 < 0: sequp = '' else: sequp = (fasta.fetch( chr, ((pileupcolumn.pos + 1) - h**o) - 1, (pileupcolumn.pos + 1) - 1)).upper() seqdw = (fasta.fetch(chr, pileupcolumn.pos + 1, (pileupcolumn.pos + 1) + h**o)).upper() for pileupread in pileupcolumn.pileups: # per ogni base dell'allineamento multiplo if not isinstance(pileupread.query_position, (int, long)): continue s, q, t, qq = pileupread.alignment.seq[ pileupread.query_position].upper(), ord( pileupread.alignment.qual[ pileupread.query_position] ) - QVAL, '*', pileupread.alignment.qual[ pileupread.query_position] # escludi posizioni introniche nei pressi di splice sites if exss and di.has_key(pileupcolumn.pos + 1): continue # multiple hit if exh and pileupread.alignment.is_secondary: continue # duplicates if exd and pileupread.alignment.is_duplicate: continue # se paired end if conc and pileupread.alignment.is_paired: # se non concordanti if not pileupread.alignment.is_proper_pair: continue # se concordanti ma nello stesso orientamento flag = pileupread.alignment.flag if pileupread.alignment.is_duplicate: flag = flag - 1024 if pileupread.alignment.is_secondary: flag = flag - 256 if flag in [67, 131, 115, 179]: continue # mapping quality if mq and pileupread.alignment.mapq < MAPQ: continue #se la qualita' >= alla qualita' minima if q >= MQUAL and pileupcolumn.pos in pileupread.alignment.positions: #tags=dict(pileupread.alignment.tags) #deduci la strand per ogni posizione if getstrand: #usa le info del mapping se strand oriented if pileupread.alignment.is_read1: if unchange1: if pileupread.alignment.is_reverse: t = '-' else: t = '+' else: if pileupread.alignment.is_reverse: t = '+' else: t = '-' elif pileupread.alignment.is_read2: if unchange2: if pileupread.alignment.is_reverse: t = '-' else: t = '+' else: if pileupread.alignment.is_reverse: t = '+' else: t = '-' else: # for single ends if unchange1: if pileupread.alignment.is_reverse: t = '-' else: t = '+' else: if pileupread.alignment.is_reverse: t = '+' else: t = '-' if rmnuc: #rlen=pileupread.alignment.rlen #pileupread.alignment.qlen #lunghezza della specifica read #print rlen,pileupread.query_position,pileupread.alignment.qstart,pileupread.alignment.qend # verifica se il nuc deve essere rimosso alle estremita' nel range x-y # testare il forward #qp=pileupread.query_position #pileupread.query_position-pileupread.alignment.qstart #print pileupread.query_position,pileupread.alignment.rlen,len(pileupread.alignment.seq) #if pileupread.alignment.is_reverse: # if (rlen-qp)-1 < rmp[0]:continue # if (rlen-qp)-1 > ((rlen)-rmp[1])-1: continue #else: # if qp<rmp[0]:continue # if qp>(rlen-rmp[1])-1: continue rlen = pileupread.alignment.rlen #pileupread.alignment.qlen #lunghezza della specifica read qp = pileupread.query_position #pileupread.query_position-pileupread.alignment.qstart if pileupread.alignment.is_reverse: if qp > (rlen - rmp[0]) - 1: continue if qp < rmp[1]: continue else: if qp < rmp[0]: continue if qp > (rlen - rmp[1]) - 1: continue # se la read di appartenenza non mappa in modo univoco con Blat if blatr: rt = 0 if pileupread.alignment.is_read1: rt = 1 elif pileupread.alignment.is_read2: rt = 2 rname = pileupread.alignment.qname + '_%i' % (rt) if d.has_key(rname): blatc += '0' #continue else: blatc += '1' # se la base e' diversa dal reference # se in regione omopolimerica scarta if rmsh and rmHomo(sequp, seqdw, h**o, ref): continue seq += s qual += q strand += t squal += qq if seq.strip() != '': if blatr: if testBlat(blatc): seq, qual, squal, strand = normByBlat( seq, strand, squal, blatc) else: continue #print pileupcolumn.pos+1,seq,squal #mystrand=kdic[pileupcolumn.pos+1] #print mystrand try: mystrand = kdic[pileupcolumn.pos + 1] except: mystrand = '2' #print chr,pileupcolumn.pos+1,seq,strand, mystrand if uann and not getstrand: if chr in tabix.contigs: sres = [ kk.strand for kk in tabix.fetch(reference=chr, start=(pileupcolumn.pos), end=(pileupcolumn.pos + 1), parser=pysam.asGTF()) ] mystrand = vstrand(sres) if getstrand and not uann: mystr = vstand(strand) if mystr == '-': mystrand = '0' elif mystr == '+': mystrand = '1' else: mystrand = '2' if mystrand == '0': seq = comp(seq) ref = comp(ref) #if getstrand and mystrand in ['1','0'] and not useconf: seq,qual,squal=normByStrand(seq,strand,squal,mystrand) if getstrand and mystrand in ['1', '0'] and corrstr: seq, qual, squal = normByStrand( seq, strand, squal, mystrand) if uann and mystrand in ['1', '0'] and corrstr: seq, qual, squal = normByStrand( seq, strand, squal, mystrand) #if not getstrand and not uann and mystrand in ['1','0']: seq,qual,squal=normByStrand(seq,strand,squal,mystrand) #print chr,pileupcolumn.pos+1,seq,strand,mystrand cov, bcomp, subs, freq = BaseCount(seq, ref) if cov < MINCOV: continue if exms and subs.count(' ') > 0: continue mqua = meanq(qual, len(seq)) if expos: if chr in extabix.contigs: exres = [ kk for kk in extabix.fetch(reference=chr, start=( pileupcolumn.pos), end=(pileupcolumn.pos + 1)) ] if len(exres) > 0: continue line = '\t'.join([ chr, str(pileupcolumn.pos + 1), ref, mystrand, str(cov), (mqua), str(bcomp), subs, freq ]) + '\n' out.write(line) bam.close() fasta.close() ktabix.close() out.close() if uann: tabix.close() if expos: extabix.close() sys.stderr.write('Job completed for region: %s\n' % (chr))
def readFromFile( infile ): """read gtf from file.""" result = [] for gff in pysam.tabix_iterator( infile, pysam.asGTF() ): result.append( gff ) return result