def bed_tofasta(bed, ref_fasta, min_size=50, stranded=True, include_name=False, out=sys.stdout): if not os.path.exists('%s.fai' % ref_fasta): pysam.faidx(ref_fasta) fasta = pysam.Fastafile(ref_fasta) refs = set() with open('%s.fai' % ref_fasta) as f: for line in f: refs.add(line.split('\t')[0].strip()) name = '' for region in bed: if include_name: name = '%s|' % (region.name.strip()) if region.end - region.start >= min_size and region.chrom in refs: seq = fasta.fetch(region.chrom, region.start, region.end) if stranded and region.strand: if region.strand == '-': seq = revcomp(seq) out.write('>%s%s:%d-%d[%s]\n%s\n' % (name, region.chrom, region.start, region.end, region.strand, seq)) else: out.write('>%s%s:%d-%d\n%s\n' % (name, region.chrom, region.start, region.end, seq)) fasta.close()
def bed_tofasta(bed, ref_fasta, min_size=50, stranded=True, include_name=False, out=sys.stdout): if not os.path.exists('%s.fai' % ref_fasta): pysam.faidx(ref_fasta) fasta = pysam.Fastafile(ref_fasta) refs = set() with open('%s.fai' % ref_fasta) as f: for line in f: refs.add(line.split('\t')[0].strip()) name = '' for region in bed: if include_name: name = '%s|' % (region.name.strip()) if region.end - region.start >= min_size and region.chrom in refs: seq = fasta.fetch(region.chrom, region.start, region.end) if stranded and region.strand: if region.strand == '-': seq = revcomp(seq) out.write('>%s%s:%d-%d[%s]\n%s\n' % (name, region.chrom, region.start, region.end, region.strand, seq)) else: out.write('>%s%s:%d-%d%s\n%s\n' % (name, region.chrom, region.start, region.end, seq)) fasta.close()
def alleles(self): alts = [] for alt in self.observed.split('/'): if alt != '-' and self.strand == '-': alt = revcomp(alt) alts.append(alt) return alts
def alleles(self): alts = [] for alt in self.observed.split("/"): if alt != "-" and self.strand == "-": alt = revcomp(alt) alts.append(alt) return alts
def write_fasta(read, out=sys.stdout, colorspace=False): if colorspace: seq = read.opt('CS') else: seq = read.seq if not read.is_unmapped and read.is_reverse: if colorspace: seq = seq[::-1] else: seq = revcomp(seq) out.write('>%s\n%s\n' % (read.qname, seq))
def write_fastq(read, out=sys.stdout, colorspace=False): if colorspace: seq = read.opt('CS') qual = read.opt('CQ') else: seq = read.seq qual = read.qual if not read.is_unmapped and read.is_reverse: if colorspace: seq = seq[::-1] else: seq = revcomp(seq) qual = qual[::-1] out.write('@%s\n%s\n+\n%s\n' % (read.qname, seq, qual))
def check_tags(barcodes, seq, edit, pos, allow_revcomp=False, verbose=False): """ For each barcode, pull out the appropriate 5' or 3' sub sequence from {seq}. Then run a local alignment of the barcode to the subseq. If a good match is found, return it; otherwise, find the best match and return that. returns a multi-tuple: valid, (tag, valid_seq, edits, reason_if_fail) For the alignments, the reference is the barcode, the query is the subset of the read that is possibly the barcode (5'/3' subseq) """ best = None # check perfect matches first... # for tag in barcodes: # barcodeseq, orientation = barcodes[tag] # if orientation == '5': # if seq[:len(barcodeseq)] == barcodeseq: # return True, (tag, seq[len(barcodeseq):], 0, '') # else: # if seq[-len(barcodeseq):] == barcodeseq: # return True, (tag, seq[:-len(barcodeseq)], 0, '') for tag in barcodes: barcodeseq, orientation, strip = barcodes[tag] if orientation == "5": testseq = seq[: len(barcodeseq) + edit + pos] else: testseq = seq[-1 * (len(barcodeseq) + edit + pos) :] aln = sw.align(barcodeseq, testseq) valid, reason = _tag_aln_check(aln, len(testseq), len(barcodeseq), orientation, edit, pos) if verbose: print "Testing tag: %s vs %s" % (str(barcodes[tag]), testseq) aln.dump() print valid, reason if valid: return True, (tag, aln, True, "") if not best or aln.score > best[1].score: best = (tag, aln, True, reason) if allow_revcomp: if orientation == "5": testseq = seq[-1 * (len(barcodeseq) + edit + pos) :] else: testseq = seq[: len(barcodeseq) + edit + pos] aln = sw.align(revcomp(barcodeseq), testseq) valid, reason = _tag_aln_check( aln, len(testseq), len(barcodeseq), "5" if orientation == "3" else "3", edit, pos ) if verbose: print "Testing tag: %s [rc] vs %s" % (str(barcodes[tag]), testseq) aln.dump() print valid, reason if valid: return True, (tag, aln, False, "") if not best or aln.score > best[1].score: best = (tag, aln, False, reason) if verbose: print "BEST: ", best best[1].dump() return False, best
def check_tags(barcodes, seq, edit, pos, allow_revcomp=False, verbose=False): ''' For each barcode, pull out the appropriate 5' or 3' sub sequence from {seq}. Then run a local alignment of the barcode to the subseq. If a good match is found, return it; otherwise, find the best match and return that. returns a multi-tuple: valid, (tag, valid_seq, edits, reason_if_fail) For the alignments, the reference is the barcode, the query is the subset of the read that is possibly the barcode (5'/3' subseq) ''' best = None # check perfect matches first... # for tag in barcodes: # barcodeseq, orientation = barcodes[tag] # if orientation == '5': # if seq[:len(barcodeseq)] == barcodeseq: # return True, (tag, seq[len(barcodeseq):], 0, '') # else: # if seq[-len(barcodeseq):] == barcodeseq: # return True, (tag, seq[:-len(barcodeseq)], 0, '') for tag in barcodes: barcodeseq, orientation, strip = barcodes[tag] if orientation == '5': testseq = seq[:len(barcodeseq) + edit + pos] else: testseq = seq[-1 * (len(barcodeseq) + edit + pos):] aln = sw.align(barcodeseq, testseq) valid, reason = _tag_aln_check(aln, len(testseq), len(barcodeseq), orientation, edit, pos) if verbose: print 'Testing tag: %s vs %s' % (str(barcodes[tag]), testseq) aln.dump() print valid, reason if valid: return True, (tag, aln, True, '') if not best or aln.score > best[1].score: best = (tag, aln, True, reason) if allow_revcomp: if orientation == '5': testseq = seq[-1 * (len(barcodeseq) + edit + pos):] else: testseq = seq[:len(barcodeseq) + edit + pos] aln = sw.align(revcomp(barcodeseq), testseq) valid, reason = _tag_aln_check(aln, len(testseq), len(barcodeseq), '5' if orientation == '3' else '3', edit, pos) if verbose: print 'Testing tag: %s [rc] vs %s' % (str( barcodes[tag]), testseq) aln.dump() print valid, reason if valid: return True, (tag, aln, False, '') if not best or aln.score > best[1].score: best = (tag, aln, False, reason) if verbose: print 'BEST: ', best best[1].dump() return False, best