def testUpdate2(self): '''issue 135: inplace update of sequence and quality score. This does not work as setting the sequence will erase the quality scores. ''' a = self.buildRead() a.query_sequence = a.query_sequence[5:10] self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None) a = self.buildRead() s = pysam.qualities_to_qualitystring(a.query_qualities) a.query_sequence = a.query_sequence[5:10] a.query_qualities = pysam.qualitystring_to_array(s[5:10]) self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), s[5:10])
def saveread(alignedRead, outfile1, outfile2): global r1 global r2 global dr1 global dr2 if alignedRead.is_read1: n = alignedRead.qname if alignedRead.is_reverse: s = "@" + n + '\n' + strRevComp( alignedRead.seq ) + '\n' + "+" + '\n' + pysam.qualities_to_qualitystring( alignedRead.query_qualities[::-1]) + '\n' else: s = "@" + n + '\n' + alignedRead.seq + '\n' + "+" + '\n' + pysam.qualities_to_qualitystring( alignedRead.query_qualities) + '\n' if n in r2: outfile1.write(s.encode()) outfile2.write(zlib.decompress(r2[n])) del r2[n] elif n in dr2: outfile1.write(s.encode()) outfile2.write(zlib.decompress(dr2[n])) del dr2[n] else: r1[n] = zlib.compress(s.encode(), 1) elif alignedRead.is_read2: n = alignedRead.qname if alignedRead.is_reverse: s = "@" + n + '\n' + strRevComp( alignedRead.seq ) + '\n' + "+" + '\n' + pysam.qualities_to_qualitystring( alignedRead.query_qualities[::-1]) + '\n' else: s = "@" + n + '\n' + alignedRead.seq + '\n' + "+" + '\n' + pysam.qualities_to_qualitystring( alignedRead.query_qualities) + '\n' if n in r1: outfile1.write(zlib.decompress(r1[n])) outfile2.write(s.encode()) del r1[n] elif n in dr1: outfile1.write(zlib.decompress(dr1[n])) outfile2.write(s.encode()) del dr1[n] else: r2[n] = zlib.compress(s.encode(), 1)
def quality_array_to_string(quality_list): """Convert list of phred quality values to string. :param quality_list: List of phred quality scores. :returns: Quality string. :rtype: str """ return pysam.qualities_to_qualitystring(quality_list)
def writepair(ar1, ar2, outfile1, outfile2, n): if ar1.is_reverse: s = "@" + n + '\n' + strRevComp( ar1.seq) + '\n' + "+" + '\n' + pysam.qualities_to_qualitystring( ar1.query_qualities[::-1]) + '\n' outfile1.write(s.encode()) s = "@" + n + '\n' + ar2.seq + '\n' + "+" + '\n' + pysam.qualities_to_qualitystring( ar2.query_qualities) + '\n' outfile2.write(s.encode()) else: s = "@" + n + '\n' + ar1.seq + '\n' + "+" + '\n' + pysam.qualities_to_qualitystring( ar1.query_qualities) + '\n' outfile1.write(s.encode()) s = "@" + n + '\n' + strRevComp( ar2.seq) + '\n' + "+" + '\n' + pysam.qualities_to_qualitystring( ar2.query_qualities[::-1]) + '\n' outfile2.write(s.encode())
def ParseReadDictionary(self, read_dictionary): """ parse read dictionary and return header, sequence, and quality as list """ for i in read_dictionary: header = i seq = ''.join(read_dictionary[i]['seq']) qual = ps.qualities_to_qualitystring(read_dictionary[i]['qual']) yield [header, seq, qual]
def write_reads(in_bam, out_dir, ctg): with open(os.path.join(out_dir, ctg+".fq"), 'w') as fout: with pysam.AlignmentFile(in_bam, 'rb') as bam: for line in bam.fetch(contig=ctg): rn = line.query_name seq = line.query_sequence ctg = line.reference_name qual = pysam.qualities_to_qualitystring(line.query_qualities) if line.mapq == 0 or line.mapq == 255: continue fout.write("@%s\n%s\n+\n%s\n"%(rn, seq, qual))
def make_read(seq, cigar, mdtag=None, name="dummy", mapq=10, baseq=30): read = pysam.AlignedSegment() read.seq = seq read.cigarstring = cigar if mdtag: read.set_tag("MD", mdtag) read.qname = name read.mapq = mapq qualities_string = pysam.qualities_to_qualitystring([baseq] * len(seq)) qualities_bytes = qualities_string.encode("ascii") read.qual = qualities_bytes return read
def testEmpty(self): a = pysam.AlignedSegment() self.assertEqual(a.query_name, None) self.assertEqual(a.query_sequence, None) self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None) self.assertEqual(a.flag, 0) self.assertEqual(a.reference_id, -1) self.assertEqual(a.mapping_quality, 0) self.assertEqual(a.cigartuples, None) self.assertEqual(a.tags, []) self.assertEqual(a.next_reference_id, -1) self.assertEqual(a.next_reference_start, -1) self.assertEqual(a.template_length, 0)
def testEmpty(self): a = pysam.AlignedSegment() self.assertEqual(a.query_name, None) self.assertEqual(a.query_sequence, None) self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None) self.assertEqual(a.flag, 0) self.assertEqual(a.reference_id, 0) self.assertEqual(a.mapping_quality, 0) self.assertEqual(a.cigartuples, None) self.assertEqual(a.tags, []) self.assertEqual(a.next_reference_id, 0) self.assertEqual(a.next_reference_start, 0) self.assertEqual(a.template_length, 0)
def make_pysam_read(seq, cigar, mdtag=None, name="dummy", mapq=10, baseq=30, reference_start=0, reference_id=0): read = pysam.AlignedSegment() read.seq = seq read.cigarstring = cigar if mdtag: read.set_tag("MD", mdtag) read.qname = name read.mapq = mapq read.reference_start = reference_start read.reference_id = reference_id qualities_string = pysam.qualities_to_qualitystring([baseq] * len(seq)) read.qual = qualities_string.encode("ascii") return read
def test_realign_rc(genome_source): read = pysam.AlignedSegment() read.query_sequence = genome_source.get_seq("chr1", 30, 50, "-") alns = genome_source.align(Alignment(read)) assert len(alns) == 1 assert alns[0].cigarstring == "21M" assert alns[0].reference_start == 30 assert alns[0].reference_end == 51 assert alns[0].is_reverse qs = "<<<<<<<:<9/,&,22;;<<<" read.query_qualities = pysam.qualitystring_to_array(qs) alns = genome_source.align(Alignment(read)) import warnings with warnings.catch_warnings(): # this is a python 2/3 incompatibility I think, where the warning # indicates array.tostring() is deprecated but array.tobytes() # only exists in py3 warnings.simplefilter("ignore") assert pysam.qualities_to_qualitystring( alns[0].query_qualities) == qs[::-1]
def make_read_info(source_align_file, art_aligned_mapped_reads, art_aligned_unmapped_reads): global LOGGER LOGGER.info("Extracting info from source SAM file (%s)..." % source_align_file) mapped_reads_info = {} unmapped_reads_info = {} with pysam.AlignmentFile(source_align_file) as f: for r in f: if r.is_secondary or r.is_supplementary: continue query_name = r.query_name sequence = r.query_sequence is_spliced = False if not r.is_unmapped: if query_name in art_aligned_mapped_reads: if "N" in r.cigarstring: is_spliced = True mapped_reads_info[query_name] = (r.reference_id, r.reference_start, r.reference_end, r.mapping_quality, is_spliced) else: if query_name in art_aligned_unmapped_reads: unmapped_reads_info[query_name] = ( sequence, pysam.qualities_to_qualitystring(r.query_qualities)) LOGGER.info("Completed info extraction") return mapped_reads_info, unmapped_reads_info
def to_unmapped(segment, unal_read1, unal_read2): if segment.is_read1: if segment.query_name in fwd_read_dict: seq = fwd_read_dict[segment.query_name] qual = qualities_to_qualitystring(segment.query_qualities) if segment.is_reverse: qual = qual[::-1] else: seq = segment.query_sequence qual = qualities_to_qualitystring(segment.query_qualities) if segment.is_reverse: seq = reverse_complement(seq) qual = qual[::-1] unal_read1.write("".join( ["@", segment.query_name, "\n", seq, '\n+\n', qual, '\n'])) elif segment.is_read2: if segment.query_name in rev_read_dict: seq = rev_read_dict[segment.query_name] qual = qualities_to_qualitystring(segment.query_qualities) if segment.is_reverse: qual = qual[::-1] else: seq = segment.query_sequence qual = qualities_to_qualitystring(segment.query_qualities) if segment.is_reverse: seq = reverse_complement(seq) qual = qual[::-1] unal_read2.write("".join( ["@", segment.query_name, "\n", seq, '\n+\n', qual, '\n'])) else: #single end and fully unmapped if segment.query_name in fwd_read_dict: seq = fwd_read_dict[segment.query_name] qual = qualities_to_qualitystring(segment.query_qualities) if segment.is_reverse: qual = qual[::-1] else: seq = segment.query_sequence qual = qualities_to_qualitystring(segment.query_qualities) if segment.is_reverse: seq = reverse_complement(seq) qual = qual[::-1] unal_read1.write("".join( ["@", segment.query_name, "\n", seq, '\n+\n', qual, '\n']))
def _init_vardict(self): """Init the variable dictionary (context for eval/code exec). Tricks: - init only those variable that appear as a substring """ self.vardict = self.init_vardict alignment = self.alignment if 'a' in self.possible_vars: self.vardict['a'] = alignment if 'QNAME' in self.possible_vars: self.vardict['QNAME'] = alignment.query_name if 'FLAG' in self.possible_vars: self.vardict['FLAG'] = alignment.flag if 'POS' in self.possible_vars: self.vardict['POS'] = alignment.reference_start + 1 if 'MAPQ' in self.possible_vars: self.vardict['MAPQ'] = alignment.mapping_quality if 'CIGAR' in self.possible_vars: self.vardict['CIGAR'] = alignment.cigarstring if 'PNEXT' in self.possible_vars: self.vardict['PNEXT'] = alignment.next_reference_start + 1 if 'TLEN' in self.possible_vars: self.vardict['TLEN'] = alignment.template_length if 'SEQ' in self.possible_vars: self.vardict['SEQ'] = alignment.query_sequence if 'RNAMEi' in self.possible_vars: self.vardict['RNAMEi'] = alignment.reference_id if 'RNEXTi' in self.possible_vars: self.vardict['RNEXTi'] = alignment.next_reference_id # the specific implementation depends on the specific version of PySam, we want the same behaviour if isinstance(alignment.qual, str): if 'QUAL' in self.possible_vars: self.vardict['QUAL'] = alignment.qual if 'QUALa' in self.possible_vars: self.vardict['QUALa'] = [ord(x) for x in alignment.qual] if 'QUALs' in self.possible_vars: self.vardict['QUALs'] = alignment.qqual if 'QUALsa' in self.possible_vars: self.vardict['QUALsa'] = [ord(x) for x in alignment.qqual] else: if 'QUAL' in self.possible_vars: self.vardict['QUAL'] = pysam.qualities_to_qualitystring( alignment.qual, offset=0) if 'QUALa' in self.possible_vars: self.vardict['QUALa'] = alignment.qual if 'QUALs' in self.possible_vars: self.vardict['QUALs'] = pysam.qualities_to_qualitystring( alignment.qqual, offset=0) if 'QUALsa' in self.possible_vars: self.vardict['QUALsa'] = alignment.qqual if 'RNAME' in self.possible_vars: if alignment.reference_id == -1: self.vardict['RNAME'] = '*' else: self.vardict['RNAME'] = self.in_sam.get_reference_name( alignment.reference_id) if 'RNEXT' in self.possible_vars: if alignment.next_reference_id == -1: self.vardict['RNEXT'] = '*' else: self.vardict['RNEXT'] = self.in_sam.get_reference_name( alignment.next_reference_id)
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--input-fastq-file", dest="input_fastq_file", type="string", help="input fastq file. " "[%default]") parser.add_option("-m", "--method", dest="method", type="choice", choices=("read-variant", "depth-vcf", "read-list", "coverage-vcf", "barcode"), help="method to apply [%default]") parser.add_option( "-e", "--input-bed", dest="input_bed_file", type="string", help="input file with intervals. Tab-delimited file of intervals " "in bed format to restrict analysis to. [%default]") parser.add_option( "-r", "--region-string", dest="region_string", type="string", help="region string. Only apply method in specified region. " "[%default]") parser.add_option("-f", "--reference-fasta-file", dest="reference_fasta_file", help="reference genomic sequence in fasta format. " "[%default]") parser.add_option("--min-base-quality", dest="min_base_quality", type="int", help="minimum base quality for barcode analysis. " "[%default]") parser.add_option("-s", "--stepper", dest="stepper", type="choice", choices=("nofilter", "samtools", "all")) parser.set_defaults(method="read-variant", reference_fasta_file=None, input_bed_file=None, regex_sample_name="([^/]+).bam", stepper="nofilter", min_base_quality=13, region_string=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) pysam_in = pysam.AlignmentFile(args[0], "rb") if options.input_bed_file: if not os.path.exists(options.input_bed_file): raise OSError("input bed file {} does not exist".format( options.input_bed_file)) bed_in = pysam.TabixFile(options.input_bed_file) else: bed_in = None if options.region_string is not None: itr = generate_from_region(pysam_in, options.region, stepper=options.stepper, min_base_quality=options.min_base_quality) elif bed_in is not None: itr = generate_from_bed(pysam_in, bed_in, stepper=options.stepper, min_base_quality=options.min_base_quality) else: itr = generate_from_bam(pysam_in, stepper=options.stepper, min_base_quality=options.min_base_quality) reference_fasta = pysam.FastaFile(options.reference_fasta_file) outf = options.stdout counter = E.Counter() if options.method == "read-variant": outf.write("chromosome\tposition\tref\ttypes\n") for pileupcolumn in itr: counter.positions_pileup += 1 reference_base = reference_fasta.fetch( pileupcolumn.reference_name, pileupcolumn.reference_pos, pileupcolumn.reference_pos + 1) matches = [] bases = set() for read in pileupcolumn.pileups: qpos = read.query_position if qpos is not None: base = read.alignment.query_sequence[qpos] else: base = "-" matches.append((base, read.alignment.query_name)) bases.add(base) bases = list(bases) if len(bases) == 1: counter.position_noninformative += 1 if bases[0] == reference_base: counter.position_reference += 1 continue counter.position_informative += 1 d = {} for base in bases: d[base] = ",".join([x[1] for x in matches if x[0] == base]) outf.write("{}\t{}\t{}\t{}\n".format(pileupcolumn.reference_name, pileupcolumn.reference_pos, reference_base, json.dumps(d))) elif options.method in ("depth-vcf", "coverage-vcf"): if options.regex_sample_name: sample_name = re.search(options.regex_sample_name, args[0]).groups()[0] else: sample_name = "unknown" outf.write("##fileformat=VCFv4.1\n") outf.write("##FORMAT=<ID=GT,Number=1,Type=String," "Description=\"Genotype\">\n") outf.write("##FORMAT=<ID=DP,Number=1,Type=Integer," "Description=\"Genotype\">\n") outf.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\t" "FILTER\tINFO\tFORMAT\t{}\n".format(sample_name)) is_depth = options.method == "depth-vcf" for idx, pileupcolumn in enumerate(itr): if idx % 1000 == 0: E.info("processed {} positions".format(idx)) reference_base = reference_fasta.fetch( pileupcolumn.reference_name, pileupcolumn.reference_pos, pileupcolumn.reference_pos + 1).upper() if reference_base == 'A': alt_base = 'C' else: alt_base = 'A' if is_depth: n = sum([ 1 for x in pileupcolumn.pileups if not (x.is_del or x.is_refskip) ]) else: n = pileupcolumn.n outf.write("{}\t{}\t.\t{}\t{}\t.\tPASS\t.\tGT:DP\t0/1:{}\n".format( pileupcolumn.reference_name, pileupcolumn.reference_pos, reference_base, alt_base, n)) elif options.method == "read-list": outf.write( "chromosome\tposition\treference_base\tbase\tquality\tquery_name\n" ) for pileupcolumn in itr: reference_base = reference_fasta.fetch( pileupcolumn.reference_name, pileupcolumn.reference_pos, pileupcolumn.reference_pos + 1) matches = [] for read in pileupcolumn.pileups: qpos = read.query_position if qpos is not None: base = read.alignment.query_sequence[qpos] quality = read.alignment.query_qualities[qpos] else: base = "-" quality = "" outf.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( pileupcolumn.reference_name, pileupcolumn.reference_pos, reference_base, base, quality, read.alignment.query_name)) elif options.method == "barcode": rows = [] for c in itr: rows.append( (c.reference_pos, c.n, "".join(c.get_query_sequences()), pysam.qualities_to_qualitystring(c.get_query_qualities()))) df = pandas.DataFrame.from_records( rows, columns=["pos", "gapped_depth", "bases", "qualities"]) df["depth"] = df.bases.str.len() bases = ["A", "C", "G", "T"] for b in bases: df[b] = df.bases.str.upper().str.count(b) df["consensus"] = df[bases].idxmax(axis=1) df["consensus_counts"] = df.lookup(df.index, df.consensus) df["consensus_support"] = df.consensus_counts / df.depth df["offconsensus_counts"] = df.depth - df.consensus_counts df.loc[df.consensus_counts == 0, "consensus"] = "N" df.to_csv(outf, sep="\t", index=False) E.info(counter) # write footer and output benchmark information. E.stop()
def generate_barcode(self, split_fq_sam_file_List, outdir, samtools_path, split_modify_fq_file_List, file_sign, maxsize, barcode_sequence): fqsamlist = list() rsplit_fq_sam_file_List = open(split_fq_sam_file_List, 'r') for split_fq_sam_file_List_info in rsplit_fq_sam_file_List: fqsamlist.append( (re.split("\t", split_fq_sam_file_List_info.strip()))[1]) otherfiletmplist.append( (re.split("\t", split_fq_sam_file_List_info.strip()))[1]) mergedFqBam = fqsamlist[0].replace("0.fq.sam.bam", "fq.merged.bam") shelldir = outdir + "/shell" check_info(shelldir, "dir") shell = shelldir + "/merge_fqbam.sh" wshell = open(shell, 'w') if len(fqsamlist) > 1: allfqsam = " ".join(fqsamlist) shell_line = " ".join( [samtools_path, "merge -f", mergedFqBam, allfqsam]) + "\n" else: shell_line = " ".join(["ln -sf", fqsamlist[0], mergedFqBam]) + "\n" wshell.write(shell_line) sv = find_samtools_version(samtools_path, shelldir) sortedFqBamprefix = mergedFqBam.replace("fq.merged.bam", "fq.merged.sorted") sortedFqBam = sortedFqBamprefix + ".bam" if sv == 0: shell_line = " ".join([ samtools_path, "sort -n -m 1G", mergedFqBam, sortedFqBamprefix ]) + "\n" else: shell_line = " ".join([ samtools_path, "sort -n -m 1G -o", sortedFqBamprefix + ".bam", mergedFqBam ]) + "\n" wshell.write(shell_line) shell_line = " ".join([samtools_path, "index", sortedFqBam]) + "\n" wshell.write(shell_line) wshell.close() subprocess.call(["sh", shell]) new_fq_prefix = sortedFqBam.replace("fq.merged.sorted.bam", "") otherfiletmplist.append(mergedFqBam) otherfiletmplist.append(sortedFqBam) otherfiletmplist.append(sortedFqBam + ".bai") wsplit_modify_fq_file_List = open(split_modify_fq_file_List, 'w') SplitSize = 0 rsortedFqBam = pysam.AlignmentFile(sortedFqBam, 'rb') readid = "N" barcodeid = 0 barcode_marker = "N" new_barcode_sequence = "N" start = 11 s = 0 split_modify_fq_file = fqsamlist[0].replace( "0.fq.sam.bam", "") + str(s) + ".BX.modified.fq.gz" wsplit_modify_fq_file_List.write(split_modify_fq_file + "\n") wsplit_modify_fq_file = gzip.open(split_modify_fq_file, 'wb') for FqBaminfo in rsortedFqBam: (real_barcode, real_readid) = re.split("_", FqBaminfo.query_name) if FqBaminfo.query_name != readid: readid = FqBaminfo.query_name if barcode_marker != real_barcode: barcode_marker = real_barcode (start, new_barcode_suffix) = self.get_new_barcode( barcodeid, start) barcodeid += 1 new_barcode_sequence = barcode_sequence + new_barcode_suffix real_readid = '@' + real_readid + "/1\tBC:Z:" + new_barcode_sequence else: real_readid = '@' + real_readid + "/2\tBC:Z:" + new_barcode_sequence SplitSize += 4 complete_read_info = "\n".join([ real_readid, FqBaminfo.query_sequence, "+", pysam.qualities_to_qualitystring(FqBaminfo.query_qualities) ]) + "\n" if SplitSize > maxsize: wsplit_modify_fq_file.close() s += 1 SplitSize = 4 split_modify_fq_file = fqsamlist[0].replace( "0.fq.sam.bam", "") + str(s) + ".BX.modified.fq.gz" wsplit_modify_fq_file_List.write(split_modify_fq_file + "\n") wsplit_modify_fq_file = gzip.open(split_modify_fq_file, 'wb') wsplit_modify_fq_file.write(complete_read_info.encode()) rsortedFqBam.close() wsplit_modify_fq_file_List.close() wsplit_modify_fq_file.close() wfile_sign = open(file_sign, 'w') wfile_sign.write("done!\n") wfile_sign.close return (split_modify_fq_file_List)