def make_joined_read(mate, reads, tags=None): if tags is None: tags = [] # flip reverse strand reads if not reads[0].is_unmapped and reads[0].is_reverse: reads = sorted(reads, reverse=True) # make new reads a = pysam.AlignedRead() # create paired-end reads but do not mark them # as proper pairs and set all mate information # to 'unmapped' a.qname = reads[0].qname a.seq = ''.join(r.seq for r in reads) a.qual = ''.join(r.qual for r in reads) a.is_paired = True a.is_proper_pair = False a.mate_is_unmapped = True a.mrnm = -1 a.mpos = -1 if mate == 0: a.is_read1 = True a.is_read2 = False else: a.is_read1 = False a.is_read2 = True a.isize = 0 a.mapq = 255 a.is_unmapped = reads[0].is_unmapped if a.is_unmapped: a.rname = -1 a.pos = 0 # add the XM tag from bowtie saying whether unmapped # due to multimapping or other reason xm_tag = min(r.opt('XM') for r in reads) tags.append(('XM', xm_tag)) else: a.is_reverse = reads[0].is_reverse a.rname = reads[0].rname a.pos = reads[0].pos a.cigar = ((0, len(a.seq)), ) # compute edit dist edit_dist = 0 for r in reads: edit_dist += r.opt('NM') tags.append(('NM', edit_dist)) # compute mismatches to reference (MD) tags.append(('MD', merge_MD_tags([r.opt('MD') for r in reads]))) a.tags = tags return a
def translate_read(read, chrom, strand, intervals): # skip unmapped reads if read.is_unmapped: return read elif chrom == -1: #logging.warning("discarded alignment %s that does not map to genomic references and cannot be translated" % (str(read))) # throw away reads that cannot be translated by # creating a dummy unmapped read a = pysam.AlignedRead() a.qname = read.qname a.seq = read.seq a.is_unmapped = True a.rname = -1 a.pos = -1 a.mapq = 0 a.mrnm = -1 a.mpos = -1 a.isize = 0 a.qual = read.qual a.tags = [("XM", 0)] return a elif (chrom >= 0) and (intervals is None): # read maps directly to a genomic reference so simply # alter the reference id to correctly refer to the new # SAM header read.rname = chrom return read genomic_intervals = translate_transcriptome_to_genomic_intervals( read, chrom, strand, intervals) spliced, cigar = get_cigar(genomic_intervals) if spliced: read.tags = read.tags + [("XS", "-" if strand else "+")] # modify read read.rname = chrom read.pos = genomic_intervals[0][0] read.cigar = cigar # flip reads that aligned to negative strand genes if strand == STRAND_REV: rev_quals = read.qual[::-1] read.is_reverse = not read.is_reverse read.seq = DNA_reverse_complement(read.seq) read.qual = rev_quals new_tags = [] for name, val in read.tags: if name == 'MD': val = reverse_complement_MD_tag(val) new_tags.append((name, val)) read.tags = new_tags return read
def copy_read(r): a = pysam.AlignedRead() a.qname = r.qname a.seq = r.seq a.flag = r.flag a.rname = r.rname a.pos = r.pos a.mapq = r.mapq a.cigar = r.cigar a.mrnm = r.mrnm a.mpos = r.mpos a.isize = r.isize a.qual = r.qual a.tags = r.tags return a
def make_unmapped_copy(r): a = pysam.AlignedRead() a.qname = r.qname a.seq = r.seq a.qual = r.qual a.is_unmapped = True a.is_qcfail = False a.is_paired = True a.is_proper_pair = False a.mate_is_unmapped = True a.mrnm = -1 a.mpos = -1 a.is_read1 = r.is_read1 a.is_read2 = r.is_read2 a.isize = 0 a.mapq = 255 a.is_reverse = False a.rname = -1 a.pos = 0 a.cigar = () a.tags = (('XM', 0), ) return a
def fastq_to_bam(fastq_files, qual_format, bam_file): fqfhs = [parse_fastq(open(f)) for f in fastq_files] qual_func = get_qual_conversion_func(qual_format) header = {'HD': {'VN': '1.0', 'SO': 'unknown'}} # 'SQ': [{'LN': 1, 'SN': 'dummy'}]} bamfh = pysam.Samfile(bam_file, "wb", header=header) try: while True: for i, fqiter in enumerate(fqfhs): id, seq, qual = fqiter.next() a = pysam.AlignedRead() a.rname = -1 a.mrnm = -1 #a.pos = 0 #a.mpos = 0 a.qname = id a.seq = seq a.qual = qual_func(qual) a.is_read1 = (i == 0) a.is_read2 = (i == 1) bamfh.write(a) except StopIteration: pass bamfh.close()