def make_read(pos, aend): read = AlignedRead() read.pos = pos read.seq = 'N' * (aend - pos) read.cigarstring = '{}M'.format(aend - pos) assert read.aend == aend, '{} != {}'.format(read.aend, aend) return read
def alignedRead(self): """Returns an alignedRead object of a the subsampled read aligned perfectley to ref""" aRead = AlignedRead() aRead.seq=str(self.seq) aRead.qual = self.qscore('ascii') aRead.qname = self.id aRead.is_reverse = self.opt.is_reverse ## Add the start position of where the read should align to # aRead.positions = [] return aRead
def get_barcode_index(self, read: pysam.AlignedRead) -> Optional[int]: """ Returns None if barcode is not in the whitelist, otherwise a small integer """ if not read.has_tag(self.tag): return None if self.use_rg: # require RG tag to be available for each read barcode = read.get_tag(self.tag), read.get_tag("RG") else: barcode = read.get_tag(self.tag) return self.barcode2index.get(barcode, None)
def testbasicQuery(): """Test DB stuff""" ## testing db stuff database = errordb('testErrors') database.deleteAll() a = AlignedRead() a.seq="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG" a.qual = '++))++)+*)******)))+)**+*+++)**)*+)' errorObj = error('A','T',a,0,refPos=1000) errorObj = database.addError(error=errorObj)
def testErrorClassBasic(): """Just to everything works in the most basis case""" a = AlignedRead() a.seq="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG" a.qual = '++))++)+*)******)))+)**+*+++)**)*+)' errorObj = error('A','T',a,0,refPos=1000) assert_equal(errorObj.true,'A') assert_equal(errorObj.emission,'T') assert_equal(errorObj.before(2),'NN') assert_equal(errorObj.after(2),'GC') assert_equal(errorObj.isSnp,True) assert_equal(errorObj.isIndel,False) assert_equal(errorObj.qual,10) assert_equal(errorObj.qscore(2),8)
def find_errors(self,query={},filt=None): errorList = [] for document in self.find(query,filt): read = AlignedRead() # read.seq = str(document['read']) read.seq = '' qname = 'st=%s&id=%s' % (str(document['refPos']),document['readID']) read.qname = "TMPCHANGE" errorList.append(error(true=document['true'], emission=document['emission'], read=read,readPos=document['readPos'], readLength = document['readLength'], refPos=document['refPos'])) return errorList
def artificial_read(flag=163): aln = AlignedRead() aln.qname = "try_this" aln.flag = flag aln.cigarstring = "16M" aln.seq = "A" * 16 aln.pos = 1000000 if flag == 163 else 1000020 aln.reference_id = 1 return aln
def parse_read(read: AlignedRead) -> Optional[Tuple[float, int]]: """ returns None if read should be ignored. Read still can be ignored if it is not in the barcode list """ if read.get_tag("AS") <= len(read.seq) - 8: # more than 2 edits return None if read.get_tag("NH") > 1: # multi-mapped return None if not read.has_tag("UB"): # does not have molecule barcode return None if read.mapq < 20: # this one should not be triggered because of NH, but just in case return None p_misaligned = 0.01 # default value ub = hash_string(read.get_tag("UB")) return p_misaligned, ub
def mkread(seq, quals, cigar, pos=None): global _READS_MADE r = AlignedRead() r.qname = 'testread{}'.format(_READS_MADE) r.seq = seq quals = [22] * len(seq) if quals is None else quals r.qual = ''.join([chr(q + 33) for q in quals]) r.cigarstring = cigar if pos: r.pos = pos _READS_MADE += 1 return r
def _string_to_aligned_segment(line, seq_dict, log_output): """Converts SAM record in string format to pysam AlignedRead Args: line: String of SAM record seq_dict: Dictionary mapping reference ID to reference ID index log_output: Handle for outputting log information Returns: aligned_segment: pysam AlignedRead class with values from 'line' """ line = line.strip().split() #print(line) aligned_segment = AlignedRead() aligned_segment.query_name = line[0] aligned_segment.flag = int(line[1]) if line[2] != "*": aligned_segment.reference_id = seq_dict[line[2]] aligned_segment.reference_start = int(line[3]) - 1 aligned_segment.mapping_quality = int(line[4]) cigartuples = [] pos = "" for symbol in line[5]: if symbol.isdigit(): pos += symbol elif symbol == "*": continue else: cigartuples.append((_CIGAR_OPERATIONS[symbol], int(pos))) pos = "" aligned_segment.cigartuples = cigartuples if line[6] == "=": aligned_segment.next_reference_id = seq_dict[line[2]] elif line[6] != "*": aligned_segment.next_reference_id = seq_dict[line[6]] aligned_segment.next_reference_start = int(line[7]) - 1 aligned_segment.template_length = int(line[8]) aligned_segment.query_sequence = line[9] aligned_segment.query_qualities = qualitystring_to_array(line[10]) for field in line[11::]: tag, tag_type, val = field.split(":", maxsplit=2) if tag_type == "i": val = int(val) elif tag_type == "f": val = float(val) elif tag_type == "H": val = bytearray.fromhex(val) elif tag_type == "B": val = [int(i) for i in val.split(",")] elif not (tag_type == "A" or tag_type == "Z"): err_msg = "Optional Ttag type '{}' not recognised".format(tag_type) log_output.write("ERROR: {}\n".format(err_msg)) raise Exception(err_msg) aligned_segment.set_tag(tag, val, value_type=tag_type) return aligned_segment
def main(): parser = OptionParser(usage=usage) #parser.add_option("-s", action="store_true", dest="sam_input", default=False, #help="Input is in SAM format instead of BAM format") (options, args) = parser.parse_args() if len(args) != 4: parser.print_help() sys.exit(1) psl_filename = args[0] ref_filename = args[1] contigs_filename = args[2] bam_filename = args[3] liftover_dir = args[1] references, ref_chromosomes = read_fasta(ref_filename) refname_to_id = dict([(name, i) for i, name in enumerate(ref_chromosomes)]) print('Read', len(ref_chromosomes), 'reference chromosomes:', ','.join(ref_chromosomes), file=sys.stderr) contigs, contig_names = read_fasta(contigs_filename) print('Read', len(contig_names), 'contigs.', file=sys.stderr) bam_header = { 'HD': { 'VN': '1.0' }, 'SQ': [ dict([('LN', len(references[chromosome])), ('SN', chromosome)]) for chromosome in ref_chromosomes ] } outfile = Samfile(bam_filename, 'wb', header=bam_header) line_nr = 0 header_read = False for line in (s.strip() for s in open(psl_filename)): line_nr += 1 if line.startswith('------'): header_read = True continue if not header_read: continue fields = line.split() assert len( fields ) == 21, 'Error reading PSL file, offending line: %d' % line_nr sizes = [int(x) for x in fields[18].strip(',').split(',')] contig_starts = [int(x) for x in fields[19].strip(',').split(',')] ref_starts = [int(x) for x in fields[20].strip(',').split(',')] assert 0 < len(sizes) == len(contig_starts) == len(ref_starts) strand = fields[8] contig_name = fields[9] ref_name = fields[13] assert strand in ['-', '+'] assert contig_name in contigs assert ref_name in references a = AlignedRead() a.qname = contig_name if strand == '+': a.seq = str(contigs[contig_name]) else: a.seq = str(contigs[contig_name].reverse_complement()) a.flag = (16 if strand == '+' else 0) a.rname = refname_to_id[ref_name] a.pos = ref_starts[0] a.mapq = 255 qpos = contig_starts[0] refpos = ref_starts[0] cigar = [] # soft-clipping at the start? if contig_starts[0] > 0: cigar.append((4, contig_starts[0])) longest_insertion = 0 longest_deletion = 0 total_matches = 0 total_insertion = 0 total_deletion = 0 for length, contig_start, ref_start in zip(sizes, contig_starts, ref_starts): assert contig_start >= qpos assert ref_start >= refpos # insertion? if contig_start > qpos: insertion_length = contig_start - qpos longest_insertion = max(longest_insertion, insertion_length) total_insertion += insertion_length append_to_cigar(cigar, 1, insertion_length) qpos = contig_start # deletion? if ref_start > refpos: deletion_length = ref_start - refpos longest_deletion = max(longest_deletion, deletion_length) total_deletion += deletion_length append_to_cigar(cigar, 2, deletion_length) refpos = ref_start # strech of matches/mismatches append_to_cigar(cigar, 0, length) refpos += length qpos += length total_matches += length # soft-clipping at the end? if len(a.seq) > qpos: cigar.append((4, len(a.seq) - qpos)) a.cigar = tuple(cigar) # only use contigs where longest deletion is <= 10000 bp if longest_deletion > 10000: continue # require at least 200 matching positions if total_matches < 200: continue # require the matching positions to make up at least 75 percent of the contig # (without counting parts of the contig that are insertions). if total_matches / (len(a.seq) - total_insertion) < 0.75: continue outfile.write(a) outfile.close()
def build_read(self, read_tags, is_forward): read = AlignedRead() read.seq = self.sequence.sequence_minus_deletions() read.rname = self.chrom_id read.pos = self.read_start read.mapq = self.mapping_quality read.cigarstring = self.sequence.cigar read.rnext = self.chrom_id read.pnext = self.read_mate_start read.tlen = self.insert_size read.qual = self.quality.ascii_quality read.tags = read_tags read.qname = self.read_id if self.read_flags is None: read.flag = FORWARD_GOOD_READ if is_forward else REVERSE_GOOD_READ else: read.flag = self.read_flags return read
def alignment_info_to_sam(seqrecord, aln_info, mate_id, mate_info, read_group, is_first): """\ Convert the internal alignment structure into an official SAM record. The reason to go immediately to SAM is for compatibility with other tools, and the ablity to write out a file that can be easily visualized. A small annoyance is that downstream we will have to re-infer the location of mismatch/ins/del elements from the cigar string; but small price to pay. :param seqrecord: the read (a Bio.SeqRecord.SeqRecord object) :param aln_info: the alignment information :param mate_id: the mate for the read :param read_group: the read group for this alignment :returns: a SAMRecord for the alignment """ samrecord = AlignedRead() samrecord.qname = seqrecord.id.rsplit(':', 1)[0] samrecord.seq = str(seqrecord.seq).upper() samrecord.is_unmapped = aln_info == None if aln_info: samrecord.mapq = 255 # TODO alignment quality? samrecord.pos = aln_info.offset samrecord.tags += [("NM", aln_info.mismatches), ("RG", read_group)] #samrecord.cigar = [(0, len(str(seqrecord.seq)))] # TODO allow indels at some point #samrecord.cigarstring = '{}M'.format(len(str(seqrecord.seq))) samrecord.cigarstring = aln_info.cigar samrecord.rname, samrecord.tid = 0, 0 # TODO deal with multiple contigs else: samrecord.tags += [("RG", read_group)] if mate_info: samrecord.mpos = mate_info.offset samrecord.pnext = mate_info.offset samrecord.rnext = 0 # TODO deal with multiple contigs samrecord.mate_is_reverse = mate_info.reversed if aln_info and mate_info: # proper pair: reads are pointing at each other if aln_info.offset < mate_info.offset: samrecord.is_proper_pair = mate_info.reversed and not aln_info.reversed else: samrecord.is_proper_pair = aln_info.reversed and not mate_info.reversed # calculate insert first, second = (aln_info, mate_info) if is_first else (mate_info, aln_info) samrecord.isize = first.offset - second.offset if first.reversed else second.offset - first.offset is_reverse = aln_info is not None and aln_info.reversed if is_reverse: samrecord.seq = samrecord.seq[::-1] is_unmapped = aln_info == None mate_is_unmapped = mate_info == None mate_is_reverse = mate_info is not None and mate_info.reversed is_second = not is_first # TODO allow unpaired reads (the 0x1 flag) samrecord.flag = (0x1 | 0x2 | 0x4 * is_unmapped | 0x8 * mate_is_unmapped | 0x10 * is_reverse | 0x20 * mate_is_reverse | 0x40 * is_first | 0x80 * is_second) if samrecord.is_unmapped: samrecord.tid = -1 samrecord.pos = -1 samrecord.cigarstring = '*' samrecord.cigar = [] else: if not samrecord.seq: # cleared by PySam (this can happen for certain cigar strings) samrecord.seq = str(seqrecord.seq).upper() try: if sum([x[1] for x in samrecord.cigar if x[0] != 2]) != len( samrecord.seq): print('ERROR AT POSITION {}'.format(samrecord.pos)) raise ValueError('Cigar {} does not fit sequence {}'.format( samrecord.cigarstring, samrecord.seq)) except TypeError: print('No seq in record: {}'.format(samrecord)) print('WTF is goin on? \n{}'.format(seqrecord)) samrecord.qual = ''.join([ chr(q + 33) for q in seqrecord._per_letter_annotations['phred_quality'] ]) return samrecord
def main(): parser = OptionParser(usage=usage) #parser.add_option("-s", action="store_true", dest="sam_input", default=False, #help="Input is in SAM format instead of BAM format") (options, args) = parser.parse_args() if len(args) != 4: parser.print_help() sys.exit(1) psl_filename = args[0] ref_filename = args[1] contigs_filename = args[2] bam_filename = args[3] liftover_dir = args[1] references, ref_chromosomes = read_fasta(ref_filename) refname_to_id = dict([(name,i) for i,name in enumerate(ref_chromosomes)]) print('Read', len(ref_chromosomes), 'reference chromosomes:', ','.join(ref_chromosomes), file=sys.stderr) contigs, contig_names = read_fasta(contigs_filename) print('Read', len(contig_names), 'contigs.', file=sys.stderr) bam_header = {'HD': {'VN': '1.0'}, 'SQ': [dict([('LN', len(references[chromosome])), ('SN', chromosome)]) for chromosome in ref_chromosomes] } outfile = Samfile(bam_filename, 'wb', header=bam_header) line_nr = 0 header_read = False for line in (s.strip() for s in open(psl_filename)): line_nr += 1 if line.startswith('------'): header_read = True continue if not header_read: continue fields = line.split() assert len(fields) == 21, 'Error reading PSL file, offending line: %d'%line_nr sizes = [int(x) for x in fields[18].strip(',').split(',')] contig_starts = [int(x) for x in fields[19].strip(',').split(',')] ref_starts = [int(x) for x in fields[20].strip(',').split(',')] assert 0 < len(sizes) == len(contig_starts) == len(ref_starts) strand = fields[8] contig_name = fields[9] ref_name = fields[13] assert strand in ['-','+'] assert contig_name in contigs assert ref_name in references a = AlignedRead() a.qname = contig_name if strand == '+': a.seq = str(contigs[contig_name]) else: a.seq = str(contigs[contig_name].reverse_complement()) a.flag = (16 if strand == '+' else 0) a.rname = refname_to_id[ref_name] a.pos = ref_starts[0] a.mapq = 255 qpos = contig_starts[0] refpos = ref_starts[0] cigar = [] # soft-clipping at the start? if contig_starts[0] > 0: cigar.append((4,contig_starts[0])) longest_insertion = 0 longest_deletion = 0 total_matches = 0 total_insertion = 0 total_deletion = 0 for length, contig_start, ref_start in zip(sizes, contig_starts, ref_starts): assert contig_start >= qpos assert ref_start >= refpos # insertion? if contig_start > qpos: insertion_length = contig_start - qpos longest_insertion = max(longest_insertion, insertion_length) total_insertion += insertion_length append_to_cigar(cigar, 1, insertion_length) qpos = contig_start # deletion? if ref_start > refpos: deletion_length = ref_start - refpos longest_deletion = max(longest_deletion, deletion_length) total_deletion += deletion_length append_to_cigar(cigar, 2, deletion_length) refpos = ref_start # strech of matches/mismatches append_to_cigar(cigar, 0, length) refpos += length qpos += length total_matches += length # soft-clipping at the end? if len(a.seq) > qpos: cigar.append((4,len(a.seq) - qpos)) a.cigar = tuple(cigar) # only use contigs where longest deletion is <= 10000 bp if longest_deletion > 10000: continue # require at least 200 matching positions if total_matches < 200: continue # require the matching positions to make up at least 75 percent of the contig # (without counting parts of the contig that are insertions). if total_matches / (len(a.seq) - total_insertion) < 0.75: continue outfile.write(a) outfile.close()