def make_read(pos, aend):
    read = AlignedRead()
    read.pos = pos
    read.seq = 'N' * (aend - pos)
    read.cigarstring = '{}M'.format(aend - pos)
    assert read.aend == aend, '{} != {}'.format(read.aend, aend)
    return read
Exemple #2
0
 def alignedRead(self):
     """Returns an alignedRead object of a the subsampled read aligned perfectley to ref"""
     aRead = AlignedRead()
     aRead.seq=str(self.seq)
     aRead.qual = self.qscore('ascii')
     aRead.qname = self.id
     aRead.is_reverse = self.opt.is_reverse
     ## Add the start position of where the read should align to
     # aRead.positions = [] 
     return aRead
Exemple #3
0
 def get_barcode_index(self, read: pysam.AlignedRead) -> Optional[int]:
     """ Returns None if barcode is not in the whitelist, otherwise a small integer """
     if not read.has_tag(self.tag):
         return None
     if self.use_rg:
         # require RG tag to be available for each read
         barcode = read.get_tag(self.tag), read.get_tag("RG")
     else:
         barcode = read.get_tag(self.tag)
     return self.barcode2index.get(barcode, None)
def testbasicQuery():
    """Test DB stuff"""
    ## testing db stuff
    database = errordb('testErrors')
    database.deleteAll()
    a = AlignedRead()
    a.seq="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"
    a.qual = '++))++)+*)******)))+)**+*+++)**)*+)'
    errorObj = error('A','T',a,0,refPos=1000)
    errorObj = database.addError(error=errorObj)
def testErrorClassBasic():
    """Just to everything works in the most basis case"""
    a = AlignedRead()
    a.seq="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"
    a.qual = '++))++)+*)******)))+)**+*+++)**)*+)'
    errorObj = error('A','T',a,0,refPos=1000)
    assert_equal(errorObj.true,'A')
    assert_equal(errorObj.emission,'T')
    assert_equal(errorObj.before(2),'NN')
    assert_equal(errorObj.after(2),'GC')
    assert_equal(errorObj.isSnp,True)
    assert_equal(errorObj.isIndel,False)
    assert_equal(errorObj.qual,10)
    assert_equal(errorObj.qscore(2),8)
Exemple #6
0
 def find_errors(self,query={},filt=None):
     errorList = []
     for document in self.find(query,filt):
         read = AlignedRead()
         # read.seq = str(document['read'])
         read.seq = ''
         qname = 'st=%s&id=%s' % (str(document['refPos']),document['readID'])
         read.qname = "TMPCHANGE"
         errorList.append(error(true=document['true'],
                                     emission=document['emission'],
                                     read=read,readPos=document['readPos'],
                                     readLength = document['readLength'],
                                     refPos=document['refPos']))
     return errorList
def artificial_read(flag=163):
    aln = AlignedRead()
    aln.qname = "try_this"
    aln.flag = flag
    aln.cigarstring = "16M"
    aln.seq = "A" * 16
    aln.pos = 1000000 if flag == 163 else 1000020
    aln.reference_id = 1
    return aln
Exemple #8
0
def parse_read(read: AlignedRead) -> Optional[Tuple[float, int]]:
    """
    returns None if read should be ignored.
    Read still can be ignored if it is not in the barcode list
    """
    if read.get_tag("AS") <= len(read.seq) - 8:
        # more than 2 edits
        return None
    if read.get_tag("NH") > 1:
        # multi-mapped
        return None
    if not read.has_tag("UB"):
        # does not have molecule barcode
        return None
    if read.mapq < 20:
        # this one should not be triggered because of NH, but just in case
        return None

    p_misaligned = 0.01  # default value
    ub = hash_string(read.get_tag("UB"))
    return p_misaligned, ub
Exemple #9
0
def mkread(seq, quals, cigar, pos=None):
    global _READS_MADE
    r = AlignedRead()
    r.qname = 'testread{}'.format(_READS_MADE)
    r.seq = seq
    quals = [22] * len(seq) if quals is None else quals
    r.qual = ''.join([chr(q + 33) for q in quals])
    r.cigarstring = cigar
    if pos:
        r.pos = pos
    _READS_MADE += 1
    return r
Exemple #10
0
def _string_to_aligned_segment(line, seq_dict, log_output):
    """Converts SAM record in string format to pysam AlignedRead
  Args:
    line: String of SAM record
    seq_dict: Dictionary mapping reference ID to reference ID index
    log_output: Handle for outputting log information
  Returns:
    aligned_segment: pysam AlignedRead class with values from 'line'
  """
    line = line.strip().split()
    #print(line)
    aligned_segment = AlignedRead()
    aligned_segment.query_name = line[0]
    aligned_segment.flag = int(line[1])
    if line[2] != "*":
        aligned_segment.reference_id = seq_dict[line[2]]
        aligned_segment.reference_start = int(line[3]) - 1
        aligned_segment.mapping_quality = int(line[4])
    cigartuples = []
    pos = ""
    for symbol in line[5]:
        if symbol.isdigit():
            pos += symbol
        elif symbol == "*":
            continue
        else:
            cigartuples.append((_CIGAR_OPERATIONS[symbol], int(pos)))
            pos = ""
    aligned_segment.cigartuples = cigartuples
    if line[6] == "=":
        aligned_segment.next_reference_id = seq_dict[line[2]]
    elif line[6] != "*":
        aligned_segment.next_reference_id = seq_dict[line[6]]
    aligned_segment.next_reference_start = int(line[7]) - 1
    aligned_segment.template_length = int(line[8])
    aligned_segment.query_sequence = line[9]
    aligned_segment.query_qualities = qualitystring_to_array(line[10])
    for field in line[11::]:
        tag, tag_type, val = field.split(":", maxsplit=2)
        if tag_type == "i":
            val = int(val)
        elif tag_type == "f":
            val = float(val)
        elif tag_type == "H":
            val = bytearray.fromhex(val)
        elif tag_type == "B":
            val = [int(i) for i in val.split(",")]
        elif not (tag_type == "A" or tag_type == "Z"):
            err_msg = "Optional Ttag type '{}' not recognised".format(tag_type)
            log_output.write("ERROR: {}\n".format(err_msg))
            raise Exception(err_msg)
        aligned_segment.set_tag(tag, val, value_type=tag_type)
    return aligned_segment
Exemple #11
0
def main():
    parser = OptionParser(usage=usage)
    #parser.add_option("-s", action="store_true", dest="sam_input", default=False,
    #help="Input is in SAM format instead of BAM format")
    (options, args) = parser.parse_args()
    if len(args) != 4:
        parser.print_help()
        sys.exit(1)
    psl_filename = args[0]
    ref_filename = args[1]
    contigs_filename = args[2]
    bam_filename = args[3]
    liftover_dir = args[1]

    references, ref_chromosomes = read_fasta(ref_filename)
    refname_to_id = dict([(name, i) for i, name in enumerate(ref_chromosomes)])
    print('Read',
          len(ref_chromosomes),
          'reference chromosomes:',
          ','.join(ref_chromosomes),
          file=sys.stderr)
    contigs, contig_names = read_fasta(contigs_filename)
    print('Read', len(contig_names), 'contigs.', file=sys.stderr)
    bam_header = {
        'HD': {
            'VN': '1.0'
        },
        'SQ': [
            dict([('LN', len(references[chromosome])), ('SN', chromosome)])
            for chromosome in ref_chromosomes
        ]
    }
    outfile = Samfile(bam_filename, 'wb', header=bam_header)

    line_nr = 0
    header_read = False
    for line in (s.strip() for s in open(psl_filename)):
        line_nr += 1
        if line.startswith('------'):
            header_read = True
            continue
        if not header_read: continue
        fields = line.split()
        assert len(
            fields
        ) == 21, 'Error reading PSL file, offending line: %d' % line_nr
        sizes = [int(x) for x in fields[18].strip(',').split(',')]
        contig_starts = [int(x) for x in fields[19].strip(',').split(',')]
        ref_starts = [int(x) for x in fields[20].strip(',').split(',')]
        assert 0 < len(sizes) == len(contig_starts) == len(ref_starts)
        strand = fields[8]
        contig_name = fields[9]
        ref_name = fields[13]
        assert strand in ['-', '+']
        assert contig_name in contigs
        assert ref_name in references
        a = AlignedRead()
        a.qname = contig_name
        if strand == '+':
            a.seq = str(contigs[contig_name])
        else:
            a.seq = str(contigs[contig_name].reverse_complement())
        a.flag = (16 if strand == '+' else 0)
        a.rname = refname_to_id[ref_name]
        a.pos = ref_starts[0]
        a.mapq = 255
        qpos = contig_starts[0]
        refpos = ref_starts[0]
        cigar = []
        # soft-clipping at the start?
        if contig_starts[0] > 0:
            cigar.append((4, contig_starts[0]))
        longest_insertion = 0
        longest_deletion = 0
        total_matches = 0
        total_insertion = 0
        total_deletion = 0
        for length, contig_start, ref_start in zip(sizes, contig_starts,
                                                   ref_starts):
            assert contig_start >= qpos
            assert ref_start >= refpos
            # insertion?
            if contig_start > qpos:
                insertion_length = contig_start - qpos
                longest_insertion = max(longest_insertion, insertion_length)
                total_insertion += insertion_length
                append_to_cigar(cigar, 1, insertion_length)
                qpos = contig_start
            # deletion?
            if ref_start > refpos:
                deletion_length = ref_start - refpos
                longest_deletion = max(longest_deletion, deletion_length)
                total_deletion += deletion_length
                append_to_cigar(cigar, 2, deletion_length)
                refpos = ref_start
            # strech of matches/mismatches
            append_to_cigar(cigar, 0, length)
            refpos += length
            qpos += length
            total_matches += length
        # soft-clipping at the end?
        if len(a.seq) > qpos:
            cigar.append((4, len(a.seq) - qpos))
        a.cigar = tuple(cigar)
        # only use contigs where longest deletion is <= 10000 bp
        if longest_deletion > 10000: continue
        # require at least 200 matching positions
        if total_matches < 200: continue
        # require the matching positions to make up at least 75 percent of the contig
        # (without counting parts of the contig that are insertions).
        if total_matches / (len(a.seq) - total_insertion) < 0.75: continue
        outfile.write(a)
    outfile.close()
Exemple #12
0
    def build_read(self, read_tags, is_forward):
        read = AlignedRead()

        read.seq = self.sequence.sequence_minus_deletions()
        read.rname = self.chrom_id
        read.pos = self.read_start
        read.mapq = self.mapping_quality
        read.cigarstring = self.sequence.cigar
        read.rnext = self.chrom_id
        read.pnext = self.read_mate_start
        read.tlen = self.insert_size
        read.qual = self.quality.ascii_quality
        read.tags = read_tags

        read.qname = self.read_id

        if self.read_flags is None:
            read.flag = FORWARD_GOOD_READ if is_forward else REVERSE_GOOD_READ
        else:
            read.flag = self.read_flags

        return read
Exemple #13
0
def alignment_info_to_sam(seqrecord, aln_info, mate_id, mate_info, read_group,
                          is_first):
    """\
    Convert the internal alignment structure into an official SAM record. The reason to
    go immediately to SAM is for compatibility with other tools, and the ablity to
    write out a file that can be easily visualized. A small annoyance is that
    downstream we will have to re-infer the location of mismatch/ins/del elements
    from the cigar string; but small price to pay.

    :param seqrecord: the read (a Bio.SeqRecord.SeqRecord object)
    :param aln_info: the alignment information
    :param mate_id: the mate for the read
    :param read_group: the read group for this alignment

    :returns: a SAMRecord for the alignment

    """
    samrecord = AlignedRead()
    samrecord.qname = seqrecord.id.rsplit(':', 1)[0]
    samrecord.seq = str(seqrecord.seq).upper()
    samrecord.is_unmapped = aln_info == None
    if aln_info:
        samrecord.mapq = 255  # TODO alignment quality?
        samrecord.pos = aln_info.offset
        samrecord.tags += [("NM", aln_info.mismatches), ("RG", read_group)]
        #samrecord.cigar = [(0, len(str(seqrecord.seq)))]  # TODO allow indels at some point
        #samrecord.cigarstring = '{}M'.format(len(str(seqrecord.seq)))
        samrecord.cigarstring = aln_info.cigar
        samrecord.rname, samrecord.tid = 0, 0  # TODO deal with multiple contigs
    else:
        samrecord.tags += [("RG", read_group)]
    if mate_info:
        samrecord.mpos = mate_info.offset
        samrecord.pnext = mate_info.offset
        samrecord.rnext = 0  # TODO deal with multiple contigs
        samrecord.mate_is_reverse = mate_info.reversed
    if aln_info and mate_info:
        # proper pair: reads are pointing at each other
        if aln_info.offset < mate_info.offset:
            samrecord.is_proper_pair = mate_info.reversed and not aln_info.reversed
        else:
            samrecord.is_proper_pair = aln_info.reversed and not mate_info.reversed
        # calculate insert
        first, second = (aln_info, mate_info) if is_first else (mate_info,
                                                                aln_info)
        samrecord.isize = first.offset - second.offset if first.reversed else second.offset - first.offset
    is_reverse = aln_info is not None and aln_info.reversed
    if is_reverse:
        samrecord.seq = samrecord.seq[::-1]
    is_unmapped = aln_info == None
    mate_is_unmapped = mate_info == None
    mate_is_reverse = mate_info is not None and mate_info.reversed
    is_second = not is_first
    # TODO allow unpaired reads (the 0x1 flag)
    samrecord.flag = (0x1 | 0x2 | 0x4 * is_unmapped | 0x8 * mate_is_unmapped
                      | 0x10 * is_reverse | 0x20 * mate_is_reverse
                      | 0x40 * is_first | 0x80 * is_second)
    if samrecord.is_unmapped:
        samrecord.tid = -1
        samrecord.pos = -1
        samrecord.cigarstring = '*'
        samrecord.cigar = []
    else:
        if not samrecord.seq:
            # cleared by PySam (this can happen for certain cigar strings)
            samrecord.seq = str(seqrecord.seq).upper()
        try:
            if sum([x[1] for x in samrecord.cigar if x[0] != 2]) != len(
                    samrecord.seq):
                print('ERROR AT POSITION {}'.format(samrecord.pos))
                raise ValueError('Cigar {} does not fit sequence {}'.format(
                    samrecord.cigarstring, samrecord.seq))
        except TypeError:
            print('No seq in record: {}'.format(samrecord))
            print('WTF is goin on? \n{}'.format(seqrecord))
    samrecord.qual = ''.join([
        chr(q + 33) for q in seqrecord._per_letter_annotations['phred_quality']
    ])
    return samrecord
def main():
	parser = OptionParser(usage=usage)
	#parser.add_option("-s", action="store_true", dest="sam_input", default=False,
					  #help="Input is in SAM format instead of BAM format")
	(options, args) = parser.parse_args()
	if len(args) != 4:
		parser.print_help()
		sys.exit(1)
	psl_filename = args[0]
	ref_filename = args[1]
	contigs_filename = args[2]
	bam_filename = args[3]
	liftover_dir = args[1]
	
	references, ref_chromosomes = read_fasta(ref_filename)
	refname_to_id = dict([(name,i) for i,name in enumerate(ref_chromosomes)])
	print('Read', len(ref_chromosomes), 'reference chromosomes:', ','.join(ref_chromosomes), file=sys.stderr)
	contigs, contig_names = read_fasta(contigs_filename)
	print('Read', len(contig_names), 'contigs.', file=sys.stderr)
	bam_header = {'HD': {'VN': '1.0'}, 'SQ': [dict([('LN', len(references[chromosome])), ('SN', chromosome)]) for chromosome in ref_chromosomes] }
	outfile = Samfile(bam_filename, 'wb', header=bam_header)

	line_nr = 0
	header_read = False
	for line in (s.strip() for s in open(psl_filename)):
		line_nr += 1
		if line.startswith('------'): 
			header_read = True
			continue
		if not header_read: continue
		fields = line.split()
		assert len(fields) == 21, 'Error reading PSL file, offending line: %d'%line_nr
		sizes = [int(x) for x in fields[18].strip(',').split(',')]
		contig_starts = [int(x) for x in fields[19].strip(',').split(',')]
		ref_starts = [int(x) for x in fields[20].strip(',').split(',')]
		assert 0 < len(sizes) == len(contig_starts) == len(ref_starts)
		strand = fields[8]
		contig_name = fields[9]
		ref_name = fields[13]
		assert strand in ['-','+']
		assert contig_name in contigs
		assert ref_name in references
		a = AlignedRead()
		a.qname = contig_name
		if strand == '+':
			a.seq = str(contigs[contig_name])
		else:
			a.seq = str(contigs[contig_name].reverse_complement())
		a.flag = (16 if strand == '+' else 0)
		a.rname = refname_to_id[ref_name]
		a.pos = ref_starts[0]
		a.mapq = 255
		qpos = contig_starts[0]
		refpos = ref_starts[0]
		cigar = []
		# soft-clipping at the start?
		if contig_starts[0] > 0:
			cigar.append((4,contig_starts[0]))
		longest_insertion = 0
		longest_deletion = 0
		total_matches = 0
		total_insertion = 0
		total_deletion = 0
		for length, contig_start, ref_start in zip(sizes, contig_starts, ref_starts):
			assert contig_start >= qpos
			assert ref_start >= refpos
			# insertion?
			if contig_start > qpos:
				insertion_length = contig_start - qpos
				longest_insertion = max(longest_insertion, insertion_length)
				total_insertion += insertion_length
				append_to_cigar(cigar, 1, insertion_length)
				qpos = contig_start
			# deletion?
			if ref_start > refpos:
				deletion_length = ref_start - refpos
				longest_deletion = max(longest_deletion, deletion_length)
				total_deletion += deletion_length
				append_to_cigar(cigar, 2, deletion_length)
				refpos = ref_start
			# strech of matches/mismatches
			append_to_cigar(cigar, 0, length)
			refpos += length
			qpos += length
			total_matches += length
		# soft-clipping at the end?
		if len(a.seq) > qpos:
			cigar.append((4,len(a.seq) - qpos))
		a.cigar = tuple(cigar)
		# only use contigs where longest deletion is <= 10000 bp
		if longest_deletion > 10000: continue
		# require at least 200 matching positions
		if total_matches < 200: continue
		# require the matching positions to make up at least 75 percent of the contig
		# (without counting parts of the contig that are insertions).
		if total_matches / (len(a.seq) - total_insertion) < 0.75: continue
		outfile.write(a)
	outfile.close()