Example #1
0
 def filter_func(aln):
     # False means this read will be filtered out
     filter_bool = all(
         q >= quality_threshold for q in pysam.qualitystring_to_array(
             aln.get_tag(BAM_CONSTANTS["RAW_CELL_BC_QUALITY_TAG"])
         )) and all(q >= quality_threshold
                    for q in pysam.qualitystring_to_array(
                        aln.get_tag(BAM_CONSTANTS["UMI_QUALITY_TAG"])))
     nonlocal n_filtered
     n_filtered += not filter_bool
     return filter_bool
Example #2
0
def test_pysam():
  import pysam

  # Create BAM file from scratch
  # Code stolen from https://pysam.readthedocs.io/en/latest/usage.html#creating-bam-cram-sam-files-from-scratch
  header = { 'HD': {'VN': '1.0'},
            'SQ': [{'LN': 1575, 'SN': 'chr1'},
                   {'LN': 1584, 'SN': 'chr2'}] }

  file_name = "out.bam"
  with pysam.AlignmentFile(file_name, "wb", header=header) as outf:
    a = pysam.AlignedSegment()
    a.query_name = "read_28833_29006_6945"
    a.query_sequence="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"
    a.flag = 99
    a.reference_id = 0
    a.reference_start = 32
    a.mapping_quality = 20
    a.cigar = ((0,10), (2,1), (0,25))
    a.next_reference_id = 0
    a.next_reference_start=199
    a.template_length=167
    a.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
    a.tags = (("NM", 1),
              ("RG", "L1"))
    outf.write(a)

  # Verify output file exists
  assert os.path.isfile(file_name)

  # Call samtools to sort the file
  # This will fail if the file is not a valid BAM file
  pysam.sort("-o", "sorted.bam", file_name)
  assert os.path.isfile("sorted.bam")
Example #3
0
def convert_to_AlignedSegment(header, sequence, quality, barcode_sequence,
                              umi_sequence):
    """
    This function converts the input variables 
    (header,sequence,quality,barcode_sequence,umi_sequence)
    to a unaligned pysam.AlignedSegment with the umi and barcode 
    informations as the following tags:
        Tag  Value
        "B0" barcode_sequence
        "B3" umi_sequence
    :param header: string with the header information
    :param sequence: string with the DNA/RNA sequence
    :param quality: string with the base calling quality values
    :param barcode_sequence: string with the barcode sequence
    :param umi_sequence: string with the unique molecular identifier sequence
    """

    # create
    aligned_segment = pysam.AlignedSegment()

    # Set the standard values
    # Header must not contain empty spaces
    aligned_segment.query_name = header.split()[0]
    aligned_segment.query_sequence = sequence
    aligned_segment.query_qualities = pysam.qualitystring_to_array(quality)

    # setting the flag to un_mapped
    aligned_segment.flag |= pysam.FUNMAP

    # Set the tags
    aligned_segment.set_tag('B0', barcode_sequence)
    aligned_segment.set_tag('B3', umi_sequence)
    aligned_segment.set_tag('RG', '0')

    return aligned_segment
Example #4
0
def unmapped_read(fq_read, sam_flag, chromium_tags, trim=None):
    ubam_read = pysam.AlignedSegment()
    ubam_read.query_name = fq_read.name
    if trim:
        ubam_read.query_sequence = fq_read.sequence[0:trim]
        ubam_read.query_qualities = pysam.qualitystring_to_array( fq_read.quality[0:trim] )
    else:
        ubam_read.query_sequence = fq_read.sequence
        ubam_read.query_qualities = pysam.qualitystring_to_array( fq_read.quality )
    ubam_read.flag = sam_flag
    ubam_read.reference_id = -1
    ubam_read.reference_start = -1
    ubam_read.next_reference_id = -1
    ubam_read.next_reference_start = -1
    ubam_read.tags = chromium_tags 
    return ubam_read
Example #5
0
 def generate_read(self, read_length, query_name, cb, ub):
     reference_id = np.random.randint(len(self.chromosome2length))
     chromosome, chr_length = list(
         self.chromosome2length.items())[reference_id]
     seq = self.chromosome2sequence[chromosome]
     start = np.random.randint(0, chr_length - read_length)
     # straight mapping
     a = pysam.AlignedSegment()
     a.query_name = query_name
     a.query_sequence = ''.join(seq[start:start + read_length])
     # flag taken from pysam example, did not analyze
     a.flag = 99
     a.reference_id = reference_id
     a.reference_start = start
     a.mapping_quality = 255
     a.cigar = ((0, read_length), )
     # a.next_reference_id = reference_id
     # a.next_reference_start = 199
     a.template_length = read_length
     a.query_qualities = pysam.qualitystring_to_array("<" * read_length)
     a.tags = (
         ("NM", 1),
         ("RG", "L1"),
         ("NH", 1),
         # normally should also add number of mutations compared to reference
         ("AS", read_length - 2),
         ("CB", cb),
         ("UB", ub),
     )
     return a
Example #6
0
def print_as_BAM(linked, header, path):
    with pysam.AlignmentFile(path, 'wb', header=header) as f:
        for n, introns in enumerate(linked):
            introns = sort_by_pos(introns)
            # calulate the postion, and distance to the next intron
            if len(introns) > 1:
                tlen = introns[-1][2] - introns[0][1] + 1
            else:
                tlen = 0
            # print out each intron as a seperate BAM entry
            for m, i in enumerate(introns):
                chrom, start, end, strand = i
                length = end - start + 1
                if m < len(introns) - 1:
                    next_ref = introns[m + 1][1]
                else:
                    next_ref = introns[0][1]
                    tlen = -tlen
                a = pysam.AlignedSegment()
                a.query_name = 'linked' + str(n)
                a.query_sequence = 'N' * length
                a.flag = 0
                a.reference_id = chrom
                a.reference_start = start
                a.mapping_quality = 60  # 60 = unqiuely mapped for HISAT2
                a.cigartuples = [(0, length)]
                a.next_reference_id = chrom
                a.next_reference_start = next_ref
                a.template_length = tlen
                a.query_qualities = pysam.qualitystring_to_array('/' * length)
                a.tags = [('XN', next_ref + 1), ('XI', len(introns))]
                f.write(a)
Example #7
0
def compose_aln(x):
    """Composes unaligned alignment.

    Parameters
    ----------
    x : tuple or list
        A cell barcode matching result.
        The output of \'match_cell_barcodes\' function.

    Returns
    -------
    AlignedSegment
        Unaligned read 2 with cell barcode matching result as tags.
    """

    read_name, read1_seq, read1_qual, read2_seq, read2_qual, bc, dist = x

    a = pysam.AlignedSegment()
    a.query_name = read_name.split(' ')[0]
    a.flag = 0x4
    a.template_length = len(read2_seq)

    a.query_sequence = read2_seq
    a.query_qualities = pysam.qualitystring_to_array(read2_qual)

    tags = [
        ('RG', 'fba'),
        ('R1', read1_seq),
        ('CB', bc),
        ('CM', dist),
    ]
    a.tags = tags

    return a
def convert_to_AlignedSegment(header, sequence, quality, 
                              barcode_sequence, umi_sequence):
    """
    This function converts the input variables 
    (header,sequence,quality,barcode_sequence,umi_sequence)
    to a unaligned pysam.AlignedSegment with the umi and barcode 
    informations as the following tags:
        Tag  Value
        "B0" barcode_sequence
        "B3" umi_sequence
    :param header: string with the header information
    :param sequence: string with the DNA/RNA sequence
    :param quality: string with the base calling quality values
    :param barcode_sequence: string with the barcode sequence
    :param umi_sequence: string with the unique molecular identifier sequence
    """

    # create
    aligned_segment = pysam.AlignedSegment()

    # Set the standard values
    # Header must not contain empty spaces
    aligned_segment.query_name = header.split()[0]
    aligned_segment.query_sequence = sequence
    aligned_segment.query_qualities = pysam.qualitystring_to_array(quality)

    # setting the flag to un_mapped
    aligned_segment.flag |= pysam.FUNMAP

    # Set the tags
    aligned_segment.set_tag('B0', barcode_sequence)
    aligned_segment.set_tag('B3', umi_sequence)
    aligned_segment.set_tag('RG', '0')

    return aligned_segment
Example #9
0
File: seq.py Project: xtmgah/wub
def quality_string_to_array(quality_string):
    """Convert quality string into a list of phred scores.

    :param quality_string: Quality string.
    :returns: Array of scores.
    :rtype: array
    """
    return pysam.qualitystring_to_array(quality_string)
Example #10
0
def test_bamread_get_quals(simple_bam_reads):
    bamread = simple_bam_reads[0]
    bamread.set_tag('OQ', ''.join(['('] * 17))
    assert np.array_equal(
        read.bamread_get_quals(bamread),
        np.array(pysam.qualitystring_to_array('==99=?<*+/5:@A99:')))
    assert np.array_equal(read.bamread_get_quals(bamread, use_oq=True),
                          np.array([7] * 17))
Example #11
0
def createBam(chromosome, positions, bamFile, readSize, outFile):
    chromInfo = bamReader.getChromosomeInfromationFromBAM(bamFile)
    header = {
        'HD': {
            'VN': '1.0'
        },
        'SQ': [{
            'LN': chromInfo[chromosome],
            'SN': chromosome
        }]
    }
    ######SAM columns; this will be used to create SAM file######
    chromosome = chromosome
    CIGAR = str(readSize) + 'M'
    RNEXT = '*'
    PNEXT = '0'
    TLEN = '0'
    DNA = ['A', 'T', 'G', 'C']
    qualityScores = [
        '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'
    ]
    flags = [0, 16]
    MAPQs = [3, 8, 23, 24, 40, 42]
    tags = (("XN", random.choice(range(0, 15))), ("XM",
                                                  random.choice(range(0, 20))),
            ("XO", random.choice(range(0,
                                       3))), ("XG", random.choice(range(0,
                                                                        9))),
            ("NM", random.choice(range(0,
                                       15))), ("MD", readSize), ("YT", "UU"))
    seq = ''.join(np.random.choice(DNA, readSize).tolist())
    qual = ''.join(np.random.choice(qualityScores, readSize).tolist())
    flag = random.choice(flags)
    mapq = random.choice(MAPQs)
    counter = 0
    with py.AlignmentFile(outFile, "wb", header=header) as outf:
        for pos in positions:
            a = py.AlignedSegment()
            a.query_name = "randomlyGenerated_CK_" + str(counter)
            a.query_sequence = seq
            a.flag = random.choice(flags)
            a.reference_id = 0
            a.reference_start = pos
            a.mapping_quality = mapq
            a.cigarstring = CIGAR
            a.next_reference_id = 0
            a.next_reference_start = 0
            a.template_length = 0
            a.query_qualities = py.qualitystring_to_array(qual)
            a.tags = (("XN", random.choice(range(0, 15))),
                      ("XM", random.choice(range(0, 20))),
                      ("XO", random.choice(range(0, 3))),
                      ("XG", random.choice(range(0, 9))),
                      ("NM", random.choice(range(0, 15))), ("MD", readSize),
                      ("YT", "UU"))
            outf.write(a)
            counter = counter + 1
 def write_to_bam(self, f):
     if self.is_split_read == False:
         a = pysam.AlignedSegment()
         a.query_name = self.name
         a.query_sequence = self.seq
         a.flag = 0
         a.reference_id = f.references.index(self.contig)
         a.reference_start = self.start_pos
         a.mapping_quality = 60
         a.cigar = self.get_cigar()
         a.query_qualities = pysam.qualitystring_to_array(self.qual)
         a.tags = (("NM", self.nmtag), ("RG", "L1"))
         f.write(a)
     else:
         j = 0
         for i, s in enumerate(self.splits):
             a = pysam.AlignedSegment()
             if type(s) is tuple:
                 start = s[0] - self.start_pos
                 end = s[1] - self.start_pos
                 a.reference_start = s[0]
             else:
                 start = s - self.start_pos
                 end = len(self.seq)
                 a.reference_start = s
             a.query_sequence = self.seq[start:end]
             if a.query_sequence:
                 parts = self.name.split('_Count=')
                 a.query_name = parts[0] + '_' + chr(
                     j + 97) + '_Count=' + parts[1]
                 j += 1
                 a.flag = 0
                 a.reference_id = f.references.index(self.contig)
                 a.mapping_quality = 60
                 endc = end + self.cigarstring[start:end].count(
                     '2')  #add 1 for each deletion
                 groups = groupby(self.cigarstring[start:endc])
                 cigar = tuple((int(label), sum(1 for _ in group))
                               for label, group in groups)
                 a.cigar = cigar
                 a.query_qualities = pysam.qualitystring_to_array(
                     self.qual)[start:end]
                 a.tags = (("NM", self.nmtag), ("RG", "L1"))
                 f.write(a)
 def write_to_bam(self, f):
     a = pysam.AlignedSegment()
     a.query_name = self.name
     a.query_sequence = self.seq
     a.flag = 0
     a.reference_id = f.references.index(self.contig)
     a.reference_start = self.start_pos
     a.mapping_quality = 60
     a.cigar = self.get_cigar()
     a.query_qualities = pysam.qualitystring_to_array(self.qual)
     a.tags = (("NM", self.nmtag), ("RG", "L1"))
     f.write(a)
Example #14
0
    def iter_sam_record(self, reference_ids, tags=None):
        mapq = 30
        bq = '+'
        it = self.iter_raw()

        # TODO secondary alignment flag
        for rec in it:
            is_reverse = int(rec.send) < int(rec.sstart)
            if is_reverse:
                qaln = dna_revcomp(rec.qseq)
                saln = dna_revcomp(rec.sseq)
                ref_start = int(rec.send) - 1
                lclip = int(rec.qstart) - 1
                rclip = int(rec.qend) - int(rec.qend)
            else:
                qaln = rec.qseq
                saln = rec.sseq
                ref_start = int(rec.sstart) - 1
                lclip = int(rec.qlen) - int(rec.qend)
                rclip = int(rec.qstart) - 1
            cigar = aln2cigar(qaln, saln)
            qseq = qaln.replace('-', '')
            if lclip > 0:
                cigar.prepend((cigar.H, lclip))
            if rclip > 0:
                cigar.append((cigar.H, rclip))

            try:
                edit = int(rec.mismatches) + int(rec.gaps)
            except:
                edit = -1

            a = pysam.AlignedSegment()
            a.query_name = rec.qname.encode('ascii')
            a.query_sequence = qseq.encode('ascii')
            a.reference_id = reference_ids[rec.sname]
            a.flag = (16 if is_reverse else 0)
            a.reference_start = ref_start
            a.mapping_quality = mapq
            a.cigar = cigar.values
            a.next_reference_id = -1
            a.next_reference_start = -1
            try:
                a.template_length = int(rec.slen)
            except:
                pass
            a.query_qualities = pysam.qualitystring_to_array(bq * len(qseq))
            a.tags = [
                ("AS", float(rec.bit_score)),  # alignment score
                ("NM", edit),  # edit distance
                ("ZE", float(rec.evalue)),  # E-value
            ]
            yield a
Example #15
0
def _string_to_aligned_segment(line, seq_dict, log_output):
    """Converts SAM record in string format to pysam AlignedRead
  Args:
    line: String of SAM record
    seq_dict: Dictionary mapping reference ID to reference ID index
    log_output: Handle for outputting log information
  Returns:
    aligned_segment: pysam AlignedRead class with values from 'line'
  """
    line = line.strip().split()
    #print(line)
    aligned_segment = AlignedRead()
    aligned_segment.query_name = line[0]
    aligned_segment.flag = int(line[1])
    if line[2] != "*":
        aligned_segment.reference_id = seq_dict[line[2]]
        aligned_segment.reference_start = int(line[3]) - 1
        aligned_segment.mapping_quality = int(line[4])
    cigartuples = []
    pos = ""
    for symbol in line[5]:
        if symbol.isdigit():
            pos += symbol
        elif symbol == "*":
            continue
        else:
            cigartuples.append((_CIGAR_OPERATIONS[symbol], int(pos)))
            pos = ""
    aligned_segment.cigartuples = cigartuples
    if line[6] == "=":
        aligned_segment.next_reference_id = seq_dict[line[2]]
    elif line[6] != "*":
        aligned_segment.next_reference_id = seq_dict[line[6]]
    aligned_segment.next_reference_start = int(line[7]) - 1
    aligned_segment.template_length = int(line[8])
    aligned_segment.query_sequence = line[9]
    aligned_segment.query_qualities = qualitystring_to_array(line[10])
    for field in line[11::]:
        tag, tag_type, val = field.split(":", maxsplit=2)
        if tag_type == "i":
            val = int(val)
        elif tag_type == "f":
            val = float(val)
        elif tag_type == "H":
            val = bytearray.fromhex(val)
        elif tag_type == "B":
            val = [int(i) for i in val.split(",")]
        elif not (tag_type == "A" or tag_type == "Z"):
            err_msg = "Optional Ttag type '{}' not recognised".format(tag_type)
            log_output.write("ERROR: {}\n".format(err_msg))
            raise Exception(err_msg)
        aligned_segment.set_tag(tag, val, value_type=tag_type)
    return aligned_segment
Example #16
0
    def corrected_reads(self, **kwargs):
        end_correction = self.end_correction
        nucleotide_counts = self.get_nucleotide_counts()
        self.full_covariation_test()
        covarying_sites = self.multiple_testing_correction()
        if end_correction:
            tail_cutoff = self.reference_length - end_correction

        for read in self.pysam_alignment.fetch():
            sequence, _ = self.read_count_data(read)
            intraread_covarying_sites = covarying_sites[
                (covarying_sites >= read.reference_start) &
                (covarying_sites < read.reference_end)
            ]
            mask = np.ones(len(sequence), np.bool)
            mask[intraread_covarying_sites - read.reference_start] = False
            local_consensus = nucleotide_counts.consensus[
                read.reference_start: read.reference_end
            ]
            sequence[mask] = local_consensus[mask]

            if end_correction:
                if read.reference_start < end_correction:
                    query_index = end_correction - read.reference_start
                    query_correction = nucleotide_counts.consensus[
                        read.reference_start: end_correction
                    ]
                    sequence[0: query_index] = query_correction
                if read.reference_end > tail_cutoff:
                    correction_length = read.reference_end - tail_cutoff
                    query_correction = nucleotide_counts.consensus[
                        tail_cutoff: tail_cutoff + correction_length
                    ]
                    sequence[-correction_length:] = query_correction

            corrected_read = pysam.AlignedSegment()
            corrected_read.query_name = read.query_name
            corrected_read.query_sequence = ''.join(sequence)
            corrected_read.flag = read.flag
            corrected_read.reference_id = 0
            corrected_read.reference_start = read.reference_start
            corrected_read.mapping_quality = read.mapping_quality
            corrected_read.cigar = [(0, len(sequence))]
            corrected_read.next_reference_id = read.next_reference_id
            corrected_read.next_reference_start = read.next_reference_start
            corrected_read.template_length = read.template_length
            corrected_read.query_qualities = pysam.qualitystring_to_array(
                len(sequence) * '<'
            )
            corrected_read.tags = read.tags
            yield corrected_read
    def test_get_aligned_pairs_padding(self):
        a = pysam.AlignedSegment()
        a.query_name = "read_12345"
        a.query_sequence = "ACGT" * 10
        a.flag = 0
        a.reference_id = 0
        a.reference_start = 20
        a.mapping_quality = 20
        a.cigartuples = ((7, 20), (6, 1), (8, 19))
        a.query_qualities = pysam.qualitystring_to_array("1234") * 10

        def inner():
            a.get_aligned_pairs()
        # padding is not bein handled right now
        self.assertRaises(NotImplementedError, inner)
Example #18
0
def test_readdata_from_bamread(simple_bam_reads):
    bamread = simple_bam_reads[0]
    r = read.ReadData.from_bamread(bamread)
    assert np.array_equal(
        r.qual, np.array(pysam.qualitystring_to_array('==99=?<*+/5:@A99:')))
    assert r.rg is None
    bamread.set_tag('OQ', '(' * 17)
    bamread.set_tag('RG', 'foo')
    r = read.ReadData.from_bamread(bamread, use_oq=True)
    assert np.array_equal(r.qual, np.array([7] * 17))
    assert r.rg == 'foo'
    #the rg 0 from conftest.py has int 0
    assert read.ReadData.rg_to_int[None] == 0
    assert read.ReadData.rg_to_int['foo'] == 1
    assert read.ReadData.numrgs == 2
    bamread.is_reverse = True
    r = read.ReadData.from_bamread(bamread)
    assert np.array_equal(
        r.qual,
        np.flip(np.array(pysam.qualitystring_to_array('==99=?<*+/5:@A99:'))))
    assert np.array_equal(r.seq, np.array(list('CAGTATCCTTTATCTAA')))
    read.ReadData.rg_to_pu = dict()
    read.ReadData.rg_to_int = dict()
    read.ReadData.numrgs = 0
 def test_get_aligned_pairs_match_mismatch(self):
     a = pysam.AlignedSegment()
     a.query_name = "read_12345"
     a.query_sequence = "ACGT" * 10
     a.flag = 0
     a.reference_id = 0
     a.reference_start = 20
     a.mapping_quality = 20
     a.cigartuples = ((7, 20), (8, 20))
     a.query_qualities = pysam.qualitystring_to_array("1234") * 10
     self.assertEqual(a.get_aligned_pairs(),
                      [(qpos, refpos) for (qpos, refpos) in zip(
                          range(0, 0 + 40), range(20, 20 + 40))])
     self.assertEqual(a.get_aligned_pairs(True),
                      [(qpos, refpos) for (qpos, refpos) in zip(
                          range(0, 0 + 40), range(20, 20 + 40))])
Example #20
0
    def build_read(self):
        '''build an example read.'''

        a = pysam.AlignedSegment()
        a.query_name = "read_12345"
        a.query_sequence = "ACGT" * 10
        a.flag = 0
        a.reference_id = 0
        a.reference_start = 20
        a.mapping_quality = 20
        a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20))
        a.next_reference_id = 0
        a.next_reference_start = 200
        a.template_length = 167
        a.query_qualities = pysam.qualitystring_to_array("1234") * 10
        return a
Example #21
0
    def testUpdate2(self):
        '''issue 135: inplace update of sequence and quality score.

        This does not work as setting the sequence will erase
        the quality scores.
        '''
        a = self.buildRead()
        a.query_sequence = a.query_sequence[5:10]
        self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None)

        a = self.buildRead()
        s = pysam.qualities_to_qualitystring(a.query_qualities)
        a.query_sequence = a.query_sequence[5:10]
        a.query_qualities = pysam.qualitystring_to_array(s[5:10])

        self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), s[5:10])
Example #22
0
    def testUpdate2(self):
        '''issue 135: inplace update of sequence and quality score.

        This does not work as setting the sequence will erase
        the quality scores.
        '''
        a = self.buildRead()
        a.query_sequence = a.query_sequence[5:10]
        self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None)

        a = self.buildRead()
        s = pysam.qualities_to_qualitystring(a.query_qualities)
        a.query_sequence = a.query_sequence[5:10]
        a.query_qualities = pysam.qualitystring_to_array(s[5:10])

        self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), s[5:10])
Example #23
0
    def buildRead(self):
        '''build an example read.'''

        a = pysam.AlignedSegment()
        a.query_name = "read_12345"
        a.query_sequence = "ACGT" * 10
        a.flag = 0
        a.reference_id = 0
        a.reference_start = 20
        a.mapping_quality = 20
        a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20))
        a.next_reference_id = 0
        a.next_reference_start = 200
        a.template_length = 167
        a.query_qualities = pysam.qualitystring_to_array("1234") * 10
        # todo: create tags
        return a
 def test_get_aligned_pairs_hard_clipping(self):
     a = pysam.AlignedSegment()
     a.query_name = "read_12345"
     a.query_sequence = "ACGT" * 10
     a.flag = 0
     a.reference_id = 0
     a.reference_start = 20
     a.mapping_quality = 20
     a.cigartuples = ((5, 2), (0, 35), (5, 3))
     a.query_qualities = pysam.qualitystring_to_array("1234") * 10
     self.assertEqual(a.get_aligned_pairs(),
                      # No seq, no seq pos
                      [(qpos, refpos) for (qpos, refpos) in zip(
                          range(0, 0 + 35), range(20, 20 + 35))])
     self.assertEqual(a.get_aligned_pairs(True),
                      [(qpos, refpos) for (qpos, refpos) in zip(
                          range(0, 0 + 35), range(20, 20 + 35))])
Example #25
0
    def testLargeRead(self):
        '''build an example read.'''

        a = pysam.AlignedSegment()
        a.query_name = "read_12345"
        a.query_sequence = "ACGT" * 200
        a.flag = 0
        a.reference_id = 0
        a.reference_start = 20
        a.mapping_quality = 20
        a.cigartuples = ((0, 4 * 200), )
        a.next_reference_id = 0
        a.next_reference_start = 200
        a.template_length = 167
        a.query_qualities = pysam.qualitystring_to_array("1234") * 200

        return a
Example #26
0
    def build_read(self):
        '''build an example read, but without header information.'''

        a = pysam.AlignedSegment()
        a.query_name = "read_12345"
        a.query_sequence = "ATGC" * 10
        a.flag = 0
        a.reference_id = -1
        a.reference_start = 20
        a.mapping_quality = 20
        a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20))
        a.next_reference_id = 0
        a.next_reference_start = 200
        a.template_length = 167
        a.query_qualities = pysam.qualitystring_to_array("1234") * 10
        # todo: create tags
        return a
Example #27
0
    def testLargeRead(self):
        '''build an example read.'''

        a = pysam.AlignedSegment()
        a.query_name = "read_12345"
        a.query_sequence = "ATGC" * 200
        a.flag = 0
        a.reference_id = -1
        a.reference_start = 20
        a.mapping_quality = 20
        a.cigartuples = ((0, 4 * 200), )
        a.next_reference_id = 0
        a.next_reference_start = 200
        a.template_length = 167
        a.query_qualities = pysam.qualitystring_to_array("1234") * 200

        return a
Example #28
0
 def _write_header_to_sam(self, header, sf_header_sam_file):
     with pysam.AlignmentFile(
             sf_header_sam_file, "w",
             header=header) as outf:  ##write the new header into file
         a = pysam.AlignedSegment()
         a.query_name = "read_28833_29006_6945_tmp"
         a.query_sequence = "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"
         a.flag = 99
         a.reference_id = 0
         a.reference_start = 32
         a.mapping_quality = 20
         a.cigar = ((0, 10), (2, 1), (0, 25))
         a.next_reference_id = 0
         a.next_reference_start = 199
         a.template_length = 167
         a.query_qualities = pysam.qualitystring_to_array(
             "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
         outf.write(a)
Example #29
0
    def build_read(self):
        '''build an example read.'''

        header = pysam.AlignmentHeader.from_references(["chr1", "chr2"],
                                                       [10000000, 10000000])

        a = pysam.AlignedSegment(header)
        a.query_name = "read_12345"
        a.query_sequence = "ATGC" * 10
        a.flag = 0
        a.reference_id = 0
        a.reference_start = 20
        a.mapping_quality = 20
        a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20))
        a.next_reference_id = 0
        a.next_reference_start = 200
        a.template_length = 167
        a.query_qualities = pysam.qualitystring_to_array("1234") * 10
        return a
Example #30
0
    def build_read(self):
        '''build an example read.'''

        header = pysam.AlignmentHeader.from_references(
            ["chr1", "chr2"],
            [10000000, 10000000])
        
        a = pysam.AlignedSegment(header)
        a.query_name = "read_12345"
        a.query_sequence = "ATGC" * 10
        a.flag = 0
        a.reference_id = 0
        a.reference_start = 20
        a.mapping_quality = 20
        a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20))
        a.next_reference_id = 0
        a.next_reference_start = 200
        a.template_length = 167
        a.query_qualities = pysam.qualitystring_to_array("1234") * 10
        return a
 def test_get_aligned_pairs_skip(self):
     a = pysam.AlignedSegment()
     a.query_name = "read_12345"
     a.query_sequence = "ACGT" * 10
     a.flag = 0
     a.reference_id = 0
     a.reference_start = 20
     a.mapping_quality = 20
     a.cigartuples = ((0, 2), (3, 100), (0, 38))
     a.query_qualities = pysam.qualitystring_to_array("1234") * 10
     self.assertEqual(a.get_aligned_pairs(),
                      [(0, 20), (1, 21)] +
                      [(None, refpos) for refpos in range(22, 22 + 100)] +
                      [(qpos, refpos) for (qpos, refpos) in zip(
                          range(2, 2 + 38),
                          range(20 + 2 + 100, 20 + 2 + 100 + 38))])
     self.assertEqual(a.get_aligned_pairs(True),
                      [(0, 20), (1, 21)] +
                      # [(None, refpos) for refpos in range(21, 21+100)] +
                      [(qpos, refpos) for (qpos, refpos) in zip(
                          range(2, 2 + 38),
                          range(20 + 2 + 100, 20 + 2 + 100 + 38))])
 def test_get_aligned_pairs_soft_clipping(self):
     a = pysam.AlignedSegment()
     a.query_name = "read_12345"
     a.query_sequence = "ACGT" * 10
     a.flag = 0
     a.reference_id = 0
     a.reference_start = 20
     a.mapping_quality = 20
     a.cigartuples = ((4, 2), (0, 35), (4, 3))
     a.query_qualities = pysam.qualitystring_to_array("1234") * 10
     self.assertEqual(a.get_aligned_pairs(),
                      [(0, None), (1, None)] +
                      [(qpos, refpos) for (qpos, refpos) in zip(
                          range(2, 2 + 35), range(20, 20 + 35))] +
                      [(37, None), (38, None), (39, None)]
                      )
     self.assertEqual(a.get_aligned_pairs(True),
                      # [(0, None), (1, None)] +
                      [(qpos, refpos) for (qpos, refpos) in zip(
                          range(2, 2 + 35), range(20, 20 + 35))]
                      # [(37, None), (38, None), (39, None)]
                      )
Example #33
0
def test_realign_rc(genome_source):
    read = pysam.AlignedSegment()
    read.query_sequence = genome_source.get_seq("chr1", 30, 50, "-")

    alns = genome_source.align(Alignment(read))
    assert len(alns) == 1
    assert alns[0].cigarstring == "21M"
    assert alns[0].reference_start == 30
    assert alns[0].reference_end == 51
    assert alns[0].is_reverse

    qs = "<<<<<<<:<9/,&,22;;<<<"
    read.query_qualities = pysam.qualitystring_to_array(qs)
    alns = genome_source.align(Alignment(read))

    import warnings
    with warnings.catch_warnings():
        # this is a python 2/3 incompatibility I think, where the warning
        # indicates array.tostring() is deprecated but array.tobytes()
        # only exists in py3
        warnings.simplefilter("ignore")
        assert pysam.qualities_to_qualitystring(
            alns[0].query_qualities) == qs[::-1]
Example #34
0
def add_snv(read, pos, base):
    ''' '''
    a = ps.AlignedSegment()

    # Add snv to seq and set high base quality score
    qual_ = read.qual[:pos - read.reference_start] + 'A' + read.qual[
        pos - read.reference_start + 1:]
    seq_ = read.seq[:pos - read.reference_start] + base + read.seq[
        pos - read.reference_start + 1:]

    # Calculate tag MD
    MD = read.get_tag('MD')
    md_list_, is_new = md_list(MD, pos - read.reference_start, read)
    MD_ = md_from_list(md_list_)

    # Calculate tag NM
    NM_ = read.get_tag('NM')
    if is_new: NM_ += 1

    # Create modified read
    a.query_name = read.query_name
    a.query_sequence = seq_
    a.flag = read.flag
    a.reference_id = read.reference_id
    a.reference_start = read.reference_start
    a.mapping_quality = read.mapping_quality
    a.cigar = read.cigar
    a.next_reference_id = read.next_reference_id
    a.next_reference_start = read.next_reference_start
    a.template_length = read.template_length
    a.query_qualities = ps.qualitystring_to_array(qual_)
    a.tags = read.get_tags()
    a.set_tag('MD', MD_, 'Z')
    a.set_tag('NM', NM_, 'i')

    return a
Example #35
0
    def make_bam_segment(
        self,
        qname=None,
        flag=0,
        rname=0,
        pos=0,
        mapq=20,
        cigar=None,
        rnext=0,
        pnext=0,
        tlen=0,
        seq=None,
        qual=None,
        tags=None,
        **kwargs,
    ):
        """
        Return pysam.AlignedSegment object.

        Each pysam.AlignedSegment element has 11 mandatory tab-separated
        fields (qname, flag, rname, pos, mapq, cigar, rnext, pnext,
        tlen, seq, qual, tags). Additionaly there is 12-th field TAGS
        for additional info.

        We try to set sensible defaults where possible, but when
        creating the segment one should at least set the `cigar` field.

        """
        segment = pysam.AlignedSegment()

        if qname is None:
            qname = "read-{}".format(random.randrange(1000, 9999))
        segment.query_name = qname

        segment.flag = flag or self.get_flag_value(**kwargs)
        segment.reference_id = rname
        segment.reference_start = pos
        segment.mapping_quality = mapq
        segment.cigar = cigar

        segment.next_reference_id = rnext
        segment.next_reference_start = pnext
        segment.template_length = tlen

        length = sum([
            length for (operation, length) in segment.cigartuples
            if operation in [0, 1, 4, 7, 8]
        ])
        if seq is None:
            seq = FastaTestCaseMixin.make_fasta_sequence(size=length,
                                                         include_n=False)
        segment.query_sequence = seq

        if qual is None:
            qual = pysam.qualitystring_to_array(
                FastqTestCaseMixin.make_quality_scores(size=length))
        segment.query_qualities = qual

        if tags is not None:
            segment.tags = tags.items()

        return segment
Example #36
0
def make_aligned_segment(data, rnd_seed=None):
    """
    Return pysam.AlignedSegment() object with the data in `data`.

    Each pysam.AlignedSegment element has 11 mandatory tab-separated
    fields. Additionaly there is TAGS field for additional info::

        QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL TAGS

    Since only some fields are required for our purpuses, the input
    parameter `data` should be a tuple with the following content:

    (qname, flag, refname, pos, mapq, cigar, tags)

        * qname - querry name
        * flag - bitwise flag, detailed description in [1]
        * refname - Index of the reference_sequence in header
        * pos - starting position of read in reference sequence
        * mapq - mapping quality
        * cigar - CIGAR values, detailed description in [1]
        * tags - additional information

    Flag is a bitwise value. For detailed explanation, see [1], but for our
    needs, we only need values 4 (is_unmapped) and 16 (reverse_strand).

    Columns RNEXT and RNAME are left undefined, since we are only interested
    in sigle-end reads. TLEN is also set to undefined (value 0). SEQ and
    QUAL are randomly generated, their size is determined by cigar value.

    Example of input parameter `data`:

        data = ('name123', 20, 0, 30, 20, [(0, 90), (2, 5), (0, 5)], {'NH': 5})

    Read more about SAM file specifications here:
    [1] https://samtools.github.io/hts-specs/SAMv1.pdf

    Parameters
    ----------
    data : tuple
        Input data for AlignedSegment.

    Returns
    -------
    pysam.AlignedSegment
        AlignedSegment, filled with given data.

    """
    # pylint: disable=no-member
    segment = pysam.AlignedSegment()
    segment.query_name = data[0]
    segment.flag = data[1]
    segment.reference_id = data[2]
    segment.reference_start = data[3]
    segment.mapping_quality = data[4]
    segment.cigar = data[5]

    segment.next_reference_id = 0
    segment.next_reference_start = 0
    segment.template_length = 0

    length = sum([n2 for (n1, n2) in segment.cigar if n1 in [0, 1, 4, 7, 8]])
    segment.query_sequence = make_sequence(size=length, include_n=True, rnd_seed=rnd_seed)
    segment.query_qualities = pysam.qualitystring_to_array(
        make_quality_scores(size=length, rnd_seed=rnd_seed))

    segment.tags = data[6].items()

    return segment
Example #37
0
def rescue_reads(tasks, results, parser_result):
    aligner = parser_result.aligner.lower()
    output_dir = parser_result.output_dir

    while True:
        item = tasks.get()

        if item is None:
            tasks.task_done()
            break

        unmapped_info, ref_id, start, is_spliced, genome_seq = item

        rescue_tmp_dir = output_dir + "/rescue_tmp"
        random_prefix = random_string(10)
        temp_dir = "%s/%s_temp" % (rescue_tmp_dir, random_prefix)
        random_output_prefix = "%s/%s" % (rescue_tmp_dir, random_prefix)
        target_sam_file = None

        unmapped_read_file, target_genome_file, star_index_num = \
            make_unmapped_read_target_genome(unmapped_info, ref_id, genome_seq, random_output_prefix, is_spliced)

        if is_spliced:
            # Rebuilds aligner index with target genome file
            if aligner == "star":
                parser_result.builder_extra_args = "--genomeSAindexNbases {star_index_num} " \
                                                   "--outTmpDir {temp_dir}".format(star_index_num=star_index_num,
                                                                                   temp_dir=temp_dir)
            parser_result.genome_file = target_genome_file
            parser_result.output_dir = rescue_tmp_dir
            parser_result.prefix = random_prefix
            parser_result.quiet = True
            parser_result.threads = 1
            try:
                target_genome_index = build_aligner_index.build_index(
                    parser_result)
            except RuntimeError:
                for unmapped_name in unmapped_info:
                    unmapped_seq = unmapped_info[unmapped_name][0]
                    results.put((unmapped_name, unmapped_seq))
                tasks.task_done()
                continue

            # Aligns unmapped read to target genome
            if aligner == "star":
                parser_result.aligner_extra_args = "--outTmpDir %s" % temp_dir
            else:
                parser_result.aligner_extra_args = None
            parser_result.input = [unmapped_read_file]
            parser_result.genome_index = target_genome_index
            if target_genome_index is not None:
                try:
                    target_sam_file = run_aligner.run_aligner(parser_result)
                except RuntimeError:
                    for unmapped_name in unmapped_info:
                        unmapped_seq = unmapped_info[unmapped_name][0]
                        results.put((unmapped_name, unmapped_seq))
                    tasks.task_done()
                    continue
        else:
            target_sam_file = "%s.sam" % random_output_prefix
            command = "blastn -query {unmapped_read} -subject {target_genome} -task megablast -perc_identity {identity} " \
                      "-qcov_hsp_perc {coverage} -outfmt \"17 SQ SR\" -out {sam_output} -parse_deflines". \
                format(unmapped_read=unmapped_read_file,
                       target_genome=target_genome_file,
                       identity=parser_result.blast_identity,
                       coverage=parser_result.blast_query_coverage,
                       sam_output=target_sam_file)

            tool_process = Popen(shlex.split(command),
                                 stdout=PIPE,
                                 stderr=PIPE)
            tool_out, tool_err = tool_process.communicate()

            if tool_process.returncode != 0 or "[Errno" in tool_err.decode(
                    "utf8").strip():
                for unmapped_name in unmapped_info:
                    unmapped_seq = unmapped_info[unmapped_name][0]
                    results.put((unmapped_name, unmapped_seq))
                tasks.task_done()
                continue

        if os.path.exists(
                target_sam_file) and os.path.getsize(target_sam_file) != 0:
            # Checks for target genome results
            with pysam.AlignmentFile(target_sam_file) as f:
                for r in f:
                    if not r.is_unmapped and not r.is_secondary and not r.is_supplementary:
                        new_start = start + r.reference_start
                        cigarstring = r.cigarstring
                        first_hard_clip = re.findall("^\d+H", cigarstring)
                        first_bp = int(
                            re.findall("\d+", first_hard_clip[0])
                            [0]) if first_hard_clip else None
                        last_hard_clip = re.findall("\d+H$", cigarstring)
                        last_bp = int(re.findall(
                            "\d+",
                            last_hard_clip[0])[0]) if last_hard_clip else None
                        unmapped_qual = unmapped_info[r.query_name][1]

                        if r.is_reverse:
                            new_qualities = pysam.qualitystring_to_array(
                                unmapped_qual[::-1])
                        else:
                            new_qualities = pysam.qualitystring_to_array(
                                unmapped_qual)

                        if first_bp is not None:
                            new_qualities = new_qualities[first_bp:]

                        if last_bp is not None:
                            last_bp = len(new_qualities) - last_bp
                            new_qualities = new_qualities[:last_bp]

                        results.put((r.query_name, r.flag, ref_id, new_start,
                                     r.mapping_quality, cigarstring,
                                     r.next_reference_id,
                                     r.next_reference_start, r.template_length,
                                     r.query_sequence, new_qualities, r.tags))
                        # break

        # Removes useless files and directories
        os.remove(unmapped_read_file)
        os.remove(target_genome_file)
        if os.path.exists("%s.sam" % random_output_prefix):
            os.remove("%s.sam" % random_output_prefix)
        elif os.path.exists("%s.bam" % random_output_prefix):
            os.remove("%s.bam" % random_output_prefix)

        if is_spliced and aligner == "star":
            if os.path.exists("%s.Aligned.out.sam" % random_output_prefix):
                os.remove("%s.Aligned.out.sam" % random_output_prefix)

            if os.path.exists("%s_star" % random_output_prefix):
                shutil.rmtree("%s_star" % random_output_prefix)

            if os.path.exists("%s_temp" % random_output_prefix):
                shutil.rmtree("%s_temp" % random_output_prefix)

            if os.path.exists("%s.Log.final.out" % random_output_prefix):
                os.remove("%s.Log.final.out" % random_output_prefix)

            if os.path.exists("%s.Log.out" % random_output_prefix):
                os.remove("%s.Log.out" % random_output_prefix)

            if os.path.exists("%s.Log.progress.out" % random_output_prefix):
                os.remove("%s.Log.progress.out" % random_output_prefix)

            if os.path.exists("%s.SJ.out.tab" % random_output_prefix):
                os.remove("%s.SJ.out.tab" % random_output_prefix)

        tasks.task_done()
Example #38
0
#create a new read				   
a = pysam.AlignedSegment()

#assign values to each attribute
a.query_name = "read_28833_29006_6945“
a.query_sequence="AGCTTAGCTA"
a.flag = 99
a.reference_id = 0
a.reference_start = 32
a.mapping_quality = 20
a.cigar = ((0,10), (2,1), (0,25))
a.next_reference_id = 0
a.next_reference_start=199
a.template_length=167
a.query_qualities = pysam.qualitystring_to_array("<<<<<<<AAA")
a.tags = (("NM", 1),("RG", "L1"))

		   
bamfile = pysam.AlignmentFile(bam, "rb")
for pileupcolumn in bamfile.pileup('chr22', 16958180,16958190):
    print ("\ncoverage at base %s = %s" % (pileupcolumn.pos, pileupcolumn.n))

for pileupcolumn in bamfile.pileup('chr22', 16958160,16958170):
	print ("\nBases at position %s = " % (pileupcolumn.pos))
	for pileupread in pileupcolumn.pileups:
		if not pileupread.is_del and not pileupread.is_refskip:
		# query position is None if is_del or is_refskip is set.
			print ("%s" % (pileupread.alignment.query_sequence[pileupread.query_position]))
	
	
Example #39
0
#writing SAM files - like a dictionary
header = { 'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1575, 'SN': 'chr1'}, {'LN': 1584, 'SN': 'chr2'}] }
with pysam.AlignmentFile(“out.bam”, "wb", header=header) as outf:
	a = pysam.AlignedSegment()
	a.query_name= "read_28833_29006_6945"
	a.query_sequence="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"
	a.flag= 99
	a.reference_id= 0
	a.reference_start= 32
	a.mapping_quality= 20
	a.cigar= ((0,10), (2,1), (0,25))
	a.next_reference_id= 0
	a.next_reference_start=199
	a.template_length=167
	a.query_qualities= pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
	a.tags= (("NM", 1), ("RG", "L1"))
	outf.write(a)

#using samtools in command line (csamtools)
pysam.sort("-o", "output.bam", "ex1.bam") #in python - allows for more investigation
samtools sort -o output.bam ex1.bam #command line

#Pysam works with Tabix indexed files (BED-user defined/GFF/GTF)
#Tabix is generic indexer for TAB-delimited genome position files, gives random access to compressed files

tbx = pysam.TabixFile("example.bed.gz") #tbx object of class TabixFile

for row in tbx.fetch("chr11", 100, 200, parser = pysam.asBed()): #pysam.asGTF() or pysam.asTuple()*
	print("name is", row.name) #if BED or GTF parser used, fields are accessible by name
def call_consensus(family_bam: str,
                   new_read_name: str = None,
                   temp_sorted_filename: str = None,
                   max_depth: int = 10000,
                   calling_method: str = 'posterior') -> pysam.AlignedSegment:
    """
    call a consensus read from a read family file
    :param family_bam: name of file containing the family reads
    :param new_read_name: name of new read
    :param temp_sorted_filename: name of temporary file in which to store family reads
    :param max_depth: max depth parameter for pileup
    :param calling_method: method for calling individual bases
    :return: consensus read
    """

    assert temp_sorted_filename is not None
    assert new_read_name is not None

    # sort and index the family file
    pysam.sort(family_bam, "-o", temp_sorted_filename)
    pysam.index(temp_sorted_filename)

    with pysam.AlignmentFile(temp_sorted_filename, "rb") as family_file:
        first_read = family_file.__next__()
        reference_id = first_read.reference_id
        tags = first_read.get_tags(with_value_type=True)

    new_read_sequence_list = []
    new_read_quality_list = []
    new_read_cigar_tuple_list = []

    cigar_last = BAM_CMATCH
    cigar_last_count = 0

    last_pileup_position = None

    first_pileup_position = None
    with pysam.AlignmentFile(temp_sorted_filename, "rb") as family_file:
        for pileup_column in family_file.pileup(stepper="nofilter",
                                                max_depth=max_depth,
                                                min_base_quality=0):
            pos = pileup_column.pos

            if first_pileup_position is None:
                first_pileup_position = pos
                last_pileup_position = pos - 1

            position_delta = pos - last_pileup_position
            if position_delta > 1:
                # We have a gap
                if cigar_last == BAM_CREF_SKIP:
                    # If we are already in a gap extend it
                    cigar_last_count += position_delta - 1
                else:
                    # If we are not in a gap close the previous segment and start a new
                    new_read_cigar_tuple_list.append(
                        (cigar_last, cigar_last_count))
                    cigar_last = BAM_CREF_SKIP
                    cigar_last_count = position_delta - 1

            query_sequences = pileup_column.get_query_sequences()
            query_qualities = pileup_column.get_query_qualities()

            called_base, called_quality, called_cigar = call_base(
                query_sequences=query_sequences,
                query_qualities=query_qualities,
                calling_method=calling_method)

            if called_cigar == BAM_CREF_SKIP:
                # No base could be called and we have a single skip
                if cigar_last == BAM_CREF_SKIP:
                    # If we are already in a skip extend it
                    cigar_last_count += 1
                else:
                    # otherwise close previous segment and start a skip
                    new_read_cigar_tuple_list.append(
                        (cigar_last, cigar_last_count))
                    cigar_last = BAM_CREF_SKIP
                    cigar_last_count = 1
            else:
                # we have a base call
                new_read_sequence_list.append(called_base)
                new_read_quality_list.append(called_quality)

                if cigar_last == BAM_CMATCH:
                    cigar_last_count += 1
                else:
                    new_read_cigar_tuple_list.append(
                        (cigar_last, cigar_last_count))
                    cigar_last = BAM_CMATCH
                    cigar_last_count = 1

            # update the last position for which we got a pileup
            last_pileup_position = pos

        # append the final cigar tuple
        new_read_cigar_tuple_list.append((cigar_last, cigar_last_count))

    # Construct new AlignedSegment
    quality_string = ''.join(new_read_quality_list)
    quality_array = pysam.qualitystring_to_array(quality_string)

    new_read = pysam.AlignedSegment()
    new_read.query_name = new_read_name
    new_read.query_sequence = ''.join(new_read_sequence_list)
    new_read.flag = 0
    new_read.reference_id = reference_id
    new_read.reference_start = first_pileup_position
    new_read.mapping_quality = 255
    new_read.cigartuples = new_read_cigar_tuple_list
    new_read.query_qualities = quality_array
    new_read.tags = tags

    return new_read
Example #41
0
if __name__ == "__main__":

    genome = "G" * 100
    genome_start = 3

    a = pysam.AlignedSegment()
    qseq = "A" * 20
    a.query_name = "read1"
    a.query_sequence = qseq
    a.flag = 0
    a.reference_id = 0
    a.reference_start = 10
    a.mapping_quality = 20
    a.cigarstring = str(len(qseq)) + "M"
    a.query_qualities = pysam.qualitystring_to_array("<" * len(qseq))
    a.tags = (("NM", 1), ("RG", "L1"))

    b = pysam.AlignedSegment()
    qseq = "T" * 20
    b.query_name = "read2"
    b.query_sequence = qseq
    b.flag = 0
    b.reference_id = 0
    b.reference_start = 15
    b.mapping_quality = 20
    b.cigarstring = str(len(qseq)) + "M"
    b.query_qualities = pysam.qualitystring_to_array("<" * len(qseq))
    b.tags = (("NM", 1), ("RG", "L1"))

    print("readA: " + str(a))
def consensusMaker (groupedReadsList,  cutoff,  readLength, flag) :
    '''The consensus maker uses a simple "majority rules" algorithm to qmake a consensus at each base position.  If no nucleotide majority reaches above the minimum theshold (--cutoff), the position is considered undefined and an 'N' is placed at that position in the read.'''
    nucIdentityList=[0, 0, 0, 0, 0, 0] # In the order of T, C, G, A, N, Total
    nucKeyDict = {0:'T', 1:'C', 2:'G', 3:'A', 4:'N'}
    consensusRead = ''
    consensusReadQ = ''
    dif = [0, 0, 0, 0] #dif for 0-100 100-200 200-end total
    l1 = 100
    l2 = 200
    l3 = readLength - l2
    major = 4
    
    groupLen=len(groupedReadsList)
#    print groupedReadsList[0][1]
    # if only one reads, return itself
    if groupLen == 1:
        consensusRead=groupedReadsList[0][0]
        consensusReadQ=groupedReadsList[0][1]
    # if two reads, return each site with higer quality score
    elif groupLen == 2:
        qArray1=pysam.qualitystring_to_array(groupedReadsList[0][1])
        qArray2=pysam.qualitystring_to_array(groupedReadsList[1][1])
        for i in xrange(len(qArray1)):
            if qArray1[i] >= qArray2[i]:
                consensusRead += groupedReadsList[0][0][i]
                consensusReadQ += groupedReadsList[0][1][i]
            else:
                consensusRead += groupedReadsList[1][0][i]
                consensusReadQ += groupedReadsList[1][1][i]
        
            if groupedReadsList[0][0][i] != groupedReadsList[1][0][i]:
                m = i
                if flag == 83 or flag == 147:
                    m = readLength - i
                if m <= l1:
                    dif[0] += 1
                elif m <=l2:
                    dif[1] += 1
                else:
                    dif[2] += 1
                dif[3] += 1
    
    
    else:
        for i in xrange(readLength) : # Count the types of nucleotides at a position in a read. i is the nucleotide index within a read in groupedReadsList
            for j in xrange(len(groupedReadsList)): # Do this for every read that comprises a SMI group. j is the read index within groupedReadsList
                try:
                    if groupedReadsList[j][0][i] == 'T' :
                        nucIdentityList[0] += 1
                    elif groupedReadsList[j][0][i] == 'C':
                        nucIdentityList[1] += 1
                    elif groupedReadsList[j][0][i] == 'G':
                        nucIdentityList[2] += 1
                    elif groupedReadsList[j][0][i] == 'A':
                        nucIdentityList[3] += 1
                    elif groupedReadsList[j][0][i] == 'N':
                        nucIdentityList[4] += 1
                    else:
                        nucIdentityList[4] += 1
                    nucIdentityList[5] += 1
                except:
                    break
            try:
                for j in [0, 1, 2, 3, 4] :
                    if float(nucIdentityList[j])/float(nucIdentityList[5]) > cutoff :
                        consensusRead += nucKeyDict[j]
                        major = j
                        break
                    elif j==4:
                        consensusRead += 'N'
                        major = 4
            except:
                consensusRead += 'N'
#            l1 = 80
#            l2 = 160
#            l3 = readLength - l2
            m = i
            if flag == 83 or flag ==147:
                m = readLength - i
            if m <= l1:
                dif[0] += nucIdentityList[5] - nucIdentityList[major]
            elif m <= l2:
                dif[1] += nucIdentityList[5] - nucIdentityList[major]            
            else:
                dif[2] += nucIdentityList[5] - nucIdentityList[major]
            dif[3] += nucIdentityList[5] - nucIdentityList[major] #difference for each point in reads

            nucIdentityList=[0, 0, 0, 0, 0, 0] # Reset for the next nucleotide position
        consensusReadQ = "J"*len(consensusRead)
    errRate = [0,0,0,0]
    errRate[0] = 100*float(dif[0])/float(l1*len(groupedReadsList))
    errRate[1] = 100*float(dif[1])/float(l2*len(groupedReadsList))
    errRate[2] = 100*float(dif[2])/float(l3*len(groupedReadsList))
    errRate[3] = 100*float(dif[3])/float(readLength*len(groupedReadsList))
    
#    errRate = map(lambda x: float(x)/float(readLength*len(groupedReadsList)), dif)
#    errRate = float(dif)/float(readLength*len(groupedReadsList))
    return consensusRead, consensusReadQ, len(groupedReadsList), [errRate, len(groupedReadsList)]
Example #43
0
def create_bam(sample,
               files_in1,
               files_in2,
               ref_fasta,
               probes_dict,
               output,
               has_trimmed_primers=True,
               debug=False):
    """
    Create a BAM file with reads placed at their expected locations, adjusted through pairwise alignment to the target sequences.

    This will give reasonable results as long as probes capture the exact target sequences, but will generate
    alignments with many mismatches if there are any discrepancies.
    """
    assert len(files_in1) == len(files_in2)

    tStart = time.time()
    counters = collections.Counter()

    ref_idx = pyfaidx.Faidx(ref_fasta, rebuild=False)
    bam_header = {
        'HD': {
            'VN': '1.0'
        },
        'SQ': [{
            'LN': record.rlen,
            'SN': name
        } for name, record in ref_idx.index.items()],
        'RG': [{
            'ID': sample,
            'SM': sample
        }],
        'PG': [{
            'ID': __title__,
            'PN': __title__,
            'VN': __version__
        }],
    }

    chr_indices = {
        chrom: index
        for index, chrom in enumerate(ref_idx.index.keys())
    }

    with pysam.AlignmentFile(output, "wb", header=bam_header) as pairedreads:
        for ixfile in range(len(files_in1)):
            file1 = files_in1[ixfile]
            file2 = files_in2[ixfile]
            log.info('Processing %s and %s (#%d)', file1, file2, ixfile)

            counters['files'] += 1
            opener = gzip.open if file1.endswith('.gz') else open
            with opener(file1, 'rt') as hdl1, opener(file2, 'rt') as hdl2:
                for read_pair in zip(
                        Bio.SeqIO.QualityIO.FastqGeneralIterator(hdl1),
                        Bio.SeqIO.QualityIO.FastqGeneralIterator(hdl2)):
                    counters['pairs_total'] += 1
                    if counters['pairs_total'] % 50000 == 0:
                        log.info(
                            "processed %d pairs - %.f sec elapsed, %.4f sec/pair, %.1f pairs/sec",
                            counters['pairs_total'],
                            time.time() - tStart,
                            (time.time() - tStart) / counters['pairs_total'],
                            counters['pairs_total'] / (time.time() - tStart))

                    if debug and counters['pairs_total'] > 10:
                        print('DEBUG - stopping after ',
                              counters['pairs_total'])
                        break

                    # extract and parse read name
                    read_names_original = [
                        read_pair[0][0].split('\t')[0],
                        read_pair[1][0].split('\t')[0],
                    ]
                    assert len(read_names_original[0]) > 0
                    assert read_names_original[0] == read_names_original[1]
                    read_name, read_probe, read_umi = parse_extended_read_name(
                        read_names_original[0])

                    probe_chr = probes_dict['chr'][read_probe]
                    if not probe_chr in chr_indices:
                        raise Exception(
                            'Probe {} is associated with chromosome {}, but this entry does not exist in the reference fasta file!'
                            .format(read_probe, probe_chr))
                    probe_chr_index = chr_indices[probe_chr]

                    read_lens = [
                        len(read_pair[read_number][1])
                        for read_number in range(2)
                    ]

                    # untested: if we haven't trimmed off the primers then we need to start aligning from the primer start location!
                    if has_trimmed_primers:
                        probe_start = int(
                            probes_dict['target_start_0'][read_probe] + 1)
                        probe_end = int(probes_dict['target_end'][read_probe])
                    else:
                        probe_start = int(
                            probes_dict['probe_start_0'][read_probe] + 1)
                        probe_end = int(probes_dict['probe_end'][read_probe])

                    if probes_dict['strand'][read_probe] == '+':
                        read_starts = [
                            probe_start, probe_end - read_lens[1] + 1
                        ]
                        read_reverse = [False, True]
                    elif probes_dict['strand'][read_probe] == '-':
                        read_starts = [
                            probe_end - read_lens[0] + 1, probe_start
                        ]
                        read_reverse = [True, False]
                    else:
                        raise Exception(
                            'Unexpected strand for probe {}'.format(
                                read_probe))

                    # NOTE: this SHOULD BE one-based based on documentation
                    # but actually seem to be ZERO-based -- at least the sequence we get for PRRX1-Ex1
                    # starts CGGA but should start GGA; ends TTC but should end TTCT if we just use probe_start and probe_end
                    # they are always in genomic sense
                    probe_target_sequence = str(
                        ref_idx.fetch(probe_chr, probe_start,
                                      probe_end)).upper()

                    # sanity check that we got the right sequence
                    if has_trimmed_primers:
                        assert len(probe_target_sequence) == probes_dict[
                            'target_length'][read_probe]
                    else:
                        assert len(probe_target_sequence
                                   ) == probes_dict['capture_size'][read_probe]

                    if debug:
                        print(read_name, read_probe, read_umi)
                        print(probe_chr, probe_chr_index, probe_start,
                              probe_end)
                        print(read_starts)
                        print(read_reverse)
                        print(probe_target_sequence)
                        print(read_pair)

                    try:
                        # pre-process alignments to make sure the mate starts are actually correct
                        read_cigars = []
                        read_sequences = []
                        read_tags_for_pysam = []
                        for read_number in range(2):
                            # copy over our custom tags from FASTQ file
                            read_tags = [
                                tag.split(':') for tag in
                                read_pair[read_number][0].split('\t')[1:]
                            ]
                            read_tags_for_pysam.append(
                                [("RG", sample, "Z")] +
                                [(tag_name, int(tag_value) if tag_type ==
                                  'i' else tag_value, tag_type) for tag_name,
                                 tag_type, tag_value in read_tags])
                            if debug:
                                print(read_tags)
                                print(read_tags_for_pysam[read_number])

                            # figure out sequence
                            read_sequence = str(
                                Bio.Seq.Seq(read_pair[read_number][1]).
                                reverse_complement()) if read_reverse[
                                    read_number] else read_pair[read_number][1]
                            read_sequences.append(read_sequence)

                            # align read to target sequence -- note both of these are in genomic sense!
                            try:
                                cigar_read_start_offset, cigartuples = align_and_find_cigar(
                                    read_sequence, probe_target_sequence)
                                read_tags_for_pysam[read_number].append(
                                    ("so", cigar_read_start_offset, 'i'))

                                # remember cigar
                                read_cigars.append(cigartuples)
                                # adjust start -- need to use zero-based coords here but probe_start is 1-based
                                read_starts[
                                    read_number] = probe_start - 1 + cigar_read_start_offset
                            except AssertionError:
                                cigar_read_start_offset, cigartuples = align_and_find_cigar(
                                    read_sequence,
                                    probe_target_sequence,
                                    debug=True)
                                raise

                        if debug:
                            print(read_cigars)
                            print(read_starts)

                        for read_number in range(2):

                            # create aligned segment
                            a = pysam.AlignedSegment()
                            a.mapping_quality = 255  #always best quality
                            a.query_name = read_names_original[0]
                            a.query_sequence = read_sequences[read_number]
                            a.query_qualities = pysam.qualitystring_to_array(
                                read_pair[read_number][2][::-1]
                                if read_reverse[read_number] else
                                read_pair[read_number][2])
                            a.set_tags(read_tags_for_pysam[read_number])
                            a.cigartuples = read_cigars[read_number]

                            a.reference_id = probe_chr_index
                            a.reference_start = read_starts[read_number]
                            a.next_reference_id = probe_chr_index
                            a.next_reference_start = read_starts[1 -
                                                                 read_number]
                            # a.template_length = read_lens[read_number]

                            a.is_paired = True
                            a.is_proper_pair = True
                            a.is_read1 = read_number == 0
                            a.is_read2 = read_number == 1
                            a.is_reverse = read_reverse[read_number]
                            a.mate_is_reverse = read_reverse[1 - read_number]

                            if debug:
                                print(a)
                            pairedreads.write(a)
                            if debug:
                                break
                    # normally we always get an alignment, but apparently sometimes we don't?
                    except AmplimapNoAlignment:
                        counters['no_alignment'] += 1
                        pass

    log.info('%s done - %d pairs in total, %d without alignment', sample,
             counters['pairs_total'], counters['no_alignment'])

    log.info("BAM file created: %s", output)