def convert_colorspace(color_seq, char_a, char_b):
    """
    take a colorspace read, convert to base sequence
    convert C to T (char_a to char_b) then back to
    colorspace and return
    """
    base_seq = cs2seq(color_seq.rstrip()).replace(char_a, char_b)
    return seq2cs(base_seq)
Example #2
0
def convert_colorspace(color_seq, char_a, char_b):
    """
    take a colorspace read, convert to base sequence
    convert C to T (char_a to char_b) then back to
    colorspace and return
    """
    base_seq = cs2seq(color_seq.rstrip()).replace(char_a, char_b)
    return seq2cs(base_seq)
def parse_sam(sam_iter, chr_lengths, get_records, unmapped_name,
              is_colorspace, out_sam):
    is_colorspace = int(is_colorspace)
    unmapped = open(unmapped_name, "w")
    print >>sys.stderr, "writing unmapped reads to %s" % (unmapped.name, )
    idx = 0
    write_new_header(out_sam, chr_lengths)

    for sline in sam_iter:
        # comment.
        if sline[0] == "@":
            copy_header(out_sam, sline, chr_lengths)
            continue
        line = sline.split("\t")
        read_id = line[0]
        sam_flag = int(line[1])
        # no reported alignments.
        # extra via -m
        if sam_flag == 4:
            if not "XM:i:0" in sline:
                # write stuff that was excluded because of too many mappings.
                raw_fastq, converted_fastq = get_records(read_id, 0)
                print >> unmapped, str(raw_fastq)
            continue
        # extra found via -M
        if line[4] == '0' and sam_flag == 0:
            raw_fastq, converted_fastq = get_records(read_id, 0)
            print >> unmapped, str(raw_fastq)
            continue

        if sam_flag != 0:
            # if the pair doesn't map to same place, skip.
            if line[6] != "=": continue
            # flags are (1 | 2 | 32 | 64) or (1 | 2 | 16 | 128)
            idx = 0 if (sam_flag & 128) == 0 else 1
            # bowtie prints the alignment without the pair end info.
            # add back /0 or /1 here.
            read_id = read_id + "/" + str(idx + 1)

        seqid = line[2]
        direction = seqid[0]
        assert direction in 'fr'

        seqid = seqid[1:]
        line[2] = seqid

        pos0 = int(line[3]) - 1
        if is_colorspace: pos0 -= 2
        converted_seq = line[9]

        # we want to include the orginal, non converted reads
        # in the output file to view the alignment.
        # read_id is the line in the file.
        #fh_raw_reads.seek((read_id * read_len) + read_id)
        #raw_seq = fh_raw_reads.read(read_len)
        raw_fastq, converted_fastq = get_records(read_id, idx)
        read_len = len(converted_seq) + 3 * int(is_colorspace)
        raw_seq = raw_fastq.seq
        if is_colorspace:
            raw_seq = cs2seq(raw_seq)

        if direction == 'f':
            line[9] = raw_seq
        else:
            pos0 = chr_lengths[seqid] - pos0 - read_len
            # adjust mate position as well.
            mpos = int(line[7])
            mpos = chr_lengths[seqid] - mpos - read_len + 2
            line[8] = str(pos0 - mpos + 1) # insert size
            line[7] = str(mpos)

            line[3] = str(pos0 + 1)
            # since the read matched the flipped genome. we flip it here.
            line[9] = raw_seq = revcomp(raw_seq)
            # flip the quality as well.
            line[10] = line[10][::-1]
            line[1] = str(sam_flag + 16) # alignment on reverse strand.
            converted_seq = revcomp(converted_fastq.seq)

        if (sam_flag & 128 != 0): # th other end of the pair.
            line[9] = raw_seq = revcomp(raw_seq)
            converted_seq = revcomp(converted_seq)
        # NM:i:2
        NM = [x for x in line[11:] if x[0] == 'N' and x[1] == 'M'][0].rstrip()
        nmiss = int(NM[-1])
        line[-1] = line[-1].rstrip()
        yield dict(
            read_id=read_id,
            seqid=line[2],
            pos0=pos0,
            mapq=line[4],
            nmiss=nmiss,
            read_sequence=converted_seq,
            raw_read=raw_seq,
        ), line, read_len, direction
Example #4
0
def parse_sam(sam_iter, chr_lengths, get_records, unmapped_name, is_colorspace,
              out_sam):
    is_colorspace = int(is_colorspace)
    unmapped = open(unmapped_name, "w")
    print >> sys.stderr, "writing unmapped reads to %s" % (unmapped.name, )
    idx = 0
    write_new_header(out_sam, chr_lengths)

    for sline in sam_iter:
        # comment.
        if sline[0] == "@":
            copy_header(out_sam, sline, chr_lengths)
            continue
        line = sline.split("\t")
        read_id = line[0]
        sam_flag = int(line[1])
        # no reported alignments.
        # extra via -m
        if sam_flag == 4:
            if not "XM:i:0" in sline:
                # write stuff that was excluded because of too many mappings.
                raw_fastq, converted_fastq = get_records(read_id, 0)
                print >> unmapped, str(raw_fastq)
            continue
        # extra found via -M
        if line[4] == '0' and sam_flag == 0:
            raw_fastq, converted_fastq = get_records(read_id, 0)
            print >> unmapped, str(raw_fastq)
            continue

        if sam_flag != 0:
            # if the pair doesn't map to same place, skip.
            if line[6] != "=": continue
            # flags are (1 | 2 | 32 | 64) or (1 | 2 | 16 | 128)
            idx = 0 if (sam_flag & 128) == 0 else 1
            # bowtie prints the alignment without the pair end info.
            # add back /0 or /1 here.
            read_id = read_id + "/" + str(idx + 1)

        seqid = line[2]
        direction = seqid[0]
        assert direction in 'fr'

        seqid = seqid[1:]
        line[2] = seqid

        pos0 = int(line[3]) - 1
        if is_colorspace: pos0 -= 2
        converted_seq = line[9]

        # we want to include the orginal, non converted reads
        # in the output file to view the alignment.
        # read_id is the line in the file.
        #fh_raw_reads.seek((read_id * read_len) + read_id)
        #raw_seq = fh_raw_reads.read(read_len)
        raw_fastq, converted_fastq = get_records(read_id, idx)
        read_len = len(converted_seq) + 3 * int(is_colorspace)
        raw_seq = raw_fastq.seq
        if is_colorspace:
            raw_seq = cs2seq(raw_seq)

        if direction == 'f':
            line[9] = raw_seq
        else:
            pos0 = chr_lengths[seqid] - pos0 - read_len
            # adjust mate position as well.
            mpos = int(line[7])
            mpos = chr_lengths[seqid] - mpos - read_len + 2
            line[8] = str(pos0 - mpos + 1)  # insert size
            line[7] = str(mpos)

            line[3] = str(pos0 + 1)
            # since the read matched the flipped genome. we flip it here.
            line[9] = raw_seq = revcomp(raw_seq)
            # flip the quality as well.
            line[10] = line[10][::-1]
            line[1] = str(sam_flag + 16)  # alignment on reverse strand.
            converted_seq = revcomp(converted_fastq.seq)

        if (sam_flag & 128 != 0):  # th other end of the pair.
            line[9] = raw_seq = revcomp(raw_seq)
            converted_seq = revcomp(converted_seq)
        # NM:i:2
        NM = [x for x in line[11:] if x[0] == 'N' and x[1] == 'M'][0].rstrip()
        nmiss = int(NM[-1])
        line[-1] = line[-1].rstrip()
        yield dict(
            read_id=read_id,
            seqid=line[2],
            pos0=pos0,
            mapq=line[4],
            nmiss=nmiss,
            read_sequence=converted_seq,
            raw_read=raw_seq,
        ), line, read_len, direction