Ejemplo n.º 1
0
def assemble_seq(readid2seq, junc_seq, tmp_file_path):

    match = 2
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)

    sw = swalign.LocalAlignment(
        scoring)  # you can also choose gap penalties, etc...

    hout = open(tmp_file_path + ".tmp3.assemble_input.fa", 'w')
    for tid in sorted(readid2seq):
        print >> hout, '>' + tid
        print >> hout, readid2seq[tid]
    hout.close()

    hout = open(tmp_file_path + ".tmp3.assemble_output.fq", 'w')
    sret = subprocess.call(
        ["fml-asm", tmp_file_path + ".tmp3.assemble_input.fa"], stdout=hout)
    hout.close()

    if sret != 0:
        print >> sys.stderr, "fml-asm error, error code: " + str(sret)
        sys.exit()

    line_num = 0
    temp_contig = ""
    with open(tmp_file_path + ".tmp3.assemble_output.fq", 'r') as hin:
        for line in hin:
            line_num = line_num + 1
            if line_num % 4 == 2:
                tseq = line.rstrip('\n')

                aln_1 = sw.align(tseq, junc_seq)
                if aln_1.score >= 35:
                    ttcontig = tseq[aln_1.r_end:]
                    if len(ttcontig) > len(temp_contig): temp_contig = ttcontig

                aln_2 = sw.align(tseq, my_seq.reverse_complement(junc_seq))
                if aln_2.score >= 35:
                    ttcontig = my_seq.reverse_complement(tseq[:aln_2.r_pos])
                    if len(ttcontig) > len(temp_contig): temp_contig = ttcontig

    # subprocess.call(["rm", "-rf", tmp_file_path + ".tmp3.assemble_input.fa"])
    # subprocess.call(["rm", "-rf", tmp_file_path + ".tmp3.assemble_output.fq"])
    return temp_contig
Ejemplo n.º 2
0
def generate_contig(input_file, output_file, tumor_bp_file, tumor_bam,
                    reference_genome, min_contig_length):

    tumor_bp_db = pysam.TabixFile(tumor_bp_file)

    readid2key = {}
    with open(input_file, 'r') as hin:
        for line in hin:
            F = line.rstrip('\n').split('\t')
            if F[0] == "Chr": continue

            tabixErrorFlag = 0
            try:
                records = tumor_bp_db.fetch(F[0], int(F[1]) - 1, int(F[1]) + 1)
            except Exception as inst:
                print >> sys.stderr, "%s: %s" % (type(inst), inst.args)
                tabixErrorMsg = str(inst.args)
                tabixErrorFlag = 1

            if tabixErrorFlag == 0:
                for record_line in records:
                    record = record_line.split('\t')
                    if record[0] == F[0] and (int(record[1]) + 1) == int(
                            F[1]) and record[3] == F[2] and record[4] == F[3]:
                        for readid in record[5].split(';'):
                            readid2key[re.sub(r'/\d$', '',
                                              readid)] = ','.join(F[:4])

    bamfile = pysam.Samfile(tumor_bam, "rb")

    hout = open(output_file + ".tmp2.contig.unsorted", 'w')
    for read in bamfile.fetch():

        if read.qname in readid2key:
            flags = format(int(read.flag), "#014b")[:1:-1]

            # skip supplementary alignment
            if flags[8] == "1" or flags[11] == "1": continue

            # skip duplicated reads
            if flags[10] == "1": continue

            print >> hout, readid2key[read.qname] + '\t' + read.qname + (
                "/1" if flags[6] == "1" else "/2") + '\t' + read.query_sequence

    hout.close()

    hout = open(output_file + ".tmp2.contig.sorted", 'w')
    subprocess.call(["sort", "-k1,1", output_file + ".tmp2.contig.unsorted"],
                    stdout=hout)
    hout.close()

    temp_key = ""
    temp_id2seq = {}
    temp_junc_seq = ""
    key2contig = {}
    with open(output_file + ".tmp2.contig.sorted") as hin:
        for line in hin:
            F = line.rstrip('\n').split('\t')
            if temp_key != F[0]:
                if len(temp_id2seq) > 0:
                    key2contig[temp_key] = assemble_seq(
                        temp_id2seq, temp_junc_seq, output_file)

                temp_key = F[0]
                temp_id2seq = {}
                FF = temp_key.split(',')
                if FF[2] == "+":
                    temp_junc_seq = my_seq.get_seq(reference_genome, FF[0],
                                                   int(FF[1]) - 20, int(FF[1]))
                else:
                    temp_junc_seq = my_seq.reverse_complement(
                        my_seq.get_seq(reference_genome, FF[0], int(FF[1]),
                                       int(FF[1]) + 20))

            temp_id2seq[F[1]] = F[2]

        if len(temp_id2seq) > 0:
            key2contig[temp_key] = assemble_seq(temp_id2seq, temp_junc_seq,
                                                output_file)

    hout = open(output_file, 'w')
    with open(input_file, 'r') as hin:
        for line in hin:
            F = line.rstrip('\n').split('\t')
            key = ','.join(F[:4])

            if key not in key2contig: continue
            contig = key2contig[key]
            if len(contig) < min_contig_length: continue
            # if contig[:8] != F[3][:8]: continue

            print >> hout, '\t'.join(F) + '\t' + contig

    hout.close()
Ejemplo n.º 3
0
def parse_bp_from_bam(input_bam, output_file, key_seq_size,
                      min_major_clip_size, max_minor_clip_size):

    bamfile = pysam.Samfile(input_bam, "rb")
    hout = open(output_file, "w")

    # maybe add the regional extraction of bam files
    for read in bamfile.fetch():

        # get the flag information
        flags = format(int(read.flag), "#014b")[:1:-1]

        # skip if not aligned
        if flags[2] == "1": continue

        # skip supplementary alignment
        if flags[8] == "1" or flags[11] == "1": continue

        # skip duplicated reads
        if flags[10] == "1": continue

        # no clipping
        if len(read.cigar) == 1: continue

        # get the clipping size in the both side
        left_clipping = (read.cigar[0][1] if read.cigar[0][0] in [4, 5] else 0)
        right_clipping = (read.cigar[len(read.cigar) - 1][1] if
                          read.cigar[len(read.cigar) - 1][0] in [4, 5] else 0)

        if left_clipping < min_major_clip_size and right_clipping < min_major_clip_size:
            continue

        # get the alignment basic information
        chr_current = bamfile.getrname(read.tid)
        pos_current = int(read.pos + 1)
        dir_current = ("-" if flags[4] == "1" else "+")

        # when the right side is clipped...
        if right_clipping >= min_major_clip_size:

            clipLen_current = right_clipping
            alignmentSize_current = read.alen
            readLength_current = read.rlen

            juncChr_current = chr_current
            juncPos_current = pos_current + alignmentSize_current - 1
            juncDir_current = "+"

            juncseq_start = readLength_current - clipLen_current
            juncseq_end = readLength_current - clipLen_current + key_seq_size
            juncseq = read.seq[juncseq_start:juncseq_end]

            print >> hout, '\t'.join([
                juncChr_current,
                str(juncPos_current - 1),
                str(juncPos_current), juncDir_current, juncseq,
                read.qname + ("/1" if flags[6] == "1" else "/2"),
                str(read.mapq),
                str(right_clipping),
                str(alignmentSize_current)
            ])

        if left_clipping >= min_major_clip_size:

            clipLen_current = left_clipping
            alignmentSize_current = read.alen
            readLength_current = read.rlen

            juncChr_current = chr_current
            juncPos_current = pos_current
            juncDir_current = "-"

            juncseq_end = clipLen_current
            juncseq_start = clipLen_current - key_seq_size
            juncseq = my_seq.reverse_complement(
                read.seq[juncseq_start:juncseq_end])

            print >> hout, '\t'.join([
                juncChr_current,
                str(juncPos_current - 1),
                str(juncPos_current), juncDir_current, juncseq,
                read.qname + ("/1" if flags[6] == "1" else "/2"),
                str(read.mapq),
                str(left_clipping),
                str(alignmentSize_current)
            ])

    bamfile.close()
    hout.close()
Ejemplo n.º 4
0
def filter_by_base_quality(input_file, output_file, tumor_bam, min_support_num,
                           permissible_range):

    hout = open(output_file, 'w')

    with open(input_file, 'r') as hin:
        for line in hin:
            F = line.rstrip('\n').split('\t')

            tumor_bamfile = pysam.Samfile(tumor_bam, "rb")
            key_baseq = []
            tumor_var_read = 0
            tabixErrorFlag = 0
            try:
                records = tumor_bamfile.fetch(F[0], max(int(F[1]) - 1, 0),
                                              int(F[1]) + 1)

            except Exception as inst:
                print >> sys.stderr, "%s: %s" % (type(inst), inst.args)
                tabixErrorMsg = str(inst.args)
                tabixErrorFlag = 1

            if tabixErrorFlag == 0:
                for read in records:
                    flags = format(int(read.flag), "#014b")[:1:-1]

                    # skip if not aligned
                    if flags[2] == "1": continue

                    # skip supplementary alignment
                    if flags[8] == "1" or flags[11] == "1": continue

                    # skip duplicated reads
                    if flags[10] == "1": continue

                    # no clipping
                    if len(read.cigar) == 1: continue

                    left_clipping = (read.cigar[0][1]
                                     if read.cigar[0][0] in [4, 5] else 0)
                    right_clipping = (read.cigar[len(read.cigar) - 1][1]
                                      if read.cigar[len(read.cigar) -
                                                    1][0] in [4, 5] else 0)
                    #print right_clipping

                    if F[2] == "+":
                        if right_clipping < 2: continue

                        juncPos_current = int(read.pos + 1) + read.alen - 1

                        if int(juncPos_current) != int(F[1]): continue

                        juncseq_start = read.rlen - right_clipping
                        juncseq_end = min(juncseq_start + 8, read.rlen)
                        juncseq = read.seq[juncseq_start:juncseq_end]

                        #if numpy.mean(read.query_qualities[juncseq_start:juncseq_end]) < 10:
                        #    continue

                        if juncseq in F[3]:
                            tumor_var_read += 1
                            key_baseq.append(
                                str(
                                    numpy.mean(read.query_qualities[
                                        juncseq_start:juncseq_end])))

                    if F[2] == "-":
                        if left_clipping < 2: continue

                        juncPos_current = int(read.pos + 1)

                        if int(juncPos_current) != int(F[1]): continue

                        juncseq_start = left_clipping
                        juncseq_end = max(left_clipping - 8, 0)
                        juncseq = my_seq.reverse_complement(
                            read.seq[juncseq_start:juncseq_end])

                        #if numpy.mean(read.query_qualities[juncseq_start:juncseq_end]) < 10:
                        #    continue

                        if juncseq in F[3]:
                            tumor_var_read += 1
                            key_baseq.append(
                                str(
                                    numpy.mean(read.query_qualities[
                                        juncseq_end:juncseq_start])))

            if tumor_var_read < min_support_num: continue
            if numpy.median(list(map(float, key_baseq))) < 20: continue
            if numpy.sort(list(map(float, key_baseq)))[-2] < 30: continue

            print >> hout, F[0] + '\t' + F[1] + '\t' + F[2] + '\t' + F[
                3] + '\t' + str(tumor_var_read)

    hin.close()
    hout.close()
Ejemplo n.º 5
0
def parse_bp_from_bam(input_bam, output_file, key_seq_size,
                      min_major_clip_size, max_minor_clip_size):
    """
    function for getting breakpoints from BAM file
    input:bam file
    output:output_file + ".bp.tmp.txt"
    breakpoint info (juncChr, juncPos-1, juncPos, Dir (right clipping:+, left clipping:-), Juncseq, ID + ("/1" or "/2"), MAPQ, Clipping size, AlignmentSize, Base quality of juncseq)
    """

    bamfile = pysam.Samfile(input_bam, "rb")
    hout = open(output_file, "w")

    # maybe add the regional extraction of bam files
    for read in bamfile.fetch():

        # get the flag information
        flags = format(int(read.flag), "#014b")[:1:-1]

        # skip if not aligned
        if flags[2] == "1": continue

        # skip supplementary alignment
        if flags[8] == "1" or flags[11] == "1": continue

        # skip duplicated reads
        if flags[10] == "1": continue

        # no clipping
        if len(read.cigar) == 1: continue

        # get the clipping size in the both side
        left_clipping = (read.cigar[0][1] if read.cigar[0][0] in [4, 5] else 0)
        right_clipping = (read.cigar[len(read.cigar) - 1][1] if
                          read.cigar[len(read.cigar) - 1][0] in [4, 5] else 0)

        if left_clipping < min_major_clip_size and right_clipping < min_major_clip_size:
            continue

        # get the alignment basic information
        chr_current = bamfile.getrname(read.tid)
        pos_current = int(read.pos + 1)
        dir_current = ("-" if flags[4] == "1" else "+")

        # when the right side is clipped...
        if right_clipping >= min_major_clip_size:
            clipLen_current = right_clipping
            alignmentSize_current = read.alen
            readLength_current = read.rlen
            juncChr_current = chr_current
            juncPos_current = pos_current + alignmentSize_current - 1
            juncDir_current = "+"
            juncseq_start = readLength_current - clipLen_current
            juncseq_end = readLength_current - clipLen_current + key_seq_size
            juncseq = read.seq[juncseq_start:juncseq_end]

            #filter if base qualities of junction seq is low
            #if numpy.mean(read.query_qualities[juncseq_start:juncseq_end]) < 10:
            #    continue

            print >> hout, '\t'.join([
                juncChr_current,
                str(juncPos_current - 1),
                str(juncPos_current), juncDir_current, juncseq,
                read.qname + ("/1" if flags[6] == "1" else "/2"),
                str(read.mapq),
                str(right_clipping),
                str(alignmentSize_current)
            ])

        if left_clipping >= min_major_clip_size:

            clipLen_current = left_clipping
            alignmentSize_current = read.alen
            readLength_current = read.rlen

            juncChr_current = chr_current
            juncPos_current = pos_current
            juncDir_current = "-"

            juncseq_end = clipLen_current
            juncseq_start = clipLen_current - key_seq_size
            juncseq = my_seq.reverse_complement(
                read.seq[juncseq_start:juncseq_end])

            #filter if base qualities of soft clipping part is low
            #if numpy.mean(read.query_qualities[juncseq_start:juncseq_end])<10:
            #  continue

            print >> hout, '\t'.join([
                juncChr_current,
                str(juncPos_current - 1),
                str(juncPos_current), juncDir_current, juncseq,
                read.qname + ("/1" if flags[6] == "1" else "/2"),
                str(read.mapq),
                str(left_clipping),
                str(alignmentSize_current)
            ])

    bamfile.close()
    hout.close()