Example #1
0
def node_is_similar(seq1, seq2):
    l1 = len(seq1)
    l2 = len(seq2)
    if l1 == 0 or l2 == 0: return False
    if l1 <= 2 and l2 <= 2: return True
    if l1 < l2:
        l1, l2 = l2, l1
        seq1, seq2 = seq2, seq1
    # always make seq1 the longer one
    o1 = Aligner(seq1,
                 match=2,
                 mismatch=5,
                 gap_open=3,
                 gap_extend=1,
                 report_secondary=False,
                 report_cigar=False)
    # require the the whole (shorter) seq2 must be aligned
    # and set min score to approx 90% accuracy

    if EXPECTED_ERR_RATE == 0:
        res = o1.align(seq2, min_score=l2 * 2 * 1.0, min_len=l2 * 1.0)
    elif EXPECTED_ERR_RATE < 2:
        res = o1.align(seq2, min_score=int(l1 * 2 * .80), min_len=int(l2 * .9))
    else:
        raise Exception, "Expected error rate not implemented for {0}% and above".format(
            EXPECTED_ERR_RATE)
    return res is not None
Example #2
0
def node_is_similar(seq1, seq2):
    l1 = len(seq1)
    l2 = len(seq2)
    if l1 == 0 or l2 == 0: return False
    if l1 <= 2 and l2 <= 2: return True
    if l1 < l2:
        l1, l2 = l2, l1
        seq1, seq2 = seq2, seq1
    # always make seq1 the longer one
    o1 = Aligner(seq1, match=2, mismatch=5, gap_open=3, gap_extend=1, report_secondary=False, report_cigar=False)
    # require the the whole (shorter) seq2 must be aligned
    # and set min score to approx 90% accuracy
    res = o1.align(seq2, min_score=int(l1*2*.80), min_len=int(l2*.9))
    return res is not None
Example #3
0
def filter_reads(readfile):
    print("Filtering reads\n")
    ssw = Aligner(tn_seq)
    total=0
    matched=0
    with open(filtered_filename,'w') as f:
        for title, seq, qual in FastqGeneralIterator(open(readfile)):
            total+=1
            res = ssw.align(seq,min_score, min_match_length)
            if res:
                end = res.query_end+1
                if len(seq)-end >= min_remaining_length:
                    matched+=1
                    f.write('@%s\n%s\n+\n%s\n' % (title, seq[end:], qual[end:]))
    print("%s of %s read had the tn seq\n" % (matched, total))
Example #4
0
def validate_reconstructed_seq(seq, orig):
    """
    seq --- the sequence that is reconstructed
    orig --- the original sequence

    because the reconstructed seq can be longer, we don't care about deletions
      (deletions w.r.t could just be exon skipping or minor base errors)
    we only care that there is NOT a lot of insertions (which would indicate error in my bubble solution)
    """
    o1 = Aligner(seq, match=2, mismatch=5, gap_open=3, gap_extend=1, report_secondary=False, report_cigar=True)
    l2 = len(orig)
    res = o1.align(orig, min_score=l2*2*.90, min_len=l2)
    for num, type in iter_cigar_string(res.cigar_string):
        if type == 'I' and num > 5:
            return False, res.cigar_string
    return True, res.cigar_string
Example #5
0
def validate_reconstructed_seq(seq, orig):
    """
    seq --- the sequence that is reconstructed
    orig --- the original sequence

    because the reconstructed seq can be longer, we don't care about deletions
      (deletions w.r.t could just be exon skipping or minor base errors)
    we only care that there is NOT a lot of insertions (which would indicate error in my bubble solution)
    """
    o1 = Aligner(seq, match=2, mismatch=5, gap_open=3, gap_extend=1, report_secondary=False, report_cigar=True)
    l2 = len(orig)
    res = o1.align(orig, min_score=l2*2*.90, min_len=l2)
    for num, type in iter_cigar_string(res.cigar_string):
        if type == 'I' and num > 5:
            return False, res.cigar_string
    return True, res.cigar_string
Example #6
0
def node_is_similar(seq1, seq2):
    l1 = len(seq1)
    l2 = len(seq2)
    if l1 == 0 or l2 == 0: return False
    if l1 <= 2 and l2 <= 2: return True
    if l1 < l2:
        l1, l2 = l2, l1
        seq1, seq2 = seq2, seq1
    # always make seq1 the longer one
    o1 = Aligner(seq1, match=2, mismatch=5, gap_open=3, gap_extend=1, report_secondary=False, report_cigar=False)
    # require the the whole (shorter) seq2 must be aligned
    # and set min score to approx 90% accuracy

    if EXPECTED_ERR_RATE == 0:
        res = os.align(seq2, min_score=l2*2*1.0, min_len=l2*1.0)
    elif EXPECTED_ERR_RATE < 2:
        res = o1.align(seq2, min_score=int(l1*2*.80), min_len=int(l2*.9))
    else:
        raise Exception, "Expected error rate not implemented for {0}% and above".format(EXPECTED_ERR_RATE)
    return res is not None
def align (opt):

    print ("Inport subject sequence")
    # Import fasta subject
    if opt.subject.rpartition(".")[2].lower() == "gz":
        subject_handle = gzip.open(opt.subject, "r")
    else:
        subject_handle = open(opt.subject, "r")
    subject = SeqIO.read(subject_handle, "fasta")

    print ("Inport query sequences and count the number of sequences")
    # Import fasta subject
    if opt.query.rpartition(".")[2].lower() == "gz":
        nseq = count_seq(opt.query, opt.qtype, True)
        query_handle = gzip.open(opt.query, "r")
    else:
        nseq = count_seq(opt.query, opt.qtype, False)
        query_handle = open(opt.query, "r")
    query_gen = SeqIO.parse(query_handle, opt.qtype)

    print("{} contains {} sequences to align".format(opt.query, nseq))
    # Calculate a step list for the progress bar
    nseq_list = [int(nseq*i/100.0) for i in range(5,101,5)]

    print ("Initialize ssw aligner with the subject sequence")
    # Init the an Aligner object with the reference value
    ssw = Aligner(
        str(subject.seq),
        match=int(opt.match),
        mismatch=int(opt.mismatch),
        gap_open=int(opt.gap_open),
        gap_extend= int(opt.gap_extend),
        report_secondary=False,
        report_cigar=True)

    # Write the header of the SAM file
    with open("result.sam", "w") as f:
        f.write("@HD\tVN:1.0\tSO:unsorted\n")
        f.write("@SQ\tSN:{}\tLN:{}\n".format(subject.id, len(subject.seq)))
        f.write("@PG\tID:Striped-Smith-Waterman\tPN:pyssw\tVN:0.1\n")
        f.write("@CO\tScore_values = match {}, mismatch {}, gap_open {}, gap_extend {}\n".format(
            opt.match,
            opt.mismatch,
            opt.gap_open,
            opt.gap_extend))
        f.write("@CO\tFilter Options = min_score {}, min_len {}\n".format(
            opt.min_score,
            opt.min_len))

        print ("Starting alignment of queries against the subject sequence")
        start = time()
        # Align each query along the subject an write result in a SAM file
        i = 0
        for query in query_gen:

            # Find the best alignment
            if opt.reverse:
                al, orient = find_best_align (ssw, query, float(opt.min_score), int(opt.min_len))
            else:
                al, orient = ssw.align(str(query.seq), float(opt.min_score), int(opt.min_len)), True

            # If valid match found
            if al:
                f.write(sam_line(
                    qname=query.id,
                    flag=0 if orient else 16,
                    rname=subject.id,
                    pos=al.ref_begin+1,
                    cigar=al.cigar_string,
                    seq=str(query.seq),
                    qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*",
                    tags=["AS:i:{}".format(al.score)]))

            # If no valid match found and -u flag activated (report unaligned)
            elif opt.unaligned:
                f.write(sam_line(
                    qname=query.id,
                    flag=4,
                    seq=str(query.seq),
                    qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*"))
            # Else = match unreported

            # Progress bar
            i+=1
            if i in nseq_list:
                frac = i/float(nseq)
                t = time()-start
                print ("{} sequences \t{}% \tRemaining time = {}s".format(i, int(frac*100), round(t/frac-t, 2)))

        print ("\n{} Sequences processed in {}s".format(i, round(time()-start, 2)))
Example #8
0
def align(opt):

    print("Inport subject sequence")
    # Import fasta subject
    if opt.subject.rpartition(".")[2].lower() == "gz":
        subject_handle = gzip.open(opt.subject, "r")
    else:
        subject_handle = open(opt.subject, "r")
    subject = SeqIO.read(subject_handle, "fasta")

    print("Inport query sequences and count the number of sequences")
    # Import fasta subject
    if opt.query.rpartition(".")[2].lower() == "gz":
        nseq = count_seq(opt.query, opt.qtype, True)
        query_handle = gzip.open(opt.query, "r")
    else:
        nseq = count_seq(opt.query, opt.qtype, False)
        query_handle = open(opt.query, "r")
    query_gen = SeqIO.parse(query_handle, opt.qtype)

    print("{} contains {} sequences to align".format(opt.query, nseq))
    # Calculate a step list for the progress bar
    nseq_list = [int(nseq * i / 100.0) for i in range(5, 101, 5)]

    print("Initialize ssw aligner with the subject sequence")
    # Init the an Aligner object with the reference value
    ssw = Aligner(str(subject.seq),
                  match=int(opt.match),
                  mismatch=int(opt.mismatch),
                  gap_open=int(opt.gap_open),
                  gap_extend=int(opt.gap_extend),
                  report_secondary=False,
                  report_cigar=True)

    # Write the header of the SAM file
    with open("result.sam", "w") as f:
        f.write("@HD\tVN:1.0\tSO:unsorted\n")
        f.write("@SQ\tSN:{}\tLN:{}\n".format(subject.id, len(subject.seq)))
        f.write("@PG\tID:Striped-Smith-Waterman\tPN:pyssw\tVN:0.1\n")
        f.write(
            "@CO\tScore_values = match {}, mismatch {}, gap_open {}, gap_extend {}\n"
            .format(opt.match, opt.mismatch, opt.gap_open, opt.gap_extend))
        f.write("@CO\tFilter Options = min_score {}, min_len {}\n".format(
            opt.min_score, opt.min_len))

        print("Starting alignment of queries against the subject sequence")
        start = time()
        # Align each query along the subject an write result in a SAM file
        i = 0
        for query in query_gen:

            # Find the best alignment
            if opt.reverse:
                al, orient = find_best_align(ssw, query, float(opt.min_score),
                                             int(opt.min_len))
            else:
                al, orient = ssw.align(str(query.seq), float(opt.min_score),
                                       int(opt.min_len)), True

            # If valid match found
            if al:
                f.write(
                    sam_line(
                        qname=query.id,
                        flag=0 if orient else 16,
                        rname=subject.id,
                        pos=al.ref_begin + 1,
                        cigar=al.cigar_string,
                        seq=str(query.seq),
                        qual=SeqIO.QualityIO._get_sanger_quality_str(query)
                        if opt.qtype == "fastq" else "*",
                        tags=["AS:i:{}".format(al.score)]))

            # If no valid match found and -u flag activated (report unaligned)
            elif opt.unaligned:
                f.write(
                    sam_line(
                        qname=query.id,
                        flag=4,
                        seq=str(query.seq),
                        qual=SeqIO.QualityIO._get_sanger_quality_str(query)
                        if opt.qtype == "fastq" else "*"))
            # Else = match unreported

            # Progress bar
            i += 1
            if i in nseq_list:
                frac = i / float(nseq)
                t = time() - start
                print("{} sequences \t{}% \tRemaining time = {}s".format(
                    i, int(frac * 100), round(t / frac - t, 2)))

        print("\n{} Sequences processed in {}s".format(
            i, round(time() - start, 2)))
Example #9
0
from ssw_wrap import Aligner


polyA = Aligner("A"*200,
                match=5,
                mismatch=3,
                gap_open=4,
                gap_extend=1,
                report_secondary=False,
                report_cigar=True)


query_seq = "CTACGTAGCTAGCTAGCTATGCTAGCTGATGCTAGCTGTGTAAAAAAAAAAAAAAGAAAAAATTTAAAAAAAACGTGCTAGCTGTGCTATTAGCTAGTCGTGGCTAGTGTAC"
result = polyA.align(query_seq, min_score=20, min_len=20)
begin = result.query_begin
end = result.query_end

print(" "*begin+query_seq[begin:end+1]+"\n"+query_seq)