Python LocalAlignment Beispiele, swalign.LocalAlignment Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: identifyOfftargetSites_SA.py Projekt: chloefishstar/Guide-seq-Modified

def alignSequences(ref_seq, query_seq, mis_allow):
    """remove PAm site """
    ref_seq=ref_seq[:-6]
    match = 2
    mismatch = -1
    ref_length = len(ref_seq) + 6
    matches_required = len(ref_seq) - mis_allow  # allow up to 8 mismatches
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring, gap_penalty=-100, gap_extension_penalty=-100, prefer_gap_runs=True)  # you can also choose gap penalties, etc...
    # sw = swalign.LocalAlignment(scoring, gap_penalty=-10, gap_extension_penalty=-0.5, prefer_gap_runs=True)  # you can also choose gap penalties, etc...
    forward_alignment = sw.align(ref_seq, query_seq)
    reverse_alignment = sw.align(ref_seq, reverseComplement(query_seq))
    if forward_alignment.matches >= matches_required and forward_alignment.matches > reverse_alignment.matches:
        start_pad = forward_alignment.r_pos
        start = forward_alignment.q_pos - start_pad
        end_pad = ref_length - forward_alignment.r_end
        end = forward_alignment.q_end + end_pad
        strand = "+"
        return [forward_alignment.query[start:end], ref_length - forward_alignment.matches - 6, end - start, strand, start, end]
    elif reverse_alignment.matches >= matches_required and reverse_alignment.matches > forward_alignment.matches:
        start_pad = reverse_alignment.r_pos
        start = reverse_alignment.q_pos - start_pad
        end_pad = ref_length - reverse_alignment.r_end
        end = reverse_alignment.q_end + end_pad
        strand = "-"
        return [reverse_alignment.query[start:end], ref_length - reverse_alignment.matches - 6, end - start, strand, start, end]
    else:
        return ["", "", "", "", "", ""]

Beispiel #2

0

Datei anzeigen

Datei: ALGO_PALWS.py Projekt: uzmakhann/A-package-for-optimizing-the-DNA-fragment-assembly

 def clculatedelta(s,i,j):
     import swalign;
     dc=0;
     match = 1
     mismatch = -3
     scoring = swalign.NucleotideScoringMatrix(match,mismatch)
     sw = swalign.LocalAlignment(scoring)
     ali1 = sw.align(s[i-1],s[j]);
     ali1.dump();
     ali2 = sw.align(s[i],s[j+1]);
     ali2.dump();
     delt_f = ali1.score + ali2.score;
     f1=ali1.score;
     f2=ali2.score;
     ali1 = sw.align(s[i-1],s[i]);
     ali1.dump();
     ali2 = sw.align(s[j],s[j+1]);
     ali2.dump();
     delt_f = delt_f - (ali1.score + ali2.score);
     f3=ali1.score;
     f4=ali2.score;
     if(f3>30):
         dc=dc+1;
     if(f4>30):
         dc=dc+1;
     if(f1>30):
         dc=dc-1;
     if(f2>30):
         dc=dc-1;
     
     return delt_f,dc;

Beispiel #3

0

Datei anzeigen

Datei: new_automated_annotation.py Projekt: WhalleyT/STACEI

def  _get_mhc_pep(none_tcr, fasta):
    MATCH = 2
    MISMATCH = -1
    SW_SCORE = swalign.NucleotideScoringMatrix(MATCH, MISMATCH)
    SMITH_WATERMAN = swalign.LocalAlignment(SW_SCORE)
    b2m_found = False

    proteins = {}

    peptides = []
    mhcas = []
    mhcbs = []

    mhc_class = None

    with open(fasta) as f:
        for name, seq in _read_fasta(f):
            letter = name.split(":")[1]
            if letter in none_tcr:
                proteins[letter] = seq

    for test in proteins:
        scores = []
        refs = []
        for reference in mhc_fastas:
            refs.append(reference)
            ref_protein = mhc_fastas[reference]
            hit_protein = proteins[test]

            swout = SMITH_WATERMAN.align(ref_protein, hit_protein)
            swout = swout.score
            scores.append(swout)

        max_score = max(scores)
        max_index = scores.index(max(scores))

        protein_len = len(hit_protein)

        #Low BLAST score
        if max_score < 50:
            #is it a peptide?
            if protein_len <= 40:
                peptides.append(test)
        else:
            ref_name = refs[max_index]

            if ref_name == "B2M":
                mhcbs.append(test)
                b2m_found = True
            elif "D" in ref_name.split("*")[0] and "B" in ref_name.split("*")[0]: #contains a d for class II and b for beta chain
                mhcbs.append(test)
            else:
                mhcas.append(test)

    if b2m_found:
        mhc_class = 1
    else:
        mhc_class = 2

    return peptides, mhcas, mhcbs, mhc_class

Beispiel #4

0

Datei anzeigen

Datei: identifyOfftargetSites.py Projekt: chrisamiller/guideseq

def alignSequences(ref_seq, query_seq):
    sys.stderr.write(ref_seq + "\t" + query_seq + "\n");
    match = 2
    mismatch = -1
    ref_length = len(ref_seq)
    matches_required = len(ref_seq) - 1 - 19  # allow up to 20 mismatches
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring, gap_penalty=-100, gap_extension_penalty=-100, prefer_gap_runs=True)  # you can also choose gap penalties, etc...
    #sw = swalign.LocalAlignment(scoring, gap_penalty=-10, gap_extension_penalty=-0.5, prefer_gap_runs=True)  # you can also choose gap penalties, etc...
    forward_alignment = sw.align(ref_seq, query_seq)
    reverse_alignment = sw.align(ref_seq, reverseComplement(query_seq))
    sys.stderr.write("fwdmatch: " + str(forward_alignment.matches) + "\n")
    sys.stderr.write("revmatch: " + str(reverse_alignment.matches) + "\n")

    if forward_alignment.matches >= matches_required and forward_alignment.matches > reverse_alignment.matches:
        start_pad = forward_alignment.r_pos
        start = forward_alignment.q_pos - start_pad
        end_pad = ref_length - forward_alignment.r_end
        end = forward_alignment.q_end + end_pad
        strand = "+"
        return [forward_alignment.query[start:end], ref_length - forward_alignment.matches - 1, end - start, strand, start, end]
    elif reverse_alignment.matches >= matches_required and reverse_alignment.matches > forward_alignment.matches:
        start_pad = reverse_alignment.r_pos
        start = reverse_alignment.q_pos - start_pad
        end_pad = ref_length - reverse_alignment.r_end
        end = reverse_alignment.q_end + end_pad
        strand = "-"
        return [reverse_alignment.query[start:end], ref_length - reverse_alignment.matches - 1, end - start, strand, start, end]
    else:
        return ["", "", "", "", "", ""]

Beispiel #5

0

Datei anzeigen

def do_swalign(seq1, seq2, match=2, mismatch=-1, gap_penalty=-2, gap_extension_decay=0.5):
    """
    Align two sequences using swalign
    """
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring, gap_penalty=gap_penalty, gap_extension_decay=gap_extension_decay)
    aln = sw.align(seq1, seq2)
    return aln

Beispiel #6

0

Datei anzeigen

Datei: softclip.py Projekt: drvenki/svcaller

def getAlign(ref_seq, query_seq):
    match = 1
    mismatch = -5
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(
        scoring)  # you can also choose gap penalties, etc...
    alignment = sw.align(ref_seq, query_seq)
    return alignment

Beispiel #7

0

Datei anzeigen

def getScoreSW(aa1, aa2, gap_penalty=-10):
    # set blosum scoring matrix
    scoring = swalign.ScoringMatrix('blosum_45.txt')

    # align the sequences
    sw = swalign.LocalAlignment(scoring, gap_penalty)
    alignment = sw.align(aa1, aa2)

    return alignment.score

Beispiel #8

0

Datei anzeigen

def smith_water_align(seq1, seq2):
    """
    Applies Smith-Waterman sequence alignment algo. on the two input sequences and returns the score
    :param seq1: aminoacid sequence 1
    :param seq2: aminoacid sequence 2
    :return:
    """
    scoring = swalign.NucleotideScoringMatrix(MATCH, MISMATCH)
    align_obj = swalign.LocalAlignment(scoring, GAP_PENALTY,
                                       GAP_EXTEND_PENALTY)
    align = align_obj.align(seq1, seq2)
    return align

Beispiel #9

0

Datei anzeigen

Datei: alignment.py Projekt: Asichurter/APISeqFewShot

def align(s1, s2, out):
    match = 2
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)

    # This sets up the aligner object. You must set your scoring matrix, but
    # you can also choose gap penalties, etc...
    sw = swalign.LocalAlignment(scoring)

    # Using your aligner object, calculate the alignment between
    # ref (first) and query (second)
    alignment = sw.align(s1, s2)

    return alignment.identity

Beispiel #10

0

Datei anzeigen

def swFactory():
    match = 2
    mismatch = -1
    gap_penalty = -1
    gap_extension_penalty = -1
    gap_extension_decay = 0.0
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    return swalign.LocalAlignment((scoring),
                                  gap_penalty,
                                  gap_extension_penalty,
                                  gap_extension_decay=gap_extension_decay,
                                  verbose=False,
                                  globalalign=False,
                                  full_query=False)

Beispiel #11

0

Datei anzeigen

Datei: pop_scorss.py Projekt: uzmakhann/A-package-for-optimizing-the-DNA-fragment-assembly

def init_pop_score(chromosomes):
    pop_fitness = list();
    import swalign
    scoring = swalign.NucleotideScoringMatrix(1,-3)
    sw = swalign.LocalAlignment(scoring)
    for i in range(len(chromosomes)):
        arrayscore=list();
        for j in range(len(chromosomes[i])-1):
            alignment = sw.align(chromosomes[i][j],chromosomes[i][j+1]);
            alignment.dump();
            var = alignment.score;
            arrayscore.append(var);
        pop_fitness.append(sum(arrayscore))
#    print(arrayscore);
    print(pop_fitness)
    return pop_fitness

Beispiel #12

0

Datei anzeigen

Datei: DSab_code.py Projekt: zoolie/DSab-origin

def sw_one(query,refseq):
    match = 5
    mismatch = -4
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring,gap_penalty = -30,gap_extension_penalty = -1)
    alignment = sw.align(refseq, query)
    #score = alignment.score
    q_pos = alignment.q_pos
    q_end = alignment.q_end
    r_pos = alignment.r_pos
    #print q_pos, q_end, r_pos, r_end
    q_len = q_end-q_pos
    middle_q = q_pos+0.5*q_len
    middle_r = r_pos+0.5*q_len
    #print query,refseq
    #print middle_q, middle_r
    return middle_q, middle_r

Beispiel #13

0

Datei anzeigen

def assemble_seq(readid2seq, junc_seq, tmp_file_path):

    match = 2
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)

    sw = swalign.LocalAlignment(
        scoring)  # you can also choose gap penalties, etc...

    hout = open(tmp_file_path + ".tmp3.assemble_input.fa", 'w')
    for tid in sorted(readid2seq):
        print >> hout, '>' + tid
        print >> hout, readid2seq[tid]
    hout.close()

    hout = open(tmp_file_path + ".tmp3.assemble_output.fq", 'w')
    sret = subprocess.call(
        ["fml-asm", tmp_file_path + ".tmp3.assemble_input.fa"], stdout=hout)
    hout.close()

    if sret != 0:
        print >> sys.stderr, "fml-asm error, error code: " + str(sret)
        sys.exit()

    line_num = 0
    temp_contig = ""
    with open(tmp_file_path + ".tmp3.assemble_output.fq", 'r') as hin:
        for line in hin:
            line_num = line_num + 1
            if line_num % 4 == 2:
                tseq = line.rstrip('\n')

                aln_1 = sw.align(tseq, junc_seq)
                if aln_1.score >= 35:
                    ttcontig = tseq[aln_1.r_end:]
                    if len(ttcontig) > len(temp_contig): temp_contig = ttcontig

                aln_2 = sw.align(tseq, my_seq.reverse_complement(junc_seq))
                if aln_2.score >= 35:
                    ttcontig = my_seq.reverse_complement(tseq[:aln_2.r_pos])
                    if len(ttcontig) > len(temp_contig): temp_contig = ttcontig

    # subprocess.call(["rm", "-rf", tmp_file_path + ".tmp3.assemble_input.fa"])
    # subprocess.call(["rm", "-rf", tmp_file_path + ".tmp3.assemble_output.fq"])
    return temp_contig

Beispiel #14

0

Datei anzeigen

def RRGA_score(init_center):
    print(init_center)
    arrayscore = list()
    import swalign
    match = 1
    mismatch = -3
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring)
    for j in range(len(init_center) - 1):

        alignment = sw.align(init_center[j], init_center[j + 1])
        #alignment = sw.align('ACACACTA','AGCACACA');
        alignment.dump()
        var = alignment.score
        arrayscore.append(var)
    init_cen_fitness = sum(arrayscore)

    return init_cen_fitness

Beispiel #15

0

Datei anzeigen

Datei: align.py Projekt: WhalleyT/STACEI

def calculate_missing_start(old, new):
    """
    This function finds the start of the alignment for
    the new IMGT numbered sequence, in essence it jumps
    past the sequences lost in the HMM
    """

    MATCH = 2
    MISMATCH = -1
    SCORE = swalign.NucleotideScoringMatrix(MATCH, MISMATCH)

    sw = swalign.LocalAlignment(SCORE)

    new = new.replace(".", "")

    alignment = sw.align(old, new)
    offset = alignment.r_pos

    return offset

Beispiel #16

0

Datei anzeigen

def fastq_trim(fastq,
               linker_5=None,
               linker_3=None,
               out=sys.stdout,
               pct_identity=0.8,
               min_trim=4,
               min_len=25,
               verbose=False,
               quiet=False,
               failed_out=None):
    '''
    fname - the fastq filename
    linker_5 - the 5' linker to remove
    linker_3 - the 3' linker to remove
    out - an output stream (eg: file, stdout)
    pct_identity - the percentage of matches that must be present in the alignment to strip away linkers
    min_trim - the distance away from the edges that the linkers much match w/in
    failed_out - an output for failed reads
    '''

    sw = swalign.LocalAlignment(swalign.NucleotideScoringMatrix(2, -1), -1)
    removed = 0
    trimmed = 0
    is_colorspace = fastq.is_colorspace  # preload to keep reader happy.
    for read in fastq.fetch(quiet=quiet):
        retval = seq_trim(read.name, read.seq, read.qual, linker_5, linker_3,
                          is_colorspace, sw, pct_identity, min_trim, min_len,
                          verbose)
        if not retval:
            if failed_out:
                read.write(failed_out)
            removed += 1
        else:
            n_seq, n_qual = retval

            if len(read.qual) != n_qual:
                trimmed += 1

            read.clone(seq=n_seq, qual=n_qual).write(out)

    if not quiet:
        sys.stderr.write('Trimmed: %s\n' % trimmed)
        sys.stderr.write('Removed: %s (len)\n' % removed)

Beispiel #17

0

Datei anzeigen

Datei: ssc-hdfinder.py Projekt: verheytb/pbssc

def is_similar(seq1, seq2, window):
    """

    :param seq1:
    :param seq2:
    :return: True if there are no differences or if the differences are 20bp-separated single indels.
    """
    if seq1 == seq2:
        return True
    sw = swalign.LocalAlignment(
        scoring)  # you can also choose gap penalties, etc...
    al = sw.align(seq1, seq2)
    seq1g = seq1
    seq2g = seq2
    assert al.q_pos == al.r_pos
    position = 0
    for c in al.cigar:
        if c[1] == "M":
            position += c[0]
        elif c[1] == "I":
            seq1g = seq1g[0:position] + "-" * c[0] + seq1g[position:]
        elif c[1] == "D":
            seq2g = seq2g[0:position] + "-" * c[0] + seq2g[position:]
    assert len(seq1g) == len(seq2g)
    for x in range(len(seq1g)):
        if seq1g[x] != seq2g[x]:
            if seq1g[x] in ("A", "C", "T",
                            "G") and seq2g[x] in ("A", "C", "T",
                                                  "G"):  # substitution
                return False
            elif seq1g[x] == "-" or seq2g[x] == "-":  # indels
                right_side = max(x - window, 0)
                left_side = min(x + window, len(seq1g))
                flankseqs = [
                    seq1g[right_side:x], seq1g[x + 1:left_side],
                    seq2g[right_side:x], seq2g[x + 1:left_side]
                ]
                if any([("-" in s) for s in flankseqs]):
                    return False
            else:
                sys.exit("Error: DNA sequences must be in capital letters.")
    return True

Beispiel #18

0

Datei anzeigen

def last_fit_score(sequence):
    print(sequence, len(sequence))
    import swalign
    contg = 0
    arrayscore = list()
    match = 1
    mismatch = -3
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring)
    for j in range(len(sequence) - 1):

        alignment = sw.align(sequence[j], sequence[j + 1])
        alignment.dump()
        var = alignment.score
        if (var > 30):
            contg = contg - 1
        else:
            contg = contg + 1
        arrayscore.append(var)
    fitness_value = sum(arrayscore)
    return fitness_value, contg

Beispiel #19

0

Datei anzeigen

Datei: RASLseqAlign.py Projekt: gladstone-institutes/RASLseqTools

def swalign_df(ref, query):
    '''
    This function returns swalign info:
    ref, query, r_pos, r_end, q_pos, q_end, score, matches, mismatches, identity, cigar
    '''
    match = 2
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)

    sw = swalign.LocalAlignment(
        scoring, gap_extension_penalty=-2,
        prefer_gap_runs=False)  # you can also choose gap penalties, etc...
    aligned = sw.align(ref, query)  #ref, query
    #return alignment

    align_series = pd.Series([ref, query, aligned.r_pos, aligned.r_end, aligned.q_pos, aligned.q_end,\
             aligned.score, aligned.matches, aligned.mismatches, \
             aligned.identity, cigar_to_align(aligned.cigar)])
    align_series.index = 'ref, query, r_pos, r_end, q_pos, q_end, score, matches, mismatches, identity, cigar'.split(
        ', ')
    return align_series

Beispiel #20

0

Datei anzeigen

Datei: treebuilder.py Projekt: johncburns1/TreeOfLife

            mismatch = 1

    return myscore


t0 = time.clock()

unique_sequences = unique_sequences[
    0:MAX_SEQUENCES]  # only do the first 50 for speed..

similarity_matrix = np.zeros((len(unique_sequences), len(unique_sequences)))

dist_matrix = np.zeros((len(unique_sequences), len(unique_sequences)))

scoring = sw.ScoringMatrix('scoring_matrix.txt')
sw = sw.LocalAlignment(scoring)

match = 2
n = 0

for x, seq1 in enumerate(unique_sequences):
    for y, seq2 in enumerate(unique_sequences):

        alignment = nw.global_align(allsequences[seq1], allsequences[seq2])

        score = float(
            nw.score_alignment(alignment[0],
                               alignment[1],
                               gap_open=-5,
                               gap_extend=-2,
                               matrix='scoring_matrix.txt'))

Beispiel #21

0

Datei anzeigen

def annot_sbinsert(infile):
    #choose your own values here… 2 and -1 are common.
    match = 2
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring)
    
    file_path = pathlib.Path(infile)
    name = file_path.stem
    outfile_full = name+'.ann.fil.txt'
    outfile = name+'.ann.fil.strict.txt'
    
    #5’- GTGTATGTAAACTTCCGACTTCAACTG ---TA
    seq_dict = {'CGACTTCA': -4,'GACTTCAA': -3,'ACTTCAAC': -2,'CTTCAACT': -1,'TTCAACTG': 0}
    
    with open(f'./break/{infile}', 'r') as hin, open(f'./break/{outfile}', 'w') as h1out, open(f'./break/{outfile_full}', 'w') as h2out:
        next(hin)
        header = '\t'.join(['chr','position(0-start)','ori_position','family_size','sb_direction','adj_seq','soft_clip_len','sw_match','sw_match_ratio','sb-seq|','|genome2bases'])
        h1out.write(header+'\n')
        h2out.write(header+'\n')
        
        for line in hin:

            F = line.rstrip('\n').split('\t')
            #seq ='GTGTATGTAAACTTCCGACTTCAACTGTAATTCTCTGAATGG'
            chr = F[0]
            position = F[1]
            read_direction = F[3]
            reads = F[4]
            seq = F[6]
            sb_length = int(F[2])
            sb_direction = F[7]
            break_motif = seq[sb_length-8:sb_length]
            j = 0
            #check sb motif in nearby break position.
            if break_motif in seq_dict.keys():
                sb_motif = '+'
                adj = seq_dict.get(break_motif)
                if adj == -4 and seq[sb_length-8:sb_length+4] == 'CGACTTCAACTG':
                    j = 1
                    genome2base = seq[sb_length+4:sb_length+6]
                elif adj == -3 and seq[sb_length-8:sb_length+3] == 'GACTTCAACTG':
                    j = 1
                    genome2base = seq[sb_length+3:sb_length+5]
                elif adj == -2 and seq[sb_length-8:sb_length+2] == 'ACTTCAACTG':
                    j = 1
                    genome2base = seq[sb_length+2:sb_length+4]
                elif adj == -1 and seq[sb_length-8:sb_length+1] == 'CTTCAACTG':
                    j = 1
                    genome2base = seq[sb_length+1:sb_length+3]
                elif adj == 0 and seq[sb_length-8:sb_length] == 'TTCAACTG':
                    j = 1
                    genome2base = seq[sb_length:sb_length+2]
                if j == 1:
                    #adj_seq ori_sb_genome_seq = seq[0:sb_length] + '|' + seq[sb_length:]
                    adj_seq = seq[0:sb_length+int(adj)*-1] + '|' + seq[sb_length+int(adj)*-1:]
                    if read_direction == '-':
                        adj_position = int(position) + int(adj)
                    if read_direction == '+':
                        adj_position = int(position) + int(adj)*-1
                else:
                    adj_position = int(position)
                    sb_motif = '-'
                    adj_seq = seq[0:sb_length] + '|' + seq[sb_length:]
                    genome2base = seq[sb_length:sb_length+2]
            else:
                #adj_position = original position
                adj_position = int(position)
                sb_motif = '-'
                genome2base = seq[sb_length:sb_length+2]
                adj_seq = seq[0:sb_length] + '|' + seq[sb_length:]
                    
            #swalign
            alignment = sw.align(seq, 'GTGTATGTAAACTTCCGACTTCAACTG')
            sw_match = alignment.matches
            #sw_cigar = alignment.cigar 
            #ratio sw_match score/soft-clipping length
            sw_ratio = round(float(sw_match/sb_length),2)
            
            #'chr','position(0-start)','ori_position','family_size','sb_direction','adj_seq','soft_clip_len','sw_match','sw_match_ratio','sb-seq|','|genome2bases','ori_sb|genome_seq'
            rec = str(chr)+'\t'+str(adj_position)+'\t'+str(position)+'\t'+str(reads)+'\t'+str(sb_direction)+'\t'+str(adj_seq)+'\t'+ \
                    str(sb_length)+'\t'+str(sw_match)+'\t'+str(sw_ratio)+'\t'+ \
                    str(sb_motif)+'\t'+str(genome2base)+'\n'         
                    # filtering: reads>=3, sb_length =22~30 and sw_match_ratio>0.9 PASS
            if int(reads)>=3 and float(sw_ratio)>0.9:
                h2out.write(rec) 
            if int(reads)>=3 and float(sw_ratio)>0.9 and int(sb_length)>=22 and int(sb_length)<=30 and str(sb_motif)=='+':
                h1out.write(rec)
            else: continue

Beispiel #22

0

Datei anzeigen

Datei: swalign_score.py Projekt: uzmakhann/A-package-for-optimizing-the-DNA-fragment-assembly

def scores(sequence1,matrix1):
    match = 1
    mismatch = -3
    scoring = swalign.NucleotideScoringMatrix(match,mismatch)
    sw = swalign.LocalAlignment(scoring)
    #matrix = np.zeros((len(sequence1),len(sequence1)));
    for i in range(len(sequence1)):
        for j in range(len(sequence1)):
            j=j+i;
            if(i==j):
                continue;
            if(j>len(sequence1)-1):
                break;
            
            alignment = sw.align(sequence1[i],sequence1[j]);
        #alignment = sw.align('ACACACTA','AGCACACA');
            alignment.dump();
            var = alignment.score;
            matrix1[i][j] =var;
            matrix1[j][i] =var;
                 #arry.append(var); 
#print(matrix) 
    return matrix1


#def fitness_chromose(chromes_array):
#     
#     match = 2
#     mismatch = -1
##     fitness_scor=list();
#     fitness_scor=[]
#     fit_scor_chroms=[]
##     fit_scor_chroms=list();
#     fitness_scor='';
#     fit_scor_chroms='';
#     for kk in range(len(chromes_array)):
#         for k in range(len(chromes_array[kk])-1):
#             scoring = swalign.NucleotideScoringMatrix(match,mismatch)
#             sw = swalign.LocalAlignment(scoring)
#             alignment = sw.align(chromes_array[k],chromes_array[k+1]);
#             alignment.dump();
#             var = alignment.score;
##             fitness_scor.append (var)
#             import numpy as np
#             fitness_scor=np.array([var])
#         fitness_scor=sum(fitness_scor)
##             print(fitness_scor,'varrrrrrrrr')
##             fitness_scor.append(var)
#             #dddddddddddd
#         fit_scor_chroms=np.array([ fitness_scor])
##         fit_scor_chroms.append([fitness_scor])
#         fitness_scor=[];
#     print(fit_scor_chroms,'uzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz')
#     return fit_scor_chroms
#          










##*****************************************************************###
#import numpy as np
#import copy; 
#def center(center1,chromosomes):
##center=['TTAA','CCGA','CTTA','AAAT','TTCG','GGCA','AAATC'];
#    rand_order=np.random.randint(4,size=(len(center1)+1));
##print(rand_order,'random')
#    len_pop=2
##    chromosomes=list();
#    chrom=list();
#    chrom=copy.deepcopy(center1);     # center use swaping order to generate chromosomes
##print(chrom,('init chrom'))
#    for k in range(len_pop):
#        for z in range(len(center1)):
#            print(rand_order,'random order')
#            print(type(rand_order),'typerandom order')
#            print(z,'L')
#            ind=rand_order[0,z]
#            print(ind,'ind')
#            #if(ind!=len(center1)):
#            print(type(chrom),'uzma')
#            s=chrom[ind]
#            print(chrom,'chromee')
#            print(ind,'ind')
#            print(rand_order,'order')
#            chrom[ind]=chrom[ind+1]           # swap order to generate chromosomes
#            chrom[ind+1]=s
##            else:
##              s=chrom[len(center)]
##              print(chrom,'chromee')
##              print(ind,'ind')
##              chrom[ind]=chrom[1]           # swap order to generate chromosomes
##              chrom[1]=s  
#    #print(chrom,('updat chorm'))
#        chromosomes.append(chrom);
#    #print(chromosom,('append chorm'));
#   # print(chrom,(' charm'))
#        chrom=[];
#        chrom=copy.deepcopy(center1);
##    print(center,('checking chrom'));
##    print(chrom,('new center'));
#        rand_order='';
#        rand_order=np.random.randint(5, size=(1,len(center1)+1))
#    #print(rand_order)
#    return chromosomes

Beispiel #23

0

Datei anzeigen

for line in f:
    if re.match('^>', line):
        b.append(next(f))
f.close()

#MANIPULATED VARIABLE, CAN CHANGE TO YOUR PREFERENCES
match = 1
mismatch = -3
gap = -1

fw = open(
    'Matrix_swalign_{0}_match={1}_mismatch={2}.txt'.format(
        filename, match, mismatch), 'w+')

scoring = swalign.NucleotideScoringMatrix(match, mismatch)
sw = swalign.LocalAlignment(scoring)

for i in range(len(b)):
    for j in range(len(b)):
        if i == j:
            print >> fw, "0,",
        else:
            scoring = swalign.NucleotideScoringMatrix(match, mismatch)
            sw = swalign.LocalAlignment(
                scoring, gap)  #CAN ADD MORE VARIABLE. REFER SWALIGN FILES
            a = sw.align(b[i], b[j])
            s = a.dump()
            print >> fw, "{0},".format(s),
    print >> fw, "\n",

fw.close()

Beispiel #24

0

Datei anzeigen

__author__ = 'michael'
'''
F**K SO STUPEEED
'''

import os
import sys
from collections import defaultdict
from ast import literal_eval
import string
import swalign
import zipfile
SCORING = swalign.NucleotideScoringMatrix()
ALIGNER = swalign.LocalAlignment(SCORING, globalalign=True, gap_penalty=-5)
UPPERCASE = set(string.ascii_uppercase)
AGREEMENT_THRESHOLD = .99
PRIOR_WEIGHT = 3


def read_reference(ref_fn):
    with open(ref_fn, 'r') as ref_file:
        genome_name = ref_file.readline().strip()[1:]
        chrom_name = ref_file.readline().strip()[4:]
        ref = ''.join([line.strip() for line in ref_file])
    return genome_name, chrom_name, ref


def process_line(line):
    """
    :param line:
    :return:

Beispiel #25

0

Datei anzeigen

Datei: fusionFrameFinder.py Projekt: jpfeil/fusionFrameFinder

def align_filter(ref, query, mode, fusion_name=''):
    """
    Aligns query to reference CDS sequence using the Smith-Waterman algorithm. Returns None if the
    alignment is clipped at the fusion boundary.

    :param str ref: In-frame reference transcript
    :param str query: Query transcript
    :param str mode: 'donor' or 'acceptor'
    :return: Alignment features
    :rtype: namedtuple
    """
    alignment_stats = collections.namedtuple(
        'AlignStats', 'qstart, qstop, rstart, rstop, insertions, deletions')

    bounds_regex = re.compile(
        r'Query\s*:\s*(?P<qstart>\d*)\s*\w*\s*(?P<qstop>\d*)\s*[\|\s]*\s*Ref\s*:\s*(?P<rstart>\d*)\s*\w*\s*(?P<rstop>\d*)'
    )
    match_regex = re.compile(r'Matches: \d+\s\((?P<percent>\d*)')

    match = 5
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring)
    alignment = sw.align(ref, query)

    # First check that the donor sequence is in frame
    insertions = 0
    deletions = 0
    for chr, num in alignment.cigar:
        if chr == 'I':
            insertions += num

        elif chr == 'D':
            deletions += num

    # Next grab the alignment statistics
    string = StringIO()
    alignment.dump(out=string)
    dump = string.getvalue()
    string.close()

    # If it's not a near perfect match, then the quality of the assembly may not be good
    m = match_regex.search(dump)
    if m:
        percent = int(m.group('percent'))
        if percent < 99:
            # print('Percent matching %d' % percent)
            # print(dump)
            logging.debug('%s: low percent matching %d' %
                          (fusion_name, percent))
            return

    # If the fusion transcript passes these filters, then grab the bounds of the alignment
    s = bounds_regex.search(dump)
    if s:
        qstart = int(s.group('qstart')) - 1  # Make zero-based
        qstop = int(s.group('qstop'))

        # If the end of the fusion transcript doesn't align, then skip this transcript
        if mode == 'donor' and qstop != len(query):
            logging.debug(
                '%s: donor alignment does not include end of sequence' %
                fusion_name)
            return

        elif mode == 'acceptor' and qstart != 0:
            logging.debug(
                '%s: acceptor alignment does not include start of sequence' %
                fusion_name)
            # print('Acceptor doesn\'t start at one')
            # print(dump)
            return

        rstart = int(s.group('rstart')) - 1  # Make zero-based
        rstop = int(s.group('rstart'))

        return alignment_stats(qstart, qstop, rstart, rstop, insertions,
                               deletions)

    else:
        return

Beispiel #26

0

Datei anzeigen

Datei: data_precessor.py Projekt: ikhachatryan93/pyhton_html_table_converter

def get_smith_waterman():
    match = 4
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    return swalign.LocalAlignment(scoring, globalalign=True)

Beispiel #27

0

Datei anzeigen

Datei: VDJ_usage.py Projekt: WhalleyT/STACEI

import swalign
import subprocess
import sys

from Bio.SeqIO import convert as bio_convert

# Global Parameters
MATCH = 2
MISMATCH = -1
SW_SCORE = swalign.NucleotideScoringMatrix(MATCH, MISMATCH)
SMITH_WATERMAN = swalign.LocalAlignment(SW_SCORE)


def _read_fasta(fp):
    name = None
    seq = []

    for line in fp:
        line = line.rstrip()
        if line.startswith(">"):
            if name:
                yield (name, ''.join(seq))
            name, seq = line, []
        else:
            seq.append(line)
    if name:
        yield (name, ''.join(seq))


def _missing_elements(nums):
    nums = list(map(int, nums))

Beispiel #28

0

Datei anzeigen

                not two.isalnum()):  # Whitespace matches don't count
            return 0

        elif one.lower() != two.lower() and (
                one.isdigit() or
                two.isdigit()):  # Give a severe penalty for mismatching digits
            return 10 * self._mismatch

        else:
            assert (one != two)
            return self._mismatch


sw = swalign.LocalAlignment(FuzzySpeciesNameScoringMatrix(),
                            -9,
                            -1,
                            verbose=False,
                            globalalign=False,
                            full_query=True)


def test():
    #NCBI    = "Candidatus Curtissbacteria bacterium GW20 11 GWA1 40 16"
    NCBI2 = "Curtissbacteria bacterium GW20 11 GWA1 40 16"
    Nmicros6 = "Curtissbacteria GWA1 OP11 40 13 partial"
    Nmicros7 = "Curtissbacteria GWA1 OP11 40 16 partial"
    Nmicros8 = "Curtissbacteria GWA1 OP11 40 16"
    Nmicros82 = "Curtissbacteria GWA1 OP11 40 13"

    #sw.align( Nmicros6, NCBI2 ).dump()
    #sw.align( NCBI2, Nmicros6 ).dump()
    #sw.align( Nmicros6, Nmicros6 ).dump()

Beispiel #29

0

Datei anzeigen

Datei: sw.py Projekt: ACAD-UofA/tools

import sys
import swalign

scoring = swalign.NucleotideScoringMatrix()
sw = swalign.LocalAlignment(scoring)


def selfanneal(s):
    aln = sw.align(s, swalign.revcomp(s))
    return aln.score


if __name__ == "__main__":
    for s in sys.argv[1:]:
        print(selfanneal)

Beispiel #30

0

Datei anzeigen

Note 2: This isn't appropriate for color-space FASTQ files with a prefix base
        included in the read sequence, since it trims an equal number of bases
        from the sequence and quality FASTQ lines.
'''

import sys
import os
import gzip

from ngsutils.support import revcomp, FASTA
from ngsutils.fastq import FASTQ

import swalign

sw = swalign.LocalAlignment(swalign.NucleotideScoringMatrix(2, -1))


def fastx_barcode_split(reader,
                        outtempl,
                        barcodes,
                        edits=0,
                        pos=0,
                        allow_revcomp=False,
                        gzip_output=False,
                        stats_fname=None):
    '''
    Split FAST[QA] reads from {fname} using {barcodes} (hash) to write them to
    output files named like {templ}.
    '''