def alignSequences(ref_seq, query_seq, mis_allow):
    """remove PAm site """
    ref_seq=ref_seq[:-6]
    match = 2
    mismatch = -1
    ref_length = len(ref_seq) + 6
    matches_required = len(ref_seq) - mis_allow  # allow up to 8 mismatches
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring, gap_penalty=-100, gap_extension_penalty=-100, prefer_gap_runs=True)  # you can also choose gap penalties, etc...
    # sw = swalign.LocalAlignment(scoring, gap_penalty=-10, gap_extension_penalty=-0.5, prefer_gap_runs=True)  # you can also choose gap penalties, etc...
    forward_alignment = sw.align(ref_seq, query_seq)
    reverse_alignment = sw.align(ref_seq, reverseComplement(query_seq))
    if forward_alignment.matches >= matches_required and forward_alignment.matches > reverse_alignment.matches:
        start_pad = forward_alignment.r_pos
        start = forward_alignment.q_pos - start_pad
        end_pad = ref_length - forward_alignment.r_end
        end = forward_alignment.q_end + end_pad
        strand = "+"
        return [forward_alignment.query[start:end], ref_length - forward_alignment.matches - 6, end - start, strand, start, end]
    elif reverse_alignment.matches >= matches_required and reverse_alignment.matches > forward_alignment.matches:
        start_pad = reverse_alignment.r_pos
        start = reverse_alignment.q_pos - start_pad
        end_pad = ref_length - reverse_alignment.r_end
        end = reverse_alignment.q_end + end_pad
        strand = "-"
        return [reverse_alignment.query[start:end], ref_length - reverse_alignment.matches - 6, end - start, strand, start, end]
    else:
        return ["", "", "", "", "", ""]
 def clculatedelta(s,i,j):
     import swalign;
     dc=0;
     match = 1
     mismatch = -3
     scoring = swalign.NucleotideScoringMatrix(match,mismatch)
     sw = swalign.LocalAlignment(scoring)
     ali1 = sw.align(s[i-1],s[j]);
     ali1.dump();
     ali2 = sw.align(s[i],s[j+1]);
     ali2.dump();
     delt_f = ali1.score + ali2.score;
     f1=ali1.score;
     f2=ali2.score;
     ali1 = sw.align(s[i-1],s[i]);
     ali1.dump();
     ali2 = sw.align(s[j],s[j+1]);
     ali2.dump();
     delt_f = delt_f - (ali1.score + ali2.score);
     f3=ali1.score;
     f4=ali2.score;
     if(f3>30):
         dc=dc+1;
     if(f4>30):
         dc=dc+1;
     if(f1>30):
         dc=dc-1;
     if(f2>30):
         dc=dc-1;
     
     return delt_f,dc;    
def  _get_mhc_pep(none_tcr, fasta):
    MATCH = 2
    MISMATCH = -1
    SW_SCORE = swalign.NucleotideScoringMatrix(MATCH, MISMATCH)
    SMITH_WATERMAN = swalign.LocalAlignment(SW_SCORE)
    b2m_found = False

    proteins = {}

    peptides = []
    mhcas = []
    mhcbs = []

    mhc_class = None

    with open(fasta) as f:
        for name, seq in _read_fasta(f):
            letter = name.split(":")[1]
            if letter in none_tcr:
                proteins[letter] = seq

    for test in proteins:
        scores = []
        refs = []
        for reference in mhc_fastas:
            refs.append(reference)
            ref_protein = mhc_fastas[reference]
            hit_protein = proteins[test]

            swout = SMITH_WATERMAN.align(ref_protein, hit_protein)
            swout = swout.score
            scores.append(swout)

        max_score = max(scores)
        max_index = scores.index(max(scores))

        protein_len = len(hit_protein)

        #Low BLAST score
        if max_score < 50:
            #is it a peptide?
            if protein_len <= 40:
                peptides.append(test)
        else:
            ref_name = refs[max_index]

            if ref_name == "B2M":
                mhcbs.append(test)
                b2m_found = True
            elif "D" in ref_name.split("*")[0] and "B" in ref_name.split("*")[0]: #contains a d for class II and b for beta chain
                mhcbs.append(test)
            else:
                mhcas.append(test)

    if b2m_found:
        mhc_class = 1
    else:
        mhc_class = 2

    return peptides, mhcas, mhcbs, mhc_class
def alignSequences(ref_seq, query_seq):
    sys.stderr.write(ref_seq + "\t" + query_seq + "\n");
    match = 2
    mismatch = -1
    ref_length = len(ref_seq)
    matches_required = len(ref_seq) - 1 - 19  # allow up to 20 mismatches
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring, gap_penalty=-100, gap_extension_penalty=-100, prefer_gap_runs=True)  # you can also choose gap penalties, etc...
    #sw = swalign.LocalAlignment(scoring, gap_penalty=-10, gap_extension_penalty=-0.5, prefer_gap_runs=True)  # you can also choose gap penalties, etc...
    forward_alignment = sw.align(ref_seq, query_seq)
    reverse_alignment = sw.align(ref_seq, reverseComplement(query_seq))
    sys.stderr.write("fwdmatch: " + str(forward_alignment.matches) + "\n")
    sys.stderr.write("revmatch: " + str(reverse_alignment.matches) + "\n")

    if forward_alignment.matches >= matches_required and forward_alignment.matches > reverse_alignment.matches:
        start_pad = forward_alignment.r_pos
        start = forward_alignment.q_pos - start_pad
        end_pad = ref_length - forward_alignment.r_end
        end = forward_alignment.q_end + end_pad
        strand = "+"
        return [forward_alignment.query[start:end], ref_length - forward_alignment.matches - 1, end - start, strand, start, end]
    elif reverse_alignment.matches >= matches_required and reverse_alignment.matches > forward_alignment.matches:
        start_pad = reverse_alignment.r_pos
        start = reverse_alignment.q_pos - start_pad
        end_pad = ref_length - reverse_alignment.r_end
        end = reverse_alignment.q_end + end_pad
        strand = "-"
        return [reverse_alignment.query[start:end], ref_length - reverse_alignment.matches - 1, end - start, strand, start, end]
    else:
        return ["", "", "", "", "", ""]
Beispiel #5
0
def do_swalign(seq1, seq2, match=2, mismatch=-1, gap_penalty=-2, gap_extension_decay=0.5):
    """
    Align two sequences using swalign
    """
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring, gap_penalty=gap_penalty, gap_extension_decay=gap_extension_decay)
    aln = sw.align(seq1, seq2)
    return aln
Beispiel #6
0
def getAlign(ref_seq, query_seq):
    match = 1
    mismatch = -5
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(
        scoring)  # you can also choose gap penalties, etc...
    alignment = sw.align(ref_seq, query_seq)
    return alignment
Beispiel #7
0
def getScoreSW(aa1, aa2, gap_penalty=-10):
    # set blosum scoring matrix
    scoring = swalign.ScoringMatrix('blosum_45.txt')

    # align the sequences
    sw = swalign.LocalAlignment(scoring, gap_penalty)
    alignment = sw.align(aa1, aa2)

    return alignment.score
Beispiel #8
0
def smith_water_align(seq1, seq2):
    """
    Applies Smith-Waterman sequence alignment algo. on the two input sequences and returns the score
    :param seq1: aminoacid sequence 1
    :param seq2: aminoacid sequence 2
    :return:
    """
    scoring = swalign.NucleotideScoringMatrix(MATCH, MISMATCH)
    align_obj = swalign.LocalAlignment(scoring, GAP_PENALTY,
                                       GAP_EXTEND_PENALTY)
    align = align_obj.align(seq1, seq2)
    return align
Beispiel #9
0
def align(s1, s2, out):
    match = 2
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)

    # This sets up the aligner object. You must set your scoring matrix, but
    # you can also choose gap penalties, etc...
    sw = swalign.LocalAlignment(scoring)

    # Using your aligner object, calculate the alignment between
    # ref (first) and query (second)
    alignment = sw.align(s1, s2)

    return alignment.identity
Beispiel #10
0
def swFactory():
    match = 2
    mismatch = -1
    gap_penalty = -1
    gap_extension_penalty = -1
    gap_extension_decay = 0.0
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    return swalign.LocalAlignment((scoring),
                                  gap_penalty,
                                  gap_extension_penalty,
                                  gap_extension_decay=gap_extension_decay,
                                  verbose=False,
                                  globalalign=False,
                                  full_query=False)
def init_pop_score(chromosomes):
    pop_fitness = list();
    import swalign
    scoring = swalign.NucleotideScoringMatrix(1,-3)
    sw = swalign.LocalAlignment(scoring)
    for i in range(len(chromosomes)):
        arrayscore=list();
        for j in range(len(chromosomes[i])-1):
            alignment = sw.align(chromosomes[i][j],chromosomes[i][j+1]);
            alignment.dump();
            var = alignment.score;
            arrayscore.append(var);
        pop_fitness.append(sum(arrayscore))
#    print(arrayscore);
    print(pop_fitness)
    return pop_fitness
Beispiel #12
0
def sw_one(query,refseq):
    match = 5
    mismatch = -4
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring,gap_penalty = -30,gap_extension_penalty = -1)
    alignment = sw.align(refseq, query)
    #score = alignment.score
    q_pos = alignment.q_pos
    q_end = alignment.q_end
    r_pos = alignment.r_pos
    #print q_pos, q_end, r_pos, r_end
    q_len = q_end-q_pos
    middle_q = q_pos+0.5*q_len
    middle_r = r_pos+0.5*q_len
    #print query,refseq
    #print middle_q, middle_r
    return middle_q, middle_r
Beispiel #13
0
def assemble_seq(readid2seq, junc_seq, tmp_file_path):

    match = 2
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)

    sw = swalign.LocalAlignment(
        scoring)  # you can also choose gap penalties, etc...

    hout = open(tmp_file_path + ".tmp3.assemble_input.fa", 'w')
    for tid in sorted(readid2seq):
        print >> hout, '>' + tid
        print >> hout, readid2seq[tid]
    hout.close()

    hout = open(tmp_file_path + ".tmp3.assemble_output.fq", 'w')
    sret = subprocess.call(
        ["fml-asm", tmp_file_path + ".tmp3.assemble_input.fa"], stdout=hout)
    hout.close()

    if sret != 0:
        print >> sys.stderr, "fml-asm error, error code: " + str(sret)
        sys.exit()

    line_num = 0
    temp_contig = ""
    with open(tmp_file_path + ".tmp3.assemble_output.fq", 'r') as hin:
        for line in hin:
            line_num = line_num + 1
            if line_num % 4 == 2:
                tseq = line.rstrip('\n')

                aln_1 = sw.align(tseq, junc_seq)
                if aln_1.score >= 35:
                    ttcontig = tseq[aln_1.r_end:]
                    if len(ttcontig) > len(temp_contig): temp_contig = ttcontig

                aln_2 = sw.align(tseq, my_seq.reverse_complement(junc_seq))
                if aln_2.score >= 35:
                    ttcontig = my_seq.reverse_complement(tseq[:aln_2.r_pos])
                    if len(ttcontig) > len(temp_contig): temp_contig = ttcontig

    # subprocess.call(["rm", "-rf", tmp_file_path + ".tmp3.assemble_input.fa"])
    # subprocess.call(["rm", "-rf", tmp_file_path + ".tmp3.assemble_output.fq"])
    return temp_contig
Beispiel #14
0
def RRGA_score(init_center):
    print(init_center)
    arrayscore = list()
    import swalign
    match = 1
    mismatch = -3
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring)
    for j in range(len(init_center) - 1):

        alignment = sw.align(init_center[j], init_center[j + 1])
        #alignment = sw.align('ACACACTA','AGCACACA');
        alignment.dump()
        var = alignment.score
        arrayscore.append(var)
    init_cen_fitness = sum(arrayscore)

    return init_cen_fitness
Beispiel #15
0
def calculate_missing_start(old, new):
    """
    This function finds the start of the alignment for
    the new IMGT numbered sequence, in essence it jumps
    past the sequences lost in the HMM
    """

    MATCH = 2
    MISMATCH = -1
    SCORE = swalign.NucleotideScoringMatrix(MATCH, MISMATCH)

    sw = swalign.LocalAlignment(SCORE)

    new = new.replace(".", "")

    alignment = sw.align(old, new)
    offset = alignment.r_pos

    return offset
Beispiel #16
0
def fastq_trim(fastq,
               linker_5=None,
               linker_3=None,
               out=sys.stdout,
               pct_identity=0.8,
               min_trim=4,
               min_len=25,
               verbose=False,
               quiet=False,
               failed_out=None):
    '''
    fname - the fastq filename
    linker_5 - the 5' linker to remove
    linker_3 - the 3' linker to remove
    out - an output stream (eg: file, stdout)
    pct_identity - the percentage of matches that must be present in the alignment to strip away linkers
    min_trim - the distance away from the edges that the linkers much match w/in
    failed_out - an output for failed reads
    '''

    sw = swalign.LocalAlignment(swalign.NucleotideScoringMatrix(2, -1), -1)
    removed = 0
    trimmed = 0
    is_colorspace = fastq.is_colorspace  # preload to keep reader happy.
    for read in fastq.fetch(quiet=quiet):
        retval = seq_trim(read.name, read.seq, read.qual, linker_5, linker_3,
                          is_colorspace, sw, pct_identity, min_trim, min_len,
                          verbose)
        if not retval:
            if failed_out:
                read.write(failed_out)
            removed += 1
        else:
            n_seq, n_qual = retval

            if len(read.qual) != n_qual:
                trimmed += 1

            read.clone(seq=n_seq, qual=n_qual).write(out)

    if not quiet:
        sys.stderr.write('Trimmed: %s\n' % trimmed)
        sys.stderr.write('Removed: %s (len)\n' % removed)
Beispiel #17
0
def is_similar(seq1, seq2, window):
    """

    :param seq1:
    :param seq2:
    :return: True if there are no differences or if the differences are 20bp-separated single indels.
    """
    if seq1 == seq2:
        return True
    sw = swalign.LocalAlignment(
        scoring)  # you can also choose gap penalties, etc...
    al = sw.align(seq1, seq2)
    seq1g = seq1
    seq2g = seq2
    assert al.q_pos == al.r_pos
    position = 0
    for c in al.cigar:
        if c[1] == "M":
            position += c[0]
        elif c[1] == "I":
            seq1g = seq1g[0:position] + "-" * c[0] + seq1g[position:]
        elif c[1] == "D":
            seq2g = seq2g[0:position] + "-" * c[0] + seq2g[position:]
    assert len(seq1g) == len(seq2g)
    for x in range(len(seq1g)):
        if seq1g[x] != seq2g[x]:
            if seq1g[x] in ("A", "C", "T",
                            "G") and seq2g[x] in ("A", "C", "T",
                                                  "G"):  # substitution
                return False
            elif seq1g[x] == "-" or seq2g[x] == "-":  # indels
                right_side = max(x - window, 0)
                left_side = min(x + window, len(seq1g))
                flankseqs = [
                    seq1g[right_side:x], seq1g[x + 1:left_side],
                    seq2g[right_side:x], seq2g[x + 1:left_side]
                ]
                if any([("-" in s) for s in flankseqs]):
                    return False
            else:
                sys.exit("Error: DNA sequences must be in capital letters.")
    return True
Beispiel #18
0
def last_fit_score(sequence):
    print(sequence, len(sequence))
    import swalign
    contg = 0
    arrayscore = list()
    match = 1
    mismatch = -3
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring)
    for j in range(len(sequence) - 1):

        alignment = sw.align(sequence[j], sequence[j + 1])
        alignment.dump()
        var = alignment.score
        if (var > 30):
            contg = contg - 1
        else:
            contg = contg + 1
        arrayscore.append(var)
    fitness_value = sum(arrayscore)
    return fitness_value, contg
def swalign_df(ref, query):
    '''
    This function returns swalign info:
    ref, query, r_pos, r_end, q_pos, q_end, score, matches, mismatches, identity, cigar
    '''
    match = 2
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)

    sw = swalign.LocalAlignment(
        scoring, gap_extension_penalty=-2,
        prefer_gap_runs=False)  # you can also choose gap penalties, etc...
    aligned = sw.align(ref, query)  #ref, query
    #return alignment

    align_series = pd.Series([ref, query, aligned.r_pos, aligned.r_end, aligned.q_pos, aligned.q_end,\
             aligned.score, aligned.matches, aligned.mismatches, \
             aligned.identity, cigar_to_align(aligned.cigar)])
    align_series.index = 'ref, query, r_pos, r_end, q_pos, q_end, score, matches, mismatches, identity, cigar'.split(
        ', ')
    return align_series
Beispiel #20
0
            mismatch = 1

    return myscore


t0 = time.clock()

unique_sequences = unique_sequences[
    0:MAX_SEQUENCES]  # only do the first 50 for speed..

similarity_matrix = np.zeros((len(unique_sequences), len(unique_sequences)))

dist_matrix = np.zeros((len(unique_sequences), len(unique_sequences)))

scoring = sw.ScoringMatrix('scoring_matrix.txt')
sw = sw.LocalAlignment(scoring)

match = 2
n = 0

for x, seq1 in enumerate(unique_sequences):
    for y, seq2 in enumerate(unique_sequences):

        alignment = nw.global_align(allsequences[seq1], allsequences[seq2])

        score = float(
            nw.score_alignment(alignment[0],
                               alignment[1],
                               gap_open=-5,
                               gap_extend=-2,
                               matrix='scoring_matrix.txt'))
Beispiel #21
0
def annot_sbinsert(infile):
    #choose your own values here… 2 and -1 are common.
    match = 2
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring)
    
    file_path = pathlib.Path(infile)
    name = file_path.stem
    outfile_full = name+'.ann.fil.txt'
    outfile = name+'.ann.fil.strict.txt'
    
    #5’- GTGTATGTAAACTTCCGACTTCAACTG ---TA
    seq_dict = {'CGACTTCA': -4,'GACTTCAA': -3,'ACTTCAAC': -2,'CTTCAACT': -1,'TTCAACTG': 0}
    
    with open(f'./break/{infile}', 'r') as hin, open(f'./break/{outfile}', 'w') as h1out, open(f'./break/{outfile_full}', 'w') as h2out:
        next(hin)
        header = '\t'.join(['chr','position(0-start)','ori_position','family_size','sb_direction','adj_seq','soft_clip_len','sw_match','sw_match_ratio','sb-seq|','|genome2bases'])
        h1out.write(header+'\n')
        h2out.write(header+'\n')
        
        for line in hin:

            F = line.rstrip('\n').split('\t')
            #seq ='GTGTATGTAAACTTCCGACTTCAACTGTAATTCTCTGAATGG'
            chr = F[0]
            position = F[1]
            read_direction = F[3]
            reads = F[4]
            seq = F[6]
            sb_length = int(F[2])
            sb_direction = F[7]
            break_motif = seq[sb_length-8:sb_length]
            j = 0
            #check sb motif in nearby break position.
            if break_motif in seq_dict.keys():
                sb_motif = '+'
                adj = seq_dict.get(break_motif)
                if adj == -4 and seq[sb_length-8:sb_length+4] == 'CGACTTCAACTG':
                    j = 1
                    genome2base = seq[sb_length+4:sb_length+6]
                elif adj == -3 and seq[sb_length-8:sb_length+3] == 'GACTTCAACTG':
                    j = 1
                    genome2base = seq[sb_length+3:sb_length+5]
                elif adj == -2 and seq[sb_length-8:sb_length+2] == 'ACTTCAACTG':
                    j = 1
                    genome2base = seq[sb_length+2:sb_length+4]
                elif adj == -1 and seq[sb_length-8:sb_length+1] == 'CTTCAACTG':
                    j = 1
                    genome2base = seq[sb_length+1:sb_length+3]
                elif adj == 0 and seq[sb_length-8:sb_length] == 'TTCAACTG':
                    j = 1
                    genome2base = seq[sb_length:sb_length+2]
                if j == 1:
                    #adj_seq ori_sb_genome_seq = seq[0:sb_length] + '|' + seq[sb_length:]
                    adj_seq = seq[0:sb_length+int(adj)*-1] + '|' + seq[sb_length+int(adj)*-1:]
                    if read_direction == '-':
                        adj_position = int(position) + int(adj)
                    if read_direction == '+':
                        adj_position = int(position) + int(adj)*-1
                else:
                    adj_position = int(position)
                    sb_motif = '-'
                    adj_seq = seq[0:sb_length] + '|' + seq[sb_length:]
                    genome2base = seq[sb_length:sb_length+2]
            else:
                #adj_position = original position
                adj_position = int(position)
                sb_motif = '-'
                genome2base = seq[sb_length:sb_length+2]
                adj_seq = seq[0:sb_length] + '|' + seq[sb_length:]
                    
            #swalign
            alignment = sw.align(seq, 'GTGTATGTAAACTTCCGACTTCAACTG')
            sw_match = alignment.matches
            #sw_cigar = alignment.cigar 
            #ratio sw_match score/soft-clipping length
            sw_ratio = round(float(sw_match/sb_length),2)
            
            #'chr','position(0-start)','ori_position','family_size','sb_direction','adj_seq','soft_clip_len','sw_match','sw_match_ratio','sb-seq|','|genome2bases','ori_sb|genome_seq'
            rec = str(chr)+'\t'+str(adj_position)+'\t'+str(position)+'\t'+str(reads)+'\t'+str(sb_direction)+'\t'+str(adj_seq)+'\t'+ \
                    str(sb_length)+'\t'+str(sw_match)+'\t'+str(sw_ratio)+'\t'+ \
                    str(sb_motif)+'\t'+str(genome2base)+'\n'         
                    # filtering: reads>=3, sb_length =22~30 and sw_match_ratio>0.9 PASS
            if int(reads)>=3 and float(sw_ratio)>0.9:
                h2out.write(rec) 
            if int(reads)>=3 and float(sw_ratio)>0.9 and int(sb_length)>=22 and int(sb_length)<=30 and str(sb_motif)=='+':
                h1out.write(rec)
            else: continue        
def scores(sequence1,matrix1):
    match = 1
    mismatch = -3
    scoring = swalign.NucleotideScoringMatrix(match,mismatch)
    sw = swalign.LocalAlignment(scoring)
    #matrix = np.zeros((len(sequence1),len(sequence1)));
    for i in range(len(sequence1)):
        for j in range(len(sequence1)):
            j=j+i;
            if(i==j):
                continue;
            if(j>len(sequence1)-1):
                break;
            
            alignment = sw.align(sequence1[i],sequence1[j]);
        #alignment = sw.align('ACACACTA','AGCACACA');
            alignment.dump();
            var = alignment.score;
            matrix1[i][j] =var;
            matrix1[j][i] =var;
                 #arry.append(var); 
#print(matrix) 
    return matrix1


#def fitness_chromose(chromes_array):
#     
#     match = 2
#     mismatch = -1
##     fitness_scor=list();
#     fitness_scor=[]
#     fit_scor_chroms=[]
##     fit_scor_chroms=list();
#     fitness_scor='';
#     fit_scor_chroms='';
#     for kk in range(len(chromes_array)):
#         for k in range(len(chromes_array[kk])-1):
#             scoring = swalign.NucleotideScoringMatrix(match,mismatch)
#             sw = swalign.LocalAlignment(scoring)
#             alignment = sw.align(chromes_array[k],chromes_array[k+1]);
#             alignment.dump();
#             var = alignment.score;
##             fitness_scor.append (var)
#             import numpy as np
#             fitness_scor=np.array([var])
#         fitness_scor=sum(fitness_scor)
##             print(fitness_scor,'varrrrrrrrr')
##             fitness_scor.append(var)
#             #dddddddddddd
#         fit_scor_chroms=np.array([ fitness_scor])
##         fit_scor_chroms.append([fitness_scor])
#         fitness_scor=[];
#     print(fit_scor_chroms,'uzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz')
#     return fit_scor_chroms
#          










##*****************************************************************###
#import numpy as np
#import copy; 
#def center(center1,chromosomes):
##center=['TTAA','CCGA','CTTA','AAAT','TTCG','GGCA','AAATC'];
#    rand_order=np.random.randint(4,size=(len(center1)+1));
##print(rand_order,'random')
#    len_pop=2
##    chromosomes=list();
#    chrom=list();
#    chrom=copy.deepcopy(center1);     # center use swaping order to generate chromosomes
##print(chrom,('init chrom'))
#    for k in range(len_pop):
#        for z in range(len(center1)):
#            print(rand_order,'random order')
#            print(type(rand_order),'typerandom order')
#            print(z,'L')
#            ind=rand_order[0,z]
#            print(ind,'ind')
#            #if(ind!=len(center1)):
#            print(type(chrom),'uzma')
#            s=chrom[ind]
#            print(chrom,'chromee')
#            print(ind,'ind')
#            print(rand_order,'order')
#            chrom[ind]=chrom[ind+1]           # swap order to generate chromosomes
#            chrom[ind+1]=s
##            else:
##              s=chrom[len(center)]
##              print(chrom,'chromee')
##              print(ind,'ind')
##              chrom[ind]=chrom[1]           # swap order to generate chromosomes
##              chrom[1]=s  
#    #print(chrom,('updat chorm'))
#        chromosomes.append(chrom);
#    #print(chromosom,('append chorm'));
#   # print(chrom,(' charm'))
#        chrom=[];
#        chrom=copy.deepcopy(center1);
##    print(center,('checking chrom'));
##    print(chrom,('new center'));
#        rand_order='';
#        rand_order=np.random.randint(5, size=(1,len(center1)+1))
#    #print(rand_order)
#    return chromosomes
Beispiel #23
0
for line in f:
    if re.match('^>', line):
        b.append(next(f))
f.close()

#MANIPULATED VARIABLE, CAN CHANGE TO YOUR PREFERENCES
match = 1
mismatch = -3
gap = -1

fw = open(
    'Matrix_swalign_{0}_match={1}_mismatch={2}.txt'.format(
        filename, match, mismatch), 'w+')

scoring = swalign.NucleotideScoringMatrix(match, mismatch)
sw = swalign.LocalAlignment(scoring)

for i in range(len(b)):
    for j in range(len(b)):
        if i == j:
            print >> fw, "0,",
        else:
            scoring = swalign.NucleotideScoringMatrix(match, mismatch)
            sw = swalign.LocalAlignment(
                scoring, gap)  #CAN ADD MORE VARIABLE. REFER SWALIGN FILES
            a = sw.align(b[i], b[j])
            s = a.dump()
            print >> fw, "{0},".format(s),
    print >> fw, "\n",

fw.close()
Beispiel #24
0
__author__ = 'michael'
'''
F**K SO STUPEEED
'''

import os
import sys
from collections import defaultdict
from ast import literal_eval
import string
import swalign
import zipfile
SCORING = swalign.NucleotideScoringMatrix()
ALIGNER = swalign.LocalAlignment(SCORING, globalalign=True, gap_penalty=-5)
UPPERCASE = set(string.ascii_uppercase)
AGREEMENT_THRESHOLD = .99
PRIOR_WEIGHT = 3


def read_reference(ref_fn):
    with open(ref_fn, 'r') as ref_file:
        genome_name = ref_file.readline().strip()[1:]
        chrom_name = ref_file.readline().strip()[4:]
        ref = ''.join([line.strip() for line in ref_file])
    return genome_name, chrom_name, ref


def process_line(line):
    """
    :param line:
    :return:
def align_filter(ref, query, mode, fusion_name=''):
    """
    Aligns query to reference CDS sequence using the Smith-Waterman algorithm. Returns None if the
    alignment is clipped at the fusion boundary.

    :param str ref: In-frame reference transcript
    :param str query: Query transcript
    :param str mode: 'donor' or 'acceptor'
    :return: Alignment features
    :rtype: namedtuple
    """
    alignment_stats = collections.namedtuple(
        'AlignStats', 'qstart, qstop, rstart, rstop, insertions, deletions')

    bounds_regex = re.compile(
        r'Query\s*:\s*(?P<qstart>\d*)\s*\w*\s*(?P<qstop>\d*)\s*[\|\s]*\s*Ref\s*:\s*(?P<rstart>\d*)\s*\w*\s*(?P<rstop>\d*)'
    )
    match_regex = re.compile(r'Matches: \d+\s\((?P<percent>\d*)')

    match = 5
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring)
    alignment = sw.align(ref, query)

    # First check that the donor sequence is in frame
    insertions = 0
    deletions = 0
    for chr, num in alignment.cigar:
        if chr == 'I':
            insertions += num

        elif chr == 'D':
            deletions += num

    # Next grab the alignment statistics
    string = StringIO()
    alignment.dump(out=string)
    dump = string.getvalue()
    string.close()

    # If it's not a near perfect match, then the quality of the assembly may not be good
    m = match_regex.search(dump)
    if m:
        percent = int(m.group('percent'))
        if percent < 99:
            # print('Percent matching %d' % percent)
            # print(dump)
            logging.debug('%s: low percent matching %d' %
                          (fusion_name, percent))
            return

    # If the fusion transcript passes these filters, then grab the bounds of the alignment
    s = bounds_regex.search(dump)
    if s:
        qstart = int(s.group('qstart')) - 1  # Make zero-based
        qstop = int(s.group('qstop'))

        # If the end of the fusion transcript doesn't align, then skip this transcript
        if mode == 'donor' and qstop != len(query):
            logging.debug(
                '%s: donor alignment does not include end of sequence' %
                fusion_name)
            return

        elif mode == 'acceptor' and qstart != 0:
            logging.debug(
                '%s: acceptor alignment does not include start of sequence' %
                fusion_name)
            # print('Acceptor doesn\'t start at one')
            # print(dump)
            return

        rstart = int(s.group('rstart')) - 1  # Make zero-based
        rstop = int(s.group('rstart'))

        return alignment_stats(qstart, qstop, rstart, rstop, insertions,
                               deletions)

    else:
        return
def get_smith_waterman():
    match = 4
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    return swalign.LocalAlignment(scoring, globalalign=True)
Beispiel #27
0
import swalign
import subprocess
import sys

from Bio.SeqIO import convert as bio_convert

# Global Parameters
MATCH = 2
MISMATCH = -1
SW_SCORE = swalign.NucleotideScoringMatrix(MATCH, MISMATCH)
SMITH_WATERMAN = swalign.LocalAlignment(SW_SCORE)


def _read_fasta(fp):
    name = None
    seq = []

    for line in fp:
        line = line.rstrip()
        if line.startswith(">"):
            if name:
                yield (name, ''.join(seq))
            name, seq = line, []
        else:
            seq.append(line)
    if name:
        yield (name, ''.join(seq))


def _missing_elements(nums):
    nums = list(map(int, nums))
Beispiel #28
0
                not two.isalnum()):  # Whitespace matches don't count
            return 0

        elif one.lower() != two.lower() and (
                one.isdigit() or
                two.isdigit()):  # Give a severe penalty for mismatching digits
            return 10 * self._mismatch

        else:
            assert (one != two)
            return self._mismatch


sw = swalign.LocalAlignment(FuzzySpeciesNameScoringMatrix(),
                            -9,
                            -1,
                            verbose=False,
                            globalalign=False,
                            full_query=True)


def test():
    #NCBI    = "Candidatus Curtissbacteria bacterium GW20 11 GWA1 40 16"
    NCBI2 = "Curtissbacteria bacterium GW20 11 GWA1 40 16"
    Nmicros6 = "Curtissbacteria GWA1 OP11 40 13 partial"
    Nmicros7 = "Curtissbacteria GWA1 OP11 40 16 partial"
    Nmicros8 = "Curtissbacteria GWA1 OP11 40 16"
    Nmicros82 = "Curtissbacteria GWA1 OP11 40 13"

    #sw.align( Nmicros6, NCBI2 ).dump()
    #sw.align( NCBI2, Nmicros6 ).dump()
    #sw.align( Nmicros6, Nmicros6 ).dump()
Beispiel #29
0
import sys
import swalign

scoring = swalign.NucleotideScoringMatrix()
sw = swalign.LocalAlignment(scoring)


def selfanneal(s):
    aln = sw.align(s, swalign.revcomp(s))
    return aln.score


if __name__ == "__main__":
    for s in sys.argv[1:]:
        print(selfanneal)
Beispiel #30
0
Note 2: This isn't appropriate for color-space FASTQ files with a prefix base
        included in the read sequence, since it trims an equal number of bases
        from the sequence and quality FASTQ lines.
'''

import sys
import os
import gzip

from ngsutils.support import revcomp, FASTA
from ngsutils.fastq import FASTQ

import swalign

sw = swalign.LocalAlignment(swalign.NucleotideScoringMatrix(2, -1))


def fastx_barcode_split(reader,
                        outtempl,
                        barcodes,
                        edits=0,
                        pos=0,
                        allow_revcomp=False,
                        gzip_output=False,
                        stats_fname=None):
    '''
    Split FAST[QA] reads from {fname} using {barcodes} (hash) to write them to
    output files named like {templ}.
    '''