def alignSequences(ref_seq, query_seq, mis_allow): """remove PAm site """ ref_seq=ref_seq[:-6] match = 2 mismatch = -1 ref_length = len(ref_seq) + 6 matches_required = len(ref_seq) - mis_allow # allow up to 8 mismatches scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment(scoring, gap_penalty=-100, gap_extension_penalty=-100, prefer_gap_runs=True) # you can also choose gap penalties, etc... # sw = swalign.LocalAlignment(scoring, gap_penalty=-10, gap_extension_penalty=-0.5, prefer_gap_runs=True) # you can also choose gap penalties, etc... forward_alignment = sw.align(ref_seq, query_seq) reverse_alignment = sw.align(ref_seq, reverseComplement(query_seq)) if forward_alignment.matches >= matches_required and forward_alignment.matches > reverse_alignment.matches: start_pad = forward_alignment.r_pos start = forward_alignment.q_pos - start_pad end_pad = ref_length - forward_alignment.r_end end = forward_alignment.q_end + end_pad strand = "+" return [forward_alignment.query[start:end], ref_length - forward_alignment.matches - 6, end - start, strand, start, end] elif reverse_alignment.matches >= matches_required and reverse_alignment.matches > forward_alignment.matches: start_pad = reverse_alignment.r_pos start = reverse_alignment.q_pos - start_pad end_pad = ref_length - reverse_alignment.r_end end = reverse_alignment.q_end + end_pad strand = "-" return [reverse_alignment.query[start:end], ref_length - reverse_alignment.matches - 6, end - start, strand, start, end] else: return ["", "", "", "", "", ""]
def clculatedelta(s,i,j): import swalign; dc=0; match = 1 mismatch = -3 scoring = swalign.NucleotideScoringMatrix(match,mismatch) sw = swalign.LocalAlignment(scoring) ali1 = sw.align(s[i-1],s[j]); ali1.dump(); ali2 = sw.align(s[i],s[j+1]); ali2.dump(); delt_f = ali1.score + ali2.score; f1=ali1.score; f2=ali2.score; ali1 = sw.align(s[i-1],s[i]); ali1.dump(); ali2 = sw.align(s[j],s[j+1]); ali2.dump(); delt_f = delt_f - (ali1.score + ali2.score); f3=ali1.score; f4=ali2.score; if(f3>30): dc=dc+1; if(f4>30): dc=dc+1; if(f1>30): dc=dc-1; if(f2>30): dc=dc-1; return delt_f,dc;
def _get_mhc_pep(none_tcr, fasta): MATCH = 2 MISMATCH = -1 SW_SCORE = swalign.NucleotideScoringMatrix(MATCH, MISMATCH) SMITH_WATERMAN = swalign.LocalAlignment(SW_SCORE) b2m_found = False proteins = {} peptides = [] mhcas = [] mhcbs = [] mhc_class = None with open(fasta) as f: for name, seq in _read_fasta(f): letter = name.split(":")[1] if letter in none_tcr: proteins[letter] = seq for test in proteins: scores = [] refs = [] for reference in mhc_fastas: refs.append(reference) ref_protein = mhc_fastas[reference] hit_protein = proteins[test] swout = SMITH_WATERMAN.align(ref_protein, hit_protein) swout = swout.score scores.append(swout) max_score = max(scores) max_index = scores.index(max(scores)) protein_len = len(hit_protein) #Low BLAST score if max_score < 50: #is it a peptide? if protein_len <= 40: peptides.append(test) else: ref_name = refs[max_index] if ref_name == "B2M": mhcbs.append(test) b2m_found = True elif "D" in ref_name.split("*")[0] and "B" in ref_name.split("*")[0]: #contains a d for class II and b for beta chain mhcbs.append(test) else: mhcas.append(test) if b2m_found: mhc_class = 1 else: mhc_class = 2 return peptides, mhcas, mhcbs, mhc_class
def alignSequences(ref_seq, query_seq): sys.stderr.write(ref_seq + "\t" + query_seq + "\n"); match = 2 mismatch = -1 ref_length = len(ref_seq) matches_required = len(ref_seq) - 1 - 19 # allow up to 20 mismatches scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment(scoring, gap_penalty=-100, gap_extension_penalty=-100, prefer_gap_runs=True) # you can also choose gap penalties, etc... #sw = swalign.LocalAlignment(scoring, gap_penalty=-10, gap_extension_penalty=-0.5, prefer_gap_runs=True) # you can also choose gap penalties, etc... forward_alignment = sw.align(ref_seq, query_seq) reverse_alignment = sw.align(ref_seq, reverseComplement(query_seq)) sys.stderr.write("fwdmatch: " + str(forward_alignment.matches) + "\n") sys.stderr.write("revmatch: " + str(reverse_alignment.matches) + "\n") if forward_alignment.matches >= matches_required and forward_alignment.matches > reverse_alignment.matches: start_pad = forward_alignment.r_pos start = forward_alignment.q_pos - start_pad end_pad = ref_length - forward_alignment.r_end end = forward_alignment.q_end + end_pad strand = "+" return [forward_alignment.query[start:end], ref_length - forward_alignment.matches - 1, end - start, strand, start, end] elif reverse_alignment.matches >= matches_required and reverse_alignment.matches > forward_alignment.matches: start_pad = reverse_alignment.r_pos start = reverse_alignment.q_pos - start_pad end_pad = ref_length - reverse_alignment.r_end end = reverse_alignment.q_end + end_pad strand = "-" return [reverse_alignment.query[start:end], ref_length - reverse_alignment.matches - 1, end - start, strand, start, end] else: return ["", "", "", "", "", ""]
def do_swalign(seq1, seq2, match=2, mismatch=-1, gap_penalty=-2, gap_extension_decay=0.5): """ Align two sequences using swalign """ scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment(scoring, gap_penalty=gap_penalty, gap_extension_decay=gap_extension_decay) aln = sw.align(seq1, seq2) return aln
def getAlign(ref_seq, query_seq): match = 1 mismatch = -5 scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment( scoring) # you can also choose gap penalties, etc... alignment = sw.align(ref_seq, query_seq) return alignment
def getScoreSW(aa1, aa2, gap_penalty=-10): # set blosum scoring matrix scoring = swalign.ScoringMatrix('blosum_45.txt') # align the sequences sw = swalign.LocalAlignment(scoring, gap_penalty) alignment = sw.align(aa1, aa2) return alignment.score
def smith_water_align(seq1, seq2): """ Applies Smith-Waterman sequence alignment algo. on the two input sequences and returns the score :param seq1: aminoacid sequence 1 :param seq2: aminoacid sequence 2 :return: """ scoring = swalign.NucleotideScoringMatrix(MATCH, MISMATCH) align_obj = swalign.LocalAlignment(scoring, GAP_PENALTY, GAP_EXTEND_PENALTY) align = align_obj.align(seq1, seq2) return align
def align(s1, s2, out): match = 2 mismatch = -1 scoring = swalign.NucleotideScoringMatrix(match, mismatch) # This sets up the aligner object. You must set your scoring matrix, but # you can also choose gap penalties, etc... sw = swalign.LocalAlignment(scoring) # Using your aligner object, calculate the alignment between # ref (first) and query (second) alignment = sw.align(s1, s2) return alignment.identity
def swFactory(): match = 2 mismatch = -1 gap_penalty = -1 gap_extension_penalty = -1 gap_extension_decay = 0.0 scoring = swalign.NucleotideScoringMatrix(match, mismatch) return swalign.LocalAlignment((scoring), gap_penalty, gap_extension_penalty, gap_extension_decay=gap_extension_decay, verbose=False, globalalign=False, full_query=False)
def init_pop_score(chromosomes): pop_fitness = list(); import swalign scoring = swalign.NucleotideScoringMatrix(1,-3) sw = swalign.LocalAlignment(scoring) for i in range(len(chromosomes)): arrayscore=list(); for j in range(len(chromosomes[i])-1): alignment = sw.align(chromosomes[i][j],chromosomes[i][j+1]); alignment.dump(); var = alignment.score; arrayscore.append(var); pop_fitness.append(sum(arrayscore)) # print(arrayscore); print(pop_fitness) return pop_fitness
def sw_one(query,refseq): match = 5 mismatch = -4 scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment(scoring,gap_penalty = -30,gap_extension_penalty = -1) alignment = sw.align(refseq, query) #score = alignment.score q_pos = alignment.q_pos q_end = alignment.q_end r_pos = alignment.r_pos #print q_pos, q_end, r_pos, r_end q_len = q_end-q_pos middle_q = q_pos+0.5*q_len middle_r = r_pos+0.5*q_len #print query,refseq #print middle_q, middle_r return middle_q, middle_r
def assemble_seq(readid2seq, junc_seq, tmp_file_path): match = 2 mismatch = -1 scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment( scoring) # you can also choose gap penalties, etc... hout = open(tmp_file_path + ".tmp3.assemble_input.fa", 'w') for tid in sorted(readid2seq): print >> hout, '>' + tid print >> hout, readid2seq[tid] hout.close() hout = open(tmp_file_path + ".tmp3.assemble_output.fq", 'w') sret = subprocess.call( ["fml-asm", tmp_file_path + ".tmp3.assemble_input.fa"], stdout=hout) hout.close() if sret != 0: print >> sys.stderr, "fml-asm error, error code: " + str(sret) sys.exit() line_num = 0 temp_contig = "" with open(tmp_file_path + ".tmp3.assemble_output.fq", 'r') as hin: for line in hin: line_num = line_num + 1 if line_num % 4 == 2: tseq = line.rstrip('\n') aln_1 = sw.align(tseq, junc_seq) if aln_1.score >= 35: ttcontig = tseq[aln_1.r_end:] if len(ttcontig) > len(temp_contig): temp_contig = ttcontig aln_2 = sw.align(tseq, my_seq.reverse_complement(junc_seq)) if aln_2.score >= 35: ttcontig = my_seq.reverse_complement(tseq[:aln_2.r_pos]) if len(ttcontig) > len(temp_contig): temp_contig = ttcontig # subprocess.call(["rm", "-rf", tmp_file_path + ".tmp3.assemble_input.fa"]) # subprocess.call(["rm", "-rf", tmp_file_path + ".tmp3.assemble_output.fq"]) return temp_contig
def RRGA_score(init_center): print(init_center) arrayscore = list() import swalign match = 1 mismatch = -3 scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment(scoring) for j in range(len(init_center) - 1): alignment = sw.align(init_center[j], init_center[j + 1]) #alignment = sw.align('ACACACTA','AGCACACA'); alignment.dump() var = alignment.score arrayscore.append(var) init_cen_fitness = sum(arrayscore) return init_cen_fitness
def calculate_missing_start(old, new): """ This function finds the start of the alignment for the new IMGT numbered sequence, in essence it jumps past the sequences lost in the HMM """ MATCH = 2 MISMATCH = -1 SCORE = swalign.NucleotideScoringMatrix(MATCH, MISMATCH) sw = swalign.LocalAlignment(SCORE) new = new.replace(".", "") alignment = sw.align(old, new) offset = alignment.r_pos return offset
def fastq_trim(fastq, linker_5=None, linker_3=None, out=sys.stdout, pct_identity=0.8, min_trim=4, min_len=25, verbose=False, quiet=False, failed_out=None): ''' fname - the fastq filename linker_5 - the 5' linker to remove linker_3 - the 3' linker to remove out - an output stream (eg: file, stdout) pct_identity - the percentage of matches that must be present in the alignment to strip away linkers min_trim - the distance away from the edges that the linkers much match w/in failed_out - an output for failed reads ''' sw = swalign.LocalAlignment(swalign.NucleotideScoringMatrix(2, -1), -1) removed = 0 trimmed = 0 is_colorspace = fastq.is_colorspace # preload to keep reader happy. for read in fastq.fetch(quiet=quiet): retval = seq_trim(read.name, read.seq, read.qual, linker_5, linker_3, is_colorspace, sw, pct_identity, min_trim, min_len, verbose) if not retval: if failed_out: read.write(failed_out) removed += 1 else: n_seq, n_qual = retval if len(read.qual) != n_qual: trimmed += 1 read.clone(seq=n_seq, qual=n_qual).write(out) if not quiet: sys.stderr.write('Trimmed: %s\n' % trimmed) sys.stderr.write('Removed: %s (len)\n' % removed)
def is_similar(seq1, seq2, window): """ :param seq1: :param seq2: :return: True if there are no differences or if the differences are 20bp-separated single indels. """ if seq1 == seq2: return True sw = swalign.LocalAlignment( scoring) # you can also choose gap penalties, etc... al = sw.align(seq1, seq2) seq1g = seq1 seq2g = seq2 assert al.q_pos == al.r_pos position = 0 for c in al.cigar: if c[1] == "M": position += c[0] elif c[1] == "I": seq1g = seq1g[0:position] + "-" * c[0] + seq1g[position:] elif c[1] == "D": seq2g = seq2g[0:position] + "-" * c[0] + seq2g[position:] assert len(seq1g) == len(seq2g) for x in range(len(seq1g)): if seq1g[x] != seq2g[x]: if seq1g[x] in ("A", "C", "T", "G") and seq2g[x] in ("A", "C", "T", "G"): # substitution return False elif seq1g[x] == "-" or seq2g[x] == "-": # indels right_side = max(x - window, 0) left_side = min(x + window, len(seq1g)) flankseqs = [ seq1g[right_side:x], seq1g[x + 1:left_side], seq2g[right_side:x], seq2g[x + 1:left_side] ] if any([("-" in s) for s in flankseqs]): return False else: sys.exit("Error: DNA sequences must be in capital letters.") return True
def last_fit_score(sequence): print(sequence, len(sequence)) import swalign contg = 0 arrayscore = list() match = 1 mismatch = -3 scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment(scoring) for j in range(len(sequence) - 1): alignment = sw.align(sequence[j], sequence[j + 1]) alignment.dump() var = alignment.score if (var > 30): contg = contg - 1 else: contg = contg + 1 arrayscore.append(var) fitness_value = sum(arrayscore) return fitness_value, contg
def swalign_df(ref, query): ''' This function returns swalign info: ref, query, r_pos, r_end, q_pos, q_end, score, matches, mismatches, identity, cigar ''' match = 2 mismatch = -1 scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment( scoring, gap_extension_penalty=-2, prefer_gap_runs=False) # you can also choose gap penalties, etc... aligned = sw.align(ref, query) #ref, query #return alignment align_series = pd.Series([ref, query, aligned.r_pos, aligned.r_end, aligned.q_pos, aligned.q_end,\ aligned.score, aligned.matches, aligned.mismatches, \ aligned.identity, cigar_to_align(aligned.cigar)]) align_series.index = 'ref, query, r_pos, r_end, q_pos, q_end, score, matches, mismatches, identity, cigar'.split( ', ') return align_series
mismatch = 1 return myscore t0 = time.clock() unique_sequences = unique_sequences[ 0:MAX_SEQUENCES] # only do the first 50 for speed.. similarity_matrix = np.zeros((len(unique_sequences), len(unique_sequences))) dist_matrix = np.zeros((len(unique_sequences), len(unique_sequences))) scoring = sw.ScoringMatrix('scoring_matrix.txt') sw = sw.LocalAlignment(scoring) match = 2 n = 0 for x, seq1 in enumerate(unique_sequences): for y, seq2 in enumerate(unique_sequences): alignment = nw.global_align(allsequences[seq1], allsequences[seq2]) score = float( nw.score_alignment(alignment[0], alignment[1], gap_open=-5, gap_extend=-2, matrix='scoring_matrix.txt'))
def annot_sbinsert(infile): #choose your own values here… 2 and -1 are common. match = 2 mismatch = -1 scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment(scoring) file_path = pathlib.Path(infile) name = file_path.stem outfile_full = name+'.ann.fil.txt' outfile = name+'.ann.fil.strict.txt' #5’- GTGTATGTAAACTTCCGACTTCAACTG ---TA seq_dict = {'CGACTTCA': -4,'GACTTCAA': -3,'ACTTCAAC': -2,'CTTCAACT': -1,'TTCAACTG': 0} with open(f'./break/{infile}', 'r') as hin, open(f'./break/{outfile}', 'w') as h1out, open(f'./break/{outfile_full}', 'w') as h2out: next(hin) header = '\t'.join(['chr','position(0-start)','ori_position','family_size','sb_direction','adj_seq','soft_clip_len','sw_match','sw_match_ratio','sb-seq|','|genome2bases']) h1out.write(header+'\n') h2out.write(header+'\n') for line in hin: F = line.rstrip('\n').split('\t') #seq ='GTGTATGTAAACTTCCGACTTCAACTGTAATTCTCTGAATGG' chr = F[0] position = F[1] read_direction = F[3] reads = F[4] seq = F[6] sb_length = int(F[2]) sb_direction = F[7] break_motif = seq[sb_length-8:sb_length] j = 0 #check sb motif in nearby break position. if break_motif in seq_dict.keys(): sb_motif = '+' adj = seq_dict.get(break_motif) if adj == -4 and seq[sb_length-8:sb_length+4] == 'CGACTTCAACTG': j = 1 genome2base = seq[sb_length+4:sb_length+6] elif adj == -3 and seq[sb_length-8:sb_length+3] == 'GACTTCAACTG': j = 1 genome2base = seq[sb_length+3:sb_length+5] elif adj == -2 and seq[sb_length-8:sb_length+2] == 'ACTTCAACTG': j = 1 genome2base = seq[sb_length+2:sb_length+4] elif adj == -1 and seq[sb_length-8:sb_length+1] == 'CTTCAACTG': j = 1 genome2base = seq[sb_length+1:sb_length+3] elif adj == 0 and seq[sb_length-8:sb_length] == 'TTCAACTG': j = 1 genome2base = seq[sb_length:sb_length+2] if j == 1: #adj_seq ori_sb_genome_seq = seq[0:sb_length] + '|' + seq[sb_length:] adj_seq = seq[0:sb_length+int(adj)*-1] + '|' + seq[sb_length+int(adj)*-1:] if read_direction == '-': adj_position = int(position) + int(adj) if read_direction == '+': adj_position = int(position) + int(adj)*-1 else: adj_position = int(position) sb_motif = '-' adj_seq = seq[0:sb_length] + '|' + seq[sb_length:] genome2base = seq[sb_length:sb_length+2] else: #adj_position = original position adj_position = int(position) sb_motif = '-' genome2base = seq[sb_length:sb_length+2] adj_seq = seq[0:sb_length] + '|' + seq[sb_length:] #swalign alignment = sw.align(seq, 'GTGTATGTAAACTTCCGACTTCAACTG') sw_match = alignment.matches #sw_cigar = alignment.cigar #ratio sw_match score/soft-clipping length sw_ratio = round(float(sw_match/sb_length),2) #'chr','position(0-start)','ori_position','family_size','sb_direction','adj_seq','soft_clip_len','sw_match','sw_match_ratio','sb-seq|','|genome2bases','ori_sb|genome_seq' rec = str(chr)+'\t'+str(adj_position)+'\t'+str(position)+'\t'+str(reads)+'\t'+str(sb_direction)+'\t'+str(adj_seq)+'\t'+ \ str(sb_length)+'\t'+str(sw_match)+'\t'+str(sw_ratio)+'\t'+ \ str(sb_motif)+'\t'+str(genome2base)+'\n' # filtering: reads>=3, sb_length =22~30 and sw_match_ratio>0.9 PASS if int(reads)>=3 and float(sw_ratio)>0.9: h2out.write(rec) if int(reads)>=3 and float(sw_ratio)>0.9 and int(sb_length)>=22 and int(sb_length)<=30 and str(sb_motif)=='+': h1out.write(rec) else: continue
def scores(sequence1,matrix1): match = 1 mismatch = -3 scoring = swalign.NucleotideScoringMatrix(match,mismatch) sw = swalign.LocalAlignment(scoring) #matrix = np.zeros((len(sequence1),len(sequence1))); for i in range(len(sequence1)): for j in range(len(sequence1)): j=j+i; if(i==j): continue; if(j>len(sequence1)-1): break; alignment = sw.align(sequence1[i],sequence1[j]); #alignment = sw.align('ACACACTA','AGCACACA'); alignment.dump(); var = alignment.score; matrix1[i][j] =var; matrix1[j][i] =var; #arry.append(var); #print(matrix) return matrix1 #def fitness_chromose(chromes_array): # # match = 2 # mismatch = -1 ## fitness_scor=list(); # fitness_scor=[] # fit_scor_chroms=[] ## fit_scor_chroms=list(); # fitness_scor=''; # fit_scor_chroms=''; # for kk in range(len(chromes_array)): # for k in range(len(chromes_array[kk])-1): # scoring = swalign.NucleotideScoringMatrix(match,mismatch) # sw = swalign.LocalAlignment(scoring) # alignment = sw.align(chromes_array[k],chromes_array[k+1]); # alignment.dump(); # var = alignment.score; ## fitness_scor.append (var) # import numpy as np # fitness_scor=np.array([var]) # fitness_scor=sum(fitness_scor) ## print(fitness_scor,'varrrrrrrrr') ## fitness_scor.append(var) # #dddddddddddd # fit_scor_chroms=np.array([ fitness_scor]) ## fit_scor_chroms.append([fitness_scor]) # fitness_scor=[]; # print(fit_scor_chroms,'uzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz') # return fit_scor_chroms # ##*****************************************************************### #import numpy as np #import copy; #def center(center1,chromosomes): ##center=['TTAA','CCGA','CTTA','AAAT','TTCG','GGCA','AAATC']; # rand_order=np.random.randint(4,size=(len(center1)+1)); ##print(rand_order,'random') # len_pop=2 ## chromosomes=list(); # chrom=list(); # chrom=copy.deepcopy(center1); # center use swaping order to generate chromosomes ##print(chrom,('init chrom')) # for k in range(len_pop): # for z in range(len(center1)): # print(rand_order,'random order') # print(type(rand_order),'typerandom order') # print(z,'L') # ind=rand_order[0,z] # print(ind,'ind') # #if(ind!=len(center1)): # print(type(chrom),'uzma') # s=chrom[ind] # print(chrom,'chromee') # print(ind,'ind') # print(rand_order,'order') # chrom[ind]=chrom[ind+1] # swap order to generate chromosomes # chrom[ind+1]=s ## else: ## s=chrom[len(center)] ## print(chrom,'chromee') ## print(ind,'ind') ## chrom[ind]=chrom[1] # swap order to generate chromosomes ## chrom[1]=s # #print(chrom,('updat chorm')) # chromosomes.append(chrom); # #print(chromosom,('append chorm')); # # print(chrom,(' charm')) # chrom=[]; # chrom=copy.deepcopy(center1); ## print(center,('checking chrom')); ## print(chrom,('new center')); # rand_order=''; # rand_order=np.random.randint(5, size=(1,len(center1)+1)) # #print(rand_order) # return chromosomes
for line in f: if re.match('^>', line): b.append(next(f)) f.close() #MANIPULATED VARIABLE, CAN CHANGE TO YOUR PREFERENCES match = 1 mismatch = -3 gap = -1 fw = open( 'Matrix_swalign_{0}_match={1}_mismatch={2}.txt'.format( filename, match, mismatch), 'w+') scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment(scoring) for i in range(len(b)): for j in range(len(b)): if i == j: print >> fw, "0,", else: scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment( scoring, gap) #CAN ADD MORE VARIABLE. REFER SWALIGN FILES a = sw.align(b[i], b[j]) s = a.dump() print >> fw, "{0},".format(s), print >> fw, "\n", fw.close()
__author__ = 'michael' ''' F**K SO STUPEEED ''' import os import sys from collections import defaultdict from ast import literal_eval import string import swalign import zipfile SCORING = swalign.NucleotideScoringMatrix() ALIGNER = swalign.LocalAlignment(SCORING, globalalign=True, gap_penalty=-5) UPPERCASE = set(string.ascii_uppercase) AGREEMENT_THRESHOLD = .99 PRIOR_WEIGHT = 3 def read_reference(ref_fn): with open(ref_fn, 'r') as ref_file: genome_name = ref_file.readline().strip()[1:] chrom_name = ref_file.readline().strip()[4:] ref = ''.join([line.strip() for line in ref_file]) return genome_name, chrom_name, ref def process_line(line): """ :param line: :return:
def align_filter(ref, query, mode, fusion_name=''): """ Aligns query to reference CDS sequence using the Smith-Waterman algorithm. Returns None if the alignment is clipped at the fusion boundary. :param str ref: In-frame reference transcript :param str query: Query transcript :param str mode: 'donor' or 'acceptor' :return: Alignment features :rtype: namedtuple """ alignment_stats = collections.namedtuple( 'AlignStats', 'qstart, qstop, rstart, rstop, insertions, deletions') bounds_regex = re.compile( r'Query\s*:\s*(?P<qstart>\d*)\s*\w*\s*(?P<qstop>\d*)\s*[\|\s]*\s*Ref\s*:\s*(?P<rstart>\d*)\s*\w*\s*(?P<rstop>\d*)' ) match_regex = re.compile(r'Matches: \d+\s\((?P<percent>\d*)') match = 5 mismatch = -1 scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment(scoring) alignment = sw.align(ref, query) # First check that the donor sequence is in frame insertions = 0 deletions = 0 for chr, num in alignment.cigar: if chr == 'I': insertions += num elif chr == 'D': deletions += num # Next grab the alignment statistics string = StringIO() alignment.dump(out=string) dump = string.getvalue() string.close() # If it's not a near perfect match, then the quality of the assembly may not be good m = match_regex.search(dump) if m: percent = int(m.group('percent')) if percent < 99: # print('Percent matching %d' % percent) # print(dump) logging.debug('%s: low percent matching %d' % (fusion_name, percent)) return # If the fusion transcript passes these filters, then grab the bounds of the alignment s = bounds_regex.search(dump) if s: qstart = int(s.group('qstart')) - 1 # Make zero-based qstop = int(s.group('qstop')) # If the end of the fusion transcript doesn't align, then skip this transcript if mode == 'donor' and qstop != len(query): logging.debug( '%s: donor alignment does not include end of sequence' % fusion_name) return elif mode == 'acceptor' and qstart != 0: logging.debug( '%s: acceptor alignment does not include start of sequence' % fusion_name) # print('Acceptor doesn\'t start at one') # print(dump) return rstart = int(s.group('rstart')) - 1 # Make zero-based rstop = int(s.group('rstart')) return alignment_stats(qstart, qstop, rstart, rstop, insertions, deletions) else: return
def get_smith_waterman(): match = 4 mismatch = -1 scoring = swalign.NucleotideScoringMatrix(match, mismatch) return swalign.LocalAlignment(scoring, globalalign=True)
import swalign import subprocess import sys from Bio.SeqIO import convert as bio_convert # Global Parameters MATCH = 2 MISMATCH = -1 SW_SCORE = swalign.NucleotideScoringMatrix(MATCH, MISMATCH) SMITH_WATERMAN = swalign.LocalAlignment(SW_SCORE) def _read_fasta(fp): name = None seq = [] for line in fp: line = line.rstrip() if line.startswith(">"): if name: yield (name, ''.join(seq)) name, seq = line, [] else: seq.append(line) if name: yield (name, ''.join(seq)) def _missing_elements(nums): nums = list(map(int, nums))
not two.isalnum()): # Whitespace matches don't count return 0 elif one.lower() != two.lower() and ( one.isdigit() or two.isdigit()): # Give a severe penalty for mismatching digits return 10 * self._mismatch else: assert (one != two) return self._mismatch sw = swalign.LocalAlignment(FuzzySpeciesNameScoringMatrix(), -9, -1, verbose=False, globalalign=False, full_query=True) def test(): #NCBI = "Candidatus Curtissbacteria bacterium GW20 11 GWA1 40 16" NCBI2 = "Curtissbacteria bacterium GW20 11 GWA1 40 16" Nmicros6 = "Curtissbacteria GWA1 OP11 40 13 partial" Nmicros7 = "Curtissbacteria GWA1 OP11 40 16 partial" Nmicros8 = "Curtissbacteria GWA1 OP11 40 16" Nmicros82 = "Curtissbacteria GWA1 OP11 40 13" #sw.align( Nmicros6, NCBI2 ).dump() #sw.align( NCBI2, Nmicros6 ).dump() #sw.align( Nmicros6, Nmicros6 ).dump()
import sys import swalign scoring = swalign.NucleotideScoringMatrix() sw = swalign.LocalAlignment(scoring) def selfanneal(s): aln = sw.align(s, swalign.revcomp(s)) return aln.score if __name__ == "__main__": for s in sys.argv[1:]: print(selfanneal)
Note 2: This isn't appropriate for color-space FASTQ files with a prefix base included in the read sequence, since it trims an equal number of bases from the sequence and quality FASTQ lines. ''' import sys import os import gzip from ngsutils.support import revcomp, FASTA from ngsutils.fastq import FASTQ import swalign sw = swalign.LocalAlignment(swalign.NucleotideScoringMatrix(2, -1)) def fastx_barcode_split(reader, outtempl, barcodes, edits=0, pos=0, allow_revcomp=False, gzip_output=False, stats_fname=None): ''' Split FAST[QA] reads from {fname} using {barcodes} (hash) to write them to output files named like {templ}. '''