def combine_PT(ET_file,PT_file,target_file, PT_output,overhang_tar_max): # to combine the PT_file(both foot and hairpin) with good performance template file(ET_file) to get the whole_PT_file tar_seq={} f=open(PT_output,'w') print "start" for record in SeqIO.parse(target_file, "fasta") : num=record.id.split('-')[len(record.id.split('-'))-1] tar_seq[num]=record.seq overhang_bind_pt={} for ET_record in SeqIO.parse(ET_file, "fasta"): for PT_record in SeqIO.parse(PT_file, "fasta"): overhang=str(ET_record.seq) #this needs to be fixed, this is just the template,not the whole overhang yet num=PT_record.id.split('-')[len(PT_record.id.split('-'))-1] if tar_seq.has_key(num): target=tar_seq[num] t=alignment.needle(revcomp(overhang),target) #print "checking" if check_binds(revcomp(overhang),target) <overhang_tar_max and t.find('GAGTC')==-1: # not too much binds with target #print "smaller?", check_binds(revcomp(overhang),target),overhang_tar_max newseq=ET_record.seq+PT_record.seq newid=PT_record.id+'-'+ET_record.id+'-'+'GACTC'+str(newseq.upper().find('GACTC')) print >>f, '>'+newid print >>f, newseq overhang_bind_pt[newid]=check_binds(revcomp(overhang),target) #else: #print len(revcomp(overhang)), #print "end" f.close()
def extract_simliar_sentence(label, input_text, extract_num, classifier): temp_query = input_text.split(" ") query = data_preprocessing1(temp_query) if label == "food": org_data_list = open(FOOD_QM, encoding='utf-8').read().strip().split('\n') elif label == "shopping": org_data_list = open(SHOPPING_QM, encoding='utf-8').read().strip().split('\n') elif label == "weather": org_data_list = open(WEATHER_QM, encoding='utf-8').read().strip().split('\n') data_list = [] for i in range(len(org_data_list)): temp_data = org_data_list[i].split(" ") p_data = data_preprocessing1(temp_data) data_list.append(p_data) result_list = [] for j in range(len(data_list)): dst1 = alignment.needle(query, data_list[j])[2] dst2 = alignment.needle(query, data_list[j])[3] dst1 = data_preprocessing2(input_text.split(" "), dst1) dst2 = data_preprocessing2(org_data_list[j].split(" "), dst2) temp = add_processing(dst1, dst2) result_list.append(temp) test_list = make_feature(result_list, "") score_dict = dict() for i in range(len(test_list)): score = classifier.prob_classify(test_list[i]).prob('P') score_dict[i] = score extract_list = [] count = 0 for i in sorted(score_dict, key=lambda m: score_dict[m], reverse=True): extract_list.append(input_text + ":" + org_data_list[i]) count += 1 if count >= extract_num: break return extract_list
def check_binds(seq1,seq2): t=alignment.needle(seq1,seq2) pieces=t.split() leng=[] for i in pieces: leng.append(len(i)) #match_num=len(t)-t.count(' ') match_num=max(leng) return match_num
def load_data(org_route, target_route): original_list = open(org_route, encoding='utf-8').read().strip().split('\n') data_list = open(target_route, encoding='utf-8').read().strip().split('\n') result_list = [] for i in range(len(data_list)): pair = data_list[i].split(",") src1 = original_list[int(pair[0])].split(" ") src2 = original_list[int(pair[1]) + 250].split(" ") p_src1 = data_preprocessing1(src1) p_src2 = data_preprocessing1(src2) dst1 = alignment.needle(p_src1, p_src2)[2] dst2 = alignment.needle(p_src1, p_src2)[3] dst1 = data_preprocessing2(src1, dst1) dst2 = data_preprocessing2(src2, dst2) temp = add_processing(dst1, dst2) result_list.append(temp) return result_list
def one2all(item): nameid = namelist.index(item) # print(nameid,seqlen) for i in range(nameid + 1, seqlen): # print(i) name2 = namelist[i] seq2 = seqlist[i] seq1 = seqlist[nameid] strt = item + ' ' + name2 + ' ' + str(ag.needle(seq1, seq2) / 100) print(strt) q[namelist.index(item)] += 1 outfile.write(strt) outfile.write('\n') outfile.flush()
def bind_hairpin(hairpin_file,extra_base_file,template_file, feet_list, lowest_ptx_long_tm,hair_tar_max): #bind hairpin to pt foot and px foot extra_base={} template={} for record in SeqIO.parse(extra_base_file, "fasta") : extra_base[record.id]=record.seq hairpin={} for record in SeqIO.parse(hairpin_file, "fasta") : hairpin[record.id]=record.seq id2=record.id+'rev' hairpin[id2]=revcomp(str(record.seq)) for record in SeqIO.parse(template_file,"fasta"): template[record.id]=record.seq primer_set_list=[] target_seq='' for e in extra_base.keys(): #PT site extra base for e2 in extra_base.keys(): #PX site extra base for feet in feet_list: if feet.direction=='p': gap=revcomp(configure.input_seq)[feet.end_pos1:feet.start_pos2] target_seq=configure.input_seq[feet.start_pos1:feet.end_pos2] else: gap=configure.input_seq[feet.end_pos1:feet.start_pos2] target_seq=revcomp(configure.input_seq[feet.start_pos1:feet.end_pos2]) if feet.start_pos2+1==feet.end_pos1+1: gap='0' if (whether_match(extra_base[e],extra_base[e2],gap)==0): for h in hairpin.keys(): if check_binds(hairpin[h],target_seq)<hair_tar_max and check_binds(revcomp(str(hairpin[h])),target_seq)<hair_tar_max: #not too much hybrid between hairpin and target pt_long=hairpin[h]+extra_base[e]+feet.get_pt px_long=feet.get_px+extra_base[e]+revcomp(str(hairpin[h])) if TmDeltaG.calTm(pt_long,px_long)<lowest_ptx_long_tm: #ptx_command='melt.pl -n DNA -t 55 -N 1 -M 0 -C 0.000000005 '+pt_in+' '+px_in for t in template.keys(): align_tar_template=alignment.needle(revcomp(template[t]), target_seq) if check_binds(revcomp(t),target) <overhang_tar_max and align_tar_template.find('GAGTC')==-1: feet_long= PTX_long(feet,e,e2,h) primer_set= primer_set(feet, e,h,t) primer_set_list.append(primer_set)
Y = np.array(float(line[38:46].strip())) Z = np.array(float(line[46:54].strip())) cord[resNum] = [resiType, np.array([X, Y, Z])] else: print line[17:20].strip() Resi_map = {} pdb_seq0 = '' first, last = sorted(cord)[0], sorted(cord)[-1] for i in range(first, last + 1): if i in [x for x in cord]: Resi_map[i] = cord[i][0] pdb_seq0 += cord[i][0] else: pdb_seq0 += 'X' #'X' for missing residues f_align.write(ii + '\n') temp = alignment.needle(pdb_seq0, full_seq0) f_align.write(temp[0] + '\n') f_align.write(temp[1] + '\n') temp2 = ['', ''] Resi_map_new = {} pdbCounter = 1 seqCounter = 1 for char in range(len(temp[0])): # find cases where pdb_seq[0] has gap (X) while # sequence has no gap ('-') if temp[0][char] == 'X' and temp[0] == '-': pdbCounter += 1 else: if pdbCounter in Resi_map: Resi_map_new[seqCounter] = Resi_map[pdbCounter]
alternative = [ len(i), np.sum(temp[-1]), np.sum(temp[-2]), np.sum(temp[-3]) ] per = per temp = temp.T X = [temp[0], temp[1:], alternative, i, per * 1] #print '> %s\n%s' %(i,i) return X result = [] for i in data.seq: result += [ alignment.needle(i, p53)[-1], ] data['1'] = result data.sort_values('1') p53 = make(p53) print len(result) epsilon = 1e-3 def batch_normalization(x, name='batchnorm', feature_norm=False): #epsilon = tf.Variable(tf.constant([1e-3,]*x.shape[0])) if feature_norm: #only works for fizxed batch size 1 mean, var = tf.nn.moments(x, [0, 2, 3], keep_dims=True) scale = tf.Variable(tf.ones([1, x.shape[1], 1, 1]))
def align(self, seq1, seq2): from alignment import needle align = needle(seq1, seq2) return align[0], align[2]
from alignment import needle, water # from msa import main from fasta import getFasta id = ["B5CZ00", "B5CZ01", "B5CZ02"] seq = [] for i in id: seq.append(getFasta(i)) if len(id) == 2: needle(seq[0], seq[1]) water(seq[0], seq[1]) else: exit()
sim_matrix = np.zeros((len(data),len(data))) import sys if len(sys.argv) >= 2: start = sys.argv[1] else: start = 0 print start start = int(start) if False: for i in range(start,len(data),7): seq1 = data.iloc[i]['seq'] print seq1 for j in range(i+1,len(data)): seq2 = data.iloc[j]['seq'] score = alignment.needle(seq1,seq2)[-1] sim_matrix[i,j] = score sim_matrix[j,i] = score np.savez_compressed('sim_matrix%s.npy.zip'%start,sim_matrix) if True: for start in range(0,7): sim_matrix = sim_matrix + np.load('sim_matrix%s.npy.zip.npz' %start).items()[0][1] def swap_rows(C, var1, var2): ''' Function to swap two rows in a covariance matrix, updating the appropriate columns as well. ''' D = C.copy() D[var2, :] = C[var1, :] D[var1, :] = C[var2, :]
def align(ust_kana, lyrics_kana): #Needleman-Wunsch alignment return alignment.needle(ust_kana, lyrics_kana)
import sys, string import alignment from pyspark.sql.functions import substring from pyspark.sql import SparkSession #create spark session spark = SparkSession\ .builder\ .appName("Needleman")\ .getOrCreate() #get sequences from files #peform spark transformation seq1 = spark.read.text( "hdfs://HadoopMaster:9000/user/hduser/seq1.txt").collect() seq2 = spark.read.text( "hdfs://HadoopMaster:9000/user/hduser/seq2.txt").collect() #perform spark action for i in range(0, 1): alignment.needle(seq1[i].value, seq2[i].value)
alternative = [ len(i), np.sum(temp[-1]), np.sum(temp[-2]), np.sum(temp[-3]) ] per = per temp = temp.T X = [temp[0], temp[1:], alternative, i, per * 1] #print '> %s\n%s' %(i,i) return X result = [] for i in data.seq: result += [ alignment.needle(p53, i)[-1], ] if alignment.needle(p53, i)[-1] >= 40: print alignment.needle(p53, i)[0] print alignment.needle(p53, i)[1], print alignment.needle(p53, i)[2], '\n' data['1'] = result data.sort_values('1') p53 = make(p53) die print len(result) epsilon = 1e-3 def batch_normalization(x, name='batchnorm', feature_norm=False):
file2=open(sys.argv[1],'r') full1=file1.read() lines1=full1.split('\n') len1=len(lines1) full2=file2.read() lines2=full2.split('\n') len2=len(lines2) outfile=open(sys.argv[2],'w') for i in range((len1-1)/2): name1_s=lines1[i*2].split('\t')[0] name1=name1_s.split('>')[1] seq1=lines1[i*2+1] for j in range(i+1,(len2-1)/2): name2_s=lines2[j*2].split('\t')[0] name2=name2_s.split('>')[1] seq2=lines2[j*2+1] strt=name1+' '+name2+' '+str(alignment.needle(seq1,seq2)/100) print(strt) outfile.write(strt) outfile.write('\n') outfile.flush() outfile.close() t2=time.time() print("Total time:%s"%(t2-t1))
#!/usr/bin/env python import sys, string import alignment seq1 = 'GCATGCU' seq2 = 'GATTACA' alignment.needle(seq1, seq2)
and line[12:16].strip() in ["C1'","C1*"]: if first_resi is True : first_resi = int(line[22:26].strip()) -1 #print line resNum,resiType = (int(line[22:26].strip())- first_resi,line[17:20].strip()) X = np.array(float(line[30:38].strip())) Y = np.array(float(line[38:46].strip())) Z = np.array(float(line[46:54].strip())) cord += [np.array([X,Y,Z]),] line2 = line seqQ += line[17:20].strip() Q = np.stack(cord) #remove residues not found in either ali = alignment.needle(seqP,seqQ) good_resiP,good_resiQ = [],[] counterP,counterQ = 0,0 for i in range(len(ali[1])): if ali[0][i] == '-': counterQ += 1 elif ali[1][i] == '-': counterP +=1 else: good_resiP += [counterP,] counterP +=1 good_resiQ += [counterQ,] counterQ += 1 P,Q = P[good_resiP,:],Q[good_resiQ,:]
def align(ust_kana, lyrics_kana): return alignment.needle(ust_kana, lyrics_kana)
import sys file1 = open(sys.argv[1], 'r') file2 = open(sys.argv[1], 'r') full1 = file1.read() lines1 = full1.split('\n') len1 = len(lines1) full2 = file2.read() lines2 = full2.split('\n') len2 = len(lines2) outfile = open(sys.argv[2], 'w') for i in range((len1 - 1) / 2): name1_s = lines1[i * 2].split('\t')[0] name1 = name1_s.split('>')[1] seq1 = lines1[i * 2 + 1] for j in range(i + 1, (len2 - 1) / 2): name2_s = lines2[j * 2].split('\t')[0] name2 = name2_s.split('>')[1] seq2 = lines2[j * 2 + 1] strt = name1 + ',' + name2 + ': ' + str( alignment.needle(seq1, seq2) / 100) print(strt) outfile.write(strt) outfile.write('\n') outfile.flush() outfile.close()
# IMPORT MODULES # # ============================================================================ # Parse_Commandline() module = _args.modulepath module_folder = os.path.dirname(module) if module_folder not in sys.path: sys.path.append(module_folder) import alignment from Bio import SeqIO # ============================================================================ # # MAIN PROGRAM # # ============================================================================ # handle1 = open(_args.input[0], "rU") records1 = {} for record in SeqIO.parse(handle1, "fasta"): records1[record.id] = record.seq handle2 = open(_args.input[1], "rU") records2 = {} for record in SeqIO.parse(handle2, "fasta"): records2[record.id]=record.seq for allele in records1.keys(): print allele alignment.needle(records1[allele], records2[allele])