Example #1
0
def combine_PT(ET_file,PT_file,target_file, PT_output,overhang_tar_max): # to combine the PT_file(both foot and hairpin) with good performance template file(ET_file) to get the whole_PT_file
    tar_seq={}
    f=open(PT_output,'w')
    print "start"
    for record in SeqIO.parse(target_file, "fasta") :
        num=record.id.split('-')[len(record.id.split('-'))-1]
	tar_seq[num]=record.seq
    overhang_bind_pt={}
    for ET_record in SeqIO.parse(ET_file, "fasta"):
	for PT_record in SeqIO.parse(PT_file, "fasta"):
		overhang=str(ET_record.seq) #this needs to be fixed, this is just the template,not the whole overhang yet
		num=PT_record.id.split('-')[len(PT_record.id.split('-'))-1]
		if tar_seq.has_key(num):
                    target=tar_seq[num]
                    t=alignment.needle(revcomp(overhang),target)
                    #print "checking"
                    if check_binds(revcomp(overhang),target) <overhang_tar_max and t.find('GAGTC')==-1: # not too much binds with target
                        #print "smaller?",  check_binds(revcomp(overhang),target),overhang_tar_max
                        newseq=ET_record.seq+PT_record.seq
                        newid=PT_record.id+'-'+ET_record.id+'-'+'GACTC'+str(newseq.upper().find('GACTC'))
                        print >>f, '>'+newid
                        print >>f, newseq
                        overhang_bind_pt[newid]=check_binds(revcomp(overhang),target)
                    #else:
                        #print len(revcomp(overhang)),
    #print "end"
    f.close()
Example #2
0
def extract_simliar_sentence(label, input_text, extract_num, classifier):
    temp_query = input_text.split(" ")
    query = data_preprocessing1(temp_query)

    if label == "food":
        org_data_list = open(FOOD_QM,
                             encoding='utf-8').read().strip().split('\n')
    elif label == "shopping":
        org_data_list = open(SHOPPING_QM,
                             encoding='utf-8').read().strip().split('\n')
    elif label == "weather":
        org_data_list = open(WEATHER_QM,
                             encoding='utf-8').read().strip().split('\n')

    data_list = []
    for i in range(len(org_data_list)):
        temp_data = org_data_list[i].split(" ")
        p_data = data_preprocessing1(temp_data)
        data_list.append(p_data)

    result_list = []
    for j in range(len(data_list)):
        dst1 = alignment.needle(query, data_list[j])[2]
        dst2 = alignment.needle(query, data_list[j])[3]
        dst1 = data_preprocessing2(input_text.split(" "), dst1)
        dst2 = data_preprocessing2(org_data_list[j].split(" "), dst2)

        temp = add_processing(dst1, dst2)
        result_list.append(temp)

    test_list = make_feature(result_list, "")
    score_dict = dict()

    for i in range(len(test_list)):
        score = classifier.prob_classify(test_list[i]).prob('P')
        score_dict[i] = score

    extract_list = []
    count = 0
    for i in sorted(score_dict, key=lambda m: score_dict[m], reverse=True):
        extract_list.append(input_text + ":" + org_data_list[i])
        count += 1
        if count >= extract_num:
            break

    return extract_list
Example #3
0
def check_binds(seq1,seq2):
    t=alignment.needle(seq1,seq2)
    pieces=t.split()
    leng=[]
    for i in pieces:
        leng.append(len(i))
    #match_num=len(t)-t.count(' ')
    match_num=max(leng)
    return match_num
Example #4
0
def load_data(org_route, target_route):
    original_list = open(org_route,
                         encoding='utf-8').read().strip().split('\n')
    data_list = open(target_route, encoding='utf-8').read().strip().split('\n')
    result_list = []

    for i in range(len(data_list)):
        pair = data_list[i].split(",")

        src1 = original_list[int(pair[0])].split(" ")
        src2 = original_list[int(pair[1]) + 250].split(" ")
        p_src1 = data_preprocessing1(src1)
        p_src2 = data_preprocessing1(src2)

        dst1 = alignment.needle(p_src1, p_src2)[2]
        dst2 = alignment.needle(p_src1, p_src2)[3]
        dst1 = data_preprocessing2(src1, dst1)
        dst2 = data_preprocessing2(src2, dst2)

        temp = add_processing(dst1, dst2)
        result_list.append(temp)

    return result_list
Example #5
0
def one2all(item):
    nameid = namelist.index(item)
    # print(nameid,seqlen)
    for i in range(nameid + 1, seqlen):
        #  print(i)
        name2 = namelist[i]
        seq2 = seqlist[i]
        seq1 = seqlist[nameid]
        strt = item + ' ' + name2 + ' ' + str(ag.needle(seq1, seq2) / 100)
        print(strt)
        q[namelist.index(item)] += 1
        outfile.write(strt)
        outfile.write('\n')
        outfile.flush()
Example #6
0
def bind_hairpin(hairpin_file,extra_base_file,template_file, feet_list, lowest_ptx_long_tm,hair_tar_max): #bind hairpin to pt foot and px foot
    extra_base={}
    template={}
    for record in SeqIO.parse(extra_base_file, "fasta") :
        extra_base[record.id]=record.seq
    hairpin={}
    for record in SeqIO.parse(hairpin_file, "fasta") :
        hairpin[record.id]=record.seq
        id2=record.id+'rev'
        hairpin[id2]=revcomp(str(record.seq))
    for record in SeqIO.parse(template_file,"fasta"):
        template[record.id]=record.seq
    primer_set_list=[]
    target_seq=''
    for e in extra_base.keys(): #PT site extra base
        for e2 in extra_base.keys(): #PX site extra base
            for feet in feet_list:
                if feet.direction=='p':
                    gap=revcomp(configure.input_seq)[feet.end_pos1:feet.start_pos2]
                    target_seq=configure.input_seq[feet.start_pos1:feet.end_pos2]
                else:
                    gap=configure.input_seq[feet.end_pos1:feet.start_pos2]
                    target_seq=revcomp(configure.input_seq[feet.start_pos1:feet.end_pos2])
                if feet.start_pos2+1==feet.end_pos1+1:
                    gap='0'
                if (whether_match(extra_base[e],extra_base[e2],gap)==0):
                    for h in hairpin.keys():
                        if check_binds(hairpin[h],target_seq)<hair_tar_max and check_binds(revcomp(str(hairpin[h])),target_seq)<hair_tar_max: #not too much hybrid between hairpin and target
                            pt_long=hairpin[h]+extra_base[e]+feet.get_pt
                            px_long=feet.get_px+extra_base[e]+revcomp(str(hairpin[h]))
                            if TmDeltaG.calTm(pt_long,px_long)<lowest_ptx_long_tm: #ptx_command='melt.pl -n DNA -t 55 -N 1 -M 0 -C 0.000000005 '+pt_in+' '+px_in
                                for t in template.keys():
                                    align_tar_template=alignment.needle(revcomp(template[t]), target_seq)
                                    if check_binds(revcomp(t),target) <overhang_tar_max and align_tar_template.find('GAGTC')==-1:      feet_long= PTX_long(feet,e,e2,h)
                                        primer_set= primer_set(feet, e,h,t)
                                        primer_set_list.append(primer_set)
                    Y = np.array(float(line[38:46].strip()))
                    Z = np.array(float(line[46:54].strip()))
                    cord[resNum] = [resiType, np.array([X, Y, Z])]
                else:
                    print line[17:20].strip()
        Resi_map = {}
        pdb_seq0 = ''
        first, last = sorted(cord)[0], sorted(cord)[-1]
        for i in range(first, last + 1):
            if i in [x for x in cord]:
                Resi_map[i] = cord[i][0]
                pdb_seq0 += cord[i][0]
            else:
                pdb_seq0 += 'X'  #'X' for missing residues
        f_align.write(ii + '\n')
        temp = alignment.needle(pdb_seq0, full_seq0)
        f_align.write(temp[0] + '\n')
        f_align.write(temp[1] + '\n')

        temp2 = ['', '']
        Resi_map_new = {}
        pdbCounter = 1
        seqCounter = 1
        for char in range(len(temp[0])):
            # find cases where pdb_seq[0] has gap (X) while
            # sequence has no gap ('-')
            if temp[0][char] == 'X' and temp[0] == '-':
                pdbCounter += 1
            else:
                if pdbCounter in Resi_map:
                    Resi_map_new[seqCounter] = Resi_map[pdbCounter]
Example #8
0
    alternative = [
        len(i), np.sum(temp[-1]),
        np.sum(temp[-2]),
        np.sum(temp[-3])
    ]
    per = per
    temp = temp.T
    X = [temp[0], temp[1:], alternative, i, per * 1]
    #print '> %s\n%s' %(i,i)
    return X


result = []
for i in data.seq:
    result += [
        alignment.needle(i, p53)[-1],
    ]
data['1'] = result
data.sort_values('1')
p53 = make(p53)

print len(result)

epsilon = 1e-3


def batch_normalization(x, name='batchnorm', feature_norm=False):
    #epsilon = tf.Variable(tf.constant([1e-3,]*x.shape[0]))
    if feature_norm:  #only works for fizxed batch size 1
        mean, var = tf.nn.moments(x, [0, 2, 3], keep_dims=True)
        scale = tf.Variable(tf.ones([1, x.shape[1], 1, 1]))
Example #9
0
 def align(self, seq1, seq2):
     from alignment import needle
     align = needle(seq1, seq2)
     return align[0], align[2]
Example #10
0
from alignment import needle, water
# from msa import main
from fasta import getFasta
id = ["B5CZ00", "B5CZ01", "B5CZ02"]
seq = []
for i in id:
    seq.append(getFasta(i))

if len(id) == 2:
    needle(seq[0], seq[1])
    water(seq[0], seq[1])
else:
    exit()
Example #11
0
sim_matrix = np.zeros((len(data),len(data)))

import sys
if len(sys.argv) >= 2:
    start = sys.argv[1]
else:
    start = 0
print start
start = int(start)
if False:
    for i in range(start,len(data),7):
        seq1 = data.iloc[i]['seq']
        print seq1
        for j in range(i+1,len(data)):
            seq2 = data.iloc[j]['seq']
            score = alignment.needle(seq1,seq2)[-1]
            sim_matrix[i,j] = score
            sim_matrix[j,i] = score
    np.savez_compressed('sim_matrix%s.npy.zip'%start,sim_matrix)
if True:
    for start in range(0,7):
         sim_matrix =  sim_matrix + np.load('sim_matrix%s.npy.zip.npz' %start).items()[0][1]
def swap_rows(C, var1, var2):
        '''
        Function to swap two rows in a covariance matrix,
        updating the appropriate columns as well.
        '''
        D = C.copy()
        D[var2, :] = C[var1, :]
        D[var1, :] = C[var2, :]
Example #12
0
def align(ust_kana, lyrics_kana):  #Needleman-Wunsch alignment
    return alignment.needle(ust_kana, lyrics_kana)
Example #13
0
import sys, string
import alignment
from pyspark.sql.functions import substring
from pyspark.sql import SparkSession
#create spark session
spark = SparkSession\
.builder\
.appName("Needleman")\
.getOrCreate()
#get sequences from files
#peform spark transformation
seq1 = spark.read.text(
    "hdfs://HadoopMaster:9000/user/hduser/seq1.txt").collect()
seq2 = spark.read.text(
    "hdfs://HadoopMaster:9000/user/hduser/seq2.txt").collect()
#perform spark action
for i in range(0, 1):
    alignment.needle(seq1[i].value, seq2[i].value)
Example #14
0
    alternative = [
        len(i), np.sum(temp[-1]),
        np.sum(temp[-2]),
        np.sum(temp[-3])
    ]
    per = per
    temp = temp.T
    X = [temp[0], temp[1:], alternative, i, per * 1]
    #print '> %s\n%s' %(i,i)
    return X


result = []
for i in data.seq:
    result += [
        alignment.needle(p53, i)[-1],
    ]
    if alignment.needle(p53, i)[-1] >= 40:
        print alignment.needle(p53, i)[0]
        print alignment.needle(p53, i)[1],
        print alignment.needle(p53, i)[2], '\n'
data['1'] = result
data.sort_values('1')
p53 = make(p53)
die
print len(result)

epsilon = 1e-3


def batch_normalization(x, name='batchnorm', feature_norm=False):
Example #15
0
file2=open(sys.argv[1],'r')

full1=file1.read()
lines1=full1.split('\n')
len1=len(lines1)

full2=file2.read()
lines2=full2.split('\n')
len2=len(lines2)

outfile=open(sys.argv[2],'w')

for i in range((len1-1)/2):
  name1_s=lines1[i*2].split('\t')[0]
  name1=name1_s.split('>')[1]
  seq1=lines1[i*2+1]
  for j in range(i+1,(len2-1)/2):
    name2_s=lines2[j*2].split('\t')[0]
    name2=name2_s.split('>')[1]
    seq2=lines2[j*2+1]
    strt=name1+' '+name2+' '+str(alignment.needle(seq1,seq2)/100)
    print(strt)
    outfile.write(strt)
    outfile.write('\n')
    outfile.flush()
outfile.close()

t2=time.time()
print("Total time:%s"%(t2-t1))  

#!/usr/bin/env python

import sys, string
import alignment

seq1 = 'GCATGCU'
seq2 = 'GATTACA'



alignment.needle(seq1, seq2)
Example #17
0
        and line[12:16].strip() in ["C1'","C1*"]:
         if first_resi is True :
             first_resi = int(line[22:26].strip()) -1
         #print line
         resNum,resiType = (int(line[22:26].strip())- first_resi,line[17:20].strip())
         X = np.array(float(line[30:38].strip()))
         Y = np.array(float(line[38:46].strip()))
         Z = np.array(float(line[46:54].strip()))
         cord += [np.array([X,Y,Z]),]
         line2 = line
         seqQ += line[17:20].strip()
 
 Q = np.stack(cord)
 
 #remove residues not found in either
 ali = alignment.needle(seqP,seqQ)
 good_resiP,good_resiQ = [],[]
 counterP,counterQ = 0,0
 for i in range(len(ali[1])):
     if ali[0][i] == '-':
         counterQ += 1
     elif ali[1][i] == '-':
         counterP +=1
     else:
         good_resiP += [counterP,]
         counterP +=1
         good_resiQ += [counterQ,]
         counterQ += 1
 P,Q = P[good_resiP,:],Q[good_resiQ,:]
     
 
Example #18
0
def align(ust_kana, lyrics_kana):
    return alignment.needle(ust_kana, lyrics_kana)
Example #19
0
import sys

file1 = open(sys.argv[1], 'r')
file2 = open(sys.argv[1], 'r')

full1 = file1.read()
lines1 = full1.split('\n')
len1 = len(lines1)

full2 = file2.read()
lines2 = full2.split('\n')
len2 = len(lines2)

outfile = open(sys.argv[2], 'w')

for i in range((len1 - 1) / 2):
    name1_s = lines1[i * 2].split('\t')[0]
    name1 = name1_s.split('>')[1]
    seq1 = lines1[i * 2 + 1]
    for j in range(i + 1, (len2 - 1) / 2):
        name2_s = lines2[j * 2].split('\t')[0]
        name2 = name2_s.split('>')[1]
        seq2 = lines2[j * 2 + 1]
        strt = name1 + ',' + name2 + ': ' + str(
            alignment.needle(seq1, seq2) / 100)
        print(strt)
        outfile.write(strt)
        outfile.write('\n')
        outfile.flush()
outfile.close()
Example #20
0
# IMPORT MODULES                                                               #
# ============================================================================ #

Parse_Commandline()
module = _args.modulepath
module_folder = os.path.dirname(module)
if module_folder not in sys.path:
    sys.path.append(module_folder)

import alignment
from Bio import SeqIO

# ============================================================================ #
# MAIN PROGRAM                                                                 #
# ============================================================================ #

handle1 = open(_args.input[0], "rU")
records1 = {}
for record in SeqIO.parse(handle1, "fasta"):
    records1[record.id] = record.seq

handle2 = open(_args.input[1], "rU")
records2 = {}
for record in SeqIO.parse(handle2, "fasta"):
    records2[record.id]=record.seq

for allele in records1.keys():
    print allele
    alignment.needle(records1[allele], records2[allele])