Esempio n. 1
0
def getSignature2(host, readPath, writePath, count):
    '''
    用smith waterman算法对请求进行序列比对,找到最合适的子序列,
    经过加工,生成签名
    '''

    sig = []
    filePath = readPath + '/' + host + '/'
    if os.path.exists(filePath):
        files = os.listdir(filePath)

        for file in files:

            sample = []
            with open(filePath + file, 'rb') as f:
                lines = f.readlines()
                for line in lines:
                    line = line.decode('utf-8').strip()
                    sample.append(line)

            if len(sample) > count:
                dis = int(len(sample) / count)
                sample = sample[::dis]
            if len(sample) > 1:
                symbol = alignment.water(sample[0], sample[1])
                for i in range(2, len(sample)):
                    symbol = alignment.water(symbol, sample[i])
                '''
                tokens=st.GetTokenList(sample)
                
                signature=st.GetSignature(tokens)
                '''
                symbol = symbol.replace(' ', '(.*)')
                wildcard = '(.*)?'
                signature = ''
                signatureSplit = symbol.split('(.*)')

                for s in signatureSplit:
                    if s != '':
                        signature += wildcard
                        signature += '('
                        signature += re.escape(s)
                        signature += ')'
                signature += wildcard

                sig.append(signature)

        sig = list(set(sig))

        sig.sort(key=lambda x: len(x))

        if len(sig) > 0:

            filePath = writePath + '/' + host
            with open(filePath, 'wb') as f:
                for s in sig:
                    s = s.encode('utf-8')
                    f.write(s)
                    f.write('\n'.encode('utf-8'))
Esempio n. 2
0
 def check_duplicates(self, tweet1, tweet2):
     tweet1 = tweet1.lower().split()
     tweet2 = tweet2.lower().split()
     self.match_len = alignment.water(tweet1, tweet2)
     if self.match_len >= self.threshold:
         return True
     else:
         return False
Esempio n. 3
0
#!/usr/bin/env python

import sys, string
import alignment

# Command-line arguments
'''f1 = open(sys.argv[1], 'r')
seq1 = f1.readline()
seq1 = string.strip(seq1)

f2 = open(sys.argv[2], 'r')
seq2 = f2.readline()
seq2 = string.strip(seq2)
'''
seq1 = 'asdef'
seq2 = 'nibla'
alignment.water(seq1, seq2)
Esempio n. 4
0
 def pair_merge(self, loga, logb):
     loga, logb = alignment.water(loga.log_split(), logb.log_split())
     logn = []
     for idx, value in enumerate(loga):
         logn.append('<*>' if value != logb[idx] else value)
     return " ".join(logn)
Esempio n. 5
0
from alignment import needle, water
# from msa import main
from fasta import getFasta
id = ["B5CZ00", "B5CZ01", "B5CZ02"]
seq = []
for i in id:
    seq.append(getFasta(i))

if len(id) == 2:
    needle(seq[0], seq[1])
    water(seq[0], seq[1])
else:
    exit()