def getSignature2(host, readPath, writePath, count): ''' 用smith waterman算法对请求进行序列比对,找到最合适的子序列, 经过加工,生成签名 ''' sig = [] filePath = readPath + '/' + host + '/' if os.path.exists(filePath): files = os.listdir(filePath) for file in files: sample = [] with open(filePath + file, 'rb') as f: lines = f.readlines() for line in lines: line = line.decode('utf-8').strip() sample.append(line) if len(sample) > count: dis = int(len(sample) / count) sample = sample[::dis] if len(sample) > 1: symbol = alignment.water(sample[0], sample[1]) for i in range(2, len(sample)): symbol = alignment.water(symbol, sample[i]) ''' tokens=st.GetTokenList(sample) signature=st.GetSignature(tokens) ''' symbol = symbol.replace(' ', '(.*)') wildcard = '(.*)?' signature = '' signatureSplit = symbol.split('(.*)') for s in signatureSplit: if s != '': signature += wildcard signature += '(' signature += re.escape(s) signature += ')' signature += wildcard sig.append(signature) sig = list(set(sig)) sig.sort(key=lambda x: len(x)) if len(sig) > 0: filePath = writePath + '/' + host with open(filePath, 'wb') as f: for s in sig: s = s.encode('utf-8') f.write(s) f.write('\n'.encode('utf-8'))
def check_duplicates(self, tweet1, tweet2): tweet1 = tweet1.lower().split() tweet2 = tweet2.lower().split() self.match_len = alignment.water(tweet1, tweet2) if self.match_len >= self.threshold: return True else: return False
#!/usr/bin/env python import sys, string import alignment # Command-line arguments '''f1 = open(sys.argv[1], 'r') seq1 = f1.readline() seq1 = string.strip(seq1) f2 = open(sys.argv[2], 'r') seq2 = f2.readline() seq2 = string.strip(seq2) ''' seq1 = 'asdef' seq2 = 'nibla' alignment.water(seq1, seq2)
def pair_merge(self, loga, logb): loga, logb = alignment.water(loga.log_split(), logb.log_split()) logn = [] for idx, value in enumerate(loga): logn.append('<*>' if value != logb[idx] else value) return " ".join(logn)
from alignment import needle, water # from msa import main from fasta import getFasta id = ["B5CZ00", "B5CZ01", "B5CZ02"] seq = [] for i in id: seq.append(getFasta(i)) if len(id) == 2: needle(seq[0], seq[1]) water(seq[0], seq[1]) else: exit()