Beispiel #1
0
def test():
    stree = GeneralisedSuffixTree(['mississippi'])
    for shared in stree.sharedSubstrings(2):
        for seq,start,stop in shared:
            print seq, '['+str(start)+':'+str(stop)+']',
            print stree.sequences[seq][start:stop],
            print stree.sequences[seq][:start]+'|'+stree.sequences[seq][start:stop]+'|'+stree.sequences[seq][stop:]
Beispiel #2
0
def test():
    stree = GeneralisedSuffixTree(['mississippi'])
    for shared in stree.sharedSubstrings(2):
        for seq, start, stop in shared:
            print seq, '[' + str(start) + ':' + str(stop) + ']',
            print stree.sequences[seq][start:stop],
            print stree.sequences[seq][:start] + '|' + stree.sequences[seq][
                start:stop] + '|' + stree.sequences[seq][stop:]
Beispiel #3
0
def lcsm(strings):
    stree = GeneralisedSuffixTree(strings)
    try:
        max_tuples = max(stree.sharedSubstrings(),
                key=lambda ss: ss[0][2] - ss[0][1])
        num, start, stop = max_tuples[0]
        return stree.sequences[num][start:stop]
    except TypeError:
        return None
Beispiel #4
0
def getMAXchSTR(string):
    stree = GeneralisedSuffixTree([string])
    ## record the max length of shared substring and the substring
    maxlength = 0;register = ""
    for shared in stree.sharedSubstrings(15):  ## five chinese characters
        for seq,start,stop in shared:
            if (stop-start) > maxlength:
                maxlength = stop - start
                register  = stree.sequences[seq][start:stop]
    #print type(register),register.decode('utf8')
    return maxlength,register
Beispiel #5
0
def getMAXchSTR(string):
    stree = GeneralisedSuffixTree([string])
    ## record the max length of shared substring and the substring
    maxlength = 0
    register = ""
    for shared in stree.sharedSubstrings(15):  ## five chinese characters
        for seq, start, stop in shared:
            if (stop - start) > maxlength:
                maxlength = stop - start
                register = stree.sequences[seq][start:stop]
    #print type(register),register.decode('utf8')
    return maxlength, register
Beispiel #6
0
def genrule(str_seq):
	stree = GeneralisedSuffixTree(str_seq)
	
	for shared in stree.sharedSubstrings(50):
		print '-'*70
	#	print shared
		for seq,start,stop in shared:
			if stop-start > 20:
				print seq, '['+str(start)+':'+str(stop)+']',
				print str_seq[seq][start:stop],
				print str_seq[:start]+'|'+ str_seq[seq][start:stop]+'|'+ str_seq[seq][stop:]
	print '='*70
Beispiel #7
0
def LCS(s1, s2):
	from suffix_tree import GeneralisedSuffixTree
	seqs = [s1, s2]
	us1 = unicode(s1, 'utf-8')
	us2 = unicode(s2, 'utf-8')
	stree = GeneralisedSuffixTree([us1, us2])
	longs = set()
	for shared in stree.sharedSubstrings():
		for seq, start, stop in shared:
			sub = seqs[seq][start:stop]
			longs.add(sub)
	return max(longs, key=len)
Beispiel #8
0
def parseBench(f1, f2, f3, f4, f5):
    (bin1, bin1arr, bin1fun, bin1all) = parse(f1)
    (bin2, bin2arr, bin2fun, bin2all) = parse(f2)
    (bin3, bin3arr, bin3fun, bin3all) = parse(f3)
    (bin4, bin4arr, bin4fun, bin4all) = parse(f4)
    (bin5, bin5arr, bin5fun, bin5all) = parse(f5)

    bin1arrB = ''.join(chr(x) for x in bin1arr)
    bin2arrB = ''.join(chr(x) for x in bin2arr)
    bin3arrB = ''.join(chr(x) for x in bin3arr)
    bin4arrB = ''.join(chr(x) for x in bin4arr)
    bin5arrB = ''.join(chr(x) for x in bin5arr)

    hex1arr = binascii.hexlify(bin1arrB)
    hex2arr = binascii.hexlify(bin2arrB)
    hex3arr = binascii.hexlify(bin3arrB)
    hex4arr = binascii.hexlify(bin4arrB)
    hex5arr = binascii.hexlify(bin5arrB)

    matches = {}
    matches3 = {}
    matches4 = {}
    matches5 = {}
    stree = GeneralisedSuffixTree([hex1arr, hex2arr])
    for shared in stree.sharedSubstrings(20):
        for seq, start, stop in shared:
            if seq == 0:
                leng = (stop-start)/2
                if leng in  matches:
                    matches[leng] += 1
                else:
                    matches[leng] = 1
                match = hex1arr[start:stop]
                if match in hex3arr:
                    if leng in matches3:
                        matches3[leng] += 1
                    else:
                        matches3[leng] = 1
                    if match in hex4arr:
                        if leng in matches4:
                            matches4[leng] += 1
                        else:
                            matches4[leng] = 1
                        if match in hex5arr:
                            if leng in matches5:
                                matches5[leng] += 1
                            else:
                                matches5[leng] = 1
                        
    return (matches, matches3, matches4, matches5)
Beispiel #9
0
def main():
    with open(sys.argv[1], 'r') as fi:
        seq_1 = fi.readline().strip()
        seq_2 = fi.readline().strip()

    seqs = [seq_1, seq_2]
    stree = GeneralisedSuffixTree(seqs)

    max_len = 0
    max_str = ''
    for shared in stree.sharedSubstrings():
        for seq, start, stop in shared:
            cs = seqs[seq][start:stop]
            if len(cs) > max_len:
                max_len = len(cs)
                max_str = cs

    print(max_str)
Beispiel #10
0
    def __computeLCS(self, stringList):
        '''
        Returns a one-element list containing the LCS of the input stringList  
        '''

        alphabet = self.__getAlphabet(
            stringList)  # get alphabet of (all characters in) stringList

        # check if alphabet requires too many characters to create enough terminal characters
        # for each string in stringList
        if not self.__isComputable(stringList, alphabet):
            strLstLen = len(stringList)
            return self.__computeLCS(self.__computeLCS(stringList[0:strLstLen/2]) + \
                                     self.__computeLCS(stringList[strLstLen/2:strLstLen]))

        (stringList, translationDict) = self.__translateCharacters(
            stringList, alphabet)  # translate characters in stringList

        # make suffix tree
        stree = GeneralisedSuffixTree(stringList)
        # get all shared substrings
        sharedSubstrings = []
        for shared in stree.sharedSubstrings():
            for seq, start, stop in shared:
                sharedSubstrings += [stree.sequences[seq][start:stop]]

        # find the index of the longest shared substring
        substringLens = [len(substring) for substring in sharedSubstrings]
        if substringLens == []:
            lcs = [""]
            return lcs
        longestSubstringIndex = substringLens.index(max(substringLens))

        lcs = sharedSubstrings[longestSubstringIndex]
        # Back translate
        for (translatedChar, originalChar) in translationDict.iteritems():
            lcs = lcs.replace(translatedChar, originalChar)
        return [lcs]
Beispiel #11
0
def lcsm(strings):
    gst = GeneralisedSuffixTree(strings)
    max_tuples = max(gst.sharedSubstrings(),
            key=lambda xs: xs[0][2] - xs[0][1])
    index, start, stop = max_tuples[0]
    return strings[index][start:stop]
Beispiel #12
0
baseComplement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}


def revc(seq):
    return "".join([baseComplement[base] for base in seq[::-1]])


# Build a random string, which should have some short reverse complements already.
bases = ['A', 'C', 'G', 'T']
data = ''.join(choice(bases) for i in xrange(400000))
#data = "AGGGTTTCCCTGACCTTCACTGCAGGTCATGCA"
# revc  TGCATGACCTGCAGTGAAGGTCAGGGAAACCCT
#       012345678901234567890123456789012
#                 1         2         3

print "Got data"
revdata = revc(data)
print "Got reverse data"

n = len(data)
minlength = 18
tree = GeneralisedSuffixTree([data, revdata])
for shared in tree.sharedSubstrings(minlength):
    _, start, stop = shared[0]
    seq = data[start:stop]
    _, rstart, rstop = shared[1]
    rseq = data[n - rstop:n - rstart]
    print "Match: {0} at [{1}:{2}] and {3} at [{4}:{5}]".format(
        seq, start, stop, rseq, n - rstop, n - rstart)
Beispiel #13
0

def validateSubstring(strings, seq):
    for s in strings:
        if s.find(seq) == -1:
            return False
    return True


with open('rosalind_lcs.txt') as spec:
    data = [seq.strip() for seq in spec]

    # The generalized suffix tree doesn't work well with a large number of strings.
    # Use the first 10 to generate candidates, and then compare each candidate
    # (in decreasing length order) to the data to find a common substring.
    tree = GeneralisedSuffixTree(data[:10])
    candidates = []
    for shared in tree.sharedSubstrings(5):
        for seq, start, stop in shared:
            candidates.append(tree.sequences[seq][start:stop])
            break

    candidates.sort(cmp=None, key=lambda s: len(s), reverse=True)
    for c in candidates:
        if validateSubstring(data, c):
            print c
            print len(c)
            break
    else:
        print "No common string found!"
from random import choice

baseComplement = { 'A' : 'T', 'C' : 'G', 'G' : 'C', 'T' : 'A' }

def revc(seq):
    return "".join([baseComplement[base] for base in seq[::-1]])


# Build a random string, which should have some short reverse complements already.
bases = ['A', 'C', 'G', 'T']
data = ''.join(choice(bases) for i in xrange(400000)) 
#data = "AGGGTTTCCCTGACCTTCACTGCAGGTCATGCA"
# revc  TGCATGACCTGCAGTGAAGGTCAGGGAAACCCT
#       012345678901234567890123456789012
#                 1         2         3

print "Got data"
revdata = revc(data)
print "Got reverse data"

n = len(data)
minlength = 18
tree = GeneralisedSuffixTree([data, revdata])
for shared in tree.sharedSubstrings(minlength):
    _, start, stop = shared[0]
    seq = data[start:stop]
    _, rstart, rstop = shared[1]
    rseq = data[n-rstop:n-rstart]
    print "Match: {0} at [{1}:{2}] and {3} at [{4}:{5}]".format(seq, start, stop, rseq, n-rstop, n-rstart)

Beispiel #15
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from suffix_tree import GeneralisedSuffixTree

# s1 = u'mississippi'
# s2 = u'sippissi'

s1 = u'一寸光阴一寸金';
s2 = u'寸金难买寸光阴';
stree = GeneralisedSuffixTree([s1,s2]) 

for shared in stree.sharedSubstrings(2):
    print '-'*70
    for (seq,start,stop) in shared:
       print seq, 
       print '['+str(start)+':'+str(stop)+']',
       ss = stree.sequences[seq][start:stop]
       print ss.encode('utf-8'),
       at = stree.sequences[seq][:start]+\
                    '{'+stree.sequences[seq][start:stop]+'}'+\
                    stree.sequences[seq][stop:]
       print at.encode('utf-8')
print '='*70 
import sys

out = open("output.txt", 'w')

Input = open(sys.argv[1], 'r').read().split("\n")

text1 = Input[0].strip()
text2 = Input[1].strip()

from suffix_tree import GeneralisedSuffixTree

stree = GeneralisedSuffixTree([text1, text2])

max_len = 0
for shared in stree.sharedSubstrings():
    seq, start, stop = shared[0]
    if (stop - start) > max_len:
        max_len = stop - start
        longestSS = stree.sequences[seq][start:stop]
print >> out, longestSS
Beispiel #17
0
from suffix_tree import GeneralisedSuffixTree

def validateSubstring(strings, seq):
    for s in strings:
        if s.find(seq) == -1:
            return False
    return True

    
with open('rosalind_lcs.txt') as spec:
    data = [seq.strip() for seq in spec]
    
    # The generalized suffix tree doesn't work well with a large number of strings.
    # Use the first 10 to generate candidates, and then compare each candidate
    # (in decreasing length order) to the data to find a common substring.
    tree = GeneralisedSuffixTree(data[:10])
    candidates = []
    for shared in tree.sharedSubstrings(5):
        for seq, start, stop in shared:
            candidates.append(tree.sequences[seq][start:stop])
            break
    
    candidates.sort(cmp=None, key=lambda s: len(s), reverse=True)
    for c in candidates:
        if validateSubstring(data, c):
            print c
            print len(c)
            break
    else:
        print "No common string found!"
        
Beispiel #18
0
import sys

out=open("output.txt",'w')

Input=open(sys.argv[1],'r').read().split("\n")

text1=Input[0].strip()
text2=Input[1].strip()

from suffix_tree import GeneralisedSuffixTree
stree = GeneralisedSuffixTree([text1,text2])

max_len=0
for shared in stree.sharedSubstrings():
  seq,start,stop=shared[0]
  if (stop-start)> max_len:
    max_len=stop-start
    longestSS=stree.sequences[seq][start:stop]
print >>out, longestSS

Beispiel #19
0
def parseBench(f1, f2, f3, f4, f5):
    (bin1, bin1arr, bin1fun, bin1all) = parse(f1)
    (bin2, bin2arr, bin2fun, bin2all) = parse(f2)
    (bin3, bin3arr, bin3fun, bin3all) = parse(f3)
    (bin4, bin4arr, bin4fun, bin4all) = parse(f4)
    (bin5, bin5arr, bin5fun, bin5all) = parse(f5)

    bin1arrB = ''.join(chr(x) for x in bin1arr)
    bin2arrB = ''.join(chr(x) for x in bin2arr)
    bin3arrB = ''.join(chr(x) for x in bin3arr)
    bin4arrB = ''.join(chr(x) for x in bin4arr)
    bin5arrB = ''.join(chr(x) for x in bin5arr)

    hex1arr = binascii.hexlify(bin1arrB)
    hex2arr = binascii.hexlify(bin2arrB)
    hex3arr = binascii.hexlify(bin3arrB)
    hex4arr = binascii.hexlify(bin4arrB)
    hex5arr = binascii.hexlify(bin5arrB)

    matches = {}
    matches3 = {}
    matches4 = {}
    matches5 = {}
    stree = GeneralisedSuffixTree([hex1arr, hex2arr])
    for shared in stree.sharedSubstrings(20):
        for seq, start, stop in shared:
            if seq == 0:
                leng = (stop-start)/2
                if leng in  matches:
                    matches[leng] += 1
                else:
                    matches[leng] = 1
                match = hex1arr[start:stop]
                if match in hex3arr:
                    if leng in matches3:
                        matches3[leng] += 1
                    else:
                        matches3[leng] = 1
                    if match in hex4arr:
                        if leng in matches4:
                            matches4[leng] += 1
                        else:
                            matches4[leng] = 1
                        if match in hex5arr:
                            if leng in matches5:
                                matches5[leng] += 1
                            else:
                                matches5[leng] = 1
                        


    last = 9
    for i in range(len(f1[0:-3])):
        sys.stdout.write(" ")
    sys.stdout.write("\t")
    for i in range(len(matches)):
        sys.stdout.write(str(i+10)+"\t")

    sys.stdout.write("\n"+f1[0:-3]+"\t")
    for i in sorted(matches):
        sys.stdout.write(str(matches[i])+"\t")
    sys.stdout.write("\n")

    for i in range(len(f1[0:-3])):
        sys.stdout.write(" ")
    sys.stdout.write("\t")
    for i in sorted(matches3):
        sys.stdout.write(str(matches3[i])+"\t")
    sys.stdout.write("\n")

    for i in range(len(f1[0:-3])):
        sys.stdout.write(" ")
    sys.stdout.write("\t")
    for i in sorted(matches4):
        sys.stdout.write(str(matches4[i])+"\t")
    sys.stdout.write("\n")

    for i in range(len(f1[0:-3])):
        sys.stdout.write(" ")
    sys.stdout.write("\t")
    for i in sorted(matches5):
        sys.stdout.write(str(matches5[i])+"\t")
    sys.stdout.write("\n")