Ejemplo n.º 1
0
def main():
    """ runs main script """

    # argv input list
    input_files = sys.argv

    flu_fasta = input_files[1]
    rhino_fasta = input_files[2]
    patient_fastq = input_files[3]

    flu_fasta_ref = list(SeqIO.parse(flu_fasta, "fasta"))
    rhino_fasta_ref = list(SeqIO.parse(rhino_fasta, "fasta"))
    patient_fastq_reads = list(SeqIO.parse(patient_fastq, "fasta"))

    # creates array of string objects of the sequences
    flu_seq = [str(flu.seq) for flu in flu_fasta_ref]
    rhi_seq = [str(rhi.seq) for rhi in rhino_fasta_ref]

    # creating the suffix tree object
    flu_suffix = STree.STree(flu_seq)
    rhi_suffix = STree.STree(rhi_seq)

    # arbitrary score, entire sequence must match to trigger
    flu_score = 0
    rhi_score = 0

    positive_strain_list = []

    # goes through each contig, but sequence must match entirely
    for read in patient_fastq_reads:

        if flu_suffix.find_all(str(read.seq)):
            flu_score += 1
            for record in flu_fasta_ref:
                if str(read.seq) in record.seq:
                    positive_strain_list.append(str(record.description))

        elif rhi_suffix.find_all(str(read.seq)):
            rhi_score += 1
            for record in rhino_fasta_ref:
                if str(read.seq) in record.seq:
                    positive_strain_list.append(str(record.description))

    with open('patient-report.txt', 'w') as output:
        output.write("Sequencing results are detecting the following:\n")
        output.write('\n'.join(positive_strain_list))
        if flu_score and not rhi_score:
            output.write("\nPatient is positive for the Influenza A virus.")
        elif rhi_score and not flu_score:
            output.write(
                "\nPatient is positive for the Human Rhinovirus Strain 89")
        elif not flu_score and not rhi_score:
            output.write(
                "\nInfluenza A virus and Human Rhinovirus not detected in patient."
            )
Ejemplo n.º 2
0
    def common_patterns(self, sequences):

        list_of_sequences = []
        # Just a bit of house keeping, the scores are in the form of lists but
        # I need them as strings in only this function. Hence the "hack-ish" code
        for sequence in sequences:
            voice = "".join(sequence)
            list_of_sequences.append(voice)

        # To be as fast as possible we window search the smallest string
        list_of_sequences = sorted(list_of_sequences, key=len)
        # Construct the genralised suffix tree
        suffix_tree = STree.STree(list_of_sequences)
        # List of common sequences
        matches = []
        # search window size
        window_size = len(suffix_tree.lcs())
        # Lower bound of the search window
        window_lower = 0
        # Upper bound of the search window
        window_upper = window_lower + window_size
        # If window size is 0 then suffix_tree.lcs() returned ""
        if window_size == 0:
            print('no match')
        elif suffix_tree.lcs() == list_of_sequences[0]:
            matches.append(suffix_tree.lcs())
            return matches
        # If there was a match then append it
        else:
            matches.append(suffix_tree.lcs())
        # Do this until the window size is 0
        while window_size != 0:
            # Create a copy of the sequence list
            list_of_sequences_copy = list_of_sequences[:]
            # Redefine the first element of the copy to only the elements in the window
            list_of_sequences_copy[0] = list_of_sequences[0][
                window_lower:window_upper]
            # Rebuild the suffix tree
            suffix_tree = STree.STree(list_of_sequences_copy)
            # If an unseen match is found and is not empty then append it
            if not suffix_tree.lcs() in matches and suffix_tree.lcs(
            ) != "" and len(suffix_tree.lcs()) >= 5:
                matches.append(suffix_tree.lcs())
            # shift the window one to the right
            window_lower += 1
            window_upper += 1
            # Once the upper bound of the window touches the end or the string
            if window_upper == len(list_of_sequences[0]) - 1:
                # Decrease the window size by one
                window_size -= 1
                # Reset the window back to the start
                window_lower = 0
                window_upper = window_lower + window_size
        return matches
Ejemplo n.º 3
0
def multiclusterConversion(labels, n=2):
    #get alphabet sequence
    alphabet = string.ascii_lowercase

    #get alphabet sequence keys into the clustering sequence (0->a, 1->b...)
    str_labels = np.array([("".join(str(a) for a in label))
                           for label in labels.T])

    code = {key: alphabet[i] for i, key in enumerate(set(str_labels))}
    str_decoded = [code[code_i] for code_i in str_labels]

    #rle
    decoded_join, rle_decoded, rep_decoded = runLengthEncoding(str_decoded)
    decoded_wordLst, ngrams_decoded = Ngrams(rle_decoded, n=n)
    pos_decoded = NgramsPos(rep_decoded, n=n)

    #bow
    bow = BagofWords(ngrams_decoded)
    mre_item = max(bow, key=lambda k: bow[k])
    print(mre_item)

    #suffix_tree
    st = STree.STree(ngrams_decoded)
    indexs = st.find_all(mre_item)

    return ngrams_decoded, pos_decoded, indexs
Ejemplo n.º 4
0
def arbol(txt, findTxt):

    st = STree.STree(txt)
    first = st.find(findTxt)
    print("El primer patron se encuentra en la posicion: {}".format(first))
    all = st.find_all(findTxt)
    print("Todas las posiciones: {}".format(all))
Ejemplo n.º 5
0
def find_file_index_lcs(file, byte_array, lcs):
    file_st = STree.STree(file)
    start_of_lcs_in_file = file_st.find(lcs)
    index = 0
    count = 0

    while count < start_of_lcs_in_file:
        substring = re.split("'", str(byte_array[index]))
        byte = substring[1]
        for b in byte:
            count += 1
        index += 1

    start_index = index

    while count < (start_of_lcs_in_file + len(lcs)):
        substring = re.split("'", str(byte_array[index]))
        byte = substring[1]
        for b in byte:
            count += 1
        index += 1

    end_index = index

    indices = [start_index, end_index]

    return indices
Ejemplo n.º 6
0
def test_lcs():
    a = [
        "abeceda", "abecednik", "abeabecedabeabeced", "abecedaaaa",
        "aaabbbeeecceeeddaaaaabeceda"
    ]
    st = STree.STree(a)
    assert st.lcs() == "abeced", "LCS test"
Ejemplo n.º 7
0
def test_missing():
    text = "name language w en url http w namelanguage en url http"
    stree = STree.STree(text)
    assert stree.find("law") == -1
    assert stree.find("ptth") == -1
    assert stree.find(
        "name language w en url http w namelanguage en url httpp") == -1
Ejemplo n.º 8
0
    def createHeatMapColoring(self, template1, template2, no_sec_peak):
        k = 2

        # get list with k-mere and their frequency
        struct_profile1 = self.getStructProfile1().getProfile()
        struct_kmer_list = [struct_profile1]

        template = [template1]

        struct_profile_obj2 = self.getStructProfile2()
        if struct_profile_obj2 is not None:
            struct_profile2 = struct_profile_obj2.getProfile()
            struct_kmer_list.append(struct_profile2)
            template.append(template2)

        norm_vector = self.getNormVector()

        result = []

        for i in range(0, len(struct_kmer_list)):
            current_template = template[i]
            current_profil = struct_kmer_list[i]

            # create suffix-tree to find k-mer position in template
            template1_s_tree = STree.STree(current_template)

            color_hm1 = {str(i): 0 for i in range(1, len(current_template) + 1)}
            color_hm1, not_matched_kmer1, color_domain_max1 = createColorVector(k, template1_s_tree, current_profil,
                                                                                color_hm1, no_sec_peak, norm_vector)

            result.append([color_hm1, color_domain_max1, not_matched_kmer1])

        return result
def build_One_STree(text):
    '''
    将所有text建立一棵树, 此时树过大, 无法存入磁盘文件
    '''
    start = time.time()
    st = STree.STree(text)
    print('Build Tree Total Time: ', time.time() - start)
    return st
Ejemplo n.º 10
0
def occurrence_of_string_sequence (strings, min_len):
    st = STree.STree(strings)
    longest = st.lcs()
    if len(longest) >= min_len:
        occurrences = st.find_all(longest)
        return len(occurrences)
    else:
        return 0
def find_longest_repeat(ps):
    """ Finds the longest repeated sub-sequence in the given string using a suffix-tree.
    """
    from suffix_trees import STree
    st = STree.STree(ps)
    deepest = max(st.root._get_leaves(),
                  key=lambda x: x.parent.depth).parent.depth
    return deepest
Ejemplo n.º 12
0
 def tf(self):
     # tfList = []
     tf_dict = {}
     unique = [x for i, x in enumerate(self) if i == self.index(x)]
     tf_dict = dict.fromkeys(unique, 0)
     tree = STree.STree(self)
     for i in unique:
         tf_dict[i] = len(tree.find_all(i))
     for i in tf_dict:
         tf_dict[i] = tf_dict[i] / float(len(self))
     return tf_dict
Ejemplo n.º 13
0
def smallest_k(sequences):
    t0 = datetime.now()
    print(t0.time(), " experiment start")
    sequences = [s.seq._data for s in sequences]

    print(datetime.now() - t0, " building tree...")
    tree = STree.STree(sequences)
    print(datetime.now() - t0, " finished building tree, bfs...")
    ret = bfs_find_shortest_uncommon_substring(tree, len(sequences))
    print(datetime.now() - t0, " finished all!")
    return ret
Ejemplo n.º 14
0
def is_strand_in_all_files(files, strand):
    j = 0
    indices = []
    for i in range(0, 10):
        st = STree.STree(files[i][3])
        index = st.find_all(strand)
        if len(index) > 0:
            strand_start_end = find_file_index_lcs(files[i][3], files[i][2],
                                                   strand)
            element = [i + 1, strand_start_end[0]]
            indices.append(element)
    # print(indices)
    return indices
Ejemplo n.º 15
0
 def longest_common_subsequence_size(self, other):
     uniques = list(
         set([x.value for x in self.trace_files] +
             [x.value for x in other.trace_files]))
     uniques = sorted(uniques)
     mine = "".join([
         chr(ord('a') + bisect.bisect_left(uniques, x.value))
         for x in self.trace_files
     ])
     theirs = "".join([
         chr(ord('a') + bisect.bisect_left(uniques, x.value))
         for x in other.trace_files
     ])
     st = STree.STree([mine, theirs])
     common = st.lcs()
     return len(common)
def build_N_STree(text, cut_point=30):
    N = len(text) // cut_point if len(
        text) % cut_point == 0 else len(text) // cut_point + 1
    strees = []
    start = time.time()
    for i in range(N):
        if i >= N - 1:
            new_text = text[cut_point * i:]
            # print('最后一个树的文档数量:',len(new_text))
        else:
            new_text = text[cut_point * i:cut_point * i + cut_point]
        strees.append(STree.STree(new_text))
    strees.append(cut_point)
    # print('STrees Number: ', len(strees))
    print('Build %d Trees, ' % N, 'Total Build Time: ', time.time() - start)
    return strees
Ejemplo n.º 17
0
def main():
    """ runs main """

    # requires the fasta/txt file is in the same directory
    fasta_text = sys.argv[1]

    fasta_list = list(SeqIO.parse(fasta_text, "fasta"))

    sequence_list = []

    for record in fasta_list:
        sequence_list.append(str(record.seq))

    st = STree.STree(sequence_list)

    with open('output-lcs.txt', 'w') as output:
        output.write(st.lcs())
Ejemplo n.º 18
0
def __construct_suffix_trees(sample_id, proteins):
    '''
        Constructs a file on disk in data folder with name sample_id.pkl
        Iterates through all the proteins and creates a map between protein_ids
        and suffix trees of these sequences
    '''

    tree_map = {}
    for protein in proteins:
        protein_id = str(protein.protein_id)
        protein_seq = str(protein.protein_seq)
        tree_map[protein_id] = STree.STree(protein_seq)

    file_location = os.path.join(BASE_DIR, "data/" + sample_id + ".pkl")
    with open(file_location, 'wb') as f:
        pickle.dump(tree_map, f)

    return tree_map
Ejemplo n.º 19
0
def test_vectors():
    if not os.getenv("SUFFIX_TREES_TEST_VECTORS") == "1":
        pytest.skip(
            "skipping vectors test. Set SUFFIX_TREES_TEST_VECTORS=1 to run.")

    with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), INPUT),
              'r') as f:
        for line in f:
            splitted = line.split(",")
            search_string = splitted[0]
            # Build tree on the search string.
            st = STree.STree(search_string)
            # Test cases
            for case in splitted[1:-1]:
                [s, res] = case.split(":")
                assert int(res) == st.find(
                    s
                ), "Search string: {}, Test case: {}, Expected: {}".format(
                    search_string, s, int(res))
Ejemplo n.º 20
0
    def _load(self, dic_name, debug=False):
        """reads the .txt files from the given directory and builds the ngram
        and suffix tree data structures necessary for the search, as well as the
        conversion dictionary"""
        total = ""
        tindex = 0
        total_dict = {}
        n = set()
        v, i, a = (0, 0, 0)
        r = re.compile("(\d\d)(\d\d)(\d\d\d).txt")
        for filename in sorted(os.listdir(dic_name)):
            #for filename in sorted(listdir('Shaker_Manifesto', dic_name)):
            index = 0
            if filename.endswith(".txt"):
                if debug:
                    print("Processing {}".format(filename))
                v, i, a = map(int, r.match(filename).groups())
                w = ''
                with open(os.path.join(dic_name, filename), "rb") as file:
                    #with stream(__name__, "{}/{}".format(dic_name, filename)) as file:
                    for c in file.read().decode("utf8",
                                                errors="replace").lower():
                        if c in " \t\n\r\ufffd,./?\'\";:<>[]{}\\|+=_-()*&^%$#@!~`":
                            if total[-1] == ' ':
                                index += 1
                                continue
                            else:
                                total += ' '
                                n.add(w)
                                w = ''
                        else:
                            total += c
                            w += c
                        total_dict[tindex] = (v, i, a, index)
                        tindex += 1
                        index += 1

        self._tree = STree.STree(total)
        self._index_dict = total_dict
        self._ngram = NGram(n)
Ejemplo n.º 21
0
def find_longest_strand_in_two_or_more_files(files):
    sorted_files = sorted(files, key=itemgetter(1),
                          reverse=True)  # Largest to smallest file
    files_array = []
    files_byte_array = []
    files_indices = []

    for i in range(0, 10):
        files_array.append(sorted_files[i][3])
        files_byte_array.append(sorted_files[i][2])
        files_indices.append(sorted_files[i][0])

    longest_length = 0
    longest_substr = ""
    files_found = []
    offsets = []

    # print_lengths_of_arrays(files_byte_array)

    broken = False
    for i in range(0, 10):
        for j in range(i + 1, 10):
            if longest_length < len(
                    files_byte_array[i]) and longest_length < len(
                        files_byte_array[j]):
                arry = [files_array[i], files_array[j]]
                suffix_tree = STree.STree(arry)
                lcs = suffix_tree.lcs()

                file_1_index = find_file_index_lcs(files_array[i],
                                                   files_byte_array[i], lcs)
                file_2_index = find_file_index_lcs(files_array[j],
                                                   files_byte_array[j], lcs)

                length = -1
                if (file_1_index[1] - file_1_index[0] !=
                        file_2_index[1] - file_2_index[0]):
                    print("LCS bytes are not equal!!")
                else:
                    length = file_1_index[1] - file_1_index[0]

                if length > longest_length:
                    longest_length = length
                    longest_substr = lcs
                    files_found = [files_indices[i], files_indices[j]]
                    offsets = [file_1_index[0], file_2_index[0]]
                elif length == longest_length:
                    indices = is_strand_in_all_files(files, lcs)
                    if len(indices) > len(files_found):
                        print("More files with this strand!")
                        longest_substr = lcs
                        files_found = []
                        offsets = []
                        for index in indices:
                            files_found.append(index[0])
                            offsets.append(index[1])
            elif longest_length > len(files_byte_array[i]):
                broken = True
            else:
                break
        if broken:
            break

    indices = is_strand_in_all_files(files, longest_substr)
    files_found = []
    offsets = []
    for index in indices:
        files_found.append(index[0])
        offsets.append(index[1])

    length_filenames_offsets = {
        "length": longest_length,
        "file names": files_found,
        "offsets": offsets
    }
    # print(length_filenames_offsets)
    # check_lcs_of_finds(files, files_found, offsets, longest_length)
    return length_filenames_offsets
Ejemplo n.º 22
0
# print(s1)

f = open("BS19B031_Q1.txt", 'r')
text2 = f.read()
text2 = text2.lower()
tokenizer = RegexpTokenizer(r'\w+')
text2 = tokenizer.tokenize(text2)
s2 = ""
for i in text2:
    s2 += i
# print(s2)

mark = np.zeros(len(text2))
k = 4
temp = ""
st = STree.STree(s1)

for i in range(len(text2) - k):
    temp = ""
    for j in range(k):
        temp += text2[i + j]
    if st.find(temp) != -1:
        for j in range(k):
            mark[i + j] = 1
ans = ""
for i in range(len(text2)):
    if mark[i] == 1:
        ans += text2[i]
        ans += " "
    else:
        if ans != "":
Ejemplo n.º 23
0
def tree_test():
    a = ["rpazu", "zupa"]
    tree = STree.STree(a)
Ejemplo n.º 24
0
 def make_tree(self, ex):
     ex['st'] = STree.STree(ex['text'].lower())
     return ex
Ejemplo n.º 25
0
import string
import random


def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
    return ''.join(random.choice(chars) for _ in range(size))


from suffix_trees import STree

if __name__ == '__main__':
    a = [
        "abeceda", "abecednik", "abeabecedabeabeced", "abecedaaaa",
        "aaabbbeeecceeeddaaaaabeceda"
    ]
    st = STree.STree(a)
    print(st.lcs())

    text = "name language w en url http w namelanguage en url http"
    stree = STree.STree(text)
    print(stree.find('law'))

    st = STree.STree("abcdefghab")
    print(st.find("abc"))  # 0
    print(st.find_all("ab"))  # [0, 8] ---> [] :-(
Ejemplo n.º 26
0
def get_lcs(chunk_hashes):
    chunk_hashes_input = ['$'.join(chunk_hash) for chunk_hash in chunk_hashes]
    st = STree.STree(chunk_hashes_input)
    return st.lcs()
        for j in range(i, n):
            if i != j:
                temp = st.lcs(stringIdxs=[i, j])
                if len(temp) > len(lcs):
                    lcs = temp
                    index1 = i
                    index2 = j
    return (lcs, index1, index2)


def get_offset(n, lcs):
    offset = []
    for i in range(0, n):
        os = ar[i].find(lcs)
        offset.append(os)
    return offset


ar = init(10)  #read into files
st = STree.STree(ar)  #build suffix tree

lcs1 = st.lcs()  #get lcs accorss all strings (for curiosity)
# print("Length of byte strands across all files ", len(lcs1), " ", lcs1)

lcs2, index1, index2 = get_lcs_between2(
    10, st)  #actual longest strings between 2 files

offsets = get_offset(10, lcs2)
print("Length of most common strand %d" % len(lcs2))
print("File sample.%d offset at %d" % (index1 + 1, offsets[index1]))
print("File sample.%d offset at %d" % (index2 + 1, offsets[index2]))
"""
requirements:

```
$ pip install git+https://github.com/Nanguage/suffix-trees
```

"""

import os
from suffix_trees import STree, visualize
from Bio.Seq import reverse_complement


if __name__ == "__main__":
    s1 = "AAAACGTCGGGATCG"
    s2 = "GGGCGTAAAGCTCT"
    T = STree.STree([s1, s2])
    Tv = visualize.VisualizeTree(T)
    dot = Tv.to_graphviz()
    dot.save("img/q3.dot")
    os.system("dot -Tpdf img/q3.dot > img/q3.pdf")

    s1_rc = reverse_complement(s1)
    s2_rc = reverse_complement(s2)
    T = STree.STree([s1_rc, s2_rc])
    Tv = visualize.VisualizeTree(T)
    dot = Tv.to_graphviz()
    dot.save("img/q3-rc.dot")
    os.system("dot -Tpdf img/q3-rc.dot > img/q3-rc.pdf")
Ejemplo n.º 29
0
 )  #the n + 1 note is to be trained at
 if n > 1:
     #note estmation based on simple syntax analysis
     current_note = digits_to_str(
         [x[2 * j + index - 1][n - 1]])
     for p, q in enumerate(
             x[2 * j + index - 1]
     ):  #getting a list of all notes played
         if digits_to_str([
                 q
         ]) not in letters and p < n and q != 0:
             letters.append(digits_to_str([q]))
             numbers.append(q)
     freq = [0] * len(letters)
     st = STree.STree(
         digits_to_str(
             x[2 * j + index - 1]
             [0:n]))  #building the suffix tree
     for q, let in enumerate(letters):
         tmp = st.find_all(
             current_note + let
         )  #determining how frequent the current_note+other_note combination is
         if isinstance(tmp, list):
             freq[q] += len(tmp)
         else:
             freq[q] += 1
     x[2 * j +
       index - 1][n] = numbers[freq.index(
           max(freq)
       )]  #adding the most frequent as another feature
     letters = []
     numbers = []
Ejemplo n.º 30
0
"""
requirements:

```
$ pip install git+https://github.com/Nanguage/suffix-trees
```

"""

import os
from suffix_trees import STree, visualize

if __name__ == "__main__":
    s1 = "ACGT"
    s2 = "TGCA"
    T = STree.STree([s1, s2])
    Tv = visualize.VisualizeTree(T)
    dot = Tv.to_graphviz()
    dot.save("img/q1.dot")
    os.system("dot -Tpdf img/q1.dot > img/q1.pdf")