Ejemplo n.º 1
0
class Ner():

    # 初始化函数
    def __init__(self):
        # 所有实体词集合
        self._ner_word_list = []

        # 实体词替换的名字
        self._ner_name = ""

        # AC模型的builder
        self._builder = AcoraBuilder()

    # 设置实体词集合
    def set_ner_word_list(self, ner_word_list):
        self._ner_word_list = ner_word_list

    # 设置实体词替换的名字
    def set_ner_name(self, ner_name):
        self._ner_name = ner_name

    # 构建模型
    def build_ner(self):
        for i in range(len(self._ner_word_list)):
            self._builder.add(self._ner_word_list[i])

        self._tree = self._builder.build()

    # 命中字符串信息
    def hit(self, content_str):
        hit_list = []
        for hit_word, pos in self._tree.finditer(content_str):
            hit_list.append([hit_word, pos, self._ner_name])

        return hit_list
Ejemplo n.º 2
0
    def __init__(self, term_index):
        self.term_index = term_index

        builder = AcoraBuilder()
        for text in term_index:
            builder.add(text)
        self.ac = builder.build()
Ejemplo n.º 3
0
def build_keyword_tries(seqs):

    builder = AcoraBuilder()
    for i in range(0, len(seqs)):
        builder.add(str(seqs[i]))  # Add all V tags to keyword trie

    key = builder.build()
    return key
def build_keyword_tries(seqs):

    builder = AcoraBuilder()
    for i in range(0,len(seqs)):
        builder.add(str(seqs[i])) # Add all V tags to keyword trie

    key = builder.build()
    return key
Ejemplo n.º 5
0
    def __init__(self, keywords, vocab=None):
        from acora import AcoraBuilder
        builder = AcoraBuilder()
        #assert isinstance(keywords, (list,tuple))
        self.vocab = vocab
        for i in keywords:
            builder.add(i)

        #Generate the Acora search engine for the current keyword set:
        self.engine = builder.build()
Ejemplo n.º 6
0
    def _build(self):
        builder = AcoraBuilder()

        for idx, item in enumerate(self._regexes_or_assoc):

            #
            #   First we compile all regular expressions and save them to
            #   the re_cache.
            #
            if isinstance(item, tuple):
                regex = item[0]
                regex = regex.encode(DEFAULT_ENCODING)
                self._re_cache[regex] = re.compile(regex,
                                                   self._re_compile_flags)

                if regex in self._translator:
                    raise ValueError('Duplicated regex "%s"' % regex)

                self._translator[regex] = item[1:]
            elif isinstance(item, basestring):
                regex = item.encode(DEFAULT_ENCODING)
                self._re_cache[regex] = re.compile(regex,
                                                   self._re_compile_flags)
            else:
                raise ValueError('Can NOT build MultiRE with provided values.')

            #
            #   Now we extract the string literals (longer than hint_len only) from
            #   the regular expressions and populate the acora index
            #
            regex_hints = esmre.hints(regex)
            regex_keywords = esmre.shortlist(regex_hints)

            if not regex_keywords:
                self._regexes_with_no_keywords.append(regex)
                continue

            # Get the longest one
            regex_keyword = regex_keywords[0]

            if len(regex_keyword) <= self._hint_len:
                self._regexes_with_no_keywords.append(regex)
                continue

            # Add this keyword to the acora index, and also save a way to associate the
            # keyword with the regular expression
            regex_keyword = regex_keyword.lower()
            builder.add(regex_keyword)

            regexes_matching_keyword = self._keyword_to_re.get(
                regex_keyword, [])
            regexes_matching_keyword.append(regex)
            self._keyword_to_re[regex_keyword] = regexes_matching_keyword

        return builder.build()
Ejemplo n.º 7
0
def setup(vregions_file, jregions_file):

    v_end_length = 40  # how many nts at the end of the V region to consider
    j_start_length = 40  # how many nts at the start of the J region to consider

    handle = open(vregions_file, 'r')
    v_list = list(SeqIO.parse(handle, 'fasta'))
    handle.close()
    v_genes = [str(string.upper(v.seq)) for v in v_list]
    v_genes_cut = [v[-v_end_length:] for v in v_genes]

    all_v_substrings = []
    for v in v_genes_cut:
        all_v_substrings.append([
            v[i:i + n] for n in range(4,
                                      len(v) + 1)
            for i in range(len(v) - (n - 1))
        ])

    t0 = time.time()
    v_keyword_tries = []
    for v_substrings in all_v_substrings:
        v_builder = AcoraBuilder()
        for i in range(len(v_substrings)):
            v_builder.add(v_substrings[i])
        v_keyword_tries.append(v_builder.build())
    print 'V keyword tries built in', round(time.time() - t0, 2), 'seconds'

    handle = open(jregions_file, 'r')
    j_list = list(SeqIO.parse(handle, 'fasta'))
    handle.close()
    j_genes = [str(string.upper(j.seq)) for j in j_list]
    j_genes_cut = [j[:j_start_length] for j in j_genes]

    all_j_substrings = []
    for j in j_genes_cut:
        all_j_substrings.append([
            j[i:i + n] for n in range(4,
                                      len(j) + 1)
            for i in range(len(j) - (n - 1))
        ])

    t0 = time.time()
    j_keyword_tries = []
    for j_substrings in all_j_substrings:
        j_builder = AcoraBuilder()
        for i in range(len(j_substrings)):
            j_builder.add(j_substrings[i])
        j_keyword_tries.append(j_builder.build())
    print 'J keyword tries built in', round(time.time() - t0, 2), 'seconds'

    return v_keyword_tries, j_keyword_tries, v_genes, j_genes
Ejemplo n.º 8
0
    def _build(self):
        builder = AcoraBuilder()

        for idx, item in enumerate(self._regexes_or_assoc):

            #
            #   First we compile all regular expressions and save them to
            #   the re_cache.
            #
            if isinstance(item, tuple):
                regex = item[0]
                regex = regex.encode(DEFAULT_ENCODING)
                self._re_cache[regex] = re.compile(regex, self._re_compile_flags)

                if regex in self._translator:
                    raise ValueError('Duplicated regex "%s"' % regex)

                self._translator[regex] = item[1:]
            elif isinstance(item, basestring):
                regex = item.encode(DEFAULT_ENCODING)
                self._re_cache[regex] = re.compile(regex, self._re_compile_flags)
            else:
                raise ValueError('Can NOT build MultiRE with provided values.')

            #
            #   Now we extract the string literals (longer than hint_len only) from
            #   the regular expressions and populate the acora index
            #
            regex_hints = esmre.hints(regex)
            regex_keywords = esmre.shortlist(regex_hints)

            if not regex_keywords:
                self._regexes_with_no_keywords.append(regex)
                continue

            # Get the longest one
            regex_keyword = regex_keywords[0]

            if len(regex_keyword) <= self._hint_len:
                self._regexes_with_no_keywords.append(regex)
                continue

            # Add this keyword to the acora index, and also save a way to associate the
            # keyword with the regular expression
            regex_keyword = regex_keyword.lower()
            builder.add(regex_keyword)

            regexes_matching_keyword = self._keyword_to_re.get(regex_keyword, [])
            regexes_matching_keyword.append(regex)
            self._keyword_to_re[regex_keyword] = regexes_matching_keyword

        return builder.build()
Ejemplo n.º 9
0
Archivo: sma.py Proyecto: yflau/dsapp
class Acora(object):

    def __init__(self,dic):
        self.__builder = AcoraBuilder()
        fp = open(dic)
        for line in fp:
            self.__builder.add(line.rstrip("\n").decode("utf-8"))
        fp.close()
        self.__tree = self.__builder.build()

    def findall(self,content):
        hitList = []
        for hitWord, pos in self.__tree.finditer(content):
            hitList.append(hitWord)
        return hitList
Ejemplo n.º 10
0
def setup(vregions_file, jregions_file):

    v_end_length = 40  # how many nts at the end of the V region to consider
    j_start_length = 40  # how many nts at the start of the J region to consider

    handle = open(vregions_file, 'r')
    v_list = list(SeqIO.parse(handle, 'fasta'))
    handle.close()
    v_genes = [str(string.upper(v.seq)) for v in v_list]
    v_genes_cut = [v[-v_end_length:] for v in v_genes]

    all_v_substrings = []
    for v in v_genes_cut:
        all_v_substrings.append([v[i:i+n] for n in range(4, len(v)+1) for i in range(len(v)-(n-1))])

    t0 = time.time()
    v_keyword_tries = []
    for v_substrings in all_v_substrings:
        v_builder = AcoraBuilder()
        for i in range(len(v_substrings)):
            v_builder.add(v_substrings[i])
        v_keyword_tries.append(v_builder.build())
    print 'V keyword tries built in', round(time.time() - t0, 2), 'seconds'

    handle = open(jregions_file, 'r')
    j_list = list(SeqIO.parse(handle, 'fasta'))
    handle.close()
    j_genes = [str(string.upper(j.seq)) for j in j_list]
    j_genes_cut = [j[:j_start_length] for j in j_genes]

    all_j_substrings = []
    for j in j_genes_cut:
        all_j_substrings.append([j[i:i+n] for n in range(4, len(j)+1) for i in range(len(j)-(n-1))])

    t0 = time.time()
    j_keyword_tries = []
    for j_substrings in all_j_substrings:
        j_builder = AcoraBuilder()
        for i in range(len(j_substrings)):
            j_builder.add(j_substrings[i])
        j_keyword_tries.append(j_builder.build())
    print 'J keyword tries built in', round(time.time() - t0, 2), 'seconds'

    return v_keyword_tries, j_keyword_tries, v_genes, j_genes
Ejemplo n.º 11
0
    def __init__(self,
                 use_unicode=True,
                 ignore_case=False,
                 titles=None,
                 extra_titles=None):
        """
        :param use_unicode: whether to use `titles` as unicode or bytestrings
        :param ignore_case: if True ignore case in all matches
        :param titles: if given, overrides default `load_titles()` values
        :param extra_titles: if given, add to titles
        """
        titles = titles if titles else load_titles()
        titles = (titles if use_unicode else
                  (s.encode('ascii') for s in titles))
        builder = AcoraBuilder()
        logging.info('building job title searcher')
        builder.update(titles)
        if extra_titles:
            builder.add(extra_titles)

        self.ac = builder.build(ignore_case=ignore_case)
        logging.info('building done')
Ejemplo n.º 12
0
    def _build(self):
        builder = AcoraBuilder()

        for idx, item in enumerate(self._keywords_or_assoc):

            if isinstance(item, tuple):
                keyword = item[0]
                keyword = keyword.encode(DEFAULT_ENCODING)

                if keyword in self._translator:
                    raise ValueError('Duplicated keyword "%s"' % keyword)

                self._translator[keyword] = item[1:]

                builder.add(keyword)
            elif isinstance(item, basestring):
                keyword = item.encode(DEFAULT_ENCODING)
                builder.add(keyword)
            else:
                raise ValueError('Can NOT build MultiIn with provided values.')

        return builder.build()
Ejemplo n.º 13
0
    def _build(self):
        builder = AcoraBuilder()

        for idx, item in enumerate(self._keywords_or_assoc):

            if isinstance(item, tuple):
                keyword = item[0]
                keyword = keyword.encode(DEFAULT_ENCODING)

                if keyword in self._translator:
                    raise ValueError('Duplicated keyword "%s"' % keyword)

                self._translator[keyword] = item[1:]

                builder.add(keyword)
            elif isinstance(item, basestring):
                keyword = item.encode(DEFAULT_ENCODING)
                builder.add(keyword)
            else:
                raise ValueError('Can NOT build MultiIn with provided values.')

        return builder.build()
def analysis( Sequence_Reads, with_statistics=True, with_reverse_complement_search=True):
    import numpy as np
    import decimal as dec
    import string
    import operator as op
    import collections as coll
    from Bio import SeqIO
    from acora import AcoraBuilder
    from time import time, clock
    from string import Template
    from operator import itemgetter, attrgetter
    import Levenshtein as lev

    v_half_split, j_half_split = [10,6] # Do not change - V tags are split at position 10, J at position 6, to look for half tags if no full tag is found.

    ################

    print 'Commencing analysis on a total of', len(Sequence_Reads), 'file(s)'

    ## Create .txt file to store f=(v_index,j_index,v_deletions,j_deletions,nt_insert)
    analysis_file = open("DecombinatorResults.txt", "w")
    analysis_file.close()
    results = "DecombinatorResults.txt" # Name the .txt file to write to

    ################
    print ('Importing known V, D and J gene segments and tags...')

    handle = open("human_TRBV_region.fasta", "rU")
    v_genes = list(SeqIO.parse(handle, "fasta"))
    handle.close()

    handle = open("human_TRBJ_region.fasta", "rU")
    j_genes = list(SeqIO.parse(handle, "fasta"))
    handle.close()

    v_regions = []
    for j in range(0, len(v_genes)):
        v_regions.append(string.upper(v_genes[j].seq))

    j_regions = []
    for j in range(0, len(j_genes)):
        j_regions.append(string.upper(j_genes[j].seq))

    ##############
    ## Build keyword tries of V and J tags for fast assignment
    v_seqs, half1_v_seqs, half2_v_seqs, jump_to_end_v = get_v_tags(open("tags_trbv.txt", "rU"), v_half_split)
    j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j = get_j_tags(open("tags_trbj.txt", "rU"), j_half_split)   

    v_builder = AcoraBuilder()
    for i in range(0,len(v_seqs)):
        v_builder.add(str(v_seqs[i])) # Add all V tags to keyword trie

    v_key = v_builder.build()

    j_builder = AcoraBuilder()
    for i in range(0,len(j_seqs)):
        j_builder.add(str(j_seqs[i])) # Add all J tags to keyword trie

    j_key = j_builder.build()

    ##############
    ## Build keyword tries for first and second halves of both V and J tags
    v_half1_builder = AcoraBuilder()
    for i in range(0,len(half1_v_seqs)):
        v_half1_builder.add(str(half1_v_seqs[i]))
    half1_v_key = v_half1_builder.build()

    v_half2_builder = AcoraBuilder()
    for i in range(0,len(half2_v_seqs)):
        v_half2_builder.add(str(half2_v_seqs[i]))
    half2_v_key = v_half2_builder.build()

    j_half1_builder = AcoraBuilder()
    for i in range(0,len(half1_j_seqs)):
        j_half1_builder.add(str(half1_j_seqs[i]))
    half1_j_key = j_half1_builder.build()

    j_half2_builder = AcoraBuilder()
    for i in range(0,len(half2_j_seqs)):
        j_half2_builder.add(str(half2_j_seqs[i]))
    half2_j_key = j_half2_builder.build()

    ###############
    ## Initialise variables
    assigned_count = 0 # this will just increase by one every time we correctly assign a seq read with all desired variables
    seq_count = 0 # this will simply track the number of sequences analysed in file
    t0 = time() # Begin timer

    ###############
    ## Open .txt file created at the start of analysis
    analysis_file = open(results, "a")
    stemplate = Template('$v $j $del_v $del_j $nt_insert') # Creates stemplate, a holder, for f. Each line will have the 5 variables separated by a space

    ###############
    ## Begin analysing sequences

    for i in range(len(Sequence_Reads)):
        
        print 'Importing sequences from', Sequence_Reads[i],' and assigning V and J regions...'
        handle = open(Sequence_Reads[i], "rU")
        
        for record in SeqIO.parse(handle, "fastq"):
            
            found_seq_match = 0
            seq_count += 1
            
            hold_v = v_key.findall(str(record.seq))
            hold_j = j_key.findall(str(record.seq))

            if hold_v:                
                v_match = v_seqs.index(hold_v[0][0]) # Assigns V
                temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ): # If the number of deletions has been found
                    [ end_v, deletions_v] = get_v_deletions( record.seq, v_match, temp_end_v, v_regions )
            else:
                found_v_match = 0
                hold_v1 = half1_v_key.findall(str(record.seq))
                hold_v2 = half2_v_key.findall(str(record.seq))
                for i in range(len(hold_v1)):
                    indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ]
                    for k in indices:
                        if len(v_seqs[k]) == len(str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]):
                            if lev.hamming( v_seqs[k], str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1:
                                v_match = k
                                temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                                found_v_match += 1
                for i in range(len(hold_v2)):
                    indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ]
                    for k in indices:
                        if len(v_seqs[k]) == len(str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]):
                            if lev.hamming( v_seqs[k], str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1:
                                v_match = k
                                temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                                found_v_match += 1

            if hold_j:
                j_match = j_seqs.index(hold_j[0][0]) # Assigns J
                temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
                if get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    [ start_j, deletions_j] = get_j_deletions( record.seq, j_match, temp_start_j, j_regions )
            else:
                found_j_match = 0
                hold_j1 = half1_j_key.findall(str(record.seq))
                hold_j2 = half2_j_key.findall(str(record.seq))
                for i in range(len(hold_j1)):
                    indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ]
                    for k in indices:
                        if len(j_seqs[k]) == len(str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]):
                            if lev.hamming( j_seqs[k], str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1:
                                j_match = half1_j_seqs.index(hold_j1[i][0])
                                temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
                                found_j_match += 1
                for i in range(len(hold_j2)):
                    indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ]
                    for k in indices:
                        if len(j_seqs[k]) == len(str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]):
                            if lev.hamming( j_seqs[k], str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1:
                                j_match = half2_j_seqs.index(hold_j2[i][0])
                                temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - 6 # Finds where the start of a full J would be
                                found_j_match += 1

            if hold_v and hold_j:
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                    print >> analysis_file, f_seq # Write to analysis_file (text file) the classification of the sequence
                    assigned_count += 1
                    found_seq_match = 1
            elif hold_v and found_j_match == 1:
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                    print >> analysis_file, f_seq
                    assigned_count += 1
                    found_seq_match = 1
            elif found_v_match == 1 and hold_j:
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                    print >> analysis_file, f_seq
                    assigned_count += 1
                    found_seq_match = 1
            elif found_v_match == 1 and found_j_match == 1:
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                    print >> analysis_file, f_seq
                    assigned_count += 1
                    found_seq_match = 1

            if found_seq_match == 0 and with_reverse_complement_search == True:
                
                #####################
                # REVERSE COMPLEMENT
                #####################

                record_reverse = record.reverse_complement()
                hold_v = v_key.findall(str(record_reverse.seq))
                hold_j = j_key.findall(str(record_reverse.seq))

                if hold_v:                
                    v_match = v_seqs.index(hold_v[0][0]) # Assigns V
                    temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ): # If the number of deletions has been found
                        [ end_v, deletions_v] = get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions )
                else:
                    found_v_match = 0
                    hold_v1 = half1_v_key.findall(str(record_reverse.seq))
                    hold_v2 = half2_v_key.findall(str(record_reverse.seq))
                    for i in range(len(hold_v1)):
                        indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ]
                        for k in indices:
                            if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]):
                                if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1:
                                    v_match = k
                                    temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                                    found_v_match += 1
                    for i in range(len(hold_v2)):
                        indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ]
                        for k in indices:
                            if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]):
                                if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1:
                                    v_match = k
                                    temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                                    found_v_match += 1

                if hold_j:
                    j_match = j_seqs.index(hold_j[0][0]) # Assigns J
                    temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
                    if get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        [ start_j, deletions_j] = get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions )
                else:
                    found_j_match = 0
                    hold_j1 = half1_j_key.findall(str(record_reverse.seq))
                    hold_j2 = half2_j_key.findall(str(record_reverse.seq))
                    for i in range(len(hold_j1)):
                        indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ]
                        for k in indices:
                            if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]):
                                if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1:
                                    j_match = half1_j_seqs.index(hold_j1[i][0])
                                    temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
                                    found_j_match += 1
                    for i in range(len(hold_j2)):
                        indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ]
                        for k in indices:
                            if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]):
                                if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1:
                                    j_match = half2_j_seqs.index(hold_j2[i][0])
                                    temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - 6 # Finds where the start of a full J would be
                                    found_j_match += 1

                if hold_v and hold_j:
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j]))
                        print >> analysis_file, f_seq # Write to analysis_file (text file) the classification of the sequence
                        assigned_count += 1
                        found_seq_match = 1
                elif hold_v and found_j_match == 1:
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j]))
                        print >> analysis_file, f_seq
                        assigned_count += 1
                        found_seq_match = 1
                elif found_v_match == 1 and hold_j:
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j]))
                        print >> analysis_file, f_seq
                        assigned_count += 1
                        found_seq_match = 1
                elif found_v_match == 1 and found_j_match == 1:
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j]))
                        print >> analysis_file, f_seq
                        assigned_count += 1
                        found_seq_match = 1
        handle.close()
    analysis_file.close()

    if with_statistics == True:
        timed = time() - t0
        print seq_count, 'sequences were analysed'
        print assigned_count, ' sequences were successfully assigned'
        print 'Time taken =', timed, 'seconds'
Ejemplo n.º 15
0
 def __init__(self, text):
     self.text = text
     keywords = ["ownership", "owner", "own", "propietary", "tracking", "track", "store", "keep", "keeping"]
     builder = AcoraBuilder()
     builder.add(*keywords)
     self.finder = builder.build()
        for protein, seq, blank in fxn.read_fa(in_file):
            mouse_proteins[protein.split(' ')[0]] = seq

    # Then scroll through non-predicted binder files, build an AC trie of all the peptides per file
    data_dir = '../Data/NonPredictedBinders/'
    matches = coll.defaultdict(fxn.nest_counter)
    all_peptides = coll.defaultdict(list)
    for f in [x for x in os.listdir(data_dir) if x.endswith('.txt')]:
        nam = f.split('-')[0]
        search_builder = AcoraBuilder()
        peptides = []

        # Build trie
        with open(data_dir + f, 'rU') as in_file:
            for line in in_file:
                search_builder.add(line.rstrip())
                peptides.append(line.rstrip())
                all_peptides[f.split('-')[0]].append(line.rstrip())
        seq_search = search_builder.build()

        # Use to search all proteins in proteome
        for protein in mouse_proteins:
            seq_check = seq_search.findall(mouse_proteins[protein])
            if seq_check:
                for s in seq_check:
                    matches[nam][s[0]] += 1

        # Then fill in the zeroes (unmatched peptides) to get denominator
        for p in peptides:
            if p not in matches[nam]:
                matches[nam][p] = 0
Ejemplo n.º 17
0
  v_regions.append(str(v_genes[v].seq).upper())
  v_nams.append(v_genes[v].id.split("|")[1])

j_regions = []
j_nams = [] 
for j in range(0, len(j_genes)):
  j_regions.append(str(j_genes[j].seq).upper())
  j_nams.append(v_genes[v].id.split("|")[1])

## Build keyword tries of V and J tags for fast assignment
v_seqs, half1_v_seqs, half2_v_seqs, jump_to_end_v = get_v_tags(open("tags_tr"+ chain.lower() + "v.txt", "rU"), v_half_split)
j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j = get_j_tags(open("tags_tr"+ chain.lower() + "j.txt", "rU"), j_half_split)

v_builder = AcoraBuilder()
for i in range(0,len(v_seqs)):
    v_builder.add(str(v_seqs[i])) # Add all V tags to keyword trie

v_key = v_builder.build()

j_builder = AcoraBuilder()
for i in range(0,len(j_seqs)):
    j_builder.add(str(j_seqs[i])) # Add all J tags to keyword trie

j_key = j_builder.build()

## Build keyword tries for first and second halves of both V and J tags
v_half1_builder = AcoraBuilder()
for i in range(0,len(half1_v_seqs)):
    v_half1_builder.add(str(half1_v_seqs[i]))
half1_v_key = v_half1_builder.build()