class Ner(): # 初始化函数 def __init__(self): # 所有实体词集合 self._ner_word_list = [] # 实体词替换的名字 self._ner_name = "" # AC模型的builder self._builder = AcoraBuilder() # 设置实体词集合 def set_ner_word_list(self, ner_word_list): self._ner_word_list = ner_word_list # 设置实体词替换的名字 def set_ner_name(self, ner_name): self._ner_name = ner_name # 构建模型 def build_ner(self): for i in range(len(self._ner_word_list)): self._builder.add(self._ner_word_list[i]) self._tree = self._builder.build() # 命中字符串信息 def hit(self, content_str): hit_list = [] for hit_word, pos in self._tree.finditer(content_str): hit_list.append([hit_word, pos, self._ner_name]) return hit_list
def __init__(self, term_index): self.term_index = term_index builder = AcoraBuilder() for text in term_index: builder.add(text) self.ac = builder.build()
def match_lines(self, s, *keywords): ''' Searching for the specific keywords @param s The Filename. @param Keywords The List which contains two keywords (index 0 - is primary key and index 1 is the parameter). @returns Lines where the keywords present. ''' builder = AcoraBuilder('\r', '\n', *keywords) ac = builder.build() line_start = 0 matches = False for kw, pos in ac.finditer(s): if kw in '\r\n': if matches: yield s[line_start:pos] matches = False line_start = pos + 1 else: matches = True if matches: yield s[line_start:]
def build_keyword_tries(seqs): builder = AcoraBuilder() for i in range(0,len(seqs)): builder.add(str(seqs[i])) # Add all V tags to keyword trie key = builder.build() return key
def build_keyword_tries(seqs): builder = AcoraBuilder() for i in range(0, len(seqs)): builder.add(str(seqs[i])) # Add all V tags to keyword trie key = builder.build() return key
def compare_search(s, filename, ignore_case, *keywords): setup_pya = setup_cya = setup_re = 0 run_pa = 'pa' in COMPARED_IMPLEMENTATIONS run_ca = 'ca' in COMPARED_IMPLEMENTATIONS run_re = 're' in COMPARED_IMPLEMENTATIONS if run_pa: t = time() builder = AcoraBuilder(keywords, ignore_case=ignore_case) py_acora = builder.build(acora=PyAcora) setup_pya = time() - t t = time() if run_ca: t = time() builder = AcoraBuilder(keywords, ignore_case=ignore_case) c_acora = builder.build() setup_ca = time() - t if run_re: t = time() if hasattr(keywords[0], 'encode'): # unicode in Py3? kw_regexp = '|'.join(keywords) else: kw_regexp = '|'.encode('ASCII').join(keywords) if ignore_case: regexp = re.compile(kw_regexp, re.I) else: regexp = re.compile(kw_regexp) setup_re = time() - t print("Case %ssensitive %s\n- setup times: PA: %.4f, CA: %.4f, RE: %.4f" % ( ignore_case and 'in' or '', builder.for_unicode and 'unicode' or 'bytes', setup_pya, setup_ca, setup_re)) if run_pa: timings = timeit.Timer(partial(py_acora.findall, s)).repeat(number=REPEAT_COUNT) print("TIME(paS): %.3f" % min(timings)) if run_ca: timings = timeit.Timer(partial(c_acora.findall, s)).repeat(number=REPEAT_COUNT) print("TIME(caS): %.3f" % min(timings)) if filename: if run_pa: timings = timeit.Timer(partial(py_acora.filefindall, filename)).repeat(number=REPEAT_COUNT) print("TIME(paF): %.3f" % min(timings)) if run_ca: timings = timeit.Timer(partial(c_acora.filefindall, filename)).repeat(number=REPEAT_COUNT) print("TIME(caF): %.3f" % min(timings)) if run_re: timings = timeit.Timer(partial(regexp.findall, s)).repeat(number=REPEAT_COUNT) print("TIME(reS): %.3f" % min(timings)) return ( run_pa and py_acora.findall(s) or None, run_ca and c_acora.findall(s) or None, run_pa and (filename and py_acora.filefindall(filename)) or None, run_ca and (filename and c_acora.filefindall(filename)) or None, run_re and regexp.findall(s) or None )
def __init__(self): # 所有实体词集合 self._ner_word_list = [] # 实体词替换的名字 self._ner_name = "" # AC模型的builder self._builder = AcoraBuilder()
def __init__(self, keywords, vocab=None): from acora import AcoraBuilder builder = AcoraBuilder() #assert isinstance(keywords, (list,tuple)) self.vocab = vocab for i in keywords: builder.add(i) #Generate the Acora search engine for the current keyword set: self.engine = builder.build()
def _build(self): builder = AcoraBuilder() for idx, item in enumerate(self._regexes_or_assoc): # # First we compile all regular expressions and save them to # the re_cache. # if isinstance(item, tuple): regex = item[0] regex = regex.encode(DEFAULT_ENCODING) self._re_cache[regex] = re.compile(regex, self._re_compile_flags) if regex in self._translator: raise ValueError('Duplicated regex "%s"' % regex) self._translator[regex] = item[1:] elif isinstance(item, basestring): regex = item.encode(DEFAULT_ENCODING) self._re_cache[regex] = re.compile(regex, self._re_compile_flags) else: raise ValueError('Can NOT build MultiRE with provided values.') # # Now we extract the string literals (longer than hint_len only) from # the regular expressions and populate the acora index # regex_hints = esmre.hints(regex) regex_keywords = esmre.shortlist(regex_hints) if not regex_keywords: self._regexes_with_no_keywords.append(regex) continue # Get the longest one regex_keyword = regex_keywords[0] if len(regex_keyword) <= self._hint_len: self._regexes_with_no_keywords.append(regex) continue # Add this keyword to the acora index, and also save a way to associate the # keyword with the regular expression regex_keyword = regex_keyword.lower() builder.add(regex_keyword) regexes_matching_keyword = self._keyword_to_re.get( regex_keyword, []) regexes_matching_keyword.append(regex) self._keyword_to_re[regex_keyword] = regexes_matching_keyword return builder.build()
def filter_text(iterable, text): b = AcoraBuilder(text.lower()) ac = b.build() def m(obj): for _,_,o in obj["_graph"]: if isinstance(o, Literal): for _ in ac.findall(o.lower()): return True return False for obj in iterable: if m(obj): yield obj
def __init__(self, use_unicode=True, ignore_case=False, titles=None): """ :param use_unicode: whether to use `titles` as unicode or bytestrings :param ignore_case: if True ignore case in all matches :param titles: if given, overrides default `load_titles()` values """ titles = titles if titles else load_titles() titles = (titles if use_unicode else (s.encode('ascii') for s in titles)) builder = AcoraBuilder() builder.update(titles) self.ac = builder.build(ignore_case=ignore_case)
def setup(vregions_file, jregions_file): v_end_length = 40 # how many nts at the end of the V region to consider j_start_length = 40 # how many nts at the start of the J region to consider handle = open(vregions_file, 'r') v_list = list(SeqIO.parse(handle, 'fasta')) handle.close() v_genes = [str(string.upper(v.seq)) for v in v_list] v_genes_cut = [v[-v_end_length:] for v in v_genes] all_v_substrings = [] for v in v_genes_cut: all_v_substrings.append([ v[i:i + n] for n in range(4, len(v) + 1) for i in range(len(v) - (n - 1)) ]) t0 = time.time() v_keyword_tries = [] for v_substrings in all_v_substrings: v_builder = AcoraBuilder() for i in range(len(v_substrings)): v_builder.add(v_substrings[i]) v_keyword_tries.append(v_builder.build()) print 'V keyword tries built in', round(time.time() - t0, 2), 'seconds' handle = open(jregions_file, 'r') j_list = list(SeqIO.parse(handle, 'fasta')) handle.close() j_genes = [str(string.upper(j.seq)) for j in j_list] j_genes_cut = [j[:j_start_length] for j in j_genes] all_j_substrings = [] for j in j_genes_cut: all_j_substrings.append([ j[i:i + n] for n in range(4, len(j) + 1) for i in range(len(j) - (n - 1)) ]) t0 = time.time() j_keyword_tries = [] for j_substrings in all_j_substrings: j_builder = AcoraBuilder() for i in range(len(j_substrings)): j_builder.add(j_substrings[i]) j_keyword_tries.append(j_builder.build()) print 'J keyword tries built in', round(time.time() - t0, 2), 'seconds' return v_keyword_tries, j_keyword_tries, v_genes, j_genes
def _build(self): builder = AcoraBuilder() for idx, item in enumerate(self._regexes_or_assoc): # # First we compile all regular expressions and save them to # the re_cache. # if isinstance(item, tuple): regex = item[0] regex = regex.encode(DEFAULT_ENCODING) self._re_cache[regex] = re.compile(regex, self._re_compile_flags) if regex in self._translator: raise ValueError('Duplicated regex "%s"' % regex) self._translator[regex] = item[1:] elif isinstance(item, basestring): regex = item.encode(DEFAULT_ENCODING) self._re_cache[regex] = re.compile(regex, self._re_compile_flags) else: raise ValueError('Can NOT build MultiRE with provided values.') # # Now we extract the string literals (longer than hint_len only) from # the regular expressions and populate the acora index # regex_hints = esmre.hints(regex) regex_keywords = esmre.shortlist(regex_hints) if not regex_keywords: self._regexes_with_no_keywords.append(regex) continue # Get the longest one regex_keyword = regex_keywords[0] if len(regex_keyword) <= self._hint_len: self._regexes_with_no_keywords.append(regex) continue # Add this keyword to the acora index, and also save a way to associate the # keyword with the regular expression regex_keyword = regex_keyword.lower() builder.add(regex_keyword) regexes_matching_keyword = self._keyword_to_re.get(regex_keyword, []) regexes_matching_keyword.append(regex) self._keyword_to_re[regex_keyword] = regexes_matching_keyword return builder.build()
def __init__(self, content: List[str], ignore_case: bool): """ Acora matcher factory :param content: a list of items to search :param ignore_case: True to match any case :return: a built matcher """ # start with a string in case content is empty # otherwise it builds a binary Acora matcher builder = AcoraBuilder("!@#$%%^&*") if len(content) > 0: builder.update(content) self.matcher = builder.build(ignore_case=ignore_case)
def test_acora_python(self): builder = AcoraBuilder() builder.update([s for (s,) in SQL_ERRORS]) ac = builder.build(acora=PyAcora) i = 0 # # This takes around 9 seconds in my workstation. # for j in xrange(self.ITERATIONS): for _ in ac.finditer(HTTP_RESPONSE): i += 1 self.assertEqual(i, self.ITERATIONS * 2)
def __init__(self, keywords: Optional[Iterable[str]] = []): non_empty_keywords = [] if keywords is not None: for w in keywords: if w.strip() != "": non_empty_keywords.append(w) self._keywords = set(non_empty_keywords) if len(self._keywords) > 0: ac_builder = AcoraBuilder() ac_builder.update(keywords) self._finder = ac_builder.build() else: self._finder = None
class Acora(object): def __init__(self,dic): self.__builder = AcoraBuilder() fp = open(dic) for line in fp: self.__builder.add(line.rstrip("\n").decode("utf-8")) fp.close() self.__tree = self.__builder.build() def findall(self,content): hitList = [] for hitWord, pos in self.__tree.finditer(content): hitList.append(hitWord) return hitList
def __init__(self,dic): self.__builder = AcoraBuilder() fp = open(dic) for line in fp: self.__builder.add(line.rstrip("\n").decode("utf-8")) fp.close() self.__tree = self.__builder.build()
def directed_graph(self) : if not hasattr(self, "_directed_graph") : print "getting directed graph ..." graph = defaultdict(_dd_int) # Zhu: in my VM, build speed is about 1.4w entity / s ac = AcoraBuilder(*self.database.entities).build() # match consumes no time, compared to build for text, attrib in self.database : entities = zip(*longest_match(ac.finditer(text)))[0] for entity in set(entities) : if entity == attrib["title"] : continue graph[attrib["title"]][entity] += 1 delattr(self, "database") self._directed_graph = graph return self._directed_graph
def get_key_word(data): output_database = [] if len(data["entity_dict"]) >= 1: dicts = OrderedDict() for key in data["entity_dict"]: dicts[key] = key for t in data["entity_dict"][key]: dicts[t] = key query = data["query"] key_word_builder = AcoraBuilder(dicts.keys()) key_word_searcher = key_word_builder.build() print(dicts, "------detected diccts-------") res = key_word_searcher.findall(query) print(res) if len(res) >= 1: input_entity = [item[0] for item in res] input_entity_key = [] for char in input_entity: input_entity_key.extend(data["entity_dict"][dicts[char]]) input_entity_key.append(dicts[char]) input_key_entity = list(set(input_entity_key)) key_word_builder = AcoraBuilder(input_key_entity) key_word_searcher = key_word_builder.build() for data in data["database"]: t = len(key_word_searcher.findall(data)) output_database.append(t) else: for data in data["database"]: output_database.append(0) else: for data in data["database"]: output_database.append(0) return output_database
def compare_search(s, filename, ignore_case, *keywords): setup_pya = setup_cya = setup_re = 0 run_pa = 'pa' in COMPARED_IMPLEMENTATIONS run_ca = 'ca' in COMPARED_IMPLEMENTATIONS run_re = 're' in COMPARED_IMPLEMENTATIONS if run_pa: t = time() builder = AcoraBuilder(keywords, ignore_case=ignore_case) py_acora = builder.build(acora=PyAcora) setup_pya = time() - t t = time() if run_ca: t = time() builder = AcoraBuilder(keywords, ignore_case=ignore_case) c_acora = builder.build() setup_ca = time() - t if run_re: t = time() if hasattr(keywords[0], 'encode'): # unicode in Py3? kw_regexp = '|'.join(keywords) else: kw_regexp = '|'.encode('ASCII').join(keywords) if ignore_case: regexp = re.compile(kw_regexp, re.I) else: regexp = re.compile(kw_regexp) setup_re = time() - t print("Case %ssensitive %s\n- setup times: PA: %.4f, CA: %.4f, RE: %.4f" % (ignore_case and 'in' or '', builder.for_unicode and 'unicode' or 'bytes', setup_pya, setup_ca, setup_re)) if run_pa: timings = timeit.Timer(partial(py_acora.findall, s)).repeat(number=REPEAT_COUNT) print("TIME(paS): %.3f" % min(timings)) if run_ca: timings = timeit.Timer(partial(c_acora.findall, s)).repeat(number=REPEAT_COUNT) print("TIME(caS): %.3f" % min(timings)) if filename: if run_pa: timings = timeit.Timer(partial( py_acora.filefindall, filename)).repeat(number=REPEAT_COUNT) print("TIME(paF): %.3f" % min(timings)) if run_ca: timings = timeit.Timer(partial( c_acora.filefindall, filename)).repeat(number=REPEAT_COUNT) print("TIME(caF): %.3f" % min(timings)) if run_re: timings = timeit.Timer(partial(regexp.findall, s)).repeat(number=REPEAT_COUNT) print("TIME(reS): %.3f" % min(timings)) return (run_pa and py_acora.findall(s) or None, run_ca and c_acora.findall(s) or None, run_pa and (filename and py_acora.filefindall(filename)) or None, run_ca and (filename and c_acora.filefindall(filename)) or None, run_re and regexp.findall(s) or None)
def setup(vregions_file, jregions_file): v_end_length = 40 # how many nts at the end of the V region to consider j_start_length = 40 # how many nts at the start of the J region to consider handle = open(vregions_file, 'r') v_list = list(SeqIO.parse(handle, 'fasta')) handle.close() v_genes = [str(string.upper(v.seq)) for v in v_list] v_genes_cut = [v[-v_end_length:] for v in v_genes] all_v_substrings = [] for v in v_genes_cut: all_v_substrings.append([v[i:i+n] for n in range(4, len(v)+1) for i in range(len(v)-(n-1))]) t0 = time.time() v_keyword_tries = [] for v_substrings in all_v_substrings: v_builder = AcoraBuilder() for i in range(len(v_substrings)): v_builder.add(v_substrings[i]) v_keyword_tries.append(v_builder.build()) print 'V keyword tries built in', round(time.time() - t0, 2), 'seconds' handle = open(jregions_file, 'r') j_list = list(SeqIO.parse(handle, 'fasta')) handle.close() j_genes = [str(string.upper(j.seq)) for j in j_list] j_genes_cut = [j[:j_start_length] for j in j_genes] all_j_substrings = [] for j in j_genes_cut: all_j_substrings.append([j[i:i+n] for n in range(4, len(j)+1) for i in range(len(j)-(n-1))]) t0 = time.time() j_keyword_tries = [] for j_substrings in all_j_substrings: j_builder = AcoraBuilder() for i in range(len(j_substrings)): j_builder.add(j_substrings[i]) j_keyword_tries.append(j_builder.build()) print 'J keyword tries built in', round(time.time() - t0, 2), 'seconds' return v_keyword_tries, j_keyword_tries, v_genes, j_genes
def _build(self): builder = AcoraBuilder() for idx, item in enumerate(self._keywords_or_assoc): if isinstance(item, tuple): keyword = item[0] keyword = keyword.encode(DEFAULT_ENCODING) if keyword in self._translator: raise ValueError('Duplicated keyword "%s"' % keyword) self._translator[keyword] = item[1:] builder.add(keyword) elif isinstance(item, basestring): keyword = item.encode(DEFAULT_ENCODING) builder.add(keyword) else: raise ValueError('Can NOT build MultiIn with provided values.') return builder.build()
v_nams = [] for v in range(0, len(v_genes)): v_regions.append(str(v_genes[v].seq).upper()) v_nams.append(v_genes[v].id.split("|")[1]) j_regions = [] j_nams = [] for j in range(0, len(j_genes)): j_regions.append(str(j_genes[j].seq).upper()) j_nams.append(v_genes[v].id.split("|")[1]) ## Build keyword tries of V and J tags for fast assignment v_seqs, half1_v_seqs, half2_v_seqs, jump_to_end_v = get_v_tags(open("tags_tr"+ chain.lower() + "v.txt", "rU"), v_half_split) j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j = get_j_tags(open("tags_tr"+ chain.lower() + "j.txt", "rU"), j_half_split) v_builder = AcoraBuilder() for i in range(0,len(v_seqs)): v_builder.add(str(v_seqs[i])) # Add all V tags to keyword trie v_key = v_builder.build() j_builder = AcoraBuilder() for i in range(0,len(j_seqs)): j_builder.add(str(j_seqs[i])) # Add all J tags to keyword trie j_key = j_builder.build() ## Build keyword tries for first and second halves of both V and J tags v_half1_builder = AcoraBuilder() for i in range(0,len(half1_v_seqs)): v_half1_builder.add(str(half1_v_seqs[i]))
def analysis(fastqs, vfasta, jfasta, vtags, jtags, rev_comp=False, verbose=False, sep=" "): if verbose: sys.stderr.write('>> Analyzing %d file(s)\n' % len(fastqs)) sys.stderr.write(">> Importing known V, and J gene segments and tags\n") # get the sequences per region v_genes = list(SeqIO.parse(nopen(vfasta), "fasta")) j_genes = list(SeqIO.parse(nopen(jfasta), "fasta")) # XXX # classes to parse fasta, fastq, and method to reverse complement # get rid of biopython v_regions = [str(v_genes[i].seq.upper()) for i, v in enumerate(v_genes)] j_regions = [str(j_genes[i].seq.upper()) for i, v in enumerate(j_genes)] v_seqs, vleft_seqs, vright_seqs, v_ends = get_tags(vtags) j_seqs, jleft_seqs, jright_seqs, j_starts = get_tags(jtags) # full sequences builder = AcoraBuilder(v_seqs) v_key = builder.build() builder = AcoraBuilder(j_seqs) j_key = builder.build() # half sequences builder = AcoraBuilder(vleft_seqs) vleft_key = builder.build() builder = AcoraBuilder(vright_seqs) vright_key = builder.build() builder = AcoraBuilder(jleft_seqs) jleft_key = builder.build() builder = AcoraBuilder(jright_seqs) jright_key = builder.build() # correctly assigned sequences assigned_count = 0 # number of sequences analysed seq_count = 0 # begin clock t0 = time() # XXX stemplate = Template('$v $j $del_v $del_j $nt_insert') for fastq in fastqs: if verbose: sys.stderr.write(">> Starting %s...\n" % fastq) for i, record in enumerate(SeqIO.parse(nopen(fastq), "fastq")): # if i == 50: # sys.exit() found_seq_match = 0 seq_count += 1 hold_v = v_key.findall(str(record.seq)) hold_j = j_key.findall(str(record.seq)) if hold_v: # the index position of the found sequence among known (v_seqs) v_match = v_seqs.index(hold_v[0][0]) # new variable names # do not like lists for this task match_idx = v_seqs.index(hold_v[0][0]) match_start_idx = hold_v[0][1] vseq_end = v_ends[match_idx] - 1 end_of_v = match_start_idx + vseq_end # Finds where the end of a full V would be temp_end_v = hold_v[0][1] + v_ends[v_match] - 1 # If the number of deletions has been found if get_v_deletions(record.seq, v_match, temp_end_v, v_regions): end_v, deletions_v = get_v_deletions(record.seq, v_match, temp_end_v, v_regions) else: found_v_match = 0 hold_v1 = vleft_key.findall(str(record.seq)) hold_v2 = vright_key.findall(str(record.seq)) for i in range(len(hold_v1)): indices = [y for y, x in enumerate(vleft_seqs) if x == hold_v1[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[vleft_seqs.index(hold_v1[i][0])])]): if lev.hamming( v_seqs[k], str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1: v_match = k # Finds where the end of a full V would be temp_end_v = hold_v1[i][1] + v_ends[v_match] - 1 found_v_match += 1 for i in range(len(hold_v2)): indices = [y for y, x in enumerate(vright_seqs) if x == hold_v2[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[vright_seqs.index(hold_v2[i][0])])]): if lev.hamming( v_seqs[k], str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1: v_match = k # Finds where the end of a full V would be temp_end_v = hold_v2[i][1] + v_ends[v_match] - 1 found_v_match += 1 if hold_j: # Assigns J j_match = j_seqs.index(hold_j[0][0]) # Finds where the start of a full J would be temp_start_j = hold_j[0][1] - j_starts[j_match] if get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): [ start_j, deletions_j] = get_j_deletions( record.seq, j_match, temp_start_j, j_regions ) else: found_j_match = 0 hold_j1 = jleft_key.findall(str(record.seq)) hold_j2 = jright_key.findall(str(record.seq)) for i in range(len(hold_j1)): indices = [y for y, x in enumerate(jleft_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[jleft_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1: j_match = jleft_seqs.index(hold_j1[i][0]) # Finds where the start of a full J would be temp_start_j = hold_j1[i][1] - j_starts[j_match] found_j_match += 1 for i in range(len(hold_j2)): indices = [y for y, x in enumerate(jright_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[jright_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1: j_match = jright_seqs.index(hold_j2[i][0]) # Finds where the start of a full J would be temp_start_j = hold_j2[i][1] - j_starts[j_match] - 6 found_j_match += 1 if hold_v and hold_j: f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = record.seq[end_v+1:start_j]) # Write to analysis_file (text file) the classification of the sequence print f_seq assigned_count += 1 found_seq_match = 1 elif hold_v and found_j_match == 1: f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and hold_j: f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and found_j_match == 1: f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print f_seq assigned_count += 1 found_seq_match = 1 ##################### # REVERSE COMPLEMENT ##################### if found_seq_match == 0 and rev_comp: record_reverse = record.reverse_complement() hold_v = v_key.findall(str(record_reverse.seq)) hold_j = j_key.findall(str(record_reverse.seq)) if hold_v: # Assigns V v_match = v_seqs.index(hold_v[0][0]) # Finds where the end of a full V would be temp_end_v = hold_v[0][1] + v_ends[v_match] - 1 # If the number of deletions has been found if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ): end_v, deletions_v = get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) else: found_v_match = 0 hold_v1 = vleft_key.findall(str(record_reverse.seq)) hold_v2 = vright_key.findall(str(record_reverse.seq)) for i in range(len(hold_v1)): indices = [y for y, x in enumerate(vleft_seqs) if x == hold_v1[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[vleft_seqs.index(hold_v1[i][0])])]): if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1: v_match = k # Finds where the end of a full V would be temp_end_v = hold_v1[i][1] + v_ends[v_match] - 1 found_v_match += 1 for i in range(len(hold_v2)): indices = [y for y, x in enumerate(vright_seqs) if x == hold_v2[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[vright_seqs.index(hold_v2[i][0])])]): if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1: v_match = k # Finds where the end of a full V would be temp_end_v = hold_v2[i][1] + v_ends[v_match] - 1 found_v_match += 1 if hold_j: # Assigns J j_match = j_seqs.index(hold_j[0][0]) # Finds where the start of a full J would be temp_start_j = hold_j[0][1] - j_starts[j_match] if get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): start_j, deletions_j = get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ) else: found_j_match = 0 hold_j1 = jleft_key.findall(str(record_reverse.seq)) hold_j2 = jright_key.findall(str(record_reverse.seq)) for i in range(len(hold_j1)): indices = [y for y, x in enumerate(jleft_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[jleft_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1: j_match = jleft_seqs.index(hold_j1[i][0]) # Finds where the start of a full J would be temp_start_j = hold_j1[i][1] - j_starts[j_match] found_j_match += 1 for i in range(len(hold_j2)): indices = [y for y, x in enumerate(jright_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[jright_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1: j_match = jright_seqs.index(hold_j2[i][0]) # Finds where the start of a full J would be temp_start_j = hold_j2[i][1] - j_starts[j_match] - 6 found_j_match += 1 if (hold_v and hold_j) or \ (hold_v and found_j_match == 1) or \ (found_v_match == 1 and hold_j) or \ (found_v_match == 1 and found_j_match == 1): f_seq = stemplate.substitute(v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v + 1:start_j])) fields = (v_match, j_match, deletions_v, deletions_j, record_reverse.seq[end_v + 1:start_j]) assigned_count += 1 found_seq_match = 1 print sep.join(map(str, fields)) if verbose: t = time() - t0 sys.stderr.write('%d sequences were analysed\n' % seq_count) sys.stderr.write('%d sequences were successfully assigned\n' % assigned_count) sys.stderr.write('%s seconds elapsed\n' % t)
mouse_proteome_file = [ x for x in os.listdir(fxn.base_data_dir) if '_mouse.fasta' in x ][0] mouse_proteins = coll.defaultdict() with gzip.open(fxn.base_data_dir + mouse_proteome_file, 'rU') as in_file: for protein, seq, blank in fxn.read_fa(in_file): mouse_proteins[protein.split(' ')[0]] = seq # Then scroll through non-predicted binder files, build an AC trie of all the peptides per file data_dir = '../Data/NonPredictedBinders/' matches = coll.defaultdict(fxn.nest_counter) all_peptides = coll.defaultdict(list) for f in [x for x in os.listdir(data_dir) if x.endswith('.txt')]: nam = f.split('-')[0] search_builder = AcoraBuilder() peptides = [] # Build trie with open(data_dir + f, 'rU') as in_file: for line in in_file: search_builder.add(line.rstrip()) peptides.append(line.rstrip()) all_peptides[f.split('-')[0]].append(line.rstrip()) seq_search = search_builder.build() # Use to search all proteins in proteome for protein in mouse_proteins: seq_check = seq_search.findall(mouse_proteins[protein]) if seq_check: for s in seq_check:
zy = {'00': 1, '01': 1, '02': 1, '03': 1, '10': 1, '11': 1, '20': 1, '22': 1, '30': 1, '33': 1} zy = {i: np.log(zy[i]) for i in zy.keys()} from acora import AcoraBuilder views = pd.read_csv('View.csv', delimiter='\t', encoding='utf-8')['View'] views = AcoraBuilder(*views) views = views.build() def predict(i, data): y_pred = data.loc[i, 'predict'] s = data.loc[i, 'Content'][:maxlen] nodes = [dict(zip(['0', '1', '2', '3'], k)) for k in np.log(y_pred[:len(s)])] tags_pred_1 = viterbi(nodes) for j in views.finditer(s): for k in range(j[1], j[1] + len(j[0])): nodes[k]['1'] += 100 nodes[k]['2'] += 100 nodes[k]['3'] += 100 try:
def import_tcr_info(inputargs): """ import_tcr_info: Gathers the required TCR chain information for Decombining """ # Get chain information global chain chain = get_chain(inputargs) ################################################# ############# GET GENES, BUILD TRIE ############# ################################################# print 'Importing TCR', ", ".join(map(chainnams.__getitem__, chain)), 'gene sequences...' # First check that valid tag/species combinations have been used if inputargs['tags'] == "extended" and inputargs['species'] == "mouse": print "Please note that there is currently no extended tag set for mouse TCR genes.\n \ Decombinator will now switch the tag set in use from \'extended\' to \'original\'.\n \ In future, consider editing the script to change the default, or use the appropriate flags (-sp mouse -tg original)." inputargs['tags'] = "original" if inputargs['tags'] == "extended" and ('g' in chain or 'd' in chain): print "Please note that there is currently no extended tag set for gamma/delta TCR genes.\n \ Decombinator will now switch the tag set in use from \'extended\' to \'original\' for these chains.\n \ In future, consider editing the script to change the default, or use the appropriate flags." inputargs['tags'] = "original" # Set tag split position, and check tag set. Note that original tags use shorter length J half tags, as these tags were originally shorter. global v_half_split, j_half_split if inputargs['tags'] == "extended": v_half_split, j_half_split = [10, 10] elif inputargs['tags'] == "original": v_half_split, j_half_split = [10, 6] else: print "Tag set unrecognised; should be either \'extended\' or \'original\' for human, or just \'original\' for mouse. \n \ Please check tag set and species flag." sys.exit() # Check species information if inputargs['species'] not in ["human", "mouse"]: print "Species not recognised. Please select either \'human\' (default) or \'mouse\'.\n \ If mouse is required by default, consider changing the default value in the script." sys.exit() # Look for tag and V/J fasta and tag files: if these cannot be found in the working directory, source them from GitHub repositories # Note that fasta/tag files fit the pattern "species_tagset_gene.[fasta/tags]" # I.e. "[human/mouse]_[extended/original]_TR[A/B/G/D][V/J].[fasta/tags]" chain_order = [] for gene in ['v', 'j']: # Get FASTA data fasta_holder = [] for i in range(len(chain)): fasta_file = read_tcr_file(inputargs['species'], inputargs['tags'], chain[i], gene, "fasta", inputargs['tagfastadir']) fasta_holder.append(list(SeqIO.parse(fasta_file, "fasta"))) fasta_file.close() chain globals()[gene + "_genes"] = flatten(fasta_holder) globals()[gene + "_regions"] = [] for g in range(0, len(globals()[gene + "_genes"])): globals()[gene + "_regions"].append( string.upper(globals()[gene + "_genes"][g].seq)) # Get tag data gene_seq_holder = [] #initialise arrays half1_gene_seq_holder = [] half2_gene_seq_holder = [] jumpfunction_holder = [] for i in range(len(chain)): tag_file = read_tcr_file(inputargs['species'], inputargs['tags'], chain[i], gene, "tags", inputargs['tagfastadir']) # get tag data if gene == 'v': jumpfunction = "jump_to_end_v" elif gene == 'j': jumpfunction = "jump_to_start_j" tag_info_holder = globals()["get_" + gene + "_tags"]( tag_file, globals()[gene + "_half_split"]) gene_seq_holder.append(tag_info_holder[0]) half1_gene_seq_holder.append(tag_info_holder[1]) half2_gene_seq_holder.append(tag_info_holder[2]) jumpfunction_holder.append(tag_info_holder[3]) chain_order.append([chain[i], gene, len(gene_seq_holder[i])]) tag_file.close() globals()[gene + "_seqs"] = flatten(gene_seq_holder) globals()["half1_" + gene + "_seqs"] = flatten(half1_gene_seq_holder) globals()["half2_" + gene + "_seqs"] = flatten(half2_gene_seq_holder) globals()[jumpfunction] = flatten(jumpfunction_holder) # Build Aho-Corasick tries globals()[gene + "_builder"] = AcoraBuilder() for i in range(0, len(globals()[gene + "_seqs"])): globals()[gene + "_builder"].add(str( globals()[gene + "_seqs"][i])) # Add all V tags to keyword trie globals()[gene + "_key"] = globals()[gene + "_builder"].build() # And tries for split, half-tags globals()[gene + "_half1_builder"] = AcoraBuilder() for i in range(0, len(globals()["half1_" + gene + "_seqs"])): globals()[gene + "_half1_builder"].add( str(globals()["half1_" + gene + "_seqs"][i])) globals()["half1_" + gene + "_key"] = globals()[gene + "_half1_builder"].build() globals()[gene + "_half2_builder"] = AcoraBuilder() for i in range(0, len(globals()["half2_" + gene + "_seqs"])): globals()[gene + "_half2_builder"].add( str(globals()["half2_" + gene + "_seqs"][i])) globals()["half2_" + gene + "_key"] = globals()[gene + "_half2_builder"].build() return chain_order
def __init__(self, text): self.text = text keywords = ["ownership", "owner", "own", "propietary", "tracking", "track", "store", "keep", "keeping"] builder = AcoraBuilder() builder.add(*keywords) self.finder = builder.build()
def import_tcr_info(inputargs): """ import_tcr_info: Gathers the required TCR chain information for Decombining """ # Get chain information global chainnams, chain, counts counts = coll.Counter() chainnams = {"a": "alpha", "b": "beta", "g": "gamma", "d": "delta"} # Detect whether chain specified in filename inner_filename_chains = [ x for x in chainnams.values() if x in inputargs['fastq'].lower() ] if len(inner_filename_chains) == 1: counts['chain_detected'] = 1 if inputargs['chain']: if inputargs['chain'].upper() in ['A', 'ALPHA', 'TRA', 'TCRA']: chain = "a" elif inputargs['chain'].upper() in ['B', 'BETA', 'TRB', 'TCRB']: chain = "b" elif inputargs['chain'].upper() in ['G', 'GAMMA', 'TRG', 'TCRG']: chain = "g" elif inputargs['chain'].upper() in ['D', 'DELTA', 'TRD', 'TCRD']: chain = "d" else: print(nochain_error) sys.exit() else: # If no chain provided, try and infer from filename if counts['chain_detected'] == 1: chain = inner_filename_chains[0][0] else: nochain_error = "TCR chain not recognised. \n \ Please either include (one) chain name in the file name (i.e. alpha/beta/gamma/delta),\n \ or use the \'-c\' flag with an explicit chain option (a/b/g/d, case-insensitive)." print(nochain_error) sys.exit() ################################################# ############# GET GENES, BUILD TRIE ############# ################################################# print('Importing TCR', chainnams[chain], 'gene sequences...') # First check that valid tag/species combinations have been used if inputargs['tags'] == "extended" and inputargs['species'] == "mouse": print( "Please note that there is currently no extended tag set for mouse TCR genes.\n \ Decombinator will now switch the tag set in use from \'extended\' to \'original\'.\n \ In future, consider editing the script to change the default, or use the appropriate flags (-sp mouse -tg original)." ) inputargs['tags'] = "original" if inputargs['tags'] == "extended" and (chain == 'g' or chain == 'd'): print( "Please note that there is currently no extended tag set for gamma/delta TCR genes.\n \ Decombinator will now switch the tag set in use from \'extended\' to \'original\'.\n \ In future, consider editing the script to change the default, or use the appropriate flags." ) inputargs['tags'] = "original" # Set tag split position, and check tag set. Note that original tags use shorter length J half tags, as these tags were originally shorter. global v_half_split, j_half_split if inputargs['tags'] == "extended": v_half_split, j_half_split = [10, 10] elif inputargs['tags'] == "original": v_half_split, j_half_split = [10, 6] else: print( "Tag set unrecognised; should be either \'extended\' or \'original\' for human, or just \'original\' for mouse. \n \ Please check tag set and species flag.") sys.exit() # Check species information if inputargs['species'] not in ["human", "mouse"]: print( "Species not recognised. Please select either \'human\' (default) or \'mouse\'.\n \ If mouse is required by default, consider changing the default value in the script." ) sys.exit() # Look for tag and V/J fasta and tag files: if these cannot be found in the working directory, source them from GitHub repositories # Note that fasta/tag files fit the pattern "species_tagset_gene.[fasta/tags]" # I.e. "[human/mouse]_[extended/original]_TR[A/B/G/D][V/J].[fasta/tags]" for gene in ['v', 'j']: # Get FASTA data fasta_file = read_tcr_file(inputargs['species'], inputargs['tags'], gene, "fasta", inputargs['tagfastadir']) globals()[gene + "_genes"] = list(SeqIO.parse(fasta_file, "fasta")) globals()[gene + "_regions"] = [] for g in range(0, len(globals()[gene + "_genes"])): globals()[gene + "_regions"].append( globals()[gene + "_genes"][g].seq.upper()) # Get tag data tag_file = read_tcr_file(inputargs['species'], inputargs['tags'], gene, "tags", inputargs['tagfastadir']) # get tag data tag_data = open(tag_file, "r") if gene == 'v': jumpfunction = "jump_to_end_v" elif gene == 'j': jumpfunction = "jump_to_start_j" globals()[gene+"_seqs"], globals()["half1_"+gene+"_seqs"], globals()["half2_"+gene+"_seqs"], globals()[jumpfunction] = \ globals()["get_"+gene+"_tags"](tag_data, globals()[gene+"_half_split"]) tag_data.close() # Build Aho-Corasick tries globals()[gene + "_builder"] = AcoraBuilder() for i in range(0, len(globals()[gene + "_seqs"])): globals()[gene + "_builder"].add(str( globals()[gene + "_seqs"][i])) # Add all V tags to keyword trie globals()[gene + "_key"] = globals()[gene + "_builder"].build() # And tries for split, half-tags globals()[gene + "_half1_builder"] = AcoraBuilder() for i in range(0, len(globals()["half1_" + gene + "_seqs"])): globals()[gene + "_half1_builder"].add( str(globals()["half1_" + gene + "_seqs"][i])) globals()["half1_" + gene + "_key"] = globals()[gene + "_half1_builder"].build() globals()[gene + "_half2_builder"] = AcoraBuilder() for i in range(0, len(globals()["half2_" + gene + "_seqs"])): globals()[gene + "_half2_builder"].add( str(globals()["half2_" + gene + "_seqs"][i])) globals()["half2_" + gene + "_key"] = globals()[gene + "_half2_builder"].build()
import json import linecache import os import re import jieba import numpy as np from acora import AcoraBuilder from emotion_cla.emo_cls import classify from emotion_cla.separate import separate in_dir = 'data/tweet' out_dir = 'data/tweet_emo' builder = AcoraBuilder([line.strip() for line in open('data/emoji.txt')]) ac = builder.build() def load_labelled(): lines = set() for i in range(5): for line in open('data/content_3000/{}.txt'.format(i)): lines.add(line.strip()) return lines # have_lines = load_labelled() def random_ids(in_name, out_name, lens): '''
for key, values in output_dict.items(): # remove last ", " output_dict[key] = values[:-2] return output_dict if __name__ == "__main__": args = parsing_argument() if not args.source: raise Exception("Please input the source file") with open(args.source, 'r') as file: keywords = file.read().splitlines() # Reading the source file ac = AcoraBuilder(keywords) ac = ac.build() # build the model for searching the keywords # Reading the target files if args.target_files: with open(args.target_files, 'r') as file: target_files = file.read().splitlines() target_file = [ target_file for target_file in target_files if ".pdf" in target_file or ".html" in target_file ] else: target_files = [ os.path.join(paths, file) for paths, _, files in os.walk(args.target_folder) for file in files if '.pdf' in file or '.html' in file
def analysis( Sequence_Reads, with_statistics=True, with_reverse_complement_search=True): import numpy as np import decimal as dec import string import operator as op import collections as coll from Bio import SeqIO from acora import AcoraBuilder from time import time, clock from string import Template from operator import itemgetter, attrgetter import Levenshtein as lev v_half_split, j_half_split = [10,6] # Do not change - V tags are split at position 10, J at position 6, to look for half tags if no full tag is found. ################ print 'Commencing analysis on a total of', len(Sequence_Reads), 'file(s)' ## Create .txt file to store f=(v_index,j_index,v_deletions,j_deletions,nt_insert) analysis_file = open("DecombinatorResults.txt", "w") analysis_file.close() results = "DecombinatorResults.txt" # Name the .txt file to write to ################ print ('Importing known V, D and J gene segments and tags...') handle = open("human_TRBV_region.fasta", "rU") v_genes = list(SeqIO.parse(handle, "fasta")) handle.close() handle = open("human_TRBJ_region.fasta", "rU") j_genes = list(SeqIO.parse(handle, "fasta")) handle.close() v_regions = [] for j in range(0, len(v_genes)): v_regions.append(string.upper(v_genes[j].seq)) j_regions = [] for j in range(0, len(j_genes)): j_regions.append(string.upper(j_genes[j].seq)) ############## ## Build keyword tries of V and J tags for fast assignment v_seqs, half1_v_seqs, half2_v_seqs, jump_to_end_v = get_v_tags(open("tags_trbv.txt", "rU"), v_half_split) j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j = get_j_tags(open("tags_trbj.txt", "rU"), j_half_split) v_builder = AcoraBuilder() for i in range(0,len(v_seqs)): v_builder.add(str(v_seqs[i])) # Add all V tags to keyword trie v_key = v_builder.build() j_builder = AcoraBuilder() for i in range(0,len(j_seqs)): j_builder.add(str(j_seqs[i])) # Add all J tags to keyword trie j_key = j_builder.build() ############## ## Build keyword tries for first and second halves of both V and J tags v_half1_builder = AcoraBuilder() for i in range(0,len(half1_v_seqs)): v_half1_builder.add(str(half1_v_seqs[i])) half1_v_key = v_half1_builder.build() v_half2_builder = AcoraBuilder() for i in range(0,len(half2_v_seqs)): v_half2_builder.add(str(half2_v_seqs[i])) half2_v_key = v_half2_builder.build() j_half1_builder = AcoraBuilder() for i in range(0,len(half1_j_seqs)): j_half1_builder.add(str(half1_j_seqs[i])) half1_j_key = j_half1_builder.build() j_half2_builder = AcoraBuilder() for i in range(0,len(half2_j_seqs)): j_half2_builder.add(str(half2_j_seqs[i])) half2_j_key = j_half2_builder.build() ############### ## Initialise variables assigned_count = 0 # this will just increase by one every time we correctly assign a seq read with all desired variables seq_count = 0 # this will simply track the number of sequences analysed in file t0 = time() # Begin timer ############### ## Open .txt file created at the start of analysis analysis_file = open(results, "a") stemplate = Template('$v $j $del_v $del_j $nt_insert') # Creates stemplate, a holder, for f. Each line will have the 5 variables separated by a space ############### ## Begin analysing sequences for i in range(len(Sequence_Reads)): print 'Importing sequences from', Sequence_Reads[i],' and assigning V and J regions...' handle = open(Sequence_Reads[i], "rU") for record in SeqIO.parse(handle, "fastq"): found_seq_match = 0 seq_count += 1 hold_v = v_key.findall(str(record.seq)) hold_j = j_key.findall(str(record.seq)) if hold_v: v_match = v_seqs.index(hold_v[0][0]) # Assigns V temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ): # If the number of deletions has been found [ end_v, deletions_v] = get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) else: found_v_match = 0 hold_v1 = half1_v_key.findall(str(record.seq)) hold_v2 = half2_v_key.findall(str(record.seq)) for i in range(len(hold_v1)): indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]): if lev.hamming( v_seqs[k], str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1: v_match = k temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be found_v_match += 1 for i in range(len(hold_v2)): indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]): if lev.hamming( v_seqs[k], str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1: v_match = k temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be found_v_match += 1 if hold_j: j_match = j_seqs.index(hold_j[0][0]) # Assigns J temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be if get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): [ start_j, deletions_j] = get_j_deletions( record.seq, j_match, temp_start_j, j_regions ) else: found_j_match = 0 hold_j1 = half1_j_key.findall(str(record.seq)) hold_j2 = half2_j_key.findall(str(record.seq)) for i in range(len(hold_j1)): indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1: j_match = half1_j_seqs.index(hold_j1[i][0]) temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be found_j_match += 1 for i in range(len(hold_j2)): indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1: j_match = half2_j_seqs.index(hold_j2[i][0]) temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - 6 # Finds where the start of a full J would be found_j_match += 1 if hold_v and hold_j: if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print >> analysis_file, f_seq # Write to analysis_file (text file) the classification of the sequence assigned_count += 1 found_seq_match = 1 elif hold_v and found_j_match == 1: if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and hold_j: if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and found_j_match == 1: if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 if found_seq_match == 0 and with_reverse_complement_search == True: ##################### # REVERSE COMPLEMENT ##################### record_reverse = record.reverse_complement() hold_v = v_key.findall(str(record_reverse.seq)) hold_j = j_key.findall(str(record_reverse.seq)) if hold_v: v_match = v_seqs.index(hold_v[0][0]) # Assigns V temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ): # If the number of deletions has been found [ end_v, deletions_v] = get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) else: found_v_match = 0 hold_v1 = half1_v_key.findall(str(record_reverse.seq)) hold_v2 = half2_v_key.findall(str(record_reverse.seq)) for i in range(len(hold_v1)): indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]): if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1: v_match = k temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be found_v_match += 1 for i in range(len(hold_v2)): indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]): if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1: v_match = k temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be found_v_match += 1 if hold_j: j_match = j_seqs.index(hold_j[0][0]) # Assigns J temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be if get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): [ start_j, deletions_j] = get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ) else: found_j_match = 0 hold_j1 = half1_j_key.findall(str(record_reverse.seq)) hold_j2 = half2_j_key.findall(str(record_reverse.seq)) for i in range(len(hold_j1)): indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1: j_match = half1_j_seqs.index(hold_j1[i][0]) temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be found_j_match += 1 for i in range(len(hold_j2)): indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1: j_match = half2_j_seqs.index(hold_j2[i][0]) temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - 6 # Finds where the start of a full J would be found_j_match += 1 if hold_v and hold_j: if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j])) print >> analysis_file, f_seq # Write to analysis_file (text file) the classification of the sequence assigned_count += 1 found_seq_match = 1 elif hold_v and found_j_match == 1: if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and hold_j: if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and found_j_match == 1: if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 handle.close() analysis_file.close() if with_statistics == True: timed = time() - t0 print seq_count, 'sequences were analysed' print assigned_count, ' sequences were successfully assigned' print 'Time taken =', timed, 'seconds'