def get_key_word(data): output_database = [] if len(data["entity_dict"]) >= 1: dicts = OrderedDict() for key in data["entity_dict"]: dicts[key] = key for t in data["entity_dict"][key]: dicts[t] = key query = data["query"] key_word_builder = AcoraBuilder(dicts.keys()) key_word_searcher = key_word_builder.build() print(dicts, "------detected diccts-------") res = key_word_searcher.findall(query) print(res) if len(res) >= 1: input_entity = [item[0] for item in res] input_entity_key = [] for char in input_entity: input_entity_key.extend(data["entity_dict"][dicts[char]]) input_entity_key.append(dicts[char]) input_key_entity = list(set(input_entity_key)) key_word_builder = AcoraBuilder(input_key_entity) key_word_searcher = key_word_builder.build() for data in data["database"]: t = len(key_word_searcher.findall(data)) output_database.append(t) else: for data in data["database"]: output_database.append(0) else: for data in data["database"]: output_database.append(0) return output_database
def compare_search(s, filename, ignore_case, *keywords): setup_pya = setup_cya = setup_re = 0 run_pa = 'pa' in COMPARED_IMPLEMENTATIONS run_ca = 'ca' in COMPARED_IMPLEMENTATIONS run_re = 're' in COMPARED_IMPLEMENTATIONS if run_pa: t = time() builder = AcoraBuilder(keywords, ignore_case=ignore_case) py_acora = builder.build(acora=PyAcora) setup_pya = time() - t t = time() if run_ca: t = time() builder = AcoraBuilder(keywords, ignore_case=ignore_case) c_acora = builder.build() setup_ca = time() - t if run_re: t = time() if hasattr(keywords[0], 'encode'): # unicode in Py3? kw_regexp = '|'.join(keywords) else: kw_regexp = '|'.encode('ASCII').join(keywords) if ignore_case: regexp = re.compile(kw_regexp, re.I) else: regexp = re.compile(kw_regexp) setup_re = time() - t print("Case %ssensitive %s\n- setup times: PA: %.4f, CA: %.4f, RE: %.4f" % (ignore_case and 'in' or '', builder.for_unicode and 'unicode' or 'bytes', setup_pya, setup_ca, setup_re)) if run_pa: timings = timeit.Timer(partial(py_acora.findall, s)).repeat(number=REPEAT_COUNT) print("TIME(paS): %.3f" % min(timings)) if run_ca: timings = timeit.Timer(partial(c_acora.findall, s)).repeat(number=REPEAT_COUNT) print("TIME(caS): %.3f" % min(timings)) if filename: if run_pa: timings = timeit.Timer(partial( py_acora.filefindall, filename)).repeat(number=REPEAT_COUNT) print("TIME(paF): %.3f" % min(timings)) if run_ca: timings = timeit.Timer(partial( c_acora.filefindall, filename)).repeat(number=REPEAT_COUNT) print("TIME(caF): %.3f" % min(timings)) if run_re: timings = timeit.Timer(partial(regexp.findall, s)).repeat(number=REPEAT_COUNT) print("TIME(reS): %.3f" % min(timings)) return (run_pa and py_acora.findall(s) or None, run_ca and c_acora.findall(s) or None, run_pa and (filename and py_acora.filefindall(filename)) or None, run_ca and (filename and c_acora.filefindall(filename)) or None, run_re and regexp.findall(s) or None)
def setup(vregions_file, jregions_file): v_end_length = 40 # how many nts at the end of the V region to consider j_start_length = 40 # how many nts at the start of the J region to consider handle = open(vregions_file, 'r') v_list = list(SeqIO.parse(handle, 'fasta')) handle.close() v_genes = [str(string.upper(v.seq)) for v in v_list] v_genes_cut = [v[-v_end_length:] for v in v_genes] all_v_substrings = [] for v in v_genes_cut: all_v_substrings.append([ v[i:i + n] for n in range(4, len(v) + 1) for i in range(len(v) - (n - 1)) ]) t0 = time.time() v_keyword_tries = [] for v_substrings in all_v_substrings: v_builder = AcoraBuilder() for i in range(len(v_substrings)): v_builder.add(v_substrings[i]) v_keyword_tries.append(v_builder.build()) print 'V keyword tries built in', round(time.time() - t0, 2), 'seconds' handle = open(jregions_file, 'r') j_list = list(SeqIO.parse(handle, 'fasta')) handle.close() j_genes = [str(string.upper(j.seq)) for j in j_list] j_genes_cut = [j[:j_start_length] for j in j_genes] all_j_substrings = [] for j in j_genes_cut: all_j_substrings.append([ j[i:i + n] for n in range(4, len(j) + 1) for i in range(len(j) - (n - 1)) ]) t0 = time.time() j_keyword_tries = [] for j_substrings in all_j_substrings: j_builder = AcoraBuilder() for i in range(len(j_substrings)): j_builder.add(j_substrings[i]) j_keyword_tries.append(j_builder.build()) print 'J keyword tries built in', round(time.time() - t0, 2), 'seconds' return v_keyword_tries, j_keyword_tries, v_genes, j_genes
def __init__(self, term_index): self.term_index = term_index builder = AcoraBuilder() for text in term_index: builder.add(text) self.ac = builder.build()
def match_lines(self, s, *keywords): ''' Searching for the specific keywords @param s The Filename. @param Keywords The List which contains two keywords (index 0 - is primary key and index 1 is the parameter). @returns Lines where the keywords present. ''' builder = AcoraBuilder('\r', '\n', *keywords) ac = builder.build() line_start = 0 matches = False for kw, pos in ac.finditer(s): if kw in '\r\n': if matches: yield s[line_start:pos] matches = False line_start = pos + 1 else: matches = True if matches: yield s[line_start:]
def build_keyword_tries(seqs): builder = AcoraBuilder() for i in range(0, len(seqs)): builder.add(str(seqs[i])) # Add all V tags to keyword trie key = builder.build() return key
def __init__(self): # 所有实体词集合 self._ner_word_list = [] # 实体词替换的名字 self._ner_name = "" # AC模型的builder self._builder = AcoraBuilder()
def __init__(self, keywords, vocab=None): from acora import AcoraBuilder builder = AcoraBuilder() #assert isinstance(keywords, (list,tuple)) self.vocab = vocab for i in keywords: builder.add(i) #Generate the Acora search engine for the current keyword set: self.engine = builder.build()
def _build(self): builder = AcoraBuilder() for idx, item in enumerate(self._regexes_or_assoc): # # First we compile all regular expressions and save them to # the re_cache. # if isinstance(item, tuple): regex = item[0] regex = regex.encode(DEFAULT_ENCODING) self._re_cache[regex] = re.compile(regex, self._re_compile_flags) if regex in self._translator: raise ValueError('Duplicated regex "%s"' % regex) self._translator[regex] = item[1:] elif isinstance(item, basestring): regex = item.encode(DEFAULT_ENCODING) self._re_cache[regex] = re.compile(regex, self._re_compile_flags) else: raise ValueError('Can NOT build MultiRE with provided values.') # # Now we extract the string literals (longer than hint_len only) from # the regular expressions and populate the acora index # regex_hints = esmre.hints(regex) regex_keywords = esmre.shortlist(regex_hints) if not regex_keywords: self._regexes_with_no_keywords.append(regex) continue # Get the longest one regex_keyword = regex_keywords[0] if len(regex_keyword) <= self._hint_len: self._regexes_with_no_keywords.append(regex) continue # Add this keyword to the acora index, and also save a way to associate the # keyword with the regular expression regex_keyword = regex_keyword.lower() builder.add(regex_keyword) regexes_matching_keyword = self._keyword_to_re.get( regex_keyword, []) regexes_matching_keyword.append(regex) self._keyword_to_re[regex_keyword] = regexes_matching_keyword return builder.build()
def __init__(self, use_unicode=True, ignore_case=False, titles=None): """ :param use_unicode: whether to use `titles` as unicode or bytestrings :param ignore_case: if True ignore case in all matches :param titles: if given, overrides default `load_titles()` values """ titles = titles if titles else load_titles() titles = (titles if use_unicode else (s.encode('ascii') for s in titles)) builder = AcoraBuilder() builder.update(titles) self.ac = builder.build(ignore_case=ignore_case)
def __init__(self, content: List[str], ignore_case: bool): """ Acora matcher factory :param content: a list of items to search :param ignore_case: True to match any case :return: a built matcher """ # start with a string in case content is empty # otherwise it builds a binary Acora matcher builder = AcoraBuilder("!@#$%%^&*") if len(content) > 0: builder.update(content) self.matcher = builder.build(ignore_case=ignore_case)
def test_acora_python(self): builder = AcoraBuilder() builder.update([s for (s,) in SQL_ERRORS]) ac = builder.build(acora=PyAcora) i = 0 # # This takes around 9 seconds in my workstation. # for j in xrange(self.ITERATIONS): for _ in ac.finditer(HTTP_RESPONSE): i += 1 self.assertEqual(i, self.ITERATIONS * 2)
def __init__(self, keywords: Optional[Iterable[str]] = []): non_empty_keywords = [] if keywords is not None: for w in keywords: if w.strip() != "": non_empty_keywords.append(w) self._keywords = set(non_empty_keywords) if len(self._keywords) > 0: ac_builder = AcoraBuilder() ac_builder.update(keywords) self._finder = ac_builder.build() else: self._finder = None
def directed_graph(self) : if not hasattr(self, "_directed_graph") : print "getting directed graph ..." graph = defaultdict(_dd_int) # Zhu: in my VM, build speed is about 1.4w entity / s ac = AcoraBuilder(*self.database.entities).build() # match consumes no time, compared to build for text, attrib in self.database : entities = zip(*longest_match(ac.finditer(text)))[0] for entity in set(entities) : if entity == attrib["title"] : continue graph[attrib["title"]][entity] += 1 delattr(self, "database") self._directed_graph = graph return self._directed_graph
def _build(self): builder = AcoraBuilder() for idx, item in enumerate(self._keywords_or_assoc): if isinstance(item, tuple): keyword = item[0] keyword = keyword.encode(DEFAULT_ENCODING) if keyword in self._translator: raise ValueError('Duplicated keyword "%s"' % keyword) self._translator[keyword] = item[1:] builder.add(keyword) elif isinstance(item, basestring): keyword = item.encode(DEFAULT_ENCODING) builder.add(keyword) else: raise ValueError('Can NOT build MultiIn with provided values.') return builder.build()
def import_tcr_info(inputargs): """ import_tcr_info: Gathers the required TCR chain information for Decombining """ # Get chain information global chain chain = get_chain(inputargs) ################################################# ############# GET GENES, BUILD TRIE ############# ################################################# print 'Importing TCR', ", ".join(map(chainnams.__getitem__, chain)), 'gene sequences...' # First check that valid tag/species combinations have been used if inputargs['tags'] == "extended" and inputargs['species'] == "mouse": print "Please note that there is currently no extended tag set for mouse TCR genes.\n \ Decombinator will now switch the tag set in use from \'extended\' to \'original\'.\n \ In future, consider editing the script to change the default, or use the appropriate flags (-sp mouse -tg original)." inputargs['tags'] = "original" if inputargs['tags'] == "extended" and ('g' in chain or 'd' in chain): print "Please note that there is currently no extended tag set for gamma/delta TCR genes.\n \ Decombinator will now switch the tag set in use from \'extended\' to \'original\' for these chains.\n \ In future, consider editing the script to change the default, or use the appropriate flags." inputargs['tags'] = "original" # Set tag split position, and check tag set. Note that original tags use shorter length J half tags, as these tags were originally shorter. global v_half_split, j_half_split if inputargs['tags'] == "extended": v_half_split, j_half_split = [10, 10] elif inputargs['tags'] == "original": v_half_split, j_half_split = [10, 6] else: print "Tag set unrecognised; should be either \'extended\' or \'original\' for human, or just \'original\' for mouse. \n \ Please check tag set and species flag." sys.exit() # Check species information if inputargs['species'] not in ["human", "mouse"]: print "Species not recognised. Please select either \'human\' (default) or \'mouse\'.\n \ If mouse is required by default, consider changing the default value in the script." sys.exit() # Look for tag and V/J fasta and tag files: if these cannot be found in the working directory, source them from GitHub repositories # Note that fasta/tag files fit the pattern "species_tagset_gene.[fasta/tags]" # I.e. "[human/mouse]_[extended/original]_TR[A/B/G/D][V/J].[fasta/tags]" chain_order = [] for gene in ['v', 'j']: # Get FASTA data fasta_holder = [] for i in range(len(chain)): fasta_file = read_tcr_file(inputargs['species'], inputargs['tags'], chain[i], gene, "fasta", inputargs['tagfastadir']) fasta_holder.append(list(SeqIO.parse(fasta_file, "fasta"))) fasta_file.close() chain globals()[gene + "_genes"] = flatten(fasta_holder) globals()[gene + "_regions"] = [] for g in range(0, len(globals()[gene + "_genes"])): globals()[gene + "_regions"].append( string.upper(globals()[gene + "_genes"][g].seq)) # Get tag data gene_seq_holder = [] #initialise arrays half1_gene_seq_holder = [] half2_gene_seq_holder = [] jumpfunction_holder = [] for i in range(len(chain)): tag_file = read_tcr_file(inputargs['species'], inputargs['tags'], chain[i], gene, "tags", inputargs['tagfastadir']) # get tag data if gene == 'v': jumpfunction = "jump_to_end_v" elif gene == 'j': jumpfunction = "jump_to_start_j" tag_info_holder = globals()["get_" + gene + "_tags"]( tag_file, globals()[gene + "_half_split"]) gene_seq_holder.append(tag_info_holder[0]) half1_gene_seq_holder.append(tag_info_holder[1]) half2_gene_seq_holder.append(tag_info_holder[2]) jumpfunction_holder.append(tag_info_holder[3]) chain_order.append([chain[i], gene, len(gene_seq_holder[i])]) tag_file.close() globals()[gene + "_seqs"] = flatten(gene_seq_holder) globals()["half1_" + gene + "_seqs"] = flatten(half1_gene_seq_holder) globals()["half2_" + gene + "_seqs"] = flatten(half2_gene_seq_holder) globals()[jumpfunction] = flatten(jumpfunction_holder) # Build Aho-Corasick tries globals()[gene + "_builder"] = AcoraBuilder() for i in range(0, len(globals()[gene + "_seqs"])): globals()[gene + "_builder"].add(str( globals()[gene + "_seqs"][i])) # Add all V tags to keyword trie globals()[gene + "_key"] = globals()[gene + "_builder"].build() # And tries for split, half-tags globals()[gene + "_half1_builder"] = AcoraBuilder() for i in range(0, len(globals()["half1_" + gene + "_seqs"])): globals()[gene + "_half1_builder"].add( str(globals()["half1_" + gene + "_seqs"][i])) globals()["half1_" + gene + "_key"] = globals()[gene + "_half1_builder"].build() globals()[gene + "_half2_builder"] = AcoraBuilder() for i in range(0, len(globals()["half2_" + gene + "_seqs"])): globals()[gene + "_half2_builder"].add( str(globals()["half2_" + gene + "_seqs"][i])) globals()["half2_" + gene + "_key"] = globals()[gene + "_half2_builder"].build() return chain_order
zy = {'00': 1, '01': 1, '02': 1, '03': 1, '10': 1, '11': 1, '20': 1, '22': 1, '30': 1, '33': 1} zy = {i: np.log(zy[i]) for i in zy.keys()} from acora import AcoraBuilder views = pd.read_csv('View.csv', delimiter='\t', encoding='utf-8')['View'] views = AcoraBuilder(*views) views = views.build() def predict(i, data): y_pred = data.loc[i, 'predict'] s = data.loc[i, 'Content'][:maxlen] nodes = [dict(zip(['0', '1', '2', '3'], k)) for k in np.log(y_pred[:len(s)])] tags_pred_1 = viterbi(nodes) for j in views.finditer(s): for k in range(j[1], j[1] + len(j[0])): nodes[k]['1'] += 100 nodes[k]['2'] += 100 nodes[k]['3'] += 100 try:
v_nams = [] for v in range(0, len(v_genes)): v_regions.append(str(v_genes[v].seq).upper()) v_nams.append(v_genes[v].id.split("|")[1]) j_regions = [] j_nams = [] for j in range(0, len(j_genes)): j_regions.append(str(j_genes[j].seq).upper()) j_nams.append(v_genes[v].id.split("|")[1]) ## Build keyword tries of V and J tags for fast assignment v_seqs, half1_v_seqs, half2_v_seqs, jump_to_end_v = get_v_tags(open("tags_tr"+ chain.lower() + "v.txt", "rU"), v_half_split) j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j = get_j_tags(open("tags_tr"+ chain.lower() + "j.txt", "rU"), j_half_split) v_builder = AcoraBuilder() for i in range(0,len(v_seqs)): v_builder.add(str(v_seqs[i])) # Add all V tags to keyword trie v_key = v_builder.build() j_builder = AcoraBuilder() for i in range(0,len(j_seqs)): j_builder.add(str(j_seqs[i])) # Add all J tags to keyword trie j_key = j_builder.build() ## Build keyword tries for first and second halves of both V and J tags v_half1_builder = AcoraBuilder() for i in range(0,len(half1_v_seqs)): v_half1_builder.add(str(half1_v_seqs[i]))
def __init__(self, text): self.text = text keywords = ["ownership", "owner", "own", "propietary", "tracking", "track", "store", "keep", "keeping"] builder = AcoraBuilder() builder.add(*keywords) self.finder = builder.build()
def import_tcr_info(inputargs): """ import_tcr_info: Gathers the required TCR chain information for Decombining """ # Get chain information global chainnams, chain, counts counts = coll.Counter() chainnams = {"a": "alpha", "b": "beta", "g": "gamma", "d": "delta"} # Detect whether chain specified in filename inner_filename_chains = [ x for x in chainnams.values() if x in inputargs['fastq'].lower() ] if len(inner_filename_chains) == 1: counts['chain_detected'] = 1 if inputargs['chain']: if inputargs['chain'].upper() in ['A', 'ALPHA', 'TRA', 'TCRA']: chain = "a" elif inputargs['chain'].upper() in ['B', 'BETA', 'TRB', 'TCRB']: chain = "b" elif inputargs['chain'].upper() in ['G', 'GAMMA', 'TRG', 'TCRG']: chain = "g" elif inputargs['chain'].upper() in ['D', 'DELTA', 'TRD', 'TCRD']: chain = "d" else: print(nochain_error) sys.exit() else: # If no chain provided, try and infer from filename if counts['chain_detected'] == 1: chain = inner_filename_chains[0][0] else: nochain_error = "TCR chain not recognised. \n \ Please either include (one) chain name in the file name (i.e. alpha/beta/gamma/delta),\n \ or use the \'-c\' flag with an explicit chain option (a/b/g/d, case-insensitive)." print(nochain_error) sys.exit() ################################################# ############# GET GENES, BUILD TRIE ############# ################################################# print('Importing TCR', chainnams[chain], 'gene sequences...') # First check that valid tag/species combinations have been used if inputargs['tags'] == "extended" and inputargs['species'] == "mouse": print( "Please note that there is currently no extended tag set for mouse TCR genes.\n \ Decombinator will now switch the tag set in use from \'extended\' to \'original\'.\n \ In future, consider editing the script to change the default, or use the appropriate flags (-sp mouse -tg original)." ) inputargs['tags'] = "original" if inputargs['tags'] == "extended" and (chain == 'g' or chain == 'd'): print( "Please note that there is currently no extended tag set for gamma/delta TCR genes.\n \ Decombinator will now switch the tag set in use from \'extended\' to \'original\'.\n \ In future, consider editing the script to change the default, or use the appropriate flags." ) inputargs['tags'] = "original" # Set tag split position, and check tag set. Note that original tags use shorter length J half tags, as these tags were originally shorter. global v_half_split, j_half_split if inputargs['tags'] == "extended": v_half_split, j_half_split = [10, 10] elif inputargs['tags'] == "original": v_half_split, j_half_split = [10, 6] else: print( "Tag set unrecognised; should be either \'extended\' or \'original\' for human, or just \'original\' for mouse. \n \ Please check tag set and species flag.") sys.exit() # Check species information if inputargs['species'] not in ["human", "mouse"]: print( "Species not recognised. Please select either \'human\' (default) or \'mouse\'.\n \ If mouse is required by default, consider changing the default value in the script." ) sys.exit() # Look for tag and V/J fasta and tag files: if these cannot be found in the working directory, source them from GitHub repositories # Note that fasta/tag files fit the pattern "species_tagset_gene.[fasta/tags]" # I.e. "[human/mouse]_[extended/original]_TR[A/B/G/D][V/J].[fasta/tags]" for gene in ['v', 'j']: # Get FASTA data fasta_file = read_tcr_file(inputargs['species'], inputargs['tags'], gene, "fasta", inputargs['tagfastadir']) globals()[gene + "_genes"] = list(SeqIO.parse(fasta_file, "fasta")) globals()[gene + "_regions"] = [] for g in range(0, len(globals()[gene + "_genes"])): globals()[gene + "_regions"].append( globals()[gene + "_genes"][g].seq.upper()) # Get tag data tag_file = read_tcr_file(inputargs['species'], inputargs['tags'], gene, "tags", inputargs['tagfastadir']) # get tag data tag_data = open(tag_file, "r") if gene == 'v': jumpfunction = "jump_to_end_v" elif gene == 'j': jumpfunction = "jump_to_start_j" globals()[gene+"_seqs"], globals()["half1_"+gene+"_seqs"], globals()["half2_"+gene+"_seqs"], globals()[jumpfunction] = \ globals()["get_"+gene+"_tags"](tag_data, globals()[gene+"_half_split"]) tag_data.close() # Build Aho-Corasick tries globals()[gene + "_builder"] = AcoraBuilder() for i in range(0, len(globals()[gene + "_seqs"])): globals()[gene + "_builder"].add(str( globals()[gene + "_seqs"][i])) # Add all V tags to keyword trie globals()[gene + "_key"] = globals()[gene + "_builder"].build() # And tries for split, half-tags globals()[gene + "_half1_builder"] = AcoraBuilder() for i in range(0, len(globals()["half1_" + gene + "_seqs"])): globals()[gene + "_half1_builder"].add( str(globals()["half1_" + gene + "_seqs"][i])) globals()["half1_" + gene + "_key"] = globals()[gene + "_half1_builder"].build() globals()[gene + "_half2_builder"] = AcoraBuilder() for i in range(0, len(globals()["half2_" + gene + "_seqs"])): globals()[gene + "_half2_builder"].add( str(globals()["half2_" + gene + "_seqs"][i])) globals()["half2_" + gene + "_key"] = globals()[gene + "_half2_builder"].build()
import json import linecache import os import re import jieba import numpy as np from acora import AcoraBuilder from emotion_cla.emo_cls import classify from emotion_cla.separate import separate in_dir = 'data/tweet' out_dir = 'data/tweet_emo' builder = AcoraBuilder([line.strip() for line in open('data/emoji.txt')]) ac = builder.build() def load_labelled(): lines = set() for i in range(5): for line in open('data/content_3000/{}.txt'.format(i)): lines.add(line.strip()) return lines # have_lines = load_labelled() def random_ids(in_name, out_name, lens): '''
for key, values in output_dict.items(): # remove last ", " output_dict[key] = values[:-2] return output_dict if __name__ == "__main__": args = parsing_argument() if not args.source: raise Exception("Please input the source file") with open(args.source, 'r') as file: keywords = file.read().splitlines() # Reading the source file ac = AcoraBuilder(keywords) ac = ac.build() # build the model for searching the keywords # Reading the target files if args.target_files: with open(args.target_files, 'r') as file: target_files = file.read().splitlines() target_file = [ target_file for target_file in target_files if ".pdf" in target_file or ".html" in target_file ] else: target_files = [ os.path.join(paths, file) for paths, _, files in os.walk(args.target_folder) for file in files if '.pdf' in file or '.html' in file
mouse_proteome_file = [ x for x in os.listdir(fxn.base_data_dir) if '_mouse.fasta' in x ][0] mouse_proteins = coll.defaultdict() with gzip.open(fxn.base_data_dir + mouse_proteome_file, 'rU') as in_file: for protein, seq, blank in fxn.read_fa(in_file): mouse_proteins[protein.split(' ')[0]] = seq # Then scroll through non-predicted binder files, build an AC trie of all the peptides per file data_dir = '../Data/NonPredictedBinders/' matches = coll.defaultdict(fxn.nest_counter) all_peptides = coll.defaultdict(list) for f in [x for x in os.listdir(data_dir) if x.endswith('.txt')]: nam = f.split('-')[0] search_builder = AcoraBuilder() peptides = [] # Build trie with open(data_dir + f, 'rU') as in_file: for line in in_file: search_builder.add(line.rstrip()) peptides.append(line.rstrip()) all_peptides[f.split('-')[0]].append(line.rstrip()) seq_search = search_builder.build() # Use to search all proteins in proteome for protein in mouse_proteins: seq_check = seq_search.findall(mouse_proteins[protein]) if seq_check: for s in seq_check: