def __init__(self, partial_match=False, ignorecase=True, stopwords=None): ''' :param partial_match: allow for matching a non clomplete word :param ignorecase: case sensitive or not :param stopwords: stopwords to skip, defaults to a very broad list ''' self.A = ahocorasick.Automaton() self.partial_match = partial_match self.ignorecase = ignorecase if stopwords is None: stopwords = DOMAIN_STOP_WORDS idx = 0 '''get the dictionaries from remote files''' vocabpath = os.path.dirname(os.path.abspath(__file__)) + "/vocabulary/" for dictionary_url in vocabulary_urls: filename = dictionary_url.split('/')[-1] category, reference_db = filename.split('.')[0].split( '_')[0].split('-') with open(vocabpath + filename) as f: dictionary = json.load(f) '''load the elements in the Automation if they are not too short or are stopwords''' for element, element_data in list(dictionary.items()): ids = element_data['ids'] pref_name = element_data['pref_name'] if len(element) > 2: element_str = element if ((len(element_str) < 5) and (element_str not in stopwords) or (len(element_str) >= 5) and (element_str.lower() not in stopwords)): idx += 1 if self.ignorecase: element_match = element_str.lower() else: element_match = element_str try: self.add_tag(element_match, idx, category, reference_db, [i for i in ids], element, element_match, pref_name) except TypeError as e: print(element_match) print(type(element_match)) raise e '''handle elements with dashes by also creating a copy without''' if '-' in element_match: element_match_without_dash = element_match.replace( '-', '') if len(element_match_without_dash) > 2: self.add_tag(element_match_without_dash, idx, category, reference_db, [i for i in ids], element, element_match_without_dash, pref_name) '''if supporting partial match''' if self.partial_match: for longest_token in element.split(): if longest_token != element and \ len(longest_token) > 5 and \ longest_token.lower() not in stopwords: self.add_tag(longest_token, idx, category + '-TOKEN', reference_db, [i for i in ids], element, longest_token, pref_name) self.A.make_automaton()
def setUp(self): self.A = ahocorasick.Automaton() self.words = ['GT-C3303', 'SAMSUNG-GT-C3303K/']
def test_save_ints(self): A = ahocorasick.Automaton(ahocorasick.STORE_INTS) with self.assertRaisesRegex(ValueError, "expected exactly one argument"): A.save(self.path, None)
def updatePKL(path, allelesDB): allelesAC = ahocorasick.Automaton() for idx,key in enumerate(allelesDB): allelesAC.add_word(key, (idx, key)) allelesAC.make_automaton() pickle.dump((allelesDB, allelesAC), open(path+'/'+'pyngSTar_alleles_AC.pkl', 'wb'))
def setUp(self): self.A = ahocorasick.Automaton(ahocorasick.STORE_LENGTH) self.words = "word python aho corasick \x00\x00\x00".split()
def build_aho_matcher(iterable, label): A = aho.Automaton() for ix, word in enumerate(iterable): A.add_word(word, (word, label)) A.make_automaton() return A
def build_actree(self, wordlist): actree = ahocorasick.Automaton() for index, word in enumerate(wordlist): actree.add_word(word, (index, word)) actree.make_automaton() return actree
def test_constructor_wrong_store(self): with self.assertRaisesRegex(ValueError, "store value must be one of.*"): ahocorasick.Automaton(-42)
def test_constructor_wrong_key_type(self): with self.assertRaisesRegex(ValueError, "key_type must have value.*"): ahocorasick.Automaton(ahocorasick.STORE_ANY, -42)
@app.after_request def after_request(response): if not hasattr(g, 'request_start_time'): return response elapsed = time.time() - g.request_start_time req_info = str( g.request_start_time) + ": " + request.method + "_" + request.url app.logger.debug(req_info + ":" + ' time_used:' + str(elapsed)) return response # metrics only end # update sensitive sensitive_words = ahocorasick.Automaton() def load_sensitive_word(): try: lines = open(BaseConfig.WORD_PATH, 'r').readlines() for line in lines: line = line.strip() if not line: continue sensitive_words.add_word(line, line) sensitive_words.make_automaton() return True except Exception as e: app.logger.error("load_sensitive_word exception:{}".format(e)) return False
def get_automaton() -> Automaton: import ahocorasick return ahocorasick.Automaton()
def c_values(self, terms, trace=False): terms_df = pd.DataFrame(terms, columns=['term']) terms_df['w'] = 1 terms_df['len'] = len(terms_df['term']) """ terms_df is term w len 0 feature hierarchies 1 20552 1 rich feature hierarchies 1 20552 2 accurate object detection 1 20552 3 semantic segmentation 1 20552 4 ross girshick1 1 20552 w is always "1" len is number of candidate terms """ term_stats = terms_df.groupby(['term'])['w'].agg([np.sum]) term_stats['len'] = list(pd.Series(term_stats.index).apply(lambda x: len(x))) """ term_stats is term sum len 1 i1 r 1 6 1000class imagenet benchmark 1 28 1000class imagenet large scale visual recogniti... 1 59 1000class imagenet object recognition challenge 1 47 1000class imagenet task 1 23 "term" is candidate term, primary key, values of "term" column are unique "sum" is term frequency "len" is length of candidate term """ # term_series is list of candidate terms term_series = list(term_stats.index) # n_terms is number of candidate terms n_terms = len(term_series) # all spaces to simplify calculation for i in range(0, n_terms): term_series[i] = ' ' + str(term_series[i]) + ' ' # replace index term_stats['trm'] = term_series term_stats.set_index('trm', inplace=True) # create finite state automata A = ahocorasick.Automaton() for i in range(0, n_terms): A.add_word(term_series[i], (i, term_series[i])) A.make_automaton() is_part_of = [] for i in range(0, n_terms): haystack = term_series[i] for end_index, (insert_order, original_value) in A.iter(haystack): if original_value != haystack: # print original_value, "insideof ", haystack is_part_of.append((original_value, haystack, 1)) subterms = pd.DataFrame(is_part_of, columns=['term', 'part_of', 'w']).set_index(['term', 'part_of']) """ subterms is term part_of w imagenet benchmark 1000class imagenet benchmark 1 imagenet large 1000class imagenet large scale visual recognit... 1 large scale 1000class imagenet large scale visual recognit... 1 visual recognition 1000class imagenet large scale visual recognit... 1 imagenet large scale visual recognition challe... 1000class imagenet large scale visual recognit... 1 ... .. reconstruction loss ℓ1 reconstruction loss 1 error ℓ2 error 1 reconstruction ℓ2 pixelwise reconstruction loss 1 pixelwise reconstruction loss ℓ2 pixelwise reconstruction loss 1 reconstruction loss ℓ2 pixelwise reconstruction loss 1 "w" is term frequency """ # print("-------------") # print(subterms) if trace: print("terms/subterms relations discovered ...") c_values = [] # term_series=['time'] for t in term_series: if t in term_stats.index: current_term = term_stats.loc[t] """ print("-------------") print(('t', t, 'current_term', current_term)) ('t', ' belief network ', 'current_term', sum 1 len 14 Name: belief network , dtype: int64) t is string current_term = {sum:1, len:14} """ # calculate average frequency of the superterms c_value = 0 if t in subterms.index: subterm_of = list(subterms.loc[t].index) """ print(('subterm_of', subterm_of)) ('subterm_of', [' deep belief network ', ' directed sigmoid belief network ', ' sigmoid belief network ']) """ for st in subterm_of: # term_stats.loc[st]['sum'] is frequency of superterm c_value -= term_stats.loc[st]['sum'] c_value /= float(len(subterm_of)) # add current term frequency c_value += current_term['sum'] # multiply to log(term length) c_value = c_value * np.log(current_term['len']) if current_term['len'] > 0 else 0 if trace: print(t, 'freq=', current_term['sum'], ' cvalue=', c_value) c_values.append(c_value) # break """ returns sorted list of tuples (candidate_term, Cvalue) """ return sorted(zip([x.strip() for x in term_series], c_values), key=lambda x: x[1], reverse=True)
def acPass(wordBlockSize, blockBoundaries): queries = { i: " ".join(sttsp[i * blockBoundaries:i * blockBoundaries + wordBlockSize]) for i in range(0, len(sttsp) // blockBoundaries) } # math is hard but this gets an extra query at the end. bonus. nice. # remove duplicate queries (keep the first occurrance) seenQueries = set() uniqQueries = {} for i, q in queries.items(): if q in seenQueries: continue seenQueries.add(q) uniqQueries[i] = q print("block len:", wordBlockSize, ", block interval:", blockBoundaries, "=> num queries:", len(queries), end="\n") print("removed duplicate queries:", str(len(queries) - len(uniqQueries)) + "/" + str(len(queries))) ac = ahocorasick.Automaton() for i, q in uniqQueries.items(): ac.add_word(q, (i, q)) # gaps in index are fine and are needed to restore the original string positioning in splitInds (we removed some of the indexes) ac.make_automaton() # print( joinedLansp[ 203 : 251+1 ] ) # print( queries[ 11 ] ) # print( sttsp[ 11*blockBoundaries : 11*blockBoundaries + wordBlockSize ] ) # lit = ( lansp[ len( joinedLansp[:203].split() ) : len( joinedLansp[:251].split() ) ] ) # af = ( lanspinds[ len( joinedLansp[:203].split() ) : len( joinedLansp[:251].split() ) ] ) # print( "in lan:", lan[af[0][0]:af[-1][1]+1] ) # print( "in stt:", stt[sttspinds[11*blockBoundaries][0] : sttspinds[11*blockBoundaries+wordBlockSize-1][1]+1] ) # take an aho corasick match, expand it, and return char ranges, split string ranges, and original texts for both lan and stt, in addition to the matchText def splitInds(joinedLanspStart, joinedLanspEnd, queryIdx): # print( "\nSPLIT INDS", joinedLanspStart, joinedLanspEnd, queryIdx, "`" + joinedLansp[joinedLanspStart:joinedLanspEnd] + "`", queries[ queryIdx ] ) # joinedLansp[ joinedLanspStart : joinedLanspEnd + 1 ] == queries[ queryIdx ] is the matching text sttspindRange = (queryIdx * blockBoundaries, min(queryIdx * blockBoundaries + wordBlockSize, len(sttspinds))) # gives you original stt char indexes for each split word lanspindRange = (len(joinedLansp[:joinedLanspStart - 1].split(" ")), min(len(joinedLansp[:joinedLanspEnd].split(" ")), len(lanspinds))) # if joinedLanspStart:joinedLanspEnd aren't on word boundaries, the queryIdx is matching in joinedLansp in the middle of a word, so the match has to contract until they're on word boundaries because reasons; bug where it matched [ khi, ... ] in the middle of cókhi in lan if sttsp[sttspindRange[0]] != lansp[lanspindRange[0]]: sttspindRange = (sttspindRange[0] + 1, sttspindRange[1]) # this seemed to fix it for every case that i have hit, should keep the following check just in case; since joinedLanspStart and joinedLanspEnd aren't used and since the expansion process happens anyway, it's fine to modify in this way if " ".join(sttsp[slice(*sttspindRange)]) != " ".join( lansp[slice(*lanspindRange)]): print("!!! FAILED FOR SOME REASON !!!", "stt:", sttspindRange, "lan:", lanspindRange) print("sttspindRange joined:", " ".join(sttsp[slice(*sttspindRange)])) print("lanspindRange joined:", " ".join(lansp[slice(*lanspindRange)])) print( "stt", list(enumerate(sttsp))[sttspindRange[0] - 10:sttspindRange[1] + 10], "\nlan", list(enumerate(lansp))[lanspindRange[0] - 10:lanspindRange[1] + 10]) # print( 0, " ".join( sttsp[ sttspindRange[0] : sttspindRange[1] ] ), " ".join( lansp[ lanspindRange[0] : lanspindRange[1] ] ), sep="\n" ); # print the match # expand spindRanges while sttspindRange[0] - 1 >= 0 and lanspindRange[0] - 1 >= 0 and sttsp[ sttspindRange[0] - 1] == lansp[lanspindRange[0] - 1]: sttspindRange = (sttspindRange[0] - 1, sttspindRange[1]) lanspindRange = (lanspindRange[0] - 1, lanspindRange[1]) # print( 1, str(sttspindRange) + " : " + " ".join( sttsp[ sttspindRange[0] : sttspindRange[1] ] ), str(lanspindRange) + " : " + " ".join( lansp[ lanspindRange[0] : lanspindRange[1] ] ), sep="\n" ); # print the match while sttspindRange[1] < len(sttsp) and lanspindRange[1] < len( lansp) and sttsp[sttspindRange[1]] == lansp[ lanspindRange[1]]: # right range index is noninclusive sttspindRange = (sttspindRange[0], sttspindRange[1] + 1) lanspindRange = (lanspindRange[0], lanspindRange[1] + 1) # print( 2, str(sttspindRange) + " : " + " ".join( sttsp[ sttspindRange[0] : sttspindRange[1] ] ), str(lanspindRange) + " : " + " ".join( lansp[ lanspindRange[0] : lanspindRange[1] ] ), sep="\n" ); # print the match # print() sttOrigCharRange = (sttspinds[sttspindRange[0]][0], sttspinds[sttspindRange[1] - 1][1] + 1) lanOrigCharRange = (lanspinds[lanspindRange[0]][0], lanspinds[lanspindRange[1] - 1][1] + 1) return { "stt": { "spindRange": sttspindRange, "origCharRange": sttOrigCharRange, "text": stt[ sttOrigCharRange[0] : sttOrigCharRange[1] ] }, \ "lan": { "spindRange": lanspindRange, "origCharRange": lanOrigCharRange, "text": lan[ lanOrigCharRange[0] : lanOrigCharRange[1] ] }, \ "matchText": queries[ queryIdx ] } # pprint( splitInds( 219, 267, 12 ) ) # for joinedLanspEnd, (queryIdx, query) in A.iter( joinedLansp ): # joinedLanspStart = joinedLanspEnd - len(query) + 1 # print(joinedLanspStart, "-", joinedLanspEnd, ",", queryIdx, ",", query, "//", str( joinedLanspStart * 100 / len( lan ) ) + "%") # print( splitInds( joinedLanspStart, joinedLanspEnd, queryIdx ) ) # this needs to check if any return matched multiple acRet = [(joinedLanspEnd, (queryIdx, query)) for joinedLanspEnd, (queryIdx, query) in ac.iter(joinedLansp)] # print(acRet) acRetQIdxCounts = Counter(a[1][0] for a in acRet) print("removed multimatches:", [(q, cnt, queries[q]) for q, cnt in acRetQIdxCounts.items() if cnt != 1]) acRet = [a for a in acRet if acRetQIdxCounts[a[1][0]] == 1] # strip all matches which don't occur strictly once; fully remove all duplicates matchingSegments = [ splitInds(joinedLanspEnd - len(query) + 1, joinedLanspEnd + 1, queryIdx) for joinedLanspEnd, (queryIdx, query) in acRet ] print("num matches:", str(len(matchingSegments)) + "/" + str(len(queries))) # merge overlapping / duplicated matchings (since they've each been expanded in splitInds) duplicateStore = set() matchToRangeSet = lambda m: (*m["lan"]["origCharRange"], *m["lan"][ "spindRange"], *m["stt"]["origCharRange"], *m["stt"]["spindRange"]) matchingSegments = [ m for m in matchingSegments if matchToRangeSet(m) not in duplicateStore and not duplicateStore.add(matchToRangeSet(m)) ] # thanks https://stackoverflow.com/a/4463433 # pprint( matchingSegments[:3] ) return matchingSegments
def __init__(self): # print(text) # self.text=text # self.textProcess=TextProcess() self.aho_policical_person = ahocorasick.Automaton() self.regulars = [] #应该有三四种抽取方式 #这是触发词+依存 self.aho_policical_person.add_word( '申请执行人', [ { 'word': '申请执行人', 'key': '申请执行人', 'field': 6, 'ekey': 'claimant', 'rel': '申请执行人[)]?[:|:]?(.*?)[,|,][男|女|住]', 'extrctor': 'regular' }, # {'key': '申请执行人', 'field': 6, 'ekey': 'claimant', 'rel': '定中关系', # 'extrctor': 'pd'}, ]) self.aho_policical_person.add_word( '被执行人', [ { 'word': '被执行人', 'key': '被执行人', 'field': 7, 'ekey': 'respondent', 'rel': '被执行人[)]?[:|:]?(.*?)[,|,][男|女|住]', 'extrctor': 'regular' }, # {'key': '被执行人', 'field': 7, 'ekey': 'respondent', 'rel': '定中关系', # 'extrctor': 'pd'} ]) self.aho_policical_person.add_word('审判员', [{ 'word': '审判员', 'key': '审判员', 'field': 12, 'ekey': 'judger', 'rel': '审判员(.*)', 'extrctor': 'regular' }]) self.aho_policical_person.add_word('审判长', [{ 'word': '审判长', 'key': '审判员', 'field': 12, 'ekey': 'judger', 'rel': '审判长(.*)', 'extrctor': 'regular' }]) self.aho_policical_person.add_word('执行员', [{ 'word': '执行员', 'key': '审判员', 'field': 12, 'ekey': 'judger', 'rel': '审判长(.*)', 'extrctor': 'regular' }]) m = r'((?:[一二三四五六七八九十千万亿兆幺零百壹贰叁肆伍陆柒捌玖拾佰仟]+(?:点[一二三四五六七八九幺零]+){0,1})|\d+(?:[\.,,]\d+)*万?)元' # ' ((?:[一二三四五六七八九十千万亿兆幺零百]+(?:点[一二三四五六七八九幺零]+){0,1})|\d+(?:[\.,,]\d+)+万?)元' m1 = r'(?:支付|给付|清偿|偿还|^(?:(?!未).)*执行到位).*?' + m #清偿 偿还 执行到位 #标的还剩,XXX未执行到位 未执行到位 # shean = [{ 'key': '标的', 'field': 8, 'ekey': 'judger', 'word': '标的', 'rel': m1, 'extrctor': 'regular' }] self.aho_policical_person.add_word('支付', shean) self.aho_policical_person.add_word('给付', shean) self.aho_policical_person.add_word('清偿', shean) self.aho_policical_person.add_word('偿还', shean) self.aho_policical_person.add_word('退赔', shean) self.aho_policical_person.add_word('执行到位', [{ 'key': '标的', 'field': 8, 'ekey': 'judger', 'word': '执行到位', 'rel': '执行到位.*?' + m, 'extrctor': 'regular' }]) self.aho_policical_person.add_word('执行标的', [{ 'key': '标的', 'field': 8, 'ekey': 'judger', 'word': '执行标的', 'rel': '执行标的.*?' + m, 'extrctor': 'regular' }]) self.aho_policical_person.add_word('执行总标的', [{ 'key': '标的', 'field': 8, 'ekey': 'judger', 'word': '执行总标的', 'rel': '执行总标的.*?' + m, 'extrctor': 'regular' }]) # self.aho_policical_person.add_word('') #数据结构: # 0审理法院、 必 ok # 1案件类型、 必 ok # 2案由、 必 ok # 3文书类型、 必 ok # 4案号、 必 ok # 5裁判日期、 必 ok # 6申请执行人、 必 ok # 7被执行人[], 必 ok # 8标的金额, 选 # 9执行金额, 选 # 10是否执行完, 必 # 11是否终结本次, 必 ok # 12审判员, 必 # 13相关判决书, 必 # 14细类型 选-----》追加或撤销追加,撤回执行,终结审查程序,驳回 移送 中止 变更申请人 self.aho_policical_person.make_automaton() #预读取数据 try: self.casepd = pandas.read_csv('./resources/case_detail_list.csv', index_col=0) except: self.casepd = pandas.DataFrame() # self.meta_person=pandas.DataFrame(columns=['name','detail']) try: self.meta_company = pandas.read_csv('./resources/company_list.csv', index_col=0) self.meta_company = self.meta_company.drop_duplicates(subset=None) except: self.meta_company = pandas.DataFrame(columns=['name', 'detail']) self.moneysent = []
def __init__(self): self.ac = ahocorasick.Automaton()
def __init__(self, rules=None, _ranked_tokens=global_tokens_by_ranks): """ Initialize the index with an iterable of Rule objects. """ # total number of unique known tokens self.len_tokens = 0 # largest token ID for a "junk" token. A token with a smaller id than # len_junk is considered a "junk" very common token self.len_junk = 0 # corresponding number of non-junk tokens: len_tokens = len_junk + len_good self.len_good = 0 # mapping of token string > token id self.dictionary = {} # mapping of token id -> token string as a list where the index is the # token id and the value the actual token string self.tokens_by_tid = [] # Note: all mappings of rid-> data are lists of data where the index # is the rule id. # rule objects proper self.rules_by_rid = [] # token_id sequences self.tids_by_rid = [] # mapping of rule id->(mapping of (token_id->[positions, ...]) # We track only high/good tokens there. This is a "traditional" # positional inverted index self.high_postings_by_rid = [] # mapping of rule_id -> tuple of low and high tokens ids sets/multisets # (low_tids_set, high_tids_set) self.tids_sets_by_rid = [] # (low_tids_mset, high_tids_mset) self.tids_msets_by_rid = [] # mapping of hash -> single rid : duplicated rules are not allowed self.rid_by_hash = {} # Aho-Corasick automatons for negative and small rules self.rules_automaton = ahocorasick.Automaton(ahocorasick.STORE_ANY) self.negative_automaton = ahocorasick.Automaton(ahocorasick.STORE_ANY) # disjunctive sets of rule ids: regular, negative, small, false positive self.regular_rids = set() self.negative_rids = set() self.small_rids = set() self.false_positive_rids = set() # length of the largest false_positive rule self.largest_false_positive_length = 0 # mapping of hash -> rid for false positive rule tokens hashes self.false_positive_rid_by_hash = {} # if True the index has been optimized and becomes read only: # no new rules can be added self.optimized = False if rules: if TRACE_INDEXING_PERF: start = time() print('LicenseIndex: building index.') # index all and optimize self._add_rules(rules, _ranked_tokens) if TRACE_INDEXING_PERF: duration = time() - start len_rules = len(self.rules_by_rid) print( 'LicenseIndex: built index with %(len_rules)d rules in %(duration)f seconds.' % locals()) self._print_index_stats()
def setUp(self): self.A = ahocorasick.Automaton(); self.words = "word python aho corasick \x00\x00\x00".split() self.inexisting = "test foo bar dword".split()
def _get_entity_text(train, flag='train'): if flag == 'train': print('*' * 20 + 'neurips 3321 + stage1_test 368' + '*' * 20) else: print('*' * 20 + 'stage 2 986' + '*' * 20) gene_list = set(train['Gene'].tolist()) var_list = set(train['Variation'].tolist()) print('unique gene in ' + flag + ':', len(gene_list), ' unique variation in ' + flag + ':', len(var_list)) sentences = set( list(chain.from_iterable( train['Text'].map(lambda x: sent_tokenize(x))))) print('number of unique sentences in all samples:', len(sentences)) print('Building gene tree...') gene_tree = ahocorasick.Automaton() for idx, gene in enumerate(gene_list): gene_tree.add_word(gene.lower(), (idx, gene.lower())) gene_tree.make_automaton() print('Assinging gene occurrence sentences to gene tree...') gene_dict = dict() for sent in sentences: word_list = word_tokenize(sent.lower()) for word in word_list: if word in gene_tree: if word in gene_dict: gene_dict[word] += sent else: gene_dict[word] = sent print('Writing gene occurrence sentences to file...') df_gene = pd.DataFrame.from_dict(gene_dict, orient='index').reset_index() df_gene.columns = ['Gene', 'Text'] df_gene.to_csv('data/intermediate/' + flag + '_gene_text', index=False, sep='|') print('Building variation tree...') var_tree = ahocorasick.Automaton() for idx, var in enumerate(var_list): var_tree.add_word(var.lower(), (idx, var.lower())) var_tree.make_automaton() print('Assinging variation occurrence sentences to variation tree...') var_dict = dict() for sent in sentences: word_list = word_tokenize(sent.lower()) for word in word_list: if word in var_tree: if word in var_dict: var_dict[word] += sent else: var_dict[word] = sent print('Writing variation occurrence sentences to file...') df_gene = pd.DataFrame.from_dict(var_dict, orient='index').reset_index() df_gene.columns = ['Variation', 'Text'] df_gene.to_csv('data/intermediate/' + flag + '_variation_text', index=False, sep='|')
def build_ac_automation_from_strings(cls, keys: List[str]) -> Any: atm = ahocorasick.Automaton() # pylint: disable=c-extension-no-member for idx, key in enumerate(keys): atm.add_word(key, (idx, key)) atm.make_automaton() return atm
def __init__(self, tags): self.tree = ahocorasick.Automaton() for tag in tags: self.tree.add_word(TagsTrie.cut_sent(tag), tag) self.tree.make_automaton()
def make_pyahocorasick_automaton(patterns): automaton = ahocorasick.Automaton() for key in patterns: automaton.add_word(key, key) automaton.make_automaton() return automaton
def run(self): self.A = ahocorasick.Automaton() self.add_words() self.remove()
#!/usr/bin/python # -*- coding:utf-8 -*- import ahocorasick A = ahocorasick.Automaton() A.add_word('最优秀', (0, '最优秀')) A.add_word('最佳', (0, '最佳')) A.add_word('最好', (0, '最好')) # A.add_word('c', (1, 'c')) # if 'hahhahah' in A : # print("TRUE") # else: # print("TRUE") A.make_automaton() # a=0 # for item in A.iter("_hahhahahhahhahhaheeeee_"): # a=a+1 # import mysql.connector # config = { # 'host': '121.42.33.20', # 'user': '******', # 'password': '******', # 'port': 8306, # 'database': 'htmp', # 'charset': 'utf8'
relative_paths = glob.iglob('%s/**/*.tex' % target_dir, recursive=True) relative_paths = set( [path.replace(target_dir, '').lstrip('/') for path in relative_paths]) ignore_paths = [glob.glob(path) for path in args.exclude] ignore_paths = set(chain.from_iterable(ignore_paths)) stats = defaultdict(lambda: 0) # ***************************************************************************** # * Build Aho-Corasick tree # ***************************************************************************** print("==> Preparing Aho-Corasick concept matcher", file=sys.stderr) matcher = ahocorasick.Automaton() with open(args.concepts) as con_f: reader = csv.DictReader(con_f) for idx, row in enumerate(reader): matcher.add_word(row['match'], (row['match'], row['concept'])) stats['index terms'] += 1 matcher.make_automaton() # ***************************************************************************** # * Annotate index entries # ***************************************************************************** print("==> Inserting \\index{...} entries", file=sys.stderr) stats['annotation distribution'] = defaultdict(lambda: 0)
def setUp(self): self.A = ahocorasick.Automaton() words = "word python aho corasick tree bark branch root".split() for word in words: self.A.add_word(conv(word), 1)
def __init__(self, file_or_list=None): self.ac = ahocorasick.Automaton(ahocorasick.STORE_LENGTH) if file_or_list: self.add_words(file_or_list) self.interfere_factor = DEFAULT_INTERFERE_FACTOR
def setUp(self): self.A = ahocorasick.Automaton(ahocorasick.STORE_ANY, ahocorasick.KEY_SEQUENCE)
} # function words f_func_words = open(FUNCTION_WORDS_FILE_PATH, "r", encoding="utf-8") func_words = f_func_words.readlines() FUNCTION_WORDS_LIST = [word.rstrip() for word in func_words] # fixed expressions f_fixed_expression = open(FIXED_EXPRESSIONS_FILE_PATH, "r", encoding="utf-8") fixed_expressions = f_fixed_expression.readlines() FIXED_EXPRESSIONS_LIST = [word.rstrip() for word in fixed_expressions] # AC Automaton AC_AUTOMATON = ahocorasick.Automaton() for idx, key in enumerate(FIXED_EXPRESSIONS_LIST): AC_AUTOMATON.add_word(key, (idx, key)) AC_AUTOMATON.make_automaton() # BPE SPM = spm.SentencePieceProcessor() SPM.Load(BPE_MODEL_PATH) # special tokens SPECIAL_TOKENS = { "pad": "<pad>", "oov": "<oov>", "sos": "<sos>", "eos": "<eos>" }
def test_get_from_an_empty_automaton(self): A = ahocorasick.Automaton() r = A.get('foo', None) self.assertEqual(r, None)
def build_sid_ahoc_queries(sample_ids): acs = ahocorasick.Automaton() #need to add "," and ":" to make sure we dont match coverage counts or within other IDs [acs.add_word("," + sid + ":", sid) for sid in sample_ids] acs.make_automaton() return acs