def actree(FRA_database, excludefile): """ create actree for muti-string-search. Parameters ---------- FRA_database: str database txt file of FRA. excludefile: str exclude names from excludefile, some words are not abbr of some proteins. Returns ------- tree ahocorasick keywordTree. lower_name_dic: dict map of keyword to protein information. """ tree_nodes, lower_name_dic = actree_pre(FRA_database, excludefile) tree = ahocorasick.KeywordTree() for node in tree_nodes: tree.add(node) tree.make() return tree, lower_name_dic
def ahocorasick_test(): """ test the use of ahocorasick """ tree = ahocorasick.KeywordTree() tree.add("alpha") tree.add("alpha beta") tree.add("gamma") tree.make() result = tree.search("I went to alpha beta the other day to pick up some \ spam") print(result) result = tree.search_long("I went to alpha beta the other day to pick up \ some spam") print(result) result = tree.search("and also got some alphabet soup") print(result) result = tree.search("but no waffles") print(result) result = tree.search_long("on, gamma rays are not tasty") print(result) result = tree.findall("I went to alpha beta to pick up alphabet soup") for match in result: print(match)
def load(self): """Load entities from file to Aho-Corasick tree""" assert FLAGS.entities_path f = codecs.open(FLAGS.entities_path, encoding=FLAGS.config_encoding, mode='r') lines = f.read().split('\n') f.close() blacklist_pattern = re.compile(r'^[a-zA-Z0-9]{,4}$') self.tree = ahocorasick.KeywordTree() self.weights = {} for line in lines: parts = line.strip().split('\t ') if len(parts) == 1: entity = line weight = 1.0 else: entity = parts[0] weight = float(parts[1]) if len(entity) < 2 or blacklist_pattern.match(entity): continue entity = entity.encode(FLAGS.runtime_encoding).upper() self.tree.add(entity) self.weights[entity] = weight self.tree.make()
def getLabels(): tree = ahocorasick.KeywordTree() tree.add("he") tree.add("she") tree.add("his") tree.add("hers") tree.make() return tree.zerostate().labels()
def testEmptyConstruction(self): """Make sure that we can safely construct and dealloc a tree with no initial keywords. Important because the C implementation assumes keywords exist on its dealloc, so we have to do some work on the back end to avoid silly segmentation errors.""" tree = ahocorasick.KeywordTree() del tree
def build_actries(profiles_keywords): actries = ahocorasick.KeywordTree() for profile_keywords in profiles_keywords: for keyword in profile_keywords: actries.add(keyword) actries.make() return actries
def __init__(self, dictFile): self.__tree = ahocorasick.KeywordTree() try: fp = open(dictFile) for line in fp: self.__tree.add(line.rstrip("\n")) fp.close() self.__tree.make() except Exception, e: log.critical("Exception caught in Confilter.__init__ function. \ [Exception]: %s" % e) return None
def build(name): tree = None count = 0 for line in open(name2fn(name), 'rb').readlines(): s = line.strip('\r\n') if s: # empty check if 0 == count: tree = ahocorasick.KeywordTree() tree.add(s) count = count + 1 if count != 0: tree.make() return tree
def __init__(self, filename, delimiter): self._tree = ahocorasick.KeywordTree() with open(filename, 'rb') as fp: data = fp.read() #if not isinstance(data, unicode): # data = data.decode('utf-8') words = data.split(delimiter) for word in words: if word: self._tree.add(word) self._tree.make()
def __init__(self, sigs, min_fx=MIN_FIXED_STRING_LENGHT): self.sigs = sigs start_time = datetime.datetime.now() self.tree = ahocorasick.KeywordTree() for fixedstring_sig in sigs.get_fixedstrings(): if fixedstring_sig > min_fx: self.tree.add(fixedstring_sig) else: logging.warning( "Ignoring signature %s because fixed string representation is less than % i " % (fixedstring_sig, min_fx)) self.tree.make() end_time = datetime.datetime.now() logging.debug("Signature download and index build time" + str(end_time - start_time))
def solve(T): print "Case #%d:" % (T + 1), K, W, L = map(int, raw_input().split()) key = raw_input() word = raw_input() tree = ahocorasick.KeywordTree() tree.add(word) tree.make() MAX = 0 num = K**L SUM = 0 for n in itertools.product(key, repeat=L): mam = "".join(n) ter = len(re.findall('(?=%s)' % word, mam)) SUM += ter MAX = max(MAX, ter) print "%.8f" % (MAX - ((SUM + 0.0) / num))
def MultipleStringMatch(patterns, corpus): """Search a list of strings in a corpus string. Args: patterns: A list of strings. corpus: The text where to search for the strings. Result: A list of Booleans stating whether each pattern string was found in the corpus or not. """ tree = ahocorasick.KeywordTree() for word in patterns: tree.add(word) tree.make() result = [False] * len(patterns) for i, j in tree.findall(corpus): match = corpus[i:j] result[patterns.index(match)] = True return result
from distutils.util import get_platform import sys sys.path.insert(0, "build/lib.%s-%s" % (get_platform(), sys.version[0:3])) import ahocorasick """We just want to exercise the code and monitor its memory usage.""" n = 0 while True: sys.stdout.write("iteration %s\n" % n) sys.stdout.flush() tree = ahocorasick.KeywordTree() f = open("/usr/share/dict/words") for i, word in enumerate(f): tree.add(word.strip()) f.close() tree.make() tree.search("foo bar baz") n += 1
def setUp(self): self.tree = ahocorasick.KeywordTree()
def getZerostate(): tree = ahocorasick.KeywordTree() tree.add("foobar") tree.make() return tree.zerostate()