Exemple #1
0
def actree(FRA_database, excludefile):
    """
    create actree for muti-string-search.

    Parameters
    ----------
    FRA_database: str
        database txt file of FRA.
    excludefile: str
        exclude names from excludefile,
        some words are not abbr of some proteins.

    Returns
    -------
    tree
        ahocorasick keywordTree.
    lower_name_dic: dict
        map of keyword to protein information.
    """
    tree_nodes, lower_name_dic = actree_pre(FRA_database, excludefile)
    tree = ahocorasick.KeywordTree()
    for node in tree_nodes:
        tree.add(node)
    tree.make()
    return tree, lower_name_dic
Exemple #2
0
def ahocorasick_test():
    """
    test the use of ahocorasick
    """
    tree = ahocorasick.KeywordTree()
    tree.add("alpha")
    tree.add("alpha beta")
    tree.add("gamma")

    tree.make()

    result = tree.search("I went to alpha beta the other day to pick up some \
                          spam")
    print(result)

    result = tree.search_long("I went to alpha beta the other day to pick up \
            some spam")
    print(result)

    result = tree.search("and also got some alphabet soup")
    print(result)

    result = tree.search("but no waffles")
    print(result)

    result = tree.search_long("on, gamma rays are not tasty")
    print(result)

    result = tree.findall("I went to alpha beta to pick up alphabet soup")
    for match in result:
        print(match)
Exemple #3
0
    def load(self):
        """Load entities from file to Aho-Corasick tree"""
        assert FLAGS.entities_path
        f = codecs.open(FLAGS.entities_path,
                        encoding=FLAGS.config_encoding,
                        mode='r')
        lines = f.read().split('\n')
        f.close()

        blacklist_pattern = re.compile(r'^[a-zA-Z0-9]{,4}$')
        self.tree = ahocorasick.KeywordTree()
        self.weights = {}
        for line in lines:
            parts = line.strip().split('\t ')
            if len(parts) == 1:
                entity = line
                weight = 1.0
            else:
                entity = parts[0]
                weight = float(parts[1])
            if len(entity) < 2 or blacklist_pattern.match(entity):
                continue
            entity = entity.encode(FLAGS.runtime_encoding).upper()
            self.tree.add(entity)
            self.weights[entity] = weight
        self.tree.make()
Exemple #4
0
def getLabels():
    tree = ahocorasick.KeywordTree()
    tree.add("he")
    tree.add("she")
    tree.add("his")
    tree.add("hers")
    tree.make()
    return tree.zerostate().labels()
 def testEmptyConstruction(self):
     """Make sure that we can safely construct and dealloc a tree
     with no initial keywords.  Important because the C
     implementation assumes keywords exist on its dealloc, so we
     have to do some work on the back end to avoid silly segmentation
     errors."""
     tree = ahocorasick.KeywordTree()
     del tree
Exemple #6
0
def build_actries(profiles_keywords):
    actries = ahocorasick.KeywordTree()
    for profile_keywords in profiles_keywords:
        for keyword in profile_keywords:
            actries.add(keyword)

    actries.make()
    return actries
Exemple #7
0
 def __init__(self, dictFile):
     self.__tree = ahocorasick.KeywordTree()
     try:
         fp = open(dictFile)
         for line in fp:
             self.__tree.add(line.rstrip("\n"))
         fp.close()
         self.__tree.make()
     except Exception, e:
         log.critical("Exception caught in Confilter.__init__ function. \
         [Exception]: %s" % e)
         return None
Exemple #8
0
 def build(name):
   tree = None
   count = 0
   for line in open(name2fn(name), 'rb').readlines():
     s = line.strip('\r\n')
     if s: # empty check
       if 0 == count:
         tree = ahocorasick.KeywordTree()
       tree.add(s)
       count = count + 1
   if count != 0:
     tree.make()
   return tree
Exemple #9
0
    def __init__(self, filename, delimiter):
        self._tree = ahocorasick.KeywordTree()

        with open(filename, 'rb') as fp:
            data = fp.read()
            #if not isinstance(data, unicode):
            #    data = data.decode('utf-8')

            words = data.split(delimiter)
            for word in words:
                if word:
                    self._tree.add(word)

        self._tree.make()
Exemple #10
0
    def __init__(self, sigs, min_fx=MIN_FIXED_STRING_LENGHT):
        self.sigs = sigs
        start_time = datetime.datetime.now()
        self.tree = ahocorasick.KeywordTree()
        for fixedstring_sig in sigs.get_fixedstrings():
            if fixedstring_sig > min_fx:
                self.tree.add(fixedstring_sig)
            else:
                logging.warning(
                    "Ignoring signature %s because fixed string representation is less than % i "
                    % (fixedstring_sig, min_fx))

        self.tree.make()
        end_time = datetime.datetime.now()
        logging.debug("Signature download and index build time" +
                      str(end_time - start_time))
Exemple #11
0
def solve(T):
    print "Case #%d:" % (T + 1),
    K, W, L = map(int, raw_input().split())
    key = raw_input()
    word = raw_input()
    tree = ahocorasick.KeywordTree()
    tree.add(word)
    tree.make()

    MAX = 0
    num = K**L
    SUM = 0
    for n in itertools.product(key, repeat=L):
        mam = "".join(n)
        ter = len(re.findall('(?=%s)' % word, mam))

        SUM += ter
        MAX = max(MAX, ter)

    print "%.8f" % (MAX - ((SUM + 0.0) / num))
Exemple #12
0
def MultipleStringMatch(patterns, corpus):
    """Search a list of strings in a corpus string.

  Args:
    patterns: A list of strings.
    corpus: The text where to search for the strings.

  Result:
    A list of Booleans stating whether each pattern string was found in the
    corpus or not.
  """
    tree = ahocorasick.KeywordTree()
    for word in patterns:
        tree.add(word)
    tree.make()

    result = [False] * len(patterns)
    for i, j in tree.findall(corpus):
        match = corpus[i:j]
        result[patterns.index(match)] = True

    return result
Exemple #13
0
from distutils.util import get_platform
import sys
sys.path.insert(0, "build/lib.%s-%s" % (get_platform(), sys.version[0:3]))


import ahocorasick

"""We just want to exercise the code and monitor its memory usage."""

n = 0
while True:
    sys.stdout.write("iteration %s\n" % n)
    sys.stdout.flush()
    tree = ahocorasick.KeywordTree()
    f = open("/usr/share/dict/words")
    for i, word in enumerate(f):
        tree.add(word.strip())
    f.close()
    tree.make()
    tree.search("foo bar baz")
    n += 1
 def setUp(self):
     self.tree = ahocorasick.KeywordTree()
def getZerostate():
    tree = ahocorasick.KeywordTree()
    tree.add("foobar")
    tree.make()
    return tree.zerostate()