Beispiel #1
0
    def test_readme_example(self):
        '''
        As used in the projects README. If you have to change this test case,
        please update the README accordingly.
        '''
        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('malaga')
        kwtree.add('lacrosse')
        kwtree.add('mallorca')
        kwtree.add('mallorca bella')
        kwtree.add('orca')
        kwtree.finalize()

        result = kwtree.search('My favorite islands are malaga and sylt.')
        self.assertEqual(('malaga', 24), result)

        result = kwtree.search(
            'idontlikewhitespaceswhereismalacrossequestionmark')
        self.assertEqual(('lacrosse', 29), result)

        results = kwtree.search_all('malheur on mallorca bellacrosse')
        self.assertIsNotNone(results)
        self.assertEqual(('mallorca', 11), next(results))
        self.assertEqual(('orca', 15), next(results))
        self.assertEqual(('mallorca bella', 11), next(results))
        self.assertEqual(('lacrosse', 23), next(results))
        with self.assertRaises(StopIteration):
            next(results)
Beispiel #2
0
    def test_empty_keyword(self):
        kwtree = KeywordTree()
        kwtree.add('')
        kwtree.finalize()

        result = kwtree.search('')
        self.assertIsNone(result)
Beispiel #3
0
 def test_state_to_string(self):
     words = ['peter', 'horst', 'gandalf', 'frodo']
     tree = KeywordTree(case_insensitive=True)
     for word in words:
         tree.add(word)
     tree.finalize()
     as_string = str(tree._zero_state)
     self.assertIsNotNone(as_string)
Beispiel #4
0
def create_keywordtree(lst, s):
    kwtree = KeywordTree(case_insensitive=True)
    for w in lst:
        kwtree.add(w)
    kwtree.finalize()
    # (keyword, position)のタプルのリストを返す
    res = kwtree.search_all(s)
    return res
Beispiel #5
0
    def test_text_end_situation_2(self):
        kwtree = KeywordTree()
        kwtree.add('blaaaaaf')
        kwtree.add('la')
        kwtree.finalize()

        result = kwtree.search('bla')
        self.assertEqual(('la', 1), result)
 def __create_search_tree(self):
     """
     Initializes the search tree with the list of tickers in __tickers
     """
     self.__searchTree = KeywordTree()
     tickers: Series = self.__tickers.Ticker
     for ticker in tickers:
         self.__searchTree.add(ticker)
     self.__searchTree.finalize()
Beispiel #7
0
    def test_domains(self):
        kwtree = KeywordTree()
        kwtree.add('searchenginemarketingfordummies.com')
        kwtree.add('linkpt.com')
        kwtree.add('fnbpeterstown.com')
        kwtree.finalize()

        result = kwtree.search('*****@*****.**')
        self.assertEqual(('linkpt.com', 10), result)
Beispiel #8
0
    def test_finalize_errors(self):
        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('bla')
        kwtree.add('blue')

        self.assertRaises(ValueError, kwtree.search, 'blueb')

        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('bla')
        kwtree.finalize()

        self.assertRaises(ValueError, kwtree.add, 'blueb')

        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('bla')
        kwtree.finalize()

        self.assertRaises(ValueError, kwtree.finalize)
Beispiel #9
0
    def test_simple_back_to_zero_state_example(self):
        kwtree = KeywordTree()
        keyword_list = ['ab', 'bca']
        for keyword in keyword_list:
            kwtree.add(keyword)
        kwtree.finalize()

        result = kwtree.search('blbabca')
        self.assertEqual(('ab', 3), result)
Beispiel #10
0
 def test_utility_calls(self):
     kwtree = KeywordTree(case_insensitive=True)
     kwtree.add('bla')
     kwtree.add('blue')
     kwtree.finalize()
     # Just test that there are no errors
     rep = repr(kwtree)
     self.assertGreater(len(rep), 0)
     tostring = str(kwtree)
     self.assertGreater(len(tostring), 0)
Beispiel #11
0
    def test_search_all_issue_1_similar(self):
        text = '/foo/bar'
        words = ['/bara', '/foo/barb', 'bar']
        tree = KeywordTree(case_insensitive=True)
        for word in words:
            tree.add(word)
        tree.finalize()

        results = tree.search_all(text)

        self.assertEqual(('bar', 5), next(results))
Beispiel #12
0
def search(patterns, content):
    kwtree = KeywordTree(case_insensitive=True)
    for p in patterns:
        kwtree.add(p)

    kwtree.finalize()
    results = kwtree.search_all(content)
    result_list = []
    for result in results:
        result_list.append(result[0])
    return result_list
Beispiel #13
0
    def test_case_insensitivity_mode(self):
        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('bla')
        kwtree.add('blue')
        kwtree.add('blISs')
        kwtree.finalize()

        result = kwtree.search('bLa')
        self.assertEqual(('bla', 0), result)

        result = kwtree.search('BLISS')
        self.assertEqual(('blISs', 0), result)
Beispiel #14
0
    def test_visualizer(self):
        # Needs working pygraphviz on system
        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('malaga')
        kwtree.add('lacrosse')
        kwtree.add('mallorca')
        kwtree.add('mallorca bella')
        kwtree.add('orca')
        kwtree.finalize()

        visualizer = Visualizer()
        visualizer.draw('readme_example.png', kwtree)
Beispiel #15
0
    def test_unicode(self):
        kwtree = KeywordTree()
        kwtree.add('bla')
        kwtree.add('blue')
        kwtree.add(u'颜到')
        kwtree.finalize()

        result = kwtree.search(u'春华变苍颜到处群魔乱')
        self.assertEqual((u'颜到', 4), result)

        result = kwtree.search(u'三年过')
        self.assertIsNone(result)
Beispiel #16
0
def aho_corasick_search(peptides: List[str], proteins: Dict[str, str]) -> List[Tuple[str, str]]:
    matches: List[Tuple[str, str]] = []

    kwtree = KeywordTree(case_insensitive=True)
    for peptide in peptides:
        kwtree.add(peptide)
    kwtree.finalize()
    for key, protein in tqdm(proteins.items()):
        match = kwtree.search(protein)
        if match != None:
            matches.append((key, match[0])) 
    return matches
Beispiel #17
0
    def test_suffix_stuff(self):
        kwtree = KeywordTree()
        kwtree.add('blaaaaaf')
        kwtree.add('bluez')
        kwtree.add('aaaamen')
        kwtree.add('uebergaaat')
        kwtree.finalize()

        result = kwtree.search('blaaaaamentada')
        self.assertEqual(('aaaamen', 3), result)

        result = kwtree.search('clueuebergaaameblaaaamenbluez')
        self.assertEqual(('aaaamen', 17), result)
Beispiel #18
0
 def init_context(self):
     concept_context = {}
     for k in self.atomic_op:
         if k.startswith('IN_'):
             concept = '_'.join(k.split('_')[1:-1])
             kt = KeywordTree(case_insensitive=False)
             for keyword in self.concept_words[concept]:
                 kt.add(keyword)
             kt.finalize()
             concept_context[k] = kt
         else:
             #BEFORE
             pass
     self.concept_context = concept_context
def ahocorasick_all_match(text, keywords):
    kwtree_all = KeywordTree(case_insensitive=True)
    for key in keywords:
        kwtree_all.add(key)
    kwtree_all.finalize()

    all_match = list()
    results = kwtree_all.search_all(text)
    for result in results:
        if result[0] in all_match:
            pass
        else:
            all_match.append(result[0])

    return len(all_match)
Beispiel #20
0
    def __init__(
        self,
        path_to_geo_entities: str = "data/openweathermap_city_list.json"
    ) -> None:
        """Initialize a trie for finding city names.

        :param path_to_geo_entities: filepath to a JSON file containing a list of cities
            file format: ["Ḩeşār-e Sefīd", "‘Ayn Ḩalāqīm", "Taglag", ..... , "Gerton"]
            this list was created using the source file: https://bulk.openweathermap.org/sample/city.list.json.gz
        :type path_to_geo_entities: str
        """
        self.geonames = self._load_from_json(path_to_geo_entities)
        self.kwtree = KeywordTree(case_insensitive=True)
        for geo in self.geonames:
            self.kwtree.add(f" {geo} ")
        self.kwtree.finalize()
Beispiel #21
0
 def searcher(self, filepath):
     kwtree_word = []
     kwtree_weight = []
     f = open(filepath)
     for line in f:
         word, weight = line.split(' ')
         word = word.replace('_', ' ')
         kwtree_word.append(word)
         weight = weight.split('\n')[0]
         kwtree_weight.append(weight)
     f.close()
     kwtree = KeywordTree(case_insensitive=True)
     for word in kwtree_word:
         kwtree.add(word)
     kwtree.finalize()
     return kwtree, kwtree_word, kwtree_weight
Beispiel #22
0
    def __init__(self, callback, bt_device_id, device_filter, packet_filter,
                 scan_parameters):
        """Construct interface object."""
        # do import here so that the package can be used in parsing-only mode (no bluez required)
        self.backend = import_module('beacontools.backend')

        threading.Thread.__init__(self)
        self.daemon = False
        self.keep_going = True
        self.callback = callback

        # number of the bt device (hciX)
        self.bt_device_id = bt_device_id
        # list of beacons to monitor
        self.device_filter = device_filter
        self.mode = get_mode(device_filter)
        # list of packet types to monitor
        self.packet_filter = packet_filter
        # bluetooth socket
        self.socket = None
        # keep track of Eddystone Beacon <-> bt addr mapping
        self.eddystone_mappings = []
        # parameters to pass to bt device
        self.scan_parameters = scan_parameters

        # construct an aho-corasick search tree for efficient prefiltering
        service_uuid_prefix = b"\x03\x03"
        self.kwtree = KeywordTree()
        if self.mode & ScannerMode.MODE_IBEACON:
            self.kwtree.add(
                bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) +
                IBEACON_MANUFACTURER_ID + IBEACON_PROXIMITY_TYPE)
        if self.mode & ScannerMode.MODE_EDDYSTONE:
            self.kwtree.add(service_uuid_prefix + EDDYSTONE_UUID)
        if self.mode & ScannerMode.MODE_ESTIMOTE:
            self.kwtree.add(service_uuid_prefix + ESTIMOTE_UUID)
            self.kwtree.add(
                bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) +
                ESTIMOTE_MANUFACTURER_ID)
        if self.mode & ScannerMode.MODE_CJMONITOR:
            self.kwtree.add(
                bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) + CJ_MANUFACTURER_ID)
        if self.mode & ScannerMode.MODE_EXPOSURE_NOTIFICATION:
            self.kwtree.add(service_uuid_prefix + EXPOSURE_NOTIFICATION_UUID)
        self.kwtree.finalize()
Beispiel #23
0
    def test_case_sensitivity(self):
        kwtree = KeywordTree()
        kwtree.add('bla')
        kwtree.add('blue')
        kwtree.add('blISs')
        kwtree.finalize()

        result = kwtree.search('bLa')
        self.assertIsNone(result)

        result = kwtree.search('BLISS')
        self.assertIsNone(result)

        result = kwtree.search('bliss')
        self.assertIsNone(result)

        result = kwtree.search('blISs')
        self.assertEqual(('blISs', 0), result)
Beispiel #24
0
    def test_many_keywords(self):
        kwtree = KeywordTree(case_insensitive=True)
        with open('tests/data/names.txt') as keyword_file:
            keyword_list = list(map(str.strip, keyword_file.readlines()))

        for kw in keyword_list:
            kwtree.add(kw)

        kwtree.finalize()
        with open('tests/data/textblob.txt') as keyword_file:
            textblob = keyword_file.read()

        result = kwtree.search(textblob)
        self.assertEqual(('Dawn Higgins', 34153), result)

        results = kwtree.search_all(textblob)
        self.assertIsNotNone(results)
        self.assertEqual(('Dawn Higgins', 34153), next(results))
        with self.assertRaises(StopIteration):
            next(results)
Beispiel #25
0
    def test_simple(self):
        kwtree = KeywordTree()
        kwtree.add('bla')
        kwtree.add('blue')
        kwtree.finalize()

        result = kwtree.search('bl')
        self.assertIsNone(result)

        result = kwtree.search('')
        self.assertIsNone(result)

        result = kwtree.search('zef')
        self.assertIsNone(result)

        result = kwtree.search('blaaaa')
        self.assertEqual(('bla', 0), result)

        result = kwtree.search('red green blue grey')
        self.assertEqual(('blue', 10), result)
Beispiel #26
0
    def test_pickling_simple(self):
        words = ['peter', 'horst', 'gandalf', 'frodo']
        tree = KeywordTree(case_insensitive=True)
        for word in words:
            tree.add(word)
        tree.finalize()
        as_bytes = dumps(tree)

        self.assertIsNotNone(as_bytes)

        deserialized = loads(as_bytes)

        self.assertIsNotNone(deserialized)

        text = 'Gollum did not like frodo. But gandalf did.'

        results = deserialized.search_all(text)

        self.assertEqual(('frodo', 20), next(results))
        self.assertEqual(('gandalf', 31), next(results))
Beispiel #27
0
    def __init__(self, names: typing.List[str], path_parts: typing.List[str]):
        self.names = names
        self.path_parts = path_parts

        # The lookup algorithm
        self.lookup = self.is_blacklisted_part
        self.tree = None

        try:
            # If package is available, use Aho-Corasick algorithm,
            from ahocorapy.keywordtree import KeywordTree  # type: ignore

            self.tree = KeywordTree(case_insensitive=True)

            for p in self.path_parts:
                self.tree.add(p)
            self.tree.finalize()

            self.lookup = self.is_blacklisted_part_aho
        except ImportError:
            pass
Beispiel #28
0
    def _load_kw_trees(self) -> List[KeywordTree]:
        """ Загружает префиксные деревья для терминов из словарей (название каждого файла соответствует количеству
        токенов в терминах этого файла

        :return: Список префиксных деревьев
        """
        fnames = [
            '1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt',
            '8.txt', '9.txt', '10.txt', '11.txt', '12.txt', '13.txt', '14.txt',
            '20.txt'
        ]
        files_dir_path = os.path.join(DICT_EXTRACTOR_PATH, TERMS_DIR_NAME)
        kw_trees = []
        for fname in fnames[::-1]:
            kwtree = KeywordTree()
            with open(os.path.join(files_dir_path, fname), 'r') as f:
                for ngramm in f.read().split('\n'):
                    if ngramm != '':
                        kwtree.add(ngramm.split())
                kwtree.finalize()
                kw_trees.append(kwtree)
        return kw_trees
def init_ahocorapy():
    kwtree = KeywordTree()
    for keyword in keyword_list:
        kwtree.add(keyword)
    kwtree.finalize()
    return kwtree
Beispiel #30
0

def load_words():
    words = set()
    with open(os.path.join(basedir, '../data/dict/words.txt')) as input:
        for line in input:
            words.add(line.strip())
    return words


keywords.update(load_dict())
keywords.update(load_words())

keyword_pattern = re.compile('(' + '|'.join(keywords) + ')', re.IGNORECASE)

keyword_tree = KeywordTree(case_insensitive=True)
for word in keywords:
    keyword_tree.add(word)
keyword_tree.finalize()


def nsfw_text(text: str):
    if keyword_tree.search(text):
        return True
    return False


anti_filter_pattern = r'(?<= )(\w)(?= )'


def anti_fiter(text: str):