def load_and_search(): with open("ac_trie", "r+b") as bf: mm = mmap.mmap(bf.fileno(), 0) ac_first = AC.from_buff(mm, copy=False) # it shares memory ac_second = AC.from_buff(mm, copy=False) # it shares memory ac_third = AC.from_buff(mm, copy=False) # it shares memory ac_four = AC.from_buff(mm, copy=False) # it shares memory print("Matches after loading shared buffer:") for id, start, end in ac_first.match(string_to_search): print(id, string_to_search[start:end])
def test_buff_ac(self): ac = AC.build([u"aİ", u"aaİ", u"aai̇", u"aai̇bİ"], True) ac.save("ac.bin") with open("ac.bin", "rb") as fi: bs = bytearray(fi.read()) self.assertEqual(len(bs), ac.buff_size()) bs2 = bytearray(ac.buff_size()) ac.to_buff(bs2) self.assertEqual(bs2, bs) self._check_ac_correct(AC.from_buff(bs2, copy=True)) self._check_ac_correct(AC.from_buff(bs2, copy=False))
def test_ignore_case(self): if sys.version_info.major < 3: return ac = AC.build([u"aİ", u"aİİ", u"aai̇", u"aai̇bİ"], True) arr = [(end_, val) for val, start_, end_ in ac.match(u"aai̇bİa")] self.assertEqual(arr, [(4, 2), (4, 0), (6, 3)]) ac = AC.build([u"aİ", u"aaİ", u"aai̇", u"aai̇bİ"], True) self.assertEqual(ac.size, 3) arr = [(end_, val) for val, start_, end_ in ac.match(u"aai̇bİa")] self.assertEqual(arr, [(4, 1), (4, 0), (6, 2)])
def test_ignore_case_sep(self): if sys.version_info.major < 3: return ac = AC.build([u"aİ", u"aaİ", u"aai̇", u"aai̇bİ"], True) sep = set([ord(" ")]) arr = [(end_, val) for val, start_, end_ in ac.match(u"aai̇bİ", sep)] self.assertEqual(arr, [(6, 2)])
def test_pickle_ac(self): ac = AC.build([u"aİ", u"aaİ", u"aai̇", u"aai̇bİ"], True) with open("ac.pkl", "wb") as fo: pickle.dump(ac, fo) with open("ac.pkl", "rb") as fi: ac = pickle.load(fi) self.assertEqual(ac.size, 3) arr = [(end_, val) for val, start_, end_ in ac.match(u"aai̇bİa")] self.assertEqual(arr, [(4, 1), (4, 0), (6, 2)])
def build(self, pages: Iterable[Vertex]): key2pages = {} for page in pages: key = _clean_title(page["title"]).lower() key_pages = key2pages.setdefault(key, []) key_pages.append(page.index) self._ac = AC.build(key2pages.keys(), ignore_case=True) for key, id_ in self._ac.items(): self._id2pages.setdefault(id_, key2pages[key])
def test_save(self): ac_file = "ac_cyac" with open(TestAC.words_file, "r", encoding="utf-8") as f: words = list(f) ac = AC.build(words) ac_to_be_saved_size = ac.buff_size() ac.save(ac_file) ac_saved_size = Path(ac_file).stat().st_size self.assertEqual(ac_to_be_saved_size, ac_saved_size) os.remove(ac_file)
def get_shared_memory_find_matches(processname, shared_memory_tag, ac_size): shm = shared_memory.SharedMemory(shared_memory_tag) AC_in_bytes = shm.buf[0:ac_size] ac_in_process = AC.from_buff(AC_in_bytes, copy=False) string_to_search = "asdpythonasdasdruby" print("Executing search in {}".format(processname)) for id, start, end in ac_in_process.match(string_to_search): print(id, string_to_search[start:end]) #time.sleep(100) ac_in_process = None AC_in_bytes.release() # MUST release memory beforing closing shm insance, otherwise error is raised. shm.close()
from cyac import AC # python dictionary with trie patterns file_words = json.load(open("../static_ioc_sample_30k.txt", "r")) words_to_search = list() trie_words = list() total_words_to_search = 1000 total_words_added = 0 t = list() patterns = dict() total_initial_words = 0 total_iterations = 10 for x in range(0, total_iterations): print("In iteration ", x) for key in file_words: for value in file_words[key]: value = value + str(random.randint(10000, 500000)) if total_words_to_search != total_words_added: words_to_search.append(value) total_words_added += 1 if x == 0: total_initial_words += 1 t.append(value) print(f"Initial words {total_initial_words}") print(f"Total patterns on AC trie: {total_initial_words*total_iterations+1}") ac = AC.build(t) input() #stop program to measure memory
def test_sep(self): ac = AC.build([u"a", u"aa", u"A", u"AA"]) sep = set([ord(" ")]) arr = [(end_, val) for val, _, end_ in ac.match(u"a aaa", sep)] self.assertEqual(arr, [(1, 0)])
def test_init(self): ac = AC.build([u'我', u'我是', u'是中']) arr = [(end_, val) for val, start_, end_ in ac.match(u"我是中国人")] self.assertEqual(arr, [(1, 0), (2, 1), (3, 2)])
from multiprocessing import Process from multiprocessing import shared_memory ac_patterns = ["python", "ruby"] patterns_dict = json.loads(open("static_ioc_sample.json", "r").read()) total_patterns = 0 for x in range(0,1000): for key in patterns_dict: for value in patterns_dict[key]: total_patterns += 1 ac_patterns.append(value+str(random.randint(0,10000))) string_to_search = "asdpythonasdasdruby" # Export to file ac = AC.build(ac_patterns) ac.save("ac_trie") print("Matches before saving:") for id, start, end in ac.match(string_to_search): print(id, string_to_search[start:end]) # Force garbage collector, to avoid possible copy to child processes. string_to_search = None ac = None ac_patterns = None # Load from file with mmap and share memory with open("ac_trie", "r+b") as bf: mm = mmap.mmap(bf.fileno(), 0) ac_first = AC.from_buff(mm, copy=False) ac_size = ac_first.buff_size() print("Size of AC automaton:Total patterns: {}bytes:{}patterns".format(ac_size, total_patterns))
def init_ac(words, size): ret = AC.build(words[:size]) return ret
import random from cyac import AC from memory_profiler import profile ac_patterns = ["python", "ruby"] patterns_dict = json.loads(open("static_ioc_sample.json", "r").read()) total_patterns = 0 for x in range(0, 1000): for key in patterns_dict: for value in patterns_dict[key]: total_patterns += 1 ac_patterns.append(value + str(random.randint(0, 10000))) string_to_search = "asdpythonasdasdruby" # Export to file ac = AC.build(ac_patterns) ac.save("ac_trie") print("Matches before saving:") for id, start, end in ac.match(string_to_search): print(id, string_to_search[start:end]) # Load from file with mmap and share memory @profile def load_and_search(): with open("ac_trie", "r+b") as bf: mm = mmap.mmap(bf.fileno(), 0) ac_first = AC.from_buff(mm, copy=False) # it shares memory ac_second = AC.from_buff(mm, copy=False) # it shares memory ac_third = AC.from_buff(mm, copy=False) # it shares memory ac_four = AC.from_buff(mm, copy=False) # it shares memory