def test_valid_separator(self): t = pygtrie.StringTrie() t['foo/bar'] = 42 self.assertTrue(bool(t.has_node('foo') & pygtrie.Trie.HAS_SUBTRIE)) t = pygtrie.StringTrie(separator='.') t['foo.bar'] = 42 self.assertTrue(bool(t.has_node('foo') & pygtrie.Trie.HAS_SUBTRIE))
def loadRevmap(syll_dict): #print("making reverse maps") singlemap = trie.StringTrie(separator='-') multimap = trie.StringTrie(separator='-') i = 0 for key in syll_dict.keys(): syllarray = syll_dict[key] if len(syllarray) > 1: # 'M AH-G ER' -> "mugger" multimap['-'.join(syllarray)] = key else: singlemap[syllarray[0]] = key i += 1 return (singlemap, multimap)
def __init__(self, gramFile, candidateFile, scoreFile): self._gramFile = gramFile self._candidateFile = candidateFile self._scoreFile = scoreFile """pre_trie""" self._pretrie = trie.StringTrie(separator=Process.SEP) self._pretrieFile = 'PreGramTrie' self._precache = {} """post_trie""" self._posttrie = trie.StringTrie(separator=Process.SEP) self._posttrieFile = 'PostGramTrie' self._postcache = {} assert os.path.exists(gramFile), "GramFile %s not exists" % gramFile assert os.path.exists( candidateFile), "CandidateFile %s not exists" % candidateFile
def test_valid_inst(self): rop_hunter = ROPHunter(CS_ARCH_X86, CS_MODE_64, self.parallel) start_offset = 0x1000 code = b"\xc7\x07\x00\x00\x00\x0f\x95\x45\xc3" rop_hunter.galileo(self.duplicates, self.output, start_offset, code) expected_trie = pygtrie.StringTrie() expected_trie["c3"] = "ret" expected_trie["c3/9545"] = "xchg eax, ebp" expected_trie["c3/9545/000f"] = "add byte ptr [rdi], cl" expected_trie["c3/9545/000f/0000"] = "add byte ptr [rax], al" expected_trie["c3/9545/00000f"] = "add byte ptr [rax], al" expected_trie[ "c3/9545/c7070000000f"] = "mov dword ptr [rdi], 0xf000000" expected_trie["c3/00000f9545"] = "add byte ptr [rax], al" actual_trie = rop_hunter.get_inst_trie() self.assertCountEqual(actual_trie.items(), expected_trie.items()) # len(code) = 9 # Expected instruction address: start_offset + len(code) - len(inst) expected_inst_addr = dict() expected_inst_addr[ "c3/9545"] = "0x1006" # e.g. 0x1000 + 9 - 3 = 0x1006 expected_inst_addr["c3/9545/000f"] = "0x1004" expected_inst_addr["c3/9545/000f/0000"] = "0x1002" expected_inst_addr["c3/9545/00000f"] = "0x1003" expected_inst_addr["c3/9545/c7070000000f"] = "0x1000" expected_inst_addr["c3/00000f9545"] = "0x1003" actual_inst_addr = rop_hunter.get_inst_addr_dict() self.assertDictEqual(actual_inst_addr, expected_inst_addr)
def parse_barcode_file(fp, primer=None, header=False): """ Load label, barcode, primer records from a CSV file. Returns a map from barcode -> label Any additional columns are ignored """ tr = trie.StringTrie() reader = csv.reader(fp) if header: # Skip header next(reader) # Skip blank rows records = (record for record in reader if record) for record in records: specimen, barcode = record[:2] if primer is not None: pr = primer else: pr = record[2] for sequence in all_unambiguous(barcode + pr): if sequence in tr: raise ValueError("Duplicate sample: {0}, {1} both have {2}", specimen, tr[sequence], sequence) logging.info('%s->%s', sequence, specimen) tr[sequence] = specimen return tr
def __init__(self, vocabulary: Iterable[Tuple[Iterable[str], str]], ignore_case: Optional[bool] = True): """builds the prefix trie using the provided vocabulary Parameters ---------- vocabulary: Iterable Vocabulary to build tree from. Different lists of words in the vocabulary are associated with a type label. ignore_case: bool, default=True Perform case-insensitive matching if True. """ self.ignore_case = ignore_case self.trie = pygtrie.StringTrie() for words, label in vocabulary: for word in words: if ignore_case: word = word.lower() dom_word = self.trie._separator.join(word.split()) if dom_word in self.trie: if self.trie[dom_word][1] != label: warnings.warn( "duplicate pytrie entry '{}' with different label: '{}' found. Original label: {} is immutable, Ignoring duplicate." .format(word, label, self.trie[dom_word][1])) continue self.trie[dom_word] = (word, label)
def deref_urls(url_mapping: dict, url_settings: Tuple[Tuple[str, str],...]=URL_SETTINGS ) -> pygtrie.StringTrie: dereferenced_urls = [] for prefix, url_ref in url_settings: dereferenced_urls.append((prefix, url_mapping[url_ref])) return pygtrie.StringTrie(dereferenced_urls, separator='.')
def get_dict_trie(dict_file_name, processing_word=None, processing_dict_type=None, trie_separator='.'): trie = pygtrie.StringTrie(separator=trie_separator) paths = [] dict_types = set() UNK_word_id = processing_word(UNK) if processing_word is not None else -1 with open(dict_file_name, encoding='utf-8') as f: for line in f: line = line.strip() if line: sent, dict_type = line.split('\t') if processing_word is not None: word_ids = [ processing_word(word) for word in sent.split(' ') ] if UNK_word_id in word_ids: continue sent = trie_separator.join( [str(word_id) for word_id in word_ids]) if processing_dict_type is not None: dict_type = processing_dict_type(dict_type) trie[sent] = dict_type paths.append('{}\t{}'.format(sent, dict_type)) dict_types.add(dict_type) return trie, paths, list(dict_types)
def readVectors(): trie = pygtrie.StringTrie() db = shelve.open("n_shelve_10_cat") #Set categories tags = [ "education", "music", "film", "food", "police", "health", "women", "children", "technology", "sport" ] tag_dict = {} print(1) #Find all tags' vector for value in tags: # <- 1 data = np.asarray(db[value], dtype="float64") tag_dict[value] = data # <- 1 db.sync() print(2) tm = db.keys() print(3) data = [] count = 0 #Add all words vector to trie for x in tm: trie[x] = np.asarray(db[x], dtype="float64") print(4) return trie, tag_dict
def readdata(): trie = pygtrie.StringTrie() db = shelve.open( "/home/saidaltindis/Desktop/PROJECTS/MULTI-LABEL CLASSIFICATION/Data/n_shelve_5_cat_reuters" ) #Set categories tags = ['earn', 'grain', 'crude', 'trade', 'interest'] tag_dict = {} print(1) #Find all tags' vector for value in tags: # <- 1 data = np.asarray(db[value], dtype="float64") tag_dict[value] = data # <- 1 db.sync() print(2) tm = db.keys() print(3) data = [] count = 0 #Add all words vector to trie for x in tm: trie[x] = np.asarray(db[x], dtype="float64") print(4) return trie, tag_dict
def main(): parser = argparse.ArgumentParser( description="Detect heavy hitters from traces") parser.add_argument('-f', '--format', dest='file_format', nargs=1, default='None', choices=['nfdump', 'flow-tools'], required=True, help='Trace format i.e. flow-tools or nfdump') parser.add_argument( 'infile', nargs='?', default=sys.stdin, help= 'File path to read from. If no path is specified then defaults to stdin' ) args = parser.parse_args() trie = t.StringTrie(separator='.') if (args.file_format[0] == "flow-tools"): buildFlowToolsTrie(args.infile, trie) else: pass
def __init__(self, path_to_binary, path_to_cache=None, save_every_n=1000, terminator="0"): self.path = path_to_binary self.needs_reset = True self.cache = {} self.error_cache = pygtrie.StringTrie(separator=" ") self.invalid_cache = pygtrie.PrefixSet() self.terminator = terminator assert terminator not in self.get_alphabet( ), f"Terminator {terminator} in alphabet, please choose a different one" # Save cache to file every n queries self.save_every_n = save_every_n self.n_queries = 0 if path_to_cache is None: print("No cache path given, not using cache") self.cachepath = None else: print("Cache dir:", str(Path(path_to_cache).absolute())) # Hash the binary to find it's cache folder with open(self.path, 'rb') as f: hash = hashlib.sha256(f.read()).hexdigest() # Check if cache exists for the given binary self.cachepath = Path(path_to_cache).joinpath(hash) if self.cachepath.is_dir(): self._load_cache() else: os.mkdir(self.cachepath)
class StringTrieTestCase(TrieTestCase): _TRIE_CTOR = staticmethod( lambda *args, **kw: pygtrie.StringTrie(*args, separator='~', **kw)) # pylint: disable=unnecessary-lambda _SHORT_KEY = '~home~foo' _SHORT_KEY2 = '~home~FOO' _LONG_KEY = _SHORT_KEY + '~bar~baz' _VERY_LONG_KEY = _LONG_KEY + '~qux' _OTHER_KEY = '~hom' _SHORT_PREFIXES = ('', '~home') _LONG_PREFIXES = ('~home~foo~bar', ) _PICKLED_PROTO_0 = ( 'Y2NvcHlfcmVnCl9yZWNvbnN0cnVjdG9yCnAwCihjcHlndHJpZQpTdHJpbmdUcmllCnAxCm' 'NfX2J1aWx0aW5fXwpvYmplY3QKcDIKTnRwMwpScDQKKGRwNQpWX3NlcGFyYXRvcgpwNgpW' 'LwpwNwpzVl9yb290CnA4CmcwCihjcHlndHJpZQpfTm9kZQpwOQpnMgpOdHAxMApScDExCi' 'hscDEyCkwxTAphVmZvbwpwMTMKYUw0MkwKYUwtMUwKYUwxTAphVmJhcgpwMTQKYUw0MkwK' 'YUwtMUwKYUwxTAphVmJhegpwMTUKYUw0MkwKYWJzVl9zb3J0ZWQKcDE2CkkwMApzYi4=') _PICKLED_PROTO_3 = ( 'gANjcHlndHJpZQpTdHJpbmdUcmllCnEAKYFxAX1xAihYCgAAAF9zZXBhcmF0b3JxA1gBAA' 'AAL3EEWAUAAABfcm9vdHEFY3B5Z3RyaWUKX05vZGUKcQYpgXEHXXEIKEsBWAMAAABmb29x' 'CUsqSv////9LAVgDAAAAYmFycQpLKkr/////SwFYAwAAAGJhenELSyplYlgHAAAAX3Nvcn' 'RlZHEMiXViLg==') @classmethod def path_from_key(cls, key): return key.split('~') @classmethod def key_from_path(cls, path): return '~'.join(path) def test_valid_separator(self): t = pygtrie.StringTrie() t['foo/bar'] = 42 self.assertTrue(bool(t.has_node('foo') & pygtrie.Trie.HAS_SUBTRIE)) t = pygtrie.StringTrie(separator='.') t['foo.bar'] = 42 self.assertTrue(bool(t.has_node('foo') & pygtrie.Trie.HAS_SUBTRIE)) def test_invalid_separator(self): self.assertRaises(TypeError, pygtrie.StringTrie, separator=42) self.assertRaises(ValueError, pygtrie.StringTrie, separator='') def test_to_string(self): self._assertToString(pygtrie.StringTrie(), '(separator=/)', "([], separator='/')") self._assertToString(self._TRIE_CTOR(), '(separator=~)', "([], separator='~')") self._assertToString(self._TRIE_CTOR({self._SHORT_KEY: 42}), '(~home~foo: 42, separator=~)', "([('~home~foo', 42)], separator='~')") self._assertToString( self._TRIE_CTOR({ self._SHORT_KEY: 42, self._OTHER_KEY: '42' }), '(~hom: 42, ~home~foo: 42, separator=~)', "([('~hom', '42'), ('~home~foo', 42)], separator='~')")
def eurosense_to_unified(eurosense: IO, unified: IO): """ Do the XML conversion from the Eurosense format to the Unified format. Note that this only deals with XML and doesn't convert other things like synset ids. For the full conversion pipeline see eurosense2unified in `pipeline.py`. """ write_header(unified, "eurosense") for sent_id, sent_elem in iter_sentences_eurosense(eurosense): unified.write('<sentence id="{}">\n'.format(sent_id)) trie = pygtrie.StringTrie(separator=" ") anns = sent_elem.xpath(".//annotation") for ann in anns: trie[ann.attrib["anchor"]] = (ann.text, ann.attrib["lemma"]) sent = sent_elem.xpath("text")[0].text cursor = 0 while cursor < len(sent): match_anchor, match_val = trie.longest_prefix(sent[cursor:]) if match_anchor: sense_key, lemma = match_val pos = WN_UNI_POS_MAP[sense_key[-1]] unified.write( '<instance lemma="{}" pos="{}" key="{}">{}</instance>\n'. format(lemma, pos, sense_key, match_anchor)) cursor += len(match_anchor) + 1 else: end_pos = sent.find(" ", cursor) if end_pos == -1: break unified.write("<wf>{}</wf>\n".format( escape(sent[cursor:end_pos]))) cursor = end_pos + 1 unified.write("</sentence>\n") unified.write("</text>\n") unified.write("</corpus>\n")
def __init__(self, vocab_path="sowpods.txt"): # Construct the Prefix Tree for all possible word/card permutations with open(vocab_path, "r") as text_file: lines = text_file.readlines() self.perms = trie.StringTrie(separator='/') for w in lines[6:]: self._make_perm(w.strip(),[],0,self.perms) print(f'Using {len(self.perms)} possible word permutations\n')
def create_public_suffix_trie(): pub_suf_trie = trie.StringTrie() data = fetch_public_suffix_data() if len(data) > 0: for ps in data: if ps != "" and not ps.startswith("//"): pub_suf_trie[ps] = True return pub_suf_trie
def build_lang_dict(language): import pygtrie lang_dict = pygtrie.StringTrie() with open(os.path.join(LANGUAGES_PREFIX, language)) as lang_file: for dict_entry in lang_file: if len(dict_entry) >= IGNORE_LENGTH_BELOW: lang_dict[ dict_entry] = dict_entry #Wastes a bit of memory, could be optimized with different Trie implementation
def test_string(): print('StringTrie Test') print('---------------') import pygtrie trie = pygtrie.StringTrie() trie.enable_sorting() for word in data.split(): trie[word.lower()] = word print('K : ', ', '.join(trie.keys())) print('V : ', ', '.join(trie.values()))
def __init__(self): self.word_offset = pygtrie.StringTrie(separator=' ') print "Loading Thesaurus..." for line in open(IDX_PATH, 'r'): if '|' in line: word, offset = line.split('|') offset = int(offset) self.word_offset[word] = offset print "Done" self.cache = {}
def __init__(self): self.resources = pygtrie.StringTrie(separator='_') print "Loading DBPedia resources..." for line in open(dbpedia_resources, 'r'): resource_name, pagerank = line.split() if resource_name.startswith('Category:'): continue pagerank = float(pagerank.rstrip()) lowered_name = resource_name.lower().replace('-', '_') self.resources[lowered_name] = (pagerank, resource_name) print "Done"
def build_trie_from_wordlists(): # Create Wordlists object wl = Wordlists() # Build the trie that will contain all of the words in each wordlist big_trie = pygtrie.StringTrie() for wordlist in wl.wordlists: with open(wordlist) as list_file: words_as_list = list_file.read().splitlines() for word in words_as_list: big_trie[word] = wordlist return big_trie
def buildTrieSingle(file, pickle_dir): """Build trie, dumps using pickle """ trie_file = getTrieFile(os.path.basename(file), pickle_dir) trie = pygtrie.StringTrie(separator=SEP) with open(file) as fd: for line in fd: key = line.strip() trie.setdefault(key, 0) trie[key] += 1 with open(trie_file, 'w') as fd: pickle.dump(trie, fd, protocol=2)
def get_count_dict(string): start = datetime.datetime.now() print "start create count_dict ...".upper() c_dict = trie.StringTrie() for c in string: if c in c_dict: c_dict[c] += 1 else: c_dict[c] = 1 end = datetime.datetime.now() print("create count_dict done, costs time: %s" % (str(end - start))).upper() return c_dict
def __build_trie(self, key): trie = pygtrie.StringTrie(separator='.') for item in it.chain.from_iterable(c[key] for c in self.config): if isinstance(item, list): prefix, value = item else: keys = list(item.keys()) prefix_key = 'prefix' value_key = keys[keys.index(prefix_key) - 1] prefix = item[prefix_key] value = item[value_key] trie[prefix] = value return trie
def get_df_trie(tries): """ Return a trie where the prefix's value is its document frequency among the tries. """ df_trie = pygtrie.StringTrie(separator=" ") for _, trie in progress.bar(tries, expected_size=len(tries)): for prefix in trie.keys(): if prefix not in df_trie: df_trie[prefix] = 0 df_trie[prefix] += 1 for prefix in df_trie.keys(): df_trie[prefix] = (df_trie[prefix] - 1) / len(tries) return df_trie
def __init__(self, rule_config_path): self.pre_trie = pygtrie.StringTrie(separator=u" ") self.post_trie = pygtrie.StringTrie(separator=u" ") with codecs.open(rule_config_path, 'r', 'utf-8') as rule_config_file: for line in rule_config_file: rule_definition_list = line.strip(u'\n').split(u'\t') rule_id = rule_definition_list[0] if rule_definition_list[1] and rule_definition_list[2]: pre_replacing = rule_definition_list[2] if is_entity_tag(pre_replacing): self.pre_trie[rule_definition_list[1]] =\ entity_tag_with_id(pre_replacing, rule_id) else: self.pre_trie[rule_definition_list[1]] = pre_replacing if rule_definition_list[3] and rule_definition_list[4]: post_to_be_replaced = rule_definition_list[3] if is_entity_tag(post_to_be_replaced): self.post_trie[entity_tag_with_id(post_to_be_replaced, rule_id)] =\ rule_definition_list[4] else: self.post_trie[rule_definition_list[3]] = rule_definition_list[4]
def test_to_string(self): self._assertToString(pygtrie.StringTrie(), '(separator=/)', "([], separator='/')") self._assertToString(self._TRIE_CTOR(), '(separator=~)', "([], separator='~')") self._assertToString(self._TRIE_CTOR({self._SHORT_KEY: 42}), '(~home~foo: 42, separator=~)', "([('~home~foo', 42)], separator='~')") self._assertToString( self._TRIE_CTOR({ self._SHORT_KEY: 42, self._OTHER_KEY: '42' }), '(~hom: 42, ~home~foo: 42, separator=~)', "([('~hom', '42'), ('~home~foo', 42)], separator='~')")
def __init__(self, path_to_binary): self.path = path_to_binary self.needs_reset = True # These are only used if an external RERS cache is not hooked up self.separator = " " self.cache = {} self.error_cache = pygtrie.StringTrie(separator=self.separator) self.invalid_cache = pygtrie.PrefixSet() # Set up external process and communication self.proc = Popen(path_to_binary, bufsize=0, stdout=PIPE, stdin=PIPE, stderr=STDOUT) self.q = Queue() self.t = Thread(target=self._enqueue, args=(self.proc.stdout, self.q)) self.t.daemon = True self.t.start()
def __init__(self, options: EzLinksOptions, root: str, files: List[mkdocs.structure.pages.Page], logger=None): self.options = options self.root = root self.file_cache = {} self.file_trie = pygtrie.StringTrie(separator=os.sep) self.logger = logger # Drop any files outside of the root of the docs dir self.files = [file for file in files if root in file.abs_src_path] for file in self.files: self._store_file(file.src_path)
def print_trie(self, by="sum", sep=None, precision=3): sep = sep or self.sep import pygtrie t = pygtrie.StringTrie(separator=sep) stats = self.stats().T[[by]] for key, row in stats.sort_index().iterrows(): t[key] = row[by] def is_child(key): return not t.has_subtrie(key) def depth_print(s, depth=0): print("\t" * depth + s) def print_trie(path_conv, path, children, value=None): path = path_conv(path) depth = path.count(sep) if t.has_key(path): value_repr = value if not precision else round(value, precision) path_repr = path.split("/")[-1] if is_child(path): # child depth_print("{}: {}".format(path_repr, value_repr), depth) else: depth_print("{}: {}".format(path_repr, value_repr), depth) for child in children: pass return value def add_other(path_conv, path, children, value=None): path = path_conv(path) if value and not is_child(path): other_key = "/".join([path, "other"]) if not t.has_key(other_key): t[other_key] = value - sum(child for child in children) else: for child in children: pass return value if by == "sum": t.traverse(add_other) t.traverse(print_trie)