Example #1
0
    def test_valid_separator(self):
        t = pygtrie.StringTrie()
        t['foo/bar'] = 42
        self.assertTrue(bool(t.has_node('foo') & pygtrie.Trie.HAS_SUBTRIE))

        t = pygtrie.StringTrie(separator='.')
        t['foo.bar'] = 42
        self.assertTrue(bool(t.has_node('foo') & pygtrie.Trie.HAS_SUBTRIE))
Example #2
0
def loadRevmap(syll_dict):
  #print("making reverse maps")
  singlemap = trie.StringTrie(separator='-')
  multimap = trie.StringTrie(separator='-')
  i = 0
  for key in syll_dict.keys():
    syllarray = syll_dict[key]
    if len(syllarray) > 1:
      # 'M AH-G ER' -> "mugger"
      multimap['-'.join(syllarray)] = key
    else:
      singlemap[syllarray[0]] = key
    i += 1
  return (singlemap, multimap)
 def __init__(self, gramFile, candidateFile, scoreFile):
     self._gramFile = gramFile
     self._candidateFile = candidateFile
     self._scoreFile = scoreFile
     """pre_trie"""
     self._pretrie = trie.StringTrie(separator=Process.SEP)
     self._pretrieFile = 'PreGramTrie'
     self._precache = {}
     """post_trie"""
     self._posttrie = trie.StringTrie(separator=Process.SEP)
     self._posttrieFile = 'PostGramTrie'
     self._postcache = {}
     assert os.path.exists(gramFile), "GramFile %s not exists" % gramFile
     assert os.path.exists(
         candidateFile), "CandidateFile %s not exists" % candidateFile
Example #4
0
    def test_valid_inst(self):
        rop_hunter = ROPHunter(CS_ARCH_X86, CS_MODE_64, self.parallel)
        start_offset = 0x1000
        code = b"\xc7\x07\x00\x00\x00\x0f\x95\x45\xc3"

        rop_hunter.galileo(self.duplicates, self.output, start_offset, code)

        expected_trie = pygtrie.StringTrie()
        expected_trie["c3"] = "ret"
        expected_trie["c3/9545"] = "xchg eax, ebp"
        expected_trie["c3/9545/000f"] = "add byte ptr [rdi], cl"
        expected_trie["c3/9545/000f/0000"] = "add byte ptr [rax], al"
        expected_trie["c3/9545/00000f"] = "add byte ptr [rax], al"
        expected_trie[
            "c3/9545/c7070000000f"] = "mov dword ptr [rdi], 0xf000000"
        expected_trie["c3/00000f9545"] = "add byte ptr [rax], al"

        actual_trie = rop_hunter.get_inst_trie()
        self.assertCountEqual(actual_trie.items(), expected_trie.items())

        # len(code) = 9
        # Expected instruction address: start_offset + len(code) - len(inst)
        expected_inst_addr = dict()
        expected_inst_addr[
            "c3/9545"] = "0x1006"  # e.g. 0x1000 + 9 - 3 = 0x1006
        expected_inst_addr["c3/9545/000f"] = "0x1004"
        expected_inst_addr["c3/9545/000f/0000"] = "0x1002"
        expected_inst_addr["c3/9545/00000f"] = "0x1003"
        expected_inst_addr["c3/9545/c7070000000f"] = "0x1000"
        expected_inst_addr["c3/00000f9545"] = "0x1003"

        actual_inst_addr = rop_hunter.get_inst_addr_dict()
        self.assertDictEqual(actual_inst_addr, expected_inst_addr)
Example #5
0
def parse_barcode_file(fp, primer=None, header=False):
    """
    Load label, barcode, primer records from a CSV file.

    Returns a map from barcode -> label

    Any additional columns are ignored
    """
    tr = trie.StringTrie()
    reader = csv.reader(fp)

    if header:
        # Skip header
        next(reader)

    # Skip blank rows
    records = (record for record in reader if record)

    for record in records:
        specimen, barcode = record[:2]
        if primer is not None:
            pr = primer
        else:
            pr = record[2]
        for sequence in all_unambiguous(barcode + pr):
            if sequence in tr:
                raise ValueError("Duplicate sample: {0}, {1} both have {2}",
                                 specimen, tr[sequence], sequence)
            logging.info('%s->%s', sequence, specimen)
            tr[sequence] = specimen

    return tr
Example #6
0
    def __init__(self,
                 vocabulary: Iterable[Tuple[Iterable[str], str]],
                 ignore_case: Optional[bool] = True):
        """builds the prefix trie using the provided vocabulary

        Parameters
        ----------
        vocabulary: Iterable
            Vocabulary to build tree from. Different lists of words in the
            vocabulary are associated with a type label.
        ignore_case: bool, default=True
            Perform case-insensitive matching if True.
        """
        self.ignore_case = ignore_case
        self.trie = pygtrie.StringTrie()
        for words, label in vocabulary:
            for word in words:
                if ignore_case:
                    word = word.lower()
                dom_word = self.trie._separator.join(word.split())
                if dom_word in self.trie:
                    if self.trie[dom_word][1] != label:
                        warnings.warn(
                            "duplicate pytrie entry '{}' with different label: '{}' found. Original label: {} is immutable, Ignoring duplicate."
                            .format(word, label, self.trie[dom_word][1]))
                    continue
                self.trie[dom_word] = (word, label)
Example #7
0
def deref_urls(url_mapping: dict,
               url_settings: Tuple[Tuple[str, str],...]=URL_SETTINGS
               ) -> pygtrie.StringTrie:
    dereferenced_urls = []
    for prefix, url_ref in url_settings:
        dereferenced_urls.append((prefix, url_mapping[url_ref]))
    return pygtrie.StringTrie(dereferenced_urls, separator='.')
Example #8
0
def get_dict_trie(dict_file_name,
                  processing_word=None,
                  processing_dict_type=None,
                  trie_separator='.'):
    trie = pygtrie.StringTrie(separator=trie_separator)
    paths = []
    dict_types = set()
    UNK_word_id = processing_word(UNK) if processing_word is not None else -1
    with open(dict_file_name, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                sent, dict_type = line.split('\t')
                if processing_word is not None:
                    word_ids = [
                        processing_word(word) for word in sent.split(' ')
                    ]
                    if UNK_word_id in word_ids:
                        continue
                    sent = trie_separator.join(
                        [str(word_id) for word_id in word_ids])
                if processing_dict_type is not None:
                    dict_type = processing_dict_type(dict_type)
                trie[sent] = dict_type
                paths.append('{}\t{}'.format(sent, dict_type))
                dict_types.add(dict_type)
    return trie, paths, list(dict_types)
Example #9
0
def readVectors():
    trie = pygtrie.StringTrie()
    db = shelve.open("n_shelve_10_cat")

    #Set categories
    tags = [
        "education", "music", "film", "food", "police", "health", "women",
        "children", "technology", "sport"
    ]

    tag_dict = {}
    print(1)

    #Find all tags' vector
    for value in tags:  # <- 1
        data = np.asarray(db[value], dtype="float64")
        tag_dict[value] = data  # <- 1
        db.sync()
    print(2)
    tm = db.keys()
    print(3)
    data = []
    count = 0
    #Add all words vector to trie
    for x in tm:
        trie[x] = np.asarray(db[x], dtype="float64")
    print(4)
    return trie, tag_dict
Example #10
0
def readdata():
    trie = pygtrie.StringTrie()
    db = shelve.open(
        "/home/saidaltindis/Desktop/PROJECTS/MULTI-LABEL CLASSIFICATION/Data/n_shelve_5_cat_reuters"
    )

    #Set categories
    tags = ['earn', 'grain', 'crude', 'trade', 'interest']

    tag_dict = {}
    print(1)

    #Find all tags' vector
    for value in tags:  # <- 1
        data = np.asarray(db[value], dtype="float64")
        tag_dict[value] = data  # <- 1
        db.sync()
    print(2)
    tm = db.keys()
    print(3)
    data = []
    count = 0
    #Add all words vector to trie
    for x in tm:
        trie[x] = np.asarray(db[x], dtype="float64")
    print(4)
    return trie, tag_dict
Example #11
0
def main():

    parser = argparse.ArgumentParser(
        description="Detect heavy hitters from traces")

    parser.add_argument('-f',
                        '--format',
                        dest='file_format',
                        nargs=1,
                        default='None',
                        choices=['nfdump', 'flow-tools'],
                        required=True,
                        help='Trace format i.e. flow-tools or nfdump')
    parser.add_argument(
        'infile',
        nargs='?',
        default=sys.stdin,
        help=
        'File path to read from. If no path is specified then defaults to stdin'
    )
    args = parser.parse_args()
    trie = t.StringTrie(separator='.')
    if (args.file_format[0] == "flow-tools"):
        buildFlowToolsTrie(args.infile, trie)
    else:
        pass
Example #12
0
    def __init__(self,
                 path_to_binary,
                 path_to_cache=None,
                 save_every_n=1000,
                 terminator="0"):
        self.path = path_to_binary
        self.needs_reset = True
        self.cache = {}
        self.error_cache = pygtrie.StringTrie(separator=" ")
        self.invalid_cache = pygtrie.PrefixSet()

        self.terminator = terminator
        assert terminator not in self.get_alphabet(
        ), f"Terminator {terminator} in alphabet, please choose a different one"

        # Save cache to file every n queries
        self.save_every_n = save_every_n
        self.n_queries = 0

        if path_to_cache is None:
            print("No cache path given, not using cache")
            self.cachepath = None
        else:
            print("Cache dir:", str(Path(path_to_cache).absolute()))
            # Hash the binary to find it's cache folder
            with open(self.path, 'rb') as f:
                hash = hashlib.sha256(f.read()).hexdigest()

            # Check if cache exists for the given binary
            self.cachepath = Path(path_to_cache).joinpath(hash)
            if self.cachepath.is_dir():
                self._load_cache()
            else:
                os.mkdir(self.cachepath)
Example #13
0
class StringTrieTestCase(TrieTestCase):
    _TRIE_CTOR = staticmethod(
        lambda *args, **kw: pygtrie.StringTrie(*args, separator='~', **kw))  # pylint: disable=unnecessary-lambda

    _SHORT_KEY = '~home~foo'
    _SHORT_KEY2 = '~home~FOO'
    _LONG_KEY = _SHORT_KEY + '~bar~baz'
    _VERY_LONG_KEY = _LONG_KEY + '~qux'
    _OTHER_KEY = '~hom'
    _SHORT_PREFIXES = ('', '~home')
    _LONG_PREFIXES = ('~home~foo~bar', )

    _PICKLED_PROTO_0 = (
        'Y2NvcHlfcmVnCl9yZWNvbnN0cnVjdG9yCnAwCihjcHlndHJpZQpTdHJpbmdUcmllCnAxCm'
        'NfX2J1aWx0aW5fXwpvYmplY3QKcDIKTnRwMwpScDQKKGRwNQpWX3NlcGFyYXRvcgpwNgpW'
        'LwpwNwpzVl9yb290CnA4CmcwCihjcHlndHJpZQpfTm9kZQpwOQpnMgpOdHAxMApScDExCi'
        'hscDEyCkwxTAphVmZvbwpwMTMKYUw0MkwKYUwtMUwKYUwxTAphVmJhcgpwMTQKYUw0MkwK'
        'YUwtMUwKYUwxTAphVmJhegpwMTUKYUw0MkwKYWJzVl9zb3J0ZWQKcDE2CkkwMApzYi4=')

    _PICKLED_PROTO_3 = (
        'gANjcHlndHJpZQpTdHJpbmdUcmllCnEAKYFxAX1xAihYCgAAAF9zZXBhcmF0b3JxA1gBAA'
        'AAL3EEWAUAAABfcm9vdHEFY3B5Z3RyaWUKX05vZGUKcQYpgXEHXXEIKEsBWAMAAABmb29x'
        'CUsqSv////9LAVgDAAAAYmFycQpLKkr/////SwFYAwAAAGJhenELSyplYlgHAAAAX3Nvcn'
        'RlZHEMiXViLg==')

    @classmethod
    def path_from_key(cls, key):
        return key.split('~')

    @classmethod
    def key_from_path(cls, path):
        return '~'.join(path)

    def test_valid_separator(self):
        t = pygtrie.StringTrie()
        t['foo/bar'] = 42
        self.assertTrue(bool(t.has_node('foo') & pygtrie.Trie.HAS_SUBTRIE))

        t = pygtrie.StringTrie(separator='.')
        t['foo.bar'] = 42
        self.assertTrue(bool(t.has_node('foo') & pygtrie.Trie.HAS_SUBTRIE))

    def test_invalid_separator(self):
        self.assertRaises(TypeError, pygtrie.StringTrie, separator=42)
        self.assertRaises(ValueError, pygtrie.StringTrie, separator='')

    def test_to_string(self):
        self._assertToString(pygtrie.StringTrie(), '(separator=/)',
                             "([], separator='/')")
        self._assertToString(self._TRIE_CTOR(), '(separator=~)',
                             "([], separator='~')")
        self._assertToString(self._TRIE_CTOR({self._SHORT_KEY: 42}),
                             '(~home~foo: 42, separator=~)',
                             "([('~home~foo', 42)], separator='~')")
        self._assertToString(
            self._TRIE_CTOR({
                self._SHORT_KEY: 42,
                self._OTHER_KEY: '42'
            }), '(~hom: 42, ~home~foo: 42, separator=~)',
            "([('~hom', '42'), ('~home~foo', 42)], separator='~')")
Example #14
0
def eurosense_to_unified(eurosense: IO, unified: IO):
    """
    Do the XML conversion from the Eurosense format to the Unified format. Note
    that this only deals with XML and doesn't convert other things like synset
    ids. For the full conversion pipeline see eurosense2unified in
    `pipeline.py`.
    """
    write_header(unified, "eurosense")
    for sent_id, sent_elem in iter_sentences_eurosense(eurosense):
        unified.write('<sentence id="{}">\n'.format(sent_id))
        trie = pygtrie.StringTrie(separator=" ")
        anns = sent_elem.xpath(".//annotation")
        for ann in anns:
            trie[ann.attrib["anchor"]] = (ann.text, ann.attrib["lemma"])
        sent = sent_elem.xpath("text")[0].text
        cursor = 0
        while cursor < len(sent):
            match_anchor, match_val = trie.longest_prefix(sent[cursor:])
            if match_anchor:
                sense_key, lemma = match_val
                pos = WN_UNI_POS_MAP[sense_key[-1]]
                unified.write(
                    '<instance lemma="{}" pos="{}" key="{}">{}</instance>\n'.
                    format(lemma, pos, sense_key, match_anchor))
                cursor += len(match_anchor) + 1
            else:
                end_pos = sent.find(" ", cursor)
                if end_pos == -1:
                    break
                unified.write("<wf>{}</wf>\n".format(
                    escape(sent[cursor:end_pos])))
                cursor = end_pos + 1
        unified.write("</sentence>\n")
    unified.write("</text>\n")
    unified.write("</corpus>\n")
Example #15
0
 def __init__(self, vocab_path="sowpods.txt"):
     # Construct the Prefix Tree for all possible word/card permutations
     with open(vocab_path, "r") as text_file:
         lines = text_file.readlines()
     self.perms = trie.StringTrie(separator='/')
     for w in lines[6:]:
         self._make_perm(w.strip(),[],0,self.perms)
     print(f'Using {len(self.perms)} possible word permutations\n')
Example #16
0
def create_public_suffix_trie():
    pub_suf_trie = trie.StringTrie()
    data = fetch_public_suffix_data()
    if len(data) > 0:
        for ps in data:
            if ps != "" and not ps.startswith("//"):
                pub_suf_trie[ps] = True
    return pub_suf_trie
def build_lang_dict(language):
    import pygtrie
    lang_dict = pygtrie.StringTrie()
    with open(os.path.join(LANGUAGES_PREFIX, language)) as lang_file:
        for dict_entry in lang_file:
            if len(dict_entry) >= IGNORE_LENGTH_BELOW:
                lang_dict[
                    dict_entry] = dict_entry  #Wastes a bit of memory, could be optimized with different Trie implementation
Example #18
0
def test_string():
    print('StringTrie Test')
    print('---------------')
    import pygtrie
    trie = pygtrie.StringTrie()
    trie.enable_sorting()
    for word in data.split():
        trie[word.lower()] = word
    print('K : ', ', '.join(trie.keys()))
    print('V : ', ', '.join(trie.values()))
Example #19
0
 def __init__(self):
     self.word_offset = pygtrie.StringTrie(separator=' ')
     print "Loading Thesaurus..."
     for line in open(IDX_PATH, 'r'):
         if '|' in line:
             word, offset = line.split('|')
             offset = int(offset)
             self.word_offset[word] = offset
     print "Done"
     self.cache = {}
Example #20
0
 def __init__(self):
     self.resources = pygtrie.StringTrie(separator='_')
     print "Loading DBPedia resources..."
     for line in open(dbpedia_resources, 'r'):
         resource_name, pagerank = line.split()
         if resource_name.startswith('Category:'):
             continue
         pagerank = float(pagerank.rstrip())
         lowered_name = resource_name.lower().replace('-', '_')
         self.resources[lowered_name] = (pagerank, resource_name)
     print "Done"
Example #21
0
def build_trie_from_wordlists():
    # Create Wordlists object
    wl = Wordlists()
    # Build the trie that will contain all of the words in each wordlist
    big_trie = pygtrie.StringTrie()
    for wordlist in wl.wordlists:
        with open(wordlist) as list_file:
            words_as_list = list_file.read().splitlines()
            for word in words_as_list:
                big_trie[word] = wordlist
    return big_trie
Example #22
0
def buildTrieSingle(file, pickle_dir):
    """Build trie, dumps using pickle
    """
    trie_file = getTrieFile(os.path.basename(file), pickle_dir)
    trie = pygtrie.StringTrie(separator=SEP)
    with open(file) as fd:
        for line in fd:
            key = line.strip()
            trie.setdefault(key, 0)
            trie[key] += 1
    with open(trie_file, 'w') as fd:
        pickle.dump(trie, fd, protocol=2)
Example #23
0
def get_count_dict(string):
    start = datetime.datetime.now()
    print "start create count_dict ...".upper()
    c_dict = trie.StringTrie()
    for c in string:
        if c in c_dict:
            c_dict[c] += 1
        else:
            c_dict[c] = 1
    end = datetime.datetime.now()
    print("create count_dict done, costs time: %s" %
          (str(end - start))).upper()
    return c_dict
Example #24
0
 def __build_trie(self, key):
     trie = pygtrie.StringTrie(separator='.')
     for item in it.chain.from_iterable(c[key] for c in self.config):
         if isinstance(item, list):
             prefix, value = item
         else:
             keys = list(item.keys())
             prefix_key = 'prefix'
             value_key = keys[keys.index(prefix_key) - 1]
             prefix = item[prefix_key]
             value = item[value_key]
         trie[prefix] = value
     return trie
Example #25
0
def get_df_trie(tries):
    """
    Return a trie where the prefix's value is its document frequency among the tries.
    """
    df_trie = pygtrie.StringTrie(separator=" ")
    for _, trie in progress.bar(tries, expected_size=len(tries)):
        for prefix in trie.keys():
            if prefix not in df_trie:
                df_trie[prefix] = 0
            df_trie[prefix] += 1
    for prefix in df_trie.keys():
        df_trie[prefix] = (df_trie[prefix] - 1) / len(tries)
    return df_trie
Example #26
0
    def __init__(self, rule_config_path):
        self.pre_trie = pygtrie.StringTrie(separator=u" ")
        self.post_trie = pygtrie.StringTrie(separator=u" ")
        with codecs.open(rule_config_path, 'r', 'utf-8') as rule_config_file:
            for line in rule_config_file:
                rule_definition_list = line.strip(u'\n').split(u'\t')
                rule_id = rule_definition_list[0]
                if rule_definition_list[1] and rule_definition_list[2]:
                    pre_replacing = rule_definition_list[2]
                    if is_entity_tag(pre_replacing):
                        self.pre_trie[rule_definition_list[1]] =\
                            entity_tag_with_id(pre_replacing, rule_id)
                    else:
                        self.pre_trie[rule_definition_list[1]] = pre_replacing

                if rule_definition_list[3] and rule_definition_list[4]:
                    post_to_be_replaced = rule_definition_list[3]
                    if is_entity_tag(post_to_be_replaced):
                        self.post_trie[entity_tag_with_id(post_to_be_replaced, rule_id)] =\
                            rule_definition_list[4]
                    else:
                        self.post_trie[rule_definition_list[3]] = rule_definition_list[4]
Example #27
0
 def test_to_string(self):
     self._assertToString(pygtrie.StringTrie(), '(separator=/)',
                          "([], separator='/')")
     self._assertToString(self._TRIE_CTOR(), '(separator=~)',
                          "([], separator='~')")
     self._assertToString(self._TRIE_CTOR({self._SHORT_KEY: 42}),
                          '(~home~foo: 42, separator=~)',
                          "([('~home~foo', 42)], separator='~')")
     self._assertToString(
         self._TRIE_CTOR({
             self._SHORT_KEY: 42,
             self._OTHER_KEY: '42'
         }), '(~hom: 42, ~home~foo: 42, separator=~)',
         "([('~hom', '42'), ('~home~foo', 42)], separator='~')")
Example #28
0
    def __init__(self, path_to_binary):
        self.path = path_to_binary
        self.needs_reset = True

        # These are only used if an external RERS cache is not hooked up
        self.separator = " "
        self.cache = {}
        self.error_cache = pygtrie.StringTrie(separator=self.separator)
        self.invalid_cache = pygtrie.PrefixSet()

        # Set up external process and communication
        self.proc = Popen(path_to_binary, bufsize=0, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
        self.q = Queue()
        self.t = Thread(target=self._enqueue, args=(self.proc.stdout, self.q))
        self.t.daemon = True
        self.t.start()
Example #29
0
    def __init__(self,
                 options: EzLinksOptions,
                 root: str,
                 files: List[mkdocs.structure.pages.Page],
                 logger=None):
        self.options = options
        self.root = root
        self.file_cache = {}
        self.file_trie = pygtrie.StringTrie(separator=os.sep)
        self.logger = logger

        # Drop any files outside of the root of the docs dir
        self.files = [file for file in files if root in file.abs_src_path]

        for file in self.files:
            self._store_file(file.src_path)
Example #30
0
    def print_trie(self, by="sum", sep=None, precision=3):
        sep = sep or self.sep

        import pygtrie

        t = pygtrie.StringTrie(separator=sep)

        stats = self.stats().T[[by]]
        for key, row in stats.sort_index().iterrows():
            t[key] = row[by]

        def is_child(key):
            return not t.has_subtrie(key)

        def depth_print(s, depth=0):
            print("\t" * depth + s)

        def print_trie(path_conv, path, children, value=None):
            path = path_conv(path)
            depth = path.count(sep)

            if t.has_key(path):
                value_repr = value if not precision else round(value, precision)
                path_repr = path.split("/")[-1]
                if is_child(path):
                    # child
                    depth_print("{}: {}".format(path_repr, value_repr), depth)
                else:
                    depth_print("{}: {}".format(path_repr, value_repr), depth)
            for child in children:
                pass
            return value

        def add_other(path_conv, path, children, value=None):
            path = path_conv(path)
            if value and not is_child(path):
                other_key = "/".join([path, "other"])
                if not t.has_key(other_key):
                    t[other_key] = value - sum(child for child in children)
            else:
                for child in children:
                    pass
            return value

        if by == "sum":
            t.traverse(add_other)
        t.traverse(print_trie)