コード例 #1
0
    def test_valid_separator(self):
        t = pygtrie.StringTrie()
        t['foo/bar'] = 42
        self.assertTrue(bool(t.has_node('foo') & pygtrie.Trie.HAS_SUBTRIE))

        t = pygtrie.StringTrie(separator='.')
        t['foo.bar'] = 42
        self.assertTrue(bool(t.has_node('foo') & pygtrie.Trie.HAS_SUBTRIE))
コード例 #2
0
ファイル: cmudict.py プロジェクト: LanceNorskog/deep_meter
def loadRevmap(syll_dict):
  #print("making reverse maps")
  singlemap = trie.StringTrie(separator='-')
  multimap = trie.StringTrie(separator='-')
  i = 0
  for key in syll_dict.keys():
    syllarray = syll_dict[key]
    if len(syllarray) > 1:
      # 'M AH-G ER' -> "mugger"
      multimap['-'.join(syllarray)] = key
    else:
      singlemap[syllarray[0]] = key
    i += 1
  return (singlemap, multimap)
コード例 #3
0
 def __init__(self, gramFile, candidateFile, scoreFile):
     self._gramFile = gramFile
     self._candidateFile = candidateFile
     self._scoreFile = scoreFile
     """pre_trie"""
     self._pretrie = trie.StringTrie(separator=Process.SEP)
     self._pretrieFile = 'PreGramTrie'
     self._precache = {}
     """post_trie"""
     self._posttrie = trie.StringTrie(separator=Process.SEP)
     self._posttrieFile = 'PostGramTrie'
     self._postcache = {}
     assert os.path.exists(gramFile), "GramFile %s not exists" % gramFile
     assert os.path.exists(
         candidateFile), "CandidateFile %s not exists" % candidateFile
コード例 #4
0
ファイル: rop_test.py プロジェクト: SamanthaYu/ROPHunter
    def test_valid_inst(self):
        rop_hunter = ROPHunter(CS_ARCH_X86, CS_MODE_64, self.parallel)
        start_offset = 0x1000
        code = b"\xc7\x07\x00\x00\x00\x0f\x95\x45\xc3"

        rop_hunter.galileo(self.duplicates, self.output, start_offset, code)

        expected_trie = pygtrie.StringTrie()
        expected_trie["c3"] = "ret"
        expected_trie["c3/9545"] = "xchg eax, ebp"
        expected_trie["c3/9545/000f"] = "add byte ptr [rdi], cl"
        expected_trie["c3/9545/000f/0000"] = "add byte ptr [rax], al"
        expected_trie["c3/9545/00000f"] = "add byte ptr [rax], al"
        expected_trie[
            "c3/9545/c7070000000f"] = "mov dword ptr [rdi], 0xf000000"
        expected_trie["c3/00000f9545"] = "add byte ptr [rax], al"

        actual_trie = rop_hunter.get_inst_trie()
        self.assertCountEqual(actual_trie.items(), expected_trie.items())

        # len(code) = 9
        # Expected instruction address: start_offset + len(code) - len(inst)
        expected_inst_addr = dict()
        expected_inst_addr[
            "c3/9545"] = "0x1006"  # e.g. 0x1000 + 9 - 3 = 0x1006
        expected_inst_addr["c3/9545/000f"] = "0x1004"
        expected_inst_addr["c3/9545/000f/0000"] = "0x1002"
        expected_inst_addr["c3/9545/00000f"] = "0x1003"
        expected_inst_addr["c3/9545/c7070000000f"] = "0x1000"
        expected_inst_addr["c3/00000f9545"] = "0x1003"

        actual_inst_addr = rop_hunter.get_inst_addr_dict()
        self.assertDictEqual(actual_inst_addr, expected_inst_addr)
コード例 #5
0
ファイル: quality_filter.py プロジェクト: wook2014/seqmagick
def parse_barcode_file(fp, primer=None, header=False):
    """
    Load label, barcode, primer records from a CSV file.

    Returns a map from barcode -> label

    Any additional columns are ignored
    """
    tr = trie.StringTrie()
    reader = csv.reader(fp)

    if header:
        # Skip header
        next(reader)

    # Skip blank rows
    records = (record for record in reader if record)

    for record in records:
        specimen, barcode = record[:2]
        if primer is not None:
            pr = primer
        else:
            pr = record[2]
        for sequence in all_unambiguous(barcode + pr):
            if sequence in tr:
                raise ValueError("Duplicate sample: {0}, {1} both have {2}",
                                 specimen, tr[sequence], sequence)
            logging.info('%s->%s', sequence, specimen)
            tr[sequence] = specimen

    return tr
コード例 #6
0
    def __init__(self,
                 vocabulary: Iterable[Tuple[Iterable[str], str]],
                 ignore_case: Optional[bool] = True):
        """builds the prefix trie using the provided vocabulary

        Parameters
        ----------
        vocabulary: Iterable
            Vocabulary to build tree from. Different lists of words in the
            vocabulary are associated with a type label.
        ignore_case: bool, default=True
            Perform case-insensitive matching if True.
        """
        self.ignore_case = ignore_case
        self.trie = pygtrie.StringTrie()
        for words, label in vocabulary:
            for word in words:
                if ignore_case:
                    word = word.lower()
                dom_word = self.trie._separator.join(word.split())
                if dom_word in self.trie:
                    if self.trie[dom_word][1] != label:
                        warnings.warn(
                            "duplicate pytrie entry '{}' with different label: '{}' found. Original label: {} is immutable, Ignoring duplicate."
                            .format(word, label, self.trie[dom_word][1]))
                    continue
                self.trie[dom_word] = (word, label)
コード例 #7
0
def deref_urls(url_mapping: dict,
               url_settings: Tuple[Tuple[str, str],...]=URL_SETTINGS
               ) -> pygtrie.StringTrie:
    dereferenced_urls = []
    for prefix, url_ref in url_settings:
        dereferenced_urls.append((prefix, url_mapping[url_ref]))
    return pygtrie.StringTrie(dereferenced_urls, separator='.')
コード例 #8
0
ファイル: data_utils.py プロジェクト: bashe99/flaskSentiment
def get_dict_trie(dict_file_name,
                  processing_word=None,
                  processing_dict_type=None,
                  trie_separator='.'):
    trie = pygtrie.StringTrie(separator=trie_separator)
    paths = []
    dict_types = set()
    UNK_word_id = processing_word(UNK) if processing_word is not None else -1
    with open(dict_file_name, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                sent, dict_type = line.split('\t')
                if processing_word is not None:
                    word_ids = [
                        processing_word(word) for word in sent.split(' ')
                    ]
                    if UNK_word_id in word_ids:
                        continue
                    sent = trie_separator.join(
                        [str(word_id) for word_id in word_ids])
                if processing_dict_type is not None:
                    dict_type = processing_dict_type(dict_type)
                trie[sent] = dict_type
                paths.append('{}\t{}'.format(sent, dict_type))
                dict_types.add(dict_type)
    return trie, paths, list(dict_types)
コード例 #9
0
def readVectors():
    trie = pygtrie.StringTrie()
    db = shelve.open("n_shelve_10_cat")

    #Set categories
    tags = [
        "education", "music", "film", "food", "police", "health", "women",
        "children", "technology", "sport"
    ]

    tag_dict = {}
    print(1)

    #Find all tags' vector
    for value in tags:  # <- 1
        data = np.asarray(db[value], dtype="float64")
        tag_dict[value] = data  # <- 1
        db.sync()
    print(2)
    tm = db.keys()
    print(3)
    data = []
    count = 0
    #Add all words vector to trie
    for x in tm:
        trie[x] = np.asarray(db[x], dtype="float64")
    print(4)
    return trie, tag_dict
コード例 #10
0
def readdata():
    trie = pygtrie.StringTrie()
    db = shelve.open(
        "/home/saidaltindis/Desktop/PROJECTS/MULTI-LABEL CLASSIFICATION/Data/n_shelve_5_cat_reuters"
    )

    #Set categories
    tags = ['earn', 'grain', 'crude', 'trade', 'interest']

    tag_dict = {}
    print(1)

    #Find all tags' vector
    for value in tags:  # <- 1
        data = np.asarray(db[value], dtype="float64")
        tag_dict[value] = data  # <- 1
        db.sync()
    print(2)
    tm = db.keys()
    print(3)
    data = []
    count = 0
    #Add all words vector to trie
    for x in tm:
        trie[x] = np.asarray(db[x], dtype="float64")
    print(4)
    return trie, tag_dict
コード例 #11
0
ファイル: detect_heavyhitters.py プロジェクト: 5up3rc/SENSS
def main():

    parser = argparse.ArgumentParser(
        description="Detect heavy hitters from traces")

    parser.add_argument('-f',
                        '--format',
                        dest='file_format',
                        nargs=1,
                        default='None',
                        choices=['nfdump', 'flow-tools'],
                        required=True,
                        help='Trace format i.e. flow-tools or nfdump')
    parser.add_argument(
        'infile',
        nargs='?',
        default=sys.stdin,
        help=
        'File path to read from. If no path is specified then defaults to stdin'
    )
    args = parser.parse_args()
    trie = t.StringTrie(separator='.')
    if (args.file_format[0] == "flow-tools"):
        buildFlowToolsTrie(args.infile, trie)
    else:
        pass
コード例 #12
0
    def __init__(self,
                 path_to_binary,
                 path_to_cache=None,
                 save_every_n=1000,
                 terminator="0"):
        self.path = path_to_binary
        self.needs_reset = True
        self.cache = {}
        self.error_cache = pygtrie.StringTrie(separator=" ")
        self.invalid_cache = pygtrie.PrefixSet()

        self.terminator = terminator
        assert terminator not in self.get_alphabet(
        ), f"Terminator {terminator} in alphabet, please choose a different one"

        # Save cache to file every n queries
        self.save_every_n = save_every_n
        self.n_queries = 0

        if path_to_cache is None:
            print("No cache path given, not using cache")
            self.cachepath = None
        else:
            print("Cache dir:", str(Path(path_to_cache).absolute()))
            # Hash the binary to find it's cache folder
            with open(self.path, 'rb') as f:
                hash = hashlib.sha256(f.read()).hexdigest()

            # Check if cache exists for the given binary
            self.cachepath = Path(path_to_cache).joinpath(hash)
            if self.cachepath.is_dir():
                self._load_cache()
            else:
                os.mkdir(self.cachepath)
コード例 #13
0
ファイル: test.py プロジェクト: pallabpain/pygtrie
class StringTrieTestCase(TrieTestCase):
    _TRIE_CTOR = staticmethod(
        lambda *args, **kw: pygtrie.StringTrie(*args, separator='~', **kw))  # pylint: disable=unnecessary-lambda

    _SHORT_KEY = '~home~foo'
    _SHORT_KEY2 = '~home~FOO'
    _LONG_KEY = _SHORT_KEY + '~bar~baz'
    _VERY_LONG_KEY = _LONG_KEY + '~qux'
    _OTHER_KEY = '~hom'
    _SHORT_PREFIXES = ('', '~home')
    _LONG_PREFIXES = ('~home~foo~bar', )

    _PICKLED_PROTO_0 = (
        'Y2NvcHlfcmVnCl9yZWNvbnN0cnVjdG9yCnAwCihjcHlndHJpZQpTdHJpbmdUcmllCnAxCm'
        'NfX2J1aWx0aW5fXwpvYmplY3QKcDIKTnRwMwpScDQKKGRwNQpWX3NlcGFyYXRvcgpwNgpW'
        'LwpwNwpzVl9yb290CnA4CmcwCihjcHlndHJpZQpfTm9kZQpwOQpnMgpOdHAxMApScDExCi'
        'hscDEyCkwxTAphVmZvbwpwMTMKYUw0MkwKYUwtMUwKYUwxTAphVmJhcgpwMTQKYUw0MkwK'
        'YUwtMUwKYUwxTAphVmJhegpwMTUKYUw0MkwKYWJzVl9zb3J0ZWQKcDE2CkkwMApzYi4=')

    _PICKLED_PROTO_3 = (
        'gANjcHlndHJpZQpTdHJpbmdUcmllCnEAKYFxAX1xAihYCgAAAF9zZXBhcmF0b3JxA1gBAA'
        'AAL3EEWAUAAABfcm9vdHEFY3B5Z3RyaWUKX05vZGUKcQYpgXEHXXEIKEsBWAMAAABmb29x'
        'CUsqSv////9LAVgDAAAAYmFycQpLKkr/////SwFYAwAAAGJhenELSyplYlgHAAAAX3Nvcn'
        'RlZHEMiXViLg==')

    @classmethod
    def path_from_key(cls, key):
        return key.split('~')

    @classmethod
    def key_from_path(cls, path):
        return '~'.join(path)

    def test_valid_separator(self):
        t = pygtrie.StringTrie()
        t['foo/bar'] = 42
        self.assertTrue(bool(t.has_node('foo') & pygtrie.Trie.HAS_SUBTRIE))

        t = pygtrie.StringTrie(separator='.')
        t['foo.bar'] = 42
        self.assertTrue(bool(t.has_node('foo') & pygtrie.Trie.HAS_SUBTRIE))

    def test_invalid_separator(self):
        self.assertRaises(TypeError, pygtrie.StringTrie, separator=42)
        self.assertRaises(ValueError, pygtrie.StringTrie, separator='')

    def test_to_string(self):
        self._assertToString(pygtrie.StringTrie(), '(separator=/)',
                             "([], separator='/')")
        self._assertToString(self._TRIE_CTOR(), '(separator=~)',
                             "([], separator='~')")
        self._assertToString(self._TRIE_CTOR({self._SHORT_KEY: 42}),
                             '(~home~foo: 42, separator=~)',
                             "([('~home~foo', 42)], separator='~')")
        self._assertToString(
            self._TRIE_CTOR({
                self._SHORT_KEY: 42,
                self._OTHER_KEY: '42'
            }), '(~hom: 42, ~home~foo: 42, separator=~)',
            "([('~hom', '42'), ('~home~foo', 42)], separator='~')")
コード例 #14
0
ファイル: munge.py プロジェクト: frankier/STIFF
def eurosense_to_unified(eurosense: IO, unified: IO):
    """
    Do the XML conversion from the Eurosense format to the Unified format. Note
    that this only deals with XML and doesn't convert other things like synset
    ids. For the full conversion pipeline see eurosense2unified in
    `pipeline.py`.
    """
    write_header(unified, "eurosense")
    for sent_id, sent_elem in iter_sentences_eurosense(eurosense):
        unified.write('<sentence id="{}">\n'.format(sent_id))
        trie = pygtrie.StringTrie(separator=" ")
        anns = sent_elem.xpath(".//annotation")
        for ann in anns:
            trie[ann.attrib["anchor"]] = (ann.text, ann.attrib["lemma"])
        sent = sent_elem.xpath("text")[0].text
        cursor = 0
        while cursor < len(sent):
            match_anchor, match_val = trie.longest_prefix(sent[cursor:])
            if match_anchor:
                sense_key, lemma = match_val
                pos = WN_UNI_POS_MAP[sense_key[-1]]
                unified.write(
                    '<instance lemma="{}" pos="{}" key="{}">{}</instance>\n'.
                    format(lemma, pos, sense_key, match_anchor))
                cursor += len(match_anchor) + 1
            else:
                end_pos = sent.find(" ", cursor)
                if end_pos == -1:
                    break
                unified.write("<wf>{}</wf>\n".format(
                    escape(sent[cursor:end_pos])))
                cursor = end_pos + 1
        unified.write("</sentence>\n")
    unified.write("</text>\n")
    unified.write("</corpus>\n")
コード例 #15
0
 def __init__(self, vocab_path="sowpods.txt"):
     # Construct the Prefix Tree for all possible word/card permutations
     with open(vocab_path, "r") as text_file:
         lines = text_file.readlines()
     self.perms = trie.StringTrie(separator='/')
     for w in lines[6:]:
         self._make_perm(w.strip(),[],0,self.perms)
     print(f'Using {len(self.perms)} possible word permutations\n')
コード例 #16
0
def create_public_suffix_trie():
    pub_suf_trie = trie.StringTrie()
    data = fetch_public_suffix_data()
    if len(data) > 0:
        for ps in data:
            if ps != "" and not ps.startswith("//"):
                pub_suf_trie[ps] = True
    return pub_suf_trie
コード例 #17
0
def build_lang_dict(language):
    import pygtrie
    lang_dict = pygtrie.StringTrie()
    with open(os.path.join(LANGUAGES_PREFIX, language)) as lang_file:
        for dict_entry in lang_file:
            if len(dict_entry) >= IGNORE_LENGTH_BELOW:
                lang_dict[
                    dict_entry] = dict_entry  #Wastes a bit of memory, could be optimized with different Trie implementation
コード例 #18
0
def test_string():
    print('StringTrie Test')
    print('---------------')
    import pygtrie
    trie = pygtrie.StringTrie()
    trie.enable_sorting()
    for word in data.split():
        trie[word.lower()] = word
    print('K : ', ', '.join(trie.keys()))
    print('V : ', ', '.join(trie.values()))
コード例 #19
0
 def __init__(self):
     self.word_offset = pygtrie.StringTrie(separator=' ')
     print "Loading Thesaurus..."
     for line in open(IDX_PATH, 'r'):
         if '|' in line:
             word, offset = line.split('|')
             offset = int(offset)
             self.word_offset[word] = offset
     print "Done"
     self.cache = {}
コード例 #20
0
 def __init__(self):
     self.resources = pygtrie.StringTrie(separator='_')
     print "Loading DBPedia resources..."
     for line in open(dbpedia_resources, 'r'):
         resource_name, pagerank = line.split()
         if resource_name.startswith('Category:'):
             continue
         pagerank = float(pagerank.rstrip())
         lowered_name = resource_name.lower().replace('-', '_')
         self.resources[lowered_name] = (pagerank, resource_name)
     print "Done"
コード例 #21
0
def build_trie_from_wordlists():
    # Create Wordlists object
    wl = Wordlists()
    # Build the trie that will contain all of the words in each wordlist
    big_trie = pygtrie.StringTrie()
    for wordlist in wl.wordlists:
        with open(wordlist) as list_file:
            words_as_list = list_file.read().splitlines()
            for word in words_as_list:
                big_trie[word] = wordlist
    return big_trie
コード例 #22
0
def buildTrieSingle(file, pickle_dir):
    """Build trie, dumps using pickle
    """
    trie_file = getTrieFile(os.path.basename(file), pickle_dir)
    trie = pygtrie.StringTrie(separator=SEP)
    with open(file) as fd:
        for line in fd:
            key = line.strip()
            trie.setdefault(key, 0)
            trie[key] += 1
    with open(trie_file, 'w') as fd:
        pickle.dump(trie, fd, protocol=2)
コード例 #23
0
def get_count_dict(string):
    start = datetime.datetime.now()
    print "start create count_dict ...".upper()
    c_dict = trie.StringTrie()
    for c in string:
        if c in c_dict:
            c_dict[c] += 1
        else:
            c_dict[c] = 1
    end = datetime.datetime.now()
    print("create count_dict done, costs time: %s" %
          (str(end - start))).upper()
    return c_dict
コード例 #24
0
 def __build_trie(self, key):
     trie = pygtrie.StringTrie(separator='.')
     for item in it.chain.from_iterable(c[key] for c in self.config):
         if isinstance(item, list):
             prefix, value = item
         else:
             keys = list(item.keys())
             prefix_key = 'prefix'
             value_key = keys[keys.index(prefix_key) - 1]
             prefix = item[prefix_key]
             value = item[value_key]
         trie[prefix] = value
     return trie
コード例 #25
0
def get_df_trie(tries):
    """
    Return a trie where the prefix's value is its document frequency among the tries.
    """
    df_trie = pygtrie.StringTrie(separator=" ")
    for _, trie in progress.bar(tries, expected_size=len(tries)):
        for prefix in trie.keys():
            if prefix not in df_trie:
                df_trie[prefix] = 0
            df_trie[prefix] += 1
    for prefix in df_trie.keys():
        df_trie[prefix] = (df_trie[prefix] - 1) / len(tries)
    return df_trie
コード例 #26
0
    def __init__(self, rule_config_path):
        self.pre_trie = pygtrie.StringTrie(separator=u" ")
        self.post_trie = pygtrie.StringTrie(separator=u" ")
        with codecs.open(rule_config_path, 'r', 'utf-8') as rule_config_file:
            for line in rule_config_file:
                rule_definition_list = line.strip(u'\n').split(u'\t')
                rule_id = rule_definition_list[0]
                if rule_definition_list[1] and rule_definition_list[2]:
                    pre_replacing = rule_definition_list[2]
                    if is_entity_tag(pre_replacing):
                        self.pre_trie[rule_definition_list[1]] =\
                            entity_tag_with_id(pre_replacing, rule_id)
                    else:
                        self.pre_trie[rule_definition_list[1]] = pre_replacing

                if rule_definition_list[3] and rule_definition_list[4]:
                    post_to_be_replaced = rule_definition_list[3]
                    if is_entity_tag(post_to_be_replaced):
                        self.post_trie[entity_tag_with_id(post_to_be_replaced, rule_id)] =\
                            rule_definition_list[4]
                    else:
                        self.post_trie[rule_definition_list[3]] = rule_definition_list[4]
コード例 #27
0
ファイル: test.py プロジェクト: pallabpain/pygtrie
 def test_to_string(self):
     self._assertToString(pygtrie.StringTrie(), '(separator=/)',
                          "([], separator='/')")
     self._assertToString(self._TRIE_CTOR(), '(separator=~)',
                          "([], separator='~')")
     self._assertToString(self._TRIE_CTOR({self._SHORT_KEY: 42}),
                          '(~home~foo: 42, separator=~)',
                          "([('~home~foo', 42)], separator='~')")
     self._assertToString(
         self._TRIE_CTOR({
             self._SHORT_KEY: 42,
             self._OTHER_KEY: '42'
         }), '(~hom: 42, ~home~foo: 42, separator=~)',
         "([('~hom', '42'), ('~home~foo', 42)], separator='~')")
コード例 #28
0
    def __init__(self, path_to_binary):
        self.path = path_to_binary
        self.needs_reset = True

        # These are only used if an external RERS cache is not hooked up
        self.separator = " "
        self.cache = {}
        self.error_cache = pygtrie.StringTrie(separator=self.separator)
        self.invalid_cache = pygtrie.PrefixSet()

        # Set up external process and communication
        self.proc = Popen(path_to_binary, bufsize=0, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
        self.q = Queue()
        self.t = Thread(target=self._enqueue, args=(self.proc.stdout, self.q))
        self.t.daemon = True
        self.t.start()
コード例 #29
0
    def __init__(self,
                 options: EzLinksOptions,
                 root: str,
                 files: List[mkdocs.structure.pages.Page],
                 logger=None):
        self.options = options
        self.root = root
        self.file_cache = {}
        self.file_trie = pygtrie.StringTrie(separator=os.sep)
        self.logger = logger

        # Drop any files outside of the root of the docs dir
        self.files = [file for file in files if root in file.abs_src_path]

        for file in self.files:
            self._store_file(file.src_path)
コード例 #30
0
    def print_trie(self, by="sum", sep=None, precision=3):
        sep = sep or self.sep

        import pygtrie

        t = pygtrie.StringTrie(separator=sep)

        stats = self.stats().T[[by]]
        for key, row in stats.sort_index().iterrows():
            t[key] = row[by]

        def is_child(key):
            return not t.has_subtrie(key)

        def depth_print(s, depth=0):
            print("\t" * depth + s)

        def print_trie(path_conv, path, children, value=None):
            path = path_conv(path)
            depth = path.count(sep)

            if t.has_key(path):
                value_repr = value if not precision else round(value, precision)
                path_repr = path.split("/")[-1]
                if is_child(path):
                    # child
                    depth_print("{}: {}".format(path_repr, value_repr), depth)
                else:
                    depth_print("{}: {}".format(path_repr, value_repr), depth)
            for child in children:
                pass
            return value

        def add_other(path_conv, path, children, value=None):
            path = path_conv(path)
            if value and not is_child(path):
                other_key = "/".join([path, "other"])
                if not t.has_key(other_key):
                    t[other_key] = value - sum(child for child in children)
            else:
                for child in children:
                    pass
            return value

        if by == "sum":
            t.traverse(add_other)
        t.traverse(print_trie)