Python RecordTrie Beispiele, marisa_trie.RecordTrie Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: main.py Projekt: stanford-oval/genienlp

 def __init__(self, args):
     super().__init__(args)
     self.alias2qids = marisa_trie.RecordTrie(f"<{'p'*5}").mmap(
         os.path.join(self.args.database_dir,
                      'es_material/alias2qids.marisa'))
     self.qid2typeqid = marisa_trie.RecordTrie("<p").mmap(
         os.path.join(self.args.database_dir,
                      'es_material/qid2typeqid.marisa'))

Beispiel #2

0

Datei anzeigen

Datei: speedy.py Projekt: muranava/labMT-simple

 def makeMarisaTrie(self):
     fmt = "fH"
     fixedtrie = marisa_trie.RecordTrie(
         fmt,
         zip(map(u, self.fixedwords),
             zip(self.fixedscores, range(len(self.fixedscores)))))
     stemtrie = marisa_trie.RecordTrie(
         fmt,
         zip(map(u, self.stemwords),
             zip(self.stemscores, range(len(self.stemscores)))))
     fixedtrie.save('{0}/{1:.2f}-fixed.marisa'.format(
         self.folders[self.cindex], self.stopVal))
     stemtrie.save('{0}/{1:.2f}-stem.marisa'.format(
         self.folders[self.cindex], self.stopVal))
     return (fixedtrie, stemtrie)

Beispiel #3

0

Datei anzeigen

def save_forgotten_sample(run_data):
    print('saving forgotten email sample')
    paths = get_paths(run_data)

    if run_data['forget_method'] == 'frequency':
        if 'trie' not in run_data:
            run_data['trie'] = marisa_trie.RecordTrie('I')
            run_data['trie'].load(paths['trie'])

    with shelve.open(paths['forgotten_sample'], 'n') as forgotten_sample:
        with shelve.open(paths['original_sample'], 'r') as original_sample:
            for i, ratio in enumerate(run_data['ratios']):
                sample_by_ratio = {}
                for (md5, tokens) in original_sample.items():
                    forgotten_email, frequency_threshold = forget_email(
                        tokens, ratio, run_data)
                    item = {
                        'md5': md5,
                        'original_email': tokens,
                        'length': len(tokens),
                        'ratio': ratio,
                        'forgotten_email': forgotten_email,
                        'frequency_threshold': frequency_threshold,
                        'bloom_filter': None,
                    }
                    sample_by_ratio[md5] = item
                forgotten_sample[str(ratio)] = sample_by_ratio

    print('saved forgotten email sample')

Beispiel #4

0

Datei anzeigen

def read_umls(UMLSfile, google_concepts_list):
    try:
        desc, UMLS = pickle.load(open('UMLSlite.pk'))
    except:
        f = open(UMLSfile)
        preUMLS = [(line.strip() + ' ').split(' |||| ')[:-1] for line in f]
        f.close()
        print "Read UMLS"
        UMLS = map(remake, preUMLS[1:])
        pickle.dump((preUMLS[0], UMLS), open('UMLSlite.pk', 'w'))
    print "Loaded UMLS"
    UMLSrest = filter(lambda x: len(good_type_list(x)) > 0, UMLS)
    # Prefix trees
    fmt = "<i"
    # Regular
    data = []
    lookup = {}
    for i, concept in enumerate(UMLSrest):
        i = int(concept[0][1:]) if concept[0].lower() != "cui-less" else 0
        for st in concept[4]:
            data.append((unicode(st), (i,)))
    for i, (mid, descriptions) in enumerate(google_concepts_list):
        for st in descriptions:
            data.append((unicode(st), (len(UMLSrest) + i,)))
            lookup[st] = lookup.get(st, []) + [mid]
    trie = marisa_trie.RecordTrie(fmt, data)
    print "Made trie"
    foo = map(lambda x: auxUMLS(x, lookup), UMLS)
    print "Made lookup"
    return UMLS, lookup, trie

Beispiel #5

0

Datei anzeigen

Datei: convert_anchor_tags_to_wikidata.py Projekt: zjulins/deeptype

def main():
    args = parse_args()
    makedirs(args.out, exist_ok=True)
    wikipedia2wikidata_trie = marisa_trie.RecordTrie('i').load(
        args.wikipedia2wikidata_trie)
    print('loaded trie')
    redirections = load_redirections(args.redirections)
    anchor_trie = construct_anchor_trie(
        anchor_tags=args.anchor_tags,
        wikipedia2wikidata_trie=wikipedia2wikidata_trie,
        redirections=redirections,
        prefix=args.prefix)
    anchor_trie.save(join(args.out, 'trie.marisa'))
    ((trie_index2indices_offsets, trie_index2indices_values,
      trie_index2indices_counts),
     (trie_index2contexts_offsets, trie_index2contexts_values,
      trie_index2contexts_counts)) = construct_mapping(
          anchor_tags=args.anchor_tags,
          wikipedia2wikidata_trie=wikipedia2wikidata_trie,
          redirections=redirections,
          prefix=args.prefix,
          anchor_trie=anchor_trie)
    np.save(join(args.out, "trie_index2indices_offsets.npy"),
            trie_index2indices_offsets)
    np.save(join(args.out, "trie_index2indices_values.npy"),
            trie_index2indices_values)
    np.save(join(args.out, "trie_index2indices_counts.npy"),
            trie_index2indices_counts)

    np.save(join(args.out, "trie_index2contexts_offsets.npy"),
            trie_index2contexts_offsets)
    np.save(join(args.out, "trie_index2contexts_values.npy"),
            trie_index2contexts_values)
    np.save(join(args.out, "trie_index2contexts_counts.npy"),
            trie_index2contexts_counts)

Beispiel #6

0

Datei anzeigen

Datei: autocomplete.py Projekt: sherlockwu/open-lambda

def setup():
    #make array of words objects with word and frequency
    f = open("words.txt")
    words = []
    freqs = []
    for line in f:
        line = line.strip()
        line = line.lower()
        if " " in line:
            line = line.split(" ")
        else:
            line = line.split("\t")
        word = line[0]
        freq = line[1]
        freqInt = int(freq)
        words.append(word)
        freqs.append(freqInt)

    f.close()

    #make trie of ranges in word array
    g = open("ranges.txt")
    prefixes = []
    ranges = []
    for line in g:
        line = line.strip()
        line = line.split("\t")
        lineu = unicode(line[0])
        prefixes.append(lineu)
        rangeind = (int(line[1]), int(line[2]))
        ranges.append(rangeind)
    fmt = "<LL"
    trie = marisa_trie.RecordTrie(fmt, zip(prefixes, ranges))
    g.close()
    return words, freqs, prefixes, trie

Beispiel #7

0

Datei anzeigen

def build_fixed_point(out, prefix):
    wiki_fixed_point_save = join(
        out, "wikidata_%s_fixed_points_values.npy" % (prefix, ))
    if not true_exists(wiki_fixed_point_save):
        print("building %s fixed point property." % (prefix, ))
        trie = marisa_trie.RecordTrie('i').load(
            join(out, WIKITILE_2_WIKIDATA_TRIE_NAME))
        num_items = count_lines(join(out, WIKIDATA_IDS_NAME))
        fixed_point_relation = {}

        category_prefix = "%s/Category:" % (prefix, )
        article_prefix = "%s/" % (prefix, )
        wikititle2wikidata_path = join(out, WIKITILE_2_WIKIDATA_TSV_NAME)
        relevant_items = trie.iteritems(category_prefix)

        for name, category_idx in relevant_items:
            article_name = article_prefix + name[len(category_prefix):]
            for fixed_point_name_alternate in fixed_point_name_alternates(
                    article_name):
                matches = trie.get(fixed_point_name_alternate, None)
                if matches is not None and len(matches) > 0:
                    fixed_point_relation[category_idx] = [matches[0][0]]
                    break
        print("Found %d fixed point relations for %s" % (
            len(fixed_point_relation),
            prefix,
        ))
        save_record_with_offset(
            join(out, "wikidata_%s_fixed_points" % (prefix, )),
            fixed_point_relation, num_items)

Beispiel #8

0

Datei anzeigen

Datei: load_specialist_lex.py Projekt: tkasukawa/MetaSRA-pipeline

    def __init__(self, lex_loc):
        print("loading SPECIALIST Lexicon...")
        self.lexicon = load_lexicon(lex_loc)
        self.eui_array = []

        print("building SPECIALIST Lexicon trie...")
        tups = []
        curr_i = 0
        for eui, lex_info in self.lexicon.items():
            self.eui_array.append(eui)

            # tups.append((lex_info["base"].decode('utf-8'), [curr_i]))
            tups.append((lex_info["base"], [curr_i]))

            if "spelling variants" in lex_info:
                for spell_var in lex_info["spelling variants"]:
                    # tups.append((spell_var.decode('utf-8'), [curr_i]))
                    tups.append((spell_var, [curr_i]))

            if "nominalization" in lex_info:
                for nom in lex_info["nominalization"]:
                    # tups.append((nom.decode('utf-8'), [curr_i]))
                    tups.append((nom, [curr_i]))

            if "inflection variants" in lex_info:
                for infl_var in lex_info["inflection variants"]:
                    # tups.append((infl_var.decode('utf-8'), [curr_i]))
                    tups.append((infl_var, [curr_i]))

            curr_i += 1

        self.trie = marisa_trie.RecordTrie("<i", tups)

Beispiel #9

0

Datei anzeigen

def create_trie_obj(location_list: [Location], code_blacklist: {str},
                    word_blacklist: {str}):
    """
    Creates a RecordTrie with the marisa library
    :param location_list: a list with all locations
    :param code_blacklist: a list with all codes to blacklist
    :param word_blacklist: a list with all words which should be blacklisted
    :rtype: marisa_trie.RecordTrie
    """
    code_id_type_tuples = []
    for location in location_list:
        code_id_type_tuples.extend(location.code_id_type_tuples())

    code_id_type_tuples = [
        code_tuple for code_tuple in code_id_type_tuples
        if code_tuple[0] not in code_blacklist
        and code_tuple[0] not in word_blacklist and len(code_tuple[0]) > 2
    ]

    for code in word_blacklist:
        code_id_type_tuples.append((code, ('0' * 32, -1)))

    encoded_tuples = [(code, (uid.encode(), code_type))
                      for code, (uid, code_type) in code_id_type_tuples]

    return marisa_trie.RecordTrie('<32sh', encoded_tuples)

Beispiel #10

0

Datei anzeigen

Datei: nlp_setup.py Projekt: neurotechuoft/NLPService

def popular_trie():
    """
    Generate a trie for the most popular words, like "to", "the", etc.
    Popular trie should be used if the branching factor for the long trie is large (>1000)
    :return: a popular trie, which also gets stored on the drive
    """
    try:
        grams = pd.read_pickle(resources_path + 'words.pkl')
    except IOError:
        grams = load_words()

    big_ones = dict()
    for elem in (grams.groupby(['first']).sum()).iterrows():
        count = elem[1]['freq']
        if count > 7000:
            big_ones[elem[1].name] = count
    grams = grams.loc[grams['first'].isin(big_ones)]
    grams = grams.loc[grams['freq'] > 1000]

    grams['freq'] = grams['freq'].apply(lambda x: (x, ))

    freqs = grams['freq'].values
    phrases = grams['first'] + " " + grams['second']
    fmt = "@i"
    phrases = list(map(lambda x: np.unicode(x), phrases))
    triee = marisa.RecordTrie(fmt, zip(phrases, freqs))
    with open(resources_path + 'popular_trie.pkl', 'wb') as output:
        pickle.dump(triee, output, pickle.HIGHEST_PROTOCOL)
    with open(resources_path + 'dict.pkl', 'wb') as output:
        pickle.dump(big_ones, output, pickle.HIGHEST_PROTOCOL)

    return triee

Beispiel #11

0

Datei anzeigen

Datei: source_reader.py Projekt: Somewater/ruword_frequency

    def build_tree_from_dictionaries(self):
        wc_lists = defaultdict(list)
        for need_lemmatization, generator, source_name in [
            (False, self.read_freq_2011, '2011'),
            (False, self.read_freq_hagen, 'hagen'),
            (True, self.read_freq_litc_win, 'litc_win'),
            (False, self.read_freq_wikipedia, 'wikipedia'),
            (True, self.read_freq_flibusta, 'flibusta'),
            (True, self.read_freq_puhlyi, 'puhlyi')
        ]:
            print('read %s' % source_name)
            wc = generator()

            wc2 = Counter()
            for w, ipm in wc.items():
                wc2[w.lower()] += ipm
            wc = wc2

            del wc2
            for w, ipm in wc.items():
                wc_lists[w].append(ipm)
        wc = dict()
        for w, cs in wc_lists.items():
            wc[w] = sum(cs) / len(cs)
        return marisa_trie.RecordTrie('<f',
                                      [(w, (ipm, )) for w, ipm in wc.items()])

Beispiel #12

0

Datei anzeigen

Datei: nlp_setup.py Projekt: neurotechuoft/NLPService

def load_data(branch_limit=10000):
    """
    Load the longest version of the trie, containing most n-grams(limited by the branch_limit)
    :param branch_limit: the limit of children for each node of the trie. Default 10000
    :return: the trie, which also gets stored on the drive
    """
    try:
        grams = pd.read_pickle(resources_path + 'words.pkl')
    except IOError:
        grams = load_words()

    grams = grams.sort_values(by='freq', ascending=False)

    # Limit the number of children for each node
    grams = grams.groupby("first").head(branch_limit)

    # The transformation from int to a singular tuple is required by the trie API
    grams['freq'] = grams['freq'].apply(lambda x: (x, ))

    freqs = grams['freq'].values
    phrases = grams['first'] + " " + grams['second']
    fmt = "@i"
    phrases = list(map(lambda x: np.unicode(x), phrases))
    triee = marisa.RecordTrie(fmt, zip(phrases, freqs))

    # Store the trie
    with open(resources_path + 'trie.pkl', 'wb') as output:
        pickle.dump(triee, output, pickle.HIGHEST_PROTOCOL)
    return triee

Beispiel #13

0

Datei anzeigen

Datei: nlp_setup.py Projekt: neurotechuoft/NLPService

def one_letter():
    """
    Generate a trie that is used for a special case where only one letter is given
    to the autocomplete function. Since it's very expensive to go over all combinations
    each time, this function does it once and stores the result.
    :return: a one-letter trie, which also gets stored on the drive
    """
    try:
        grams = pd.read_pickle(resources_path + 'words.pkl')
    except IOError:
        grams = load_words()

    short_grams = grams.copy()
    # short_grams['first'] = short_grams[['first']].apply(lambda x: x[0].lower())
    short_grams['indices'] = short_grams.index

    res = short_grams.groupby("first").apply(
        lambda group: group.nlargest(50, columns='freq'))
    indices = res['indices'].values
    grams = grams.iloc[indices, :]
    grams['freq'] = grams['freq'].apply(lambda x: (x, ))

    freqs = grams['freq'].values
    phrases = grams['first'] + " " + grams['second']
    fmt = "@i"
    phrases = list(map(lambda x: np.unicode(x), phrases))
    triee = marisa.RecordTrie(fmt, zip(phrases, freqs))
    with open(resources_path + 'short_trie.pkl', 'wb') as output:
        pickle.dump(triee, output, pickle.HIGHEST_PROTOCOL)
    return triee

Beispiel #14

0

Datei anzeigen

Datei: converter.py Projekt: kenhys/EgoisticLily

    def __init__(self, model_dir):
        """ 変換モジュール

        :param model_dir:
        """
        self.words = []
        trie_keys = []
        trie_values = []
        self.loss = nn.MSELoss()

        model_path = os.path.abspath(model_dir)
        with open(model_path + "/words.csv", "r") as f:
            reader = csv.reader(f, delimiter=",")
            for i, row in enumerate(reader):
                self.words.append(
                    [row[0], int(row[2]),
                     int(row[3]),
                     int(row[4])])
                trie_keys.append(row[1])
                trie_values.append([i])

        self.trie = marisa_trie.RecordTrie("<I", zip(trie_keys, trie_values))
        self.word_info = Words()

        self.model = CostAeModel(len(self.word_info.word_type_list[0]),
                                 len(self.word_info.word_type_list[1]))
        self.model.load_state_dict(torch.load(model_dir + "/dnn.mdl"))

Beispiel #15

0

Datei anzeigen

Datei: wiki_link_db.py Projekt: JKrse/LUKE_thesis

    def build(dump_db, mention_db, out_file, pool_size, chunk_size):
        title_trie = marisa_trie.Trie(dump_db.titles())
        data = {}

        with tqdm(total=dump_db.page_size(), mininterval=0.5) as pbar:
            initargs = (dump_db, mention_db, title_trie)
            with closing(
                    Pool(pool_size,
                         initializer=WikiLinkDB._initialize_worker,
                         initargs=initargs)) as pool:
                for title, links in pool.imap_unordered(
                        WikiLinkDB._extract_wiki_links,
                        title_trie,
                        chunksize=chunk_size):
                    data[title] = links
                    pbar.update()

        mention_trie = marisa_trie.Trie(text for links in data.values()
                                        for text, _, _ in links)

        def item_generator():
            for title, links in data.items():
                for mention_text, link_title_id, link_prob in links:
                    yield title, (mention_trie[mention_text], link_title_id,
                                  link_prob)

        data_trie = marisa_trie.RecordTrie("<IIf", item_generator())

        joblib.dump(
            dict(title_trie=title_trie,
                 mention_trie=mention_trie,
                 data_trie=data_trie), out_file)

Beispiel #16

0

Datei anzeigen

    def test_iteritems(self):
        fmt, data = self.data()
        trie = marisa_trie.RecordTrie(fmt, data)
        assert trie.items() == list(trie.iteritems())

        for key, value in data:
            prefix = key[:5]
            assert trie.items(prefix) == list(trie.iteritems(prefix))

Beispiel #17

0

Datei anzeigen

    def __init__(self, filename, featname, format=None):
        import marisa_trie

        self.filename = filename
        self.data = marisa_trie.RecordTrie(format or GAZETTEER_FORMAT)
        self.data.load(filename)

        super(MarisaGeonamesGlobalFeature, self).__init__(self.data, featname)

Beispiel #18

0

Datei anzeigen

Datei: geonames.py Projekt: zanachka/webstruct

def to_marisa(df, columns=GAZETTEER_COLUMNS, format=GAZETTEER_FORMAT):
    """
    Encode ``pandas.DataFrame`` with GeoNames data
    (loaded using :func:`read_geonames` and maybe filtered in some way)
    to a ``marisa.RecordTrie``.
    """
    import marisa_trie
    return marisa_trie.RecordTrie(format, _iter_geonames_items(df, columns))

Beispiel #19

0

Datei anzeigen

 def _construct_trie(self, hanzi):
     pairs = []
     for hz, df in self.hanzi.items():
         py, en = df
         py = str(''.join(filter(lambda x: x in ASCII_CHARS, ' '.join(py))))
         pairs.append((hz, (py.encode('utf-8'),)))
     trie = marisa_trie.RecordTrie(str('@s'), pairs)
     return trie

Beispiel #20

0

Datei anzeigen

Datei: test_record_trie.py Projekt: ha-lins/forte

def test_dumps_loads(data):
    trie = marisa_trie.RecordTrie("<H?", data)

    buf = io.BytesIO()
    pickle.dump(trie, buf)
    buf.seek(0)

    assert trie == pickle.load(buf)

Beispiel #21

0

Datei anzeigen

Datei: entity_db.py Projekt: studio-ousia/luke

    def build_from_wikipedia(
        dump_db: DumpDB,
        tokenizer,
        normalizer,
        out_file,
        max_candidate_size,
        min_mention_count,
        max_mention_length,
        pool_size,
        chunk_size,
    ):
        logger.info("Extracting all entity names...")

        title_dict = defaultdict(Counter)
        with tqdm(total=dump_db.page_size(), mininterval=0.5) as pbar:
            initargs = (dump_db, tokenizer, normalizer, max_mention_length)
            with closing(
                    Pool(pool_size,
                         initializer=EntityDB._initialize_worker,
                         initargs=initargs)) as pool:
                for ret in pool.imap_unordered(
                        EntityDB._extract_name_entity_pairs,
                        dump_db.titles(),
                        chunksize=chunk_size):
                    for (name, title) in ret:
                        title_dict[title][name] += 1
                    pbar.update()

        logger.info("Building DB...")

        mentions = frozenset([
            mention for mention_counter in title_dict.values()
            for mention in mention_counter.keys()
        ])
        title_trie = frozenset(title_dict.keys())
        mention_trie = marisa_trie.Trie(mentions)

        def item_generator():
            for (title, mention_counter) in title_dict.items():
                for (mention, mention_count
                     ) in mention_counter.most_common()[:max_candidate_size]:
                    if mention_count < min_mention_count:
                        continue
                    yield (title, (mention_trie[mention], mention_count))

        data_trie = marisa_trie.RecordTrie("<II", item_generator())

        joblib.dump(
            dict(
                title_trie=title_trie,
                mention_trie=mention_trie,
                data_trie=data_trie,
                tokenizer=tokenizer,
                normalizer=normalizer,
                max_mention_length=max_mention_length,
            ),
            out_file,
        )

Beispiel #22

0

Datei anzeigen

 def article2id(self):
     if self._article2id is None:
         if self.verbose:
             print('load %r' % ("article2id", ))
         self._article2id = marisa_trie.RecordTrie('i').load(
             join(self.path, "wikititle2wikidata.marisa"))
         if self.verbose:
             print("done.")
     return self._article2id

Beispiel #23

0

Datei anzeigen

    def test_getitem(self):
        fmt, data = self.data()
        trie = marisa_trie.RecordTrie(fmt, data)

        for key, value in data:
            assert trie[key] == [value]

        with pytest.raises(KeyError):
            trie['2135']

Beispiel #24

0

Datei anzeigen

    def build_from_p_e_m_file(p_e_m_file, dump_db, wiki_mention_db, tokenizer,
                              normalizer, out_file, max_mention_length):
        with open(p_e_m_file) as f:
            lines = f.readlines()

        name_dict = defaultdict(Counter)

        for line in tqdm(lines):
            (text, total_count, *data) = line.rstrip().split("\t")
            total_count = int(total_count)
            text = text.replace(SEP_CHAR, REP_CHAR)
            tokens = tuple(
                normalizer.normalize(t) for t in tokenizer.tokenize(text))
            if len(tokens) <= max_mention_length:
                for entry in data:
                    (_, prob, *title_parts) = entry.split(",")
                    title = ",".join(title_parts).replace("_", " ")
                    title = dump_db.resolve_redirect(title)
                    count = int(float(prob) * total_count)
                    name_dict[tokens][title] += count

        titles = frozenset([
            title for entity_counter in name_dict.values()
            for title in entity_counter.keys()
        ])
        title_trie = marisa_trie.Trie(titles)

        def item_generator():
            for (tokens, entity_counter) in name_dict.items():
                name = SEP_CHAR.join(tokens)
                total_link_count = sum(entity_counter.values())

                wiki_mentions = wiki_mention_db.query(tokens)
                if wiki_mentions:
                    doc_count = int(total_link_count /
                                    wiki_mentions[0].link_prob)
                else:
                    doc_count = 0

                for (title, link_count) in entity_counter.most_common():
                    yield (name, (title_trie[title], link_count,
                                  total_link_count, doc_count))

        data_trie = marisa_trie.RecordTrie("<IIII", item_generator())
        mention_trie = marisa_trie.Trie(data_trie.keys())

        joblib.dump(
            dict(
                title_trie=title_trie,
                mention_trie=mention_trie,
                data_trie=data_trie,
                tokenizer=tokenizer,
                normalizer=normalizer,
                max_mention_length=max_mention_length,
            ),
            out_file,
        )

Beispiel #25

0

Datei anzeigen

Datei: aliasmention_trie.py Projekt: pombredanne/bootleg

 def load(self, load_dir):
     self._max_value = load_json_file(
         filename=os.path.join(load_dir, "max_value.json"))
     self._stoi = marisa_trie.Trie().mmap(
         os.path.join(load_dir, f"vocabulary_trie.marisa"))
     self._itos = lambda x: self._stoi.restore_key(x)
     self._record_trie = marisa_trie.RecordTrie(
         self._get_fmt_string(self._max_value)).mmap(
             os.path.join(load_dir, f"record_trie.marisa"))

Beispiel #26

0

Datei anzeigen

 def load_index(self, dictionary):
     if not dictionary.is_inmemory() and os.path.isfile(
             dictionary.get_path(PydicStemmer.INDEX_FILENAME)):
         index = marisa_trie.RecordTrie(PydicStemmer.MARISA_FORMAT)
         index.load(dictionary.get_path(PydicStemmer.INDEX_FILENAME))
     else:
         index = self.build_index(dictionary)
         if not dictionary.is_inmemory():
             index.save(dictionary.get_path(PydicStemmer.INDEX_FILENAME))
     return index

Beispiel #27

0

Datei anzeigen

 def build_index(self, dictionary):
     return marisa_trie.RecordTrie(
         PydicStemmer.MARISA_FORMAT,
         ifilter(
             lambda t: t[0].find(' ') == -1,
             izip(
                 imap(lambda i: dictionary.id_base(i)[::-1].lower(),
                      dictionary),
                 imap(lambda i: (i.id, ), dictionary),
             )))

Beispiel #28

0

Datei anzeigen

 def test_prefixes(self):
     trie = marisa_trie.RecordTrie(str("<H"), [
         ('foo', [1]),
         ('bar', [2]),
         ('foobar', [3]),
     ])
     assert trie.prefixes('foo') == ['foo']
     assert trie.prefixes('foobar') == ['foo', 'foobar']
     assert trie.prefixes('bara') == ['bar']
     assert trie.prefixes('f') == []

Beispiel #29

0

Datei anzeigen

Datei: utils.py Projekt: pombredanne/bootleg

def load_single_item_trie(file):
    """Load a marisa trie with integer values from memmap file.

    Args:
        file: marisa input file

    Returns: marisa trie
    """
    assert exists_dir(file)
    return marisa_trie.RecordTrie("<l").mmap(file)

Beispiel #30

0

Datei anzeigen

def save_tries(run_data):
    print('saving tries')
    paths = get_paths(run_data)

    if os.path.exists(paths['forward_counter']):
        os.remove(paths['forward_counter'])
    if os.path.exists(paths['backward_counter']):
        os.remove(paths['backward_counter'])

    print('\tcounting ngrams')
    with shelve.open(paths['forward_counter'], 'n') as forward_counter:
        with shelve.open(paths['backward_counter'], 'n') as backward_counter:
            with shelve.open(paths['emails'], 'r') as emails:
                partial_counters = map(
                    functools.partial(count_ngrams,
                                      ngram_length=run_data['ngram_length']),
                    emails.values())
                for i, (forward_partial,
                        backward_partial) in enumerate(partial_counters):
                    for k, v in forward_partial.items():
                        if k in forward_counter:
                            forward_counter[k] += v
                        else:
                            forward_counter[k] = v
                    for k, v in backward_partial.items():
                        if k in backward_counter:
                            backward_counter[k] += v
                        else:
                            backward_counter[k] = v
                    if i % 1000 == 0:
                        print('\tprocessed {} emails'.format(i))

            print('\tbuilding tries')
            for reverse in (False, True):
                if reverse:
                    path = paths['reverse_trie']
                    counter = backward_counter
                else:
                    path = paths['trie']
                    counter = forward_counter

                if os.path.exists(path):
                    os.remove(path)

                trie = marisa_trie.RecordTrie('I',
                                              ((k, (v, ))
                                               for k, v in counter.items()),
                                              order=marisa_trie.WEIGHT_ORDER)
                trie.save(path)

            print('saved tries')
            print('unique n-grams: {}'.format(len(forward_counter)))

    os.remove(paths['forward_counter'])
    os.remove(paths['backward_counter'])