Beispiel #1
0
 def __init__(self, args):
     super().__init__(args)
     self.alias2qids = marisa_trie.RecordTrie(f"<{'p'*5}").mmap(
         os.path.join(self.args.database_dir,
                      'es_material/alias2qids.marisa'))
     self.qid2typeqid = marisa_trie.RecordTrie("<p").mmap(
         os.path.join(self.args.database_dir,
                      'es_material/qid2typeqid.marisa'))
Beispiel #2
0
 def makeMarisaTrie(self):
     fmt = "fH"
     fixedtrie = marisa_trie.RecordTrie(
         fmt,
         zip(map(u, self.fixedwords),
             zip(self.fixedscores, range(len(self.fixedscores)))))
     stemtrie = marisa_trie.RecordTrie(
         fmt,
         zip(map(u, self.stemwords),
             zip(self.stemscores, range(len(self.stemscores)))))
     fixedtrie.save('{0}/{1:.2f}-fixed.marisa'.format(
         self.folders[self.cindex], self.stopVal))
     stemtrie.save('{0}/{1:.2f}-stem.marisa'.format(
         self.folders[self.cindex], self.stopVal))
     return (fixedtrie, stemtrie)
Beispiel #3
0
def save_forgotten_sample(run_data):
    print('saving forgotten email sample')
    paths = get_paths(run_data)

    if run_data['forget_method'] == 'frequency':
        if 'trie' not in run_data:
            run_data['trie'] = marisa_trie.RecordTrie('I')
            run_data['trie'].load(paths['trie'])

    with shelve.open(paths['forgotten_sample'], 'n') as forgotten_sample:
        with shelve.open(paths['original_sample'], 'r') as original_sample:
            for i, ratio in enumerate(run_data['ratios']):
                sample_by_ratio = {}
                for (md5, tokens) in original_sample.items():
                    forgotten_email, frequency_threshold = forget_email(
                        tokens, ratio, run_data)
                    item = {
                        'md5': md5,
                        'original_email': tokens,
                        'length': len(tokens),
                        'ratio': ratio,
                        'forgotten_email': forgotten_email,
                        'frequency_threshold': frequency_threshold,
                        'bloom_filter': None,
                    }
                    sample_by_ratio[md5] = item
                forgotten_sample[str(ratio)] = sample_by_ratio

    print('saved forgotten email sample')
Beispiel #4
0
def read_umls(UMLSfile, google_concepts_list):
    try:
        desc, UMLS = pickle.load(open('UMLSlite.pk'))
    except:
        f = open(UMLSfile)
        preUMLS = [(line.strip() + ' ').split(' |||| ')[:-1] for line in f]
        f.close()
        print "Read UMLS"
        UMLS = map(remake, preUMLS[1:])
        pickle.dump((preUMLS[0], UMLS), open('UMLSlite.pk', 'w'))
    print "Loaded UMLS"
    UMLSrest = filter(lambda x: len(good_type_list(x)) > 0, UMLS)
    # Prefix trees
    fmt = "<i"
    # Regular
    data = []
    lookup = {}
    for i, concept in enumerate(UMLSrest):
        i = int(concept[0][1:]) if concept[0].lower() != "cui-less" else 0
        for st in concept[4]:
            data.append((unicode(st), (i,)))
    for i, (mid, descriptions) in enumerate(google_concepts_list):
        for st in descriptions:
            data.append((unicode(st), (len(UMLSrest) + i,)))
            lookup[st] = lookup.get(st, []) + [mid]
    trie = marisa_trie.RecordTrie(fmt, data)
    print "Made trie"
    foo = map(lambda x: auxUMLS(x, lookup), UMLS)
    print "Made lookup"
    return UMLS, lookup, trie
def main():
    args = parse_args()
    makedirs(args.out, exist_ok=True)
    wikipedia2wikidata_trie = marisa_trie.RecordTrie('i').load(
        args.wikipedia2wikidata_trie)
    print('loaded trie')
    redirections = load_redirections(args.redirections)
    anchor_trie = construct_anchor_trie(
        anchor_tags=args.anchor_tags,
        wikipedia2wikidata_trie=wikipedia2wikidata_trie,
        redirections=redirections,
        prefix=args.prefix)
    anchor_trie.save(join(args.out, 'trie.marisa'))
    ((trie_index2indices_offsets, trie_index2indices_values,
      trie_index2indices_counts),
     (trie_index2contexts_offsets, trie_index2contexts_values,
      trie_index2contexts_counts)) = construct_mapping(
          anchor_tags=args.anchor_tags,
          wikipedia2wikidata_trie=wikipedia2wikidata_trie,
          redirections=redirections,
          prefix=args.prefix,
          anchor_trie=anchor_trie)
    np.save(join(args.out, "trie_index2indices_offsets.npy"),
            trie_index2indices_offsets)
    np.save(join(args.out, "trie_index2indices_values.npy"),
            trie_index2indices_values)
    np.save(join(args.out, "trie_index2indices_counts.npy"),
            trie_index2indices_counts)

    np.save(join(args.out, "trie_index2contexts_offsets.npy"),
            trie_index2contexts_offsets)
    np.save(join(args.out, "trie_index2contexts_values.npy"),
            trie_index2contexts_values)
    np.save(join(args.out, "trie_index2contexts_counts.npy"),
            trie_index2contexts_counts)
Beispiel #6
0
def setup():
    #make array of words objects with word and frequency
    f = open("words.txt")
    words = []
    freqs = []
    for line in f:
        line = line.strip()
        line = line.lower()
        if " " in line:
            line = line.split(" ")
        else:
            line = line.split("\t")
        word = line[0]
        freq = line[1]
        freqInt = int(freq)
        words.append(word)
        freqs.append(freqInt)

    f.close()

    #make trie of ranges in word array
    g = open("ranges.txt")
    prefixes = []
    ranges = []
    for line in g:
        line = line.strip()
        line = line.split("\t")
        lineu = unicode(line[0])
        prefixes.append(lineu)
        rangeind = (int(line[1]), int(line[2]))
        ranges.append(rangeind)
    fmt = "<LL"
    trie = marisa_trie.RecordTrie(fmt, zip(prefixes, ranges))
    g.close()
    return words, freqs, prefixes, trie
Beispiel #7
0
def build_fixed_point(out, prefix):
    wiki_fixed_point_save = join(
        out, "wikidata_%s_fixed_points_values.npy" % (prefix, ))
    if not true_exists(wiki_fixed_point_save):
        print("building %s fixed point property." % (prefix, ))
        trie = marisa_trie.RecordTrie('i').load(
            join(out, WIKITILE_2_WIKIDATA_TRIE_NAME))
        num_items = count_lines(join(out, WIKIDATA_IDS_NAME))
        fixed_point_relation = {}

        category_prefix = "%s/Category:" % (prefix, )
        article_prefix = "%s/" % (prefix, )
        wikititle2wikidata_path = join(out, WIKITILE_2_WIKIDATA_TSV_NAME)
        relevant_items = trie.iteritems(category_prefix)

        for name, category_idx in relevant_items:
            article_name = article_prefix + name[len(category_prefix):]
            for fixed_point_name_alternate in fixed_point_name_alternates(
                    article_name):
                matches = trie.get(fixed_point_name_alternate, None)
                if matches is not None and len(matches) > 0:
                    fixed_point_relation[category_idx] = [matches[0][0]]
                    break
        print("Found %d fixed point relations for %s" % (
            len(fixed_point_relation),
            prefix,
        ))
        save_record_with_offset(
            join(out, "wikidata_%s_fixed_points" % (prefix, )),
            fixed_point_relation, num_items)
    def __init__(self, lex_loc):
        print("loading SPECIALIST Lexicon...")
        self.lexicon = load_lexicon(lex_loc)
        self.eui_array = []

        print("building SPECIALIST Lexicon trie...")
        tups = []
        curr_i = 0
        for eui, lex_info in self.lexicon.items():
            self.eui_array.append(eui)

            # tups.append((lex_info["base"].decode('utf-8'), [curr_i]))
            tups.append((lex_info["base"], [curr_i]))

            if "spelling variants" in lex_info:
                for spell_var in lex_info["spelling variants"]:
                    # tups.append((spell_var.decode('utf-8'), [curr_i]))
                    tups.append((spell_var, [curr_i]))

            if "nominalization" in lex_info:
                for nom in lex_info["nominalization"]:
                    # tups.append((nom.decode('utf-8'), [curr_i]))
                    tups.append((nom, [curr_i]))

            if "inflection variants" in lex_info:
                for infl_var in lex_info["inflection variants"]:
                    # tups.append((infl_var.decode('utf-8'), [curr_i]))
                    tups.append((infl_var, [curr_i]))

            curr_i += 1

        self.trie = marisa_trie.RecordTrie("<i", tups)
Beispiel #9
0
def create_trie_obj(location_list: [Location], code_blacklist: {str},
                    word_blacklist: {str}):
    """
    Creates a RecordTrie with the marisa library
    :param location_list: a list with all locations
    :param code_blacklist: a list with all codes to blacklist
    :param word_blacklist: a list with all words which should be blacklisted
    :rtype: marisa_trie.RecordTrie
    """
    code_id_type_tuples = []
    for location in location_list:
        code_id_type_tuples.extend(location.code_id_type_tuples())

    code_id_type_tuples = [
        code_tuple for code_tuple in code_id_type_tuples
        if code_tuple[0] not in code_blacklist
        and code_tuple[0] not in word_blacklist and len(code_tuple[0]) > 2
    ]

    for code in word_blacklist:
        code_id_type_tuples.append((code, ('0' * 32, -1)))

    encoded_tuples = [(code, (uid.encode(), code_type))
                      for code, (uid, code_type) in code_id_type_tuples]

    return marisa_trie.RecordTrie('<32sh', encoded_tuples)
Beispiel #10
0
def popular_trie():
    """
    Generate a trie for the most popular words, like "to", "the", etc.
    Popular trie should be used if the branching factor for the long trie is large (>1000)
    :return: a popular trie, which also gets stored on the drive
    """
    try:
        grams = pd.read_pickle(resources_path + 'words.pkl')
    except IOError:
        grams = load_words()

    big_ones = dict()
    for elem in (grams.groupby(['first']).sum()).iterrows():
        count = elem[1]['freq']
        if count > 7000:
            big_ones[elem[1].name] = count
    grams = grams.loc[grams['first'].isin(big_ones)]
    grams = grams.loc[grams['freq'] > 1000]

    grams['freq'] = grams['freq'].apply(lambda x: (x, ))

    freqs = grams['freq'].values
    phrases = grams['first'] + " " + grams['second']
    fmt = "@i"
    phrases = list(map(lambda x: np.unicode(x), phrases))
    triee = marisa.RecordTrie(fmt, zip(phrases, freqs))
    with open(resources_path + 'popular_trie.pkl', 'wb') as output:
        pickle.dump(triee, output, pickle.HIGHEST_PROTOCOL)
    with open(resources_path + 'dict.pkl', 'wb') as output:
        pickle.dump(big_ones, output, pickle.HIGHEST_PROTOCOL)

    return triee
    def build_tree_from_dictionaries(self):
        wc_lists = defaultdict(list)
        for need_lemmatization, generator, source_name in [
            (False, self.read_freq_2011, '2011'),
            (False, self.read_freq_hagen, 'hagen'),
            (True, self.read_freq_litc_win, 'litc_win'),
            (False, self.read_freq_wikipedia, 'wikipedia'),
            (True, self.read_freq_flibusta, 'flibusta'),
            (True, self.read_freq_puhlyi, 'puhlyi')
        ]:
            print('read %s' % source_name)
            wc = generator()

            wc2 = Counter()
            for w, ipm in wc.items():
                wc2[w.lower()] += ipm
            wc = wc2

            del wc2
            for w, ipm in wc.items():
                wc_lists[w].append(ipm)
        wc = dict()
        for w, cs in wc_lists.items():
            wc[w] = sum(cs) / len(cs)
        return marisa_trie.RecordTrie('<f',
                                      [(w, (ipm, )) for w, ipm in wc.items()])
Beispiel #12
0
def load_data(branch_limit=10000):
    """
    Load the longest version of the trie, containing most n-grams(limited by the branch_limit)
    :param branch_limit: the limit of children for each node of the trie. Default 10000
    :return: the trie, which also gets stored on the drive
    """
    try:
        grams = pd.read_pickle(resources_path + 'words.pkl')
    except IOError:
        grams = load_words()

    grams = grams.sort_values(by='freq', ascending=False)

    # Limit the number of children for each node
    grams = grams.groupby("first").head(branch_limit)

    # The transformation from int to a singular tuple is required by the trie API
    grams['freq'] = grams['freq'].apply(lambda x: (x, ))

    freqs = grams['freq'].values
    phrases = grams['first'] + " " + grams['second']
    fmt = "@i"
    phrases = list(map(lambda x: np.unicode(x), phrases))
    triee = marisa.RecordTrie(fmt, zip(phrases, freqs))

    # Store the trie
    with open(resources_path + 'trie.pkl', 'wb') as output:
        pickle.dump(triee, output, pickle.HIGHEST_PROTOCOL)
    return triee
Beispiel #13
0
def one_letter():
    """
    Generate a trie that is used for a special case where only one letter is given
    to the autocomplete function. Since it's very expensive to go over all combinations
    each time, this function does it once and stores the result.
    :return: a one-letter trie, which also gets stored on the drive
    """
    try:
        grams = pd.read_pickle(resources_path + 'words.pkl')
    except IOError:
        grams = load_words()

    short_grams = grams.copy()
    # short_grams['first'] = short_grams[['first']].apply(lambda x: x[0].lower())
    short_grams['indices'] = short_grams.index

    res = short_grams.groupby("first").apply(
        lambda group: group.nlargest(50, columns='freq'))
    indices = res['indices'].values
    grams = grams.iloc[indices, :]
    grams['freq'] = grams['freq'].apply(lambda x: (x, ))

    freqs = grams['freq'].values
    phrases = grams['first'] + " " + grams['second']
    fmt = "@i"
    phrases = list(map(lambda x: np.unicode(x), phrases))
    triee = marisa.RecordTrie(fmt, zip(phrases, freqs))
    with open(resources_path + 'short_trie.pkl', 'wb') as output:
        pickle.dump(triee, output, pickle.HIGHEST_PROTOCOL)
    return triee
Beispiel #14
0
    def __init__(self, model_dir):
        """ 変換モジュール

        :param model_dir:
        """
        self.words = []
        trie_keys = []
        trie_values = []
        self.loss = nn.MSELoss()

        model_path = os.path.abspath(model_dir)
        with open(model_path + "/words.csv", "r") as f:
            reader = csv.reader(f, delimiter=",")
            for i, row in enumerate(reader):
                self.words.append(
                    [row[0], int(row[2]),
                     int(row[3]),
                     int(row[4])])
                trie_keys.append(row[1])
                trie_values.append([i])

        self.trie = marisa_trie.RecordTrie("<I", zip(trie_keys, trie_values))
        self.word_info = Words()

        self.model = CostAeModel(len(self.word_info.word_type_list[0]),
                                 len(self.word_info.word_type_list[1]))
        self.model.load_state_dict(torch.load(model_dir + "/dnn.mdl"))
Beispiel #15
0
    def build(dump_db, mention_db, out_file, pool_size, chunk_size):
        title_trie = marisa_trie.Trie(dump_db.titles())
        data = {}

        with tqdm(total=dump_db.page_size(), mininterval=0.5) as pbar:
            initargs = (dump_db, mention_db, title_trie)
            with closing(
                    Pool(pool_size,
                         initializer=WikiLinkDB._initialize_worker,
                         initargs=initargs)) as pool:
                for title, links in pool.imap_unordered(
                        WikiLinkDB._extract_wiki_links,
                        title_trie,
                        chunksize=chunk_size):
                    data[title] = links
                    pbar.update()

        mention_trie = marisa_trie.Trie(text for links in data.values()
                                        for text, _, _ in links)

        def item_generator():
            for title, links in data.items():
                for mention_text, link_title_id, link_prob in links:
                    yield title, (mention_trie[mention_text], link_title_id,
                                  link_prob)

        data_trie = marisa_trie.RecordTrie("<IIf", item_generator())

        joblib.dump(
            dict(title_trie=title_trie,
                 mention_trie=mention_trie,
                 data_trie=data_trie), out_file)
Beispiel #16
0
    def test_iteritems(self):
        fmt, data = self.data()
        trie = marisa_trie.RecordTrie(fmt, data)
        assert trie.items() == list(trie.iteritems())

        for key, value in data:
            prefix = key[:5]
            assert trie.items(prefix) == list(trie.iteritems(prefix))
Beispiel #17
0
    def __init__(self, filename, featname, format=None):
        import marisa_trie

        self.filename = filename
        self.data = marisa_trie.RecordTrie(format or GAZETTEER_FORMAT)
        self.data.load(filename)

        super(MarisaGeonamesGlobalFeature, self).__init__(self.data, featname)
Beispiel #18
0
def to_marisa(df, columns=GAZETTEER_COLUMNS, format=GAZETTEER_FORMAT):
    """
    Encode ``pandas.DataFrame`` with GeoNames data
    (loaded using :func:`read_geonames` and maybe filtered in some way)
    to a ``marisa.RecordTrie``.
    """
    import marisa_trie
    return marisa_trie.RecordTrie(format, _iter_geonames_items(df, columns))
Beispiel #19
0
 def _construct_trie(self, hanzi):
     pairs = []
     for hz, df in self.hanzi.items():
         py, en = df
         py = str(''.join(filter(lambda x: x in ASCII_CHARS, ' '.join(py))))
         pairs.append((hz, (py.encode('utf-8'),)))
     trie = marisa_trie.RecordTrie(str('@s'), pairs)
     return trie
Beispiel #20
0
def test_dumps_loads(data):
    trie = marisa_trie.RecordTrie("<H?", data)

    buf = io.BytesIO()
    pickle.dump(trie, buf)
    buf.seek(0)

    assert trie == pickle.load(buf)
Beispiel #21
0
    def build_from_wikipedia(
        dump_db: DumpDB,
        tokenizer,
        normalizer,
        out_file,
        max_candidate_size,
        min_mention_count,
        max_mention_length,
        pool_size,
        chunk_size,
    ):
        logger.info("Extracting all entity names...")

        title_dict = defaultdict(Counter)
        with tqdm(total=dump_db.page_size(), mininterval=0.5) as pbar:
            initargs = (dump_db, tokenizer, normalizer, max_mention_length)
            with closing(
                    Pool(pool_size,
                         initializer=EntityDB._initialize_worker,
                         initargs=initargs)) as pool:
                for ret in pool.imap_unordered(
                        EntityDB._extract_name_entity_pairs,
                        dump_db.titles(),
                        chunksize=chunk_size):
                    for (name, title) in ret:
                        title_dict[title][name] += 1
                    pbar.update()

        logger.info("Building DB...")

        mentions = frozenset([
            mention for mention_counter in title_dict.values()
            for mention in mention_counter.keys()
        ])
        title_trie = frozenset(title_dict.keys())
        mention_trie = marisa_trie.Trie(mentions)

        def item_generator():
            for (title, mention_counter) in title_dict.items():
                for (mention, mention_count
                     ) in mention_counter.most_common()[:max_candidate_size]:
                    if mention_count < min_mention_count:
                        continue
                    yield (title, (mention_trie[mention], mention_count))

        data_trie = marisa_trie.RecordTrie("<II", item_generator())

        joblib.dump(
            dict(
                title_trie=title_trie,
                mention_trie=mention_trie,
                data_trie=data_trie,
                tokenizer=tokenizer,
                normalizer=normalizer,
                max_mention_length=max_mention_length,
            ),
            out_file,
        )
Beispiel #22
0
 def article2id(self):
     if self._article2id is None:
         if self.verbose:
             print('load %r' % ("article2id", ))
         self._article2id = marisa_trie.RecordTrie('i').load(
             join(self.path, "wikititle2wikidata.marisa"))
         if self.verbose:
             print("done.")
     return self._article2id
Beispiel #23
0
    def test_getitem(self):
        fmt, data = self.data()
        trie = marisa_trie.RecordTrie(fmt, data)

        for key, value in data:
            assert trie[key] == [value]

        with pytest.raises(KeyError):
            trie['2135']
Beispiel #24
0
    def build_from_p_e_m_file(p_e_m_file, dump_db, wiki_mention_db, tokenizer,
                              normalizer, out_file, max_mention_length):
        with open(p_e_m_file) as f:
            lines = f.readlines()

        name_dict = defaultdict(Counter)

        for line in tqdm(lines):
            (text, total_count, *data) = line.rstrip().split("\t")
            total_count = int(total_count)
            text = text.replace(SEP_CHAR, REP_CHAR)
            tokens = tuple(
                normalizer.normalize(t) for t in tokenizer.tokenize(text))
            if len(tokens) <= max_mention_length:
                for entry in data:
                    (_, prob, *title_parts) = entry.split(",")
                    title = ",".join(title_parts).replace("_", " ")
                    title = dump_db.resolve_redirect(title)
                    count = int(float(prob) * total_count)
                    name_dict[tokens][title] += count

        titles = frozenset([
            title for entity_counter in name_dict.values()
            for title in entity_counter.keys()
        ])
        title_trie = marisa_trie.Trie(titles)

        def item_generator():
            for (tokens, entity_counter) in name_dict.items():
                name = SEP_CHAR.join(tokens)
                total_link_count = sum(entity_counter.values())

                wiki_mentions = wiki_mention_db.query(tokens)
                if wiki_mentions:
                    doc_count = int(total_link_count /
                                    wiki_mentions[0].link_prob)
                else:
                    doc_count = 0

                for (title, link_count) in entity_counter.most_common():
                    yield (name, (title_trie[title], link_count,
                                  total_link_count, doc_count))

        data_trie = marisa_trie.RecordTrie("<IIII", item_generator())
        mention_trie = marisa_trie.Trie(data_trie.keys())

        joblib.dump(
            dict(
                title_trie=title_trie,
                mention_trie=mention_trie,
                data_trie=data_trie,
                tokenizer=tokenizer,
                normalizer=normalizer,
                max_mention_length=max_mention_length,
            ),
            out_file,
        )
 def load(self, load_dir):
     self._max_value = load_json_file(
         filename=os.path.join(load_dir, "max_value.json"))
     self._stoi = marisa_trie.Trie().mmap(
         os.path.join(load_dir, f"vocabulary_trie.marisa"))
     self._itos = lambda x: self._stoi.restore_key(x)
     self._record_trie = marisa_trie.RecordTrie(
         self._get_fmt_string(self._max_value)).mmap(
             os.path.join(load_dir, f"record_trie.marisa"))
Beispiel #26
0
 def load_index(self, dictionary):
     if not dictionary.is_inmemory() and os.path.isfile(
             dictionary.get_path(PydicStemmer.INDEX_FILENAME)):
         index = marisa_trie.RecordTrie(PydicStemmer.MARISA_FORMAT)
         index.load(dictionary.get_path(PydicStemmer.INDEX_FILENAME))
     else:
         index = self.build_index(dictionary)
         if not dictionary.is_inmemory():
             index.save(dictionary.get_path(PydicStemmer.INDEX_FILENAME))
     return index
Beispiel #27
0
 def build_index(self, dictionary):
     return marisa_trie.RecordTrie(
         PydicStemmer.MARISA_FORMAT,
         ifilter(
             lambda t: t[0].find(' ') == -1,
             izip(
                 imap(lambda i: dictionary.id_base(i)[::-1].lower(),
                      dictionary),
                 imap(lambda i: (i.id, ), dictionary),
             )))
Beispiel #28
0
 def test_prefixes(self):
     trie = marisa_trie.RecordTrie(str("<H"), [
         ('foo', [1]),
         ('bar', [2]),
         ('foobar', [3]),
     ])
     assert trie.prefixes('foo') == ['foo']
     assert trie.prefixes('foobar') == ['foo', 'foobar']
     assert trie.prefixes('bara') == ['bar']
     assert trie.prefixes('f') == []
Beispiel #29
0
def load_single_item_trie(file):
    """Load a marisa trie with integer values from memmap file.

    Args:
        file: marisa input file

    Returns: marisa trie
    """
    assert exists_dir(file)
    return marisa_trie.RecordTrie("<l").mmap(file)
Beispiel #30
0
def save_tries(run_data):
    print('saving tries')
    paths = get_paths(run_data)

    if os.path.exists(paths['forward_counter']):
        os.remove(paths['forward_counter'])
    if os.path.exists(paths['backward_counter']):
        os.remove(paths['backward_counter'])

    print('\tcounting ngrams')
    with shelve.open(paths['forward_counter'], 'n') as forward_counter:
        with shelve.open(paths['backward_counter'], 'n') as backward_counter:
            with shelve.open(paths['emails'], 'r') as emails:
                partial_counters = map(
                    functools.partial(count_ngrams,
                                      ngram_length=run_data['ngram_length']),
                    emails.values())
                for i, (forward_partial,
                        backward_partial) in enumerate(partial_counters):
                    for k, v in forward_partial.items():
                        if k in forward_counter:
                            forward_counter[k] += v
                        else:
                            forward_counter[k] = v
                    for k, v in backward_partial.items():
                        if k in backward_counter:
                            backward_counter[k] += v
                        else:
                            backward_counter[k] = v
                    if i % 1000 == 0:
                        print('\tprocessed {} emails'.format(i))

            print('\tbuilding tries')
            for reverse in (False, True):
                if reverse:
                    path = paths['reverse_trie']
                    counter = backward_counter
                else:
                    path = paths['trie']
                    counter = forward_counter

                if os.path.exists(path):
                    os.remove(path)

                trie = marisa_trie.RecordTrie('I',
                                              ((k, (v, ))
                                               for k, v in counter.items()),
                                              order=marisa_trie.WEIGHT_ORDER)
                trie.save(path)

            print('saved tries')
            print('unique n-grams: {}'.format(len(forward_counter)))

    os.remove(paths['forward_counter'])
    os.remove(paths['backward_counter'])