Exemple #1
0
    def __init__(self, infile, output_prefix):
        self.__infile = infile
        self.__output_prefix = output_prefix
        self.__ngram_line_regex = re.compile(NGRAM_LINE_REGEX)

        self.__ngram_entries = [{} for x in range(0, NGRAM)]

        self.__vocab_keyset = marisa.Keyset()
        self.__input_keyset = marisa.Keyset()

        self.__vocab_trie = marisa.Trie()
        self.__input_trie = marisa.Trie()

        self.__min_cost = 0.0
Exemple #2
0
    def open(self):
        if self._opened:
            raise RuntimeError('Tried to open an open Dict!')

        self._trie = marisa.Trie()
        self._trie.mmap(self._index_filename)

        with open(self._index_filename, 'rb') as f:
            f.seek(self._trie.io_size())
            self._key_id_to_index = json.load(f)

        self._dict_file = open(self._dict_filename)
        self._dict_mm = mmap.mmap(self._dict_file.fileno(),
                                  0,
                                  access=mmap.ACCESS_READ)
        self._opened = True
Exemple #3
0
def main():
    def output_ngram(f_out, n, context_dict, context_id_dicts):
        f_out.write('#%d\t%d\n' % (n, len(context_dict)))

        for context_key in sorted(context_dict.keys()):
            context_id_dicts[n][context_key] = len(context_id_dicts[n])
            first = context_key[0]
            if len(context_key) > 1:
                next_context_id = context_id_dicts[n - 1][tuple(
                    context_key[1:])]
            else:
                next_context_id = 0

            f_out.write('%d\t' % (context_id_dicts[n][context_key]) +
                        '%d\t%d\t' % (context_key[0], next_context_id) +
                        '%f\t%f\t%s\n' % context_dict[context_key])

        return

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hi:o:t:')
    except getopt.GetoptError:
        Usage()
        sys.exit(2)

    input_filename = ''
    output_filename_prefix = ''

    for k, v in opts:
        if k == '-h':
            usage()
            sys.exit()
        elif k == '-i':
            input_filename = v
        elif k == '-o':
            output_filename_prefix = v

    if input_filename == '' or output_filename_prefix == '':
        Usage()

    f_in = codecs.open(input_filename, 'r', 'utf-8')
    f_out_text = codecs.open(output_filename_prefix + EXT_TEXT, 'w', 'utf-8')

    cur_n = 0
    pair_id = 1
    pair_dict = {}
    context_dict = {}
    context_id_dicts = [{}]
    keyset_key = marisa.Keyset()
    keyset_pair = marisa.Keyset()
    trie_key = marisa.Trie()
    trie_pair = marisa.Trie()
    agent = marisa.Agent()
    min_score = 99.0
    max_backoff = -99.0

    for line in f_in:
        line = line.rstrip('\n')
        if not line:
            continue

        if line[0] == '\\':
            m = re.search(r'^\\(\d+)-grams:', line)
            if (cur_n and (m or re.search(r'^\\end\\', line))):
                if cur_n == 1:
                    trie_key.build(keyset_key)
                    trie_key.save(output_filename_prefix + EXT_KEY)
                    trie_pair.build(keyset_pair)
                    trie_pair.save(output_filename_prefix + EXT_PAIR)
                    for k, v in pair_dict.iteritems():
                        context_dict[(to_id(trie_pair, agent, k), )] = v

                output_ngram(f_out_text, cur_n, context_dict, context_id_dicts)

            if m:
                cur_n = int(m.group(1))
                context_dict = {}
                context_id_dicts.append({})
                print 'Processing %d-gram...' % cur_n

            continue

        if cur_n == 0:
            continue

        fields = line.split('\t')
        if len(fields) < 2:
            continue

        if len(fields) == 2:
            fields.append('-99')

        score = float(fields[0])
        backoff = float(fields[2])
        if score < min_score and score > -99:
            min_score = score
        if backoff > max_backoff:
            max_backoff = backoff

        if cur_n == 1:
            k = fields[1].encode('utf-8')
            keyset_pair.push_back(k)
            pair = k.split(PAIR_SEPARATOR, 1)
            keyset_key.push_back(pair[0])
            pair_dict[k] = (float(fields[0]), float(fields[2]), fields[1])
        else:
            ngram = [
                to_id(trie_pair, agent, x.encode('utf-8'))
                for x in reversed(fields[1].split(' '))
            ]
            context_dict[tuple(ngram)] = (float(fields[0]), float(fields[2]),
                                          fields[1])

    f_in.close()
    f_out_text.close()
    print 'Done.'
    print 'min_score = %f, max_backoff = %f' % (min_score, max_backoff)
Exemple #4
0
    scount = 1 # setting 1 as minimum
    
print("\nSum of 1-gram:", scount)
if factor > 1:
    scount = int(scount/factor)
    print("Normalized sum of 1-gram:", scount)
    
if scount > 2**31:
    print("Trouble: sum of 1-grams doesn't fit INT32. Please normalize the data manually or automatically by increasing threshold for counts")
    sys.exit(-1)

print()

# save ngrams
print('Saving in Marisa format')
trie = marisa.Trie()
trie.build(keyset)
trie.save(os.path.join(args.output, "ngrams.trie"))

print("Keys: ", trie.num_keys(), "\n")

arr = np.zeros(trie.num_keys()+1, dtype=np.int32)
arr[0] = scount
agent = marisa.Agent()
for k in data:
    agent.set_query(k)
    trie.lookup(agent)
    arr[ agent.key_id() + 1 ] = int(data[k] / factor)

binwrite=open(os.path.join(args.output, 'ngrams.counts'),'wb')
arr.tofile(binwrite)
Exemple #5
0
def build(in_filename,
          in_file,
          logger=None,
          out_index_filename=qingfanyi.dict.INDEX_FILENAME,
          out_dict_filename=qingfanyi.dict.DICT_FILENAME):
    """
    Takes a dictionary file in CEDICT format and generates an index for use with
    qingfanyi.

    :param in_filename: Name of input CEDICT file
    :param in_file: IO object for dictionary file
    :param logger: Function accepting a string, used for logging.
    """

    if not logger:
        logger = qingfanyi.debug

    logger('Building from %s' % in_filename)

    logger('Writing dict ...')
    _ensure_index_dir()

    ks = marisa.Keyset()

    with open(out_dict_filename, 'wb') as dict_out:
        index = 0
        # trie_keys = []
        num_keys = 0
        trie_values = {}
        lineno = 0
        for line in in_file:
            lineno += 1
            if line.startswith(b'#'):
                continue
            if not line.strip():
                continue
            keys = _extract_keys(line, lineno)
            for key_bytes in keys:
                key_str = str(key_bytes, 'utf-8')
                ks.push_back(key_str)

                # trie_keys.append(key_str)
                trie_values.setdefault(key_str, []).append(index)
            dict_out.write(line)
            index += len(line)

    logger('Writing index ...')
    trie = marisa.Trie()
    trie.build(ks)

    trie.save(out_index_filename)

    key_id_to_index = [0] * ks.num_keys()
    for i in range(0, ks.num_keys()):
        key_id = ks.key_id(i)
        key_str = ks.key_str(i)
        key_id_to_index[key_id] = list(set(trie_values[key_str]))

    with open(out_index_filename, mode='a') as f:
        json.dump(key_id_to_index, f, separators=(',', ':'))

    logger('Done!')