def __init__(self, infile, output_prefix): self.__infile = infile self.__output_prefix = output_prefix self.__ngram_line_regex = re.compile(NGRAM_LINE_REGEX) self.__ngram_entries = [{} for x in range(0, NGRAM)] self.__vocab_keyset = marisa.Keyset() self.__input_keyset = marisa.Keyset() self.__vocab_trie = marisa.Trie() self.__input_trie = marisa.Trie() self.__min_cost = 0.0
def open(self): if self._opened: raise RuntimeError('Tried to open an open Dict!') self._trie = marisa.Trie() self._trie.mmap(self._index_filename) with open(self._index_filename, 'rb') as f: f.seek(self._trie.io_size()) self._key_id_to_index = json.load(f) self._dict_file = open(self._dict_filename) self._dict_mm = mmap.mmap(self._dict_file.fileno(), 0, access=mmap.ACCESS_READ) self._opened = True
def main(): def output_ngram(f_out, n, context_dict, context_id_dicts): f_out.write('#%d\t%d\n' % (n, len(context_dict))) for context_key in sorted(context_dict.keys()): context_id_dicts[n][context_key] = len(context_id_dicts[n]) first = context_key[0] if len(context_key) > 1: next_context_id = context_id_dicts[n - 1][tuple( context_key[1:])] else: next_context_id = 0 f_out.write('%d\t' % (context_id_dicts[n][context_key]) + '%d\t%d\t' % (context_key[0], next_context_id) + '%f\t%f\t%s\n' % context_dict[context_key]) return try: opts, args = getopt.getopt(sys.argv[1:], 'hi:o:t:') except getopt.GetoptError: Usage() sys.exit(2) input_filename = '' output_filename_prefix = '' for k, v in opts: if k == '-h': usage() sys.exit() elif k == '-i': input_filename = v elif k == '-o': output_filename_prefix = v if input_filename == '' or output_filename_prefix == '': Usage() f_in = codecs.open(input_filename, 'r', 'utf-8') f_out_text = codecs.open(output_filename_prefix + EXT_TEXT, 'w', 'utf-8') cur_n = 0 pair_id = 1 pair_dict = {} context_dict = {} context_id_dicts = [{}] keyset_key = marisa.Keyset() keyset_pair = marisa.Keyset() trie_key = marisa.Trie() trie_pair = marisa.Trie() agent = marisa.Agent() min_score = 99.0 max_backoff = -99.0 for line in f_in: line = line.rstrip('\n') if not line: continue if line[0] == '\\': m = re.search(r'^\\(\d+)-grams:', line) if (cur_n and (m or re.search(r'^\\end\\', line))): if cur_n == 1: trie_key.build(keyset_key) trie_key.save(output_filename_prefix + EXT_KEY) trie_pair.build(keyset_pair) trie_pair.save(output_filename_prefix + EXT_PAIR) for k, v in pair_dict.iteritems(): context_dict[(to_id(trie_pair, agent, k), )] = v output_ngram(f_out_text, cur_n, context_dict, context_id_dicts) if m: cur_n = int(m.group(1)) context_dict = {} context_id_dicts.append({}) print 'Processing %d-gram...' % cur_n continue if cur_n == 0: continue fields = line.split('\t') if len(fields) < 2: continue if len(fields) == 2: fields.append('-99') score = float(fields[0]) backoff = float(fields[2]) if score < min_score and score > -99: min_score = score if backoff > max_backoff: max_backoff = backoff if cur_n == 1: k = fields[1].encode('utf-8') keyset_pair.push_back(k) pair = k.split(PAIR_SEPARATOR, 1) keyset_key.push_back(pair[0]) pair_dict[k] = (float(fields[0]), float(fields[2]), fields[1]) else: ngram = [ to_id(trie_pair, agent, x.encode('utf-8')) for x in reversed(fields[1].split(' ')) ] context_dict[tuple(ngram)] = (float(fields[0]), float(fields[2]), fields[1]) f_in.close() f_out_text.close() print 'Done.' print 'min_score = %f, max_backoff = %f' % (min_score, max_backoff)
scount = 1 # setting 1 as minimum print("\nSum of 1-gram:", scount) if factor > 1: scount = int(scount/factor) print("Normalized sum of 1-gram:", scount) if scount > 2**31: print("Trouble: sum of 1-grams doesn't fit INT32. Please normalize the data manually or automatically by increasing threshold for counts") sys.exit(-1) print() # save ngrams print('Saving in Marisa format') trie = marisa.Trie() trie.build(keyset) trie.save(os.path.join(args.output, "ngrams.trie")) print("Keys: ", trie.num_keys(), "\n") arr = np.zeros(trie.num_keys()+1, dtype=np.int32) arr[0] = scount agent = marisa.Agent() for k in data: agent.set_query(k) trie.lookup(agent) arr[ agent.key_id() + 1 ] = int(data[k] / factor) binwrite=open(os.path.join(args.output, 'ngrams.counts'),'wb') arr.tofile(binwrite)
def build(in_filename, in_file, logger=None, out_index_filename=qingfanyi.dict.INDEX_FILENAME, out_dict_filename=qingfanyi.dict.DICT_FILENAME): """ Takes a dictionary file in CEDICT format and generates an index for use with qingfanyi. :param in_filename: Name of input CEDICT file :param in_file: IO object for dictionary file :param logger: Function accepting a string, used for logging. """ if not logger: logger = qingfanyi.debug logger('Building from %s' % in_filename) logger('Writing dict ...') _ensure_index_dir() ks = marisa.Keyset() with open(out_dict_filename, 'wb') as dict_out: index = 0 # trie_keys = [] num_keys = 0 trie_values = {} lineno = 0 for line in in_file: lineno += 1 if line.startswith(b'#'): continue if not line.strip(): continue keys = _extract_keys(line, lineno) for key_bytes in keys: key_str = str(key_bytes, 'utf-8') ks.push_back(key_str) # trie_keys.append(key_str) trie_values.setdefault(key_str, []).append(index) dict_out.write(line) index += len(line) logger('Writing index ...') trie = marisa.Trie() trie.build(ks) trie.save(out_index_filename) key_id_to_index = [0] * ks.num_keys() for i in range(0, ks.num_keys()): key_id = ks.key_id(i) key_str = ks.key_str(i) key_id_to_index[key_id] = list(set(trie_values[key_str])) with open(out_index_filename, mode='a') as f: json.dump(key_id_to_index, f, separators=(',', ':')) logger('Done!')