class XlmEmbedding(TextEmbedding): def __init__(self): pass def get_embedding_fn(self, max_length=12): self.max_length = max_length self.s = Simplifier('tokenizer/zh_mapping.txt') self.t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model', 'tokenizer/lg.all.voc', max_length ) self.max_length = max_length return self.embedding def embedding(self, text): simple = self.s.simplify(text) tokens = self.t.tokenize(simple) accents = run_strip_accents(tokens) ids = self.t.token_to_id(accents) return ids def size(self): return self.t.dico.counts @classmethod def get_feeder(cls): return DenseDataFeeder
t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model', 'tokenizer/lg.all.voc', doc_max_length) count = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True #f = ['a real test', 'b false test'] with open(infile, 'r', encoding='utf-8') as f, open(outfile, 'w', encoding='utf-8') as fo: with tf.Session(config=config) as sess: time = datetime.datetime.now() for line in f: simple = s.simplify(line) tokens = t.tokenize(simple) accents = run_strip_accents(tokens) ids = t.token_to_id(accents) l = len(ids) _ids = ids + [0] * (doc_max_length - l) _mask = [1] * l + [0] * (doc_max_length - l) _type = [0] * doc_max_length feed_dict = { doc_ids: [_ids], doc_mask: [_mask], doc_type: [_type], } result = sess.run(doc_output, feed_dict=feed_dict) result = [str(int(round(r * 65536))) for r in result[0]] fo.write(line.rstrip() + '\t' + '\t'.join(result)) fo.write('\n') count += 1