class XlmEmbedding(TextEmbedding):
    def __init__(self):
        pass
    def get_embedding_fn(self, max_length=12):
        self.max_length = max_length
        self.s = Simplifier('tokenizer/zh_mapping.txt')
        self.t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model',
            'tokenizer/lg.all.voc',
            max_length
        )
        self.max_length = max_length
        return self.embedding
    def embedding(self, text):
        simple = self.s.simplify(text)
        tokens = self.t.tokenize(simple)
        accents = run_strip_accents(tokens)
        ids = self.t.token_to_id(accents)
        return ids
    def size(self):
        return self.t.dico.counts
    @classmethod
    def get_feeder(cls):
        return DenseDataFeeder
Exemple #2
0
t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model',
              'tokenizer/lg.all.voc', doc_max_length)
count = 0
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
#f = ['a real test', 'b false test']
with open(infile, 'r', encoding='utf-8') as f, open(outfile,
                                                    'w',
                                                    encoding='utf-8') as fo:
    with tf.Session(config=config) as sess:
        time = datetime.datetime.now()
        for line in f:
            simple = s.simplify(line)
            tokens = t.tokenize(simple)
            accents = run_strip_accents(tokens)
            ids = t.token_to_id(accents)

            l = len(ids)
            _ids = ids + [0] * (doc_max_length - l)
            _mask = [1] * l + [0] * (doc_max_length - l)
            _type = [0] * doc_max_length
            feed_dict = {
                doc_ids: [_ids],
                doc_mask: [_mask],
                doc_type: [_type],
            }
            result = sess.run(doc_output, feed_dict=feed_dict)
            result = [str(int(round(r * 65536))) for r in result[0]]
            fo.write(line.rstrip() + '\t' + '\t'.join(result))
            fo.write('\n')
            count += 1