コード例 #1
0
	
	def encode(self, seq_list):
		try:
			_s_embedded, _s_lengths = Helper.get_batch(seq_list)		
			feed_dict = {
				self.s_embedded: _s_embedded,
				self.s_lengths: _s_lengths}
			s_embeddings = self.sess.run(self.s_embeddings, feed_dict = feed_dict)
			return s_embeddings
		except Exception as e:
			logger.get().debug('seq_length=%s, errmsg=%s', len(seq_list), e)
	

def test():
	Helper.init()
	with codecs.open('./data/test.txt', 'r', 'utf-8') as in_f:
		corpus = [line.strip('\n') for line in in_f.readlines()]
	corpus = [['<s>'] + [word for word in NLPUtil.tokenize_via_jieba(sent)
		if word in Helper._word2vec] + ['</s>'] for sent in corpus]
	s_encoder = Encoder()	
	s_embeddings = s_encoder.encode(corpus)
	print s_embeddings.shape
	print s_embeddings.dtype
	print s_embeddings[0]


if __name__ == '__main__':
	logger.start('./log/encode.log', name = __name__, level = 'DEBUG')
	test()
コード例 #2
0
ファイル: reader.py プロジェクト: zxk1234/Rhetoric-Generator
    fw = codecs.open(output, 'w', 'utf-8')
    with codecs.open(input, 'r', 'utf-8') as fr:
        for line in fr:
            lines = line.strip().split('\t')
            lines[2] = str(dic[lines[2]])
            fw.write('\t'.join(lines) + '\n')
    fw.close()


def dump_word_embeddings(word2id):
    #import random as np
    emb_size = 300
    vocab_size = len(word2id)

    word2vec = Word2Vec.load('../data2/word2vec.model')
    embeddings = np.random.randn(vocab_size, emb_size)
    for word, idx in word2id.items():
        if word in word2vec:
            embeddings[idx, :] = word2vec[word]
        else:
            embeddings[idx, :] = np.random.randn(emb_size)
    print(embeddings.shape)
    np.save('../data2/word2vec_new.model', embeddings)


if __name__ == '__main__':
    g_log_inst.start('../log/reader.log', __name__, 'DEBUG')
    save_vocab()
    #data_process('../data/train.txt', '../data/trainset.txt')
    #data_process('../data/test.txt', '../data/testset.txt')
コード例 #3
0
            for k, v in cls._replace_pattern_cfg.items():
                if v.match(token):
                    token = k
                    break
            if '{[' not in token:
                return token
            for item in cls._wordseg_pattern_cfg:
                token = item.sub('', token)
            return token
        except Exception as e:
            logger.get().warn('token=%s, errmsg=%s' % (token, e))
            return token


if '__main__' == __name__:
    logger.start('./log/test.log', __name__, 'DEBUG')

    in_fpath = './data/question.raw'
    out_fpath = './data/question.raw.gbk'
    #NLPUtil.conv_fenc_u8_to_gbk(in_fpath, out_fpath)

    in_fpath = './data/question.seg.u8'
    out_fpath = './data/vocab.txt'
    #NLPUtil.stat_token_freq(in_fpath, out_fpath)

    msgs = [
        u'携带乙肝病毒可以母乳喂养吗',
        u'做糖筛是不是又要打B超哦',
        u'这个crp偏高是怎么回事, 12mg, 12ml, 12mml, 11kg, 11kcal, 11k, 11kj',
        u'b 你好 乳头内陷要怎么母乳',
    ]
コード例 #4
0
                save_path = saver.save(session, '%s/model.ckpt' % (ckpt_dir))
                g_log_inst.get().info('[model] save success, ckpt_path=%s' %
                                      (save_path))

        # test the accuracy
        test_perplexity, accuracy, domain_accuracy = run_epoch(
            session,
            mtest,
            test_data,
            tf.no_op(),
            debug=True,
            verbose=True,
            id2word_dict=id2word_dict,
            dsl_converter=config.converter)
        g_log_inst.get().info('Test: perplexity=%.3f, accuracy=%s' %
                              (test_perplexity, accuracy))

        # acc compute
        '''
        for idx, domain_accu in enumerate(domain_accuracy):
            g_log_inst.get().info('Domain: %s, precision: %.3f, recall: %.3f' % (
            config.converter.label2domain[idx], domain_accuracy[idx][0] / float(domain_accuracy[idx][1]),
            domain_accuracy[idx][2] / float(domain_accuracy[idx][3])))
        '''
    g_log_inst.get().info('bilstm_attention training finished')


if __name__ == '__main__':
    g_log_inst.start('../log/train.log', __name__, 'DEBUG')
    tf.app.run()