# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:22 from hanlp.components.tok import TransformerTokenizer from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST from tests import cdroot cdroot() tokenizer = TransformerTokenizer() save_dir = 'data/model/cws_bert_albert_ctb6' tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir, transformer='albert_base_zh', max_seq_length=150, metrics='f1', learning_rate=5e-5, epochs=3) tokenizer.load(save_dir) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}')
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:39 from hanlp.components.tok import TransformerTokenizer from hanlp.datasets.cws.sighan2005.msr import SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_VALID, SIGHAN2005_MSR_TEST from tests import cdroot cdroot() tokenizer = TransformerTokenizer() save_dir = 'data/model/cws_bert_base_msra' tokenizer.fit(SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_VALID, save_dir, transformer='chinese_L-12_H-768_A-12', metrics='f1') # tokenizer.load(save_dir) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(SIGHAN2005_MSR_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}')
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:22 from hanlp.components.tok import TransformerTokenizer from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST from tests import cdroot cdroot() tokenizer = TransformerTokenizer() save_dir = 'data/model/cws_bert_base_ctb6' # tagger.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese', # metrics='f1') tokenizer.load(save_dir) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}')
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:22 from hanlp.components.tok import TransformerTokenizer from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST from tests import cdroot cdroot() tokenizer = TransformerTokenizer() save_dir = 'data/model/cws_bert_albert_ctb6' tokenizer.fit( CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir, transformer='/home/ubuntu/hankcs/laser/data/transformer/albert_base_tf2', metrics='f1', learning_rate=5e-5, epochs=3) tokenizer.load(save_dir) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}')
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:39 from hanlp.components.tok import TransformerTokenizer from hanlp.datasets.cws.ctb import CTB6_CWS_VALID, CTB6_CWS_TEST from tests import cdroot cdroot() tokenizer = TransformerTokenizer() save_dir = 'data/model/cws_bert_base_100million' tokenizer.fit('data/cws/large/all.txt', CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese', metrics='accuracy', batch_size=32) tokenizer.load(save_dir, metrics='f1') print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}')
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:39 from hanlp.components.tok import TransformerTokenizer from hanlp.datasets.cws.sighan2005.msr import SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_VALID, SIGHAN2005_MSR_TEST from tests import cdroot cdroot() tokenizer = TransformerTokenizer() save_dir = 'data/model/cws_bert_base_msra' tokenizer.fit(SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_VALID, save_dir, transformer='bert-base-chinese', metrics='f1') # tagger.load(save_dir) print(tokenizer.tokenize(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(SIGHAN2005_MSR_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}')
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:22 from hanlp.components.tok import TransformerTokenizer from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST from tests import cdroot cdroot() tokenizer = TransformerTokenizer() save_dir = 'data/model/cws_bert_base_ctb6' tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir, transformer='chinese_L-12_H-768_A-12', epochs=1, metrics='f1') # tokenizer.load(save_dir) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}')
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:22 from hanlp.components.tok import TransformerTokenizer from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST from tests import cdroot cdroot() tokenizer = TransformerTokenizer() save_dir = 'data/model/cws_bert_base_ctb6' tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese', metrics='f1') # tokenizer.load(save_dir) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}')