# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:39 import tensorflow as tf from hanlp.components.tok import NgramConvTokenizer from hanlp.datasets.cws.sighan2005.msr import SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_VALID, SIGHAN2005_MSR_TEST from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR from tests import cdroot cdroot() tokenizer = NgramConvTokenizer() save_dir = 'data/model/cws/convseg-msr-nocrf-noembed' tokenizer.fit(SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_VALID, save_dir, word_embed={ 'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': True, 'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR, 'expand_vocab': False, 'lowercase': False, } }, optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8, clipnorm=5), epochs=100, window_size=0, metrics='f1',
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-29 21:58 import tensorflow as tf from hanlp.components.tok import NgramConvTokenizer from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR from tests import cdroot cdroot() tokenizer = NgramConvTokenizer() save_dir = 'data/model/cws/ctb6_cws' optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8, clipnorm=5) tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir, word_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': True, 'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR, 'expand_vocab': False, 'lowercase': False, }}, optimizer=optimizer, window_size=0, weight_norm=True) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False) print(tokenizer.tokenize(['中央民族乐团离开北京前往维也纳', '商品和服务']))
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:39 import tensorflow as tf from hanlp.components.tok import NgramConvTokenizer from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100 from tests import cdroot cdroot() tokenizer = NgramConvTokenizer() save_dir = 'data/model/cws/pku98_6m_conv_ngram' optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8, clipnorm=5) tokenizer.fit('data/cws/pku98/199801-06-seg.txt', 'data/cws/pku98/test_pku98_name_merged.txt', save_dir, word_embed={ 'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': False, 'filepath': RADICAL_CHAR_EMBEDDING_100, 'expand_vocab': True, 'lowercase': False, } }, optimizer=optimizer, window_size=0, weight_norm=True)