def load_model(cls): """ 模型加载 """ config = get_config() model_path = '{}.bin'.format(config.get('train', 'model_path')) if os.path.exists(model_path): cls.__model = ft.load_model(model_path)
def train_model(): """ 模型训练 """ config = get_config() dictionary = get_dictionary() input_file = dictionary.get_corpus_path() output = config.get('train', 'model_path') model = get_model() model.train(input_file, output)
def get_corpus_path(cls, sample=None): """ 获取语料路径 """ config = get_config() if sample: corpus_path = config.get('train', 'sample_corpus_path') else: corpus_path = config.get('train', 'seg_corpus_path') return corpus_path
def test_model(): """ 模型测试 """ config = get_config() test_file_path = config.get('test', 'test_seg_corpus_path') model = get_model() result = model.test(test_file_path) print('precision:', result.precision) print('recall:', result.recall) print('examples:', result.nexamples)
def __load_user_dict(cls): """ 加载用户词典 """ config = get_config() user_dict_path = config.get('train', 'user_dict_path') gr = gzip.open(user_dict_path) lines = gr.readlines() words = set([line.strip() for line in lines if line.strip()]) user_dict = ['{} {} n'.format(word, len(word)*1000) for word in words] buff_file = StringIO('\n'.join(user_dict)) jieba.load_userdict(buff_file) cls._jieba = jieba gr.close()
def cut(cls, **kwargs): """ 语料分词 """ config = get_config() kwargs.setdefault('corpus_path', config.get('train', 'corpus_path')) kwargs.setdefault('seg_corpus_path', config.get('train', 'seg_corpus_path')) kwargs.setdefault('sample_corpus_path', config.get('train', 'sample_corpus_path')) kwargs.setdefault('vocabs_path', config.get('train', 'vocabs_path')) kwargs.setdefault('sample', config.get('train', 'sample')) kwargs.setdefault('sentence', '') if not cls._jieba: cls.__load_user_dict() if not kwargs.get('sentence'): cls.__cut_corpus(**kwargs) else: return cls.__cut_sentence(**kwargs)
def train(cls, input_file, output, **kwargs): """ 模型训练 * input_file training file path (required) * output output file path (required) * label_prefix label prefix ['__label__'] * lr learning rate [0.1] * lr_update_rate change the rate of updates for the learning rate [100] * dim size of word vectors [100] * ws size of the context window [5] * epoch number of epochs [5] * min_count minimal number of word occurences [1] * neg number of negatives sampled [5] * word_ngrams max length of word ngram [1] * loss loss function {ns, hs, softmax} [softmax] * bucket number of buckets [0] * minn min length of char ngram [0] * maxn max length of char ngram [0] * thread number of threads [12] * t sampling threshold [0.0001] * silent disable the log output from the C++ extension [1] * encoding specify input_file encoding [utf-8] * pretrained_vectors pretrained word vectors (.vec file) for supervised learning [] """ config = get_config() kwargs.setdefault('lr', config.get('model', 'lr')) kwargs.setdefault('lr_update_rate', config.get('model', 'lr_update_rate')) kwargs.setdefault('dim', config.get('model', 'dim')) kwargs.setdefault('ws', config.get('model', 'ws')) kwargs.setdefault('epoch', config.get('model', 'epoch')) kwargs.setdefault('word_ngrams', config.get('model', 'word_ngrams')) kwargs.setdefault('loss', config.get('model', 'loss')) kwargs.setdefault('bucket', config.get('model', 'bucket')) kwargs.setdefault('thread', config.get('model', 'thread')) kwargs.setdefault('silent', config.get('model', 'silent')) cls.__model = ft.supervised(input_file, output, **kwargs) return cls.__model