def train(config=None): """ 模型训练主入口 pkuseg.train(trainFile, testFile, savedir, train_iter = 20, init_model = None) trainFile 训练文件路径。文件格式为多行文本 testFile 测试文件路径。 savedir 训练模型的保存路径。 train_iter 训练轮数。 init_model 初始化模型,默认为None表示使用默认初始化,用户可以填自己想要初始化的模型的路径如init_model='./models/'。 """ if config is None: config = Config() if config.init_model is None: # None feature_extractor = FeatureExtractor() else: feature_extractor = FeatureExtractor.load(config.init_model) """ `build()` 函数包含以下过程 : 1.逐行读取训练文本, 2.去除换行符、分隔符 3.处理数字和英文字母 4.以字符为单位,以15种方式来抽取特征 5.定义5中标签 6.分别将特征和标签转化为id的形式 `save()` 函数保存文件到 "xxx/models/ctb8/features.pkl", 二进制格式, 字典结构如下 : data = {'unigram': xx, 'bigram': xx, 'feature_to_idx': xx, 'tag_to_idx': xx} """ feature_extractor.build(config.trainFile) feature_extractor.save() # 将文本文件转为特征文件 feature_extractor.convert_text_file_to_feature_file( config.trainFile, config.c_train, config.f_train ) # ("xxx/data/small_training.utf8", "xxx/train.conll.txt", "xxx/train.feat.txt") feature_extractor.convert_text_file_to_feature_file( config.testFile, config.c_test, config.f_test ) # ("xxx/data/small_test.utf8", "xxx/test.conll.txt", "xxx/test.feat.txt") # 将特征文件中特征转化为id feature_extractor.convert_feature_file_to_idx_file( config.f_train, config.fFeatureTrain, config.fGoldTrain ) # ("xxx/train.feat.txt", "xxx/ftrain.txt", "xxx/gtrain.txt") feature_extractor.convert_feature_file_to_idx_file( config.f_test, config.fFeatureTest, config.fGoldTest ) # ("xxx/test.feat.txt", "xxx/ftest.txt", "xxx/gtest.txt") # 设置使用的评价指标、部分训练参数 config.globalCheck() """ `config.outDir` : 'xxx/output/' `config.fTune` : 'xxx/output/tune.txt' `config.fLog` : 'xxx/output/trainLog.txt' `config.fResRaw` : 'xxx/output/rawResult.txt' """ config.swLog = open(os.path.join(config.outDir, config.fLog), "w") config.swResRaw = open(os.path.join(config.outDir, config.fResRaw), "w") config.swTune = open(os.path.join(config.outDir, config.fTune), "w") print("\nstart training...") config.swLog.write("\nstart training...\n") print("\nreading training & test data...") config.swLog.write("\nreading training & test data...\n") """ self.fFeatureTrain : 'ftrain.txt' self.fGoldTrain : 'gtrain.txt' self.fFeatureTest : 'ftest.txt' self.fGoldTest : 'gtest.txt' """ trainset = DataSet.load(config.fFeatureTrain, config.fGoldTrain) # ('ftrain.txt', 'gtrain.txt') testset = DataSet.load(config.fFeatureTest, config.fGoldTest) # ('ftest.txt', 'gtest.txt') # 是否扩增/缩小数据集,扩增方法是重复取数据,缩小方法是只取部分数据 trainset = trainset.resize(config.trainSizeScale) # (1) print("done! train/test data sizes: {}/{}".format(len(trainset), len(testset))) config.swLog.write("done! train/test data sizes: {}/{}\n".format( len(trainset), len(testset))) config.swLog.write("\nr: {}\n".format(config.reg)) # self.reg = 1 print("\nr: {}".format(config.reg)) if config.rawResWrite: # self.rawResWrite = True config.swResRaw.write("\n%r: {}\n".format(config.reg)) # 使用训练集,初始化训练类 trainer = Trainer(config, trainset, feature_extractor) time_list = [] # 存储 `trainer.train_epoch()` 过程的耗时 err_list = [] diff_list = [] score_list_list = [] for i in range( config.ttlIter): # self.ttlIter = 20 # of training iterations # config.glbIter += 1 time_s = time.time() err, sample_size, diff = trainer.train_epoch() time_t = time.time() - time_s time_list.append(time_t) err_list.append(err) diff_list.append(diff) score_list = trainer.test(testset, i) score_list_list.append(score_list) score = score_list[0] logstr = "iter{} diff={:.2e} train-time(sec)={:.2f} {}={:.2f}%".format( i, diff, time_t, config.metric, score) config.swLog.write(logstr + "\n") config.swLog.write( "------------------------------------------------\n") config.swLog.flush() print(logstr) res_summarize.write(config, time_list, err_list, diff_list, score_list_list) if config.save == 1: trainer.model.save() config.swLog.close() config.swResRaw.close() config.swTune.close() res_summarize.summarize(config) print("finished.")
def __init__(self, model_name="default", user_dict="default", postag=False): """初始化函数,加载模型及用户词典""" # print("loading model") # config = Config() # self.config = config self.postag = postag if model_name in ["default"]: config.modelDir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "models", model_name, ) elif model_name in config.available_models: config.modelDir = os.path.join( config.pkuseg_home, model_name, ) download_model(config.model_urls[model_name], config.pkuseg_home, config.model_hash[model_name]) else: config.modelDir = model_name # config.fModel = os.path.join(config.modelDir, "model.txt") if user_dict is None: file_name = None other_names = None else: if user_dict not in config.available_models: file_name = user_dict else: file_name = None if model_name in config.models_with_dict: other_name = os.path.join( config.pkuseg_home, model_name, model_name + "_dict.pkl", ) default_name = os.path.join( os.path.dirname(os.path.realpath(__file__)), "dicts", "default.pkl", ) other_names = [other_name, default_name] else: default_name = os.path.join( os.path.dirname(os.path.realpath(__file__)), "dicts", "default.pkl", ) other_names = [default_name] self.preprocesser = Preprocesser(file_name) # self.preprocesser = Preprocesser([]) self.postprocesser = Postprocesser(None, other_names) self.feature_extractor = FeatureExtractor.load() self.model = Model.load() self.idx_to_tag = { idx: tag for tag, idx in self.feature_extractor.tag_to_idx.items() } self.n_feature = len(self.feature_extractor.feature_to_idx) self.n_tag = len(self.feature_extractor.tag_to_idx) if postag: download_model(config.model_urls["postag"], config.pkuseg_home, config.model_hash[model_name]) postag_dir = os.path.join( config.pkuseg_home, "postag", ) self.tagger = Postag(postag_dir)
def train(config=None): if config is None: config = Config() if config.init_model is None: feature_extractor = FeatureExtractor() else: feature_extractor = FeatureExtractor.load(config.init_model) feature_extractor.build(config.trainFile) feature_extractor.save() feature_extractor.convert_text_file_to_feature_file( config.trainFile, config.c_train, config.f_train) feature_extractor.convert_text_file_to_feature_file( config.testFile, config.c_test, config.f_test) feature_extractor.convert_feature_file_to_idx_file(config.f_train, config.fFeatureTrain, config.fGoldTrain) feature_extractor.convert_feature_file_to_idx_file(config.f_test, config.fFeatureTest, config.fGoldTest) config.globalCheck() config.swLog = open(os.path.join(config.outDir, config.fLog), "w") config.swResRaw = open(os.path.join(config.outDir, config.fResRaw), "w") config.swTune = open(os.path.join(config.outDir, config.fTune), "w") print("\nstart training...") config.swLog.write("\nstart training...\n") print("\nreading training & test data...") config.swLog.write("\nreading training & test data...\n") trainset = DataSet.load(config.fFeatureTrain, config.fGoldTrain) testset = DataSet.load(config.fFeatureTest, config.fGoldTest) trainset = trainset.resize(config.trainSizeScale) print("done! train/test data sizes: {}/{}".format(len(trainset), len(testset))) config.swLog.write("done! train/test data sizes: {}/{}\n".format( len(trainset), len(testset))) config.swLog.write("\nr: {}\n".format(config.reg)) print("\nr: {}".format(config.reg)) if config.rawResWrite: config.swResRaw.write("\n%r: {}\n".format(config.reg)) trainer = Trainer(config, trainset, feature_extractor) time_list = [] err_list = [] diff_list = [] score_list_list = [] for i in range(config.ttlIter): # config.glbIter += 1 time_s = time.time() err, sample_size, diff = trainer.train_epoch() time_t = time.time() - time_s time_list.append(time_t) err_list.append(err) diff_list.append(diff) score_list = trainer.test(testset, i) score_list_list.append(score_list) score = score_list[0] logstr = "iter{} diff={:.2e} train-time(sec)={:.2f} {}={:.2f}%".format( i, diff, time_t, config.metric, score) config.swLog.write(logstr + "\n") config.swLog.write( "------------------------------------------------\n") config.swLog.flush() print(logstr) res_summarize.write(config, time_list, err_list, diff_list, score_list_list) if config.save == 1: trainer.model.save() config.swLog.close() config.swResRaw.close() config.swTune.close() res_summarize.summarize(config) print("finished.")
def __init__(self, model_name="default", user_dict="default"): """初始化函数,加载模型及用户词典""" # print("loading model") # config = Config() # self.config = config if model_name in ["default"]: config.modelDir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "models", model_name, ) else: config.modelDir = model_name # config.fModel = os.path.join(config.modelDir, "model.txt") if user_dict == "default": # file_name = os.path.join( # os.path.dirname(os.path.realpath(__file__)), # "dicts", "default_common.txt", # ) file_name = None other_name = os.path.join( os.path.dirname(os.path.realpath(__file__)), "dicts", "default.txt", ) else: file_name = user_dict other_name = None # self.preprocesser = Preprocesser(file_name) self.preprocesser = Preprocesser([]) self.postprocesser = Postprocesser(file_name, other_name) self.feature_extractor = FeatureExtractor.load() self.model = Model.load() self.idx_to_tag = { idx: tag for tag, idx in self.feature_extractor.tag_to_idx.items() } # self.idx2tag = [None] * len(self.testFeature.tagIndexMap) # for i in self.testFeature.tagIndexMap: # self.idx2tag[self.testFeature.tagIndexMap[i]] = i # if config.nLabel == 2: # B = B_single = "B" # I_first = I = I_end = "I" # elif config.nLabel == 3: # B = B_single = "B" # I_first = I = "I" # I_end = "I_end" # elif config.nLabel == 4: # B = "B" # B_single = "B_single" # I_first = I = "I" # I_end = "I_end" # elif config.nLabel == 5: # B = "B" # B_single = "B_single" # I_first = "I_first" # I = "I" # I_end = "I_end" # self.B = B # self.B_single = B_single # self.I_first = I_first # self.I = I # self.I_end = I_end self.n_feature = len(self.feature_extractor.feature_to_idx) self.n_tag = len(self.feature_extractor.tag_to_idx)