Exemple #1
0
def train(config=None):
    """
    模型训练主入口

    pkuseg.train(trainFile, testFile, savedir, train_iter = 20, init_model = None)
                trainFile		训练文件路径。文件格式为多行文本
                testFile		测试文件路径。
                savedir			训练模型的保存路径。
                train_iter		训练轮数。
                init_model		初始化模型,默认为None表示使用默认初始化,用户可以填自己想要初始化的模型的路径如init_model='./models/'。
    """
    if config is None:
        config = Config()

    if config.init_model is None:  # None
        feature_extractor = FeatureExtractor()
    else:
        feature_extractor = FeatureExtractor.load(config.init_model)
    """
    `build()` 函数包含以下过程 :
        1.逐行读取训练文本,
        2.去除换行符、分隔符
        3.处理数字和英文字母
        4.以字符为单位,以15种方式来抽取特征
        5.定义5中标签
        6.分别将特征和标签转化为id的形式
    
    `save()` 函数保存文件到 "xxx/models/ctb8/features.pkl", 二进制格式, 
    字典结构如下 : 
        data = {'unigram': xx, 'bigram': xx, 'feature_to_idx': xx, 'tag_to_idx': xx}
    """
    feature_extractor.build(config.trainFile)
    feature_extractor.save()

    # 将文本文件转为特征文件
    feature_extractor.convert_text_file_to_feature_file(
        config.trainFile, config.c_train, config.f_train
    )  # ("xxx/data/small_training.utf8", "xxx/train.conll.txt", "xxx/train.feat.txt")
    feature_extractor.convert_text_file_to_feature_file(
        config.testFile, config.c_test, config.f_test
    )  # ("xxx/data/small_test.utf8", "xxx/test.conll.txt", "xxx/test.feat.txt")

    # 将特征文件中特征转化为id
    feature_extractor.convert_feature_file_to_idx_file(
        config.f_train, config.fFeatureTrain, config.fGoldTrain
    )  # ("xxx/train.feat.txt", "xxx/ftrain.txt", "xxx/gtrain.txt")
    feature_extractor.convert_feature_file_to_idx_file(
        config.f_test, config.fFeatureTest, config.fGoldTest
    )  # ("xxx/test.feat.txt", "xxx/ftest.txt", "xxx/gtest.txt")

    # 设置使用的评价指标、部分训练参数
    config.globalCheck()
    """
    `config.outDir` : 'xxx/output/'
    `config.fTune` : 'xxx/output/tune.txt'
    `config.fLog` : 'xxx/output/trainLog.txt'
    `config.fResRaw` : 'xxx/output/rawResult.txt'
    """
    config.swLog = open(os.path.join(config.outDir, config.fLog), "w")
    config.swResRaw = open(os.path.join(config.outDir, config.fResRaw), "w")
    config.swTune = open(os.path.join(config.outDir, config.fTune), "w")

    print("\nstart training...")
    config.swLog.write("\nstart training...\n")

    print("\nreading training & test data...")
    config.swLog.write("\nreading training & test data...\n")
    """
    self.fFeatureTrain : 'ftrain.txt'
    self.fGoldTrain : 'gtrain.txt'
    self.fFeatureTest : 'ftest.txt'
    self.fGoldTest : 'gtest.txt'
    """
    trainset = DataSet.load(config.fFeatureTrain,
                            config.fGoldTrain)  # ('ftrain.txt', 'gtrain.txt')
    testset = DataSet.load(config.fFeatureTest,
                           config.fGoldTest)  # ('ftest.txt', 'gtest.txt')

    # 是否扩增/缩小数据集,扩增方法是重复取数据,缩小方法是只取部分数据
    trainset = trainset.resize(config.trainSizeScale)  # (1)

    print("done! train/test data sizes: {}/{}".format(len(trainset),
                                                      len(testset)))
    config.swLog.write("done! train/test data sizes: {}/{}\n".format(
        len(trainset), len(testset)))

    config.swLog.write("\nr: {}\n".format(config.reg))  # self.reg = 1
    print("\nr: {}".format(config.reg))
    if config.rawResWrite:  # self.rawResWrite = True
        config.swResRaw.write("\n%r: {}\n".format(config.reg))

    # 使用训练集,初始化训练类
    trainer = Trainer(config, trainset, feature_extractor)

    time_list = []  # 存储 `trainer.train_epoch()` 过程的耗时
    err_list = []
    diff_list = []
    score_list_list = []

    for i in range(
            config.ttlIter):  # self.ttlIter = 20  # of training iterations
        # config.glbIter += 1
        time_s = time.time()

        err, sample_size, diff = trainer.train_epoch()

        time_t = time.time() - time_s
        time_list.append(time_t)

        err_list.append(err)
        diff_list.append(diff)

        score_list = trainer.test(testset, i)
        score_list_list.append(score_list)
        score = score_list[0]

        logstr = "iter{}  diff={:.2e}  train-time(sec)={:.2f}  {}={:.2f}%".format(
            i, diff, time_t, config.metric, score)
        config.swLog.write(logstr + "\n")
        config.swLog.write(
            "------------------------------------------------\n")
        config.swLog.flush()
        print(logstr)

    res_summarize.write(config, time_list, err_list, diff_list,
                        score_list_list)
    if config.save == 1:
        trainer.model.save()

    config.swLog.close()
    config.swResRaw.close()
    config.swTune.close()

    res_summarize.summarize(config)

    print("finished.")
Exemple #2
0
    def __init__(self,
                 model_name="default",
                 user_dict="default",
                 postag=False):
        """初始化函数,加载模型及用户词典"""
        # print("loading model")
        # config = Config()
        # self.config = config
        self.postag = postag
        if model_name in ["default"]:
            config.modelDir = os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                "models",
                model_name,
            )
        elif model_name in config.available_models:
            config.modelDir = os.path.join(
                config.pkuseg_home,
                model_name,
            )
            download_model(config.model_urls[model_name], config.pkuseg_home,
                           config.model_hash[model_name])
        else:
            config.modelDir = model_name
        # config.fModel = os.path.join(config.modelDir, "model.txt")
        if user_dict is None:
            file_name = None
            other_names = None
        else:
            if user_dict not in config.available_models:
                file_name = user_dict
            else:
                file_name = None
            if model_name in config.models_with_dict:
                other_name = os.path.join(
                    config.pkuseg_home,
                    model_name,
                    model_name + "_dict.pkl",
                )
                default_name = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)),
                    "dicts",
                    "default.pkl",
                )
                other_names = [other_name, default_name]
            else:
                default_name = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)),
                    "dicts",
                    "default.pkl",
                )
                other_names = [default_name]

        self.preprocesser = Preprocesser(file_name)
        # self.preprocesser = Preprocesser([])
        self.postprocesser = Postprocesser(None, other_names)

        self.feature_extractor = FeatureExtractor.load()
        self.model = Model.load()

        self.idx_to_tag = {
            idx: tag
            for tag, idx in self.feature_extractor.tag_to_idx.items()
        }

        self.n_feature = len(self.feature_extractor.feature_to_idx)
        self.n_tag = len(self.feature_extractor.tag_to_idx)

        if postag:
            download_model(config.model_urls["postag"], config.pkuseg_home,
                           config.model_hash[model_name])
            postag_dir = os.path.join(
                config.pkuseg_home,
                "postag",
            )
            self.tagger = Postag(postag_dir)
Exemple #3
0
def train(config=None):
    if config is None:
        config = Config()

    if config.init_model is None:
        feature_extractor = FeatureExtractor()
    else:
        feature_extractor = FeatureExtractor.load(config.init_model)
    feature_extractor.build(config.trainFile)
    feature_extractor.save()

    feature_extractor.convert_text_file_to_feature_file(
        config.trainFile, config.c_train, config.f_train)
    feature_extractor.convert_text_file_to_feature_file(
        config.testFile, config.c_test, config.f_test)

    feature_extractor.convert_feature_file_to_idx_file(config.f_train,
                                                       config.fFeatureTrain,
                                                       config.fGoldTrain)
    feature_extractor.convert_feature_file_to_idx_file(config.f_test,
                                                       config.fFeatureTest,
                                                       config.fGoldTest)

    config.globalCheck()

    config.swLog = open(os.path.join(config.outDir, config.fLog), "w")
    config.swResRaw = open(os.path.join(config.outDir, config.fResRaw), "w")
    config.swTune = open(os.path.join(config.outDir, config.fTune), "w")

    print("\nstart training...")
    config.swLog.write("\nstart training...\n")

    print("\nreading training & test data...")
    config.swLog.write("\nreading training & test data...\n")

    trainset = DataSet.load(config.fFeatureTrain, config.fGoldTrain)
    testset = DataSet.load(config.fFeatureTest, config.fGoldTest)

    trainset = trainset.resize(config.trainSizeScale)

    print("done! train/test data sizes: {}/{}".format(len(trainset),
                                                      len(testset)))
    config.swLog.write("done! train/test data sizes: {}/{}\n".format(
        len(trainset), len(testset)))

    config.swLog.write("\nr: {}\n".format(config.reg))
    print("\nr: {}".format(config.reg))
    if config.rawResWrite:
        config.swResRaw.write("\n%r: {}\n".format(config.reg))

    trainer = Trainer(config, trainset, feature_extractor)

    time_list = []
    err_list = []
    diff_list = []
    score_list_list = []

    for i in range(config.ttlIter):
        # config.glbIter += 1
        time_s = time.time()
        err, sample_size, diff = trainer.train_epoch()
        time_t = time.time() - time_s
        time_list.append(time_t)
        err_list.append(err)
        diff_list.append(diff)

        score_list = trainer.test(testset, i)
        score_list_list.append(score_list)
        score = score_list[0]

        logstr = "iter{}  diff={:.2e}  train-time(sec)={:.2f}  {}={:.2f}%".format(
            i, diff, time_t, config.metric, score)
        config.swLog.write(logstr + "\n")
        config.swLog.write(
            "------------------------------------------------\n")
        config.swLog.flush()
        print(logstr)

    res_summarize.write(config, time_list, err_list, diff_list,
                        score_list_list)
    if config.save == 1:
        trainer.model.save()

    config.swLog.close()
    config.swResRaw.close()
    config.swTune.close()

    res_summarize.summarize(config)

    print("finished.")
Exemple #4
0
    def __init__(self, model_name="default", user_dict="default"):
        """初始化函数,加载模型及用户词典"""
        # print("loading model")
        # config = Config()
        # self.config = config
        if model_name in ["default"]:
            config.modelDir = os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                "models",
                model_name,
            )
        else:
            config.modelDir = model_name
        # config.fModel = os.path.join(config.modelDir, "model.txt")
        if user_dict == "default":
            # file_name = os.path.join(
            #     os.path.dirname(os.path.realpath(__file__)),
            #     "dicts", "default_common.txt",
            # )
            file_name = None
            other_name = os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                "dicts",
                "default.txt",
            )
        else:
            file_name = user_dict
            other_name = None

        # self.preprocesser = Preprocesser(file_name)
        self.preprocesser = Preprocesser([])
        self.postprocesser = Postprocesser(file_name, other_name)

        self.feature_extractor = FeatureExtractor.load()
        self.model = Model.load()

        self.idx_to_tag = {
            idx: tag
            for tag, idx in self.feature_extractor.tag_to_idx.items()
        }

        # self.idx2tag = [None] * len(self.testFeature.tagIndexMap)
        # for i in self.testFeature.tagIndexMap:
        #     self.idx2tag[self.testFeature.tagIndexMap[i]] = i
        # if config.nLabel == 2:
        #     B = B_single = "B"
        #     I_first = I = I_end = "I"
        # elif config.nLabel == 3:
        #     B = B_single = "B"
        #     I_first = I = "I"
        #     I_end = "I_end"
        # elif config.nLabel == 4:
        #     B = "B"
        #     B_single = "B_single"
        #     I_first = I = "I"
        #     I_end = "I_end"
        # elif config.nLabel == 5:
        #     B = "B"
        #     B_single = "B_single"
        #     I_first = "I_first"
        #     I = "I"
        #     I_end = "I_end"
        # self.B = B
        # self.B_single = B_single
        # self.I_first = I_first
        # self.I = I
        # self.I_end = I_end

        self.n_feature = len(self.feature_extractor.feature_to_idx)
        self.n_tag = len(self.feature_extractor.tag_to_idx)