Esempio n. 1
0
    def __init__(self, config, dataset, feature_extractor):
        self.config = config
        self.X = dataset
        self.n_feature = dataset.n_feature  # 特征总数量
        self.n_tag = dataset.n_tag  # 标签总数量==5

        if config.init_model is None:
            self.model = Model(self.n_feature, self.n_tag)  # do this
        else:
            self.model = Model.load(config.init_model)
            self.model.expand(self.n_feature, self.n_tag)

        self.optim = self._get_optimizer(dataset, self.model)

        self.feature_extractor = feature_extractor
        self.idx_to_chunk_tag = {
        }  # {0: 'B', 1: 'B_single', 2: 'I', 3: 'I', 4: 'I'}
        """
        `tag_to_idx` : 
            {'B': 0, 'B_single': 1, 'I': 2, 'I_end': 3, 'I_first': 4}
        
        `startswith()` 函数:
            >>> aaa
            'Begin'
            >>> aaa.startswith("A")
            False
            >>> aaa.startswith("B")
            True
        """
        for tag, idx in feature_extractor.tag_to_idx.items():
            if tag.startswith("I"):  # ['I', 'I_end', 'I_first']
                tag = "I"
            if tag.startswith("O"):
                tag = "O"
            self.idx_to_chunk_tag[idx] = tag
Esempio n. 2
0
    def __init__(self, config, dataset, feature_extractor):
        self.config = config
        self.X = dataset
        self.n_feature = dataset.n_feature
        self.n_tag = dataset.n_tag

        if config.init_model is None:
            self.model = Model(self.n_feature, self.n_tag)
        else:
            self.model = Model.load(config.init_model)
            self.model.expand(self.n_feature, self.n_tag)

        self.optim = self._get_optimizer(dataset, self.model)

        self.feature_extractor = feature_extractor
        self.idx_to_chunk_tag = {}
        for tag, idx in feature_extractor.tag_to_idx.items():
            if tag.startswith("I"):
                tag = "I"
            if tag.startswith("O"):
                tag = "O"
            self.idx_to_chunk_tag[idx] = tag
Esempio n. 3
0
class Trainer:
    def __init__(self, config, dataset, feature_extractor):
        self.config = config
        self.X = dataset
        self.n_feature = dataset.n_feature  # 特征总数量
        self.n_tag = dataset.n_tag  # 标签总数量==5

        if config.init_model is None:
            self.model = Model(self.n_feature, self.n_tag)  # do this
        else:
            self.model = Model.load(config.init_model)
            self.model.expand(self.n_feature, self.n_tag)

        self.optim = self._get_optimizer(dataset, self.model)

        self.feature_extractor = feature_extractor
        self.idx_to_chunk_tag = {
        }  # {0: 'B', 1: 'B_single', 2: 'I', 3: 'I', 4: 'I'}
        """
        `tag_to_idx` : 
            {'B': 0, 'B_single': 1, 'I': 2, 'I_end': 3, 'I_first': 4}
        
        `startswith()` 函数:
            >>> aaa
            'Begin'
            >>> aaa.startswith("A")
            False
            >>> aaa.startswith("B")
            True
        """
        for tag, idx in feature_extractor.tag_to_idx.items():
            if tag.startswith("I"):  # ['I', 'I_end', 'I_first']
                tag = "I"
            if tag.startswith("O"):
                tag = "O"
            self.idx_to_chunk_tag[idx] = tag

    def _get_optimizer(self, dataset, model):
        config = self.config
        if "adf" in config.modelOptimizer:  # self.modelOptimizer = "crf.adf"
            return ADF(config, dataset, model)

        raise ValueError("Invalid Optimizer")

    def train_epoch(self):
        return self.optim.optimize()

    def test(self, testset, iteration):

        outfile = os.path.join(config.outDir, config.fOutput.format(iteration))

        func_mapping = {
            "tok.acc": self._decode_tokAcc,
            "str.acc": self._decode_strAcc,
            "f1": self._decode_fscore,
        }

        with open(outfile, "w", encoding="utf8") as writer:
            score_list = func_mapping[config.evalMetric](testset, self.model,
                                                         writer)

        for example in testset:
            example.predicted_tags = None

        return score_list

    def _decode(self, testset: DataSet, model: Model):
        if config.nThread == 1:
            self._decode_single(testset, model)
        else:
            self._decode_multi_proc(testset, model)

    def _decode_single(self, testset: DataSet, model: Model):
        # n_tag = model.n_tag
        for example in testset:
            _, tags = _inf.decodeViterbi_fast(example.features, model)
            example.predicted_tags = tags

    @staticmethod
    def _decode_proc(model, in_queue, out_queue):
        while True:
            item = in_queue.get()
            if item is None:
                return
            idx, features = item
            _, tags = _inf.decodeViterbi_fast(features, model)
            out_queue.put((idx, tags))

    def _decode_multi_proc(self, testset: DataSet, model: Model):
        in_queue = Queue()
        out_queue = Queue()
        procs = []
        nthread = self.config.nThread
        for i in range(nthread):
            p = Process(target=self._decode_proc,
                        args=(model, in_queue, out_queue))
            procs.append(p)

        for idx, example in enumerate(testset):
            in_queue.put((idx, example.features))

        for proc in procs:
            in_queue.put(None)
            proc.start()

        for _ in range(len(testset)):
            idx, tags = out_queue.get()
            testset[idx].predicted_tags = tags

        for p in procs:
            p.join()

    # token accuracy
    def _decode_tokAcc(self, dataset, model, writer):
        config = self.config

        self._decode(dataset, model)
        n_tag = model.n_tag
        all_correct = [0] * n_tag
        all_pred = [0] * n_tag
        all_gold = [0] * n_tag

        for example in dataset:
            pred = example.predicted_tags
            gold = example.tags

            if writer is not None:
                writer.write(",".join(map(str, pred)))
                writer.write("\n")

            for pred_tag, gold_tag in zip(pred, gold):
                all_pred[pred_tag] += 1
                all_gold[gold_tag] += 1
                if pred_tag == gold_tag:
                    all_correct[gold_tag] += 1

        config.swLog.write(
            "% tag-type  #gold  #output  #correct-output  token-precision  token-recall  token-f-score\n"
        )
        sumGold = 0
        sumOutput = 0
        sumCorrOutput = 0

        for i, (correct, gold,
                pred) in enumerate(zip(all_correct, all_gold, all_pred)):
            sumGold += gold
            sumOutput += pred
            sumCorrOutput += correct

            if gold == 0:
                rec = 0
            else:
                rec = correct * 100.0 / gold

            if pred == 0:
                prec = 0
            else:
                prec = correct * 100.0 / pred

            config.swLog.write(
                "% {}:  {}  {}  {}  {:.2f}  {:.2f}  {:.2f}\n".format(
                    i,
                    gold,
                    pred,
                    correct,
                    prec,
                    rec,
                    (2 * prec * rec / (prec + rec)),
                ))

        if sumGold == 0:
            rec = 0
        else:
            rec = sumCorrOutput * 100.0 / sumGold
        if sumOutput == 0:
            prec = 0
        else:
            prec = sumCorrOutput * 100.0 / sumOutput

        if prec == 0 and rec == 0:
            fscore = 0
        else:
            fscore = 2 * prec * rec / (prec + rec)

        config.swLog.write(
            "% overall-tags:  {}  {}  {}  {:.2f}  {:.2f}  {:.2f}\n".format(
                sumGold, sumOutput, sumCorrOutput, prec, rec, fscore))
        config.swLog.flush()
        return [fscore]

    def _decode_strAcc(self, dataset, model, writer):

        config = self.config

        self._decode(dataset, model)

        correct = 0
        total = len(dataset)

        for example in dataset:
            pred = example.predicted_tags
            gold = example.tags

            if writer is not None:
                writer.write(",".join(map(str, pred)))
                writer.write("\n")

            for pred_tag, gold_tag in zip(pred, gold):
                if pred_tag != gold_tag:
                    break
            else:
                correct += 1

        acc = correct / total * 100.0
        config.swLog.write(
            "total-tag-strings={}  correct-tag-strings={}  string-accuracy={}%"
            .format(total, correct, acc))
        return [acc]

    def _decode_fscore(self, dataset, model, writer):
        config = self.config

        self._decode(dataset, model)

        gold_tags = []
        pred_tags = []

        for example in dataset:
            pred = example.predicted_tags
            gold = example.tags

            pred_str = ",".join(map(str, pred))
            pred_tags.append(pred_str)
            if writer is not None:
                writer.write(pred_str)
                writer.write("\n")
            gold_tags.append(",".join(map(str, gold)))

        scoreList, infoList = getFscore(gold_tags, pred_tags,
                                        self.idx_to_chunk_tag)
        config.swLog.write(
            "#gold-chunk={}  #output-chunk={}  #correct-output-chunk={}  precision={:.2f}  recall={:.2f}  f-score={:.2f}\n"
            .format(
                infoList[0],
                infoList[1],
                infoList[2],
                scoreList[1],
                scoreList[2],
                scoreList[0],
            ))
        return scoreList
Esempio n. 4
0
    def __init__(self,
                 model_name="default",
                 user_dict="default",
                 postag=False):
        """初始化函数,加载模型及用户词典"""
        # print("loading model")
        # config = Config()
        # self.config = config
        self.postag = postag
        if model_name in ["default"]:
            config.modelDir = os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                "models",
                model_name,
            )
        elif model_name in config.available_models:
            config.modelDir = os.path.join(
                config.pkuseg_home,
                model_name,
            )
            download_model(config.model_urls[model_name], config.pkuseg_home,
                           config.model_hash[model_name])
        else:
            config.modelDir = model_name
        # config.fModel = os.path.join(config.modelDir, "model.txt")
        if user_dict is None:
            file_name = None
            other_names = None
        else:
            if user_dict not in config.available_models:
                file_name = user_dict
            else:
                file_name = None
            if model_name in config.models_with_dict:
                other_name = os.path.join(
                    config.pkuseg_home,
                    model_name,
                    model_name + "_dict.pkl",
                )
                default_name = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)),
                    "dicts",
                    "default.pkl",
                )
                other_names = [other_name, default_name]
            else:
                default_name = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)),
                    "dicts",
                    "default.pkl",
                )
                other_names = [default_name]

        self.preprocesser = Preprocesser(file_name)
        # self.preprocesser = Preprocesser([])
        self.postprocesser = Postprocesser(None, other_names)

        self.feature_extractor = FeatureExtractor.load()
        self.model = Model.load()

        self.idx_to_tag = {
            idx: tag
            for tag, idx in self.feature_extractor.tag_to_idx.items()
        }

        self.n_feature = len(self.feature_extractor.feature_to_idx)
        self.n_tag = len(self.feature_extractor.tag_to_idx)

        if postag:
            download_model(config.model_urls["postag"], config.pkuseg_home,
                           config.model_hash[model_name])
            postag_dir = os.path.join(
                config.pkuseg_home,
                "postag",
            )
            self.tagger = Postag(postag_dir)
Esempio n. 5
0
    def __init__(self, model_name="default", user_dict="default"):
        """初始化函数,加载模型及用户词典"""
        # print("loading model")
        # config = Config()
        # self.config = config
        if model_name in ["default"]:
            config.modelDir = os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                "models",
                model_name,
            )
        else:
            config.modelDir = model_name
        # config.fModel = os.path.join(config.modelDir, "model.txt")
        if user_dict == "default":
            # file_name = os.path.join(
            #     os.path.dirname(os.path.realpath(__file__)),
            #     "dicts", "default_common.txt",
            # )
            file_name = None
            other_name = os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                "dicts",
                "default.txt",
            )
        else:
            file_name = user_dict
            other_name = None

        # self.preprocesser = Preprocesser(file_name)
        self.preprocesser = Preprocesser([])
        self.postprocesser = Postprocesser(file_name, other_name)

        self.feature_extractor = FeatureExtractor.load()
        self.model = Model.load()

        self.idx_to_tag = {
            idx: tag
            for tag, idx in self.feature_extractor.tag_to_idx.items()
        }

        # self.idx2tag = [None] * len(self.testFeature.tagIndexMap)
        # for i in self.testFeature.tagIndexMap:
        #     self.idx2tag[self.testFeature.tagIndexMap[i]] = i
        # if config.nLabel == 2:
        #     B = B_single = "B"
        #     I_first = I = I_end = "I"
        # elif config.nLabel == 3:
        #     B = B_single = "B"
        #     I_first = I = "I"
        #     I_end = "I_end"
        # elif config.nLabel == 4:
        #     B = "B"
        #     B_single = "B_single"
        #     I_first = I = "I"
        #     I_end = "I_end"
        # elif config.nLabel == 5:
        #     B = "B"
        #     B_single = "B_single"
        #     I_first = "I_first"
        #     I = "I"
        #     I_end = "I_end"
        # self.B = B
        # self.B_single = B_single
        # self.I_first = I_first
        # self.I = I
        # self.I_end = I_end

        self.n_feature = len(self.feature_extractor.feature_to_idx)
        self.n_tag = len(self.feature_extractor.tag_to_idx)