Exemple #1
0
    def _show_all_labels(self):
        # split labeled data and unlabeled data
        output = []
        contents = []
        seg_contents = []
        features = []
        labels = []
        for i in self.samples:
            label = i.human_label if i.human_label else i.machine_label
            output.append(label + self.col_sep + str(i.prob))
            seg_contents.append(i.seg_text_word)
            contents.append(i.original_text)
            labels.append(label)
            features.append(i.feature.toarray().tolist()[0])
        # get data feature
        X_train, X_val, y_train, y_val = train_test_split(
            csr_matrix(np.array(features)), labels)

        # fit
        self.model.fit(X_train, y_train)

        # save model
        dump_pkl(self.model, self.model_save_path, overwrite=True)
        eval(self.model, X_val, y_val)
        save(output,
             ture_labels=None,
             pred_save_path=self.pred_save_path,
             data_set=contents)
Exemple #2
0
def infer_deep_model(model_type='cnn',
                     data_path='',
                     model_save_path='',
                     label_vocab_path='',
                     max_len=300,
                     batch_size=128,
                     col_sep='\t',
                     pred_save_path=None):
    # load data content
    data_set, true_labels = data_reader(data_path, col_sep)
    # init feature
    # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2)
    if model_type == 'han':
        feature_type = 'doc_vectorize'
    else:
        feature_type = 'vectorize'
    feature = Feature(data_set,
                      feature_type=feature_type,
                      is_infer=True,
                      max_len=max_len)
    # get data feature
    data_feature = feature.get_feature()

    # load model
    model = load_model(model_save_path)
    # predict, in keras, predict_proba same with predict
    pred_label_probs = model.predict(data_feature, batch_size=batch_size)

    # label id map
    label_id = load_vocab(label_vocab_path)
    id_label = {v: k for k, v in label_id.items()}
    pred_labels = [prob.argmax() for prob in pred_label_probs]
    pred_labels = [id_label[i] for i in pred_labels]
    pred_output = [
        id_label[prob.argmax()] + col_sep + str(prob.max())
        for prob in pred_label_probs
    ]
    logger.info("save infer label and prob result to: %s" % pred_save_path)
    save(pred_output,
         ture_labels=None,
         pred_save_path=pred_save_path,
         data_set=data_set)
    if true_labels:
        # evaluate
        assert len(pred_labels) == len(true_labels)
        for label, prob in zip(true_labels, pred_label_probs):
            logger.debug('label_true:%s\tprob_label:%s\tprob:%s' %
                         (label, id_label[prob.argmax()], prob.max()))

        print('total eval:')
        try:
            print(classification_report(true_labels, pred_labels))
            print(confusion_matrix(true_labels, pred_labels))
        except UnicodeEncodeError:
            true_labels_id = [label_id[i] for i in true_labels]
            pred_labels_id = [label_id[i] for i in pred_labels]
            print(classification_report(true_labels_id, pred_labels_id))
            print(confusion_matrix(true_labels_id, pred_labels_id))
Exemple #3
0
    def _train(self, labeled_sample_list, unlabeled_sample_list, batch_id):
        machine_samples_list = []
        # get data feature
        labeled_data_label = [
            i.human_label if i.human_label else i.machine_label
            for i in labeled_sample_list
        ]
        labeled_data_feature = [
            i.feature.toarray().tolist()[0] for i in labeled_sample_list
        ]
        X_train, X_val, y_train, y_val = train_test_split(
            csr_matrix(np.array(labeled_data_feature)), labeled_data_label)
        # fit
        self.model.fit(X_train, y_train)

        # save model
        dump_pkl(self.model, self.model_save_path, overwrite=True)
        eval(self.model, X_val, y_val)

        # 预测未标注数据集
        unlabeled_data_feature = [
            i.feature.toarray().tolist()[0] for i in unlabeled_sample_list
        ]
        if not unlabeled_sample_list:
            return machine_samples_list
        pred_result = self.model.predict_proba(
            csr_matrix(np.array(unlabeled_data_feature)))

        pred_label_proba = [(self.id_label[prob.argmax()], prob.max())
                            for prob in pred_result]

        # save middle result
        pred_output = [
            self.id_label[prob.argmax()] + self.col_sep + str(prob.max())
            for prob in pred_result
        ]
        pred_save_path = self.pred_save_path[:-4] + '_batch_' + str(
            batch_id) + '.txt'
        logger.debug("save infer label and prob result to: %s" %
                     pred_save_path)
        unlabeled_data_text = [i.original_text for i in unlabeled_sample_list]
        save(pred_output,
             ture_labels=None,
             pred_save_path=pred_save_path,
             data_set=unlabeled_data_text)

        assert len(unlabeled_sample_list) == len(pred_label_proba)
        for unlabeled_sample, label_prob in zip(unlabeled_sample_list,
                                                pred_label_proba):
            idx = unlabeled_sample.id
            self.samples[idx].machine_label = label_prob[0]
            self.samples[idx].prob = label_prob[1]
            machine_samples_list.append(unlabeled_sample)
        return machine_samples_list
Exemple #4
0
 def save(self, save_dir=None):
     if not save_dir is None:
         self.save_dir = save_dir
     if not os.path.exists(self.save_dir):
         os.makedirs(self.save_dir)
     save(self.idx2token, self.save_dir + '/idx2token.pkl')
     save(self.token2idx, self.save_dir + '/token2idx.pkl')
     save(self.word_freq, self.save_dir + '/word_freq.pkl')
     save(self.special, self.save_dir + '/special_words.pkl')
Exemple #5
0
def infer_classic(model_type='xgboost_lr',
                  model_save_path='',
                  label_vocab_path='',
                  test_data_path='',
                  pred_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word'):
    # load data content
    data_set, true_labels = data_reader(test_data_path, col_sep)
    # init feature
    feature = Feature(data_set,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      is_infer=True)
    # get data feature
    data_feature = feature.get_feature()
    # load model
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path)
    else:
        model = load_pkl(model_save_path)

    # predict
    pred_label_probs = model.predict_proba(data_feature)

    # label id map
    label_id = load_vocab(label_vocab_path)
    id_label = {v: k for k, v in label_id.items()}

    pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs]
    pred_output = [
        id_label[prob.argmax()] + col_sep + str(prob.max())
        for prob in pred_label_probs
    ]
    logger.info("save infer label and prob result to:%s" % pred_save_path)
    save(pred_output,
         ture_labels=None,
         pred_save_path=pred_save_path,
         data_set=data_set)
    if 'logistic_regression' in model_save_path and config.is_debug:
        count = 0
        features = load_pkl('output/lr_features.pkl')
        for line in data_set:
            if count > 5:
                break
            count += 1
            logger.debug(line)
            words = line.split()
            for category, category_feature in features.items():
                logger.debug('*' * 43)
                logger.debug(category)
                category_score = 0
                for w in words:
                    if w in category_feature:
                        category_score += category_feature[w]
                        logger.debug("%s:%s" % (w, category_feature[w]))
                logger.debug("%s\t%f" % (category, category_score))
                logger.debug('=' * 43)
    if true_labels:
        # evaluate
        try:
            print(classification_report(true_labels, pred_labels))
            print(confusion_matrix(true_labels, pred_labels))
        except UnicodeEncodeError:
            true_labels_id = [label_id[i] for i in true_labels]
            pred_labels_id = [label_id[i] for i in pred_labels]
            print(classification_report(true_labels_id, pred_labels_id))
            print(confusion_matrix(true_labels_id, pred_labels_id))