def _show_all_labels(self): # split labeled data and unlabeled data output = [] contents = [] seg_contents = [] features = [] labels = [] for i in self.samples: label = i.human_label if i.human_label else i.machine_label output.append(label + self.col_sep + str(i.prob)) seg_contents.append(i.seg_text_word) contents.append(i.original_text) labels.append(label) features.append(i.feature.toarray().tolist()[0]) # get data feature X_train, X_val, y_train, y_val = train_test_split( csr_matrix(np.array(features)), labels) # fit self.model.fit(X_train, y_train) # save model dump_pkl(self.model, self.model_save_path, overwrite=True) eval(self.model, X_val, y_val) save(output, ture_labels=None, pred_save_path=self.pred_save_path, data_set=contents)
def infer_deep_model(model_type='cnn', data_path='', model_save_path='', label_vocab_path='', max_len=300, batch_size=128, col_sep='\t', pred_save_path=None): # load data content data_set, true_labels = data_reader(data_path, col_sep) # init feature # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2) if model_type == 'han': feature_type = 'doc_vectorize' else: feature_type = 'vectorize' feature = Feature(data_set, feature_type=feature_type, is_infer=True, max_len=max_len) # get data feature data_feature = feature.get_feature() # load model model = load_model(model_save_path) # predict, in keras, predict_proba same with predict pred_label_probs = model.predict(data_feature, batch_size=batch_size) # label id map label_id = load_vocab(label_vocab_path) id_label = {v: k for k, v in label_id.items()} pred_labels = [prob.argmax() for prob in pred_label_probs] pred_labels = [id_label[i] for i in pred_labels] pred_output = [ id_label[prob.argmax()] + col_sep + str(prob.max()) for prob in pred_label_probs ] logger.info("save infer label and prob result to: %s" % pred_save_path) save(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=data_set) if true_labels: # evaluate assert len(pred_labels) == len(true_labels) for label, prob in zip(true_labels, pred_label_probs): logger.debug('label_true:%s\tprob_label:%s\tprob:%s' % (label, id_label[prob.argmax()], prob.max())) print('total eval:') try: print(classification_report(true_labels, pred_labels)) print(confusion_matrix(true_labels, pred_labels)) except UnicodeEncodeError: true_labels_id = [label_id[i] for i in true_labels] pred_labels_id = [label_id[i] for i in pred_labels] print(classification_report(true_labels_id, pred_labels_id)) print(confusion_matrix(true_labels_id, pred_labels_id))
def _train(self, labeled_sample_list, unlabeled_sample_list, batch_id): machine_samples_list = [] # get data feature labeled_data_label = [ i.human_label if i.human_label else i.machine_label for i in labeled_sample_list ] labeled_data_feature = [ i.feature.toarray().tolist()[0] for i in labeled_sample_list ] X_train, X_val, y_train, y_val = train_test_split( csr_matrix(np.array(labeled_data_feature)), labeled_data_label) # fit self.model.fit(X_train, y_train) # save model dump_pkl(self.model, self.model_save_path, overwrite=True) eval(self.model, X_val, y_val) # 预测未标注数据集 unlabeled_data_feature = [ i.feature.toarray().tolist()[0] for i in unlabeled_sample_list ] if not unlabeled_sample_list: return machine_samples_list pred_result = self.model.predict_proba( csr_matrix(np.array(unlabeled_data_feature))) pred_label_proba = [(self.id_label[prob.argmax()], prob.max()) for prob in pred_result] # save middle result pred_output = [ self.id_label[prob.argmax()] + self.col_sep + str(prob.max()) for prob in pred_result ] pred_save_path = self.pred_save_path[:-4] + '_batch_' + str( batch_id) + '.txt' logger.debug("save infer label and prob result to: %s" % pred_save_path) unlabeled_data_text = [i.original_text for i in unlabeled_sample_list] save(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=unlabeled_data_text) assert len(unlabeled_sample_list) == len(pred_label_proba) for unlabeled_sample, label_prob in zip(unlabeled_sample_list, pred_label_proba): idx = unlabeled_sample.id self.samples[idx].machine_label = label_prob[0] self.samples[idx].prob = label_prob[1] machine_samples_list.append(unlabeled_sample) return machine_samples_list
def save(self, save_dir=None): if not save_dir is None: self.save_dir = save_dir if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) save(self.idx2token, self.save_dir + '/idx2token.pkl') save(self.token2idx, self.save_dir + '/token2idx.pkl') save(self.word_freq, self.save_dir + '/word_freq.pkl') save(self.special, self.save_dir + '/special_words.pkl')
def infer_classic(model_type='xgboost_lr', model_save_path='', label_vocab_path='', test_data_path='', pred_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word'): # load data content data_set, true_labels = data_reader(test_data_path, col_sep) # init feature feature = Feature(data_set, feature_type=feature_type, feature_vec_path=feature_vec_path, is_infer=True) # get data feature data_feature = feature.get_feature() # load model if model_type == 'xgboost_lr': model = XGBLR(model_save_path) else: model = load_pkl(model_save_path) # predict pred_label_probs = model.predict_proba(data_feature) # label id map label_id = load_vocab(label_vocab_path) id_label = {v: k for k, v in label_id.items()} pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs] pred_output = [ id_label[prob.argmax()] + col_sep + str(prob.max()) for prob in pred_label_probs ] logger.info("save infer label and prob result to:%s" % pred_save_path) save(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=data_set) if 'logistic_regression' in model_save_path and config.is_debug: count = 0 features = load_pkl('output/lr_features.pkl') for line in data_set: if count > 5: break count += 1 logger.debug(line) words = line.split() for category, category_feature in features.items(): logger.debug('*' * 43) logger.debug(category) category_score = 0 for w in words: if w in category_feature: category_score += category_feature[w] logger.debug("%s:%s" % (w, category_feature[w])) logger.debug("%s\t%f" % (category, category_score)) logger.debug('=' * 43) if true_labels: # evaluate try: print(classification_report(true_labels, pred_labels)) print(confusion_matrix(true_labels, pred_labels)) except UnicodeEncodeError: true_labels_id = [label_id[i] for i in true_labels] pred_labels_id = [label_id[i] for i in pred_labels] print(classification_report(true_labels_id, pred_labels_id)) print(confusion_matrix(true_labels_id, pred_labels_id))