def cal_confidence_score(sample, model_name): ''' 给未标注数据打标签,并且计算得分,返回结果 :param model: 模型 :param sample: 对象 :return: 预测的功能点名称 置信度 ''' model = Tagger() model.open(model_name) # unlabeled sample features feature_sequence = build_model_features(sample, 17, False) # words # words = sample.sen_words chars = list(sample.sentence) model.set(feature_sequence) predicted_labels = model.tag() # get predicted_fps fp_list = [] fp = '' for index, label in enumerate(predicted_labels): if label == 'B' or label == 'I' or label == 'E': fp += chars[index] if label == 'N' and len(fp) > 0: fp_list.append(fp) fp = '' # calculate the probability of tagging crf_confidence = model.probability(predicted_labels) lan_confidence = 0 filtered_fp_list = [] for fp_name in fp_list: filtered_fp_list.append(fp_name) if len(filtered_fp_list) == 0: predicted_fps = 'null' else: predicted_fps = ' '.join(filtered_fp_list) # print(str(sample.story_id) +' '+ sample.sentence +' '+ fp +' '+ str(confidence)) # 为防止多进程乱序执行导致结果跟sample不对应,因此同时返回sample信息 return sample.story_id, sample.sentence, predicted_fps, crf_confidence
pickle.dump({ 'dataset': dataset, 'thetas': thetas }, open(FLEXCRF_TEST_DATA_FILE, 'wb')) else: dd = pickle.load(open(FLEXCRF_TEST_DATA_FILE)) dataset = dd['dataset'] thetas = dd['thetas'] # -- Start classification ------------------------------------------------ for seq in range(len(dataset)): # -- with crfsuite s_ = tagger.tag(data['X'][seq]) y_ = np.array([int(model.labels[s]) for s in s_]) prob_ = tagger.probability(s_) print "\n-- With crfsuite:" print "labels:\n", s_, "\n", y_ print "probability:\t %f" % prob_ # -- with flexcrf f_xy, y = dataset[seq] theta = thetas[seq] m_xy, f_m_xy = _compute_all_potentials(f_xy, theta) y_pred = viterbi_decoder(m_xy) # ADD CODE TO COMPUTE POSTERIOR PROBABILITY WITH FLEXCRF HERE ....