Ejemplo n.º 1
0
def cal_confidence_score(sample, model_name):
    '''
    给未标注数据打标签,并且计算得分,返回结果
    :param model: 模型
    :param sample: 对象
    :return: 预测的功能点名称  置信度
    '''
    model = Tagger()
    model.open(model_name)
    # unlabeled sample features
    feature_sequence = build_model_features(sample, 17, False)
    # words
    # words = sample.sen_words
    chars = list(sample.sentence)
    model.set(feature_sequence)
    predicted_labels = model.tag()

    # get predicted_fps
    fp_list = []
    fp = ''
    for index, label in enumerate(predicted_labels):
        if label == 'B' or label == 'I' or label == 'E':
            fp += chars[index]
        if label == 'N' and len(fp) > 0:
            fp_list.append(fp)
            fp = ''

    # calculate the probability of tagging
    crf_confidence = model.probability(predicted_labels)

    lan_confidence = 0
    filtered_fp_list = []
    for fp_name in fp_list:
        filtered_fp_list.append(fp_name)

    if len(filtered_fp_list) == 0:
        predicted_fps = 'null'
    else:
        predicted_fps = ' '.join(filtered_fp_list)

    # print(str(sample.story_id) +' '+ sample.sentence +' '+ fp +' '+ str(confidence))
    # 为防止多进程乱序执行导致结果跟sample不对应,因此同时返回sample信息
    return sample.story_id, sample.sentence, predicted_fps, crf_confidence
Ejemplo n.º 2
0
def crf_predict(
    tagger: pycrfsuite.Tagger,
    gp_data: list,
    mode: str = 'raw',
    exclude_labels: list = ['NOL', 'NAT', 'NEE']
) -> Union[list, Tuple[list, pd.DataFrame]]:
    """Return predictions for the test data, grouped by file. 3 modes for return:
		* Return raw predictions (raw)
		* Return predictions with only valid tags (exclude_ool)
		* Return predictions (valid tags) and probabilities for each class (rt_proba)

	Predictions are returned unflattened
	
	https://python-crfsuite.readthedocs.io/en/latest/pycrfsuite.html
	"""
    if mode not in ['raw', 'exclude_ool', 'rt_proba']:
        raise ValueError(
            f"mode must be one of raw|exclude_ool|rt_proba; currently {mode}")
    if mode == 'raw':
        return [tagger.tag(xseq) for xseq in gp_data]
    labels = tagger.labels()

    res = []
    y_pred = []
    for fi, xseq in enumerate(gp_data):
        tagger.set(xseq)
        file_proba = pd.DataFrame({
            label: [tagger.marginal(label, i) for i in range(len(xseq))]
            for label in labels
        })
        y_pred.append(file_proba[[
            col for col in file_proba.columns if col not in exclude_labels
        ]].idxmax(axis=1).tolist())
        file_proba['file_id'] = fi
        res.append(file_proba)

    if mode == 'rt_proba':
        return y_pred, pd.concat(res, axis=0)
    return y_pred  # else
Ejemplo n.º 3
0
def evaluate_model_by_story(model_name, test_samples):
    model = Tagger()
    model.open(model_name)

    story_fps = dict()
    for sample in test_samples:
        model.set(build_model_features(sample, 17, True))
        predicted_labels = model.tag()

        chars = list(sample.sentence)
        predicted_fps = []
        fp = ''
        for index, word in enumerate(predicted_labels):
            if word == 'E' or word == 'S':
                fp += chars[index]
                predicted_fps.append(fp)
                fp = ''
            if word == 'B' or word == 'I':
                fp += chars[index]

        actual_fps = [fp for fp in sample.fps if fp != '' and fp != 'null' and fp in sample.sentence]

        filtered_predicted_fps = predicted_fps
        # for predicted_fp in predicted_fps:
        #     lan_confidence_temp = lmmodel.score(predicted_fp, bos=True, eos=True) / len(predicted_fp)
        #     if len(re.findall('[a-zA-Z0-9+]+', predicted_fp)) > 0:
        #         lan_confidence_temp += 5
        #     if lan_confidence_temp > -2.4:
        #         filtered_predicted_fps.append(predicted_fp)

        if sample.story_id not in story_fps:
            story_fps[sample.story_id] = [set(actual_fps), set(filtered_predicted_fps)]
        else:
            story_fps[sample.story_id][0].update(actual_fps)
            story_fps[sample.story_id][1].update(filtered_predicted_fps)

    # print(len(story_fps))
    global sim_t
    sim_threshold = sim_t

    TP_precision = 0
    TP_recall = 0
    all_actual_fps = 0
    all_predicted_fps = 0
    for story_id, (actual_fps, predicted_fps) in story_fps.items():
        story_precision = 0.0
        story_recall = 0.0

        all_actual_fps += len(actual_fps)

        all_predicted_fps += len(predicted_fps)
        # for actual_fp in actual_fps:

        story = samples_dao.read_story_by_story_id(int(story_id))
        data = [story_id,
                story[0] if story is not None else '',
                story[1] if story is not None else '',
                story[2] if story is not None else '',
                story[3] if story is not None else '',
                story[4] if story is not None else '',
                actual_fps,
                predicted_fps]
        with open('../Archive/date_performance/resultsIterRes_by_story_details.csv', 'a', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(data)
        for predicted_fp in predicted_fps:
            sim = []
            for actual_fp in actual_fps:
                similarity = 1-distance.nlevenshtein(actual_fp, predicted_fp, method=1)
                # if actual_fp in predicted_fp:
                #     similarity = 1
                sim.append(similarity)
            # print(sim)

            if len(sim) == 0:
                sim = [0]
            if max(sim) >= sim_threshold:
                TP_precision += 1
                story_precision += 1

        for actual_fp in actual_fps:
            sim = []
            for predicted_fp in predicted_fps:
                similarity = 1-distance.nlevenshtein(actual_fp, predicted_fp, method=1)
                sim.append(similarity)
            # print(sim)
            if len(sim) == 0:
                sim = [0]
            if max(sim) >= sim_threshold:
                TP_recall += 1
                story_recall += 1

        # 每个故事的详情
        story_precision = 0 if len(filtered_predicted_fps) == 0 else story_precision/len(filtered_predicted_fps)
        story_recall = 0 if len(actual_fps) == 0 else story_recall/len(actual_fps)
        data = ["STORY " + story_id, story_precision, story_recall]
        with open('../Archive/date_performance/results/story_details.csv', 'a', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(data)
    with open('../Archive/date_performance/results/story_details.csv', 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(["THE END!!!"])

    # 整体的详情
    precision = TP_precision/all_predicted_fps
    recall = TP_recall/all_actual_fps
    f1 = 2 * precision * recall / (precision + recall)

    print("By Story: Iteration: %s\n\tPrecision: %f\n\tRecall: %f\n\tF1: %f\n\n\n"
          % (model_name.split('_')[2], precision, recall, f1))

    data = ["BY STORY: Iteration " + model_name.split('_')[2], precision, recall, f1]

    with open('../Archive/date_performance/results/IterRes_by_story.csv', 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(data)

    return precision, recall, f1
Ejemplo n.º 4
0
def evaluate_model(model_name, test_samples):
    '''
    最后一次迭代训练模型,并输出测试结果
    :param test_samples:
    :param model_name:
    :return:
    '''
    model = Tagger()
    model.open(model_name)

    accuracy = 0.0
    recall = 0.0
    f1 = 0.0
    # sample_accuracy = 0.0
    iteration_test_details = []
    for sample in test_samples:
        model.set(build_model_features(sample, 17, True))
        predicted_labels = model.tag()
        true_labels = sample.char_label

        predicted_label_index = []
        for predicted_label in predicted_labels:
            if predicted_label == 'N':
                predicted_label_index.append(0)
            else:
                predicted_label_index.append(1)

        true_label_index = []
        for true_label in true_labels:
            if true_label == 'N':
                true_label_index.append(0)
            else:
                true_label_index.append(1)

        iteration_test_details = []
        chars = list(sample.sentence)
        # sen_words = sample.sen_words
        iteration_test_details.append(sample.sentence)
        predicted_fps = ''
        actual_fps = ''
        for index, word in enumerate(predicted_labels):
            if word != 'N':
                predicted_fps += chars[index]
        if len(predicted_fps) == 0:
            predicted_fps = '-----'

        for index, word in enumerate(true_labels):
            if word != 'N':
                actual_fps += chars[index]

        iteration_test_details.append(actual_fps)
        iteration_test_details.append(predicted_fps)

        with open('../Archive/date_performance/results/Iteration_Test_Details.csv', 'a', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(iteration_test_details)

        # print(sample.sen_words)
        # print(predicted_labels)
        # print(true_labels)

        accuracy += metrics.accuracy_score(true_label_index, predicted_label_index)
        recall += metrics.recall_score(true_label_index, predicted_label_index, average='binary', pos_label=1)
        f1 += 2*accuracy*recall/(accuracy+recall)
        # sample_accuracy += metrics.sequence_accuracy_score(true_labels, predicted_labels)

    print("Iteration: %s\n\tAccuracy: %f\n\tRecall: %f\n\tF1: %f\n\n\n"
          % (
              model_name.split('_')[2], accuracy / len(test_samples), recall / len(test_samples),
              f1 / len(test_samples)))

    data = ["Iteration " + model_name.split('_')[2], accuracy / len(test_samples), recall / len(test_samples),
            f1 / len(test_samples)]

    with open('../Archive/date_performance/results/IterRes.csv', 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(data)

    with open('../Archive/date_performance/results/Iteration_Test_Details.csv', 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(data)

    return accuracy / len(test_samples), recall / len(test_samples), f1 / len(test_samples)