Ejemplo n.º 1
0
def caculate_bleu(train_data,sentence):
    logging.debug('=' * 10 + 'BLEU' + '=' * 10)
    logging.debug(u'句子:%s' %(sentence))
    # 创建一个列表存放每个实例的分数
    get_wer_score = lambda x: bleu([x.split('|')], sentence.split('|'))
    sentence_wer_score_list = train_data['SEGMENT_FULL'].apply(get_wer_score).as_matrix()
    # print sentence_wer_score_list
    # 排序
    sorted_index = np.argsort(sentence_wer_score_list)
    sorted_index = sorted_index[-1::-1]
    most_similary_score = sentence_wer_score_list[sorted_index]
    indexes = sorted_index[:k].tolist()
    for index,items in enumerate(most_similary_score[k+1:],start=k+1):
        if items==most_similary_score[0]:
            indexes.append(sorted_index[index])
        else:
            break
    most_similary_score = sentence_wer_score_list[indexes]
    # print most_similary_score
    most_similary_sentence = train_data['SEGMENT_FULL'].iloc[indexes].as_matrix()
    most_similary_label = train_data['LABEL'].iloc[indexes].as_matrix()

    logging.debug(u'前k个最相似的句子为:%s' % (' , '.join(most_similary_sentence)))
    logging.debug('分数分别为:%s' % most_similary_score)
    logging.debug(u'类别分别为:%s' % (' , '.join(most_similary_label)))

    return most_similary_score.tolist(), most_similary_sentence.tolist(), most_similary_label.tolist()
    similary_score_list = []
Ejemplo n.º 2
0
def caculate_bleu(train_data,sentence):
    logging.debug('=' * 10 + 'BLEU' + '=' * 10)
    similary_score_list = []
    for label,group in train_data.groupby(by=['LABEL']):
        logging.debug( '-'*20)
        logging.debug(u'正在计算类别(%s),个数:%d'%(label,len(group)))
        references = [items.split('|') for items in group['SEGMENT_FULL']]
        # 每个句子和分数绑定在一起
        bleu_score = [('|'.join(items),bleu([items], sentence.split('|'))) for items in references]
        bleu_score = sorted(bleu_score,key=lambda x: x[1],reverse=True)
        logging.debug('得分列表(从大到小,越大越相似)为:%s'%([item[1] for item in bleu_score]))
        logging.debug(u'对应句子(从大到小,越大越相似)为:%s'%(','.join([item[0] for item in bleu_score])))

        logging.debug(u'最大bleu相似性(越越相似):%f'%(bleu_score[0][1]))
        similary_score_list.append((label,bleu_score[0][1]))

    similary_score_list = sorted(similary_score_list,key=lambda x:x[1],reverse=True)
    most_similary_score = similary_score_list[0][1]
    most_similary_label = [items[0] for items in similary_score_list
                           if items[1]==most_similary_score or items[1]>0.6]
    logging.debug( '-'*20)
    logging.debug(u'最近似类别:%s,分数(%f)'%('|'.join(most_similary_label),most_similary_score))
    return most_similary_label,similary_score_list