Beispiel #1
0
    def _preprocess(self, corpus: Corpus, log_stats=True):
        xs = []
        ys = []
        answers = []
        question2predicates = []
        num_right = {'all': .0, '2hop': .0}
        num_2hop = .0
        is_train = corpus.name == 'train'
        for i, sample in enumerate(corpus):
            candidates = sample['candidate_answer']
            gold_predicates = sample['gold_tuple']
            gold_entities = sample['gold_entitys']
            answers.append(sample['answer'])
            has_right = False
            question2predicates.append({
                'gold': gold_predicates,
                'candidate': []
            })
            for answer_predicates, feats in candidates.items():
                if cmp(answer_predicates, gold_predicates) == 0:
                    xs.append(feats[2:])
                    ys.append([1])
                    question2predicates[-1]['candidate'].append(
                        answer_predicates)
                else:
                    prop = random()
                    if prop < 0.5 or not is_train:
                        xs.append(feats[2:])
                        ys.append([0])
                        question2predicates[-1]['candidate'].append(
                            answer_predicates)
                if cmp(answer_predicates, gold_predicates) == 0:
                    has_right = True

            if has_right:
                num_right['all'] += 1
                if len(gold_predicates) <= 3 and len(gold_entities) == 1:
                    num_right['2hop'] += 1
            if len(gold_predicates) <= 3 and len(gold_entities) == 1:
                num_2hop += 1
        xs = np.array(xs, dtype=self._dtype)
        ys = np.array(ys, dtype=self._dtype)
        if log_stats:
            print('* For {}'.format(corpus.name))
            print('* Recall ratio of single-subject questions: {:.2f}'.format(
                num_right['2hop'] / num_2hop))
            print('\tand the one of all questions: {:.2f}'.format(
                num_right['all'] / len(corpus)))
        return xs, ys, question2predicates, answers
Beispiel #2
0
def GetData(corpus):
    '''为验证集验证模型使用的数据
    X : numpy.array, (num_sample,num_feature)
    Y : numpy.array, (num_sample,1)
    samples : python-list,(num_sample,)
    ans : python-list, (num_question,num_answer)
    question2sample : python-dict, key:questionindex , value:sampleindexs
    '''
    X = []
    Y = []
    samples = []
    ans = []
    gold_tuples = []
    question2sample = {}

    sample_index = 0
    true_num = 0
    hop2_num = 0
    hop2_true_num = 0
    for i in range(len(corpus)):
        candidate_tuples = corpus[i]['candidate_tuples']
        gold_tuple = corpus[i]['gold_tuple']
        gold_entitys = corpus[i]['gold_entitys']
        answer = corpus[i]['answer']
        q_sample_indexs = []
        for t in candidate_tuples:
            features = candidate_tuples[t]
            if len(gold_tuple) == len(set(gold_tuple).intersection(set(t))):
                X.append([features[2]])
                Y.append([1])
            else:
                X.append([features[2]])
                Y.append([0])
            samples.append(t)
            q_sample_indexs.append(sample_index)
            sample_index += 1
        ans.append(answer)
        gold_tuples.append(gold_tuple)
        question2sample[i] = q_sample_indexs

        if_true = 0
        #判断gold tuple是否包含在候选tuples中
        for thistuple in candidate_tuples:
            if cmp(thistuple, gold_tuple) == 0:
                if_true = 1
                break
        #判断单实体问题中,可召回的比例
        if if_true == 1:
            true_num += 1
            if len(gold_tuple) <= 3 and len(gold_entitys) == 1:
                hop2_true_num += 1
        if len(gold_tuple) <= 3 and len(gold_entitys) == 1:
            hop2_num += 1

    X = np.array(X, dtype='float32')
    Y = np.array(Y, dtype='float32')
    print('单实体问题中,候选答案可召回的的比例为:%.3f' % (hop2_true_num / hop2_num))
    print('候选答案能覆盖标准查询路径的比例为:%.3f' % (true_num / len(corpus)))
    return X, Y, samples, ans, gold_tuples, question2sample
Beispiel #3
0
 def recall_of(predicts_list, gold_list):
     num_correct = .0
     num_single = .0
     for predicts, gold in zip(predicts_list, gold_list):
         if len(gold) <= 3:
             num_single += 1
         for predict, prob in predicts:
             if cmp(predict, gold) == 0:
                 num_correct += 1
                 break
     return num_correct / num_single
Beispiel #4
0
def ComputePrecision(gold_tuples, predict_tuples, predict_props):
    '''
    计算单实体问题中,筛选后候选答案的召回率,float
    '''
    true_num = 0
    one_subject_num = 0
    for i in range(len(gold_tuples)):
        gold_tuple = gold_tuples[i]
        if len(gold_tuple) <= 3:
            one_subject_num += 1
        for j in range(len(predict_tuples[i])):
            predict_tuple = predict_tuples[i][j]
            if cmp(predict_tuple, gold_tuple) == 0:
                true_num += 1
                break
    return true_num / one_subject_num
Beispiel #5
0
def GetData_train(corpus):
    '''
    为训练集的候选答案生成逻辑回归训练数据,由于正负例非常不均衡,对于负例进行0.05的采样
    '''
    X = []
    Y = []
    true_num = 0
    hop2_num = 0
    hop2_true_num = 0
    for i in range(len(corpus)):
        if i >= 1283 and i < 1683:
            continue
        candidate_tuples = corpus[i]['candidate_tuples']  #字典
        gold_tuple = corpus[i]['gold_tuple']
        gold_entitys = corpus[i]['gold_entitys']

        for t in candidate_tuples:
            features = candidate_tuples[t]
            if len(gold_tuple) == len(set(gold_tuple).intersection(set(t))):
                X.append([features[9][0][1]])
                Y.append([1])
            else:
                prop = random.random()
                if prop < 0.5:
                    X.append([features[9][0][1]])
                    Y.append([0])

        if_true = 0  #判断答案是否召回
        for thistuple in candidate_tuples:
            if cmp(thistuple, gold_tuple) == 0:
                if_true = 1
                break
        if if_true == 1:
            true_num += 1
            if len(gold_tuple) <= 3 and len(gold_entitys) == 1:
                hop2_true_num += 1
        if len(gold_tuple) <= 3 and len(gold_entitys) == 1:
            hop2_num += 1

    X = np.array(X, dtype='float32')
    Y = np.array(Y, dtype='float32')
    print('单实体问题中,候选答案可召回的的比例为:%.3f' % (hop2_true_num / hop2_num))
    print('候选答案能覆盖标准查询路径的比例为:%.3f' % (true_num / len(corpus)))
    return X, Y