def _preprocess(self, corpus: Corpus, log_stats=True): xs = [] ys = [] answers = [] question2predicates = [] num_right = {'all': .0, '2hop': .0} num_2hop = .0 is_train = corpus.name == 'train' for i, sample in enumerate(corpus): candidates = sample['candidate_answer'] gold_predicates = sample['gold_tuple'] gold_entities = sample['gold_entitys'] answers.append(sample['answer']) has_right = False question2predicates.append({ 'gold': gold_predicates, 'candidate': [] }) for answer_predicates, feats in candidates.items(): if cmp(answer_predicates, gold_predicates) == 0: xs.append(feats[2:]) ys.append([1]) question2predicates[-1]['candidate'].append( answer_predicates) else: prop = random() if prop < 0.5 or not is_train: xs.append(feats[2:]) ys.append([0]) question2predicates[-1]['candidate'].append( answer_predicates) if cmp(answer_predicates, gold_predicates) == 0: has_right = True if has_right: num_right['all'] += 1 if len(gold_predicates) <= 3 and len(gold_entities) == 1: num_right['2hop'] += 1 if len(gold_predicates) <= 3 and len(gold_entities) == 1: num_2hop += 1 xs = np.array(xs, dtype=self._dtype) ys = np.array(ys, dtype=self._dtype) if log_stats: print('* For {}'.format(corpus.name)) print('* Recall ratio of single-subject questions: {:.2f}'.format( num_right['2hop'] / num_2hop)) print('\tand the one of all questions: {:.2f}'.format( num_right['all'] / len(corpus))) return xs, ys, question2predicates, answers
def GetData(corpus): '''为验证集验证模型使用的数据 X : numpy.array, (num_sample,num_feature) Y : numpy.array, (num_sample,1) samples : python-list,(num_sample,) ans : python-list, (num_question,num_answer) question2sample : python-dict, key:questionindex , value:sampleindexs ''' X = [] Y = [] samples = [] ans = [] gold_tuples = [] question2sample = {} sample_index = 0 true_num = 0 hop2_num = 0 hop2_true_num = 0 for i in range(len(corpus)): candidate_tuples = corpus[i]['candidate_tuples'] gold_tuple = corpus[i]['gold_tuple'] gold_entitys = corpus[i]['gold_entitys'] answer = corpus[i]['answer'] q_sample_indexs = [] for t in candidate_tuples: features = candidate_tuples[t] if len(gold_tuple) == len(set(gold_tuple).intersection(set(t))): X.append([features[2]]) Y.append([1]) else: X.append([features[2]]) Y.append([0]) samples.append(t) q_sample_indexs.append(sample_index) sample_index += 1 ans.append(answer) gold_tuples.append(gold_tuple) question2sample[i] = q_sample_indexs if_true = 0 #判断gold tuple是否包含在候选tuples中 for thistuple in candidate_tuples: if cmp(thistuple, gold_tuple) == 0: if_true = 1 break #判断单实体问题中,可召回的比例 if if_true == 1: true_num += 1 if len(gold_tuple) <= 3 and len(gold_entitys) == 1: hop2_true_num += 1 if len(gold_tuple) <= 3 and len(gold_entitys) == 1: hop2_num += 1 X = np.array(X, dtype='float32') Y = np.array(Y, dtype='float32') print('单实体问题中,候选答案可召回的的比例为:%.3f' % (hop2_true_num / hop2_num)) print('候选答案能覆盖标准查询路径的比例为:%.3f' % (true_num / len(corpus))) return X, Y, samples, ans, gold_tuples, question2sample
def recall_of(predicts_list, gold_list): num_correct = .0 num_single = .0 for predicts, gold in zip(predicts_list, gold_list): if len(gold) <= 3: num_single += 1 for predict, prob in predicts: if cmp(predict, gold) == 0: num_correct += 1 break return num_correct / num_single
def ComputePrecision(gold_tuples, predict_tuples, predict_props): ''' 计算单实体问题中,筛选后候选答案的召回率,float ''' true_num = 0 one_subject_num = 0 for i in range(len(gold_tuples)): gold_tuple = gold_tuples[i] if len(gold_tuple) <= 3: one_subject_num += 1 for j in range(len(predict_tuples[i])): predict_tuple = predict_tuples[i][j] if cmp(predict_tuple, gold_tuple) == 0: true_num += 1 break return true_num / one_subject_num
def GetData_train(corpus): ''' 为训练集的候选答案生成逻辑回归训练数据,由于正负例非常不均衡,对于负例进行0.05的采样 ''' X = [] Y = [] true_num = 0 hop2_num = 0 hop2_true_num = 0 for i in range(len(corpus)): if i >= 1283 and i < 1683: continue candidate_tuples = corpus[i]['candidate_tuples'] #字典 gold_tuple = corpus[i]['gold_tuple'] gold_entitys = corpus[i]['gold_entitys'] for t in candidate_tuples: features = candidate_tuples[t] if len(gold_tuple) == len(set(gold_tuple).intersection(set(t))): X.append([features[9][0][1]]) Y.append([1]) else: prop = random.random() if prop < 0.5: X.append([features[9][0][1]]) Y.append([0]) if_true = 0 #判断答案是否召回 for thistuple in candidate_tuples: if cmp(thistuple, gold_tuple) == 0: if_true = 1 break if if_true == 1: true_num += 1 if len(gold_tuple) <= 3 and len(gold_entitys) == 1: hop2_true_num += 1 if len(gold_tuple) <= 3 and len(gold_entitys) == 1: hop2_num += 1 X = np.array(X, dtype='float32') Y = np.array(Y, dtype='float32') print('单实体问题中,候选答案可召回的的比例为:%.3f' % (hop2_true_num / hop2_num)) print('候选答案能覆盖标准查询路径的比例为:%.3f' % (true_num / len(corpus))) return X, Y