def generate_diff_data(entities, person_mat, org_mat, loc_mat): person_result, org_result, loc_result = manual_ranker(entities, person_mat, org_mat, loc_mat) data_pair = [("PERSON", person_mat, person_result), ("ORGANIZATION", org_mat, org_result), ("LOCATION", loc_mat, loc_result)] ret = [] for type, mat, result in data_pair: print "\n\nShowing", type for i in result: print "%d: %s" % (i, entities[type][i][0]) remainder = set(result) layers = [] while True: res = raw_input("Good results:") if not res: break res_str = res.split() res = set() for num in res_str: res.add(int(num)) layers.append(res) remainder -= res layers.append(remainder) Xs, ys = [], [] for i in xrange(len(layers) - 1): for j in xrange(i + 1, len(layers)): for a in layers[i]: for b in layers[j]: Xs.append(dist_vector(mat[a], mat[b])) ys.append(1) Xs.append(dist_vector(mat[b], mat[a])) ys.append(-1) ret.append((Xs, ys)) return ret
def ml_ranker(entities, person_mat, org_mat, loc_mat): clf_person, clf_org, clf_loc = pickle.load(open("classifiers.dat", "r")) data_set = [(person_mat, clf_person), (org_mat, clf_org), (loc_mat, clf_loc)] ret = [] scores = [] for data, clf in data_set: normalize(data) score = [0] * len(data) for i in xrange(len(data) - 1): for j in xrange(i + 1, len(data)): if clf.predict(dist_vector(data[i], data[j])) == [1]: score[i] += 1 else: score[j] += 1 if clf.predict(dist_vector(data[j], data[i])) == [-1]: score[i] += 1 else: score[j] += 1 result = sorted(range(len(data)), key=lambda i: -score[i]) ret.append(result) scores.append(score) return ret, scores