Example #1
0
        def get_mids_by_surface():
            surface = request.args.get('surface').strip()
            print '[get_mid_by_surface]'

            mids = DBManager.get_candidate_entities(surface, 0.1)
            print mids
            res = {
                'candidates': '<br>'.join('%s %s' % (m[0], m[1]) for m in mids)
            }
            return json.dumps(res)
Example #2
0
def gen_unsolved_sentence(fn_in, fn_out):
    avg_candidate = 0
    num = 0
    with open(fn_in) as fin, open(fn_out, 'w') as fout:
        for line in fin:
            data = json.loads(line, encoding='utf8')
            gold_entity = data['entity']
            surfaces = data['predict'].split("\t")

            candidates = dict()
            for surface in surfaces:
                surface = surface.lower().replace(' ', '')
                res = DBManager.get_candidate_entities(surface, 0.1)

                for e in res:
                    if e[0] not in candidates or e[1] > candidates[e[0]]:
                        candidates[e[0]] = e[1]
            if len(candidates) == 0:
                sentence = [w.split('|')[0]
                            for w in data['tag_res'].split()][1:-1]
                if 'pos' in data:
                    all_pos = data['pos'][1:-1]
                else:
                    all_pos = None
                # use ngram of surface
                for surface in surfaces:
                    surface = surface.lower().split()
                    if len(surface) == 0:
                        continue
                    start = find_word(sentence, surface)
                    if start == -1:
                        continue
                    l = len(surface)
                    found = False
                    for j in range(l, 0, -1):
                        # if found:
                        #     break
                        for i in range(l - j + 1):
                            if 'pos' not in data or is_entity_occurrence(
                                    all_pos, sentence, start + i,
                                    start + i + j):
                                s = ''.join(surface[i:i + j])
                                res = DBManager.get_candidate_entities(s, 0.1)
                                for e in res:
                                    if e[1] < 1.1 and (
                                            e[0] not in candidates
                                            or e[1] > candidates[e[0]]):
                                        candidates[e[0]] = e[1]
                            found = len(res) > 0
            # candidates = sorted(candidates.items(), key=lambda x:x[1], reverse=True)[:20]
            candidates = candidates.items()
            correct = False
            for e, _ in candidates:
                if e == gold_entity:
                    avg_candidate += len(candidates)
                    num += 1
                    correct = True
                    break
            print >> fout, ("%s\t%s" % (gold_entity, ' '.join(
                [c for c, _ in candidates]))).encode('utf8')
            if not correct:
                # print >> fout, line.strip(), candidates
                print surfaces, data['gold'].split('\t'), gold_entity
            # else:
            # print line.strip()
            # print candidates
    print "%s find correct topic entity" % num
    print "average number of candidate entities: %s" % (avg_candidate * 1.0 /
                                                        num)