Exemple #1
0
def gen_query_graph(fn_wq_list, fn_simple_list, fn_out, use_aqqu=False):
    pipeline = Pipeline(use_aqqu)
    complete_qids = set()
    topic_complete_qids = set()
    qids = set()
    with open(fn_out, 'w') as fout:
        qid = 0
        # Process Simple qustion
        for fn in fn_simple_list:
            with open(fn) as fin:
                for line in fin:
                    qid += 1

        # Process WEBQUESTION
        for fn in fn_wq_list:
            webq = json.load(open(fn), encoding="utf8")
            for data in webq:
                qid += 1
                positive_relations = set()
                for path in data['paths']:
                    if path[1] == "forward_pass_cvt" or path[
                            1] == "forward_direct":
                        positive_relations.add(path[0].split()[-2])
                # if len(positive_relations) == 0:
                #     continue
                qids.add(qid)
                question, query_graphs = pipeline.gen_candidate_query_graph(
                    data['utterance'], debug=False)
                # print "candidate query graph"
                # for q in query_graphs:
                #     print q
                for j in xrange(len(query_graphs)):
                    query_graphs[j]['qid'] = qid
                gold_answers = set(data['mids'].values())
                # print question
                # print "correct answer", gold_answers
                query_patterns = pipeline.extract_query_pattern_and_f1(
                    query_graphs, gold_answers)
                query_patterns = pipeline.add_answer_feature(
                    question, query_patterns)
                # Just for statistic
                for j in xrange(len(query_graphs)):
                    if query_graphs[j]['topic'] == data['mid1'] and query_graphs[j]['answer'] in gold_answers and \
                                    query_graphs[j]['relation'] in positive_relations:
                        query_graphs[j]['label'] = 1
                    else:
                        query_graphs[j]['label'] = 0
                    if query_graphs[j]['topic'] == data['mid1']:
                        topic_complete_qids.add(qid)
                    query_graphs[j]['question'] = question
                    # query_graphs[j]['qid'] = data['id']
                    if query_graphs[j]['label'] == 1:
                        complete_qids.add(qid)
                        # print >> fout, json.dumps(g, ensure_ascii=False).encode('utf8')

                # Write query pattern to file
                for j in xrange(len(query_patterns)):
                    print >> fout, json.dumps(
                        query_patterns[j], ensure_ascii=False).encode('utf8')
    print "total valid question", len(qids)
    print "complete question", len(complete_qids)
    print "%s questions can be correctly linked" % len(topic_complete_qids)