def gen_query_graph(fn_wq_list, fn_simple_list, fn_out, use_aqqu=False): pipeline = Pipeline(use_aqqu) complete_qids = set() topic_complete_qids = set() qids = set() with open(fn_out, 'w') as fout: qid = 0 # Process Simple qustion for fn in fn_simple_list: with open(fn) as fin: for line in fin: qid += 1 # Process WEBQUESTION for fn in fn_wq_list: webq = json.load(open(fn), encoding="utf8") for data in webq: qid += 1 positive_relations = set() for path in data['paths']: if path[1] == "forward_pass_cvt" or path[ 1] == "forward_direct": positive_relations.add(path[0].split()[-2]) # if len(positive_relations) == 0: # continue qids.add(qid) question, query_graphs = pipeline.gen_candidate_query_graph( data['utterance'], debug=False) # print "candidate query graph" # for q in query_graphs: # print q for j in xrange(len(query_graphs)): query_graphs[j]['qid'] = qid gold_answers = set(data['mids'].values()) # print question # print "correct answer", gold_answers query_patterns = pipeline.extract_query_pattern_and_f1( query_graphs, gold_answers) query_patterns = pipeline.add_answer_feature( question, query_patterns) # Just for statistic for j in xrange(len(query_graphs)): if query_graphs[j]['topic'] == data['mid1'] and query_graphs[j]['answer'] in gold_answers and \ query_graphs[j]['relation'] in positive_relations: query_graphs[j]['label'] = 1 else: query_graphs[j]['label'] = 0 if query_graphs[j]['topic'] == data['mid1']: topic_complete_qids.add(qid) query_graphs[j]['question'] = question # query_graphs[j]['qid'] = data['id'] if query_graphs[j]['label'] == 1: complete_qids.add(qid) # print >> fout, json.dumps(g, ensure_ascii=False).encode('utf8') # Write query pattern to file for j in xrange(len(query_patterns)): print >> fout, json.dumps( query_patterns[j], ensure_ascii=False).encode('utf8') print "total valid question", len(qids) print "complete question", len(complete_qids) print "%s questions can be correctly linked" % len(topic_complete_qids)