def main(): parser = ArgumentParser() parser.add_argument('input_graph_path', help='path to the input graph json file') parser.add_argument('output_graph_path', help='path to write the coref-compressed graph') parser.add_argument('output_log_path', help='path to write the log file') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() output_graph_path = util.get_output_path(args.output_graph_path, overwrite_warning=not args.force) output_log_path = util.get_output_path(args.output_log_path, overwrite_warning=not args.force) input_json_graph = JsonGraph.from_dict( util.read_json_file(args.input_graph_path, 'JSON graph')) num_old_eres = len(list(input_json_graph.each_ere())) assert num_old_eres == len(input_json_graph.eres) num_old_stmts = len(list(input_json_graph.each_statement())) logging.info( 'Found {} EREs and {} statements in the original graph'.format( num_old_eres, num_old_stmts)) mappings = build_mappings(input_json_graph) output_json_graph = JsonGraph() num_new_eres = compress_eres(input_json_graph, mappings, output_json_graph) num_new_stmts = compress_statements(input_json_graph, mappings, output_json_graph) logging.info( 'Finished coref-compressed graph with {} EREs and {} statements'. format(num_new_eres, num_new_stmts)) logging.info( 'Writing compressed json graph to {}'.format(output_graph_path)) with open(str(output_graph_path), 'w') as fout: json.dump(output_json_graph.as_dict(), fout, indent=1) log_json = {} for mapping_key, mapping in mappings.items(): if 'key' in mapping_key: continue if mapping_key.endswith('s'): log_json[mapping_key] = {k: list(v) for k, v in mapping.items()} else: log_json[mapping_key] = mapping logging.info('Writing compression log to {}'.format(output_log_path)) with open(str(output_log_path), 'w') as fout: json.dump(log_json, fout, indent=2)
def main(): parser = ArgumentParser( description= 'Read in a TA2 KB and a (list of) XML-based Statement of Information Need ' 'definition, convert the KB to JSON format, then convert each SoIN to a JSON ' 'query by identifying and ranking entry points.') parser.add_argument('kb_path', help='Path to the input TA2 KB') parser.add_argument('graph_output_path', help='Path to write the JSON graph') parser.add_argument( '-s', '--soin_path', help= 'Path to the input SoIN file, or a directory containing multiple SoIN ' 'files; if not provided, will only transform the graph') parser.add_argument( '-q', '--query_output_dir', help= 'Directory to write the JSON queries, used when soin_path is provided') parser.add_argument( '-m', '--max_matches', type=int, default=50, help='The maximum number of EPs *per entry point description*') parser.add_argument( '-d', '--dup_kb', default=duplicate_kb_file, help='Path to the json file with duplicate KB ID mappings') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() kb_path = util.get_input_path(args.kb_path) graph_output_path = util.get_output_path(args.graph_output_path, overwrite_warning=not args.force) aida_graph = AidaGraph() aida_graph.build_graph(str(kb_path), fmt='ttl') json_graph = JsonGraph() json_graph.build_graph(aida_graph) logging.info('Writing JSON graph to {} ...'.format(graph_output_path)) with open(str(graph_output_path), 'w') as fout: json.dump(json_graph.as_dict(), fout, indent=1) logging.info('Done.') if args.soin_path is not None: assert args.query_output_dir is not None, 'Must provide query_output_dir' soin_path = util.get_input_path(args.soin_path) query_output_dir = util.get_output_dir( args.query_output_dir, overwrite_warning=not args.force) soin_file_paths = util.get_file_list(soin_path, suffix='.xml', sort=True) dup_kb_id_mapping = None if args.dup_kb is not None: dup_kb_id_mapping = util.read_json_file(args.dup_kb, 'duplicate KB ID mapping') logging.info('Getting Cluster Mappings ...') ere_to_prototypes = get_cluster_mappings(aida_graph) for soin_file_path in soin_file_paths: query_output_path = query_output_dir / (soin_file_path.stem + '_query.json') logging.info('Processing SOIN {} ...'.format(soin_file_path)) soin = SOIN.parse(str(soin_file_path), dup_kbid_mapping=dup_kb_id_mapping) logging.info('Resolving all entrypoints ...') soin.resolve(aida_graph, ere_to_prototypes, max_matches=args.max_matches) query_json = {'graph': kb_path.stem} query_json.update(soin.to_json()) logging.info( 'Writing JSON query to {} ...'.format(query_output_path)) with open(str(query_output_path), 'w') as fout: json.dump(query_json, fout, indent=1)