Ejemplo n.º 1
0
def main():
    parser = ArgumentParser()
    parser.add_argument('input_graph_path',
                        help='path to the input graph json file')
    parser.add_argument('output_graph_path',
                        help='path to write the coref-compressed graph')
    parser.add_argument('output_log_path', help='path to write the log file')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    output_graph_path = util.get_output_path(args.output_graph_path,
                                             overwrite_warning=not args.force)
    output_log_path = util.get_output_path(args.output_log_path,
                                           overwrite_warning=not args.force)

    input_json_graph = JsonGraph.from_dict(
        util.read_json_file(args.input_graph_path, 'JSON graph'))

    num_old_eres = len(list(input_json_graph.each_ere()))
    assert num_old_eres == len(input_json_graph.eres)
    num_old_stmts = len(list(input_json_graph.each_statement()))
    logging.info(
        'Found {} EREs and {} statements in the original graph'.format(
            num_old_eres, num_old_stmts))

    mappings = build_mappings(input_json_graph)

    output_json_graph = JsonGraph()

    num_new_eres = compress_eres(input_json_graph, mappings, output_json_graph)
    num_new_stmts = compress_statements(input_json_graph, mappings,
                                        output_json_graph)

    logging.info(
        'Finished coref-compressed graph with {} EREs and {} statements'.
        format(num_new_eres, num_new_stmts))

    logging.info(
        'Writing compressed json graph to {}'.format(output_graph_path))
    with open(str(output_graph_path), 'w') as fout:
        json.dump(output_json_graph.as_dict(), fout, indent=1)

    log_json = {}
    for mapping_key, mapping in mappings.items():
        if 'key' in mapping_key:
            continue
        if mapping_key.endswith('s'):
            log_json[mapping_key] = {k: list(v) for k, v in mapping.items()}
        else:
            log_json[mapping_key] = mapping

    logging.info('Writing compression log to {}'.format(output_log_path))
    with open(str(output_log_path), 'w') as fout:
        json.dump(log_json, fout, indent=2)
Ejemplo n.º 2
0
def main():
    parser = ArgumentParser(
        description=
        'Read in a TA2 KB and a (list of) XML-based Statement of Information Need '
        'definition, convert the KB to JSON format, then convert each SoIN to a JSON '
        'query by identifying and ranking entry points.')
    parser.add_argument('kb_path', help='Path to the input TA2 KB')
    parser.add_argument('graph_output_path',
                        help='Path to write the JSON graph')
    parser.add_argument(
        '-s',
        '--soin_path',
        help=
        'Path to the input SoIN file, or a directory containing multiple SoIN '
        'files; if not provided, will only transform the graph')
    parser.add_argument(
        '-q',
        '--query_output_dir',
        help=
        'Directory to write the JSON queries, used when soin_path is provided')
    parser.add_argument(
        '-m',
        '--max_matches',
        type=int,
        default=50,
        help='The maximum number of EPs *per entry point description*')
    parser.add_argument(
        '-d',
        '--dup_kb',
        default=duplicate_kb_file,
        help='Path to the json file with duplicate KB ID mappings')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    kb_path = util.get_input_path(args.kb_path)
    graph_output_path = util.get_output_path(args.graph_output_path,
                                             overwrite_warning=not args.force)

    aida_graph = AidaGraph()
    aida_graph.build_graph(str(kb_path), fmt='ttl')

    json_graph = JsonGraph()
    json_graph.build_graph(aida_graph)

    logging.info('Writing JSON graph to {} ...'.format(graph_output_path))
    with open(str(graph_output_path), 'w') as fout:
        json.dump(json_graph.as_dict(), fout, indent=1)
    logging.info('Done.')

    if args.soin_path is not None:
        assert args.query_output_dir is not None, 'Must provide query_output_dir'
        soin_path = util.get_input_path(args.soin_path)
        query_output_dir = util.get_output_dir(
            args.query_output_dir, overwrite_warning=not args.force)

        soin_file_paths = util.get_file_list(soin_path,
                                             suffix='.xml',
                                             sort=True)

        dup_kb_id_mapping = None
        if args.dup_kb is not None:
            dup_kb_id_mapping = util.read_json_file(args.dup_kb,
                                                    'duplicate KB ID mapping')

        logging.info('Getting Cluster Mappings ...')
        ere_to_prototypes = get_cluster_mappings(aida_graph)

        for soin_file_path in soin_file_paths:
            query_output_path = query_output_dir / (soin_file_path.stem +
                                                    '_query.json')

            logging.info('Processing SOIN {} ...'.format(soin_file_path))
            soin = SOIN.parse(str(soin_file_path),
                              dup_kbid_mapping=dup_kb_id_mapping)

            logging.info('Resolving all entrypoints ...')
            soin.resolve(aida_graph,
                         ere_to_prototypes,
                         max_matches=args.max_matches)

            query_json = {'graph': kb_path.stem}
            query_json.update(soin.to_json())

            logging.info(
                'Writing JSON query to {} ...'.format(query_output_path))
            with open(str(query_output_path), 'w') as fout:
                json.dump(query_json, fout, indent=1)