Exemple #1
0
def main():
    parser = ArgumentParser()
    parser.add_argument('input_graph_path',
                        help='path to the input graph json file')
    parser.add_argument('output_graph_path',
                        help='path to write the coref-compressed graph')
    parser.add_argument('output_log_path', help='path to write the log file')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    output_graph_path = util.get_output_path(args.output_graph_path,
                                             overwrite_warning=not args.force)
    output_log_path = util.get_output_path(args.output_log_path,
                                           overwrite_warning=not args.force)

    input_json_graph = JsonGraph.from_dict(
        util.read_json_file(args.input_graph_path, 'JSON graph'))

    num_old_eres = len(list(input_json_graph.each_ere()))
    assert num_old_eres == len(input_json_graph.eres)
    num_old_stmts = len(list(input_json_graph.each_statement()))
    logging.info(
        'Found {} EREs and {} statements in the original graph'.format(
            num_old_eres, num_old_stmts))

    mappings = build_mappings(input_json_graph)

    output_json_graph = JsonGraph()

    num_new_eres = compress_eres(input_json_graph, mappings, output_json_graph)
    num_new_stmts = compress_statements(input_json_graph, mappings,
                                        output_json_graph)

    logging.info(
        'Finished coref-compressed graph with {} EREs and {} statements'.
        format(num_new_eres, num_new_stmts))

    logging.info(
        'Writing compressed json graph to {}'.format(output_graph_path))
    with open(str(output_graph_path), 'w') as fout:
        json.dump(output_json_graph.as_dict(), fout, indent=1)

    log_json = {}
    for mapping_key, mapping in mappings.items():
        if 'key' in mapping_key:
            continue
        if mapping_key.endswith('s'):
            log_json[mapping_key] = {k: list(v) for k, v in mapping.items()}
        else:
            log_json[mapping_key] = mapping

    logging.info('Writing compression log to {}'.format(output_log_path))
    with open(str(output_log_path), 'w') as fout:
        json.dump(log_json, fout, indent=2)
def main():
    parser = ArgumentParser()
    parser.add_argument('input_path',
                        help='path to the input Excel ontology file')
    parser.add_argument('output_path',
                        help='path to write the JSON ontology file')

    args = parser.parse_args()

    input_path = util.get_input_path(args.input_path)

    df = pandas.read_excel(str(input_path), sheet_name=None)
    event_records = df['events'].to_dict('records')
    relation_records = df['relations'].to_dict('records')

    roles_ontology = defaultdict(dict)

    for ev in event_records:
        ev_type = get_type_str(ev)

        for arg_idx in range(1, 6):
            arg_key = f'arg{arg_idx} label'
            if isinstance(ev[arg_key], str):
                roles_ontology[ev_type][f'arg{arg_idx}'] = ev[arg_key]

    for rel in relation_records:
        rel_type = get_type_str(rel)

        roles_ontology[rel_type]['arg1'] = rel['arg1 label']
        roles_ontology[rel_type]['arg2'] = rel['arg2 label']

    output_path = util.get_output_path(args.output_path)
    with open(str(output_path), 'w') as fout:
        json.dump(roles_ontology, fout, indent=2)
Exemple #3
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        'hypotheses_path',
        help='path to the input json file for hypotheses, or a directory with '
        'a list of hypotheses files')
    parser.add_argument(
        'output_dir', help='directory to write the coref-recovered hypotheses')
    parser.add_argument('original_graph_path',
                        help='path to the original graph json file')
    parser.add_argument('compressed_graph_path',
                        help='path to the compressed graph json file')
    parser.add_argument('input_log_path',
                        help='path to log file from coref compression')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    hypotheses_file_paths = util.get_file_list(args.hypotheses_path,
                                               suffix='.json',
                                               sort=True)

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    original_graph_json = util.read_json_file(args.original_graph_path,
                                              'original JSON graph')
    compressed_graph_json = util.read_json_file(args.compressed_graph_path,
                                                'compressed JSON graph')
    input_log_json = util.read_json_file(args.input_log_path, 'coref log')

    for hypotheses_file_path in hypotheses_file_paths:
        input_hypotheses_json = util.read_json_file(hypotheses_file_path,
                                                    'hypotheses')

        # probs do not change
        output_hypotheses_json = {
            'probs': input_hypotheses_json['probs'],
            'support': []
        }

        for compressed_hypothesis in input_hypotheses_json["support"]:
            original_hypothesis = {'statements': [], 'statementWeights': []}

            # The mapping from each original statement (before coref-compression) to its weight
            original_stmt_weight_mapping = {}

            # Set of cluster membership nodes to include in the original hypothesis
            cluster_membership_set = set()

            for compressed_stmt, stmt_weight in zip(
                    compressed_hypothesis['statements'],
                    compressed_hypothesis['statementWeights']):
                # Get the statement entry from the compressed graph
                compressed_stmt_entry = compressed_graph_json['theGraph'][
                    compressed_stmt]
                # Get the cluster(s) from the subject of the compressed statement
                stmt_subj_clusters = \
                    input_log_json['prototype_to_clusters'][compressed_stmt_entry['subject']]
                # Whether this is a type statement
                is_type_stmt = (compressed_stmt_entry['predicate'] == 'type')
                # Get the cluster(s) from the object of the compressed statement if it is an edge
                # statement
                if is_type_stmt:
                    stmt_obj_clusters = None
                else:
                    stmt_obj_clusters = \
                        input_log_json['prototype_to_clusters'][compressed_stmt_entry['object']]

                for original_stmt in input_log_json['new_stmt_to_old_stmts'][
                        compressed_stmt]:
                    # Resolve the statements and weights before coref-compression
                    if original_stmt not in original_stmt_weight_mapping:
                        original_stmt_weight_mapping[
                            original_stmt] = stmt_weight
                    elif original_stmt_weight_mapping[
                            original_stmt] < stmt_weight:
                        original_stmt_weight_mapping[
                            original_stmt] = stmt_weight

                    # Get the statement entry from the original graph
                    original_stmt_entry = original_graph_json['theGraph'][
                        original_stmt]

                    # Add cluster membership between the original subject and each subject cluster
                    stmt_subj = original_stmt_entry['subject']
                    for stmt_subj_cluster in stmt_subj_clusters:
                        cluster_membership_set.add(
                            (stmt_subj, stmt_subj_cluster))

                    if is_type_stmt:
                        assert original_stmt_entry['predicate'] == 'type'
                    else:
                        assert original_stmt_entry['predicate'] != 'type'

                        # Add cluster membership between the original object and each object cluster
                        stmt_obj = original_stmt_entry['object']
                        for stmt_obj_cluster in stmt_obj_clusters:
                            cluster_membership_set.add(
                                (stmt_obj, stmt_obj_cluster))

            for original_stmt, stmt_weight in original_stmt_weight_mapping.items(
            ):
                original_hypothesis['statements'].append(original_stmt)
                original_hypothesis['statementWeights'].append(stmt_weight)

            original_hypothesis['clusterMemberships'] = list(
                cluster_membership_set)

            original_hypothesis['failedQueries'] = compressed_hypothesis[
                'failedQueries']

            original_query_stmts = set()
            for compressed_query_stmt in compressed_hypothesis[
                    'queryStatements']:
                original_query_stmts.update(
                    input_log_json['new_stmt_to_old_stmts']
                    [compressed_query_stmt])
            original_hypothesis['queryStatements'] = list(original_query_stmts)

            output_hypotheses_json['support'].append(original_hypothesis)

        if 'graph' in input_hypotheses_json:
            output_hypotheses_json['graph'] = input_hypotheses_json['graph']
        if 'queries' in input_hypotheses_json:
            output_hypotheses_json['queries'] = input_hypotheses_json[
                'queries']

        output_path = util.get_output_path(output_dir /
                                           hypotheses_file_path.name,
                                           overwrite_warning=not args.force)
        print('Writing coref-recovered hypotheses to {}'.format(output_path))
        with open(str(output_path), 'w') as fout:
            json.dump(output_hypotheses_json, fout, indent=2)
Exemple #4
0
def process_soin(graph: AidaGraph,
                 soin_file_paths: List[Path],
                 output_dir: Path,
                 ep_cap: int = 50,
                 consider_roles: bool = False,
                 dup_kb_id_mapping: Dict = None):
    logging.info("Getting Cluster Mappings ...")
    cluster_to_prototype, entity_to_cluster, entities_to_roles = get_cluster_mappings(
        graph)

    for soin_file_path in soin_file_paths:
        logging.info('Processing SOIN {} ...'.format(soin_file_path))
        logging.info('Parsing SOIN XML ...')
        soin = SOIN.process_xml(str(soin_file_path),
                                dup_kbid_mapping=dup_kb_id_mapping)
        # logging.info('Done.')

        logging.info('Gathering role information ...')
        ep_variables = set()
        for ep in soin.entrypoints:
            ep_variables.add(
                ep.variable[0])  # [0] is for senseless tuple wrapper

        var_roles = {}
        for frame in soin.frames:
            for edge in frame.edge_list:
                if edge.obj in ep_variables:
                    if edge.obj in var_roles:
                        var_roles[edge.obj].add(edge.predicate)
                    else:
                        var_roles[edge.obj] = {edge.predicate}
        # logging.info('Done.')

        logging.info('Resolving all entrypoints ...')
        ep_dict, ep_weights_dict = resolve_all_entrypoints(
            graph, soin.entrypoints, cluster_to_prototype, entity_to_cluster,
            entities_to_roles, var_roles, ep_cap, consider_roles)

        # logging.info('Done.')

        write_me = {
            'graph': '',
            'soin_id': soin.id,
            'frame_id': [frame.id for frame in soin.frames],
            'entrypoints': ep_dict,
            'entrypointWeights': ep_weights_dict,
            'queries': [],
            'facets': [],
        }

        logging.info('Serializing data structures ...')
        temporal_info = soin.temporal_info_to_dict()
        for frame in soin.frames:
            frame_rep = frame.frame_to_dict(temporal_info)
            write_me['facets'].append(frame_rep)

        query_output = util.get_output_path(
            output_dir / (soin_file_path.stem + '_query.json'))
        logging.info('Writing JSON query to {} ...'.format(query_output))
        with open(str(query_output), 'w') as fout:
            json.dump(write_me, fout, indent=1)
        logging.info('Done.')
Exemple #5
0
def main():
    parser = ArgumentParser(
        description=
        'Read in a TA2 KB and a (list of) XML-based Statement of Information Need '
        'definition, convert the KB to JSON format, then convert each SoIN to a JSON '
        'query by identifying and ranking entry points.')
    parser.add_argument('kb_path', help='Path to the input TA2 KB')
    parser.add_argument('graph_output_path',
                        help='Path to write the JSON graph')
    parser.add_argument(
        '-s',
        '--soin_path',
        help=
        'Path to the input SoIN file, or a directory containing multiple SoIN '
        'files; if not provided, will only transform the graph')
    parser.add_argument(
        '-q',
        '--query_output_dir',
        help=
        'Directory to write the JSON queries, used when soin_path is provided')
    parser.add_argument(
        '-m',
        '--max_matches',
        type=int,
        default=50,
        help='The maximum number of EPs *per entry point description*')
    parser.add_argument(
        '-d',
        '--dup_kb',
        default=duplicate_kb_file,
        help='Path to the json file with duplicate KB ID mappings')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    kb_path = util.get_input_path(args.kb_path)
    graph_output_path = util.get_output_path(args.graph_output_path,
                                             overwrite_warning=not args.force)

    aida_graph = AidaGraph()
    aida_graph.build_graph(str(kb_path), fmt='ttl')

    json_graph = JsonGraph()
    json_graph.build_graph(aida_graph)

    logging.info('Writing JSON graph to {} ...'.format(graph_output_path))
    with open(str(graph_output_path), 'w') as fout:
        json.dump(json_graph.as_dict(), fout, indent=1)
    logging.info('Done.')

    if args.soin_path is not None:
        assert args.query_output_dir is not None, 'Must provide query_output_dir'
        soin_path = util.get_input_path(args.soin_path)
        query_output_dir = util.get_output_dir(
            args.query_output_dir, overwrite_warning=not args.force)

        soin_file_paths = util.get_file_list(soin_path,
                                             suffix='.xml',
                                             sort=True)

        dup_kb_id_mapping = None
        if args.dup_kb is not None:
            dup_kb_id_mapping = util.read_json_file(args.dup_kb,
                                                    'duplicate KB ID mapping')

        logging.info('Getting Cluster Mappings ...')
        ere_to_prototypes = get_cluster_mappings(aida_graph)

        for soin_file_path in soin_file_paths:
            query_output_path = query_output_dir / (soin_file_path.stem +
                                                    '_query.json')

            logging.info('Processing SOIN {} ...'.format(soin_file_path))
            soin = SOIN.parse(str(soin_file_path),
                              dup_kbid_mapping=dup_kb_id_mapping)

            logging.info('Resolving all entrypoints ...')
            soin.resolve(aida_graph,
                         ere_to_prototypes,
                         max_matches=args.max_matches)

            query_json = {'graph': kb_path.stem}
            query_json.update(soin.to_json())

            logging.info(
                'Writing JSON query to {} ...'.format(query_output_path))
            with open(str(query_output_path), 'w') as fout:
                json.dump(query_json, fout, indent=1)