def main(): parser = ArgumentParser() parser.add_argument('input_graph_path', help='path to the input graph json file') parser.add_argument('output_graph_path', help='path to write the coref-compressed graph') parser.add_argument('output_log_path', help='path to write the log file') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() output_graph_path = util.get_output_path(args.output_graph_path, overwrite_warning=not args.force) output_log_path = util.get_output_path(args.output_log_path, overwrite_warning=not args.force) input_json_graph = JsonGraph.from_dict( util.read_json_file(args.input_graph_path, 'JSON graph')) num_old_eres = len(list(input_json_graph.each_ere())) assert num_old_eres == len(input_json_graph.eres) num_old_stmts = len(list(input_json_graph.each_statement())) logging.info( 'Found {} EREs and {} statements in the original graph'.format( num_old_eres, num_old_stmts)) mappings = build_mappings(input_json_graph) output_json_graph = JsonGraph() num_new_eres = compress_eres(input_json_graph, mappings, output_json_graph) num_new_stmts = compress_statements(input_json_graph, mappings, output_json_graph) logging.info( 'Finished coref-compressed graph with {} EREs and {} statements'. format(num_new_eres, num_new_stmts)) logging.info( 'Writing compressed json graph to {}'.format(output_graph_path)) with open(str(output_graph_path), 'w') as fout: json.dump(output_json_graph.as_dict(), fout, indent=1) log_json = {} for mapping_key, mapping in mappings.items(): if 'key' in mapping_key: continue if mapping_key.endswith('s'): log_json[mapping_key] = {k: list(v) for k, v in mapping.items()} else: log_json[mapping_key] = mapping logging.info('Writing compression log to {}'.format(output_log_path)) with open(str(output_log_path), 'w') as fout: json.dump(log_json, fout, indent=2)
def main(): parser = ArgumentParser() parser.add_argument('input_path', help='path to the input Excel ontology file') parser.add_argument('output_path', help='path to write the JSON ontology file') args = parser.parse_args() input_path = util.get_input_path(args.input_path) df = pandas.read_excel(str(input_path), sheet_name=None) event_records = df['events'].to_dict('records') relation_records = df['relations'].to_dict('records') roles_ontology = defaultdict(dict) for ev in event_records: ev_type = get_type_str(ev) for arg_idx in range(1, 6): arg_key = f'arg{arg_idx} label' if isinstance(ev[arg_key], str): roles_ontology[ev_type][f'arg{arg_idx}'] = ev[arg_key] for rel in relation_records: rel_type = get_type_str(rel) roles_ontology[rel_type]['arg1'] = rel['arg1 label'] roles_ontology[rel_type]['arg2'] = rel['arg2 label'] output_path = util.get_output_path(args.output_path) with open(str(output_path), 'w') as fout: json.dump(roles_ontology, fout, indent=2)
def main(): parser = ArgumentParser() parser.add_argument( 'hypotheses_path', help='path to the input json file for hypotheses, or a directory with ' 'a list of hypotheses files') parser.add_argument( 'output_dir', help='directory to write the coref-recovered hypotheses') parser.add_argument('original_graph_path', help='path to the original graph json file') parser.add_argument('compressed_graph_path', help='path to the compressed graph json file') parser.add_argument('input_log_path', help='path to log file from coref compression') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() hypotheses_file_paths = util.get_file_list(args.hypotheses_path, suffix='.json', sort=True) output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force) original_graph_json = util.read_json_file(args.original_graph_path, 'original JSON graph') compressed_graph_json = util.read_json_file(args.compressed_graph_path, 'compressed JSON graph') input_log_json = util.read_json_file(args.input_log_path, 'coref log') for hypotheses_file_path in hypotheses_file_paths: input_hypotheses_json = util.read_json_file(hypotheses_file_path, 'hypotheses') # probs do not change output_hypotheses_json = { 'probs': input_hypotheses_json['probs'], 'support': [] } for compressed_hypothesis in input_hypotheses_json["support"]: original_hypothesis = {'statements': [], 'statementWeights': []} # The mapping from each original statement (before coref-compression) to its weight original_stmt_weight_mapping = {} # Set of cluster membership nodes to include in the original hypothesis cluster_membership_set = set() for compressed_stmt, stmt_weight in zip( compressed_hypothesis['statements'], compressed_hypothesis['statementWeights']): # Get the statement entry from the compressed graph compressed_stmt_entry = compressed_graph_json['theGraph'][ compressed_stmt] # Get the cluster(s) from the subject of the compressed statement stmt_subj_clusters = \ input_log_json['prototype_to_clusters'][compressed_stmt_entry['subject']] # Whether this is a type statement is_type_stmt = (compressed_stmt_entry['predicate'] == 'type') # Get the cluster(s) from the object of the compressed statement if it is an edge # statement if is_type_stmt: stmt_obj_clusters = None else: stmt_obj_clusters = \ input_log_json['prototype_to_clusters'][compressed_stmt_entry['object']] for original_stmt in input_log_json['new_stmt_to_old_stmts'][ compressed_stmt]: # Resolve the statements and weights before coref-compression if original_stmt not in original_stmt_weight_mapping: original_stmt_weight_mapping[ original_stmt] = stmt_weight elif original_stmt_weight_mapping[ original_stmt] < stmt_weight: original_stmt_weight_mapping[ original_stmt] = stmt_weight # Get the statement entry from the original graph original_stmt_entry = original_graph_json['theGraph'][ original_stmt] # Add cluster membership between the original subject and each subject cluster stmt_subj = original_stmt_entry['subject'] for stmt_subj_cluster in stmt_subj_clusters: cluster_membership_set.add( (stmt_subj, stmt_subj_cluster)) if is_type_stmt: assert original_stmt_entry['predicate'] == 'type' else: assert original_stmt_entry['predicate'] != 'type' # Add cluster membership between the original object and each object cluster stmt_obj = original_stmt_entry['object'] for stmt_obj_cluster in stmt_obj_clusters: cluster_membership_set.add( (stmt_obj, stmt_obj_cluster)) for original_stmt, stmt_weight in original_stmt_weight_mapping.items( ): original_hypothesis['statements'].append(original_stmt) original_hypothesis['statementWeights'].append(stmt_weight) original_hypothesis['clusterMemberships'] = list( cluster_membership_set) original_hypothesis['failedQueries'] = compressed_hypothesis[ 'failedQueries'] original_query_stmts = set() for compressed_query_stmt in compressed_hypothesis[ 'queryStatements']: original_query_stmts.update( input_log_json['new_stmt_to_old_stmts'] [compressed_query_stmt]) original_hypothesis['queryStatements'] = list(original_query_stmts) output_hypotheses_json['support'].append(original_hypothesis) if 'graph' in input_hypotheses_json: output_hypotheses_json['graph'] = input_hypotheses_json['graph'] if 'queries' in input_hypotheses_json: output_hypotheses_json['queries'] = input_hypotheses_json[ 'queries'] output_path = util.get_output_path(output_dir / hypotheses_file_path.name, overwrite_warning=not args.force) print('Writing coref-recovered hypotheses to {}'.format(output_path)) with open(str(output_path), 'w') as fout: json.dump(output_hypotheses_json, fout, indent=2)
def process_soin(graph: AidaGraph, soin_file_paths: List[Path], output_dir: Path, ep_cap: int = 50, consider_roles: bool = False, dup_kb_id_mapping: Dict = None): logging.info("Getting Cluster Mappings ...") cluster_to_prototype, entity_to_cluster, entities_to_roles = get_cluster_mappings( graph) for soin_file_path in soin_file_paths: logging.info('Processing SOIN {} ...'.format(soin_file_path)) logging.info('Parsing SOIN XML ...') soin = SOIN.process_xml(str(soin_file_path), dup_kbid_mapping=dup_kb_id_mapping) # logging.info('Done.') logging.info('Gathering role information ...') ep_variables = set() for ep in soin.entrypoints: ep_variables.add( ep.variable[0]) # [0] is for senseless tuple wrapper var_roles = {} for frame in soin.frames: for edge in frame.edge_list: if edge.obj in ep_variables: if edge.obj in var_roles: var_roles[edge.obj].add(edge.predicate) else: var_roles[edge.obj] = {edge.predicate} # logging.info('Done.') logging.info('Resolving all entrypoints ...') ep_dict, ep_weights_dict = resolve_all_entrypoints( graph, soin.entrypoints, cluster_to_prototype, entity_to_cluster, entities_to_roles, var_roles, ep_cap, consider_roles) # logging.info('Done.') write_me = { 'graph': '', 'soin_id': soin.id, 'frame_id': [frame.id for frame in soin.frames], 'entrypoints': ep_dict, 'entrypointWeights': ep_weights_dict, 'queries': [], 'facets': [], } logging.info('Serializing data structures ...') temporal_info = soin.temporal_info_to_dict() for frame in soin.frames: frame_rep = frame.frame_to_dict(temporal_info) write_me['facets'].append(frame_rep) query_output = util.get_output_path( output_dir / (soin_file_path.stem + '_query.json')) logging.info('Writing JSON query to {} ...'.format(query_output)) with open(str(query_output), 'w') as fout: json.dump(write_me, fout, indent=1) logging.info('Done.')
def main(): parser = ArgumentParser( description= 'Read in a TA2 KB and a (list of) XML-based Statement of Information Need ' 'definition, convert the KB to JSON format, then convert each SoIN to a JSON ' 'query by identifying and ranking entry points.') parser.add_argument('kb_path', help='Path to the input TA2 KB') parser.add_argument('graph_output_path', help='Path to write the JSON graph') parser.add_argument( '-s', '--soin_path', help= 'Path to the input SoIN file, or a directory containing multiple SoIN ' 'files; if not provided, will only transform the graph') parser.add_argument( '-q', '--query_output_dir', help= 'Directory to write the JSON queries, used when soin_path is provided') parser.add_argument( '-m', '--max_matches', type=int, default=50, help='The maximum number of EPs *per entry point description*') parser.add_argument( '-d', '--dup_kb', default=duplicate_kb_file, help='Path to the json file with duplicate KB ID mappings') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() kb_path = util.get_input_path(args.kb_path) graph_output_path = util.get_output_path(args.graph_output_path, overwrite_warning=not args.force) aida_graph = AidaGraph() aida_graph.build_graph(str(kb_path), fmt='ttl') json_graph = JsonGraph() json_graph.build_graph(aida_graph) logging.info('Writing JSON graph to {} ...'.format(graph_output_path)) with open(str(graph_output_path), 'w') as fout: json.dump(json_graph.as_dict(), fout, indent=1) logging.info('Done.') if args.soin_path is not None: assert args.query_output_dir is not None, 'Must provide query_output_dir' soin_path = util.get_input_path(args.soin_path) query_output_dir = util.get_output_dir( args.query_output_dir, overwrite_warning=not args.force) soin_file_paths = util.get_file_list(soin_path, suffix='.xml', sort=True) dup_kb_id_mapping = None if args.dup_kb is not None: dup_kb_id_mapping = util.read_json_file(args.dup_kb, 'duplicate KB ID mapping') logging.info('Getting Cluster Mappings ...') ere_to_prototypes = get_cluster_mappings(aida_graph) for soin_file_path in soin_file_paths: query_output_path = query_output_dir / (soin_file_path.stem + '_query.json') logging.info('Processing SOIN {} ...'.format(soin_file_path)) soin = SOIN.parse(str(soin_file_path), dup_kbid_mapping=dup_kb_id_mapping) logging.info('Resolving all entrypoints ...') soin.resolve(aida_graph, ere_to_prototypes, max_matches=args.max_matches) query_json = {'graph': kb_path.stem} query_json.update(soin.to_json()) logging.info( 'Writing JSON query to {} ...'.format(query_output_path)) with open(str(query_output_path), 'w') as fout: json.dump(query_json, fout, indent=1)