예제 #1
0
def main():
    parser = ArgumentParser()
    parser.add_argument('input_path',
                        help='path to the input Excel ontology file')
    parser.add_argument('output_path',
                        help='path to write the JSON ontology file')

    args = parser.parse_args()

    input_path = util.get_input_path(args.input_path)

    df = pandas.read_excel(str(input_path), sheet_name=None)
    event_records = df['events'].to_dict('records')
    relation_records = df['relations'].to_dict('records')

    roles_ontology = defaultdict(dict)

    for ev in event_records:
        ev_type = get_type_str(ev)

        for arg_idx in range(1, 6):
            arg_key = f'arg{arg_idx} label'
            if isinstance(ev[arg_key], str):
                roles_ontology[ev_type][f'arg{arg_idx}'] = ev[arg_key]

    for rel in relation_records:
        rel_type = get_type_str(rel)

        roles_ontology[rel_type]['arg1'] = rel['arg1 label']
        roles_ontology[rel_type]['arg2'] = rel['arg2 label']

    output_path = util.get_output_path(args.output_path)
    with open(str(output_path), 'w') as fout:
        json.dump(roles_ontology, fout, indent=2)
예제 #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("working_dir", help='path to the working directory.')
    parser.add_argument(
        '--subgraph_dir',
        default='subgraph',
        help='name of the subdirectory in working_dir containing subgraphs')
    parser.add_argument(
        '--seed_dir',
        default='cluster_seeds',
        help='name of the subdirectory in working_dir containing cluster seeds'
    )
    parser.add_argument(
        '--indexed_data_dir',
        default='data_indexed',
        help='name of the subdirectory in working_dir to write indexed data')
    parser.add_argument("--indexer_path",
                        default='resources/indexer.p',
                        help='path to the indexer file.')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    working_dir = util.get_input_path(args.working_dir)

    subgraph_dir = util.get_input_path(working_dir / args.subgraph_dir)
    seed_dir = util.get_input_path(working_dir / args.seed_dir)
    output_dir = util.get_output_dir(working_dir / args.indexed_data_dir,
                                     overwrite_warning=not args.force)

    indexer_path = str(util.get_input_path(args.indexer_path))

    locals().update(vars(args))

    seed_subgraph_map = map_subgraph_to_seed(subgraph_dir, seed_dir)

    print('\nIndexing files ...')

    index_and_partition(seed_subgraph_map, output_dir, indexer_path)

    print(f'\nIndexing finished: indexed data in directory {output_dir}')
예제 #3
0
파일: stats.py 프로젝트: pxch/aida-utexas
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input_path', help='Path to a TA2 KB or a JSON graph')

    args = parser.parse_args()

    input_path = util.get_input_path(args.input_path)

    if input_path.suffix == '.ttl':
        aida_graph = AidaGraph()
        aida_graph.build_graph(str(input_path), fmt='ttl')

        get_kb_stats(aida_graph)

    elif input_path.suffix == '.json':
        with open(input_path, 'r') as fin:
            json_graph = JsonGraph.from_dict(json.load(fin))

        get_json_stats(json_graph)
예제 #4
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph json file')
    parser.add_argument('hypotheses_path', help='path to the hypotheses json file')
    parser.add_argument('db_dir', help='directory with copies of tdb databases')
    parser.add_argument('output_dir', help='path to output directory')
    parser.add_argument('--top', default=50, type=int,

                        help='number of top hypothesis to output')
    parser.add_argument('--dry_run', action='store_true',
                        help='if specified, only write the SPARQL queries to '
                             'files, without actually executing the queries')
    parser.add_argument('--query_just', action='store_true')
    parser.add_argument('--query_conf', action='store_true')
    parser.add_argument('-f', '--force', action='store_true', default=False,
                        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph'))

    mappings = json_graph.build_cluster_member_mappings()
    member_to_clusters = mappings['member_to_clusters']
    cluster_to_prototype = mappings['cluster_to_prototype']
    prototype_set = set(mappings['prototype_to_clusters'].keys())

    hypotheses_json = util.read_json_file(args.hypotheses_path, 'hypotheses')

    output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force)

    db_dir = util.get_input_path(args.db_dir)
    db_path_list = [str(path) for path in sorted(db_dir.glob('copy*'))]
    print('Using the following tdb databases to query: {}'.format(db_path_list))

    num_node_queries = len(db_path_list)

    top_count = 0
    for result_idx, prob in sorted(
            enumerate(hypotheses_json['probs']), key=itemgetter(1), reverse=True):
        hypothesis = hypotheses_json['support'][result_idx]
        # node_query_list, stmt_query_list, just_query_list, conf_query_list = \
        sparql_query_str = \
            queries_for_aida_result(
                json_graph=json_graph,
                hypothesis=hypothesis,
                member_to_clusters=member_to_clusters,
                cluster_to_prototype=cluster_to_prototype,
                prototype_set=prototype_set,
                num_node_queries=num_node_queries,
                query_just=args.query_just,
                query_conf=args.query_conf)

        top_count += 1

        print(f'Writing queries for hypothesis #{top_count} with prob {prob}')

        sparql_query_path = output_dir / 'hypothesis-{:0>3d}-query.rq'.format(top_count)
        with open(str(sparql_query_path), 'w') as fout:
            fout.write(sparql_query_str + '\n')

        if not args.dry_run:
            query_result_path = output_dir / 'hypothesis-{:0>3d}-raw.ttl'.format(top_count)
            query_cmd = 'echo "query {0}"; tdbquery --loc {1} --query {0} > {2}; '.format(
                sparql_query_path, db_path_list[0], query_result_path)

            print('Executing queries ...')
            process = subprocess.Popen(query_cmd, shell=True)
            process.wait()

        # sparql_helper.execute_sparql_queries(
        #     node_query_list, stmt_query_list, just_query_list, conf_query_list,
        #     db_path_list, output_dir,
        #     filename_prefix='hypothesis-{:0>3d}'.format(top_count),
        #     header_prefixes=AIF_HEADER_PREFIXES, dry_run=args.dry_run)

        if top_count >= args.top:
            break
예제 #5
0
def main():
    parser = argparse.ArgumentParser()

    # For qualitative evaluation only
    parser.add_argument('working_dir', help='path to the working directory')
    parser.add_argument('--seed_dir', default='cluster_seeds',
                        help='name of the subdirectory in working_dir containing cluster seeds')
    parser.add_argument('--indexed_data_dir', default='data_indexed',
                        help='name of the subdirectory in working_dir containing indexed data')
    parser.add_argument('--output_dir', default='result_jsons',
                        help='name of the subdirectory in working_dir to write output hypotheses')

    parser.add_argument("--indexer_path", default="resources/indexers.p",
                        help='path to the indexer file')
    parser.add_argument("--model_path", default="resources/gcn2-cuda_best_15000_0.ckpt",
                        help='path to the pre-trained model checkpoint')

    parser.add_argument("--device", type=int, default=-1)

    parser.add_argument("--attention_type", type=str, default='concat')
    parser.add_argument("--num_layers", type=int, default=2)
    parser.add_argument("--hidden_size", type=int, default=300)
    parser.add_argument("--attention_size", type=int, default=300)
    parser.add_argument("--conv_dropout", type=float, default=.5)
    parser.add_argument("--attention_dropout", type=float, default=.3)

    parser.add_argument('-f', '--force', action='store_true', default=False,
                        help='If specified, overwrite existing output files without warning')

    # parser.add_argument("--use_highest_ranked_gold", action='store_true')
    # parser.add_argument("--force", action='store_true')
    # parser.add_argument("--init_prob_force", type=float, default=.95)
    # parser.add_argument("--force_decay", type=float, default=.9684)
    # parser.add_argument("--force_every", type=int, default=1000)
    # parser.add_argument("--batch_size", type=int, default=4)
    # parser.add_argument("--self_attention_type", type=str, default='concat')
    # parser.add_argument("--num_epochs", type=int, default=2)
    # parser.add_argument("--learning_rate", type=float, default=1e-5)
    # parser.add_argument("--save_path", type=str, default="Saved_Models")
    # parser.add_argument("--save_tag", type=str, default="")
    # parser.add_argument("--eval_tag", type=str, default="")
    # parser.add_argument("--valid_every", type=int, default=5000)
    # parser.add_argument("--print_every", type=int, default=100)

    args = parser.parse_args()

    if args.device == -1:
        device = torch.device('cpu')
    else:
        device = torch.device(f'cuda:{args.device}')

    print(f'\nConfig:{args}')

    torch.manual_seed(0)
    np.random.seed(0)

    indexer_path = str(util.get_input_path(args.indexer_path))
    with open(indexer_path, 'rb') as fin:
        ere_indexer, stmt_indexer, ere_emb_mat, stmt_emb_mat, num_word2vec_ere, num_word2vec_stmt = dill.load(fin)
    indexer_info_dict = dict()
    indexer_info_dict['ere_indexer'] = ere_indexer
    indexer_info_dict['stmt_indexer'] = stmt_indexer
    indexer_info_dict['ere_emb_mat'] = ere_emb_mat
    indexer_info_dict['stmt_emb_mat'] = stmt_emb_mat
    indexer_info_dict['num_word2vec_ere'] = num_word2vec_ere
    indexer_info_dict['num_word2vec_stmt'] = num_word2vec_stmt

    model = CoherenceNetWithGCN(
        False,
        indexer_info_dict,
        args.attention_type,
        None,
        args.num_layers,
        args.hidden_size,
        args.attention_size,
        args.conv_dropout,
        args.attention_dropout)

    model_path = str(util.get_input_path(args.model_path))
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))['model'])

    model.to(device)
    model.eval()

    working_dir = util.get_input_path(args.working_dir)
    seed_dir = util.get_input_path(working_dir / args.seed_dir)
    indexed_data_dir = util.get_input_path(working_dir / args.indexed_data_dir)
    output_dir = util.get_output_dir(working_dir / args.output_dir,
                                     overwrite_warning=not args.force)

    print('\nExpanding cluster seeds ...')

    evaluate(seed_dir, indexed_data_dir, output_dir, model, device)

    print(f'\nExpanding finished: raw hypotheses in directory {output_dir}')
예제 #6
0
def main():
    parser = ArgumentParser(
        description=
        'Read in a TA2 KB and a (list of) XML-based Statement of Information Need '
        'definition, convert the KB to JSON format, then convert each SoIN to a JSON '
        'query by identifying and ranking entry points.')
    parser.add_argument('kb_path', help='Path to the input TA2 KB')
    parser.add_argument('graph_output_path',
                        help='Path to write the JSON graph')
    parser.add_argument(
        '-s',
        '--soin_path',
        help=
        'Path to the input SoIN file, or a directory containing multiple SoIN '
        'files; if not provided, will only transform the graph')
    parser.add_argument(
        '-q',
        '--query_output_dir',
        help=
        'Directory to write the JSON queries, used when soin_path is provided')
    parser.add_argument(
        '-m',
        '--max_matches',
        type=int,
        default=50,
        help='The maximum number of EPs *per entry point description*')
    parser.add_argument(
        '-d',
        '--dup_kb',
        default=duplicate_kb_file,
        help='Path to the json file with duplicate KB ID mappings')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    kb_path = util.get_input_path(args.kb_path)
    graph_output_path = util.get_output_path(args.graph_output_path,
                                             overwrite_warning=not args.force)

    aida_graph = AidaGraph()
    aida_graph.build_graph(str(kb_path), fmt='ttl')

    json_graph = JsonGraph()
    json_graph.build_graph(aida_graph)

    logging.info('Writing JSON graph to {} ...'.format(graph_output_path))
    with open(str(graph_output_path), 'w') as fout:
        json.dump(json_graph.as_dict(), fout, indent=1)
    logging.info('Done.')

    if args.soin_path is not None:
        assert args.query_output_dir is not None, 'Must provide query_output_dir'
        soin_path = util.get_input_path(args.soin_path)
        query_output_dir = util.get_output_dir(
            args.query_output_dir, overwrite_warning=not args.force)

        soin_file_paths = util.get_file_list(soin_path,
                                             suffix='.xml',
                                             sort=True)

        dup_kb_id_mapping = None
        if args.dup_kb is not None:
            dup_kb_id_mapping = util.read_json_file(args.dup_kb,
                                                    'duplicate KB ID mapping')

        logging.info('Getting Cluster Mappings ...')
        ere_to_prototypes = get_cluster_mappings(aida_graph)

        for soin_file_path in soin_file_paths:
            query_output_path = query_output_dir / (soin_file_path.stem +
                                                    '_query.json')

            logging.info('Processing SOIN {} ...'.format(soin_file_path))
            soin = SOIN.parse(str(soin_file_path),
                              dup_kbid_mapping=dup_kb_id_mapping)

            logging.info('Resolving all entrypoints ...')
            soin.resolve(aida_graph,
                         ere_to_prototypes,
                         max_matches=args.max_matches)

            query_json = {'graph': kb_path.stem}
            query_json.update(soin.to_json())

            logging.info(
                'Writing JSON query to {} ...'.format(query_output_path))
            with open(str(query_output_path), 'w') as fout:
                json.dump(query_json, fout, indent=1)