Example #1
0
def generate_subgraph_datasets(params, splits=['train', 'valid'], saved_relation2id=None, max_label_value=None):

    testing = 'test' in splits
    adj_list, triplets, entity2id, relation2id, id2entity, id2relation = process_files(params.file_paths, saved_relation2id)

    # plot_rel_dist(adj_list, os.path.join(params.main_dir, f'data/{params.dataset}/rel_dist.png'))

    data_path = os.path.join(params.main_dir, f'data/{params.dataset}/relation2id.json')
    if not os.path.isdir(data_path) and not testing:
        with open(data_path, 'w') as f:
            json.dump(relation2id, f)

    graphs = {}

    for split_name in splits:
        graphs[split_name] = {'triplets': triplets[split_name], 'max_size': params.max_links}

    # Sample train and valid/test links
    for split_name, split in graphs.items():
        logging.info(f"Sampling negative links for {split_name}")
        split['pos'], split['neg'] = sample_neg(adj_list, split['triplets'], params.num_neg_samples_per_link, max_size=split['max_size'], constrained_neg_prob=params.constrained_neg_prob)

    if testing:
        directory = os.path.join(params.main_dir, 'data/{}/'.format(params.dataset))
        save_to_file(directory, f'neg_{params.test_file}_{params.constrained_neg_prob}.txt', graphs['test']['neg'], id2entity, id2relation)

    links2subgraphs(adj_list, graphs, params, max_label_value)
Example #2
0
def generate_subgraph_datasets(params,
                               splits=['train', 'valid', 'test'],
                               saved_relation2id=None,
                               max_label_value=None):

    testing = 'test' in splits
    #adj_list, triplets, entity2id, relation2id, id2entity, id2relation, rel = process_files(params.file_paths, saved_relation2id)

    triple_file = 'data/{}/relations_2hop.txt'.format(params.dataset)
    if params.dataset == 'drugbank':
        adj_list, triplets, entity2id, relation2id, id2entity, id2relation, rel = process_files_ddi(
            params.file_paths, triple_file, saved_relation2id)
    else:
        adj_list, triplets, entity2id, relation2id, id2entity, id2relation, rel, triplets_mr, polarity_mr = process_files_decagon(
            params.file_paths, triple_file, saved_relation2id)
    # plot_rel_dist(adj_list, os.path.join(params.main_dir, f'data/{params.dataset}/rel_dist.png'))
    #print(triplets.keys(), triplets_mr.keys())
    data_path = os.path.join(params.main_dir,
                             f'data/{params.dataset}/relation2id.json')
    if not os.path.isdir(data_path) and testing:
        with open(data_path, 'w') as f:
            json.dump(relation2id, f)

    graphs = {}

    for split_name in splits:
        if params.dataset == 'drugbank':
            graphs[split_name] = {
                'triplets': triplets[split_name],
                'max_size': params.max_links
            }
        elif params.dataset == 'BioSNAP':
            graphs[split_name] = {
                'triplets': triplets_mr[split_name],
                'max_size': params.max_links,
                "polarity_mr": polarity_mr[split_name]
            }
    # Sample train and valid/test links
    for split_name, split in graphs.items():
        logging.info(f"Sampling negative links for {split_name}")
        split['pos'], split['neg'] = sample_neg(
            adj_list,
            split['triplets'],
            params.num_neg_samples_per_link,
            max_size=split['max_size'],
            constrained_neg_prob=params.constrained_neg_prob)
    #print(graphs.keys())
    if testing:
        directory = os.path.join(params.main_dir,
                                 'data/{}/'.format(params.dataset))
        save_to_file(
            directory,
            f'neg_{params.test_file}_{params.constrained_neg_prob}.txt',
            graphs['test']['neg'], id2entity, id2relation)

    links2subgraphs(adj_list, graphs, params, max_label_value)