Ejemplo n.º 1
0
def dataset_to_query(model, dataset_name, dataset_mode):
    queries = None
    try:
        dataset = Dataset(dataset_name)

        query_ids = dataset.dataset_to_queries(
            dataset_mode)  # dataset_mode = [train,test,valid]

        query_side = []

        for i in range(len(query_ids[:, 0])):
            query_side.append(
                (query_ids[:, 0][i].item(), query_ids[:, 1][i].item()))

        check = []

        for i, j in query_side:
            check.append(dataset.to_skip['rhs'][i, j])

        queries = model.get_queries_separated(query_ids)

        if not ('train' in dataset_mode.lower()):
            results = dataset.eval(model, dataset_mode, -1)
            print("\n\n{} : {}".format(dataset_mode, results))

    except RuntimeError as e:
        print(
            "Cannot convert the dataset to a query list with error: {}".format(
                str(e)))
        return None, None

    return queries, check
Ejemplo n.º 2
0
parser.add_argument('--init', default=1e-3, type=float, help="Initial scale")
parser.add_argument('--learning_rate',
                    default=1e-1,
                    type=float,
                    help="Learning rate")
parser.add_argument('--decay1',
                    default=0.9,
                    type=float,
                    help="decay rate for the first moment estimate in Adam")
parser.add_argument('--decay2',
                    default=0.999,
                    type=float,
                    help="decay rate for second moment estimate in Adam")
args = parser.parse_args()

dataset = Dataset(args.dataset)
examples = torch.from_numpy(dataset.get_train().astype('int64'))

print(dataset.get_shape())
model = {
    'CP': lambda: CP(dataset.get_shape(), args.rank, args.init),
    'ComplEx': lambda: ComplEx(dataset.get_shape(), args.rank, args.init),
}[args.model]()

regularizer = {
    'F2': F2(args.reg),
    'N3': N3(args.reg),
}[args.regularizer]

device = 'cuda'
model.to(device)
                                                                                             axis=1).to_list()
    # Only keywords
    dataset_kw = dataset.copy()
    dataset_kw['text'] = dataset[['keywords']].apply(lambda x: ''.join(x), axis=1).to_list()

    input_x = dataset[['claim', 'text']]
    input_x_all = dataset_text_all[['claim', 'text']]
    input_x_kw = dataset_kw[['claim', 'text']]

    input_y = dataset[class_list].copy().values

    data_path = args[2]
    model_path = args[3]

    # CKGE Graph embeddings
    ckge_dataset = Dataset(os.path.join(data_path, "CKGE"), use_cpu=True)
    ckge_model = CP(ckge_dataset.get_shape(), 50)
    ckge_model.load_state_dict(
        torch.load(os.path.join(model_path, "CKGE.pickle"),
                   map_location=torch.device('cpu')))

    ckge_graph_vectorizer = GraphEmbeddingTransformer(ckge_dataset, ckge_model)

    # Distil RoBERTa (DR)
    flair_vectorizer_DR = FlairTransformer([
        TransformerWordEmbeddings(model="distilroberta-base",
                                  use_scalar_mix=True)
    ], batch_size=1)

    # GPT2
    flair_vectorizer_GPT2 = FlairTransformer([
Ejemplo n.º 4
0
from pathlib import Path
import pkg_resources
import pickle
from kbc.datasets import Dataset
import matplotlib.pyplot as plt

#%%%
# FB237 dataset degree distribution

mydata = Dataset('FB237')

train, slice_dic = mydata.get_sorted_train()

plt.close('all')

plt.figure()
plt.hist(slice_dic[:, 3], bins=300, range=(0, 300))
plt.xlabel('Number of adjacent neighbors (Degrees) \n FB237 Dataset ')
plt.ylabel('Frequency')
plt.show()

#%%%
# FB15K dataset degree distribution

mydata = Dataset('WN18RR')

train, slice_dic = mydata.get_sorted_train()

plt.figure()
plt.hist(slice_dic[:, 3], bins=30, range=(0, 30))
plt.xlabel('Number of adjacent neighbors (Degrees) \n WN18RR Dataset')
Ejemplo n.º 5
0
Archivo: learn.py Proyecto: jhb115/kbc
parser.add_argument(
    '--n_freeze', default=0, type=int,
    help='Number of training epochs you wish to freeze the original embedding'
)

parser.add_argument(
    '--evaluation_mode', default=0, type=int, choices=[0, 1],
    help='Whther to get an attention mask or not'
)

# Setup parser
args = parser.parse_args()

# Get Dataset
dataset = Dataset(args.dataset)
if args.model in ['CP', 'ComplEx']:
    unsorted_examples = torch.from_numpy(dataset.get_train().astype('int64'))
    examples = unsorted_examples
else:
    sorted_data, slice_dic = dataset.get_sorted_train()
    examples = torch.from_numpy(dataset.get_train().astype('int64'))

model = {
    'CP': lambda: CP(dataset.get_shape(), args.rank, args.init),
    'ComplEx': lambda: ComplEx(dataset.get_shape(), args.rank, args.init),
    'ContExt': lambda: ContExt(dataset.get_shape(), args.rank, sorted_data, slice_dic,
                               max_NB=args.max_NB, init_size=args.init, data_name=args.dataset,
                               ascending=args.ascending, dropout_1=args.dropout_1,
                               dropout_g=args.dropout_g, evaluation_mode=args.evaluation_mode),
}[args.model]()
Ejemplo n.º 6
0
import sys

import torch
from kbc.datasets import Dataset

from kbc import avg_both
from kbc.models import CP

args = sys.argv[1:]

dataset = Dataset(args[0], use_cpu=True)
model = CP(dataset.get_shape(), 50)
model.load_state_dict(torch.load(args[1], map_location=torch.device('cpu')))

print(avg_both(*dataset.eval(model, "test", 50000, batch_size=100)))
Ejemplo n.º 7
0
                        type=float,
                        help="decay rate for second moment estimate in Adam")

    parser.add_argument('--model_save_schedule',
                        default=50,
                        type=int,
                        help="Saving the model every N iterations")

    parser.add_argument('--eval_only', action='store_true', default=False)
    parser.add_argument('--checkpoint', type=str)

    args = parser.parse_args()

    args.dataset = os.path.basename(args.path)

    dataset = Dataset(os.path.join(args.path, 'kbc_data'))
    args.data_shape = dataset.get_shape()

    if not args.eval_only:
        model = {
            'CP': lambda: CP(dataset.get_shape(), args.rank, args.init),
            'ComplEx':
            lambda: ComplEx(dataset.get_shape(), args.rank, args.init),
            'DistMult':
            lambda: DistMult(dataset.get_shape(), args.rank, args.init)
        }[args.model]()

        regularizer = {
            'N2': N2(args.reg),
            'N3': N3(args.reg),
        }[args.regularizer]
Ejemplo n.º 8
0

if __name__ == "__main__":

    big_datasets = ['Bio', 'FB15K', 'WN', 'WN18RR', 'FB237', 'YAGO3-10']
    datasets = big_datasets

    parser = argparse.ArgumentParser(description="Chain Dataset Sampling")

    parser.add_argument('--dataset',
                        choices=datasets,
                        help="Dataset in {}".format(datasets))

    parser.add_argument(
        '--threshold',
        default=1e5,
        type=int,
        help="Threshold for maximum amount sampled per chain type")

    parser.add_argument('--save_path',
                        default=os.getcwd(),
                        help="Path to save the chained dataset")

    args = parser.parse_args()

    chained_dataset_sampler = ChaineDataset(Dataset(args.dataset),
                                            args.threshold)
    chained_dataset_sampler.sample_chains()

    save_chain_data(args.save_path, args.dataset, chained_dataset_sampler)
    input_x = DataFrame()
    input_x['claim'] = claims
    input_x['text'] = texts
    input_x['keywords'] = keywords
    input_x['author'] = authors

    input_x['text'] = input_x[['text', 'keywords',
                               'author']].apply(lambda x: ''.join(x),
                                                axis=1).to_list()
    # input_x['text'] = input_x[['keywords']].apply(lambda x: ''.join(x), axis=1).to_list()

    input_y = ratings

    # Graph embeddings
    dataset = Dataset(os.path.join(args[1]), use_cpu=True)
    model = CP(dataset.get_shape(), 50)
    model.load_state_dict(torch.load(args[2],
                                     map_location=torch.device('cpu')))

    graph_vectorizer = ClamsKGGraphEmbeddingTransformer(
        dataset, model, args[0],
        NeighbourhoodVectorConcatStrategy.CONCAT_TRIPLES)
    # graph_vectorizer = GraphEmbeddingTransformer(dataset, model)

    # Baseline RoBERTa/BERT
    flair_vectorizer_baseline_roberta = FlairTransformer([
        TransformerWordEmbeddings(model="distilroberta-base",
                                  use_scalar_mix=True)
    ])
Ejemplo n.º 10
0
# Parser argument for ConvE
# dropout
parser.add_argument('--dropouts',
                    default=(0.3, 0.3, 0.3),
                    type=tuple,
                    help="Dropout rates for each layer in ConvE")

# Whether to use bias for the ConvE layer
parser.add_argument('--use_bias',
                    default=True,
                    type=bool,
                    help="Using or not using bias for the ConvE layers")

args = parser.parse_args()

dataset = Dataset(args.dataset)
examples = torch.from_numpy(
    dataset.get_train().astype('int64')).cpu()  # changed for cpu

print(dataset.get_shape())
model = {
    'CP':
    lambda: CP(dataset.get_shape(), args.rank, args.init),
    'ComplEx':
    lambda: ComplEx(dataset.get_shape(), args.rank, args.init),
    'ConvE':
    lambda: ConvE(dataset.get_shape(), args.rank, args.dropouts, args.use_bias)
}[args.model]()

regularizer = {
    'N2': N2(args.reg),
Ejemplo n.º 11
0
import os
import re
from typing import Dict

import numpy
import torch
from torch import optim
from torch.nn import DataParallel

from kbc import avg_both
from kbc.datasets import Dataset
from kbc.models import CP, ComplEx
from kbc.optimizers import KBCOptimizer
from kbc.regularizers import F2, N3

datasets = Dataset.get_dataset_shortlist()

parser = argparse.ArgumentParser(description="Relational learning contraption")

parser.add_argument("--save-model",
                    nargs=1,
                    default=[''],
                    dest="save_model",
                    help="Save final model to specified directory")

parser.add_argument(
    "--save-checkpoints",
    nargs=1,
    default=[''],
    dest="save_checkpoints",
    help="Save checkpoints for each epoch of the model to specified directory")
Ejemplo n.º 12
0
                                                str(int(args.train_no)))

# check if the folder exists
if not os.path.exists(folder_path):
    raise Exception('You do not have folder named:{}'.format(folder_path))

# folder path format 'results/ComplEx/FB15K/train1'
# Get the configuration
config = pickle.load(open(folder_path + '/config.p', 'rb'))

if config['save_pre_train'] == 1:
    pre_train_folder = '../pre_train/{}/{}'.format('Context_' + args.model,
                                                   args.dataset)

# Get Dataset
dataset = Dataset(config['dataset'])

dataset = Dataset(args.dataset)
if args.model in ['CP', 'ComplEx', 'ConvE']:  # For non-context model
    unsorted_examples = torch.from_numpy(dataset.get_train().astype('int64'))
    examples = unsorted_examples
else:  # Get sorted examples for context model
    sorted_data, slice_dic = dataset.get_sorted_train()
    examples = torch.from_numpy(dataset.get_train().astype('int64'))

rank, init = [int(config['rank']), float(config['init'])]

print(dataset.get_shape())

model = {
    'CP':