if args.max_num_item is not None:
            args.max_num_item = int(args.max_num_item)

    else:
        args.max_num_person = None
        args.max_num_item = None

    device = torch.device("cuda" if args.cuda else "cpu")
    if args.cuda: torch.cuda.set_device(args.gpu_device)

    train_dataset = load_dataset(
        args.dataset,
        train=True,
        num_person=args.num_person,
        num_item=args.num_item,
        ability_dim=args.ability_dim,
        max_num_person=args.max_num_person,
        max_num_item=args.max_num_item,
    )

    if args.artificial_missing_perc > 0:
        train_dataset = artificially_mask_dataset(
            train_dataset,
            args.artificial_missing_perc,
        )

    num_person = train_dataset.num_person
    num_item = train_dataset.num_item

    train_loader = torch.utils.data.DataLoader(
    if not os.path.isdir(args.out_dir):
        os.makedirs(args.out_dir)

    device = torch.device("cuda" if args.cuda else "cpu")
    if args.cuda: torch.cuda.set_device(args.gpu_device)

    if args.response_dist == 'bernoulli':
        dataset_name = args.dataset
    else:
        dataset_name = f'{args.dataset}_continuous'

    train_dataset = load_dataset(
        dataset_name, 
        train = True, 
        num_person = args.num_person, 
        num_item = args.num_item,  
        ability_dim = args.ability_dim,
        max_num_person = args.max_num_person,
        max_num_item = args.max_num_item,
    )
    test_dataset  = load_dataset(
        dataset_name, 
        train = False, 
        num_person = args.num_person, 
        num_item = args.num_item, 
        ability_dim = args.ability_dim,
        max_num_person = args.max_num_person,
        max_num_item = args.max_num_item,
    )

    if args.artificial_missing_perc > 0:
Beispiel #3
0
def load(data_dir, config, splits):
    """
    Load specific dataset.

    Args:
        data_dir (str): path to the dataset directory.
        config (dict): general dict with settings.
        splits (list): list of strings 'train'|'val'|'test'.

    Returns (dict): dictionary with keys 'train'|'val'|'test'| and values
    as tensorflow Dataset objects.

    """
    dataset_path = '/tf/data/{}'.format(config['data.dataset'])

    if not os.path.exists(dataset_path):
        os.makedirs(dataset_path)

    data = load_dataset(config, datagen_flow=True, with_datasets=True)

    ret = {}

    for split in splits:
        # n_way (number of classes per episode)
        if split in ['val', 'test']:
            n_way = config['data.test_way']
        else:
            n_way = config['data.train_way']

        # n_support (number of support samples per class)
        if split in ['val', 'test']:
            n_support = config['data.test_support']
        else:
            n_support = config['data.train_support']

        # n_query (number of query samples per class)
        if split in ['val', 'test']:
            n_query = config['data.test_query']
        else:
            n_query = config['data.train_query']

        batch_size = config['data.batch_size']
        split_size = data[f"{split}_size"]

        x, y = data[f"{split}_gen"].next()

        batches = 1
        for images, labels in data[f"{split}_gen"]:
            x = np.concatenate([x, images])
            y = np.concatenate([y, labels])
            batches += 1
            if batches >= split_size / batch_size:
                # we need to break the loop by hand because
                # the generator loops indefinitely
                break

        i = np.argsort(y)
        y = y[i]
        x = x[i, :, :, :]

        split_data = [[] for i in range(data["nb_classes"])]
        for index in i:
            split_data[y[index]].append(x[index])

        data_loader = DataLoader(np.array(
            [np.array(images) for images in split_data]),
                                 n_classes=data["nb_classes"],
                                 n_way=n_way,
                                 n_support=n_support,
                                 n_query=n_query,
                                 x_dim=data["image_shape"])

        ret[split] = data_loader

    return ret
Beispiel #4
0
import torch
import jsonlines

from tqdm import tqdm
from argparse import ArgumentParser
from pathlib import Path
from src.datasets import load_dataset

if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('--checkpoint', type=Path)
    parser.add_argument('--dataset', type=str, default='squad')

    args = parser.parse_args()

    dataset = load_dataset(args.dataset, train=True)
    saved_model = torch.load(args.checkpoint / 'checkpoint.pth.tar')

    weights = saved_model['model_state_dict']

    models, items = [], []

    for i, model in tqdm(enumerate(dataset.ix_to_model)):
        models.append({
            'submission_id':
            model,
            'ability_mu':
            weights['ability_mu_lookup.weight'][i].tolist(),
            'ability_logvar':
            weights['ability_logvar_lookup.weight'][i].tolist()
        })
Beispiel #5
0
def run_experiments(
        args: argparse.Namespace,
        neptuneai_project_id: str = 'clfmsc2020/experiments') -> None:
    """ Runs experiments """
    def _debug(text: str) -> None:
        """ Prints statemets only if args.debug or arg.verbose
            flag is set """
        if args.debug or args.verbose:
            print(f'[INFO] {text}')

    if args.useneptune:
        _debug('Neptune.AI enabled.')
        neptune.init(neptuneai_project_id)

    _debug(f'Config file path: {args.config}')
    data_yaml_params, knn_yaml_params, exp_yaml_params = \
        helpers.load_config_file(args.config)

    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    _debug(f'Device: {device}')

    # -- LEVEL 0: CACHE DATASET
    for data_params in utils.iterparams(data_yaml_params):

        x, y = datasets.load_dataset(data_params.DATASET,
                                     transform,
                                     use_umap=data_params.USE_UMAP)
        (train_x, train_y), (val_x,
                             val_y), (test_x, test_y) = utils.split_data(
                                 x,
                                 y,
                                 val_size=data_params.VAL_SIZE,
                                 test_size=data_params.TEST_SIZE)
        _debug(f'Dataset: {data_params.DATASET}\n')

        # TODO: output_shape = ...

        # -- LEVEL 1: CACHE KNN
        for knn_params in utils.iterparams(knn_yaml_params):

            knn = None
            if knn_params.K is not None and knn_params.K > 0:
                train_val_x = np.concatenate((train_x, val_x), axis=0)
                train_val_y = np.concatenate((train_y, val_y), axis=0)
                knn = FaissKNN(train_val_x,
                               train_val_y,
                               precompute=True,
                               k=knn_params.K)
                _debug(f'kNN wrapper initialized (k = {knn_params.K}).\n')

            # -- LEVEL 2: RUN EXPERIMENTS
            for exp_params in utils.iterparams(exp_yaml_params):

                # EXCEPTIONS
                if exp_params.FUNCTION_NAME == 'bce' and \
                   exp_params.OUTPUT_ACTIVATIONS == 'tanh':
                    _debug('An exception has occurred (BCE + TanH)')
                    continue

                # Criterion
                criterion = None
                hinge_target_range = False

                criterion_name = exp_params.FUNCTION_NAME
                criterion_type = helpers.LossFuncType.from_string(
                    exp_params.FUNCTION_TYPE)
                n_layers = len(exp_params.LAYERS)
                _debug(f'Criterion: {criterion_name} (type: {criterion_type})')

                if criterion_type == ftype.BASIC:
                    hinge_target_range, loss_function = helpers.get_loss_function(
                        criterion_name, criterion_type)
                    criterion = loss_function()

                elif criterion_type == ftype.ENTR_R:
                    assert knn is not None, 'kNN wrapper is not initialized!'
                    hinge_target_range, base_loss = helpers.get_loss_function(
                        criterion_name, ftype.BASIC)
                    criterion = lossfunc.EntropyRegularizedBinaryLoss(
                        base_loss(), knn)

                elif criterion_type == ftype.ENTR_W:
                    assert knn is not None, 'kNN wrapper is not initialized!'
                    hinge_target_range, base_loss = helpers.get_loss_function(
                        criterion_name, ftype.BASIC)
                    criterion = lossfunc.EntropyWeightedBinaryLoss(
                        base_loss(), knn)

                elif criterion_type == ftype.CLF:
                    assert knn is not None, 'kNN wrapper is not initialized!'
                    hinge_target_range, loss_function = helpers.get_loss_function(
                        criterion_name, ftype.CLF)
                    criterion = loss_function(
                        knn, 0.5)  # FIXME: Fixed params (alpha, beta)

                assert criterion is not None, 'Criterion was not initialized!'

                # Change target range
                target_train_y = np.copy(train_y)
                target_val_y = np.copy(val_y)
                target_test_y = np.copy(test_y)

                if hinge_target_range:
                    _debug('Negative class: 0 -> -1')
                    target_train_y[train_y == 0] = -1
                    target_val_y[val_y == 0] = -1
                    target_test_y[test_y == 0] = -1

                # Convert the subsets into DataLoaders
                train_dataloader = datasets.convert_to_dataloader(
                    train_x, target_train_y, batch_size=data_params.BATCH_SIZE)

                valid_dataloader = datasets.convert_to_dataloader(
                    val_x,
                    target_val_y,
                    batch_size=data_params.BATCH_SIZE,
                    startidx=train_x.shape[0])

                test_data_x, test_data_y = Tensor(test_x), Tensor(test_y)

                # Prepare the experiment
                all_params = {**data_params, **knn_params, **exp_params}
                _debug(f'Params: \n {all_params}')

                unified_dataset_name = datasets.simplify_dataset_name(
                    data_params.DATASET)
                experiment_name = f'{unified_dataset_name}_{exp_params.FUNCTION_NAME}_{exp_params.FUNCTION_TYPE}'
                _debug(f'Experiment name: {experiment_name}')

                # Set-up neptue.ai experiment
                experiment = None
                if args.useneptune:
                    tags = [
                        exp_params.FUNCTION_NAME, unified_dataset_name,
                        data_params.PROBLEM, exp_params.FUNCTION_TYPE,
                        exp_params.OUTPUT_ACTIVATIONS
                    ]

                    all_params['N_LAYERS'] = n_layers
                    experiment = neptune.create_experiment(
                        name=experiment_name,
                        tags=tags,
                        params=all_params,
                        upload_source_files=[
                            args.config, 'src/losses/*.py', __file__
                        ])

                # Input shape
                input_dim = x.shape[1]

                # Layers
                predefined_layers = exp_params.LAYERS.copy()
                if criterion_type == ftype.ENTR_R:
                    predefined_layers.append(2)

                # Output shape (#FIXME)
                output_dim = 1

                layers = [input_dim] + predefined_layers + [output_dim]

                model = CustomNeuralNetwork(layers,
                                            exp_params.HIDDEN_ACTIVATIONS,
                                            exp_params.OUTPUT_ACTIVATIONS)
                optimizer = Adam(model.parameters(),
                                 lr=exp_params.LEARNING_RATE)

                # Run an experiment
                _debug('Starting the training ...')
                logger = neptune if args.useneptune else None
                model, training_loss_history, validation_loss_history = \
                    trainingloop.run(experiment_name, optimizer, criterion, model,
                                     train_dataloader, exp_params.EPOCHS, valid_dataloader,
                                     test_data_x=test_data_x, test_data_y=test_data_y,
                                     eval_freq=exp_params.EVAL,
                                     early_stopping=exp_params.EARLY_STOPPING,
                                     neptune_logger=logger, knn_use_indices=True,
                                     loss_type=criterion_type)

                # Evaluate the model
                metrics = trainingloop.evaluate_binary(model, test_data_x,
                                                       test_data_y)
                _debug(f'Done! Evaluation results: \n{metrics}\n')

                if args.useneptune:
                    for metric, value in metrics.items():
                        neptune.log_metric(f'final_{metric}', value)
                    experiment.stop()
Beispiel #6
0
        args.ability_dim,
        args.ability_merge,
        args.num_iafs,
    )
    args.out_dir = os.path.join(args.out_dir, out_file)

    if not os.path.isdir(args.out_dir):
        os.makedirs(args.out_dir)

    device = torch.device("cuda" if args.cuda else "cpu")
    if args.cuda: torch.cuda.set_device(args.gpu_device)

    train_dataset = load_dataset(
        args.dataset,
        train=True,
        num_person=args.num_person,
        num_item=args.num_item,
        ability_dim=args.ability_dim,
    )
    test_dataset = load_dataset(
        args.dataset,
        train=False,
        num_person=args.num_person,
        num_item=args.num_item,
        ability_dim=args.ability_dim,
    )

    num_person = train_dataset.num_person
    num_item = train_dataset.num_item

    train_loader = torch.utils.data.DataLoader(