def main(project_name):

    logger = Logger('_02_valid_model_{}'.format(project_name))
    logger.info('=' * 50)

    model_path = '_model/embedding_model_{}.pt'.format(project_name)
    logger.info('load model from {}'.format(model_path))
    model = torch.load(model_path)

    evaluator = Evaluator()
    evaluator.evaluate(model)
Beispiel #2
0
    def __init__(self, recommender, hyperparameters, verbose=True, report_name='grid_search_results'):
        """
        Train number of recommenders using UV decomposition using different parameters.

        :param AbstractRecommender recommender:
        :param dict hyperparameters: A dictionary of the hyperparameters.
        :param boolean verbose: A flag to decide printing progress.
        :param str report_name: The name of the csv file in which the analysis of the grid search will be dumped.
        """
        self.recommender = recommender
        self.hyperparameters = hyperparameters
        self._verbose = verbose
        self.evaluator = Evaluator(recommender.get_ratings())
        self.all_errors = dict()
        self.results_file_name = report_name + '.csv'
Beispiel #3
0
    def setUp(self):
        """
        Setup method that is called at the beginning of each test.
        """
        self.documents, self.users = 18, 10
        documents_cnt, users_cnt = self.documents, self.users
        self.n_iterations = 15
        self.k_folds = 3
        self.hyperparameters = {'n_factors': 5, '_lambda': 0.01}
        self.options = {'n_iterations': self.n_iterations, 'k_folds': self.k_folds}
        self.initializer = ModelInitializer(self.hyperparameters.copy(), self.n_iterations)
        self.n_recommendations = 1

        def mock_get_ratings_matrix(self=None):
            return [[int(not bool((article + user) % 3)) for article in range(documents_cnt)]
                    for user in range(users_cnt)]

        self.ratings_matrix = numpy.array(mock_get_ratings_matrix())
        setattr(DataParser, "get_ratings_matrix", mock_get_ratings_matrix)

        self.evaluator = Evaluator(self.ratings_matrix)
        self.cf = CollaborativeFiltering(self.initializer, self.evaluator, self.hyperparameters,
                                         self.options, load_matrices=True)
        self.cf.train()
        self.cf.evaluator.k_folds = self.k_folds
        self.test_data = self.cf.test_data
        self.predictions = self.cf.get_predictions()
        self.rounded_predictions = self.cf.rounded_predictions()
Beispiel #4
0
 def runTest(self):
     evaluator = Evaluator(self.ratings_matrix)
     cf = CollaborativeFiltering(self.initializer,
                                 evaluator,
                                 self.initial_config,
                                 self.options,
                                 load_matrices=True)
     grid_search = GridSearch(cf, self.hyperparameters, False)
     self.checkKeyGenerator(grid_search)
     self.checkCombinationsGenerator(grid_search)
     self.checkGridSearch(grid_search)
def main(args):
    logger = Logger(args.output_dir)
    args.logger = logger
    trainer = Trainer(args)
    evaluator = Evaluator(trainer)
    for i_epoch in range(0, args.epoch + 1):

        # train
        log_dict = {
            'i_epoch': i_epoch,
            'train_losses': [],  # per batch
            'test_bleus': []
        }  # per sample
        trainer.train_one_epoch(log_dict)

        # evaluation and logging
        logger.log('%d th epoch' % i_epoch)
        evaluator.bleu(log_dict)
        evaluator.sample_translation()
        log_dict_mean = {
            'i_epoch': log_dict['i_epoch'],
            'train_loss': np.mean(log_dict['train_losses']),
            'test_bleu': np.mean(log_dict['test_bleus'])
        }
        logger.dump(log_dict_mean)
        trainer.save_best(log_dict_mean)
        logger.log('-' * 10)
Beispiel #6
0
    def runTest(self):
        m1, m2 = numpy.random.random((4, 8)), numpy.random.random((4, 8))
        self.assertTrue(abs(self.cf.evaluator.get_rmse(m1, m2) - numpy.sqrt(mean_squared_error(m1, m2))) < 1e-6)
        train, test = self.cf.evaluator.naive_split()
        self.assertEqual(numpy.count_nonzero(train) + numpy.count_nonzero(test),
                         numpy.count_nonzero(self.ratings_matrix))

        test_indices = self.cf.evaluator.get_kfold_indices()
        # k = 3
        first_fold_indices = test_indices[0::self.k_folds]
        second_fold_indices = test_indices[1::self.k_folds]
        third_fold_indices = test_indices[2::self.k_folds]
        train1, test1 = self.cf.evaluator.generate_kfold_matrix(first_fold_indices)
        train2, test2 = self.cf.evaluator.generate_kfold_matrix(second_fold_indices)
        train3, test3 = self.cf.evaluator.generate_kfold_matrix(third_fold_indices)

        total_ratings = numpy.count_nonzero(self.ratings_matrix)

        # ensure that each fold has 1/k of the total ratings
        k_inverse = 1 / self.k_folds
        self.assertTrue(abs(k_inverse - ((numpy.count_nonzero(test1)) / total_ratings)) < 1e-6)
        self.assertTrue(abs(k_inverse - ((numpy.count_nonzero(test1)) / total_ratings)) < 1e-6)
        self.assertTrue(abs(k_inverse - ((numpy.count_nonzero(test1)) / total_ratings)) < 1e-6)

        # assert that the folds don't intertwine
        self.assertTrue(numpy.all((train1 * test1) == 0))
        self.assertTrue(numpy.all((train2 * test2) == 0))
        self.assertTrue(numpy.all((train3 * test3) == 0))
        # assert that test sets dont contain the same elements
        self.assertTrue(numpy.all((test1 * test2) == 0))
        self.assertTrue(numpy.all((test2 * test3) == 0))
        self.assertTrue(numpy.all((test1 * test3) == 0))

        evaluator = Evaluator(self.ratings_matrix)
        self.assertEqual(self.predictions.shape, self.ratings_matrix.shape)
        recall = evaluator.calculate_recall(self.ratings_matrix, self.predictions)
        # if predictions are  perfect
        if recall == 1:
            for row in range(self.users):
                for col in range(self.documents):
                    self.assertEqual(self.rounded_predictions[row, col], self.ratings_matrix[row, col])

        self.setUp()
        evaluator.ratings = self.ratings_matrix.copy()

        # restore the unmodified rating matrix
        self.setUp()
        evaluator.ratings = self.ratings_matrix.copy()

        # mrr will always decrease as we set the highest prediction's index
        # to 0 in the rating matrix. top_n recommendations set to 0.
        mrr = []
        for i in range(self.users):
            evaluator.ratings[i, (numpy.argmax(self.predictions[i], axis=0))] = 0
            mrr.append(evaluator.calculate_mrr(self.n_recommendations, self.predictions,
                                               self.rounded_predictions, evaluator.ratings))
            if i > 1:
                self.assertLessEqual(mrr[i], mrr[i-1])
    def setUp(self):
        """
        Setup method that is called at the beginning of each test.
        """
        self.documents, self.users = 8, 10
        documents_cnt, users_cnt = self.documents, self.users
        self.n_iterations = 5
        self.n_factors = 5
        self.k_folds = 5
        self.hyperparameters = {'n_factors': self.n_factors}
        self.options = {'n_iterations': self.n_iterations, 'k_folds': self.k_folds}
        self.initializer = ModelInitializer(self.hyperparameters.copy(), self.n_iterations)

        def mock_process(self=None):
            pass

        def mock_get_abstracts(self=None):
            return {0: 'hell world berlin dna evolution', 1: 'freiburg is green',
                    2: 'the best dna is the dna of dinasours', 3: 'truth is absolute',
                    4: 'berlin is not that green', 5: 'truth manifests itself',
                    6: 'plato said truth is beautiful', 7: 'freiburg has dna'}

        def mock_get_ratings_matrix(self=None):
            return [[int(not bool((article + user) % 3)) for article in range(documents_cnt)]
                    for user in range(users_cnt)]

        def mock_get_word_distribution(self=None):
            abstracts = mock_get_abstracts()
            vocab = set(itertools.chain(*list(map(lambda ab: ab.split(' '), abstracts.values()))))
            w2i = dict(zip(vocab, range(len(vocab))))
            word_to_count = [(w2i[word], sum(abstract.split(' ').count(word)
                                             for doc_id, abstract in abstracts.items())) for word in vocab]
            article_to_word = list(set([(doc_id, w2i[word])
                                        for doc_id, abstract in abstracts.items() for word in abstract.split(' ')]))
            article_to_word_to_count = list(set([(doc_id, w2i[word], abstract.count(word))
                                                 for doc_id, abstract in abstracts.items()
                                                 for word in abstract.split(' ')]))
            return word_to_count, article_to_word, article_to_word_to_count

        abstracts = mock_get_abstracts()
        word_to_count, article_to_word,  article_to_word_to_count = mock_get_word_distribution()
        self.abstracts_preprocessor = AbstractsPreprocessor(abstracts, word_to_count,
                                                            article_to_word, article_to_word_to_count)
        self.ratings_matrix = numpy.array(mock_get_ratings_matrix())
        self.evaluator = Evaluator(self.ratings_matrix, self.abstracts_preprocessor)
        setattr(DataParser, "get_abstracts", mock_get_abstracts)
        setattr(DataParser, "process", mock_process)
        setattr(DataParser, "get_ratings_matrix", mock_get_ratings_matrix)
        setattr(DataParser, "get_word_distribution", mock_get_word_distribution)
Beispiel #8
0
 def setUp(self):
     """
     Setting up the ratings, expected ratings and recommendations.
     The comments are showing where are the matching recommendations.
     A matching recommendation will occur at the recommendation_indcies list,
     and the corresponding ratings and expected rating are both positive.
     """
     # 0  1  2  3  4  5  6  7  8
     self.ratings = numpy.array([
         [1, 1, 0, 0, 1, 0, 1, 0, 0],
         #    ^
         [0, 0, 1, 1, 0, 0, 0, 1, 0],
         #       ^
         [1, 1, 0, 1, 0, 0, 1, 0, 1],
         #          ^              ^
         [1, 0, 0, 0, 1, 0, 0, 0, 0],
         # ^
         [0, 0, 0, 0, 0, 0, 0, 0, 1]
     ])
     #
     # 0  1  2  3  4  5  6  7  8
     self.expected_ratings = numpy.array([
         [0, 1, 0, 0, 0, 0, 0, 0, 0],
         #    ^
         [0, 0, 1, 0, 0, 0, 0, 0, 0],
         #       ^
         [0, 0, 0, 1, 0, 0, 0, 0, 1],
         #          ^              ^
         [1, 0, 0, 0, 0, 0, 0, 0, 0],
         # ^
         [0, 1, 0, 0, 0, 0, 0, 0, 0]
     ])
     #
     self.recommendation_indices = numpy.array([
         [1],
         # 1 matches -> 1/1
         [3, 2],
         # 3 doesn't match, 2 matches -> 1/2
         [4, 6, 3, 0, 8],
         # 4,6,0 don't match, 3, 8 match -> 1/3, 1/5
         [0],
         # 0 matches -> 1/1
         [0]
     ])
     # no matches -> 0
     self.n_users, self.n_items = self.ratings.shape
     self.evaluator = Evaluator(self.ratings)
     self.evaluator.recs_loaded = True
     self.evaluator.recommendation_indices = self.recommendation_indices
    def setUp(self):
        """
        Setup method that is called at the beginning of each test.
        """
        self.documents, self.users = 30, 4
        documents_cnt, users_cnt = self.documents, self.users
        self.n_factors = 5
        self.n_iterations = 20
        self.k_folds = 3
        self.hyperparameters = {'n_factors': self.n_factors, '_lambda': 0.01}
        self.options = {'k_folds': self.k_folds, 'n_iterations': self.n_iterations}
        self.initializer = ModelInitializer(self.hyperparameters.copy(), self.n_iterations)

        def mock_get_ratings_matrix(self=None):
            return [[int(not bool((article + user) % 3)) for article in range(documents_cnt)]
                    for user in range(users_cnt)]
        self.ratings_matrix = numpy.array(mock_get_ratings_matrix())
        self.evaluator = Evaluator(self.ratings_matrix)
        setattr(DataParser, "get_ratings_matrix", mock_get_ratings_matrix)
def train(model, project_name):

    sampler = SubSampler()
    list_train_imgs = sampler.get_train_imgs()
    dataset = LandmarkDataset('../../input_large_delf/train', list_train_imgs)
    evaluator = Evaluator()

    dir_model = '_model'
    os.makedirs(dir_model, exist_ok=True)

    # for training
    batch_size = 240
    group_size = 12
    iter_outside = 10
    iter_inside = 600

    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=iter_outside * iter_inside)

    for param_group in optimizer.param_groups:
        logger.info('start lerning rate with: {:.6f}'.format(param_group['lr']))

    for ep in range(1, iter_outside + 1):

        logger.info('-' * 30)
        logger.info('epoch: {:d}'.format(ep))

        for param_group in optimizer.param_groups:
            logger.info('current lerning rate with: {:.8f}'.format(param_group['lr']))

        model.train()
        if ep > 1:
            set_batch_norm_eval(model)

        train_loss1 = 0
        train_loss3 = 0
        count_sample = 0
        ave_good_index = 0

        pt_sampler = PyTorchSampler(sampler, batch_size, group_size, iter_inside)

        dataloader = DataLoader(dataset, batch_sampler=pt_sampler, num_workers=8)

        for batch in tqdm(dataloader):
            batch_cuda = batch.cuda()

            # forward with requires_grad=False

            with torch.no_grad():
                v_batch_no_bp = batch_cuda
                optimizer.zero_grad()
                out = model.forward(v_batch_no_bp)

                batch_indeses, num_good_index = get_apn_index(out, batch_size, group_size)

            # forward with requires_grad=True

            v_batch = batch_cuda[batch_indeses, ...]

            optimizer.zero_grad()
            out = model.forward(v_batch)

            out_anchor = out[:batch_size]
            hard_positive = out[batch_size:batch_size*2]
            hard_negative = out[batch_size*2:batch_size*3]

            # calc loss

            loss1 = smooth_pairwise_loss(out_anchor, hard_positive) * 0.1
            loss3 = hard_negative_triplet_loss(out_anchor, hard_positive, hard_negative)

            loss = loss3

            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss1 += float(loss1.data.cpu().numpy()) * batch_size
            train_loss3 += float(loss3.data.cpu().numpy()) * batch_size
            ave_good_index += num_good_index * batch_size
            count_sample += batch_size

        logger.info('train loss (pair-pos): {:.6f}'.format(train_loss1 / count_sample))
        logger.info('train loss (triplet) : {:.6f}'.format(train_loss3 / count_sample))
        logger.info('average number of far negative: {:.2f} / {:d}'.format(ave_good_index / count_sample, batch_size))

        evaluator.evaluate(model)

        if ep % 4 == 0 and ep != iter_outside:

            model_name = 'embedding_model_{}_ep{}.pt'.format(project_name, ep)
            logger.info('save model: {}'.format(model_name))
            torch.save(model, os.path.join(dir_model, model_name))

    model_name = 'embedding_model_{}.pt'.format(project_name)
    logger.info('save model: {}'.format(model_name))
    torch.save(model, os.path.join(dir_model, model_name))
def run_training(H):
    # torch.cuda.is_available = lambda : False
    # torch.backends.cudnn.enabled=False
    torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = True

    create_logger(H)

    random.seed(H.SEED)
    np.random.seed(H.SEED)
    torch.manual_seed(H.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(H.SEED)
        torch.cuda.manual_seed_all(H.SEED)

    logger.info("Training start.")
    logger.info(repr(H))

    train_loader, valid_loader, vocab = create_data_pipelines(H)

    logger.info(train_loader.dataset)
    logger.info(valid_loader.dataset)

    m = Metric([('train_loss', np.inf), ('train_score', np.inf),
                ('valid_loss', np.inf), ('valid_score', 0), ('train_lr', 0),
                ('valid_cer', np.inf)])

    model = SpeechCNN(len(vocab),
                      input_size=256,
                      hidden_size=H.CNN_HIDDEN_SIZE,
                      dropout=H.CNN_DROPOUT,
                      initialize=torch_weight_init)
    if H.USE_CUDA:
        model.cuda()

    if H.PRELOAD_MODEL_PATH:
        path = os.path.join(H.EXPERIMENT, H.PRELOAD_MODEL_PATH)
        state = torch.load(path)
        model.load_state_dict(state)
        print("Preloaded model: {}".format(path))

    criterion = PytorchCTCLoss(vocab)

    optimizer = optim.SGD(list(
        filter(lambda p: p.requires_grad, model.parameters())),
                          lr=H.LR,
                          weight_decay=H.WEIGHT_DECAY,
                          momentum=H.MOMENTUM,
                          nesterov=H.NESTEROV)

    stopping = Stopping(model, patience=H.STOPPING_PATIENCE)

    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[H.LR_LAMBDA])

    ctc_decoder = CTCGreedyDecoder(vocab)

    scorer = Scorer(reduction='sum')

    tlogger = TensorboardLogger(root_dir=H.EXPERIMENT,
                                experiment_dir=H.TIMESTAMP)  # PytorchLogger()

    checkpoint = Checkpoint(model,
                            optimizer,
                            stopping,
                            m,
                            root_dir=H.EXPERIMENT,
                            experiment_dir=H.TIMESTAMP,
                            restore_from=-1,
                            interval=H.CHECKPOINT_INTERVAL,
                            verbose=0)

    trainer = Trainer(model, train_loader, optimizer, scheduler, criterion,
                      ctc_decoder, scorer, H.MAX_GRAD_NORM)

    evaluator = Evaluator(model, valid_loader, criterion, ctc_decoder, scorer)

    epoch_start = 1
    if H.CHECKPOINT_RESTORE:
        epoch_start = checkpoint.restore() + 1
        train_loader.batch_sampler.shuffle(epoch_start)

    epoch = epoch_start
    try:
        epoch_itr = tlogger.set_itr(range(epoch_start, H.MAX_EPOCHS + 1))

        for epoch in epoch_itr:

            with DelayedKeyboardInterrupt():

                m.train_loss, m.train_score, m.train_lr = trainer(epoch)

                m.valid_loss, m.valid_score = evaluator()

                if checkpoint:
                    checkpoint.step(epoch)

                stopping_flag = stopping.step(epoch, m.valid_loss,
                                              m.valid_score)

                epoch_itr.log_values(m.train_loss, m.train_score, m.train_lr,
                                     m.valid_loss, m.valid_score,
                                     stopping.best_score_epoch,
                                     stopping.best_score)

                if stopping_flag:
                    logger.info(
                        "Early stopping at epoch: %d, score %f" %
                        (stopping.best_score_epoch, stopping.best_score))
                    break

                train_loader.batch_sampler.shuffle(epoch)

    except KeyboardInterrupt:
        logger.info("Training interrupted at: {}".format(epoch))
        pass

    checkpoint.create(epoch)

    model.load_state_dict(stopping.best_score_state)
    torch.save(model.state_dict(),
               os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar'))

    logger.info(repr(tlogger))
    logger.info(repr(stopping))
    logger.info(repr(checkpoint))

    logger.info("Training end.")
Beispiel #12
0
def main():
    args = parse_args()
    update_config(args.cfg_file)

    if args.gpus:
        config.GPUS = args.gpus
    else:
        config.CUDA = False
    if args.workers:
        config.WORKERS = args.workers
    print('Using config:')
    pprint.pprint(config)

    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    if config.CUDA:
        os.environ["CUDA_VISIBLE_DEVICES"] = config.GPUS
    device = torch.device('cuda' if config.CUDA else 'cpu')

    # Redirect print to both console and log file
    sys.stdout = Logger(osp.join(config.OUTPUT_DIR, 'log-eval.txt'))

    # Create data loaders
    dataset = DataSet(config.DATASET.ROOT, config.DATASET.DATASET)

    normalizer = T.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    transformer = T.Compose([
        T.Resize(config.MODEL.IMAGE_SIZE, interpolation=3),
        T.ToTensor(),
        normalizer,
    ])

    query_loader = DataLoader(
        Preprocessor(dataset.query,
                     root=osp.join(dataset.images_dir, dataset.query_path), transform=transformer),
        batch_size=config.TEST.BATCH_SIZE, num_workers=config.WORKERS,
        shuffle=False, pin_memory=True)

    gallery_loader = DataLoader(
        Preprocessor(dataset.gallery,
                     root=osp.join(dataset.images_dir, dataset.gallery_path), transform=transformer),
        batch_size=config.TEST.BATCH_SIZE, num_workers=config.WORKERS,
        shuffle=False, pin_memory=True)

    # Create model
    model = models.create(config.MODEL.NAME)

    # Load from checkpoint
    checkpoint = load_checkpoint(config.TEST.MODEL_FILE)
    print('best model at epoch: {}'.format(checkpoint['epoch']))
    model.load_state_dict(checkpoint['state_dict'], strict=False)

    # Set model
    model = nn.DataParallel(model).to(device)
    
    print('Test with best model:')
    evaluator = Evaluator(model)
    evaluator.evaluate(query_loader, gallery_loader, dataset.query,
                       dataset.gallery, config.TEST.OUTPUT_FEATURES)
Beispiel #13
0
def main():
    args = parse_args()
    update_config(args.cfg_file)

    if args.gpus:
        config.GPUS = args.gpus
    else:
        config.CUDA = False
    if args.workers:
        config.WORKERS = args.workers
    print('Using config:')
    pprint.pprint(config)

    if args.manualSeed is None:
        args.manualSeed = random.randint(1, 10000)
    random.seed(args.manualSeed)
    np.random.seed(args.manualSeed)
    torch.manual_seed(args.manualSeed)
    if config.CUDA:
        torch.cuda.manual_seed_all(args.manualSeed)

    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    if config.CUDA:
        os.environ["CUDA_VISIBLE_DEVICES"] = config.GPUS
    device = torch.device('cuda' if config.CUDA else 'cpu')

    # Redirect print to both console and log file
    sys.stdout = Logger(osp.join(config.OUTPUT_DIR, 'log.txt'))

    # Create data loaders
    dataset = DataSet(config.DATASET.ROOT, config.DATASET.DATASET)
    normalizer = T.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])

    train_transformer = T.Compose([
        T.RandomSizedRectCrop(*config.MODEL.IMAGE_SIZE),
        T.RandomHorizontalFlip(),
        T.RandomRotation(10),
        T.ColorJitter(0.2, 0.2, 0.2),
        T.ToTensor(),
        normalizer,
        T.RandomErasing(EPSILON=config.DATASET.RE),
    ])
    test_transformer = T.Compose([
        T.Resize(config.MODEL.IMAGE_SIZE, interpolation=3),
        T.ToTensor(),
        normalizer,
    ])
    train_loader = DataLoader(UnsupervisedCamStylePreprocessor(
        dataset.train,
        root=osp.join(dataset.images_dir, dataset.train_path),
        camstyle_root=osp.join(dataset.images_dir,
                               dataset.train_camstyle_path),
        num_cam=dataset.num_cam,
        use_gan=True,
        transform=train_transformer),
                              batch_size=config.TRAIN.BATCH_SIZE,
                              num_workers=config.WORKERS,
                              shuffle=config.TRAIN.SHUFFLE,
                              pin_memory=True,
                              drop_last=False)

    query_loader = DataLoader(Preprocessor(dataset.query,
                                           root=osp.join(
                                               dataset.images_dir,
                                               dataset.query_path),
                                           transform=test_transformer),
                              batch_size=config.TEST.BATCH_SIZE,
                              num_workers=config.WORKERS,
                              shuffle=False,
                              pin_memory=True)

    gallery_loader = DataLoader(Preprocessor(dataset.gallery,
                                             root=osp.join(
                                                 dataset.images_dir,
                                                 dataset.gallery_path),
                                             transform=test_transformer),
                                batch_size=config.TEST.BATCH_SIZE,
                                num_workers=config.WORKERS,
                                shuffle=False,
                                pin_memory=True)

    # Create model
    model = models.create(config.MODEL.NAME,
                          pretrained=config.MODEL.PRETRAINED,
                          num_classes=dataset.num_train_ids)

    # Memory Network
    num_tgt = len(dataset.train)
    memory = models.create('memory', config.MODEL.FEATURES, num_tgt)

    # Load from checkpoint
    if config.TRAIN.RESUME:
        checkpoint = load_checkpoint(config.TRAIN.CHECKPOINT)
        model.load_state_dict(checkpoint['state_dict'], strict=False)
        memory.load_state_dict(checkpoint['state_dict_memory'], strict=False)
        print("=> Start epoch {} ".format(checkpoint['epoch']))

    # Set model
    model = nn.DataParallel(model).to(device)
    memory = memory.to(device)

    # Optimizer
    base_param_ids = set(map(id, model.module.base.parameters()))

    base_params_need_for_grad = filter(lambda p: p.requires_grad,
                                       model.module.base.parameters())

    new_params = [p for p in model.parameters() if id(p) not in base_param_ids]
    param_groups = [{
        'params': base_params_need_for_grad,
        'lr_mult': 0.1
    }, {
        'params': new_params,
        'lr_mult': 1.0
    }]

    optimizer = get_optimizer(config, param_groups)

    # Trainer
    trainer = Trainer(config, model, memory)

    def adjust_lr(epoch):
        step_size = config.TRAIN.LR_STEP
        lr = config.TRAIN.LR * (config.TRAIN.LR_FACTOR**(epoch // step_size))
        for g in optimizer.param_groups:
            g['lr'] = lr * g.get('lr_mult', 1)

    best_r1 = 0.0
    # Start training
    for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):
        # lr_scheduler.step()
        adjust_lr(epoch)
        trainer.train(epoch, train_loader, optimizer)

        print('Test with latest model:')
        evaluator = Evaluator(model)
        r1 = evaluator.evaluate(query_loader, gallery_loader, dataset.query,
                                dataset.gallery, config.TEST.OUTPUT_FEATURES)

        if r1 > best_r1:
            best_r1 = r1
            save_checkpoint(
                {
                    'state_dict': model.module.state_dict(),
                    'state_dict_memory': memory.state_dict(),
                    'epoch': epoch + 1,
                },
                fpath=osp.join(config.OUTPUT_DIR, 'checkpoint.pth.tar'))

        print('\n * Finished epoch {:3d} \n'.format(epoch))

    # Final test
    print('Test with best model:')
    evaluator = Evaluator(model)
    checkpoint = load_checkpoint(
        osp.join(config.OUTPUT_DIR, 'checkpoint.pth.tar'))
    print('best model at epoch: {}'.format(checkpoint['epoch']))
    model.module.load_state_dict(checkpoint['state_dict'])
    evaluator.evaluate(query_loader, gallery_loader, dataset.query,
                       dataset.gallery, config.TEST.OUTPUT_FEATURES)
Beispiel #14
0
    def runTest(self):
        train, test = self.cf.evaluator.naive_split()
        self.assertEqual(
            numpy.count_nonzero(train) + numpy.count_nonzero(test),
            numpy.count_nonzero(self.ratings_matrix))

        train_indices, test_indices = self.cf.evaluator.get_kfold_indices()
        # k = 3
        first_fold_indices = train_indices[0::self.k_folds], test_indices[
            0::self.k_folds]
        second_fold_indices = train_indices[1::self.k_folds], test_indices[
            1::self.k_folds]
        third_fold_indices = train_indices[2::self.k_folds], test_indices[
            2::self.k_folds]

        train1, test1 = self.cf.evaluator.generate_kfold_matrix(
            first_fold_indices[0], first_fold_indices[1])
        train2, test2 = self.cf.evaluator.generate_kfold_matrix(
            second_fold_indices[0], second_fold_indices[1])
        train3, test3 = self.cf.evaluator.generate_kfold_matrix(
            third_fold_indices[0], third_fold_indices[1])

        total_ratings = numpy.count_nonzero(self.ratings_matrix)

        # ensure that each fold has 1/k of the total ratings
        k_inverse = (1 / self.k_folds)
        self.assertEqual(k_inverse, numpy.count_nonzero(test1) / total_ratings)

        self.assertEqual(k_inverse, numpy.count_nonzero(test2) / total_ratings)

        self.assertEqual(k_inverse, numpy.count_nonzero(test2) / total_ratings)

        # assert that the folds don't intertwine
        self.assertTrue(numpy.all((train1 * test1) == 0))
        self.assertTrue(numpy.all((train2 * test2) == 0))
        self.assertTrue(numpy.all((train3 * test3) == 0))
        # assert that test sets dont contain the same elements
        self.assertTrue(numpy.all((test1 * test2) == 0))
        self.assertTrue(numpy.all((test2 * test3) == 0))
        self.assertTrue(numpy.all((test1 * test3) == 0))

        evaluator = Evaluator(self.ratings_matrix)
        self.assertEqual(self.predictions.shape, self.ratings_matrix.shape)
        recall = evaluator.calculate_recall(self.ratings_matrix,
                                            self.predictions)
        # if predictions are  perfect
        if recall == 1:
            for row in range(self.users):
                for col in range(self.documents):
                    self.assertEqual(self.rounded_predictions[row, col],
                                     self.ratings_matrix[row, col])

        # If we modify all the top predictions for half the users,
        # recall should be 0.5 by definition
        for i in range(0, self.users, 2):
            evaluator.ratings[i, self.predictions[i].nonzero()[0]] = 0
        recall_at_x = evaluator.recall_at_x(self.n_recommendations,
                                            self.predictions,
                                            self.ratings_matrix,
                                            self.rounded_predictions)
        self.assertEqual(0.5, recall_at_x)

        self.setUp()
        evaluator.ratings[:] = self.ratings_matrix

        # removing all top hits, should yield ndcg of 0 as number of recs is 1.
        for i in range(0, self.users):
            evaluator.ratings[i, self.predictions[i].nonzero()[0]] = 0
        ndcg = evaluator.calculate_ndcg(self.n_recommendations,
                                        self.predictions, self.ratings_matrix,
                                        self.test_data)

        self.assertEqual(0.0, ndcg)

        # restore the unmodified rating matrix
        self.setUp()
        evaluator.ratings[:] = self.ratings_matrix

        # mrr will always decrease as we set the highest prediction's index
        # to 0 in the rating matrix. top_n recommendations set to 0.
        mrr = []
        for i in range(self.users):
            mrr.append(
                evaluator.calculate_mrr(self.n_recommendations,
                                        self.predictions,
                                        self.rounded_predictions,
                                        self.test_data))
            evaluator.ratings[i,
                              (numpy.argmax(self.predictions[i], axis=0))] = 0
            if i > 1:
                self.assertLessEqual(mrr[i], mrr[i - 1])
def train(model, project_name):

    sampler = Sampler()
    loader = ImgLoader('../../input_large_delf/train')
    evaluator = Evaluator()

    dir_model = '_model'
    os.makedirs(dir_model, exist_ok=True)

    # for training
    batch_size = 240
    group_size = 12
    iter_outside = 10
    iter_inside = 500

    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=1e-4)

    for param_group in optimizer.param_groups:
        logger.info('start lerning rate with: {:.6f}'.format(
            param_group['lr']))

    for ep in range(1, iter_outside + 1):

        logger.info('-' * 30)
        logger.info('epoch: {:d}'.format(ep))

        model.train()
        if ep > 1:
            set_batch_norm_eval(model)

        train_loss1 = 0
        train_loss3 = 0
        count_sample = 0
        ave_good_index = 0

        for _ in tqdm(range(iter_inside)):

            batch = torch.FloatTensor(batch_size * group_size, 40,
                                      1000).zero_()
            ids = sampler.get_sample(batch_size, group_size)

            for i in range(batch_size * group_size):
                batch[i] = loader.load_image('{}.delf'.format(ids[i]))

            batch_cuda = batch.cuda()

            # forward with requires_grad=False

            v_batch_no_bp = Variable(batch_cuda, volatile=True)
            optimizer.zero_grad()
            out = model.forward(v_batch_no_bp)

            batch_indeces, num_good_index = get_apn_index(
                out, batch_size, group_size)

            # forward with requires_grad=True

            v_batch = Variable(batch_cuda[batch_indeces, :, :])

            optimizer.zero_grad()
            out = model.forward(v_batch)

            out_anchor = out[:batch_size]
            hard_positive = out[batch_size:batch_size * 2]
            hard_negative = out[batch_size * 2:batch_size * 3]

            # calc loss

            loss1 = smooth_pairwise_loss(out_anchor, hard_positive) * 0.1
            loss3 = hard_negative_triplet_loss(out_anchor, hard_positive,
                                               hard_negative)

            loss = loss3

            loss.backward()
            optimizer.step()

            train_loss1 += float(loss1.data.cpu().numpy()) * batch_size
            train_loss3 += float(loss3.data.cpu().numpy()) * batch_size
            ave_good_index += num_good_index * batch_size
            count_sample += batch_size

        logger.info('train loss (pair-pos): {:.6f}'.format(train_loss1 /
                                                           count_sample))
        logger.info('train loss (triplet) : {:.6f}'.format(train_loss3 /
                                                           count_sample))
        logger.info('average number of far negative: {:.2f} / {:d}'.format(
            ave_good_index / count_sample, batch_size))

        evaluator.evaluate(model)

        if ep % 4 == 0 and ep != iter_outside:

            model_name = 'embedding_model_{}_ep{}.pt'.format(project_name, ep)
            logger.info('save model: {}'.format(model_name))
            torch.save(model, os.path.join(dir_model, model_name))

            if ep % 8 == 0:

                for param_group in optimizer.param_groups:
                    param_group['lr'] *= 0.1
                    logger.info('change learning rate into: {:.6f}'.format(
                        param_group['lr']))

    model_name = 'embedding_model_{}.pt'.format(project_name)
    logger.info('save model: {}'.format(model_name))
    torch.save(model, os.path.join(dir_model, model_name))
def run_training(H):
    # torch.cuda.is_available = lambda : False
    # torch.backends.cudnn.enabled=False
    torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = True

    create_logger(H)

    random.seed(H.SEED)
    np.random.seed(H.SEED)
    torch.manual_seed(H.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(H.SEED)
        torch.cuda.manual_seed_all(H.SEED)

    logger.info("Training start.")
    logger.info(repr(H))

    train_loader, valid_loader, vocab = create_data_pipelines(H)

    logger.info(train_loader.dataset)
    logger.info(valid_loader.dataset)

    m = Metric([('train_loss', np.inf), ('train_score', np.inf),
                ('valid_loss', np.inf), ('valid_score', 0), ('train_lr', 0),
                ('valid_cer', np.inf)])

    model = NeuralSpeechRecognizer(
        vocab,
        train_loader.dataset.max_seq_length,
        rnn_hidden_size=H.RNN_HIDDEN_SIZE,
        rnn_num_layers=H.RNN_NUM_LAYERS,
        rnn_dropout=H.RNN_DROPOUT,
        cnn_dropout=H.CNN_DROPOUT,
        teacher_forcing_ratio=H.TEACHER_FORCING_RATIO,
        sample_rate=H.AUDIO_SAMPLE_RATE,
        window_size=H.SPECT_WINDOW_SIZE,
        initialize=torch_weight_init)
    if H.USE_CUDA:
        model.cuda()

    logging.info(model_summary(model, line_length=100))

    if H.PRELOAD_MODEL_PATH:
        path = os.path.join(H.EXPERIMENT, H.PRELOAD_MODEL_PATH)
        state = torch.load(path)
        model.load_state_dict(state)
        logging.info("Preloaded model: {}".format(path))

    criterion = LabelSmoothingLoss(padding_idx=0,
                                   label_smoothing=H.LABEL_SMOOTHING)

    sts_decoder = STSDecoder(vocab)

    scorer = Scorer()

    optimizer = optim.Adam(list(
        filter(lambda p: p.requires_grad, model.parameters())),
                           amsgrad=False,
                           betas=(0.9, 0.999),
                           eps=1e-08,
                           lr=H.LR,
                           weight_decay=H.WEIGHT_DECAY)

    stopping = Stopping(model, patience=H.STOPPING_PATIENCE)

    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[H.LR_LAMBDA])

    tlogger = TensorboardLogger(root_dir=H.EXPERIMENT,
                                experiment_dir=H.TIMESTAMP)  # PytorchLogger()

    checkpoint = Checkpoint(model,
                            optimizer,
                            stopping,
                            m,
                            root_dir=H.EXPERIMENT,
                            experiment_dir=H.TIMESTAMP,
                            restore_from=-1,
                            interval=H.CHECKPOINT_INTERVAL,
                            verbose=0)

    trainer = Trainer(model, train_loader, optimizer, scheduler, criterion,
                      sts_decoder, scorer, H.MAX_GRAD_NORM)

    evaluator = Evaluator(model, valid_loader, criterion, sts_decoder, scorer)

    epoch_start = 1
    if H.CHECKPOINT_RESTORE:
        epoch_start = checkpoint.restore() + 1
        train_loader.batch_sampler.shuffle(epoch_start)

    epoch = epoch_start
    try:
        epoch_itr = tlogger.set_itr(range(epoch_start, H.MAX_EPOCHS + 1))

        for epoch in epoch_itr:

            with DelayedKeyboardInterrupt():

                m.train_loss, m.train_score, m.train_lr = trainer(epoch)

                m.valid_loss, m.valid_score = evaluator()

                if checkpoint:
                    checkpoint.step(epoch)

                stopping_flag = stopping.step(epoch, m.valid_loss,
                                              m.valid_score)

                epoch_itr.log_values(m.train_loss, m.train_score, m.train_lr,
                                     m.valid_loss, m.valid_score,
                                     stopping.best_score_epoch,
                                     stopping.best_score)

                if stopping_flag:
                    logger.info(
                        "Early stopping at epoch: %d, score %f" %
                        (stopping.best_score_epoch, stopping.best_score))
                    break

                train_loader.batch_sampler.shuffle(epoch)

    except KeyboardInterrupt:
        logger.info("Training interrupted at: {}".format(epoch))
        pass

    checkpoint.create(epoch)

    model.load_state_dict(stopping.best_score_state)
    torch.save(model.state_dict(),
               os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar'))

    logger.info(repr(tlogger))
    logger.info(repr(stopping))
    logger.info(repr(checkpoint))

    logger.info("Training end.")
Beispiel #17
0
    def __init__(self,
                 initializer=None,
                 abstracts_preprocessor=None,
                 ratings=None,
                 config=None,
                 process_parser=False,
                 verbose=False,
                 load_matrices=True,
                 dump_matrices=True,
                 train_more=True,
                 random_seed=False,
                 results_file_name='top_recommendations'):
        """
        Constructor of the RecommenderSystem.

        :param ModelInitializer initializer: A model initializer.
        :param AbstractsPreprocessor abstracts_preprocessor: A preprocessor of abstracts, if None then queried.
        :param int[][] ratings: Ratings matrix; if None, matrix gets queried from the database.
        :param boolean process_parser: A Flag deceiding process the dataparser.
        :param boolean verbose: A flag deceiding to print progress.
        :param boolean dump_matrices: A flag for saving output matrices.
        :param boolean train_more: train_more the collaborative filtering after loading matrices.
        :param boolean random_seed: A flag to determine if we will use random seed or not.
        :param str results_file_name: Top recommendations results' file name
        """
        if process_parser:
            DataParser.process()

        if ratings is None:
            self.ratings = numpy.array(DataParser.get_ratings_matrix())
        else:
            self.ratings = ratings

        if abstracts_preprocessor is None:
            self.abstracts_preprocessor = AbstractsPreprocessor(
                DataParser.get_abstracts(),
                *DataParser.get_word_distribution())
        else:
            self.abstracts_preprocessor = abstracts_preprocessor

        # Get configurations
        self.config = RecommenderConfiguration(config)

        # Set flags
        self.results_file_name = results_file_name + '.dat'
        self._verbose = verbose
        self._dump_matrices = dump_matrices
        self._load_matrices = load_matrices
        self._train_more = train_more
        self._split_type = 'user'
        self._random_seed = random_seed

        self.set_hyperparameters(self.config.get_hyperparameters())
        self.set_options(self.config.get_options())

        self.initializer = ModelInitializer(self.hyperparameters.copy(),
                                            self.n_iter, self._verbose)

        if self.config.get_error_metric() == 'RMS':
            self.evaluator = Evaluator(self.ratings,
                                       self.abstracts_preprocessor,
                                       self._random_seed, self._verbose)
        else:
            raise NameError(
                "Not a valid error metric %s. Only option is 'RMS'" %
                self.config.get_error_metric())

        # Initialize content based.
        if self.config.get_content_based() == 'None':
            self.content_based = ContentBased(self.initializer, self.evaluator,
                                              self.hyperparameters,
                                              self.options, self._verbose,
                                              self._load_matrices,
                                              self._dump_matrices)
        elif self.config.get_content_based() == 'LDA':
            self.content_based = LDARecommender(self.initializer,
                                                self.evaluator,
                                                self.hyperparameters,
                                                self.options, self._verbose,
                                                self._load_matrices,
                                                self._dump_matrices)
        elif self.config.get_content_based() == 'LDA2Vec':
            self.content_based = LDA2VecRecommender(
                self.initializer, self.evaluator, self.hyperparameters,
                self.options, self._verbose, self._load_matrices,
                self._dump_matrices)
        else:
            raise NameError(
                "Not a valid content based %s. Options are 'None', "
                "'LDA', 'LDA2Vec'" % self.config.get_content_based())

        # Initialize collaborative filtering.
        if self.config.get_collaborative_filtering() == 'ALS':
            is_hybrid = self.config.get_recommender() == 'hybrid'
            if self.config.get_content_based() == 'None':
                raise NameError(
                    "Not valid content based 'None' with hybrid recommender")
            self.collaborative_filtering = CollaborativeFiltering(
                self.initializer, self.evaluator, self.hyperparameters,
                self.options, self._verbose, self._load_matrices,
                self._dump_matrices, self._train_more, is_hybrid)
        elif self.config.get_collaborative_filtering() == 'SDAE':
            self.collaborative_filtering = SDAERecommender(
                self.initializer, self.evaluator, self.hyperparameters,
                self.options, self._verbose, self._load_matrices,
                self._dump_matrices)
            if not self.config.get_content_based() == 'None':
                raise NameError(
                    "Not a valid content based %s with SDAE. You can only use 'None'"
                    % self.config.get_content_based())
        elif self.config.get_collaborative_filtering() == 'None':
            if not self.config.get_recommender() == 'itembased':
                raise NameError(
                    "None collaborative filtering is only valid with itembased recommender type"
                )
            elif self.config.get_content_based() == 'None':
                raise NameError(
                    "Not valid content based 'None' with item-based recommender"
                )
            self.collaborative_filtering = None
        else:
            raise NameError("Not a valid collaborative filtering %s. "
                            "Only options are 'None', 'ALS', 'SDAE'" %
                            self.config.get_collaborative_filtering())

        # Initialize recommender
        if self.config.get_recommender() == 'itembased':
            self.recommender = self.content_based
        elif self.config.get_recommender() == 'userbased':
            self.recommender = self.collaborative_filtering
        elif self.config.get_recommender() == 'hybrid':
            self.recommender = self
        else:
            raise NameError(
                "Invalid recommender type %s. "
                "Only options are 'userbased','itembased', and 'hybrid'" %
                self.config.get_recommender())
Beispiel #18
0
            filename=absolute_path(content_dictionary_filename))
    elif file_exists(basepaths_filename):
        print('Building content dictionary...')
        content_dictionary = ContentDictionary().build(
            basepaths_filename=absolute_path(basepaths_filename),
            dictionary_filename=absolute_path(content_dictionary_filename),
            url=args.remote_url,
            niceness=args.niceness)
    else:
        print("Error, neither %s or %s found" %
              (content_dictionary_filename, basepaths_filename))
        sys.exit(1)

    if args.evaluate:
        print('Evaluating', args.theme_name, 'theme')
        evaluator = Evaluator(absolute_path(model_filename),
                              content_dictionary)
        evaluator.save_results()

    else:
        model_class = LdaModel(absolute_path(model_filename),
                               num_topics=args.num_topics)

        if model_class.no_pretrained_model_exists():
            print('Training model with', args.num_topics, 'topics')
            model_class.train_model(content_dictionary=content_dictionary,
                                    cores=args.cores)
        else:
            print('Loading model')

        model = model_class.load_model()
        corpus = model_class.load_corpus()
Beispiel #19
0
    def __init__(self, initializer=None, abstracts_preprocessor=None, ratings=None, config=None,
                 process_parser=False, verbose=False, load_matrices=True, dump_matrices=True, train_more=True):
        """
        Constructor of the RecommenderSystem.

        :param ModelInitializer initializer: A model initializer.
        :param AbstractsPreprocessor abstracts_preprocessor: A preprocessor of abstracts, if None then queried.
        :param int[][] ratings: Ratings matrix; if None, matrix gets queried from the database.
        :param boolean process_parser: A Flag deceiding process the dataparser.
        :param boolean verbose: A flag deceiding to print progress.
        :param boolean dump_matrices: A flag for saving output matrices.
        :param boolean train_more: train_more the collaborative filtering after loading matrices.
        """
        if process_parser:
            DataParser.process()

        if ratings is None:
            self.ratings = numpy.array(DataParser.get_ratings_matrix())
        else:
            self.ratings = ratings

        if abstracts_preprocessor is None:
            self.abstracts_preprocessor = AbstractsPreprocessor(DataParser.get_abstracts(),
                                                                *DataParser.get_word_distribution())
        else:
            self.abstracts_preprocessor = abstracts_preprocessor

        # Get configurations
        self.config = RecommenderConfiguration(config)
        self.set_hyperparameters(self.config.get_hyperparameters())
        self.set_options(self.config.get_options())

        # Set flags
        self._verbose = verbose
        self._dump_matrices = dump_matrices
        self._load_matrices = load_matrices
        self._train_more = train_more

        self.initializer = ModelInitializer(self.hyperparameters.copy(), self.n_iter, self._verbose)

        if self.config.get_error_metric() == 'RMS':
            self.evaluator = Evaluator(self.ratings, self.abstracts_preprocessor)
        else:
            raise NameError("Not a valid error metric %s. Only option is 'RMS'" % self.config.get_error_metric())

        # Initialize content based.
        if self.config.get_content_based() == 'None':
            self.content_based = ContentBased(self.initializer, self.evaluator, self.hyperparameters, self.options,
                                              self._verbose, self._load_matrices, self._dump_matrices)
        elif self.config.get_content_based() == 'LDA':
            self.content_based = LDARecommender(self.initializer, self.evaluator, self.hyperparameters, self.options,
                                                self._verbose, self._load_matrices, self._dump_matrices)
        elif self.config.get_content_based() == 'LDA2Vec':
            self.content_based = LDA2VecRecommender(self.initializer, self.evaluator, self.hyperparameters,
                                                    self.options, self._verbose,
                                                    self._load_matrices, self._dump_matrices)
        else:
            raise NameError("Not a valid content based %s. Options are 'None', "
                            "'LDA', 'LDA2Vec'" % self.config.get_content_based())

        # Initialize collaborative filtering.
        if self.config.get_collaborative_filtering() == 'ALS':
            self.collaborative_filtering = CollaborativeFiltering(self.initializer, self.evaluator,
                                                                  self.hyperparameters, self.options,
                                                                  self._verbose, self._load_matrices,
                                                                  self._dump_matrices, self._train_more)
        else:
            raise NameError("Not a valid collaborative filtering %s. "
                            "Only option is 'ALS'" % self.config.get_collaborative_filtering())

        # Initialize recommender
        if self.config.get_recommender() == 'itembased':
            self.recommender = self.content_based
        elif self.config.get_recommender() == 'userbased':
            self.recommender = self.collaborative_filtering
        else:
            raise NameError("Invalid recommender type %s. "
                            "Only options are 'userbased' and 'itembased'" % self.config.get_recommender())
Beispiel #20
0
class GridSearch(object):
    """
    A class to perform grid search and find the best hyperparameters for a recommender.
    """
    def __init__(self, recommender, hyperparameters, verbose=True, report_name='grid_search_results'):
        """
        Train number of recommenders using UV decomposition using different parameters.

        :param AbstractRecommender recommender:
        :param dict hyperparameters: A dictionary of the hyperparameters.
        :param boolean verbose: A flag to decide printing progress.
        :param str report_name: The name of the csv file in which the analysis of the grid search will be dumped.
        """
        self.recommender = recommender
        self.hyperparameters = hyperparameters
        self._verbose = verbose
        self.evaluator = Evaluator(recommender.get_ratings())
        self.all_errors = dict()
        self.results_file_name = report_name + '.csv'

    def get_all_combinations(self):
        """
        The method retuns all possible combinations of the hyperparameters.

        :returns: array of dicts containing all combinations
        :rtype: list[dict]

        >>> get_all_combinations({'_lambda': [0, 0.1], 'n_factors': [20, 40]})
        [{'n_factors': 20, '_lambda': 0}, {'n_factors': 40, '_lambda': 0},
        {'n_factors': 20, '_lambda': 0.1}, {'n_factors': 40, '_lambda': 0.1}]
        """
        names = sorted(self.hyperparameters)
        return [dict(zip(names, prod)) for prod in it.product(
            *(self.hyperparameters[name] for name in names))]

    def train(self):
        """
        The method loops on all possible combinations of hyperparameters and calls
        the train and split method on the recommender. the train and test errors are
        saved and the hyperparameters that produced the best test error are returned

        :returns: Pair of best hyperparameters dictionary, and list of lists of metrics' results
        :rtype: tuple(dict, float[][])
        """
        best_error = numpy.inf
        best_params = dict()
        train, test = self.recommender.evaluator.naive_split(self.recommender._split_type)
        predictions = None
        all_results = [['n_factors', '_lambda', 'rmse', 'train_recall', 'test_recall', 'recall_at_200', 'ratio',
                        'mrr @ 5', 'ndcg @ 5', 'mrr @ 10', 'ndcg @ 10']]
        for hyperparameters in self.get_all_combinations():
            if self._verbose:
                print("Running config: %s" % hyperparameters)
            self.recommender.set_hyperparameters(hyperparameters)
            current_result = [hyperparameters['n_factors'], hyperparameters['_lambda']]
            self.recommender.train()
            current_result.extend(self.recommender.get_evaluation_report())
            all_results.append(current_result)
            if predictions is None:
                predictions = self.recommender.get_predictions()
            rounded_predictions = self.recommender.rounded_predictions()
            test_recall = self.evaluator.calculate_recall(test, rounded_predictions)
            train_recall = self.evaluator.calculate_recall(self.recommender.get_ratings(), rounded_predictions)
            if self._verbose:
                print('Train error: %f, Test error: %f' % (train_recall, test_recall))
            if 1 - test_recall < best_error:
                best_params = hyperparameters
                best_error = 1 - test_recall
            current_key = self.get_key(hyperparameters)
            self.all_errors[current_key] = dict()
            self.all_errors[current_key]['train_recall'] = train_recall
            self.all_errors[current_key]['test_recall'] = test_recall
        self.dump_csv(all_results)
        if self._verbose:
            print("Best config: %s" % best_params)
        return best_params, all_results

    def get_key(self, config):
        """
        Given a dict (config) the function generates a key that uniquely represents
        this config to be used to store all errors

        :param dict config: given configuration.
        :returns: string reperesenting the unique key of the configuration
        :rtype: str

        >>> get_key({n_iter: 1, n_factors:200})
        'n_iter:1,n_factors:200'
        """
        generated_key = ''
        keys_array = sorted(config)
        for key in keys_array:
            generated_key += key + ':'
            generated_key += str(config[key]) + ','
        return generated_key.strip(',')

    def dump_csv(self, all_results):
        """
        Given some results as a list of lists, the function dumps to a csv file

        :param str[][] all_results: all results from all runs.
        """
        base_dir = os.path.dirname(os.path.realpath(__file__))
        path = os.path.join(os.path.dirname(base_dir), 'matrices/%s' % self.results_file_name)
        with open(path, "a") as f:
            writer = csv.writer(f)
            writer.writerows(all_results)
        if self._verbose:
            print("dumped to %s" % path)

    def get_all_errors(self):
        """
        The method returns all errors calculated for every configuration.

        :returns: containing every single computed test error.
        :rtype: dict
        """
        return self.all_errors
Beispiel #21
0
    def __init__(self,
                 use_database=True,
                 verbose=True,
                 load_matrices=True,
                 dump=True,
                 train_more=True,
                 random_seed=False,
                 config=None):
        """
        Setup the data and configuration for the recommenders.
        """
        if use_database:
            self.ratings = numpy.array(DataParser.get_ratings_matrix())
            self.documents, self.users = self.ratings.shape
            self.abstracts_preprocessor = AbstractsPreprocessor(
                DataParser.get_abstracts(),
                *DataParser.get_word_distribution())
        else:
            abstracts = {
                0: 'hell world berlin dna evolution',
                1: 'freiburg is green',
                2: 'the best dna is the dna of dinasours',
                3: 'truth is absolute',
                4: 'berlin is not that green',
                5: 'truth manifests itself',
                6: 'plato said truth is beautiful',
                7: 'freiburg has dna'
            }

            vocab = set(
                itertools.chain(
                    *list(map(lambda ab: ab.split(' '), abstracts.values()))))
            w2i = dict(zip(vocab, range(len(vocab))))
            word_to_count = [(w2i[word],
                              sum(
                                  abstract.split(' ').count(word)
                                  for doc_id, abstract in abstracts.items()))
                             for word in vocab]
            article_to_word = list(
                set([(doc_id, w2i[word])
                     for doc_id, abstract in abstracts.items()
                     for word in abstract.split(' ')]))
            article_to_word_to_count = list(
                set([(doc_id, w2i[word], abstract.count(word))
                     for doc_id, abstract in abstracts.items()
                     for word in abstract.split(' ')]))
            self.abstracts_preprocessor = AbstractsPreprocessor(
                abstracts, word_to_count, article_to_word,
                article_to_word_to_count)
            self.documents, self.users = 8, 10
            self.ratings = numpy.array([[
                int(not bool((article + user) % 3))
                for article in range(self.documents)
            ] for user in range(self.users)])

        self.verbose = verbose
        self.load_matrices = load_matrices
        self.dump = dump
        self.train_more = train_more
        self.random_seed = random_seed
        self.evaluator = Evaluator(self.ratings, self.abstracts_preprocessor,
                                   self.random_seed, self.verbose)
        self.config = RecommenderConfiguration()
        self.hyperparameters = self.config.get_hyperparameters()
        self.options = self.config.get_options()
        self.initializer = ModelInitializer(self.hyperparameters.copy(),
                                            self.options['n_iterations'],
                                            self.verbose)