def local_test_train(i,data_path):
    '''
    Generates a debugging dataset to cifar10
    This is for use in local CPU to test if the code is working or not.
    Try-catch structure can be applied.
    '''
    ##
    success_nb = 0
    while success_nb < 1:
        try:
            gene = Gene_data()
            number_parameters = 0
            while number_parameters < gene.para_min or number_parameters > gene.para_max:
                gene.create_draft_order()
                gene.create_architecture_order()
                mdl, init_algorithm_list,init_hyerparam_list= gene.create_mdl()
                number_parameters = count_nb_params(mdl)
                print(init_algorithm_list)
                print(init_hyerparam_list)
                print(number_parameters)
            ##
            trainloader, valloader, testloader = get_cifar10_for_data_point_mdl_gen()
            print(mdl)
            ##
            start = time.time()
            mdl = mdl.to(device)
            epochs = 1
            optimizer = torch.optim.Adam(mdl.parameters())
            scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[], gamma=1.0)
            criterion = nn.CrossEntropyLoss()
            error_criterion = metrics.error_criterion
            init_params = list(mdl.parameters())
            stats_collector = StatsCollector()
            iterations, train_iterations = 1, 1 # CAREFUL it might be returining 1, for real production we want it to be greater than 1
            trainer = Trainer(trainloader,valloader, testloader, optimizer, scheduler, criterion, error_criterion, stats_collector, device)
            train_loss, train_error, val_loss, val_error, test_loss, test_error = trainer.train_and_track_stats(mdl, epochs, iterations=iterations, train_iterations=train_iterations)
            final_params = list(mdl.parameters())
            ## save data point
            how_long, hours = timeSince(start)
            print(f'hours = {hours}')
            print(f'{how_long}')
            mdl_name = f'debug_{i}'
            other_data = trainer.stats_collector.get_stats_dict({'error_criterion':error_criterion.__name__})
            batch_size_train = trainloader.batch_size
            batch_size_test = testloader.batch_size
            batch_size_val = valloader.batch_size
            save_model_info(data_path, mdl, init_params, final_params,
                train_loss, train_error, val_loss, val_error, test_loss, test_error,
                optimizer, epochs, criterion, error_criterion,
                hours, mdl_name, init_algorithm_list, init_hyerparam_list,
                batch_size_train, batch_size_val, batch_size_test,
                number_parameters,
                scheduler, other_data)
            success_nb =success_nb + 1
            print('Success')
        except Exception as e:
            print('FAIL')
            print(e)
Beispiel #2
0
 def generate_debug_dataset(self):
     '''
     Generates a debugging dataset to cifar10
     '''
     self.path.mkdir(exist_ok=True)
     iterations, train_iterations = 1, 1 # CAREFUL it might be returining 1, for real production we want it to be greater than 1
     mdls = get_debug_models()
     print()
     for i in range(len(mdls)):
         print(f'---> mdl_{i}')
         start = time.time()
         ## generate mdl data point
         mdl = mdls[i].to(device)
         epochs = random.randint(self.min_train_epochs, self.max_train_epochs) # CAREFUL it might be returining 1, for real production we want it to be greater than 1
         optimizer = torch.optim.Adam(mdl.parameters())
         scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[], gamma=1.0)
         criterion = nn.CrossEntropyLoss()
         error_criterion = metrics.error_criterion
         init_params = list(mdl.parameters())
         stats_collector = StatsCollector()
         trainer = Trainer(self.trainloader, self.valloader, self.testloader, optimizer, scheduler, criterion, error_criterion, stats_collector, device)
         train_loss, train_error, val_loss, val_error, test_loss, test_error = trainer.train_and_track_stats(mdl, epochs, iterations=iterations, train_iterations=train_iterations)
         final_params = list(mdl.parameters())
         ## save data point
         how_long, hours = timeSince(start)
         print(f'hours = {hours}')
         print(f'{how_long}')
         mdl_name = f'debug_{i}'
         other_data = trainer.stats_collector.get_stats_dict({'error_criterion':error_criterion.__name__})
         # TODO: fix later
         init_algorithm_list = 'default'
         init_hyerparam_list = torch.__version__
         number_parameters = count_nb_params(mdl)
         ##
         batch_size_train, batch_size_val, batch_size_test = self.trainloader.batch_size, self.valloader.batch_size, self.testloader.batch_size
         data_path = str(self.path)
         save_model_info(data_path, mdl, init_params, final_params,
             train_loss, train_error, val_loss, val_error, test_loss, test_error,
             optimizer, epochs, criterion, error_criterion,
             hours, mdl_name, init_algorithm_list, init_hyerparam_list,
             batch_size_train, batch_size_val, batch_size_test,
             number_parameters,
             scheduler=scheduler, other_data=other_data)
         print(f'--> mdl_{i} data point saved!\n')
Beispiel #3
0
def main(i, gene, data_path, epochs, mdl_name):
    '''
    The main train fucntion to be used on GPU.
    Have a try-catch structure.
    i: each call will run i times and save i data points.
    gene: an object of Gene_data. Can change the default parameters in this fucntion. The defaul inputs are:
     ( min_conv_n = 1, max_conv_n = 7, min_fc_n = 1, max_fc_n = 7, para_min = 40000,
     min_filter = 26,min_fc = 32, max_filter = 32, max_fc = 256,max_para_times = 50,
      flag = True,default_init_w_algor = False))
     data
    data_path: the root of where to save the results of training.
    epochs: number of epochs to train.
    mdl_name: change the mdl name.
    '''
    # get model type
    success_nb = 0
    while success_nb < 1:
        try:

            number_parameters = 0
            while number_parameters < gene.para_min or number_parameters > gene.para_max:
                gene.generate_random_inputs()
                gene.create_architecture_order()
                mdl, init_algorithm_list, init_hyerparam_list = gene.create_mdl(
                )

                number_parameters = count_nb_params(mdl)
                print(number_parameters)

            trainloader, valloader, testloader = get_cifar10_for_data_point_mdl_gen(
            )
            ## create directory to save models
            make_and_check_dir(data_path)
            ## start creating models and its variations
            start = time.time()
            ## generate mdl data point
            mdl = mdl.to(device)
            optimizer = torch.optim.Adam(mdl.parameters())
            criterion = nn.CrossEntropyLoss()
            error_criterion = metrics.error_criterion
            scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                             milestones=[],
                                                             gamma=1.0)
            stats_collector = StatsCollector()
            trainer = Trainer(trainloader, valloader, testloader, optimizer,
                              scheduler, criterion, error_criterion,
                              stats_collector, device)
            init_params = list(mdl.parameters())
            train_loss, train_error, val_loss, val_error, test_loss, test_error = trainer.train_and_track_stats(
                mdl, epochs)
            final_params = list(mdl.parameters())
            ## save data point
            how_long, seconds, minutes, hours = report_times(start)
            print(f'hours = {hours}')
            print(how_long)
            # mdl_name = f'tower_mdl_{i}'
            other_data = trainer.stats_collector.get_stats_dict(
                {'error_criterion': error_criterion.__name__})
            batch_size_train = trainloader.batch_size
            batch_size_test = testloader.batch_size
            batch_size_val = valloader.batch_size
            save_model_info(data_path, mdl, init_params, final_params,
                            train_loss, train_error, val_loss, val_error,
                            test_loss, test_error, optimizer, epochs,
                            criterion, error_criterion, hours, mdl_name,
                            init_algorithm_list, init_hyerparam_list,
                            batch_size_train, batch_size_val, batch_size_test,
                            number_parameters, scheduler, other_data)
            success_nb = success_nb + 1
            print('Success')
        except Exception as e:
            print('FAIL')
            print(e)
Beispiel #4
0
def main():
    '''
    '''
    USE_CUDA = torch.cuda.is_available()
    device = torch.device("cuda" if USE_CUDA else "cpu")
    ## paths to automl data set
    # data_path = '~/predicting_generalization/automl/data/automl_dataset_debug'
    data_path_save = '/home/xiaot6/cs446-project-fa2019/automl/data/set1'  #where you store results
    data_path_test = '/home/xiaot6/split_set/test'
    data_path_train = '/home/xiaot6/split_set/train'
    data_path_val = '/home/xiaot6/split_set/val'
    path = Path(data_path_save).expanduser()
    ## Vocab
    vocab = Vocab()
    V_a, V_hp = len(vocab.architecture_vocab), len(vocab.hparms_vocab)
    ## create dataloader for meta learning data set
    batch_first = True
    # dataset = MetaLearningDataset(data_path, vocab)
    dataset_test = MetaLearningDataset(data_path_test, vocab)
    dataset_train = MetaLearningDataset(data_path_train, vocab)
    dataset_val = MetaLearningDataset(data_path_val, vocab)
    collate_fn = Collate_fn_onehot_general_features(device, batch_first, vocab)
    batch_size = 512
    # dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)
    trainloader = torch.utils.data.DataLoader(dataset_train,
                                              batch_size=batch_size,
                                              collate_fn=collate_fn)
    testloader = torch.utils.data.DataLoader(dataset_test,
                                             batch_size=batch_size,
                                             collate_fn=collate_fn)
    valloader = torch.utils.data.DataLoader(dataset_val,
                                            batch_size=batch_size,
                                            collate_fn=collate_fn)
    ## instantiate Meta Learner
    # arch hps
    arch_input_size = V_a
    arch_hidden_size = 16
    arch_num_layers = 1
    # arch_hp hps
    arch_hp_input_size = V_hp
    arch_hp_hidden_size = arch_hidden_size
    arch_hp_num_layers = 1
    # opt hps
    # st()
    # input1 = dataset[0]['train_history'].view(batch_size,-1)
    # input2 = dataset[0]['val_history'].view(batch_size,-1)
    # st()
    # seq_len = len(dataset[0]['test_errors']) # since they all have the same seq_len
    seq_len = len(
        dataset_test[0]['test_errors'])  # since they all have the same seq_len
    input_dim = 4  # 4 is cuz we have 2 losses, errors CSEloss but we have val and train so 2*2=4
    opt_input_size = input_dim  # so that it process one time step of the history at a time [train_err,train_loss,val_loss,val_err]
    opt_hidden_size = arch_hidden_size
    opt_num_layers = 1
    # weight stats
    weight_stats_input_size = 3  # 3 because we are only processing init params stats mu, std l2. if we also process all final param stats this would be 6
    weight_stats_hidden_size = arch_hidden_size
    weight_stats_layers = 1
    ## train error hps
    train_err_input_size = 1
    train_err_hidden_size = arch_hidden_size
    num_layers_num_layers = 1
    # meta-learner chain lstm
    meta_learner = ChainLSTM(arch_input_size=arch_input_size,
                             arch_hidden_size=arch_hidden_size,
                             arch_num_layers=1,
                             arch_hp_input_size=arch_hp_input_size,
                             arch_hp_hidden_size=arch_hp_hidden_size,
                             arch_hp_num_layers=1,
                             weight_stats_input_size=weight_stats_input_size,
                             weight_stats_hidden_size=weight_stats_hidden_size,
                             weight_stats_layers=weight_stats_layers,
                             opt_input_size=opt_input_size,
                             opt_hidden_size=opt_hidden_size,
                             opt_num_layers=opt_num_layers,
                             train_err_input_size=train_err_input_size,
                             train_err_hidden_size=train_err_hidden_size,
                             num_layers_num_layers=num_layers_num_layers,
                             device=device)
    ##
    # trainloader, valloader, testloader = dataloader, dataloader, dataloader # TODO this is just for the sake of an example!
    optimizer = torch.optim.Adam(meta_learner.parameters())
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                     milestones=[],
                                                     gamma=1.0)
    criterion = torch.nn.MSELoss()
    # error_criterion = criterion # TODO: implement epsilon classification loss
    init_params = list(meta_learner.parameters())
    error_criterion = metrics.error_criterion
    stats_collector = StatsCollector()
    device = device
    trainer = Trainer(trainloader, valloader, testloader, optimizer, scheduler,
                      criterion, error_criterion, stats_collector, device)
    ##
    final_params = list(meta_learner.parameters())
    batch_size_train = trainloader.batch_size
    batch_size_test = testloader.batch_size
    batch_size_val = valloader.batch_size
    nb_epochs = 3  #500 #50
    train_iterations = inf  # TODO: CHANGE for model to be fully trained!!! # inf
    train_loss, train_error, val_loss, val_error, test_loss, test_error = trainer.train_and_track_stats(
        meta_learner,
        nb_epochs,
        iterations=4,
        train_iterations=train_iterations)
    other_data = trainer.stats_collector.get_stats_dict(
        {'error_criterion': error_criterion.__name__})
    # other_data = trainer.stats_collector.get_stats_dict({'error_criterion':error_criterion})
    save_model_info_lstm(data_path_save, train_loss, train_error, val_loss,
                         val_error, test_loss, test_error, nb_epochs,
                         optimizer, batch_size_train, batch_size_val,
                         batch_size_test, scheduler, other_data)
    print('done')