def plot_learning_curves(name, figure_dir, learning_curves, y_label, y_limits=(0, +1), **kwargs): import matplotlib.pyplot as plt plt.figure() for label, sizes, scores in learning_curves: plot_learning_curve(sizes, scores, name=label, **kwargs) if name is not None: plt.title(name) # Could extract from figure_dir if y_limits is not None: plt.ylim(*y_limits) plt.xlabel('# Training Examples') plt.ylabel(y_label) plt.grid() plt.legend(loc='best') plt.tight_layout() if figure_dir is not None: mkdir(figure_dir) figure_path = os.path.join(figure_dir, '{}.png'.format(y_label)) # pdf plt.savefig(figure_path, transparent=False, bbox_inches='tight') # dpi=1000, print('Saved', figure_path) if not is_remote(): plt.show() return plt
def save_experiments(experiments_dir, expid, experiments): if experiments_dir is None: return None mkdir(experiments_dir) data_path = os.path.join( experiments_dir, 'experiments_{}.pk{}'.format(expid, get_python_version())) write_pickle(data_path, experiments) print('Saved', data_path) return data_path
def save_learner(data_dir, learner): if data_dir is None: return False #domain = learner.func #data_dir = os.path.join(MODEL_DIRECTORY, domain.name) #name = learner.name name = get_label(learner.algorithm) mkdir(data_dir) learner_path = os.path.join(data_dir, '{}.pk{}'.format(name, get_python_version())) print('Saved', learner_path) write_pickle(learner_path, learner) return True
def save(self, data_dir): from learn_tools.analyze_experiment import get_label if data_dir is None: return False # domain = learner.func # data_dir = os.path.join(MODEL_DIRECTORY, domain.name) # name = learner.name name = get_label(self.algorithm) mkdir(data_dir) learner_path = os.path.join( data_dir, '{}.pk{}'.format(name, get_python_version())) print('Saved learner:', learner_path) write_pickle(learner_path, self) return True
def main(): parser = argparse.ArgumentParser() parser.add_argument('paths', nargs='*', help='Paths to the data.') #parser.add_argument('-a', '--active', type=int, default=0, # None # help='The number of active samples to collect') parser.add_argument('-l', '--learner', default=None, help='Path to the learner that should be used') parser.add_argument('-n', '--num_trials', type=int, default=100, help='The number of samples to collect') parser.add_argument('-s', '--save', action='store_true', help='Whether to save the learners') parser.add_argument('-r', '--num_rounds', type=int, default=1, help='The number of rounds to collect') #parser.add_argument('-t', '--num_train', type=int, default=None, # help='The size of the training set') args = parser.parse_args() # TODO: be careful that paging isn't altering the data # TODO: don't penalize if the learner identifies that it can't make a good prediction # TODO: use a different set of randomized parameters for train and test include_none = False serial = is_darwin() #training_sizes = inclusive_range(50, 500, 25) #training_sizes = inclusive_range(25, 100, 5) #training_sizes = inclusive_range(25, 100, 5) training_sizes = inclusive_range(10, 50, 5) #training_sizes = inclusive_range(100, 1000, 100) #training_sizes = [20] #training_sizes = [1500] #kernels = ['RBF', 'Matern52', 'MLP'] kernels = ['MLP'] #hyperparameters = [None] #hyperparameters = [True] hyperparameters = [True, None] # None, query_type = BEST # BEST | CONFIDENT | REJECTION | ACTIVE # type of query used to evaluate the learner is_adaptive = False max_test = 50 # #alphas = np.linspace(0.0, 0.9, num=5, endpoint=True) alphas = [0.0, .8, .9, .99] #alphas = [None] # Use the default (i.e. GP parameters) use_vars = [True] binary = False split = UNIFORM # BALANCED # Omitting failed labels is okay because they will never be executed algorithms = [] #algorithms += [(Algorithm(BATCH_GP, kernel=kernel, hyperparameters=hype, use_var=use_var), [num_train]) # for num_train, kernel, hype, use_var in product(training_sizes, kernels, hyperparameters, use_vars)] algorithms += [(Algorithm(STRADDLE_GP, kernel, hype, use_var), training_sizes) for kernel, hype, use_var in product(kernels, hyperparameters, use_vars)] #algorithms += [(Algorithm(rf_model, p_explore=None, use_var=use_var), [num_train]) # for rf_model, num_train, use_var in product(RF_MODELS, training_sizes, use_vars)] #algorithms += [(Algorithm(nn_model, p_explore=None), [num_train]) # for nn_model, num_train in product(NN_MODELS, training_sizes)] #algorithms += [(Algorithm(RANDOM), None), (Algorithm(DESIGNED), None)] print('Algorithms:', algorithms) print('Split:', split) trials_per_round = sum(1 if train_sizes is None else (train_sizes[-1] - train_sizes[0] + len(train_sizes)) for _, train_sizes in algorithms) num_experiments = args.num_rounds*trials_per_round date_name = datetime.datetime.now().strftime(DATE_FORMAT) size_str = '[{},{}]'.format(training_sizes[0], training_sizes[-1]) #size_str = '-'.join(map(str, training_sizes)) experiments_name = '{}_r={}_t={}_n={}'.format(date_name, args.num_rounds, size_str, args.num_trials) #'19-08-09_21-44-58_r=5_t=[10,150]_n=1'# #experiments_name = 't={}'.format(args.num_rounds) # TODO: could include OS and username if desired domain = load_data(args.paths) print() print(domain) X, Y, W = domain.get_data(include_none=include_none) print('Total number of examples:', len(X)) if binary: # NN can fit perfectly when binary # Binary seems to be outperforming w/o Y = threshold_scores(Y) max_train = len(X) - max_test #min(max([0] + [active_sizes[0] for _, active_sizes in algorithms # if active_sizes is not None]), len(X)) #parameters = { # 'include None': include_none, # 'binary': binary, # 'split': split, #} print('Name:', experiments_name) print('Experiments:', num_experiments) print('Max train:', max_train) print('Include None:', include_none) print('Examples: n={}, d={}'.format(*X.shape)) print('Binary:', binary) print('Estimated hours:', num_experiments * SEC_PER_EXPERIMENT / HOURS_TO_SECS) user_input('Begin?') # TODO: residual learning for sim to real transfer # TODO: can always be conservative and add sim negative examples # TODO: combine all data to write in one folder data_dir = os.path.join(DATA_DIRECTORY, domain.name) # EXPERIMENT_DIRECTORY experiments_dir = os.path.join(data_dir, experiments_name) mkdir(experiments_dir) start_time = time.time() experiments = [] for round_idx in range(args.num_rounds): round_dir = os.path.join(data_dir, experiments_name, str(round_idx)) mkdir(round_dir) seed = hash(time.time()) train_test_file = os.path.join(round_dir, 'data.pk3') if not os.path.exists(train_test_file): X_train, Y_train, X_test, Y_test = split_data(X, Y, split, max_train) X_test, Y_test = X_test[:max_test], Y_test[:max_test] write_pickle(train_test_file, (X_train, Y_train, X_test, Y_test)) else: X_train, Y_train, X_test, Y_test = read_pickle(train_test_file) print('Train examples:', X_train.shape) print('Test examples:', X_test.shape) # TODO: need to be super careful when running with multiple contexts for algorithm, active_sizes in algorithms: # active_sizes = [first #trainingdata selected from X_train, #active exploration + #trainingdata] print(SEPARATOR) print('Round: {} | {} | Seed: {} | Sizes: {}'.format(round_idx, algorithm, seed, active_sizes)) # TODO: allow keyboard interrupt if active_sizes is None: learner = algorithm.name active_size = None train_confusion = None experiments.append(evaluate_learner(domain, seed, train_confusion, X_test, Y_test, algorithm, learner, active_size, args.num_trials, alphas, serial)) else: # [10 20 25] take first 10 samples from X_train to train the model, 10 samples chosen actively # sequentially + evaluate model, 5 samples chosen actively sequentially + evaluate model # Could always keep around all the examples and retrain # TODO: segfaults when this runs in parallel # TODO: may be able to retrain in parallel if I set OPENBLAS_NUM_THREADS learner_prior_nx = 0 ''' if algorithm.hyperparameters: if domain.skill == 'pour': learner_file = '/Users/ziw/ltamp_pr2/data/pour_19-06-13_00-59-21/19-08-09_19-30-01_r=10_t=[50,400]_n=1/{}/gp_active_mlp_true_true.pk3'.format( round_idx) elif domain.skill == 'scoop': learner_file = '/Users/ziw/ltamp_pr2/data/scoop_19-06-10_20-16-59_top-diameter/19-08-09_19-34-56_r=10_t=[50,400]_n=1/{}/gp_active_mlp_true_true.pk3'.format( round_idx) learner = read_pickle(learner_file) learner_prior_nx = learner.nx learner.retrain(newx=X_train[:active_sizes[0]], newy=Y_train[:active_sizes[0], None]) else: ''' learner, train_confusion = create_learner(domain, X_train, Y_train, split, algorithm, num_train=active_sizes[0], query_type=query_type, is_adaptive=is_adaptive) if algorithm.name == STRADDLE_GP: X_select, Y_select = X_train[active_sizes[0]:], Y_train[active_sizes[0]:] for active_size in active_sizes: num_active = active_size - learner.nx + learner_prior_nx# learner.nx is len(learner.xx) print('\nRound: {} | {} | Seed: {} | Size: {} | Active: {}'.format( round_idx, algorithm, seed, active_size, num_active)) if algorithm.name == STRADDLE_GP: X_select, Y_select = active_learning_discrete(learner, num_active, X_select, Y_select) #if args.save: save_learner(round_dir, learner) experiments.append(evaluate_learner(domain, seed, None, X_test, Y_test, algorithm, learner, active_size, args.num_trials, alphas, serial)) save_experiments(experiments_dir, experiments) print(SEPARATOR) if experiments: save_experiments(experiments_dir, experiments) plot_experiments(domain, experiments_name, experiments_dir, experiments, include_none=False) #include_none=include_none) print('Experiments:', experiments_dir) print('Total experiments:', len(experiments)) print('Total hours:', elapsed_time(start_time) / HOURS_TO_SECS)
import os import glob from pybullet_tools.utils import read_json, write_json from pddlstream.utils import mkdir if __name__ == '__main__': SKILL = 'scoop' MATERIAL = {'pour': 'red_men', 'scoop': 'chickpeas'} all_dirs = glob.glob('data/pr2_{}_*'.format(SKILL)) all_trials = [] for dir in all_dirs: trial_file = os.path.join(dir, 'trials.json') data = read_json(trial_file) for d in data: if d['policy'] == 'training' and d['material'] == MATERIAL[SKILL] and d['score'] is not None: all_trials.append(d) newdir = 'data/pr2_{}/'.format(SKILL) mkdir(newdir) write_json(os.path.join(newdir, 'all_trials.json'), all_trials)
def main(): parser = argparse.ArgumentParser() parser.add_argument('trainsize', default=2000, type=int, help='training set size') parser.add_argument('expid', default=1, type=int, help='experiment ID') parser.add_argument( 'beta_lambda', type=float, default=0.9, help='lambda parameter for computing beta from best beta') parser.add_argument('sample_strategy_id', default=1, type=int) # 1, 2, 3 parser.add_argument( 'paths', default=[os.path.join(get_data_dir('pour'), 'trials_n=10000.json')], nargs='*', help='Paths to the data.') parser.add_argument('-u', '--use_hyper', action='store_true', help='When enabled, use existing hyper parameter.') parser.add_argument('-o', '--use_obstacle', action='store_true', help='When enabled, no obstacle is used in the scene.') args = parser.parse_args() beta_lambda = args.beta_lambda sample_strategy = SAMPLE_STRATEGIES[args.sample_strategy_id] global SEED SEED = args.expid set_seed(SEED) n_train_tasks = 50 n_test_tasks = 20 train_tasks_seeds = get_seeds(n_train_tasks) test_tasks_seeds = get_seeds(n_test_tasks) print('loading data') domain = load_data(args.paths) data = domain.create_dataset(include_none=True, binary=False) data.shuffle() X, Y, W = data.get_data() print('finished obtaining x y data') n_train = args.trainsize X = X[:n_train] Y = Y[:n_train] print('initializing ActiveGP with #datapoints = {}'.format(len(X))) hype = None if 'pour' in args.paths[0] and args.use_hyper: hype = POUR_MLP_HYPERPARAM_3000 elif 'scoop' in args.paths[0] and args.use_hyper: hype = SCOOP_MLP_HYPERPARAM_3000 learner = ActiveGP(domain, initx=X, inity=Y, hyperparameters=hype, sample_time_limit=60, beta_lambda=beta_lambda) learner.retrain(num_restarts=10) exp_file = 'tasklengthscale_sampling_trainsize={}_beta_lambdda={}_strategy_{}_obs_{}_expid_{}.pk3'.format( len(X), beta_lambda, args.sample_strategy_id, int(args.use_obstacle), args.expid) exp_dirname = os.path.dirname(args.paths[0]) if args.use_hyper: exp_dirname = os.path.join(exp_dirname, 'default_hyper/') mkdir(exp_dirname) exp_file = os.path.join(exp_dirname, exp_file) print('saving results to ', exp_file) results = [] if sample_strategy != DIVERSELK: n_train_tasks = 0 # no need to train prev_tasklengthscale = None for i in range(n_train_tasks + 1): test_results = [] print('task_lengthscal = {}'.format(learner.task_lengthscale)) print('================BEGIN TESTING==============') if prev_tasklengthscale is not None and ( learner.task_lengthscale == prev_tasklengthscale).all(): test_results = results[-1][1] else: for j in range(n_test_tasks): if sample_strategy == DIVERSELK: test_sample_strategy = DIVERSE else: test_sample_strategy = sample_strategy seed = test_tasks_seeds[j] test_result = eval_task_with_seed( domain, seed, learner, sample_strategy=test_sample_strategy, obstacle=args.use_obstacle) test_results.append(test_result) prev_tasklengthscale = learner.task_lengthscale.copy() if i != n_train_tasks: seed = train_tasks_seeds[i] train_result = eval_task_with_seed(domain, seed, learner, sample_strategy=sample_strategy, obstacle=args.use_obstacle) else: train_result = None results.append((train_result, test_results)) write_pickle(exp_file, (results, SEED, train_tasks_seeds, test_tasks_seeds))
def main(): parser = argparse.ArgumentParser() parser.add_argument('paths', nargs='*', help='Paths to the data.') #parser.add_argument('-a', '--active', type=int, default=0, # None # help='The number of active samples to collect') parser.add_argument( '-d', '--deterministic', action='store_true', help='Whether to deterministically create training splits') parser.add_argument('-n', '--num_trials', type=int, default=-1, help='The number of samples to collect') parser.add_argument('-s', '--save', action='store_true', help='Whether to save the learners') parser.add_argument('-r', '--num_rounds', type=int, default=1, help='The number of rounds to collect') parser.add_argument('-t', '--test', action='store_true', help='Whether to save the data') parser.add_argument('-v', '--visualize', action='store_true', help='When enabled, visualizes execution.') args = parser.parse_args() # TODO: be careful that paging isn't altering the data # TODO: use a different set of randomized parameters for train and test serial = is_darwin() visualize = serial and args.visualize assert implies(visualize, serial) num_trials = get_max_cores( serial) if args.num_trials < 0 else args.num_trials ################################################## #train_sizes = inclusive_range(50, 200, 10) # Best #train_sizes = inclusive_range(50, 400, 10) # F1 #train_sizes = inclusive_range(25, 400, 25) #train_sizes = inclusive_range(50, 100, 5) # Real #train_sizes = inclusive_range(100, 200, 5) #train_sizes = inclusive_range(10, 250, 5) #train_sizes = inclusive_range(35, 70, 5) #train_sizes = inclusive_range(5, 50, 5) #train_sizes = inclusive_range(40, 80, 5) #train_sizes = inclusive_range(100, 1000, 100) #train_sizes = [50] #train_sizes = [250] train_sizes = [1000] #train_sizes = [327] # train + test #train_sizes = inclusive_range(5, 150, 25) #train_sizes = [100] #kernels = ['RBF', 'Matern52', 'MLP'] kernels = ['MLP'] hyperparams = [None] #hyperparams = [True] #hyperparams = [None, True] query_type = BEST # BEST | CONFIDENT | REJECTION | ACTIVE # type of query used to evaluate the learner include_none = False binary = False # 0 => no transfer # 1 => mean transfer # 2 => kernel transfer # 3 => both transfer transfer_weights = [None] #transfer_weights = list(range(4)) #transfer_weights = [0, 1] #transfer_weights = [3] #transfer_weights = np.around(np.linspace(0.0, 1.0, num=1+5, endpoint=True), decimals=3) # max 10 colors #transfer_weights = list(range(1, 1+3)) #split = UNIFORM # BALANCED #print('Split:', split) #parameters = { # 'include None': include_none, # 'binary': binary, # 'split': split, #} # Omitting failed labels is okay because they will never be executed algorithms = [] #algorithms += [(Algorithm(nn_model, label='NN'), [num]) # for nn_model, num in product(NN_MODELS, train_sizes)] #algorithms += [(Algorithm(RANDOM), None), (Algorithm(DESIGNED), None)] #algorithms += [(Algorithm(RF_CLASSIFIER, variance=False, transfer_weight=tw, label='RF'), [num]) # for num, tw in product(train_sizes, [None])] # transfer_weights #algorithms += [(Algorithm(RF_REGRESSOR, variance=False, transfer_weight=tw, label='RF'), [num]) # for num, tw in product(train_sizes, [None])] # transfer_weights #algorithms += [(Algorithm(BATCH_RF, variance=True, transfer_weight=tw, label='RF'), [num]) # for num, tw in product(train_sizes, [None])] # transfer_weights #algorithms += [(Algorithm(BATCH_MAXVAR_RF, variance=True, transfer_weight=tw), train_sizes) # for tw in product(use_vars, [None])] # transfer_weights #algorithms += [(Algorithm(BATCH_STRADDLE_RF, variance=True, transfer_weight=tw), train_sizes) # for tw, in product([None])] # transfer_weights use_vars = [True] # STRADDLE is better than MAXVAR when the learner has a good estimate of uncertainty algorithms += [ (Algorithm(BATCH_GP, kernel, hype, use_var, tw, label='GP'), [num]) # label='GP-{}'.format(kernel) for num, kernel, hype, use_var, tw in product( train_sizes, kernels, hyperparams, use_vars, transfer_weights) ] #algorithms += [(Algorithm(BATCH_MAXVAR_GP, kernel, hype, True, tw, label='GP-Var'), train_sizes) # for kernel, hype, tw in product(kernels, hyperparams, transfer_weights)] #algorithms += [(Algorithm(BATCH_STRADDLE_GP, kernel, hype, True, tw, label='GP-LSE'), train_sizes) # for kernel, hype, tw in product(kernels, hyperparams, transfer_weights)] # default active #algorithms += [(Algorithm(BATCH_STRADDLE_GP, kernel, hype, True, tw, label='GP-LSE2'), train_sizes) # for kernel, hype, tw in product(kernels, hyperparams, transfer_weights)] # active control only # algorithms += [(Algorithm(MAXVAR_GP, kernel, hype, use_var), train_sizes) # for kernel, hype, use_var in product(kernels, hyperparams, use_vars)] #algorithms += [(Algorithm(STRADDLE_GP, kernel, hype, use_var, tw), train_sizes) # for kernel, hype, use_var, tw in product(kernels, hyperparams, use_vars, transfer_weights)] #batch_sizes = inclusive_range(train_sizes[0], 90, 10) #step_size = 10 # TODO: extract from train_sizes #final_size = train_sizes[-1] # Previously didn't have use_var=True # algorithms += [(Algorithm(BATCH_STRADDLE_GP, kernel, hyperparameters=batch_size, variance=True, transfer_weight=tw), # inclusive_range(batch_size, final_size, step_size)) # for kernel, tw, batch_size in product(kernels, transfer_weights, batch_sizes)] # algorithms += [(Algorithm(BATCH_STRADDLE_RF, hyperparameters=batch_size, variance=True, transfer_weight=tw), # inclusive_range(batch_size, final_size, step_size)) # for tw, batch_size in product(transfer_weights, batch_sizes)] print('Algorithms:', algorithms) ################################################## real_world = not args.paths transfer_domain = load_data(TRANSFER_DATASETS, verbose=False) transfer_algorithm = None if real_world and transfer_weights != [None]: #assert transfer_weights[0] is not None transfer_data = transfer_domain.create_dataset( include_none=include_none, binary=binary) transfer_algorithm = Algorithm(BATCH_GP, kernel=kernels[0], variance=use_vars[0]) validity_learner = None #validity_learner = create_validity_classifier(transfer_domain) ################################################## train_paths = args.paths if real_world: train_paths = SCOOP_TRAIN_DATASETS # TRAIN_DATASETS #train_paths = TRANSFER_DATASETS #train_paths = TRAIN_DATASETS + TRANSFER_DATASETS # Train before transfer #scale_paths = TRAIN_DATASETS + TEST_DATASETS scale_paths = None print(SEPARATOR) print('Train paths:', train_paths) domain = load_data(train_paths) print() print(domain) all_data = domain.create_dataset(include_none=include_none, binary=binary, scale_paths=scale_paths) #all_data.results = all_data.results[:1000] num_failed = 0 #num_failed = 100 failed_domain = transfer_domain if real_world else domain failed_results = randomize( result for result in failed_domain.results if not result.get('success', False))[:num_failed] #failed_data = Dataset(domain, failed_results, **all_data.kwargs) test_paths = SCOOP_TEST_DATASETS # TEST_DATASETS | SCOOP_TEST_DATASETS #test_paths = None if real_world and not (set(train_paths) & set(test_paths)): #assert not set(train_paths) & set(test_paths) #max_test = 0 test_data = load_data(test_paths).create_dataset( include_none=False, binary=binary, scale_paths=scale_paths) else: #assert scale_paths is None # TODO: max_train will be too small otherwise test_paths = test_data = None print(SEPARATOR) print('Test paths:', test_paths) all_active_data = None #if real_world: # all_active_data = load_data(ACTIVE_DATASETS).create_dataset(include_none=True, binary=binary, scale_paths=scale_paths) # TODO: could include OS and username if desired date_name = datetime.datetime.now().strftime(DATE_FORMAT) size_str = '[{},{}]'.format(train_sizes[0], train_sizes[-1]) #size_str = '-'.join(map(str, train_sizes)) experiments_name = '{}_r={}_t={}_n={}'.format(date_name, args.num_rounds, size_str, num_trials) trials_per_round = sum( 1 if train_sizes is None else (train_sizes[-1] - train_sizes[0] + len(train_sizes)) for _, train_sizes in algorithms) num_experiments = args.num_rounds * trials_per_round max_train = min( max([0] + [ active_sizes[0] for _, active_sizes in algorithms if active_sizes is not None ]), len(all_data)) max_test = min(len(all_data) - max_train, 1000) ################################################## # #features = ['bowl_height'] # features = ['spoon_height'] # #features = ['bowl_height', 'spoon_height'] # X, Y, _ = all_data.get_data() # #indices = [domain.inputs.index(feature) for feature in features] # #X = X[:,indices] # X = [[result[FEATURE][name] for name in features] for result in all_data.results] # from sklearn.linear_model import LinearRegression # model = LinearRegression(fit_intercept=True, normalize=False) # model.fit(X, Y) # #print(model.get_params()) # print(model.coef_.tolist(), model.intercept_) # print(model.score(X, Y)) #data_dir = os.path.join(DATA_DIRECTORY, domain.name) # EXPERIMENT_DIRECTORY data_dir = os.path.abspath(os.path.join(domain.name, os.path.pardir)) experiments_dir, data_path = None, None if not args.test or not serial: experiments_dir = os.path.join(data_dir, experiments_name) data_path = os.path.join( experiments_dir, 'experiments.pk{}'.format(get_python_version())) ################################################## print(SEPARATOR) print('Name:', experiments_name) print('Experiments:', num_experiments) print('Experiment dir:', experiments_dir) print('Data path:', data_path) print('Examples:', len(all_data)) print('Valid:', sum(result.get('valid', True) for result in all_data.results)) print('Success:', sum(result.get('success', False) for result in all_data.results)) print( 'Scored:', sum( result.get('score', None) is not None for result in all_data.results)) print('Max train:', max_train) print('Max test:', max_test) print('Include None:', include_none) print('Examples: n={}, d={}'.format(len(all_data), domain.dx)) print('Binary:', binary) print('Serial:', serial) print('Estimated hours: {:.3f}'.format(num_experiments * SEC_PER_EXPERIMENT / HOURS_TO_SECS)) user_input('Begin?') ################################################## experiments = [] if experiments_dir is not None: mkdir(experiments_dir) # if os.path.exists(data_path): # experiments.extend(read_pickle(data_path)) # TODO: embed in a KeyboardInterrupt to allow early termination start_time = time.time() for round_idx in range(args.num_rounds): seed = round_idx if args.deterministic else hash( time.time()) # vs just time.time()? random.seed(seed) all_data.shuffle() if test_paths is None: # cannot use test_data #test_data, train_data = split_data(all_data, max_test) train_data = test_data = all_data # Training performance else: train_data = all_data transfer_learner = None if transfer_algorithm is not None: round_data, _ = transfer_data.partition(index=1000) transfer_learner, _ = create_learner(transfer_domain, round_data, transfer_algorithm, verbose=True) transfer_learner.retrain() print(SEPARATOR) print('Round {} | Train examples: {} | Test examples: {}'.format( round_idx, len(train_data), len(test_data))) for algorithm, active_sizes in algorithms: # active_sizes = [first #trainingdata selected from X_train, #active exploration + #trainingdata] print(SEPARATOR) print('Round: {} | {} | Seed: {} | Sizes: {}'.format( round_idx, algorithm, seed, active_sizes)) # TODO: allow keyboard interrupt if active_sizes is None: learner = algorithm.name active_size = train_confusion = None experiments.append( evaluate_learner(domain, seed, train_confusion, test_data, algorithm, learner, active_size, num_trials, serial, args.visualize)) continue # [10 20 25] take first 10 samples from X_train to train the model, 10 samples chosen actively # sequentially + evaluate model, 5 samples chosen actively sequentially + evaluate model # Could always keep around all the examples and retrain # TODO: segfaults when this runs in parallel # TODO: may be able to retrain in parallel if I set OPENBLAS_NUM_THREADS num_batch = active_sizes[0] batch_data, active_data = train_data.partition(num_batch) if all_active_data is not None: active_data = all_active_data.clone() #batch_data.results.extend(failed_results) learner, train_confusion = create_learner( domain, batch_data, algorithm, # alphas, query_type=query_type, verbose=True) learner.validity_learner = validity_learner if transfer_learner is not None: learner.sim_model = transfer_learner.model learner.retrain() for active_size in active_sizes: num_active = active_size - (learner.nx - len(failed_results)) print('\nRound: {} | {} | Seed: {} | Size: {} | Active: {}'. format(round_idx, algorithm, seed, active_size, num_active)) if algorithm.name in CONTINUOUS_ACTIVE_GP: active_learning(learner, num_active, visualize=visualize) #active_learning(learner, num_active, discrete_feature=True, random_feature=False) #active_learning_discrete(learner, active_data, num_active, random_feature=False) elif algorithm.name in BATCH_ACTIVE: active_learning_discrete(learner, active_data, num_active) #active_learning(learner, num_active, discrete_feature=True, random_feature=True) #active_learning_discrete(learner, active_data, num_active, random_feature=True) #if round_dir is not None: # save_learner(round_dir, learner) if args.save: learner.save(data_dir) experiments.append( evaluate_learner(domain, seed, train_confusion, test_data, algorithm, learner, active_size, num_trials, serial, args.visualize)) save_experiments(data_path, experiments) print(SEPARATOR) if experiments: save_experiments(data_path, experiments) plot_experiments(domain, experiments_name, experiments_dir, experiments, include_none=False) print('Experiments: {}'.format(experiments_dir)) print('Total experiments: {}'.format(len(experiments))) print('Total hours: {:.3f}'.format( elapsed_time(start_time) / HOURS_TO_SECS))