def main(argv): del argv if not gfile.Exists(FLAGS.save_dir): try: gfile.MkDir(FLAGS.save_dir) except: print(('WARNING: error creating save directory, ')) save_dir = os.path.join(FLAGS.save_dir, FLAGS.dataset + '_' + FLAGS.sampling_method) if FLAGS.do_save == "True": if not gfile.Exists(save_dir): try: gfile.MkDir(save_dir) except: print(('WARNING: error creating save directory, ' 'directory most likely already created.')) # Set up logging filename = os.path.join( save_dir, "log-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + ".txt") sys.stdout = utils.Logger(filename) X, y = utils.get_mldata(FLAGS.data_dir, FLAGS.dataset) #load dataset! starting_seed = FLAGS.seed all_results = {} for seed in range(starting_seed, starting_seed + FLAGS.trials): sampler = get_AL_sampler(FLAGS.sampling_method) #load sampler! score_model = utils.get_model(FLAGS.score_method, seed) #load score model! if (FLAGS.select_method == "None" or #load select model! FLAGS.select_method == FLAGS.score_method): select_model = None else: select_model = utils.get_model(FLAGS.select_method, seed) results, sampler_state = \ generate_one_curve(X=X, y=y, sampler=sampler, score_model=score_model, seed=seed, warmstart_size=FLAGS.warmstart_size, batch_size=FLAGS.batch_size, select_model=select_model, max_points=FLAGS.max_dataset_size) key = (FLAGS.dataset, FLAGS.sampling_method, FLAGS.score_method, FLAGS.select_method, FLAGS.warmstart_size, FLAGS.batch_size, seed) #sampler_output = sampler_state.to_dict() #results['sampler_output'] = sampler_output results['sampler_output'] = None all_results[key] = results fields = [ 'dataset', 'sampling_methods', 'score_method', 'select_method', 'warmstart size', 'batch size', 'seed' ] all_results['tuple_keys'] = fields if FLAGS.do_save == "True": filename = ("results_score_" + FLAGS.score_method + "_select_" + FLAGS.select_method) existing_files = gfile.Glob(os.path.join(save_dir, filename + "*.pkl")) filename = os.path.join( save_dir, filename + "_" + str(1000 + len(existing_files))[1:] + ".pkl") pickle.dump(all_results, gfile.GFile(filename, "w")) sys.stdout.flush_file()
def main(args): # make the export folder structure # this is made here because the Logger uses the filename if args.do_save: # make a base save directory utils.make_dir(args.save_dir) # make a directory in the base save directory with for the specific # method. save_subdir = os.path.join(args.save_dir, args.dataset + "_" + args.sampling_method) utils.make_dir(save_subdir) filename = os.path.join( save_subdir, "log-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + ".txt") sys.stdout = utils.Logger(filename) # confusion argument can have multiple values confusions = [float(t) for t in args.confusions.split(" ")] mixtures = [float(t) for t in args.active_sampling_percentage.split(" ")] max_dataset_size = None if args.max_dataset_size == 0 else args.max_dataset_size starting_seed = args.seed # get the dataset from file based on the data directory and dataset name X, y = utils.get_mldata(args.data_dir, args.dataset) # object to store the results in all_results = {} # percentage of labels to randomize for c in confusions: # Mixture weights on active sampling." for m in mixtures: # the number of curves created during multiple trials for seed in range(starting_seed, starting_seed + args.trials): # get the sampler based on the name # returns a python object # also named: query strategy sampler = get_AL_sampler(args.sampling_method) # get the model score_model = utils.get_model(args.score_method, seed) # if (args.select_method == "None" or args.select_method == args.score_method): select_model = None else: select_model = utils.get_model(args.select_method, seed) # create the learning curve results, sampler_state = generate_one_curve( X, y, sampler, score_model, seed, args.warmstart_size, args.batch_size, select_model, confusion=c, active_p=m, max_points=max_dataset_size, standardize_data=args.standardize_data, norm_data=args.normalize_data, train_horizon=args.train_horizon) key = (args.dataset, args.sampling_method, args.score_method, args.select_method, m, args.warmstart_size, args.batch_size, c, args.standardize_data, args.normalize_data, seed) sampler_output = sampler_state.to_dict() results["sampler_output"] = sampler_output all_results[key] = results # Not sure why this is done in a qay like this. fields = [ "dataset", "sampler", "score_method", "select_method", "active percentage", "warmstart size", "batch size", "confusion", "standardize", "normalize", "seed" ] all_results["tuple_keys"] = fields # write the results to a file if args.do_save: # format the filename filename = "results_score_{}_select_{}_norm_{}_stand_{}".format( args.score_method, args.select_method, args.normalize_data, args.standardize_data) existing_files = gfile.Glob( os.path.join(save_subdir, "{}*.pkl".format(filename))) filepath = os.path.join( save_subdir, "{}_{}.pkl".format(filename, 1000 + len(existing_files))[1:]) # dump the dict to a pickle file pickle.dump(all_results, gfile.GFile(filepath, "w")) # flush stfout sys.stdout.flush_file()
def main(argv): del argv if not gfile.Exists(FLAGS.save_dir): try: gfile.MkDir(FLAGS.save_dir) except: print(('WARNING: error creating save directory, ' 'directory most likely already created.')) save_dir = os.path.join( FLAGS.save_dir, FLAGS.dataset + "_" + FLAGS.sampling_method) do_save = FLAGS.do_save == "True" if do_save: if not gfile.Exists(save_dir): try: gfile.MkDir(save_dir) except: print(('WARNING: error creating save directory, ' 'directory most likely already created.')) # Set up logging filename = os.path.join( save_dir, "log-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + ".txt") sys.stdout = utils.Logger(filename) confusions = [float(t) for t in FLAGS.confusions.split(" ")] mixtures = [float(t) for t in FLAGS.active_sampling_percentage.split(" ")] all_results = {} max_dataset_size = None if FLAGS.max_dataset_size == "0" else int( FLAGS.max_dataset_size) normalize_data = FLAGS.normalize_data == "True" standardize_data = FLAGS.standardize_data == "True" X, y = utils.get_mldata(FLAGS.data_dir, FLAGS.dataset) starting_seed = FLAGS.seed for c in confusions: for m in mixtures: for seed in range(starting_seed, starting_seed + FLAGS.trials): sampler = get_AL_sampler(FLAGS.sampling_method) score_model = utils.get_model(FLAGS.score_method, seed) if (FLAGS.select_method == "None" or FLAGS.select_method == FLAGS.score_method): select_model = None else: select_model = utils.get_model(FLAGS.select_method, seed) results, sampler_state = generate_one_curve( X, y, sampler, score_model, seed, FLAGS.warmstart_size, FLAGS.batch_size, select_model, c, m, max_dataset_size, standardize_data, normalize_data, FLAGS.train_horizon) key = (FLAGS.dataset, FLAGS.sampling_method, FLAGS.score_method, FLAGS.select_method, m, FLAGS.warmstart_size, FLAGS.batch_size, c, standardize_data, normalize_data, seed) sampler_output = sampler_state.to_dict() results["sampler_output"] = sampler_output all_results[key] = results fields = [ "dataset", "sampler", "score_method", "select_method", "active percentage", "warmstart size", "batch size", "confusion", "standardize", "normalize", "seed" ] all_results["tuple_keys"] = fields if do_save: filename = ("results_score_" + FLAGS.score_method + "_select_" + FLAGS.select_method + "_norm_" + str(normalize_data) + "_stand_" + str(standardize_data)) existing_files = gfile.Glob(os.path.join(save_dir, filename + "*.pkl")) filename = os.path.join(save_dir, filename + "_" + str(1000+len(existing_files))[1:] + ".pkl") pickle.dump(all_results, gfile.GFile(filename, "w")) sys.stdout.flush_file()