def create_datarun(self, dataset_url, class_column, budget_type, budget): run_config = self._build_run_config(dataset_url=dataset_url, class_column=class_column, budget_type=budget_type, budget=budget) id = enter_data(self._sql_config, run_config) return {'id': id}
def get_new_worker(**kwargs): kwargs['methods'] = kwargs.get('methods', ['logreg', 'dt']) sql_conf = SQLConfig(database=DB_PATH) run_conf = RunConfig(**kwargs) run_id = enter_data(sql_conf, run_conf) db = Database(**vars(sql_conf)) datarun = db.get_datarun(run_id) return Worker(db, datarun)
def post_enter_data(): """ Deprecated. Use post_new_dataset and post_new_datarun Receives and saves a CSV file, after which it executes the enter_data function. See: http://flask.pocoo.org/docs/0.12/patterns/fileuploads/ """ if 'file' not in request.files: raise ApiError('No file part', status_code=400) file = request.files['file'] # if user does not select file, browser also submits an empty part without filename if file.filename == '': raise ApiError('Empty file part', status_code=400) if file and allowed_file(file.filename): filename = secure_filename(file.filename) rel_filepath = os.path.join(current_app.config['UPLOAD_FOLDER'], filename) abs_filepath = os.path.abspath(rel_filepath) if not os.path.exists(current_app.config['UPLOAD_FOLDER']): os.makedirs(current_app.config['UPLOAD_FOLDER']) if os.path.exists(abs_filepath): file_name, file_extension = os.path.splitext(abs_filepath) path_temp = file_name + '_%d' + file_extension count = 2 while os.path.exists(abs_filepath): abs_filepath = path_temp % count count += 1 # Ugly hack to prevent dead loop if count > 100: raise ValueError( 'The saved data file renamed to over 100, please rename the file and upload.' ) logger.warning( 'Filename %s already exists, renamed and saved to %s' % (rel_filepath, abs_filepath)) file.save(abs_filepath) run_conf = current_app.config['RUN_CONF'] sql_conf = current_app.config['SQL_CONF'] aws_conf = current_app.config['AWS_CONF'] run_per_partition = current_app.config['RUN_PER_PARTITION'] # we need to set a customized train_path but without modifying the # global run_conf object, so we deepcopy the run_conf object upload_run_conf = copy.deepcopy(run_conf) upload_run_conf.train_path = abs_filepath datarun_id = enter_data(sql_conf, upload_run_conf, aws_conf, run_per_partition) return jsonify({ 'success': True, 'filename': os.path.split(abs_filepath)[1], 'id': datarun_id })
def test_enter_data_all(dataset): sql_conf = SQLConfig(database=DB_PATH) db = Database(**vars(sql_conf)) run_conf = RunConfig(dataset_id=dataset.id, methods=METHOD_HYPERPARTS.keys()) run_id = enter_data(sql_conf, run_conf) with db_session(db): run = db.get_datarun(run_id) assert run.dataset.id == dataset.id assert len(run.hyperpartitions) == sum(METHOD_HYPERPARTS.values())
def test_enter_data_by_methods(dataset): sql_conf = SQLConfig(database=DB_PATH) db = Database(**vars(sql_conf)) run_conf = RunConfig(dataset_id=dataset.id) for method, n_parts in METHOD_HYPERPARTS.items(): run_conf.methods = [method] run_id = enter_data(sql_conf, run_conf) assert db.get_datarun(run_id) with db_session(db): run = db.get_datarun(run_id) assert run.dataset.id == dataset.id assert len(run.hyperpartitions) == n_parts
def test_run_per_partition(dataset): sql_conf = SQLConfig(database=DB_PATH) db = Database(**vars(sql_conf)) run_conf = RunConfig(dataset_id=dataset.id, methods=['logreg']) run_ids = enter_data(sql_conf, run_conf, run_per_partition=True) with db_session(db): runs = [] for run_id in run_ids: run = db.get_datarun(run_id) if run is not None: runs.append(run) assert len(runs) == METHOD_HYPERPARTS['logreg'] assert all([len(run.hyperpartitions) == 1 for run in runs])
def btb_test(dataruns=None, datasets=None, processes=1, graph=False, **kwargs): """ Run a test datarun using the chosen tuner and selector, and compare it to the baseline performance. Tuner and selector will be specified in **kwargs, along with the rest of the standard datarun arguments. """ sql_conf, run_conf, _, _ = load_config(sql_path=SQL_CONFIG, run_path=RUN_CONFIG, **kwargs) db = Database(**vars(sql_conf)) datarun_ids = dataruns or [] datarun_ids_per_dataset = [[each] for each in dataruns] if dataruns else [] datasets = datasets or DATASETS_MAX_FIRST # if necessary, generate datasets and dataruns if not datarun_ids: for ds in datasets: run_conf.train_path = DATA_URL + ds run_conf.dataset_id = None print('Creating 10 dataruns for', run_conf.train_path) run_ids = [enter_data(sql_conf, run_conf) for i in range(10)] datarun_ids_per_dataset.append(run_ids) datarun_ids.extend(run_ids) # work on the dataruns til they're done print('Working on %d dataruns' % len(datarun_ids)) work_parallel(db=db, datarun_ids=datarun_ids, n_procs=processes) print('Finished!') results = {} # compute and maybe graph the results for each dataset for rids in datarun_ids_per_dataset: res = report_auc_vs_baseline(db, rids, graph=graph) results[tuple(rids)] = {'test': res[0], 'baseline': res[1]} return results
help='number of processes to run concurrently', type=int, default=4) parser.add_argument('--total-time', help='total time for each worker to work (in seconds)', type=int, default=None) args = parser.parse_args() sql_config, run_config, _, _ = load_config(sql_path=SQL_CONFIG, run_path=RUN_CONFIG) db = Database(**vars(sql_config)) print('creating dataruns...') datarun_ids = [] for ds in DATASETS: run_config.train_path = os.path.join(DATA_DIR, ds) datarun_ids.append(enter_data(sql_config=sql_config, run_config=run_config)) work_parallel(db=db, datarun_ids=datarun_ids, n_procs=args.processes, total_time=args.total_time) print('workers finished.') for rid in datarun_ids: print_summary(db, rid)
You can pass yaml configuration files (--sql-config, --aws-config, --run-config) instead of passing individual arguments. Any arguments in the config files will override arguments passed on the command line. See the examples in the config/ folder for more information. """) # Add argparse arguments for aws, sql, and datarun config add_arguments_aws_s3(parser) add_arguments_sql(parser) add_arguments_datarun(parser) add_arguments_logging(parser) parser.add_argument('--run-per-partition', default=False, action='store_true', help='if set, generate a new datarun for each hyperpartition') args = parser.parse_args() # default logging config is different if initialized from the command line if args.log_config is None: args.log_config = os.path.join(PROJECT_ROOT, 'config/templates/log-script.yaml') # create config objects from the config files and/or command line args sql_conf, run_conf, aws_conf, log_conf = load_config(sql_path=args.sql_config, run_path=args.run_config, aws_path=args.aws_config, log_path=args.log_config, **vars(args)) initialize_logging(log_conf) # create and save the dataset and datarun enter_data(sql_conf, run_conf, aws_conf, args.run_per_partition)
parser.add_argument('--method', help='code for method to test') parser.add_argument('--method-path', help='path to JSON config for method to test') args = parser.parse_args() sql_config, run_config, aws_config, _ = load_config(sql_path=SQL_CONFIG, run_path=RUN_CONFIG) db = Database(**vars(sql_config)) print('creating dataruns...') datarun_ids = [] for ds in DATASETS: run_config.train_path = os.path.join(DATA_DIR, ds) if args.method: run_config.methods = [args.method] else: run_config.methods = METHODS datarun_ids.extend( enter_data(sql_config, run_config, aws_config, run_per_partition=True)) print('computing on dataruns', datarun_ids) work_parallel(db=db, datarun_ids=datarun_ids, aws_config=aws_config, n_procs=args.processes) print('workers finished.') for rid in datarun_ids: print_hp_summary(db, rid)
folder for more information. """) # Add argparse arguments for aws, sql, and datarun config add_arguments_aws_s3(parser) add_arguments_sql(parser) add_arguments_datarun(parser) add_arguments_logging(parser) parser.add_argument( '--run-per-partition', default=False, action='store_true', help='if set, generate a new datarun for each hyperpartition') args = parser.parse_args() # default logging config is different if initialized from the command line if args.log_config is None: args.log_config = os.path.join(PROJECT_ROOT, 'config/templates/log-script.yaml') # create config objects from the config files and/or command line args sql_conf, run_conf, aws_conf, log_conf = load_config( sql_path=args.sql_config, run_path=args.run_config, aws_path=args.aws_config, log_path=args.log_config, **vars(args)) initialize_logging(log_conf) # create and save the dataset and datarun enter_data(sql_conf, run_conf, aws_conf, args.run_per_partition)
''') parser.add_argument('--processes', help='number of processes to run concurrently', type=int, default=1) parser.add_argument('--method', help='code for method to test') parser.add_argument('--method-path', help='path to JSON config for method to test') args = parser.parse_args() sql_config, run_config, aws_config, _ = load_config(sql_path=SQL_CONFIG, run_path=RUN_CONFIG) db = Database(**vars(sql_config)) print('creating dataruns...') datarun_ids = [] for ds in DATASETS: run_config.train_path = join(DATA_DIR, ds) if args.method: run_config.methods = [args.method] else: run_config.methods = METHODS datarun_ids.extend(enter_data(sql_config, run_config, aws_config, run_per_partition=True)) print('computing on dataruns', datarun_ids) work_parallel(db=db, datarun_ids=datarun_ids, aws_config=aws_config, n_procs=args.processes) print('workers finished.') for rid in datarun_ids: print_hp_summary(db, rid)
DATASETS = DATASETS_SIMPLE parser = argparse.ArgumentParser(description=''' Run a single end-to-end test with 10 sample datasets. The script will create a datarun for each dataset, then run a worker until the jobs are finished. ''') parser.add_argument('--processes', help='number of processes to run concurrently', type=int, default=4) args = parser.parse_args() sql_config, run_config, _, _ = load_config(sql_path=SQL_CONFIG, run_path=RUN_CONFIG) db = Database(**vars(sql_config)) print('creating dataruns...') datarun_ids = [] for ds in DATASETS: run_config.train_path = join(DATA_DIR, ds) datarun_ids.append(enter_data(sql_config=sql_config, run_config=run_config)) work_parallel(db=db, datarun_ids=datarun_ids, n_procs=args.processes) print('workers finished.') for rid in datarun_ids: print_summary(db, rid)