예제 #1
0
파일: Admin.py 프로젝트: nginyc/atm_example
 def create_datarun(self, dataset_url, class_column, budget_type, budget):
     run_config = self._build_run_config(dataset_url=dataset_url,
                                         class_column=class_column,
                                         budget_type=budget_type,
                                         budget=budget)
     id = enter_data(self._sql_config, run_config)
     return {'id': id}
예제 #2
0
def get_new_worker(**kwargs):
    kwargs['methods'] = kwargs.get('methods', ['logreg', 'dt'])
    sql_conf = SQLConfig(database=DB_PATH)
    run_conf = RunConfig(**kwargs)
    run_id = enter_data(sql_conf, run_conf)
    db = Database(**vars(sql_conf))
    datarun = db.get_datarun(run_id)
    return Worker(db, datarun)
예제 #3
0
def get_new_worker(**kwargs):
    kwargs['methods'] = kwargs.get('methods', ['logreg', 'dt'])
    sql_conf = SQLConfig(database=DB_PATH)
    run_conf = RunConfig(**kwargs)
    run_id = enter_data(sql_conf, run_conf)
    db = Database(**vars(sql_conf))
    datarun = db.get_datarun(run_id)
    return Worker(db, datarun)
예제 #4
0
파일: api.py 프로젝트: yuyicai/ATMSeer
def post_enter_data():
    """
    Deprecated. Use post_new_dataset and post_new_datarun
    Receives and saves a CSV file, after which it executes the enter_data function.
    See: http://flask.pocoo.org/docs/0.12/patterns/fileuploads/
    """
    if 'file' not in request.files:
        raise ApiError('No file part', status_code=400)

    file = request.files['file']

    # if user does not select file, browser also submits an empty part without filename
    if file.filename == '':
        raise ApiError('Empty file part', status_code=400)
    if file and allowed_file(file.filename):
        filename = secure_filename(file.filename)
        rel_filepath = os.path.join(current_app.config['UPLOAD_FOLDER'],
                                    filename)
        abs_filepath = os.path.abspath(rel_filepath)

        if not os.path.exists(current_app.config['UPLOAD_FOLDER']):
            os.makedirs(current_app.config['UPLOAD_FOLDER'])
        if os.path.exists(abs_filepath):
            file_name, file_extension = os.path.splitext(abs_filepath)
            path_temp = file_name + '_%d' + file_extension
            count = 2
            while os.path.exists(abs_filepath):
                abs_filepath = path_temp % count
                count += 1
                # Ugly hack to prevent dead loop
                if count > 100:
                    raise ValueError(
                        'The saved data file renamed to over 100, please rename the file and upload.'
                    )
            logger.warning(
                'Filename %s already exists, renamed and saved to %s' %
                (rel_filepath, abs_filepath))

        file.save(abs_filepath)

        run_conf = current_app.config['RUN_CONF']
        sql_conf = current_app.config['SQL_CONF']
        aws_conf = current_app.config['AWS_CONF']
        run_per_partition = current_app.config['RUN_PER_PARTITION']
        # we need to set a customized train_path but without modifying the
        # global run_conf object, so we deepcopy the run_conf object

        upload_run_conf = copy.deepcopy(run_conf)
        upload_run_conf.train_path = abs_filepath

        datarun_id = enter_data(sql_conf, upload_run_conf, aws_conf,
                                run_per_partition)

        return jsonify({
            'success': True,
            'filename': os.path.split(abs_filepath)[1],
            'id': datarun_id
        })
def test_enter_data_all(dataset):
    sql_conf = SQLConfig(database=DB_PATH)
    db = Database(**vars(sql_conf))
    run_conf = RunConfig(dataset_id=dataset.id,
                         methods=METHOD_HYPERPARTS.keys())

    run_id = enter_data(sql_conf, run_conf)

    with db_session(db):
        run = db.get_datarun(run_id)
        assert run.dataset.id == dataset.id
        assert len(run.hyperpartitions) == sum(METHOD_HYPERPARTS.values())
def test_enter_data_by_methods(dataset):
    sql_conf = SQLConfig(database=DB_PATH)
    db = Database(**vars(sql_conf))
    run_conf = RunConfig(dataset_id=dataset.id)

    for method, n_parts in METHOD_HYPERPARTS.items():
        run_conf.methods = [method]
        run_id = enter_data(sql_conf, run_conf)

        assert db.get_datarun(run_id)
        with db_session(db):
            run = db.get_datarun(run_id)
            assert run.dataset.id == dataset.id
            assert len(run.hyperpartitions) == n_parts
def test_run_per_partition(dataset):
    sql_conf = SQLConfig(database=DB_PATH)
    db = Database(**vars(sql_conf))
    run_conf = RunConfig(dataset_id=dataset.id, methods=['logreg'])

    run_ids = enter_data(sql_conf, run_conf, run_per_partition=True)

    with db_session(db):
        runs = []
        for run_id in run_ids:
            run = db.get_datarun(run_id)
            if run is not None:
                runs.append(run)

        assert len(runs) == METHOD_HYPERPARTS['logreg']
        assert all([len(run.hyperpartitions) == 1 for run in runs])
예제 #8
0
def btb_test(dataruns=None, datasets=None, processes=1, graph=False, **kwargs):
    """
    Run a test datarun using the chosen tuner and selector, and compare it to
    the baseline performance.

    Tuner and selector will be specified in **kwargs, along with the rest of the
    standard datarun arguments.
    """
    sql_conf, run_conf, _, _ = load_config(sql_path=SQL_CONFIG,
                                           run_path=RUN_CONFIG,
                                           **kwargs)

    db = Database(**vars(sql_conf))
    datarun_ids = dataruns or []
    datarun_ids_per_dataset = [[each] for each in dataruns] if dataruns else []
    datasets = datasets or DATASETS_MAX_FIRST

    # if necessary, generate datasets and dataruns
    if not datarun_ids:
        for ds in datasets:
            run_conf.train_path = DATA_URL + ds
            run_conf.dataset_id = None
            print('Creating 10 dataruns for', run_conf.train_path)
            run_ids = [enter_data(sql_conf, run_conf) for i in range(10)]
            datarun_ids_per_dataset.append(run_ids)
            datarun_ids.extend(run_ids)

    # work on the dataruns til they're done
    print('Working on %d dataruns' % len(datarun_ids))
    work_parallel(db=db, datarun_ids=datarun_ids, n_procs=processes)
    print('Finished!')

    results = {}

    # compute and maybe graph the results for each dataset
    for rids in datarun_ids_per_dataset:
        res = report_auc_vs_baseline(db, rids, graph=graph)
        results[tuple(rids)] = {'test': res[0], 'baseline': res[1]}

    return results
예제 #9
0
def btb_test(dataruns=None, datasets=None, processes=1, graph=False, **kwargs):
    """
    Run a test datarun using the chosen tuner and selector, and compare it to
    the baseline performance.

    Tuner and selector will be specified in **kwargs, along with the rest of the
    standard datarun arguments.
    """
    sql_conf, run_conf, _, _ = load_config(sql_path=SQL_CONFIG,
                                           run_path=RUN_CONFIG,
                                           **kwargs)

    db = Database(**vars(sql_conf))
    datarun_ids = dataruns or []
    datarun_ids_per_dataset = [[each] for each in dataruns] if dataruns else []
    datasets = datasets or DATASETS_MAX_FIRST

    # if necessary, generate datasets and dataruns
    if not datarun_ids:
        for ds in datasets:
            run_conf.train_path = DATA_URL + ds
            run_conf.dataset_id = None
            print('Creating 10 dataruns for', run_conf.train_path)
            run_ids = [enter_data(sql_conf, run_conf) for i in range(10)]
            datarun_ids_per_dataset.append(run_ids)
            datarun_ids.extend(run_ids)

    # work on the dataruns til they're done
    print('Working on %d dataruns' % len(datarun_ids))
    work_parallel(db=db, datarun_ids=datarun_ids, n_procs=processes)
    print('Finished!')

    results = {}

    # compute and maybe graph the results for each dataset
    for rids in datarun_ids_per_dataset:
        res = report_auc_vs_baseline(db, rids, graph=graph)
        results[tuple(rids)] = {'test': res[0], 'baseline': res[1]}

    return results
예제 #10
0
                    help='number of processes to run concurrently',
                    type=int,
                    default=4)
parser.add_argument('--total-time',
                    help='total time for each worker to work (in seconds)',
                    type=int,
                    default=None)

args = parser.parse_args()
sql_config, run_config, _, _ = load_config(sql_path=SQL_CONFIG,
                                           run_path=RUN_CONFIG)

db = Database(**vars(sql_config))

print('creating dataruns...')
datarun_ids = []
for ds in DATASETS:
    run_config.train_path = os.path.join(DATA_DIR, ds)
    datarun_ids.append(enter_data(sql_config=sql_config,
                                  run_config=run_config))

work_parallel(db=db,
              datarun_ids=datarun_ids,
              n_procs=args.processes,
              total_time=args.total_time)

print('workers finished.')

for rid in datarun_ids:
    print_summary(db, rid)
예제 #11
0
You can pass yaml configuration files (--sql-config, --aws-config, --run-config)
instead of passing individual arguments. Any arguments in the config files will
override arguments passed on the command line. See the examples in the config/
folder for more information. """)
    # Add argparse arguments for aws, sql, and datarun config
    add_arguments_aws_s3(parser)
    add_arguments_sql(parser)
    add_arguments_datarun(parser)
    add_arguments_logging(parser)
    parser.add_argument('--run-per-partition', default=False, action='store_true',
                        help='if set, generate a new datarun for each hyperpartition')

    args = parser.parse_args()

    # default logging config is different if initialized from the command line
    if args.log_config is None:
        args.log_config = os.path.join(PROJECT_ROOT,
                                       'config/templates/log-script.yaml')

    # create config objects from the config files and/or command line args
    sql_conf, run_conf, aws_conf, log_conf = load_config(sql_path=args.sql_config,
                                                         run_path=args.run_config,
                                                         aws_path=args.aws_config,
                                                         log_path=args.log_config,
                                                         **vars(args))
    initialize_logging(log_conf)

    # create and save the dataset and datarun
    enter_data(sql_conf, run_conf, aws_conf, args.run_per_partition)
예제 #12
0
parser.add_argument('--method', help='code for method to test')
parser.add_argument('--method-path',
                    help='path to JSON config for method to test')

args = parser.parse_args()
sql_config, run_config, aws_config, _ = load_config(sql_path=SQL_CONFIG,
                                                    run_path=RUN_CONFIG)
db = Database(**vars(sql_config))

print('creating dataruns...')
datarun_ids = []
for ds in DATASETS:
    run_config.train_path = os.path.join(DATA_DIR, ds)
    if args.method:
        run_config.methods = [args.method]
    else:
        run_config.methods = METHODS
    datarun_ids.extend(
        enter_data(sql_config, run_config, aws_config, run_per_partition=True))

print('computing on dataruns', datarun_ids)
work_parallel(db=db,
              datarun_ids=datarun_ids,
              aws_config=aws_config,
              n_procs=args.processes)

print('workers finished.')

for rid in datarun_ids:
    print_hp_summary(db, rid)
예제 #13
0
folder for more information. """)
    # Add argparse arguments for aws, sql, and datarun config
    add_arguments_aws_s3(parser)
    add_arguments_sql(parser)
    add_arguments_datarun(parser)
    add_arguments_logging(parser)
    parser.add_argument(
        '--run-per-partition',
        default=False,
        action='store_true',
        help='if set, generate a new datarun for each hyperpartition')

    args = parser.parse_args()

    # default logging config is different if initialized from the command line
    if args.log_config is None:
        args.log_config = os.path.join(PROJECT_ROOT,
                                       'config/templates/log-script.yaml')

    # create config objects from the config files and/or command line args
    sql_conf, run_conf, aws_conf, log_conf = load_config(
        sql_path=args.sql_config,
        run_path=args.run_config,
        aws_path=args.aws_config,
        log_path=args.log_config,
        **vars(args))
    initialize_logging(log_conf)

    # create and save the dataset and datarun
    enter_data(sql_conf, run_conf, aws_conf, args.run_per_partition)
예제 #14
0
''')
parser.add_argument('--processes', help='number of processes to run concurrently',
                    type=int, default=1)
parser.add_argument('--method', help='code for method to test')
parser.add_argument('--method-path', help='path to JSON config for method to test')

args = parser.parse_args()
sql_config, run_config, aws_config, _ = load_config(sql_path=SQL_CONFIG,
                                                    run_path=RUN_CONFIG)
db = Database(**vars(sql_config))

print('creating dataruns...')
datarun_ids = []
for ds in DATASETS:
    run_config.train_path = join(DATA_DIR, ds)
    if args.method:
        run_config.methods = [args.method]
    else:
        run_config.methods = METHODS
    datarun_ids.extend(enter_data(sql_config, run_config, aws_config,
                                  run_per_partition=True))

print('computing on dataruns', datarun_ids)
work_parallel(db=db, datarun_ids=datarun_ids, aws_config=aws_config,
              n_procs=args.processes)

print('workers finished.')

for rid in datarun_ids:
    print_hp_summary(db, rid)
예제 #15
0
DATASETS = DATASETS_SIMPLE


parser = argparse.ArgumentParser(description='''
Run a single end-to-end test with 10 sample datasets.
The script will create a datarun for each dataset, then run a worker until the
jobs are finished.
''')
parser.add_argument('--processes', help='number of processes to run concurrently',
                    type=int, default=4)

args = parser.parse_args()
sql_config, run_config, _, _ = load_config(sql_path=SQL_CONFIG,
                                           run_path=RUN_CONFIG)

db = Database(**vars(sql_config))

print('creating dataruns...')
datarun_ids = []
for ds in DATASETS:
    run_config.train_path = join(DATA_DIR, ds)
    datarun_ids.append(enter_data(sql_config=sql_config,
                                  run_config=run_config))

work_parallel(db=db, datarun_ids=datarun_ids, n_procs=args.processes)

print('workers finished.')

for rid in datarun_ids:
    print_summary(db, rid)