Beispiel #1
0
def get_db():
    """Connect to the application's configured database. The connection
    is unique for each request and will be reused if this is called
    again.
    """
    if 'db' not in g:
        sql_conf = current_app.config['SQL_CONF']
        db = Database(sql_conf.dialect, sql_conf.database, sql_conf.username,
                      sql_conf.password, sql_conf.host, sql_conf.port,
                      sql_conf.query)
        check_db_mappers(db)
        db.session = db.get_session()
        g.db = db
    return g.db
Beispiel #2
0
def enter_datarun(sql_config, run_config, aws_config=None, upload_data=False,
                  run_per_partition=False):
    """
    Generate a datarun, including a dataset if necessary.

    sql_config: Object with all attributes necessary to initialize a Database.
    run_config: all attributes necessary to initialize a Datarun, including
        Dataset info if the dataset has not already been created.
    aws_config: all attributes necessary to connect to an S3 bucket.
    upload_data: whether to store processed data in the cloud

    Returns: ID of the generated datarun
    """
    # connect to the database
    db = Database(sql_config.dialect, sql_config.database, sql_config.username,
                  sql_config.password, sql_config.host, sql_config.port,
                  sql_config.query)

    # if the user has provided a dataset id, use that. Otherwise, create a new
    # dataset based on the arguments we were passed.
    if run_config.dataset_id is None:
        dataset = enter_dataset(db, run_config, aws_config=aws_config,
                                upload_data=upload_data)
    else:
        dataset = db.get_dataset(run_config.dataset_id)


    # create hyperpartitions for the new datarun
    print
    print 'creating hyperpartitions...'
    session = db.get_session()

    method_and_parts = []
    for m in run_config.methods:
        # enumerate all combinations of categorical variables for this method
        method = Method(METHODS_MAP[m])
        method_hyperparitions = method.get_hyperpartitions()

        for method_hyperparition in method_hyperparitions:
            method_and_parts.append((m, method_hyperparition))

        print 'method', m, 'has', len(method_hyperparitions), 'hyperpartitions'

    # create and save datarun to database
    print
    print 'creating datarun...'

    # create hyperpartitions and datarun(s)
    run_ids = []
    if not run_per_partition:
        datarun = create_datarun(db, session, dataset, run_config)
        session.commit()

    for method, part in method_and_parts:
        # if necessary, create a new datarun for each hyperpartition.
        # This setting is useful for debugging.
        if run_per_partition:
            datarun = create_datarun(db, session, dataset, run_config)
            session.commit()
            run_ids.append(datarun.id)

        hp = db.Hyperpartition(datarun_id=datarun.id,
                               method=method,
                               tunables=part.tunables,
                               constants=part.constants,
                               categoricals=part.categoricals,
                               status=PartitionStatus.INCOMPLETE)
        session.add(hp)
        session.commit()


    print
    print '========== Summary =========='
    print 'Dataset ID:', dataset.id
    print 'Training data:', dataset.train_path
    print 'Test data:', (dataset.test_path or '(None)')
    if run_per_partition:
        print 'Datarun IDs:', ', '.join(map(str, run_ids))
    else:
        print 'Datarun ID:', datarun.id
    print 'Hyperpartition selection strategy:', datarun.selector
    print 'Parameter tuning strategy:', datarun.tuner
    print 'Budget: %d (%s)' % (datarun.budget, datarun.budget_type)
    print

    return run_ids or datarun.id