Ejemplo n.º 1
0
def run(config_filename, verbose, replace, predictions, validate_only):
    # configure logging
    log_filename = 'logs/modeling_{}'.format(
        str(datetime.datetime.now()).replace(' ', '_').replace(':', ''))
    if verbose:
        logging_level = logging.DEBUG
    else:
        logging_level = logging.INFO
    logging.basicConfig(
        format='%(asctime)s %(process)d %(levelname)s: %(message)s',
        level=logging_level,
        handlers=[logging.FileHandler(log_filename),
                  logging.StreamHandler()])

    #    config_filename = 'experiment_config'
    features_directory = 'features'

    # load main experiment config
    with open('config/{}.yaml'.format(config_filename)) as f:
        experiment_config = yaml.load(f)

    # load feature configs and update experiment config with their contents
    all_feature_aggregations = []
    for filename in os.listdir('config/{}/'.format(features_directory)):
        with open('config/{}/{}'.format(features_directory, filename)) as f:
            feature_aggregations = yaml.load(f)
            for aggregation in feature_aggregations:
                all_feature_aggregations.append(aggregation)
    experiment_config['feature_aggregations'] = all_feature_aggregations

    with open('config/db_default_profile.json') as f:
        DB_CONFIG = json.load(f)

    db_engine = create_engine(
        f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['pass']}@{DB_CONFIG['host']}/{DB_CONFIG['db']}"
    )

    experiment = MultiCoreExperiment(
        config=experiment_config,
        db_engine=db_engine,
        project_path=PROJECT_PATH,
        replace=replace,
        n_db_processes=4,
        n_processes=40,
        save_predictions=predictions,
    )
    experiment.validate()
    if not validate_only:
        experiment.run()
Ejemplo n.º 2
0
def run_experiment(experiment_file, output_path, replace):

    start_time = datetime.datetime.now()
    logging.info(
        f"Reading the file experiment configuration from {experiment_file}")

    # Load the experiment configuration file
    s3 = s3fs.S3FileSystem()
    with s3.open(experiment_file, 'rb') as f:
        experiment_config = yaml.load(f.read())

    host = os.environ['POSTGRES_HOST']
    user = os.environ['POSTGRES_USER']
    db = os.environ['POSTGRES_DB']
    password = os.environ['POSTGRES_PASSWORD']
    port = os.environ['POSTGRES_PORT']

    db_url = f"postgresql://{user}:{password}@{host}:{port}/{db}"

    logging.info(
        f"Using the database: postgresql://{user}:XXXXX@{host}:{port}/{db}")

    try:
        n_processes = int(os.environ.get('NUMBER_OF_PROCESSES', 12))
    except ValueError:
        n_processes = 12
    try:
        n_db_processes = int(os.environ.get('NUMBER_OF_DB_PROCESSES', 6))
    except ValueError:
        n_db_processes = 6

    logging.info(f"The experiment will use {n_processes} cores in the host")

    logging.info(
        f"The output (matrices and models) of this experiment will be stored in {output_path}"
    )

    logging.info(
        f"The experiment will utilize any preexisting matrix or model: {not replace}"
    )

    logging.info(f"Creating experiment object")

    experiment = MultiCoreExperiment(
        n_processes=n_processes,
        n_db_processes=n_db_processes,
        config=experiment_config,
        db_engine=triage.create_engine(db_url),
        project_path=output_path,
        #matrix_storage_class=HDFMatrixStore,
        replace=replace,
        cleanup=True,
        cleanup_timeout=2)

    logging.info(
        f"Experiment created: all the file permissions, and db connections are OK"
    )

    logging.info(f"Validating the experiment")

    experiment.validate()

    logging.info("""
           The experiment configuration doesn't contain any obvious errors.
           Any error that occurs possibly is related to number of columns or collision in
           the column names, both due to PostgreSQL limitations.
    """)

    logging.debug(f"Experiment configuration: {experiment.config}")

    experiment_name = os.path.splitext(os.path.split(experiment_file)[1])[0]

    logging.info(f"Running the experiment: {experiment_name}")

    experiment.run()

    end_time = datetime.datetime.now()

    logging.info(
        f"Experiment {experiment_file} completed in {end_time - start_time} seconds"
    )

    logging.info("Done!")