Example #1
0
def split_single_telescope_data(input_path, output_basename, fmt, inkey, key, fraction, name):

    if fmt in ['hdf5', 'hdf', 'h5']:
        data = read_data(input_path, key=inkey)
    elif fmt == 'csv':
        data = read_data(input_path)

    assert len(fraction) == len(name), 'You must give a name for each fraction'

    if sum(fraction) != 1:
        warnings.warn('Fractions do not sum up to 1')

    ids = data.index.values
    n_total = len(data)

    log.info('Found a total of {} single-telescope events in the file'.format(len(data)))

    num_ids = split_indices(ids, n_total, fractions=fraction)

    for n, part_name in zip(num_ids, name):
        selected_ids = np.random.choice(ids, size=n, replace=False)
        selected_data = data.loc[selected_ids]

        if fmt in ['hdf5', 'hdf', 'h5']:
            path = output_basename + '_' + part_name + '.hdf5'
            log.info('Writing {} telescope-array events to: {}'.format(n, path))
            write_data(selected_data, path, key=key, use_h5py=True, mode='w')

        elif fmt == 'csv':
            filename = output_basename + '_' + part_name + '.csv'
            log.info('Writing {} telescope-array events to: {}'.format(n, filename))
            selected_data.to_csv(filename, index=False)

        data = data.loc[list(set(data.index.values) - set(selected_data.index.values))]
        ids = data.index.values
Example #2
0
def test_write_data_h5py():
    from fact.io import write_data

    df = pd.DataFrame({
        'x': np.random.normal(size=50),
        'N': np.random.randint(0, 10, dtype='uint8')
    })

    with tempfile.NamedTemporaryFile(suffix='.hdf5') as f:
        write_data(df, f.name, use_h5py=True)
Example #3
0
def test_write_data_root():
    from fact.io import write_data

    df = pd.DataFrame({
        'x': np.random.normal(size=50),
        'N': np.random.randint(0, 10, dtype='uint8')
    })

    with pytest.raises(IOError):
        with tempfile.NamedTemporaryFile(suffix='.root') as f:
            write_data(df, f.name)
Example #4
0
def main(configuration_path, input_path, output_path, key, verbose):
    '''
    Apply cuts given in CONFIGURATION_PATH to the data in INPUT_PATH and
    write the result to OUTPUT_PATH.

    example:
    ```
    selection:
        length:
          - '<'
          - 0.06
    ```
    '''
    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
    log = logging.getLogger()

    with open(configuration_path) as f:
        config = yaml.safe_load(f)

    selection = config.get('selection', {})

    array_events = read_data(input_path, key='array_events')
    telescope_events = read_data(input_path, key='telescope_events')

    mask_telescope = create_mask_h5py(input_path,
                                      selection,
                                      key='telescope_events')
    selected_telescope_events = telescope_events[mask_telescope]

    array_events['idx'] = array_events.index
    merge = pd.merge(selected_telescope_events[['run_id', 'array_event_id']],
                     array_events[['run_id', 'array_event_id', 'idx']],
                     on=['run_id', 'array_event_id'],
                     how='left')
    selected_array_events = array_events[array_events.idx.isin(merge.idx)]

    write_data(selected_telescope_events,
               output_path,
               key='telescope_events',
               use_h5py=True,
               mode='w')
    write_data(selected_array_events,
               output_path,
               key='array_events',
               use_h5py=True,
               mode='a')

    with h5py.File(input_path,
                   mode='r') as infile, h5py.File(output_path,
                                                  'r+') as outfile:
        if 'runs' in infile.keys():
            log.info('Copying runs group to outputfile')
            infile.copy('/runs', outfile['/'])
Example #5
0
def test_read_data_h5py():
    '''
    Create a h5py hdf5 file from a dataframe and read it back.
    '''
    from fact.io import write_data, read_data

    df = pd.DataFrame({
        'x': np.random.normal(size=50).astype('float32'),
        'N': np.random.randint(0, 10, dtype='uint8', size=50)
    }).sort_index(1)

    with tempfile.NamedTemporaryFile(suffix='.hdf5') as f:
        write_data(df, f.name, use_h5py=True, key='lecker_daten')

        df_from_file = read_data(f.name, key='lecker_daten').sort_index(1)
        assert set(df.columns) == set(df_from_file.columns)
        assert df.equals(df_from_file)
Example #6
0
def test_read_data_csv():
    '''
    Write a csv file from a dataframe and then read it back again.
    '''
    from fact.io import write_data, read_data

    df = pd.DataFrame({
        'x': np.random.normal(size=50).astype('float32'),
        'N': np.random.randint(0, 10, dtype='uint8', size=50)
    })

    with tempfile.NamedTemporaryFile(suffix='.csv') as f:
        write_data(df, f.name)

        dtypes = {'x': 'float32', 'N': 'uint8'}
        df_from_file = read_data(f.name, dtype=dtypes)

        assert df.equals(df_from_file)
Example #7
0
def split_telescope_data(input_path, output_basename, fraction, name):

    array_events = read_data(input_path, key='array_events')
    telescope_events = read_data(input_path, key='telescope_events')
    runs = read_data(input_path, key='runs')

    # split by runs
    ids = set(runs.run_id)
    log.debug(f'All runs:{ids}')
    n_total = len(ids)

    log.info(f'Found a total of {n_total} runs in the file')
    num_runs = split_indices(ids, n_total, fractions=fraction)

    for n, part_name in zip(num_runs, name):
        selected_run_ids = np.random.choice(list(ids), size=n, replace=False)
        selected_runs = runs[runs.run_id.isin(selected_run_ids)]
        selected_array_events = array_events[array_events.run_id.isin(selected_run_ids)]
        selected_telescope_events = telescope_events[telescope_events.run_id.isin(selected_run_ids)]

        path = output_basename + '_' + part_name + '.hdf5'
        log.info('Writing {} runs events to: {}'.format(n, path))
        write_data(selected_runs, path, key='runs', use_h5py=True, mode='w')
        write_data(selected_array_events, path, key='array_events', 
                    use_h5py=True, mode='a')
        write_data(selected_telescope_events, path, key='telescope_events', 
                    use_h5py=True, mode='a')
        log.debug(f'selected runs {set(selected_run_ids)}')
        log.debug(f'Runs minus selected runs {ids - set(selected_run_ids)}')
        ids = ids - set(selected_run_ids)
Example #8
0
def apply(ctx, out_file, data, number_of_images):
    import fact.io as fio
    network = ctx.obj['network']
    model = load_model(network)

    p = '{}.index'.format(model_path)
    if not os.path.exists(p):
        print('No model trained yet. Do so first.')
        return

    if os.path.exists(out_file):
        click.confirm(
            'Do you want to overwrite existing file {}?'.format(out_file),
            abort=True)
        os.remove(out_file)

    if data == 'crab':
        df = image_io.apply_to_observation_data(model)

    elif data == 'gamma':
        df = image_io.apply_to_mc(model,
                                  path='./data/gamma_images.hdf5',
                                  N=number_of_images)
        shower_truth = fio.read_data('./data/gamma_images.hdf5', key='showers')
        fio.write_data(shower_truth,
                       file_path=out_file,
                       key='showers',
                       use_hp5y=True)

    elif data == 'proton':
        df = image_io.apply_to_mc(model,
                                  path='./data/proton_images.hdf5',
                                  N=number_of_images)
        shower_truth = fio.read_data('./data/proton_images.hdf5',
                                     key='showers')
        fio.write_data(shower_truth,
                       file_path=out_file,
                       key='showers',
                       use_hp5y=True)

    print('Writing {} events to file {}'.format(len(df), out_file))
    fio.write_data(df, out_file, key='events')
Example #9
0
def main(configuration_path, signal_path, predictions_path, disp_model_path, sign_model_path, key, verbose):
    '''
    Train two learners to be able to reconstruct the source position.
    One regressor for disp and one classifier for the sign of delta.

    Both pmml and pickle format are supported for the output.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    DISP_MODEL_PATH: Path to save the disp model to.

    SIGN_MODEL_PATH: Path to save the disp model to.
        Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    '''
    log = setup_logging(verbose=verbose)

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.disp

    np.random.seed(config.seed)

    disp_regressor = model_config.disp_regressor
    sign_classifier = model_config.sign_classifier

    disp_regressor.random_state = config.seed
    sign_classifier.random_state = config.seed

    log.info('Loading data')
    df = read_telescope_data(
        signal_path, config,
        model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal
    )
    log.info('Total number of events: {}'.format(len(df)))

    log.info(
        'Using coordinate transformations for %s',
        model_config.coordinate_transformation
    )

    df = convert_units(df, model_config)
    source_x, source_y = horizontal_to_camera(df, model_config)

    log.info('Using projected disp: {}'.format(model_config.project_disp))
    df['true_disp'], df['true_sign'] = calc_true_disp(
        source_x, source_y,
        df[model_config.cog_x_column], df[model_config.cog_y_column],
        df[model_config.delta_column],
        project_disp=model_config.project_disp,
    )

    # generate features if given in config
    if model_config.feature_generation:
        feature_generation(df, model_config.feature_generation, inplace=True)

    df_train = convert_to_float32(df[config.disp.features])
    df_train.dropna(how='any', inplace=True)

    log.info('Events after nan-dropping: {} '.format(len(df_train)))

    target_disp = df['true_disp'].loc[df_train.index]
    target_sign = df['true_sign'].loc[df_train.index]

    # load optional columns if available to be able to make performance plots
    # vs true energy / size
    if config.true_energy_column is not None:
        true_energy = df.loc[df_train.index, config.true_energy_column].to_numpy()
    if config.size_column is not None:
        size = df.loc[df_train.index, config.size_column].to_numpy()

    if model_config.log_target is True:
        target_disp = np.log(target_disp)

    log.info('Starting {} fold cross validation... '.format(
        model_config.n_cross_validations
    ))
    scores_disp = []
    scores_sign = []
    cv_predictions = []

    kfold = model_selection.KFold(
        n_splits=model_config.n_cross_validations,
        shuffle=True,
        random_state=config.seed,
    )

    total = model_config.n_cross_validations
    for fold, (train, test) in enumerate(tqdm(kfold.split(df_train.values), total=total)):

        cv_x_train, cv_x_test = df_train.values[train], df_train.values[test]

        cv_disp_train, cv_disp_test = target_disp.values[train], target_disp.values[test]
        cv_sign_train, cv_sign_test = target_sign.values[train], target_sign.values[test]

        disp_regressor.fit(cv_x_train, cv_disp_train)
        cv_disp_prediction = disp_regressor.predict(cv_x_test)

        if model_config.log_target is True:
            cv_disp_test = np.exp(cv_disp_test)
            cv_disp_prediction = np.exp(cv_disp_prediction)

        sign_classifier.fit(cv_x_train, cv_sign_train)
        # scale proba for positive sign to [-1, 1], so it's a nice score for the sign
        # where values close to -1 mean high confidence for - and values close to 1 mean
        # high confidence for +
        cv_sign_score = 2 * sign_classifier.predict_proba(cv_x_test)[:, 1] - 1
        cv_sign_prediction = np.where(cv_sign_score < 0, -1.0, 1.0)

        scores_disp.append(metrics.r2_score(cv_disp_test, cv_disp_prediction))
        scores_sign.append(metrics.accuracy_score(cv_sign_test, cv_sign_prediction))

        cv_df = pd.DataFrame({
            'disp': cv_disp_test,
            'disp_prediction': cv_disp_prediction,
            'sign': cv_sign_test,
            'sign_prediction': cv_sign_prediction,
            'sign_score': cv_sign_score,
            'cv_fold': fold,
        })
        if config.true_energy_column is not None:
            cv_df[config.true_energy_column] = true_energy[test]
        if config.size_column is not None:
            cv_df[config.size_column] = size[test]
        cv_predictions.append(cv_df)

    predictions_df = pd.concat(cv_predictions, ignore_index=True)

    log.info('writing predictions from cross validation')
    write_data(predictions_df, predictions_path, mode='w')

    scores_disp = np.array(scores_disp)
    scores_sign = np.array(scores_sign)
    log.info('Cross validated R^2 scores for disp: {}'.format(scores_disp))
    log.info('Mean R^2 score from CV: {:0.4f} ± {:0.4f}'.format(
        scores_disp.mean(), scores_disp.std()
    ))

    log.info('Cross validated accuracy for the sign: {}'.format(scores_sign))
    log.info('Mean accuracy from CV: {:0.4f} ± {:0.4f}'.format(
        scores_sign.mean(), scores_sign.std()
    ))

    log.info('Building new model on complete data set...')
    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    disp_regressor.random_state = config.seed
    sign_classifier.random_state = config.seed

    disp_regressor.fit(df_train.values, target_disp.values)
    sign_classifier.fit(df_train.values, target_sign.values)

    log.info('Pickling disp model to {} ...'.format(disp_model_path))
    save_model(
        disp_regressor,
        feature_names=list(df_train.columns),
        model_path=disp_model_path,
        label_text='abs_disp',
    )
    log.info('Pickling sign model to {} ...'.format(sign_model_path))
    save_model(
        sign_classifier,
        feature_names=list(df_train.columns),
        model_path=sign_model_path,
        label_text='sign_disp',
    )
def main(configuration_path, signal_path, predictions_path, disp_model_path, 
        sign_model_path, key, verbose, column_name):
    '''
    Train two learners to be able to reconstruct the source position.
    One regressor for disp and one classifier for the sign of delta.

    Both pmml and pickle format are supported for the output.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    DISP_MODEL_PATH: Path to save the disp model to.

    SIGN_MODEL_PATH: Path to save the disp model to.
        Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    '''

    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
    log = logging.getLogger()

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.disp

    disp_regressor = model_config.disp_regressor
    sign_classifier = model_config.sign_classifier

    disp_regressor.random_state = config.seed
    sign_classifier.random_state = config.seed

    columns = model_config.columns_to_read_train
    columns.append(config.energy.target_column)
    columns.append('focal_length')

    log.info('Loading data')
    df = read_telescope_data(
        signal_path, config,
        columns,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal
    )

    log.info('Total number of events: {}'.format(len(df)))

    source_x, source_y = horizontal_to_camera(
        az=df[model_config.source_az_column],
        alt=df[model_config.source_alt_column],
        az_pointing=df[model_config.pointing_az_column],
        alt_pointing=df[model_config.pointing_alt_column],
        focal_length=df['focal_length']
    )

    df['true_disp'], df['true_sign'] = calc_true_disp(
        source_x, source_y,
        df[model_config.cog_x_column], df[model_config.cog_y_column],
        df[model_config.delta_column],
    )

    # generate features if given in config
    if model_config.feature_generation:
        feature_generation(df, model_config.feature_generation, inplace=True)

    df_train = convert_to_float32(df)
    df_train.dropna(how='any', inplace=True)
    mc_energies = df_train[config.energy.target_column]
    df_train = df_train[config.disp.features]

    df['prediction_disp'] = np.zeros(len(df)) * np.nan

    log.info('Events after nan-dropping: {} '.format(len(df_train)))

    target_disp = df['true_disp'].loc[df_train.index]
    target_sign = df['true_sign'].loc[df_train.index]

    log.info('Starting {} fold cross validation... '.format(
        model_config.n_cross_validations
    ))
    
    scores_disp = []
    scores_sign = []
    cv_predictions = []

    kfold = model_selection.KFold(
        n_splits=model_config.n_cross_validations,
        shuffle=True,
        random_state=config.seed,
    )

    for fold, (train, test) in tqdm(enumerate(kfold.split(df_train.values))):

        cv_x_train, cv_x_test = df_train.values[train], df_train.values[test]

        cv_disp_train, cv_disp_test = target_disp.values[train], target_disp.values[test]
        cv_sign_train, cv_sign_test = target_sign.values[train], target_sign.values[test]

        disp_regressor.fit(cv_x_train, cv_disp_train)
        cv_disp_prediction = disp_regressor.predict(cv_x_test)

        sign_classifier.fit(cv_x_train, cv_sign_train)
        cv_sign_prediction = sign_classifier.predict(cv_x_test)
        cv_sign_proba = sign_classifier.predict_proba(cv_x_test)[:, 1]

        scores_disp.append(metrics.r2_score(cv_disp_test, cv_disp_prediction))
        
        scores_sign.append(metrics.accuracy_score(cv_sign_test, cv_sign_prediction))
        
        df.prediction_disp[test] = cv_disp_prediction

        cv_predictions.append(pd.DataFrame({
            'disp': cv_disp_test,
            'disp_prediction': cv_disp_prediction,
            'sign': cv_sign_test,
            'sign_prediction': cv_sign_prediction,
            'sign_probabilities': cv_sign_proba,
            'mc_energy': mc_energies[test],
            'cv_fold': fold
        }))

    predictions_df = pd.concat(cv_predictions, ignore_index=True)

    log.info('writing predictions from cross validation')
    write_data(predictions_df, predictions_path, mode='w')

    scores_disp = np.array(scores_disp)
    scores_sign = np.array(scores_sign)
    log.info('Cross validated R^2 scores for disp: {}'.format(scores_disp))
    log.info('Mean R^2 score from CV: {:0.4f} ± {:0.4f}'.format(
        scores_disp.mean(), scores_disp.std()
    ))

    log.info('Cross validated accuracy for the sign: {}'.format(scores_sign))
    log.info('Mean accuracy from CV: {:0.4f} ± {:0.4f}'.format(
        scores_sign.mean(), scores_sign.std()
    ))

    log.info('Writing new data set with predictions column')

    with h5py.File(signal_path, 'r+') as f:
            append_to_h5py(
                f, df.prediction_disp, 
                config.telescope_events_key, 
                column_name
            )

    log.info('Building new model on complete data set...')
    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    disp_regressor.random_state = config.seed
    sign_classifier.random_state = config.seed

    disp_regressor.fit(df_train.values, target_disp.values)
    sign_classifier.fit(df_train.values, target_sign.values)

    log.info('Pickling disp model to {} ...'.format(disp_model_path))
    pickle_model(
        disp_regressor,
        feature_names=list(df_train.columns),
        model_path=disp_model_path,
        label_text='disp',
    )
    log.info('Pickling sign model to {} ...'.format(sign_model_path))
    pickle_model(
        sign_classifier,
        feature_names=list(df_train.columns),
        model_path=sign_model_path,
        label_text='disp',
    )
def main(configuration_path, signal_path, predictions_path, model_path, verbose):
    '''
    Train an energy regressor simulated gamma.
    Both pmml and pickle format are supported for the output.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    MODEL_PATH: Path to save the model to.
        Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    '''
    logging.getLogger().setLevel(logging.DEBUG if verbose else logging.INFO)

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.energy

    df = read_telescope_data(
        signal_path, config, model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal
    )

    log.info('Total number of events: {}'.format(len(df)))

    df_train = convert_to_float32(df[model_config.features])
    df_train.dropna(how='any', inplace=True)

    log.debug('Events after nan-dropping: {} '.format(len(df_train)))

    target = df[model_config.target_column].loc[df_train.index]
    target.name = 'true_energy'

    if model_config.log_target is True:
        target = np.log(target)

    n_cv = model_config.n_cross_validations
    regressor = model_config.model
    log.info('Starting {} fold cross validation... '.format(n_cv))
    scores = []
    cv_predictions = []

    kfold = model_selection.KFold(n_splits=n_cv, shuffle=True, random_state=config.seed)

    for fold, (train, test) in enumerate(tqdm(kfold.split(df_train.values), total=n_cv)):

        cv_x_train, cv_x_test = df_train.values[train], df_train.values[test]
        cv_y_train, cv_y_test = target.values[train], target.values[test]

        regressor.fit(cv_x_train, cv_y_train)
        cv_y_prediction = regressor.predict(cv_x_test)

        if model_config.log_target is True:
            cv_y_test = np.exp(cv_y_test)
            cv_y_prediction = np.exp(cv_y_prediction)

        scores.append(metrics.r2_score(cv_y_test, cv_y_prediction))

        cv_predictions.append(pd.DataFrame({
            'label': cv_y_test,
            'label_prediction': cv_y_prediction,
            'cv_fold': fold
        }))

    predictions_df = pd.concat(cv_predictions, ignore_index=True)

    log.info('writing predictions from cross validation')
    write_data(predictions_df, predictions_path, mode='w')

    scores = np.array(scores)
    log.info('Cross validated R^2 scores: {}'.format(scores))
    log.info('Mean R^2 score from CV: {:0.4f} ± {:0.4f}'.format(
        scores.mean(), scores.std()
    ))

    log.info('Building new model on complete data set...')
    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    regressor.random_state = config.seed

    regressor.fit(df_train.values, target.values)

    log.info('Pickling model to {} ...'.format(model_path))
    pickle_model(
        regressor,
        feature_names=list(df_train.columns),
        model_path=model_path,
        label_text='estimated_energy',
    )
def main(configuration_path, signal_path, background_path, predictions_path, model_path, verbose):
    '''
    Train a classifier on signal and background monte carlo data and write the model
    to MODEL_PATH in pmml or pickle format.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    BACKGROUND_PATH: Path to the background data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    MODEL_PATH: Path to save the model to. Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    '''

    logging.getLogger().setLevel(logging.DEBUG if verbose else logging.INFO)

    check_extension(predictions_path)
    check_extension(model_path, allowed_extensions=['.pmml', '.pkl'])

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.separator

    log.info('Loading signal data')
    df_signal = read_telescope_data(
        signal_path, config, model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal
    )
    df_signal['label_text'] = 'signal'
    df_signal['label'] = 1

    log.info('Loading background data')
    df_background = read_telescope_data(
        background_path, config, model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_background
    )
    df_background['label_text'] = 'background'
    df_background['label'] = 0

    df_full = pd.concat([df_background, df_signal], ignore_index=True)

    df_training = convert_to_float32(df_full[model_config.features])
    log.debug('Total training events: {}'.format(len(df_training)))

    df_training.dropna(how='any', inplace=True)
    log.debug('Training events after dropping nans: {}'.format(len(df_training)))

    label = df_full.loc[df_training.index, 'label']

    n_gammas = len(label[label == 1])
    n_protons = len(label[label == 0])
    log.info('Training classifier with {} background and {} signal events'.format(
        n_protons, n_gammas
    ))
    log.debug(model_config.features)

    # save prediction_path for each cv iteration
    cv_predictions = []

    # iterate over test and training sets
    X = df_training.values
    y = label.values
    n_cross_validations = model_config.n_cross_validations
    classifier = model_config.model

    log.info('Starting {} fold cross validation... '.format(n_cross_validations))

    stratified_kfold = model_selection.StratifiedKFold(
        n_splits=n_cross_validations, shuffle=True, random_state=config.seed
    )

    aucs = []
    cv_it = stratified_kfold.split(X, y)
    for fold, (train, test) in enumerate(tqdm(cv_it, total=n_cross_validations)):
        # select data
        xtrain, xtest = X[train], X[test]
        ytrain, ytest = y[train], y[test]

        # fit and predict
        classifier.fit(xtrain, ytrain)

        y_probas = classifier.predict_proba(xtest)[:, 1]
        y_prediction = classifier.predict(xtest)

        cv_predictions.append(pd.DataFrame({
            'label': ytest,
            'label_prediction': y_prediction,
            'probabilities': y_probas,
            'cv_fold': fold,
        }))
        aucs.append(metrics.roc_auc_score(ytest, y_probas))

    log.info('Mean AUC ROC : {}'.format(np.array(aucs).mean()))

    predictions_df = pd.concat(cv_predictions, ignore_index=True)
    log.info('Writing predictions from cross validation')
    write_data(predictions_df, predictions_path, mode='w')

    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    classifier.random_state = config.seed

    if model_config.calibrate_classifier:
        log.info('Training calibrated classifier')
        classifier = CalibratedClassifierCV(classifier, cv=2, method='sigmoid')
        classifier.fit(X, y)
    else:
        log.info('Training model on complete dataset')
        classifier.fit(X, y)

    log.info('Pickling model to {} ...'.format(model_path))
    pickle_model(
        classifier=classifier,
        model_path=model_path,
        label_text='label',
        feature_names=list(df_training.columns)
    )
def main(configuration_path, signal_path, predictions_path, disp_model_path, sign_model_path, key, verbose):
    '''
    Train two learners to be able to reconstruct the source position.
    One regressor for disp and one classifier for the sign of delta.

    Both pmml and pickle format are supported for the output.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    DISP_MODEL_PATH: Path to save the disp model to.

    SIGN_MODEL_PATH: Path to save the disp model to.
        Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    '''

    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
    log = logging.getLogger()

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.disp

    np.random.seed(config.seed)

    disp_regressor = model_config.disp_regressor
    sign_classifier = model_config.sign_classifier

    disp_regressor.random_state = config.seed
    sign_classifier.random_state = config.seed

    log.info('Loading data')
    df = read_telescope_data(
        signal_path, config,
        model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal
    )
    log.info('Total number of events: {}'.format(len(df)))

    source_x, source_y = horizontal_to_camera(
        az=df[model_config.source_az_column],
        zd=df[model_config.source_zd_column],
        az_pointing=df[model_config.pointing_az_column],
        zd_pointing=df[model_config.pointing_zd_column],
    )

    df['true_disp'], df['true_sign'] = calc_true_disp(
        source_x, source_y,
        df[model_config.cog_x_column], df[model_config.cog_y_column],
        df[model_config.delta_column],
    )

    # generate features if given in config
    if model_config.feature_generation:
        feature_generation(df, model_config.feature_generation, inplace=True)

    df_train = convert_to_float32(df[config.disp.features])
    df_train.dropna(how='any', inplace=True)

    log.info('Events after nan-dropping: {} '.format(len(df_train)))

    target_disp = df['true_disp'].loc[df_train.index]
    target_sign = df['true_sign'].loc[df_train.index]

    log.info('Starting {} fold cross validation... '.format(
        model_config.n_cross_validations
    ))
    scores_disp = []
    scores_sign = []
    cv_predictions = []

    kfold = model_selection.KFold(
        n_splits=model_config.n_cross_validations,
        shuffle=True,
        random_state=config.seed,
    )

    total = model_config.n_cross_validations
    for fold, (train, test) in enumerate(tqdm(kfold.split(df_train.values), total=total)):

        cv_x_train, cv_x_test = df_train.values[train], df_train.values[test]

        cv_disp_train, cv_disp_test = target_disp.values[train], target_disp.values[test]
        cv_sign_train, cv_sign_test = target_sign.values[train], target_sign.values[test]

        disp_regressor.fit(cv_x_train, cv_disp_train)
        cv_disp_prediction = disp_regressor.predict(cv_x_test)

        sign_classifier.fit(cv_x_train, cv_sign_train)
        cv_sign_prediction = sign_classifier.predict(cv_x_test)

        scores_disp.append(metrics.r2_score(cv_disp_test, cv_disp_prediction))
        scores_sign.append(metrics.accuracy_score(cv_sign_test, cv_sign_prediction))

        cv_predictions.append(pd.DataFrame({
            'disp': cv_disp_test,
            'disp_prediction': cv_disp_prediction,
            'sign': cv_sign_test,
            'sign_prediction': cv_sign_prediction,
            'cv_fold': fold
        }))

    predictions_df = pd.concat(cv_predictions, ignore_index=True)

    log.info('writing predictions from cross validation')
    write_data(predictions_df, predictions_path, mode='w')

    scores_disp = np.array(scores_disp)
    scores_sign = np.array(scores_sign)
    log.info('Cross validated R^2 scores for disp: {}'.format(scores_disp))
    log.info('Mean R^2 score from CV: {:0.4f} ± {:0.4f}'.format(
        scores_disp.mean(), scores_disp.std()
    ))

    log.info('Cross validated accuracy for the sign: {}'.format(scores_sign))
    log.info('Mean accuracy from CV: {:0.4f} ± {:0.4f}'.format(
        scores_sign.mean(), scores_sign.std()
    ))

    log.info('Building new model on complete data set...')
    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    disp_regressor.random_state = config.seed
    sign_classifier.random_state = config.seed

    disp_regressor.fit(df_train.values, target_disp.values)
    sign_classifier.fit(df_train.values, target_sign.values)

    log.info('Pickling disp model to {} ...'.format(disp_model_path))
    pickle_model(
        disp_regressor,
        feature_names=list(df_train.columns),
        model_path=disp_model_path,
        label_text='disp',
    )
    log.info('Pickling sign model to {} ...'.format(sign_model_path))
    pickle_model(
        sign_classifier,
        feature_names=list(df_train.columns),
        model_path=sign_model_path,
        label_text='disp',
    )
Example #14
0
def write_hdf(data, path, table_name, mode="w", **kwargs):
    write_data(data, path, key=table_name, use_h5py=True, mode=mode, **kwargs)
def main(configuration_path, signal_path, background_path, predictions_path,
         model_path, verbose):
    '''
    Train a classifier on signal and background monte carlo data and write the model
    to MODEL_PATH in pmml or pickle format.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    BACKGROUND_PATH: Path to the background data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    MODEL_PATH: Path to save the model to. Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    '''
    log = setup_logging(verbose=verbose)

    check_extension(predictions_path)
    check_extension(model_path, allowed_extensions=['.pmml', '.pkl', '.onnx'])

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.separator
    label_text = model_config.output_name

    log.info('Loading signal data')
    df_signal = read_telescope_data(
        signal_path,
        config,
        model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal)
    df_signal['label_text'] = 'signal'
    df_signal['label'] = 1

    log.info('Loading background data')
    df_background = read_telescope_data(
        background_path,
        config,
        model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_background)
    df_background['label_text'] = 'background'
    df_background['label'] = 0

    df = pd.concat([df_background, df_signal], ignore_index=True)

    df_train = convert_to_float32(df[model_config.features])
    log.debug('Total training events: {}'.format(len(df_train)))

    df_train.dropna(how='any', inplace=True)
    log.debug('Training events after dropping nans: {}'.format(len(df_train)))

    label = df.loc[df_train.index, 'label']

    # load optional columns if available to be able to make performance plots
    # vs true energy / size
    if config.true_energy_column is not None:
        true_energy = df.loc[df_train.index,
                             config.true_energy_column].to_numpy()
    if config.size_column is not None:
        size = df.loc[df_train.index, config.size_column].to_numpy()

    n_gammas = len(label[label == 1])
    n_protons = len(label[label == 0])
    log.info(
        'Training classifier with {} background and {} signal events'.format(
            n_protons, n_gammas))
    log.debug(model_config.features)

    # save prediction_path for each cv iteration
    cv_predictions = []

    # iterate over test and training sets
    X = df_train.values
    y = label.values
    n_cross_validations = model_config.n_cross_validations
    classifier = model_config.model

    log.info(
        'Starting {} fold cross validation... '.format(n_cross_validations))

    stratified_kfold = model_selection.StratifiedKFold(
        n_splits=n_cross_validations, shuffle=True, random_state=config.seed)

    aucs = []
    cv_it = stratified_kfold.split(X, y)
    for fold, (train, test) in enumerate(tqdm(cv_it,
                                              total=n_cross_validations)):
        # select data
        xtrain, xtest = X[train], X[test]
        ytrain, ytest = y[train], y[test]

        # fit and predict
        classifier.fit(xtrain, ytrain)

        y_probas = classifier.predict_proba(xtest)[:, 1]

        cv_df = pd.DataFrame({
            'label': ytest,
            model_config.output_name: y_probas,
            'cv_fold': fold,
        })
        if config.true_energy_column is not None:
            cv_df[config.true_energy_column] = true_energy[test]
        if config.size_column is not None:
            cv_df[config.size_column] = size[test]
        cv_predictions.append(cv_df)
        aucs.append(metrics.roc_auc_score(ytest, y_probas))

    aucs = np.array(aucs)
    log.info('Cross-validation ROC-AUCs: {}'.format(aucs))
    log.info('Mean AUC ROC : {:.3f} ± {:.3f}'.format(aucs.mean(), aucs.std()))

    predictions_df = pd.concat(cv_predictions, ignore_index=True)
    log.info('Writing predictions from cross validation')
    write_data(predictions_df, predictions_path, mode='w')

    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    classifier.random_state = config.seed

    if model_config.calibrate_classifier:
        log.info('Training calibrated classifier')
        classifier = CalibratedClassifierCV(classifier, cv=2, method='sigmoid')
        classifier.fit(X, y)
    else:
        log.info('Training model on complete dataset')
        classifier.fit(X, y)

    log.info('Saving model to {} ...'.format(model_path))
    save_model(classifier,
               model_path=model_path,
               label_text=label_text,
               feature_names=list(df_train.columns))
Example #16
0
def main(configuration_path, signal_path, predictions_path, model_path,
         verbose, column_name):
    '''
    Train a x_max regressor.
    Both pmml and pickle format are supported for the output.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    MODEL_PATH: Path to save the model to.
        Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    '''
    logging.getLogger().setLevel(logging.DEBUG if verbose else logging.INFO)

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.x_max

    columns = model_config.columns_to_read_train
    columns.append(config.energy.target_column)

    df = read_telescope_data(
        signal_path,
        config,
        columns,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal)

    log.info('Total number of events: {}'.format(len(df)))

    df_train = convert_to_float32(df)
    df_train.dropna(how='any', inplace=True)
    mc_energies = df_train[config.energy.target_column]
    df_train = df_train[config.x_max.features]

    df['prediction_x_max'] = np.zeros(len(df)) * np.nan

    log.debug('Events after nan-dropping: {} '.format(len(df_train)))

    target = df[model_config.target_column].loc[df_train.index]
    target.name = 'true_x_max'

    n_cross_validations = model_config.n_cross_validations
    regressor = model_config.model
    log.info(
        'Starting {} fold cross validation... '.format(n_cross_validations))

    scores = []
    cv_predictions = []

    kfold = model_selection.KFold(n_splits=n_cross_validations,
                                  shuffle=True,
                                  random_state=config.seed)

    for fold, (train, test) in tqdm(enumerate(kfold.split(df_train.values))):

        cv_x_train, cv_x_test = df_train.values[train], df_train.values[test]
        cv_y_train, cv_y_test = target.values[train], target.values[test]

        regressor.fit(cv_x_train, cv_y_train)
        cv_y_prediction = regressor.predict(cv_x_test)

        if model_config.log_target is True:
            cv_y_test = np.exp(cv_y_test)
            cv_y_prediction = np.exp(cv_y_prediction)

        scores.append(metrics.r2_score(cv_y_test, cv_y_prediction))

        df.prediction_x_max[test] = cv_y_prediction

        cv_predictions.append(
            pd.DataFrame({
                'x_max': cv_y_test,
                'x_max_prediction': cv_y_prediction,
                'mc_energy': mc_energies[test],
                'cv_fold': fold
            }))

    predictions_df = pd.concat(cv_predictions, ignore_index=True)

    log.info('writing predictions from cross validation')
    write_data(predictions_df, predictions_path, mode='w')

    scores = np.array(scores)

    #write telescope data as a new file (equivalent to apply)
    log.info('Writing new data set with predictions column')

    with h5py.File(signal_path, 'r+') as f:
        append_to_h5py(f, df.prediction_x_max, config.telescope_events_key,
                       column_name)

    log.info('Cross validated R^2 scores: {}'.format(scores))
    log.info('Mean R^2 score from CV: {:0.4f} ± {:0.4f}'.format(
        scores.mean(), scores.std()))

    log.info('Building new model on complete data set...')
    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    regressor.random_state = config.seed

    regressor.fit(df_train.values, target.values)

    log.info('Pickling model to {} ...'.format(model_path))
    pickle_model(
        regressor,
        feature_names=list(df_train.columns),
        model_path=model_path,
        label_text=column_name,
    )
def main(configuration_path, signal_path, background_path, predictions_path,
         model_path, verbose):
    '''
    Train a classifier on signal and background monte carlo data and write the model
    to MODEL_PATH in pmml or pickle format.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    BACKGROUND_PATH: Path to the background data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    MODEL_PATH: Path to save the model to. Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    '''

    logging.getLogger().setLevel(logging.DEBUG if verbose else logging.INFO)

    check_extension(predictions_path)
    check_extension(model_path, allowed_extensions=['.pmml', '.pkl'])

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.separator

    columns = model_config.columns_to_read_train
    columns.append(config.energy.target_column)

    log.info('Loading signal data')
    df_signal = read_telescope_data(
        signal_path,
        config,
        columns,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal)
    df_signal['label_text'] = 'signal'
    df_signal['label'] = 1

    log.info('Loading background data')
    df_background = read_telescope_data(
        background_path,
        config,
        columns,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_background)
    df_background['label_text'] = 'background'
    df_background['label'] = 0

    df_full = pd.concat([df_background, df_signal], ignore_index=True)

    df_training = df_full.copy()
    log.debug('Total training events: {}'.format(len(df_training)))

    df_training.dropna(how='any', inplace=True)
    log.debug('Training events after dropping nans: {}'.format(
        len(df_training)))

    mc_energies = convert_to_float32(df_training[config.energy.target_column])
    label = df_training['label']
    df_training = convert_to_float32(df_training[model_config.features])

    #label = df_full.loc[df_training.index, 'label']

    n_gammas = len(label[label == 1])
    n_protons = len(label[label == 0])
    log.info(
        'Training classifier with {} background and {} signal events'.format(
            n_protons, n_gammas))
    log.debug(model_config.features)

    # save prediction_path for each cv iteration
    cv_predictions = []

    # iterate over test and training sets
    X = df_training.values
    y = label.values
    n_cross_validations = model_config.n_cross_validations
    classifier = model_config.model

    log.info(
        'Starting {} fold cross validation... '.format(n_cross_validations))

    stratified_kfold = model_selection.StratifiedKFold(
        n_splits=n_cross_validations, shuffle=True, random_state=config.seed)

    aucs = []
    cv_it = stratified_kfold.split(X, y)
    for fold, (train, test) in enumerate(tqdm(cv_it,
                                              total=n_cross_validations)):
        # select data
        xtrain, xtest = X[train], X[test]
        ytrain, ytest = y[train], y[test]

        # fit and predict
        classifier.fit(xtrain, ytrain)

        y_probas = classifier.predict_proba(xtest)[:, 1]
        y_prediction = classifier.predict(xtest)

        cv_predictions.append(
            pd.DataFrame({
                'label': ytest,
                'label_prediction': y_prediction,
                'probabilities': y_probas,
                'cv_fold': fold,
                'mc_energy': mc_energies[test],
            }))
        aucs.append(metrics.roc_auc_score(ytest, y_probas))

    log.info('Mean AUC ROC : {}'.format(np.array(aucs).mean()))

    predictions_df = pd.concat(cv_predictions, ignore_index=True)
    log.info('Writing predictions from cross validation')
    write_data(predictions_df, predictions_path, mode='w')

    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    classifier.random_state = config.seed

    if model_config.calibrate_classifier:
        log.info('Training calibrated classifier')
        classifier = CalibratedClassifierCV(classifier, cv=2, method='sigmoid')
        classifier.fit(X, y)
    else:
        log.info('Training model on complete dataset')
        classifier.fit(X, y)

    log.info('Pickling model to {} ...'.format(model_path))
    pickle_model(classifier=classifier,
                 model_path=model_path,
                 label_text='label',
                 feature_names=list(df_training.columns))
def main(configuration_path, signal_path, predictions_path, dxdy_model_path,
         key, verbose):
    """
    Train one learner to be able to reconstruct the source position.
    One regressor for multiple outputs (dx,dy).

    Both pmml and pickle format are supported for the output.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    DXDY_MODEL_PATH: Path to save the dxdy model to.

        Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    """
    log = setup_logging(verbose=verbose)

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.dxdy

    np.random.seed(config.seed)

    dxdy_regressor = model_config.dxdy_regressor
    dxdy_regressor.random_state = config.seed

    log.info("Loading data")
    df = read_telescope_data(
        signal_path,
        config,
        model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal,
    )
    log.info("Total number of events: {}".format(len(df)))
    log.info(
        "Using coordinate transformations for %s",
        config.coordinate_transformation,
    )

    source_x, source_y = horizontal_to_camera(df, config)

    df["true_dx"] = source_x - df[config.cog_x_column]
    df["true_dy"] = source_y - df[config.cog_y_column]

    # generate features if given in config
    if model_config.feature_generation:
        feature_generation(df, model_config.feature_generation, inplace=True)

    df_train = convert_to_float32(df[model_config.features])
    df_train.dropna(how="any", inplace=True)

    log.info("Events after nan-dropping: {} ".format(len(df_train)))

    target_dxdy = df.loc[df_train.index, ["true_dx", "true_dy"]].to_numpy()

    # load optional columns if available to be able to make performance plots
    # vs true energy / size
    if config.true_energy_column is not None:
        true_energy = df.loc[df_train.index,
                             config.true_energy_column].to_numpy()
    if config.size_column is not None:
        size = df.loc[df_train.index, config.size_column].to_numpy()

    log.info("Starting {} fold cross validation... ".format(
        model_config.n_cross_validations))
    scores_dxdy = []
    cv_predictions = []

    kfold = model_selection.KFold(
        n_splits=model_config.n_cross_validations,
        shuffle=True,
        random_state=config.seed,
    )

    total = model_config.n_cross_validations
    for fold, (train, test) in enumerate(
            tqdm(kfold.split(df_train.values), total=total)):

        cv_x_train, cv_x_test = df_train.values[train], df_train.values[test]

        cv_dxdy_train = target_dxdy[train]
        cv_dxdy_test = target_dxdy[test]

        dxdy_regressor.fit(cv_x_train, cv_dxdy_train)
        cv_dxdy_prediction = dxdy_regressor.predict(cv_x_test)

        scores_dxdy.append(metrics.r2_score(cv_dxdy_test, cv_dxdy_prediction))
        cv_df = pd.DataFrame({
            "dx": cv_dxdy_test[:, 0],
            "dy": cv_dxdy_test[:, 1],
            "dx_prediction": cv_dxdy_prediction[:, 0],
            "dy_prediction": cv_dxdy_prediction[:, 1],
            "cv_fold": fold,
        })
        if config.true_energy_column is not None:
            cv_df[config.true_energy_column] = true_energy[test]
        if config.size_column is not None:
            cv_df[config.size_column] = size[test]
        cv_predictions.append(cv_df)

    predictions_df = pd.concat(cv_predictions, ignore_index=True)

    log.info("writing predictions from cross validation")
    write_data(predictions_df, predictions_path, mode="w")

    scores_dxdy = np.array(scores_dxdy)
    log.info("Cross validated R^2 scores for dxdy: {}".format(scores_dxdy))
    log.info("Mean R^2 score from CV: {:0.4f} ± {:0.4f}".format(
        scores_dxdy.mean(), scores_dxdy.std()))

    log.info("Building new model on complete data set...")
    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    dxdy_regressor.random_state = config.seed

    dxdy_regressor.fit(df_train.values, target_dxdy)

    log.info("Pickling dxdy model to {} ...".format(dxdy_model_path))
    save_model(
        dxdy_regressor,
        feature_names=list(df_train.columns),
        model_path=dxdy_model_path,
        label_text="dxdy",
    )
def main(configuration_path, signal_path, predictions_path, model_path,
         verbose):
    """
    Train an energy regressor simulated gamma.
    Both pmml and pickle format are supported for the output.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    MODEL_PATH: Path to save the model to.
        Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    """
    log = setup_logging(verbose=verbose)

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.energy

    df = read_telescope_data(
        signal_path,
        config,
        model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal,
    )

    log.info("Total number of events: {}".format(len(df)))

    df_train = convert_to_float32(df[model_config.features])
    df_train.dropna(how="any", inplace=True)

    log.debug("Events after nan-dropping: {} ".format(len(df_train)))

    target = df[model_config.target_column].loc[df_train.index]
    target.name = "true_energy"

    if model_config.log_target is True:
        target = np.log(target)

    n_cv = model_config.n_cross_validations
    regressor = model_config.model
    log.info("Starting {} fold cross validation... ".format(n_cv))
    scores = []
    cv_predictions = []

    kfold = model_selection.KFold(n_splits=n_cv,
                                  shuffle=True,
                                  random_state=config.seed)

    for fold, (train, test) in enumerate(
            tqdm(kfold.split(df_train.values), total=n_cv)):

        cv_x_train, cv_x_test = df_train.values[train], df_train.values[test]
        cv_y_train, cv_y_test = target.values[train], target.values[test]

        regressor.fit(cv_x_train, cv_y_train)
        cv_y_prediction = regressor.predict(cv_x_test)

        if model_config.log_target is True:
            cv_y_test = np.exp(cv_y_test)
            cv_y_prediction = np.exp(cv_y_prediction)

        scores.append(metrics.r2_score(cv_y_test, cv_y_prediction))

        cv_df = pd.DataFrame({
            model_config.target_column: cv_y_test,
            model_config.output_name: cv_y_prediction,
            "cv_fold": fold,
        })
        cv_predictions.append(cv_df)

    predictions_df = pd.concat(cv_predictions, ignore_index=True)

    log.info("writing predictions from cross validation")
    write_data(predictions_df, predictions_path, mode="w")

    scores = np.array(scores)
    log.info("Cross validated R^2 scores: {}".format(scores))
    log.info("Mean R^2 score from CV: {:0.4f} ± {:0.4f}".format(
        scores.mean(), scores.std()))

    log.info("Building new model on complete data set...")
    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    regressor.random_state = config.seed

    regressor.fit(df_train.values, target.values)

    log.info("Pickling model to {} ...".format(model_path))
    save_model(
        regressor,
        feature_names=list(df_train.columns),
        model_path=model_path,
        label_text=model_config.output_name,
    )
Example #20
0
def main(configuration_path, input_path1, input_path2, output_path1,
         output_path2):  #features, bins):
    '''
    Equalize two datasets in INPUT_PATH1 and INPUT_PATH2 in the feature `intensity`.
    In case of multiple telescopes, the feature is averaged.
    Outputs hdf5 files to OUTPUT_BASENAME_NAME.FORMAT

    Important remark: No run information is stored in the output file,
    since it is no longer valid due to sampling. Therefore, these output files
    should only be used for training. 

    '''

    config = AICTConfig.from_yaml(configuration_path)

    array_events1 = read_data(input_path1, key='array_events')
    array_events1['arr_id_unique'] = array_events1.index
    telescope_events1 = read_data(input_path1, key='telescope_events')
    telescope_events1['tel_id_unique'] = telescope_events1.index
    df1 = pd.merge(array_events1,
                   telescope_events1,
                   on=['run_id', 'array_event_id'],
                   how='left')

    array_events2 = read_data(input_path2, key='array_events')
    array_events2['arr_id_unique'] = array_events2.index
    telescope_events2 = read_data(input_path2, key='telescope_events')
    telescope_events2['tel_id_unique'] = telescope_events2.index
    df2 = pd.merge(array_events2,
                   telescope_events2,
                   on=['run_id', 'array_event_id'])

    print('Number of events before: ', '\nFile 1: ', len(array_events1),
          '\nFile 2: ', len(array_events2))

    if config.has_multiple_telescopes:
        feature = 'average_intensity'
    else:
        feature = 'intensity'

    minimum = np.min([np.nanmin(df1[feature]), np.nanmin(df2[feature])])
    maximum = np.max([np.nanmax(df1[feature]), np.nanmax(df2[feature])])

    minimum_log = np.log10(minimum)
    maximum_log = np.log10(maximum)

    binning = np.logspace(minimum_log, maximum_log,
                          int((maximum_log - minimum_log) / 0.05))

    arr_ids1 = np.array([])
    arr_ids2 = np.array([])

    for start, end in zip(binning[:-1], binning[1:]):

        ids1 = df1[(df1[feature] > start) & (df1[feature] < end)].arr_id_unique
        ids2 = df2[(df2[feature] > start) & (df2[feature] < end)].arr_id_unique

        if len(ids1) < len(ids2):
            arr_ids1 = np.append(arr_ids1, ids1)
            arr_ids2 = np.append(
                arr_ids2, np.random.choice(ids2, size=len(ids1),
                                           replace=False))

        else:
            arr_ids2 = np.append(arr_ids2, ids2)
            arr_ids1 = np.append(
                arr_ids1, np.random.choice(ids1, size=len(ids2),
                                           replace=False))

    print('Number of events after: ', '\nFile 1: ', len(arr_ids1),
          '\nFile 2: ', len(arr_ids2))

    tel_ids1 = df1[df1.arr_id_unique.isin(arr_ids1)].tel_id_unique
    selected_telescope_events1 = telescope_events1[
        telescope_events1.tel_id_unique.isin(tel_ids1)]
    selected_telescope_events1.drop(columns=['tel_id_unique'], inplace=True)
    write_data(selected_telescope_events1,
               output_path1,
               key='telescope_events',
               use_h5py=True,
               mode='w')
    selected_array_events1 = array_events1[array_events1.arr_id_unique.isin(
        arr_ids1)]
    selected_array_events1.drop(columns=['arr_id_unique'], inplace=True)
    write_data(array_events1,
               output_path1,
               key='array_events',
               use_h5py=True,
               mode='a')

    tel_ids2 = df2[df2.arr_id_unique.isin(arr_ids2)].tel_id_unique
    selected_telescope_events2 = telescope_events2[
        telescope_events2.tel_id_unique.isin(tel_ids2)]
    selected_telescope_events2.drop(columns=['tel_id_unique'], inplace=True)
    write_data(selected_telescope_events2,
               output_path2,
               key='telescope_events',
               use_h5py=True,
               mode='w')
    selected_array_events2 = array_events2[array_events2.arr_id_unique.isin(
        arr_ids2)]
    selected_array_events2.drop(columns=['arr_id_unique'], inplace=True)
    write_data(array_events2,
               output_path2,
               key='array_events',
               use_h5py=True,
               mode='a')