def main(configuration_path, signal_path, background_path, predictions_path,
         model_path, verbose):
    '''
    Train a classifier on signal and background monte carlo data and write the model
    to MODEL_PATH in pmml or pickle format.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    BACKGROUND_PATH: Path to the background data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    MODEL_PATH: Path to save the model to. Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    '''
    log = setup_logging(verbose=verbose)

    check_extension(predictions_path)
    check_extension(model_path, allowed_extensions=['.pmml', '.pkl', '.onnx'])

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.separator
    label_text = model_config.output_name

    log.info('Loading signal data')
    df_signal = read_telescope_data(
        signal_path,
        config,
        model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal)
    df_signal['label_text'] = 'signal'
    df_signal['label'] = 1

    log.info('Loading background data')
    df_background = read_telescope_data(
        background_path,
        config,
        model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_background)
    df_background['label_text'] = 'background'
    df_background['label'] = 0

    df = pd.concat([df_background, df_signal], ignore_index=True)

    df_train = convert_to_float32(df[model_config.features])
    log.debug('Total training events: {}'.format(len(df_train)))

    df_train.dropna(how='any', inplace=True)
    log.debug('Training events after dropping nans: {}'.format(len(df_train)))

    label = df.loc[df_train.index, 'label']

    # load optional columns if available to be able to make performance plots
    # vs true energy / size
    if config.true_energy_column is not None:
        true_energy = df.loc[df_train.index,
                             config.true_energy_column].to_numpy()
    if config.size_column is not None:
        size = df.loc[df_train.index, config.size_column].to_numpy()

    n_gammas = len(label[label == 1])
    n_protons = len(label[label == 0])
    log.info(
        'Training classifier with {} background and {} signal events'.format(
            n_protons, n_gammas))
    log.debug(model_config.features)

    # save prediction_path for each cv iteration
    cv_predictions = []

    # iterate over test and training sets
    X = df_train.values
    y = label.values
    n_cross_validations = model_config.n_cross_validations
    classifier = model_config.model

    log.info(
        'Starting {} fold cross validation... '.format(n_cross_validations))

    stratified_kfold = model_selection.StratifiedKFold(
        n_splits=n_cross_validations, shuffle=True, random_state=config.seed)

    aucs = []
    cv_it = stratified_kfold.split(X, y)
    for fold, (train, test) in enumerate(tqdm(cv_it,
                                              total=n_cross_validations)):
        # select data
        xtrain, xtest = X[train], X[test]
        ytrain, ytest = y[train], y[test]

        # fit and predict
        classifier.fit(xtrain, ytrain)

        y_probas = classifier.predict_proba(xtest)[:, 1]

        cv_df = pd.DataFrame({
            'label': ytest,
            model_config.output_name: y_probas,
            'cv_fold': fold,
        })
        if config.true_energy_column is not None:
            cv_df[config.true_energy_column] = true_energy[test]
        if config.size_column is not None:
            cv_df[config.size_column] = size[test]
        cv_predictions.append(cv_df)
        aucs.append(metrics.roc_auc_score(ytest, y_probas))

    aucs = np.array(aucs)
    log.info('Cross-validation ROC-AUCs: {}'.format(aucs))
    log.info('Mean AUC ROC : {:.3f} ± {:.3f}'.format(aucs.mean(), aucs.std()))

    predictions_df = pd.concat(cv_predictions, ignore_index=True)
    log.info('Writing predictions from cross validation')
    write_data(predictions_df, predictions_path, mode='w')

    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    classifier.random_state = config.seed

    if model_config.calibrate_classifier:
        log.info('Training calibrated classifier')
        classifier = CalibratedClassifierCV(classifier, cv=2, method='sigmoid')
        classifier.fit(X, y)
    else:
        log.info('Training model on complete dataset')
        classifier.fit(X, y)

    log.info('Saving model to {} ...'.format(model_path))
    save_model(classifier,
               model_path=model_path,
               label_text=label_text,
               feature_names=list(df_train.columns))
def main(configuration_path, signal_path, background_path, predictions_path, model_path, verbose):
    '''
    Train a classifier on signal and background monte carlo data and write the model
    to MODEL_PATH in pmml or pickle format.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    BACKGROUND_PATH: Path to the background data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    MODEL_PATH: Path to save the model to. Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    '''

    logging.getLogger().setLevel(logging.DEBUG if verbose else logging.INFO)

    check_extension(predictions_path)
    check_extension(model_path, allowed_extensions=['.pmml', '.pkl'])

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.separator

    log.info('Loading signal data')
    df_signal = read_telescope_data(
        signal_path, config, model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal
    )
    df_signal['label_text'] = 'signal'
    df_signal['label'] = 1

    log.info('Loading background data')
    df_background = read_telescope_data(
        background_path, config, model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_background
    )
    df_background['label_text'] = 'background'
    df_background['label'] = 0

    df_full = pd.concat([df_background, df_signal], ignore_index=True)

    df_training = convert_to_float32(df_full[model_config.features])
    log.debug('Total training events: {}'.format(len(df_training)))

    df_training.dropna(how='any', inplace=True)
    log.debug('Training events after dropping nans: {}'.format(len(df_training)))

    label = df_full.loc[df_training.index, 'label']

    n_gammas = len(label[label == 1])
    n_protons = len(label[label == 0])
    log.info('Training classifier with {} background and {} signal events'.format(
        n_protons, n_gammas
    ))
    log.debug(model_config.features)

    # save prediction_path for each cv iteration
    cv_predictions = []

    # iterate over test and training sets
    X = df_training.values
    y = label.values
    n_cross_validations = model_config.n_cross_validations
    classifier = model_config.model

    log.info('Starting {} fold cross validation... '.format(n_cross_validations))

    stratified_kfold = model_selection.StratifiedKFold(
        n_splits=n_cross_validations, shuffle=True, random_state=config.seed
    )

    aucs = []
    cv_it = stratified_kfold.split(X, y)
    for fold, (train, test) in enumerate(tqdm(cv_it, total=n_cross_validations)):
        # select data
        xtrain, xtest = X[train], X[test]
        ytrain, ytest = y[train], y[test]

        # fit and predict
        classifier.fit(xtrain, ytrain)

        y_probas = classifier.predict_proba(xtest)[:, 1]
        y_prediction = classifier.predict(xtest)

        cv_predictions.append(pd.DataFrame({
            'label': ytest,
            'label_prediction': y_prediction,
            'probabilities': y_probas,
            'cv_fold': fold,
        }))
        aucs.append(metrics.roc_auc_score(ytest, y_probas))

    log.info('Mean AUC ROC : {}'.format(np.array(aucs).mean()))

    predictions_df = pd.concat(cv_predictions, ignore_index=True)
    log.info('Writing predictions from cross validation')
    write_data(predictions_df, predictions_path, mode='w')

    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    classifier.random_state = config.seed

    if model_config.calibrate_classifier:
        log.info('Training calibrated classifier')
        classifier = CalibratedClassifierCV(classifier, cv=2, method='sigmoid')
        classifier.fit(X, y)
    else:
        log.info('Training model on complete dataset')
        classifier.fit(X, y)

    log.info('Pickling model to {} ...'.format(model_path))
    pickle_model(
        classifier=classifier,
        model_path=model_path,
        label_text='label',
        feature_names=list(df_training.columns)
    )
def main(configuration_path, signal_path, background_path, predictions_path,
         model_path, verbose):
    '''
    Train a classifier on signal and background monte carlo data and write the model
    to MODEL_PATH in pmml or pickle format.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    BACKGROUND_PATH: Path to the background data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    MODEL_PATH: Path to save the model to. Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    '''

    logging.getLogger().setLevel(logging.DEBUG if verbose else logging.INFO)

    check_extension(predictions_path)
    check_extension(model_path, allowed_extensions=['.pmml', '.pkl'])

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.separator

    columns = model_config.columns_to_read_train
    columns.append(config.energy.target_column)

    log.info('Loading signal data')
    df_signal = read_telescope_data(
        signal_path,
        config,
        columns,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal)
    df_signal['label_text'] = 'signal'
    df_signal['label'] = 1

    log.info('Loading background data')
    df_background = read_telescope_data(
        background_path,
        config,
        columns,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_background)
    df_background['label_text'] = 'background'
    df_background['label'] = 0

    df_full = pd.concat([df_background, df_signal], ignore_index=True)

    df_training = df_full.copy()
    log.debug('Total training events: {}'.format(len(df_training)))

    df_training.dropna(how='any', inplace=True)
    log.debug('Training events after dropping nans: {}'.format(
        len(df_training)))

    mc_energies = convert_to_float32(df_training[config.energy.target_column])
    label = df_training['label']
    df_training = convert_to_float32(df_training[model_config.features])

    #label = df_full.loc[df_training.index, 'label']

    n_gammas = len(label[label == 1])
    n_protons = len(label[label == 0])
    log.info(
        'Training classifier with {} background and {} signal events'.format(
            n_protons, n_gammas))
    log.debug(model_config.features)

    # save prediction_path for each cv iteration
    cv_predictions = []

    # iterate over test and training sets
    X = df_training.values
    y = label.values
    n_cross_validations = model_config.n_cross_validations
    classifier = model_config.model

    log.info(
        'Starting {} fold cross validation... '.format(n_cross_validations))

    stratified_kfold = model_selection.StratifiedKFold(
        n_splits=n_cross_validations, shuffle=True, random_state=config.seed)

    aucs = []
    cv_it = stratified_kfold.split(X, y)
    for fold, (train, test) in enumerate(tqdm(cv_it,
                                              total=n_cross_validations)):
        # select data
        xtrain, xtest = X[train], X[test]
        ytrain, ytest = y[train], y[test]

        # fit and predict
        classifier.fit(xtrain, ytrain)

        y_probas = classifier.predict_proba(xtest)[:, 1]
        y_prediction = classifier.predict(xtest)

        cv_predictions.append(
            pd.DataFrame({
                'label': ytest,
                'label_prediction': y_prediction,
                'probabilities': y_probas,
                'cv_fold': fold,
                'mc_energy': mc_energies[test],
            }))
        aucs.append(metrics.roc_auc_score(ytest, y_probas))

    log.info('Mean AUC ROC : {}'.format(np.array(aucs).mean()))

    predictions_df = pd.concat(cv_predictions, ignore_index=True)
    log.info('Writing predictions from cross validation')
    write_data(predictions_df, predictions_path, mode='w')

    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    classifier.random_state = config.seed

    if model_config.calibrate_classifier:
        log.info('Training calibrated classifier')
        classifier = CalibratedClassifierCV(classifier, cv=2, method='sigmoid')
        classifier.fit(X, y)
    else:
        log.info('Training model on complete dataset')
        classifier.fit(X, y)

    log.info('Pickling model to {} ...'.format(model_path))
    pickle_model(classifier=classifier,
                 model_path=model_path,
                 label_text='label',
                 feature_names=list(df_training.columns))