def get_composition_pipeline(pipeline, p, use_sample_weights):
    if p is None:
        pipeline_str = '{}_comp_{}_{}-groups'.format(pipeline, config,
                                                     num_groups)
        pipeline = comp.load_trained_model(pipeline_str)
    if use_sample_weights is not None:
        model = use_sample_weights
        pipeline_str = '{}_comp_{}_{}-groups'.format(pipeline, config,
                                                     num_groups)
        pipeline = comp.get_pipeline(pipeline_str)
        compositions = df_sim_train['comp_group_{}'.format(num_groups)].values
        energies = df_sim_train['reco_energy'].values
        sample_weight = calculate_sample_weights(compositions,
                                                 energies,
                                                 model=model)
        X = df_sim_train[feature_list].values
        y = df_sim_train['comp_target_{}'.format(num_groups)].values
        fit_params = {'classifier__sample_weight': sample_weight}
        pipeline.fit(X, y, **fit_params)

    return pipeline
    data_dir = os.path.join(comp.paths.comp_data_dir, config, 'unfolding',
                            'datachallenge')

    # Load simulation and train composition classifier
    df_sim_train, df_sim_test = comp.load_sim(config=config,
                                              energy_reco=False,
                                              log_energy_min=None,
                                              log_energy_max=None,
                                              test_size=0.5,
                                              verbose=True)

    feature_list, feature_labels = comp.get_training_features()

    print('Loading energy regressor...')
    energy_pipeline = comp.load_trained_model(
        'linearregression_energy_{}'.format(config))
    # energy_pipeline = comp.load_trained_model('RF_energy_{}'.format(config))
    for df in [df_sim_train, df_sim_test]:
        df['reco_log_energy'] = energy_pipeline.predict(
            df[feature_list].values)
        df['reco_energy'] = 10**df['reco_log_energy']

    print('Loading or fitting composition classifier...')
    if any([
            args.weights_model, args.energy_spectrum_weights,
            args.compositon_weights
    ]):
        model = args.weights_model
        energy_spectrum_weights = args.energy_spectrum_weights
        compositon_weights = args.compositon_weights
def get_classified_fractions(df_train,
                             df_test,
                             pipeline_str=None,
                             num_groups=4,
                             energy_key='MC_log_energy'):
    '''Calculates the fraction of correctly identified samples in each energy bin
    for each composition in comp_list. In addition, the statisitcal error for the
    fraction correctly identified is calculated.'''

    # Input validation
    if energy_key not in ['MC_log_energy', 'reco_log_energy']:
        raise ValueError(
            "Invalid energy_key ({}) entered. Must be either "
            "'MC_log_energy' or 'reco_log_energy'.".format(energy_key))

    if pipeline_str is None:
        pipeline_str = 'BDT_comp_IC86.2012_{}-groups'.format(num_groups)

    # Fit pipeline and get mask for correctly identified events
    feature_list, feature_labels = comp.get_training_features()
    if 'CustomClassifier' in pipeline_str:
        pipeline = comp.get_pipeline(pipeline_str)
    else:
        pipeline = comp.load_trained_model(pipeline_str)
    comp_target_str = 'comp_target_{}'.format(num_groups)

    if 'CustomClassifier' in pipeline_str:
        test_predictions = pipeline.predict(
            df_test['comp_target_{}'.format(num_groups)])
    else:
        test_predictions = pipeline.predict(df_test[feature_list])
    pred_comp = np.array(
        comp.decode_composition_groups(test_predictions,
                                       num_groups=num_groups))

    data = {}
    for true_composition, identified_composition in product(
            comp_list, comp_list):
        true_comp_mask = df_test['comp_group_{}'.format(
            num_groups)] == true_composition
        ident_comp_mask = pred_comp == identified_composition

        # Get number of MC comp in each energy bin
        num_true_comp, _ = np.histogram(df_test.loc[true_comp_mask,
                                                    energy_key],
                                        bins=energybins.log_energy_bins)
        num_true_comp_err = np.sqrt(num_true_comp)

        # Get number of correctly identified comp in each energy bin
        combined_mask = true_comp_mask & ident_comp_mask
        num_identified_comp, _ = np.histogram(df_test.loc[combined_mask,
                                                          energy_key],
                                              bins=energybins.log_energy_bins)
        num_identified_comp_err = np.sqrt(num_identified_comp)

        # Calculate correctly identified fractions as a function of energy
        frac_identified, frac_identified_err = comp.ratio_error(
            num_identified_comp, num_identified_comp_err, num_true_comp,
            num_true_comp_err)
        data['true_{}_identified_{}'.format(
            true_composition, identified_composition)] = frac_identified
        data['true_{}_identified_{}_err'.format(
            true_composition, identified_composition)] = frac_identified_err

    return data
                           'unfolding-df_{}-groups.hdf'.format(num_groups))
    df = pd.read_hdf(df_file)

    # Load simulation and train composition classifier
    df_sim_train, df_sim_test = comp.load_sim(config=config,
                                              energy_reco=False,
                                              log_energy_min=None,
                                              log_energy_max=None,
                                              test_size=0.5,
                                              n_jobs=10,
                                              verbose=True)

    feature_list, feature_labels = comp.get_training_features()

    print('Running energy reconstruction...')
    energy_pipeline = comp.load_trained_model('RF_energy_{}'.format(config))
    for df in [df_sim_train, df_sim_test]:
        X = df_sim_train[feature_list].values
        # Energy reconstruction
        df['reco_log_energy'] = energy_pipeline.predict(
            df[feature_list].values)
        df['reco_energy'] = 10**df['reco_log_energy']

    efficiencies, efficiencies_err = comp.get_detector_efficiencies(
        config=config,
        num_groups=num_groups,
        sigmoid='slant',
        pyunfold_format=True)

    print('Running composition classifications...')
    pipeline_str = 'xgboost_comp_{}_{}-groups'.format(config, num_groups)
        # processed=False,
        test_size=0,
        energy_reco=False,
        log_energy_min=None,
        log_energy_max=None,
        compute=False)

    # ddf = comp.load_data(config=config,
    #                      processed=False,
    #                      energy_reco=False,
    #                      log_energy_min=None,
    #                      log_energy_max=None,
    #                      compute=False)

    # Energy reconstruction model
    energy_pipeline = comp.load_trained_model(
        'linearregression_energy_{}'.format(config), return_metadata=False)

    for shift_type in ['up', 'down']:
        print('Processing VEM calibration {} shifted dataset...'.format(
            shift_type))
        s125_scaling_factor = 1.03 if shift_type == 'up' else 0.97

        # Process data:
        #     - Shift S125 value to account for VEM calibration systematic uncertainty
        #     - Energy reconstruction
        #     - Energy range cut
        ddf_systematic = (ddf.assign(
            lap_s125=s125_scaling_factor * ddf.lap_s125).assign(
                log_s125=lambda x: da.log10(x.lap_s125)).map_partitions(
                    add_reco_energy, energy_pipeline,
                    feature_list).map_partitions(apply_energy_cut,