Ejemplo n.º 1
0
def predict_class(files, config, yaml_file, prefix, config_save, test_mode):
    if config_save is None:
        config_save = config

    d_cuts = configyaml.ConfigYaml(yaml_file)
    pt_bins = np.array(d_cuts.values['model_building']['bins_pt'])

    base_name = definitions.PROCESSING_FOLDER + config + '/ml-dataset/' + prefix + 'model_pt'

    models = [
        base_name + str(pt_bin) + '_main_mojo.zip'
        for pt_bin in range(len(pt_bins) - 1)
    ]

    for file in files:
        print('Processing file: ')
        print(file)
        dataset = pd.read_parquet(file)

        if test_mode:
            dataset = dataset.iloc[:1000]

        dataset['Probability'] = -999.
        pt_bins_df = pd.cut(dataset['Pt'], list(pt_bins), labels=False)

        predictions = dataset.groupby(pt_bins_df,
                                      as_index=False,
                                      group_keys=False).apply(
                                          add_prediction, models)
        dataset['Probability'] = predictions.astype('float32')

        file_name = file.split('/')[-1]
        dataset.to_parquet(definitions.PROCESSING_FOLDER + config_save +
                           '/filtered/' + file_name)
        print()
Ejemplo n.º 2
0
def train_model(dataset_name, pt_bin, yaml_file, prefix):
    d_cuts = configyaml.ConfigYaml(yaml_file)

    train = dr.get_ml_dataset(dataset_name, d_cuts, pt_bin)

    params = d_cuts.values['model_building']['model_parameters']
    train_parameters = d_cuts.values['model_building']['train_parameters']

    cv_params = d_cuts.values['model_building']['cv_parameters']
    cv_params.update(train_parameters)

    features = d_cuts.values['model_building']['features']
    target = d_cuts.values['model_building']['target']

    lgb_dataset = lgb.Dataset(train[features], label=train[target])

    del train

    start = time.time()
    cv = lgb.cv(params, lgb_dataset, **cv_params)
    print('Total CV time: ' + str(time.time() - start))
    results_cv = pd.DataFrame(cv)

    cv_results_file = dr.get_location_step(dataset_name, 'ml') + 'cv_' + str(pt_bin) + '.pkl'

    try:
        os.remove(cv_results_file)
    except FileNotFoundError:
        pass

    print('Best iteration of the model: ')
    print(results_cv.iloc[-1])
    results_cv.to_pickle(cv_results_file)

    train_parameters['num_boost_round'] = len(results_cv)

    start = time.time()
    gbm = lgb.train(params, lgb_dataset, **train_parameters)
    print('Total training time: ' + str(time.time() - start))

    name_to_save = dr.get_location_step(dataset_name, 'ml') + prefix + 'model_' + str(pt_bin) + '.txt'

    try:
        os.remove(name_to_save)
    except FileNotFoundError:
        pass

    temp_file = dr.definitions.TEMP + 'temp_model.txt'

    gbm.save_model(temp_file)
    shutil.copyfile(temp_file, name_to_save)

    os.remove(temp_file)

    return gbm, name_to_save
Ejemplo n.º 3
0
def reduce_opt(files_to_reduce, config, yaml_file, id_job, particle, pre_filter_bkg, maximum_pt_filter):
    d_cuts = configyaml.ConfigYaml(yaml_file)
    pt_bins = np.array(d_cuts.values['reduce_data']['bins_pt'])
    cols_keep = d_cuts.values['reduce_data']['features']

    base_name = definitions.PROCESSING_FOLDER + config + '/skimmed/'

    dataset = pd.concat([pd.read_parquet(file, columns=cols_keep) for file in files_to_reduce])

    dataset = dataset.loc[((dataset['bkg'] < pre_filter_bkg) & (dataset['Pt'] < maximum_pt_filter)) | (
            dataset['Pt'] >= maximum_pt_filter)]

    reduce_dataframe_memory(dataset)
    df_pt_bins = pd.cut(dataset['Pt'], list(pt_bins), labels=False)
    dataset.groupby(df_pt_bins).apply(
        lambda x: x.to_parquet(base_name + 'id' + str(id_job) + '_pt' + str(x.name) + '_' + particle + '.parquet'))
Ejemplo n.º 4
0
def submit_train(dataset_name, yaml_config, prefix=None):
    d_cuts = configyaml.ConfigYaml(yaml_config)

    pt_bins = np.array(d_cuts.values['model_building']['bins_pt'])
    pt_bins = pd.cut(0.5 * (pt_bins[:-1] + pt_bins[1:]), bins=pt_bins)
    base_f = definitions.ROOT_DIR

    queue = d_cuts.values['model_building']['queue']

    for i in reversed(range(len(pt_bins))):
        arguments = str(i) + ' ' + str(dataset_name)
        if prefix is not None:
            arguments += ' --prefix ' + prefix

        command = get_job_command(dataset_name + '_t_pt_' + str(i), base_f + "/ml/train_lgb.py ", arguments,
                                  queue=queue)
        subprocess.run(command, shell=True)
Ejemplo n.º 5
0
    def __init__(self, name_file, particle='D0'):
        """Default constructor. yaml_file should come from the class CutsYaml. The particle is set as Default to D0 """
        yaml_config = configyaml.ConfigYaml(name_file, default_file=ROOT_DIR + "/config/config_retangular.yaml")

        try:
            d_meson_cuts = yaml_config.values[particle]['cuts']
        except KeyError as key_error:
            print(key_error)
            raise (ValueError, "The particle " + str(particle) + " cuts were not found.")

        # Save the cuts to a DataFrame
        self.cut_df = pd.DataFrame(d_meson_cuts).apply(pd.to_numeric, errors='ignore')
        self.cut_df.set_index('PtBin', inplace=True)

        # Change names to values with no -range, min_, max_
        names = [a.split('_')[0] for a in self.cut_df.columns]
        type_col = [a.split('_')[1] for a in self.cut_df.columns]  # save the type of cut

        self.range_features = [names[i] for i in range(len(names)) if type_col[i] == "range"]
        self.min_features = [names[i] for i in range(len(names)) if type_col[i] == "min"]
        self.max_features = [names[i] for i in range(len(names)) if type_col[i] == "max"]
        self.bool_features = [names[i] for i in range(len(names)) if type_col[i] == "bool"]

        self.cut_df.columns = names
        self.cut_type = type_col

        pt_ = self.cut_df['Pt']
        min_pt = [pt_[i][0] for i in range(len(pt_))]
        max_pt = [pt_[i][1] for i in range(len(pt_))]

        # Define basic selection variable types
        self.pt_bins = list(min_pt) + list([max_pt[-1]])

        # Change pt_bins to intervals
        mid_pt = (np.array(min_pt) + np.array(max_pt)) / 2.
        self.cut_df['PtBin'] = pd.cut(mid_pt, self.pt_bins)
        self.cut_df.set_index('PtBin', inplace=True)

        self.particle_mass = float(yaml_config.values[particle]['particle_mass'])
        self.particle_name = str(yaml_config.values[particle]['particle_name'])
        self.features_absolute = tuple(yaml_config.values[particle]['features_abs'])
Ejemplo n.º 6
0
def train_model(config, pt_bin, yaml_file, prefix):
    train_f = definitions.PROCESSING_FOLDER + config + '/ml-dataset/ml_sample_train_' + str(
        pt_bin) + '.parquet'
    train = h2o.import_file(train_f)

    d_cuts = configyaml.ConfigYaml(yaml_file)

    # Configuration of the GRID Search
    features = d_cuts.values['model_building']['features']
    target = d_cuts.values['model_building']['target']
    parameters = d_cuts.values['model_building']['model_parameters']
    train[target] = train[target] > -1

    train[target] = train[target].asfactor()

    model = H2OXGBoostEstimator(**parameters)

    model.train(features, target, training_frame=train)

    place_to_save = definitions.PROCESSING_FOLDER + config + '/ml-dataset/'
    file_list_saved = list()

    # Save Main model
    path_main = h2o.save_model(model, place_to_save, force=True)
    path_main_rename = ''.join([
        x + '/' for x in path_main.split('/')[:-1]
    ]) + prefix + 'model_pt' + str(pt_bin) + '_main'
    os.rename(path_main, path_main_rename)
    file_list_saved.append(path_main_rename)

    model_list = model.cross_validation_models()
    for model_cv, i in zip(model_list, range(len(model_list))):
        path = h2o.save_model(model_cv, place_to_save, force=True)
        path_new = ''.join([
            x + '/' for x in path.split('/')[:-1]
        ]) + prefix + 'model_pt' + str(pt_bin) + '_cv' + str(i)
        os.rename(path, path_new)
        file_list_saved.append(path_new)

    return model, model_list, file_list_saved
Ejemplo n.º 7
0
def fit_d_meson_inv_mass(config_file_name=None, suffix='_t'):
    config = configyaml.ConfigYaml(config_file_name)
    data_sample = reader.load_pairs(config_file_name, 'selected')
    base_folder = config.values['base_folder']
    print("Fitting the Invariant Mass")
    fits = data_sample.groupby(['APtBin',
                                'TPtBin']).apply(fit_inv_mass,
                                                 suffix=suffix,
                                                 **config.values['inv_mass'])

    fits.columns = ['Fits']
    fits.to_pickle(base_folder + '/fits_inv_mass' + suffix + '.pkl')

    print("Plotting the fits")
    for index, row in fits.iteritems():
        a_i = index[0]
        t_i = index[1]
        fig, ax = plt.subplots()

        plot_inv_mass_fit(row, ax, **config.values['correlation_qa_style'])
        fig.savefig(base_folder + '/plots/mass_pt_a' + str(a_i) + '_t' +
                    str(t_i) + '.pdf',
                    bbox_inches="tight")
Ejemplo n.º 8
0
                        ' generation on cluster')
    parser.add_argument('-s',
                        "--skip_signal",
                        dest='skip_signal',
                        action='store_true',
                        help='Skip signal processing')
    parser.set_defaults(submit_bkg=True)
    parser.set_defaults(skip_signal=False)

    args = parser.parse_args()

    print("The following configuration will be used:")
    print('Configuration in MC (for signal): ' + args.mc_config)
    print('Configuration in data (for background): ' + args.data_config)

    d_cuts = configyaml.ConfigYaml(args.yaml_file)

    dr.check_for_folder(dr.get_location_step(args.data_config, 'ml'))

    if not args.skip_signal:
        prepare_signal(args.mc_config,
                       d_cuts.values['model_building']['bins_pt'], 'dmeson')

    from dhfcorr.utils import batch, format_list_to_bash

    runs = dr.get_run_numbers(args.data_config)

    print("Processing Background:")
    clear = subprocess.Popen('rm -f ' + ' bkg_*', shell=True)
    clear.wait()
    job_id = 0
Ejemplo n.º 9
0
        type=str,
        help='Configuration name (used to save the temporary files)')
    parser.add_argument("--yaml_config",
                        default=None,
                        help='Configuration file)')
    parser.add_argument("--id", default=0, help='id to save the file')
    parser.add_argument("--particle_name",
                        default='dmeson',
                        help='particle name')

    args = parser.parse_args()
    run_list = args.run_list
    run_list = run_list.split(',')

    yaml_config = args.yaml_config
    d_cuts = configyaml.ConfigYaml(yaml_config)

    folder_to_save = reader.get_location_step('ml')
    mc_mean = pd.read_pickle(folder_to_save + '/mc_mean_sigma.pkl')

    def filter_bkg(df, mc_shape, n_sigma=4.0):
        pt_bin = df.name
        mean = mc_shape.loc[pt_bin]['mean']
        std = mc_shape.loc[pt_bin]['std']
        bkg_sidebands = df[np.abs(df['InvMass'] - mean) > n_sigma * std]
        return bkg_sidebands

    candidates_df = list()
    for run in run_list:
        bkg = reader.load(args.config_name,
                          args.particle_name,
variables_to_keep_trig = [
    'GridPID', 'EventNumber', 'ID', 'IsParticleCandidate', 'Pt', 'Eta', 'Phi',
    'InvMass', 'prediction'
]
variables_to_keep_assoc = [
    'GridPID', 'EventNumber', 'Charge', 'Pt', 'Eta', 'Phi',
    'InvMassPartnersULS', 'InvMassPartnersLS'
]
index = ['GridPID', 'EventNumber']

df = reader.load('D0_HMV0', ['dmeson', 'electron'],
                 columns=variables_to_keep_trig,
                 index=index,
                 lazy=True)

config_corr = configyaml.ConfigYaml('dhfcorr/config/optimize_bdt_cut.yaml')

pt_bins_trig = config_corr.values['correlation']['bins_trig']
pt_bins_assoc = config_corr.values['correlation']['bins_assoc']
trig_suffix = '_t'
assoc_suffix = '_a'

inv_mass_trig_list = list()

pairs = dhfcorr.correlate.make_pairs.build_pairs_from_lazy(
    df, (trig_suffix, assoc_suffix), pt_bins_trig, pt_bins_assoc,
    **config_corr.values['correlation'])

selected = pd.read_pickle('pairs_d_hfe_hm.pkl').reset_index(level=0, drop=True)

# Remove ROOT messages