def split_single_telescope_data(input_path, output_basename, fmt, inkey, key, fraction, name): if fmt in ['hdf5', 'hdf', 'h5']: data = read_data(input_path, key=inkey) elif fmt == 'csv': data = read_data(input_path) assert len(fraction) == len(name), 'You must give a name for each fraction' if sum(fraction) != 1: warnings.warn('Fractions do not sum up to 1') ids = data.index.values n_total = len(data) log.info('Found a total of {} single-telescope events in the file'.format(len(data))) num_ids = split_indices(ids, n_total, fractions=fraction) for n, part_name in zip(num_ids, name): selected_ids = np.random.choice(ids, size=n, replace=False) selected_data = data.loc[selected_ids] if fmt in ['hdf5', 'hdf', 'h5']: path = output_basename + '_' + part_name + '.hdf5' log.info('Writing {} telescope-array events to: {}'.format(n, path)) write_data(selected_data, path, key=key, use_h5py=True, mode='w') elif fmt == 'csv': filename = output_basename + '_' + part_name + '.csv' log.info('Writing {} telescope-array events to: {}'.format(n, filename)) selected_data.to_csv(filename, index=False) data = data.loc[list(set(data.index.values) - set(selected_data.index.values))] ids = data.index.values
def test_write_data_h5py(): from fact.io import write_data df = pd.DataFrame({ 'x': np.random.normal(size=50), 'N': np.random.randint(0, 10, dtype='uint8') }) with tempfile.NamedTemporaryFile(suffix='.hdf5') as f: write_data(df, f.name, use_h5py=True)
def test_write_data_root(): from fact.io import write_data df = pd.DataFrame({ 'x': np.random.normal(size=50), 'N': np.random.randint(0, 10, dtype='uint8') }) with pytest.raises(IOError): with tempfile.NamedTemporaryFile(suffix='.root') as f: write_data(df, f.name)
def main(configuration_path, input_path, output_path, key, verbose): ''' Apply cuts given in CONFIGURATION_PATH to the data in INPUT_PATH and write the result to OUTPUT_PATH. example: ``` selection: length: - '<' - 0.06 ``` ''' logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) log = logging.getLogger() with open(configuration_path) as f: config = yaml.safe_load(f) selection = config.get('selection', {}) array_events = read_data(input_path, key='array_events') telescope_events = read_data(input_path, key='telescope_events') mask_telescope = create_mask_h5py(input_path, selection, key='telescope_events') selected_telescope_events = telescope_events[mask_telescope] array_events['idx'] = array_events.index merge = pd.merge(selected_telescope_events[['run_id', 'array_event_id']], array_events[['run_id', 'array_event_id', 'idx']], on=['run_id', 'array_event_id'], how='left') selected_array_events = array_events[array_events.idx.isin(merge.idx)] write_data(selected_telescope_events, output_path, key='telescope_events', use_h5py=True, mode='w') write_data(selected_array_events, output_path, key='array_events', use_h5py=True, mode='a') with h5py.File(input_path, mode='r') as infile, h5py.File(output_path, 'r+') as outfile: if 'runs' in infile.keys(): log.info('Copying runs group to outputfile') infile.copy('/runs', outfile['/'])
def test_read_data_h5py(): ''' Create a h5py hdf5 file from a dataframe and read it back. ''' from fact.io import write_data, read_data df = pd.DataFrame({ 'x': np.random.normal(size=50).astype('float32'), 'N': np.random.randint(0, 10, dtype='uint8', size=50) }).sort_index(1) with tempfile.NamedTemporaryFile(suffix='.hdf5') as f: write_data(df, f.name, use_h5py=True, key='lecker_daten') df_from_file = read_data(f.name, key='lecker_daten').sort_index(1) assert set(df.columns) == set(df_from_file.columns) assert df.equals(df_from_file)
def test_read_data_csv(): ''' Write a csv file from a dataframe and then read it back again. ''' from fact.io import write_data, read_data df = pd.DataFrame({ 'x': np.random.normal(size=50).astype('float32'), 'N': np.random.randint(0, 10, dtype='uint8', size=50) }) with tempfile.NamedTemporaryFile(suffix='.csv') as f: write_data(df, f.name) dtypes = {'x': 'float32', 'N': 'uint8'} df_from_file = read_data(f.name, dtype=dtypes) assert df.equals(df_from_file)
def split_telescope_data(input_path, output_basename, fraction, name): array_events = read_data(input_path, key='array_events') telescope_events = read_data(input_path, key='telescope_events') runs = read_data(input_path, key='runs') # split by runs ids = set(runs.run_id) log.debug(f'All runs:{ids}') n_total = len(ids) log.info(f'Found a total of {n_total} runs in the file') num_runs = split_indices(ids, n_total, fractions=fraction) for n, part_name in zip(num_runs, name): selected_run_ids = np.random.choice(list(ids), size=n, replace=False) selected_runs = runs[runs.run_id.isin(selected_run_ids)] selected_array_events = array_events[array_events.run_id.isin(selected_run_ids)] selected_telescope_events = telescope_events[telescope_events.run_id.isin(selected_run_ids)] path = output_basename + '_' + part_name + '.hdf5' log.info('Writing {} runs events to: {}'.format(n, path)) write_data(selected_runs, path, key='runs', use_h5py=True, mode='w') write_data(selected_array_events, path, key='array_events', use_h5py=True, mode='a') write_data(selected_telescope_events, path, key='telescope_events', use_h5py=True, mode='a') log.debug(f'selected runs {set(selected_run_ids)}') log.debug(f'Runs minus selected runs {ids - set(selected_run_ids)}') ids = ids - set(selected_run_ids)
def apply(ctx, out_file, data, number_of_images): import fact.io as fio network = ctx.obj['network'] model = load_model(network) p = '{}.index'.format(model_path) if not os.path.exists(p): print('No model trained yet. Do so first.') return if os.path.exists(out_file): click.confirm( 'Do you want to overwrite existing file {}?'.format(out_file), abort=True) os.remove(out_file) if data == 'crab': df = image_io.apply_to_observation_data(model) elif data == 'gamma': df = image_io.apply_to_mc(model, path='./data/gamma_images.hdf5', N=number_of_images) shower_truth = fio.read_data('./data/gamma_images.hdf5', key='showers') fio.write_data(shower_truth, file_path=out_file, key='showers', use_hp5y=True) elif data == 'proton': df = image_io.apply_to_mc(model, path='./data/proton_images.hdf5', N=number_of_images) shower_truth = fio.read_data('./data/proton_images.hdf5', key='showers') fio.write_data(shower_truth, file_path=out_file, key='showers', use_hp5y=True) print('Writing {} events to file {}'.format(len(df), out_file)) fio.write_data(df, out_file, key='events')
def main(configuration_path, signal_path, predictions_path, disp_model_path, sign_model_path, key, verbose): ''' Train two learners to be able to reconstruct the source position. One regressor for disp and one classifier for the sign of delta. Both pmml and pickle format are supported for the output. CONFIGURATION_PATH: Path to the config yaml file SIGNAL_PATH: Path to the signal data PREDICTIONS_PATH : path to the file where the mc predictions are stored. DISP_MODEL_PATH: Path to save the disp model to. SIGN_MODEL_PATH: Path to save the disp model to. Allowed extensions are .pkl and .pmml. If extension is .pmml, then both pmml and pkl file will be saved ''' log = setup_logging(verbose=verbose) config = AICTConfig.from_yaml(configuration_path) model_config = config.disp np.random.seed(config.seed) disp_regressor = model_config.disp_regressor sign_classifier = model_config.sign_classifier disp_regressor.random_state = config.seed sign_classifier.random_state = config.seed log.info('Loading data') df = read_telescope_data( signal_path, config, model_config.columns_to_read_train, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_signal ) log.info('Total number of events: {}'.format(len(df))) log.info( 'Using coordinate transformations for %s', model_config.coordinate_transformation ) df = convert_units(df, model_config) source_x, source_y = horizontal_to_camera(df, model_config) log.info('Using projected disp: {}'.format(model_config.project_disp)) df['true_disp'], df['true_sign'] = calc_true_disp( source_x, source_y, df[model_config.cog_x_column], df[model_config.cog_y_column], df[model_config.delta_column], project_disp=model_config.project_disp, ) # generate features if given in config if model_config.feature_generation: feature_generation(df, model_config.feature_generation, inplace=True) df_train = convert_to_float32(df[config.disp.features]) df_train.dropna(how='any', inplace=True) log.info('Events after nan-dropping: {} '.format(len(df_train))) target_disp = df['true_disp'].loc[df_train.index] target_sign = df['true_sign'].loc[df_train.index] # load optional columns if available to be able to make performance plots # vs true energy / size if config.true_energy_column is not None: true_energy = df.loc[df_train.index, config.true_energy_column].to_numpy() if config.size_column is not None: size = df.loc[df_train.index, config.size_column].to_numpy() if model_config.log_target is True: target_disp = np.log(target_disp) log.info('Starting {} fold cross validation... '.format( model_config.n_cross_validations )) scores_disp = [] scores_sign = [] cv_predictions = [] kfold = model_selection.KFold( n_splits=model_config.n_cross_validations, shuffle=True, random_state=config.seed, ) total = model_config.n_cross_validations for fold, (train, test) in enumerate(tqdm(kfold.split(df_train.values), total=total)): cv_x_train, cv_x_test = df_train.values[train], df_train.values[test] cv_disp_train, cv_disp_test = target_disp.values[train], target_disp.values[test] cv_sign_train, cv_sign_test = target_sign.values[train], target_sign.values[test] disp_regressor.fit(cv_x_train, cv_disp_train) cv_disp_prediction = disp_regressor.predict(cv_x_test) if model_config.log_target is True: cv_disp_test = np.exp(cv_disp_test) cv_disp_prediction = np.exp(cv_disp_prediction) sign_classifier.fit(cv_x_train, cv_sign_train) # scale proba for positive sign to [-1, 1], so it's a nice score for the sign # where values close to -1 mean high confidence for - and values close to 1 mean # high confidence for + cv_sign_score = 2 * sign_classifier.predict_proba(cv_x_test)[:, 1] - 1 cv_sign_prediction = np.where(cv_sign_score < 0, -1.0, 1.0) scores_disp.append(metrics.r2_score(cv_disp_test, cv_disp_prediction)) scores_sign.append(metrics.accuracy_score(cv_sign_test, cv_sign_prediction)) cv_df = pd.DataFrame({ 'disp': cv_disp_test, 'disp_prediction': cv_disp_prediction, 'sign': cv_sign_test, 'sign_prediction': cv_sign_prediction, 'sign_score': cv_sign_score, 'cv_fold': fold, }) if config.true_energy_column is not None: cv_df[config.true_energy_column] = true_energy[test] if config.size_column is not None: cv_df[config.size_column] = size[test] cv_predictions.append(cv_df) predictions_df = pd.concat(cv_predictions, ignore_index=True) log.info('writing predictions from cross validation') write_data(predictions_df, predictions_path, mode='w') scores_disp = np.array(scores_disp) scores_sign = np.array(scores_sign) log.info('Cross validated R^2 scores for disp: {}'.format(scores_disp)) log.info('Mean R^2 score from CV: {:0.4f} ± {:0.4f}'.format( scores_disp.mean(), scores_disp.std() )) log.info('Cross validated accuracy for the sign: {}'.format(scores_sign)) log.info('Mean accuracy from CV: {:0.4f} ± {:0.4f}'.format( scores_sign.mean(), scores_sign.std() )) log.info('Building new model on complete data set...') # set random seed again to make sure different settings # for n_cross_validations don't change the final model np.random.seed(config.seed) disp_regressor.random_state = config.seed sign_classifier.random_state = config.seed disp_regressor.fit(df_train.values, target_disp.values) sign_classifier.fit(df_train.values, target_sign.values) log.info('Pickling disp model to {} ...'.format(disp_model_path)) save_model( disp_regressor, feature_names=list(df_train.columns), model_path=disp_model_path, label_text='abs_disp', ) log.info('Pickling sign model to {} ...'.format(sign_model_path)) save_model( sign_classifier, feature_names=list(df_train.columns), model_path=sign_model_path, label_text='sign_disp', )
def main(configuration_path, signal_path, predictions_path, disp_model_path, sign_model_path, key, verbose, column_name): ''' Train two learners to be able to reconstruct the source position. One regressor for disp and one classifier for the sign of delta. Both pmml and pickle format are supported for the output. CONFIGURATION_PATH: Path to the config yaml file SIGNAL_PATH: Path to the signal data PREDICTIONS_PATH : path to the file where the mc predictions are stored. DISP_MODEL_PATH: Path to save the disp model to. SIGN_MODEL_PATH: Path to save the disp model to. Allowed extensions are .pkl and .pmml. If extension is .pmml, then both pmml and pkl file will be saved ''' logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) log = logging.getLogger() config = AICTConfig.from_yaml(configuration_path) model_config = config.disp disp_regressor = model_config.disp_regressor sign_classifier = model_config.sign_classifier disp_regressor.random_state = config.seed sign_classifier.random_state = config.seed columns = model_config.columns_to_read_train columns.append(config.energy.target_column) columns.append('focal_length') log.info('Loading data') df = read_telescope_data( signal_path, config, columns, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_signal ) log.info('Total number of events: {}'.format(len(df))) source_x, source_y = horizontal_to_camera( az=df[model_config.source_az_column], alt=df[model_config.source_alt_column], az_pointing=df[model_config.pointing_az_column], alt_pointing=df[model_config.pointing_alt_column], focal_length=df['focal_length'] ) df['true_disp'], df['true_sign'] = calc_true_disp( source_x, source_y, df[model_config.cog_x_column], df[model_config.cog_y_column], df[model_config.delta_column], ) # generate features if given in config if model_config.feature_generation: feature_generation(df, model_config.feature_generation, inplace=True) df_train = convert_to_float32(df) df_train.dropna(how='any', inplace=True) mc_energies = df_train[config.energy.target_column] df_train = df_train[config.disp.features] df['prediction_disp'] = np.zeros(len(df)) * np.nan log.info('Events after nan-dropping: {} '.format(len(df_train))) target_disp = df['true_disp'].loc[df_train.index] target_sign = df['true_sign'].loc[df_train.index] log.info('Starting {} fold cross validation... '.format( model_config.n_cross_validations )) scores_disp = [] scores_sign = [] cv_predictions = [] kfold = model_selection.KFold( n_splits=model_config.n_cross_validations, shuffle=True, random_state=config.seed, ) for fold, (train, test) in tqdm(enumerate(kfold.split(df_train.values))): cv_x_train, cv_x_test = df_train.values[train], df_train.values[test] cv_disp_train, cv_disp_test = target_disp.values[train], target_disp.values[test] cv_sign_train, cv_sign_test = target_sign.values[train], target_sign.values[test] disp_regressor.fit(cv_x_train, cv_disp_train) cv_disp_prediction = disp_regressor.predict(cv_x_test) sign_classifier.fit(cv_x_train, cv_sign_train) cv_sign_prediction = sign_classifier.predict(cv_x_test) cv_sign_proba = sign_classifier.predict_proba(cv_x_test)[:, 1] scores_disp.append(metrics.r2_score(cv_disp_test, cv_disp_prediction)) scores_sign.append(metrics.accuracy_score(cv_sign_test, cv_sign_prediction)) df.prediction_disp[test] = cv_disp_prediction cv_predictions.append(pd.DataFrame({ 'disp': cv_disp_test, 'disp_prediction': cv_disp_prediction, 'sign': cv_sign_test, 'sign_prediction': cv_sign_prediction, 'sign_probabilities': cv_sign_proba, 'mc_energy': mc_energies[test], 'cv_fold': fold })) predictions_df = pd.concat(cv_predictions, ignore_index=True) log.info('writing predictions from cross validation') write_data(predictions_df, predictions_path, mode='w') scores_disp = np.array(scores_disp) scores_sign = np.array(scores_sign) log.info('Cross validated R^2 scores for disp: {}'.format(scores_disp)) log.info('Mean R^2 score from CV: {:0.4f} ± {:0.4f}'.format( scores_disp.mean(), scores_disp.std() )) log.info('Cross validated accuracy for the sign: {}'.format(scores_sign)) log.info('Mean accuracy from CV: {:0.4f} ± {:0.4f}'.format( scores_sign.mean(), scores_sign.std() )) log.info('Writing new data set with predictions column') with h5py.File(signal_path, 'r+') as f: append_to_h5py( f, df.prediction_disp, config.telescope_events_key, column_name ) log.info('Building new model on complete data set...') # set random seed again to make sure different settings # for n_cross_validations don't change the final model np.random.seed(config.seed) disp_regressor.random_state = config.seed sign_classifier.random_state = config.seed disp_regressor.fit(df_train.values, target_disp.values) sign_classifier.fit(df_train.values, target_sign.values) log.info('Pickling disp model to {} ...'.format(disp_model_path)) pickle_model( disp_regressor, feature_names=list(df_train.columns), model_path=disp_model_path, label_text='disp', ) log.info('Pickling sign model to {} ...'.format(sign_model_path)) pickle_model( sign_classifier, feature_names=list(df_train.columns), model_path=sign_model_path, label_text='disp', )
def main(configuration_path, signal_path, predictions_path, model_path, verbose): ''' Train an energy regressor simulated gamma. Both pmml and pickle format are supported for the output. CONFIGURATION_PATH: Path to the config yaml file SIGNAL_PATH: Path to the signal data PREDICTIONS_PATH : path to the file where the mc predictions are stored. MODEL_PATH: Path to save the model to. Allowed extensions are .pkl and .pmml. If extension is .pmml, then both pmml and pkl file will be saved ''' logging.getLogger().setLevel(logging.DEBUG if verbose else logging.INFO) config = AICTConfig.from_yaml(configuration_path) model_config = config.energy df = read_telescope_data( signal_path, config, model_config.columns_to_read_train, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_signal ) log.info('Total number of events: {}'.format(len(df))) df_train = convert_to_float32(df[model_config.features]) df_train.dropna(how='any', inplace=True) log.debug('Events after nan-dropping: {} '.format(len(df_train))) target = df[model_config.target_column].loc[df_train.index] target.name = 'true_energy' if model_config.log_target is True: target = np.log(target) n_cv = model_config.n_cross_validations regressor = model_config.model log.info('Starting {} fold cross validation... '.format(n_cv)) scores = [] cv_predictions = [] kfold = model_selection.KFold(n_splits=n_cv, shuffle=True, random_state=config.seed) for fold, (train, test) in enumerate(tqdm(kfold.split(df_train.values), total=n_cv)): cv_x_train, cv_x_test = df_train.values[train], df_train.values[test] cv_y_train, cv_y_test = target.values[train], target.values[test] regressor.fit(cv_x_train, cv_y_train) cv_y_prediction = regressor.predict(cv_x_test) if model_config.log_target is True: cv_y_test = np.exp(cv_y_test) cv_y_prediction = np.exp(cv_y_prediction) scores.append(metrics.r2_score(cv_y_test, cv_y_prediction)) cv_predictions.append(pd.DataFrame({ 'label': cv_y_test, 'label_prediction': cv_y_prediction, 'cv_fold': fold })) predictions_df = pd.concat(cv_predictions, ignore_index=True) log.info('writing predictions from cross validation') write_data(predictions_df, predictions_path, mode='w') scores = np.array(scores) log.info('Cross validated R^2 scores: {}'.format(scores)) log.info('Mean R^2 score from CV: {:0.4f} ± {:0.4f}'.format( scores.mean(), scores.std() )) log.info('Building new model on complete data set...') # set random seed again to make sure different settings # for n_cross_validations don't change the final model np.random.seed(config.seed) regressor.random_state = config.seed regressor.fit(df_train.values, target.values) log.info('Pickling model to {} ...'.format(model_path)) pickle_model( regressor, feature_names=list(df_train.columns), model_path=model_path, label_text='estimated_energy', )
def main(configuration_path, signal_path, background_path, predictions_path, model_path, verbose): ''' Train a classifier on signal and background monte carlo data and write the model to MODEL_PATH in pmml or pickle format. CONFIGURATION_PATH: Path to the config yaml file SIGNAL_PATH: Path to the signal data BACKGROUND_PATH: Path to the background data PREDICTIONS_PATH : path to the file where the mc predictions are stored. MODEL_PATH: Path to save the model to. Allowed extensions are .pkl and .pmml. If extension is .pmml, then both pmml and pkl file will be saved ''' logging.getLogger().setLevel(logging.DEBUG if verbose else logging.INFO) check_extension(predictions_path) check_extension(model_path, allowed_extensions=['.pmml', '.pkl']) config = AICTConfig.from_yaml(configuration_path) model_config = config.separator log.info('Loading signal data') df_signal = read_telescope_data( signal_path, config, model_config.columns_to_read_train, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_signal ) df_signal['label_text'] = 'signal' df_signal['label'] = 1 log.info('Loading background data') df_background = read_telescope_data( background_path, config, model_config.columns_to_read_train, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_background ) df_background['label_text'] = 'background' df_background['label'] = 0 df_full = pd.concat([df_background, df_signal], ignore_index=True) df_training = convert_to_float32(df_full[model_config.features]) log.debug('Total training events: {}'.format(len(df_training))) df_training.dropna(how='any', inplace=True) log.debug('Training events after dropping nans: {}'.format(len(df_training))) label = df_full.loc[df_training.index, 'label'] n_gammas = len(label[label == 1]) n_protons = len(label[label == 0]) log.info('Training classifier with {} background and {} signal events'.format( n_protons, n_gammas )) log.debug(model_config.features) # save prediction_path for each cv iteration cv_predictions = [] # iterate over test and training sets X = df_training.values y = label.values n_cross_validations = model_config.n_cross_validations classifier = model_config.model log.info('Starting {} fold cross validation... '.format(n_cross_validations)) stratified_kfold = model_selection.StratifiedKFold( n_splits=n_cross_validations, shuffle=True, random_state=config.seed ) aucs = [] cv_it = stratified_kfold.split(X, y) for fold, (train, test) in enumerate(tqdm(cv_it, total=n_cross_validations)): # select data xtrain, xtest = X[train], X[test] ytrain, ytest = y[train], y[test] # fit and predict classifier.fit(xtrain, ytrain) y_probas = classifier.predict_proba(xtest)[:, 1] y_prediction = classifier.predict(xtest) cv_predictions.append(pd.DataFrame({ 'label': ytest, 'label_prediction': y_prediction, 'probabilities': y_probas, 'cv_fold': fold, })) aucs.append(metrics.roc_auc_score(ytest, y_probas)) log.info('Mean AUC ROC : {}'.format(np.array(aucs).mean())) predictions_df = pd.concat(cv_predictions, ignore_index=True) log.info('Writing predictions from cross validation') write_data(predictions_df, predictions_path, mode='w') # set random seed again to make sure different settings # for n_cross_validations don't change the final model np.random.seed(config.seed) classifier.random_state = config.seed if model_config.calibrate_classifier: log.info('Training calibrated classifier') classifier = CalibratedClassifierCV(classifier, cv=2, method='sigmoid') classifier.fit(X, y) else: log.info('Training model on complete dataset') classifier.fit(X, y) log.info('Pickling model to {} ...'.format(model_path)) pickle_model( classifier=classifier, model_path=model_path, label_text='label', feature_names=list(df_training.columns) )
def main(configuration_path, signal_path, predictions_path, disp_model_path, sign_model_path, key, verbose): ''' Train two learners to be able to reconstruct the source position. One regressor for disp and one classifier for the sign of delta. Both pmml and pickle format are supported for the output. CONFIGURATION_PATH: Path to the config yaml file SIGNAL_PATH: Path to the signal data PREDICTIONS_PATH : path to the file where the mc predictions are stored. DISP_MODEL_PATH: Path to save the disp model to. SIGN_MODEL_PATH: Path to save the disp model to. Allowed extensions are .pkl and .pmml. If extension is .pmml, then both pmml and pkl file will be saved ''' logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) log = logging.getLogger() config = AICTConfig.from_yaml(configuration_path) model_config = config.disp np.random.seed(config.seed) disp_regressor = model_config.disp_regressor sign_classifier = model_config.sign_classifier disp_regressor.random_state = config.seed sign_classifier.random_state = config.seed log.info('Loading data') df = read_telescope_data( signal_path, config, model_config.columns_to_read_train, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_signal ) log.info('Total number of events: {}'.format(len(df))) source_x, source_y = horizontal_to_camera( az=df[model_config.source_az_column], zd=df[model_config.source_zd_column], az_pointing=df[model_config.pointing_az_column], zd_pointing=df[model_config.pointing_zd_column], ) df['true_disp'], df['true_sign'] = calc_true_disp( source_x, source_y, df[model_config.cog_x_column], df[model_config.cog_y_column], df[model_config.delta_column], ) # generate features if given in config if model_config.feature_generation: feature_generation(df, model_config.feature_generation, inplace=True) df_train = convert_to_float32(df[config.disp.features]) df_train.dropna(how='any', inplace=True) log.info('Events after nan-dropping: {} '.format(len(df_train))) target_disp = df['true_disp'].loc[df_train.index] target_sign = df['true_sign'].loc[df_train.index] log.info('Starting {} fold cross validation... '.format( model_config.n_cross_validations )) scores_disp = [] scores_sign = [] cv_predictions = [] kfold = model_selection.KFold( n_splits=model_config.n_cross_validations, shuffle=True, random_state=config.seed, ) total = model_config.n_cross_validations for fold, (train, test) in enumerate(tqdm(kfold.split(df_train.values), total=total)): cv_x_train, cv_x_test = df_train.values[train], df_train.values[test] cv_disp_train, cv_disp_test = target_disp.values[train], target_disp.values[test] cv_sign_train, cv_sign_test = target_sign.values[train], target_sign.values[test] disp_regressor.fit(cv_x_train, cv_disp_train) cv_disp_prediction = disp_regressor.predict(cv_x_test) sign_classifier.fit(cv_x_train, cv_sign_train) cv_sign_prediction = sign_classifier.predict(cv_x_test) scores_disp.append(metrics.r2_score(cv_disp_test, cv_disp_prediction)) scores_sign.append(metrics.accuracy_score(cv_sign_test, cv_sign_prediction)) cv_predictions.append(pd.DataFrame({ 'disp': cv_disp_test, 'disp_prediction': cv_disp_prediction, 'sign': cv_sign_test, 'sign_prediction': cv_sign_prediction, 'cv_fold': fold })) predictions_df = pd.concat(cv_predictions, ignore_index=True) log.info('writing predictions from cross validation') write_data(predictions_df, predictions_path, mode='w') scores_disp = np.array(scores_disp) scores_sign = np.array(scores_sign) log.info('Cross validated R^2 scores for disp: {}'.format(scores_disp)) log.info('Mean R^2 score from CV: {:0.4f} ± {:0.4f}'.format( scores_disp.mean(), scores_disp.std() )) log.info('Cross validated accuracy for the sign: {}'.format(scores_sign)) log.info('Mean accuracy from CV: {:0.4f} ± {:0.4f}'.format( scores_sign.mean(), scores_sign.std() )) log.info('Building new model on complete data set...') # set random seed again to make sure different settings # for n_cross_validations don't change the final model np.random.seed(config.seed) disp_regressor.random_state = config.seed sign_classifier.random_state = config.seed disp_regressor.fit(df_train.values, target_disp.values) sign_classifier.fit(df_train.values, target_sign.values) log.info('Pickling disp model to {} ...'.format(disp_model_path)) pickle_model( disp_regressor, feature_names=list(df_train.columns), model_path=disp_model_path, label_text='disp', ) log.info('Pickling sign model to {} ...'.format(sign_model_path)) pickle_model( sign_classifier, feature_names=list(df_train.columns), model_path=sign_model_path, label_text='disp', )
def write_hdf(data, path, table_name, mode="w", **kwargs): write_data(data, path, key=table_name, use_h5py=True, mode=mode, **kwargs)
def main(configuration_path, signal_path, background_path, predictions_path, model_path, verbose): ''' Train a classifier on signal and background monte carlo data and write the model to MODEL_PATH in pmml or pickle format. CONFIGURATION_PATH: Path to the config yaml file SIGNAL_PATH: Path to the signal data BACKGROUND_PATH: Path to the background data PREDICTIONS_PATH : path to the file where the mc predictions are stored. MODEL_PATH: Path to save the model to. Allowed extensions are .pkl and .pmml. If extension is .pmml, then both pmml and pkl file will be saved ''' log = setup_logging(verbose=verbose) check_extension(predictions_path) check_extension(model_path, allowed_extensions=['.pmml', '.pkl', '.onnx']) config = AICTConfig.from_yaml(configuration_path) model_config = config.separator label_text = model_config.output_name log.info('Loading signal data') df_signal = read_telescope_data( signal_path, config, model_config.columns_to_read_train, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_signal) df_signal['label_text'] = 'signal' df_signal['label'] = 1 log.info('Loading background data') df_background = read_telescope_data( background_path, config, model_config.columns_to_read_train, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_background) df_background['label_text'] = 'background' df_background['label'] = 0 df = pd.concat([df_background, df_signal], ignore_index=True) df_train = convert_to_float32(df[model_config.features]) log.debug('Total training events: {}'.format(len(df_train))) df_train.dropna(how='any', inplace=True) log.debug('Training events after dropping nans: {}'.format(len(df_train))) label = df.loc[df_train.index, 'label'] # load optional columns if available to be able to make performance plots # vs true energy / size if config.true_energy_column is not None: true_energy = df.loc[df_train.index, config.true_energy_column].to_numpy() if config.size_column is not None: size = df.loc[df_train.index, config.size_column].to_numpy() n_gammas = len(label[label == 1]) n_protons = len(label[label == 0]) log.info( 'Training classifier with {} background and {} signal events'.format( n_protons, n_gammas)) log.debug(model_config.features) # save prediction_path for each cv iteration cv_predictions = [] # iterate over test and training sets X = df_train.values y = label.values n_cross_validations = model_config.n_cross_validations classifier = model_config.model log.info( 'Starting {} fold cross validation... '.format(n_cross_validations)) stratified_kfold = model_selection.StratifiedKFold( n_splits=n_cross_validations, shuffle=True, random_state=config.seed) aucs = [] cv_it = stratified_kfold.split(X, y) for fold, (train, test) in enumerate(tqdm(cv_it, total=n_cross_validations)): # select data xtrain, xtest = X[train], X[test] ytrain, ytest = y[train], y[test] # fit and predict classifier.fit(xtrain, ytrain) y_probas = classifier.predict_proba(xtest)[:, 1] cv_df = pd.DataFrame({ 'label': ytest, model_config.output_name: y_probas, 'cv_fold': fold, }) if config.true_energy_column is not None: cv_df[config.true_energy_column] = true_energy[test] if config.size_column is not None: cv_df[config.size_column] = size[test] cv_predictions.append(cv_df) aucs.append(metrics.roc_auc_score(ytest, y_probas)) aucs = np.array(aucs) log.info('Cross-validation ROC-AUCs: {}'.format(aucs)) log.info('Mean AUC ROC : {:.3f} ± {:.3f}'.format(aucs.mean(), aucs.std())) predictions_df = pd.concat(cv_predictions, ignore_index=True) log.info('Writing predictions from cross validation') write_data(predictions_df, predictions_path, mode='w') # set random seed again to make sure different settings # for n_cross_validations don't change the final model np.random.seed(config.seed) classifier.random_state = config.seed if model_config.calibrate_classifier: log.info('Training calibrated classifier') classifier = CalibratedClassifierCV(classifier, cv=2, method='sigmoid') classifier.fit(X, y) else: log.info('Training model on complete dataset') classifier.fit(X, y) log.info('Saving model to {} ...'.format(model_path)) save_model(classifier, model_path=model_path, label_text=label_text, feature_names=list(df_train.columns))
def main(configuration_path, signal_path, predictions_path, model_path, verbose, column_name): ''' Train a x_max regressor. Both pmml and pickle format are supported for the output. CONFIGURATION_PATH: Path to the config yaml file SIGNAL_PATH: Path to the signal data PREDICTIONS_PATH : path to the file where the mc predictions are stored. MODEL_PATH: Path to save the model to. Allowed extensions are .pkl and .pmml. If extension is .pmml, then both pmml and pkl file will be saved ''' logging.getLogger().setLevel(logging.DEBUG if verbose else logging.INFO) config = AICTConfig.from_yaml(configuration_path) model_config = config.x_max columns = model_config.columns_to_read_train columns.append(config.energy.target_column) df = read_telescope_data( signal_path, config, columns, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_signal) log.info('Total number of events: {}'.format(len(df))) df_train = convert_to_float32(df) df_train.dropna(how='any', inplace=True) mc_energies = df_train[config.energy.target_column] df_train = df_train[config.x_max.features] df['prediction_x_max'] = np.zeros(len(df)) * np.nan log.debug('Events after nan-dropping: {} '.format(len(df_train))) target = df[model_config.target_column].loc[df_train.index] target.name = 'true_x_max' n_cross_validations = model_config.n_cross_validations regressor = model_config.model log.info( 'Starting {} fold cross validation... '.format(n_cross_validations)) scores = [] cv_predictions = [] kfold = model_selection.KFold(n_splits=n_cross_validations, shuffle=True, random_state=config.seed) for fold, (train, test) in tqdm(enumerate(kfold.split(df_train.values))): cv_x_train, cv_x_test = df_train.values[train], df_train.values[test] cv_y_train, cv_y_test = target.values[train], target.values[test] regressor.fit(cv_x_train, cv_y_train) cv_y_prediction = regressor.predict(cv_x_test) if model_config.log_target is True: cv_y_test = np.exp(cv_y_test) cv_y_prediction = np.exp(cv_y_prediction) scores.append(metrics.r2_score(cv_y_test, cv_y_prediction)) df.prediction_x_max[test] = cv_y_prediction cv_predictions.append( pd.DataFrame({ 'x_max': cv_y_test, 'x_max_prediction': cv_y_prediction, 'mc_energy': mc_energies[test], 'cv_fold': fold })) predictions_df = pd.concat(cv_predictions, ignore_index=True) log.info('writing predictions from cross validation') write_data(predictions_df, predictions_path, mode='w') scores = np.array(scores) #write telescope data as a new file (equivalent to apply) log.info('Writing new data set with predictions column') with h5py.File(signal_path, 'r+') as f: append_to_h5py(f, df.prediction_x_max, config.telescope_events_key, column_name) log.info('Cross validated R^2 scores: {}'.format(scores)) log.info('Mean R^2 score from CV: {:0.4f} ± {:0.4f}'.format( scores.mean(), scores.std())) log.info('Building new model on complete data set...') # set random seed again to make sure different settings # for n_cross_validations don't change the final model np.random.seed(config.seed) regressor.random_state = config.seed regressor.fit(df_train.values, target.values) log.info('Pickling model to {} ...'.format(model_path)) pickle_model( regressor, feature_names=list(df_train.columns), model_path=model_path, label_text=column_name, )
def main(configuration_path, signal_path, background_path, predictions_path, model_path, verbose): ''' Train a classifier on signal and background monte carlo data and write the model to MODEL_PATH in pmml or pickle format. CONFIGURATION_PATH: Path to the config yaml file SIGNAL_PATH: Path to the signal data BACKGROUND_PATH: Path to the background data PREDICTIONS_PATH : path to the file where the mc predictions are stored. MODEL_PATH: Path to save the model to. Allowed extensions are .pkl and .pmml. If extension is .pmml, then both pmml and pkl file will be saved ''' logging.getLogger().setLevel(logging.DEBUG if verbose else logging.INFO) check_extension(predictions_path) check_extension(model_path, allowed_extensions=['.pmml', '.pkl']) config = AICTConfig.from_yaml(configuration_path) model_config = config.separator columns = model_config.columns_to_read_train columns.append(config.energy.target_column) log.info('Loading signal data') df_signal = read_telescope_data( signal_path, config, columns, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_signal) df_signal['label_text'] = 'signal' df_signal['label'] = 1 log.info('Loading background data') df_background = read_telescope_data( background_path, config, columns, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_background) df_background['label_text'] = 'background' df_background['label'] = 0 df_full = pd.concat([df_background, df_signal], ignore_index=True) df_training = df_full.copy() log.debug('Total training events: {}'.format(len(df_training))) df_training.dropna(how='any', inplace=True) log.debug('Training events after dropping nans: {}'.format( len(df_training))) mc_energies = convert_to_float32(df_training[config.energy.target_column]) label = df_training['label'] df_training = convert_to_float32(df_training[model_config.features]) #label = df_full.loc[df_training.index, 'label'] n_gammas = len(label[label == 1]) n_protons = len(label[label == 0]) log.info( 'Training classifier with {} background and {} signal events'.format( n_protons, n_gammas)) log.debug(model_config.features) # save prediction_path for each cv iteration cv_predictions = [] # iterate over test and training sets X = df_training.values y = label.values n_cross_validations = model_config.n_cross_validations classifier = model_config.model log.info( 'Starting {} fold cross validation... '.format(n_cross_validations)) stratified_kfold = model_selection.StratifiedKFold( n_splits=n_cross_validations, shuffle=True, random_state=config.seed) aucs = [] cv_it = stratified_kfold.split(X, y) for fold, (train, test) in enumerate(tqdm(cv_it, total=n_cross_validations)): # select data xtrain, xtest = X[train], X[test] ytrain, ytest = y[train], y[test] # fit and predict classifier.fit(xtrain, ytrain) y_probas = classifier.predict_proba(xtest)[:, 1] y_prediction = classifier.predict(xtest) cv_predictions.append( pd.DataFrame({ 'label': ytest, 'label_prediction': y_prediction, 'probabilities': y_probas, 'cv_fold': fold, 'mc_energy': mc_energies[test], })) aucs.append(metrics.roc_auc_score(ytest, y_probas)) log.info('Mean AUC ROC : {}'.format(np.array(aucs).mean())) predictions_df = pd.concat(cv_predictions, ignore_index=True) log.info('Writing predictions from cross validation') write_data(predictions_df, predictions_path, mode='w') # set random seed again to make sure different settings # for n_cross_validations don't change the final model np.random.seed(config.seed) classifier.random_state = config.seed if model_config.calibrate_classifier: log.info('Training calibrated classifier') classifier = CalibratedClassifierCV(classifier, cv=2, method='sigmoid') classifier.fit(X, y) else: log.info('Training model on complete dataset') classifier.fit(X, y) log.info('Pickling model to {} ...'.format(model_path)) pickle_model(classifier=classifier, model_path=model_path, label_text='label', feature_names=list(df_training.columns))
def main(configuration_path, signal_path, predictions_path, dxdy_model_path, key, verbose): """ Train one learner to be able to reconstruct the source position. One regressor for multiple outputs (dx,dy). Both pmml and pickle format are supported for the output. CONFIGURATION_PATH: Path to the config yaml file SIGNAL_PATH: Path to the signal data PREDICTIONS_PATH : path to the file where the mc predictions are stored. DXDY_MODEL_PATH: Path to save the dxdy model to. Allowed extensions are .pkl and .pmml. If extension is .pmml, then both pmml and pkl file will be saved """ log = setup_logging(verbose=verbose) config = AICTConfig.from_yaml(configuration_path) model_config = config.dxdy np.random.seed(config.seed) dxdy_regressor = model_config.dxdy_regressor dxdy_regressor.random_state = config.seed log.info("Loading data") df = read_telescope_data( signal_path, config, model_config.columns_to_read_train, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_signal, ) log.info("Total number of events: {}".format(len(df))) log.info( "Using coordinate transformations for %s", config.coordinate_transformation, ) source_x, source_y = horizontal_to_camera(df, config) df["true_dx"] = source_x - df[config.cog_x_column] df["true_dy"] = source_y - df[config.cog_y_column] # generate features if given in config if model_config.feature_generation: feature_generation(df, model_config.feature_generation, inplace=True) df_train = convert_to_float32(df[model_config.features]) df_train.dropna(how="any", inplace=True) log.info("Events after nan-dropping: {} ".format(len(df_train))) target_dxdy = df.loc[df_train.index, ["true_dx", "true_dy"]].to_numpy() # load optional columns if available to be able to make performance plots # vs true energy / size if config.true_energy_column is not None: true_energy = df.loc[df_train.index, config.true_energy_column].to_numpy() if config.size_column is not None: size = df.loc[df_train.index, config.size_column].to_numpy() log.info("Starting {} fold cross validation... ".format( model_config.n_cross_validations)) scores_dxdy = [] cv_predictions = [] kfold = model_selection.KFold( n_splits=model_config.n_cross_validations, shuffle=True, random_state=config.seed, ) total = model_config.n_cross_validations for fold, (train, test) in enumerate( tqdm(kfold.split(df_train.values), total=total)): cv_x_train, cv_x_test = df_train.values[train], df_train.values[test] cv_dxdy_train = target_dxdy[train] cv_dxdy_test = target_dxdy[test] dxdy_regressor.fit(cv_x_train, cv_dxdy_train) cv_dxdy_prediction = dxdy_regressor.predict(cv_x_test) scores_dxdy.append(metrics.r2_score(cv_dxdy_test, cv_dxdy_prediction)) cv_df = pd.DataFrame({ "dx": cv_dxdy_test[:, 0], "dy": cv_dxdy_test[:, 1], "dx_prediction": cv_dxdy_prediction[:, 0], "dy_prediction": cv_dxdy_prediction[:, 1], "cv_fold": fold, }) if config.true_energy_column is not None: cv_df[config.true_energy_column] = true_energy[test] if config.size_column is not None: cv_df[config.size_column] = size[test] cv_predictions.append(cv_df) predictions_df = pd.concat(cv_predictions, ignore_index=True) log.info("writing predictions from cross validation") write_data(predictions_df, predictions_path, mode="w") scores_dxdy = np.array(scores_dxdy) log.info("Cross validated R^2 scores for dxdy: {}".format(scores_dxdy)) log.info("Mean R^2 score from CV: {:0.4f} ± {:0.4f}".format( scores_dxdy.mean(), scores_dxdy.std())) log.info("Building new model on complete data set...") # set random seed again to make sure different settings # for n_cross_validations don't change the final model np.random.seed(config.seed) dxdy_regressor.random_state = config.seed dxdy_regressor.fit(df_train.values, target_dxdy) log.info("Pickling dxdy model to {} ...".format(dxdy_model_path)) save_model( dxdy_regressor, feature_names=list(df_train.columns), model_path=dxdy_model_path, label_text="dxdy", )
def main(configuration_path, signal_path, predictions_path, model_path, verbose): """ Train an energy regressor simulated gamma. Both pmml and pickle format are supported for the output. CONFIGURATION_PATH: Path to the config yaml file SIGNAL_PATH: Path to the signal data PREDICTIONS_PATH : path to the file where the mc predictions are stored. MODEL_PATH: Path to save the model to. Allowed extensions are .pkl and .pmml. If extension is .pmml, then both pmml and pkl file will be saved """ log = setup_logging(verbose=verbose) config = AICTConfig.from_yaml(configuration_path) model_config = config.energy df = read_telescope_data( signal_path, config, model_config.columns_to_read_train, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_signal, ) log.info("Total number of events: {}".format(len(df))) df_train = convert_to_float32(df[model_config.features]) df_train.dropna(how="any", inplace=True) log.debug("Events after nan-dropping: {} ".format(len(df_train))) target = df[model_config.target_column].loc[df_train.index] target.name = "true_energy" if model_config.log_target is True: target = np.log(target) n_cv = model_config.n_cross_validations regressor = model_config.model log.info("Starting {} fold cross validation... ".format(n_cv)) scores = [] cv_predictions = [] kfold = model_selection.KFold(n_splits=n_cv, shuffle=True, random_state=config.seed) for fold, (train, test) in enumerate( tqdm(kfold.split(df_train.values), total=n_cv)): cv_x_train, cv_x_test = df_train.values[train], df_train.values[test] cv_y_train, cv_y_test = target.values[train], target.values[test] regressor.fit(cv_x_train, cv_y_train) cv_y_prediction = regressor.predict(cv_x_test) if model_config.log_target is True: cv_y_test = np.exp(cv_y_test) cv_y_prediction = np.exp(cv_y_prediction) scores.append(metrics.r2_score(cv_y_test, cv_y_prediction)) cv_df = pd.DataFrame({ model_config.target_column: cv_y_test, model_config.output_name: cv_y_prediction, "cv_fold": fold, }) cv_predictions.append(cv_df) predictions_df = pd.concat(cv_predictions, ignore_index=True) log.info("writing predictions from cross validation") write_data(predictions_df, predictions_path, mode="w") scores = np.array(scores) log.info("Cross validated R^2 scores: {}".format(scores)) log.info("Mean R^2 score from CV: {:0.4f} ± {:0.4f}".format( scores.mean(), scores.std())) log.info("Building new model on complete data set...") # set random seed again to make sure different settings # for n_cross_validations don't change the final model np.random.seed(config.seed) regressor.random_state = config.seed regressor.fit(df_train.values, target.values) log.info("Pickling model to {} ...".format(model_path)) save_model( regressor, feature_names=list(df_train.columns), model_path=model_path, label_text=model_config.output_name, )
def main(configuration_path, input_path1, input_path2, output_path1, output_path2): #features, bins): ''' Equalize two datasets in INPUT_PATH1 and INPUT_PATH2 in the feature `intensity`. In case of multiple telescopes, the feature is averaged. Outputs hdf5 files to OUTPUT_BASENAME_NAME.FORMAT Important remark: No run information is stored in the output file, since it is no longer valid due to sampling. Therefore, these output files should only be used for training. ''' config = AICTConfig.from_yaml(configuration_path) array_events1 = read_data(input_path1, key='array_events') array_events1['arr_id_unique'] = array_events1.index telescope_events1 = read_data(input_path1, key='telescope_events') telescope_events1['tel_id_unique'] = telescope_events1.index df1 = pd.merge(array_events1, telescope_events1, on=['run_id', 'array_event_id'], how='left') array_events2 = read_data(input_path2, key='array_events') array_events2['arr_id_unique'] = array_events2.index telescope_events2 = read_data(input_path2, key='telescope_events') telescope_events2['tel_id_unique'] = telescope_events2.index df2 = pd.merge(array_events2, telescope_events2, on=['run_id', 'array_event_id']) print('Number of events before: ', '\nFile 1: ', len(array_events1), '\nFile 2: ', len(array_events2)) if config.has_multiple_telescopes: feature = 'average_intensity' else: feature = 'intensity' minimum = np.min([np.nanmin(df1[feature]), np.nanmin(df2[feature])]) maximum = np.max([np.nanmax(df1[feature]), np.nanmax(df2[feature])]) minimum_log = np.log10(minimum) maximum_log = np.log10(maximum) binning = np.logspace(minimum_log, maximum_log, int((maximum_log - minimum_log) / 0.05)) arr_ids1 = np.array([]) arr_ids2 = np.array([]) for start, end in zip(binning[:-1], binning[1:]): ids1 = df1[(df1[feature] > start) & (df1[feature] < end)].arr_id_unique ids2 = df2[(df2[feature] > start) & (df2[feature] < end)].arr_id_unique if len(ids1) < len(ids2): arr_ids1 = np.append(arr_ids1, ids1) arr_ids2 = np.append( arr_ids2, np.random.choice(ids2, size=len(ids1), replace=False)) else: arr_ids2 = np.append(arr_ids2, ids2) arr_ids1 = np.append( arr_ids1, np.random.choice(ids1, size=len(ids2), replace=False)) print('Number of events after: ', '\nFile 1: ', len(arr_ids1), '\nFile 2: ', len(arr_ids2)) tel_ids1 = df1[df1.arr_id_unique.isin(arr_ids1)].tel_id_unique selected_telescope_events1 = telescope_events1[ telescope_events1.tel_id_unique.isin(tel_ids1)] selected_telescope_events1.drop(columns=['tel_id_unique'], inplace=True) write_data(selected_telescope_events1, output_path1, key='telescope_events', use_h5py=True, mode='w') selected_array_events1 = array_events1[array_events1.arr_id_unique.isin( arr_ids1)] selected_array_events1.drop(columns=['arr_id_unique'], inplace=True) write_data(array_events1, output_path1, key='array_events', use_h5py=True, mode='a') tel_ids2 = df2[df2.arr_id_unique.isin(arr_ids2)].tel_id_unique selected_telescope_events2 = telescope_events2[ telescope_events2.tel_id_unique.isin(tel_ids2)] selected_telescope_events2.drop(columns=['tel_id_unique'], inplace=True) write_data(selected_telescope_events2, output_path2, key='telescope_events', use_h5py=True, mode='w') selected_array_events2 = array_events2[array_events2.arr_id_unique.isin( arr_ids2)] selected_array_events2.drop(columns=['arr_id_unique'], inplace=True) write_data(array_events2, output_path2, key='array_events', use_h5py=True, mode='a')