Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import ConfigObject, Chain from eskapade import core_ops from eskapade import process_manager from eskapade.logger import Logger, LogLevel logger = Logger() logger.debug('Now parsing configuration file esk107_chain_looper') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk107_chain_looper' settings['version'] = 0 ######################################################################################### # --- Analysis configuration flags. # E.g. use these flags turn on or off certain chains with links. # by default all set to false, unless already configured in # configobject or vars() # turn on/off the example
KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import ConfigObject, Chain from eskapade import analysis, root_analysis, visualization from eskapade import process_manager from eskapade import resources from eskapade.logger import Logger, LogLevel logger = Logger() logger.debug('Now parsing configuration file esk410_testing_correlations_between_categories') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk410_testing_correlations_between_categories' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. input_files = [resources.fixture('mock_accounts.csv.gz')] ######################################################################################### # --- now set up the chains and links based on configuration flags
KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import ConfigObject, Chain from eskapade import core_ops, analysis, root_analysis from eskapade import process_manager from eskapade import resources from eskapade.logger import Logger, LogLevel logger = Logger() logger.debug( 'Now parsing configuration file esk405_simulation_based_on_binned_data') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk405_simulation_based_on_binned_data' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['high_num_dims'] = False input_files = [resources.fixture('mock_accounts.csv.gz')]
KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ import pyspark from eskapade import process_manager, ConfigObject, DataStore, spark_analysis, Chain from eskapade.logger import Logger from eskapade.spark_analysis import SparkManager logger = Logger() logger.debug('Now parsing configuration file esk606_convert_spark_df.') ########################################################################## # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk606_convert_spark_df' settings['version'] = 0 ########################################################################## # --- start Spark session spark = process_manager.service(SparkManager).create_session( eskapade_settings=settings) ##########################################################################
final histogram. Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import analysis, core_ops, process_manager, resources, visualization, ConfigObject, Chain from eskapade.logger import Logger, LogLevel logger = Logger() logger.debug('Now parsing configuration file esk302_histogram_filler_plotter.') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk302_histogram_filler_plotter' settings['version'] = 0 ######################################################################################### msg = r""" The plots and latex files produced by link hist_summary can be found in dir: {path} """ logger.info(msg, path=settings['resultsDir'] + '/' + settings['analysisName'] + '/data/v0/report/')
Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import ConfigObject, resources, Chain from eskapade import analysis, visualization from eskapade import process_manager from eskapade.logger import Logger, LogLevel logger = Logger() logger.debug('Now parsing configuration file esk306_concatenate_reports.') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk306_concatenate_reports' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. input_files = resources.fixture('correlated_data.sv.gz') ######################################################################################### # --- now set up the chains and links based on configuration flags
Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import ConfigObject, Chain from eskapade import analysis, visualization from eskapade import process_manager from eskapade.logger import Logger logger = Logger() logger.debug('Now parsing configuration file esk301_dfsummary_plotter.') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk301_dfsummary_plotter' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. msg = r""" The plots and latex files produced by link df_summary can be found in dir: {path}
Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import ConfigObject, resources, Chain from eskapade import analysis, visualization from eskapade import process_manager from eskapade.logger import Logger logger = Logger() logger.debug('Now parsing configuration file esk305_correlation_summary.') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk305_correlation_summary' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['input_path'] = resources.fixture('correlated_data.sv.gz') settings['reader'] = 'csv' settings['separator'] = ' ' settings['correlations'] = [
Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import ConfigObject, Chain from eskapade import core_ops from eskapade import process_manager from eskapade.logger import Logger logger = Logger() logger.debug( 'Now parsing configuration file esk104_basic_datastore_operations.') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk104_basic_datastore_operations' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. # some dummy information to use in this macro f = {'hello': 'world', 'v': [3, 1, 4, 1, 5], 'n_favorite': 7} g = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 'favorite'}
modification, are permitted according to the terms listed in the file LICENSE. """ import sys import ROOT from eskapade import process_manager, ConfigObject, Chain from eskapade.logger import Logger from esroofit.links import WsUtils logger = Logger() logger.debug('Now parsing configuration file tutorial_5.') ############################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'tutorial_5' settings['version'] = 0 ############################################################################### # - First create, compile and load your pdf model. We can either create it # on the fly or load if it has already been created. pdf_name = 'MyPdf' pdf_lib_base = pdf_name + '_cxx' pdf_lib_ext = '.so' pdf_lib_name = pdf_lib_base + pdf_lib_ext
pdf file. Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import ConfigObject, Chain from eskapade import analysis, visualization from eskapade import process_manager from escore.core import persistence from eskapade.logger import Logger logger = Logger() logger.debug('Now parsing configuration file esk304_df_boxplot.') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk304_df_boxplot' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. msg = r""" The plots and latex files produced by link df_summary can be found in dir: {path}
Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import process_manager, ConfigObject, resources, spark_analysis, Chain from eskapade.logger import Logger from eskapade.spark_analysis import SparkManager logger = Logger() logger.debug('Now parsing configuration file esk602_read_csv_to_spark_df.') ########################################################################## # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk602_read_csv_to_spark_df' settings['version'] = 0 ########################################################################## # --- start Spark session spark = process_manager.service(SparkManager).create_session(eskapade_settings=settings) ########################################################################## # --- CSV and data-frame settings
Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import ConfigObject, resources, Chain from eskapade import core_ops, analysis from eskapade import process_manager from eskapade.logger import Logger logger = Logger() logger.debug('Now parsing configuration file esk207_record_vectorizer') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk207_record_vectorizer' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. # --- Set path of data data_path = resources.fixture('dummy.csv') #########################################################################################
Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import process_manager, ConfigObject, Chain from eskapade.logger import Logger from eskapadespark import SparkManager, SparkConfigurator logger = Logger() logger.debug('Now parsing configuration file esk601_spark_configuration.') ########################################################################## # --- logging in Spark # # 1) through log4j # in $SPARK_HOME/conf/log4j.properties set: # log4j.logger.org.apache.spark.api.python.PythonGatewayServer=INFO # # 2) through SparkContext # in Python code set: # process_manager.service(SparkManager).get_session().sparkContext.setLogLevel('INFO') # # NB: get a list of loggers through logging.Logger.manager.loggerDict # # logging.getLogger('py4j').setLevel('INFO')
Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ import pandas as pd from eskapade import analysis, core_ops, process_manager, resources, ConfigObject, Chain from eskapade.logger import Logger logger = Logger() logger.debug('Now parsing configuration file esk211_fork_read_data_itr') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk211_fork_read_data_itr' settings['version'] = 0 # no need to set this normally, but illustrates how to throttle the number of concurrent processes. # default is set to number of available cpu cores. process_manager.num_cpu = 4 ######################################################################################### # when chunking through an input file, pick up only N lines in each iteration. chunk_size = 5
Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import ConfigObject, Chain from eskapade import core_ops from eskapade import process_manager from eskapade.logger import Logger, LogLevel logger = Logger() logger.debug('Now parsing configuration file esk101_helloworld') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk101_helloworld' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. # E.g. define flags turn on or off certain chains with links. # by default all set to false, unless already configured in # configobject or vars()
from eskapade import ConfigObject, Chain from eskapade import process_manager from eskapade import root_analysis from eskapade.core import persistence from eskapade.logger import Logger from eskapade.root_analysis import roofit_utils # make sure Eskapade RooFit library is loaded roofit_utils.load_libesroofit() import ROOT from ROOT import RooFit logger = Logger() logger.debug( 'Now parsing configuration file esk411_weibull_predictive_maintenance') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk411_weibull_predictive_maintenance' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. msg = r""" The plots and latex report produced by link WsUtils can be found in dir: {path}
Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import ConfigObject, Chain from eskapade import core_ops, visualization, root_analysis from eskapade import process_manager from eskapade.logger import Logger logger = Logger('macro.esk404_workspace_createpdf_simulate_fit_plot') logger.debug('Now parsing configuration file esk404_workspace_createpdf_simulate_fit_plot') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk404_workspace_createpdf_simulate_fit_plot' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['generate_fit_plot'] = True settings['summary'] = True #########################################################################################
Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import ConfigObject, Chain from eskapade import core_ops from eskapade import process_manager from eskapade.logger import Logger logger = Logger() logger.debug('Now parsing configuration file esk105_datastore_pickling.') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk105_datastore_pickling' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. msg = r""" The setup consists of three simple chains that add progressively more information to the datastore. In the examples the datastore gets persisted after the execution of each chain, and can be picked up again as input for the next chain.
from eskapade import ConfigObject, Chain from eskapade import core_ops, visualization, root_analysis from eskapade import process_manager from eskapade.logger import Logger from eskapade.root_analysis import roofit_utils # make sure Eskapade RooFit library is loaded roofit_utils.load_libesroofit() import ROOT from ROOT import RooFit logger = Logger() logger.debug('Now parsing configuration file esk408_classification_error_propagation_after_fit') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk408_classification_error_propagation_after_fit' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. ######################################################################################### # --- now set up the chains and links based on configuration flags # --- generate pdf, simulate, fit, and plot
Do not forget to clean the results directory when testing. Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from pyspark.streaming import StreamingContext from eskapade import process_manager, ConfigObject, DataStore, Chain from escore.core import persistence from eskapade.logger import Logger from eskapadespark import SparkManager, SparkStreamingWordCount, SparkStreamingWriter, SparkStreamingController logger = Logger() logger.debug('Now parsing configuration file esk610_spark_streaming.') ########################################################################## # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk610_spark_streaming' settings['version'] = 0 # check command line def check_var(var_name, local_vars=vars(), settings=settings, default=False): """Set setting and return it.""" var_value = default if var_name in local_vars: var_value = settings[var_name] = local_vars[var_name]
Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import ConfigObject, Chain, process_manager from eskapade import core_ops, analysis from eskapade.logger import Logger, LogLevel from esroofit import resources from esroofit.links import RooDataHistFiller logger = Logger() logger.debug('Now parsing configuration file esk402_roodatahist_fill') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk402_roodatahist_fill' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. input_files = [resources.fixture('mock_accounts.csv.gz')] ######################################################################################### # --- now set up the chains and links based on configuration flags
Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import process_manager, ConfigObject, DataStore, spark_analysis, Chain from eskapade.logger import Logger from eskapade.spark_analysis import SparkManager logger = Logger() logger.debug('Now parsing configuration file esk609_map_df_groups') ########################################################################## # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk609_map_df_groups' settings['version'] = 0 ########################################################################## # --- start Spark session spark = process_manager.service(SparkManager).create_session( eskapade_settings=settings) ##########################################################################
KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from pyspark.sql import types, functions from eskapade import process_manager, ConfigObject, Chain from eskapade.logger import Logger from eskapadespark import SparkManager, SparkDfReader, SparkWithColumn, resources logger = Logger() logger.debug('Now parsing configuration file esk607_spark_with_column') ########################################################################## # Minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk607_spark_with_column' settings['version'] = 0 ########################################################################## # Start Spark session spark = process_manager.service(SparkManager).create_session( eskapade_settings=settings) ##########################################################################
Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import process_manager, ConfigObject, Chain from eskapade.logger import Logger from eskapadespark import SparkManager, SparkDfReader, SparkExecuteQuery, resources logger = Logger() logger.debug('Now parsing configuration file esk604_spark_execute_query.') ########################################################################## # Minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk604_spark_execute_query' settings['version'] = 0 ########################################################################## # Start Spark session spark = process_manager.service(SparkManager).create_session(eskapade_settings=settings) ########################################################################## # CSV and dataframe settings
datasets in chunks. Authors: KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ from eskapade import analysis, core_ops, process_manager, resources, ConfigObject, Chain from eskapade.logger import Logger logger = Logger() logger.debug('Now parsing configuration file esk209_read_big_data_itr') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk209_read_big_data_itr' settings['version'] = 0 ######################################################################################### # when chunking through an input file, pick up only N lines in each iteration. chunk_size = 5 ######################################################################################### # --- Set path of data data_path = resources.fixture('dummy.csv')
KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ import numpy as np from eskapade import ConfigObject, Chain from eskapade import data_mimic from eskapade import process_manager from eskapade.logger import Logger, LogLevel logger = Logger() logger.debug('Now parsing configuration file esk703_mimic_data') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk703_mimic_data' settings['version'] = 0 np.random.seed(42) ch = Chain('DataPrep') ch.logger.log_level = LogLevel.DEBUG sim_data = data_mimic.MixedVariablesSimulation(store_key='df', n_obs=100000, p_unordered=np.array(
from ROOT import RooFit from eskapade import ConfigObject, Chain, process_manager from eskapade import core_ops from eskapade.logger import Logger from esroofit import roofit_utils from esroofit.links import WsUtils, PrintWs # make sure Eskapade RooFit library is loaded roofit_utils.load_libesroofit() logger = Logger() logger.debug( 'Now parsing configuration file esk407_classification_unbiased_fit_estimate' ) ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk407_classification_unbiased_fit_estimate' settings['version'] = 0 ######################################################################################### # --- now set up the chains and links based on configuration flags # --- generate pdf, simulate, fit, and plot ch = Chain('WsOps')
from eskapade.root_analysis import RooFitManager, TruncExpGen, TruncExpFit from eskapade.root_analysis.roofit_models import TruncExponential MODEL_NAME = 'voucher_redeem' REDEEM_DATA_KEY = 'voucher_redeems' AGE_DATA_KEY = 'voucher_ages' MAX_AGE = 1500 # days FAST_REDEEM_RATE = -0.01 # per day SLOW_REDEEM_RATE = -0.001 # per day FAST_FRAC = 0.4 REDEEM_FRAC = 0.6 logger = Logger() logger.debug('Now parsing configuration file esk409_unredeemed_vouchers.') ############################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk409_unredeemed_vouchers' settings['version'] = 0 ############################################################################### # --- create voucher redeem model # create model if it is not read from persisted services of first chain if not settings.get('beginWithChain'): rfm = process_manager.service(RooFitManager) model = rfm.model(MODEL_NAME,
Redistribution and use in source and binary forms, with or without modification, are permitted according to the terms listed in the file LICENSE. """ import tempfile from eskapade import ConfigObject, Chain from eskapade import core_ops, analysis, data_quality from eskapade import process_manager from eskapade.logger import Logger logger = Logger() logger.debug('Now parsing configuration file esk501_fix_pandas_dataframe') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk501_fix_pandas_dataframe' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. # dummy dataframe filled with inconsistent data types per column tmp = b"""A,B,C,D,E,F,G,H True,foo,1.0,1,1,1,a,a