# used below with f.name ######################################################################################### # --- now set up the chains and links based on configuration flags ch = Chain('DataPrep') # --- 0. pandas read_csv has multiple settings to help reading in of buggy csv's. # o The option error_bad_lines=False skips lines with too few or too many values # o The option encoding='latin1' interprets most non-standard characters read_data = analysis.ReadToDf(key='vrh', reader='csv', path=f.name, error_bad_lines=False, encoding='latin1') ch.add(read_data) # --- 1. standard setting: # o convert all nans to np.nan (= float) # o convert all rows in a column to most occuring datatype in that column fixer = data_quality.FixPandasDataFrame(name='fixer1') fixer.read_key = 'vrh' fixer.store_key = 'vrh_fix1' ch.add(fixer) # --- 2. force certain columns to specified datatype fixer = data_quality.FixPandasDataFrame(name='fixer2') fixer.read_key = 'vrh' fixer.store_key = 'vrh_fix2' fixer.var_dtype = {'B': int, 'C': str} ch.add(fixer)
######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. input_files = [resources.fixture('mock_accounts.csv.gz')] ######################################################################################### # --- now set up the chains and links based on configuration flags ch = Chain('Data') # --- 0. readdata keeps on opening the next file in the file list. # all kwargs are passed on to pandas file reader. read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv') read_data.path = input_files ch.add(read_data) # --- 1. add the record factorizer to convert categorical observables into integers # Here the columns dummy and loc of the input dataset are factorized # e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column: # x = [0, 1, 2, 0, 2] # By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original' fact = analysis.RecordFactorizer(name='rf1') fact.columns = ['eyeColor', 'favoriteFruit'] # ['Obs_*'] fact.read_key = read_data.key fact.inplace = True # factorizer stores a dict with the mappings that have been applied to all observables fact.sk_map_to_original = 'to_original' # factorizer also stores a dict with the mappings back to the original observables fact.sk_map_to_factorized = 'to_factorized' ch.add(fact)
######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['high_num_dims'] = False input_files = [resources.fixture('mock_accounts.csv.gz')] ######################################################################################### # --- now set up the chains and links based on configuration flags ch = Chain('Data') # --- 0. read input data read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv') read_data.path = input_files ch.add(read_data) # --- 1. add the record factorizer # Here the columns dummy and loc of the input dataset are factorized # e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column: # x = [0, 1, 2, 0, 2] # By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original' fact = analysis.RecordFactorizer(name='rf1') fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender'] fact.read_key = 'accounts' fact.inplace = True fact.sk_map_to_original = 'to_original' fact.sk_map_to_factorized = 'to_factorized' fact.logger.log_level = LogLevel.DEBUG ch.add(fact)
'rdd': {}, 'list': { filter_list: dict(min_index=20) }, 'pd': { filter_pd: dict(min_index=20) } } # create chain and data-frame-creator links chain = Chain('Create') for out_format in process_methods: # create data-frame-conversion link lnk = spark_analysis.SparkDfConverter( name='df_to_{}_converter'.format(out_format), read_key='df', store_key='{}_output'.format(out_format), schema_key='{}_schema'.format(out_format), output_format=out_format, preserve_col_names=False, process_methods=process_methods[out_format], process_meth_args=process_meth_args[out_format], process_meth_kwargs=process_meth_kwargs[out_format]) # add link to chain chain.add(lnk) ########################################################################## logger.debug('Done parsing configuration file esk606_convert_spark_df.')
# --- example 2: readdata loops over the input files, with file chunking. if settings['do_loop']: ch = Chain('Data') # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview) # --- readdata keeps on opening the next 400 lines of the open or next file in the file list. # all kwargs are passed on to pandas file reader. read_data = analysis.ReadToDf(name='dflooper', key='rc', reader='csv') read_data.chunksize = chunk_size read_data.path = input_files ch.add(read_data) # add conversion functions to "Data" chain # here, convert column 'registered', an integer, to an actual timestamp. conv_funcs = [{'func': to_date, 'colin': 'registered', 'colout': 'date'}] transform = analysis.ApplyFuncToDf(name='Transform', read_key=read_data.key, apply_funcs=conv_funcs) ch.add(transform) # --- As an example, will fill histogram iteratively over the file loop vc = analysis.ValueCounter() vc.read_key = 'rc' vc.store_key_hists = 'hist' vc.logger.log_level = LogLevel.DEBUG # colums that are picked up to do value_counting on in the input dataset # note: can also be 2-dim: ['isActive','age']
f = {'hello': 'world', 'v': [3, 1, 4, 1, 5], 'n_favorite': 7} g = {'a': 1, 'b': 2, 'c': 3} h = [2, 7] ######################################################################################### # --- now set up the chains and links based on configuration flags ######### # chain 1 ch = Chain('chain1') # the link ToDsDict adds objects to the datastore at link execution. link = core_ops.ToDsDict(name='intods_1') link.store_key = 'f' link.obj = f ch.add(link) # print contents of datastore link = core_ops.PrintDs() ch.add(link) ######### # chain 2 ch = Chain('chain2') # the link AssertInDs checks the presence # of certain objects in the datastore link = core_ops.AssertInDs() link.keySet = ['f'] ch.add(link)
SIZE = 10000 VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C') VAR_UNITS = dict(var_b='m/s') GEN_CONF = dict(var_b=dict(mean=42., std=2.), var_c=dict(mean=42, std=2, dtype=int)) ######################################################################################### # --- now set up the chains and links based on configuration flags data = Chain('Data') # add data-generator link to "Data" chain generator = analysis.BasicGenerator(name='Generate_data', key='data', columns=COLUMNS, size=SIZE, gen_config=GEN_CONF) data.add(generator) # add data-frame summary link to "Summary" chain # can provide labels and units for the variables in the dataset summary = Chain('Summary') summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=generator.key, var_labels=VAR_LABELS, var_units=VAR_UNITS) summary.add(summarizer) ######################################################################################### logger.debug('Done parsing configuration file esk301_dfsummary_plotter')
var_c=dict(choice=['delta', 'epsilon', 'zeta', 'eta'], dtype=str)) ######################################################################################### # --- now set up the chains and links based on configuration flags # create chains data = Chain('Data') # add data-generator link to "Data" chain generator = analysis.BasicGenerator(name='Generate_data', key='data', columns=COLUMNS, size=SIZE, gen_config=GEN_CONF) data.add(generator) # add data-frame summary link to "Boxplot" chain # can provide labels and units for the variables in the dataset, and set the statistics to print in output file plot = Chain('BoxPlot') box_plot = visualization.DfBoxplot( name='Create_stats_overview', read_key=generator.key, statistics=['count', 'mean', 'min', 'max', 'std'], var_labels=VAR_LABELS, var_units=VAR_UNITS, column='var_b', cause_columns=['var_a', 'var_c'], results_path=persistence.io_path('results_data', 'report')) plot.add(box_plot)
] ######################################################################################### # --- now set up the chains and links based on configuration flags # create chains data = Chain('Data') # load data reader = analysis.ReadToDf(name='reader', path=settings['input_path'], sep=settings['separator'], key='input_data', reader=settings['reader']) data.add(reader) # make visualizations of correlations summary = Chain('Summary') corr_link = visualization.CorrelationSummary(name='correlation_summary', read_key='input_data', store_key='correlations', methods=settings['correlations']) summary.add(corr_link) ######################################################################################### logger.debug('Done parsing configuration file esk305_correlation_summary.')
# --- now set up the chains and links based on configuration flags # create read link read_link = spark_analysis.SparkDfReader(name='Reader', store_key='spark_df', read_methods=['csv']) # set CSV read arguments read_link.read_meth_args['csv'] = (file_paths,) read_link.read_meth_kwargs['csv'] = dict(sep=separator, header=has_header, inferSchema=infer_schema) if columns: # add select function read_link.read_methods.append('select') read_link.read_meth_args['select'] = tuple(columns) if num_partitions: # add repartition function read_link.read_methods.append('repartition') read_link.read_meth_args['repartition'] = (num_partitions,) # add link to chain read = Chain('Read') read.add(read_link) ########################################################################## logger.debug('Done parsing configuration file esk602_read_csv_to_spark_df.')
# --- 1. define a model by passing strings to the rooworkspace factory # For the workspace factory syntax, see: # https://root.cern.ch/doc/master/RooFactoryWSTool_8cxx_source.html#l00722 # For rooworkspace factory examples see: # https://root.cern.ch/root/html/tutorials/roofit/rf511_wsfactory_basic.C.html # https://root.cern.ch/root/html/tutorials/roofit/rf512_wsfactory_oper.C.html # https://root.cern.ch/root/html/tutorials/roofit/rf513_wsfactory_tools.C.html # Here we use the pdf class we just created (MyPdfV3), with observable y and parameter A and B, # with ranges (-10,10), (0,100) and (-10,10) respectively. The starting values of A and B are # 10 and 2 respectively. wsu = WsUtils(name='modeller') wsu.factory = [ '{pdf}::testpdf(y[-10,10],A[10,0,100],B[2,-10,10])'.format(pdf=pdf_name) ] ch.add(wsu) # --- 2. simulation: 400 records of observable 'y' with pdf 'testpdf' (of type MyPdfV3). # the simulated data is stored in the datastore under key 'simdata' wsu = WsUtils(name='simulater') wsu.add_simulate(pdf='testpdf', obs='y', num=400, key='simdata') ch.add(wsu) # --- 3. fit: perform fit of pdf 'testpdf' to dataset 'simdata'. # store the fit result object in the datastore under key 'fit_result' # The fit knows from the input dataset that the observable is y, and that # the fit parameters are A and B. wsu = WsUtils(name='fitter') wsu.pages_key = 'report_pages' wsu.add_fit(pdf='testpdf', data='simdata', key='fit_result') ch.add(wsu)
# --- Analysis values, settings, helper functions, configuration flags. # --- Set path of data data_path = resources.fixture('dummy.csv') ######################################################################################### # --- now set up the chains and links based on configuration flags ch1 = Chain('MyChain1') # --- read dummy dataset read_data = analysis.ReadToDf(key='test1', sep='|', reader='csv', path=data_path) ch1.add(read_data) # --- print contents of the datastore pds = core_ops.PrintDs(name='printer1') pds.keys = ['test1'] ch1.add(pds) # --- add the record vectorizer # Here the columns x and y of the input dataset are vectorized # e.g. x=1 becomes the column: x_1 = True vectorizer = analysis.RecordVectorizer() vectorizer.columns = ['x', 'y'] vectorizer.read_key = 'test1' vectorizer.store_key = 'vect_test' vectorizer.astype = int ch1.add(vectorizer)
# --- METHOD 1: configuration file spark = sm.create_session(eskapade_settings=settings) sc = spark.sparkContext logger.info('---> METHOD 1: configuration file') logger.info(str(sc.getConf().getAll())) ########################################################################## # --- METHOD 2: link conf_link = SparkConfigurator(name='SparkConfigurator', log_level='WARN') conf_link.spark_settings = [('spark.app.name', settings['analysisName'] + '_link'), ('spark.master', 'local[42]'), ('spark.driver.host', '127.0.0.1')] config = Chain('Config') config.add(conf_link) logger.info('---> METHOD 2: link') logger.info('NB: settings will be printed at time of link execution.') ########################################################################## # --- running spark session will be stopped automatically at end ########################################################################### # --- the end logger.debug('Done parsing configuration file esk601_spark_configuration.')
settings['do_example'] = True ######################################################################################### # --- now set up the chains and links, based on configuration flags # --- example loops over the first chain 10 times. if settings['do_example']: # --- a loop is set up in the chain MyChain. # we iterate over the chain until the link RepeatChain is done. # then move on to the next chain (Overview) ch = Chain('MyChain') link = core_ops.HelloWorld(name='HelloWorld') link.logger.log_level = LogLevel.DEBUG ch.add(link) # --- this link sends out a signal to repeat the execution of the chain. # It serves as the 'continue' statement of the loop. # go back to start of the chain until counter reaches 10. repeater = core_ops.RepeatChain() # repeat max of 10 times repeater.maxcount = 10 repeater.logger.log_level = LogLevel.DEBUG ch.add(repeater) # --- print contents of the datastore. # which in this case is empty. overview = Chain('Overview') pds = core_ops.PrintDs(name='End') overview.add(pds)
# --- generate pdf ch = Chain('Model') # --- 1. define a model wsu = root_analysis.WsUtils(name='modeller') factory = [ "RooWeibull::wb1(t[0,110000000],a1[0.93,0,2],b1[2.2e-4,0,1e-3])", "RooWeibull::wb2(t,a2[0.61,0,2],b2[1.1e-5,0,1e-3])", "RooWeibull::wb3(t,a3[0.43,0,2],b3[4.7e-7,0,1e-3])", "RooWeibull::wb4(t,a4[0.43,0,2],b4[2.2e-7,0,1e-3])", "SUM::sum2pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2)", "SUM::sum3pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2,N3[150500,0,2e6]*wb3)", "SUM::sum4pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2,N3[150500,0,2e6]*wb3,N4[1e5,0,2e6]*wb4)" ] wsu.factory += factory ch.add(wsu) if settings['generate']: # --- generate pdf ch = Chain('Generate') wsu = root_analysis.WsUtils(name='generator') wsu.add_simulate(pdf=fitpdf, obs='t', num=1625500, key='rds') ch.add(wsu) # # --- example of how to import a roodataset from a root file # if settings['read_data']: # ch = Chain('Data') # read_data = root_analysis.ReadFromRootFile() # read_data.path = '/opt/eskapade/data/tsv_renamed_data.root' # read_data.keys = ['rds']
######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk101_helloworld' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. # E.g. define flags turn on or off certain chains with links. # by default all set to false, unless already configured in # configobject or vars() settings['do_hello'] = True settings['n_repeat'] = 2 ######################################################################################### # --- now set up the chains and links based on configuration flags if settings['do_hello']: hello = Chain(name='Hello') link = core_ops.HelloWorld(name='HelloWorld') link.logger.log_level = LogLevel.DEBUG link.repeat = settings['n_repeat'] hello.add(link) ######################################################################################### logger.debug('Done parsing configuration file esk101_helloworld')
input_files = resources.fixture('correlated_data.sv.gz') ######################################################################################### # --- now set up the chains and links based on configuration flags data = Chain('Data') # --- 0. readdata keeps on opening the next file in the file list. # all kwargs are passed on to pandas file reader. read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv', sep=' ') read_data.path = input_files data.add(read_data) # --- 1. add data-frame summary link to "Summary" chain summarizer = visualization.DfSummary(name='Create_stats_overview', read_key=read_data.key, pages_key='report_pages') data.add(summarizer) # --- 2. Fill 2d histogrammar histograms hf = analysis.HistogrammarFiller() hf.read_key = 'accounts' hf.store_key = 'hist' hf.logger.log_level = LogLevel.DEBUG hf.columns = [['x1', 'x2'], ['x1', 'x3'], ['x1', 'x4'], ['x1', 'x5'], ['x2', 'x3'], ['x2', 'x4'], ['x2', 'x5'], ['x3', 'x4'], ['x3', 'x4'], ['x4', 'x5']]
# --- generate pdf, simulate, fit, and plot ch = Chain('WsOps') # 1. simulate output score of machine learning classifier wsu = root_analysis.WsUtils(name='DataSimulator') wsu.factory = ["RooGaussian::high_risk(score[0,1],1,0.15)", "RooPolynomial::low_risk(score,{-0.3,-0.3})", "SUM::model(frac[0.1,0.,1.]*high_risk,low_risk)"] wsu.add_simulate(pdf='model', obs='score', num=500, key='data', into_ws=True) wsu.add_fit(pdf='model', data='data', key='fit_result', into_ws=True) wsu.add_plot(obs='score', data='data', pdf='model', key='simplot') wsu.add_plot(obs='score', pdf='model', pdf_args=(RooFit.Components('low_risk'), RooFit.LineColor(ROOT.kRed), RooFit.LineStyle(ROOT.kDashed)), output_file='data_with_generator_model.pdf', key='simplot') ch.add(wsu) ch = Chain('SignalPValue') # 2. plot signal probability wsu = root_analysis.WsUtils(name='SignalProbability') wsu.factory = ["expr::high_risk_pvalue('@0*@1/@2',{frac,high_risk,model})"] wsu.add_plot(obs='score', func='high_risk_pvalue', func_args=(RooFit.MoveToBack(),), func_kwargs={'VisualizeError': 'fit_result'}, key='ratio_plot') wsu.add_plot(obs='score', func='high_risk_pvalue', output_file='high_risk_probability.pdf', key='ratio_plot') ch.add(wsu) # 3. calculate p-values and uncertainties thereon ape = root_analysis.AddPropagatedErrorToRooDataSet()
input_files = [resources.fixture('correlated_data.sv.gz')] ######################################################################################### # --- now set up the chains and links based on configuration flags if settings['read_data']: ch = Chain('Data') # --- 0. read the input dataset read_data = analysis.ReadToDf(name='reader', key='correlated_data', reader='csv', sep=' ') read_data.path = input_files ch.add(read_data) # --- 1. convert into a roofit dataset (roodataset) # build a KEYS pdf out of the dataset as well df2rds = root_analysis.ConvertDataFrame2RooDataSet() df2rds.read_key = read_data.key df2rds.store_key = 'rds_' + read_data.key df2rds.store_key_vars = 'keys_varset' df2rds.columns = ['x2', 'x3', 'x4' ] if settings['high_num_dims'] else ['x2', 'x3'] df2rds.store_index = False # build a KEYS pdf out of the roodataset, used for simulation below df2rds.create_keys_pdf = 'keys_Ndim' ch.add(df2rds) pds = core_ops.PrintDs(name='pds1')
######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. input_files = [resources.fixture('mock_accounts.csv.gz')] ######################################################################################### # --- now set up the chains and links based on configuration flags ch = Chain('Data') # --- 0. readdata keeps on opening the next file in the file list. # all kwargs are passed on to pandas file reader. read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv') read_data.path = input_files # readdata.itr_over_files = True ch.add(read_data) # --- 1. add the record factorizer to convert categorical observables into integers # Here the columns dummy and loc of the input dataset are factorized # e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column: # x = [0, 1, 2, 0, 2] # By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original' fact = analysis.RecordFactorizer(name='rf1') fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender'] fact.read_key = 'accounts' fact.inplace = True # factorizer stores a dict with the mappings that have been applied to all observables fact.sk_map_to_original = 'to_original' # factorizer also stores a dict with the mappings back to the original observables fact.sk_map_to_factorized = 'to_factorized' fact.logger.log_level = LogLevel.DEBUG
elif stream_type == 'tcp': ds['dstream'] = ssc.socketTextStream('localhost', 9999) else: logger.error('unsupported stream_type specified: {type}.', type=stream_type) ########################################################################## # --- now set up the chains and links based on configuration flags spark_streaming = Chain('SparkStreaming') # the word count example wordcount_link = SparkStreamingWordCount(name='SparkStreamingWordCount', read_key='dstream', store_key='wordcounts') spark_streaming.add(wordcount_link) # store output writer_link = SparkStreamingWriter( name='SparkStreamingWriter', read_key=wordcount_link.store_key, output_path='file:' + persistence.io_path('results_data', '/dstream/wordcount'), suffix='txt', repartition=1) spark_streaming.add(writer_link) # start/stop of Spark Streaming control_link = SparkStreamingController(name='SparkStreamingController', timeout=10)
read = Chain('Read') # create read link for each data file read_link = SparkDfReader(name='ReadFile', store_key='spark_df', read_methods=['csv']) # set CSV read arguments read_link.read_meth_args['csv'] = (file_path, ) read_link.read_meth_kwargs['csv'] = dict(sep='|', header=True, inferSchema=True) # add link to chain read.add(read_link) # create link to create new column col_link = SparkWithColumn(name='UdfPower', read_key=read_link.store_key, store_key='new_spark_df') # Power of two columns col_link.new_col = functions.pow(functions.col('x'), functions.col('y')) col_link.new_col_name = 'pow_xy1' # add link to chain add_col = Chain('AddColumn') add_col.add(col_link) ##########################################################################
rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.) for it in range(100)] ds['df'] = spark.createDataFrame(rows, schema=['index', 'foo', 'bar']) ########################################################################## # --- now set up the chains and links based on configuration flags # create chain chain = Chain('Map') # create a link to convert the data frame into an RDD conv_lnk = spark_analysis.SparkDfConverter(name='DfConverter', read_key='df', store_key='rdd', output_format='rdd', preserve_col_names=True) chain.add(conv_lnk) # create a link to calculate the sum of "bar" for each group of ten rows map_lnk = spark_analysis.RddGroupMapper(name='Mapper', read_key='rdd', store_key='map_rdd', group_map=sum, input_map=lambda r: (r['index'] // 10, r['bar']), flatten_output_groups=False) chain.add(map_lnk) # create a link to add a column with the sum of "bar" for each group of ten rows flmap_lnk = spark_analysis.RddGroupMapper(name='FlatMapper', read_key='rdd', store_key='flat_map_rdd',
ch = Chain('MyChain1') # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview) # --- readdata keeps on opening the next file in the file list. # all kwargs are passed on to pandas file reader. read_data = analysis.ReadToDf(name='dflooper1', key='test1', sep='|', reader='csv', usecols=['x', 'y']) read_data.path = [data_path] * 3 read_data.itr_over_files = True ch.add(read_data) # --- this serves as the break statement from this loop. # if dataset test is empty, which can happen as the very last dataset by readdata, # then skip the rest of this chain. skipper = core_ops.SkipChainIfEmpty() skipper.collection_set = ['test1'] skipper.check_at_initialize = False skipper.check_at_execute = True ch.add(skipper) # --- do something useful with the test dataset here ... # e.g. apply selections, or collect into histograms. # --- this serves as the continue statement of the loop. go back to start of the chain. # repeater listens to readdata is there are any more datasets coming. if so, continue the loop.
########################################################################## # Now set up the chains and links based on configuration flags read = Chain('Read') # create read link for each data file for index, key in enumerate(STORE_KEYS): read_link = SparkDfReader(name='Reader' + str(index + 1), store_key=key, read_methods=['csv']) # set CSV read arguments read_link.read_meth_args['csv'] = (file_paths[index],) read_link.read_meth_kwargs['csv'] = dict(sep='|', header=True, inferSchema=True) # add link to chain read.add(read_link) # create SQL-query link sql_link = SparkExecuteQuery(name='SparkSQL', store_key='spark_df_sql') # define SQL-query to apply to one or more objects in the DataStore sql_link.query = 'SELECT loc, sum(x) as sumx, sum(y) as sumy ' \ 'FROM (SELECT * FROM {0:s} UNION ALL SELECT * FROM {1:s}) t ' \ 'WHERE t.x < 5 ' \ 'GROUP BY loc'.format(STORE_KEYS[0], STORE_KEYS[1]) # add link to chain sql = Chain('ApplySQL') sql.add(sql_link) ##########################################################################
into_ws=True) wsu.add_simulate(pdf='low_risk', obs='score', num=1000, key='unbiased_low_risk_testdata', into_ws=True) wsu.add_simulate(pdf='model', obs='score', num=1000, key='data', into_ws=True) wsu.add_plot(obs='score', data='data', pdf='model', key='simplot') wsu.add_plot(obs='score', pdf='model', pdf_args=(RooFit.Components('low_risk'), RooFit.LineColor(ROOT.kRed), RooFit.LineStyle(ROOT.kDashed)), output_file='data_with_generator_model.pdf', key='simplot') ch.add(wsu) # 2a. turn data into roofit histograms wsu = WsUtils(name='HistMaker') def make_histograms(w): """Make histogram.""" # Need to be imported here as well, otherwise throws: name 'ROOT' is not defined. import ROOT # noqa from esroofit.decorators.roofit import ws_put w.var('score').setBins(40) high_risk_hist = ROOT.RooDataHist('high_risk_hist', 'high_risk_hist', ROOT.RooArgSet(w.var('score')), w.data('unbiased_high_risk_testdata')) low_risk_hist = ROOT.RooDataHist('low_risk_hist', 'low_risk_hist',
settings['analysisName'] = 'esk703_mimic_data' settings['version'] = 0 np.random.seed(42) ch = Chain('DataPrep') ch.logger.log_level = LogLevel.DEBUG sim_data = data_mimic.MixedVariablesSimulation(store_key='df', n_obs=100000, p_unordered=np.array( [[0.2, 0.2, 0.3, 0.3], [0.3, 0.7]])) sim_data.logger.log_level = LogLevel.DEBUG ch.add(sim_data) pre_data = data_mimic.KDEPreparation( read_key='df', data_store_key='data', data_smoothed_store_key='data_smoothed', data_no_nans_store_key='data_no_nans', data_normalized_store_key='data_normalized', maps_store_key='maps', qts_store_key='qts', new_column_order_store_key='new_column_order', ids_store_key='ids', unordered_categorical_columns=['a', 'b'], string_columns=['a', 'b'], count=1, extremes_fraction=0.15,
# --- example 2: readdata loops over the input files, with file chunking. if settings.get('do_example2', True): ch = Chain('MyChain2') ch.n_fork = 10 # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview) # --- readdata keeps on opening the next 4 lines of the open or next file in the file list. # all kwargs are passed on to pandas file reader. read_data = analysis.ReadToDf(name='dflooper2', key='test2', sep='|', reader='csv', usecols=['x', 'y'], chunksize=chunk_size) read_data.path = [data_path] * 3 ch.add(read_data) # --- do something useful with the test dataset here ... # e.g. apply selections, or collect into histograms. # query_set = seletions that are applies to incoming_records # after selections, only keep column in select_columns ('a', 'c') link = analysis.ApplySelectionToDf(read_key='test2', store_key='reduced_data', query_set=['x>1']) # Any other kwargs given to ApplySelectionToDf are passed on the the # pandas query() function. ch.add(link) dc = core_ops.ForkDataCollector() dc.keys = [{'key_ds': link.store_key, 'func': pd.concat}] ch.add(dc)
model.build_model() model.var.SetTitle('Redeem age') model.max_var.SetTitle('Age') model.var.setUnit('days') model.max_var.setUnit('days') ############################################################################### # --- create chain for generating voucher redeem data ch = Chain('Generation') gen_link = TruncExpGen(name='Generate', store_key=REDEEM_DATA_KEY, max_var_data_key=AGE_DATA_KEY, model_name=MODEL_NAME, event_frac=REDEEM_FRAC) ch.add(gen_link) np.random.seed(settings['seeds']['NumPy']) ROOT.RooRandom.randomGenerator().SetSeed(settings['seeds']['RooFit']) ############################################################################### # --- create chain for fitting voucher redeem model to generated data ch = Chain('Fitting') fit_link = TruncExpFit(name='Fit', read_key=gen_link.store_key, max_var_data_key=gen_link.max_var_data_key, model_name=gen_link.model_name) ch.add(fit_link) ###############################################################################
ds = process_manager.service(DataStore) ds['incoming_records'] = df ######################################################################################### # --- Here we apply example selections to a dataframe picked up from the datastore. data_prep = Chain('DataPrep') # query_set = seletions that are applies to incoming_records # after selections, only keep column in select_columns ('a', 'c') link = analysis.ApplySelectionToDf(read_key='incoming_records', store_key='outgoing_records', query_set=['a>0', 'c<b'], select_columns=['a', 'c']) # Any other kwargs given to ApplySelectionToDf are passed on the the # pandas query() function. link.logger.log_level = LogLevel.DEBUG data_prep.add(link) link = core_ops.DsObjectDeleter() link.deletion_keys = ['incoming_records'] data_prep.add(link) link = core_ops.PrintDs() link.keys = ['n_outgoing_records', 'outgoing_records'] data_prep.add(link) ######################################################################################### logger.debug('Done parsing configuration file esk204_apply_query_to_pandas_df')