コード例 #1
0
# used below with f.name

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('DataPrep')

# --- 0. pandas read_csv has multiple settings to help reading in of buggy csv's.
#     o The option error_bad_lines=False skips lines with too few or too many values
#     o The option encoding='latin1' interprets most non-standard characters
read_data = analysis.ReadToDf(key='vrh',
                              reader='csv',
                              path=f.name,
                              error_bad_lines=False,
                              encoding='latin1')
ch.add(read_data)

# --- 1. standard setting:
#     o convert all nans to np.nan (= float)
#     o convert all rows in a column to most occuring datatype in that column
fixer = data_quality.FixPandasDataFrame(name='fixer1')
fixer.read_key = 'vrh'
fixer.store_key = 'vrh_fix1'
ch.add(fixer)

# --- 2. force certain columns to specified datatype
fixer = data_quality.FixPandasDataFrame(name='fixer2')
fixer.read_key = 'vrh'
fixer.store_key = 'vrh_fix2'
fixer.var_dtype = {'B': int, 'C': str}
ch.add(fixer)
#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

input_files = [resources.fixture('mock_accounts.csv.gz')]

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('Data')

# --- 0. readdata keeps on opening the next file in the file list.
#     all kwargs are passed on to pandas file reader.
read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv')
read_data.path = input_files
ch.add(read_data)

# --- 1. add the record factorizer to convert categorical observables into integers
#     Here the columns dummy and loc of the input dataset are factorized
#     e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column:
#     x = [0, 1, 2, 0, 2]
#     By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original'
fact = analysis.RecordFactorizer(name='rf1')
fact.columns = ['eyeColor', 'favoriteFruit']  # ['Obs_*']
fact.read_key = read_data.key
fact.inplace = True
# factorizer stores a dict with the mappings that have been applied to all observables
fact.sk_map_to_original = 'to_original'
# factorizer also stores a dict with the mappings back to the original observables
fact.sk_map_to_factorized = 'to_factorized'
ch.add(fact)
コード例 #3
0
#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['high_num_dims'] = False

input_files = [resources.fixture('mock_accounts.csv.gz')]

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('Data')

# --- 0. read input data
read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv')
read_data.path = input_files
ch.add(read_data)

# --- 1. add the record factorizer
#     Here the columns dummy and loc of the input dataset are factorized
#     e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column:
#     x = [0, 1, 2, 0, 2]
#     By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original'
fact = analysis.RecordFactorizer(name='rf1')
fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender']
fact.read_key = 'accounts'
fact.inplace = True
fact.sk_map_to_original = 'to_original'
fact.sk_map_to_factorized = 'to_factorized'
fact.logger.log_level = LogLevel.DEBUG
ch.add(fact)
コード例 #4
0
    'rdd': {},
    'list': {
        filter_list: dict(min_index=20)
    },
    'pd': {
        filter_pd: dict(min_index=20)
    }
}

# create chain and data-frame-creator links
chain = Chain('Create')
for out_format in process_methods:
    # create data-frame-conversion link
    lnk = spark_analysis.SparkDfConverter(
        name='df_to_{}_converter'.format(out_format),
        read_key='df',
        store_key='{}_output'.format(out_format),
        schema_key='{}_schema'.format(out_format),
        output_format=out_format,
        preserve_col_names=False,
        process_methods=process_methods[out_format],
        process_meth_args=process_meth_args[out_format],
        process_meth_kwargs=process_meth_kwargs[out_format])

    # add link to chain
    chain.add(lnk)

##########################################################################

logger.debug('Done parsing configuration file esk606_convert_spark_df.')
コード例 #5
0
# --- example 2: readdata loops over the input files, with file chunking.

if settings['do_loop']:
    ch = Chain('Data')

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next 400 lines of the open or next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    read_data = analysis.ReadToDf(name='dflooper', key='rc', reader='csv')
    read_data.chunksize = chunk_size
    read_data.path = input_files
    ch.add(read_data)

    # add conversion functions to "Data" chain
    # here, convert column 'registered', an integer, to an actual timestamp.
    conv_funcs = [{'func': to_date, 'colin': 'registered', 'colout': 'date'}]
    transform = analysis.ApplyFuncToDf(name='Transform', read_key=read_data.key,
                                       apply_funcs=conv_funcs)
    ch.add(transform)

    # --- As an example, will fill histogram iteratively over the file loop
    vc = analysis.ValueCounter()
    vc.read_key = 'rc'
    vc.store_key_hists = 'hist'
    vc.logger.log_level = LogLevel.DEBUG
    # colums that are picked up to do value_counting on in the input dataset
    # note: can also be 2-dim: ['isActive','age']
コード例 #6
0
f = {'hello': 'world', 'v': [3, 1, 4, 1, 5], 'n_favorite': 7}
g = {'a': 1, 'b': 2, 'c': 3}
h = [2, 7]

#########################################################################################
# --- now set up the chains and links based on configuration flags

#########
# chain 1
ch = Chain('chain1')

# the link ToDsDict adds objects to the datastore at link execution.
link = core_ops.ToDsDict(name='intods_1')
link.store_key = 'f'
link.obj = f
ch.add(link)

# print contents of datastore
link = core_ops.PrintDs()
ch.add(link)

#########
# chain 2
ch = Chain('chain2')

# the link AssertInDs checks the presence
# of certain objects in the datastore
link = core_ops.AssertInDs()
link.keySet = ['f']
ch.add(link)
コード例 #7
0
SIZE = 10000
VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C')
VAR_UNITS = dict(var_b='m/s')
GEN_CONF = dict(var_b=dict(mean=42., std=2.),
                var_c=dict(mean=42, std=2, dtype=int))

#########################################################################################
# --- now set up the chains and links based on configuration flags

data = Chain('Data')
# add data-generator link to "Data" chain
generator = analysis.BasicGenerator(name='Generate_data',
                                    key='data',
                                    columns=COLUMNS,
                                    size=SIZE,
                                    gen_config=GEN_CONF)
data.add(generator)

# add data-frame summary link to "Summary" chain
# can provide labels and units for the variables in the dataset
summary = Chain('Summary')
summarizer = visualization.DfSummary(name='Create_stats_overview',
                                     read_key=generator.key,
                                     var_labels=VAR_LABELS,
                                     var_units=VAR_UNITS)
summary.add(summarizer)

#########################################################################################

logger.debug('Done parsing configuration file esk301_dfsummary_plotter')
コード例 #8
0
                var_c=dict(choice=['delta', 'epsilon', 'zeta', 'eta'],
                           dtype=str))

#########################################################################################
# --- now set up the chains and links based on configuration flags

# create chains
data = Chain('Data')

# add data-generator link to "Data" chain
generator = analysis.BasicGenerator(name='Generate_data',
                                    key='data',
                                    columns=COLUMNS,
                                    size=SIZE,
                                    gen_config=GEN_CONF)
data.add(generator)

# add data-frame summary link to "Boxplot" chain
# can provide labels and units for the variables in the dataset, and set the statistics to print in output file
plot = Chain('BoxPlot')
box_plot = visualization.DfBoxplot(
    name='Create_stats_overview',
    read_key=generator.key,
    statistics=['count', 'mean', 'min', 'max', 'std'],
    var_labels=VAR_LABELS,
    var_units=VAR_UNITS,
    column='var_b',
    cause_columns=['var_a', 'var_c'],
    results_path=persistence.io_path('results_data', 'report'))
plot.add(box_plot)
コード例 #9
0
]

#########################################################################################
# --- now set up the chains and links based on configuration flags

# create chains
data = Chain('Data')

# load data
reader = analysis.ReadToDf(name='reader',
                           path=settings['input_path'],
                           sep=settings['separator'],
                           key='input_data',
                           reader=settings['reader'])

data.add(reader)

# make visualizations of correlations
summary = Chain('Summary')

corr_link = visualization.CorrelationSummary(name='correlation_summary',
                                             read_key='input_data',
                                             store_key='correlations',
                                             methods=settings['correlations'])

summary.add(corr_link)

#########################################################################################

logger.debug('Done parsing configuration file esk305_correlation_summary.')
コード例 #10
0
# --- now set up the chains and links based on configuration flags

# create read link
read_link = spark_analysis.SparkDfReader(name='Reader',
                                         store_key='spark_df',
                                         read_methods=['csv'])

# set CSV read arguments
read_link.read_meth_args['csv'] = (file_paths,)
read_link.read_meth_kwargs['csv'] = dict(sep=separator,
                                         header=has_header,
                                         inferSchema=infer_schema)

if columns:
    # add select function
    read_link.read_methods.append('select')
    read_link.read_meth_args['select'] = tuple(columns)

if num_partitions:
    # add repartition function
    read_link.read_methods.append('repartition')
    read_link.read_meth_args['repartition'] = (num_partitions,)

# add link to chain
read = Chain('Read')
read.add(read_link)

##########################################################################

logger.debug('Done parsing configuration file esk602_read_csv_to_spark_df.')
コード例 #11
0
# --- 1. define a model by passing strings to the rooworkspace factory
#     For the workspace factory syntax, see:
#     https://root.cern.ch/doc/master/RooFactoryWSTool_8cxx_source.html#l00722
#     For rooworkspace factory examples see:
#     https://root.cern.ch/root/html/tutorials/roofit/rf511_wsfactory_basic.C.html
#     https://root.cern.ch/root/html/tutorials/roofit/rf512_wsfactory_oper.C.html
#     https://root.cern.ch/root/html/tutorials/roofit/rf513_wsfactory_tools.C.html
#     Here we use the pdf class we just created (MyPdfV3), with observable y and parameter A and B,
#     with ranges (-10,10), (0,100) and (-10,10) respectively. The starting values of A and B are
#     10 and 2 respectively.
wsu = WsUtils(name='modeller')
wsu.factory = [
    '{pdf}::testpdf(y[-10,10],A[10,0,100],B[2,-10,10])'.format(pdf=pdf_name)
]
ch.add(wsu)

# --- 2. simulation: 400 records of observable 'y' with pdf 'testpdf' (of type MyPdfV3).
#        the simulated data is stored in the datastore under key 'simdata'
wsu = WsUtils(name='simulater')
wsu.add_simulate(pdf='testpdf', obs='y', num=400, key='simdata')
ch.add(wsu)

# --- 3. fit: perform fit of pdf 'testpdf' to dataset 'simdata'.
#        store the fit result object in the datastore under key 'fit_result'
#        The fit knows from the input dataset that the observable is y, and that
#        the fit parameters are A and B.
wsu = WsUtils(name='fitter')
wsu.pages_key = 'report_pages'
wsu.add_fit(pdf='testpdf', data='simdata', key='fit_result')
ch.add(wsu)
コード例 #12
0
# --- Analysis values, settings, helper functions, configuration flags.

# --- Set path of data
data_path = resources.fixture('dummy.csv')

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch1 = Chain('MyChain1')

# --- read dummy dataset
read_data = analysis.ReadToDf(key='test1',
                              sep='|',
                              reader='csv',
                              path=data_path)
ch1.add(read_data)

# --- print contents of the datastore
pds = core_ops.PrintDs(name='printer1')
pds.keys = ['test1']
ch1.add(pds)

# --- add the record vectorizer
#     Here the columns x and y of the input dataset are vectorized
#     e.g. x=1 becomes the column: x_1 = True
vectorizer = analysis.RecordVectorizer()
vectorizer.columns = ['x', 'y']
vectorizer.read_key = 'test1'
vectorizer.store_key = 'vect_test'
vectorizer.astype = int
ch1.add(vectorizer)
コード例 #13
0
# --- METHOD 1: configuration file

spark = sm.create_session(eskapade_settings=settings)
sc = spark.sparkContext

logger.info('---> METHOD 1: configuration file')
logger.info(str(sc.getConf().getAll()))

##########################################################################
# --- METHOD 2: link

conf_link = SparkConfigurator(name='SparkConfigurator', log_level='WARN')
conf_link.spark_settings = [('spark.app.name',
                             settings['analysisName'] + '_link'),
                            ('spark.master', 'local[42]'),
                            ('spark.driver.host', '127.0.0.1')]

config = Chain('Config')
config.add(conf_link)

logger.info('---> METHOD 2: link')
logger.info('NB: settings will be printed at time of link execution.')

##########################################################################
# --- running spark session will be stopped automatically at end

###########################################################################
# --- the end

logger.debug('Done parsing configuration file esk601_spark_configuration.')
コード例 #14
0
settings['do_example'] = True

#########################################################################################
# --- now set up the chains and links, based on configuration flags

# --- example loops over the first chain 10 times.

if settings['do_example']:
    # --- a loop is set up in the chain MyChain.
    #     we iterate over the chain until the link RepeatChain is done.
    #     then move on to the next chain (Overview)
    ch = Chain('MyChain')

    link = core_ops.HelloWorld(name='HelloWorld')
    link.logger.log_level = LogLevel.DEBUG
    ch.add(link)

    # --- this link sends out a signal to repeat the execution of the chain.
    #     It serves as the 'continue' statement of the loop.
    #     go back to start of the chain until counter reaches 10.
    repeater = core_ops.RepeatChain()
    # repeat max of 10 times
    repeater.maxcount = 10
    repeater.logger.log_level = LogLevel.DEBUG
    ch.add(repeater)

# --- print contents of the datastore.
#    which in this case is empty.
overview = Chain('Overview')
pds = core_ops.PrintDs(name='End')
overview.add(pds)
コード例 #15
0
    # --- generate pdf
    ch = Chain('Model')

    # --- 1. define a model
    wsu = root_analysis.WsUtils(name='modeller')
    factory = [
        "RooWeibull::wb1(t[0,110000000],a1[0.93,0,2],b1[2.2e-4,0,1e-3])",
        "RooWeibull::wb2(t,a2[0.61,0,2],b2[1.1e-5,0,1e-3])",
        "RooWeibull::wb3(t,a3[0.43,0,2],b3[4.7e-7,0,1e-3])",
        "RooWeibull::wb4(t,a4[0.43,0,2],b4[2.2e-7,0,1e-3])",
        "SUM::sum2pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2)",
        "SUM::sum3pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2,N3[150500,0,2e6]*wb3)",
        "SUM::sum4pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2,N3[150500,0,2e6]*wb3,N4[1e5,0,2e6]*wb4)"
    ]
    wsu.factory += factory
    ch.add(wsu)

if settings['generate']:
    # --- generate pdf
    ch = Chain('Generate')

    wsu = root_analysis.WsUtils(name='generator')
    wsu.add_simulate(pdf=fitpdf, obs='t', num=1625500, key='rds')
    ch.add(wsu)

# # --- example of how to import a roodataset from a root file
# if settings['read_data']:
#     ch = Chain('Data')
#     read_data = root_analysis.ReadFromRootFile()
#     read_data.path = '/opt/eskapade/data/tsv_renamed_data.root'
#     read_data.keys = ['rds']
コード例 #16
0
ファイル: esk101_helloworld.py プロジェクト: evoloji/Eskapade
#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk101_helloworld'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

#     E.g. define flags turn on or off certain chains with links.
#     by default all set to false, unless already configured in
#     configobject or vars()

settings['do_hello'] = True
settings['n_repeat'] = 2

#########################################################################################
# --- now set up the chains and links based on configuration flags

if settings['do_hello']:
    hello = Chain(name='Hello')
    link = core_ops.HelloWorld(name='HelloWorld')
    link.logger.log_level = LogLevel.DEBUG
    link.repeat = settings['n_repeat']
    hello.add(link)

#########################################################################################

logger.debug('Done parsing configuration file esk101_helloworld')
コード例 #17
0
input_files = resources.fixture('correlated_data.sv.gz')

#########################################################################################
# --- now set up the chains and links based on configuration flags

data = Chain('Data')

# --- 0. readdata keeps on opening the next file in the file list.
#     all kwargs are passed on to pandas file reader.
read_data = analysis.ReadToDf(name='dflooper',
                              key='accounts',
                              reader='csv',
                              sep=' ')
read_data.path = input_files
data.add(read_data)

# --- 1. add data-frame summary link to "Summary" chain
summarizer = visualization.DfSummary(name='Create_stats_overview',
                                     read_key=read_data.key,
                                     pages_key='report_pages')
data.add(summarizer)

# --- 2. Fill 2d histogrammar histograms
hf = analysis.HistogrammarFiller()
hf.read_key = 'accounts'
hf.store_key = 'hist'
hf.logger.log_level = LogLevel.DEBUG
hf.columns = [['x1', 'x2'], ['x1', 'x3'], ['x1', 'x4'], ['x1', 'x5'],
              ['x2', 'x3'], ['x2', 'x4'], ['x2', 'x5'], ['x3', 'x4'],
              ['x3', 'x4'], ['x4', 'x5']]
コード例 #18
0
# --- generate pdf, simulate, fit, and plot
ch = Chain('WsOps')

# 1. simulate output score of machine learning classifier
wsu = root_analysis.WsUtils(name='DataSimulator')
wsu.factory = ["RooGaussian::high_risk(score[0,1],1,0.15)",
               "RooPolynomial::low_risk(score,{-0.3,-0.3})",
               "SUM::model(frac[0.1,0.,1.]*high_risk,low_risk)"]
wsu.add_simulate(pdf='model', obs='score', num=500, key='data', into_ws=True)
wsu.add_fit(pdf='model', data='data', key='fit_result', into_ws=True)
wsu.add_plot(obs='score', data='data', pdf='model', key='simplot')
wsu.add_plot(obs='score', pdf='model',
             pdf_args=(RooFit.Components('low_risk'), RooFit.LineColor(ROOT.kRed),
                       RooFit.LineStyle(ROOT.kDashed)),
             output_file='data_with_generator_model.pdf', key='simplot')
ch.add(wsu)

ch = Chain('SignalPValue')

# 2. plot signal probability
wsu = root_analysis.WsUtils(name='SignalProbability')
wsu.factory = ["expr::high_risk_pvalue('@0*@1/@2',{frac,high_risk,model})"]
wsu.add_plot(obs='score', func='high_risk_pvalue',
             func_args=(RooFit.MoveToBack(),),
             func_kwargs={'VisualizeError': 'fit_result'},
             key='ratio_plot')
wsu.add_plot(obs='score', func='high_risk_pvalue', output_file='high_risk_probability.pdf', key='ratio_plot')
ch.add(wsu)

# 3. calculate p-values and uncertainties thereon
ape = root_analysis.AddPropagatedErrorToRooDataSet()
コード例 #19
0
input_files = [resources.fixture('correlated_data.sv.gz')]

#########################################################################################
# --- now set up the chains and links based on configuration flags

if settings['read_data']:
    ch = Chain('Data')

    # --- 0. read the input dataset
    read_data = analysis.ReadToDf(name='reader',
                                  key='correlated_data',
                                  reader='csv',
                                  sep=' ')
    read_data.path = input_files
    ch.add(read_data)

    # --- 1. convert into a roofit dataset (roodataset)
    #        build a KEYS pdf out of the dataset as well
    df2rds = root_analysis.ConvertDataFrame2RooDataSet()
    df2rds.read_key = read_data.key
    df2rds.store_key = 'rds_' + read_data.key
    df2rds.store_key_vars = 'keys_varset'
    df2rds.columns = ['x2', 'x3', 'x4'
                      ] if settings['high_num_dims'] else ['x2', 'x3']
    df2rds.store_index = False
    # build a KEYS pdf out of the roodataset, used for simulation below
    df2rds.create_keys_pdf = 'keys_Ndim'
    ch.add(df2rds)

    pds = core_ops.PrintDs(name='pds1')
コード例 #20
0
#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

input_files = [resources.fixture('mock_accounts.csv.gz')]

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('Data')

# --- 0. readdata keeps on opening the next file in the file list.
#     all kwargs are passed on to pandas file reader.
read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv')
read_data.path = input_files
# readdata.itr_over_files = True
ch.add(read_data)

# --- 1. add the record factorizer to convert categorical observables into integers
#     Here the columns dummy and loc of the input dataset are factorized
#     e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column:
#     x = [0, 1, 2, 0, 2]
#     By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original'
fact = analysis.RecordFactorizer(name='rf1')
fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender']
fact.read_key = 'accounts'
fact.inplace = True
# factorizer stores a dict with the mappings that have been applied to all observables
fact.sk_map_to_original = 'to_original'
# factorizer also stores a dict with the mappings back to the original observables
fact.sk_map_to_factorized = 'to_factorized'
fact.logger.log_level = LogLevel.DEBUG
コード例 #21
0
elif stream_type == 'tcp':
    ds['dstream'] = ssc.socketTextStream('localhost', 9999)
else:
    logger.error('unsupported stream_type specified: {type}.',
                 type=stream_type)

##########################################################################
# --- now set up the chains and links based on configuration flags

spark_streaming = Chain('SparkStreaming')

# the word count example
wordcount_link = SparkStreamingWordCount(name='SparkStreamingWordCount',
                                         read_key='dstream',
                                         store_key='wordcounts')
spark_streaming.add(wordcount_link)

# store output
writer_link = SparkStreamingWriter(
    name='SparkStreamingWriter',
    read_key=wordcount_link.store_key,
    output_path='file:' +
    persistence.io_path('results_data', '/dstream/wordcount'),
    suffix='txt',
    repartition=1)

spark_streaming.add(writer_link)

# start/stop of Spark Streaming
control_link = SparkStreamingController(name='SparkStreamingController',
                                        timeout=10)
コード例 #22
0
read = Chain('Read')

# create read link for each data file
read_link = SparkDfReader(name='ReadFile',
                          store_key='spark_df',
                          read_methods=['csv'])

# set CSV read arguments
read_link.read_meth_args['csv'] = (file_path, )
read_link.read_meth_kwargs['csv'] = dict(sep='|',
                                         header=True,
                                         inferSchema=True)

# add link to chain
read.add(read_link)

# create link to create new column
col_link = SparkWithColumn(name='UdfPower',
                           read_key=read_link.store_key,
                           store_key='new_spark_df')

# Power of two columns
col_link.new_col = functions.pow(functions.col('x'), functions.col('y'))
col_link.new_col_name = 'pow_xy1'

# add link to chain
add_col = Chain('AddColumn')
add_col.add(col_link)

##########################################################################
コード例 #23
0
rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.) for it in range(100)]
ds['df'] = spark.createDataFrame(rows, schema=['index', 'foo', 'bar'])

##########################################################################
# --- now set up the chains and links based on configuration flags

# create chain
chain = Chain('Map')

# create a link to convert the data frame into an RDD
conv_lnk = spark_analysis.SparkDfConverter(name='DfConverter',
                                           read_key='df',
                                           store_key='rdd',
                                           output_format='rdd',
                                           preserve_col_names=True)
chain.add(conv_lnk)

# create a link to calculate the sum of "bar" for each group of ten rows
map_lnk = spark_analysis.RddGroupMapper(name='Mapper',
                                        read_key='rdd',
                                        store_key='map_rdd',
                                        group_map=sum,
                                        input_map=lambda r:
                                        (r['index'] // 10, r['bar']),
                                        flatten_output_groups=False)
chain.add(map_lnk)

# create a link to add a column with the sum of "bar" for each group of ten rows
flmap_lnk = spark_analysis.RddGroupMapper(name='FlatMapper',
                                          read_key='rdd',
                                          store_key='flat_map_rdd',
コード例 #24
0
    ch = Chain('MyChain1')

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    read_data = analysis.ReadToDf(name='dflooper1',
                                  key='test1',
                                  sep='|',
                                  reader='csv',
                                  usecols=['x', 'y'])
    read_data.path = [data_path] * 3
    read_data.itr_over_files = True
    ch.add(read_data)

    # --- this serves as the break statement from this loop.
    #     if dataset test is empty, which can happen as the very last dataset by readdata,
    #     then skip the rest of this chain.
    skipper = core_ops.SkipChainIfEmpty()
    skipper.collection_set = ['test1']
    skipper.check_at_initialize = False
    skipper.check_at_execute = True
    ch.add(skipper)

    # --- do something useful with the test dataset here ...
    #     e.g. apply selections, or collect into histograms.

    # --- this serves as the continue statement of the loop. go back to start of the chain.
    #     repeater listens to readdata is there are any more datasets coming. if so, continue the loop.
コード例 #25
0
##########################################################################
# Now set up the chains and links based on configuration flags

read = Chain('Read')

# create read link for each data file
for index, key in enumerate(STORE_KEYS):
    read_link = SparkDfReader(name='Reader' + str(index + 1), store_key=key, read_methods=['csv'])

    # set CSV read arguments
    read_link.read_meth_args['csv'] = (file_paths[index],)
    read_link.read_meth_kwargs['csv'] = dict(sep='|', header=True, inferSchema=True)

    # add link to chain
    read.add(read_link)

# create SQL-query link
sql_link = SparkExecuteQuery(name='SparkSQL', store_key='spark_df_sql')

# define SQL-query to apply to one or more objects in the DataStore
sql_link.query = 'SELECT loc, sum(x) as sumx, sum(y) as sumy ' \
                 'FROM (SELECT * FROM {0:s} UNION ALL SELECT * FROM {1:s}) t ' \
                 'WHERE t.x < 5 ' \
                 'GROUP BY loc'.format(STORE_KEYS[0], STORE_KEYS[1])

# add link to chain
sql = Chain('ApplySQL')
sql.add(sql_link)

##########################################################################
コード例 #26
0
                 into_ws=True)
wsu.add_simulate(pdf='low_risk',
                 obs='score',
                 num=1000,
                 key='unbiased_low_risk_testdata',
                 into_ws=True)
wsu.add_simulate(pdf='model', obs='score', num=1000, key='data', into_ws=True)
wsu.add_plot(obs='score', data='data', pdf='model', key='simplot')
wsu.add_plot(obs='score',
             pdf='model',
             pdf_args=(RooFit.Components('low_risk'),
                       RooFit.LineColor(ROOT.kRed),
                       RooFit.LineStyle(ROOT.kDashed)),
             output_file='data_with_generator_model.pdf',
             key='simplot')
ch.add(wsu)

# 2a. turn data into roofit histograms
wsu = WsUtils(name='HistMaker')


def make_histograms(w):
    """Make histogram."""
    # Need to be imported here as well, otherwise throws: name 'ROOT' is not defined.
    import ROOT  # noqa
    from esroofit.decorators.roofit import ws_put
    w.var('score').setBins(40)
    high_risk_hist = ROOT.RooDataHist('high_risk_hist', 'high_risk_hist',
                                      ROOT.RooArgSet(w.var('score')),
                                      w.data('unbiased_high_risk_testdata'))
    low_risk_hist = ROOT.RooDataHist('low_risk_hist', 'low_risk_hist',
コード例 #27
0
settings['analysisName'] = 'esk703_mimic_data'
settings['version'] = 0

np.random.seed(42)

ch = Chain('DataPrep')
ch.logger.log_level = LogLevel.DEBUG

sim_data = data_mimic.MixedVariablesSimulation(store_key='df',
                                               n_obs=100000,
                                               p_unordered=np.array(
                                                   [[0.2, 0.2, 0.3, 0.3],
                                                    [0.3, 0.7]]))

sim_data.logger.log_level = LogLevel.DEBUG
ch.add(sim_data)

pre_data = data_mimic.KDEPreparation(
    read_key='df',
    data_store_key='data',
    data_smoothed_store_key='data_smoothed',
    data_no_nans_store_key='data_no_nans',
    data_normalized_store_key='data_normalized',
    maps_store_key='maps',
    qts_store_key='qts',
    new_column_order_store_key='new_column_order',
    ids_store_key='ids',
    unordered_categorical_columns=['a', 'b'],
    string_columns=['a', 'b'],
    count=1,
    extremes_fraction=0.15,
コード例 #28
0
# --- example 2: readdata loops over the input files, with file chunking.

if settings.get('do_example2', True):
    ch = Chain('MyChain2')
    ch.n_fork = 10

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next 4 lines of the open or next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    read_data = analysis.ReadToDf(name='dflooper2', key='test2', sep='|', reader='csv', usecols=['x', 'y'],
                                  chunksize=chunk_size)
    read_data.path = [data_path] * 3
    ch.add(read_data)

    # --- do something useful with the test dataset here ...
    #     e.g. apply selections, or collect into histograms.

    # query_set = seletions that are applies to incoming_records
    # after selections, only keep column in select_columns ('a', 'c')
    link = analysis.ApplySelectionToDf(read_key='test2', store_key='reduced_data', query_set=['x>1'])
    # Any other kwargs given to ApplySelectionToDf are passed on the the
    # pandas query() function.
    ch.add(link)

    dc = core_ops.ForkDataCollector()
    dc.keys = [{'key_ds': link.store_key, 'func': pd.concat}]
    ch.add(dc)
コード例 #29
0
    model.build_model()
    model.var.SetTitle('Redeem age')
    model.max_var.SetTitle('Age')
    model.var.setUnit('days')
    model.max_var.setUnit('days')

###############################################################################
# --- create chain for generating voucher redeem data

ch = Chain('Generation')
gen_link = TruncExpGen(name='Generate',
                       store_key=REDEEM_DATA_KEY,
                       max_var_data_key=AGE_DATA_KEY,
                       model_name=MODEL_NAME,
                       event_frac=REDEEM_FRAC)
ch.add(gen_link)

np.random.seed(settings['seeds']['NumPy'])
ROOT.RooRandom.randomGenerator().SetSeed(settings['seeds']['RooFit'])

###############################################################################
# --- create chain for fitting voucher redeem model to generated data

ch = Chain('Fitting')
fit_link = TruncExpFit(name='Fit',
                       read_key=gen_link.store_key,
                       max_var_data_key=gen_link.max_var_data_key,
                       model_name=gen_link.model_name)
ch.add(fit_link)

###############################################################################
コード例 #30
0
ds = process_manager.service(DataStore)
ds['incoming_records'] = df

#########################################################################################
# --- Here we apply example selections to a dataframe picked up from the datastore.

data_prep = Chain('DataPrep')

# query_set = seletions that are applies to incoming_records
# after selections, only keep column in select_columns ('a', 'c')
link = analysis.ApplySelectionToDf(read_key='incoming_records',
                                   store_key='outgoing_records',
                                   query_set=['a>0', 'c<b'],
                                   select_columns=['a', 'c'])
# Any other kwargs given to ApplySelectionToDf are passed on the the
# pandas query() function.
link.logger.log_level = LogLevel.DEBUG
data_prep.add(link)

link = core_ops.DsObjectDeleter()
link.deletion_keys = ['incoming_records']
data_prep.add(link)

link = core_ops.PrintDs()
link.keys = ['n_outgoing_records', 'outgoing_records']
data_prep.add(link)

#########################################################################################

logger.debug('Done parsing configuration file esk204_apply_query_to_pandas_df')