Beispiel #1
0
#     configobject or vars()

# turn on/off the 2 examples
settings['do_example1'] = True
settings['do_example2'] = True

#########################################################################################
# --- Set path of data
data_path = resources.fixture('dummy.csv')

#########################################################################################
# --- now set up the chains and links, based on configuration flags

# --- example 1: readdata with one input file
if settings['do_example1']:
    ch1 = Chain('MyChain1')
    read_data = analysis.ReadToDf(key='test1', sep='|', reader='csv', path=data_path)
    ch1.add(read_data)

    # --- do something useful with the test dataset here ...

# --- example 2: readdata with default settings reads all three input files simultaneously.
#                all extra key word arguments are passed on to pandas reader.
if settings['do_example2']:
    ch2 = Chain('MyChain2')

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next file in the file list.
#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk410_testing_correlations_between_categories'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

input_files = [resources.fixture('mock_accounts.csv.gz')]

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('Data')

# --- 0. readdata keeps on opening the next file in the file list.
#     all kwargs are passed on to pandas file reader.
read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv')
read_data.path = input_files
ch.add(read_data)

# --- 1. add the record factorizer to convert categorical observables into integers
#     Here the columns dummy and loc of the input dataset are factorized
#     e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column:
#     x = [0, 1, 2, 0, 2]
#     By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original'
fact = analysis.RecordFactorizer(name='rf1')
fact.columns = ['eyeColor', 'favoriteFruit']  # ['Obs_*']
fact.read_key = read_data.key
settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk405_simulation_based_on_binned_data'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['high_num_dims'] = False

input_files = [resources.fixture('mock_accounts.csv.gz')]

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('Data')

# --- 0. read input data
read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv')
read_data.path = input_files
ch.add(read_data)

# --- 1. add the record factorizer
#     Here the columns dummy and loc of the input dataset are factorized
#     e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column:
#     x = [0, 1, 2, 0, 2]
#     By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original'
fact = analysis.RecordFactorizer(name='rf1')
fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender']
fact.read_key = 'accounts'
fact.inplace = True
# define data stream
ds = process_manager.service(DataStore)

if not stream_type or stream_type == 'file':
    ds['dstream'] = ssc.textFileStream('/tmp/eskapade_stream_test/')
elif stream_type == 'tcp':
    ds['dstream'] = ssc.socketTextStream('localhost', 9999)
else:
    logger.error('unsupported stream_type specified: {type}.',
                 type=stream_type)

##########################################################################
# --- now set up the chains and links based on configuration flags

spark_streaming = Chain('SparkStreaming')

# the word count example
wordcount_link = SparkStreamingWordCount(name='SparkStreamingWordCount',
                                         read_key='dstream',
                                         store_key='wordcounts')
spark_streaming.add(wordcount_link)

# store output
writer_link = SparkStreamingWriter(
    name='SparkStreamingWriter',
    read_key=wordcount_link.store_key,
    output_path='file:' +
    persistence.io_path('results_data', '/dstream/wordcount'),
    suffix='txt',
    repartition=1)
Beispiel #5
0
    sum_bar = sum(r['bar'] for r in rows)
    return [r + (sum_bar, ) for r in rows]


##########################################################################
# --- input data

ds = process_manager.service(DataStore)
rows = [(it, 'foo{:d}'.format(it), (it + 1) / 2.) for it in range(100)]
ds['df'] = spark.createDataFrame(rows, schema=['index', 'foo', 'bar'])

##########################################################################
# --- now set up the chains and links based on configuration flags

# create chain
chain = Chain('Map')

# create a link to convert the data frame into an RDD
conv_lnk = spark_analysis.SparkDfConverter(name='DfConverter',
                                           read_key='df',
                                           store_key='rdd',
                                           output_format='rdd',
                                           preserve_col_names=True)
chain.add(conv_lnk)

# create a link to calculate the sum of "bar" for each group of ten rows
map_lnk = spark_analysis.RddGroupMapper(name='Mapper',
                                        read_key='rdd',
                                        store_key='map_rdd',
                                        group_map=sum,
                                        input_map=lambda r:
Beispiel #6
0
#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk306_concatenate_reports'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

input_files = resources.fixture('correlated_data.sv.gz')

#########################################################################################
# --- now set up the chains and links based on configuration flags

data = Chain('Data')

# --- 0. readdata keeps on opening the next file in the file list.
#     all kwargs are passed on to pandas file reader.
read_data = analysis.ReadToDf(name='dflooper',
                              key='accounts',
                              reader='csv',
                              sep=' ')
read_data.path = input_files
data.add(read_data)

# --- 1. add data-frame summary link to "Summary" chain
summarizer = visualization.DfSummary(name='Create_stats_overview',
                                     read_key=read_data.key,
                                     pages_key='report_pages')
data.add(summarizer)
Beispiel #7
0
#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['read_data'] = True
settings['generate'] = True
settings['make_plot'] = True
settings['high_num_dims'] = False

input_files = [resources.fixture('correlated_data.sv.gz')]

#########################################################################################
# --- now set up the chains and links based on configuration flags

if settings['read_data']:
    ch = Chain('Data')

    # --- 0. read the input dataset
    read_data = analysis.ReadToDf(name='reader',
                                  key='correlated_data',
                                  reader='csv',
                                  sep=' ')
    read_data.path = input_files
    ch.add(read_data)

    # --- 1. convert into a roofit dataset (roodataset)
    #        build a KEYS pdf out of the dataset as well
    df2rds = root_analysis.ConvertDataFrame2RooDataSet()
    df2rds.read_key = read_data.key
    df2rds.store_key = 'rds_' + read_data.key
    df2rds.store_key_vars = 'keys_varset'
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk207_record_vectorizer'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

# --- Set path of data
data_path = resources.fixture('dummy.csv')

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch1 = Chain('MyChain1')

# --- read dummy dataset
read_data = analysis.ReadToDf(key='test1',
                              sep='|',
                              reader='csv',
                              path=data_path)
ch1.add(read_data)

# --- print contents of the datastore
pds = core_ops.PrintDs(name='printer1')
pds.keys = ['test1']
ch1.add(pds)

# --- add the record vectorizer
#     Here the columns x and y of the input dataset are vectorized
# --- now set up the chains and links based on configuration flags

# create read link
read_link = spark_analysis.SparkDfReader(name='Reader',
                                         store_key='spark_df',
                                         read_methods=['csv'])

# set CSV read arguments
read_link.read_meth_args['csv'] = (file_paths,)
read_link.read_meth_kwargs['csv'] = dict(sep=separator,
                                         header=has_header,
                                         inferSchema=infer_schema)

if columns:
    # add select function
    read_link.read_methods.append('select')
    read_link.read_meth_args['select'] = tuple(columns)

if num_partitions:
    # add repartition function
    read_link.read_methods.append('repartition')
    read_link.read_meth_args['repartition'] = (num_partitions,)

# add link to chain
read = Chain('Read')
read.add(read_link)

##########################################################################

logger.debug('Done parsing configuration file esk602_read_csv_to_spark_df.')
nan,3,bal,3,bla,bar,c,1
,nan,NaN,NaN,nan,nan,d,2
,,,,,,,3
1,2,,,,,,,6
"""

f = tempfile.NamedTemporaryFile(delete=False)
f.write(tmp)
f.close()
# file is not immediately deleted because we used delete=False
# used below with f.name

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = Chain('DataPrep')

# --- 0. pandas read_csv has multiple settings to help reading in of buggy csv's.
#     o The option error_bad_lines=False skips lines with too few or too many values
#     o The option encoding='latin1' interprets most non-standard characters
read_data = analysis.ReadToDf(key='vrh',
                              reader='csv',
                              path=f.name,
                              error_bad_lines=False,
                              encoding='latin1')
ch.add(read_data)

# --- 1. standard setting:
#     o convert all nans to np.nan (= float)
#     o convert all rows in a column to most occuring datatype in that column
fixer = data_quality.FixPandasDataFrame(name='fixer1')
# --- METHOD 1: configuration file

spark = sm.create_session(eskapade_settings=settings)
sc = spark.sparkContext

logger.info('---> METHOD 1: configuration file')
logger.info(str(sc.getConf().getAll()))

##########################################################################
# --- METHOD 2: link

conf_link = SparkConfigurator(name='SparkConfigurator', log_level='WARN')
conf_link.spark_settings = [('spark.app.name',
                             settings['analysisName'] + '_link'),
                            ('spark.master', 'local[42]'),
                            ('spark.driver.host', '127.0.0.1')]

config = Chain('Config')
config.add(conf_link)

logger.info('---> METHOD 2: link')
logger.info('NB: settings will be printed at time of link execution.')

##########################################################################
# --- running spark session will be stopped automatically at end

###########################################################################
# --- the end

logger.debug('Done parsing configuration file esk601_spark_configuration.')
Beispiel #12
0
LICENSE.
"""

from eskapade import process_manager, Chain, ConfigObject
from eskapade.logger import Logger, LogLevel
from eskapade.analysis.links import ReadToDf

from eskapade_viz.links import DfSummaryBokeh

logger = Logger()

logger.debug('Now parsing configuration file bokeh_macro.')

# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'bokeh_macro'
settings['version'] = 0

# --- now set up the chains and links

ch = Chain('load_data')
link = ReadToDf(path='../data/planets.csv', skiprows=79, key='data')
ch.add(link)

ch = Chain('viz')
link = DfSummaryBokeh(read_key='data')
ch.add(link)

logger.debug('Done parsing configuration file bokeh_macro.py.')
# generate a dummy dataframe and add to datastore
# to this dataset selections are applied below, during link execution.

# NB: realize that, normally, such a dataframe is read or constructed on the fly
# during link execution.

df = DataFrame(randn(100, 3), columns=list('abc'))

ds = process_manager.service(DataStore)
ds['incoming_records'] = df

#########################################################################################
# --- Here we apply example selections to a dataframe picked up from the datastore.

data_prep = Chain('DataPrep')

# query_set = seletions that are applies to incoming_records
# after selections, only keep column in select_columns ('a', 'c')
link = analysis.ApplySelectionToDf(read_key='incoming_records',
                                   store_key='outgoing_records',
                                   query_set=['a>0', 'c<b'],
                                   select_columns=['a', 'c'])
# Any other kwargs given to ApplySelectionToDf are passed on the the
# pandas query() function.
link.logger.log_level = LogLevel.DEBUG
data_prep.add(link)

link = core_ops.DsObjectDeleter()
link.deletion_keys = ['incoming_records']
data_prep.add(link)
Beispiel #14
0
read_link.read_meth_kwargs['csv'] = dict(sep=separator,
                                         header=has_header,
                                         inferSchema=infer_schema)

if columns:
    # add select function
    read_link.read_methods.append('select')
    read_link.read_meth_args['select'] = tuple(columns)

if num_partitions:
    # add repartition function
    read_link.read_methods.append('repartition')
    read_link.read_meth_args['repartition'] = (num_partitions, )

# add link to chain
read = Chain('Read')
read.add(read_link)

output = Chain('Output')

# fill spark histograms
hf = SparkHistogrammarFiller()
hf.read_key = read_link.store_key
hf.store_key = 'hist'
hf.logger.log_level = LogLevel.DEBUG
# colums that are picked up to do value_counting on in the input dataset
# note: can also be 2-dim: ['x','y']
# in this example, the rest are one-dimensional histograms
hf.columns = ['x', 'y', 'loc', ['x', 'y'], 'date']
output.add(hf)
Beispiel #15
0
#########################################################################################

# when chunking through an input file, pick up only N lines in each iteration.
chunk_size = 5

#########################################################################################
# --- Set path of data
data_path = resources.fixture('dummy.csv')

#########################################################################################
# --- now set up the chains and links, based on configuration flags

# --- example 2: readdata loops over the input files, with file chunking.

if settings.get('do_example2', True):
    ch = Chain('MyChain2')
    ch.n_fork = 10

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next 4 lines of the open or next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    read_data = analysis.ReadToDf(name='dflooper2', key='test2', sep='|', reader='csv', usecols=['x', 'y'],
                                  chunksize=chunk_size)
    read_data.path = [data_path] * 3
    ch.add(read_data)

    # --- do something useful with the test dataset here ...
    #     e.g. apply selections, or collect into histograms.
Beispiel #16
0
logger.info(msg, path=persistence.io_path('results_data', 'report'))

COLUMNS = ['var_a', 'var_b', 'var_c']
SIZE = 10000
VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C')
VAR_UNITS = dict(var_b='m/s')
GEN_CONF = dict(var_a=dict(choice=['alpha', 'beta', 'gamma'], dtype=str),
                var_b=dict(mean=3., std=1.),
                var_c=dict(choice=['delta', 'epsilon', 'zeta', 'eta'],
                           dtype=str))

#########################################################################################
# --- now set up the chains and links based on configuration flags

# create chains
data = Chain('Data')

# add data-generator link to "Data" chain
generator = analysis.BasicGenerator(name='Generate_data',
                                    key='data',
                                    columns=COLUMNS,
                                    size=SIZE,
                                    gen_config=GEN_CONF)
data.add(generator)

# add data-frame summary link to "Boxplot" chain
# can provide labels and units for the variables in the dataset, and set the statistics to print in output file
plot = Chain('BoxPlot')
box_plot = visualization.DfBoxplot(
    name='Create_stats_overview',
    read_key=generator.key,
Beispiel #17
0
settings['generate'] = True
# settings['read_data'] = not settings['generate']
settings['model'] = True
settings['process'] = True
settings['fit_plot'] = True
settings['summary'] = True

fitpdf = 'sum3pdf'
n_percentile_bins = 300

#########################################################################################
# --- now set up the chains and links based on configuration flags

if settings['model']:
    # --- generate pdf
    ch = Chain('Model')

    # --- 1. define a model
    wsu = root_analysis.WsUtils(name='modeller')
    factory = [
        "RooWeibull::wb1(t[0,110000000],a1[0.93,0,2],b1[2.2e-4,0,1e-3])",
        "RooWeibull::wb2(t,a2[0.61,0,2],b2[1.1e-5,0,1e-3])",
        "RooWeibull::wb3(t,a3[0.43,0,2],b3[4.7e-7,0,1e-3])",
        "RooWeibull::wb4(t,a4[0.43,0,2],b4[2.2e-7,0,1e-3])",
        "SUM::sum2pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2)",
        "SUM::sum3pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2,N3[150500,0,2e6]*wb3)",
        "SUM::sum4pdf(N1[580000,0,2e6]*wb1,N2[895000,0,2e6]*wb2,N3[150500,0,2e6]*wb3,N4[1e5,0,2e6]*wb4)"
    ]
    wsu.factory += factory
    ch.add(wsu)
Beispiel #18
0
# --- now set up the chains and links based on configuration flags

# demo consists of 5 simple parts:
#
# 1. putting items in the datastore, and displaying the contents.
# 2. asserting the presence of items in the datastore, and then deleting individual items from the datastore.
# 3. deleting all items from the datastore.
# 4. deleting all but certain items from the datastore.
# 5. moving, copying, or removing objects from the datastore

#########
# chain 1:
# - putting items in the datastore.
# - displaying the contents of items in the datastore.

ch = Chain('chain1')

# the link ToDsDict adds objects to the datastore
# by default this happens at the execution of the link.
# (optionally, this can be done at initialization.)
# Here it is used as a dummy data generator.

link = core_ops.ToDsDict(name='intods_1')
link.obj = f
# copydict = true: all items in dict f are added to the datastore
link.copydict = True
ch.add(link)

# print contents of datastore
link = core_ops.PrintDs()
link.keys = ['n_favorite', 'hello']
            data_path=settings['resultsDir'] + '/' + settings['analysisName'] +
            '/data/v0/',
            conf_path=settings['resultsDir'] + '/' + settings['analysisName'] +
            '/config/v0/')

# dummy information used in this macro, added to each chain below.
f = {'hello': 'world', 'v': [3, 1, 4, 1, 5], 'n_favorite': 7}
g = {'a': 1, 'b': 2, 'c': 3}
h = [2, 7]

#########################################################################################
# --- now set up the chains and links based on configuration flags

#########
# chain 1
ch = Chain('chain1')

# the link ToDsDict adds objects to the datastore at link execution.
link = core_ops.ToDsDict(name='intods_1')
link.store_key = 'f'
link.obj = f
ch.add(link)

# print contents of datastore
link = core_ops.PrintDs()
ch.add(link)

#########
# chain 2
ch = Chain('chain2')
Beispiel #20
0
#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk101_helloworld'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

#     E.g. define flags turn on or off certain chains with links.
#     by default all set to false, unless already configured in
#     configobject or vars()

settings['do_hello'] = True
settings['n_repeat'] = 2

#########################################################################################
# --- now set up the chains and links based on configuration flags

if settings['do_hello']:
    hello = Chain(name='Hello')
    link = core_ops.HelloWorld(name='HelloWorld')
    link.logger.log_level = LogLevel.DEBUG
    link.repeat = settings['n_repeat']
    hello.add(link)

#########################################################################################

logger.debug('Done parsing configuration file esk101_helloworld')
Beispiel #21
0
"""
logger.info(msg,
            path=settings['resultsDir'] + '/' + settings['analysisName'] +
            '/data/v0/report/')

COLUMNS = ['var_a', 'var_b', 'var_c']
SIZE = 10000
VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C')
VAR_UNITS = dict(var_b='m/s')
GEN_CONF = dict(var_b=dict(mean=42., std=2.),
                var_c=dict(mean=42, std=2, dtype=int))

#########################################################################################
# --- now set up the chains and links based on configuration flags

data = Chain('Data')
# add data-generator link to "Data" chain
generator = analysis.BasicGenerator(name='Generate_data',
                                    key='data',
                                    columns=COLUMNS,
                                    size=SIZE,
                                    gen_config=GEN_CONF)
data.add(generator)

# add data-frame summary link to "Summary" chain
# can provide labels and units for the variables in the dataset
summary = Chain('Summary')
summarizer = visualization.DfSummary(name='Create_stats_overview',
                                     read_key=generator.key,
                                     var_labels=VAR_LABELS,
                                     var_units=VAR_UNITS)
settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk404_workspace_createpdf_simulate_fit_plot'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['generate_fit_plot'] = True
settings['summary'] = True

#########################################################################################
# --- now set up the chains and links based on configuration flags

if settings['generate_fit_plot']:
    # --- generate pdf, simulate, fit, and plot
    ch = Chain('WsOps')

    # --- 1. define a model by passing strings to the rooworkspace factory
    #     For the workspace factory syntax, see:
    #     https://root.cern.ch/doc/master/RooFactoryWSTool_8cxx_source.html#l00722
    #     For rooworkspace factory examples see:
    #     https://root.cern.ch/root/html/tutorials/roofit/rf511_wsfactory_basic.C.html
    #     https://root.cern.ch/root/html/tutorials/roofit/rf512_wsfactory_oper.C.html
    #     https://root.cern.ch/root/html/tutorials/roofit/rf513_wsfactory_tools.C.html
    wsu = root_analysis.WsUtils(name='modeller')
    wsu.factory = ["Gaussian::sig1(x[-10,10],mean[5,0,10],0.5)",
                   "Gaussian::sig2(x,mean,1)",
                   "Chebychev::bkg(x,{a0[0.5,0.,1],a1[-0.2,-1,1]})",
                   "SUM::sig(sig1frac[0.8,0.,1.]*sig1,sig2)",
                   "SUM::model(bkgfrac[0.5,0.,1.]*bkg,sig)"]
    ch.add(wsu)
Beispiel #23
0
}
process_meth_kwargs = {
    'df': {
        set_num_parts: dict(max_num_parts=2)
    },
    'rdd': {},
    'list': {
        filter_list: dict(min_index=20)
    },
    'pd': {
        filter_pd: dict(min_index=20)
    }
}

# create chain and data-frame-creator links
chain = Chain('Create')
for out_format in process_methods:
    # create data-frame-conversion link
    lnk = spark_analysis.SparkDfConverter(
        name='df_to_{}_converter'.format(out_format),
        read_key='df',
        store_key='{}_output'.format(out_format),
        schema_key='{}_schema'.format(out_format),
        output_format=out_format,
        preserve_col_names=False,
        process_methods=process_methods[out_format],
        process_meth_args=process_meth_args[out_format],
        process_meth_kwargs=process_meth_kwargs[out_format])

    # add link to chain
    chain.add(lnk)
Beispiel #24
0
#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk408_classification_error_propagation_after_fit'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

#########################################################################################
# --- now set up the chains and links based on configuration flags

# --- generate pdf, simulate, fit, and plot
ch = Chain('WsOps')

# 1. simulate output score of machine learning classifier
wsu = root_analysis.WsUtils(name='DataSimulator')
wsu.factory = ["RooGaussian::high_risk(score[0,1],1,0.15)",
               "RooPolynomial::low_risk(score,{-0.3,-0.3})",
               "SUM::model(frac[0.1,0.,1.]*high_risk,low_risk)"]
wsu.add_simulate(pdf='model', obs='score', num=500, key='data', into_ws=True)
wsu.add_fit(pdf='model', data='data', key='fit_result', into_ws=True)
wsu.add_plot(obs='score', data='data', pdf='model', key='simplot')
wsu.add_plot(obs='score', pdf='model',
             pdf_args=(RooFit.Components('low_risk'), RooFit.LineColor(ROOT.kRed),
                       RooFit.LineStyle(ROOT.kDashed)),
             output_file='data_with_generator_model.pdf', key='simplot')
ch.add(wsu)
spark = process_manager.service(SparkManager).create_session(eskapade_settings=settings)

##########################################################################
# CSV and dataframe settings

# NB: local file may not be accessible to worker node in cluster mode
file_paths = ['file:' + resources.fixture('dummy1.csv'),
              'file:' + resources.fixture('dummy2.csv')]

# define store_key for all data files to be read in
STORE_KEYS = ['spark_df1', 'spark_df2']

##########################################################################
# Now set up the chains and links based on configuration flags

read = Chain('Read')

# create read link for each data file
for index, key in enumerate(STORE_KEYS):
    read_link = SparkDfReader(name='Reader' + str(index + 1), store_key=key, read_methods=['csv'])

    # set CSV read arguments
    read_link.read_meth_args['csv'] = (file_paths[index],)
    read_link.read_meth_kwargs['csv'] = dict(sep='|', header=True, inferSchema=True)

    # add link to chain
    read.add(read_link)

# create SQL-query link
sql_link = SparkExecuteQuery(name='SparkSQL', store_key='spark_df_sql')
##########################################################################
# Start Spark session

spark = process_manager.service(SparkManager).create_session(
    eskapade_settings=settings)

##########################################################################
# CSV and dataframe settings

# NB: local file may not be accessible to worker node in cluster mode
file_path = ['file:' + resources.fixture('dummy1.csv')]

##########################################################################
# Now set up the chains and links based on configuration flags

read = Chain('Read')

# create read link for each data file
read_link = SparkDfReader(name='ReadFile',
                          store_key='spark_df',
                          read_methods=['csv'])

# set CSV read arguments
read_link.read_meth_args['csv'] = (file_path, )
read_link.read_meth_kwargs['csv'] = dict(sep='|',
                                         header=True,
                                         inferSchema=True)

# add link to chain
read.add(read_link)
from eskapade import data_mimic
from eskapade import process_manager
from eskapade.logger import Logger, LogLevel

logger = Logger()
logger.debug('Now parsing configuration file esk703_mimic_data')

#########################################################################################
# --- minimal analysis information
settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk703_mimic_data'
settings['version'] = 0

np.random.seed(42)

ch = Chain('DataPrep')
ch.logger.log_level = LogLevel.DEBUG

sim_data = data_mimic.MixedVariablesSimulation(store_key='df',
                                               n_obs=100000,
                                               p_unordered=np.array(
                                                   [[0.2, 0.2, 0.3, 0.3],
                                                    [0.3, 0.7]]))

sim_data.logger.log_level = LogLevel.DEBUG
ch.add(sim_data)

pre_data = data_mimic.KDEPreparation(
    read_key='df',
    data_store_key='data',
    data_smoothed_store_key='data_smoothed',
Beispiel #28
0
logger.debug(
    'Now parsing configuration file esk407_classification_unbiased_fit_estimate'
)

#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk407_classification_unbiased_fit_estimate'
settings['version'] = 0

#########################################################################################
# --- now set up the chains and links based on configuration flags

# --- generate pdf, simulate, fit, and plot
ch = Chain('WsOps')

# 1. simulate output score of machine learning classifier
wsu = WsUtils(name='DataSimulator')
wsu.factory = [
    "expr::trans('@0-@1',{score[0,1],1})",
    "RooExponential::high_risk(trans,10)",
    "RooPolynomial::low_risk(score,{-0.4,-0.4})",
    "SUM::model(low_risk_frac[0.95,0.,1.]*low_risk,high_risk)"
]
wsu.add_simulate(pdf='high_risk',
                 obs='score',
                 num=1000,
                 key='unbiased_high_risk_testdata',
                 into_ws=True)
wsu.add_simulate(pdf='low_risk',
Beispiel #29
0
                      var_range=(0., MAX_AGE),
                      var=('redeem_age', 0.),
                      max_var=('age', MAX_AGE),
                      exp=[('rate_fast', FAST_REDEEM_RATE),
                           ('rate_slow', SLOW_REDEEM_RATE)],
                      fracs=[('frac_fast', FAST_FRAC)])
    model.build_model()
    model.var.SetTitle('Redeem age')
    model.max_var.SetTitle('Age')
    model.var.setUnit('days')
    model.max_var.setUnit('days')

###############################################################################
# --- create chain for generating voucher redeem data

ch = Chain('Generation')
gen_link = TruncExpGen(name='Generate',
                       store_key=REDEEM_DATA_KEY,
                       max_var_data_key=AGE_DATA_KEY,
                       model_name=MODEL_NAME,
                       event_frac=REDEEM_FRAC)
ch.add(gen_link)

np.random.seed(settings['seeds']['NumPy'])
ROOT.RooRandom.randomGenerator().SetSeed(settings['seeds']['RooFit'])

###############################################################################
# --- create chain for fitting voucher redeem model to generated data

ch = Chain('Fitting')
fit_link = TruncExpFit(name='Fit',
Beispiel #30
0
def first_word(x):
    """Take first word."""
    return x.split()[0]


#########################################################################################
# --- now set up the chains and links based on configuration flags

# This chain does 'mapping'. (macro B does 'reduction'.)

# --- mapper: chain with event looper
#     this eventlooper link serves as a mapper.
#     in this example the lines are converted to lower chars, and the first word is selected.
if settings['do_map']:
    ch = Chain("Mapper")
    looper = core_ops.EventLooper(name='listener')
    looper.skip_line_beginning_with = ['#']
    looper.line_processor_set = [first_word, to_lower]
    if settings['TESTING']:
        looper.filename = f.name
    ch.add(looper)

# --- reducer: chain with event looper
#     this eventlooper link serves as a reducer
#     in this example the lines are grouped together into unique sets.
if settings['do_reduce']:
    ch = Chain("Reducer")
    looper = core_ops.EventLooper(name='grouper')
    # reducer selects all unique lines
    looper.sort = True