df2rdh.columns = ['longitude', 'age', 'eyeColor']
# be careful not to blow up the total number of bins.
# do this by setting the maximum total number of bins allowed.
df2rdh.n_max_total_bins = 1e6
# a histogram-based pdf is created out of the roodatahist object
# we use this pdf below to simulate a new dataset with the same properties as the original
df2rdh.create_hist_pdf = 'hpdf_Ndim'
# all output is stored in the workspace, not datastore
df2rdh.into_ws = True
ch.add(df2rdh)

# --- Print overview
pws = root_analysis.PrintWs()
ch.add(pws)

pds = core_ops.PrintDs()
ch.add(pds)

# --- 3. resimulate the data with the created hist-pdf, and plot these data and the pdf
ch = Chain('WsOps')
wsu = root_analysis.WsUtils()
wsu.add_simulate(pdf='hpdf_Ndim', obs='rdh_vars', num=10000, key='simdata')
wsu.add_plot(obs='age',
             data='simdata',
             pdf='hpdf_Ndim',
             output_file='test.pdf',
             pdf_kwargs={'ProjWData': ('rdh_cats', 'simdata')})
ch.add(wsu)

#########################################################################################
Ejemplo n.º 2
0
    # Any other kwargs given to ApplySelectionToDf are passed on the the
    # pandas query() function.
    ch.add_link(link)

    # --- As an example, will merge reduced datasets back into a single, merged dataframe.
    concat = analysis.DfConcatenator()
    concat.readKeys = ['merged', 'reduced_data']
    concat.storeKey = 'merged'
    concat.ignore_missing_input = True  # in first iteration input 'merged' is missing.
    ch.add_link(concat)

    # --- this serves as the continue statement of the loop. go back to start of the chain.
    repeater = core_ops.RepeatChain()
    # repeat until readdata says halt.
    repeater.listenTo = 'chainRepeatRequestBy_' + readdata.name
    # repeat max of 10 times
    #repeater.maxcount = 10
    ch.add_link(repeater)

# --- print contents of the datastore
proc_mgr.add_chain('Overview')
pds = core_ops.PrintDs(name='End')
pds.keys = [
    'n_test1', 'n_sum_test1', 'n_test2', 'n_sum_test2', 'test2', 'n_merged'
]
proc_mgr.get_chain('Overview').add_link(pds)

#########################################################################################

log.debug('Done parsing configuration file esk209_read_big_data_itr')
ch = proc_mgr.add_chain('chain1')

# the link ToDsDict adds objects to the datastore
# by default this happens at the execution of the link.
# (optionally, this can be done at initialization.)
# Here it is used as a dummy data generator.

link = core_ops.ToDsDict(name='intods_1')
link.obj = f
# copydict = true: all items in dict f are added to the datastore
link.copydict = True
ch.add_link(link)

# print contents of datastore
link = core_ops.PrintDs()
link.keys = ['n_favorite', 'hello']
ch.add_link(link)

#########
# chain 2
# - asserting the presence of items in the datastore.
# - deleting individual items from the datastore.

ch = proc_mgr.add_chain('chain2')

# the link AssertInDs checks the presence
# of certain objects in the datastore
link = core_ops.AssertInDs()
link.keySet = ['hello', 'n_favorite']
ch.add_link(link)
Ejemplo n.º 4
0
data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv')

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch1 = proc_mgr.add_chain('Factorize')

# --- read dummy dataset
readdata = analysis.ReadToDf(key='test1',
                             sep='|',
                             reader='csv',
                             path=data_path)
ch1.add_link(readdata)

# --- print contents of the datastore
pds = core_ops.PrintDs(name='printer1')
pds.keys = ['test1']
ch1.add_link(pds)

# --- add the record factorizer
#     Here the columns dummy and loc of the input dataset are factorized
#     e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column:
#     x = [0, 1, 2, 0, 2]
#     By default, the mapping is stored in a dict under key: 'map_'+store_key+'_to_original'
fact = analysis.RecordFactorizer(name='rf1')
fact.columns = ['dummy', 'loc']
fact.read_key = 'test1'
fact.store_key = 'test1_fact'
fact.sk_map_to_original = 'to_original'
fact.set_log_level(logging.DEBUG)
ch1.add_link(fact)
#########################################################################################
# --- now set up the chains and links based on configuration flags

proc_mgr = ProcessManager()

ch = proc_mgr.add_chain('DataPrep')

# querySet = seletions that are applies to incoming_records
# after selections, only keep column in selectColumns ('a', 'c')
# add conversion functions to "Data" chain
link = analysis.ApplyFuncToDf(name='Transform',
                              read_key='incoming_data',
                              store_key='transformed_data',
                              apply_funcs=conv_funcs)
# Any other kwargs given to ApplyFuncToDf are passed on the the
# pandas query() function.
link.set_log_level(logging.DEBUG)
ch.add_link(link)

link = core_ops.DsObjectDeleter()
link.deletionKeys = ['incoming_data']
ch.add_link(link)

link = core_ops.PrintDs()
link.keys = ['transformed_data']
ch.add_link(link)

#########################################################################################

log.debug('Done parsing configuration file esk203_apply_func_to_pandas_df')
    ch = proc_mgr.add_chain('Conversion2')

    rds2df = root_analysis.ConvertRooDataSet2DataFrame()
    rds2df.read_key = 'simdata'
    rds2df.store_key = 'df_simdata'
    rds2df.remove_original = True
    ch.add_link(rds2df)

if settings['summary']:
    proc_mgr.add_chain('Summary')

    # print contents of the workspace
    pws = root_analysis.PrintWs()
    ch.add_link(pws)

    # print contents of datastore
    pds = core_ops.PrintDs(name='pds2')
    #pds.keys = ['accounts']
    ch.add_link(pds)

    # --- make a summary document of simulated dataframe
    summarizer = visualization.DfSummary(name='Create_stats_overview',
                                         read_key=rds2df.store_key)
    proc_mgr.get_chain('Summary').add_link(summarizer)

#########################################################################################

log.debug(
    'Done parsing configuration file esk404_workspace_createpdf_simulate_fit_plot'
)
Ejemplo n.º 7
0
    ch.add(read_data)

    # --- 1. convert into a roofit dataset (roodataset)
    #        build a KEYS pdf out of the dataset as well
    df2rds = root_analysis.ConvertDataFrame2RooDataSet()
    df2rds.read_key = read_data.key
    df2rds.store_key = 'rds_' + read_data.key
    df2rds.store_key_vars = 'keys_varset'
    df2rds.columns = ['x2', 'x3', 'x4'
                      ] if settings['high_num_dims'] else ['x2', 'x3']
    df2rds.store_index = False
    # build a KEYS pdf out of the roodataset, used for simulation below
    df2rds.create_keys_pdf = 'keys_Ndim'
    ch.add(df2rds)

    pds = core_ops.PrintDs(name='pds1')
    ch.add(pds)

if settings['generate']:
    # --- 2. simulate a new dataset with the keys pdf, and then plot this dataset
    ch = Chain('WsOps')
    wsu = root_analysis.WsUtils()
    wsu.add_simulate(pdf='keys_Ndim',
                     obs='keys_varset',
                     num=5000,
                     key='simdata',
                     into_ws=True)
    wsu.add_plot(obs='x2', data='simdata', output_file='x2_simdata.pdf')
    wsu.add_plot(obs='x3', data='simdata', output_file='x3_simdata.pdf')
    if settings['high_num_dims']:
        wsu.add_plot(obs='x4', data='simdata', output_file='x4_simdata.pdf')
Ejemplo n.º 8
0
ds = ProcessManager().service(DataStore)
ds['hello'] = 'world'
ds['d'] = {'a': 1, 'b': 2, 'c': 3}

#########################################################################################
# --- now set up the chains and links based on configuration flags

proc_mgr = ProcessManager()

ch = proc_mgr.add_chain('Overview')

# 1. printdatastore prints an overview of the contents in the datastore
# at the state of executing the link.
# The overview consists of list of keys in the datastore and and the object types.
link = core_ops.PrintDs(name='printer1')
# keys are the items for which the contents of the actual item is printed.
link.keys = ['hello', 'd']
ch.add_link(link)

# 2. This link will start an interactive ipython session.
# from this session, one can access the datastore and the configobject with:
# >>> ds
# or
# >>> settings
# Try to add something to the datastore in this session!
# >>> ds['foo'] = 'bar'
if not settings['TESTING']:
    link = core_ops.IPythonEmbed()
    ch.add_link(link)