# key to store the results of the residuals test in the datastore
hypotest.sk_residuals_map = 'residuals'
# key to store the results of the residuals test in the datastore, in format which
# is makes further processing more easy
hypotest.sk_residuals_overview = 'residuals_overview'

# Advanced settings
# Specify what categories to ignore.
# hypotest.ignore_categories = ['None','Not_familair_with','NoFruit']
# hypotest.var_ignore_categories = ['obs1':'None','obs2':'Not_familiar_with','obs1:obs2':['None','pear']]
# Hypothesis tester is also applicable to continues variables once they are categorised;
# ie make bins. The number of bins can be set using the following options
# hypotest.default_number_of_bins = 5
# hypotest.var_default_number_of_bins = ['obs1':10,'obs2':5,'obs1:obs2':[3,3]]

hypotest.set_log_level(logging.DEBUG)
ch.add_link(hypotest)

# --- 4. print contents of the datastore
proc_mgr.add_chain('Overview')
hist_summary = visualization.DfSummary(name='HistogramSummary',
                                       read_key=hypotest.hist_dict_key,
                                       pages_key=hypotest.pages_key)
proc_mgr.get_chain('Overview').add_link(hist_summary)

#########################################################################################

log.debug(
    'Done parsing configuration file esk410_testing_correlations_between_categories'
)
コード例 #2
0
if not stream_type or stream_type == 'file':
    ds['dstream'] = ssc.textFileStream('/tmp/eskapade_stream_test/')
elif stream_type == 'tcp':
    ds['dstream'] = ssc.socketTextStream('localhost', 9999)
else:
    log.error('unsupported stream_type specified: {}'.format(stream_type))

##########################################################################
# --- now set up the chains and links based on configuration flags

proc_mgr.add_chain('SparkStreaming')

# the word count example
wordcount_link = spark_analysis.SparkStreamingWordCount(
    name='SparkStreamingWordCount', read_key='dstream', store_key='wordcounts')
proc_mgr.get_chain('SparkStreaming').add_link(wordcount_link)

# store output
writer_link = spark_analysis.SparkStreamingWriter(
    name='SparkStreamingWriter',
    read_key=wordcount_link.store_key,
    output_path='file:' +
    persistence.io_dir('results_data', settings.io_conf()) +
    '/dstream/wordcount',
    mode='overwrite',
    suffix='txt',
    repartition=1)
proc_mgr.get_chain('SparkStreaming').add_link(writer_link)

# start/stop of Spark Streaming
control_link = spark_analysis.SparkStreamingController(
コード例 #3
0
# create process manager
proc_mgr = ProcessManager()

# create chains
proc_mgr.add_chain('Data')
proc_mgr.add_chain('BoxPlot')

# add data-generator link to "Data" chain

generator = analysis.BasicGenerator(name='Generate_data',
                                    key='data',
                                    columns=COLUMNS,
                                    size=SIZE,
                                    gen_config=GEN_CONF)
proc_mgr.get_chain('Data').add_link(generator)

# add data-frame summary link to "Boxplot" chain
# can provide labels and units for the variables in the dataset, and set the statistics to print in output file
boxplot = visualization.DfBoxplot(name='Create_stats_overview',
                                  read_key=generator.key,
                                  statistics=['count', 'mean', 'min', 'max', 'std'],
                                  var_labels=VAR_LABELS,
                                  var_units=VAR_UNITS,
                                  column='var_b',
                                  cause_columns=['var_a', 'var_c'],
                                  results_path=persistence.io_path('results_data', settings.io_conf(), 'report'))
proc_mgr.get_chain('BoxPlot').add_link(boxplot)

#########################################################################################
コード例 #4
0
fact = analysis.RecordFactorizer(name='rf1')
fact.columns = ['isActive', 'eyeColor', 'favoriteFruit', 'gender']
fact.read_key = 'accounts'
fact.inplace = True
# factorizer stores a dict with the mappings that have been applied to all observables
fact.sk_map_to_original = 'to_original'
# factorizer also stores a dict with the mappings back to the original observables
fact.sk_map_to_factorized = 'to_factorized'
fact.set_log_level(logging.DEBUG)
ch.add_link(fact)

# --- 2. Fill a roodatahist
df2rdh = root_analysis.RooDataHistFiller()
df2rdh.read_key = readdata.key
df2rdh.store_key = 'rdh_' + readdata.key
# the observables in this map are treated as categorical observables by roofit (roocategories)
df2rdh.map_to_factorized = 'to_factorized'
df2rdh.columns = ['transaction', 'latitude', 'longitude', 'age', 'eyeColor', 'favoriteFruit']
#df2rdh.into_ws = True
ch.add_link(df2rdh)

# --- print contents of the datastore
proc_mgr.add_chain('Overview')
pds = core_ops.PrintDs()
pds.keys = ['n_rdh_accounts', 'n_accounts']
proc_mgr.get_chain('Overview').add_link(pds)

#########################################################################################

log.debug('Done parsing configuration file esk402_roodatahist_fill')
コード例 #5
0
    ch = proc_mgr.add_chain('Conversion2')

    rds2df = root_analysis.ConvertRooDataSet2DataFrame()
    rds2df.read_key = 'simdata'
    rds2df.store_key = 'df_simdata'
    rds2df.remove_original = True
    ch.add_link(rds2df)

if settings['summary']:
    proc_mgr.add_chain('Summary')

    # print contents of the workspace
    pws = root_analysis.PrintWs()
    ch.add_link(pws)

    # print contents of datastore
    pds = core_ops.PrintDs(name='pds2')
    #pds.keys = ['accounts']
    ch.add_link(pds)

    # --- make a summary document of simulated dataframe
    summarizer = visualization.DfSummary(name='Create_stats_overview',
                                         read_key=rds2df.store_key)
    proc_mgr.get_chain('Summary').add_link(summarizer)

#########################################################################################

log.debug(
    'Done parsing configuration file esk404_workspace_createpdf_simulate_fit_plot'
)
コード例 #6
0
#########################################################################################
# --- now set up the chains and links based on configuration flags

# create chains
proc_mgr.add_chain('Data')
proc_mgr.add_chain('Summary')

# load data
reader = analysis.ReadToDf(name='reader',
                           path=settings['input_path'],
                           sep=settings['separator'],
                           key='input_data',
                           reader=settings['reader'])

proc_mgr.get_chain('Data').add_link(reader)

# make visualizations
for corr in settings['correlations']:
    corr_link = visualization.CorrelationSummary(name=corr + '_summary',
                                                 read_key='input_data',
                                                 write_key=corr +
                                                 '_correlations',
                                                 method=corr)

    proc_mgr.get_chain('Summary').add_link(corr_link)

#########################################################################################

log.debug('Done parsing configuration file esk305_correlation_summary')