# during link execution. df = DataFrame(randn(100, 3), columns=list('abc')) ds = process_manager.service(DataStore) ds['incoming_records'] = df ######################################################################################### # --- Here we apply example selections to a dataframe picked up from the datastore. data_prep = Chain('DataPrep') # query_set = seletions that are applies to incoming_records # after selections, only keep column in select_columns ('a', 'c') link = analysis.ApplySelectionToDf(read_key='incoming_records', store_key='outgoing_records', query_set=['a>0', 'c<b'], select_columns=['a', 'c']) # Any other kwargs given to ApplySelectionToDf are passed on the the # pandas query() function. link.logger.log_level = LogLevel.DEBUG data_prep.add(link) link = core_ops.DsObjectDeleter() link.deletion_keys = ['incoming_records'] data_prep.add(link) link = core_ops.PrintDs() link.keys = ['n_outgoing_records', 'outgoing_records'] data_prep.add(link) #########################################################################################
# --- this serves as the break statement from this loop. # if dataset test is empty, which can happen as the very last dataset by readdata, # then skip the rest of this chain. skipper = core_ops.SkipChainIfEmpty() skipper.collectionSet = ['test2'] skipper.checkAtInitialize = False skipper.checkAtExecute = True ch.add_link(skipper) # --- do something useful with the test dataset here ... # e.g. apply selections, or collect into histograms. # querySet = seletions that are applies to incoming_records # after selections, only keep column in selectColumns ('a', 'c') link = analysis.ApplySelectionToDf(readKey='test2', storeKey='reduced_data', querySet=['x>1']) # Any other kwargs given to ApplySelectionToDf are passed on the the # pandas query() function. ch.add_link(link) # --- As an example, will merge reduced datasets back into a single, merged dataframe. concat = analysis.DfConcatenator() concat.readKeys = ['merged', 'reduced_data'] concat.storeKey = 'merged' concat.ignore_missing_input = True # in first iteration input 'merged' is missing. ch.add_link(concat) # --- this serves as the continue statement of the loop. go back to start of the chain. repeater = core_ops.RepeatChain() # repeat until readdata says halt.
# --- read materials file read_data = analysis.ReadToDf( name='reader', path=input_path, sep=num_separator, decimal=num_decimal, key='input_data', usecols=readcols, # parse_dates=['DATE_OF_BIRTH'], reader=reader_type) ch.add(read_data) # --- filter data link = analysis.ApplySelectionToDf(read_key=read_data.key, query_set=filter_query) ch = Chain('Fix') # --- percentile binning, done *before* nans get converted info floats below, # such that these do not affect the percentile bins # pb = RooFitPercentileBinning() # pb.read_key = read_data.key # pb.var_number_of_bins = var_number_of_bins # pb.binning_name = 'percentile' # ch.add(pb) # --- fix nans if they exist in a row (set to same dtype with convert_inconsistent_nans) fixer = data_quality.FixPandasDataFrame(name='fixer') fixer.read_key = read_data.key # fixer.read_key = transform.store_key fixer.store_key = 'fix_nan'