######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk406_simulation_based_on_unbinned_data' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['read_data'] = True settings['generate'] = True settings['make_plot'] = True settings['high_num_dims'] = False input_files = [resources.fixture('correlated_data.sv.gz')] ######################################################################################### # --- now set up the chains and links based on configuration flags if settings['read_data']: ch = Chain('Data') # --- 0. read the input dataset read_data = analysis.ReadToDf(name='reader', key='correlated_data', reader='csv', sep=' ') read_data.path = input_files ch.add(read_data)
logger.debug( 'Now parsing configuration file esk405_simulation_based_on_binned_data') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk405_simulation_based_on_binned_data' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['high_num_dims'] = False input_files = [resources.fixture('mock_accounts.csv.gz')] ######################################################################################### # --- now set up the chains and links based on configuration flags ch = Chain('Data') # --- 0. read input data read_data = analysis.ReadToDf(name='dflooper', key='accounts', reader='csv') read_data.path = input_files ch.add(read_data) # --- 1. add the record factorizer # Here the columns dummy and loc of the input dataset are factorized # e.g. x = ['apple', 'tree', 'pear', 'apple', 'pear'] becomes the column: # x = [0, 1, 2, 0, 2]
settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk211_fork_read_data_itr' settings['version'] = 0 # no need to set this normally, but illustrates how to throttle the number of concurrent processes. # default is set to number of available cpu cores. process_manager.num_cpu = 4 ######################################################################################### # when chunking through an input file, pick up only N lines in each iteration. chunk_size = 5 ######################################################################################### # --- Set path of data data_path = resources.fixture('dummy.csv') ######################################################################################### # --- now set up the chains and links, based on configuration flags # --- example 2: readdata loops over the input files, with file chunking. if settings.get('do_example2', True): ch = Chain('MyChain2') ch.n_fork = 10 # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview) # --- readdata keeps on opening the next 4 lines of the open or next file in the file list.
settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk604_spark_execute_query' settings['version'] = 0 ########################################################################## # Start Spark session spark = process_manager.service(SparkManager).create_session( eskapade_settings=settings) ########################################################################## # CSV and dataframe settings # NB: local file may not be accessible to worker node in cluster mode file_paths = [ 'file:' + resources.fixture('dummy1.csv'), 'file:' + resources.fixture('dummy2.csv') ] # define store_key for all data files to be read in STORE_KEYS = ['spark_df1', 'spark_df2'] ########################################################################## # Now set up the chains and links based on configuration flags read = Chain('Read') # create read link for each data file for index, key in enumerate(STORE_KEYS): read_link = spark_analysis.SparkDfReader(name='Reader' + str(index + 1), store_key=key,
logger = Logger() logger.debug('Now parsing configuration file esk305_correlation_summary.') ######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk305_correlation_summary' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. settings['input_path'] = resources.fixture('correlated_data.sv.gz') settings['reader'] = 'csv' settings['separator'] = ' ' settings['correlations'] = [ 'pearson', 'kendall', 'spearman', 'correlation_ratio' ] ######################################################################################### # --- now set up the chains and links based on configuration flags # create chains data = Chain('Data') # load data reader = analysis.ReadToDf(name='reader', path=settings['input_path'],
# --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk602_read_csv_to_spark_df' settings['version'] = 0 ########################################################################## # --- start Spark session spark = process_manager.service(SparkManager).create_session(eskapade_settings=settings) ########################################################################## # --- CSV and data-frame settings # NB: local file may not be accessible to worker node in cluster mode file_paths = ['file:' + resources.fixture('dummy1.csv'), 'file:' + resources.fixture('dummy2.csv')] separator = '|' has_header = True infer_schema = True num_partitions = 5 columns = ['date', 'loc', 'x', 'y'] ########################################################################## # --- now set up the chains and links based on configuration flags # create read link read_link = spark_analysis.SparkDfReader(name='Reader', store_key='spark_df', read_methods=['csv'])
settings['analysisName'] = 'esk210_dataframe_restoration' settings['version'] = 0 ds = process_manager.service(DataStore) ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. # Two writers that are able to restore dataframes have been included in eskapade. # To turn one of them off set the below to False settings['do_numpy'] = True settings['do_feather'] = True ######################################################################################### # --- Set path of data # messy_dtypes is a small files with some complex data types that are # not guaranteed to be read back properly when using csv data_path = resources.fixture('messy_dtypes.csv') # The actual fundamental data types are: dtypes = [ 'str', 'int64', 'float32', 'float64', 'S32', 'str', 'bool', 'str', 'uint64', 'str' ] # Inferred | True # ----------------------- # 'object' | 'str', # 'int64' | 'int64' # 'float64' | 'float32' # 'float64' | 'float64' # 'object' | 'S32' # 'int64' | 'str'