'file:' + persistence.io_path('data', settings.io_conf(), 'dummy1.csv'), 'file:' + persistence.io_path('data', settings.io_conf(), 'dummy2.csv') ] # define store_key for all data files to be read in STORE_KEYS = ['spark_df1', 'spark_df2'] ########################################################################## # Now set up the chains and links based on configuration flags proc_mgr.add_chain('Read') # create read link for each data file for index, key in enumerate(STORE_KEYS): read_link = spark_analysis.SparkDfReader(name='Reader' + str(index + 1), store_key=key, read_methods=['csv']) # set CSV read arguments read_link.read_meth_args['csv'] = (file_paths[index], ) read_link.read_meth_kwargs['csv'] = dict(sep='|', header=True, inferSchema=True) # add link to chain proc_mgr.get_chain('Read').add_link(read_link) # create SQL-query link sql_link = spark_analysis.SparkExecuteQuery(name='SparkSQL', store_key='spark_df_sql')
file_paths = [ 'file:' + persistence.io_path('data', settings.io_conf(), 'dummy.csv') ] separator = '|' has_header = True infer_schema = True num_partitions = 4 columns = ['date', 'loc', 'x', 'y'] ########################################################################## # --- now set up the chains and links based on configuration flags # create read link read_link = spark_analysis.SparkDfReader(name='Reader', store_key='spark_df', read_methods=['csv']) # set CSV read arguments read_link.read_meth_args['csv'] = (file_paths, ) read_link.read_meth_kwargs['csv'] = dict(sep=separator, header=has_header, inferSchema=infer_schema) if columns: # add select function read_link.read_methods.append('select') read_link.read_meth_args['select'] = tuple(columns) if num_partitions: # add repartition function