'file:' + persistence.io_path('data', settings.io_conf(), 'dummy1.csv'),
    'file:' + persistence.io_path('data', settings.io_conf(), 'dummy2.csv')
]

# define store_key for all data files to be read in
STORE_KEYS = ['spark_df1', 'spark_df2']

##########################################################################
# Now set up the chains and links based on configuration flags

proc_mgr.add_chain('Read')

# create read link for each data file
for index, key in enumerate(STORE_KEYS):
    read_link = spark_analysis.SparkDfReader(name='Reader' + str(index + 1),
                                             store_key=key,
                                             read_methods=['csv'])

    # set CSV read arguments
    read_link.read_meth_args['csv'] = (file_paths[index], )
    read_link.read_meth_kwargs['csv'] = dict(sep='|',
                                             header=True,
                                             inferSchema=True)

    # add link to chain
    proc_mgr.get_chain('Read').add_link(read_link)

# create SQL-query link
sql_link = spark_analysis.SparkExecuteQuery(name='SparkSQL',
                                            store_key='spark_df_sql')
Esempio n. 2
0
file_paths = [
    'file:' + persistence.io_path('data', settings.io_conf(), 'dummy.csv')
]
separator = '|'
has_header = True
infer_schema = True
num_partitions = 4
columns = ['date', 'loc', 'x', 'y']

##########################################################################
# --- now set up the chains and links based on configuration flags

# create read link
read_link = spark_analysis.SparkDfReader(name='Reader',
                                         store_key='spark_df',
                                         read_methods=['csv'])

# set CSV read arguments
read_link.read_meth_args['csv'] = (file_paths, )
read_link.read_meth_kwargs['csv'] = dict(sep=separator,
                                         header=has_header,
                                         inferSchema=infer_schema)

if columns:
    # add select function
    read_link.read_methods.append('select')
    read_link.read_meth_args['select'] = tuple(columns)

if num_partitions:
    # add repartition function