Beispiel #1
0
# --- now set up the chains and links based on configuration flags

# --- readdata with default settings reads all three input files simultaneously.
#     all extra key word arguments are passed on to pandas reader.
if settings['do_readdata']:
    read = Chain('ReadData')
    # --- readdata keeps on opening the next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    read_data = analysis.ReadToDf(name='reader',
                                  key='test',
                                  sep='|',
                                  reader='csv',
                                  path=[data_path] * 3)
    read.add(read_data)

if settings['do_writedata']:
    write = Chain('WriteData')
    # --- writedata needs a specified output format ('writer' argument).
    #     if this is not set, try to determine this from the extension from the filename.
    #     'key' is picked up from the datastore. 'path' is the output filename.
    #     all other kwargs are passed on to pandas file writer.
    write_data = analysis.WriteFromDf(name='writer',
                                      key='test',
                                      path='tmp3.csv',
                                      writer='csv')
    write.add(write_data)

#########################################################################################

logger.debug('Done parsing configuration file esk202_writedata')
fixer = data_quality.FixPandasDataFrame(name='fixer2')
fixer.read_key = 'vrh'
fixer.store_key = 'vrh_fix2'
fixer.var_dtype = {'B': int, 'C': str}
ch.add(fixer)

# --- 3. convert all nans to data type consistent with rest of column
fixer = data_quality.FixPandasDataFrame(name='fixer3')
fixer.read_key = 'vrh'
fixer.store_key = 'vrh_fix3'
fixer.convert_inconsistent_nans = True
# set a specific nan (GREPME) for a given column (G)
fixer.var_nan = {'G': 'GREPME'}
ch.add(fixer)

# --- 4. compare results
pds = core_ops.PrintDs(name='pds2')
pds.keys = ['vrh', 'vrh_fix1', 'vrh_fix2', 'vrh_fix3']
ch.add(pds)

# --- 5. write out fixed dataframe - turned off in this example
writedata = analysis.WriteFromDf(name='writer',
                                 key='vrh_fix1',
                                 path='tmp.csv',
                                 writer='csv')
# ch.add(writedata)

#########################################################################################

logger.debug('Done parsing configuration file esk501_fix_pandas_dataframe')
fixer.store_key = 'vrh_fix2'
fixer.var_dtype = {'B': int, 'C': str}
ch.add(fixer)

# --- 3. convert all nans to data type consistent with rest of column
fixer = data_quality.FixPandasDataFrame(name='fixer3')
fixer.read_key = 'vrh'
fixer.store_key = 'vrh_fix3'
fixer.convert_inconsistent_nans = True
# set a specific nan (GREPME) for a given column (G)
fixer.var_nan = {'G': 'GREPME'}
ch.add(fixer)

# --- 4. compare results
pds = core_ops.PrintDs(name='pds2')
pds.keys = ['vrh', 'vrh_fix1', 'vrh_fix2', 'vrh_fix3']
ch.add(pds)

# --- 5. write out fixed dataframe - turned off in this example
# The dataframe will be saved with the numpy writer which will
# restore the dtypes when reloading the dataframe
writedata = analysis.WriteFromDf(name='writer',
                                 key='vrh_fix1',
                                 path='tmp.npz',
                                 )
# ch.add(writedata)

#########################################################################################

logger.debug('Done parsing configuration file esk501_fix_pandas_dataframe')