def execute(self): """Execute DfSummary Creates a report page for each variable in data frame. * create statistics object for column * create overview table of column variable * plot histogram of column variable * store plot :returns: execution status code :rtype: StatusCode """ ds = ProcessManager().service(DataStore) # fetch and check input data frame data = ds.get(self.read_key, None) if data is None: self.log().critical( 'No input data "%s" found in data store for %s', self.read_key, str(self)) raise RuntimeError('no input data found for {}'.format(str(self))) else: self.assert_data_type(data) # create report page for histogram if self.pages_key: self.pages = ds.get(self.pages_key, []) if not isinstance(self.pages, list): raise TypeError( 'pages key "{}" does not refer to a list'.format( self.pages_key)) # determine all possible columns, used for comparison below all_columns = self.get_all_columns(data) if not self.columns: self.columns = all_columns for name in self.columns[:]: # check if column is in data frame if name not in all_columns: self.log().warning('Key "%s" not in input data; skipping', name) self.columns.remove(self.columns.index(name)) continue self.log().debug('Processing "%s"', name) sample = self.get_sample(data, name) self.process_sample(name, sample) # add nan histogram to summary if present if self.nan_counts: nan_hist = self.nan_counts, self.columns self.process_nan_histogram(nan_hist, self.get_length(data)) # storage if self.pages_key: ds[self.pages_key] = self.pages return StatusCode.Success
def test_esk106(self): settings = ProcessManager().service(ConfigObject) settings['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings['macro'] = settings[ 'esRoot'] + '/tutorials/esk106_cmdline_options.py' # fake a setting from the cmd-line. picked up in the macro settings['do_chain0'] = False status = execution.run_eskapade(settings) pm = ProcessManager() settings = ProcessManager().service(ConfigObject) ds = ProcessManager().service(DataStore) self.assertTrue(status.isSuccess()) self.assertEqual(1, len(pm.chains)) self.assertEqual('Chain1', pm.chains[0].name) self.assertEqual(False, settings.get('do_chain0', True)) self.assertEqual(True, settings.get('do_chain1', True)) self.assertEqual('Universe', pm.chains[0].links[0].hello)
def execute(self): """Execute DsObjectDeleter""" settings = ProcessManager().service(ConfigObject) ds = ProcessManager().service(DataStore) # used in code testing only if settings.get('TESTING'): self.log().warning( 'Running in TESTING mode. NOT clearing datastore for testing purposes.' ) return StatusCode.Success # delete specific items for key in self.deletionKeys: if key in ds: self.log().debug('Now deleting datastore object with key "%s"', key) del ds[key] # delete specific class types for cls in self.deletionClasses: for key in ds: if isinstance(ds[key], cls): self.log().debug( 'Now deleting datastore object with key "%s"', key) del ds[key] # delete all but specific items if len(self.keepOnly): keys = list(ds.keys()) for key in keys: if key not in self.keepOnly: self.log().debug( 'Now deleting datastore object with key "%s"', key) del ds[key] # delete all items in datastore if self.clearAll: keys = list(ds.keys()) for key in keys: self.log().debug('Now deleting datastore object with key "%s"', key) del ds[key] return StatusCode.Success
# when chunking through an input file, pick up only N lines in each iteration. chunksize = 5 ######################################################################################### # --- Set path of data data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv') ######################################################################################### # --- now set up the chains and links, based on configuration flags proc_mgr = ProcessManager() # --- example 1: readdata loops over the input files, but no file chunking. if settings.get('do_example1', True): ch = proc_mgr.add_chain('MyChain1') # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview) # --- readdata keeps on opening the next file in the file list. # all kwargs are passed on to pandas file reader. readdata = analysis.ReadToDf(name='dflooper1', key='test1', sep='|', reader='csv', usecols=['x', 'y']) readdata.path = [data_path] * 3 readdata.itr_over_files = True
def execute(self): """Execute CorrelationSummary""" ds = ProcessManager().service(DataStore) import matplotlib.pyplot as plt from matplotlib import colors # fetch and check input data frame # drop all-nan columns right away df = ds.get(self.read_key, None).dropna(how='all', axis=1) if not isinstance(df, pd.DataFrame): self.log().critical( 'no Pandas data frame "%s" found in data store for %s', self.read_key, str(self)) raise RuntimeError('no input data found for %s' % str(self)) # compute correlations between all numerical variables self.log().debug('Computing "%s" correlations of dataframe "%s"', self.method, self.read_key) # mutual info, from sklearn if self.method == 'mutual_information': # numerical columns only cols = df.select_dtypes(include=[np.number]).columns # initialize correlation matrix n = len(cols) cors = np.zeros((n, n)) for i, c in enumerate(cols): # compare each column to all of the columns cors[i, :] = mutual_info_regression(df[cols], df[c]) cors = pd.DataFrame(cors, columns=cols, index=cols) elif self.method == 'correlation_ratio': # numerical columns only cols = df.select_dtypes(include=[np.number]).columns # choose bins for each column bins = {c: len(np.histogram(df[c])[1]) for c in cols} # sort rows into bins for c in cols: df[str(c) + '_bin'] = pd.cut(df[c], bins[c]) # initialize correlation matrix n = len(cols) cors = np.zeros((n, n)) for i, x in enumerate(cols): xbin = str(x) + '_bin' # definition from Wikipedia "correlation ratio" y_given_x = (df.groupby(xbin))[cols] weighted_var_y_bar = (y_given_x.count() * (y_given_x.mean() - df.mean())**2).sum() weighted_var_y = df[cols].count() * df[cols].var() cors[i, :] = weighted_var_y_bar / weighted_var_y cors = pd.DataFrame(cors, columns=cols, index=cols) else: cors = df.corr(method=self.method) cols = list(cors.columns) # set up heatmap of convenient size plot_size = max(len(cols) / 1.8, 2) fig, ax = plt.subplots(figsize=(1.5 * plot_size, plot_size)) vmin = -1 if self.method in LINEAR_CORRS else 0 vmax = 1 cmap = 'RdYlGn' if self.method in LINEAR_CORRS else 'YlGn' norm = colors.Normalize(vmin=vmin, vmax=vmax) img = ax.pcolormesh(cors, cmap=cmap, edgecolor='w', linewidth=1, norm=norm) # make plot look pretty ax.set_title('{0:s} correlations'.format(self.method.capitalize())) ax.set_yticks(np.arange(len(cols)) + 0.5) ax.set_xticks(np.arange(len(cols)) + 0.5) ax.set_yticklabels(cols, rotation='horizontal') ax.set_xticklabels(cols, rotation='vertical') fig.colorbar(img) # annotate with correlation values for i in range(len(cols)): for j in range(len(cols)): point = float(cors[cols[i]][j]) label = 'NaN' if np.isnan(point) else '{0:.2f}'.format(point) white_cond = (point < 0.7 * vmin) or ( point >= 0.7 * vmax) or np.isnan(point) color = 'w' if white_cond else 'k' ax.annotate(label, xy=(i + 0.5, j + 0.5), color=color, horizontalalignment='center', verticalalignment='center') # save plots in file fname = '_'.join( ['correlations', self.read_key.replace(' ', ''), self.method]) + '.pdf' fpath = os.path.join(self.results_path, fname) self.log().debug('Saving correlation heatmap as {}'.format(fpath)) fig.savefig(fpath, bbox_inches='tight') # save correlations to datastore if requested if self.write_key: ds[self.write_key] = cors return StatusCode.Success
def execute(self): """Execute CorrelationSummary""" ds = ProcessManager().service(DataStore) import matplotlib.pyplot as plt from matplotlib import colors # fetch and check input data frame # drop all-nan columns right away df = ds.get(self.read_key, None).dropna(how='all', axis=1) if not isinstance(df, pd.DataFrame): self.log().critical( 'no Pandas data frame "%s" found in data store for %s', self.read_key, str(self)) raise RuntimeError('no input data found for %s' % str(self)) n_df = len(df.index) assert n_df, 'Pandas data frame "%s" frame has zero length' % self.read_key # create report pages if self.pages_key: self.pages = ds.get(self.pages_key, []) assert isinstance( self.pages, list), 'Pages key %s does not refer to a list' % self.pages_key # below, create report pages # for each correlation create resulting heatmap cors_list = [] for method in self.methods: # compute correlations between all numerical variables self.log().debug('Computing "%s" correlations of dataframe "%s"', method, self.read_key) # mutual info, from sklearn if method == 'mutual_information': # numerical columns only cols = df.select_dtypes(include=[np.number]).columns # initialize correlation matrix n = len(cols) cors = np.zeros((n, n)) for i, c in enumerate(cols): # compare each column to all of the columns cors[i, :] = mutual_info_regression(df[cols], df[c]) cors = pd.DataFrame(cors, columns=cols, index=cols) elif method == 'correlation_ratio': # numerical columns only cols = df.select_dtypes(include=[np.number]).columns # choose bins for each column bins = {c: len(np.histogram(df[c])[1]) for c in cols} # sort rows into bins for c in cols: df[str(c) + '_bin'] = pd.cut(df[c], bins[c]) # initialize correlation matrix n = len(cols) cors = np.zeros((n, n)) for i, x in enumerate(cols): # definition from Wikipedia "correlation ratio" xbin = str(x) + '_bin' y_given_x = (df.groupby(xbin))[cols] weighted_var_y_bar = ( y_given_x.count() * (y_given_x.mean() - df.mean())**2).sum() weighted_var_y = df[cols].count() * df[cols].var() cors[i, :] = weighted_var_y_bar / weighted_var_y cors = pd.DataFrame(cors, columns=cols, index=cols) else: cors = df.corr(method=method) cols = list(cors.columns) # replace column names with indices, as with numpy matrix, for plotting function below n = len(cols) cors.columns = range(n) # keep for potential later usage cors_list.append(cors) # plot settings title = '{0:s} correlation matrix'.format(method.capitalize()) vmin = -1 if method in LINEAR_CORRS else 0 vmax = 1 color_map = 'RdYlGn' if method in LINEAR_CORRS else 'YlGn' fname = '_'.join( ['correlations', self.read_key.replace(' ', ''), method]) + '.pdf' fpath = os.path.join(self.results_path, fname) # create nice looking plot self.log().debug('Saving correlation heatmap as {}'.format(fpath)) visualization.vis_utils.plot_correlation_matrix( cors, cols, cols, fpath, title, vmin, vmax, color_map) # statistics table for report page n_unique = (n * n - n) / 2 if method is not 'correlation_ratio' else n * n stats = [('entries', n_df), ('bins', n * n), ('unique', n_unique), ('> 0', (cors.values.ravel() > 0).sum()), ('< 0', (cors.values.ravel() < 0).sum()), ('avg', np.average(cors.values.ravel())), ('max', max(cors.values.ravel())), ('min', min(cors.values.ravel()))] if n > 0 else [] stats_table = tabulate.tabulate(stats, tablefmt='latex') # add plot and table as page to report self.pages.append( self.page_template.replace('VAR_LABEL', title).replace( 'VAR_STATS_TABLE', stats_table).replace('VAR_HISTOGRAM_PATH', fpath)) # save correlations to datastore if requested if self.store_key: ds[self.store_key] = cors_list if self.pages_key: ds[self.pages_key] = self.pages return StatusCode.Success
The two flags below control whether chains are turned on or off. (default=on) from the cmd line, control these with: -c do_chain0=False -c do_chain1=False Try it; No Hello Worlds will be printed. """ log.info(msg) ######################################################################################### # --- now set up the chains and links based on configuration flags proc_mgr = ProcessManager() if settings.get('do_chain0', True): ch = proc_mgr.add_chain('Chain0') link = core_ops.HelloWorld(name='hello0') link.hello = 'Town' ch.add_link(link) if settings.get('do_chain1', True): ch = proc_mgr.add_chain('Chain1') link = core_ops.HelloWorld(name='hello1') link.hello = 'Universe' ch.add_link(link) ######################################################################################### log.debug('Done parsing configuration file esk106_cmdline_options')