class DaskCollect(OpVertex): """ Calls dask.compute on a collection of dask delayed objects """ dask_progress_bar = BaseParameter( True, "Include a diagnostic Progressbar from dask") num_workers = BaseParameter(4, "Number of dask workers") scheduler = BaseParameter('processes', "Dask scheduler option") def _node_color(self): return '#2fbc2d' def run(self, *ops): self.ops = list() self.delays = list() for _op, _delay in ops: self.ops.append(_op) self.delays.append(_delay) compute_kwargs = dict(scheduler=self.scheduler, num_workers=self.num_workers) if self.dask_progress_bar: with ProgressBar(): results = dask.compute(*self.delays, **compute_kwargs) else: results = dask.compute(*self.delays, **compute_kwargs) return results
class DaskParallel(OpVertex): """ Wraps an Op in a dask.delayed object """ parallel_op = BaseParameter() op_kwargs = BaseParameter(dict()) def _node_color(self): return '#2fbc2d' def _node_shape(self): #return 'doublecircle' return 'doubleoctagon' def requires(self): if inspect.isclass(self.parallel_op): # instantiate with provided keyword args self.inst_op = self.parallel_op(**self.op_kwargs) else: # already instantiated self.inst_op = self.parallel_op return self.inst_op.requires() def run(self, *args, **kwargs): return (self.inst_op, dask.delayed(self.inst_op.run)(*args, **kwargs)) def get_name(self): return self.parallel_op.get_name()
class DropDuplicates(DFOp): subset = BaseParameter(None) keep = BaseParameter('first') inplace = BaseParameter(False) def run(self, df): return df.drop_duplicates(subset=self.subset, keep=self.keep, inplace=self.inplace)
class RenameColumns(DFOp): columns = BaseParameter(None) copy = BaseParameter(True) inplace = BaseParameter(False) level = BaseParameter(None) def run(self, df): return df.rename(columns=self.columns, copy=self.copy, inplace=self.inplace, level=self.level)
class DropNa(DFOp): axis = BaseParameter(0) how = BaseParameter('any') thresh = BaseParameter(None) subset = BaseParameter(None) def run(self, df): return df.dropna(axis=self.axis, how=self.how, thresh=self.thresh, subset=self.subset, inplace=False)
class AssignColumn(DFOp): column = BaseParameter(None) value = BaseParameter(None) assignments = BaseParameter(None) def run(self, df): if self.assignments is not None: for col, val in self.assignments.items(): df[col] = val else: df[self.column] = self.value return df
class Read_CSV(OpVertex): filepath_or_buffer = BaseParameter() sep = BaseParameter(',') delimiter = BaseParameter(None) header = BaseParameter('infer') def run(self, path=None): if path is None: path = self.filepath_or_buffer return pd.read_csv(filepath_or_buffer=path, sep=self.sep, delimiter=self.delimiter, header=self.header)
class HistMulti(OpVertex): title = BaseParameter('') figsize = BaseParameter((8, 5)) xlabel = BaseParameter('x') ylabel = BaseParameter('y') logx = BaseParameter(False) logy = BaseParameter(False) passthrough = BaseParameter(False, "If False, returns axis of plot rather ") # @staticmethod def hist_compare(self, **hist_data): fig, ax = plt.subplots(figsize=self.figsize) hist_kwargs = dict(bins=20, alpha=.4, density=False) for d_name, d in hist_data.items(): ax.hist(d, label=d_name, **hist_kwargs) ax.legend(fontsize=15) ax.set_title(self.title, fontsize=18) ax.set_xlabel(self.xlabel, fontsize=16) ax.set_ylabel(self.ylabel, fontsize=16) # ax.set_xticklabels(ax.get_xticklabels(), fontsize=13) ax.tick_params(labelsize=15) if self.logy: ax.set_yscale('log') if self.logx: ax.set_xscale('log') if self.passthrough: return hist_data else: return ax
class Subplots(OpVertex): nrows = BaseParameter(1) ncols = BaseParameter(1) sharex = BaseParameter(False) sharey = BaseParameter(False) #squeeze=BaseParameter(True) subplot_kw = BaseParameter(None) gridspec_kw = BaseParameter(None) figsize = BaseParameter((12, 6)) _never_cache = True #fig_kw = dsdag.core.parameter.BaseParameter(dict()) def run(self): from matplotlib import pyplot as plt #self.fig, self.axs = plt.subplots(**{k: v.value for k, v in self.get_parameters()}) self.fig, self.axs = plt.subplots(nrows=self.nrows, ncols=self.ncols, sharex=self.sharex, sharey=self.sharey, figsize=self.figsize, subplot_kw=self.subplot_kw, gridspec_kw=self.gridspec_kw, squeeze=False) self.axs = [ self.axs[r][c] for r in range(self.nrows) for c in range(self.ncols) ] return self.axs
class Join(DFOp): how = BaseParameter('left') lsuffix = BaseParameter('') rsuffix = BaseParameter('') sort = BaseParameter(False) def requires(self): raise NotImplementedError() def run(self, *args): assert all(isinstance(o, pd.DataFrame) for o in args) ret = args[0] for o in args[1:]: ret = ret.join(o, how=self.how, lsuffix=self.lsuffix, rsuffix=self.rsuffix, sort=self.sort) return ret
class LambdaOp(OpVertex): f = BaseParameter(help_msg="Function that is applied to the input") def _node_color(self): return '#d65768' def requires(self): raise NotImplementedError("Incomplete LambdaOp - must be applied") def run(self, *args, **kwargs): return self.f(*args, **kwargs)
class FrameBrowse(DFOp, FrameBrowseMaixin): passthrough = BaseParameter(True) def op_nb_viz(self, op_out, viz_out=None): return FrameBrowseMaixin.op_nb_viz(op_out, viz_out) def run(self, df): output = self.op_nb_viz(df) display(output) if self.passthrough: return df
class Concat(DFOp): axis = BaseParameter(0) join = BaseParameter('outer') ignore_index = BaseParameter(False) keys = BaseParameter(None) levels = BaseParameter(None) verify_integrity = BaseParameter(False) sort = BaseParameter(None) copy = BaseParameter(True) def requires(self): raise NotImplementedError() def run(self, *args): return pd.concat(args, axis=self.axis, join=self.join, ignore_index=self.ignore_index, keys=self.keys, levels=self.levels, verify_integrity=self.verify_integrity, sort=self.sort, copy=self.copy)
class ContourPlot(OpVertex): sample_ixes = BaseParameter(None) title = BaseParameter('') bands = BaseParameter(None) value_column = BaseParameter(None) dot_color_column = BaseParameter(None) cbar = BaseParameter(False) _never_cache = True @staticmethod def contour_plot(_s_df, value_column, dot_color=None, vmin=None, vmax=None, ax=None, cbar=False, title=''): if ax is None: fig, ax = plt.subplots() x = _s_df['x'] y = _s_df['y'] z = _s_df[value_column] # define grid. _s_2d_df = _s_df.dropna().pivot(index='x', columns='y', values=value_column) _s_stacked_df = _s_2d_df.stack( dropna=False).rename(value_column).reset_index() # return _s_2d_df xi = _s_2d_df.index.tolist() yi = _s_2d_df.columns.tolist() zi = griddata(x, y, z, xi, yi, interp='linear') vmax = abs(zi).max() if vmax is None else vmax vmin = -abs(zi).max() if vmin is None else vmin ctr = ax.contourf(xi, yi, zi, 30, cmap=plt.cm.hsv, vmax=vmax, vmin=vmin) if cbar: #plt.colorbar(mappable=ctr) plt.colorbar(ctr, ax=ax) ax.scatter( x=_s_df['x'], y=_s_df['y'], cmap='gray', s=5, c=dot_color, ) ax.set_title(title) return ax def run( self, df, ax=None, value_column=None, ): value_column = self.value_column if value_column is None else value_column plt_df = df.dropna() self.contour_plot( plt_df, value_column=value_column, title=self.title, ax=ax, dot_color=plt_df[value_column] if self.dot_color_column is None else plt_df[self.dot_color_column], cbar=self.cbar)
class DenseAutoencoder(OpVertex): input_feature_dim = BaseParameter() output_feature_dim = BaseParameter() encoding_dim = BaseParameter(25) dropout = BaseParameter(.15) input_dropout = BaseParameter(0.15) width = BaseParameter(128) depth = BaseParameter(1) hidden_activation = BaseParameter('relu') output_activation = BaseParameter('relu') lr = BaseParameter(0.001) decay = BaseParameter(0.0) optimizer = BaseParameter(None) loss = BaseParameter('mean_squared_error') batchnorm = BaseParameter(True) kernel_initializer = BaseParameter('glorot_normal') bias_initializer = BaseParameter('zeros') @staticmethod def build_autoencoder(input_feature_dim, output_feature_dim, encoding_dim=25, dropout=.15, input_dropout=0.15, width=128, depth=1, hidden_activation='relu', output_activation='relu', lr=0.001, decay=0.0, optimizer=None, loss='mean_squared_error', batchnorm=True, kernel_initializer='glorot_normal', bias_initializer='zeros'): """ returns encoder, decoder, autoencoder """ #print("Input-Enc-Output: %d-%d-%d" % (input_feature_dim, encoding_dim, output_feature_dim)) input_sym = Input(shape=(input_feature_dim, )) enc_x = input_sym if input_dropout is not None and input_dropout > 0.: enc_x = Dropout(input_dropout)(enc_x) for d in range(depth): enc_x = Dense(width, activation=None, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)(enc_x) if batchnorm: enc_x = BatchNormalization()(enc_x) enc_x = Activation(hidden_activation)(enc_x) enc_x = Dropout(dropout)(enc_x) encoded = Dense(encoding_dim, activation=None, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)(enc_x) if batchnorm: encoded = BatchNormalization()(encoded) encoded = Activation(hidden_activation, name='encoder')(encoded) decoder = Dropout(dropout)(encoded) dec_x = decoder for d in range(depth): dec_x = Dense(width, activation=None, name='decoder_l%d' % d, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)(dec_x) if batchnorm: dec_x = BatchNormalization()(dec_x) dec_x = Activation(hidden_activation)(dec_x) dec_x = Dropout(dropout)(dec_x) decoded = Dense(output_feature_dim, activation=None, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)(dec_x) decoded = Activation(output_activation)(decoded) autoencoder = Model(input_sym, decoded) encoder = Model(input_sym, encoded) # create a placeholder for an encoded (32-dimensional) input encoded_input = Input(shape=(encoding_dim, )) # retrieve the last layer of the autoencoder model # decoder_layer = autoencoder.layers[-(depth*4 + 1)] decoder_layer = [ l for l in autoencoder.layers if l.name == 'decoder_l0' ][0] # create the decoder model decoder = Model(encoded_input, decoder_layer(encoded_input)) if optimizer is None: optimizer = Adam(lr=lr, decay=decay) autoencoder.compile(optimizer=optimizer, loss=loss) return encoder, decoder, autoencoder
class ApplyMap(DFOp): func = BaseParameter() def run(self, df): return df.applymap(func=self.func)
class Query(DFOp): q = BaseParameter() def run(self, df): return df.query(self.q)
class SelectColumns(DFOp): columns = BaseParameter(None) def run(self, df): return df[self.columns]
class Merge(DFOp): #key = BaseParameter() how = BaseParameter('inner') on = BaseParameter(None) left_on = BaseParameter(None) right_on = BaseParameter(None) left_index = BaseParameter(False) right_index = BaseParameter(False) sort = BaseParameter(False) suffixes = BaseParameter(('_x', '_y')) copy = BaseParameter(True) indicator = BaseParameter(False) validate = BaseParameter(None) def requires(self): raise NotImplementedError() def run(self, *args): #frames = list(kwargs.values()) frames = list(args) merged = frames[0] for f in frames[1:]: merged = merged.merge(f, on=self.on, how=self.how, left_on=self.left_on, right_on=self.right_on, left_index=self.left_index, right_index=self.right_index, sort=self.sort, suffixes=self.suffixes, copy=self.copy, indicator=self.indicator, validate=self.validate) return merged
class Select(OpVertex): i = BaseParameter(0) _never_cache = True def run(self, l): return l[self.i]
class InputOp(OpVertex): obj = BaseParameter(help_msg="The object to wrap and return") def run(self): return self.obj
class Drop(DFOp): labels = BaseParameter() axis = BaseParameter(0) def run(self, df): return df.drop(labels=self.labels, axis=self.axis)