def process(msg): logger, log_stream = set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation # groupby list cols = tfp.read_list(api.config.groupby) att_dict['config']['groupby'] = api.config.groupby # mapping colagg = tfp.read_dict(api.config.aggregation) att_dict['config']['aggregation'] = api.config.aggregation # groupby df = df.groupby(cols, as_index=api.config.index).agg(colagg) # drop col att_dict['config']['dropcols'] = api.config.drop_columns dropcols = tfp.read_list(api.config.drop_columns) if dropcols: df.drop(columns=dropcols, inplace=True) ############################################## # final infos to attributes and info message ############################################## att_dict['operator'] = 'groupbyDataFrame' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = list(df.columns) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg): logger, log_stream = set_logging(name='dropColumns', loglevel='DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation att_dict['config']['drop_columns'] = api.config.drop_columns drop_cols = tfp.read_list(api.config.drop_columns, df.columns) if drop_cols: logger.debug("Drops columns: {}".format(str(drop_cols))) df = df.drop(columns=drop_cols) att_dict['config']['rename_columns'] = api.config.rename_columns map_names = tfp.read_dict(api.config.rename_columns) if map_names: df.rename(columns=map_names, inplace=True) ###### end of doing calculation ############################################## # final infos to attributes and info message ############################################## # df from body att_dict['operator'] = 'dropColumns' # name of operator att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['name'] = prev_att['name'] att_dict['columns'] = list(df.columns) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg) : logger, log_stream = set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df,pd.DataFrame) : raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation # map_values : column1: {from_value: to_value}, column2: {from_value: to_value} att_dict['config']['set_value'] = api.config.map_values maps_map = tfp.read_dict_of_dict(api.config.map_values) df.replace(maps_map,inplace=True) # Fill NaN value : column1: value, column2: value, att_dict['config']['fill_nan_values'] = api.config.fill_nan_values map_dict = tfp.read_dict(api.config.fill_nan_values) if map_dict : df.fillna(map_dict,inplace=True) ############################################## # final infos to attributes and info message ############################################## if df.empty : raise ValueError('DataFrame is empty') att_dict['operator'] = 'setValue' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict['number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0,example_rows) : att_dict['row_'+str(i)] = str([ str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict,body=df) return log, msg
def process(msg): logger, log_stream = set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): logger.error('Message body does not contain a pandas DataFrame') raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() df = pd.get_dummies(df, prefix_sep='_', drop_first=True) ############################################## # final infos to attributes and info message ############################################## if df.empty: logger.error('DataFrame is empty') raise ValueError('DataFrame is empty') att_dict['operator'] = 'selectDataFrame' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(test_msg, base_msg) : logger, log_stream = set_logging('DEBUG') # start custom process definition test_att = test_msg.attributes base_att = base_msg.attributes att_dict = dict() if test_att['name'] == base_att['name']: att_dict['name'] = test_att['name'] else: att_dict['name'] = test_att['name'] + '-' + base_att['name'] att_dict['config'] = dict() att_dict['config']['test_index'] = api.config.test_index testdf_index = tfp.read_value(api.config.test_index) if not testdf_index: logger.error('Index of test data is mandatory') raise ValueError('Index of test data is mandatory') att_dict['number_rows'] = str(base_msg.body.shape[0]) # get the columns to check mapping = tfp.read_dict(api.config.check_columns) df = pd.DataFrame() if mapping: att_dict['config']['check_columns'] = str(mapping) att_dict['config']['limit'] = api.config.limit # read stream from memory test_df = test_msg.body # test if all mapping cols in testdf checkcols = [elem in list(test_df.columns) for elem in list(mapping.keys())] if not all(checkcols): error_txt = 'Elements in mapping are not contained in columns of test df : ' + \ str(list(mapping.keys())) + '-' + str(list(test_df.columns)) + ' - ' + str(checkcols) logger.error(error_txt) raise ValueError(error_txt) if not testdf_index in test_df.columns: logger.error('Test index needs to be column') raise ValueError('Test index needs to be column') tcols = ['t_' + c for c in list(mapping.keys())] tdf = pd.DataFrame(columns=tcols) df = base_msg.body df = pd.concat([df, tdf], axis=1) num_cols = len(mapping) # run over all left df rows to test in right_df for index, test_row in test_df.iterrows(): # apply function def get_ratio(row): sc = 0 for tcol, bcol in mapping.items(): sc = sc + fuzz.token_sort_ratio(test_row[tcol], row[bcol]) return sc / num_cols df['tscore'] = df.apply(get_ratio, axis=1) # get best matching and store index in v_dict max_score = df['tscore'].max() if max_score >= api.config.limit: mask = (df['tscore'] == max_score) df.loc[mask, 'score'] = max_score df.loc[mask, 'external_id'] = test_row[testdf_index] for coli in mapping: df.loc[mask, 't_' + coli] = test_row[coli] df.drop(columns=['tscore'], inplace=True) # remove external_id when test column value has none t_cols = ['t_' + t for t in mapping.keys()] + ['external_id', 'score'] for bcol in mapping.values(): mask = df[bcol].isna() df.loc[mask, t_cols] = np.nan if api.config.only_index: df = df[list(base_msg.body.columns) + ['external_id']] att_dict['config']['only_index'] = api.config.only_index if api.config.only_matching_rows: df = df.loc[~df['score'].isna()] att_dict['config']['only_matching_rows'] = api.config.only_matching_rows basedf_index = tfp.read_value(api.config.base_index) att_dict['config']['base_index'] = basedf_index if api.config.joint_id: if not basedf_index: raise ValueError("For <joint_id> a value for <base_index> is necessary ") df.loc[~df['external_id'].isna(), 'joint_id'] = df.loc[~df['external_id'].isna(), 'external_id'] df.loc[df['external_id'].isna(), 'joint_id'] = df.loc[df['external_id'].isna(), basedf_index] att_dict['config']['joint_id'] = api.config.joint_id if api.config.add_non_matching: # test if same columns if not all([elem in test_df.columns for elem in base_msg.body.columns]): raise ValueError("Adding test dataframe only possible when having same columns " + str(test_df.columns) \ + ' vs. ' + str(base_msg.body.columns)) matched_ids = df['external_id'].unique() addto_df = test_df.loc[~test_df[testdf_index].isin(matched_ids)].copy() addto_df['joint_id'] = addto_df[testdf_index] df = pd.concat([df, addto_df], axis=0, sort=False) att_dict['config']['add_non_matching'] = api.config.add_non_matching else: logger.warning('No columns to check') ############################################## # final infos to attributes and info message ############################################## if df.empty: logger.warning('DataFrame is empty') else : att_dict['operator'] = 'fuzzyjoinDataFrames' att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] if 'id' in base_att.keys(): att_dict['id'] = base_att['id'] + '; ' + att_dict['operator'] + ': ' + str(id(df)) else: att_dict['id'] = att_dict['operator'] + ': ' + str(id(df)) example_rows = EXAMPLE_ROWS if att_dict['number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict,body = df) return log, msg
def process(msg): logger, log_stream = set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body att_dict = dict() att_dict['config'] = dict() ######################### Start Calculation # save and reset indices index_names = df.index.names if index_names[0]: logger.debug("Reset index") df.reset_index(inplace=True) # prepare selection for numbers if api.config.selection_num and not api.config.selection_num.upper( ) == 'NONE': selection_map = tfp.read_relations(api.config.selection_num) for s in selection_map: if s[1] == '≤': df = df.loc[df[s[0]] <= s[2]] elif s[1] == '<': df = df.loc[df[s[0]] < s[2]] elif s[1] == '≥': df = df.loc[df[s[0]] >= s[2]] elif s[1] == '>': df = df.loc[df[s[0]] > s[2]] elif s[1] == '=': df = df.loc[df[s[0]] == s[2]] elif s[1] == '!': df = df.loc[df[s[0]] != s[2]] else: raise ValueError('Unknown relation: ' + str(s)) att_dict['config']['selection_num'] = api.config.selection_num if api.config.selection_list and not api.config.selection_list.upper( ) == 'NONE': value_list_dict = tfp.read_dict_of_list(api.config.selection_list) for key, vl in value_list_dict.items(): df = df.loc[df[key].isin(vl)] att_dict['config']['selection_list'] = api.config.selection_list # set index again if index_names[0]: att_dict['indices'] = index_names logger.debug('Set indices to: {}'.format(str(index_names))) df.set_index(keys=index_names, inplace=True) if df.empty: logger.error('DataFrame is empty') raise ValueError('DataFrame is empty') ######################### End Calculation ############################################## # final infos to attributes and info message ############################################## att_dict['operator'] = 'selectDataFrame' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] if 'id' in prev_att.keys(): att_dict[ 'id'] = prev_att['id'] + '; ' + att_dict['operator'] + ': ' + str( id(df)) else: att_dict['id'] = att_dict['operator'] + ': ' + str(id(df)) example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg): logger, log_stream = set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): logger.error('Message body does not contain a pandas DataFrame') raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation # segment columns att_dict['config']['segment_cols'] = api.config.segment_cols segment_cols = tfp.read_list(api.config.segment_cols) # regression columns att_dict['config']['regression_cols'] = api.config.regression_cols regression_cols = tfp.read_list(api.config.regression_cols) if not regression_cols: logger.error('No Regression Columns - mandatory data') raise ValueError('No Regression Columns - mandatory data') # prediction column att_dict['config']['prediction_col'] = api.config.prediction_col prediction_col = tfp.read_value(api.config.prediction_col) if not prediction_col: raise ValueError('No Predicition Column - mandatory data') training_cols = regression_cols + [prediction_col] model = LinearRegression(fit_intercept=True) def fit(x): model.fit(x[regression_cols], x[prediction_col]) return pd.Series([model.coef_, model.intercept_], index=['coef', 'intercept']) if segment_cols: coef_df = df.groupby(segment_cols)[training_cols].apply( fit).reset_index() else: model.fit(df[regression_cols], df[prediction_col]) coef_df = pd.Series([model.coef_, model.intercept_], index=['coef', 'intercept']) ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') att_dict['operator'] = 'regressionTrainingDataFrame' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() coef_att = { 'segmentation_columns': segment_cols, 'regression_columns': regression_cols, 'prediction_column': prediction_col } msg_coef = api.Message(attributes=coef_att, body=coef_df) msg_data = api.Message(attributes=att_dict, body=df) return log, msg_coef, msg_data
def process(msg_coef, msg_data) : logger, log_stream = set_logging('DEBUG') # start custom process definition prev_att = msg_data.attributes df = msg_data.body coef_df = msg_coef.body if not isinstance(df, pd.DataFrame): logger.error('Message body does not contain a pandas DataFrame') raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation # segment columns segment_cols = None if 'segmentation_columns' in msg_coef.attributes: segment_cols = msg_coef.attributes['segmentation_columns'] # regression columns regression_cols = msg_coef.attributes['regression_columns'] # prediction column prediction_col = msg_coef.attributes['prediction_column'] # setting values of regression column values (if not in the dataMsg already done att_dict['config']['regresssion_cols_value'] = api.config.regresssion_cols_value valmap = tfp.read_dict(api.config.regresssion_cols_value) if valmap: for col, val in valmap.items(): if np.issubdtype(df[col].dtype, np.integer): val = int(val) elif np.issubdtype(df[col].dtype, np.float): val = float(val) else: raise ValueError('Regression value needs to be numeric') df[col] = val # merge data and coef df if segment_cols: df = pd.merge(df, coef_df, how='inner', left_on=segment_cols, right_on=segment_cols) prefix = tfp.read_value(api.config.prediction_prefix) if prefix == None: prefix = '' pcol = prefix + prediction_col if segment_cols: def predict(x): x[pcol] = np.dot(x['coef'], x[regression_cols].values) + x['intercept'] return x df = df.apply(predict, axis=1, result_type=None) df.drop(columns=['coef', 'intercept'], inplace=True) else: def predict(x): x[pcol] = np.dot(coef_df['coef'], x[regression_cols].values) + coef_df['intercept'] return x df = df.apply(predict, axis=1, result_type=None) # cast type of prediction col from prediction variable if df[prediction_col].dtype == np.integer: logger.debug('Cast prediction column to <int>') df[pcol] = df[pcol].round().astype(df[prediction_col].dtype) if api.config.prediction_col_only: logger.debug('Output only prediction columns') if segment_cols: df[prediction_col] = df[pcol] df = df[segment_cols + [prediction_col]] else: df = df[prediction_col] att_dict['config']['prediction_col_only'] = api.config.prediction_col_only ###### end of doing calculation ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') att_dict['operator'] = 'regressionTrainingDataFrame' att_dict['name'] = prev_att['name'] # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict,body=df) return log, msg
def process(msg) : logger, log_stream = set_logging('DEBUG') # start custom process definition # test if body refers to a DataFrame type prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): logger.error('Message body does not contain a pandas DataFrame') raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start calculation sample_size = api.config.sample_size if sample_size < 1 : sample_size = int(sample_size * df.shape[0]) if sample_size < 1 : sample_size = 1 logger.warning("Fraction of sample size too small. Set sample size to 1.") elif sample_size > df.shape[0]: logger.warning("Sample size larger than number of rows") logger.debug("Samples_size: {}/() ({})".format(sample_size,df.shape[0],sample_size/df.shape[0])) random_state = api.config.random_state invariant_column = tfp.read_value(api.config.invariant_column) if invariant_column and sample_size < df.shape[0]: # get the average number of records for each value of invariant sc_df = df.groupby(invariant_column)[invariant_column].count() sample_size_invariant = int(sample_size / sc_df.mean()) sample_size_invariant = 1 if sample_size_invariant == 0 else sample_size_invariant # ensure minimum sc_df = sc_df.sample(n=sample_size_invariant, random_state=random_state).to_frame() sc_df.rename(columns={invariant_column: 'sum'}, inplace=True) # sample the df by merge 2 df df = pd.merge(df, sc_df, how='inner', right_index=True, left_on=invariant_column) df.drop(columns=['sum'], inplace=True) else: df = df.sample(n=sample_size, random_state=random_state) ###### end calculation ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') att_dict['operator'] = 'selectDataFrame' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict['number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict,body=df) return log, msg
def process(left_msg, right_msg): logger, log_stream = set_logging('DEBUG') # start custom process definition att_dict = dict() att_dict['config'] = dict() l_att = left_msg.attributes r_att = right_msg.attributes if l_att['name'] == r_att['name']: att_dict['name'] = l_att['name'] else: att_dict['name'] = l_att['name'] + '-' + r_att['name'] att_dict['config'] = dict() # read stream from memory left_df = left_msg.body right_df = right_msg.body ###### start of doing calculation how = tfp.read_value(api.config.how) # merge according to config att_dict['config']['on_index'] = api.config.on_index if api.config.on_index: df = pd.merge(left_df, right_df, how=how, left_index=True, right_index=True) elif api.config.left_on and api.config.right_on: att_dict['config']['left_on'] = api.config.left_on att_dict['config']['right_on'] = api.config.right_on left_on_list = tfp.read_list(api.config.left_on) right_on_list = tfp.read_list(api.config.right_on) left_df.reset_index(inplace=True) right_df.reset_index(inplace=True) df = pd.merge(left_df, right_df, how=how, left_on=left_on_list, right_on=right_on_list) # removing second index - might be a more elegant solution if 'index_x' in df.columns: df.drop(columns=['index_x'], inplace=True) else: raise ValueError( "Config setting: Either <on> or both <left_on> and <right_on> has to be set in order to join the dataframes" ) att_dict['config']['new_indices'] = api.config.new_indices index_list = tfp.read_list(api.config.new_indices) if index_list: df.set_index(keys=index_list, inplace=True) att_dict['config']['drop_columns'] = api.config.drop_columns col_list = tfp.read_list(api.config.drop_columns) if col_list: df.drop(labels=col_list, axis=1, inplace=True) ############################################## # final infos to attributes and info message ############################################## if df.empty == True: raise ValueError('Merged Dataframe is empty') att_dict['operator'] = 'joinDataFrames' att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg): logger, log_stream = set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation att_dict['config']['reset_index'] = api.config.reset_index if api.config.reset_index: df.reset_index(inplace=True) # create DataFrame with numbered columns add concat it to df att_dict['config']['transpose_column'] = api.config.transpose_column trans_col = tfp.read_value(api.config.transpose_column) att_dict['config']['value_column'] = api.config.value_column val_col = tfp.read_value(api.config.value_column) # new columns tvals = list(df[trans_col].unique()) if api.config.prefix: new_cols = {trans_col + '_' + str(v): v for v in tvals} else: new_cols = {str(v): v for v in tvals} t_df = pd.DataFrame(columns=new_cols.keys(), index=df.index) df = pd.concat([df, t_df], axis=1) # setting the corresponding column to the value of the value column for col, val in new_cols.items(): df.loc[df[trans_col] == val, col] = df.loc[df[trans_col] == val, val_col] df.drop(columns=[trans_col, val_col], inplace=True) att_dict['config']['groupby'] = api.config.groupby gbcols = tfp.read_list(api.config.groupby, df.columns) # group df if gbcols: aggr_trans = api.config.aggr_trans.strip() aggr_default = api.config.aggr_default.strip() aggregation = dict() for col in df.columns: aggregation[col] = aggr_trans if col in new_cols else aggr_default aggregation = {c: a for c, a in aggregation.items() if c not in gbcols} df = df.groupby(gbcols, as_index=api.config.as_index).agg(aggregation) ##################### # final infos to attributes and info message ##################### # df from body att_dict['operator'] = 'transposeColumnDataFrame' # name of operator att_dict['mem_usage'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['name'] = prev_att['name'] att_dict['columns'] = list(df.columns) att_dict['number_columns'] = len(att_dict['columns']) att_dict['number_rows'] = len(df.index) att_dict['example_row_1'] = str(df.iloc[0, :].tolist()) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg