def process(msg): att_dict = msg.attributes logger, log_stream = slog.set_logging('word_regex', api.config.debug_mode) logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() df = msg.body if not isinstance(df, pd.DataFrame) or df.empty: logger.warning('Empty dataframe, no output send!') api.send(outports[0]['name'], log_stream.getvalue()) api.send(outports[2]['name'], api.Message(attributes=att_dict, body=df)) return 0 df['count'] = df['count'].astype('int32') # word type word_types = tfp.read_list(api.config.word_types) if word_types: df = df.loc[df['type'].isin(word_types)] # Language filter language_filter = tfp.read_list(api.config.language_filter) if language_filter: df = df.loc[df['language'].isin(language_filter)] df = df.groupby(['language', 'type', 'word'])['count'].agg('sum').reset_index() api.send(outports[1]['name'], api.Message(attributes=att_dict, body=df)) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'groupby' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') prev_att = msg.attributes df = msg.body ###### start of doing calculation # groupby list cols = tfp.read_list(api.config.groupby) att_dict['config']['groupby'] = api.config.groupby # mapping aggregation try: colagg = tfp.read_dict(api.config.aggregation) except IndexError: logger.info('Aggregation is not a map, try to parse a value instead') colagg = tfp.read_value(api.config.aggregation) att_dict['config']['aggregation'] = api.config.aggregation # groupby logger.debug('Group columns: {}'.format(cols)) logger.debug('Aggregation: {}'.format(colagg)) logger.debug('Index: {}'.format(api.config.index)) df = df.groupby(cols, as_index=api.config.index).agg(colagg) # drop col att_dict['config']['dropcols'] = api.config.drop_columns dropcols = tfp.read_list(api.config.drop_columns) if dropcols: df.drop(columns=dropcols, inplace=True) ############################################## # final infos to attributes and info message ############################################## att_dict['operator'] = 'groupbyDataFrame' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = list(df.columns) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg) : att_dict = msg.attributes att_dict['operator'] = 'groupby' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() prev_att = msg.attributes df = msg.body prev_shape = df.shape ###### start of doing calculation # groupby list cols = tfp.read_list(api.config.groupby) # mapping aggregation try : colagg = tfp.read_dict(api.config.aggregation) except IndexError : logger.info('Aggregation is not a map, try to parse a value instead') colagg = tfp.read_value(api.config.aggregation) # groupby logger.info('Group columns: {}'.format(cols)) logger.info('Aggregation: {}'.format(colagg)) logger.info('Index: {}'.format(api.config.index)) df = df.groupby(cols, as_index=api.config.index).agg(colagg) # drop col dropcols = tfp.read_list(api.config.drop_columns) if dropcols : logger.info('Drop columns: {}'.format(dropcols)) df.drop(columns=dropcols,inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format(i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format(progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): logger, log_stream = set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation # groupby list cols = tfp.read_list(api.config.groupby) att_dict['config']['groupby'] = api.config.groupby # mapping colagg = tfp.read_dict(api.config.aggregation) att_dict['config']['aggregation'] = api.config.aggregation # groupby df = df.groupby(cols, as_index=api.config.index).agg(colagg) # drop col att_dict['config']['dropcols'] = api.config.drop_columns dropcols = tfp.read_list(api.config.drop_columns) if dropcols: df.drop(columns=dropcols, inplace=True) ############################################## # final infos to attributes and info message ############################################## att_dict['operator'] = 'groupbyDataFrame' att_dict['name'] = prev_att['name'] att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = list(df.columns) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg): logger, log_stream = slog.set_logging('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body # Columns with 1 unique value columns = tfp.read_list(api.config.columns,df.columns) col1val_data = {'column': [], 'type': [], 'unique_vals': [], 'action': []} for col in columns: vals = df[col].unique() if len(vals) == 1: col1val_data['column'].append(col) col1val_data['type'].append(str(df[col].dtype)) col1val_data['unique_vals'].append(vals) col1val_data['action'].append('drop') if not api.config.info_only: df.drop(columns=[col], inplace=True) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) return log_stream.getvalue(), api.Message(attributes={'name':'drop_duplicates','type':'DataFrame'},body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(col1val_data))
def process(msg): logger, log_stream = slog.set_logging('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body columns = tfp.read_list(api.config.columns, df.columns, test_number=False) info_only = api.config.info_only threshold = api.config.threshold transform_data = { 'column': [], 'dtype': [], 'unique_values': [], 'action': [] } for col in df[columns].select_dtypes(np.object): unique_vals_num = len(df[col].unique()) frac_unique_vals = unique_vals_num / df.shape[0] if frac_unique_vals > threshold: transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) transform_data['unique_values'].append(frac_unique_vals) transform_data['action'].append('drop') if info_only == False: df.drop(columns=[col], inplace=True) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'drop_duplicates' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body before_num_rows = df.shape[0] drop_cols_test = tfp.read_list(api.config.columns,df.columns) keep = tfp.read_value(api.config.keep,test_number=False) df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True) logger.debug('Duplicate Rows: {}'.format(before_num_rows - df.shape[0])) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) return log_stream.getvalue(), api.Message(attributes={'name':'drop_duplicates','type':'DataFrame'},body=df),
def process(msg): att_dict = msg.attributes logger, log_stream = slog.set_logging('word_index_regex', api.config.debug_mode) logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() # regex patterns regex_patterns = tfp.read_list(api.config.patterns) # word type word_types = tfp.read_list(api.config.word_types) if not word_types: logger.warning( 'Word types had to be defined. Default word type : \'PROPN\'') word_types = ['PROPN'] # pandas Dataframe and select only values with right word_type cols = [c["name"] for c in msg.attributes['table']['columns']] df = pd.DataFrame(msg.body, columns=cols) df_p = df.loc[df['TYPE'].isin(word_types)] # Language filter language_filter = tfp.read_list(api.config.language_filter) if language_filter: df_p = df_p.loc[df['LANGUAGE'].isin(language_filter)] # get unique words to get words that comply with regex words = df_p['WORD'].unique() logger.info('Number of words to test with regex pattern: {}'.format( len(words))) for ipat, pat in enumerate(regex_patterns): if pat == '': logger.warning('Empty pattern') continue logger.info('Execute pattern: {} ({}/{})'.format( pat, ipat, len(regex_patterns))) cleansing_words = [w for w in words if re.match(pat, w)] df = df.loc[~df['WORD'].isin(cleansing_words)] api.send(outports[1]['name'], api.Message(attributes=att_dict, body=df.values.tolist())) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): global last_msg global hash_list global lexicon_stem, lexicon logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode) # Check if setup complete msg = check_for_setup(logger, msg) if not msg: api.send(outports[0]['name'], log_stream.flush()) return 0 logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() att_dict = msg.attributes # pandas Dataframe and select only values with right word_type cols = [c["name"] for c in msg.attributes['table']['columns']] df = pd.DataFrame(msg.body, columns=cols) # word type types = tfp.read_list(api.config.types) if not types: logger.warning( 'Word types had to be defined. Default word type : \'PROPN\'') types = ['PROPN'] # Language filter languages = tfp.read_list(api.config.languages) if not languages: logger.warning( 'Languages had to be defined. Default languages : EN, FR, ES, DE') languages = ['EN', 'FR', 'ES', 'DE'] for lang in lexicon: for w in lexicon[lang]: df.loc[(df['TYPE'].isin(types)) & (df['LANGUAGE'] == lang) & (df['WORD'] == w)] = lexicon[lang][w] for w in lexicon_stem[lang]: df.loc[(df['TYPE'].isin(types)) & (df['LANGUAGE'] == lang) & (df['WORD'] == w)] = lexicon_stem[lang][w] api.send(outports[1]['name'], api.Message(attributes=att_dict, body=df.values.tolist())) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'drop_highly_unique' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') time_monitor = tp.progress() logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body columns = tfp.read_list(api.config.columns, df.columns, test_number=False) info_only = api.config.info_only threshold = api.config.threshold transform_data = { 'column': [], 'dtype': [], 'unique_values': [], 'action': [] } for col in df[columns].select_dtypes(np.object).columns: unique_vals_num = len(df[col].unique()) frac_unique_vals = unique_vals_num / df.shape[0] if frac_unique_vals > threshold: transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) transform_data['unique_values'].append(frac_unique_vals) transform_data['action'].append('drop') if info_only == False: df.drop(columns=[col], inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict,body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'dropColumns' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation drop_cols = tfp.read_list(api.config.drop_columns, df.columns) if drop_cols: logger.debug("Drops columns: {}".format(str(drop_cols))) df = df.drop(columns=drop_cols) map_names = tfp.read_dict(api.config.rename_columns) if map_names: df.rename(columns=map_names, inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'drop_1valuecolumns' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body prev_shape = df.shape # Columns with 1 unique value columns = tfp.read_list(api.config.columns, df.columns) transform_data = { 'column': [], 'type': [], 'unique_vals': [], 'action': [] } for col in columns: vals = df[col].unique() if len(vals) == 1: transform_data['column'].append(col) transform_data['type'].append(str(df[col].dtype)) transform_data['unique_vals'].append(vals) transform_data['action'].append('drop') if not api.config.info_only: df.drop(columns=[col], inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict,body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'filter_by_population' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body prev_cols = df.shape[1] columns = tfp.read_list(api.config.columns,df.columns,test_number=False) info_only = api.config.info_only threshold = api.config.threshold logger.debug('Parameter Threshold: {} Data Modification:{} '.format(threshold,info_only)) transform_data = {'column': [], 'dtype': [], 'unique_vals': [],'action': []} for col in columns: population = df[col].count() / df.shape[0] unique_vals = df[col].unique() if population < threshold and not (len(unique_vals) == 1 and np.isnan(unique_vals[0])): unique_vals = df[col].unique() transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) transform_data['unique_vals'].append(unique_vals) transform_data['action'].append('drop') if not info_only: df.drop(columns=[col], inplace=True) # end custom process definition if df.empty : raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1])) logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format(att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format(progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict,body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg): words = msg.body att_dict = msg.attributes logger, log_stream = slog.set_logging('word_regex_cleansing', api.config.debug_mode) logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() if isinstance(words[0], list): words = [w[0] for w in words] regex_patterns = tfp.read_list(api.config.patterns) logger.info('Test mode: {}'.format(api.config.test_mode)) logger.info('Number of words to cleanse: {}'.format(len(words))) word_type = tfp.read_value(api.config.word_type) if len(word_type) > 1: logger.warning( 'Only one word type can be processed. Take first one only: {}'. format(word_type[0])) count = 0 for ipat, pat in enumerate(regex_patterns): if pat == '': logger.warning('Empty pattern') continue cleansing_words = [w for w in words if re.match(pat, w)] logger.info('Execute pattern: {} ({}/{})'.format( pat, ipat, len(regex_patterns))) logger.info('Number of DELETE statements: {}'.format( len(cleansing_words))) api.send(outports[0]['name'], log_stream.getvalue()) log_stream.seek(0) log_stream.truncate() if not api.config.test_mode: for iw, w in enumerate(cleansing_words): if word_type: sql = 'DELETE FROM WORD_INDEX WHERE WORD = \'' + w + '\' AND WORD_TYPE = \'' + word_type + '\';' else: sql = 'DELETE FROM WORD_INDEX WHERE WORD = \'' + w + '\';' att_dict['message.indexBatch'] = count att_dict['message.lastBatch'] = False api.send(outports[1]['name'], api.Message(attributes=att_dict, body=sql)) count += 1 sql = 'SELECT * FROM DUMMY;' att_dict['message.indexBatch'] = count att_dict['message.lastBatch'] = True api.send(outports[1]['name'], api.Message(attributes=att_dict, body=sql)) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'filter_by_population' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body columns = tfp.read_list(api.config.columns, df.columns, test_number=False) info_only = api.config.info_only threshold = api.config.threshold transform_data = { 'column': [], 'dtype': [], 'unique_vals': [], 'action': [] } for col in columns: population = df[col].count() / df.shape[0] unique_vals = df[col].unique() if population < threshold and not (len(unique_vals) == 1 and np.isnan(unique_vals[0])): unique_vals = df[col].unique() transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) transform_data['unique_vals'].append(unique_vals) transform_data['action'].append('drop') if not info_only: df.drop(columns=[col], inplace=True) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'drop_highly_unique' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') time_monitor = tp.progress() logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body prev_shape = df.shape columns = tfp.read_list(api.config.columns, df.columns, test_number=False) info_only = api.config.info_only threshold = api.config.threshold transform_data = { 'column': [], 'dtype': [], 'unique_values': [], 'action': [] } for col in df[columns].select_dtypes(np.object).columns: unique_vals_num = len(df[col].unique()) frac_unique_vals = unique_vals_num / df.shape[0] if frac_unique_vals > threshold: transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) transform_data['unique_values'].append(frac_unique_vals) transform_data['action'].append('drop') if info_only == False: df.drop(columns=[col], inplace=True) logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) logger.info('Dropped Columns: {}'.format(prev_shape[1] - df.shape[1])) logger.debug('Dropped Columns: {}'.format(prev_shape[1] - df.shape[1])) return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg) : att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'dropColumns' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation att_dict['config']['drop_columns'] = api.config.drop_columns drop_cols = tfp.read_list(api.config.drop_columns, df.columns) if drop_cols: logger.debug("Drops columns: {}".format(str(drop_cols))) df = df.drop(columns=drop_cols) att_dict['config']['rename_columns'] = api.config.rename_columns map_names = tfp.read_dict(api.config.rename_columns) if map_names: df.rename(columns=map_names, inplace=True) ###### end of doing calculation logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) # df from body att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict,body=df) return log, msg
def process(msg): global blacklist global last_msg global id_set logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode) # Check if setup complete msg = check_for_setup(logger, msg) if not msg: api.send(outports[0]['name'], log_stream.flush()) return 0 logger.info("Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() att_dict = msg.attributes df = msg.body # word type word_types = tfp.read_list(api.config.word_types) if not word_types : word_types = list(df['type'].unique()) # Language filter language_filter = tfp.read_list(api.config.language_filter) if not language_filter : language_filter = list(df['language'].unique()) df = df.loc[~(df['type'].isin(word_types) & df['language'].isin(language_filter) & df['word'].isin(blacklist)) ] # test for duplicates dup_s = df.duplicated(subset=['text_id','language','type','word']).value_counts() num_duplicates = dup_s[True] if True in dup_s else 0 logger.info('Duplicates: {} / {}'.format(num_duplicates, df.shape[0])) logger.info('End process: {}'.format(time_monitor.elapsed_time())) api.send(outports[1]['name'], api.Message(attributes=att_dict, body=df)) api.send(outports[0]['name'],log_stream.getvalue())
def process(msg): logger, log_stream = slog.set_logging('DEBUG') # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation att_dict['config']['drop_columns'] = api.config.drop_columns drop_cols = tfp.read_list(api.config.drop_columns, df.columns) if drop_cols: logger.debug("Drops columns: {}".format(str(drop_cols))) df = df.drop(columns=drop_cols) att_dict['config']['rename_columns'] = api.config.rename_columns map_names = tfp.read_dict(api.config.rename_columns) if map_names: df.rename(columns=map_names, inplace=True) ###### end of doing calculation ############################################## # final infos to attributes and info message ############################################## # df from body att_dict['operator'] = 'dropColumns' # name of operator att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['name'] = prev_att['name'] att_dict['columns'] = list(df.columns) att_dict['number_columns'] = df.shape[1] att_dict['number_rows'] = df.shape[0] example_rows = EXAMPLE_ROWS if att_dict[ 'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows'] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'drop_duplicates' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body prev_shape = df.shape drop_cols_test = tfp.read_list(api.config.columns, df.columns) keep = tfp.read_value(api.config.keep, test_number=False) df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'drop_1valuecolumns' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body prev_shape = df.shape # Columns with 1 unique value columns = tfp.read_list(api.config.columns, df.columns) col1val_data = {'column': [], 'type': [], 'unique_vals': [], 'action': []} for col in columns: vals = df[col].unique() if len(vals) == 1: col1val_data['column'].append(col) col1val_data['type'].append(str(df[col].dtype)) col1val_data['unique_vals'].append(vals) col1val_data['action'].append('drop') if not api.config.info_only: df.drop(columns=[col], inplace=True) logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) logger.debug('Dropped columns: {}'.format(prev_shape[1] - df.shape[1])) logger.info('Dropped columns: {}'.format(prev_shape[1] - df.shape[1])) return log_stream.getvalue(), api.Message(attributes={'name':'drop_duplicates','type':'DataFrame'},body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(col1val_data))
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'drop_duplicates' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body prev_shape = df.shape drop_cols_test = tfp.read_list(api.config.columns, df.columns) keep = tfp.read_value(api.config.keep, test_number=False) df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True) logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) logger.debug('Dropped duplicates: {}'.format(prev_shape[0] - df.shape[0])) logger.info('Dropped duplicates: {}'.format(prev_shape[0] - df.shape[0])) return log_stream.getvalue(), api.Message(attributes={ 'name': 'drop_duplicates', 'type': 'DataFrame' }, body=df),
def process(msg): logger, log_stream = slog.set_logging('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body before_num_rows = df.shape[0] drop_cols_test = tfp.read_list(api.config.columns, df.columns) keep = tfp.read_value(api.config.keep, test_number=False) df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True) logger.debug('Duplicate Rows: {}'.format(before_num_rows - df.shape[0])) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) return log_stream.getvalue(), api.Message(attributes={ 'name': 'drop_duplicates', 'type': 'DataFrame' }, body=df),
def process(msg): logger, log_stream = slog.set_logging('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body columns = tfp.read_list(api.config.columns, df.columns, test_number=False) info_only = api.config.info_only threshold = api.config.threshold transform_data = { 'column': [], 'dtype': [], 'unique_vals': [], 'action': [] } for col in columns: population = df[col].count() / df.shape[0] unique_vals = df[col].unique() if population < threshold and not (len(unique_vals) == 1 and np.isnan(unique_vals[0])): unique_vals = df[col].unique() transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) transform_data['unique_vals'].append(unique_vals) transform_data['action'].append('drop') if not info_only: df.drop(columns=[col], inplace=True) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(db_msg): logger, log_stream = slog.set_logging('topic_identification', loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() columns = [c['name'] for c in db_msg.attributes['table']['columns']] df = pd.DataFrame(db_msg.body, columns=columns) # Language filter language_filter = tfp.read_list(api.config.language_filter) if language_filter: df = df.loc[df["LANGUAGE"].isin(language_filter)] logger.info('Languages : {}'.format(language_filter)) # Word type filter word_type_filter = tfp.read_value(api.config.word_type_filter) if word_type_filter: types = [c for c in word_type_filter] df = df.loc[df["TYPE"].isin(types)] logger.info('Word restricted to types : {}'.format(word_type_filter)) # groupby and concatenate words gdf = df.groupby('HASH_TEXT').agg({ "LANGUAGE": 'first', "WORD": [(lambda x: ' '.join(x)), 'count'] }) gdf.columns = gdf.columns.droplevel(level=0) gdf.rename(columns={ "first": 'LANGUAGE', "count": 'NUM_WORDS', '<lambda_0>': 'WORDS' }, inplace=True) # create document-term matrix - no tokenization or text prep are needed tf_vectorizer = CountVectorizer(analyzer='word', min_df=1, lowercase=False, tokenizer=str.split) # tf means term-frequency in a document for each language date_today = str(date.today()) # 2-array with TOPIC, LANGUAGE, TYPE, DATE, EXPIRY_DATE, ATTRIBUTE, KEYWORD_i (num of topics) topic_list = list() for lang in language_filter: lang_gdf = gdf.loc[gdf['LANGUAGE'] == lang] logger.info( "Language: {} #articles: {} av.words/article: {:.1f}".format( lang, lang_gdf.shape[0], lang_gdf['NUM_WORDS'].mean())) dtm_tf = tf_vectorizer.fit_transform(lang_gdf['WORDS']) # for tf dtm lda_tf = LatentDirichletAllocation(n_components=api.config.num_topics, learning_method='online', evaluate_every=-1, n_jobs=-1) lda_tf.fit(dtm_tf) feature_names = tf_vectorizer.get_feature_names() for i, topic in enumerate(lda_tf.components_): topic_words = [ feature_names[f] for f in topic.argsort()[:-api.config.topic_words - 1:-1] ] print('Len: {} topic_words:{}'.format(len(topic_words), topic_words)) row = [ date_today + "-" + str(i), lang, 'ALGO', date_today, None, None ] + topic_words print('Len: {} record:{}'.format(len(row), row)) topic_list.append(row) attributes = { "table": { "columns": [{ "class": "string", "name": "TOPIC", "nullable": False, "size": 80, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "LANGUAGE", "nullable": False, "size": 2, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "TYPE", "nullable": False, "size": 10, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "DATE", "nullable": True, "type": { "hana": "DATE" } }, { "class": "string", "name": "EXPIRY_DATE", "nullable": True, "type": { "hana": "DATE" } }, { "class": "string", "name": "ATTRIBUTE", "nullable": True, "size": 25, "type": { "hana": "NVACHAR" } }], "name": "DIPROJECTS.WORD_INDEX", "version": 1 } } for i in range(1, api.config.topic_words + 1): attributes['table']['columns'].append({ "class": "string", "name": "KEYWORD_" + str(i), "nullable": True, "size": 80, "type": { "hana": "NVARCHAR" } }) msg = api.Message(attributes=attributes, body=topic_list) logger.debug('Process ended, topics processed {}'.format( time_monitor.elapsed_time())) api.send(outports[0]['name'], log_stream.getvalue()) api.send(outports[1]['name'], msg)
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'lgbm_classifier' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='DEBUG') else : logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation model = LGBMRegressor( n_estimators=200, learning_rate=0.03, num_leaves=32, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.04, reg_lambda=0.073, min_split_gain=0.0222415, min_child_weight=40) train_cols = tfp.read_list(api.config.train_cols, df.columns) logger.info('Train columns: {}'.format(train_cols)) label = tfp.read_value(api.config.label_col) logger.info('Label column: {}'.format(label)) if not label: raise ValueError('Label is mandatory') # cast to categorical dtype for c in df[train_cols].select_dtypes(include='category').columns: unique_num = len(df[c].unique()) nan_num = df[c].isna().count() logger.debug('Cast to category - {}: unique {}, nan: {} of {}'.format(c, unique_num, nan_num, df.shape[0])) df[c] = df[c].cat.codes df[c] = df[c].astype('int32') if pd.api.types.is_categorical(df[label]): df[label] = df[label].astype('category') logger.debug('Cast label to <category>') df[label] = df[label].cat.codes df[label] = df[label].astype('int32') print(df.select_dtypes(include='category').head(10)) logger.debug('Train with {} features'.format(len(train_cols))) print(train_cols) model.fit(df[train_cols], df[label], eval_metric='auc') ###### end of doing calculation # end custom process definition if df.empty : raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1])) logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format(att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format(progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict,body=df)
def process(left_msg, right_msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'join' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') # start custom process definition l_att = left_msg.attributes r_att = right_msg.attributes if l_att['name'] == r_att['name']: att_dict['name'] = l_att['name'] else: att_dict['name'] = l_att['name'] + '-' + r_att['name'] att_dict['config'] = dict() # read stream from memory left_df = left_msg.body right_df = right_msg.body ###### start of doing calculation how = tfp.read_value(api.config.how) # merge according to config att_dict['config']['on_index'] = api.config.on_index if api.config.on_index: df = pd.merge(left_df, right_df, how=how, left_index=True, right_index=True) elif api.config.left_on and api.config.right_on: att_dict['config']['left_on'] = api.config.left_on att_dict['config']['right_on'] = api.config.right_on left_on_list = tfp.read_list(api.config.left_on) right_on_list = tfp.read_list(api.config.right_on) left_df.reset_index(inplace=True) right_df.reset_index(inplace=True) df = pd.merge(left_df, right_df, how=how, left_on=left_on_list, right_on=right_on_list) # removing second index - might be a more elegant solution if 'index_x' in df.columns: df.drop(columns=['index_x'], inplace=True) else: raise ValueError( "Config setting: Either <on> or both <left_on> and <right_on> has to be set in order to join the dataframes" ) att_dict['config']['new_indices'] = api.config.new_indices index_list = tfp.read_list(api.config.new_indices) if index_list: df.set_index(keys=index_list, inplace=True) att_dict['config']['drop_columns'] = api.config.drop_columns col_list = tfp.read_list(api.config.drop_columns) if col_list: df.drop(labels=col_list, axis=1, inplace=True) ############################################## # final infos to attributes and info message ############################################## if df.empty == True: raise ValueError('Merged Dataframe is empty') att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str( [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict, body=df) return log, msg
def process(msg): att_dict = msg.attributes global result_df att_dict['operator'] = 'fromCSV' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() logger.info('Filename: {} index: {} count: {} endofSeq: {}'.format(msg.attributes["storage.filename"], \ msg.attributes["storage.fileIndex"], \ msg.attributes["storage.fileCount"], \ msg.attributes["storage.endOfSequence"])) if msg.body == None: logger.info('Process ended.') msg = api.Message(attributes=att_dict, body=result_df) log = log_stream.getvalue() return log, msg elif isinstance(msg.body, str): csv_io = io.StringIO(msg.body) logger.debug("Input format: <string>") elif isinstance(msg.body, bytes): csv_io = io.BytesIO(msg.body) logger.debug("Input format: <bytes>") elif isinstance(msg.body, io.BytesIO): logger.debug("Input format: <io.Bytes>") csv_io = msg.body else: raise TypeError('Message body has unsupported type' + str(type(msg.body))) # nrows nrows = None if not api.config.limit_rows == 0: nrows = api.config.limit_rows # usecols use_cols = tfp.read_list(api.config.use_columns) logger.debug('Columns used: {}'.format(use_cols)) # dtypes mapping typemap = tfp.read_dict(api.config.dtypes) logger.debug('Type cast: {}'.format(str(typemap))) kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=') ##### Read string from buffer logger.debug("Read from input") df = pd.read_csv(csv_io, api.config.separator, usecols=use_cols, dtype=typemap, decimal=api.config.decimal, \ nrows=nrows, **kwargs) # Data from filename if api.config.data_from_filename and not api.config.data_from_filename == 'None': col = api.config.data_from_filename.split(':')[0].strip().strip( "'").strip('"') pat = api.config.data_from_filename.split(':')[1].strip().strip( "'").strip('"') logger.debug('Filename: {} pattern: {}'.format( att_dict['filename'], pat)) try: dataff = re.match('.*(\d{4}-\d+-\d+).*', att_dict['filename']) df[col] = dataff.group(1) except AttributeError: raise ValueError( 'Pattern not found - Filename: {} pattern: {}'.format( att_dict['filename'], pat)) # To Datetime if api.config.todatetime and not api.config.todatetime == 'None': dt_fmt = tfp.read_dict(api.config.todatetime) logger.debug('Time conversion {} by using UTC {}'.format( api.config.todatetime, api.config.utc)) for col, fmt in dt_fmt.items(): df[col] = pd.to_datetime(df[col], format=fmt, utc=api.config.utc) ###### Downcasting # save memory footprint for calculating the savings of the downcast logger.debug('Memory used before downcast: {}'.format( df.memory_usage(deep=True).sum() / 1024**2)) if api.config.downcast_int: df, dci = downcast(df, 'int', 'unsigned') if api.config.downcast_float: df, dcf = downcast(df, 'float', 'float') # check if index is provided and set index_list = tfp.read_list(api.config.index_cols) if index_list: df.set_index(index_list, inplace=True) if api.config.collect: # stores the result in global variable result_df if msg.attributes['storage.fileIndex'] == 0: logger.debug('Added to DataFrame: {}'.format( att_dict['storage.filename'])) result_df = df else: try: result_df = pd.concat([result_df, df], axis=0, sort=False) except Exception as e: logger.error(str(e)) result_df = df else: result_df = df # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'anonymizeData' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') time_monitor = tp.progress() result = '' logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') att_dict = dict() att_dict['config'] = dict() ###### start of doing calculation model = LGBMRegressor(n_estimators=200, learning_rate=0.03, num_leaves=32, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.04, reg_lambda=0.073, min_split_gain=0.0222415, min_child_weight=40) att_dict['config']['train columns'] = api.config.train_cols train_cols = tfp.read_list(api.config.train_cols, df.columns) att_dict['config']['label'] = api.config.label label = tfp.read_value(api.config.label) if not label: raise ValueError('Label is mandatory') # cast to categorical dtype for c in df[train_cols].select_dtypes(include='category').columns: unique_num = len(df[c].unique()) nan_num = df[c].isna().count() logger.debug('Cast to category - {}: unique {}, nan: {} of {}'.format( c, unique_num, nan_num, df.shape[0])) df[c] = df[c].cat.codes df[c] = df[c].astype('int32') if pd.api.types.is_categorical(df[label]): df[label] = df[label].astype('category') logger.debug('Cast label to <category>') df[label] = df[label].cat.codes df[label] = df[label].astype('int32') print(df.select_dtypes(include='category').head(10)) logger.debug('Train with {} features'.format(len(train_cols))) print(train_cols) model.fit(df[train_cols], df[label], eval_metric='auc') ###### end of doing calculation ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) return log_stream.getvalue(), api.Message(attributes=att_dict, body=model)
def process(left_msg, right_msg): att_dict = left_msg.attributes att_dict['operator'] = 'join' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition l_att = left_msg.attributes r_att = right_msg.attributes # read stream from memory left_df = left_msg.body right_df = right_msg.body ###### start of doing calculation how = tfp.read_value(api.config.how) # merge according to config if api.config.on_index: df = pd.merge(left_df, right_df, how=how, left_index=True, right_index=True) elif api.config.left_on and api.config.right_on: left_on_list = tfp.read_list(api.config.left_on) right_on_list = tfp.read_list(api.config.right_on) logger.info('Join DataFrames on {} - {}'.format( left_on_list, right_on_list)) left_df.reset_index(inplace=True) right_df.reset_index(inplace=True) df = pd.merge(left_df, right_df, how=how, left_on=left_on_list, right_on=right_on_list) # removing second index - might be a more elegant solution if 'index_x' in df.columns: df.drop(columns=['index_x'], inplace=True) else: raise ValueError( "Config setting: Either <on> or both <left_on> and <right_on> has to be set in order to join the dataframes" ) index_list = tfp.read_list(api.config.new_indices) if index_list: df.set_index(keys=index_list, inplace=True) logger.info('Set index: {}'.format(index_list)) col_list = tfp.read_list(api.config.drop_columns) if col_list: df.drop(labels=col_list, axis=1, inplace=True) logger.info('Drop columns: {}'.format(col_list)) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)