def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'drop_duplicates' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body before_num_rows = df.shape[0] drop_cols_test = tfp.read_list(api.config.columns,df.columns) keep = tfp.read_value(api.config.keep,test_number=False) df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True) logger.debug('Duplicate Rows: {}'.format(before_num_rows - df.shape[0])) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) return log_stream.getvalue(), api.Message(attributes={'name':'drop_duplicates','type':'DataFrame'},body=df),
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'keyword_search' logger, log_stream = slog.set_logging(att_dict['operator'], loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() adict = msg.body global word_count for a in adict: cw = Counter(a['words']) word_count = word_count.add(pd.Series(cw), fill_value=0) word_count.sort_values(ascending=False, inplace=True) msg, progress_str = create_msg(attributes=att_dict, body=word_count.to_dict(), collect=api.config.collect) if msg: word_count.sort_values(ascending=False, inplace=True) msg.body = word_count.head(api.config.num_words).to_dict() api.send(outports[1]['name'], msg) logger.debug('Process ended, articles processed {} - {} '.format( progress_str, time_monitor.elapsed_time())) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): logger, log_stream = slog.set_logging('topic dispatcher', loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() table = tfp.read_value(api.config.topic_table) column = tfp.read_value(api.config.table_colum) topics = msg.body for t in topics: topic_keywords = [ "'" + t[i] + "'" for i in range(6, len(t)) if not t[i] == '' ] sql = 'SELECT * FROM ' + table + ' WHERE ' + column + ' IN(' + ','.join( topic_keywords) + ')' att_dict = {'topic': t[0], 'keywords': topic_keywords} sql_msg = api.Message(attributes=att_dict, body=sql) api.send(outports[1]['name'], sql_msg) logger.debug('Send sql: {}'.format(sql)) logger.debug('Process ended, topics processed {} - {} '.format( len(topics), time_monitor.elapsed_time())) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): logger, log_stream = slog.set_logging('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body columns = tfp.read_list(api.config.columns, df.columns, test_number=False) info_only = api.config.info_only threshold = api.config.threshold transform_data = { 'column': [], 'dtype': [], 'unique_values': [], 'action': [] } for col in df[columns].select_dtypes(np.object): unique_vals_num = len(df[col].unique()) frac_unique_vals = unique_vals_num / df.shape[0] if frac_unique_vals > threshold: transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) transform_data['unique_values'].append(frac_unique_vals) transform_data['action'].append('drop') if info_only == False: df.drop(columns=[col], inplace=True) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg): logger, log_stream = slog.set_logging('word_indexing', loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() articles = msg.body word_index = list() # as table for article in articles : word_index.extend([[article[0], article[1], article[2], w[0],w[1]] for w in article[3]]) att_dict = msg.attributes att_dict['table'] = {"columns": [{"class": "string", "name": "HASH_TEXT", "nullable": True, "type": {"hana": "INTEGER"}}, {"class": "string", "name": "LANGUAGE", "nullable": True, "size": 2,"type": {"hana": "NVARCHAR"}}, {"class": "string", "name": "TYPE", "nullable": True, "size": 1,"type": {"hana": "NVARCHAR"}}, {"class": "string", "name": "WORD", "nullable": True, "size": 80,"type": {"hana": "NVARCHAR"}}, {"class": "string", "name": "COUNT", "nullable": True, "type": {"hana": "INTEGER"}}], "name": "DIPROJECTS.WORD_INDEX", "version": 1} logger.debug('Process ended, articles processed {} - {} '.format(len(articles), time_monitor.elapsed_time())) api.send(outports[0]['name'], log_stream.getvalue()) msg = api.Message(attributes=att_dict,body=word_index) api.send(outports[1]['name'], msg)
def process(): operator_name = 'sql_word_index' logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode) logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() language = tfp.read_value(api.config.language) type_limit = tfp.read_dict(api.config.type_limit_map) table_name = tfp.read_value(api.config.table_name) text_id_col = tfp.read_value(api.config.text_id_col) for i, [wtype, limit] in enumerate(type_limit.items()): sql_s = "SELECT {tid}, \"{tn}\".LANGUAGE, \"{tn}\".TYPE, \"{tn}\".WORD, COUNT FROM \"{tn}\" INNER JOIN"\ "(SELECT WORD, TYPE, LANGUAGE, SUM(COUNT) as CUMS FROM \"{tn}\" "\ "WHERE LANGUAGE = \'{lang}\' AND TYPE = \'{wt}\' "\ "GROUP BY WORD, TYPE, LANGUAGE) AS CTABLE ON "\ "\"{tn}\".WORD = CTABLE.WORD AND \"{tn}\".TYPE = CTABLE.TYPE AND \"{tn}\".LANGUAGE = CTABLE.LANGUAGE "\ "WHERE CUMS >= {lt}".format(tid = text_id_col,tn = table_name,lang=language,wt = wtype,lt=limit) lastbatch = True if len(type_limit) == i + 1 else False att_dict = attributes={'operator':operator_name,'parameter':{'type':wtype,'limit':limit,'language':language},\ 'message.batchIndex':i,'message.batchSize':len(type_limit),'message.lastBatch':lastbatch} msg = api.Message(attributes=att_dict, body=sql_s) api.send(outports[1]['name'], msg) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): att_dict = msg.attributes logger, log_stream = slog.set_logging('word_regex', api.config.debug_mode) logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() df = msg.body if not isinstance(df, pd.DataFrame) or df.empty: logger.warning('Empty dataframe, no output send!') api.send(outports[0]['name'], log_stream.getvalue()) api.send(outports[2]['name'], api.Message(attributes=att_dict, body=df)) return 0 df['count'] = df['count'].astype('int32') # word type word_types = tfp.read_list(api.config.word_types) if word_types: df = df.loc[df['type'].isin(word_types)] # Language filter language_filter = tfp.read_list(api.config.language_filter) if language_filter: df = df.loc[df['language'].isin(language_filter)] df = df.groupby(['language', 'type', 'word'])['count'].agg('sum').reset_index() api.send(outports[1]['name'], api.Message(attributes=att_dict, body=df)) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): logger, log_stream = slog.set_logging('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body # Columns with 1 unique value columns = tfp.read_list(api.config.columns,df.columns) col1val_data = {'column': [], 'type': [], 'unique_vals': [], 'action': []} for col in columns: vals = df[col].unique() if len(vals) == 1: col1val_data['column'].append(col) col1val_data['type'].append(str(df[col].dtype)) col1val_data['unique_vals'].append(vals) col1val_data['action'].append('drop') if not api.config.info_only: df.drop(columns=[col], inplace=True) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) return log_stream.getvalue(), api.Message(attributes={'name':'drop_duplicates','type':'DataFrame'},body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(col1val_data))
def process(msg) : att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'setValue' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='DEBUG') else : logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition df = msg.body if not isinstance(df,pd.DataFrame) : raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation # map_values : column1: {from_value: to_value}, column2: {from_value: to_value} att_dict['config']['set_value'] = api.config.map_values maps_map = tfp.read_dict_of_dict(api.config.map_values) df.replace(maps_map,inplace=True) # Fill NaN value : column1: value, column2: value, att_dict['config']['fill_nan_values'] = api.config.fill_nan_values map_dict = tfp.read_dict(api.config.fill_nan_values) if map_dict : df.fillna(map_dict,inplace=True) ############################################## # final infos to attributes and info message ############################################## if df.empty : raise ValueError('DataFrame is empty') logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict,body=df) return log, msg
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'setValue' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation # map_values : column1: {from_value: to_value}, column2: {from_value: to_value} maps_map = tfp.read_dict_of_dict(api.config.map_values) df.replace(maps_map, inplace=True) logger.info('Replace values: {}'.format(maps_map)) # Fill NaN value : column1: value, column2: value, map_dict = tfp.read_dict(api.config.fill_nan_values) if map_dict: df.fillna(map_dict, inplace=True) logger.info('Fill nan values: {}'.format(map_dict)) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'drop_highly_unique' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') time_monitor = tp.progress() logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body columns = tfp.read_list(api.config.columns, df.columns, test_number=False) info_only = api.config.info_only threshold = api.config.threshold transform_data = { 'column': [], 'dtype': [], 'unique_values': [], 'action': [] } for col in df[columns].select_dtypes(np.object).columns: unique_vals_num = len(df[col].unique()) frac_unique_vals = unique_vals_num / df.shape[0] if frac_unique_vals > threshold: transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) transform_data['unique_values'].append(frac_unique_vals) transform_data['action'].append('drop') if info_only == False: df.drop(columns=[col], inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict,body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'drop_1valuecolumns' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body prev_shape = df.shape # Columns with 1 unique value columns = tfp.read_list(api.config.columns, df.columns) transform_data = { 'column': [], 'type': [], 'unique_vals': [], 'action': [] } for col in columns: vals = df[col].unique() if len(vals) == 1: transform_data['column'].append(col) transform_data['type'].append(str(df[col].dtype)) transform_data['unique_vals'].append(vals) transform_data['action'].append('drop') if not api.config.info_only: df.drop(columns=[col], inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict[ 'storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict,body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg): global setup_data global last_msg global hash_text_list operator_name = 'sentiment analysis' logger, log_stream = slog.set_logging(operator_name, loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() if api.config.debug_mode: api.send(outports[0]['name'], log_stream.getvalue()) article_list = msg.body att_dict = msg.attributes att_dict['operator'] = operator_name sentiments_list = list() sentiments_table = list() media_set = set() for article in article_list: media_set.add(article['media']) # Ensure that text only analysed once if article['hash_text'] in hash_text_list: continue hash_text_list.append(article['hash_text']) if not language_dict[article['media']] in supported_languages : continue polarity, subjectivity = get_article_sentiment(article) sentiments_list.append({'HASH_TEXT': article['hash_text'],'POLARITY': polarity, 'SUBJECTIVITY': subjectivity}) sentiments_table.append([article['hash_text'],polarity,subjectivity]) logger.debug('Process ended, analysed media: {} - article sentiments analysed {} - {}'.format(str(media_set), len(sentiments_list),\ time_monitor.elapsed_time())) table_att = {"columns": [ {"class": "string", "name": "HASH_TEXT", "nullable": False, "type": {"hana": "INTEGER"}}, {"class": "string", "name": "POLARITY", "nullable": True,"type": {"hana": "DOUBLE"}}, {"class": "string", "name": "SUBJECTIVITY", "nullable": True, "type": {"hana": "DOUBLE"}}], "name": "DIPROJECTS.SENTIMENTS", "version": 1} api.send(outports[0]['name'], log_stream.getvalue()) if len(sentiments_list) : logger.debug("First Record: {}".format(str(sentiments_list[0]))) api.send(outports[2]['name'], api.Message(attributes=att_dict, body=sentiments_list)) att_dict['table'] = table_att if len(sentiments_table) : logger.debug("First Record: {}".format(str(sentiments_table[0]))) msg = api.Message(attributes=att_dict, body=sentiments_table) api.send(outports[1]['name'], msg)
def process(msg) : att_dict = msg.attributes att_dict['operator'] = 'groupby' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() prev_att = msg.attributes df = msg.body prev_shape = df.shape ###### start of doing calculation # groupby list cols = tfp.read_list(api.config.groupby) # mapping aggregation try : colagg = tfp.read_dict(api.config.aggregation) except IndexError : logger.info('Aggregation is not a map, try to parse a value instead') colagg = tfp.read_value(api.config.aggregation) # groupby logger.info('Group columns: {}'.format(cols)) logger.info('Aggregation: {}'.format(colagg)) logger.info('Index: {}'.format(api.config.index)) df = df.groupby(cols, as_index=api.config.index).agg(colagg) # drop col dropcols = tfp.read_list(api.config.drop_columns) if dropcols : logger.info('Drop columns: {}'.format(dropcols)) df.drop(columns=dropcols,inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format(i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if not att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format(progress_str, time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(db_msg): logger, log_stream = slog.set_logging('topic dispatcher', loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() columns = [c['name'] for c in db_msg.attributes['table']['columns']] df = pd.DataFrame(db_msg.body, columns=columns) # groupby and concatenate words gdf = df.groupby(['HASH_TEXT'])['WORD'].apply(' '.join) # create document-term matrix #tf_vectorizer = CountVectorizer(analyzer='word', # min_df=1, # minimum reqd occurences of a word # # stop_words='german', # remove stop words # lowercase=False, # convert all words to lowercase # # token_pattern='[a-zA-Z0-9]{1,}', # num chars > 3 # # max_features=5000, # max number of uniq words # ) # tf means term-frequency in a document #dtm_tf = tf_vectorizer.fit_transform(gdf) # for tf dtm #lda_tf = LatentDirichletAllocation(n_components=30, learning_method='online', evaluate_every=-1, n_jobs=-1) #lda_tf.fit(dtm_tf) # get the first 10 keywords of each topic # get the words #feature_names = tf_vectorizer.get_feature_names() # topics can be extracted from components_ # date_today = str(date.today()) + '-' # for topic_ii, topic in enumerate(lda_tf.components_): # topic_id = str(date.today()) + '-' + str(topic_ii) # language = 'DE' # topic_type = 'LDA' # topic_date = str(date.today()) # experiy_date = None # attribute = None # topic_words = [feature_names[ii] for ii in topic.argsort()[:-11:-1]] # row = [topic_id, language, topic_type, topic_date, experiy_date, attribute] # row.extend(topic_words) # topic_list.append(row) #topic_np = np.array(topic_list, dtype='object') #col_names = ['TOPIC', 'LANGUAGE', 'TYPE', 'DATE', 'EXPERIY_DATE', 'ATTRIBUTE'] #for ii in range(1, 11): # col_names.append(f'KEYWORD_{ii}') #self.topic_df = pd.DataFrame(topic_np, columns=col_names) logger.debug('Process ended, topics processed {}'.format( time_monitor.elapsed_time())) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'filter_by_population' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body prev_cols = df.shape[1] columns = tfp.read_list(api.config.columns,df.columns,test_number=False) info_only = api.config.info_only threshold = api.config.threshold logger.debug('Parameter Threshold: {} Data Modification:{} '.format(threshold,info_only)) transform_data = {'column': [], 'dtype': [], 'unique_vals': [],'action': []} for col in columns: population = df[col].count() / df.shape[0] unique_vals = df[col].unique() if population < threshold and not (len(unique_vals) == 1 and np.isnan(unique_vals[0])): unique_vals = df[col].unique() transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) transform_data['unique_vals'].append(unique_vals) transform_data['action'].append('drop') if not info_only: df.drop(columns=[col], inplace=True) # end custom process definition if df.empty : raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1])) logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format(att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format(progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict,body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'filter_by_population' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') time_monitor = tp.progress() logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body columns = tfp.read_list(api.config.columns, df.columns, test_number=False) info_only = api.config.info_only threshold = api.config.threshold transform_data = { 'column': [], 'dtype': [], 'unique_vals': [], 'action': [] } for col in columns: population = df[col].count() / df.shape[0] unique_vals = df[col].unique() if population < threshold and not (len(unique_vals) == 1 and np.isnan(unique_vals[0])): unique_vals = df[col].unique() transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) transform_data['unique_vals'].append(unique_vals) transform_data['action'].append('drop') if not info_only: df.drop(columns=[col], inplace=True) logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'toCSV' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition df = msg.body if api.config.reset_index: logger.debug('Reset Index') df = df.reset_index() kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=') if not kwargs == None: data_str = df.to_csv(sep=api.config.separator, index=api.config.write_index, **kwargs) else: data_str = df.to_csv(sep=api.config.separator, index=api.config.write_index) # end custom process definition logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) # create dict of columns and types for HANA map_hana = {'int8': 'TINYINT', 'int16': 'SMALLINT', 'int32': 'INTEGER', 'int64': 'BIGINT', 'float32': 'FLOAT', 'float64': 'DOUBLE', \ 'object': 'VARCHAR', 'datetime64': 'TIMESTAMP'} col_dict = {c: str(df[c].dtype) for c in df.columns} hana_table_dict = list() for c, ty in col_dict.items(): if ty == 'object': size = df[c].str.len().max() hana_table_dict.append({ 'name': c, 'type': map_hana[col_dict[c]], 'size': size }) elif 'datetime64' in ty: hana_table_dict.append({'name': c, 'type': 'TIMESTAMP'}) else: hana_table_dict.append({'name': c, 'type': map_hana[col_dict[c]]}) logger.info('For Hana table definition: {}'.format(hana_table_dict)) log = log_stream.getvalue() return log, data_str
def process(msg): words = msg.body att_dict = msg.attributes logger, log_stream = slog.set_logging('word_regex_cleansing', api.config.debug_mode) logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() if isinstance(words[0], list): words = [w[0] for w in words] regex_patterns = tfp.read_list(api.config.patterns) logger.info('Test mode: {}'.format(api.config.test_mode)) logger.info('Number of words to cleanse: {}'.format(len(words))) word_type = tfp.read_value(api.config.word_type) if len(word_type) > 1: logger.warning( 'Only one word type can be processed. Take first one only: {}'. format(word_type[0])) count = 0 for ipat, pat in enumerate(regex_patterns): if pat == '': logger.warning('Empty pattern') continue cleansing_words = [w for w in words if re.match(pat, w)] logger.info('Execute pattern: {} ({}/{})'.format( pat, ipat, len(regex_patterns))) logger.info('Number of DELETE statements: {}'.format( len(cleansing_words))) api.send(outports[0]['name'], log_stream.getvalue()) log_stream.seek(0) log_stream.truncate() if not api.config.test_mode: for iw, w in enumerate(cleansing_words): if word_type: sql = 'DELETE FROM WORD_INDEX WHERE WORD = \'' + w + '\' AND WORD_TYPE = \'' + word_type + '\';' else: sql = 'DELETE FROM WORD_INDEX WHERE WORD = \'' + w + '\';' att_dict['message.indexBatch'] = count att_dict['message.lastBatch'] = False api.send(outports[1]['name'], api.Message(attributes=att_dict, body=sql)) count += 1 sql = 'SELECT * FROM DUMMY;' att_dict['message.indexBatch'] = count att_dict['message.lastBatch'] = True api.send(outports[1]['name'], api.Message(attributes=att_dict, body=sql)) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'castColumns' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition df = msg.body castmap = tfp.read_dict(api.config.cast) if castmap: for col, casttype in castmap.items(): if api.config.round: df[col] = df[col].round() df[col] = df[col].astype(casttype) ###### end calculation # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'drop_highly_unique' logger, log_stream = slog.set_logging(att_dict['operator']) if api.config.debug_mode == True: logger.setLevel('DEBUG') time_monitor = tp.progress() logger.debug('Start time: ' + time_monitor.get_start_time()) df = msg.body prev_shape = df.shape columns = tfp.read_list(api.config.columns, df.columns, test_number=False) info_only = api.config.info_only threshold = api.config.threshold transform_data = { 'column': [], 'dtype': [], 'unique_values': [], 'action': [] } for col in df[columns].select_dtypes(np.object).columns: unique_vals_num = len(df[col].unique()) frac_unique_vals = unique_vals_num / df.shape[0] if frac_unique_vals > threshold: transform_data['column'].append(col) transform_data['dtype'].append(df[col].dtype) transform_data['unique_values'].append(frac_unique_vals) transform_data['action'].append('drop') if info_only == False: df.drop(columns=[col], inplace=True) logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) logger.info('Dropped Columns: {}'.format(prev_shape[1] - df.shape[1])) logger.debug('Dropped Columns: {}'.format(prev_shape[1] - df.shape[1])) return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg) : att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'dropColumns' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation att_dict['config']['drop_columns'] = api.config.drop_columns drop_cols = tfp.read_list(api.config.drop_columns, df.columns) if drop_cols: logger.debug("Drops columns: {}".format(str(drop_cols))) df = df.drop(columns=drop_cols) att_dict['config']['rename_columns'] = api.config.rename_columns map_names = tfp.read_dict(api.config.rename_columns) if map_names: df.rename(columns=map_names, inplace=True) ###### end of doing calculation logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) # df from body att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]) logger.debug('Head data: {}'.format(att_dict['row_' + str(i)])) # end custom process definition log = log_stream.getvalue() msg = api.Message(attributes=att_dict,body=df) return log, msg
def process(msg): att_dict = msg.attributes logger, log_stream = slog.set_logging('word_index_regex', api.config.debug_mode) logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() # regex patterns regex_patterns = tfp.read_list(api.config.patterns) # word type word_types = tfp.read_list(api.config.word_types) if not word_types: logger.warning( 'Word types had to be defined. Default word type : \'PROPN\'') word_types = ['PROPN'] # pandas Dataframe and select only values with right word_type cols = [c["name"] for c in msg.attributes['table']['columns']] df = pd.DataFrame(msg.body, columns=cols) df_p = df.loc[df['TYPE'].isin(word_types)] # Language filter language_filter = tfp.read_list(api.config.language_filter) if language_filter: df_p = df_p.loc[df['LANGUAGE'].isin(language_filter)] # get unique words to get words that comply with regex words = df_p['WORD'].unique() logger.info('Number of words to test with regex pattern: {}'.format( len(words))) for ipat, pat in enumerate(regex_patterns): if pat == '': logger.warning('Empty pattern') continue logger.info('Execute pattern: {} ({}/{})'.format( pat, ipat, len(regex_patterns))) cleansing_words = [w for w in words if re.match(pat, w)] df = df.loc[~df['WORD'].isin(cleansing_words)] api.send(outports[1]['name'], api.Message(attributes=att_dict, body=df.values.tolist())) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): global last_msg global hash_list global lexicon_stem, lexicon logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode) # Check if setup complete msg = check_for_setup(logger, msg) if not msg: api.send(outports[0]['name'], log_stream.flush()) return 0 logger.info("Main Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() att_dict = msg.attributes # pandas Dataframe and select only values with right word_type cols = [c["name"] for c in msg.attributes['table']['columns']] df = pd.DataFrame(msg.body, columns=cols) # word type types = tfp.read_list(api.config.types) if not types: logger.warning( 'Word types had to be defined. Default word type : \'PROPN\'') types = ['PROPN'] # Language filter languages = tfp.read_list(api.config.languages) if not languages: logger.warning( 'Languages had to be defined. Default languages : EN, FR, ES, DE') languages = ['EN', 'FR', 'ES', 'DE'] for lang in lexicon: for w in lexicon[lang]: df.loc[(df['TYPE'].isin(types)) & (df['LANGUAGE'] == lang) & (df['WORD'] == w)] = lexicon[lang][w] for w in lexicon_stem[lang]: df.loc[(df['TYPE'].isin(types)) & (df['LANGUAGE'] == lang) & (df['WORD'] == w)] = lexicon_stem[lang][w] api.send(outports[1]['name'], api.Message(attributes=att_dict, body=df.values.tolist())) api.send(outports[0]['name'], log_stream.getvalue())
def process(msg): att_dict = msg.attributes att_dict['operator'] = 'drop_duplicates' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body prev_shape = df.shape drop_cols_test = tfp.read_list(api.config.columns, df.columns) keep = tfp.read_value(api.config.keep, test_number=False) df.drop_duplicates(subset=drop_cols_test, keep=keep, inplace=True) # end custom process definition if df.empty: raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format( df.memory_usage(deep=True).sum() / 1024**2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format( i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '<BATCH ENDED><1>' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict: if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']: progress_str = '<BATCH ENDED><{}>'.format( att_dict['storage.fileCount']) else: progress_str = '<BATCH IN-PROCESS><{}/{}>'.format( att_dict['storage.fileIndex'] + 1, att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Process ended: {} - {} '.format( progress_str, time_monitor.elapsed_time())) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
def process(msg) : att_dict = msg.attributes att_dict['operator'] = 'dropColumns' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() # start custom process definition prev_att = msg.attributes df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation drop_cols = tfp.read_list(api.config.drop_columns, df.columns) if drop_cols: logger.debug("Drops columns: {}".format(str(drop_cols))) df = df.drop(columns=drop_cols) map_names = tfp.read_dict(api.config.rename_columns) if map_names: df.rename(columns=map_names, inplace=True) # end custom process definition if df.empty : raise ValueError('DataFrame is empty') logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1])) logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2)) example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0] for i in range(0, example_rows): logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()]))) progress_str = '>BATCH ENDED<' if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict : if not att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount'] : progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],att_dict['storage.fileCount']) att_dict['process_list'].append(att_dict['operator']) logger.debug('Past process steps: {}'.format(att_dict['process_list'])) logger.debug('Process ended: {} - {} '.format(progress_str,time_monitor.elapsed_time())) return log_stream.getvalue(), api.Message(attributes=att_dict,body=df)
def process(msg): logger, log_stream = slog.set_logging('DEBUG') time_monitor = tp.progress() result = '' logger.debug('Start Process Function') logger.debug('Start time: ' + time_monitor.get_start_time()) for i in range(0, msg.body): result += str( i) + ':' + api.config.var1 + ' - ' + api.config.var2 + ' ' logger.debug('End of Process Function') logger.debug('End time: ' + time_monitor.elapsed_time()) return api.Message(attributes={ 'name': 'concat', 'type': 'str' }, body=result), log_stream.getvalue()
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'drop_1valuecolumns' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body prev_shape = df.shape # Columns with 1 unique value columns = tfp.read_list(api.config.columns, df.columns) col1val_data = {'column': [], 'type': [], 'unique_vals': [], 'action': []} for col in columns: vals = df[col].unique() if len(vals) == 1: col1val_data['column'].append(col) col1val_data['type'].append(str(df[col].dtype)) col1val_data['unique_vals'].append(vals) col1val_data['action'].append('drop') if not api.config.info_only: df.drop(columns=[col], inplace=True) logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) logger.debug('Dropped columns: {}'.format(prev_shape[1] - df.shape[1])) logger.info('Dropped columns: {}'.format(prev_shape[1] - df.shape[1])) return log_stream.getvalue(), api.Message(attributes={'name':'drop_duplicates','type':'DataFrame'},body=df),\ api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(col1val_data))
def process(msg): logger, log_stream = slog.set_logging('metadata_articles', loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() adict = msg.body att_dict = msg.attributes metadata_articles = list() articles_table = list() for index_article, article in enumerate(adict): metadata = {'media': article['media'], 'date': article['date'], 'language': language[article['media']], \ 'hash_text': article['hash_text'], 'url': article['url'][:255], 'rubrics': article['rubrics'], 'title': article['title'][:255]} metadata['num_characters'] = len(article['text']) metadata_articles.append(metadata) datea = datetime.strptime(article['date'], '%Y-%m-%d').replace(tzinfo=timezone.utc) articles_table.append([article['media'],datea,language[article['media']],article['hash_text'],\ article['url'][:255],article['rubrics'],article['title'][:255]]) table_att = {"columns": [ {"class": "string", "name": "MEDIA", "nullable": True, "size": 80, "type": {"hana": "NVARCHAR"}}, {"class": "string", "name": "DATE", "nullable": False, "type": {"hana": "DATETIME"}}, {"class": "string", "name": "LANGUAGE", "nullable": True, "size": 2, "type": {"hana": "NVARCHAR"}}, {"class": "string", "name": "HASH_TEXT", "nullable": True, "type": {"hana": "INTEGER"}}, {"class": "string", "name": "URL", "nullable": True, "size": 255,"type": {"hana": "NVARCHAR"}}, {"class": "string", "name": "RUBRICS", "nullable": True, "size": 80,"type": {"hana": "NVARCHAR"}}, {"class": "string", "name": "TITLE", "nullable": True, "size": 255,"type": {"hana": "NVARCHAR"}}], "name": "DIPROJECTS.ARTICLES_METADATA2", "version": 1} logger.debug('Process ended, articles processed {} - {} '.format(len(adict), time_monitor.elapsed_time())) att_dict['content'] = 'metadata for articles' # JSON msg = api.Message(attributes=att_dict, body=metadata_articles) api.send(outports[2]['name'], msg) # TABLE att_dict['table'] = table_att msg = api.Message(attributes=att_dict, body=articles_table) api.send(outports[1]['name'], msg) api.send(outports[0]['name'], log_stream.getvalue())
def process(db_msg): logger, log_stream = slog.set_logging('topic_index', loglevel=api.config.debug_mode) logger.info("Process started. Logging level: {}".format(logger.level)) time_monitor = tp.progress() att_dict = dict() topic = db_msg.attributes['topic'] keywords = db_msg.attributes['keywords'] columns = [c['name'] for c in db_msg.attributes['table']['columns']] columns = ['KEYWORD' if c == 'WORD' else c for c in columns] att_dict['topic'] = topic att_dict['tolerance'] = api.config.tolerance if api.config.tolerance < 1.0: min_keyword_num = len(keywords) - int( api.config.tolerance * len(keywords)) else: min_keyword_num = len(keywords) - api.config.tolerance df = pd.DataFrame(db_msg.body, columns=columns) logger.debug('Input DataFrame: {} - {}'.format(df.shape[0], df.shape[1])) # filter out all indices of not containing keyword - not needed when getting the sql output directly #df = df.loc[df['KEYWORD'].isin(keywords)] num_keyword_articles = df.shape[0] g_df = df.groupby(['HASH_TEXT']).count().reset_index() g_df = g_df.loc[g_df['KEYWORD'] >= min_keyword_num] g_df['count'] = min_keyword_num g_df['topic'] = topic g_df.rename(columns={'KEYWORD': 'count', 'count': 'min_num'}, inplace=True) g_df = g_df[['HASH_TEXT', 'topic', 'count', 'min_num']] #print(g_df) topic_index = g_df.to_dict('records') topic_index_msg = api.Message(attributes=att_dict, body=topic_index) logger.info('Topic found in articles: \"{}\" in {}/{} ({}/{})'.format(topic,g_df.shape[0],num_keyword_articles,\ min_keyword_num,len(keywords))) logger.debug('Process ended, {}'.format(time_monitor.elapsed_time())) api.send(outports[1]['name'], topic_index_msg) api.send(outports[0]['name'], log_stream.getvalue())