Example #1
0
def process():

    operator_name = 'sql_word_index'
    logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode)
    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    language = tfp.read_value(api.config.language)
    type_limit = tfp.read_dict(api.config.type_limit_map)
    table_name = tfp.read_value(api.config.table_name)
    text_id_col = tfp.read_value(api.config.text_id_col)

    for i, [wtype, limit] in enumerate(type_limit.items()):
        sql_s = "SELECT {tid}, \"{tn}\".LANGUAGE, \"{tn}\".TYPE, \"{tn}\".WORD, COUNT FROM \"{tn}\" INNER JOIN"\
                "(SELECT WORD, TYPE, LANGUAGE, SUM(COUNT) as CUMS FROM \"{tn}\" "\
                "WHERE LANGUAGE = \'{lang}\' AND TYPE = \'{wt}\' "\
                "GROUP BY WORD, TYPE, LANGUAGE) AS CTABLE ON "\
                "\"{tn}\".WORD = CTABLE.WORD AND \"{tn}\".TYPE = CTABLE.TYPE AND \"{tn}\".LANGUAGE = CTABLE.LANGUAGE "\
                "WHERE CUMS >= {lt}".format(tid = text_id_col,tn = table_name,lang=language,wt = wtype,lt=limit)

        lastbatch = True if len(type_limit) == i + 1 else False
        att_dict = attributes={'operator':operator_name,'parameter':{'type':wtype,'limit':limit,'language':language},\
                               'message.batchIndex':i,'message.batchSize':len(type_limit),'message.lastBatch':lastbatch}
        msg = api.Message(attributes=att_dict, body=sql_s)
        api.send(outports[1]['name'], msg)

    api.send(outports[0]['name'], log_stream.getvalue())
Example #2
0
def process(msg) :

    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'setValue'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='DEBUG')
    else :
        logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    df = msg.body
    if not isinstance(df,pd.DataFrame) :
        raise TypeError('Message body does not contain a pandas DataFrame')


    ###### start of doing calculation

    # map_values : column1: {from_value: to_value}, column2: {from_value: to_value}
    att_dict['config']['set_value'] = api.config.map_values
    maps_map = tfp.read_dict_of_dict(api.config.map_values)
    df.replace(maps_map,inplace=True)

    # Fill NaN value : column1: value, column2: value,
    att_dict['config']['fill_nan_values'] = api.config.fill_nan_values
    map_dict = tfp.read_dict(api.config.fill_nan_values)
    if map_dict :
        df.fillna(map_dict,inplace=True)

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty :
        raise ValueError('DataFrame is empty')

    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))

    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict,body=df)
    return log, msg
Example #3
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'groupby'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    prev_att = msg.attributes
    df = msg.body

    ###### start of doing calculation

    # groupby list
    cols = tfp.read_list(api.config.groupby)
    att_dict['config']['groupby'] = api.config.groupby

    # mapping aggregation
    try:
        colagg = tfp.read_dict(api.config.aggregation)
    except IndexError:
        logger.info('Aggregation is not a map, try to parse a value instead')
        colagg = tfp.read_value(api.config.aggregation)
    att_dict['config']['aggregation'] = api.config.aggregation

    # groupby
    logger.debug('Group columns: {}'.format(cols))
    logger.debug('Aggregation: {}'.format(colagg))
    logger.debug('Index: {}'.format(api.config.index))
    df = df.groupby(cols, as_index=api.config.index).agg(colagg)

    # drop col
    att_dict['config']['dropcols'] = api.config.drop_columns
    dropcols = tfp.read_list(api.config.drop_columns)
    if dropcols:
        df.drop(columns=dropcols, inplace=True)

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    att_dict['operator'] = 'groupbyDataFrame'
    att_dict['name'] = prev_att['name']
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = list(df.columns)
    att_dict['number_columns'] = df.shape[1]
    att_dict['number_rows'] = df.shape[0]

    example_rows = EXAMPLE_ROWS if att_dict[
        'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
Example #4
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'setValue'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation

    # map_values : column1: {from_value: to_value}, column2: {from_value: to_value}
    maps_map = tfp.read_dict_of_dict(api.config.map_values)
    df.replace(maps_map, inplace=True)
    logger.info('Replace values: {}'.format(maps_map))

    # Fill NaN value : column1: value, column2: value,
    map_dict = tfp.read_dict(api.config.fill_nan_values)
    if map_dict:
        df.fillna(map_dict, inplace=True)
        logger.info('Fill nan values: {}'.format(map_dict))

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Example #5
0
def process(msg) :
    att_dict = msg.attributes
    att_dict['operator'] = 'groupby'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    prev_att = msg.attributes
    df = msg.body
    prev_shape = df.shape

    ###### start of doing calculation

    # groupby list
    cols = tfp.read_list(api.config.groupby)

    # mapping aggregation
    try :
        colagg = tfp.read_dict(api.config.aggregation)
    except IndexError :
        logger.info('Aggregation is not a map, try to parse a value instead')
        colagg = tfp.read_value(api.config.aggregation)

    # groupby
    logger.info('Group columns: {}'.format(cols))
    logger.info('Aggregation: {}'.format(colagg))
    logger.info('Index: {}'.format(api.config.index))
    df = df.groupby(cols, as_index=api.config.index).agg(colagg)

    # drop col
    dropcols = tfp.read_list(api.config.drop_columns)
    if dropcols :
        logger.info('Drop columns: {}'.format(dropcols))
        df.drop(columns=dropcols,inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'], att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Example #6
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'toCSV'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    df = msg.body
    if api.config.reset_index:
        logger.debug('Reset Index')
        df = df.reset_index()

    kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=')

    if not kwargs == None:
        data_str = df.to_csv(sep=api.config.separator,
                             index=api.config.write_index,
                             **kwargs)
    else:
        data_str = df.to_csv(sep=api.config.separator,
                             index=api.config.write_index)
    # end custom process definition
    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    # create dict of columns and types for HANA
    map_hana = {'int8': 'TINYINT', 'int16': 'SMALLINT', 'int32': 'INTEGER', 'int64': 'BIGINT', 'float32': 'FLOAT',
                'float64': 'DOUBLE', \
                'object': 'VARCHAR', 'datetime64': 'TIMESTAMP'}
    col_dict = {c: str(df[c].dtype) for c in df.columns}
    hana_table_dict = list()
    for c, ty in col_dict.items():
        if ty == 'object':
            size = df[c].str.len().max()
            hana_table_dict.append({
                'name': c,
                'type': map_hana[col_dict[c]],
                'size': size
            })
        elif 'datetime64' in ty:
            hana_table_dict.append({'name': c, 'type': 'TIMESTAMP'})
        else:
            hana_table_dict.append({'name': c, 'type': map_hana[col_dict[c]]})
    logger.info('For Hana table definition: {}'.format(hana_table_dict))

    log = log_stream.getvalue()
    return log, data_str
Example #7
0
def process(msg):

    att_dict = msg.attributes
    att_dict['operator'] = 'castColumns'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    df = msg.body

    castmap = tfp.read_dict(api.config.cast)

    if castmap:
        for col, casttype in castmap.items():
            if api.config.round:
                df[col] = df[col].round()
            df[col] = df[col].astype(casttype)

    ###### end calculation

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Example #8
0
def process(msg) :

    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'dropColumns'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation
    att_dict['config']['drop_columns'] = api.config.drop_columns
    drop_cols = tfp.read_list(api.config.drop_columns, df.columns)
    if drop_cols:
        logger.debug("Drops columns: {}".format(str(drop_cols)))
        df = df.drop(columns=drop_cols)

    att_dict['config']['rename_columns'] = api.config.rename_columns
    map_names = tfp.read_dict(api.config.rename_columns)
    if map_names:

        df.rename(columns=map_names, inplace=True)
    ###### end of doing calculation
    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    # df from body
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))
    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict,body=df)
    return log, msg
Example #9
0
def process(msg):

    logger, log_stream = slog.set_logging('DEBUG')

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start of doing calculation

    # map_values : column1: {from_value: to_value}, column2: {from_value: to_value}
    att_dict['config']['set_value'] = api.config.map_values
    maps_map = tfp.read_dict_of_dict(api.config.map_values)
    df.replace(maps_map, inplace=True)

    # Fill NaN value : column1: value, column2: value,
    att_dict['config']['fill_nan_values'] = api.config.fill_nan_values
    map_dict = tfp.read_dict(api.config.fill_nan_values)
    if map_dict:
        df.fillna(map_dict, inplace=True)

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty:
        raise ValueError('DataFrame is empty')

    att_dict['operator'] = 'setValue'
    att_dict['name'] = prev_att['name']
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['number_columns'] = df.shape[1]
    att_dict['number_rows'] = df.shape[0]

    example_rows = EXAMPLE_ROWS if att_dict[
        'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])

    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
Example #10
0
def process(msg):

    logger, log_stream = slog.set_logging('DEBUG')

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start of doing calculation
    att_dict['config']['drop_columns'] = api.config.drop_columns
    drop_cols = tfp.read_list(api.config.drop_columns, df.columns)
    if drop_cols:
        logger.debug("Drops columns: {}".format(str(drop_cols)))
        df = df.drop(columns=drop_cols)

    att_dict['config']['rename_columns'] = api.config.rename_columns
    map_names = tfp.read_dict(api.config.rename_columns)
    if map_names:

        df.rename(columns=map_names, inplace=True)
    ###### end of doing calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    # df from body
    att_dict['operator'] = 'dropColumns'  # name of operator
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['name'] = prev_att['name']
    att_dict['columns'] = list(df.columns)
    att_dict['number_columns'] = df.shape[1]
    att_dict['number_rows'] = df.shape[0]

    example_rows = EXAMPLE_ROWS if att_dict[
        'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])

    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
Example #11
0
def process(msg):

    logger, log_stream = set_logging('DEBUG')

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start of doing calculation

    # groupby list
    cols = tfp.read_list(api.config.groupby)
    att_dict['config']['groupby'] = api.config.groupby

    # mapping
    colagg = tfp.read_dict(api.config.aggregation)
    att_dict['config']['aggregation'] = api.config.aggregation

    # groupby
    df = df.groupby(cols, as_index=api.config.index).agg(colagg)

    # drop col
    att_dict['config']['dropcols'] = api.config.drop_columns
    dropcols = tfp.read_list(api.config.drop_columns)
    if dropcols:
        df.drop(columns=dropcols, inplace=True)

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    att_dict['operator'] = 'groupbyDataFrame'
    att_dict['name'] = prev_att['name']
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = list(df.columns)
    att_dict['number_columns'] = df.shape[1]
    att_dict['number_rows'] = df.shape[0]

    example_rows = EXAMPLE_ROWS if att_dict[
        'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
Example #12
0
def process(msg):

    logger, log_stream = slog.set_logging('DEBUG')

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body

    att_dict = dict()
    att_dict['config'] = dict()

    castmap = tfp.read_dict(api.config.cast)

    if castmap:
        for col, casttype in castmap.items():
            if api.config.round:
                df[col] = df[col].round()
            df[col] = df[col].astype(casttype)

    ###### end calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    att_dict['operator'] = 'castDataFrame'
    att_dict['name'] = prev_att['name']
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['number_columns'] = df.shape[1]
    att_dict['number_rows'] = df.shape[0]
    if 'id' in prev_att.keys():
        att_dict[
            'id'] = prev_att['id'] + '; ' + att_dict['operator'] + ': ' + str(
                id(df))
    else:
        att_dict['id'] = att_dict['operator'] + ': ' + str(id(df))

    example_rows = EXAMPLE_ROWS if att_dict[
        'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])

    msg = api.Message(attributes=att_dict, body=df)
    # end custom process definition
    log = log_stream.getvalue()
    return log, msg
Example #13
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'castColumns'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    logger.debug("Process started")

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body

    castmap = tfp.read_dict(api.config.cast)

    if castmap:
        for col, casttype in castmap.items():
            if api.config.round:
                df[col] = df[col].round()
            df[col] = df[col].astype(casttype)

    ###### end calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))

    msg = api.Message(attributes=att_dict, body=df)
    # end custom process definition
    log = log_stream.getvalue()
    return log, msg
Example #14
0
def process(msg) :

    att_dict = msg.attributes
    att_dict['operator'] = 'dropColumns'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation
    drop_cols = tfp.read_list(api.config.drop_columns, df.columns)
    if drop_cols:
        logger.debug("Drops columns: {}".format(str(drop_cols)))
        df = df.drop(columns=drop_cols)

    map_names = tfp.read_dict(api.config.rename_columns)
    if map_names:
        df.rename(columns=map_names, inplace=True)

    # end custom process definition
    if df.empty :
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1]))
    logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict :
        if not att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount'] :
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(progress_str,time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict,body=df)
Example #15
0
def process(msg_coef, msg_data):

    logger, log_stream = slog.set_logging('DEBUG')

    # start custom process definition
    prev_att = msg_data.attributes
    df = msg_data.body
    coef_df = msg_coef.body
    if not isinstance(df, pd.DataFrame):
        logger.error('Message body does not contain a pandas DataFrame')
        raise TypeError('Message body does not contain a pandas DataFrame')

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start of doing calculation
    # segment columns
    segment_cols = None
    if 'segmentation_columns' in msg_coef.attributes:
        segment_cols = msg_coef.attributes['segmentation_columns']

    # regression columns
    regression_cols = msg_coef.attributes['regression_columns']

    # prediction column
    prediction_col = msg_coef.attributes['prediction_column']

    # setting values of regression column values (if not in the dataMsg already done
    att_dict['config'][
        'regresssion_cols_value'] = api.config.regresssion_cols_value
    valmap = tfp.read_dict(api.config.regresssion_cols_value)
    if valmap:
        for col, val in valmap.items():
            if np.issubdtype(df[col].dtype, np.integer):
                val = int(val)
            elif np.issubdtype(df[col].dtype, np.float):
                val = float(val)
            else:
                raise ValueError('Regression value needs to be numeric')
            df[col] = val

    # merge data and coef df
    if segment_cols:
        df = pd.merge(df,
                      coef_df,
                      how='inner',
                      left_on=segment_cols,
                      right_on=segment_cols)

    prefix = tfp.read_value(api.config.prediction_prefix)
    if prefix == None:
        prefix = ''
    pcol = prefix + prediction_col

    if segment_cols:

        def predict(x):
            x[pcol] = np.dot(x['coef'],
                             x[regression_cols].values) + x['intercept']
            return x

        df = df.apply(predict, axis=1, result_type=None)
        df.drop(columns=['coef', 'intercept'], inplace=True)
    else:

        def predict(x):
            x[pcol] = np.dot(coef_df['coef'],
                             x[regression_cols].values) + coef_df['intercept']
            return x

        df = df.apply(predict, axis=1, result_type=None)

    # cast type of prediction col from prediction variable
    if df[prediction_col].dtype == np.integer:
        logger.debug('Cast prediction column to <int>')
        df[pcol] = df[pcol].round().astype(df[prediction_col].dtype)

    if api.config.prediction_col_only:
        logger.debug('Output only prediction columns')
        if segment_cols:
            df[prediction_col] = df[pcol]
            df = df[segment_cols + [prediction_col]]
        else:
            df = df[prediction_col]
    att_dict['config']['prediction_col_only'] = api.config.prediction_col_only

    ###### end of doing calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty:
        raise ValueError('DataFrame is empty')

    att_dict['operator'] = 'regressionTrainingDataFrame'
    att_dict['name'] = prev_att['name']

    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
Example #16
0
def process(test_msg, base_msg):
    att_dict = base_msg.attributes
    att_dict['operator'] = 'fuzzyjoin'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    testdf_index = tfp.read_value(api.config.test_index)
    if not testdf_index:
        logger.error('Index of test data is mandatory')
        raise ValueError('Index of test data is mandatory')

    # get the columns to check
    mapping = tfp.read_dict(api.config.check_columns)
    df = pd.DataFrame()

    if mapping:
        # read stream from memory
        test_df = test_msg.body

        # test if all mapping cols in testdf
        checkcols = [
            elem in list(test_df.columns) for elem in list(mapping.keys())
        ]
        if not all(checkcols):
            error_txt = 'Elements in mapping are not contained in columns of test df : ' + \
                        str(list(mapping.keys())) + '-' + str(list(test_df.columns)) + ' - ' + str(checkcols)
            logger.error(error_txt)
            raise ValueError(error_txt)

        if not testdf_index in test_df.columns:
            logger.error('Test index needs to be column')
            raise ValueError('Test index needs to be column')

        tcols = ['t_' + c for c in list(mapping.keys())]
        tdf = pd.DataFrame(columns=tcols)

        df = base_msg.body
        df = pd.concat([df, tdf], axis=1)

        num_cols = len(mapping)
        # run over all left df rows to test in right_df
        for index, test_row in test_df.iterrows():
            # apply function
            def get_ratio(row):
                sc = 0
                for tcol, bcol in mapping.items():
                    sc = sc + fuzz.token_sort_ratio(test_row[tcol], row[bcol])
                return sc / num_cols

            df['tscore'] = df.apply(get_ratio, axis=1)
            # get best matching and store index in v_dict
            max_score = df['tscore'].max()
            if max_score >= api.config.limit:
                mask = (df['tscore'] == max_score)
                df.loc[mask, 'score'] = max_score
                df.loc[mask, 'external_id'] = test_row[testdf_index]
                for coli in mapping:
                    df.loc[mask, 't_' + coli] = test_row[coli]

            df.drop(columns=['tscore'], inplace=True)

        # remove external_id when test column value has none

        t_cols = ['t_' + t for t in mapping.keys()] + ['external_id', 'score']
        for bcol in mapping.values():
            mask = df[bcol].isna()
            df.loc[mask, t_cols] = np.nan

        if api.config.only_index:
            df = df[list(base_msg.body.columns) + ['external_id']]

        if api.config.only_matching_rows:
            df = df.loc[~df['score'].isna()]

        basedf_index = tfp.read_value(api.config.base_index)

        if api.config.joint_id:
            if not basedf_index:
                raise ValueError(
                    "For <joint_id> a value for <base_index> is necessary ")
            df.loc[~df['external_id'].isna(),
                   'joint_id'] = df.loc[~df['external_id'].isna(),
                                        'external_id']
            df.loc[df['external_id'].isna(),
                   'joint_id'] = df.loc[df['external_id'].isna(), basedf_index]

        if api.config.add_non_matching:
            # test if same columns
            if not all(
                [elem in test_df.columns for elem in base_msg.body.columns]):
                raise ValueError("Adding test dataframe only possible when having same columns " + str(test_df.columns) \
                                 + ' vs. ' + str(base_msg.body.columns))
            matched_ids = df['external_id'].unique()
            addto_df = test_df.loc[~test_df[testdf_index].isin(matched_ids
                                                               )].copy()
            addto_df['joint_id'] = addto_df[testdf_index]
            df = pd.concat([df, addto_df], axis=0, sort=False)
    else:
        logger.warning('No columns to check')

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Example #17
0
def process(test_msg, base_msg) :

    logger, log_stream = set_logging('DEBUG')

    # start custom process definition
    test_att = test_msg.attributes
    base_att = base_msg.attributes

    att_dict = dict()

    if test_att['name'] == base_att['name']:
        att_dict['name'] = test_att['name']
    else:
        att_dict['name'] = test_att['name'] + '-' + base_att['name']
    att_dict['config'] = dict()

    att_dict['config']['test_index'] = api.config.test_index
    testdf_index = tfp.read_value(api.config.test_index)
    if not testdf_index:
        logger.error('Index of test data is mandatory')
        raise ValueError('Index of test data is mandatory')

    att_dict['number_rows'] = str(base_msg.body.shape[0])

    # get the columns to check

    mapping = tfp.read_dict(api.config.check_columns)
    df = pd.DataFrame()

    if mapping:

        att_dict['config']['check_columns'] = str(mapping)
        att_dict['config']['limit'] = api.config.limit

        # read stream from memory
        test_df = test_msg.body

        # test if all mapping cols in testdf
        checkcols = [elem in list(test_df.columns) for elem in list(mapping.keys())]
        if not all(checkcols):
            error_txt = 'Elements in mapping are not contained in columns of test df : ' + \
                        str(list(mapping.keys())) + '-' + str(list(test_df.columns)) + ' - ' + str(checkcols)
            logger.error(error_txt)
            raise ValueError(error_txt)

        if not testdf_index in test_df.columns:
            logger.error('Test index needs to be column')
            raise ValueError('Test index needs to be column')

        tcols = ['t_' + c for c in list(mapping.keys())]
        tdf = pd.DataFrame(columns=tcols)

        df = base_msg.body
        df = pd.concat([df, tdf], axis=1)

        num_cols = len(mapping)
        # run over all left df rows to test in right_df
        for index, test_row in test_df.iterrows():
            # apply function
            def get_ratio(row):
                sc = 0
                for tcol, bcol in mapping.items():
                    sc = sc + fuzz.token_sort_ratio(test_row[tcol], row[bcol])
                return sc / num_cols

            df['tscore'] = df.apply(get_ratio, axis=1)
            # get best matching and store index in v_dict
            max_score = df['tscore'].max()
            if max_score >= api.config.limit:
                mask = (df['tscore'] == max_score)
                df.loc[mask, 'score'] = max_score
                df.loc[mask, 'external_id'] = test_row[testdf_index]
                for coli in mapping:
                    df.loc[mask, 't_' + coli] = test_row[coli]

            df.drop(columns=['tscore'], inplace=True)

        # remove external_id when test column value has none

        t_cols = ['t_' + t for t in mapping.keys()] + ['external_id', 'score']
        for bcol in mapping.values():
            mask = df[bcol].isna()
            df.loc[mask, t_cols] = np.nan

        if api.config.only_index:
            df = df[list(base_msg.body.columns) + ['external_id']]
        att_dict['config']['only_index'] = api.config.only_index

        if api.config.only_matching_rows:
            df = df.loc[~df['score'].isna()]
        att_dict['config']['only_matching_rows'] = api.config.only_matching_rows

        basedf_index = tfp.read_value(api.config.base_index)
        att_dict['config']['base_index'] = basedf_index

        if api.config.joint_id:
            if not basedf_index:
                raise ValueError("For <joint_id> a value for <base_index> is necessary ")
            df.loc[~df['external_id'].isna(), 'joint_id'] = df.loc[~df['external_id'].isna(), 'external_id']
            df.loc[df['external_id'].isna(), 'joint_id'] = df.loc[df['external_id'].isna(), basedf_index]
        att_dict['config']['joint_id'] = api.config.joint_id

        if api.config.add_non_matching:
            # test if same columns
            if not all([elem in test_df.columns for elem in base_msg.body.columns]):
                raise ValueError("Adding test dataframe only possible when having same columns " + str(test_df.columns) \
                                 + ' vs. ' + str(base_msg.body.columns))
            matched_ids = df['external_id'].unique()
            addto_df = test_df.loc[~test_df[testdf_index].isin(matched_ids)].copy()
            addto_df['joint_id'] = addto_df[testdf_index]
            df = pd.concat([df, addto_df], axis=0, sort=False)
        att_dict['config']['add_non_matching'] = api.config.add_non_matching

    else:
        logger.warning('No columns to check')

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    if df.empty:
        logger.warning('DataFrame is empty')
    else :
        att_dict['operator'] = 'fuzzyjoinDataFrames'
        att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
        att_dict['columns'] = str(list(df.columns))
        att_dict['number_columns'] = df.shape[1]
        att_dict['number_rows'] = df.shape[0]
        if 'id' in base_att.keys():
            att_dict['id'] = base_att['id'] + '; ' + att_dict['operator'] + ': ' + str(id(df))
        else:
            att_dict['id'] = att_dict['operator'] + ': ' + str(id(df))

        example_rows = EXAMPLE_ROWS if att_dict['number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
        for i in range(0, example_rows):
            att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict,body = df)
    return log, msg
Example #18
0
def process(msg):
    logger, log_stream = slog.set_logging('DEBUG')
    logger.debug("Process started")

    # start custom process definition
    att_dict = dict()
    att_dict['config'] = dict()

    global result_df

    # json string of attributes already converted to dict
    # att_dict['prev_attributes'] = msg.attributes
    att_dict['filename'] = msg.attributes["storage.filename"]

    logger.info('Filename: {} index: {}  count: {}  endofSeq: {}'.format(msg.attributes["storage.filename"], \
                                                                         msg.attributes["storage.fileIndex"], \
                                                                         msg.attributes["storage.fileCount"], \
                                                                         msg.attributes["storage.endOfSequence"]))

    # using file name from attributes of ReadFile
    if not api.config.df_name or api.config.df_name == "DataFrame":
        att_dict['name'] = att_dict['filename'].split(".")[0]

    if isinstance(msg.body, str):
        csv_io = io.StringIO(msg.body)
        logger.debug("Input format: <string>")
    elif isinstance(msg.body, bytes):
        csv_io = io.BytesIO(msg.body)
        logger.debug("Input format: <bytes>")
    elif isinstance(msg.body, io.BytesIO):
        logger.debug("Input format: <io.Bytes>")
        csv_io = msg.body
    else:
        raise TypeError('Message body has unsupported type' +
                        str(type(msg.body)))

    # nrows
    nrows = None
    if not api.config.limit_rows == 0:
        nrows = api.config.limit_rows

    # usecols
    att_dict['config']['use_columns'] = api.config.use_columns
    use_cols = tfp.read_list(api.config.use_columns)

    # dtypes mapping
    att_dict['config']['dtypes'] = api.config.dtypes
    typemap = tfp.read_dict(api.config.dtypes)

    kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=')

    ##### Read string from buffer
    logger.debug("Read from input")
    df = pd.read_csv(csv_io, api.config.separator, usecols=use_cols, dtype=typemap, decimal=api.config.decimal, \
                     nrows=nrows, **kwargs)

    # Data from filename
    if api.config.data_from_filename and not api.config.data_from_filename == 'None':
        col = api.config.data_from_filename.split(':')[0].strip().strip(
            "'").strip('"')
        pat = api.config.data_from_filename.split(':')[1].strip().strip(
            "'").strip('"')
        logger.debug('Filename: {}  pattern: {}'.format(
            att_dict['filename'], pat))
        try:
            dataff = re.match('.*(\d{4}-\d+-\d+).*', att_dict['filename'])
            df[col] = dataff.group(1)
        except AttributeError:
            raise ValueError(
                'Pattern not found - Filename: {}  pattern: {}'.format(
                    att_dict['filename'], pat))

    # To Datetime
    if api.config.todatetime and not api.config.todatetime == 'None':
        coldate = api.config.todatetime.split(':')[0].strip().strip("'").strip(
            '"')
        dformat = api.config.todatetime.split(':')[1].strip().strip("'").strip(
            '"')
        df[coldate] = pd.to_datetime(df[coldate], format=dformat)

    ###### Downcasting
    # save memory footprint for calculating the savings of the downcast
    att_dict['previous_memory'] = df.memory_usage(deep=True).sum() / 1024**2
    if api.config.downcast_int:
        df, dci = downcast(df, 'int', 'unsigned')
    if api.config.downcast_float:
        df, dcf = downcast(df, 'float', 'float')

    # check if index is provided and set
    index_list = tfp.read_list(api.config.index_cols)
    att_dict['config']['index_cols'] = str(index_list)
    att_dict['index_cols'] = str(index_list)
    if index_list:
        df.set_index(index_list, inplace=True)

    # stores the result in global variable result_df
    if msg.attributes['storage.fileIndex'] == 0:
        result_df = df
    else:
        result_df = pd.concat([result_df, df], axis=0, sort=False)

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    att_dict['operator'] = 'fromCSVDataFrame'
    att_dict['memory'] = result_df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = list(result_df.columns)
    att_dict['dtypes'] = {
        col: str(ty)
        for col, ty in df.dtypes.to_dict().items()
    }
    att_dict['number_columns'] = result_df.shape[1]
    att_dict['number_rows'] = result_df.shape[0]
    att_dict['id'] = str(id(result_df))

    example_rows = EXAMPLE_ROWS if att_dict[
        'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in result_df.iloc[i, :].tolist()])

    # end custom process definition
    msg = api.Message(attributes=att_dict, body=result_df)
    log = log_stream.getvalue()
    return log, msg
Example #19
0
def process(msg) :
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'groupby'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    prev_att = msg.attributes
    df = msg.body
    prev_shape = df.shape

    ###### start of doing calculation

    # groupby list
    cols = tfp.read_list(api.config.groupby)
    att_dict['config']['groupby'] = api.config.groupby

    # mapping aggregation
    try :
        colagg = tfp.read_dict(api.config.aggregation)
    except IndexError :
        logger.info('Aggregation is not a map, try to parse a value instead')
        colagg = tfp.read_value(api.config.aggregation)
    att_dict['config']['aggregation'] = api.config.aggregation

    # groupby
    logger.debug('Group columns: {}'.format(cols))
    logger.debug('Aggregation: {}'.format(colagg))
    logger.debug('Index: {}'.format(api.config.index))
    df = df.groupby(cols, as_index=api.config.index).agg(colagg)

    # drop col
    att_dict['config']['dropcols'] = api.config.drop_columns
    dropcols = tfp.read_list(api.config.drop_columns)
    if dropcols :
        df.drop(columns=dropcols,inplace=True)

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    #logger.debug('Dropped duplicates: {}'.format(prev_shape[0] - df.shape[0]))
    logger.info('Dropped rows: {}'.format(prev_shape[0]-df.shape[0]))
    logger.info('Dropped columns: {}'.format(prev_shape[1] - df.shape[1]))
    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
Example #20
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'toCSV'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    df = msg.body
    if api.config.reset_index:
        logger.debug('Reset Index')
        df = df.reset_index()

    kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=')

    if not kwargs == None:
        data_str = df.to_csv(sep=api.config.separator,
                             index=api.config.write_index,
                             **kwargs)
    else:
        data_str = df.to_csv(sep=api.config.separator,
                             index=api.config.write_index)
    # end custom process definition

    att_dict['process_list'].append(att_dict['operator'])
    logger.info('Process list: {}'.format(att_dict['process_list']))

    # create dict of columns and types for HANA
    map_hana = {'int8': 'TINYINT', 'int16': 'SMALLINT', 'int32': 'INTEGER', 'int64': 'BIGINT', 'float32': 'FLOAT',
                'float64': 'DOUBLE', \
                'object': 'VARCHAR', 'datetime64': 'TIMESTAMP'}
    col_dict = {c: str(df[c].dtype) for c in df.columns}
    hana_table_dict = list()
    for c, ty in col_dict.items():
        if ty == 'object':
            size = df[c].str.len().max()
            hana_table_dict.append({
                'name': c,
                'type': map_hana[col_dict[c]],
                'size': size
            })
        elif 'datetime64' in ty:
            hana_table_dict.append({'name': c, 'type': 'TIMESTAMP'})
        else:
            hana_table_dict.append({'name': c, 'type': map_hana[col_dict[c]]})
    logger.info('For Hana table definition: {}'.format(hana_table_dict))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    log = log_stream.getvalue()
    return log, data_str
Example #21
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'word_regex'
    logger, log_stream = slog.set_logging(att_dict['operator'], api.config.debug_mode)
    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    # Dataframe
    df = msg.body
    logger.debug('Attributes: {}'.format(str(msg.attributes)))

    df = msg.body
    if not isinstance(df, pd.DataFrame) or df.empty:
        logger.warning('Empty dataframe, no output send!')
        api.send(outports[0]['name'], log_stream.getvalue())
        api.send(outports[2]['name'], api.Message(attributes=att_dict, body=df))
        return 0

    # in case the input is from a DB
    df.rename(columns = {'TEXT_ID':'text_id','LANGUAGE':'language','TYPE':'type','WORD':'word','COUNT':'count'},inplace=True)
    logger.debug('DataFrame columns: {}'.format(df.columns))

    df['word_m'] = np.nan
    df['word_r'] = np.nan
    df['word_orig'] = df['word']

    # word type
    word_types = tfp.read_list(api.config.word_types)
    if not word_types :
        word_types = list(df['type'].unique())
    logger.debug('Word types: {}'.format(word_types))

    # Language filter
    language_filter = tfp.read_list(api.config.language_filter)
    if not language_filter :
        language_filter = list(df['language'].unique())
    logger.debug('Language filter: {}'.format(language_filter))

    mask = df['language'].isin(language_filter) & df['type'].isin(word_types)

    # regex patterns word removal
    regex_wordr = tfp.read_list(api.config.pattern_word_removal)
    remove_words = list()
    if regex_wordr :
        for ipat, pat in enumerate(regex_wordr):
            logger.info('Execute pattern: {} ({}/{})'.format(pat,ipat,len(regex_wordr)))
            api.send(outports[0]['name'], log_stream.getvalue())
            log_stream.truncate()
            log_stream.seek(0)
            df.loc[mask & df['word'].str.contains(pat = pat),'word_r'] = pat

    # regex patterns word removal
    regex_ssr = tfp.read_dict(api.config.pattern_substring_replace)
    if regex_ssr :
        for ipat, pat in enumerate(regex_ssr.items()):
            logger.info('Execute replace pattern: {} ({}/{})'.format(pat,ipat,len(regex_ssr)))
            api.send(outports[0]['name'], log_stream.getvalue())
            log_stream.truncate()
            log_stream.seek(0)
            df.loc[mask & df['word'].str.contains(pat=pat[0]), 'word_m'] = pat[0]
            df.loc[mask,'word'] = df.loc[mask,'word'].str.replace(pat[0],pat[1],regex = True)

    # send removed or replace words to port removed
    rm_df= df.loc[df[['word_r','word_m']].any(axis=1),['word_orig','word','word_r','word_m']].drop_duplicates()
    rm_csv = rm_df.to_csv(index=False)
    attributes_removed = att_dict.copy()
    attributes_removed['port'] = outports[1]['name']
    logger.debug('CSV send to port {} with #rows: {})'.format(attributes_removed['port'],rm_df.shape[0]))
    api.send(outports[1]['name'], api.Message(attributes=attributes_removed, body=rm_csv))

    # delete rows with not-nan value in 'word_r and drop word_r, word_m columns
    df.drop(df.loc[~df['word_r'].isnull()].index,axis = 0, inplace = True)
    df.drop(columns=['word_r','word_m'],inplace = True)

    # group on text_id, language, type, word
    df = df.groupby(by=['text_id','language','type','word'])['count'].sum().reset_index()
    logger.info('Dataframe shape: {} -  {}'.format(df.shape[0],df.shape[1]))

    att_dict['port'] = outports[2]['name']
    api.send(outports[2]['name'], api.Message(attributes=att_dict, body=df))

    api.send(outports[0]['name'],log_stream.getvalue())
Example #22
0
def process(msg):

    att_dict = msg.attributes

    global result_df

    att_dict['operator'] = 'fromCSV'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    logger.info('Filename: {} index: {}  count: {}  endofSeq: {}'.format(msg.attributes["storage.filename"], \
                                                                         msg.attributes["storage.fileIndex"], \
                                                                         msg.attributes["storage.fileCount"], \
                                                                         msg.attributes["storage.endOfSequence"]))

    if msg.body == None:
        logger.info('Process ended.')
        msg = api.Message(attributes=att_dict, body=result_df)
        log = log_stream.getvalue()
        return log, msg
    elif isinstance(msg.body, str):
        csv_io = io.StringIO(msg.body)
        logger.debug("Input format: <string>")
    elif isinstance(msg.body, bytes):
        csv_io = io.BytesIO(msg.body)
        logger.debug("Input format: <bytes>")
    elif isinstance(msg.body, io.BytesIO):
        logger.debug("Input format: <io.Bytes>")
        csv_io = msg.body
    else:
        raise TypeError('Message body has unsupported type' +
                        str(type(msg.body)))

    # nrows
    nrows = None
    if not api.config.limit_rows == 0:
        nrows = api.config.limit_rows

    # usecols
    use_cols = tfp.read_list(api.config.use_columns)
    logger.debug('Columns used: {}'.format(use_cols))

    # dtypes mapping
    typemap = tfp.read_dict(api.config.dtypes)
    logger.debug('Type cast: {}'.format(str(typemap)))

    kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=')

    ##### Read string from buffer
    logger.debug("Read from input")
    df = pd.read_csv(csv_io, api.config.separator, usecols=use_cols, dtype=typemap, decimal=api.config.decimal, \
                     nrows=nrows, **kwargs)

    # Data from filename
    if api.config.data_from_filename and not api.config.data_from_filename == 'None':
        col = api.config.data_from_filename.split(':')[0].strip().strip(
            "'").strip('"')
        pat = api.config.data_from_filename.split(':')[1].strip().strip(
            "'").strip('"')
        logger.debug('Filename: {}  pattern: {}'.format(
            att_dict['filename'], pat))
        try:
            dataff = re.match('.*(\d{4}-\d+-\d+).*', att_dict['filename'])
            df[col] = dataff.group(1)
        except AttributeError:
            raise ValueError(
                'Pattern not found - Filename: {}  pattern: {}'.format(
                    att_dict['filename'], pat))

    # To Datetime
    if api.config.todatetime and not api.config.todatetime == 'None':
        dt_fmt = tfp.read_dict(api.config.todatetime)
        logger.debug('Time conversion {} by using UTC {}'.format(
            api.config.todatetime, api.config.utc))
        for col, fmt in dt_fmt.items():
            df[col] = pd.to_datetime(df[col], format=fmt, utc=api.config.utc)

    ###### Downcasting
    # save memory footprint for calculating the savings of the downcast
    logger.debug('Memory used before downcast: {}'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    if api.config.downcast_int:
        df, dci = downcast(df, 'int', 'unsigned')
    if api.config.downcast_float:
        df, dcf = downcast(df, 'float', 'float')

    # check if index is provided and set
    index_list = tfp.read_list(api.config.index_cols)
    if index_list:
        df.set_index(index_list, inplace=True)

    if api.config.collect:
        # stores the result in global variable result_df
        if msg.attributes['storage.fileIndex'] == 0:
            logger.debug('Added to DataFrame: {}'.format(
                att_dict['storage.filename']))
            result_df = df
        else:
            try:
                result_df = pd.concat([result_df, df], axis=0, sort=False)
            except Exception as e:
                logger.error(str(e))
                result_df = df
    else:
        result_df = df

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Example #23
0
def process(msg):

    global ID_list

    operator_name = 'doc_prepare'
    logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode)

    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    att_dict = msg.attributes
    df = msg.body

    text_column = tfp.read_value(api.config.text_column)
    if not text_column:
        text_column = 'text'

    id_column = tfp.read_value(api.config.id_column)
    if not id_column:
        id_column = 'text_id'

    default_language = 'DE'
    if api.config.media_docs:
        language_column = 'language'
        df['language'] = default_language
        df.loc[df['media'].isin(['Lefigaro', 'Lemonde']), 'language'] = 'FR'
        df.loc[df['media'].isin(['Elpais', 'Elmundo']), 'language'] = 'ES'
    else:
        language_column = tfp.read_value(api.config.language_column)
        if not language_column:
            language_column = 'language'

        r_default_language = tfp.read_value(api.config.default_language)
        if r_default_language:
            default_language = r_default_language

        if not language_column in df.columns:
            df[language_column] = default_language
        else:
            df.loc[df['language'].isna()] = default_language

    df.rename(columns={
        text_column: 'text',
        id_column: 'text_id',
        language_column: 'language'
    },
              inplace=True)
    logger.debug('Columns: {}'.format(df.columns))

    logger.info("Default language: {}".format(default_language))

    # if text is a binary
    if type(df['text'].iloc[0]) == bytes:
        logger.info('Text is bytes. Decoded to \'utf-8\'')
        df.text = df.text.str.decode('utf-8')

    # remove duplicates
    prev_num_rows = df.shape[0]
    df.drop_duplicates(subset=['text_id'], inplace=True)
    df = df.loc[~df['text_id'].isin(ID_set)]
    post_num_rows = df.shape[0]
    logger.debug('Docs reduced due to be already processed: {} - {}'.format(
        prev_num_rows, post_num_rows))
    ID_set.update(df.text_id.values.tolist())

    # replace html tags
    if api.config.remove_html_tags:
        df['text'] = df['text'].str.replace('<.*?>', '', regex=True)

    # correct common text format errors
    #repl_pattern = list()
    #repl_pattern.append([r'([\:,\.?!\)])([A-Z])',r'\1 \2'])
    #repl_pattern.append([r'(,)([a-z])', r'\1 \2'])
    #repl_pattern.append([r'(\"\.)([A-Z])', r'\1 \2'])
    #repl_pattern.append([r'(\.)(\"[A-Z])', r'\1 \2'])

    repl_pattern = tfp.read_dict(api.config.pattern_substring_replace)
    repl_pattern = repl_pattern
    if repl_pattern:
        logger.info('Apply regex to text: {}'.format(repl_pattern))
        for pat, repl in repl_pattern.items():
            mask = df['text'].str.contains(pat)
            df.loc[mask, 'text'] = df.loc[mask, 'text'].str.replace(pat,
                                                                    repl,
                                                                    regex=True)

    api.send(outports[0]['name'], log_stream.getvalue())
    api.send(
        outports[1]['name'],
        api.Message(attributes=att_dict,
                    body=df[['text_id', 'language', 'text']]))