Esempio n. 1
0
def process(msg):

    logger, log_stream = set_logging('DEBUG')

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body

    att_dict = dict()
    att_dict['config'] = dict()

    ######################### Start Calculation

    # save and reset indices
    index_names = df.index.names
    if index_names[0]:
        logger.debug("Reset index")
        df.reset_index(inplace=True)

    # prepare selection for numbers
    if api.config.selection_num and not api.config.selection_num.upper(
    ) == 'NONE':

        selection_map = tfp.read_relations(api.config.selection_num)

        for s in selection_map:
            if s[1] == '≤':
                df = df.loc[df[s[0]] <= s[2]]
            elif s[1] == '<':
                df = df.loc[df[s[0]] < s[2]]
            elif s[1] == '≥':
                df = df.loc[df[s[0]] >= s[2]]
            elif s[1] == '>':
                df = df.loc[df[s[0]] > s[2]]
            elif s[1] == '=':
                df = df.loc[df[s[0]] == s[2]]
            elif s[1] == '!':
                df = df.loc[df[s[0]] != s[2]]
            else:
                raise ValueError('Unknown relation: ' + str(s))
    att_dict['config']['selection_num'] = api.config.selection_num

    if api.config.selection_list and not api.config.selection_list.upper(
    ) == 'NONE':
        value_list_dict = tfp.read_dict_of_list(api.config.selection_list)
        for key, vl in value_list_dict.items():
            df = df.loc[df[key].isin(vl)]
    att_dict['config']['selection_list'] = api.config.selection_list

    # set  index again
    if index_names[0]:
        att_dict['indices'] = index_names
        logger.debug('Set indices to: {}'.format(str(index_names)))
        df.set_index(keys=index_names, inplace=True)

    if df.empty:
        logger.error('DataFrame is empty')
        raise ValueError('DataFrame is empty')
    ######################### End Calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    att_dict['operator'] = 'selectDataFrame'
    att_dict['name'] = prev_att['name']
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['number_columns'] = df.shape[1]
    att_dict['number_rows'] = df.shape[0]
    if 'id' in prev_att.keys():
        att_dict[
            'id'] = prev_att['id'] + '; ' + att_dict['operator'] + ': ' + str(
                id(df))
    else:
        att_dict['id'] = att_dict['operator'] + ': ' + str(id(df))

    example_rows = EXAMPLE_ROWS if att_dict[
        'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])

    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
Esempio n. 2
0
def process(msg) :
    att_dict = msg.attributes
    att_dict['operator'] = 'selectValues'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='DEBUG')
    else :
        logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition

    df = msg.body

    ######################### Start Calculation

    # save and reset indices
    index_names = df.index.names
    if index_names[0]:
        logger.debug("Reset index")
        df.reset_index(inplace=True)

    # prepare selection for numbers
    if api.config.selection_num and not api.config.selection_num.upper() == 'NONE':

        selection_map = tfp.read_relations(api.config.selection_num)

        for s in selection_map:
            if s[1] == '≤':
                df = df.loc[df[s[0]] <= s[2]]
            elif s[1] == '<':
                df = df.loc[df[s[0]] < s[2]]
            elif s[1] == '≥':
                df = df.loc[df[s[0]] >= s[2]]
            elif s[1] == '>':
                df = df.loc[df[s[0]] > s[2]]
            elif s[1] == '=':
                df = df.loc[df[s[0]] == s[2]]
            elif s[1] == '!':
                df = df.loc[df[s[0]] != s[2]]
            else:
                raise ValueError('Unknown relation: ' + str(s))
    logger.info('Numerical Selection: {}'.format(api.config.selection_num))

    if api.config.selection_list and not api.config.selection_list.upper() == 'NONE':
        value_list_dict = tfp.read_dict_of_list(api.config.selection_list)
        for key, vl in value_list_dict.items():
            df = df.loc[df[key].isin(vl)]
    logger.info('Selection list: {}'.format(api.config.selection_list))

    # set  index again
    if index_names[0]:
        att_dict['indices'] = index_names
        logger.debug('Set indices to: {}'.format(str(index_names)))
        df.set_index(keys=index_names, inplace=True)

    if df.empty:
        logger.error('DataFrame is empty')
        raise ValueError('DataFrame is empty')

    # end custom process definition
    if df.empty :
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1]))
    logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(att_dict['storage.fileIndex'] + 1,
                                                              att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict,body=df)
Esempio n. 3
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'selectValues'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition

    df = msg.body

    ######################### Start Calculation

    # save and reset indices
    index_names = df.index.names
    if index_names[0]:
        logger.debug("Reset index")
        df.reset_index(inplace=True)

    # prepare selection for numbers
    if api.config.selection_num and not api.config.selection_num.upper(
    ) == 'NONE':

        selection_map = tfp.read_relations(api.config.selection_num)

        for s in selection_map:
            if s[1] == '≤':
                df = df.loc[df[s[0]] <= s[2]]
            elif s[1] == '<':
                df = df.loc[df[s[0]] < s[2]]
            elif s[1] == '≥':
                df = df.loc[df[s[0]] >= s[2]]
            elif s[1] == '>':
                df = df.loc[df[s[0]] > s[2]]
            elif s[1] == '=':
                df = df.loc[df[s[0]] == s[2]]
            elif s[1] == '!':
                df = df.loc[df[s[0]] != s[2]]
            else:
                raise ValueError('Unknown relation: ' + str(s))
    att_dict['config']['selection_num'] = api.config.selection_num

    if api.config.selection_list and not api.config.selection_list.upper(
    ) == 'NONE':
        value_list_dict = tfp.read_dict_of_list(api.config.selection_list)
        for key, vl in value_list_dict.items():
            df = df.loc[df[key].isin(vl)]
    att_dict['config']['selection_list'] = api.config.selection_list

    # set  index again
    if index_names[0]:
        att_dict['indices'] = index_names
        logger.debug('Set indices to: {}'.format(str(index_names)))
        df.set_index(keys=index_names, inplace=True)

    if df.empty:
        logger.error('DataFrame is empty')
        raise ValueError('DataFrame is empty')
    ######################### End Calculation

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))

    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg