Exemple #1
0
def process(msg):
    global setup_data
    global last_msg
    global hash_list

    logger, log_stream = slog.set_logging('text cleansing', loglevel=api.config.debug_mode)
    logger.info("Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    msg = check_for_setup(logger, msg)
    if not msg:
        api.send(outports[0]['name'], log_stream.flush())
        return 0

    adict = msg.body
    att_dict = msg.attributes

    language_filter = tfp.read_value(api.config.language)
    article_words = dict()
    article_count = 0
    for index_article, article in enumerate(adict):

        language = language_dict[article['media']]

        # filter language
        if language_filter and not language_filter == language :
            #logger.debug('Language filtered out: {} ({})'.format(language, language_filter))
            continue

        article_count += 1
        # check if article has been processed
        if article['hash_text'] in hash_list :
            logger.debug('Article has already been processed: {} - {} - {}'.format(article['date'],article['media'],article['hash_text']))
            continue
        hash_list.append(article['hash_text'])

        text = article['text']
        text = re.sub(r'\d+', '', text.lower())
        text = re.sub(r'\b[a-z]\b', '', text)

        # Language settings
        if language == 'DE':
            doc = nlp_g(text)
        elif language == 'FR':
            doc = nlp_fr(text)
        elif language == 'ES':
            doc = nlp_es(text)
        else :
            logger.warning('Language not implemented')
            doc = None
            words = []

        # only when doc has been created - language exists
        if doc :
            if api.config.mode == 'P+NOUN' :
                words = [token.lemma_[:api.config.max_word_len] for token in doc if token.pos_  in ['PROPN', 'NOUN'] ]
            elif api.config.mode == 'NOUN':
                words = [token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ == 'NOUN']
            elif api.config.mode == 'PROPN' :
                words = [token.lemma_[:api.config.max_word_len] for token in doc if token.pos_ == 'PROPN']
            else :
                words = [token.text[:api.config.max_word_len] for token in doc if not token.is_stop]

        # Remove blacklist words
        words = [ w for w in words if w not in setup_data]

        if api.config.counter:
            article_words[article['hash_text']] = collections.Counter(words)
        else :
            article_words[article['hash_text']] = words


    msg = api.Message(attributes=att_dict,body = article_words)
    logger.info('File processed: {} #Articles: {} '.format(att_dict["storage.filename"],len(adict)))
    api.send(outports[0]['name'], log_stream.getvalue())
    api.send(outports[1]['name'], msg)
Exemple #2
0
def process(msg):

    att_dict = msg.attributes

    global result_df

    att_dict['operator'] = 'fromCSV'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    logger.info('Filename: {} index: {}  count: {}  endofSeq: {}'.format(msg.attributes["storage.filename"], \
                                                                         msg.attributes["storage.fileIndex"], \
                                                                         msg.attributes["storage.fileCount"], \
                                                                         msg.attributes["storage.endOfSequence"]))

    if msg.body == None:
        logger.info('Process ended.')
        msg = api.Message(attributes=att_dict, body=result_df)
        log = log_stream.getvalue()
        return log, msg
    elif isinstance(msg.body, str):
        csv_io = io.StringIO(msg.body)
        logger.debug("Input format: <string>")
    elif isinstance(msg.body, bytes):
        csv_io = io.BytesIO(msg.body)
        logger.debug("Input format: <bytes>")
    elif isinstance(msg.body, io.BytesIO):
        logger.debug("Input format: <io.Bytes>")
        csv_io = msg.body
    else:
        raise TypeError('Message body has unsupported type' +
                        str(type(msg.body)))

    # nrows
    nrows = None
    if not api.config.limit_rows == 0:
        nrows = api.config.limit_rows

    # usecols
    use_cols = tfp.read_list(api.config.use_columns)
    logger.debug('Columns used: {}'.format(use_cols))

    # dtypes mapping
    typemap = tfp.read_dict(api.config.dtypes)
    logger.debug('Type cast: {}'.format(str(typemap)))

    kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=')

    ##### Read string from buffer
    logger.debug("Read from input")
    df = pd.read_csv(csv_io, api.config.separator, usecols=use_cols, dtype=typemap, decimal=api.config.decimal, \
                     nrows=nrows, **kwargs)

    # Data from filename
    if api.config.data_from_filename and not api.config.data_from_filename == 'None':
        col = api.config.data_from_filename.split(':')[0].strip().strip(
            "'").strip('"')
        pat = api.config.data_from_filename.split(':')[1].strip().strip(
            "'").strip('"')
        logger.debug('Filename: {}  pattern: {}'.format(
            att_dict['filename'], pat))
        try:
            dataff = re.match('.*(\d{4}-\d+-\d+).*', att_dict['filename'])
            df[col] = dataff.group(1)
        except AttributeError:
            raise ValueError(
                'Pattern not found - Filename: {}  pattern: {}'.format(
                    att_dict['filename'], pat))

    # To Datetime
    if api.config.todatetime and not api.config.todatetime == 'None':
        dt_fmt = tfp.read_dict(api.config.todatetime)
        logger.debug('Time conversion {} by using UTC {}'.format(
            api.config.todatetime, api.config.utc))
        for col, fmt in dt_fmt.items():
            df[col] = pd.to_datetime(df[col], format=fmt, utc=api.config.utc)

    ###### Downcasting
    # save memory footprint for calculating the savings of the downcast
    logger.debug('Memory used before downcast: {}'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    if api.config.downcast_int:
        df, dci = downcast(df, 'int', 'unsigned')
    if api.config.downcast_float:
        df, dcf = downcast(df, 'float', 'float')

    # check if index is provided and set
    index_list = tfp.read_list(api.config.index_cols)
    if index_list:
        df.set_index(index_list, inplace=True)

    if api.config.collect:
        # stores the result in global variable result_df
        if msg.attributes['storage.fileIndex'] == 0:
            logger.debug('Added to DataFrame: {}'.format(
                att_dict['storage.filename']))
            result_df = df
        else:
            try:
                result_df = pd.concat([result_df, df], axis=0, sort=False)
            except Exception as e:
                logger.error(str(e))
                result_df = df
    else:
        result_df = df

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Exemple #3
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'transposeColumn'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation

    att_dict['config']['reset_index'] = api.config.reset_index
    if api.config.reset_index:
        df.reset_index(inplace=True)

    # create DataFrame with numbered columns add concat it to df
    att_dict['config']['transpose_column'] = api.config.transpose_column
    trans_col = tfp.read_value(api.config.transpose_column)

    att_dict['config']['value_column'] = api.config.value_column
    val_col = tfp.read_value(api.config.value_column)

    # new columns
    tvals = list(df[trans_col].unique())
    if api.config.prefix:
        new_cols = {trans_col + '_' + str(v): v for v in tvals}
    else:
        new_cols = {str(v): v for v in tvals}
    t_df = pd.DataFrame(columns=new_cols.keys(), index=df.index)
    df = pd.concat([df, t_df], axis=1)

    # setting the corresponding column to the value of the value column
    for col, val in new_cols.items():
        df.loc[df[trans_col] == val, col] = df.loc[df[trans_col] == val,
                                                   val_col]
    df.drop(columns=[trans_col, val_col], inplace=True)

    att_dict['config']['groupby'] = api.config.groupby
    gbcols = tfp.read_list(api.config.groupby, df.columns)
    # group df
    if gbcols:
        aggr_trans = api.config.aggr_trans.strip()
        aggr_default = api.config.aggr_default.strip()

        aggregation = dict()
        for col in df.columns:
            aggregation[col] = aggr_trans if col in new_cols else aggr_default
        aggregation = {c: a for c, a in aggregation.items() if c not in gbcols}

        df = df.groupby(gbcols, as_index=api.config.as_index).agg(aggregation)

    #####################
    #  final infos to attributes and info message
    #####################

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))

    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
Exemple #4
0
def process(msg):
    logger, log_stream = slog.set_logging('DEBUG')
    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    df = msg.body

    columns = tfp.read_list(api.config.columns, df.columns, test_number=False)
    info_only = api.config.info_only
    equal_only = api.config.equal_only
    threshold = api.config.threshold
    upper_threshold = api.config.upper_threshold
    num_values = api.config.num_values

    transform_data = {
        'column': [],
        'dtype': [],
        'unique_values': [],
        'action': []
    }
    for col in df[columns].select_dtypes(np.object):
        unique_vals = df.loc[df[col].notnull(), col].unique()
        if (len(unique_vals) == num_values) or (len(unique_vals) <= num_values
                                                and not equal_only):
            population = df[col].count() / df.shape[0]
            if population > upper_threshold and len(unique_vals) == 2:
                transform_data['column'].append(col)
                transform_data['dtype'].append(df[col].dtype)
                v0 = 0
                v1 = 1
                if df.loc[df[col] == unique_vals[0],
                          col].count() > df.shape[0] * 0.5:
                    v0 = 1
                    v1 = 0
                # per definition first unique value 0, second unique value 1
                if v0 == 0:
                    transform_data['unique_values'].append(unique_vals)
                else:
                    transform_data['unique_values'].append(
                        [unique_vals[1], unique_vals[0]])
                transform_data['action'].append('map2')
                # print('{}: {} -> {}'.format(vals[0],df.loc[df[col]==vals[0],col].count(),v0))
                # print('{}: {} -> {}'.format(vals[1],df.loc[df[col]==vals[1],col].count(),v1))
                if not info_only:
                    df.loc[df[col] == unique_vals[0], col] = v0
                    df.loc[df[col] == unique_vals[1], col] = v1
                    df.loc[df[col].isnull(), col] = 0
                    df[col] = df[col].astype('int8')
            elif population < threshold or len(unique_vals) == 1:
                transform_data['column'].append(col)
                transform_data['dtype'].append(df[col].dtype)
                transform_data['unique_values'].append(unique_vals)
                transform_data['action'].append('map1')
                if not info_only:
                    df.loc[df[col].isin(unique_vals), col] = 1
                    df.loc[df[col].isnull(), col] = 0
                    df[col] = df[col].astype('int8')

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())
    return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
def process(msg):
    logger, log_stream = slog.set_logging('DEBUG')
    time_monitor = tp.progress()

    result = ''
    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start of doing calculation

    model = LGBMRegressor(
        n_estimators=200,
        learning_rate=0.03,
        num_leaves=32,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.073,
        min_split_gain=0.0222415,
        min_child_weight=40)

    att_dict['config']['train columns'] = api.config.train_cols
    train_cols = tfp.read_list(api.config.train_cols, df.columns)

    att_dict['config']['label'] = api.config.label
    label = tfp.read_value(api.config.label)
    if not label:
        raise ValueError('Label is mandatory')

    # cast to categorical dtype
    for c in df[train_cols].select_dtypes(include='category').columns:
        unique_num = len(df[c].unique())
        nan_num = df[c].isna().count()
        logger.debug('Cast to category - {}: unique {}, nan: {} of {}'.format(c, unique_num, nan_num, df.shape[0]))
        df[c] = df[c].cat.codes
        df[c] = df[c].astype('int32')

    if pd.api.types.is_categorical(df[label]):
        df[label] = df[label].astype('category')
        logger.debug('Cast label to <category>')
        df[label] = df[label].cat.codes
        df[label] = df[label].astype('int32')

    print(df.select_dtypes(include='category').head(10))
    logger.debug('Train with {} features'.format(len(train_cols)))
    print(train_cols)
    model.fit(df[train_cols], df[label], eval_metric='auc')

    ###### end of doing calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty:
        raise ValueError('DataFrame is empty')

    att_dict['operator'] = 'lgbm_classifier'
    att_dict['name'] = prev_att['name']

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())
    return log_stream.getvalue(), api.Message(attributes=att_dict,body=model)
Exemple #6
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'sample'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    # test if body refers to a DataFrame type
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        logger.error('Message body does not contain a pandas DataFrame')
        raise TypeError('Message body does not contain a pandas DataFrame')

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start  calculation

    sample_size = api.config.sample_size
    if sample_size < 1:
        sample_size = int(sample_size * df.shape[0])
        if sample_size < 1:
            sample_size = 1
            logger.warning(
                "Fraction of sample size too small. Set sample size to 1.")
    elif sample_size > df.shape[0]:
        logger.warning("Sample size larger than number of rows")

    logger.debug("Samples_size: {}/() ({})".format(sample_size, df.shape[0],
                                                   sample_size / df.shape[0]))
    random_state = api.config.random_state

    invariant_column = tfp.read_value(api.config.invariant_column)
    if invariant_column and sample_size < df.shape[0]:
        # get the average number of records for each value of invariant
        sc_df = df.groupby(invariant_column)[invariant_column].count()
        sample_size_invariant = int(sample_size / sc_df.mean())
        sample_size_invariant = 1 if sample_size_invariant == 0 else sample_size_invariant  # ensure minimum
        sc_df = sc_df.sample(n=sample_size_invariant,
                             random_state=random_state).to_frame()
        sc_df.rename(columns={invariant_column: 'sum'}, inplace=True)
        # sample the df by merge 2 df
        df = pd.merge(df,
                      sc_df,
                      how='inner',
                      right_index=True,
                      left_on=invariant_column)
        df.drop(columns=['sum'], inplace=True)
    else:
        df = df.sample(n=sample_size, random_state=random_state)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Exemple #7
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'selectValues'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition

    df = msg.body

    ######################### Start Calculation

    # save and reset indices
    index_names = df.index.names
    if index_names[0]:
        logger.debug("Reset index")
        df.reset_index(inplace=True)

    # prepare selection for numbers
    if api.config.selection_num and not api.config.selection_num.upper(
    ) == 'NONE':

        selection_map = tfp.read_relations(api.config.selection_num)

        for s in selection_map:
            if s[1] == '≤':
                df = df.loc[df[s[0]] <= s[2]]
            elif s[1] == '<':
                df = df.loc[df[s[0]] < s[2]]
            elif s[1] == '≥':
                df = df.loc[df[s[0]] >= s[2]]
            elif s[1] == '>':
                df = df.loc[df[s[0]] > s[2]]
            elif s[1] == '=':
                df = df.loc[df[s[0]] == s[2]]
            elif s[1] == '!':
                df = df.loc[df[s[0]] != s[2]]
            else:
                raise ValueError('Unknown relation: ' + str(s))
    att_dict['config']['selection_num'] = api.config.selection_num

    if api.config.selection_list and not api.config.selection_list.upper(
    ) == 'NONE':
        value_list_dict = tfp.read_dict_of_list(api.config.selection_list)
        for key, vl in value_list_dict.items():
            df = df.loc[df[key].isin(vl)]
    att_dict['config']['selection_list'] = api.config.selection_list

    # set  index again
    if index_names[0]:
        att_dict['indices'] = index_names
        logger.debug('Set indices to: {}'.format(str(index_names)))
        df.set_index(keys=index_names, inplace=True)

    if df.empty:
        logger.error('DataFrame is empty')
        raise ValueError('DataFrame is empty')

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Exemple #8
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'splitSample'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    time_monitor = tp.progress()

    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation
    if api.config.split > df.shape[0]:
        warning = 'Split larger than whole sample'
        split = 1
    elif api.config.split > 1:
        split = api.config.split / df.shape[0]
    else:
        split = api.config.split
    logger.info('Split DataFrame: {}'.format(split))

    if api.config.to_category:
        cast_cols = df.select_dtypes(include=np.object).columns
        for col in cast_cols:
            unique_num = len(df[col].unique())
            nan_num = df[col].isna().count()
            logger.debug(
                'Cast to category - {}: unique {}, nan: {} of {}'.format(
                    col, unique_num, nan_num, df.shape[0]))
            df[col] = df[col].astype('category')
        logger.info('Cast to category type: {}'.format(cast_cols))

    label = tfp.read_value(api.config.label_col)
    if label:
        label_vals = list(df[label].unique())
        tdf = list()
        for lab in label_vals:
            tdf.append(df.loc[df[label] == lab].sample(
                frac=split, random_state=api.config.seed))
        train_df = pd.concat(tdf)
        logger.info('Consider label ratio for splitting: {}'.format(label))
    else:
        train_df = df.sample(
            frac=split,
            random_state=api.config.seed)  # random state is a seed value

    test_df = df.drop(train_df.index)
    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict,
                                              body=train_df), api.Message(
                                                  attributes=att_dict,
                                                  body=test_df)
def process(msg):
    global blacklist
    global last_msg
    global hash_list

    logger, log_stream = slog.set_logging(operator_name, api.config.debug_mode)

    # Check if setup complete
    msg = check_for_setup(logger,
                          msg,
                          mode=api.config.mode,
                          use_blacklist=api.config.use_blacklist)
    if not msg:
        api.send(outports[0]['name'], log_stream.flush())
        return 0

    logger.info("Main Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    adict = msg.body

    language_filter = tfp.read_value(api.config.language)
    mode = tfp.read_value(api.config.mode)

    if not mode or not any(m in mode for m in supported_word_types):
        raise Exception(
            'Mode is mandatory parameter and valid values are: {}'.format(
                supported_word_types))

    use_keywords = True if 'K' in mode else False
    use_lexicon = True if 'L' in mode else False
    use_blacklist = api.config.use_blacklist

    logger.info('Usage:  keywords: {}  lexicon: {}  blacklist: {} '.format(
        use_keywords, use_lexicon, use_blacklist))

    article_words = list()
    article_count = 0
    for index_article, article in enumerate(adict):

        language = media_languages[article['media']]

        # filter language
        if language_filter and not language_filter == language:
            # logger.debug('Language filtered out: {} ({})'.format(language, language_filter))
            continue

        article_count += 1
        # check if article has been processed
        if article['hash_text'] in hash_list:
            logger.debug(
                'Article has already been processed: {} - {} - {}'.format(
                    article['date'], article['media'], article['hash_text']))
            continue
        hash_list.append(article['hash_text'])

        doc = nlp_doc(logger, language, article['text'])

        words = dict()
        # only when doc has been created - language exists
        if doc:
            if 'P' in api.config.mode:
                words['P'] = [
                    token.lemma_[:api.config.max_word_len] for token in doc
                    if token.pos_ == 'PROPN'
                ]
            if 'N' in api.config.mode:
                words['N'] = [
                    token.lemma_[:api.config.max_word_len] for token in doc
                    if token.pos_ == 'NOUN'
                ]
            if 'X' in api.config.mode:
                words['X'] = [
                    token.text[:api.config.max_word_len] for token in doc
                    if not token.is_stop
                ]
            if use_keywords:
                #words['K'] = [token.lemma_ for kw in keywords for token in doc if re.match(kw, token.lemma_)]
                words['K'] = [
                    token.lemma_ for kw in keywords for token in doc
                    if kw == token.lemma
                ]
            if use_lexicon and lexicon_languages[language]:
                words['L'] = [
                    lexicon_stem[language][lw] for lw in lexicon_stem[language]
                    for token in doc if re.match(lw, token.lemma_)
                ]
                words['L'] = [
                    lexicon[language][lw] for lw in lexicon[language]
                    for token in doc if lw == token.lemma
                ]

        for m in words:
            # heuristics
            # remove preceding non-alpha characters
            words[m] = [re.sub('^[-\'\./+]', '', w) for w in words[m]]
            # remove trailing non-alpha characters
            words[m] = [re.sub('[-\./+]$', '', w) for w in words[m]]
            # minimum word length
            words[m] = [
                w for w in words[m] if len(w) >= api.config.min_word_len
            ]
            # remove numbers and dates
            words[m] = [
                w for w in words[m]
                if not (re.findall('\d+[\.,]\d+', w) or re.findall('^\d+$', w))
            ]
            # Remove blacklist words
            if use_blacklist:
                words[m] = [w for w in words[m] if w not in blacklist]
            article_words.append([
                article['hash_text'], language, m,
                collections.Counter(words[m]).most_common()
            ])

    attributes = {
        "table": {
            "columns": [{
                "class": "string",
                "name": "HASH_TEXT",
                "nullable": True,
                "type": {
                    "hana": "INTEGER"
                }
            }, {
                "class": "string",
                "name": "LANGUAGE",
                "nullable": True,
                "size": 2,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "TYPE",
                "nullable": True,
                "size": 1,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "WORDS",
                "nullable": True,
                "type": {
                    "hana": "ARRAY"
                }
            }],
            "name":
            "DIPROJECTS.WORD_INDEX",
            "version":
            1
        },
        "storage.filename": msg.attributes["storage.filename"]
    }

    attributes['counter'] = 'Y' if api.config.counter else 'N'

    table_msg = api.Message(attributes=attributes, body=article_words)
    logger.info('File processed: {} #Articles: {} '.format(
        msg.attributes["storage.filename"], len(adict)))
    api.send(outports[0]['name'], log_stream.getvalue())
    api.send(outports[1]['name'], table_msg)
Exemple #10
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'transposeColumn'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation
    if api.config.reset_index:
        df.reset_index(inplace=True)
        logger.info('Reset index')

    # create DataFrame with numbered columns add concat it to df
    trans_col = tfp.read_value(api.config.transpose_column)
    logger.info('Transpose column: {}'.format(trans_col))

    val_col = tfp.read_value(api.config.value_column)
    logger.info('Value column: {}'.format(val_col))

    # new columns
    tvals = list(df[trans_col].unique())
    if api.config.prefix:
        new_cols = {trans_col + '_' + str(v): v for v in tvals}
    else:
        new_cols = {str(v): v for v in tvals}
    t_df = pd.DataFrame(columns=new_cols.keys(), index=df.index)
    df = pd.concat([df, t_df], axis=1)

    # setting the corresponding column to the value of the value column
    for col, val in new_cols.items():
        df.loc[df[trans_col] == val, col] = df.loc[df[trans_col] == val,
                                                   val_col]
    df.drop(columns=[trans_col, val_col], inplace=True)

    gbcols = tfp.read_list(api.config.groupby, df.columns)
    # group df
    if gbcols:
        aggr_trans = api.config.aggr_trans.strip()
        aggr_default = api.config.aggr_default.strip()

        aggregation = dict()
        for col in df.columns:
            aggregation[col] = aggr_trans if col in new_cols else aggr_default
        aggregation = {c: a for c, a in aggregation.items() if c not in gbcols}

        df = df.groupby(gbcols, as_index=api.config.as_index).agg(aggregation)
        logger.info('Groupby: {}'.format(gbcols))

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Exemple #11
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'lgbm_classifier'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='DEBUG')
    else :
        logger, log_stream = slog.set_logging(att_dict['operator'],loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation

    model = LGBMRegressor(
        n_estimators=200,
        learning_rate=0.03,
        num_leaves=32,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.073,
        min_split_gain=0.0222415,
        min_child_weight=40)

    train_cols = tfp.read_list(api.config.train_cols, df.columns)
    logger.info('Train columns: {}'.format(train_cols))

    label = tfp.read_value(api.config.label_col)
    logger.info('Label column: {}'.format(label))
    if not label:
        raise ValueError('Label is mandatory')

    # cast to categorical dtype
    for c in df[train_cols].select_dtypes(include='category').columns:
        unique_num = len(df[c].unique())
        nan_num = df[c].isna().count()
        logger.debug('Cast to category - {}: unique {}, nan: {} of {}'.format(c, unique_num, nan_num, df.shape[0]))
        df[c] = df[c].cat.codes
        df[c] = df[c].astype('int32')

    if pd.api.types.is_categorical(df[label]):
        df[label] = df[label].astype('category')
        logger.debug('Cast label to <category>')
        df[label] = df[label].cat.codes
        df[label] = df[label].astype('int32')

    print(df.select_dtypes(include='category').head(10))
    logger.debug('Train with {} features'.format(len(train_cols)))
    print(train_cols)
    model.fit(df[train_cols], df[label], eval_metric='auc')

    ###### end of doing calculation

    # end custom process definition
    if df.empty :
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0],df.shape[1]))
    logger.debug('Memory: {} kB'.format(df.memory_usage(deep=True).sum() / 1024 ** 2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(i,str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict :
        if not att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount'] :
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(progress_str,time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict,body=df)
Exemple #12
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'anonymizeData'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    logger.debug("Process started")
    time_monitor = tp.progress()

    result = ''
    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())

    prev_att = msg.attributes
    df = msg.body

    ###### start of doing calculation

    att_dict['config']['to_nan'] = api.config.to_nan
    to_nan = tfp.read_value(api.config.to_nan, test_number=False)
    if to_nan:
        df.replace(to_nan, np.nan, inplace=True)

    att_dict['config'][
        'anonymize_to_int_cols'] = api.config.anonymize_to_int_cols
    anonymize_to_int_cols = tfp.read_list(api.config.anonymize_to_int_cols,
                                          list(df.columns))
    att_dict['config']['anonymize_cols'] = api.config.anonymize_cols
    anonymize_cols = tfp.read_list(api.config.anonymize_cols, list(df.columns))

    ## Anonymize columns
    if anonymize_cols:
        logger.debug('Anonymize Columns: {}'.format(str(anonymize_cols)))
        # ensure that ids are not anonymized in the section but exclusively in the id-section
        anonymize_cols = [
            c for c in anonymize_cols if not c in anonymize_to_int_cols
        ]

        # replaceing string with a random string
        for c in df[anonymize_cols].select_dtypes(include='object'):
            unique_list = df[c].unique()
            n = int(math.log10(len(unique_list))) + 2
            # create random map first then check if keys have the values of the keep_list and replace the random values
            rep_map = {
                x: ''.join(random.choices(string.ascii_letters, k=n))
                for x in unique_list if isinstance(x, str)
            }
            for ktk, ktv in keep_terms.items():
                if ktk in rep_map.keys():
                    rep_map[ktk] = ktv
            df[c].replace(rep_map, inplace=True)

        # linear shift of integer
        for c in df[anonymize_cols].select_dtypes(include='int'):
            unique_i = df[c].unique()
            max_i = max(unique_i)
            min_i = min(unique_i)
            length = max_i - min_i
            rand_int1 = random.randint(0, 100)
            rand_int2 = random.randint(0, 100)
            # preserves existing/binary values 0 and 1
            if not (len(unique_i) == 2 and 0 in unique_i and 1 in unique_i):
                df[c] = ((df[c] - min_i) / length * rand_int1 +
                         rand_int2).astype('int')

        # linear shift of float
        for c in df[anonymize_cols].select_dtypes(include='float'):
            unique_f = df[c].unique()
            max_f = max(unique_f)
            min_f = min(unique_f)
            length = max_f - min_f
            rand_float1 = random.random()
            rand_float2 = random.random()
            df[c] = (
                (df[c] - min_f) / length * rand_float1 + rand_float2) / 2.0

    if anonymize_to_int_cols:
        logger.debug('Anonymize to Integer Columns: {}'.format(
            str(anonymize_to_int_cols)))
        # replaceing string with a random string
        for c in df[anonymize_to_int_cols]:
            unique_list = df[c].unique()
            rand_list = list(
                np.random.choice(1000 * len(unique_list),
                                 len(unique_list),
                                 replace=False))
            # create random map first then check if keys have the values of the keep_list and replace the random values
            rep_map = dict(zip(unique_list, rand_list))
            df[c].replace(rep_map, inplace=True)

    att_dict['config']['enumerate_cols'] = api.config.enumerate_cols
    att_dict['config']['prefix_cols'] = api.config.prefix_cols
    enumerate_cols = tfp.read_list(api.config.enumerate_cols, list(df.columns))
    if enumerate_cols:
        ncols = int(math.log10(len(enumerate_cols))) + 1
        prefix_cols = tfp.read_value(api.config.prefix_cols)
        if not prefix_cols:
            prefix_cols = 'Att_'
        cols_map = {
            oc: prefix_cols + str(i).zfill(ncols)
            for i, oc in enumerate(enumerate_cols)
        }
        df.rename(columns=cols_map, inplace=True)

    ###### end of doing calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty:
        raise ValueError('DataFrame is empty')

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))

    logger.debug('End of Process Function')
    logger.debug('End time: ' + time_monitor.elapsed_time())

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Exemple #13
0
def process(msg) :

    logger, log_stream = slog.set_logging('DEBUG')

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body

    att_dict = dict()
    att_dict['config'] = dict()

    ######################### Start Calculation

    # save and reset indices
    index_names = df.index.names
    if index_names[0]:
        logger.debug("Reset index")
        df.reset_index(inplace=True)

    # prepare selection for numbers
    if api.config.selection_num and not api.config.selection_num.upper() == 'NONE':

        selection_map = tfp.read_relations(api.config.selection_num)

        for s in selection_map:
            if s[1] == '≤':
                df = df.loc[df[s[0]] <= s[2]]
            elif s[1] == '<':
                df = df.loc[df[s[0]] < s[2]]
            elif s[1] == '≥':
                df = df.loc[df[s[0]] >= s[2]]
            elif s[1] == '>':
                df = df.loc[df[s[0]] > s[2]]
            elif s[1] == '=':
                df = df.loc[df[s[0]] == s[2]]
            elif s[1] == '!':
                df = df.loc[df[s[0]] != s[2]]
            else:
                raise ValueError('Unknown relation: ' + str(s))
    att_dict['config']['selection_num'] = api.config.selection_num

    if api.config.selection_list and not api.config.selection_list.upper() == 'NONE':
        value_list_dict = tfp.read_dict_of_list(api.config.selection_list)
        for key, vl in value_list_dict.items():
            df = df.loc[df[key].isin(vl)]
    att_dict['config']['selection_list'] = api.config.selection_list

    # set  index again
    if index_names[0]:
        att_dict['indices'] = index_names
        logger.debug('Set indices to: {}'.format(str(index_names)))
        df.set_index(keys=index_names, inplace=True)

    if df.empty:
        logger.error('DataFrame is empty')
        raise ValueError('DataFrame is empty')
    ######################### End Calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    att_dict['operator'] = 'selectDataFrame'
    att_dict['name'] = prev_att['name']
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
    att_dict['columns'] = str(list(df.columns))
    att_dict['number_columns'] = df.shape[1]
    att_dict['number_rows'] = df.shape[0]
    if 'id' in prev_att.keys():
        att_dict['id'] = prev_att['id'] + '; ' + att_dict['operator'] + ': ' + str(id(df))
    else:
        att_dict['id'] = att_dict['operator'] + ': ' + str(id(df))

    example_rows = EXAMPLE_ROWS if att_dict['number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])

    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
Exemple #14
0
def process(msg1, msg2, msg3, msg4, msg5):
    adict = msg1.attributes
    msg_list = [msg1, msg2, msg3, msg4, msg5]

    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging('scrapy', loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging('scrapy', loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()
    # logger, log_stream = slog.set_logging('scrapy',loglevel=api.config.debug_mode)

    scrapy_dir = tfp.read_value(api.config.scrapy_dir)
    if not scrapy_dir:
        logger.error('Scrapy direcory mandatory entry field')
        raise ValueError('Missing Scrapy Directory')
    logger.info('Change directory to: {}'.format(scrapy_dir))
    os.chdir(scrapy_dir)

    project_dir = tfp.read_value(api.config.project_dir)
    if not project_dir:
        logger.error('Scrapy project direcory mandatory entry field')
        raise ValueError('Missing Scrapy Project Directory')

    project_dir = os.path.join(scrapy_dir, project_dir)

    new_file_list = []
    for msg in msg_list:
        filename = msg.attributes["storage.filename"]
        if filename == 'spider.py':
            filename = os.path.join(project_dir, 'spiders', filename)
        else:
            filename = os.path.join(project_dir, filename)
        # copy files to directories
        with open(filename, 'wb') as fout:
            logger.info('Write to filename (binary): {}'.format(filename))
            fout.write(msg.body)
            fout.close()
        new_file_list.append(filename)

    for f in new_file_list:
        if os.path.isfile(filename):
            logger.info('File successfully written: {} ({})'.format(
                filename, time.ctime(os.path.getmtime(filename))))
        else:
            logger.error('File does not exist: {}'.format(filename))

    api.send(outports[0]['name'], log_stream.getvalue())
    log_stream.truncate(0)

    spiderlist = tfp.read_list(api.config.spider)
    num_spiders = len(spiderlist)
    num_batches = 0
    num_all_articles = 0
    if api.config.start_cmd:
        for i, spider in enumerate(spiderlist):
            media = spider.split('_')[0]
            cmd = ['scrapy', 'crawl', spider]
            logger.info('Start scrapy: {} ({}/{})'.format(cmd, i, num_spiders))
            api.send(outports[0]['name'], log_stream.getvalue())
            log_stream.truncate(0)
            #proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd = scrapy_dir,universal_newlines=True)
            proc = subprocess.run(cmd,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE,
                                  cwd=scrapy_dir,
                                  universal_newlines=True)
            #proc = subprocess.Popen(['python','/Users/Shared/data/onlinemedia/outputgen.py'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
            #print('CWD: {}'.format(os.getcwd()))

            api.send(outports[1]['name'], proc.stderr)

            count_articles = 0
            articles_list = list()
            # run through stdout after scrape has ended and add to batch_output
            last_article = dict()
            for line in proc.stdout.splitlines():
                adict = format_check_output(line, logger)
                if adict:
                    adict['media'] = media
                    articles_list.append(adict)
                    last_article = adict
                    count_articles += 1

            # send result to outport
            if len(articles_list) == 0:
                logger.warning('No articles found: {}'.format(media))
                continue

            num_batches += 1
            attributes = {
                k: v
                for k, v in last_article.items()
                if k in ['website', 'date', 'columns']
            }
            attributes['filename'] = '{}_{}.json'.format(
                media, last_article['date'])
            attributes['batch.index'] = i
            attributes['batch.number'] = num_spiders
            if i + 1 == num_spiders:
                attributes['batch.last'] = True
            msg = api.Message(attributes=attributes, body=articles_list)
            api.send(outports[3]['name'], msg)

            if api.config.json_string_output:
                attributes['format'] = 'JSON String'
                msg = api.Message(attributes=attributes,
                                  body=json.dumps(articles_list,
                                                  ensure_ascii=False,
                                                  indent=4))
                api.send(outports[2]['name'], msg)

        logger.info('Spider completed: {} - #articles: {}'.format(
            spider, count_articles))
        num_all_articles += count_articles

    logger.info('Process ended: {}  '.format(time_monitor.elapsed_time()))
    logger.info('<SCAN ENDED><{}>'.format(num_batches))

    api.send(outports[0]['name'], log_stream.getvalue())
    return 0
Exemple #15
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'splitSample'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    time_monitor = tp.progress()

    logger.debug('Start Process Function')
    logger.debug('Start time: ' + time_monitor.get_start_time())
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation
    att_dict['config']['split'] = api.config.split
    if api.config.split > df.shape[0]:
        warning = 'Split larger than whole sample'
        split = 1
    elif api.config.split > 1:
        split = api.config.split / df.shape[0]
    else:
        split = api.config.split

    att_dict['config']['to_category'] = api.config.to_category
    if api.config.to_category:
        for col in df.select_dtypes(include=np.object).columns:
            unique_num = len(df[col].unique())
            nan_num = df[col].isna().count()
            logger.debug(
                'Cast to category - {}: unique {}, nan: {} of {}'.format(col, unique_num, nan_num, df.shape[0]))
            df[col] = df[col].astype('category')

    att_dict['config']['label'] = api.config.label
    label = tfp.read_value(api.config.label)
    if label:
        label_vals = list(df[label].unique())
        tdf = list()
        for lab in label_vals:
            tdf.append(df.loc[df[label] == lab].sample(frac=split, random_state=api.config.seed))
        train_df = pd.concat(tdf)
    else:
        train_df = df.sample(frac=split, random_state=api.config.seed)  # random state is a seed value

    test_df = df.drop(train_df.index)
    ###### end of doing calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty:
        raise ValueError('DataFrame is empty')

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))

    train_msg =  api.Message(attributes=att_dict, body=train_df)
    test_msg = api.Message(attributes=att_dict, body=test_df)
    logger.debug('End time: ' + time_monitor.elapsed_time())

    return log_stream.getvalue(), train_msg, test_msg
def process(msg) :

    logger, log_stream = slog.set_logging('DEBUG')

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start of doing calculation

    att_dict['prev_number_columns'] = df.shape[1]
    att_dict['prev_number_rows'] = df.shape[0]

    #
    att_dict['config']['remove_duplicates_cols'] = api.config.remove_duplicates_cols
    remove_duplicates_cols = tfp.read_list(api.config.remove_duplicates_cols)
    if remove_duplicates_cols:
        df = df.groupby(remove_duplicates_cols).first().reset_index()
        logger.debug('#Dropped duplicates: {} - {} = {}'.format(att_dict['prev_number_rows'], df.shape[0], \
                                                                 att_dict['prev_number_rows'] - df.shape[0]))

    att_dict['config']['value_to_nan'] = api.config.value_to_nan
    value_to_nan = tfp.read_value(api.config.value_to_nan)
    if value_to_nan:
        df.select_dtypes(include='object').replace(value_to_nan, value_to_nan.nan, inplace=True)

    att_dict['config']['yes_no_to_boolean'] = str(api.config.yes_no_to_num)
    if api.config.yes_no_to_num:
        prev_categoricals = len(df.select_dtypes(include=np.object).columns)
        for col in df.select_dtypes(include=np.object):
            df[col] = df[col].str.upper()
            vals = [x for x in df.loc[df[col].notnull(), col].unique()]
            if len(vals) == 1 and vals[0] in ['YES', 'Y']:
                df.loc[df[col].notnull(), col] = 1
                df.loc[df[col].isnull(), col] = 0
                try:
                    df[col] = df[col].astype('int8')
                except ValueError:
                    print('Value Error: {}'.format(col))
                    print(df[col].unique())
            if len(vals) == 1 and vals[0] in ['NO', 'N']:
                df.loc[df[col].notnull(), col] = 1
                df.loc[df[col].isnull(), col] = 0
                df[col] = df[col].astype('int8')
            if len(vals) == 2 and (all(i in vals for i in ['YES', 'NO']) or all(i in vals for i in ['Y', 'N'])):
                df[col].replace(to_replace={'NO': 0, 'N': 0, 'no': 0, 'n': 0, 'YES': 1, 'Y': 1, 'yes': 1, 'y': 1})
                df[col] = df[col].astype('int8')
        after_categoricals = len(df.select_dtypes(include=np.object).columns)
        logger.debug('<yes_no_to_boolean> impact: {} -> {}'.format(prev_categoricals, after_categoricals))

    att_dict['config']['all_constant_to_NaN'] = str(api.config.all_constant_to_NaN)
    if api.config.all_constant_to_NaN:
        num_constant_cols = 0
        for col in df.columns:
            unique_vals = df[col].unique()
            if len(unique_vals) == 1:
                df[col] = np.nan
                num_constant_cols = num_constant_cols + 1
        logger.debug('<all_constant_to_NaN> number of columns: {}'.format(num_constant_cols))

    # remove rare value rows with quantile
    att_dict['config']['rare_value_cols'] = api.config.rare_value_cols
    att_dict['config']['rare_value_quantile'] = api.config.rare_value_quantile
    att_dict['config']['rare_value_std'] = api.config.rare_value_std
    rare_value_cols = tfp.read_list(api.config.rare_value_cols, list(df.columns))
    if rare_value_cols:
        logger.debug('quantile')
        # drop rare values by quantile
        if api.config.rare_value_quantile > 0:
            if not api.config.rare_value_quantile >= 0 and api.config.rare_value_quantile < 1:
                raise ValueError('Quantile value range: [0,1[, not {}'.format(api.config.rare_value_quantile))
            num_reduce_categoricals_col = 0
            for col in rare_value_cols:
                unique_num = len(df[col].unique())
                val_num = df[col].count()
                ratio = df[col].count() / len(df[col].unique())
                threshold = df[col].count() / len(df[col].unique()) * api.config.rare_value_quantile
                value_counts = df[col].value_counts()  # Specific column
                # kept_values = value_counts[value_counts > threshold].count()
                if value_counts[value_counts > threshold].count() > 1:
                    to_remove = value_counts[value_counts <= threshold].index
                    if len(to_remove) > 0:
                        logger.debug(
                            'Drop rare value by quantile: Column {}: {}/{} '.format(col, len(to_remove), unique_num))
                        df[col].replace(to_remove, np.nan, inplace=True)
                        num_reduce_categoricals_col += 1
            logger.debug('<rare_value_quantile> impact on columns: {}/{}'.format(num_reduce_categoricals_col,
                                                                                  len(rare_value_cols)))

        # drop rare values by std
        if api.config.rare_value_std > 0:
            num_reduce_categoricals_col = 0
            for col in df.columns:
                unique_num = len(df[col].unique())
                value_counts = df[col].value_counts()
                mean = value_counts.mean()
                threshold = value_counts.mean() - value_counts.std() * api.config.rare_value_std
                if threshold > 1:
                    to_remove = value_counts[value_counts <= threshold].index
                    if len(to_remove) > 0:
                        logger.debug(
                            'Drop rare value by std: Column {}: {}/{} '.format(col, len(to_remove), unique_num))
                        df[col].replace(to_remove, np.nan, inplace=True)
                        num_reduce_categoricals_col += 1
            logger.debug(
                '<rare_value_std> impact on columns: {}/{}'.format(num_reduce_categoricals_col, len(rare_value_cols)))

    # for unique values less then threshold_unique set to 1. All NaN set to 0
    att_dict['config']['threshold_unique_cols'] = api.config.threshold_unique_cols
    att_dict['config']['threshold_unique'] = api.config.threshold_unique
    threshold_unique_cols = tfp.read_list(api.config.threshold_unique_cols, list(df.columns))
    if threshold_unique_cols:
        prev_obj_cols = len(df.select_dtypes("object"))
        for col in threshold_unique_cols:
            if df[col].dtype == np.object:
                unique_vals = list(df[col].unique())
                if len(unique_vals) <= api.config.threshold_unique:
                    # test if one of the values is nan
                    if np.nan in unique_vals:
                        df.loc[df[col].notnull(), col] = 1
                        df.loc[df[col].isnull(), col] = 0
                        df[col] = df[col].astype('int8')
        after_obj_cols = len(df.select_dtypes("object"))
        logger.debug(
            'Threshold unique effect on number of categorical columns: {} -> {}'.format(prev_obj_cols, after_obj_cols))

    # for count values less then threshold_count set to NaN
    att_dict['config']['sparse_cols'] = api.config.sparse_cols
    att_dict['config']['sparse'] = api.config.sparse
    sparse_cols = tfp.read_list(api.config.sparse_cols)
    if sparse_cols:
        logger.debug('Sparse check')
        if api.config.reduce_categoricals_only:
            test_cols = [ot for ot in sparse_cols if df[ot].dtype == np.object]
        if api.config.sparse < 1:
            api.config.sparse = api.config.sparse * df.shape[0]
        for col in sparse_cols:
            if df[col].count() < api.config.sparse_freq:
                logger.debug('Threshold_count: Removed column {} (#values {})'.format(col, df[col].count()))
                df[col] = np.nan

    # removes columns with to many category values that could not be transposed
    att_dict['config']['max_cat_num'] = api.config.max_cat_num
    att_dict['config']['max_cat_num_cols'] = api.config.max_cat_num_cols
    max_cat_num_cols = tfp.read_list(api.config.max_cat_num_cols)
    if api.config.max_cat_num > 0 and max_cat_num_cols:
        drop_cols = list()
        for col in max_cat_num_cols:
            if df[col].dtype == np.object:
                if len(df[col].unique()) > api.config.max_cat_num:
                    drop_cols.append(col)
        df.drop(columns=drop_cols, inplace=True)

    # remove cols with only NaN
    att_dict['config']['drop_nan_columns'] = api.config.drop_nan_columns
    if api.config.drop_nan_columns:
        df.dropna(axis='columns', how='all', inplace=True)

    # remove rows with NAN except for dimension cols
    att_dict['config']['drop_nan_rows_cols'] = api.config.drop_nan_rows_cols
    drop_nan_rows_cols = tfp.read_list(api.config.drop_nan_rows_cols, df.columns)
    if drop_nan_rows_cols:
        prev_row_num = df.shape[0]
        df[drop_nan_rows_cols].dropna(subset=drop_nan_rows_cols, how='all', inplace=True)
        logger.debug('<drop_nan_rows_cols> deleted rows: {}/{}'.format(prev_row_num - df.shape[0], prev_row_num))

    # maps a certain value to nan for all object type columns
    if tfp.read_value(api.config.fill_categoricals_nan):
        cat_cols = df.select_dtypes(include='object')
        for col in cat_cols:
            df[col].fillna(value=api.config.fill_categoricals_nan, inplace=True)

    # im construction error-prone and ugly
    #if api.config.cut_obj_size > 0:
    #    cols_obj = df.select_dtypes(include='object')
    #    dict_mapping = dict()
    #    for col in cols_obj:
    #        if df[col].str.len().max() > api.config.cut_obj_size:
    #            catmap = dict(enumerate(df[col].unique()))
    #            valmap = {val: val[:api.config.cut_obj_size - 3] + '_' + str(cat) for cat, val in catmap.items()}
    #            if len(api.config.fill_categoricals_nan) > 0:
    #                if api.config.fill_categoricals_nan in valmap.keys():
    #                    valmap[api.config.fill_categoricals_nan] = api.config.fill_categoricals_nan
    #            df[col] = df[col].map(valmap)  # problem
    #        df[col].str.replace(r'[,\.:;]', '')
    #    print(dict_mapping)

    if api.config.fill_numeric_nan_zero:
        cols_num = df.select_dtypes(include=np.number)
        for col in cols_num:
            df[col] = df[col].fillna(0.0)

    print('Cols: {} -> {}   Rows: {} -> {}'.format(att_dict['prev_number_columns'], df.shape[1],
                                                   att_dict['prev_number_rows'], df.shape[0]))

    ###### end of doing calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty:
        raise ValueError('DataFrame is empty')

    att_dict['operator'] = 'selectDataFrame'
    att_dict['name'] = prev_att['name']
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024 ** 2
    att_dict['columns'] = str(list(df.columns))
    att_dict['number_columns'] = df.shape[1]
    att_dict['number_rows'] = df.shape[0]

    example_rows = EXAMPLE_ROWS if att_dict['number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])

    # end custom process definition

    log = log_stream.getvalue()
    return log, msg
Exemple #17
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'groupby'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    prev_att = msg.attributes
    df = msg.body
    prev_shape = df.shape

    ###### start of doing calculation

    # groupby list
    cols = tfp.read_list(api.config.groupby)

    # mapping aggregation
    try:
        colagg = tfp.read_dict(api.config.aggregation)
    except IndexError:
        logger.info('Aggregation is not a map, try to parse a value instead')
        colagg = tfp.read_value(api.config.aggregation)

    # groupby
    logger.info('Group columns: {}'.format(cols))
    logger.info('Aggregation: {}'.format(colagg))
    logger.info('Index: {}'.format(api.config.index))
    df = df.groupby(cols, as_index=api.config.index).agg(colagg)

    # drop col
    dropcols = tfp.read_list(api.config.drop_columns)
    if dropcols:
        logger.info('Drop columns: {}'.format(dropcols))
        df.drop(columns=dropcols, inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Exemple #18
0
def process(left_msg, right_msg):

    att_dict = left_msg.attributes
    att_dict['operator'] = 'join'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    # start custom process definition

    l_att = left_msg.attributes
    r_att = right_msg.attributes

    # read stream from memory
    left_df = left_msg.body
    right_df = right_msg.body

    ###### start of doing calculation
    how = tfp.read_value(api.config.how)

    # merge according to config
    if api.config.on_index:
        df = pd.merge(left_df,
                      right_df,
                      how=how,
                      left_index=True,
                      right_index=True)
    elif api.config.left_on and api.config.right_on:
        left_on_list = tfp.read_list(api.config.left_on)
        right_on_list = tfp.read_list(api.config.right_on)
        logger.info('Join DataFrames on {} - {}'.format(
            left_on_list, right_on_list))
        left_df.reset_index(inplace=True)
        right_df.reset_index(inplace=True)

        df = pd.merge(left_df,
                      right_df,
                      how=how,
                      left_on=left_on_list,
                      right_on=right_on_list)

        # removing second index - might be a more elegant solution
        if 'index_x' in df.columns:
            df.drop(columns=['index_x'], inplace=True)
    else:
        raise ValueError(
            "Config setting: Either <on> or both <left_on> and <right_on> has to be set in order to join the dataframes"
        )

    index_list = tfp.read_list(api.config.new_indices)
    if index_list:
        df.set_index(keys=index_list, inplace=True)
        logger.info('Set index: {}'.format(index_list))

    col_list = tfp.read_list(api.config.drop_columns)
    if col_list:
        df.drop(labels=col_list, axis=1, inplace=True)
        logger.info('Drop columns: {}'.format(col_list))

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '>BATCH ENDED<'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if not att_dict['storage.fileIndex'] + 1 == att_dict[
                'storage.fileCount']:
            progress_str = '{}/{}'.format(att_dict['storage.fileIndex'],
                                          att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=df)
Exemple #19
0
def process(msg):

    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'fromCSV'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    logger.debug("Process started")

    global result_df

    att_dict['filename'] = msg.attributes["storage.filename"]

    logger.info('Filename: {} index: {}  count: {}  endofSeq: {}'.format(msg.attributes["storage.filename"], \
                                                                         msg.attributes["storage.fileIndex"], \
                                                                         msg.attributes["storage.fileCount"], \
                                                                         msg.attributes["storage.endOfSequence"]))

    # using file name from attributes of ReadFile
    if not api.config.df_name or api.config.df_name == "DataFrame":
        att_dict['name'] = att_dict['filename'].split(".")[0]

    if isinstance(msg.body, str):
        csv_io = io.StringIO(msg.body)
        logger.debug("Input format: <string>")
    elif isinstance(msg.body, bytes):
        csv_io = io.BytesIO(msg.body)
        logger.debug("Input format: <bytes>")
    elif isinstance(msg.body, io.BytesIO):
        logger.debug("Input format: <io.Bytes>")
        csv_io = msg.body
    else:
        raise TypeError('Message body has unsupported type' +
                        str(type(msg.body)))

    # nrows
    nrows = None
    if not api.config.limit_rows == 0:
        nrows = api.config.limit_rows

    # usecols
    att_dict['config']['use_columns'] = api.config.use_columns
    use_cols = tfp.read_list(api.config.use_columns)

    # dtypes mapping
    att_dict['config']['dtypes'] = api.config.dtypes
    typemap = tfp.read_dict(api.config.dtypes)

    kwargs = tfp.read_dict(text=api.config.keyword_args, map_sep='=')

    ##### Read string from buffer
    logger.debug("Read from input")
    df = pd.read_csv(csv_io, api.config.separator, usecols=use_cols, dtype=typemap, decimal=api.config.decimal, \
                     nrows=nrows, **kwargs)

    # Data from filename
    if api.config.data_from_filename and not api.config.data_from_filename == 'None':
        col = api.config.data_from_filename.split(':')[0].strip().strip(
            "'").strip('"')
        pat = api.config.data_from_filename.split(':')[1].strip().strip(
            "'").strip('"')
        logger.debug('Filename: {}  pattern: {}'.format(
            att_dict['filename'], pat))
        try:
            dataff = re.match('.*(\d{4}-\d+-\d+).*', att_dict['filename'])
            df[col] = dataff.group(1)
        except AttributeError:
            raise ValueError(
                'Pattern not found - Filename: {}  pattern: {}'.format(
                    att_dict['filename'], pat))

    # To Datetime
    if api.config.todatetime and not api.config.todatetime == 'None':
        coldate = api.config.todatetime.split(':')[0].strip().strip("'").strip(
            '"')
        dformat = api.config.todatetime.split(':')[1].strip().strip("'").strip(
            '"')
        df[coldate] = pd.to_datetime(df[coldate], format=dformat)

    ###### Downcasting
    # save memory footprint for calculating the savings of the downcast
    att_dict['previous_memory'] = df.memory_usage(deep=True).sum() / 1024**2
    if api.config.downcast_int:
        df, dci = downcast(df, 'int', 'unsigned')
    if api.config.downcast_float:
        df, dcf = downcast(df, 'float', 'float')

    # check if index is provided and set
    index_list = tfp.read_list(api.config.index_cols)
    att_dict['config']['index_cols'] = str(index_list)
    att_dict['index_cols'] = str(index_list)
    if index_list:
        df.set_index(index_list, inplace=True)

    # stores the result in global variable result_df
    if msg.attributes['storage.fileIndex'] == 0:
        logger.debug('Added to DataFrame: {}'.format(att_dict['filename']))
        result_df = df
    else:
        result_df = pd.concat([result_df, df], axis=0, sort=False)

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    att_dict['memory'] = result_df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = list(result_df.columns)
    att_dict['dtypes'] = {
        col: str(ty)
        for col, ty in df.dtypes.to_dict().items()
    }
    att_dict['shape'] = result_df.shape
    att_dict['id'] = str(id(result_df))

    logger.debug('Columns: {}'.format(str(result_df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        result_df.shape[0], result_df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if result_df.shape[
        0] > EXAMPLE_ROWS else result_df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in result_df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))

    # end custom process definition
    msg = api.Message(attributes=att_dict, body=result_df)
    log = log_stream.getvalue()
    return log, msg
Exemple #20
0
def process(msg):

    logger, log_stream = slog.set_logging('DEBUG')

    # start custom process definition
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        logger.error('Message body does not contain a pandas DataFrame')
        raise TypeError('Message body does not contain a pandas DataFrame')

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start of doing calculation

    # segment columns
    att_dict['config']['segment_cols'] = api.config.segment_cols
    segment_cols = tfp.read_list(api.config.segment_cols)

    # regression columns
    att_dict['config']['regression_cols'] = api.config.regression_cols
    regression_cols = tfp.read_list(api.config.regression_cols)
    if not regression_cols:
        logger.error('No Regression Columns - mandatory data')
        raise ValueError('No Regression Columns - mandatory data')

    # prediction column
    att_dict['config']['prediction_col'] = api.config.prediction_col
    prediction_col = tfp.read_value(api.config.prediction_col)
    if not prediction_col:
        raise ValueError('No Predicition Column - mandatory data')

    training_cols = regression_cols + [prediction_col]
    model = LinearRegression(fit_intercept=True)

    def fit(x):
        model.fit(x[regression_cols], x[prediction_col])
        return pd.Series([model.coef_, model.intercept_],
                         index=['coef', 'intercept'])

    if segment_cols:
        coef_df = df.groupby(segment_cols)[training_cols].apply(
            fit).reset_index()
    else:
        model.fit(df[regression_cols], df[prediction_col])
        coef_df = pd.Series([model.coef_, model.intercept_],
                            index=['coef', 'intercept'])

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty:
        raise ValueError('DataFrame is empty')

    att_dict['operator'] = 'regressionTrainingDataFrame'
    att_dict['name'] = prev_att['name']
    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['number_columns'] = df.shape[1]
    att_dict['number_rows'] = df.shape[0]

    example_rows = EXAMPLE_ROWS if att_dict[
        'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])

    # end custom process definition

    log = log_stream.getvalue()
    coef_att = {
        'segmentation_columns': segment_cols,
        'regression_columns': regression_cols,
        'prediction_column': prediction_col
    }

    msg_coef = api.Message(attributes=coef_att, body=coef_df)
    msg_data = api.Message(attributes=att_dict, body=df)

    return log, msg_coef, msg_data
def process(msg):

    logger, log_stream = slog.set_logging('word_indexing',
                                          loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    articles = msg.body
    word_index = list()
    # as table
    for article in articles:
        word_index.extend([[article[0], article[1], article[2], w[0], w[1]]
                           for w in article[3]])

    attributes = {
        "table": {
            "columns": [{
                "class": "string",
                "name": "HASH_TEXT",
                "nullable": True,
                "type": {
                    "hana": "INTEGER"
                }
            }, {
                "class": "string",
                "name": "LANGUAGE",
                "nullable": True,
                "size": 2,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "TYPE",
                "nullable": True,
                "size": 1,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "WORD",
                "nullable": True,
                "size": 80,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "COUNT",
                "nullable": True,
                "type": {
                    "hana": "INTEGER"
                }
            }],
            "name":
            "DIPROJECTS.WORD_INDEX3",
            "version":
            1
        }
    }

    logger.debug('Process ended, articles processed {}  - {}  '.format(
        len(articles), time_monitor.elapsed_time()))
    api.send(outports[0]['name'], log_stream.getvalue())

    msg = api.Message(attributes=attributes, body=word_index)
    api.send(outports[1]['name'], msg)
def process(msg):
    global blacklist
    global last_msg
    global word_counter
    global hash_list

    logger, log_stream = slog.set_logging('word_frequency',
                                          loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    msg = check_for_setup(logger, msg)
    if not msg:
        api.send(outports[0]['name'], log_stream.flush())
        return 0

    adict = msg.body
    att_dict = msg.attributes

    end_date = datetime.strptime(api.config.date, '%Y-%m-%d')
    start_date = end_date - timedelta(days=api.config.days_into_past)

    language_filter = tfp.read_value(api.config.language)
    media_filter = tfp.read_value(api.config.media)

    for index_article, article in enumerate(adict):

        # filter article
        adate = datetime.strptime(article['date'], '%Y-%m-%d')
        if not start_date <= adate <= end_date:
            #logger.debug('Date of article out of range: {} ({} - {})'.format(adate,start_date,end_date))
            continue

        # filter language
        if language_filter and not language_filter == language_dict[
                article['media']]:
            #logger.debug('Language filtered out: {} ({})'.format(language_dict[article['media']], language_filter))
            continue

        # filter media
        if media_filter and not media_filter == article['media']:
            #logger.debug('Media filtered out: {} ({})'.format(article['media'], media_filter))
            continue

        # check if article has been processed
        if article['hash_text'] in hash_list:
            logger.debug(
                'Article has already been processed: {} - {} - {}'.format(
                    article['date'], article['media'], article['hash_text']))
            word_counter.update(hash_list[article['hash_text']])
            continue

        language = language_dict[article['media']]
        text = article['text']

        # Language settings
        if language == 'G':
            doc = nlp_g(text)
        elif language == 'F':
            doc = nlp_fr(text)
        elif language == 'S':
            doc = nlp_es(text)
        else:
            logger.warning('Language not implmented')
            doc = None
            words = []

        # only when doc has been created - language exists
        if doc:
            if api.config.mode == 'NOUN':
                words = [
                    token.lemma_[:api.config.max_word_len] for token in doc
                    if token.pos_ in ['PROPN', 'NOUN']
                ]
            elif api.config.mode == 'PROPN':
                words = [
                    token.lemma_[:api.config.max_word_len] for token in doc
                    if token.pos_ == 'PROPN'
                ]
            else:
                words = [
                    token.text[:api.config.max_word_len] for token in doc
                    if not token.is_stop
                ]

            word_counter.update(words)

        hash_list[article['hash_text']] = words

    if api.config.limit > 0:
        common_words = word_counter.most_common(api.config.limit)

    result, progress_str = test_last_batch(attributes=att_dict,
                                           collect=api.config.collect)
    if result:
        word_freq = [ {'date':api.config.date,'days_into_past':api.config.days_into_past, 'language':api.config.language, \
                       'media':api.config.media,'mode':api.config.mode, 'word':w, 'frequency': f} for w,f in common_words ]
        msg = api.Message(attributes=att_dict, body=word_freq)
        api.send(outports[2]['name'], msg)

        if api.config.json_string_output:
            json_data = json.dumps(word_freq)
            api.send(outports[1]['name'], msg)

    logger.debug('Process ended,  {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    api.send(outports[0]['name'], log_stream.getvalue())
def process(db_msg):

    logger, log_stream = slog.set_logging('topic_identification',
                                          loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    columns = [c['name'] for c in db_msg.attributes['table']['columns']]
    df = pd.DataFrame(db_msg.body, columns=columns)

    # Language filter
    language_filter = tfp.read_list(api.config.language_filter)
    if language_filter:
        df = df.loc[df["LANGUAGE"].isin(language_filter)]
    else:
        language_filter = list(df['LANGUAGE'].unique())
    logger.info('Languages : {}'.format(language_filter))

    # Word type filter
    word_type_filter = tfp.read_value(api.config.word_type_filter)
    if word_type_filter:
        types = [c for c in word_type_filter]
        df = df.loc[df["TYPE"].isin(types)]
    logger.info('Word restricted to types : {}'.format(word_type_filter))

    # groupby and concatenate words
    gdf = df.groupby(by=['HASH_TEXT', 'LANGUAGE'])['WORD'].apply(
        lambda x: ' '.join(x)).reset_index()

    logger.info('Topic identification: ')
    for lang in language_filter:
        logger.info('Language: {}  #Documents: {}  #Words: {}'.format(lang,gdf.loc[gdf['LANGUAGE']==lang].shape[0],\
                                                                      df.loc[df['LANGUAGE'] == lang].shape[0]))

    api.send(outports[0]['name'], log_stream.getvalue())
    log_stream.seek(0)

    # create document-term matrix - no tokenization or text prep are needed
    tf_vectorizer = CountVectorizer(analyzer='word',
                                    min_df=1,
                                    lowercase=False,
                                    tokenizer=str.split)

    # tf means term-frequency in a document for each language
    date_today = str(date.today())

    # 2-array with TOPIC, LANGUAGE, TYPE, DATE, EXPIRY_DATE, ATTRIBUTE, KEYWORD_i (num of topics)
    topic_list = list()
    for lang in language_filter:
        logger.info('Process all texts for language: {}'.format(lang))
        lang_gdf = gdf.loc[gdf['LANGUAGE'] == lang]
        dtm_tf = tf_vectorizer.fit_transform(lang_gdf['WORD'])
        # for tf dtm
        lda_tf = LatentDirichletAllocation(n_components=api.config.num_topics,
                                           learning_method='online',
                                           evaluate_every=-1,
                                           n_jobs=-1)
        lda_tf.fit(dtm_tf)
        feature_names = tf_vectorizer.get_feature_names()

        for i, topic in enumerate(lda_tf.components_):
            topic_words = [
                feature_names[f]
                for f in topic.argsort()[:-api.config.topic_num_words - 1:-1]
            ]
            logger.debug('Len: {}  topic_words:{}'.format(
                len(topic_words), topic_words))
            row = [
                date_today + "-" + str(i), lang, 'ALGO', date_today, None, None
            ] + topic_words
            topic_list.append(row)

    attributes = {
        "table": {
            "columns": [{
                "class": "string",
                "name": "TOPIC",
                "nullable": False,
                "size": 80,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "LANGUAGE",
                "nullable": False,
                "size": 2,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "TYPE",
                "nullable": False,
                "size": 10,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "DATE",
                "nullable": True,
                "type": {
                    "hana": "DATE"
                }
            }, {
                "class": "string",
                "name": "EXPIRY_DATE",
                "nullable": True,
                "type": {
                    "hana": "DATE"
                }
            }, {
                "class": "string",
                "name": "ATTRIBUTE",
                "nullable": True,
                "size": 25,
                "type": {
                    "hana": "NVACHAR"
                }
            }],
            "name":
            "DIPROJECTS.WORD_INDEX",
            "version":
            1
        }
    }
    for i in range(1, api.config.topic_num_words + 1):
        attributes['table']['columns'].append({
            "class": "string",
            "name": "KEYWORD_" + str(i),
            "nullable": True,
            "size": 80,
            "type": {
                "hana": "NVARCHAR"
            }
        })

    msg = api.Message(attributes=attributes, body=topic_list)
    logger.debug('Process ended, topics processed {}'.format(
        time_monitor.elapsed_time()))
    api.send(outports[0]['name'], log_stream.getvalue())
    api.send(outports[1]['name'], msg)
Exemple #24
0
def process(test_msg, base_msg):

    logger, log_stream = slog.set_logging('DEBUG')

    # start custom process definition
    test_att = test_msg.attributes
    base_att = base_msg.attributes

    att_dict = dict()

    if test_att['name'] == base_att['name']:
        att_dict['name'] = test_att['name']
    else:
        att_dict['name'] = test_att['name'] + '-' + base_att['name']
    att_dict['config'] = dict()

    att_dict['config']['test_index'] = api.config.test_index
    testdf_index = tfp.read_value(api.config.test_index)
    if not testdf_index:
        logger.error('Index of test data is mandatory')
        raise ValueError('Index of test data is mandatory')

    att_dict['number_rows'] = str(base_msg.body.shape[0])

    # get the columns to check

    mapping = tfp.read_dict(api.config.check_columns)
    df = pd.DataFrame()

    if mapping:

        att_dict['config']['check_columns'] = str(mapping)
        att_dict['config']['limit'] = api.config.limit

        # read stream from memory
        test_df = test_msg.body

        # test if all mapping cols in testdf
        checkcols = [
            elem in list(test_df.columns) for elem in list(mapping.keys())
        ]
        if not all(checkcols):
            error_txt = 'Elements in mapping are not contained in columns of test df : ' + \
                        str(list(mapping.keys())) + '-' + str(list(test_df.columns)) + ' - ' + str(checkcols)
            logger.error(error_txt)
            raise ValueError(error_txt)

        if not testdf_index in test_df.columns:
            logger.error('Test index needs to be column')
            raise ValueError('Test index needs to be column')

        tcols = ['t_' + c for c in list(mapping.keys())]
        tdf = pd.DataFrame(columns=tcols)

        df = base_msg.body
        df = pd.concat([df, tdf], axis=1)

        num_cols = len(mapping)
        # run over all left df rows to test in right_df
        for index, test_row in test_df.iterrows():
            # apply function
            def get_ratio(row):
                sc = 0
                for tcol, bcol in mapping.items():
                    sc = sc + fuzz.token_sort_ratio(test_row[tcol], row[bcol])
                return sc / num_cols

            df['tscore'] = df.apply(get_ratio, axis=1)
            # get best matching and store index in v_dict
            max_score = df['tscore'].max()
            if max_score >= api.config.limit:
                mask = (df['tscore'] == max_score)
                df.loc[mask, 'score'] = max_score
                df.loc[mask, 'external_id'] = test_row[testdf_index]
                for coli in mapping:
                    df.loc[mask, 't_' + coli] = test_row[coli]

            df.drop(columns=['tscore'], inplace=True)

        # remove external_id when test column value has none

        t_cols = ['t_' + t for t in mapping.keys()] + ['external_id', 'score']
        for bcol in mapping.values():
            mask = df[bcol].isna()
            df.loc[mask, t_cols] = np.nan

        if api.config.only_index:
            df = df[list(base_msg.body.columns) + ['external_id']]
        att_dict['config']['only_index'] = api.config.only_index

        if api.config.only_matching_rows:
            df = df.loc[~df['score'].isna()]
        att_dict['config'][
            'only_matching_rows'] = api.config.only_matching_rows

        basedf_index = tfp.read_value(api.config.base_index)
        att_dict['config']['base_index'] = basedf_index

        if api.config.joint_id:
            if not basedf_index:
                raise ValueError(
                    "For <joint_id> a value for <base_index> is necessary ")
            df.loc[~df['external_id'].isna(),
                   'joint_id'] = df.loc[~df['external_id'].isna(),
                                        'external_id']
            df.loc[df['external_id'].isna(),
                   'joint_id'] = df.loc[df['external_id'].isna(), basedf_index]
        att_dict['config']['joint_id'] = api.config.joint_id

        if api.config.add_non_matching:
            # test if same columns
            if not all(
                [elem in test_df.columns for elem in base_msg.body.columns]):
                raise ValueError("Adding test dataframe only possible when having same columns " + str(test_df.columns) \
                                 + ' vs. ' + str(base_msg.body.columns))
            matched_ids = df['external_id'].unique()
            addto_df = test_df.loc[~test_df[testdf_index].isin(matched_ids
                                                               )].copy()
            addto_df['joint_id'] = addto_df[testdf_index]
            df = pd.concat([df, addto_df], axis=0, sort=False)
        att_dict['config']['add_non_matching'] = api.config.add_non_matching

    else:
        logger.warning('No columns to check')

    ##############################################
    #  final infos to attributes and info message
    ##############################################
    if df.empty:
        logger.warning('DataFrame is empty')
    else:
        att_dict['operator'] = 'fuzzyjoinDataFrames'
        att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
        att_dict['columns'] = str(list(df.columns))
        att_dict['number_columns'] = df.shape[1]
        att_dict['number_rows'] = df.shape[0]
        if 'id' in base_att.keys():
            att_dict['id'] = base_att['id'] + '; ' + att_dict[
                'operator'] + ': ' + str(id(df))
        else:
            att_dict['id'] = att_dict['operator'] + ': ' + str(id(df))

        example_rows = EXAMPLE_ROWS if att_dict[
            'number_rows'] > EXAMPLE_ROWS else att_dict['number_rows']
        for i in range(0, example_rows):
            att_dict['row_' + str(i)] = str(
                [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
Exemple #25
0
def process(msg):
    global blacklist
    global last_msg
    global hash_list

    logger, log_stream = slog.set_logging('word_extraction',
                                          loglevel=api.config.debug_mode)
    logger.info("Process started. Logging level: {}".format(logger.level))
    time_monitor = tp.progress()

    msg = check_for_setup(logger, msg, mode=api.config.use_blacklist)
    if not msg:
        api.send(outports[0]['name'], log_stream.flush())
        return 0

    adict = msg.body

    language_filter = tfp.read_value(api.config.language)
    mode = tfp.read_value(api.config.mode)

    if not mode or not any(m in mode for m in supported_modes):
        raise Exception(
            'Mode is mandatory parameter and valid values are: {}'.format(
                supported_modes))

    article_words = list()
    article_count = 0
    for index_article, article in enumerate(adict):

        language = language_dict[article['media']]

        # filter language
        if language_filter and not language_filter == language:
            #logger.debug('Language filtered out: {} ({})'.format(language, language_filter))
            continue

        article_count += 1
        # check if article has been processed
        if article['hash_text'] in hash_list:
            logger.debug(
                'Article has already been processed: {} - {} - {}'.format(
                    article['date'], article['media'], article['hash_text']))
            continue
        hash_list.append(article['hash_text'])

        text = article['text']
        # might interfer with
        #text = re.sub(r'\d+', '', text.lower())
        #text = re.sub(r'\b[a-z]\b', '', text)

        # Language settings
        if language == 'DE':
            doc = nlp_g(text)
        elif language == 'FR':
            doc = nlp_fr(text)
        elif language == 'ES':
            doc = nlp_es(text)
        else:
            logger.warning('Language not implemented: {}'.format(language))
            doc = None

        words = dict()
        # only when doc has been created - language exists
        if doc:
            if 'P' in api.config.mode:
                words['P'] = [
                    token.lemma_[:api.config.max_word_len] for token in doc
                    if token.pos_ == 'PROPN'
                ]
            if 'N' in api.config.mode:
                words['N'] = [
                    token.lemma_[:api.config.max_word_len] for token in doc
                    if token.pos_ == 'NOUN'
                ]
            if 'X' in api.config.mode:
                words['X'] = [
                    token.text[:api.config.max_word_len] for token in doc
                    if not token.is_stop
                ]

        for m in words:
            # heuristics

            # remove preceding non-alpha characters
            words[m] = [re.sub('^[-\'\./]', '', w) for w in words[m]]
            # remove trailing non-alpha characters
            words[m] = [re.sub('[-\./]$', '', w) for w in words[m]]
            # minimum word length
            words[m] = [
                w for w in words[m] if len(w) >= api.config.min_word_len
            ]
            # remove date like words
            words[m] = [w for w in words[m] if not re.findall('\d+\.\d+', w)]
            # Remove blacklist words
            if api.config.use_blacklist:
                words[m] = [w for w in words[m] if w not in blacklist]
            if api.config.counter:
                article_words.append([
                    article['hash_text'], language, m,
                    collections.Counter(words[m]).most_common()
                ])
            else:
                article_words.append(
                    [article['hash_text'], language, m, words[m]])

    attributes = {
        "table": {
            "columns": [{
                "class": "string",
                "name": "HASH_TEXT",
                "nullable": True,
                "type": {
                    "hana": "INTEGER"
                }
            }, {
                "class": "string",
                "name": "LANGUAGE",
                "nullable": True,
                "size": 2,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "TYPE",
                "nullable": True,
                "size": 1,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "WORDS",
                "nullable": True,
                "type": {
                    "hana": "ARRAY"
                }
            }],
            "name":
            "DIPROJECTS.WORD_INDEX3",
            "version":
            1
        },
        "storage.filename": msg.attributes["storage.filename"]
    }

    attributes['counter'] = 'Y' if api.config.counter else 'N'

    table_msg = api.Message(attributes=attributes, body=article_words)
    logger.info('File processed: {} #Articles: {} '.format(
        msg.attributes["storage.filename"], len(adict)))
    api.send(outports[0]['name'], log_stream.getvalue())
    api.send(outports[1]['name'], table_msg)
Exemple #26
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'categorical2exist'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    logger.info('Start Process')
    time_monitor.get_start_time()

    df = msg.body

    prev_cat_col = len(df.select_dtypes(np.object).columns)

    columns = tfp.read_list(api.config.columns, df.columns, test_number=False)
    info_only = api.config.info_only
    equal_only = api.config.equal_only
    threshold = api.config.threshold
    upper_threshold = api.config.upper_threshold
    num_values = api.config.num_values

    logger.debug('PARAMETER  threshold: {}   upper_threshold: {}  num_values: {}  Modification: {}'\
                 .format(threshold,upper_threshold,num_values,info_only))

    transform_data = {
        'column': [],
        'dtype': [],
        'unique_values': [],
        'action': []
    }
    for col in df[columns].select_dtypes(np.object).columns:
        unique_vals = df.loc[df[col].notnull(), col].unique()
        if (len(unique_vals) == num_values) or (len(unique_vals) <= num_values
                                                and not equal_only):
            population = df[col].count() / df.shape[0]
            if population > upper_threshold and len(unique_vals) == 2:
                transform_data['column'].append(col)
                transform_data['dtype'].append(df[col].dtype)
                v0 = 0
                v1 = 1
                if df.loc[df[col] == unique_vals[0],
                          col].count() > df.shape[0] * 0.5:
                    v0 = 1
                    v1 = 0
                # per definition first unique value 0, second unique value 1
                if v0 == 0:
                    transform_data['unique_values'].append(unique_vals)
                else:
                    transform_data['unique_values'].append(
                        [unique_vals[1], unique_vals[0]])
                transform_data['action'].append('map2')
                # print('{}: {} -> {}'.format(vals[0],df.loc[df[col]==vals[0],col].count(),v0))
                # print('{}: {} -> {}'.format(vals[1],df.loc[df[col]==vals[1],col].count(),v1))
                if not info_only:
                    df.loc[df[col] == unique_vals[0], col] = v0
                    df.loc[df[col] == unique_vals[1], col] = v1
                    df.loc[df[col].isnull(), col] = 0
                    df[col] = df[col].astype('int8')
            elif population < threshold or len(unique_vals) == 1:
                transform_data['column'].append(col)
                transform_data['dtype'].append(df[col].dtype)
                transform_data['unique_values'].append(unique_vals)
                transform_data['action'].append('map1')
                if not info_only:
                    df.loc[df[col].isin(unique_vals), col] = 1
                    df.loc[df[col].isnull(), col] = 0
                    df[col] = df[col].astype('int8')

    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))

    cat_cols = len(df.select_dtypes(np.object).columns)
    logger.info('Categoricals to Numeric: {} - {} = {}'.format(
        prev_cat_col, cat_cols, prev_cat_col - cat_cols))

    return log_stream.getvalue(), api.Message(attributes={'name':'filter_by_population','type':'DataFrame'},body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
Exemple #27
0
def process(msg):
    att_dict = msg.attributes
    att_dict['operator'] = 'filter_by_population'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    prev_cols = df.shape[1]

    columns = tfp.read_list(api.config.columns, df.columns, test_number=False)
    info_only = api.config.info_only
    threshold = api.config.threshold
    logger.debug('Parameter  Threshold: {}   Data Modification:{} '.format(
        threshold, info_only))

    transform_data = {
        'column': [],
        'dtype': [],
        'unique_vals': [],
        'action': []
    }
    for col in columns:
        population = df[col].count() / df.shape[0]
        unique_vals = df[col].unique()
        if population < threshold and not (len(unique_vals) == 1
                                           and np.isnan(unique_vals[0])):
            unique_vals = df[col].unique()
            transform_data['column'].append(col)
            transform_data['dtype'].append(df[col].dtype)
            transform_data['unique_vals'].append(unique_vals)
            transform_data['action'].append('drop')
            if not info_only:
                df.drop(columns=[col], inplace=True)

    # end custom process definition
    if df.empty:
        raise ValueError('DataFrame is empty')
    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(
        df.memory_usage(deep=True).sum() / 1024**2))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        logger.debug('Row {}: {}'.format(
            i, str([str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])))

    progress_str = '<BATCH ENDED><1>'
    if 'storage.fileIndex' in att_dict and 'storage.fileCount' in att_dict and 'storage.endOfSequence' in att_dict:
        if att_dict['storage.fileIndex'] + 1 == att_dict['storage.fileCount']:
            progress_str = '<BATCH ENDED><{}>'.format(
                att_dict['storage.fileCount'])
        else:
            progress_str = '<BATCH IN-PROCESS><{}/{}>'.format(
                att_dict['storage.fileIndex'] + 1,
                att_dict['storage.fileCount'])
    att_dict['process_list'].append(att_dict['operator'])
    logger.debug('Process ended: {}  - {}  '.format(
        progress_str, time_monitor.elapsed_time()))
    logger.debug('Past process steps: {}'.format(att_dict['process_list']))

    return log_stream.getvalue(), api.Message(attributes=att_dict,body=df),\
            api.Message(attributes={'name':'transformation','type':'DataFrame'},body=pd.DataFrame(transform_data))
Exemple #28
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'sample'
    logger, log_stream = slog.set_logging(att_dict['operator'])
    if api.config.debug_mode == True:
        logger.setLevel('DEBUG')

    # start custom process definition
    # test if body refers to a DataFrame type
    prev_att = msg.attributes
    df = msg.body
    if not isinstance(df, pd.DataFrame):
        logger.error('Message body does not contain a pandas DataFrame')
        raise TypeError('Message body does not contain a pandas DataFrame')

    att_dict = dict()
    att_dict['config'] = dict()

    ###### start  calculation

    sample_size = api.config.sample_size
    if sample_size < 1:
        sample_size = int(sample_size * df.shape[0])
        if sample_size < 1:
            sample_size = 1
            logger.warning(
                "Fraction of sample size too small. Set sample size to 1.")
    elif sample_size > df.shape[0]:
        logger.warning("Sample size larger than number of rows")

    logger.debug("Samples_size: {}/() ({})".format(sample_size, df.shape[0],
                                                   sample_size / df.shape[0]))
    random_state = api.config.random_state

    invariant_column = tfp.read_value(api.config.invariant_column)
    if invariant_column and sample_size < df.shape[0]:
        # get the average number of records for each value of invariant
        sc_df = df.groupby(invariant_column)[invariant_column].count()
        sample_size_invariant = int(sample_size / sc_df.mean())
        sample_size_invariant = 1 if sample_size_invariant == 0 else sample_size_invariant  # ensure minimum
        sc_df = sc_df.sample(n=sample_size_invariant,
                             random_state=random_state).to_frame()
        sc_df.rename(columns={invariant_column: 'sum'}, inplace=True)
        # sample the df by merge 2 df
        df = pd.merge(df,
                      sc_df,
                      how='inner',
                      right_index=True,
                      left_on=invariant_column)
        df.drop(columns=['sum'], inplace=True)
    else:
        df = df.sample(n=sample_size, random_state=random_state)

    ###### end  calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty:
        raise ValueError('DataFrame is empty')

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))
    example_rows = EXAMPLE_ROWS if df.shape[0] > EXAMPLE_ROWS else df.shape[0]
    for i in range(0, example_rows):
        att_dict['row_' + str(i)] = str(
            [str(i)[:10].ljust(10) for i in df.iloc[i, :].tolist()])
        logger.debug('Head data: {}'.format(att_dict['row_' + str(i)]))

    # end custom process definition

    log = log_stream.getvalue()
    msg = api.Message(attributes=att_dict, body=df)
    return log, msg
Exemple #29
0
def process(msg1, msg2, msg3, msg4, msg5):
    adict = msg1.attributes
    msg_list = [msg1, msg2, msg3, msg4, msg5]

    logger, log_stream = slog.set_logging('scrapy', loglevel='DEBUG')

    logger.info("Process started")
    time_monitor = tp.progress()
    # logger, log_stream = slog.set_logging('scrapy',loglevel=api.config.debug_mode)

    scrapy_dir = tfp.read_value(api.config.scrapy_dir)
    if not scrapy_dir:
        logger.error('Scrapy direcory mandatory entry field')
        raise ValueError('Missing Scrapy Directory')
    logger.info('Change directory to: {}'.format(scrapy_dir))
    os.chdir(scrapy_dir)

    project_dir = tfp.read_value(api.config.project_dir)
    if not project_dir:
        logger.error('Scrapy project direcory mandatory entry field')
        raise ValueError('Missing Scrapy Project Directory')

    project_dir = os.path.join(scrapy_dir,project_dir)

    new_file_list = []
    for msg in msg_list:
        filename = os.path.basename(msg.attributes["file"]["path"])

        if filename == 'spider.py':
            filename = os.path.join(project_dir, 'spiders', filename)
        else:
            filename = os.path.join(project_dir, filename)
        # copy files to directories
        try:
            with open(filename, 'wb') as fout:
                logger.info('Write to filename (binary): {}'.format(filename))
                fout.write(msg.body)
                fout.close()
        except IOError:
            logger.warning('File not found: {}'.format(filename))
            logger.debug('Current directory: {}'.format(os.getcwd()))
            f = []
            for (dirpath, dirnames, filenames) in os.walk('/home/onlinemedia/'):
                f.extend(filenames)
                break
            logger.debug('Files under directory onlinemedia: {}'.format(f))
            api.send(outports[0]['name'], log_stream.getvalue())
            time.sleep(5)
            exit(-1)
        new_file_list.append(filename)

    for f in new_file_list:
        if os.path.isfile(filename):
            logger.info('File successfully written: {} ({})'.format(filename, time.ctime(os.path.getmtime(filename))))
        else:
            logger.error('File does not exist: {}'.format(filename))

    api.send(outports[0]['name'], log_stream.getvalue())
    log_stream.seek(0)
    log_stream.truncate(0)

    spiderlist = tfp.read_list(api.config.spider)
    num_spiders = len(spiderlist)
    num_batches = 0
    num_all_articles = 0

    for i, spider in enumerate(spiderlist) :
        media = spider.split('_')[0]
        today_date = datetime.today().strftime('%Y-%m-%d')
        cmd = ['scrapy', 'crawl', spider]
        logger.info('Start scrapy: {} ({}/{})'.format(cmd,i,num_spiders))

        #proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd = scrapy_dir,universal_newlines=True)
        proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=scrapy_dir,universal_newlines=True)
        #proc = subprocess.Popen(['python','/Users/Shared/data/onlinemedia/outputgen.py'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
        #print('CWD: {}'.format(os.getcwd()))

        logger.info(proc.stderr)
        api.send(outports[0]['name'], log_stream.getvalue())
        log_stream.seek(0)
        log_stream.truncate(0)

        count_articles = 0
        articles_list = list()
        # run through stdout after scrape has ended and add to batch_output
        last_article = dict()
        for line in  proc.stdout.splitlines():
            adict = format_check_output(line, logger)
            if adict:
                adict['media'] = media
                adict['date'] = today_date
                articles_list.append(adict)
                last_article = adict
                count_articles += 1

        # send result to outport
        if len(articles_list) == 0 :
            logger.warning('No articles found: {}'.format(media))
            continue

        num_batches += 1
        attributes = { k:v for k,v in last_article.items() if k in ['website','date','columns']}
        attributes['media'] = media
        if media in media_languages :
            attributes['language'] = media_languages[media]
        else :
            attributes['language'] = 'unknown'
        attributes['today_str'] = today_date
        attributes['month'] = datetime.today().strftime("%B")
        attributes['message.indexBatch'] = i
        attributes['message.countBatch'] = num_spiders
        attributes['message.lastBatch'] = True if  i+1 == num_spiders else False


        df = pd.DataFrame(articles_list)
        df  = df.drop_duplicates(subset=['text_id'])

        df = df[['media','date','text_id','title','rubric','url','paywall','num_characters','text']]
        msg = api.Message(attributes=attributes, body=df)
        api.send(outports[1]['name'], msg)

        logger.info('Spider completed: {} - #articles: {}'.format(spider,count_articles))
        num_all_articles += count_articles

    logger.info('Process ended: {} Articles processed: {}  '.format(time_monitor.elapsed_time(),num_all_articles ))

    api.send(outports[0]['name'], log_stream.getvalue())
    return 0
Exemple #30
0
            operator_description = "Text Sentiment Analysis"
            operator_description_long = "Text Sentiment Analysis using Textblob. "
            add_readme = dict()
            debug_mode = True
            config_params['debug_mode'] = {
                'title': 'Debug mode',
                'description': 'Sending debug level information to log port',
                'type': 'boolean'
            }


last_msg = None
id_set = set()

operator_name = 'sentiment analysis'
logger, log_stream = slog.set_logging(operator_name,
                                      loglevel=api.config.debug_mode)
logger.info("Process started")
time_monitor = tp.progress()


def get_sentiment(text, language):
    if isinstance(text, str):
        if language == 'DE':
            blob = TextBlobDE(text)
            return [blob.sentiment.polarity, blob.sentiment.subjectivity]
        elif language == 'FR':
            tb = Blobber(pos_tagger=PatternTaggerFR(),
                         analyzer=PatternAnalyzerFR())
            blob = tb(text)
            return blob.sentiment
        else: