コード例 #1
0
def fix_columns_literal_numbers(data,
                                columns,
                                thousand_separators=None,
                                decimal_separators='.',
                                silent_mode=True,
                                inplace=False,
                                units=None):
    if thousand_separators is None:
        thousand_separators = [' ', ',']
    df = data if inplace else data.copy()
    if not silent_mode:
        kut.display_message('fixing literal numbers')
    for column in columns:
        this_unit = units.get(column, '') if units is not None else units
        new_column = '_'.join([column, this_unit]) if this_unit else column
        if not silent_mode:
            kut.display_message(column, secondary=True)
            print(df[column].value_counts(dropna=False).head())
            print('type', df[column].dtype)
        this_thousand_separators = thousand_separators.get(
            column, 'true') if type(
                thousand_separators) == dict else thousand_separators
        this_decimal_separator = decimal_separators.get(
            column,
            None) if type(decimal_separators) == dict else decimal_separators
        df[column] = df[column].apply(lambda x: fix_literal_numbers(
            x,
            thousand_separators=this_thousand_separators,
            decimal_separator=this_decimal_separator,
            unit=this_unit))
        df.rename(columns={column: new_column}, inplace=True)
        if not silent_mode:
            print(df[new_column].value_counts(dropna=False).head())
            print('type', df[new_column].dtype)
    return None if inplace else df
コード例 #2
0
def add_sentiment_information(data, lemme_column='lemmatizedTweet'):
    kut.display_message('adding sentiment information based on ' +
                        lemme_column)
    print('expected time is 300s/1000rows')
    start = kut.yet()
    lemme_array = data[lemme_column].tolist()
    nlp_df = pd.DataFrame(nlp(lemme_array)).rename(columns={
        'label': 'sentiment_label',
        'score': 'sentiment_score'
    })
    out = data.join(pd.DataFrame(nlp(lemme_array)))
    kut.job_done(start=start)
    return out
コード例 #3
0
def clean_tweets(data,
                 tweet_column='tweet',
                 method=lemmatized_cleaning,
                 clean_column=None):
    kut.display_message('cleaning tweet column', 'name: ' + tweet_column)
    if clean_column is None:
        clean_column = tweet_column
    print('output', clean_column)
    start = kut.yet()
    # Apply to all texts
    data[clean_column] = data[tweet_column].apply(lambda x: method(x))
    kut.job_done(start=start)
    return data
コード例 #4
0
def parse_columns_date(data,
                       columns,
                       errors='raise',
                       first_day=False,
                       first_year=False,
                       utc=None,
                       format=None,
                       exact=True,
                       unit=None,
                       infer_datetime_format=False,
                       origin='unix',
                       cache=True,
                       inplace=False,
                       silent_mode=True):
    df = data if inplace else data.copy()
    if not silent_mode:
        kut.display_message('parsing dates')
    for column in columns:
        if not silent_mode:
            kut.display_message(column, secondary=True)
            print(df[column].value_counts(dropna=False).head())
            print('type', df[column].dtype)
        if 'datetime' in str(data[column].dtype):
            continue
        data[column] = pd.to_datetime(
            data[column],
            errors=errors,
            dayfirst=first_day,
            yearfirst=first_year,
            utc=utc,
            format=format,
            exact=exact,
            unit=unit,
            infer_datetime_format=infer_datetime_format,
            origin=origin,
            cache=cache,
        )
        if not silent_mode:
            print(df[column].value_counts(dropna=False).head())
            print('type', df[column].dtype)

        return None if inplace else df
コード例 #5
0
def set_client_context(client_row,
                       project_key=None,
                       add_ecrm_context=True,
                       connection='dataiku_workspace'):
    kut.display_message('setting context', secondary=True)
    if not project_key:
        project_name = dataiku.default_project_key()
        project_key = dataiku.api_client().get_project(project_name)
        print('inferring project key:', project_key)
    new_vars = serialize_variables(new_vars=client_row.to_dict(),
                                   project=project_key,
                                   context='local')
    if add_ecrm_context:
        new_vars = add_ECRM_context(new_vars, connection=connection)
    project_key.set_variables(new_vars)
    variables = project_key.get_variables()
    local_variables = project_key.get_variables()['local']
    client_name = local_variables['clientName']
    print('client name:', client_name)
    print(local_variables)
    return variables
コード例 #6
0
def fix_columns_literal_boolean(data,
                                columns,
                                true_value='true',
                                false_value=None,
                                silent_mode=True,
                                inplace=False):
    df = data if inplace else data.copy()
    if not silent_mode:
        kut.display_message('fixing literal booleans')
    for column in columns:
        if not silent_mode:
            kut.display_message(column, secondary=True)
            print(df[column].value_counts(dropna=False))
            print('type', df[column].dtype)
        this_true_value = true_value.get(
            column, 'true') if type(true_value) == dict else true_value
        this_false_value = false_value.get(
            column, None) if type(false_value) == dict else false_value
        df[column] = df[column].apply(lambda x: fix_literal_boolean(
            x, this_true_value, this_false_value))
        if not silent_mode:
            print(df[column].value_counts(dropna=False))
            print('type', df[column].dtype)
    return None if inplace else df
コード例 #7
0
def build_scenario(build_plan,
                   filter_on='ready',
                   connection='dataiku_workspace',
                   ref_table='referentialclient',
                   ref_project='DIReferential',
                   add_ecrm_context=True,
                   finish_on_client=None,
                   single_client=None):
    scenario = Scenario()
    if not isinstance(filter_on, list):
        filter_on = [filter_on]
    project_name = dataiku.default_project_key()
    project_key = dataiku.api_client().get_project(project_name)
    local_variables = project_key.get_variables()['local']
    env = local_variables['env']
    kut.display_message('reading client context referential')

    executor = SQLExecutor2(connection=connection)
    sql_query_referential_client = "SELECT * FROM " + '_'.join(
        [env, ref_project, ref_table])
    client_ref = executor.query_to_df(sql_query_referential_client)
    filter_query = ' & '.join(filter_on)
    client_ref = client_ref.query(filter_query) if filter_query else client_ref
    kut.display_message('Client ready for automation  : ' +
                        client_ref.clientName.unique())

    kut.display_message('run configuration')
    print(build_plan)

    if not pd.isnull(finish_on_client):
        finish_client = client_ref[client_ref.clientName == finish_on_client]
        if len(finish_client) == 0:
            kut.display_message(
                'finish client not found in plan ' + finish_on_client +
                ' is the client name valid ?'
            )  # Example: load a DSS dataset as a Pandas dataframe
        other_clients = client_ref[client_ref.clientName != finish_on_client]
        client_ref = pd.concat([other_clients, finish_client],
                               ignore_index=True)
    success = []
    if single_client is not None:
        requested_client = client_ref[client_ref.clientName == single_client]
        if not len(single_client):
            kut.display_message(
                'requested single client is not found,building all allowed clients'
            )
        else:
            client_ref = requested_client
    for index, client_row in client_ref.iterrows():
        variables = set_client_context(client_row=client_row,
                                       add_ecrm_context=add_ecrm_context,
                                       connection=connection)
        client_name = variables['local']['clientName']
        kut.display_message('starting builds on ' + client_name)

        run_scenario(table_plan=build_plan, scenario=scenario)
        success.append(client_name)
        scenario.set_global_variables(successfullRun=success)
        print('done_________________' + client_name)
    return success
コード例 #8
0
def batch_me(
        data,
        instructions,
        batch_size=100,
        track_offset=None,
        with_save_dir=None,
        with_load_dir=None,
        task_name='batchedJob',
        reset_job=False,
        clean_backup=True,
):
    from math import ceil, floor

    current_batch = 0

    kut.display_message('batching', instructions)

    if with_load_dir is True:
        with_load_dir = with_save_dir
    if with_save_dir:
        save_radical = with_save_dir + '_'.join([task_name, kut.file_stamp()])
        print('batch will be saved in:', save_radical + '_*')

    if type(instructions) is not list:
        instructions = [instructions]

    # I wanted to make batching instruction intuitive but you can't access a variable name inside a function
    # There might be a more convoluted way to do it , either through a class attribute or a wrapper. For now...
    # instructions=[X.replace(data,'batch_df') for X in instructions]

    remaining_data = data
    treated_stack = load_backup(backup_dir=with_load_dir, task_name=task_name, reset_job=reset_job)
    if len(treated_stack):
        treated_df = pd.concat(treated_stack, ignore_index=True)
        print('found', len(treated_df), 'backed_up rows')
        remaining_data = data.loc[data.index.difference(treated_df.index)]
        print('remaining to treat', len(remaining_data), 'rows')

    if len(remaining_data):
        batch_size = len(remaining_data) if batch_size > len(remaining_data) else batch_size
        total_batches = ceil(len(remaining_data) / batch_size)

        print(floor(len(remaining_data) / batch_size), 'batches of', batch_size, 'rows')
        if track_offset is None:
            track_offset = max(round(0.25 * total_batches), 1)
        if len(remaining_data) % batch_size:
            print('one batch of', len(remaining_data) % batch_size, 'rows')
        if track_offset:
            print('track offset every', track_offset, 'batch')

    else:
        print('nothing left to batch')

    for batch_df in kut.split_me(data=remaining_data, batch_size=batch_size):
        current_batch += 1

        for instruction in instructions:
            eval(instruction)

        treated_stack.append(batch_df)

        # intermediate prints and saves
        print(current_batch, track_offset)
        if not current_batch % track_offset:
            print('done batch', current_batch)
            if with_save_dir is not None:
                save_name = '_'.join([save_radical, str(current_batch)])
                current_treated_df = pd.concat(treated_stack, ignore_index=True)
                current_treated_df.to_csv(save_name, index=False)
                print('saved', len(current_treated_df), 'rows')
                print(save_name)

    out = pd.concat(treated_stack, ignore_index=True)
    if clean_backup:
        kut.display_message('cleaning backups')
        clean_backups(backup_dir=with_load_dir, task_name=task_name)
    return out