def fix_columns_literal_numbers(data, columns, thousand_separators=None, decimal_separators='.', silent_mode=True, inplace=False, units=None): if thousand_separators is None: thousand_separators = [' ', ','] df = data if inplace else data.copy() if not silent_mode: kut.display_message('fixing literal numbers') for column in columns: this_unit = units.get(column, '') if units is not None else units new_column = '_'.join([column, this_unit]) if this_unit else column if not silent_mode: kut.display_message(column, secondary=True) print(df[column].value_counts(dropna=False).head()) print('type', df[column].dtype) this_thousand_separators = thousand_separators.get( column, 'true') if type( thousand_separators) == dict else thousand_separators this_decimal_separator = decimal_separators.get( column, None) if type(decimal_separators) == dict else decimal_separators df[column] = df[column].apply(lambda x: fix_literal_numbers( x, thousand_separators=this_thousand_separators, decimal_separator=this_decimal_separator, unit=this_unit)) df.rename(columns={column: new_column}, inplace=True) if not silent_mode: print(df[new_column].value_counts(dropna=False).head()) print('type', df[new_column].dtype) return None if inplace else df
def add_sentiment_information(data, lemme_column='lemmatizedTweet'): kut.display_message('adding sentiment information based on ' + lemme_column) print('expected time is 300s/1000rows') start = kut.yet() lemme_array = data[lemme_column].tolist() nlp_df = pd.DataFrame(nlp(lemme_array)).rename(columns={ 'label': 'sentiment_label', 'score': 'sentiment_score' }) out = data.join(pd.DataFrame(nlp(lemme_array))) kut.job_done(start=start) return out
def clean_tweets(data, tweet_column='tweet', method=lemmatized_cleaning, clean_column=None): kut.display_message('cleaning tweet column', 'name: ' + tweet_column) if clean_column is None: clean_column = tweet_column print('output', clean_column) start = kut.yet() # Apply to all texts data[clean_column] = data[tweet_column].apply(lambda x: method(x)) kut.job_done(start=start) return data
def parse_columns_date(data, columns, errors='raise', first_day=False, first_year=False, utc=None, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', cache=True, inplace=False, silent_mode=True): df = data if inplace else data.copy() if not silent_mode: kut.display_message('parsing dates') for column in columns: if not silent_mode: kut.display_message(column, secondary=True) print(df[column].value_counts(dropna=False).head()) print('type', df[column].dtype) if 'datetime' in str(data[column].dtype): continue data[column] = pd.to_datetime( data[column], errors=errors, dayfirst=first_day, yearfirst=first_year, utc=utc, format=format, exact=exact, unit=unit, infer_datetime_format=infer_datetime_format, origin=origin, cache=cache, ) if not silent_mode: print(df[column].value_counts(dropna=False).head()) print('type', df[column].dtype) return None if inplace else df
def set_client_context(client_row, project_key=None, add_ecrm_context=True, connection='dataiku_workspace'): kut.display_message('setting context', secondary=True) if not project_key: project_name = dataiku.default_project_key() project_key = dataiku.api_client().get_project(project_name) print('inferring project key:', project_key) new_vars = serialize_variables(new_vars=client_row.to_dict(), project=project_key, context='local') if add_ecrm_context: new_vars = add_ECRM_context(new_vars, connection=connection) project_key.set_variables(new_vars) variables = project_key.get_variables() local_variables = project_key.get_variables()['local'] client_name = local_variables['clientName'] print('client name:', client_name) print(local_variables) return variables
def fix_columns_literal_boolean(data, columns, true_value='true', false_value=None, silent_mode=True, inplace=False): df = data if inplace else data.copy() if not silent_mode: kut.display_message('fixing literal booleans') for column in columns: if not silent_mode: kut.display_message(column, secondary=True) print(df[column].value_counts(dropna=False)) print('type', df[column].dtype) this_true_value = true_value.get( column, 'true') if type(true_value) == dict else true_value this_false_value = false_value.get( column, None) if type(false_value) == dict else false_value df[column] = df[column].apply(lambda x: fix_literal_boolean( x, this_true_value, this_false_value)) if not silent_mode: print(df[column].value_counts(dropna=False)) print('type', df[column].dtype) return None if inplace else df
def build_scenario(build_plan, filter_on='ready', connection='dataiku_workspace', ref_table='referentialclient', ref_project='DIReferential', add_ecrm_context=True, finish_on_client=None, single_client=None): scenario = Scenario() if not isinstance(filter_on, list): filter_on = [filter_on] project_name = dataiku.default_project_key() project_key = dataiku.api_client().get_project(project_name) local_variables = project_key.get_variables()['local'] env = local_variables['env'] kut.display_message('reading client context referential') executor = SQLExecutor2(connection=connection) sql_query_referential_client = "SELECT * FROM " + '_'.join( [env, ref_project, ref_table]) client_ref = executor.query_to_df(sql_query_referential_client) filter_query = ' & '.join(filter_on) client_ref = client_ref.query(filter_query) if filter_query else client_ref kut.display_message('Client ready for automation : ' + client_ref.clientName.unique()) kut.display_message('run configuration') print(build_plan) if not pd.isnull(finish_on_client): finish_client = client_ref[client_ref.clientName == finish_on_client] if len(finish_client) == 0: kut.display_message( 'finish client not found in plan ' + finish_on_client + ' is the client name valid ?' ) # Example: load a DSS dataset as a Pandas dataframe other_clients = client_ref[client_ref.clientName != finish_on_client] client_ref = pd.concat([other_clients, finish_client], ignore_index=True) success = [] if single_client is not None: requested_client = client_ref[client_ref.clientName == single_client] if not len(single_client): kut.display_message( 'requested single client is not found,building all allowed clients' ) else: client_ref = requested_client for index, client_row in client_ref.iterrows(): variables = set_client_context(client_row=client_row, add_ecrm_context=add_ecrm_context, connection=connection) client_name = variables['local']['clientName'] kut.display_message('starting builds on ' + client_name) run_scenario(table_plan=build_plan, scenario=scenario) success.append(client_name) scenario.set_global_variables(successfullRun=success) print('done_________________' + client_name) return success
def batch_me( data, instructions, batch_size=100, track_offset=None, with_save_dir=None, with_load_dir=None, task_name='batchedJob', reset_job=False, clean_backup=True, ): from math import ceil, floor current_batch = 0 kut.display_message('batching', instructions) if with_load_dir is True: with_load_dir = with_save_dir if with_save_dir: save_radical = with_save_dir + '_'.join([task_name, kut.file_stamp()]) print('batch will be saved in:', save_radical + '_*') if type(instructions) is not list: instructions = [instructions] # I wanted to make batching instruction intuitive but you can't access a variable name inside a function # There might be a more convoluted way to do it , either through a class attribute or a wrapper. For now... # instructions=[X.replace(data,'batch_df') for X in instructions] remaining_data = data treated_stack = load_backup(backup_dir=with_load_dir, task_name=task_name, reset_job=reset_job) if len(treated_stack): treated_df = pd.concat(treated_stack, ignore_index=True) print('found', len(treated_df), 'backed_up rows') remaining_data = data.loc[data.index.difference(treated_df.index)] print('remaining to treat', len(remaining_data), 'rows') if len(remaining_data): batch_size = len(remaining_data) if batch_size > len(remaining_data) else batch_size total_batches = ceil(len(remaining_data) / batch_size) print(floor(len(remaining_data) / batch_size), 'batches of', batch_size, 'rows') if track_offset is None: track_offset = max(round(0.25 * total_batches), 1) if len(remaining_data) % batch_size: print('one batch of', len(remaining_data) % batch_size, 'rows') if track_offset: print('track offset every', track_offset, 'batch') else: print('nothing left to batch') for batch_df in kut.split_me(data=remaining_data, batch_size=batch_size): current_batch += 1 for instruction in instructions: eval(instruction) treated_stack.append(batch_df) # intermediate prints and saves print(current_batch, track_offset) if not current_batch % track_offset: print('done batch', current_batch) if with_save_dir is not None: save_name = '_'.join([save_radical, str(current_batch)]) current_treated_df = pd.concat(treated_stack, ignore_index=True) current_treated_df.to_csv(save_name, index=False) print('saved', len(current_treated_df), 'rows') print(save_name) out = pd.concat(treated_stack, ignore_index=True) if clean_backup: kut.display_message('cleaning backups') clean_backups(backup_dir=with_load_dir, task_name=task_name) return out