def insert_dynamo(table_name, dict_data, key_name, force=True): aws_session = getSession() dynamo = aws_session.client('dynamodb') try: _ = dynamo.put_item( TableName=table_name, Item=dict_data, ReturnConsumedCapacity='TOTAL', ConditionExpression='attribute_not_exists({0})'.format(key_name)) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'ConditionalCheckFailedException': if force: pm.print_info('Forcing to rewrite [{0}:{1}]'.format( key_name, dict_data[key_name])) pm.print_dict(dict_data) _ = dynamo.put_item(TableName=table_name, Item=dict_data, ReturnConsumedCapacity='TOTAL') else: pm.print_warning('Key already exists [{0}:{1}]'.format( key_name, dict_data[key_name])) else: pm.print_error('Dynamo problem unknown') pm.print_error(str(e), exit_code=1)
def athena_query(query, athena_database, s3_bucketname, file_remote_path, verbose=True): athena = athena_resource() output_location = 's3://' + '/'.join([s3_bucketname, file_remote_path ]) + '/' query_result = None response = None try: response = athena.start_query_execution( QueryString=query, QueryExecutionContext={'Database': athena_database}, ResultConfiguration={ 'OutputLocation': output_location, 'EncryptionConfiguration': { 'EncryptionOption': 'SSE_S3' } }) except ClientError as err: pm.print_warning('Athena [{0}] error'.format(athena_database)) pm.print_error(err.response['Error']['Message'], exit_code=1) try: while True: status = athena.get_query_execution( QueryExecutionId=response['QueryExecutionId']) current_status = status['QueryExecution']['Status']['State'] if current_status not in ['SUCCEEDED', 'FAILED', 'CANCELLED']: if verbose: pm.print_info_flush( msg='Query Status: {0}'.format(current_status), wait=True) elif current_status == 'SUCCEEDED': if verbose: pm.print_info_flush( msg='Query Status: {0}'.format(current_status), wait=False) query_result = athena.get_query_results( QueryExecutionId=response['QueryExecutionId']) break else: if verbose: pm.print_error('Query {0}'.format(current_status)) return None except ClientError as err: pm.print_warning('Athena [{0}] error'.format(athena_database)) pm.print_error(err.response['Error']['Message'], exit_code=1) return query_result
def sns_publish(topic, message): sns = sns_resource() try: response = sns.publish(TopicArn = topic, Message = message) except ClientError as err: pm.print_warning('SNS [{0}] error') pm.print_error(err.response['Error']['Message'], exit_code=1) return response
def deletes3(s3_bucketname, file_remote_path): if isfiles3(s3_bucketname, file_remote_path): s3 = s3_resource() try: s3.Object(s3_bucketname, file_remote_path).delete() except botocore.exceptions.ClientError as e: # Something else has gone wrong. s3 = None pm.print_error('[AWS][S3] Unknown error') pm.print_error(str(e)) pm.print_error('', exit_code=1) else: s3 = None return True else: pm.print_warning('File [s3://{0}/{1}] does not exist'.format( s3_bucketname, file_remote_path))
def merge_dicts(a, b, path=None, replacement=True): """merges b into a""" if path is None: path = [] for key in b: if key in a: if isinstance(a[key], dict) and isinstance(b[key], dict): merge_dicts(a[key], b[key], path + [str(key)]) elif a[key] == b[key]: pass elif a[key] != b[key]: if replacement: msg = 'Update [{1}] -> [{2}]'.format(key, a[key], b[key]) pm.print_warning(msg) a[key] = b[key] # Child step over else: msg = 'Not updated [{1}] -/> [{2}]'.format( key, a[key], b[key]) pm.print_warning(msg) pass else: a[key] = b[key] return a
def applymodifier(var_value, modifiers=None): if modifiers is None: modifiers = list() import pymake.utils.common.text_modifiers as tm allowed_modifiers_names = [ f[0] for f in getmembers(tm) if isfunction(f[1]) ] if not isinstance(modifiers, list): modifiers = [modifiers] for m in modifiers: import pymake.utils.common.text_modifiers as tm if m not in allowed_modifiers_names: pm.print_warning( 'Unknown modifier [{0}] - Unchanged'.format(m)) else: modifier_func = [f[1] for f in getmembers(tm) if f[0] == m][0] var_value = modifier_func(var_value) return var_value
def connect(self): if not self._connected: connection_string = 'DRIVER={ODBC Driver 13 for SQL Server};' connection_string += 'SERVER={0};DATABASE={1};UID={2};PWD={3};'.format(self._host, self._dbname, self._user, self._pwd) try: import pyodbc except ImportError: pm.print_error('Package pyodbc is not installed') try: import pydockerutils except ImportError: pm.print_error('You have installation recipes in package pydockerutils @') pm.print_error(' - [https://github.com/nenetto/pydockerutils]') pm.print_error('Exiting', exit_code=1) pm.print_warning('Please, run the command install_pyodb from pydockerutils in the shell') try: self._connection = pyodbc.connect(connection_string) except Exception as e: pm.print_error('Error connecting to database') pm.print_error(str(e)) pm.print_separator() self._connected = False return self._connected = True if self._verbose: pm.print_info('Connection Success')
def load_env_var_from_dict(envar_dict, prefix='', update=True): newprefix = prefix + '_' if prefix != '' else '' for k, v in envar_dict.items(): if isinstance(v, dict): load_env_var_from_dict(v, newprefix + k) else: varname = newprefix + k if update and varname in os.environ: pm.print_warning( 'Updating environment variable [{0}]:[{1}]->[{2}]'.format( varname, os.environ[varname], str(v))) os.environ[varname] = str(v) elif varname not in os.environ: pm.print_info( 'Setting environment variable [{0}]:[{1}]'.format( varname, str(v))) os.environ[varname] = str(v) elif k in os.environ: pm.print_warning( 'Found enviroment variable [{0}]:[{1}]- no replaced by new value [{2}]' .format(varname, os.environ[varname], str(v)))
def separate_numeric_column(df, column_name, verbose=True): """ This function try to convert a column to a numeric. Those values where a number is not found are set to nans :param df: dataframe for input :type df: pandas.DataFrame :param column_name: name of the column to be fixed :type column_name: str """ pm.print_info('Fixing column {0}'.format(column_name)) if verbose: pm.print_info('Number of rows {0}'.format(df.shape[0])) # Copy column of interest dfx = df[[column_name]].copy() # Create type variable dfx[column_name + '_type'] = 'num' n_num = 0 n_str = 0 n_nans = 0 n_others = 0 total = dfx.shape[0] for i, row in dfx.iterrows(): pm.print_info_percentage(100 * i / total, 'Processing column', padding=1) try: # Try conversion to number x = float(row[column_name]) if np.isnan(x): n_nans += 1 n_num += 1 except ValueError: dfx.loc[i, column_name + '_type'] = row[column_name] dfx.loc[i, column_name] = np.nan n_str += 1 except TypeError: n_others += 1 pm.print_info_percentage(100, 'Processed column', padding=1) n_nums_no_nans = n_num - n_nans pm.print_info('Nums: {0}'.format(n_nums_no_nans)) pm.print_info('Nans: {0}'.format(n_nans)) pm.print_info('Strs: {0}'.format(n_str)) pm.print_info('Unkn: {0}'.format(n_others)) if verbose: pm.print_info( 'Number of different types reduces from {0} to {1}'.format( len(df[column_name].unique()), len(dfx[column_name + '_type'].unique()))) pm.print_info('Classification:') for e in dfx[column_name + '_type'].unique(): n = dfx[dfx[column_name + '_type'] == e].shape[0] pm.print_info_2('{0} # {1}'.format(e, n), padding=1) # check the number of non numeric values total_num = dfx[column_name].shape[0] if total_num == (n_nums_no_nans + n_nans): # Numeric variable - leave as it pm.print_warning('Seems to be numeric, please revise') elif total_num == n_str: # Categorical variable - leave as it pm.print_warning('Seems to be categorical, please revise') #df[column_name] = df[column_name].astype('str') elif total_num == n_others: # Unknown type or date - leave as it pm.print_warning('Unknown or date, please revise') elif n_str > (n_nums_no_nans + n_nans): # Categorical variable - leave as it pm.print_warning('Seems to be categorical, please revise') #df[column_name] = df[column_name].astype('str') elif (n_nums_no_nans == 0) and (n_str > 0): # Categorical variable - leave as it pm.print_warning('Seems to be categorical, please revise') #df[column_name] = df[column_name].astype('str') else: # Mixed variable, do the split in two df[column_name] = dfx[column_name].copy().astype('float') df[column_name + '_type'] = dfx[column_name + '_type'].copy().astype('str')
def reload_partitions_in_table(athena_database, athena_table, s3_bucketname, file_remote_path, verbose=True): if not athena_exist(athena_database, s3_bucketname, file_remote_path, False): pm.print_error('Database does not exist', exit_code=1) athena = athena_resource() output_location = 's3://' + '/'.join([s3_bucketname, file_remote_path ]) + '/' response = None try: response = athena.start_query_execution( QueryString='MSCK REPAIR TABLE {0};'.format(athena_table), QueryExecutionContext={'Database': athena_database}, ResultConfiguration={ 'OutputLocation': output_location, 'EncryptionConfiguration': { 'EncryptionOption': 'SSE_S3' } }) except ClientError as err: pm.print_error('Reload partitions failed on table [{0}.{1}]'.format( athena_database, athena_table)) pm.print_error(err.response['Error']['Message'], exit_code=1) try: while True: status = athena.get_query_execution( QueryExecutionId=response['QueryExecutionId']) current_status = status['QueryExecution']['Status']['State'] if current_status not in ['SUCCEEDED', 'FAILED', 'CANCELLED']: if verbose: pm.print_info_flush( msg='Query Status: {0}'.format(current_status), wait=True) elif current_status == 'SUCCEEDED': if verbose: pm.print_info_flush( msg='Query Status: {0}'.format(current_status), wait=False) _ = athena.get_query_results( QueryExecutionId=response['QueryExecutionId']) break else: if verbose: pm.print_error('Query {0}'.format(current_status)) return None time.sleep(5) except ClientError as err: pm.print_warning('Athena [{0}] error'.format(athena_database)) pm.print_error(err.response['Error']['Message'], exit_code=1) else: pm.print_info('Reload partitions succeed on table [{0}.{1}]'.format( athena_database, athena_table))