Exemple #1
0
def insert_dynamo(table_name, dict_data, key_name, force=True):
    aws_session = getSession()
    dynamo = aws_session.client('dynamodb')

    try:
        _ = dynamo.put_item(
            TableName=table_name,
            Item=dict_data,
            ReturnConsumedCapacity='TOTAL',
            ConditionExpression='attribute_not_exists({0})'.format(key_name))

    except botocore.exceptions.ClientError as e:

        if e.response['Error']['Code'] == 'ConditionalCheckFailedException':

            if force:
                pm.print_info('Forcing to rewrite [{0}:{1}]'.format(
                    key_name, dict_data[key_name]))
                pm.print_dict(dict_data)

                _ = dynamo.put_item(TableName=table_name,
                                    Item=dict_data,
                                    ReturnConsumedCapacity='TOTAL')

            else:
                pm.print_warning('Key already exists [{0}:{1}]'.format(
                    key_name, dict_data[key_name]))

        else:
            pm.print_error('Dynamo problem unknown')
            pm.print_error(str(e), exit_code=1)
Exemple #2
0
def replace(rule, configuration):
    replacement = {}
    var_value = None

    # Find pieces to replace in string type "{var-value}"
    for match in re.finditer('{[a-z]+(-*[a-z]*)*}', rule):
        var = match.group(0)

        var_name = var[1:-1]

        if var_name not in configuration.keys():
            pm.print_error('Unknown rule for [{0}]'.format(var))
            pm.print_error('Configuration during error:')
            pm.print_dict(configuration)
            exit(1)
        else:
            var_value = configuration[var_name]

        replacement[var] = var_value

    # Replace
    replaced_rule = copy.deepcopy(rule)
    for old, new in replacement.items():
        replaced_rule = replaced_rule.replace(str(old), str(new))
    return replaced_rule
Exemple #3
0
def read_env_var(name):
    if name in os.environ:
        return os.environ[name]
    else:
        pm.print_error('Environment variable [{0}] not found'.format(name),
                       exit_code=1)
    return None
Exemple #4
0
def run_shell_command(command, wait=True):

    pm.print_info('Running [{0}]'.format(command))
    session = subprocess.Popen(command,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               shell=True)
    if wait:
        stdout, stderr = session.communicate()
        session.wait()

        output_lines = stdout.decode("utf-8").split('\n')
        for ol in output_lines:
            pm.print_info_2(ol, padding=3)

        if session.returncode != 0:
            e = stderr.decode("utf-8")
            pm.print_error('Command failed! [{0}]'.format(command))
            pm.print_info_2('Error: {0}'.format(str(e)))
            pm.print_error('Exit', exit_code=1)

        else:
            e = stderr.decode("utf-8").split('\n')
            for ee in e:
                pm.print_info(ee, padding=4)
Exemple #5
0
def sns_publish(topic, message):

    sns = sns_resource()   

    try:
        response = sns.publish(TopicArn = topic, Message = message)

    except ClientError as err:
        pm.print_warning('SNS [{0}] error')
        pm.print_error(err.response['Error']['Message'], exit_code=1)    

    return response
Exemple #6
0
    def connect(self):

        if not self._connected:
            connection_string = "host='{}' port={} dbname='{}' user={} password={}".format(
                self._host, self._port, self._dbname, self._user, self._pwd)
            try:
                self._connection = psycopg2.connect(connection_string)
            except Exception as e:
                pm.print_error('Error connecting to database')
                pm.print_separator()
                pm.print_error(str(e))
                pm.print_separator()
                self._connected = False

            self._connected = True
Exemple #7
0
def downloads3(file_local_path, s3_bucketname, file_remote_path, verbose=True):

    s3 = s3_resource()

    if not isfiles3(s3_bucketname, file_remote_path):
        s3 = None
        return False

    try:
        s3.Bucket(s3_bucketname).download_file(file_remote_path,
                                               file_local_path)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            if verbose:
                pm.print_error('[AWS][S3] The object does not exist.')
            s3 = None
            return False
        else:
            pm.print_error('[AWS][S3] Unknown error')
            pm.print_error_2(str(e))
            s3 = None
            pm.print_error('', exit_code=1)

    s3 = None
    return True
Exemple #8
0
    def connect(self):

        if not self._connected:

            # Check for database
            if not athena_exist(self._dbname, self._s3bucket, self._remotepath, verbose=False):
                self._connected = False
                pm.print_error('Athena [{0}] does not exist'.format(self._dbname), exit_code=1)

            if check_aws_env():

                try:
                    self._connection = pyathena.connect(aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
                                                        aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
                                                        s3_staging_dir=self._output_location,
                                                        region_name=os.environ['AWS_REGION'])

                except Exception as e:
                    pm.print_error('Error connecting to database')
                    pm.print_separator()
                    pm.print_error(str(e))
                    pm.print_separator()
                    self._connected = False


            else:


                try:
                    self._connection = pyathena.connect(s3_staging_dir=self._output_location,
                                                        region_name=os.environ['AWS_REGION'])

                except Exception as e:
                    pm.print_error('Error connecting to database')
                    pm.print_separator()
                    pm.print_error(str(e))
                    pm.print_separator()
                    self._connected = False

            self._connected = True
Exemple #9
0
    def get_query(self, query, close=True):

        df = None
        self.connect()

        if self._connected:
            try:
                df = pd.read_sql(query, self._connection)
            except Exception as e:
                pm.print_error('Query problem')
                pm.print_separator()
                pm.print_error(query)
                pm.print_separator()
                pm.print_error(str(e), raise_error=Exception)
        else:
            pm.print_error('Data Base not connected')
            pm.print_error('Exiting', exit_code=1)

        if close:
            self.disconnect()

        return df
Exemple #10
0
def athena_query(query,
                 athena_database,
                 s3_bucketname,
                 file_remote_path,
                 verbose=True):

    athena = athena_resource()
    output_location = 's3://' + '/'.join([s3_bucketname, file_remote_path
                                          ]) + '/'

    query_result = None
    response = None

    try:
        response = athena.start_query_execution(
            QueryString=query,
            QueryExecutionContext={'Database': athena_database},
            ResultConfiguration={
                'OutputLocation': output_location,
                'EncryptionConfiguration': {
                    'EncryptionOption': 'SSE_S3'
                }
            })

    except ClientError as err:
        pm.print_warning('Athena [{0}] error'.format(athena_database))
        pm.print_error(err.response['Error']['Message'], exit_code=1)

    try:

        while True:
            status = athena.get_query_execution(
                QueryExecutionId=response['QueryExecutionId'])
            current_status = status['QueryExecution']['Status']['State']

            if current_status not in ['SUCCEEDED', 'FAILED', 'CANCELLED']:
                if verbose:
                    pm.print_info_flush(
                        msg='Query Status: {0}'.format(current_status),
                        wait=True)
            elif current_status == 'SUCCEEDED':
                if verbose:
                    pm.print_info_flush(
                        msg='Query Status: {0}'.format(current_status),
                        wait=False)
                query_result = athena.get_query_results(
                    QueryExecutionId=response['QueryExecutionId'])
                break
            else:
                if verbose:
                    pm.print_error('Query {0}'.format(current_status))
                return None

    except ClientError as err:
        pm.print_warning('Athena [{0}] error'.format(athena_database))
        pm.print_error(err.response['Error']['Message'], exit_code=1)

    return query_result
Exemple #11
0
def get_pymake_var(var):
    pymakefile = pkg_resources.resource_filename('pymake', 'pymakefile.json')
    pymakevars = json2dict(pymakefile)

    if 'project-name' not in pymakevars.keys():
        pm.print_error('Unknown var [{0}]'.format('project-name'))
        pm.print_error('Pymakefile during error:')
        pm.print_dict(pymakevars)
        exit(1)

    if var in pymakevars.keys():
        return pymakevars[var]
    else:
        pm.print_error('Unknown var [{0}]'.format(var))
        pm.print_error('Pymakefile during error:')
        pm.print_dict(pymakevars)
        exit(1)
Exemple #12
0
def deletes3(s3_bucketname, file_remote_path):

    if isfiles3(s3_bucketname, file_remote_path):
        s3 = s3_resource()

        try:
            s3.Object(s3_bucketname, file_remote_path).delete()
        except botocore.exceptions.ClientError as e:
            # Something else has gone wrong.
            s3 = None
            pm.print_error('[AWS][S3] Unknown error')
            pm.print_error(str(e))
            pm.print_error('', exit_code=1)
        else:
            s3 = None
            return True

    else:
        pm.print_warning('File [s3://{0}/{1}] does not exist'.format(
            s3_bucketname, file_remote_path))
Exemple #13
0
def isfiles3(s3_bucketname, file_remote_path):

    s3 = s3_resource()

    try:
        s3.Object(s3_bucketname, file_remote_path).load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            # The object does not exist.
            s3 = None
            return False
        else:
            # Something else has gone wrong.
            s3 = None
            pm.print_error('[AWS][S3] Unknown error')
            pm.print_error(str(e))
            pm.print_error('', exit_code=1)
    else:
        s3 = None
        return True
Exemple #14
0
    def connect(self):

        if not self._connected:

            connection_string = 'DRIVER={ODBC Driver 13 for SQL Server};'
            connection_string += 'SERVER={0};DATABASE={1};UID={2};PWD={3};'.format(self._host,
                                                                                   self._dbname,
                                                                                   self._user,
                                                                                   self._pwd)
            try:
                import pyodbc
            except ImportError:
                pm.print_error('Package pyodbc is not installed')
                try:
                    import pydockerutils
                except ImportError:
                    pm.print_error('You have installation recipes in package pydockerutils @')
                    pm.print_error('  - [https://github.com/nenetto/pydockerutils]')
                    pm.print_error('Exiting', exit_code=1)

                pm.print_warning('Please, run the command install_pyodb from pydockerutils in the shell')


            try:
                self._connection = pyodbc.connect(connection_string)
            except Exception as e:
                pm.print_error('Error connecting to database')
                pm.print_error(str(e))
                pm.print_separator()
                self._connected = False
                return

            self._connected = True

            if self._verbose:
                pm.print_info('Connection Success')
Exemple #15
0
def summary_table(df, fixedtable_file_xlsx, schema_file_xlsx,
                  summary_file_xlsx, log_file):

    try:
        from pydqc.infer_schema import infer_schema
        from pydqc.data_summary import data_summary

    except ImportError:
        pm.print_error(
            'To use this function, you need to install pacakge pydqc')
        pm.print_error('    - https://github.com/nenetto/pydqc')
        pm.print_error('    - pip[3] install https://github.com/nenetto/pydqc')
        pm.print_error('', exit_code=1)

    # Fix table
    try:
        df = fixtable(df)
    except Exception as e:
        with open(log_file, 'a') as log:
            log.write('Error fixing table:\n')
            log.write(str(e))
        pm.print_error('Error processing file:')
        pm.print_error(str(e))
        return

    # Save in log the void columns
    nan_columns = df.columns[df.isna().all()].tolist()
    with open(log_file, 'a') as log:
        log.write('Columns with all NaNs:\n')
        for c in nan_columns:
            log.write('  - {0}\n'.format(c))

    # Clean df
    nan_columns_type = list()
    for c in nan_columns:
        nan_columns_type.append(c + '_type')

    clean_columns = list()
    for c in df.columns:
        if (c not in nan_columns) and (c not in nan_columns_type):
            clean_columns.append(c)

    df = df[clean_columns]
    df.to_excel(fixedtable_file_xlsx, index=False)

    dirpath = tempfile.mkdtemp()

    # Infer schema
    pm.print_info('Infering Schema')

    try:
        infer_schema(df,
                     fname='',
                     output_root=dirpath,
                     sample_size=1.0,
                     type_threshold=0.5,
                     n_jobs=1,
                     base_schema=None)

        shutil.copyfile(os.path.join(dirpath, 'data_schema_.xlsx'),
                        schema_file_xlsx)
        df_schema = get_schema(schema_file_xlsx)
    except Exception as e:
        with open(log_file, 'a') as log:
            log.write('Error Infering Schema:\n')
            log.write(str(e))
        pm.print_error('Error Infering Schema')
        pm.print_error(str(e))
        return

    pm.print_info('Schema Detected')

    pm.print_info('Generating Summary')

    data_summary(table_schema=df_schema,
                 table=df,
                 output_root=dirpath,
                 fname='',
                 sample_size=1.0,
                 keep_images=False)
    try:

        shutil.copyfile(os.path.join(dirpath, 'data_summary_.xlsx'),
                        summary_file_xlsx)
    except Exception as e:
        with open(log_file, 'a') as log:
            log.write('Error Generating Summary:\n')
            log.write(str(e))
        pm.print_error('Error Generating Summary')
        pm.print_error(str(e))
        return

    pm.print_info('Summary Generated')
    # Remove temp dir
    shutil.rmtree(dirpath)
Exemple #16
0
def reload_partitions_in_table(athena_database,
                               athena_table,
                               s3_bucketname,
                               file_remote_path,
                               verbose=True):

    if not athena_exist(athena_database, s3_bucketname, file_remote_path,
                        False):
        pm.print_error('Database does not exist', exit_code=1)

    athena = athena_resource()

    output_location = 's3://' + '/'.join([s3_bucketname, file_remote_path
                                          ]) + '/'

    response = None

    try:
        response = athena.start_query_execution(
            QueryString='MSCK REPAIR TABLE {0};'.format(athena_table),
            QueryExecutionContext={'Database': athena_database},
            ResultConfiguration={
                'OutputLocation': output_location,
                'EncryptionConfiguration': {
                    'EncryptionOption': 'SSE_S3'
                }
            })
    except ClientError as err:
        pm.print_error('Reload partitions failed on table [{0}.{1}]'.format(
            athena_database, athena_table))
        pm.print_error(err.response['Error']['Message'], exit_code=1)

    try:

        while True:
            status = athena.get_query_execution(
                QueryExecutionId=response['QueryExecutionId'])
            current_status = status['QueryExecution']['Status']['State']

            if current_status not in ['SUCCEEDED', 'FAILED', 'CANCELLED']:
                if verbose:
                    pm.print_info_flush(
                        msg='Query Status: {0}'.format(current_status),
                        wait=True)
            elif current_status == 'SUCCEEDED':
                if verbose:
                    pm.print_info_flush(
                        msg='Query Status: {0}'.format(current_status),
                        wait=False)
                _ = athena.get_query_results(
                    QueryExecutionId=response['QueryExecutionId'])
                break
            else:
                if verbose:
                    pm.print_error('Query {0}'.format(current_status))
                return None
            time.sleep(5)

    except ClientError as err:
        pm.print_warning('Athena [{0}] error'.format(athena_database))
        pm.print_error(err.response['Error']['Message'], exit_code=1)
    else:
        pm.print_info('Reload partitions succeed on table [{0}.{1}]'.format(
            athena_database, athena_table))