Example #1
0
# https://community.ipswitch.com/s/article/Direct-Download-Links-for-Transfer-and-Automation-2018

import os
import sys
import shutil
import re
import logging
import argparse
import json
import subprocess
import boto3
from botocore.exceptions import ClientError
import lib.logs as log

logger = logging.getLogger(__name__)
log.setup()


def clean_exit(code, message):
    """Exits with a logger message and code"""
    logger.debug('Exiting with code %s : %s', str(code), message)
    sys.exit(code)


# Command line arguments
parser = argparse.ArgumentParser(
    description='GDX Analytics ETL utility for PRMP.')
parser.add_argument(
    '-c',
    '--conf',
    help='Microservice configuration file.',
Example #2
0
def main():
    """Process S3 loaded CMS Lite Metadata file to Redshift"""

    logger = logging.getLogger(__name__)
    log.setup()

    def clean_exit(code, message):
        """Exits with a logger message and code"""
        logger.debug('Exiting with code %s : %s', str(code), message)
        sys.exit(code)

    # we will use this timestamp to write to the cmslite.microservice_log table
    # changes to that table trigger Looker cacheing.
    # As a result, Looker refreshes its cmslite metadata cache
    # each time this microservice completes
    starttime = str(datetime.datetime.now())

    # Read configuration file
    if len(sys.argv) != 2:  # will be 1 if no arguments, 2 if one argument
        logger.error(
            "Usage: python27 cmslitemetadata_to_redshift.py configfile.json")
        clean_exit(1, 'bad configuration')
    configfile = sys.argv[1]
    if os.path.isfile(configfile) is False:  # confirm that the file exists
        logger.error("Invalid file name %s", configfile)
        clean_exit(1, 'bad configuration')
    with open(configfile) as _f:
        data = json.load(_f)

    # Set up variables from config file
    bucket = data['bucket']
    source = data['source']
    destination = data['destination']
    directory = data['directory']
    doc = data['doc']
    if 'dbschema' in data:
        dbschema = data['dbschema']
    else:
        dbschema = 'microservice'
    dbtable = data['dbtable']

    column_count = data['column_count']
    columns_metadata = data['columns_metadata']
    columns_lookup = data['columns_lookup']
    dbtables_dictionaries = data['dbtables_dictionaries']
    dbtables_metadata = data['dbtables_metadata']
    nested_delim = data['nested_delim']
    columns = data['columns']
    dtype_dic = {}
    if 'dtype_dic_strings' in data:
        for fieldname in data['dtype_dic_strings']:
            dtype_dic[fieldname] = str
    delim = data['delim']
    truncate = data['truncate']

    # set up S3 connection
    client = boto3.client('s3')  # low-level functional API
    resource = boto3.resource('s3')  # high-level object-oriented API
    # subsitute this for your s3 bucket name.
    my_bucket = resource.Bucket(bucket)
    bucket_name = my_bucket.name

    aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID']
    aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY']

    # prep database call to pull the batch file into redshift
    conn_string = """
    dbname='{dbname}' host='{host}' port='{port}' user='******' \
    password={password}
    """.format(dbname='snowplow',
               host='redshift.analytics.gov.bc.ca',
               port='5439',
               user=os.environ['pguser'],
               password=os.environ['pgpass'])

    # bucket = the S3 bucket
    # filename = the name of the original file being processed
    # (eg. example.csv)
    # batchfile = the name of the batch file. This will be appended to the
    # original filename path (eg. part01.csv -> "example.csv/part01.csv")
    # df = the dataframe to write out
    # columnlist = a list of columns to use from the dataframe.
    # Must be the same order as the SQL table.
    # If null (eg None in Python), will write all columns in order.
    # index = if not Null, add an index column with this label
    def to_s3(loc_batchfile, filename, loc_df, loc_columnlist, loc_index):
        """Funcion to write a CSV to S3"""
        # Put the full data set into a buffer and write it
        # to a "   " delimited file in the batch directory
        csv_buffer = StringIO()
        if loc_columnlist is None:  # no column list, no index
            if loc_index is None:
                loc_df.to_csv(csv_buffer,
                              header=True,
                              index=False,
                              sep="	",
                              encoding='utf-8')
            else:  # no column list, include index
                loc_df.to_csv(csv_buffer,
                              header=True,
                              index=True,
                              sep="	",
                              index_label=loc_index,
                              encoding='utf-8')
        else:
            if loc_index is None:  # column list, no index
                loc_df.to_csv(csv_buffer,
                              header=True,
                              index=False,
                              sep="	",
                              columns=loc_columnlist,
                              encoding='utf-8')
            # column list, include index
            else:
                loc_df.to_csv(csv_buffer,
                              header=True,
                              index=True,
                              sep="	",
                              columns=loc_columnlist,
                              index_label=loc_index,
                              encoding='utf-8')

        logger.debug("Writing " + filename + " to " + loc_batchfile)
        resource.Bucket(bucket).put_object(Key=loc_batchfile + "/" + filename,
                                           Body=csv_buffer.getvalue())

    # Create a dictionary dataframe based on a column
    def to_dict(loc_df, section):
        '''build a dictionary type dataframe for a column with nested \
        delimeters'''
        # drop any nulls and wrapping delimeters, split and flatten:
        clean = loc_df.copy().dropna(
            subset=[section])[section].str[1:-1].str.split(
                nested_delim).values.flatten()
        # set to exlude duplicates
        _l = list(set(itertools.chain.from_iterable(clean)))
        # make a dataframe of the list
        return pd.DataFrame({section: sorted(_l)})

    # Check to see if the file has been processed already
    def is_processed(loc_object_summary):
        '''check S3 for objects already processed'''
        # Check to see if the file has been processed already
        loc_key = loc_object_summary.key
        filename = loc_key[loc_key.rfind('/') + 1:]  # get the filename string
        loc_goodfile = destination + "/good/" + key
        loc_badfile = destination + "/bad/" + key
        try:
            client.head_object(Bucket=bucket, Key=loc_goodfile)
        except ClientError:
            pass  # this object does not exist under the good destination path
        else:
            return True
        try:
            client.head_object(Bucket=bucket, Key=loc_badfile)
        except ClientError:
            pass  # this object does not exist under the bad destination path
        else:
            return True
        logger.debug("%s has not been processed.", filename)
        return False

    # This bucket scan will find unprocessed objects.
    # objects_to_process will contain zero or one objects if truncate=True;
    # objects_to_process will contain zero or more objects if truncate=False.
    objects_to_process = []
    for object_summary in my_bucket.objects.filter(Prefix=source + "/" +
                                                   directory + "/"):
        key = object_summary.key
        # skip to next object if already processed
        if is_processed(object_summary):
            continue

        logger.debug("Processing %s", object_summary)
        # only review those matching our configued 'doc' regex pattern
        if re.search(doc + '$', key):
            # under truncate, we will keep list length to 1
            # only adding the most recently modified file to objects_to_process
            if truncate:
                if len(objects_to_process) == 0:
                    objects_to_process.append(object_summary)
                    continue
                # compare last modified dates of the latest and current obj
                if (object_summary.last_modified >
                        objects_to_process[0].last_modified):
                    objects_to_process[0] = object_summary
                else:
                    logger.debug("skipping %s; less recent than %s", key,
                                 object_summary.last_modified)
            else:
                # no truncate, so the list may exceed 1 element
                objects_to_process.append(object_summary)

    if truncate and len(objects_to_process) == 1:
        logger.info(('truncate is set. processing most recent file match: '
                     '%s (modified %s)'), objects_to_process[0].key,
                    objects_to_process[0].last_modified)

    # process the objects that were found during the earlier directory pass
    for object_summary in objects_to_process:
        # Check to see if the file has been processed already
        batchfile = destination + "/batch/" + object_summary.key
        goodfile = destination + "/good/" + object_summary.key
        badfile = destination + "/bad/" + object_summary.key

        # Load the object from S3 using Boto and set body to be its contents
        obj = client.get_object(Bucket=bucket, Key=object_summary.key)
        body = obj['Body']

        csv_string = body.read().decode('utf-8')

        # XX  temporary fix while we figure out better delimiter handling
        csv_string = csv_string.replace('	', ' ')

        # Check for an empty file. If it's empty, accept it as good and move on
        _df = None
        try:
            _df = pd.read_csv(StringIO(csv_string),
                              sep=delim,
                              index_col=False,
                              dtype=dtype_dic,
                              usecols=range(column_count))
        except pd.errors.EmptyDataError:
            logger.exception("Empty file:")
            outfile = badfile
            client.copy_object(Bucket=bucket,
                               CopySource=bucket + '/' + object_summary.key,
                               Key=outfile)
            clean_exit(
                1, f'{object_summary.key} was empty and was tagged as bad.')
        except pd.errors.ParserError:
            logger.exception("Parse error:")
            outfile = badfile

            client.copy_object(Bucket=bucket,
                               CopySource=bucket + '/' + object_summary.key,
                               Key=outfile)
            clean_exit(
                1,
                f'{object_summary.key} did not parse and was tagged as bad.')

        # set the data frame to use the columns listed in the .conf file.
        # Note that this overrides the columns in the file, and will give an
        # error if the wrong number of columns is present.
        # It will not validate the existing column names.
        _df.columns = columns

        # Run rename to change column names
        if 'rename' in data:
            for thisfield in data['rename']:
                if thisfield['old'] in _df.columns:
                    _df.rename(columns={thisfield['old']: thisfield['new']},
                               inplace=True)

        # Run replace on some fields to clean the data up
        if 'replace' in data:
            for thisfield in data['replace']:
                _df[thisfield['field']].str.replace(thisfield['old'],
                                                    thisfield['new'])

        # Clean up date fields, for each field listed in the dateformat array
        # named "field" apply "format". Leaves null entries as blanks instead
        # of NaT.
        if 'dateformat' in data:
            for thisfield in data['dateformat']:
                _df[thisfield['field']] = pd.to_datetime(
                    _df[thisfield['field']]).apply(lambda x: x.strftime(
                        thisfield['format']) if not pd.isnull(x) else '')

        # We loop over the columns listed in the JSON configuration file.
        # There are three sets of values that should match to consider:
        # - columns_lookup
        # - dbtables_dictionaries
        # - dbtables_metadata

        # The table is built in the same way as the others, but this allows us
        # to resuse the code below in the loop to write the batch file and run
        # the SQL command.

        dictionary_dfs = {}  # keep the dictionaries in storage
        # loop starts at index -1 to process the main metadata table.

        # build an aggregate query which will be used to make one transaction
        copy_queries = {}
        for i in range(-1, len(columns_lookup) * 2):
            # the metadata table is built once
            if i == -1:
                column = "metadata"
                dbtable = "metadata"
                key = None
                columnlist = columns_metadata
                df_new = _df.copy()
            # the column lookup tables are built
            elif i < len(columns_lookup):
                key = "key"
                column = columns_lookup[i]
                columnlist = [columns_lookup[i]]
                dbtable = dbtables_dictionaries[i]
                df_new = to_dict(_df, column)  # make dict a df of this column
                dictionary_dfs[columns_lookup[i]] = df_new
            # the metadata tables are built
            else:
                i_off = i - len(columns_lookup)
                key = None
                column = columns_lookup[i_off]
                columnlist = ['node_id', 'lookup_id']
                dbtable = dbtables_metadata[i_off]

                # retrieve the dict in mem
                df_dictionary = dictionary_dfs[column]

                # for each row in df
                df_new = pd.DataFrame(columns=columnlist)
                for iterrows_tuple in _df.copy().iterrows():
                    row = iterrows_tuple[1]
                    # iterate over the list of delimited terms
                    if row[column] is not np.nan:
                        # get the full string of delimited
                        # values to be looked up
                        entry = row[column]
                        # remove wrapping delimeters
                        entry = entry[1:-1]
                        if entry:  # skip empties
                            # split on delimiter and iterate on resultant list
                            for lookup_entry in entry.split(nested_delim):
                                node_id = row.node_id
                                # its dictionary index
                                lookup_id = df_dictionary.loc[
                                    df_dictionary[column] ==
                                    lookup_entry].index[0]
                                # create the data frame to concat
                                _d = pd.DataFrame([[node_id, lookup_id]],
                                                  columns=columnlist)
                                df_new = pd.concat([df_new, _d],
                                                   ignore_index=True)

            # output the the dataframe as a csv
            to_s3(batchfile, dbtable + '.csv', df_new, columnlist, key)

            # append the formatted copy query to the copy_queries dictionary
            copy_queries[dbtable] = (
                f"COPY {dbtable}_scratch FROM \n"
                f"'s3://{bucket_name}/{batchfile}/{dbtable}.csv' \n"
                f"CREDENTIALS 'aws_access_key_id={aws_access_key_id};"
                f"aws_secret_access_key={aws_secret_access_key}' \n"
                "IGNOREHEADER AS 1 MAXERROR AS 0 \n"
                "DELIMITER '	' NULL AS '-' ESCAPE;\n")

        # prepare the single-transaction query
        query = f'BEGIN; \nSET search_path TO {dbschema};'
        for table, copy_query in copy_queries.items():
            start_query = (
                f'DROP TABLE IF EXISTS {table}_scratch;\n'
                f'DROP TABLE IF EXISTS {table}_old;\n'
                f'CREATE TABLE {table}_scratch (LIKE {table});\n'
                f'ALTER TABLE {table}_scratch OWNER TO microservice;\n'
                f'GRANT SELECT ON {table}_scratch TO looker;\n')
            end_query = (f'ALTER TABLE {table} RENAME TO {table}_old;\n'
                         f'ALTER TABLE {table}_scratch RENAME TO {table};\n'
                         f'DROP TABLE {table}_old;\n')
            query = query + start_query + copy_query + end_query
        query = query + 'COMMIT;\n'
        logquery = (query.replace(os.environ['AWS_ACCESS_KEY_ID'],
                                  'AWS_ACCESS_KEY_ID').replace(
                                      os.environ['AWS_SECRET_ACCESS_KEY'],
                                      'AWS_SECRET_ACCESS_KEY'))

        logger.debug('\n%s', logquery)
        with psycopg2.connect(conn_string) as conn:
            with conn.cursor() as curs:
                try:
                    curs.execute(query)
                except psycopg2.Error:
                    logger.exception("Executing transaction for %s failed.",
                                     object_summary.key)
                    outfile = badfile
                else:  # if the DB call succeed, place file in /good
                    logger.info("Executing transaction %s succeeded.",
                                object_summary.key)
                    outfile = goodfile

        # Copies the uploaded file from client into processed/good or /bad
        try:
            client.copy_object(Bucket=bucket,
                               CopySource=bucket + '/' + object_summary.key,
                               Key=outfile)
        except ClientError:
            logger.exception("S3 transfer failed")
            clean_exit(
                1, f'S3 transfer of {object_summary.key} to {outfile} failed.')

        # exit with non-zero code if the file was keyed to bad
        if outfile == badfile:
            clean_exit(1, f'{object_summary.key} was processed as bad.')

    # now we run the single-time load on the cmslite.themes
    query = """
    -- perform this as a transaction.
    -- Either the whole query completes, or it leaves the old table intact
    BEGIN;
    SET search_path TO {dbschema};
    DROP TABLE IF EXISTS {dbschema}.themes;
    CREATE TABLE IF NOT EXISTS {dbschema}.themes (
      "node_id"	       VARCHAR(255),
      "title"		   VARCHAR(2047),
      "hr_url"	       VARCHAR(2047),
      "parent_node_id" VARCHAR(255),
      "parent_title"   VARCHAR(2047),
      "theme_id"	   VARCHAR(255),
      "subtheme_id"	   VARCHAR(255),
      "topic_id"	   VARCHAR(255),
      "subtopic_id"	   VARCHAR(255),
      "subsubtopic_id" VARCHAR(255),
      "theme"		   VARCHAR(2047),
      "subtheme"	   VARCHAR(2047),
      "topic"		   VARCHAR(2047),
      "subtopic"	   VARCHAR(2047),
      "subsubtopic"	   VARCHAR(2047)
    );
    ALTER TABLE {dbschema}.themes OWNER TO microservice;
    GRANT SELECT ON {dbschema}.themes TO looker;

    INSERT INTO {dbschema}.themes
    WITH ids
    AS (SELECT cm.node_id,
      cm.title,
      cm.hr_url,
      cm.parent_node_id,
      cm_parent.title AS parent_title,
      cm.ancestor_nodes,
      CASE
        -- page is root: Gov, Intranet, ALC, MCFD or Training SITE
        WHEN cm.node_id IN ('CA4CBBBB070F043ACF7FB35FE3FD1081',
                            'A9A4B738CE26466C92B45A66DD8C2AFC',
                            '7B239105652B4EBDAB215C59B75A453B',
                            'AFE735F4ADA542ACA830EBC10D179FBE',
                            'D69135AB037140D880A4B0E725D15774')
          THEN '||'
        -- parent page is root: Gov, Intranet, ALC, MCFD or Training SITE
        WHEN cm.parent_node_id IN ('CA4CBBBB070F043ACF7FB35FE3FD1081',
                            'A9A4B738CE26466C92B45A66DD8C2AFC',
                            '7B239105652B4EBDAB215C59B75A453B',
                            'AFE735F4ADA542ACA830EBC10D179FBE',
                            'D69135AB037140D880A4B0E725D15774')
          THEN '|' || cm.node_id || '|'
        -- "first" page is root: Gov, Intranet, ALC, MCFD or Training SITE
        WHEN TRIM(SPLIT_PART(cm.ancestor_nodes, '|', 2)) IN
                           ('CA4CBBBB070F043ACF7FB35FE3FD1081',
                            'A9A4B738CE26466C92B45A66DD8C2AFC',
                            '7B239105652B4EBDAB215C59B75A453B',
                            'AFE735F4ADA542ACA830EBC10D179FBE',
                            'D69135AB037140D880A4B0E725D15774')
          THEN REPLACE(cm.ancestor_nodes, '|' ||
            TRIM(SPLIT_PART(cm.ancestor_nodes, '|', 2)), '') ||
            cm.parent_node_id || '|' || cm.node_id || '|'
        -- an exception for assets, push the parent node to level2 and
        -- leave the node out of the hierarchy
        WHEN cm.ancestor_nodes = '||' AND cm.page_type = 'ASSET'
          THEN cm.ancestor_nodes || cm.parent_node_id
        -- no ancestor nodes
        WHEN cm.ancestor_nodes = '||'
          THEN '|' || cm.parent_node_id || '|' || cm.node_id || '|'
        ELSE cm.ancestor_nodes || cm.parent_node_id || '|' || cm.node_id || '|'
      END AS full_tree_nodes,
      -- The first SPLIT_PART of full_tree_nodes is always blank as the
      -- string has '|' on each end
      CASE
        WHEN TRIM(SPLIT_PART(full_tree_nodes, '|', 2)) <> ''
          THEN TRIM(SPLIT_PART(full_tree_nodes, '|', 2))
        ELSE NULL
      END AS level1_id,
      CASE
        WHEN TRIM(SPLIT_PART(full_tree_nodes, '|', 3)) <> ''
          THEN TRIM(SPLIT_PART(full_tree_nodes, '|', 3))
        ELSE NULL
      END AS level2_id,
      --  exception for Service BC pages:
      -- "promote" FD6DB5BA2A5248038EEF54D9F9F37C4D as a topic and
      -- raise up its children as sub-topics
      CASE
        WHEN TRIM(SPLIT_PART(full_tree_nodes, '|', 7)) =
          'FD6DB5BA2A5248038EEF54D9F9F37C4D'
          THEN 'FD6DB5BA2A5248038EEF54D9F9F37C4D'
        WHEN TRIM(SPLIT_PART(full_tree_nodes, '|', 4)) <> ''
          THEN TRIM(SPLIT_PART(full_tree_nodes, '|', 4))
        ELSE NULL
      END AS level3_id,
      CASE
        WHEN
TRIM(SPLIT_PART(full_tree_nodes, '|', 7)) =
'FD6DB5BA2A5248038EEF54D9F9F37C4D'
          AND TRIM(SPLIT_PART(full_tree_nodes, '|', 8)) <> ''
          THEN TRIM(SPLIT_PART(full_tree_nodes, '|', 8))
        WHEN
TRIM(SPLIT_PART(full_tree_nodes, '|', 7)) <>
'FD6DB5BA2A5248038EEF54D9F9F37C4D'
          AND TRIM(SPLIT_PART(full_tree_nodes, '|', 5)) <> ''
          THEN TRIM(SPLIT_PART(full_tree_nodes, '|', 5))
        ELSE NULL
      END AS level4_id,
      CASE
        WHEN
TRIM(SPLIT_PART(full_tree_nodes, '|', 7)) =
'FD6DB5BA2A5248038EEF54D9F9F37C4D'
          AND TRIM(SPLIT_PART(full_tree_nodes, '|', 9)) <> ''
          THEN TRIM(SPLIT_PART(full_tree_nodes, '|', 9))
        WHEN
TRIM(SPLIT_PART(full_tree_nodes, '|', 7)) <>
'FD6DB5BA2A5248038EEF54D9F9F37C4D'
          AND TRIM(SPLIT_PART(full_tree_nodes, '|', 6)) <> ''
          THEN TRIM(SPLIT_PART(full_tree_nodes, '|', 6))
        ELSE NULL
      END AS level5_id
    FROM {dbschema}.metadata AS cm
      LEFT JOIN {dbschema}.metadata AS cm_parent
        ON cm_parent.node_id = cm.parent_node_id),
biglist
  AS (SELECT
    ROW_NUMBER () OVER ( PARTITION BY ids.node_id ) AS index,
    ids.*,
    l1.title AS theme,
    l2.title AS subtheme,
    l3.title AS topic,
    l4.title AS subtopic,
    l5.title AS subsubtopic,
  CASE
    WHEN theme IS NOT NULL
      THEN level1_ID
    ELSE NULL
  END AS theme_ID,
  CASE
    WHEN subtheme IS NOT NULL
      THEN level2_ID
    ELSE NULL
  END AS subtheme_ID,
  CASE
    WHEN topic IS NOT NULL
      THEN level3_ID
    ELSE NULL
  END AS topic_ID,
  CASE
    WHEN subtopic IS NOT NULL
      THEN level4_ID
    ELSE NULL
  END AS subtopic_ID,
  CASE
    WHEN subsubtopic IS NOT NULL
      THEN level5_ID
    ELSE NULL
  END AS subsubtopic_ID
FROM ids
    LEFT JOIN {dbschema}.metadata AS l1
      ON l1.node_id = ids.level1_id
    LEFT JOIN {dbschema}.metadata AS l2
      ON l2.node_id = ids.level2_id
    LEFT JOIN {dbschema}.metadata AS l3
      ON l3.node_id = ids.level3_id
    LEFT JOIN {dbschema}.metadata AS l4
      ON l4.node_id = ids.level4_id
    LEFT JOIN {dbschema}.metadata AS l5
      ON l5.node_id = ids.level5_id
)
SELECT node_id,
       title,
       hr_url,
       parent_node_id,
       parent_title,
       theme_id,
       subtheme_id,
       topic_id,
       subtopic_id,
       subsubtopic_id,
       theme,
       subtheme,
       topic,
       subtopic,
       subsubtopic
FROM biglist
WHERE index = 1;
COMMIT;
    """.format(dbschema=dbschema)

    # Execute the query and log the outcome
    logger.debug('Executing query:\n%s', query)
    with psycopg2.connect(conn_string) as conn:
        with conn.cursor() as curs:
            try:
                curs.execute(query)
            # if the DB call fails, print error and place file in /bad
            except psycopg2.Error:
                logger.exception("Psycopg2 error processing themes table")
                clean_exit(1, 'Failed to rebuild themes table.')
            # if the DB call succeed, place file in /good
            else:
                logger.info("Themes table loaded successfully")
                # if the job was succesful, write to cmslite.microservice_log
                endtime = str(datetime.datetime.now())
                query = (f"SET search_path TO {dbschema}; "
                         "INSERT INTO microservice_log VALUES "
                         f"('{starttime}', '{endtime}');")
                try:
                    curs.execute(query)
                except psycopg2.Error:  # if the DB call fails, print error
                    logger.exception("Failed to write to %s.microservice_log",
                                     dbschema)
                    logger.debug(
                        "To manually update, use: "
                        "start time: %s -- end time: %s", starttime, endtime)
                    clean_exit(1, 'microservice_log load failed.')
                else:
                    logger.info("timestamp row added to microservice_log "
                                "table")
                    logger.debug("start time: %s -- end time: %s", starttime,
                                 endtime)

    clean_exit(0, 'Succesfully finished cmslitemetadata_to_redshift.')