Beispiel #1
0
 def _execute_batches(self, batches: List):
     results = []
     LOG.info("Executing cassandra batches")
     for batch in batches:
         results.append(self._execute_batch(batch))
     LOG.info("finished %s batches", len(results))
     return results
Beispiel #2
0
 def _show_result(self, execution_id, max_result_size=1000):
     results = self._get_query_result(execution_id, max_result_size)
     column_info = results['ResultSet']['ResultSetMetadata']['ColumnInfo']
     headers = [h['Name'].encode('utf-8') for h in column_info]
     LOG.info(headers)
     csv_writer = csv.writer(sys.stdout, quoting=csv.QUOTE_ALL)
     csv_writer.writerows([[val['VarCharValue'] for val in row['Data']]
                           for row in results['ResultSet']['Rows']])
Beispiel #3
0
    def create_table(self, table_settings):
        """
        Create a table from given settings
        Args:
            table_settings (dict): Dictionary of settings to create table

        Returns: None
        """
        table_sql = self._build_create_table_sql(table_settings)
        LOG.info(table_sql)
        self.run_query(table_sql)
Beispiel #4
0
    def _state_manager_connect(self):

        LOG.info("Connecting to Cassandra")

        conn = CassandraConnectionManager(
            self.__settings.etl_state_manager_connection)
        conn.setup_connection(self.__settings.etl_state_manager_keyspace)

        LOG.info("Cassandra connection established")

        sync_etl_state_table()
Beispiel #5
0
 def get_all_keys(self, key_prefix: str) -> List[str]:
     """
     Sense all keys under a given key prefix
     Args:
         key_prefix (str): the key prefix under which all files will be sensed
     Returns: List[str]
     """
     LOG.info("sensing files from s3://%s/%s ", self.bucket, key_prefix)
     metadata = self.get_object_metadata(key_prefix)
     lines = [file.key for file in metadata]
     LOG.info("found %s s3 keys", len(lines))
     return lines
 def _upsert_data_frame(self, data_frame):
     if self.__settings.destination_batch_size > 1:
         LOG.info("Going to upsert batches of size %s", self.__settings.destination_batch_size)
         result = self._get_cassandra_util().upsert_dataframe_in_batches(
             dataframe=data_frame,
             table=self.__settings.destination_table,
             batch_size=self.__settings.destination_batch_size)
     else:
         LOG.info("Going to upsert one row at a time")
         result = self._get_cassandra_util().upsert_dataframe(
             dataframe=data_frame,
             table=self.__settings.destination_table)
     return result
Beispiel #7
0
 def delete_recursive(self, key_prefix: str) -> None:
     """
     Recursively delete all keys with given prefix from the named bucket
     Args:
         key_prefix (str): Key prefix under which all files will be deleted
     Returns: NA
     """
     if not key_prefix.endswith("/"):
         key_prefix = f"{key_prefix}/"
     LOG.info("Recursively deleting s3://%s/%s", self.bucket, key_prefix)
     response = self.get_resource().Bucket(self.bucket).objects.filter(
         Prefix=key_prefix).delete()
     LOG.info(response)
Beispiel #8
0
 def rename_file(self, key: str, new_file_name: str) -> None:
     """
     Rename a file on s3
     Args:
         key: Current key of the file
         new_file_name: target file name
     Returns: None
     """
     s3 = self.get_resource()
     full_new_file_path = key.rpartition('/')[0] + '/' + new_file_name
     LOG.info("Renaming source: %s to %s", key, full_new_file_path)
     s3.Object(self.bucket, full_new_file_path).copy_from(
         CopySource={'Bucket': self.bucket, 'Key': key})
     s3.Object(self.bucket, key).delete()
Beispiel #9
0
    def select_dataframe(self, query) -> DataFrame:
        """
        execute SQL query using Airflow mysql hook and retrieve data in pandas data frame

        Args:
            query (str): Mysql compliant query string

        Returns: DataFrame

        """
        LOG.info("Executing \n %s", query)
        with closing(self._conn_mgr.get_conn()) as connection:
            df = sql.read_sql(query, connection)
        LOG.info("Sql Data frame size: %s", df.shape[0])
        LOG.debug(df.head(2))
        return df
Beispiel #10
0
def _multi_process_upload_file(settings: AwsConnectionSettings, filename: str, bucket: str,
                               key: str) -> None:
    """
    A standalone copy of the method making it simple to pickle in a multi processing pool
    Args:
        settings: the s3 connection settings to use for upload
        filename: local file name of the file to be uploaded.
        bucket: the s3 bucket to upload file to.
        key: the s3 key to use while uploading the file
    Returns: None
    """
    LOG.info("Uploading File %s to s3://%s/%s", filename, bucket, key)
    S3Util(
        conn=AwsConnectionManager(settings),
        bucket=bucket
    ).upload_file(local_file_path=filename, key=key)
Beispiel #11
0
def get_page_as_list_of_dict(page: dict) -> List[OrderedDict]:
    """
    Converts a list of entries from google adwords response into a list of Ordered Dictionaries
    Args:
        page (dict): the response page from google adwords api
    Returns: List[dict]
    """
    result = []
    if 'entries' in page:
        entries = page['entries']
        # These entries are a list of zeep Objects that need conversion to Dict
        result = [zeep_object_to_dict(entry) for entry in entries]
        LOG.debug("The result from the adword API: %s", result)
    else:
        LOG.info('No entries were found.')
    return result
Beispiel #12
0
    def parse_and_export_msgs(self, list_of_msgs, interval):
        """
        Converts messages to a pandas dataframe and then exports to s3

        Args:
            list_of_msgs (list(Kafka Message Object)): List of msg objects
            interval (int): Rounding interval for the temporal partitioning
        Returns: None

        """

        good_data, bad_data = convert_msgs_to_dictionary(list_of_msgs)

        self._partition_data_and_upload_to_s3(good_data, interval)
        self._partition_data_and_upload_to_s3(bad_data, interval)

        LOG.info("Data Upload Complete")
Beispiel #13
0
    def execute(self, query: str, values: Optional[Tuple] = None) -> None:
        """
        execute standalone SQL statements in a mysql database
        Args:
            query (str): Mysql compliant query string
            values (Tuple): n tuple for substituting values in the query

        Returns: None

        """
        LOG.info("Executing \n %s", query)
        conn = self._conn_mgr.get_conn()
        cur = conn.cursor()
        if values:
            cur.execute(query, values)
        else:
            cur.execute(query)
        conn.commit()
Beispiel #14
0
 def delete_recursive_match_suffix(self, key_prefix: str, suffix: str) -> None:
     """
     Recursively delete all keys with given key prefix and suffix from the bucket
     Args:
         key_prefix (str): Key prefix under which all files will be deleted.
         suffix (str): suffix of the subset of files in the given prefix directory to be deleted
     Returns: None
     """
     if not key_prefix:
         raise ValueError("key_prefix must not be empty")
     if not suffix:
         raise ValueError("suffix must not be empty")
     s3 = self.get_resource()
     for obj in s3.Bucket(self.bucket).objects.filter(Prefix=key_prefix):
         if obj.key.endswith(suffix):
             LOG.info("deleting s3://%s/%s", self.bucket, obj.key)
             response = obj.delete()
             LOG.info("Response: %s ", response)
Beispiel #15
0
 def download_directory(self, source_key: str, file_suffix: str, local_directory: str) -> None:
     """
     Download an entire directory from s3 onto local file system
     Args:
         source_key (str): key prefix of the directory to be downloaded from s3
         file_suffix (str): suffix to filter a subset under the source_key to be downloaded
         local_directory (str): local absolute path to store all the files
     Returns: None
     """
     s3 = self.get_resource()
     LOG.info("Downloading s3://%s/%s to %s", self.bucket, source_key, local_directory)
     for obj in s3.Bucket(self.bucket).objects.filter(Prefix=source_key):
         key_path = obj.key.split("/")
         if obj.key.endswith(file_suffix):
             filename = f"{local_directory}/{key_path[-1]}"
             self.download_file(
                 local_file_path=filename,
                 key=obj.key)
Beispiel #16
0
    def watch_query(self, execution_id, poll_frequency=10):
        """
        Watch the query execution for a given execution id in Athena
        Args:
            execution_id: the execution id of an Athena Auery
            poll_frequency (int): Freq in seconds to poll for the query status using Athen API

        Returns: dictionary of status from Athena

        """
        LOG.info("Watching query with execution id - %s", execution_id)
        while True:
            athena = self.get_client()
            stats = athena.get_query_execution(QueryExecutionId=execution_id)
            status = stats['QueryExecution']['Status']['State']
            if status in ['SUCCEEDED', 'FAILED', 'CANCELLED']:
                LOG.info("Query Completed %s", stats)
                return stats
            time.sleep(poll_frequency)
Beispiel #17
0
    def upload_all(self) -> (List[dict], List[dict], List[dict]):
        """
        Upload all files from the Athena table onto AdWords offline conversion
        Returns:
            verification_issues List[dict]: a tuple of lists outlining any
            verification failures
            successes List[dict]: The responses for successful uploads to
            the Google Adwords API
            failures List[dict]: The responses for failed uploads to the
            Google Adwords API
        """

        verification_issues, successes, failures = [], [], []

        for key in self.list_source_files():
            issues, success, fail = \
                self._process_data_frame(self.get_data_frame(key))

            verification_issues.extend(issues)
            successes.extend(success)
            failures.extend(fail)

        if len(verification_issues) > 0:
            LOG.warning("There were %s verification failures",
                        len(verification_issues))

            LOG.debug("All verification failures: \n %s", verification_issues)

        if len(failures) > 0:
            LOG.warning(
                "There were %s failures uploading to the adwords "
                "API", len(failures))

            LOG.info("Sample Failure: \n %s", failures[0])

            LOG.debug("All failures: \n %s", failures)

        LOG.info(
            "There were %s records successfully uploaded from a total of %s submitted items",
            len(successes),
            len(successes) + len(failures) + len(verification_issues))

        return verification_issues, successes, failures
Beispiel #18
0
 def prepare_batches(self,
                     prepared_statement: PreparedStatement,
                     tuples: List[tuple],
                     batch_size: int) -> List:
     """
     Prepares a list of cassandra batched Statements out of a list of tuples and prepared
     statement
     Args:
         prepared_statement (PreparedStatement): the statement to be used for batching.
         tuples (list[tuple]): the data to be inserted.
         batch_size (int): limit on the number of prepared statements in the batch.
     Returns: list[BatchStatement]
     """
     batches = []
     LOG.debug("Preparing cassandra batches out of rows")
     batches_of_tuples = _chunk_list(tuples, batch_size)
     for tpl in batches_of_tuples:
         batch = self._prepare_batch(prepared_statement, tpl)
         batches.append(batch)
     LOG.info("created %s batches out of list of %s tuples", len(batches), len(tuples))
     return batches
Beispiel #19
0
 def download_strings_from_directory(self, key_prefix: str, encoding: str = UTF8) -> List[str]:
     """
     Read lines from s3 files
     Args:
         key_prefix: the key prefix under which all files will be read
         encoding: the character encoding to use for encoding / decoding content
     Returns: List[str]
     """
     s3 = self.get_resource()
     bucket = s3.Bucket(name=self.bucket)
     lines = []
     LOG.info("reading files from s3://%s/%s", self.bucket, key_prefix)
     file_metadata = bucket.objects.filter(Prefix=key_prefix)
     for file in file_metadata:
         obj = s3.Object(self.bucket, file.key)
         data = obj.get()['Body'].read().decode(encoding)
         lines.append(data.splitlines())
     # Flatten the list of lists
     flat_lines = [item for sublist in lines for item in sublist]
     LOG.info("read %s lines from %s s3 files", len(flat_lines), len(lines))
     return flat_lines
Beispiel #20
0
    def run_query(self, query_string, return_result=False):
        """
        General purpose query executor that submits an athena query, then uses the execution id
        to poll and monitor the
        sucess of the query. and optionally return the result.
        Args:
            query_string (string): The string contianing valid athena query
            return_result (boolean): Boolean flag to turn on results

        Returns (boolean): if return_result = True then returns result dictionary, else None

        """
        athena = self.get_client()
        output_location = "s3://{bucket}/{key}".format(
            bucket=self.output_bucket, key=self.output_key)
        LOG.info(
            "executing query \n%s \non database - %s with results location %s",
            query_string, self.database, output_location)
        response = athena.start_query_execution(
            QueryString=query_string,
            QueryExecutionContext={'Database': self.database},
            ResultConfiguration={'OutputLocation': output_location})
        execution_id = response['QueryExecutionId']
        stats = self.watch_query(execution_id)
        LOG.info("athena response %s", response)
        if stats['QueryExecution']['Status']['State'] == 'SUCCEEDED':
            LOG.info("Query execution id - %s SUCCEEDED", execution_id)
            if return_result:
                return self._get_query_result(execution_id)
        else:
            raise ValueError("Query exited with {} state because {}".format(
                stats['QueryExecution']['Status']['State'],
                stats['QueryExecution']['Status']['StateChangeReason']))
        return None
Beispiel #21
0
 def upload_directory(self,
                      source_directory: str,
                      extension: str,
                      target_key: str,
                      overwrite: bool = True,
                      rename: bool = True) -> None:
     """
     Upload a local file directory to s3
     Args:
         source_directory (str): Local source directory's absolute path.
         extension (str): the file extension of files in that directory to be uploaded.
         target_key (str): Target location on the s3 bucket for files to be uploaded.
         overwrite (bool): overwrites files on s3 if set to true.
         rename (bool): renames the file when uploading to s3 if set to true.
     Returns: None
     """
     if overwrite:
         LOG.info("Cleaning existing files on s3")
         self.delete_recursive(f"{target_key}/")
     LOG.info("searching for files to upload in %s", source_directory)
     path_list = Path(source_directory).glob(f'**/*.{extension}')
     itr = 0
     upload_data = []
     for path in path_list:
         path_in_str = str(path)
         filename = os.path.basename(path_in_str)
         if rename:
             filename = f"file-{str(uuid.uuid4())}.{extension}"
         destination_key = f"{target_key}/{filename}"
         itr = itr + 1
         upload_data += [(self.conn.settings, path_in_str, self.bucket, destination_key)]
     Pool().starmap(_multi_process_upload_file, upload_data)  # Use all available cores
     LOG.info("Upload of directory complete at s3://%s/%s", self.bucket, target_key)
Beispiel #22
0
    def _build_create_table_sql(self, table_settings):
        exists = _construct_table_exists_ddl(table_settings["exists"])
        partitions = _construct_table_partition_ddl(
            table_settings["partitions"])
        table_properties = _construct_table_properties_ddl(
            table_settings.get("skip_headers", False),
            table_settings["storage_format_selector"].lower(),
            table_settings["encryption"])

        sql = """
            CREATE EXTERNAL TABLE {exists} {table}(
              {columns}
              )
            {partitions}
            ROW FORMAT SERDE 
              '{row_format_serde}' 
            STORED AS INPUTFORMAT 
              '{inputformat}' 
            OUTPUTFORMAT 
              '{outputformat}'
            LOCATION
              's3://{s3_bucket}/{s3_dir}'
            {table_properties}
            """.format(
            table=table_settings["table"],
            exists=exists,
            columns=zip_columns(table_settings["columns"]),
            partitions=partitions,
            row_format_serde=self.storage_format_lookup[
                table_settings["storage_format_selector"]]["row_format_serde"],
            inputformat=self.storage_format_lookup[
                table_settings["storage_format_selector"]]["inputformat"],
            outputformat=self.storage_format_lookup[
                table_settings["storage_format_selector"]]["outputformat"],
            s3_bucket=table_settings["s3_bucket"],
            s3_dir=table_settings["s3_dir"],
            table_properties=table_properties)
        LOG.info("Query from table settings:\n %s", sql)
        return sql
Beispiel #23
0
 def create_athena_table(self) -> None:
     """
     Creates an athena table on top of the transferred data
     Returns: None
     """
     athena_util = self._get_athena_util()
     s3_util = self._get_s3_util()
     keys = s3_util.get_keys(key_prefix=self._final_target_prefix)
     LOG.debug("gathered files transferred under this ETL %s", keys)
     if keys:
         data = s3_util.download_parquet_as_dataframe(keys[0])
         LOG.info(
             "Downloaded parquet file from s3 to construct Athena create table statement: %s "
             "\n made dataframe of shape %s", keys[0], data.shape)
         if self.__settings.target_table_ddl_progress:
             athena_util.drop_table(self.__settings.target_table)
         athena_table_settings = self._construct_athena_table_settings(data)
         athena_util.create_table(table_settings=athena_table_settings)
     else:
         raise ValueError(
             "No Data has been uploaded to target directory, please load data first, "
             "before creating Athena table")
Beispiel #24
0
 def get_keys_modified_in_range(self,
                                key_prefix: str,
                                start_date: arrow,
                                end_date: arrow) -> List[str]:
     """
     Sense if there were any files changed or added in the given time period under the given key
     prefix and return a list of keys
     Args:
         key_prefix: the key prefix under which all files will be sensed
         start_date: start of the duration in which the s3 objects were modified
         end_date: end of the duration in which the s3 objects were modified
     Returns: List[str]
     """
     LOG.info("sensing files from s3://%s/%s \n between %s to %s", self.bucket, key_prefix,
              start_date, end_date)
     metadata = self.get_object_metadata(key_prefix)
     lines = []
     for file in metadata:
         if start_date < arrow.get(file.last_modified) <= end_date:
             lines += [file.key]
     LOG.info("found %s s3 files changed", len(lines))
     return lines
Beispiel #25
0
 def move_recursive(self,
                    source_dir: str,
                    destination_dir: str,
                    delete_after_copy: bool = True) -> None:
     """
     recursively move files on s3 to a new location on the same bucket
     Args:
         source_dir: Source key prefix representing the directory to move
         destination_dir: destination key prefix
         delete_after_copy: removes the files from source after successful copy if set to true
     Returns: None
     """
     s3 = self.get_resource()
     bucket = s3.Bucket(self.bucket)
     for obj in bucket.objects.filter(Prefix=source_dir):
         # replace the prefix
         new_key = destination_dir + obj.key[len(source_dir):]
         LOG.info("Moving s3 object from : \n%s \nto: \n%s", obj.key, new_key)
         new_obj = bucket.Object(new_key)
         new_obj.copy({'Bucket': self.bucket, 'Key': obj.key})
     if delete_after_copy:
         self.delete_recursive(source_dir)
Beispiel #26
0
    def instantiate_producer(self):
        """
        Try to connect to the Kafka bootstrap server. We include
        functionality to allow failure to connect to the queue to happen
        silently

        Returns (Producer): Kafka Producer

        """
        try:
            LOG.debug("Instantiating Producer")
            self.producer = create_producer(self.producer_config)
            # Check the connection works by polling for messages
            LOG.debug("Polling Queue")
            self.producer.poll(3)
            LOG.info("Succesfully polled Kafka Queue")

        except Exception as exception:
            self.producer = None
            LOG.error("Kafka Producer failed to instantiate: \n %s", exception)

            if self.raise_exception_on_failed_connection:
                raise NoProducerInstantiatedError()