Ejemplo n.º 1
0
    def generate_partitioned_dataframes(self, df_msgs_and_meta_data):
        """
        Generates the batched dataframes to upload to s3
        Args:
            df_msgs_and_meta_data (dataframe): dataframe to be paritioned
        Returns (List[DataFrame, str, str]): list of triples (df, s3_dir, file_name)
        """
        dataframes_and_fld_locs = []
        for cur_interval_np_datetime in \
                pd.unique(df_msgs_and_meta_data[self.partition_key_nm]):
            df_partition = \
                df_msgs_and_meta_data[
                    df_msgs_and_meta_data[self.partition_key_nm] ==
                    cur_interval_np_datetime]

            cur_interval_ts = pd.to_datetime(cur_interval_np_datetime)

            batch_file_path = "{}/date_of_batch={}/time_of_batch={}".format(
                self.root_path, cur_interval_ts.strftime('%Y%m%d'),
                cur_interval_ts.strftime('%H%M%S'))

            file_nm = generate_snapshot_file_name_with_timestamp()

            LOG.debug("Data path : %s", batch_file_path)

            dataframes_and_fld_locs.append(
                (df_partition, batch_file_path, file_nm))

        return dataframes_and_fld_locs
Ejemplo n.º 2
0
    def _dataframe_to_cassandra_ddl(self,
                                    data_frame: DataFrame,
                                    primary_key_column_list: List[str],
                                    partition_key_column_list: List[str],
                                    table_name: str,
                                    table_options_statement: str = ""):
        """
        Generates a 'create table' cql statement with primary keys (using compound keys for the partition)
        Args:
            data_frame (Dataframe):
            primary_key_column_list (list[str]): list of columns specifying the columns to be used as primary key
            partition_key_column_list (list[str]): list of columns specifying the columns to be used to partition the data
            table_name (str): name of the table to create
            table_options_statement (str):

        Returns:  str
        """
        column_list = _cql_manage_column_lists(data_frame, primary_key_column_list, partition_key_column_list)
        # create list of partition keys from first column of the primary key if not specified
        partition_key_column_list = partition_key_column_list if partition_key_column_list is not None and \
            len(partition_key_column_list) > 0 else [primary_key_column_list[0]]
        partition_key = ["(" + ", ".join(partition_key_column_list) + ")"]
        # create list of cluster keys from the remainder of the primary key columns
        clustering_key_column_list = [x for x in primary_key_column_list if x not in partition_key_column_list]
        cluster_keys = [", ".join(clustering_key_column_list)] if len(clustering_key_column_list)>0 else []
        cql = f"""
        CREATE TABLE IF NOT EXISTS {self.keyspace}.{table_name} (
            {", ".join(column_list)},
            PRIMARY KEY ({", ".join(partition_key + cluster_keys)})
            )
        {table_options_statement};
        """
        LOG.debug(cql)
        return cql
Ejemplo n.º 3
0
 def _execute_batches(self, batches: List):
     results = []
     LOG.info("Executing cassandra batches")
     for batch in batches:
         results.append(self._execute_batch(batch))
     LOG.info("finished %s batches", len(results))
     return results
Ejemplo n.º 4
0
def sync_etl_state_table():
    """
    Utility method to sync (Create) the table as per ORM model
    Returns: None
    """
    LOG.debug("Sinking Cassandra Table using model")
    sync_table(EtlSinkRecordState)
Ejemplo n.º 5
0
 def _show_result(self, execution_id, max_result_size=1000):
     results = self._get_query_result(execution_id, max_result_size)
     column_info = results['ResultSet']['ResultSetMetadata']['ColumnInfo']
     headers = [h['Name'].encode('utf-8') for h in column_info]
     LOG.info(headers)
     csv_writer = csv.writer(sys.stdout, quoting=csv.QUOTE_ALL)
     csv_writer.writerows([[val['VarCharValue'] for val in row['Data']]
                           for row in results['ResultSet']['Rows']])
Ejemplo n.º 6
0
 def _cql_upsert_from_dataframe(self, dataframe: DataFrame, table: str):
     upsert_sql = f"""
     INSERT INTO {self.keyspace}.{table} 
     ({", ".join(list(dataframe.columns.values))}) 
     VALUES ({", ".join(['?' for key in dataframe.columns.values])});
         """
     LOG.debug(upsert_sql)
     return upsert_sql
Ejemplo n.º 7
0
 def _cql_upsert_from_dict(self, data: dict, table: str):
     upsert_sql = f"""
     INSERT INTO {self.keyspace}.{table} 
     ({", ".join(data)}) 
     VALUES ({", ".join(['?' for key in data])});
         """
     LOG.debug(upsert_sql)
     return upsert_sql
Ejemplo n.º 8
0
    def _check_msg_is_not_error(msg):

        if msg.error():
            LOG.error("Consumer error: %s", msg.error())
            return None

        LOG.debug("Message from topic: %s", msg.value().decode('utf-8'))

        return msg
Ejemplo n.º 9
0
 def get_fields_types(self, field_types_row_number: int) -> List[str]:
     """
     Get the field types as a list
     Args:
         field_types_row_number (int): Row number of field types
     Returns: field types list
     """
     field_types = self._get_worksheet().row_values(field_types_row_number)
     LOG.debug("Field types:\n%s", field_types)
     return field_types
Ejemplo n.º 10
0
 def poll_topic_and_upload_to_s3(self):
     """
     Poll at the Kafka topic at set intervals and parse and export the
     messages to S3
     Returns: None
     """
     while True:
         LOG.debug("Polling Kafka for messages")
         self.create_events_snapshot()
         LOG.debug("Upload complete, sleeping")
         time.sleep(self._polling_interval)
Ejemplo n.º 11
0
def _validate_partition_key_list(column_dict, primary_key_column_list, partition_key_column_list):
    _validate_primary_key_list(column_dict, primary_key_column_list)
    if partition_key_column_list is None or not partition_key_column_list:
        LOG.debug("partition_key_column_list : %s\nNo partition key specified. Revert to using first column from the primary key for partitioning.",
            str(partition_key_column_list))
        return
    for key in partition_key_column_list:
        if key not in primary_key_column_list:
            raise ValidationError(
                f"The column {key} is not in the primary key list. It cannot be specified as part of the partition key",
            )
Ejemplo n.º 12
0
    def create_table(self, table_settings):
        """
        Create a table from given settings
        Args:
            table_settings (dict): Dictionary of settings to create table

        Returns: None
        """
        table_sql = self._build_create_table_sql(table_settings)
        LOG.info(table_sql)
        self.run_query(table_sql)
Ejemplo n.º 13
0
    def _state_manager_connect(self):

        LOG.info("Connecting to Cassandra")

        conn = CassandraConnectionManager(
            self.__settings.etl_state_manager_connection)
        conn.setup_connection(self.__settings.etl_state_manager_keyspace)

        LOG.info("Cassandra connection established")

        sync_etl_state_table()
Ejemplo n.º 14
0
    def _verify_data_before_upsert(
            self, data: List[dict]) -> (List[dict], List[dict]):
        data, issues = map(list,
                           zip(*[self._sanitise_data(dat) for dat in data]))

        if len(issues) > 0:
            LOG.warning("Issues found in verification, number of issues: %i",
                        len(issues))

        # Remove None from the List
        return [i for i in data if i], [i for i in issues if i]
Ejemplo n.º 15
0
    def get_msgs(self):
        """Get the latest messages from the Kafka topic

        Returns list(Message) : list of Kafka Messages
        """

        LOG.debug("Getting messages from topic %s", self._topic)

        if not self._subscribed_to_topic:
            self._subscribe_consumer()

        return self.poll_kafka_for_messages()
Ejemplo n.º 16
0
 def get_all_keys(self, key_prefix: str) -> List[str]:
     """
     Sense all keys under a given key prefix
     Args:
         key_prefix (str): the key prefix under which all files will be sensed
     Returns: List[str]
     """
     LOG.info("sensing files from s3://%s/%s ", self.bucket, key_prefix)
     metadata = self.get_object_metadata(key_prefix)
     lines = [file.key for file in metadata]
     LOG.info("found %s s3 keys", len(lines))
     return lines
Ejemplo n.º 17
0
    def _partition_data_and_upload_to_s3(self, data_list, interval):
        """
        Partitions the messages by time in a dataframe, and then uploads to s3

        """
        if data_list:

            for msg_df, key, file_name in \
                    self.partition_msgs_by_kafka_ts(data_list, interval):
                LOG.debug("data path : %s/%s", key, file_name)

                self._s3_client.upload_dataframe_as_parquet(
                    dataframe=msg_df, key=key, file_name=file_name)
Ejemplo n.º 18
0
 def _upsert_data_frame(self, data_frame):
     if self.__settings.destination_batch_size > 1:
         LOG.info("Going to upsert batches of size %s", self.__settings.destination_batch_size)
         result = self._get_cassandra_util().upsert_dataframe_in_batches(
             dataframe=data_frame,
             table=self.__settings.destination_table,
             batch_size=self.__settings.destination_batch_size)
     else:
         LOG.info("Going to upsert one row at a time")
         result = self._get_cassandra_util().upsert_dataframe(
             dataframe=data_frame,
             table=self.__settings.destination_table)
     return result
Ejemplo n.º 19
0
    def create_events_snapshot(self):
        """
        Get Kafka messages from a topic and export to s3

        Returns: None

        """

        msgs = self._kafka_poller.get_msgs()
        LOG.debug("Json messages : %s", msgs)

        self._kafka_s3_exporter.parse_and_export_msgs(msgs,
                                                      self._polling_interval)
Ejemplo n.º 20
0
 def execute(self, query: str, row_factory: callable, **kwargs) -> Result:
     """
     Execute a cql command and retrieve data with the row factory
     Args:
         query (str):
         row_factory (callable):
         **kwargs: Kwargs to match the session.execute command in cassandra
     Returns: ResultSet
     """
     LOG.debug("Executing query: %s", query)
     if row_factory is not None:
         self._session.row_factory = row_factory
     return self._session.execute(query, **kwargs)
Ejemplo n.º 21
0
 def delete_recursive(self, key_prefix: str) -> None:
     """
     Recursively delete all keys with given prefix from the named bucket
     Args:
         key_prefix (str): Key prefix under which all files will be deleted
     Returns: NA
     """
     if not key_prefix.endswith("/"):
         key_prefix = f"{key_prefix}/"
     LOG.info("Recursively deleting s3://%s/%s", self.bucket, key_prefix)
     response = self.get_resource().Bucket(self.bucket).objects.filter(
         Prefix=key_prefix).delete()
     LOG.info(response)
Ejemplo n.º 22
0
    def _sanitise_data(self, dat):
        try:

            current_state = self._get_sink_manager(dat).current_state()

            LOG.debug("Current state of sink manager %s", current_state)

            if current_state == EtlStates.Ready:
                LOG.debug("Record in ready state with data: %s", dat)

                return dat, None
            else:

                LOG.debug(
                    "Sink state found to be not ready, state is %s, the "
                    "data is: "
                    "%s", current_state, dat)

                return None, _get_structured_issue(
                    f"Current state is {current_state} "
                    "state", dat)
        except ValidationError as e:
            LOG.warning(
                "Issue while trying to ready a record for the upload \n %s \n %s",
                e, dat)
            return None, _get_structured_issue(str(e), dat)
Ejemplo n.º 23
0
 def rename_file(self, key: str, new_file_name: str) -> None:
     """
     Rename a file on s3
     Args:
         key: Current key of the file
         new_file_name: target file name
     Returns: None
     """
     s3 = self.get_resource()
     full_new_file_path = key.rpartition('/')[0] + '/' + new_file_name
     LOG.info("Renaming source: %s to %s", key, full_new_file_path)
     s3.Object(self.bucket, full_new_file_path).copy_from(
         CopySource={'Bucket': self.bucket, 'Key': key})
     s3.Object(self.bucket, key).delete()
Ejemplo n.º 24
0
    def add_versions_from_json_file(self, version_file_location):
        """
        Load a json file from local storage and append these versions to the
        version tracking dictionary
        Args:
            version_file_location (str): Path to the json file

        Returns: None

        """
        try:
            with open(version_file_location, "r") as content:
                file_content = content.read()
                version_dict_from_file = json.loads(file_content)

                self.add_dictionary_to_versions(version_dict_from_file)

        except FileNotFoundError as fnf:
            log.error("No versioning file found at : %s",
                      version_file_location)
            raise fnf

        except json.decoder.JSONDecodeError as decode_error:

            log.error("JSON file failed to decode, check version dict is "
                      "correctly formatted")

            raise decode_error

        except Exception as exception:
            log.error("unknown error")
            log.error(traceback.format_exc())
            raise exception
Ejemplo n.º 25
0
def _multi_process_upload_file(settings: AwsConnectionSettings, filename: str, bucket: str,
                               key: str) -> None:
    """
    A standalone copy of the method making it simple to pickle in a multi processing pool
    Args:
        settings: the s3 connection settings to use for upload
        filename: local file name of the file to be uploaded.
        bucket: the s3 bucket to upload file to.
        key: the s3 key to use while uploading the file
    Returns: None
    """
    LOG.info("Uploading File %s to s3://%s/%s", filename, bucket, key)
    S3Util(
        conn=AwsConnectionManager(settings),
        bucket=bucket
    ).upload_file(local_file_path=filename, key=key)
Ejemplo n.º 26
0
def get_page_as_list_of_dict(page: dict) -> List[OrderedDict]:
    """
    Converts a list of entries from google adwords response into a list of Ordered Dictionaries
    Args:
        page (dict): the response page from google adwords api
    Returns: List[dict]
    """
    result = []
    if 'entries' in page:
        entries = page['entries']
        # These entries are a list of zeep Objects that need conversion to Dict
        result = [zeep_object_to_dict(entry) for entry in entries]
        LOG.debug("The result from the adword API: %s", result)
    else:
        LOG.info('No entries were found.')
    return result
Ejemplo n.º 27
0
 def add_partitions(self):
     """
     Add the current Data Transfer's partition to Athena's Metadata
     Returns: None
     """
     if self.__settings.is_partitioned_table:
         athena_util = self._get_athena_util()
         athena_util.add_partitions(
             table=self.__settings.target_table,
             partition_keys=[
                 key for (key, value) in self.__settings.partition_values
             ],
             partition_values=[
                 value for (key, value) in self.__settings.partition_values
             ])
     else:
         LOG.warning("The table is not partitioned, this is a NOOP")
Ejemplo n.º 28
0
    def parse_and_export_msgs(self, list_of_msgs, interval):
        """
        Converts messages to a pandas dataframe and then exports to s3

        Args:
            list_of_msgs (list(Kafka Message Object)): List of msg objects
            interval (int): Rounding interval for the temporal partitioning
        Returns: None

        """

        good_data, bad_data = convert_msgs_to_dictionary(list_of_msgs)

        self._partition_data_and_upload_to_s3(good_data, interval)
        self._partition_data_and_upload_to_s3(bad_data, interval)

        LOG.info("Data Upload Complete")
Ejemplo n.º 29
0
 def download_directory(self, source_key: str, file_suffix: str, local_directory: str) -> None:
     """
     Download an entire directory from s3 onto local file system
     Args:
         source_key (str): key prefix of the directory to be downloaded from s3
         file_suffix (str): suffix to filter a subset under the source_key to be downloaded
         local_directory (str): local absolute path to store all the files
     Returns: None
     """
     s3 = self.get_resource()
     LOG.info("Downloading s3://%s/%s to %s", self.bucket, source_key, local_directory)
     for obj in s3.Bucket(self.bucket).objects.filter(Prefix=source_key):
         key_path = obj.key.split("/")
         if obj.key.endswith(file_suffix):
             filename = f"{local_directory}/{key_path[-1]}"
             self.download_file(
                 local_file_path=filename,
                 key=obj.key)
Ejemplo n.º 30
0
 def delete_recursive_match_suffix(self, key_prefix: str, suffix: str) -> None:
     """
     Recursively delete all keys with given key prefix and suffix from the bucket
     Args:
         key_prefix (str): Key prefix under which all files will be deleted.
         suffix (str): suffix of the subset of files in the given prefix directory to be deleted
     Returns: None
     """
     if not key_prefix:
         raise ValueError("key_prefix must not be empty")
     if not suffix:
         raise ValueError("suffix must not be empty")
     s3 = self.get_resource()
     for obj in s3.Bucket(self.bucket).objects.filter(Prefix=key_prefix):
         if obj.key.endswith(suffix):
             LOG.info("deleting s3://%s/%s", self.bucket, obj.key)
             response = obj.delete()
             LOG.info("Response: %s ", response)