Esempio n. 1
0
    def _dataframe_to_cassandra_ddl(self,
                                    data_frame: DataFrame,
                                    primary_key_column_list: List[str],
                                    partition_key_column_list: List[str],
                                    table_name: str,
                                    table_options_statement: str = ""):
        """
        Generates a 'create table' cql statement with primary keys (using compound keys for the partition)
        Args:
            data_frame (Dataframe):
            primary_key_column_list (list[str]): list of columns specifying the columns to be used as primary key
            partition_key_column_list (list[str]): list of columns specifying the columns to be used to partition the data
            table_name (str): name of the table to create
            table_options_statement (str):

        Returns:  str
        """
        column_list = _cql_manage_column_lists(data_frame, primary_key_column_list, partition_key_column_list)
        # create list of partition keys from first column of the primary key if not specified
        partition_key_column_list = partition_key_column_list if partition_key_column_list is not None and \
            len(partition_key_column_list) > 0 else [primary_key_column_list[0]]
        partition_key = ["(" + ", ".join(partition_key_column_list) + ")"]
        # create list of cluster keys from the remainder of the primary key columns
        clustering_key_column_list = [x for x in primary_key_column_list if x not in partition_key_column_list]
        cluster_keys = [", ".join(clustering_key_column_list)] if len(clustering_key_column_list)>0 else []
        cql = f"""
        CREATE TABLE IF NOT EXISTS {self.keyspace}.{table_name} (
            {", ".join(column_list)},
            PRIMARY KEY ({", ".join(partition_key + cluster_keys)})
            )
        {table_options_statement};
        """
        LOG.debug(cql)
        return cql
Esempio n. 2
0
def sync_etl_state_table():
    """
    Utility method to sync (Create) the table as per ORM model
    Returns: None
    """
    LOG.debug("Sinking Cassandra Table using model")
    sync_table(EtlSinkRecordState)
Esempio n. 3
0
    def generate_partitioned_dataframes(self, df_msgs_and_meta_data):
        """
        Generates the batched dataframes to upload to s3
        Args:
            df_msgs_and_meta_data (dataframe): dataframe to be paritioned
        Returns (List[DataFrame, str, str]): list of triples (df, s3_dir, file_name)
        """
        dataframes_and_fld_locs = []
        for cur_interval_np_datetime in \
                pd.unique(df_msgs_and_meta_data[self.partition_key_nm]):
            df_partition = \
                df_msgs_and_meta_data[
                    df_msgs_and_meta_data[self.partition_key_nm] ==
                    cur_interval_np_datetime]

            cur_interval_ts = pd.to_datetime(cur_interval_np_datetime)

            batch_file_path = "{}/date_of_batch={}/time_of_batch={}".format(
                self.root_path, cur_interval_ts.strftime('%Y%m%d'),
                cur_interval_ts.strftime('%H%M%S'))

            file_nm = generate_snapshot_file_name_with_timestamp()

            LOG.debug("Data path : %s", batch_file_path)

            dataframes_and_fld_locs.append(
                (df_partition, batch_file_path, file_nm))

        return dataframes_and_fld_locs
Esempio n. 4
0
 def _cql_upsert_from_dataframe(self, dataframe: DataFrame, table: str):
     upsert_sql = f"""
     INSERT INTO {self.keyspace}.{table} 
     ({", ".join(list(dataframe.columns.values))}) 
     VALUES ({", ".join(['?' for key in dataframe.columns.values])});
         """
     LOG.debug(upsert_sql)
     return upsert_sql
Esempio n. 5
0
 def _cql_upsert_from_dict(self, data: dict, table: str):
     upsert_sql = f"""
     INSERT INTO {self.keyspace}.{table} 
     ({", ".join(data)}) 
     VALUES ({", ".join(['?' for key in data])});
         """
     LOG.debug(upsert_sql)
     return upsert_sql
Esempio n. 6
0
    def _check_msg_is_not_error(msg):

        if msg.error():
            LOG.error("Consumer error: %s", msg.error())
            return None

        LOG.debug("Message from topic: %s", msg.value().decode('utf-8'))

        return msg
Esempio n. 7
0
 def get_fields_types(self, field_types_row_number: int) -> List[str]:
     """
     Get the field types as a list
     Args:
         field_types_row_number (int): Row number of field types
     Returns: field types list
     """
     field_types = self._get_worksheet().row_values(field_types_row_number)
     LOG.debug("Field types:\n%s", field_types)
     return field_types
Esempio n. 8
0
def _validate_partition_key_list(column_dict, primary_key_column_list, partition_key_column_list):
    _validate_primary_key_list(column_dict, primary_key_column_list)
    if partition_key_column_list is None or not partition_key_column_list:
        LOG.debug("partition_key_column_list : %s\nNo partition key specified. Revert to using first column from the primary key for partitioning.",
            str(partition_key_column_list))
        return
    for key in partition_key_column_list:
        if key not in primary_key_column_list:
            raise ValidationError(
                f"The column {key} is not in the primary key list. It cannot be specified as part of the partition key",
            )
Esempio n. 9
0
 def poll_topic_and_upload_to_s3(self):
     """
     Poll at the Kafka topic at set intervals and parse and export the
     messages to S3
     Returns: None
     """
     while True:
         LOG.debug("Polling Kafka for messages")
         self.create_events_snapshot()
         LOG.debug("Upload complete, sleeping")
         time.sleep(self._polling_interval)
Esempio n. 10
0
    def get_msgs(self):
        """Get the latest messages from the Kafka topic

        Returns list(Message) : list of Kafka Messages
        """

        LOG.debug("Getting messages from topic %s", self._topic)

        if not self._subscribed_to_topic:
            self._subscribe_consumer()

        return self.poll_kafka_for_messages()
Esempio n. 11
0
    def _partition_data_and_upload_to_s3(self, data_list, interval):
        """
        Partitions the messages by time in a dataframe, and then uploads to s3

        """
        if data_list:

            for msg_df, key, file_name in \
                    self.partition_msgs_by_kafka_ts(data_list, interval):
                LOG.debug("data path : %s/%s", key, file_name)

                self._s3_client.upload_dataframe_as_parquet(
                    dataframe=msg_df, key=key, file_name=file_name)
Esempio n. 12
0
    def create_events_snapshot(self):
        """
        Get Kafka messages from a topic and export to s3

        Returns: None

        """

        msgs = self._kafka_poller.get_msgs()
        LOG.debug("Json messages : %s", msgs)

        self._kafka_s3_exporter.parse_and_export_msgs(msgs,
                                                      self._polling_interval)
Esempio n. 13
0
 def execute(self, query: str, row_factory: callable, **kwargs) -> Result:
     """
     Execute a cql command and retrieve data with the row factory
     Args:
         query (str):
         row_factory (callable):
         **kwargs: Kwargs to match the session.execute command in cassandra
     Returns: ResultSet
     """
     LOG.debug("Executing query: %s", query)
     if row_factory is not None:
         self._session.row_factory = row_factory
     return self._session.execute(query, **kwargs)
Esempio n. 14
0
def get_page_as_list_of_dict(page: dict) -> List[OrderedDict]:
    """
    Converts a list of entries from google adwords response into a list of Ordered Dictionaries
    Args:
        page (dict): the response page from google adwords api
    Returns: List[dict]
    """
    result = []
    if 'entries' in page:
        entries = page['entries']
        # These entries are a list of zeep Objects that need conversion to Dict
        result = [zeep_object_to_dict(entry) for entry in entries]
        LOG.debug("The result from the adword API: %s", result)
    else:
        LOG.info('No entries were found.')
    return result
Esempio n. 15
0
    def select_dataframe(self, query) -> DataFrame:
        """
        execute SQL query using Airflow mysql hook and retrieve data in pandas data frame

        Args:
            query (str): Mysql compliant query string

        Returns: DataFrame

        """
        LOG.info("Executing \n %s", query)
        with closing(self._conn_mgr.get_conn()) as connection:
            df = sql.read_sql(query, connection)
        LOG.info("Sql Data frame size: %s", df.shape[0])
        LOG.debug(df.head(2))
        return df
Esempio n. 16
0
    def upload_all(self) -> (List[dict], List[dict], List[dict]):
        """
        Upload all files from the Athena table onto AdWords offline conversion
        Returns:
            verification_issues List[dict]: a tuple of lists outlining any
            verification failures
            successes List[dict]: The responses for successful uploads to
            the Google Adwords API
            failures List[dict]: The responses for failed uploads to the
            Google Adwords API
        """

        verification_issues, successes, failures = [], [], []

        for key in self.list_source_files():
            issues, success, fail = \
                self._process_data_frame(self.get_data_frame(key))

            verification_issues.extend(issues)
            successes.extend(success)
            failures.extend(fail)

        if len(verification_issues) > 0:
            LOG.warning("There were %s verification failures",
                        len(verification_issues))

            LOG.debug("All verification failures: \n %s", verification_issues)

        if len(failures) > 0:
            LOG.warning(
                "There were %s failures uploading to the adwords "
                "API", len(failures))

            LOG.info("Sample Failure: \n %s", failures[0])

            LOG.debug("All failures: \n %s", failures)

        LOG.info(
            "There were %s records successfully uploaded from a total of %s submitted items",
            len(successes),
            len(successes) + len(failures) + len(verification_issues))

        return verification_issues, successes, failures
Esempio n. 17
0
    def _get_or_create_state(self):

        try:

            LOG.debug(
                "Creating ETL record sink using etl_signature: "
                "%s, record_identifier: %s", self.etl_signature,
                self.record_identifier)

            return EtlSinkRecordState.if_not_exists().create(
                etl_signature=self.etl_signature,
                record_identifier=self.record_identifier)

        except LWTException as e:

            LOG.debug("LWTException raised: \n %s", e)

            return EtlSinkRecordState.get(
                etl_signature=e.existing['etl_signature'],
                record_identifier=e.existing['record_identifier'])
Esempio n. 18
0
 def prepare_batches(self,
                     prepared_statement: PreparedStatement,
                     tuples: List[tuple],
                     batch_size: int) -> List:
     """
     Prepares a list of cassandra batched Statements out of a list of tuples and prepared
     statement
     Args:
         prepared_statement (PreparedStatement): the statement to be used for batching.
         tuples (list[tuple]): the data to be inserted.
         batch_size (int): limit on the number of prepared statements in the batch.
     Returns: list[BatchStatement]
     """
     batches = []
     LOG.debug("Preparing cassandra batches out of rows")
     batches_of_tuples = _chunk_list(tuples, batch_size)
     for tpl in batches_of_tuples:
         batch = self._prepare_batch(prepared_statement, tpl)
         batches.append(batch)
     LOG.info("created %s batches out of list of %s tuples", len(batches), len(tuples))
     return batches
Esempio n. 19
0
    def produce_msg(self, msg):
        """
        Produces a message to the Kafka topic

        Args:
            msg (str): String to be push to the Kafka topic
        Returns: None

        """
        self._instantiated_producer_if_required()

        if self.producer is not None:
            LOG.debug("Producing message on topic %s : %s", self.topic, msg)
            self.producer.produce(self.topic, msg)
        else:
            LOG.warning(
                "Kafka Connection not initialised, message not sent, "
                "message body: %s", msg)

            if self.raise_exception_on_failed_connection:
                raise NoProducerInstantiatedError
Esempio n. 20
0
    def _sanitise_data(self, dat):
        try:

            current_state = self._get_sink_manager(dat).current_state()

            LOG.debug("Current state of sink manager %s", current_state)

            if current_state == EtlStates.Ready:
                LOG.debug("Record in ready state with data: %s", dat)

                return dat, None
            else:

                LOG.debug(
                    "Sink state found to be not ready, state is %s, the "
                    "data is: "
                    "%s", current_state, dat)

                return None, _get_structured_issue(
                    f"Current state is {current_state} "
                    "state", dat)
        except ValidationError as e:
            LOG.warning(
                "Issue while trying to ready a record for the upload \n %s \n %s",
                e, dat)
            return None, _get_structured_issue(str(e), dat)
Esempio n. 21
0
 def create_athena_table(self) -> None:
     """
     Creates an athena table on top of the transferred data
     Returns: None
     """
     athena_util = self._get_athena_util()
     s3_util = self._get_s3_util()
     keys = s3_util.get_keys(key_prefix=self._final_target_prefix)
     LOG.debug("gathered files transferred under this ETL %s", keys)
     if keys:
         data = s3_util.download_parquet_as_dataframe(keys[0])
         LOG.info(
             "Downloaded parquet file from s3 to construct Athena create table statement: %s "
             "\n made dataframe of shape %s", keys[0], data.shape)
         if self.__settings.target_table_ddl_progress:
             athena_util.drop_table(self.__settings.target_table)
         athena_table_settings = self._construct_athena_table_settings(data)
         athena_util.create_table(table_settings=athena_table_settings)
     else:
         raise ValueError(
             "No Data has been uploaded to target directory, please load data first, "
             "before creating Athena table")
Esempio n. 22
0
    def upload_dataframe_as_parquet(self,
                                    dataframe: DataFrame,
                                    key: str,
                                    file_name: str = "data",
                                    **kwargs) -> None:
        """
        Exports a datafame to a parquet file on s3
        Args:
            dataframe (DataFrame): dataframe to export
            key (str): The path on s3 to upload the file to (excluding bucket name and file name)
            file_name (str): the name of the file at destination
        Returns: None
        """
        LOG.debug(
            "Uploading the dataframe as parquet\nColumn names of the dataframe: %s\nTop 2 rows of "
            "the dataframe: %s\nShape of the dataframe: %s",
            list(dataframe), dataframe.head(2), dataframe.shape)

        tmp_file = NamedTemporaryFile(delete=False)
        destination = f"{key}/{file_name}.parquet"
        dataframe.to_parquet(fname=tmp_file.name, **kwargs)
        self.upload_file(local_file_path=tmp_file.name, key=destination, remove_local=True)
Esempio n. 23
0
    def instantiate_producer(self):
        """
        Try to connect to the Kafka bootstrap server. We include
        functionality to allow failure to connect to the queue to happen
        silently

        Returns (Producer): Kafka Producer

        """
        try:
            LOG.debug("Instantiating Producer")
            self.producer = create_producer(self.producer_config)
            # Check the connection works by polling for messages
            LOG.debug("Polling Queue")
            self.producer.poll(3)
            LOG.info("Succesfully polled Kafka Queue")

        except Exception as exception:
            self.producer = None
            LOG.error("Kafka Producer failed to instantiate: \n %s", exception)

            if self.raise_exception_on_failed_connection:
                raise NoProducerInstantiatedError()
Esempio n. 24
0
 def _execute_batch(self, batch):
     LOG.debug("Executing query: %s", batch)
     return self._session.execute(batch, timeout=300.0)