def _dataframe_to_cassandra_ddl(self, data_frame: DataFrame, primary_key_column_list: List[str], partition_key_column_list: List[str], table_name: str, table_options_statement: str = ""): """ Generates a 'create table' cql statement with primary keys (using compound keys for the partition) Args: data_frame (Dataframe): primary_key_column_list (list[str]): list of columns specifying the columns to be used as primary key partition_key_column_list (list[str]): list of columns specifying the columns to be used to partition the data table_name (str): name of the table to create table_options_statement (str): Returns: str """ column_list = _cql_manage_column_lists(data_frame, primary_key_column_list, partition_key_column_list) # create list of partition keys from first column of the primary key if not specified partition_key_column_list = partition_key_column_list if partition_key_column_list is not None and \ len(partition_key_column_list) > 0 else [primary_key_column_list[0]] partition_key = ["(" + ", ".join(partition_key_column_list) + ")"] # create list of cluster keys from the remainder of the primary key columns clustering_key_column_list = [x for x in primary_key_column_list if x not in partition_key_column_list] cluster_keys = [", ".join(clustering_key_column_list)] if len(clustering_key_column_list)>0 else [] cql = f""" CREATE TABLE IF NOT EXISTS {self.keyspace}.{table_name} ( {", ".join(column_list)}, PRIMARY KEY ({", ".join(partition_key + cluster_keys)}) ) {table_options_statement}; """ LOG.debug(cql) return cql
def sync_etl_state_table(): """ Utility method to sync (Create) the table as per ORM model Returns: None """ LOG.debug("Sinking Cassandra Table using model") sync_table(EtlSinkRecordState)
def generate_partitioned_dataframes(self, df_msgs_and_meta_data): """ Generates the batched dataframes to upload to s3 Args: df_msgs_and_meta_data (dataframe): dataframe to be paritioned Returns (List[DataFrame, str, str]): list of triples (df, s3_dir, file_name) """ dataframes_and_fld_locs = [] for cur_interval_np_datetime in \ pd.unique(df_msgs_and_meta_data[self.partition_key_nm]): df_partition = \ df_msgs_and_meta_data[ df_msgs_and_meta_data[self.partition_key_nm] == cur_interval_np_datetime] cur_interval_ts = pd.to_datetime(cur_interval_np_datetime) batch_file_path = "{}/date_of_batch={}/time_of_batch={}".format( self.root_path, cur_interval_ts.strftime('%Y%m%d'), cur_interval_ts.strftime('%H%M%S')) file_nm = generate_snapshot_file_name_with_timestamp() LOG.debug("Data path : %s", batch_file_path) dataframes_and_fld_locs.append( (df_partition, batch_file_path, file_nm)) return dataframes_and_fld_locs
def _cql_upsert_from_dataframe(self, dataframe: DataFrame, table: str): upsert_sql = f""" INSERT INTO {self.keyspace}.{table} ({", ".join(list(dataframe.columns.values))}) VALUES ({", ".join(['?' for key in dataframe.columns.values])}); """ LOG.debug(upsert_sql) return upsert_sql
def _cql_upsert_from_dict(self, data: dict, table: str): upsert_sql = f""" INSERT INTO {self.keyspace}.{table} ({", ".join(data)}) VALUES ({", ".join(['?' for key in data])}); """ LOG.debug(upsert_sql) return upsert_sql
def _check_msg_is_not_error(msg): if msg.error(): LOG.error("Consumer error: %s", msg.error()) return None LOG.debug("Message from topic: %s", msg.value().decode('utf-8')) return msg
def get_fields_types(self, field_types_row_number: int) -> List[str]: """ Get the field types as a list Args: field_types_row_number (int): Row number of field types Returns: field types list """ field_types = self._get_worksheet().row_values(field_types_row_number) LOG.debug("Field types:\n%s", field_types) return field_types
def _validate_partition_key_list(column_dict, primary_key_column_list, partition_key_column_list): _validate_primary_key_list(column_dict, primary_key_column_list) if partition_key_column_list is None or not partition_key_column_list: LOG.debug("partition_key_column_list : %s\nNo partition key specified. Revert to using first column from the primary key for partitioning.", str(partition_key_column_list)) return for key in partition_key_column_list: if key not in primary_key_column_list: raise ValidationError( f"The column {key} is not in the primary key list. It cannot be specified as part of the partition key", )
def poll_topic_and_upload_to_s3(self): """ Poll at the Kafka topic at set intervals and parse and export the messages to S3 Returns: None """ while True: LOG.debug("Polling Kafka for messages") self.create_events_snapshot() LOG.debug("Upload complete, sleeping") time.sleep(self._polling_interval)
def get_msgs(self): """Get the latest messages from the Kafka topic Returns list(Message) : list of Kafka Messages """ LOG.debug("Getting messages from topic %s", self._topic) if not self._subscribed_to_topic: self._subscribe_consumer() return self.poll_kafka_for_messages()
def _partition_data_and_upload_to_s3(self, data_list, interval): """ Partitions the messages by time in a dataframe, and then uploads to s3 """ if data_list: for msg_df, key, file_name in \ self.partition_msgs_by_kafka_ts(data_list, interval): LOG.debug("data path : %s/%s", key, file_name) self._s3_client.upload_dataframe_as_parquet( dataframe=msg_df, key=key, file_name=file_name)
def create_events_snapshot(self): """ Get Kafka messages from a topic and export to s3 Returns: None """ msgs = self._kafka_poller.get_msgs() LOG.debug("Json messages : %s", msgs) self._kafka_s3_exporter.parse_and_export_msgs(msgs, self._polling_interval)
def execute(self, query: str, row_factory: callable, **kwargs) -> Result: """ Execute a cql command and retrieve data with the row factory Args: query (str): row_factory (callable): **kwargs: Kwargs to match the session.execute command in cassandra Returns: ResultSet """ LOG.debug("Executing query: %s", query) if row_factory is not None: self._session.row_factory = row_factory return self._session.execute(query, **kwargs)
def get_page_as_list_of_dict(page: dict) -> List[OrderedDict]: """ Converts a list of entries from google adwords response into a list of Ordered Dictionaries Args: page (dict): the response page from google adwords api Returns: List[dict] """ result = [] if 'entries' in page: entries = page['entries'] # These entries are a list of zeep Objects that need conversion to Dict result = [zeep_object_to_dict(entry) for entry in entries] LOG.debug("The result from the adword API: %s", result) else: LOG.info('No entries were found.') return result
def select_dataframe(self, query) -> DataFrame: """ execute SQL query using Airflow mysql hook and retrieve data in pandas data frame Args: query (str): Mysql compliant query string Returns: DataFrame """ LOG.info("Executing \n %s", query) with closing(self._conn_mgr.get_conn()) as connection: df = sql.read_sql(query, connection) LOG.info("Sql Data frame size: %s", df.shape[0]) LOG.debug(df.head(2)) return df
def upload_all(self) -> (List[dict], List[dict], List[dict]): """ Upload all files from the Athena table onto AdWords offline conversion Returns: verification_issues List[dict]: a tuple of lists outlining any verification failures successes List[dict]: The responses for successful uploads to the Google Adwords API failures List[dict]: The responses for failed uploads to the Google Adwords API """ verification_issues, successes, failures = [], [], [] for key in self.list_source_files(): issues, success, fail = \ self._process_data_frame(self.get_data_frame(key)) verification_issues.extend(issues) successes.extend(success) failures.extend(fail) if len(verification_issues) > 0: LOG.warning("There were %s verification failures", len(verification_issues)) LOG.debug("All verification failures: \n %s", verification_issues) if len(failures) > 0: LOG.warning( "There were %s failures uploading to the adwords " "API", len(failures)) LOG.info("Sample Failure: \n %s", failures[0]) LOG.debug("All failures: \n %s", failures) LOG.info( "There were %s records successfully uploaded from a total of %s submitted items", len(successes), len(successes) + len(failures) + len(verification_issues)) return verification_issues, successes, failures
def _get_or_create_state(self): try: LOG.debug( "Creating ETL record sink using etl_signature: " "%s, record_identifier: %s", self.etl_signature, self.record_identifier) return EtlSinkRecordState.if_not_exists().create( etl_signature=self.etl_signature, record_identifier=self.record_identifier) except LWTException as e: LOG.debug("LWTException raised: \n %s", e) return EtlSinkRecordState.get( etl_signature=e.existing['etl_signature'], record_identifier=e.existing['record_identifier'])
def prepare_batches(self, prepared_statement: PreparedStatement, tuples: List[tuple], batch_size: int) -> List: """ Prepares a list of cassandra batched Statements out of a list of tuples and prepared statement Args: prepared_statement (PreparedStatement): the statement to be used for batching. tuples (list[tuple]): the data to be inserted. batch_size (int): limit on the number of prepared statements in the batch. Returns: list[BatchStatement] """ batches = [] LOG.debug("Preparing cassandra batches out of rows") batches_of_tuples = _chunk_list(tuples, batch_size) for tpl in batches_of_tuples: batch = self._prepare_batch(prepared_statement, tpl) batches.append(batch) LOG.info("created %s batches out of list of %s tuples", len(batches), len(tuples)) return batches
def produce_msg(self, msg): """ Produces a message to the Kafka topic Args: msg (str): String to be push to the Kafka topic Returns: None """ self._instantiated_producer_if_required() if self.producer is not None: LOG.debug("Producing message on topic %s : %s", self.topic, msg) self.producer.produce(self.topic, msg) else: LOG.warning( "Kafka Connection not initialised, message not sent, " "message body: %s", msg) if self.raise_exception_on_failed_connection: raise NoProducerInstantiatedError
def _sanitise_data(self, dat): try: current_state = self._get_sink_manager(dat).current_state() LOG.debug("Current state of sink manager %s", current_state) if current_state == EtlStates.Ready: LOG.debug("Record in ready state with data: %s", dat) return dat, None else: LOG.debug( "Sink state found to be not ready, state is %s, the " "data is: " "%s", current_state, dat) return None, _get_structured_issue( f"Current state is {current_state} " "state", dat) except ValidationError as e: LOG.warning( "Issue while trying to ready a record for the upload \n %s \n %s", e, dat) return None, _get_structured_issue(str(e), dat)
def create_athena_table(self) -> None: """ Creates an athena table on top of the transferred data Returns: None """ athena_util = self._get_athena_util() s3_util = self._get_s3_util() keys = s3_util.get_keys(key_prefix=self._final_target_prefix) LOG.debug("gathered files transferred under this ETL %s", keys) if keys: data = s3_util.download_parquet_as_dataframe(keys[0]) LOG.info( "Downloaded parquet file from s3 to construct Athena create table statement: %s " "\n made dataframe of shape %s", keys[0], data.shape) if self.__settings.target_table_ddl_progress: athena_util.drop_table(self.__settings.target_table) athena_table_settings = self._construct_athena_table_settings(data) athena_util.create_table(table_settings=athena_table_settings) else: raise ValueError( "No Data has been uploaded to target directory, please load data first, " "before creating Athena table")
def upload_dataframe_as_parquet(self, dataframe: DataFrame, key: str, file_name: str = "data", **kwargs) -> None: """ Exports a datafame to a parquet file on s3 Args: dataframe (DataFrame): dataframe to export key (str): The path on s3 to upload the file to (excluding bucket name and file name) file_name (str): the name of the file at destination Returns: None """ LOG.debug( "Uploading the dataframe as parquet\nColumn names of the dataframe: %s\nTop 2 rows of " "the dataframe: %s\nShape of the dataframe: %s", list(dataframe), dataframe.head(2), dataframe.shape) tmp_file = NamedTemporaryFile(delete=False) destination = f"{key}/{file_name}.parquet" dataframe.to_parquet(fname=tmp_file.name, **kwargs) self.upload_file(local_file_path=tmp_file.name, key=destination, remove_local=True)
def instantiate_producer(self): """ Try to connect to the Kafka bootstrap server. We include functionality to allow failure to connect to the queue to happen silently Returns (Producer): Kafka Producer """ try: LOG.debug("Instantiating Producer") self.producer = create_producer(self.producer_config) # Check the connection works by polling for messages LOG.debug("Polling Queue") self.producer.poll(3) LOG.info("Succesfully polled Kafka Queue") except Exception as exception: self.producer = None LOG.error("Kafka Producer failed to instantiate: \n %s", exception) if self.raise_exception_on_failed_connection: raise NoProducerInstantiatedError()
def _execute_batch(self, batch): LOG.debug("Executing query: %s", batch) return self._session.execute(batch, timeout=300.0)