Example #1
0
    def insert_ptn_ratio(self):

        ptn_list_sorted = sorted(self.hdfs_ptn_list,key=lambda  s: int(re.sub("[^0-9]", "", s)))

        try:
            for ptn in ptn_list_sorted:
                transmitted_ratio = self.hive_mng.load_ptn_transmitted_ratio(ptn)
                load_success_data = {
                        "topic_name": self.topic,
                        "database_name": self.database,
                        "table_name":self.table,
                        "load_start_time":utils.iso_format(self.loadts),
                        "load_end_time": utils.iso_format(datetime.now()),
                        "hdfs_partition":ptn,
                        "transmitted_ratio":transmitted_ratio
                }

                print load_success_data
                self.metadata_mgr.insert(load_success_data,"ratio")
        except Exception:
            logger.error("Error creating transmission ratio metadata")
            raise
Example #2
0
    def execute(self):
        # TODO [comment section (2)] based on??
        """
        Top level method for LoadHandler; manages the load workflow.
        The method
        (1) decides if the current load should proceed
        (2) creates Hive table for server & HDFS timestamp partitioned by
        Year, Month, Day, Hour based on
        (3) creates hive table for hive timestamp partitioned by
        Year, Month, Day, Hour based on source Hive table partition
        :return: None
        """
        # Determines if there exist hdfs directories and hive partitions to process
        self.hdfs_to_proceed()
        self.hive_to_proceed()
        if not self.hive_proceed and not self.hdfs_proceed:
            logger.info("No partitions or directories to proceed. Ending load")
            return

        logger.info("Proceeding with load")

        if self.hdfs_proceed:
            logger.info("Proceeding with HDFS load")
            try:
                # Calculate the latest last processed hdfs directory after the current load
                # hdfs_thread_manager retrieves hdfs timestamps from pending directories,
                # write to local file system, copy files to hive warehouse,
                # create new partitions that point toward corresponding directories
                self.hdfs_new_last_dir = hdfs_thread_execute(self.topic,self.table,self.hdfs_dir_pending,
                                                             self.hdfs_ptn_list,
                                                             self.get_config("local_hdfs_ts_path"),
                                                             self.get_config("hive_hdfs_ts_path"))
            except Exception:
                logger.error("Error retrieving server and hdfs timestamp")
                raise
        else:
            logger.info("No new HDFS dir to process.")

        if self.hive_proceed:
            hive_hive_ts_path = self.get_config("hive_hive_ts_path")
            logger.info("Proceeding with Hive load")
            try:
                # create a FlowView partition for each Hive partition pending processing
                for partition in self.hive_ptn_pending:
                    # create partition
                    self.hive_mgr.create_hive_ts_ptn(partition,hive_hive_ts_path)
                    # retrieve hive timestamp data and write into the corresponding directory
                    self.hive_mgr.pull_hive_ts(partition,hive_hive_ts_path)
                # calculate the last processed partition after the current load
                self.hive_new_last_ptn = self.hive_ptn_pending[-1]
            except Exception:
                logger.error("Error creating hive partition")
                raise

            logger.info("Created hive partition for Hive timestamp")

        else:
            logger.info("No new Hive partition to process")

        try:
            # create metadata for current load
            load_metadata = {
                "topic_name": self.topic,
                "database_name": self.database,
                "table_name":self.table,
                "load_start_time":utils.iso_format(self.loadts),
                "load_end_time": utils.iso_format(datetime.now()),
                "last_load_hdfs_dir": self.hdfs_new_last_dir,
                "last_load_hive_partition": self.hive_new_last_ptn
            }
            logger.info("Created load_metadata %s" %load_metadata)
        except Exception:
            logger.error("Error creating metadata")
            raise
        logger.info("Created hive partition for hive timestamp")

        try:
            # insert metadata for current load into SQL metadata database
            self.metadata_mgr.insert(load_metadata,"load")
        except MetadataException:
                logger.error("Error inserting metadata %s" %load_metadata)
                raise

        try:
            self.insert_ptn_ratio()
            logger.info("Calculated load partition data transmission ratio")
        except Exception:
            logger.error("Error in calculating load partition data transmission ratio")
            raise

        logger.info("Load complete")