Ejemplo n.º 1
0
    def close_conn(self, conn_obj):
        result = {}
        try:

            if conn_obj is not None:
                # Checking type of the connection passed before closing
                if isinstance(conn_obj, pyodbc.Connection):
                    conn_obj.close()
                    logger.info("closed connection successfully")
                else:
                    status_message = "The Connection object passed is not valid"
                    raise Exception(status_message)

            result = {CONSTANTS.STATUS_KEY: CONSTANTS.STATUS_SUCCESS}
            return result

        except KeyboardInterrupt:
            raise KeyboardInterrupt

        except Exception as e:
            status_message = "ERROR : " + str(traceback.format_exc())
            logger.error(status_message)
            conn_result = {
                CONSTANTS.STATUS_KEY: CONSTANTS.STATUS_FAILED,
                CONSTANTS.ERROR_KEY: str(e)
            }
            return conn_result
Ejemplo n.º 2
0
    def submit_shell_cmd(self, sqoop_cmd):
        """
        Method to run sqoop command on shell
        :param sqoop_cmd: Sqoop command to run
        :return: Status if success or error and no. of records ingested in case of success
        """
        result = {"status": "success", "message": ""}

        try:
            child = subprocess.Popen(sqoop_cmd,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.STDOUT,
                                     shell=True)
            stream_data = child.communicate()[0]
            rc = child.returncode
            if rc != 0:
                logger.error(stream_data)
                result["status"] = "error"
                result["message"] = stream_data[-100:]
                return result
            # logger.debug(stream_data)  todo: uncomment this to see sqoop job logs in case of success

            # fetching no. of records ingested using regex
            rc_count = re.search(
                'INFO mapreduce.ImportJobBase: Retrieved(.*)records.',
                stream_data[-10000:])
            if rc_count is not None:
                result["message"] = rc_count.group(1).strip()
            return result

        except Exception:
            logger.error(str(traceback.format_exc()))
            result["status"] = "error"
            result["message"] = str(traceback.format_exc())
            return result
Ejemplo n.º 3
0
    def run_all_sources(self):

        job_list = []
        try:
            logger.info(
                "Running for all the sources present in the input configuration"
            )
            logger.info("Fetching the list of sources")

            source_list = self.job_configuration["source_list"].keys()
            logger.info("total number of sources configured: " +
                        str(len(source_list)))

            for source in source_list:
                table_list = self.job_configuration["source_list"][source][
                    "tables"].keys()
                for table in table_list:
                    job_list.append(source + "$$" + table)

            logger.info(
                "total number of tables configured for all the source: " +
                str(len(job_list)))
            logger.info("Job list : " + str(job_list))

            result = self.submit_sqoop_jobs(job_list)

            return result

        except Exception:
            logger.error(str(traceback.format_exc()))
            return False
Ejemplo n.º 4
0
    def generate_sqoop_command(self, main_dict, table_dict, source_name):
        """
        method to generate sqoop command based on the input configurations
        :param main_dict: dictionary containing the source details
        :param table_dict: dictionary containing the table configurations
        :param source_name: name of the source
        :return:
        """
        try:

            if not all(keys in str(main_dict)
                       for keys in MANDATORY_SOURCE_KEYS):
                logger.error(
                    "one/some of the mandatory keys are not present : " +
                    str(MANDATORY_SOURCE_KEYS))
                return None

            main_dict = self.update_main_dict_env(main_dict, source_name)
            main_dict.update(table_dict)
            main_dict["connection_string"] = CONN_STRING.format(
                main_dict["db_type"], main_dict["hostname"],
                main_dict["database"])
            main_dict["load_date"] = self.load_date
            main_dict["hdfs_base_path"] = str(
                main_dict["hdfs_base_path"]).rstrip("/")

            #with open(os.path.join(self.base_path, "config/master_sqoop.template"), "rb") as f: # TODO, check where to put the sqoop master template
            master_sqoop_template = SQOOP_MASTER_TEMPLATE

            for key, value in main_dict.iteritems():
                master_sqoop_template = master_sqoop_template.replace(
                    "$$" + key, value)

            final_sqoop_template = ""
            for line in master_sqoop_template.splitlines():
                if "$$" not in str(line):
                    final_sqoop_template = final_sqoop_template + line + " "

            logger.info("Generating hive query to perform retention")
            date_converted = datetime.strptime(self.load_date, '%Y-%m-%d')
            main_dict["retention_date"] = str(
                (date_converted - timedelta(days=RETENTION_PERIOD)
                 ).strftime("%Y-%m-%d")).strip()

            logger.info("Query created for Deleting partitions older than : " +
                        str(main_dict["retention_date"]))

            hive_retention_template = HIVE_RETENTION_TEMPLATE
            for key, value in main_dict.iteritems():
                hive_retention_template = hive_retention_template.replace(
                    "$$" + key, value)

            return final_sqoop_template, hive_retention_template

        except:
            logger.error(str(traceback.format_exc()))
            return None, None
Ejemplo n.º 5
0
 def fetch_json(self, path):
     """
     Method to convert json file to a python dict
     :param path: path to the json file
     :return: python dict loaded with json data, None in case of failure
     """
     try:
         json_dict = json.load(open(path))
         return json_dict
     except Exception:
         logger.error(str(traceback.format_exc()))
         return None
Ejemplo n.º 6
0
    def run_single_table(self, source_name, table_name):

        job_list = []
        try:
            logger.info("Running for the give table in the given source")

            try:
                table_list = self.job_configuration["source_list"][
                    source_name]["tables"].keys()
            except Exception:
                logger.error(
                    "error fetching the source and table configurations for given source: "
                    + source_name)
                logger.error(
                    "verify is the input configurations has the correct structure and source details"
                )
                return False
            if table_name in table_list:
                job_list.append(source_name + "$$" + table_name)
            else:
                logger.error("give table: " + str(table_name) +
                             " not found in the configurations for source: " +
                             str(source_name))
                return False
            logger.info("Job list : " + str(job_list))

            result = self.submit_sqoop_jobs(job_list)
            return True

        except Exception:
            logger.error(str(traceback.format_exc()))
            return False
Ejemplo n.º 7
0
    def execute_query(self, query, conn_obj):
        dict_result = {}
        fetched_data = {}
        try:
            logger.debug("checking if connection is null or empty")
            if conn_obj is None or str(conn_obj) == "":
                logger.error("empty connection object or invalid connection")
                raise Exception
            else:
                logger.debug("Executing the Given Sql query : " + query)

                try:
                    fetched_data = pandas.read_sql(query, conn_obj)
                except Exception as e:
                    if str(e) == "'NoneType' object is not iterable":
                        status_message = "No data is returned for query " + query.decode('utf-8') \
                                         + "executing the next query."
                        logger.debug(status_message)
                    else:
                        status_message = "Error occurred while executing the query " + query.decode(
                            'utf-8')
                        raise e
                # Formulating the output dictionary based on the number of the records in the data frame
                # Checking for the number of records
                if len(fetched_data) > 0:
                    fetched_data_dict = fetched_data.to_dict(orient='dict')
                    dict_result[CONSTANTS.RESULT_KEY] = fetched_data_dict
                else:
                    dict_result[CONSTANTS.RESULT_KEY] = None
                dict_result[CONSTANTS.STATUS_KEY] = CONSTANTS.STATUS_SUCCESS
                result_dictionary = dict_result
                status_message = "Query has been successfully executed"
                logger.debug(status_message)

                return result_dictionary

        except KeyboardInterrupt:
            raise KeyboardInterrupt

        except Exception as e:
            status_message = "ERROR : " + str(traceback.format_exc())
            logger.error(status_message)
            conn_result = {
                CONSTANTS.STATUS_KEY: CONSTANTS.STATUS_FAILED,
                CONSTANTS.ERROR_KEY: str(e)
            }
            return conn_result
Ejemplo n.º 8
0
    def submit_sqoop_jobs(self, job_list):

        try:

            logger.info("Creating pool of threads based on the input job list")
            if job_list is not None:
                if len(job_list) > self.thread_pool_size:
                    pool_size = self.thread_pool_size
                else:
                    pool_size = len(job_list)

                thread_pool = Pool.ProcessPool(pool_size)

                job_list_results = thread_pool.map(self.run_sqoop_job,
                                                   job_list)

                logger.info("completed running the sqoop job/jobs")
                logger.debug(job_list_results)

                df = pd.DataFrame(job_list_results)

                logger.info("Summery of all the jobs configured")
                logger.info("\n" + tabulate(df[[
                    'table_name', 'source_name', 'record_count', 'status',
                    'error_traceback'
                ]],
                                            headers='keys',
                                            tablefmt='psql'))

                status_list = df['status'].tolist()
            else:
                logger.error(
                    "Job list is empty, Unable to trigger any sqoop jobs")
                return False

            if "failed" in status_list:
                return False
            else:
                return True

        except Exception:
            logger.error(str(traceback.format_exc()))
            return False
Ejemplo n.º 9
0
 def update_main_dict_env(self, main_dict, source_name):
     """
     method to update the dict with values present in environment variables
     :param main_dict: dict containing key values
     :param source_name: name of the souce to which this dict belongs
     :return: updated dict with values from environment variables
     """
     try:
         main_keys = main_dict.keys()
         for key in main_keys:
             if source_name.upper() + "_" + key.upper() in os.environ:
                 logger.info("fetching " + key +
                             " from environment variable: " +
                             source_name.upper() + "_" + key.upper())
                 main_dict[key] = environ.get(source_name.upper() + "_" +
                                              key.upper())
         return main_dict
     except:
         logger.error(str(traceback.format_exc()))
         return None
Ejemplo n.º 10
0
    def run_single_source(self, source_name):

        job_list = []
        try:
            logger.info(
                "Running for all the tables present in the given source: " +
                source_name)

            try:
                table_list = self.job_configuration["source_list"][
                    source_name]["tables"].keys()
            except Exception:
                logger.error(
                    "error fetching the source and table configurations for given source: "
                    + source_name)
                logger.error(
                    "verify is the input configurations has the correct structure and source names as passed in the arguments"
                )
                return False

            for table in table_list:
                job_list.append(source_name + "$$" + table)

            logger.info(
                "total number of tables configured for given source: " +
                str(len(job_list)))
            logger.info("Job list : " + str(job_list))

            result = self.submit_sqoop_jobs(job_list)
            return True

        except Exception:
            logger.error(str(traceback.format_exc()))
            return False
Ejemplo n.º 11
0
    def create_conn(self, connection_details):
        status_message = ""
        try:
            status_message = "creating azure-ms-sql connection"
            logger.info(status_message)
            server_name = connection_details["server_name"]
            database_name = connection_details["database_name"]
            username = connection_details["username"]
            password = connection_details["password"]
            port = connection_details["port"]
            mssql_driver_string = connection_details["mssql_driver_string"]

            conn = pyodbc.connect('DRIVER={' + mssql_driver_string +
                                  '};SERVER=' + server_name + ';PORT=' + port +
                                  ';DATABASE=' + database_name + ';UID=' +
                                  username + ';PWD=' + password)
            conn.autocommit = True
            status_message = "successfully connected to the given server"
            logger.info(status_message)

            conn_result = {
                CONSTANTS.STATUS_KEY: CONSTANTS.STATUS_SUCCESS,
                CONSTANTS.RESULT_KEY: conn
            }

            return conn_result

        except KeyboardInterrupt:
            raise KeyboardInterrupt

        except Exception as e:
            status_message = "ERROR : " + str(traceback.format_exc())
            logger.error(status_message)
            conn_result = {
                CONSTANTS.STATUS_KEY: CONSTANTS.STATUS_FAILED,
                CONSTANTS.ERROR_KEY: str(e)
            }
            return conn_result
Ejemplo n.º 12
0
    def create_mssql_conn(self, connection_details):
        status_message = ""
        try:
            status_message = "creating ms-sql connection"
            logger.info(status_message)
            server_name = connection_details["server_name"]
            database_name = connection_details["database_name"]
            username = connection_details["username"]
            password = connection_details["password"]
            port = connection_details["port"]

            conn = pymssql.connect(server=server_name,
                                   user=username,
                                   password=password,
                                   database=database_name,
                                   port=port)
            conn.autocommit = True
            status_message = "successfully connected to the given server"
            logger.info(status_message)

            conn_result = {
                CONSTANTS.STATUS_KEY: CONSTANTS.STATUS_SUCCESS,
                CONSTANTS.RESULT_KEY: conn
            }

            return conn_result

        except KeyboardInterrupt:
            raise KeyboardInterrupt

        except Exception as e:
            status_message = "ERROR : " + str(traceback.format_exc())
            logger.error(status_message)
            conn_result = {
                CONSTANTS.STATUS_KEY: CONSTANTS.STATUS_FAILED,
                CONSTANTS.ERROR_KEY: str(e)
            }
            return conn_result
Ejemplo n.º 13
0
    def sqoop_main(self,
                   source_path,
                   source_name=None,
                   table_name=None,
                   thread_pool=None):

        try:
            logger.info("loading source list configuration into python dict ")
            self.job_configuration = self.fetch_json(source_path)
            if self.job_configuration is None:
                logger.error("unable to load source configuration list")
                raise Exception

            if thread_pool is not None:
                logger.info(
                    "Setting up the thread pool size to the given value : " +
                    str(thread_pool))
                self.thread_pool_size = int(thread_pool)

            if source_name is None and table_name is None:
                result = self.run_all_sources()
                if not result:
                    raise Exception

            elif table_name is None and source_name is not None:
                result = self.run_single_source(source_name)
                if not result:
                    raise Exception

            elif table_name is not None and source_name is not None:
                result = self.run_single_table(source_name, table_name)
                if not result:
                    raise Exception

            else:
                logger.error(
                    "HINT : if table is provided, Its mandatory to provide source name too...!"
                )
                raise Exception

        except Exception:
            logger.error(str(traceback.format_exc()))
            sys.exit(1)
Ejemplo n.º 14
0
    def run_sqoop_job(self, input_val):

        output_dict = {
            "table_name": "",
            "source_name": "",
            "status": "",
            "record_count": "",
            "error_traceback": ""
        }
        try:
            input_params = input_val.split("$$")
            source_name = input_params[0]
            table_name = input_params[1]
            logger.info(
                "Running Sqoop job for table: {} and source: {}".format(
                    table_name, source_name))

            logger.info("reading source_configuration")
            source_dict = self.job_configuration["source_list"][source_name][
                "source_config"]
            logger.info("reading table configuration")
            table_dict = self.job_configuration["source_list"][source_name][
                "tables"][table_name]

            sqoop_cmd, hive_retention_cmd = self.generate_sqoop_command(
                source_dict, table_dict, source_name)

            if sqoop_cmd is None:
                logger.error(
                    "error generating sqoop command for table: {}, source: {} "
                    .format(table_name, source_name))
                raise Exception

            logger.debug("Running Sqoop Command : " + str(sqoop_cmd))
            result = self.submit_shell_cmd(sqoop_cmd)

            output_dict["table_name"] = table_name
            output_dict["source_name"] = source_name
            if result["status"] == "success":
                output_dict["record_count"] = result["message"]
                output_dict["status"] = "success"
            else:
                output_dict["error_traceback"] = result["message"]
                output_dict["status"] = "failed"

            logger.debug(
                "Running Hive Command for dropping older partitions: " +
                str(hive_retention_cmd))
            result_hive = self.submit_shell_cmd(hive_retention_cmd)

            if result_hive["status"] == "success":
                logger.info(
                    "Dropped older partitions as per the retention policy")
            else:
                logger.warn(
                    "Error occurred while dropping older partitions : " +
                    str(result_hive["message"]))

            return output_dict

        except Exception:
            logger.error(str(traceback.format_exc()))
            return output_dict