Beispiel #1
0
    def __init__(self, connection_info):
        """
        Initialize vertica cmd line

        @rtype: None
        @return: None
        """
        self.connection_info = connection_info

        # Set the locale explicitly to utf-8. Otherwise, loads triggered manually
        # by our headless service account will succeed and the loads triggered by the scheduler will fail.
        # The reason for this is that the value of the 'locale' environment variables for
        # the scheduler and our headless service account are different. This difference can be verified by
        # viewing the output of the 'locale' command manually triggered from
        # our headless service account's home folder and the output of 'locale' command from our scheduler.

        self.set_locale = "export LC_ALL='en_US.UTF-8'"
        self.vsql = "%s && %s -k %s -K %s -h %s -p %s -d %s -U %s" \
                    % (self.set_locale,
                       self.connection_info["vertica_vsql_path"],
                       self.connection_info["vertica_krb_svcname"],
                       self.connection_info["vertica_krb_host"],
                       self.connection_info["vertica_host"],
                       self.connection_info["vertica_port"],
                       self.connection_info["vertica_db"],
                       self.connection_info["vertica_user"])
        self.shell_exec = ShellExecutor()
Beispiel #2
0
    def __init__(self):
        """
        Instantiate a ShellExecutor to execute oozie commands

        @rtype: None
        @return: None
        """
        self.shell_exec = ShellExecutor()
Beispiel #3
0
    def __init__(self, db, table):
        """
        Initializes the manager object for a Hive table
        @type db: str
        @param db: Hive db

        @type table: str
        @param table: Hive table

        @rtype: None
        @return: None
        """
        self.db = db
        self.table = table
        self.shell_exec = ShellExecutor()
Beispiel #4
0
class OozieManager(object):
    """
    Manager class for Oozie jobs. This class is concerned with launch, status fetch
    and monitoring Oozie jobs using the Oozie command-line interface
    """
    def __init__(self):
        """
        Instantiate a ShellExecutor to execute oozie commands

        @rtype: None
        @return: None
        """
        self.shell_exec = ShellExecutor()

    def launch(self, propfile=None):
        """
        Launches oozie job with properties in "propfile"

        @type propfile: str
        @param propfile: Job properties file

        @rtype: str
        @return: Launched Oozie jobid
        """
        try:
            if propfile is None:
                logkv(
                    logger, {
                        "msg": "Workflow properties file not found",
                        "propfile": propfile
                    }, "error")
                raise OozieManagerException()

            launchcmd = "oozie job -config %s -run" % propfile
            result = self.shell_exec.safe_execute(launchcmd)
            jobid = result.output.split(":")[1].strip()
            logkv(logger, {
                "msg": "Launched Oozie job",
                "jobid": jobid
            }, "info")
            return jobid
        except Exception:
            logkv(logger, {"msg": "Failed to launch Oozie job"}, "error")
            raise OozieManagerException()

    def get_status(self, jobid):
        """
        Parses output of "oozie job -info <jobid>" command to get
        overall and per-step status of the launched Oozie job

        @type jobid: str
        @param jobid: Oozie jobid

        @rtype: dict
        @return: {"step": "<status>"} for all steps in workflow
        """
        # Get raw job status from command line
        pollcmd = "oozie job -info %s" % jobid
        result = self.shell_exec.safe_execute(pollcmd)

        # Get overall job status
        try:
            ostat = re.findall("Status\s*:\s*(\w+)", result.output)[0]
        except IndexError:
            logkv(logger, {"msg": "Failed to get overall job status"}, "error")
            raise OozieManagerException()

        # Now get status of individual steps
        # Escape dashes in jobid, otherwise regex matches will fail
        jobid_esc = jobid.replace("-", "\-")
        step_status = re.findall("%s@(.+?)\n" % jobid_esc, result.output)

        # Combine statuses of individual steps into a status dictionary
        # for easier processing later
        try:
            jobstatus = dict([tuple(ss.split()[:2]) for ss in step_status])
            jobstatus.update({"overall": ostat})
            return jobstatus
        except Exception:
            logkv(logger, {"msg": "Failed to get status of workflow steps"},
                  "error")
            raise OozieManagerException()

    def get_logtrace(self, jobid):
        """
        Gets log trace of the oozie job of given jobid

        @type jobid: str
        @param jobid: Oozie jobid whose status is desired

        @rtype: str
        @return: log trace as a string
        """
        try:
            logcmd = "oozie job -log %s" % jobid
            result = self.shell_exec.safe_execute(logcmd)
            return result.output
        except ShellException:
            logkv(logger, {"msg": "Failed to get logtrace"}, "error")
            raise OozieManagerException()

    def get_counts(self, jobid):
        """
        Gets HDFS counts after the parse-json job is done. These counts are the source
        of truth for number of records processed and number of hive row counts generated

        @type jobid: str
        @param jobid: Jobid of Oozie job.

        @rtype: dict
        @return: A dictionary containing the counter name and its value.
        """

        try:
            # Get hadoop counts from Oozie. The action name is hardcoded, for now,
            # but should probably think about how to factor this out without leading to
            # config profusion.
            action = "parse-json"
            pollcmd = "oozie job -info %s@%s -verbose" % (jobid, action)
            result = self.shell_exec.safe_execute(pollcmd)

            res = re.findall('{.*}', result.output)[0]
            counts = json.loads(res)
            task_counter = counts.get(
                "org.apache.hadoop.mapreduce.TaskCounter", dict())
            thrive_counter = counts.get("THRIVE", dict())

            return {
                "map_input_records":
                task_counter.get("MAP_INPUT_RECORDS", "0"),
                "map_output_records":
                task_counter.get("MAP_OUTPUT_RECORDS", "0"),
                "reduce_input_records":
                task_counter.get("REDUCE_INPUT_RECORDS", "0"),
                "reduce_output_records":
                task_counter.get("REDUCE_OUTPUT_RECORDS", "0"),
                "skipped":
                thrive_counter.get("SKIPPED", "0")
            }
        except Exception:
            logkv(logger, {"msg": "Error getting Hadoop counts through Oozie"},
                  "error")
            raise OozieManagerException()

    def poll(self, jobid, interval=10):
        """
        Polls the Oozie job to get status

        @type jobid: str
        @param jobid: Oozie jobid

        @type interval: int
        @param interval: Interval between polls

        @rtype: bool
        @return: SUCCESS/FAIL code
        """
        jobrunning = True
        while jobrunning:
            jobstatus = self.get_status(jobid)
            logkv(logger, {"jobid": jobid, "status": jobstatus}, "info")

            if jobstatus["overall"] != "RUNNING":
                jobrunning = False
            time.sleep(interval)

        # Once the job finishes, analyse the status of all steps and see if any failed
        for step, status in jobstatus.items():
            if "FAIL" in status.upper() or "ERROR" in status.upper():
                errmsg = "Oozie error. Step: %s, Error: %s" % (step, status)
                logkv(logger, {
                    "msg": "Oozie error",
                    "step": step,
                    "error": status
                }, "error")
                logkv(logger, {"oozie_logtrace": self.get_logtrace(jobid)},
                      "info")
                raise OozieManagerException(errmsg)
Beispiel #5
0
class VerticaManager(object):
    """
    Manager class for Vertica operations
    """
    def __init__(self, connection_info):
        """
        Initialize vertica cmd line

        @rtype: None
        @return: None
        """
        self.connection_info = connection_info

        # Set the locale explicitly to utf-8. Otherwise, loads triggered manually
        # by our headless service account will succeed and the loads triggered by the scheduler will fail.
        # The reason for this is that the value of the 'locale' environment variables for
        # the scheduler and our headless service account are different. This difference can be verified by
        # viewing the output of the 'locale' command manually triggered from
        # our headless service account's home folder and the output of 'locale' command from our scheduler.

        self.set_locale = "export LC_ALL='en_US.UTF-8'"
        self.vsql = "%s && %s -k %s -K %s -h %s -p %s -d %s -U %s" \
                    % (self.set_locale,
                       self.connection_info["vertica_vsql_path"],
                       self.connection_info["vertica_krb_svcname"],
                       self.connection_info["vertica_krb_host"],
                       self.connection_info["vertica_host"],
                       self.connection_info["vertica_port"],
                       self.connection_info["vertica_db"],
                       self.connection_info["vertica_user"])
        self.shell_exec = ShellExecutor()

    @staticmethod
    def getrows(vcopy_output):
        """
        Parses the console output of COPY command and extracts number of rows loaded

        @type vcopy_output: str
        @param vcopy_output: Output of COPY command

        @rtype: int
        @return: Number of rows loaded
        """
        pattern = "\s*(Rows Loaded|count|OUTPUT)\s*.*\s*([0-9]*)\s*"

        try:
            return re.findall(pattern, vcopy_output)[0][1]
        except Exception:
            logkv(
                logger, {
                    "msg":
                    "Error retriving rows loaded from output of COPY command",
                    "voutput": vcopy_output,
                    "pattern": pattern
                }, "error")

            raise VerticaManagerException()

    def execute(self, stmt=None, scriptfile=None):
        """
        Execute sql stmt 'stmt' or sql script file 'scriptfile'

        @type stmt: str
        @param stmt: SQL query string

        @type scriptfile: str
        @param scriptfile: Location of scriptfile containing commands to execute

        @rtype: str
        @return: Output of the shell command enclosing Vertica sql command
        """

        # Get the execution mode and the argument (statement or filename)
        if stmt and not scriptfile:
            # The stmt *needs* to be in double quotes since it could contain
            # single-quoted strings (see load method below)
            vsql_cmd = '%s -c "%s" ' % (self.vsql, stmt)
        elif scriptfile and not stmt:
            vsql_cmd = "%s -f '%s'" % (self.vsql, scriptfile)
        else:
            logkv(
                logger, {
                    "msg": "Received incorrect or conflicting execution mode",
                    "stmt": stmt,
                    "scriptfile": scriptfile
                }, "info")
            raise VerticaManagerException("Incorrect execution mode")
        try:
            vresult = self.shell_exec.safe_execute(vsql_cmd,
                                                   verbose=False,
                                                   as_shell=True,
                                                   splitcmd=False)
            return vresult
        except ShellException:
            logkv(logger, {
                "msg": "VSQL command failed",
                "cmd": vsql_cmd
            }, "error")
            raise VerticaManagerException()

    def create_table(self, ddlfile):
        """
        Creates Vertica schema from the schema file

        @type ddlfile: str
        @param ddlfile: Full path to file containing table-creation SQL

        @rtype: None
        @return: None
        """
        try:
            self.execute(scriptfile=ddlfile)
            logkv(logger, {
                "msg": "Created Vertica table DDL",
                "ddlfile": ddlfile
            }, "info")
        except VerticaManagerException as ex:
            logkv(logger, {"msg": "VSQL table create failed"}, "error", ex)
            raise

    def clone_schema(self, srcschema, srctable, dstschema, dsttable):
        """
        Clones schema of the 'srcschema.srctable' and creates 'dsttable'. If 'dsttable'
        exists already, it'll be deleted.

        @type srcschema: str
        @param srcschema: Vertica schema of the source table

        @type srctable: str
        @parasrctablele: Source table in Vertica

        @type dstschema: str
        @param dstschema: Vertica schema of the destination table

        @type dsttable: str
        @param dsttable: Destination table in Vertica

        @rtype: None
        @return: None
        """
        try:
            vsql_stmt = "drop table if exists %s.%s; create table %s.%s as select * from %s.%s where false;" \
                        % (dstschema, dsttable,dstschema, dsttable, srcschema, srctable)
            self.execute(stmt=vsql_stmt)
            logkv(
                logger, {
                    "msg": "Cloned schema",
                    "source": "%s.%s" % (srcschema, srctable),
                    "destination": "%s.%s" % (dstschema, dsttable)
                }, "info")
        except VerticaManagerException as ex:
            logkv(
                logger, {
                    "msg": "Failed to clone schema",
                    "source": "%s.%s" % (srcschema, srctable),
                    "destination": "%s.%s" % (dstschema, dsttable)
                }, "error", ex)
            raise

    def drop_table(self, vschema, vtable):
        """
        Drops table 'table' in schema 'schema'
        @type vschema: str
        @param vschema: Vertica schema

        @type vtable: str
        @param vtable: Vertica table

        @rtype: None
        @return: None
        """
        try:
            vsql_stmt = "drop table if exists %s.%s" % (vschema, vtable)
            self.execute(stmt=vsql_stmt)
            logkv(logger, {
                "msg": "Dropped table",
                "vschema": vschema,
                "vtable": vtable
            }, "info")
        except VerticaManagerException as ex:
            logkv(logger, {"msg": "VSQL table drop failed"}, "error", ex)
            raise

    def load(self,
             webhdfs_root,
             hdfs_path,
             vschema,
             dtable,
             rtable,
             mode="direct"):
        """
        Loads data from HDFS into Vertica table 'dtable

        @type webhdfs_root: str
        @param hdfs_path: WebHdfs prefix. Same as hadoop name node

        @type hdfs_path: str
        @param hdfs_path: HDFS path to dataset

        @type vschema: str
        @param vschema: Vertica schema

        @type dtable: str
        @param dtable: Vertica table for data

        @type rtable: str
        @param rtable: Vertica table for rejected rows

        @type mode: str
        @param mode: Copy mode. Possible values: 'direct' or 'decompress'. If mode is
        'decompress', the data in Hive partition will be decompressed to a plain text
        format using a single node to a temporary HDFS location. IF the mode='direct'
        an attempt will be made to directly load compressed data to vertica using the
        appropriate filter.

        mode='decompress' is required if a the MapReduce job outputs data in a compression
        format not supported by Vertica filter function. For example BZip2. In this
        case, we'll decompress the data before passing it to the COPY command

        @rtype: str
        @return: Number of rows loaded
        """

        if mode == "direct":
            _filter = "FILTER GZIP()"
        elif mode == "decompress":
            _filter = ""
        else:
            logkv(
                logger, {
                    "msg":
                    "Invalid load mode supplied to Vertica COPY command",
                    "mode": mode
                }, "error")
            raise VerticaManagerException()

        # Discard the leading "/" in HDFS path. We're going to be pre-pending it with
        # webhdfs base URL
        if hdfs_path.startswith("/"):
            hdfs_path = hdfs_path[1:]

        webhdfs_url = os.path.join(webhdfs_root, hdfs_path)

        copy_cmd = '''COPY %s.%s
                      SOURCE Hdfs(url=\'%s\', username=\'%s\', low_speed_limit=1048576)
                      %s
                      DELIMITER E\'\\001\'
                      REJECTMAX 0
                      REJECTED DATA AS TABLE %s.%s
                      DIRECT
                      COMMIT
                    ''' % (vschema, dtable, webhdfs_url,
                           self.connection_info["vertica_user"], _filter,
                           vschema, rtable)

        try:
            vresult = self.execute(stmt=copy_cmd)
            rows_loaded = VerticaManager.getrows(vresult.output)

            logkv(
                logger, {
                    "msg": "Loaded data in HDFS path to Vertica table",
                    "hdfs_path": hdfs_path,
                    "vschema": vschema,
                    "dtable": dtable,
                    "rows_loaded": rows_loaded
                }, "info")
            return rows_loaded
        except VerticaManagerException as ex:
            logkv(logger, {"msg": "Load to Vertica via WebHdfs failed"},
                  "error", ex)
            raise

    def grant(self, privilege, level, vschema, vtable=None, to=None):
        """
        Grants 'privilege' on 'vschema'.'vtable' to 'entity'. An 'entity' can be a
        user or a group.

        @type privilege: str
        @param privilege: 'USAGE', 'SELECT', 'CREATE', 'DELETE' etc

        @type vschema: str
        @param vschema: Vertica schema

        @type vtable: str
        @param vtable: Vertica table

        @type to: str
        @param to: user or a group, separated by commas

        @rtype: None
        @return: None
        """

        # If table is not specified, the privileges are to be granted at the Schema level

        if (level.lower() == "schema") and (privilege.upper() == "SELECT"):
            grant_stmt = "grant SELECT on all tables in schema %s to %s" % (
                vschema, to)
        elif (level.lower() == "schema") and (privilege.upper() == "USAGE"):
            grant_stmt = "grant USAGE on schema %s to %s" % (vschema, to)
        elif (level.lower() == "table") and (privilege.upper() == "SELECT"):
            grant_stmt = "grant SELECT on table %s.%s to %s" % (vschema,
                                                                vtable, to)
        else:
            logkv(
                logger, {
                    "msg": "Incorrect level/privilege combination",
                    "level": level,
                    "privilege": privilege
                }, "error")
            raise VerticaManagerException()

        try:
            self.execute(stmt=grant_stmt)
            logkv(
                logger, {
                    "msg": "Granted privileges",
                    "granted_to": to,
                    "privilege": privilege,
                    "vschema": vschema,
                    "vtable": vtable
                }, "info")
        except VerticaManagerException as ex:
            logkv(logger, {
                "msg": "Error executing grant statement",
                "stmt": grant_stmt
            }, "error", ex)
            raise

    def rollback(self,
                 srcschema,
                 srctable,
                 rollbackschema,
                 rollbacktable,
                 rkey=None):
        """
        Deletes rows in srctable that are present in rollbacktable.

        @type srcschema: str
        @param srcschema: Schema of the souce table

        @type srctable: str
        @param srctable: Source table

        @type rollbackschema: str
        @param rollbackschema: Schema of the table containing rollback data

        @type rollbacktable: str
        @param rollbacktable: Table containing the rollback data

        @rtype: str
        @return: Count of rows deleted
        """
        try:
            rbk_stmt = '''
                       set session autocommit to on;

                       delete from %s.%s
                       where %s in (
                           select %s
                           from %s.%s
                       );''' % (srcschema, srctable, rkey, rkey,
                                rollbackschema, rollbacktable)

            vresult = self.execute(stmt=rbk_stmt)
            rows = VerticaManager.getrows(vresult.output)

            logkv(
                logger, {
                    "msg": "rollback successful",
                    "source_schema": srcschema,
                    "source_table": srctable,
                    "rows": rows
                }, "info")
            return rows
        except VerticaManagerException as ex:
            logkv(logger, {"msg": "VSQL table drop failed"}, "error", ex)
            raise VerticaManagerException()

    def truncate(self, vschema, vtable):
        """
        Truncates 'vschema.vtable'.

        @type vschema: str
        @param vschema: Schema of the table to be truncated

        @type vtable: str
        @param vtable: Target table

        @rtype: None
        @return: None
        """
        try:
            truncate_stmt = "truncate table %s.%s;" % (vschema, vtable)
            self.execute(stmt=truncate_stmt)
            logkv(logger, {
                "msg": "Truncated table",
                "schema": vschema,
                "table": vtable
            }, "info")
        except VerticaManagerException as ex:
            logkv(
                logger, {
                    "msg": "Failed to truncate table",
                    "schema": vschema,
                    "table": vtable
                }, "error", ex)
            raise
Beispiel #6
0
    def __init__(self,
                 datacfg_file=None,
                 envcfg_file=None,
                 resources_file=None):
        """
        Parses config file and performs basic checks on filetypes

        @type datacfg_file:  str
        @param datacfg_file: Full or relative path of the dataset-specific config file

        @type envcfg_file:  str
        @param envcfg_file: Full or relative path of the global environment config file

        @type resources_file: str
        @param resources_file: Full or relative path of the resources file

        @type return: None
        @return: None
        """

        # self.parser = SafeConfigParser()
        # self.parser.read(config_file)

        # Instantiate ConfigLoader for managing dataset-specific config
        self.datacfg = ConfigLoader(datacfg_file)

        # Instantiate ConfigLoader for managing dataset-independent global configs
        self.envcfg = ConfigLoader(envcfg_file)

        # Resources file
        self.resources = resources_file

        if self.resources is not None:
            if not self.resources.endswith(".zip"):
                logkv(
                    logger, {
                        "msg": "Resource is not a zip file",
                        "resource_file": self.resources
                    }, "info")
                raise ThriveHandlerException

        credtypes = [
            "dbtype", "dbhost", "dbport", "dbuser", "dbpass", "dbname"
        ]
        md_credentials = dict([(cred, self.get_config(cred, configtype="env"))
                               for cred in credtypes])

        self.metadata_mgr = MetadataManager(credentials=md_credentials)

        # Get the timestamp at which the present load started
        self.loadts = datetime.now()

        # Create a ShellExecutor instance for managing execution of Shell commands for
        # all subclasses
        self.shell_exec = ShellExecutor()

        # Instantiate HdfsManager for HDFS-related tasks
        self.hdfs_mgr = HdfsManager()

        # Instantiate Vertica manager for Vertica-related tasks
        vconfigs = [
            "vertica_db", "vertica_vsql_path", "vertica_krb_svcname",
            "vertica_krb_host", "vertica_host", "vertica_port", "vertica_user"
        ]

        # Create connection_info. We're working with Python 2.6, so cannot use
        # dictionary comprehension and have to resort to passing tupes to the dict
        # constructor
        vconnection_info = dict(
            (key, self.get_config(key)) for key in vconfigs)
        self.vertica_mgr = VerticaManager(vconnection_info)

        # Instantiate a HiveManager for Hive-related tasks
        self.hive_mgr = HiveManager(db=self.get_config("hive_db"),
                                    table=self.get_config("hive_table"))

        # Create a load_id for this load. Used by 'setup' and 'load' phases
        self.load_id = uuid.uuid1()
Beispiel #7
0
class HiveManager(object):
    """
    Manager for Hive tables. Creates partition after each load from HDFS
    """
    def __init__(self, db, table):
        """
        Initializes the manager object for a Hive table
        @type db: str
        @param db: Hive db

        @type table: str
        @param table: Hive table

        @rtype: None
        @return: None
        """
        self.db = db
        self.table = table
        self.shell_exec = ShellExecutor()

    def execute(self, stmt):
        """
        Executes Hive SQL statement 'stmt'

        @type stmt: str
        @param stmt: Hive query

        @rtype: None
        @return: None
        """
        try:
            # Create hive partition
            self.shell_exec.safe_execute(stmt, splitcmd=False, as_shell=True)
            logkv(logger, {"msg": "Executed query", "query": stmt}, "info")
        except ShellException:
            logkv(logger, {
                "msg": "Error executing Hive statement",
                "query": stmt
            }, "error")
            raise HiveManagerException()

    def create_partition(self, ptn_path):
        """
        Creates a timestmp-based partition and points the partition to the location of
        the data

        @type ptn_path: str
        @param ptn_path: location of the parsed JSON data

        @rtype: None
        @return: None
        """

        # Check if HDFS path exists before attempting to point the new partition to it
        hdfs_mgr = HdfsManager()
        if not hdfs_mgr.path_exists(ptn_path):
            logkv(logger, {
                "msg": "Hadoop path does not exist",
                "path": ptn_path
            }, "error")
            raise HiveManagerException()
        else:
            logkv(logger, {
                "msg": "Hadoop path exists",
                "path": ptn_path
            }, "info")

        # Construct Hive a partition string
        ptn_vals = parse_partition(ptn_path)
        ptn_year, ptn_month, ptn_day, ptn_hour, ptn_part = ptn_vals
        ptn_str = "year=%s/month=%s/day=%s/hour=%s/part=%s" % ptn_vals

        # Check if the Hive table partition we're about to create already exists
        if self.check_partition(ptn_str):
            errmsg = "Partition %s for table %s.%s already exists" \
                     % (ptn_str, self.db, self.table)
            logkv(logger, {"msg": errmsg}, "error")
            raise HiveManagerException()

        # If the partition does not exist, proceed to create it
        # Construct partition command
        partition_cmd = ''' \
        hive -e "use %s; \
        alter table %s \
        add partition (year = '%s', month = '%s', day = '%s', hour = '%s', part = '%s') \
        location '%s';" ''' % (self.db, self.table, ptn_year, ptn_month,
                               ptn_day, ptn_hour, ptn_part, ptn_path)
        try:
            self.shell_exec.safe_execute(partition_cmd,
                                         splitcmd=False,
                                         as_shell=True)
            logkv(logger, {
                "msg": "Created partition",
                "partition": ptn_str
            }, "info")

        except ShellException:
            logkv(logger, {
                "msg": "Error creating Hive partition",
                "partition": ptn_str
            }, "error")
            raise HiveManagerException()

    def drop_partition(self, ptn_str):
        """
        Deletes hive partition.

        @type ptn_str: str
        @param ptn_str: partition to delete in the yyyy/mm/dd/hh/part format

        @rtype: None
        @return: None
        """
        # Compose drop partition command
        dropcmd = ''' hive -e "use %s; alter table %s drop if exists partition (%s)"''' \
                  % (self.db, self.table, ptn_str)
        try:
            self.execute(dropcmd)
            logkv(logger, {
                "msg": "Dropped partition",
                "partition": ptn_str
            }, "info")
        except ShellException:
            logkv(logger, {
                "msg": "Error dropping Hive partition",
                "partition": ptn_str
            }, "error")
            raise HiveManagerException()

    def create_table(self, ddlfile):
        """
        Creates a table by executing a 'create table' statement inside 'ddlfile'

        @type ddlfile: str
        @param ddlfile: path to the ddlfile

        @rtype: None
        @return: None
        """
        pass

    def drop_table(self):
        """
        Drops self.table

        @rtype: None
        @return: None
        """
        dropcmd = ''' hive -e "use %s; drop table %s" ''' % (self.db,
                                                             self.table)
        try:
            # Drop the table
            self.shell_exec.safe_execute(dropcmd,
                                         splitcmd=False,
                                         as_shell=True)
            logkv(logger, {
                "msg": "Dropped table",
                "table": "%s.%s" % (self.db, self.table)
            }, "info")
        except ShellException:
            logkv(
                logger, {
                    "msg": "Error dropping table",
                    "table": "%s.%s" % (self.db, self.table)
                }, "error")
            raise HiveManagerException()

    def drop_db(self):
        """
        Drops self.db, deletes all tables inside

        @rtype: None
        @return: None
        """
        dropcmd = ''' hive -e "drop database if exists %s cascade;" ''' % (
            self.db)
        try:
            # Drop the table
            self.shell_exec.safe_execute(dropcmd,
                                         splitcmd=False,
                                         as_shell=True)
            logkv(logger, {
                "msg": "Dropped database",
                "database": self.db
            }, "info")
        except ShellException:
            logkv(logger, {
                "msg": "Error dropping database",
                "database": self.db
            }, "warning")
            raise HiveManagerException()

    def check_partition(self, ptn_str):
        """
        Checks if a Hive partition exists. Purpose of this function is as
        follows: Hive will throw an exception if the partition we're about to
        create already exists. So the create_partition uses this method to make
        sure that the partition does not* exist.

        @type ptn_str: str
        @param ptn_str: Partition string.
                        E.g. "year=2016/month=06/day=06/hour=18/part=1"

        @rtype: bool
        @return: True if a partition exists, False otherwise
        """
        cmd = ''' hive -e "use %s; show partitions %s" ''' % (self.db,
                                                              self.table)
        try:
            result = self.shell_exec.safe_execute(cmd,
                                                  splitcmd=False,
                                                  as_shell=True)
            return bool(re.search(ptn_str, result.output))
        except Exception:
            logkv(logger, {
                "msg": "Error checking Hive partition",
                "partition": ptn_str
            }, "error")
            raise HiveManagerException()
Beispiel #8
0
 def __init__(self):
     self.shell_exec = ShellExecutor()