def _cleanup_metadata(self):
     try:
         topic = self.get_config("dataset_name")
         self.metadata_mgr.purge(topic)
         logkv(logger, {"msg": "Purged metadata"})
     except MetadataManagerException as ex:
         logkv(logger, {"msg": "Purging metadata failed"}, "warning", ex)
Exemple #2
0
    def __init__(self, config_file):
        """
        Constructor. Each ConfigLoader instance is initialized with a config file that
        it owns and manages

        @type config_file:  str
        @param config_file: Full or relative path of the config file

        @rtype: None
        @return: None
        """

        # Raise exception if config file not found
        if not os.path.exists(config_file):
            logkv(logger, {"msg": "Config file %s not found" % config_file},
                  "error")
            raise ConfigLoaderException()

        self.parser = SafeConfigParser()

        # If the config file has a section, load it as as. If attempting to load a
        # config file without section headers, create a file object from the string,
        # append section header "default" to it, and then populate the parser
        try:
            self.parser.read(config_file)
        except MissingSectionHeaderError:
            with open(config_file) as cf:
                properties = cf.read()
                configs = "[main]\n%s" % properties
                config_file_stringio = StringIO.StringIO(configs)
                self.parser.readfp(config_file_stringio)
        except ConfigParser.Error:
            logkv(logger, {"msg": "ConfigParser Error"}, "error")
            raise ConfigLoaderException()
    def _get_response(self, metric, start, end):
        """
        Internal method for making NewRelic API call

        @type metric: str
        @param metric: Endpoint to get counts from.  Acceptable values are "posted" and "produced".

        @type start: str
        @param start: String specifiying start time and time zone, e.g. "2016-01-27T02:59:00+00:00"

        @type end: str
        @param end: String specifiying start time and time zone, e.g. "2016-01-27T02:59:00+00:00"

        @rtype: str
        @return: Response from NewRelic API call to given endpoint
        """

        curl_cmd = '''curl -X GET '%s' -H '%s' -i -G -d 'names[]=%s.%s&from=%s&to=%s&summarize=true' ''' \
                   % (self.endpt_url, self.api_key, metric, self.dataset, start, end)

        try:
            # Execute curl command
            result = self.shell_exec.safe_execute(curl_cmd,
                                                  splitcmd=False,
                                                  as_shell=True)
            logkv(logger, {"msg": "Queried NewRelic"}, "info")
        except ShellException:
            raise NewRelicManagerException()

        return result.output
Exemple #4
0
    def check_partition(self, ptn_str):
        """
        Checks if a Hive partition exists. Purpose of this function is as
        follows: Hive will throw an exception if the partition we're about to
        create already exists. So the create_partition uses this method to make
        sure that the partition does not* exist.

        @type ptn_str: str
        @param ptn_str: Partition string.
                        E.g. "year=2016/month=06/day=06/hour=18/part=1"

        @rtype: bool
        @return: True if a partition exists, False otherwise
        """
        cmd = ''' hive -e "use %s; show partitions %s" ''' % (self.db,
                                                              self.table)
        try:
            result = self.shell_exec.safe_execute(cmd,
                                                  splitcmd=False,
                                                  as_shell=True)
            return bool(re.search(ptn_str, result.output))
        except Exception:
            logkv(logger, {
                "msg": "Error checking Hive partition",
                "partition": ptn_str
            }, "error")
            raise HiveManagerException()
    def truncate(self, vschema, vtable):
        """
        Truncates 'vschema.vtable'.

        @type vschema: str
        @param vschema: Schema of the table to be truncated

        @type vtable: str
        @param vtable: Target table

        @rtype: None
        @return: None
        """
        try:
            truncate_stmt = "truncate table %s.%s;" % (vschema, vtable)
            self.execute(stmt=truncate_stmt)
            logkv(logger, {
                "msg": "Truncated table",
                "schema": vschema,
                "table": vtable
            }, "info")
        except VerticaManagerException as ex:
            logkv(
                logger, {
                    "msg": "Failed to truncate table",
                    "schema": vschema,
                    "table": vtable
                }, "error", ex)
            raise
Exemple #6
0
    def grantall(self, permissions, hdfspath):
        """
        Grants 'permissions' to all on HDFS hdfspath

        @type permissions: str
        @param permissions: Permission string. E.g: "rwx"

        @type hdfspath: str
        @param hdfspath: HDFS path to grant permissions on

        @rtype: None
        @return: None
        """
        try:
            self.shell_exec.safe_execute("hadoop fs -chmod -R a+%s %s" %
                                         (permissions, hdfspath))

            logkv(
                logger, {
                    "msg": "Granted permissions",
                    "permissions": permissions,
                    "path": hdfspath
                }, "info")
        except ShellException:
            logkv(
                logger, {
                    "msg": "Error granting permissions",
                    "permissions": permissions,
                    "path": hdfspath
                }, "warning")
            raise HdfsManagerException()
Exemple #7
0
    def drop_partition(self, ptn_str):
        """
        Deletes hive partition.

        @type ptn_str: str
        @param ptn_str: partition to delete in the yyyy/mm/dd/hh/part format

        @rtype: None
        @return: None
        """
        # Compose drop partition command
        dropcmd = ''' hive -e "use %s; alter table %s drop if exists partition (%s)"''' \
                  % (self.db, self.table, ptn_str)
        try:
            self.execute(dropcmd)
            logkv(logger, {
                "msg": "Dropped partition",
                "partition": ptn_str
            }, "info")
        except ShellException:
            logkv(logger, {
                "msg": "Error dropping Hive partition",
                "partition": ptn_str
            }, "error")
            raise HiveManagerException()
Exemple #8
0
    def _get_xml_saved_searches(self):
        """
        Get XML for all saved searches for given dataset

        @rtype: str
        @return: XML for all saved searches from this dataset's REST endpoint
        """

        curl_cmd = '''curl -k -u %s:%s %s/%s/%s/saved/searches?search=%s*''' \
                   % (self.user, self.passwd, self.url, self.user, self.app, self.base_name)

        try:
            # Execute curl command
            xml = self.shell_exec.safe_execute(curl_cmd,
                                               splitcmd=False,
                                               as_shell=True).output
            logkv(logger, {
                "msg": "Got XML for all alerts",
                "dataset": self.dataset
            }, "info")
            return xml
        except ShellException:
            logkv(
                logger, {
                    "msg": "Error in getting XML for all alerts",
                    "dataset": self.dataset
                }, "error")
            raise SplunkManagerException()
Exemple #9
0
    def cleanup_all_alerts(self):
        """
        Delete all Splunk alerts for this dataset

        @rtype: None
        @return: None
        """

        xml = self._get_xml_saved_searches()

        # Parse to get the list of REST endpoints for all alerts
        # try:
        #     alert_name_list = re.findall('\<title\>(%s.*)\</title\>' % self.base_name, xml)
        # except:
        #     logkv(logger, {"msg": "Error in parsing XML for all alerts",
        #                    "dataset": self.dataset}, "error")
        #     raise SplunkManagerException()

        alert_name_list = re.findall('<title>(%s.+?)</title>' % self.base_name,
                                     xml)
        logkv(logger, {
            "msg": "Parsed XML for all alerts",
            "dataset": self.dataset
        }, "info")

        if len(alert_name_list) != 0:
            # Call cleanup alert
            for alert in alert_name_list:
                self.cleanup_alert(alert)

        else:
            logkv(logger, {
                "msg": "No alerts to clean up",
                "dataset": self.dataset
            }, "info")
Exemple #10
0
    def setup_dashboard(self, dash_xml):
        """
        Creates a new Splunk dashboard or overwrites it if it already exists

        @type dash_xml: str
        @param dash_xml: Dashboard XML to post

        @rtype: None
        @return: None
        """

        # Construct curl command (change this to use urllib2)
        curl_cmd = '''curl -k -u %s:%s %s/%s/%s/data/ui/views -d 'name=%s&eai:data=%s' ''' \
                   % (self.user, self.passwd, self.url, self.user, self.app, self.base_name, dash_xml)

        try:
            # Execute curl command
            self.shell_exec.safe_execute(curl_cmd,
                                         splitcmd=False,
                                         as_shell=True)
            logkv(logger, {
                "msg": "Set up dashboard",
                "dataset": self.dataset
            }, "info")
        except ShellException:
            logkv(logger, {
                "msg": "Error in setting up dashboard",
                "dataset": self.dataset
            }, "error")
            raise SplunkManagerException()
Exemple #11
0
    def cleanup_alert(self, alert):
        """
        Deletes specified Splunk alert

        @type alert: str
        @param alert: Last part of URL path for alert REST endpoint

        @rtype: None
        @return: None
        """
        curl_cmd = '''curl -k -u %s:%s --request DELETE %s/%s/%s/saved/searches/%s''' \
                   % (self.user, self.passwd, self.url, self.user, self.app, alert)

        try:
            # Execute curl command
            self.shell_exec.safe_execute(curl_cmd,
                                         splitcmd=False,
                                         as_shell=True)
            logkv(logger, {
                "msg": "Removed alert %s" % alert,
                "dataset": self.dataset
            }, "info")
        except ShellException:
            logkv(
                logger, {
                    "msg": "Error in removing alert %s" % alert,
                    "dataset": self.dataset
                }, "error")
            raise SplunkManagerException()
Exemple #12
0
    def cleanup_dashboard(self):
        """
        Delete an existing Splunk dashboard

        @rtype: None
        @return: None
        """

        owner = self._get_dash_owner()

        curl_cmd = '''curl -k -u %s:%s --request DELETE %s/%s/%s/data/ui/views/%s''' \
                   % (self.user, self.passwd, self.url, owner, self.app, self.base_name)

        try:
            # Execute curl command
            self.shell_exec.safe_execute(curl_cmd,
                                         splitcmd=False,
                                         as_shell=True)
            logkv(logger, {
                "msg": "Removed dashboard",
                "dataset": self.dataset
            }, "info")
        except ShellException:
            logkv(logger, {
                "msg": "Error in removing dashboard",
                "dataset": self.dataset
            }, "error")
            raise SplunkManagerException()
Exemple #13
0
    def get_search_str(self):
        """
        Returns the Splunk search string based on the type of alert

        @type alert_type: str
        @param alert_type: Specifies which type of alert (e.g. "dataset_locked")

        @rtype: str
        @return: Portion of Splunk search string corresponding to this alert type
        """
        try:
            base_search_str = "index=%s source=%s/*" % (self.index,
                                                        self.log_path)
            freq_str = "earliest=-%s" % self.time_window
            qry_file_text = open('resources/splunk_qry_str.json').read()
            qry_dict = json.loads(qry_file_text)
            query_str = qry_dict[self.alert_type]
            search_str = ''''%s %s %s' ''' % (base_search_str, freq_str,
                                              query_str)
            return search_str
        except:
            logkv(logger,
                  {"msg": "Error getting search string for Splunk Alert"},
                  "error")
            raise SplunkAlertException()
Exemple #14
0
    def __init__(self, index, log_path, alert_configs):
        """
        Creates a SplunkAlert from the specified configs

        @type index: str
        @param index: Name of Splunk index (thrive_perf or thrive_prod)

        @type log_path: str
        @param log_path: Path where logs are stored

        @type alert_configs: dict(str, str)
        @param alert_configs: Dict containing configs / config values for this alert.  It must specify
        type as one of dataset_locked | run_failed | hive_vertica_mismatch | no_new_hdfs_dirs | hdfs_in_parse_mismatch,
        time_window as a string in valid Splunk time format (e.g. "2h")

        @rtype: None
        @return: None
        """

        try:
            self.index = index
            self.log_path = log_path
            self.alert_configs = alert_configs

            self.alert_type = self.alert_configs.pop("type")
            self.time_window = self.alert_configs.pop("time_window")
        except:
            logkv(logger, {"msg": "Error initializing Splunk Alert"}, "error")
            raise SplunkAlertException()
    def increment_release_attempt(self, dataset_name):
        """
        Increments release attempt

        @type dataset_name: str
        @param dataset_name: Dataset name

        @rtype: None
        @return: None
        """
        qry = '''
                 update thrive_dataset_lock
                 set release_attempts = release_attempts + 1
                 where dataset_name = '%s';
              ''' % dataset_name

        try:
            self.execute(qry)
        except Exception as ex:
            logkv(
                logger, {
                    "msg": "Failed to increment release attempts",
                    "dataset": dataset_name,
                    "query": qry,
                    "error": ex
                }, "error")
            raise MetadataManagerException()
Exemple #16
0
    def grant_permissions_alert(self, alert_name, group="app"):
        """
        Grant permissions for an existing Splunk alert for the specified group

        @type group: str
        @param group: Level to grant permissions for.  Default is "app"

        @rtype: None
        @return: None
        """

        curl_cmd = '''curl -k -u %s:%s %s/%s/%s/saved/searches/%s/acl -d sharing=%s -d owner=%s''' \
                   % (self.user, self.passwd, self.url, self.user, self.app, alert_name, group, self.user)

        try:
            # Execute curl command
            self.shell_exec.safe_execute(curl_cmd,
                                         splitcmd=False,
                                         as_shell=True)
            logkv(logger, {
                "msg": "Granted permissions on alert",
                "dataset": self.dataset
            }, "info")
        except ShellException:
            logkv(
                logger, {
                    "msg": "Error in granting permissions on alert",
                    "dataset": self.dataset
                }, "error")
            raise SplunkManagerException()
Exemple #17
0
    def decompress(self, srcpath, dstpath):
        """
        Decompresses data file(s) form 'srcpath' and output to 'dstpath'. Both srcpath
        and dstpath are HDFS paths

        @type srcpath: str
        @param srcpath: Location of compressed files

        @type dstpath: str
        @param dstpath: Location of uncompressed files

        @rtype: None
        @return: None
        """
        cmd = "hadoop fs -text %s | hadoop fs -put - %s" % (srcpath, dstpath)
        try:
            self.shell_exec.safe_execute(cmd,
                                         as_shell=True,
                                         splitcmd=False,
                                         verbose=False)
            logkv(
                logger, {
                    "msg": "Decompressed source data into target",
                    "source": srcpath,
                    "target": dstpath
                }, "info")
        except ShellException:
            logkv(logger, {
                "msg": "HDFS decompress failed. %s",
                "cmd": cmd
            }, "error")
            raise HdfsManagerException()
Exemple #18
0
    def _agg_alert_params(self, alert_name):
        """
        Returns a string that specifies the various fields / values associated with this alert

        @type alert_name: str
        @param alert_name: Name of alert whose params are to be aggregated

        @rtype: str
        @return: String specifying the parameters for the alert to be used in a call to Splunk REST API
        """

        params_curl_cmd = """"""
        try:
            alert_dict = self.splunk_alerts[alert_name].alert_configs
        except:
            logkv(
                logger, {
                    "msg": "Error finding alert name %s" % alert_name,
                    "dataset": self.dataset
                }, "error")
            raise SplunkManagerException()

        for key in alert_dict.keys():
            params_curl_cmd += """ -d %s=%s""" % (key, alert_dict[key])
        return params_curl_cmd
Exemple #19
0
    def putfile(self, localpath, hdfspath):
        """
        Copies a resource from local path to HDFS path

        @type localpath: str
        @param localpath: Path to resource on local fileystem

        @type hdfspath: str
        @param hdfspath: HDFS destination path

        @rtype: None
        @return: None
        """
        try:
            self.shell_exec.safe_execute("hadoop fs -put %s %s" %
                                         (localpath, hdfspath))
            logkv(logger,
                  {"msg": "Put %s to HDFS path %s" % (localpath, hdfspath)},
                  "info")
        except ShellException:
            logkv(
                logger,
                {"msg": "HDFS putfile failed. %s %s" % (localpath, hdfspath)},
                "error")
            raise HdfsManagerException()
    def get_lastdir(self, dataset_name, hive_table, load_type):
        """
        Returns the last directory processed for "topic" by querying "table"

        @type dataset_name: str
        @param dataset_name: dataset being loaded

        @rtype: str
        @return: Last Camus directory loaded
        """

        qry = '''
                 select last_load_folder
                 from thrive_load_metadata
                 where dataset_name = '%s'
                 and hive_table = '%s'
                 and load_type = '%s'
                 order by hive_end_ts desc
                 limit 1;
              ''' % (dataset_name, hive_table, load_type)

        try:
            return self.execute_return(qry)[0][0]
        except Exception as ex:
            logkv(
                logger, {
                    "msg": "Failed to get last dir for dataset",
                    "dataset": dataset_name,
                    "table": hive_table,
                    "query": qry,
                    "error": ex
                }, "error")
            raise MetadataManagerException()
Exemple #21
0
    def drop_table(self):
        """
        Drops self.table

        @rtype: None
        @return: None
        """
        dropcmd = ''' hive -e "use %s; drop table %s" ''' % (self.db,
                                                             self.table)
        try:
            # Drop the table
            self.shell_exec.safe_execute(dropcmd,
                                         splitcmd=False,
                                         as_shell=True)
            logkv(logger, {
                "msg": "Dropped table",
                "table": "%s.%s" % (self.db, self.table)
            }, "info")
        except ShellException:
            logkv(
                logger, {
                    "msg": "Error dropping table",
                    "table": "%s.%s" % (self.db, self.table)
                }, "error")
            raise HiveManagerException()
    def purge(self, dataset_name):
        """
        Purges metadata entries for 'topic' in 'thrive_setup' and
        'thrive_load_metadata' tables

        @type dataset_name: str
        @param dataset_name: topic name

        @rtype: None
        @return: None
        """
        thrive_tables = ("thrive_setup", "thrive_load_metadata",
                         "thrive_dataset_lock")
        try:
            for md_table in thrive_tables:
                purge_setup_qry = "delete from %s where dataset_name = '%s';" \
                                  % (md_table, dataset_name)
                self.execute(purge_setup_qry)
        except Exception as ex:
            logkv(
                logger, {
                    "msg": "Purge failed for dataset",
                    "dataset": dataset_name,
                    "error": ex
                }, "error")
            raise MetadataManagerException()
Exemple #23
0
    def get_config(self, config, configtype="data"):
        """
        Returns value of requested "config" of type "configtype". "data" configs are
        specific to dataset being processed. "env" are global configs apply to the
        environment in which the processing is happening (production, preproduction,
        dev, etc).

        @type configtype: str
        @param configtype: type of configuration "env" or "data"

        @type config: str
        @param config: Key of the config whose value is desired

        @type return: str
        @return: value of configuration parameter "config"
        """

        if configtype == "data":
            return self.datacfg.get_config("main", config).strip()
        elif configtype == "env":
            return self.envcfg.get_config("main", config).strip()
        else:
            logkv(logger, {
                "msg": "Unknown configuration type",
                "configtype": configtype
            }, "error")
            raise ThriveHandlerException()
    def lock(self, dataset_name):
        """
        Locks the specified dataset. Each instance of the load process checks for the
        lock state. The lock state must be 0 for the load process to proceed.

        @type dataset_name: str
        @param dataset_name: Name of the dataset to be locked

        @rtype: None
        @return: None
        """
        lock_qry = '''
                      update thrive_dataset_lock
                      set locked = TRUE, release_attempts = 0
                      where  dataset_name = '%s';
                   ''' % dataset_name
        try:
            self.execute(lock_qry)
        except Exception as ex:
            logkv(
                logger, {
                    "msg": "Failed to set dataset lock",
                    "dataset": dataset_name,
                    "query": lock_qry,
                    "error": ex
                }, "error")
            raise MetadataManagerException()
    def zip26(_resources_file, _outdir, _tozip):
        """
        Creation of zip artifact for Python2.6

        @type _resources_file: str
        @param _resources_file: Name of the output zip artifact

        @type _outdir: str
        @param _outdir: Directory location of the zip artifact

        @type _tozip: list
        @param _tozip: List of files to zip into the artifactls

        @rtype: None
        @return: None

        @exception: Exception
        """
        try:
            zf = zipfile.ZipFile(_resources_file, "w")
            for _file in _tozip:
                _fpath = os.path.join(_outdir, _file)
                logkv(logger, {"msg": "Zipping", "file": _file}, "info")
                zf.write(_fpath, os.path.basename(_fpath))
        except Exception as ex:
            logkv(logger, {"msg": "Error creating resources file",
                           "resources_file": _resources_file,
                           "error": ex}, "error")
            raise PrepareHandlerException
        finally:
            zf.close()
    def __init__(self, credentials):
        """
        Initializes the connection object by establishing connection with
        the metadata DB

        @type credentials: dict
        @param credentials: parameters required for connection

        @rtype: None
        @return: None
        """
        self.credentials = credentials

        try:
            self.connection = pyodbc.connect(
                "DRIVER={%s};SERVER=%s;PORT=%s;UID=%s;PWD=%s;DB=%s" %
                (self.credentials["dbtype"], self.credentials["dbhost"],
                 self.credentials["dbport"], self.credentials["dbuser"],
                 self.credentials["dbpass"], self.credentials["dbname"]))
        except Exception as ex:
            logkv(logger, {
                "msg": "Could not connect to database",
                "credentials": credentials
            }, "error")
            raise MetadataManagerException()
    def _extract_count(response):
        """
        Internal method for getting event count from NewRelic API response.

        @type response: str
        @param response: Response from NewRelic API call, which contains header and JSON containing number of counts

        @rtype: int
        @return: Number of counts in call_count field
        """
        try:
            response_json = re.findall(JSON_PATTERN, response)[0]
            metrics_dict = json.loads(response_json)
            count_value = metrics_dict["metric_data"]["metrics"][0][
                "timeslices"][0]["values"]["call_count"]
            counts = int(count_value)
        except (IndexError, KeyError, ValueError, TypeError) as ex:
            logkv(
                logger, {
                    "msg":
                    "Error in retrieving requested metric. Returning -1",
                    "error": ex.message
                }, "warning")
            counts = -1

        return counts
    def release(self, dataset_name):
        """
        Releases lock on a dataset.

        @type dataset_name: str
        @param dataset_name:

        @rtype: None
        @return: None
        """
        release_qry = '''
                         update thrive_dataset_lock
                         set locked = FALSE
                         where dataset_name='%s';
                      ''' % dataset_name
        try:
            self.execute(release_qry)
        except Exception as ex:
            logkv(
                logger, {
                    "msg": "Failed to release dataset lock",
                    "dataset": dataset_name,
                    "query": release_qry,
                    "error": ex
                }, "error")
            raise MetadataManagerException()
Exemple #29
0
    def safe_execute(cmd_string, **kwargs):
        """
        Executes command string and raises exception if return code is not 0

        @type cmd_string: string
        @param cmd_string: Command string with arguments separated by spaces

        @type verbose: bool
        @param verbose: Echoes the command being executed

        @type splitcmd: bool
        @param splitcmd: If true, command is split into a list of arguments before
        being passed to Popen

        @type as_shell: bool
        @param as_shell: Sets the "shell" option of Popen

        @rtype: ShellResult
        @return: ShellResult instance containing return code, output, and error messages
        """

        result = ShellExecutor.execute(cmd_string, **kwargs)
        if result.retcode != 0:
            logkv(
                logger, {
                    "msg": "Error in execution of shell command",
                    "cmd": cmd_string,
                    "retcode": result.retcode,
                    "error": result.error
                }, "warning")
            raise ShellException
        else:
            return result
 def _cleanup_hive(self):
     try:
         self.hive_mgr.drop_table()
         logkv(logger, {"msg": "Cleaning Hive tables"})
     except HiveManagerException as ex:
         logkv(logger, {"msg": "Cleaning Hive tables failed"}, "warning",
               ex)