コード例 #1
0
    def on_change(self, attr, prev_value, current_value):
        if attr == "status" and "tendrl/monitor" in NS.node_context.tags:
            _tc = NS.tendrl.objects.TendrlContext(node_id=self.node_id).load()
            # Check node is managed
            _cnc = NS.tendrl.objects.ClusterNodeContext(
                node_id=self.node_id,
                integration_id=_tc.integration_id).load()
            if current_value is None and str(_cnc.is_managed).lower() == "yes":
                self.status = "DOWN"
                self.save()
                msg = "Node {0} is DOWN".format(self.fqdn)
                event_utils.emit_event("node_status",
                                       self.status,
                                       msg,
                                       "node_{0}".format(self.fqdn),
                                       "WARNING",
                                       node_id=self.node_id,
                                       integration_id=_tc.integration_id)
                # Load cluster_node_context will load node_context
                # and it will be updated with latest values
                _cnc_new = \
                    NS.tendrl.objects.ClusterNodeContext(
                        node_id=self.node_id,
                        integration_id=_tc.integration_id,
                        first_sync_done=_cnc.first_sync_done,
                        is_managed=_cnc.is_managed
                    )
                _cnc_new.save()
                del _cnc_new
                # Update cluster details
                self.update_cluster_details(_tc.integration_id)
                _tag = "provisioner/%s" % _tc.integration_id
                if _tag in self.tags:
                    _index_key = "/indexes/tags/%s" % _tag
                    self.tags.remove(_tag)
                    self.save()
                    etcd_utils.delete(_index_key)
                if _tc.sds_name in ["gluster", "RHGS"]:
                    bricks = etcd_utils.read(
                        "clusters/{0}/Bricks/all/{1}".format(
                            _tc.integration_id, self.fqdn))

                    for brick in bricks.leaves:
                        try:
                            etcd_utils.write("{0}/status".format(brick.key),
                                             "Stopped")
                        except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound):
                            pass
            elif current_value == "UP" and str(
                    _cnc.is_managed).lower() == "yes":
                msg = "{0} is UP".format(self.fqdn)
                event_utils.emit_event("node_status",
                                       "UP",
                                       msg,
                                       "node_{0}".format(self.fqdn),
                                       "INFO",
                                       node_id=self.node_id,
                                       integration_id=_tc.integration_id)
            del _cnc
コード例 #2
0
    def volume_remove_brick_force(self, event):
        time.sleep(self.sync_interval)
        # Event returns bricks list as space separated single string
        bricks = event['message']['bricks'].split(" ")
        for brick in bricks:
            fetched_brick = NS.gluster.objects.Brick(
                fqdn=brick.split(":/")[0],
                brick_dir=brick.split(":/")[1].replace('/', '_')).load()

            try:
                NS._int.wclient.delete(
                    "clusters/{0}/Bricks/all/{1}/{2}".format(
                        NS.tendrl_context.integration_id,
                        brick.split(":/")[0],
                        brick.split(":/")[1].replace('/', '_')),
                    recursive=True,
                )
            except etcd.EtcdKeyNotFound:
                pass

            job_id = monitoring_utils.update_dashboard(
                "%s|%s" % (event['message']['volume'], brick),
                RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id,
                "delete")
            logger.log(
                "debug", NS.publisher_id,
                {"message": "Update dashboard job %s "
                 "created" % job_id})

            job_id = monitoring_utils.delete_resource_from_graphite(
                "%s|%s" % (event['message']['volume'], brick),
                RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id,
                "delete")
            logger.log(
                "debug", NS.publisher_id, {
                    "message":
                    "Delete resource from graphite job %s "
                    "created" % job_id
                })

        volume_brick_path = "clusters/{0}/Volumes/{1}/"\
                            "Bricks".format(
                                NS.tendrl_context.integration_id,
                                fetched_brick.vol_id,
                            )

        # remove all the brick infromation under volume as the
        # subvolume might have changed, let the next sync handle
        # the updation of brick info
        try:
            NS._int.wclient.delete(volume_brick_path, recursive=True)
        except etcd.EtcdKeyNotFound:
            pass

        _trigger_sync_key = 'clusters/%s/_sync_now' % NS.tendrl_context.integration_id
        etcd_utils.write(_trigger_sync_key, 'true')
        etcd_utils.refresh(_trigger_sync_key, self.sync_interval)
コード例 #3
0
ファイル: __init__.py プロジェクト: Tendrl/gluster_bridge
    def shutdown(signum, frame):
        logger.log(
            "debug",
            NS.publisher_id,
            {"message": "Signal handler: stopping"}
        )
        # Remove the node's name from gluster server tag
        try:
            gl_srvr_list = etcd_utils.read(
                "/indexes/tags/gluster/server"
            ).value
            gl_srvr_list = json.loads(gl_srvr_list)
            if NS.node_context.node_id in gl_srvr_list:
                gl_srvr_list.remove(NS.node_context.node_id)
            etcd_utils.write(
                "/indexes/tags/gluster/server",
                json.dumps(gl_srvr_list)
            )
            node_tags = NS.node_context.tags
            if 'provisioner/%s' % NS.tendrl_context.integration_id \
                in node_tags:
                etcd_utils.delete(
                    "/indexes/tags/provisioner/%s" %
                    NS.tendrl_context.integration_id,
                    recursive=True
                )
            int_srvr_list = etcd_utils.read(
                "/indexes/tags/tendrl/integration/gluster"
            ).value
            int_srvr_list = json.loads(int_srvr_list)
            if NS.node_context.node_id in int_srvr_list:
                int_srvr_list.remove(NS.node_context.node_id)
            etcd_utils.write(
                "/indexes/tags/tendrl/integration/gluster",
                json.dumps(int_srvr_list)
            )
        except etcd.EtcdKeyNotFound:
            logger.log(
                "debug",
                NS.publisher_id,
                {
                    "message": "Couldnt remove node from "
                    "gluster servers list tag."
                    "integration_id: %s, node_id: %s" %
                    (
                        NS.tendrl_context.integration_id,
                        NS.node_context.node_id
                    )
                }
            )
            pass

        complete.set()
        m.stop()
コード例 #4
0
    def shutdown(signum, frame):
        logger.log(
            "debug",
            NS.publisher_id,
            {"message": "Signal handler: stopping"}
        )
        # Remove the node's name from gluster server tag
        try:
            gl_srvr_list = etcd_utils.read(
                "/indexes/tags/gluster/server"
            ).value
            gl_srvr_list = json.loads(gl_srvr_list)
            if NS.node_context.node_id in gl_srvr_list:
                gl_srvr_list.remove(NS.node_context.node_id)
            etcd_utils.write(
                "/indexes/tags/gluster/server",
                json.dumps(gl_srvr_list)
            )
            node_tags = json.loads(NS.node_context.tags)
            if 'provisioner/%s' % NS.tendrl_context.integration_id \
                in node_tags:
                etcd_utils.delete(
                    "/indexes/tags/provisioner/%s" %
                    NS.tendrl_context.integration_id,
                    recursive=True
                )
            int_srvr_list = etcd_utils.read(
                "/indexes/tags/tendrl/integration/gluster"
            ).value
            int_srvr_list = json.loads(int_srvr_list)
            if NS.node_context.node_id in int_srvr_list:
                int_srvr_list.remove(NS.node_context.node_id)
            etcd_utils.write(
                "/indexes/tags/tendrl/integration/gluster",
                json.dumps(int_srvr_list)
            )
        except etcd.EtcdKeyNotFound:
            logger.log(
                "debug",
                NS.publisher_id,
                {
                    "message": "Couldnt remove node from "
                    "gluster servers list tag."
                    "integration_id: %s, node_id: %s" %
                    (
                        NS.tendrl_context.integration_id,
                        NS.node_context.node_id
                    )
                }
            )
            pass

        complete.set()
        m.stop()
コード例 #5
0
ファイル: logger.py プロジェクト: Tendrl/bridge_common
 def push_operation(self):
     etcd_utils.write(
         "/messages/jobs/%s" % self.message.job_id,
         Message.to_json(self.message),
         append=True)
     etcd_utils.refresh(
         "/messages/jobs/%s" % self.message.job_id,
         ttl=NS.config.data['message_retention_time']
     )
     log_message = ("%s:%s") % (
         self.message.job_id,
         self.message.payload["message"])
     return log_message
コード例 #6
0
 def push_operation(self):
     etcd_utils.write(
         "/messages/jobs/%s" % self.message.job_id,
         Message.to_json(self.message),
         append=True)
     etcd_utils.refresh(
         "/messages/jobs/%s" % self.message.job_id,
         ttl=NS.config.data['message_retention_time']
     )
     log_message = ("%s:%s") % (
         self.message.job_id,
         self.message.payload["message"])
     return log_message
コード例 #7
0
    def on_change(self, attr, prev_value, current_value):
        if attr == "status":
            if current_value is None:
                self.status = "DOWN"
                self.save()
                msg = "Node {0} is DOWN".format(self.fqdn)
                event_utils.emit_event("node_status",
                                       self.status,
                                       msg,
                                       "node_{0}".format(self.fqdn),
                                       "WARNING",
                                       node_id=self.node_id)

                _tc = NS.tendrl.objects.TendrlContext(
                    node_id=self.node_id).load()
                _tag = "provisioner/%s" % _tc.integration_id
                if _tag in self.tags:
                    _index_key = "/indexes/tags/%s" % _tag
                    self.tags.remove(_tag)
                    self.save()
                    etcd_utils.delete(_index_key)
                    _msg = "node_sync, STALE provisioner node "\
                        "found! re-configuring monitoring "\
                        "(job-id: %s) on this node"
                    payload = {
                        "tags": ["tendrl/node_%s" % self.node_id],
                        "run": "tendrl.flows.ConfigureMonitoring",
                        "status": "new",
                        "parameters": {
                            'TendrlContext.integration_id': _tc.integration_id
                        },
                        "type": "node"
                    }
                    _job_id = str(uuid.uuid4())
                    NS.tendrl.objects.Job(job_id=_job_id,
                                          status="new",
                                          payload=payload).save()
                    logger.log("debug", NS.publisher_id,
                               {"message": _msg % _job_id})

                if _tc.sds_name == "gluster":
                    bricks = etcd_utils.read(
                        "clusters/{0}/Bricks/all/{1}".format(
                            _tc.integration_id, self.fqdn))

                    for brick in bricks.leaves:
                        try:
                            etcd_utils.write("{0}/status".format(brick.key),
                                             "Stopped")
                        except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound):
                            pass
コード例 #8
0
ファイル: __init__.py プロジェクト: BwithPrashant/commons
    def save(self, update=True, ttl=None):
        hash_key_changed = True
        if "Message" not in self.__class__.__name__:
            # If local object.hash is equal to
            # central_store object.hash, return
            if self.hash_compare_with_central_store(ttl=ttl):
                # No change in hashkey
                hash_key_changed = False
        rendered_obj = self.render()
        watchables = self._defs.get("watch_attrs", [])
        if self.__class__.__name__ in ['Config', 'Definition'] or \
            len(watchables) > 0:
            for item in rendered_obj:
                if item['name'] in watchables:
                    _type = self._defs.get("attrs", {}).get(item['name'],
                                                            {}).get("type")
                    if _type and _type.lower() in ['json', 'list'] and \
                        item['value']:
                        try:
                            item['value'] = json.dumps(item['value'])
                        except ValueError:
                            _msg = "Error save() attr %s for object %s" % \
                                   (item['name'], self.__name__)
                            logger.log("debug", NS.publisher_id,
                                       {"message": _msg})
                    if self._ttl and item['name'] in self._attrs_with_ttl:
                        etcd_utils.write(item['key'],
                                         item['value'],
                                         quorum=True,
                                         ttl=self._ttl)
                    else:
                        etcd_utils.write(item['key'],
                                         item['value'],
                                         quorum=True)
        if hash_key_changed:
            data_key = self.value + '/data'
            etcd_utils.write(data_key, self.json)
            updated_at_key = self.value + '/updated_at'
            hash_key = self.value + '/hash'
            etcd_utils.write(updated_at_key, str(time_utils.now()))
            if hasattr(self, 'hash'):
                etcd_utils.write(hash_key, self.hash)

            if ttl:
                etcd_utils.refresh(self.value, ttl)

        self.watch_attrs()
コード例 #9
0
ファイル: __init__.py プロジェクト: Tendrl/bridge_common
    def save(self, update=True, ttl=None):
        hash_key_changed = True
        if "Message" not in self.__class__.__name__:
            # If local object.hash is equal to
            # central_store object.hash, return
            if self.hash_compare_with_central_store(ttl=ttl):
                # No change in hashkey
                hash_key_changed = False
        rendered_obj = self.render()
        watchables = self._defs.get("watch_attrs", [])
        if self.__class__.__name__ in ['Config', 'Definition'] or \
            len(watchables) > 0:
            for item in rendered_obj:
                if item['name'] in watchables:
                    _type = self._defs.get("attrs", {}).get(
                        item['name'],
                        {}
                    ).get("type")
                    if _type and _type.lower() in ['json', 'list'] and \
                        item['value']:
                        try:
                            item['value'] = json.dumps(item['value'])
                        except ValueError:
                            _msg = "Error save() attr %s for object %s" % \
                                   (item['name'], self.__name__)
                            logger.log(
                                "debug",
                                NS.publisher_id,
                                {"message": _msg}
                            )
                    etcd_utils.write(item['key'], item['value'], quorum=True)
        if hash_key_changed:
            data_key = self.value + '/data'
            etcd_utils.write(data_key, self.json)
            updated_at_key = self.value + '/updated_at'
            hash_key = self.value + '/hash'
            etcd_utils.write(updated_at_key, str(time_utils.now()))
            if hasattr(self, 'hash'):
                etcd_utils.write(hash_key, self.hash)

            if ttl:
                etcd_utils.refresh(self.value, ttl)

        self.watch_attrs()
コード例 #10
0
ファイル: disk_sync.py プロジェクト: Tendrl/node-agent
def sync():
    try:
        _keep_alive_for = int(NS.config.data.get("sync_interval", 10)) + 250
        disks = get_node_disks()
        disk_map = {}
        for disk in disks:
            # Creating dict with disk name as key and disk_id as value
            # It will help populate block device disk_id attribute
            _map = dict(disk_id=disks[disk]['disk_id'], ssd=False)
            disk_map[disks[disk]['disk_name']] = _map
        block_devices = get_node_block_devices(disk_map)

        for disk in disks:
            if disk_map[disks[disk]['disk_name']]:
                disks[disk]['ssd'] = disk_map[disks[disk]['disk_name']]['ssd']

            if "virtio" in disks[disk]["driver"]:
                # Virtual disk
                NS.tendrl.objects.VirtualDisk(**disks[disk]).save(
                    ttl=_keep_alive_for)
            else:
                # physical disk
                NS.tendrl.objects.Disk(**disks[disk]).save(ttl=_keep_alive_for)

        for device in block_devices['all']:
            NS.tendrl.objects.BlockDevice(**device).save(ttl=_keep_alive_for)
        for device_id in block_devices['used']:
            etcd_utils.write("nodes/%s/LocalStorage/BlockDevices/used/%s" %
                             (NS.node_context.node_id,
                              device_id.replace("/", "_").replace("_", "", 1)),
                             device_id,
                             ttl=_keep_alive_for)
        for device_id in block_devices['free']:
            etcd_utils.write("nodes/%s/LocalStorage/BlockDevices/free/%s" %
                             (NS.node_context.node_id,
                              device_id.replace("/", "_").replace("_", "", 1)),
                             device_id,
                             ttl=_keep_alive_for)
        raw_reference = get_raw_reference()
        etcd_utils.write(
            "nodes/%s/LocalStorage/DiskRawReference" % NS.node_context.node_id,
            raw_reference,
            ttl=_keep_alive_for,
        )
    except (Exception, KeyError) as ex:
        _msg = "node_sync disks sync failed: " + ex.message
        Event(
            ExceptionMessage(priority="error",
                             publisher=NS.publisher_id,
                             payload={
                                 "message": _msg,
                                 "exception": ex
                             }))
コード例 #11
0
def test_write():
    setattr(__builtin__, "NS", maps.NamedDict())
    setattr(NS, "_int", maps.NamedDict())
    NS._int.wclient = importlib.import_module("tendrl.commons"
                                              ".tests.fixtures."
                                              "client").Client()
    NS._int.wreconnect = type("Dummy", (object, ), {})
    with patch.object(Client, "write") as mock_write:
        etcd_utils.write("key", "test_value", False)
        assert mock_write.assert_called
    with patch.object(Client, "write",
                      raise_etcdconnectionfailed) as mock_write:
        with pytest.raises(etcd.EtcdConnectionFailed):
            etcd_utils.write("key", "test_value", False)
    with patch.object(Client, "write", raise_etcdkeynotfound) as mock_write:
        with pytest.raises(etcd.EtcdKeyNotFound):
            etcd_utils.write("key", "test_value", False)
コード例 #12
0
def test_write():
    setattr(__builtin__, "NS", maps.NamedDict())
    setattr(NS, "_int", maps.NamedDict())
    NS._int.wclient = importlib.import_module("tendrl.commons"
                                              ".tests.fixtures."
                                              "client").Client()
    NS._int.wreconnect = type("Dummy", (object,), {})
    with patch.object(Client, "write") as mock_write:
        etcd_utils.write("key", "test_value", False)
        assert mock_write.assert_called
    with patch.object(Client, "write",
                      raise_etcdconnectionfailed) as mock_write:
        with pytest.raises(etcd.EtcdConnectionFailed):
            etcd_utils.write("key", "test_value", False)
    with patch.object(Client, "write",
                      raise_etcdkeynotfound) as mock_write:
        with pytest.raises(etcd.EtcdKeyNotFound):
            etcd_utils.write("key", "test_value", False)
コード例 #13
0
def update_last_seen_at():
    etcd_utils.write(
        '/monitoring/nodes/%s/last_seen_at' % NS.node_context.node_id,
        tendrl_now().isoformat())
コード例 #14
0
ファイル: __init__.py プロジェクト: Tendrl/gluster_bridge
def sync_volumes(
    volumes, index,
    vol_options,
    sync_ttl,
    cluster_short_name,
    devicetree,
    lvs
):
    NS.node_context = NS.tendrl.objects.NodeContext().load()
    tag_list = NS.node_context.tags
    # Raise alerts for volume state change.
    cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id
    if cluster_provisioner in tag_list:
        try:
            _volume = NS.tendrl.objects.GlusterVolume(
                NS.tendrl_context.integration_id,
                vol_id=volumes['volume%s.id' % index]
            ).load()
            if _volume.locked_by and 'job_id' in _volume.locked_by and \
                _volume.current_job.get('status', '') == 'in_progress':
                # There is a job active on volume. skip the sync
                return
            stored_volume_status = _volume.status
            current_status = volumes['volume%s.status' % index]
            if stored_volume_status not in [None, ""] and \
                current_status != stored_volume_status:
                msg = ("Status of volume: %s in cluster %s "
                       "changed from %s to %s") % (
                           volumes['volume%s.name' % index],
                           cluster_short_name,
                           stored_volume_status,
                           current_status)
                instance = "volume_%s" % volumes[
                    'volume%s.name' % index
                ]
                event_utils.emit_event(
                    "volume_status",
                    current_status,
                    msg,
                    instance,
                    'WARNING' if current_status == 'Stopped'
                    else 'INFO',
                    tags={"entity_type": RESOURCE_TYPE_VOLUME,
                          "volume_name": volumes['volume%s.name' % index]
                          }
                )
        except (KeyError, etcd.EtcdKeyNotFound) as ex:
            if isinstance(ex, KeyError):
                raise ex
            pass

        volume = NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id,
            vol_id=volumes['volume%s.id' % index]
        ).load()
        volume.vol_type = "arbiter" \
            if int(volumes['volume%s.arbiter_count' % index]) > 0 \
            else volumes['volume%s.type' % index]
        volume.name = volumes['volume%s.name' % index]
        volume.transport_type = volumes['volume%s.transport_type' % index]
        volume.status = volumes['volume%s.status' % index]
        volume.brick_count = volumes['volume%s.brickcount' % index]
        volume.snap_count = volumes['volume%s.snap_count' % index]
        volume.stripe_count = volumes['volume%s.stripe_count' % index]
        volume.replica_count = volumes['volume%s.replica_count' % index]
        volume.subvol_count = volumes['volume%s.subvol_count' % index]
        volume.arbiter_count = volumes['volume%s.arbiter_count' % index]
        volume.disperse_count = volumes['volume%s.disperse_count' % index]
        volume.redundancy_count = volumes['volume%s.redundancy_count' % index]
        volume.quorum_status = volumes['volume%s.quorum_status' % index]
        volume.snapd_status = volumes[
            'volume%s.snapd_svc.online_status' % index]
        volume.snapd_inited = volumes['volume%s.snapd_svc.inited' % index]
        if NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id,
            vol_id=volumes['volume%s.id' % index]
        ).exists():
            existing_vol = NS.tendrl.objects.GlusterVolume(
                NS.tendrl_context.integration_id,
                vol_id=volumes['volume%s.id' % index]
            ).load()
            volume_profiling_old_value = existing_vol.profiling_enabled
        else:
            volume_profiling_old_value = volume.profiling_enabled
        if ('volume%s.profile_enabled' % index) in volumes:
            value = int(volumes['volume%s.profile_enabled' % index])
            if value == 1:
                volume_profiling_new_value = "yes"
            else:
                volume_profiling_new_value = "no"
        else:
            volume_profiling_new_value = None
        volume.profiling_enabled = volume_profiling_new_value
        if volume_profiling_old_value not in [None, ""] and \
            volume_profiling_old_value != volume_profiling_new_value:
            # Raise alert for the same value change
            msg = ("Value of volume profiling for volume: %s "
                   "of cluster %s changed from %s to %s" % (
                       volumes['volume%s.name' % index],
                       cluster_short_name,
                       volume_profiling_old_value,
                       volume_profiling_new_value))
            instance = "volume_%s" % \
                volumes['volume%s.name' % index]
            event_utils.emit_event(
                "volume_profiling_status",
                volume_profiling_new_value,
                msg,
                instance,
                'INFO',
                tags={
                    "entity_type": RESOURCE_TYPE_BRICK,
                    "volume_name": volumes[
                        'volume%s.name' % index
                    ]
                }
            )
        volume.save(ttl=sync_ttl)
        # Save the default values of volume options
        vol_opt_dict = {}
        for opt_count in \
            range(1, int(vol_options['volume%s.options.count' % index])):
            vol_opt_dict[
                vol_options[
                    'volume%s.options.key%s' % (index, opt_count)
                ]
            ] = vol_options[
                'volume%s.options.value%s' % (index, opt_count)
            ]
        volume.options = vol_opt_dict
        volume.save()

    rebal_det = NS.gluster.objects.RebalanceDetails(
        vol_id=volumes['volume%s.id' % index],
        rebal_id=volumes['volume%s.rebalance.id' % index],
        rebal_status=volumes['volume%s.rebalance.status' % index],
        rebal_failures=volumes['volume%s.rebalance.failures' % index],
        rebal_skipped=volumes['volume%s.rebalance.skipped' % index],
        rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index],
        rebal_files=volumes['volume%s.rebalance.files' % index],
        rebal_data=volumes['volume%s.rebalance.data' % index],
        time_left=volumes.get('volume%s.rebalance.time_left' % index),
    )
    rebal_det.save(ttl=sync_ttl)
    georep_details.save_georep_details(volumes, index)

    b_index = 1
    # ipv4 address of current node
    try:
        network_ip = []
        networks = NS.tendrl.objects.NodeNetwork().load_all()
        for network in networks:
            if network.ipv4:
                network_ip.extend(network.ipv4)
    except etcd.EtcdKeyNotFound as ex:
        Event(
            ExceptionMessage(
                priority="debug",
                publisher=NS.publisher_id,
                payload={
                    "message": "Could not find "
                    "any ipv4 networks for node"
                    " %s" % NS.node_context.node_id,
                    "exception": ex
                }
            )
        )
    while True:
        try:
            # Update brick node wise
            hostname = volumes[
                'volume%s.brick%s.hostname' % (index, b_index)
            ]
            ip = socket.gethostbyname(hostname)
            try:
                node_id = etcd_utils.read("indexes/ip/%s" % ip).value
                fqdn = NS.tendrl.objects.ClusterNodeContext(
                    node_id=node_id
                ).load().fqdn
                cluster_node_ids = etcd_utils.read(
                    "indexes/tags/tendrl/integration/%s" %
                    NS.tendrl_context.integration_id
                ).value
                cluster_node_ids = json.loads(cluster_node_ids)
                if NS.node_context.fqdn != fqdn or \
                        node_id not in cluster_node_ids:
                    b_index += 1
                    continue
            except(TypeError, etcd.EtcdKeyNotFound):
                b_index += 1
                continue
            sub_vol_size = (int(
                volumes['volume%s.brickcount' % index]
            )) / int(
                volumes['volume%s.subvol_count' % index]
            )
            brick_name = NS.node_context.fqdn
            brick_name += ":"
            brick_name += volumes['volume%s.brick%s' '.path' % (
                index,
                b_index
            )].split(":")[-1].replace("/", "_")

            # Raise alerts if the brick path changes
            try:
                stored_brick = NS.tendrl.objects.GlusterBrick(
                    NS.tendrl_context.integration_id,
                    NS.node_context.fqdn,
                    brick_dir=brick_name.split(":_")[-1]
                ).load()
                current_status = volumes.get(
                    'volume%s.brick%s.status' % (index, b_index)
                )
                if stored_brick.status and \
                    current_status != stored_brick.status:
                    msg = ("Brick:%s in volume:%s has %s"
                           ) % (
                               volumes['volume%s.brick%s' '.path' % (
                                   index,
                                   b_index
                               )],
                               volumes['volume%s.' 'name' % index],
                               current_status)
                    instance = "volume_%s|brick_%s" % (
                        volumes['volume%s.name' % index],
                        volumes['volume%s.brick%s.path' % (
                            index,
                            b_index
                        )]
                    )
                    event_utils.emit_event(
                        "brick_status",
                        current_status,
                        msg,
                        instance,
                        'WARNING' if current_status == 'Stopped'
                        else 'INFO',
                        tags={"entity_type": RESOURCE_TYPE_BRICK,
                              "volume_name": volumes[
                                  'volume%s.' 'name' % index]
                              }
                    )

            except etcd.EtcdKeyNotFound:
                pass

            brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s"

            vol_brick_path = brk_pth % (
                NS.tendrl_context.integration_id,
                volumes['volume%s.id' % index],
                str((b_index - 1) / sub_vol_size),
                brick_name
            )

            etcd_utils.write(vol_brick_path, "")
            brick = NS.tendrl.objects.GlusterBrick(
                NS.tendrl_context.integration_id,
                NS.node_context.fqdn,
                brick_dir=brick_name.split(":_")[-1]
            ).load()
            brick.integration_id = NS.tendrl_context.integration_id
            brick.fqdn = NS.node_context.fqdn
            brick.brick_dir = brick_name.split(":_")[-1]
            brick.name = brick_name
            brick.vol_id = volumes['volume%s.id' % index]
            brick.sequence_number = b_index
            brick.brick_path = volumes[
                'volume%s.brick%s.path' % (index, b_index)
            ]
            brick.hostname = volumes.get(
                'volume%s.brick%s.hostname' % (index, b_index)
            )
            brick.port = volumes.get(
                'volume%s.brick%s.port' % (index, b_index)
            )
            brick.vol_name = volumes['volume%s.name' % index]
            brick.used = True
            brick.node_id = NS.node_context.node_id
            brick.status = volumes.get(
                'volume%s.brick%s.status' % (index, b_index)
            )
            brick.filesystem_type = volumes.get(
                'volume%s.brick%s.filesystem_type' % (index, b_index)
            )
            brick.mount_opts = volumes.get(
                'volume%s.brick%s.mount_options' % (index, b_index)
            )
            brick.utilization = brick_utilization.brick_utilization(
                volumes['volume%s.brick%s.path' % (index, b_index)],
                lvs
            )
            brick.client_count = volumes.get(
                'volume%s.brick%s.client_count' % (index, b_index)
            )
            brick.is_arbiter = volumes.get(
                'volume%s.brick%s.is_arbiter' % (index, b_index)
            )
            brick.save(ttl=sync_ttl)
            # sync brick device details
            brick_device_details.\
                update_brick_device_details(
                    brick_name,
                    volumes[
                        'volume%s.brick%s.path' % (
                            index, b_index)
                    ],
                    devicetree,
                    sync_ttl
                )

            # Sync the brick client details
            c_index = 1
            if volumes.get(
                'volume%s.brick%s.client_count' % (index, b_index)
            ) > 0:
                while True:
                    try:
                        NS.gluster.objects.ClientConnection(
                            brick_name=brick_name,
                            fqdn=NS.node_context.fqdn,
                            brick_dir=brick_name.split(":_")[-1],
                            hostname=volumes[
                                'volume%s.brick%s.client%s.hostname' % (
                                    index, b_index, c_index
                                )
                            ],
                            bytesread=volumes[
                                'volume%s.brick%s.client%s.bytesread' % (
                                    index, b_index, c_index
                                )
                            ],
                            byteswrite=volumes[
                                'volume%s.brick%s.client%s.byteswrite' % (
                                    index, b_index, c_index
                                )
                            ],
                            opversion=volumes[
                                'volume%s.brick%s.client%s.opversion' % (
                                    index, b_index, c_index
                                )
                            ]
                        ).save(ttl=sync_ttl)
                    except KeyError:
                        break
                    c_index += 1
            sync_ttl += 4
            b_index += 1
        except KeyError:
            break
    return b_index
コード例 #15
0
ファイル: __init__.py プロジェクト: BwithPrashant/commons
    def run(self):
        logger.log(
            "info",
            NS.publisher_id,
            {"message": "Deleting cluster details."},
            job_id=self.parameters['job_id'],
            flow_id=self.parameters['flow_id'],
        )
        integration_id = self.parameters['TendrlContext.integration_id']

        etcd_keys_to_delete = []
        etcd_keys_to_delete.append("/clusters/%s/nodes" % integration_id)
        etcd_keys_to_delete.append("/clusters/%s/Bricks" % integration_id)
        etcd_keys_to_delete.append("/clusters/%s/Volumes" % integration_id)
        etcd_keys_to_delete.append("/clusters/%s/GlobalDetails" %
                                   integration_id)
        etcd_keys_to_delete.append("/clusters/%s/TendrlContext" %
                                   integration_id)
        etcd_keys_to_delete.append("/clusters/%s/Utilization" % integration_id)
        etcd_keys_to_delete.append("/clusters/%s/raw_map" % integration_id)
        etcd_keys_to_delete.append("/alerting/clusters/%s" % integration_id)
        nodes = etcd_utils.read("/clusters/%s/nodes" % integration_id)
        node_ids = []
        for node in nodes.leaves:
            node_id = node.key.split("/")[-1]
            node_ids.append(node_id)
            key = "/alerting/nodes/%s" % node_id
            etcd_keys_to_delete.append(key)
            try:
                # delete node alerts from /alerting/alerts
                node_alerts = etcd_utils.read(key)
                for node_alert in node_alerts.leaves:
                    etcd_keys_to_delete.append("/alerting/alerts/%s" %
                                               node_alert.key.split("/")[-1])
            except etcd.EtcdKeyNotFound:
                # No node alerts, continue
                pass

        # Find the alerting/alerts entries to be deleted
        try:
            cluster_alert_ids = etcd_utils.read("/alerting/clusters/%s" %
                                                integration_id)
            for entry in cluster_alert_ids.leaves:
                ca_id = entry.key.split("/")[-1]
                etcd_keys_to_delete.append("/alerting/alerts/%s" % ca_id)
        except etcd.EtcdKeyNotFound:
            # No cluster alerts, continue
            pass

        try:
            index_key = "/indexes/tags/tendrl/integration/%s" % integration_id
            _node_ids = etcd_utils.read(index_key).value
            _node_ids = json.loads(_node_ids)
            for _node_id in _node_ids[:]:
                node_obj = NS.tendrl.objects.NodeContext(
                    node_id=_node_id).load()
                # Remove cluster indexes for down node
                if node_obj.status.lower() == "down":
                    _node_ids.remove(_node_id)
                    # Removing down node details
                    logger.log(
                        "warning",
                        NS.publisher_id,
                        {
                            "message":
                            "Deleting down node %s details" % node_obj.fqdn
                        },
                        job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id'],
                    )
                    etcd_keys_to_delete.append("/nodes/%s" % _node_id)
            etcd_utils.write(index_key, json.dumps(_node_ids))
        except (etcd.EtcdKeyNotFound, ValueError, TypeError, AttributeError,
                IndexError):
            # If index details not present then we don't need to stop
            # un-manage flow, Because when node-agent work properly these
            # details are populated again by the node sync
            pass
        # Remove the cluster details
        for key in list(set(etcd_keys_to_delete)):
            try:
                etcd_utils.delete(key, recursive=True)
            except etcd.EtcdKeyNotFound:
                logger.log(
                    "debug",
                    NS.publisher_id,
                    {"message": "%s key not found for deletion" % key},
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id'],
                )
                continue
        # remove short name
        cluster = NS.tendrl.objects.Cluster(
            integration_id=integration_id).load()
        cluster.short_name = ""
        cluster.save()
        return True
コード例 #16
0
    def volume_remove_brick_force(self, event):
        time.sleep(self.sync_interval)
        # Event returns bricks list as space separated single string
        bricks = event['message']['bricks'].split(" ")
        try:
            for brick in bricks:
                # find fqdn using ip
                ip = socket.gethostbyname(brick.split(":/")[0])
                node_id = etcd_utils.read("indexes/ip/%s" % ip).value
                fqdn = NS.tendrl.objects.ClusterNodeContext(
                    node_id=node_id).load().fqdn
                brick = fqdn + ":" + brick.split(":")[-1]
                fetched_brick = NS.tendrl.objects.GlusterBrick(
                    NS.tendrl_context.integration_id,
                    fqdn=brick.split(":/")[0],
                    brick_dir=brick.split(":/")[1].replace('/', '_')).load()

                # delete brick
                etcd_utils.delete(
                    "clusters/{0}/Bricks/all/{1}/{2}".format(
                        NS.tendrl_context.integration_id,
                        brick.split(":/")[0],
                        brick.split(":/")[1].replace('/', '_')),
                    recursive=True,
                )

                # delete alert dashbaord
                job_id = monitoring_utils.update_dashboard(
                    "%s|%s" % (event['message']['volume'], brick),
                    RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id,
                    "delete")
                logger.log(
                    "debug", NS.publisher_id,
                    {"message": "Update dashboard job %s "
                     "created" % job_id})

                # delete brick details from graphite
                job_id = monitoring_utils.delete_resource_from_graphite(
                    "%s|%s" % (event['message']['volume'], brick),
                    RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id,
                    "delete")
                logger.log(
                    "debug", NS.publisher_id, {
                        "message":
                        "Delete resource from graphite job %s "
                        "created" % job_id
                    })

            volume_brick_path = "clusters/{0}/Volumes/{1}/"\
                                "Bricks".format(
                                    NS.tendrl_context.integration_id,
                                    fetched_brick.vol_id,
                                )

            # remove all the brick infromation under volume as the
            # subvolume might have changed, let the next sync handle
            # the updation of brick info
            etcd_utils.delete(volume_brick_path, recursive=True)

            _trigger_sync_key = 'clusters/%s/_sync_now' % \
                NS.tendrl_context.integration_id
            etcd_utils.write(_trigger_sync_key, 'true')
            etcd_utils.refresh(_trigger_sync_key, self.sync_interval)
        except etcd.EtcdKeyNotFound:
            logger.log("debug", NS.publisher_id,
                       {"message": "Unable to delete bricks %s" % bricks})
コード例 #17
0
def sync_volumes(
    volumes, index,
    vol_options,
    sync_ttl,
    cluster_short_name,
    devicetree
):
    NS.node_context = NS.tendrl.objects.NodeContext().load()
    tag_list = NS.node_context.tags
    # Raise alerts for volume state change.
    cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id
    if cluster_provisioner in tag_list:
        try:
            _volume = NS.tendrl.objects.GlusterVolume(
                NS.tendrl_context.integration_id,
                vol_id=volumes['volume%s.id' % index]
            ).load()
            if _volume.locked_by and 'job_id' in _volume.locked_by and \
                _volume.current_job.get('status', '') == 'in_progress':
                # There is a job active on volume. skip the sync
                return
            stored_volume_status = _volume.status
            current_status = volumes['volume%s.status' % index]
            if stored_volume_status not in [None, ""] and \
                current_status != stored_volume_status:
                msg = ("Status of volume: %s in cluster %s "
                       "changed from %s to %s") % (
                           volumes['volume%s.name' % index],
                           cluster_short_name,
                           stored_volume_status,
                           current_status)
                instance = "volume_%s" % volumes[
                    'volume%s.name' % index
                ]
                event_utils.emit_event(
                    "volume_status",
                    current_status,
                    msg,
                    instance,
                    'WARNING' if current_status == 'Stopped'
                    else 'INFO',
                    tags={"entity_type": RESOURCE_TYPE_VOLUME,
                          "volume_name": volumes['volume%s.name' % index]
                          }
                )
        except (KeyError, etcd.EtcdKeyNotFound) as ex:
            if isinstance(ex, KeyError):
                raise ex
            pass

        volume = NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id,
            vol_id=volumes['volume%s.id' % index]
        ).load()
        volume.vol_type = "arbiter" \
            if int(volumes['volume%s.arbiter_count' % index]) > 0 \
            else volumes['volume%s.type' % index]
        volume.name = volumes['volume%s.name' % index]
        volume.transport_type = volumes['volume%s.transport_type' % index]
        volume.status = volumes['volume%s.status' % index]
        volume.brick_count = volumes['volume%s.brickcount' % index]
        volume.snap_count = volumes['volume%s.snap_count' % index]
        volume.stripe_count = volumes['volume%s.stripe_count' % index]
        volume.replica_count = volumes['volume%s.replica_count' % index]
        volume.subvol_count = volumes['volume%s.subvol_count' % index]
        volume.arbiter_count = volumes['volume%s.arbiter_count' % index]
        volume.disperse_count = volumes['volume%s.disperse_count' % index]
        volume.redundancy_count = volumes['volume%s.redundancy_count' % index]
        volume.quorum_status = volumes['volume%s.quorum_status' % index]
        volume.snapd_status = volumes[
            'volume%s.snapd_svc.online_status' % index]
        volume.snapd_inited = volumes['volume%s.snapd_svc.inited' % index]
        if NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id,
            vol_id=volumes['volume%s.id' % index]
        ).exists():
            existing_vol = NS.tendrl.objects.GlusterVolume(
                NS.tendrl_context.integration_id,
                vol_id=volumes['volume%s.id' % index]
            ).load()
            volume_profiling_old_value = existing_vol.profiling_enabled
        else:
            volume_profiling_old_value = volume.profiling_enabled
        if ('volume%s.profile_enabled' % index) in volumes:
            value = int(volumes['volume%s.profile_enabled' % index])
            if value == 1:
                volume_profiling_new_value = "yes"
            else:
                volume_profiling_new_value = "no"
        else:
            volume_profiling_new_value = None
        volume.profiling_enabled = volume_profiling_new_value
        if volume_profiling_old_value not in [None, ""] and \
            volume_profiling_old_value != volume_profiling_new_value:
            # Raise alert for the same value change
            msg = ("Value of volume profiling for volume: %s "
                   "of cluster %s changed from %s to %s" % (
                       volumes['volume%s.name' % index],
                       cluster_short_name,
                       volume_profiling_old_value,
                       volume_profiling_new_value))
            instance = "volume_%s" % \
                volumes['volume%s.name' % index]
            event_utils.emit_event(
                "volume_profiling_status",
                volume_profiling_new_value,
                msg,
                instance,
                'INFO',
                tags={
                    "entity_type": RESOURCE_TYPE_BRICK,
                    "volume_name": volumes[
                        'volume%s.name' % index
                    ]
                }
            )
        volume.save(ttl=sync_ttl)
        # Save the default values of volume options
        vol_opt_dict = {}
        for opt_count in \
            range(1, int(vol_options['volume%s.options.count' % index])):
            vol_opt_dict[
                vol_options[
                    'volume%s.options.key%s' % (index, opt_count)
                ]
            ] = vol_options[
                'volume%s.options.value%s' % (index, opt_count)
            ]
        volume.options = vol_opt_dict
        volume.save()

    rebal_det = NS.gluster.objects.RebalanceDetails(
        vol_id=volumes['volume%s.id' % index],
        rebal_id=volumes['volume%s.rebalance.id' % index],
        rebal_status=volumes['volume%s.rebalance.status' % index],
        rebal_failures=volumes['volume%s.rebalance.failures' % index],
        rebal_skipped=volumes['volume%s.rebalance.skipped' % index],
        rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index],
        rebal_files=volumes['volume%s.rebalance.files' % index],
        rebal_data=volumes['volume%s.rebalance.data' % index],
        time_left=volumes.get('volume%s.rebalance.time_left' % index),
    )
    rebal_det.save(ttl=sync_ttl)
    georep_details.save_georep_details(volumes, index)

    b_index = 1
    # ipv4 address of current node
    try:
        network_ip = []
        networks = NS.tendrl.objects.NodeNetwork().load_all()
        for network in networks:
            if network.ipv4:
                network_ip.extend(network.ipv4)
    except etcd.EtcdKeyNotFound as ex:
        Event(
            ExceptionMessage(
                priority="debug",
                publisher=NS.publisher_id,
                payload={
                    "message": "Could not find "
                    "any ipv4 networks for node"
                    " %s" % NS.node_context.node_id,
                    "exception": ex
                }
            )
        )
    while True:
        try:
            # Update brick node wise
            hostname = volumes[
                'volume%s.brick%s.hostname' % (index, b_index)
            ]
            ip = socket.gethostbyname(hostname)
            try:
                node_id = etcd_utils.read("indexes/ip/%s" % ip).value
                fqdn = NS.tendrl.objects.ClusterNodeContext(
                    node_id=node_id
                ).load().fqdn
                cluster_node_ids = etcd_utils.read(
                    "indexes/tags/tendrl/integration/%s" %
                    NS.tendrl_context.integration_id
                ).value
                cluster_node_ids = json.loads(cluster_node_ids)
                if NS.node_context.fqdn != fqdn or \
                        node_id not in cluster_node_ids:
                    b_index += 1
                    continue
            except(TypeError, etcd.EtcdKeyNotFound):
                b_index += 1
                continue
            sub_vol_size = (int(
                volumes['volume%s.brickcount' % index]
            )) / int(
                volumes['volume%s.subvol_count' % index]
            )
            brick_name = NS.node_context.fqdn
            brick_name += ":"
            brick_name += volumes['volume%s.brick%s' '.path' % (
                index,
                b_index
            )].split(":")[-1].replace("/", "_")

            # Raise alerts if the brick path changes
            try:
                stored_brick = NS.tendrl.objects.GlusterBrick(
                    NS.tendrl_context.integration_id,
                    NS.node_context.fqdn,
                    brick_dir=brick_name.split(":_")[-1]
                ).load()
                current_status = volumes.get(
                    'volume%s.brick%s.status' % (index, b_index)
                )
                if stored_brick.status and \
                    current_status != stored_brick.status:
                    msg = ("Brick:%s in volume:%s has %s"
                           ) % (
                               volumes['volume%s.brick%s' '.path' % (
                                   index,
                                   b_index
                               )],
                               volumes['volume%s.' 'name' % index],
                               current_status)
                    instance = "volume_%s|brick_%s" % (
                        volumes['volume%s.name' % index],
                        volumes['volume%s.brick%s.path' % (
                            index,
                            b_index
                        )]
                    )
                    event_utils.emit_event(
                        "brick_status",
                        current_status,
                        msg,
                        instance,
                        'WARNING' if current_status == 'Stopped'
                        else 'INFO',
                        tags={"entity_type": RESOURCE_TYPE_BRICK,
                              "volume_name": volumes[
                                  'volume%s.' 'name' % index]
                              }
                    )

            except etcd.EtcdKeyNotFound:
                pass

            brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s"

            vol_brick_path = brk_pth % (
                NS.tendrl_context.integration_id,
                volumes['volume%s.id' % index],
                str((b_index - 1) / sub_vol_size),
                brick_name
            )

            etcd_utils.write(vol_brick_path, "")
            brick = NS.tendrl.objects.GlusterBrick(
                NS.tendrl_context.integration_id,
                NS.node_context.fqdn,
                brick_dir=brick_name.split(":_")[-1]
            ).load()
            brick.integration_id = NS.tendrl_context.integration_id
            brick.fqdn = NS.node_context.fqdn
            brick.brick_dir = brick_name.split(":_")[-1]
            brick.name = brick_name
            brick.vol_id = volumes['volume%s.id' % index]
            brick.sequence_number = b_index
            brick.brick_path = volumes[
                'volume%s.brick%s.path' % (index, b_index)
            ]
            brick.hostname = volumes.get(
                'volume%s.brick%s.hostname' % (index, b_index)
            )
            brick.port = volumes.get(
                'volume%s.brick%s.port' % (index, b_index)
            )
            brick.vol_name = volumes['volume%s.name' % index]
            brick.used = True
            brick.node_id = NS.node_context.node_id
            brick.status = volumes.get(
                'volume%s.brick%s.status' % (index, b_index)
            )
            brick.filesystem_type = volumes.get(
                'volume%s.brick%s.filesystem_type' % (index, b_index)
            )
            brick.mount_opts = volumes.get(
                'volume%s.brick%s.mount_options' % (index, b_index)
            )
            brick.utilization = brick_utilization.brick_utilization(
                volumes['volume%s.brick%s.path' % (index, b_index)]
            )
            brick.client_count = volumes.get(
                'volume%s.brick%s.client_count' % (index, b_index)
            )
            brick.is_arbiter = volumes.get(
                'volume%s.brick%s.is_arbiter' % (index, b_index)
            )
            brick.save(ttl=sync_ttl)
            # sync brick device details
            brick_device_details.\
                update_brick_device_details(
                    brick_name,
                    volumes[
                        'volume%s.brick%s.path' % (
                            index, b_index)
                    ],
                    devicetree,
                    sync_ttl
                )

            # Sync the brick client details
            c_index = 1
            if volumes.get(
                'volume%s.brick%s.client_count' % (index, b_index)
            ) > 0:
                while True:
                    try:
                        NS.gluster.objects.ClientConnection(
                            brick_name=brick_name,
                            fqdn=NS.node_context.fqdn,
                            brick_dir=brick_name.split(":_")[-1],
                            hostname=volumes[
                                'volume%s.brick%s.client%s.hostname' % (
                                    index, b_index, c_index
                                )
                            ],
                            bytesread=volumes[
                                'volume%s.brick%s.client%s.bytesread' % (
                                    index, b_index, c_index
                                )
                            ],
                            byteswrite=volumes[
                                'volume%s.brick%s.client%s.byteswrite' % (
                                    index, b_index, c_index
                                )
                            ],
                            opversion=volumes[
                                'volume%s.brick%s.client%s.opversion' % (
                                    index, b_index, c_index
                                )
                            ]
                        ).save(ttl=sync_ttl)
                    except KeyError:
                        break
                    c_index += 1
            sync_ttl += 4
            b_index += 1
        except KeyError:
            break
    return b_index
コード例 #18
0
ファイル: __init__.py プロジェクト: BwithPrashant/commons
def process_job(jid):
    job = NS.tendrl.objects.Job(job_id=jid).load()
    if job.status in [None, ""]:
        job.status = "new"
        job.save()

    NS.node_context = NS.node_context.load()
    # Check job not already "finished", or "processing"
    try:
        if job.status in ["finished", "processing", "failed"]:
            return
    except etcd.EtcdKeyNotFound:
        pass

    try:
        _timeout = None
        _timeout = job.timeout
        if _timeout:
            _timeout = _timeout.lower()
    except etcd.EtcdKeyNotFound:
        pass

    # tendrl-node-agent tagged as tendrl/monitor will ensure
    # >10 min old "new" parent jobs are timed out and marked
    # as "failed"
    if "tendrl/monitor" in NS.node_context.tags and _timeout == "yes" and \
        job.status == "new" and job.payload.get('parent') is None:
        _valid_until = job.valid_until

        if _valid_until:
            _now_epoch = (time_utils.now() - datetime.datetime(
                1970, 1, 1).replace(tzinfo=utc)).total_seconds()
            if int(_now_epoch) >= int(_valid_until):
                # Job has "new" status since 10 minutes,
                # mark status as "failed" and Job.error =
                # "Timed out"
                _msg = str("Timed-out (>10min as 'new')")
                job.errors = _msg
                job.status = "failed"
                job.save()
                integration_id = NS.tendrl_context.integration_id
                alert_utils.alert_job_status(
                    "failed",
                    "Job timed out (job_id: %s)" % jid,
                    integration_id=integration_id
                    or job.payload['parameters'].get(
                        'TendrlContext.integration_id'),
                    cluster_name=NS.tendrl_context.cluster_name
                    or job.payload['parameters'].get(
                        'TendrlContext.cluster_name'))
                return
        else:
            _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10)
            _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc)

            _now_plus_10_epoch = (_now_plus_10 - _epoch_start).total_seconds()
            job = NS.tendrl.objects.Job(job_id=jid).load()
            if job.status == "new":
                # To avoid  server and storage node do save same time
                job.valid_until = int(_now_plus_10_epoch)
                job.save()

    job = NS.tendrl.objects.Job(job_id=jid).load()
    if job.payload["type"] == NS.type and \
            job.status == "new":
        # Job routing
        # Flows created by tendrl-api use 'tags' from flow
        # definition to target jobs
        _tag_match = False
        if job.payload.get("tags", []):
            for flow_tag in job.payload['tags']:
                if flow_tag in NS.node_context.tags:
                    _tag_match = True

        if not _tag_match:
            _job_tags = ", ".join(job.payload.get("tags", []))
            _msg = "Node (%s)(type: %s)(tags: %s) will not " \
                   "process job-%s (tags: %s)" % \
                   (NS.node_context.node_id, NS.type,
                    NS.node_context.tags, jid,
                    _job_tags)
            logger.log("debug", NS.publisher_id, {"message": _msg})
            return

        try:
            try:
                job_status_key = "/queue/%s/status" % job.job_id
                etcd_utils.write(job_status_key, "processing", prevValue="new")
            except etcd.EtcdKeyNotFound:
                # if status watchable attribute not present
                # then it will be created when job save happens
                pass
            lock_info = dict(node_id=NS.node_context.node_id,
                             fqdn=NS.node_context.fqdn,
                             type=NS.type)
            job = NS.tendrl.objects.Job(job_id=jid).load()
            job.locked_by = lock_info
            job.status = "processing"
            job.save(ttl=DEFAULT_JOB_TTL)
        except etcd.EtcdCompareFailed:
            # job is already being processed by some tendrl
            # agent
            return

        the_flow = None
        try:
            current_ns, flow_name, obj_name = \
                _extract_fqdn(job.payload['run'])

            if obj_name:
                runnable_flow = current_ns.ns.get_obj_flow(obj_name, flow_name)
            else:
                runnable_flow = current_ns.ns.get_flow(flow_name)
            time.sleep(2)
            job = NS.tendrl.objects.Job(job_id=jid).load()
            lock_info = dict(node_id=NS.node_context.node_id,
                             fqdn=NS.node_context.fqdn,
                             type=NS.type)
            if job.locked_by != lock_info:
                return

            the_flow = runnable_flow(parameters=job.payload['parameters'],
                                     job_id=job.job_id)
            # Tendrl server does not have fqdn in node_context
            logger.log("info",
                       NS.publisher_id, {
                           "message":
                           "Starting %s Job: %s on %s" %
                           (job.payload['run'].split('.')[-1], job.job_id,
                            NS.node_context.fqdn or "server")
                       },
                       job_id=job.job_id,
                       flow_id=the_flow.parameters['flow_id'])

            logger.log("info",
                       NS.publisher_id, {
                           "message":
                           "Running %s job: %s on %s" %
                           (job.payload['run'].split('.')[-1], job.job_id,
                            NS.node_context.fqdn or "server")
                       },
                       job_id=job.job_id,
                       flow_id=the_flow.parameters['flow_id'])

            the_flow.run()

            try:
                job = NS.tendrl.objects.Job(job_id=jid).load()
                job.status = "finished"
                job.save()
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'finished', " \
                       "current job status invalid"
                raise FlowExecutionFailedError(_msg)

            logger.log(
                "info",
                NS.publisher_id,
                {
                    "message":
                    "Job (%s) for %s finished. " %
                    (job.job_id, job.payload['run'].split('.')[-1])
                },
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id'],
            )
            if job.payload.get('parent') is None:
                alert_utils.alert_job_status(
                    "finished",
                    "%s (job ID: %s) completed successfully " %
                    (job.payload['run'].split('.')[-1], job.job_id),
                    integration_id=NS.tendrl_context.integration_id
                    or job.payload['parameters'].get(
                        'TendrlContext.integration_id'),
                    cluster_name=NS.tendrl_context.cluster_name
                    or job.payload['parameters'].get(
                        'TendrlContext.cluster_name'))
        except (FlowExecutionFailedError, AtomExecutionFailedError,
                Exception) as e:
            _trace = str(traceback.format_exc(e))
            _msg = "Failure in Job %s Flow %s with error:" % \
                   (job.job_id, job.payload['run'])
            Event(
                ExceptionMessage(priority="error",
                                 publisher=NS.publisher_id,
                                 payload={
                                     "message": _msg + _trace,
                                     "exception": e
                                 }))
            if the_flow:
                logger.log("error",
                           NS.publisher_id, {"message": _msg + "\n" + _trace},
                           job_id=job.job_id,
                           flow_id=the_flow.parameters['flow_id'])
            else:
                logger.log("error", NS.publisher_id,
                           {"message": _msg + "\n" + _trace})

            try:
                job = NS.tendrl.objects.Job(job_id=jid).load()
                job.status = "failed"
                job.save()
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'failed', current" \
                       "job status invalid"
                raise FlowExecutionFailedError(_msg)
            else:
                job = NS.tendrl.objects.Job(job_id=jid).load()
                job.errors = _trace
                if job.payload.get('parent') is None:
                    alert_utils.alert_job_status(
                        "failed",
                        "Job failed (job_id: %s)" % job.job_id,
                        integration_id=NS.tendrl_context.integration_id
                        or job.payload['parameters'].get(
                            'TendrlContext.integration_id'),
                        cluster_name=NS.tendrl_context.cluster_name
                        or job.payload['parameters'].get(
                            'TendrlContext.cluster_name'))
                job.save()
コード例 #19
0
ファイル: __init__.py プロジェクト: Tendrl/bridge_common
def process_job(jid):
    job = NS.tendrl.objects.Job(job_id=jid).load()
    if job.status in [None, ""]:
        job.status = "new"
        job.save()

    NS.node_context = NS.node_context.load()
    # Check job not already "finished", or "processing"
    try:
        if job.status in ["finished",
                          "processing",
                          "failed"]:
            return
    except etcd.EtcdKeyNotFound:
        pass

    try:
        _timeout = None
        _timeout = job.timeout
        if _timeout:
            _timeout = _timeout.lower()
    except etcd.EtcdKeyNotFound:
        pass

    # tendrl-node-agent tagged as tendrl/monitor will ensure
    # >10 min old "new" jobs are timed out and marked as
    # "failed" (the parent job of these jobs will also be
    # marked as "failed")
    if "tendrl/monitor" in NS.node_context.tags and \
        _timeout == "yes" and job.status == "new":
        _valid_until = job.valid_until

        if _valid_until:
            _now_epoch = (time_utils.now() -
                          datetime.datetime(1970, 1,
                                            1).replace(
                              tzinfo=utc)).total_seconds()
            if int(_now_epoch) >= int(_valid_until):
                # Job has "new" status since 10 minutes,
                # mark status as "failed" and Job.error =
                # "Timed out"
                try:
                    job = job.load()
                    if job.status == "new":
                        job.status = "failed"
                        job.save()
                except etcd.EtcdCompareFailed:
                    pass
                else:
                    job = NS.tendrl.objects.Job(job_id=jid).load()
                    if job.status == "new":
                        _msg = str("Timed-out (>10min as 'new')")
                        job.errors = _msg
                        job.save()
                        if job.payload.get('parent') is None:
                            integration_id = NS.tendrl_context.integration_id
                            alert_utils.alert_job_status(
                                "failed",
                                "Job timed out (job_id: %s)" % jid,
                                integration_id=integration_id or
                                job.payload['parameters'].get(
                                    'TendrlContext.integration_id'
                                ),
                                cluster_name=NS.tendrl_context.cluster_name or
                                job.payload['parameters'].get(
                                    'TendrlContext.cluster_name'
                                )
                            )
                    return
        else:
            _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10)
            _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc)

            _now_plus_10_epoch = (_now_plus_10 -
                                  _epoch_start).total_seconds()
            time.sleep(7)
            job = job.load()
            if job.status == "new":
                # To avoid  server and storage node do save same time
                job.valid_until = int(_now_plus_10_epoch)
                job.save()

    job = NS.tendrl.objects.Job(job_id=jid).load()
    if job.payload["type"] == NS.type and \
            job.status == "new":
        # Job routing
        # Flows created by tendrl-api use 'tags' from flow
        # definition to target jobs
        _tag_match = False
        if job.payload.get("tags", []):
            for flow_tag in job.payload['tags']:
                if flow_tag in NS.node_context.tags:
                    _tag_match = True

        if not _tag_match:
            _job_tags = ", ".join(job.payload.get("tags", []))
            _msg = "Node (%s)(type: %s)(tags: %s) will not " \
                   "process job-%s (tags: %s)" % \
                   (NS.node_context.node_id, NS.type,
                    NS.node_context.tags, jid,
                    _job_tags)
            logger.log(
                "debug",
                NS.publisher_id,
                {"message": _msg}
            )
            return

        try:
            try:
                job_status_key = "/queue/%s/status" % job.job_id
                etcd_utils.write(job_status_key,
                                 "processing",
                                 prevValue="new")
            except etcd.EtcdKeyNotFound:
                # if status watchable attribute not present
                # then it will be created when job save happens
                pass
            lock_info = dict(node_id=NS.node_context.node_id,
                             fqdn=NS.node_context.fqdn,
                             type=NS.type)
            job = job.load()
            job.locked_by = lock_info
            job.status = "processing"
            job.save(ttl=DEFAULT_JOB_TTL)
        except etcd.EtcdCompareFailed:
            # job is already being processed by some tendrl
            # agent
            return

        the_flow = None
        try:
            current_ns, flow_name, obj_name = \
                _extract_fqdn(job.payload['run'])

            if obj_name:
                runnable_flow = current_ns.ns.get_obj_flow(
                    obj_name, flow_name)
            else:
                runnable_flow = current_ns.ns.get_flow(flow_name)

            job = job.load()
            lock_info = dict(node_id=NS.node_context.node_id,
                             fqdn=NS.node_context.fqdn,
                             type=NS.type)
            if job.locked_by != lock_info:
                return

            the_flow = runnable_flow(parameters=job.payload[
                'parameters'], job_id=job.job_id)
            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Starting Job %s" %
                            job.job_id},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id']
            )

            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Running %s" %
                            job.payload['run'].split('.')[-1]},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id']
            )

            the_flow.run()

            try:
                job = job.load()
                job.status = "finished"
                job.save()
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'finished', " \
                       "current job status invalid"
                raise FlowExecutionFailedError(_msg)

            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Job (%s) for %s finished. "
                            % (
                                job.job_id,
                                job.payload['run'].split('.')[-1])},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id'],
            )
            if job.payload.get('parent') is None:
                alert_utils.alert_job_status(
                    "finished",
                    "%s (job ID: %s) completed successfully " % (
                        job.payload['run'].split('.')[-1],
                        job.job_id),
                    integration_id=NS.tendrl_context.integration_id or
                    job.payload['parameters'].get(
                        'TendrlContext.integration_id'
                    ),
                    cluster_name=NS.tendrl_context.cluster_name or
                    job.payload['parameters'].get(
                        'TendrlContext.cluster_name'
                    )
                )
        except (FlowExecutionFailedError,
                AtomExecutionFailedError,
                Exception) as e:
            _trace = str(traceback.format_exc(e))
            _msg = "Failure in Job %s Flow %s with error:" % \
                   (job.job_id, job.payload['run'])
            Event(
                ExceptionMessage(
                    priority="error",
                    publisher=NS.publisher_id,
                    payload={"message": _msg + _trace,
                             "exception": e
                             }
                )
            )
            if the_flow:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": _msg + "\n" + _trace},
                    job_id=job.job_id,
                    flow_id=the_flow.parameters['flow_id']
                )
            else:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": _msg + "\n" + _trace}
                )

            try:
                job = job.load()
                job.status = "failed"
                job.save()
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'failed', current" \
                       "job status invalid"
                raise FlowExecutionFailedError(_msg)
            else:
                job = job.load()
                job.errors = _trace
                if job.payload.get('parent') is None:
                    alert_utils.alert_job_status(
                        "failed",
                        "Job failed (job_id: %s)" % job.job_id,
                        integration_id=NS.tendrl_context.integration_id or
                        job.payload['parameters'].get(
                            'TendrlContext.integration_id'
                        ),
                        cluster_name=NS.tendrl_context.cluster_name or
                        job.payload['parameters'].get(
                            'TendrlContext.cluster_name'
                        )
                    )
                job.save()
コード例 #20
0
def process_job(job):
    jid = job.key.split('/')[-1]
    job_status_key = "/queue/%s/status" % jid
    job_lock_key = "/queue/%s/locked_by" % jid
    NS.node_context = NS.node_context.load()
    # Check job not already locked by some agent
    try:
        _locked_by = etcd_utils.read(job_lock_key).value
        if _locked_by:
            return
    except etcd.EtcdKeyNotFound:
        pass

    # Check job not already "finished", or "processing"
    try:
        _status = etcd_utils.read(job_status_key).value
        if _status in ["finished", "processing"]:
            return
    except etcd.EtcdKeyNotFound:
        pass

    try:
        _job_timeout_key = "/queue/%s/timeout" % jid
        _timeout = None
        _timeout = etcd_utils.read(_job_timeout_key).value
        if _timeout:
            _timeout = _timeout.lower()
    except etcd.EtcdKeyNotFound:
        pass

    # tendrl-node-agent tagged as tendrl/monitor will ensure
    # >10 min old "new" jobs are timed out and marked as
    # "failed" (the parent job of these jobs will also be
    # marked as "failed")
    if "tendrl/monitor" in NS.node_context.tags and \
        _timeout == "yes":
        _job_valid_until_key = "/queue/%s/valid_until" % jid
        _valid_until = None
        try:
            _valid_until = etcd_utils.read(
                _job_valid_until_key).value
        except etcd.EtcdKeyNotFound:
            pass

        if _valid_until:
            _now_epoch = (time_utils.now() -
                          datetime.datetime(1970, 1,
                                            1).replace(
                              tzinfo=utc)).total_seconds()
            if int(_now_epoch) >= int(_valid_until):
                # Job has "new" status since 10 minutes,
                # mark status as "failed" and Job.error =
                # "Timed out"
                try:
                    etcd_utils.write(job_status_key,
                                     "failed",
                                     prevValue="new")
                except etcd.EtcdCompareFailed:
                    pass
                else:
                    job = NS.tendrl.objects.Job(job_id=jid).load()
                    _msg = str("Timed-out (>10min as 'new')")
                    job.errors = _msg
                    job.save()
                    if job.payload.get('parent') is None:
                        alert_utils.alert_job_status(
                            "failed",
                            "Job timed out (job_id: %s)" % jid,
                            integration_id=NS.tendrl_context.integration_id or
                            job.payload['parameters'].get(
                                'TendrlContext.integration_id'
                            ),
                            cluster_name=NS.tendrl_context.cluster_name or
                            job.payload['parameters'].get(
                                'TendrlContext.cluster_name'
                            )
                        )
                    return
        else:
            _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10)
            _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc)

            # noinspection PyTypeChecker
            _now_plus_10_epoch = (_now_plus_10 -
                                  _epoch_start).total_seconds()
            etcd_utils.write(_job_valid_until_key,
                             int(_now_plus_10_epoch))

    job = NS.tendrl.objects.Job(job_id=jid).load()
    if job.payload["type"] == NS.type and \
            job.status == "new":
        # Job routing
        # Flows created by tendrl-api use 'tags' from flow
        # definition to target jobs
        _tag_match = False
        if job.payload.get("tags", []):
            for flow_tag in job.payload['tags']:
                if flow_tag in NS.node_context.tags:
                    _tag_match = True

        if not _tag_match:
            _job_tags = ", ".join(job.payload.get("tags", []))
            _msg = "Node (%s)(type: %s)(tags: %s) will not " \
                   "process job-%s (tags: %s)" % \
                   (NS.node_context.node_id, NS.type,
                    NS.node_context.tags, jid,
                    _job_tags)
            logger.log(
                "info",
                NS.publisher_id,
                {"message": _msg}
            )
            return

        job_status_key = "/queue/%s/status" % job.job_id
        job_lock_key = "/queue/%s/locked_by" % job.job_id
        try:
            lock_info = dict(node_id=NS.node_context.node_id,
                             fqdn=NS.node_context.fqdn,
                             tags=NS.node_context.tags,
                             type=NS.type)
            etcd_utils.write(job_status_key, "processing",
                             prevValue="new")
            etcd_utils.write(job_lock_key,
                             json.dumps(lock_info))
        except etcd.EtcdCompareFailed:
            # job is already being processed by some tendrl
            # agent
            return

        the_flow = None
        try:
            current_ns, flow_name, obj_name = \
                _extract_fqdn(job.payload['run'])

            if obj_name:
                runnable_flow = current_ns.ns.get_obj_flow(
                    obj_name, flow_name)
            else:
                runnable_flow = current_ns.ns.get_flow(flow_name)

            the_flow = runnable_flow(parameters=job.payload[
                'parameters'], job_id=job.job_id)
            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Processing Job %s" %
                            job.job_id},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id']
            )

            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Running Flow %s" %
                            job.payload['run']},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id']
            )
            the_flow.run()
            try:
                etcd_utils.write(job_status_key,
                                 "finished",
                                 prevValue="processing")
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'finished', " \
                       "current job status invalid"
                raise FlowExecutionFailedError(_msg)

            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Job (%s):  Finished "
                            "Flow %s" % (
                                job.job_id,
                                job.payload['run'])},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id'],
            )
            if job.payload.get('parent') is None:
                alert_utils.alert_job_status(
                    "finished",
                    "Job finished successfully (job_id: %s)" % job.job_id,
                    integration_id=NS.tendrl_context.integration_id or
                    job.payload['parameters'].get(
                        'TendrlContext.integration_id'
                    ),
                    cluster_name=NS.tendrl_context.cluster_name or
                    job.payload['parameters'].get(
                        'TendrlContext.cluster_name'
                    )
                )
        except (FlowExecutionFailedError,
                AtomExecutionFailedError,
                Exception) as e:
            _trace = str(traceback.format_exc(e))
            _msg = "Failure in Job %s Flow %s with error:" % \
                   (job.job_id, job.payload['run'])
            Event(
                ExceptionMessage(
                    priority="error",
                    publisher=NS.publisher_id,
                    payload={"message": _msg + _trace,
                             "exception": e
                             }
                )
            )
            if the_flow:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": _msg + "\n" + _trace},
                    job_id=job.job_id,
                    flow_id=the_flow.parameters['flow_id']
                )
            else:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": _msg + "\n" + _trace}
                )

            try:
                etcd_utils.write(job_status_key,
                                 "failed",
                                 prevValue="processing")
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'failed', current" \
                       "job status invalid"
                raise FlowExecutionFailedError(_msg)
            else:
                job = job.load()
                job.errors = _trace
                if job.payload.get('parent') is None:
                    alert_utils.alert_job_status(
                        "failed",
                        "Job failed (job_id: %s)" % job.job_id,
                        integration_id=NS.tendrl_context.integration_id or
                        job.payload['parameters'].get(
                            'TendrlContext.integration_id'
                        ),
                        cluster_name=NS.tendrl_context.cluster_name or
                        job.payload['parameters'].get(
                            'TendrlContext.cluster_name'
                        )
                    )
                job.save()
コード例 #21
0
ファイル: callback.py プロジェクト: Tendrl/gluster_bridge
    def volume_remove_brick_force(self, event):
        time.sleep(self.sync_interval)
        # Event returns bricks list as space separated single string
        bricks = event['message']['bricks'].split(" ")
        try:
            for brick in bricks:
                # find fqdn using ip
                ip = socket.gethostbyname(brick.split(":/")[0])
                node_id = etcd_utils.read("indexes/ip/%s" % ip).value
                fqdn = NS.tendrl.objects.ClusterNodeContext(
                    node_id=node_id
                ).load().fqdn
                brick = fqdn + ":" + brick.split(":")[-1]
                fetched_brick = NS.tendrl.objects.GlusterBrick(
                    NS.tendrl_context.integration_id,
                    fqdn=brick.split(":/")[0],
                    brick_dir=brick.split(":/")[1].replace('/', '_')
                ).load()

                # delete brick
                etcd_utils.delete(
                    "clusters/{0}/Bricks/all/{1}/{2}".format(
                        NS.tendrl_context.integration_id,
                        brick.split(":/")[0],
                        brick.split(":/")[1].replace('/', '_')
                    ),
                    recursive=True,
                )

                # delete alert dashbaord
                job_id = monitoring_utils.update_dashboard(
                    "%s|%s" % (event['message']['volume'], brick),
                    RESOURCE_TYPE_BRICK,
                    NS.tendrl_context.integration_id,
                    "delete"
                )
                logger.log(
                    "debug",
                    NS.publisher_id,
                    {
                        "message": "Update dashboard job %s "
                        "created" % job_id
                    }
                )

                # delete brick details from graphite
                job_id = monitoring_utils.delete_resource_from_graphite(
                    "%s|%s" % (event['message']['volume'], brick),
                    RESOURCE_TYPE_BRICK,
                    NS.tendrl_context.integration_id,
                    "delete"
                )
                logger.log(
                    "debug",
                    NS.publisher_id,
                    {
                        "message": "Delete resource from graphite job %s "
                        "created" % job_id
                    }
                )

            volume_brick_path = "clusters/{0}/Volumes/{1}/"\
                                "Bricks".format(
                                    NS.tendrl_context.integration_id,
                                    fetched_brick.vol_id,
                                )

            # remove all the brick infromation under volume as the
            # subvolume might have changed, let the next sync handle
            # the updation of brick info
            etcd_utils.delete(
                volume_brick_path,
                recursive=True
            )

            _trigger_sync_key = 'clusters/%s/_sync_now' % \
                NS.tendrl_context.integration_id
            etcd_utils.write(_trigger_sync_key, 'true')
            etcd_utils.refresh(_trigger_sync_key, self.sync_interval)
        except etcd.EtcdKeyNotFound:
            logger.log(
                "debug",
                NS.publisher_id,
                {
                    "message": "Unable to delete bricks %s" % bricks
                }
            )
コード例 #22
0
def sync(sync_ttl):
    try:
        NS.node_context = NS.node_context.load()
        logger.log(
            "debug",
            NS.publisher_id,
            {"message": "Running SDS detection"}
        )
        try:
            sds_discovery_manager = sds_manager.SDSDiscoveryManager()
        except ValueError as ex:
            Event(
                ExceptionMessage(
                    priority="debug",
                    publisher=NS.publisher_id,
                    payload={"message": "Failed to init SDSDiscoveryManager.",
                             "exception": ex
                             }
                )
            )
            return

        # Execute the SDS discovery plugins and tag the nodes with data
        for plugin in sds_discovery_manager.get_available_plugins():
            sds_details = plugin.discover_storage_system()
            if sds_details is None:
                break

            if "peers" in sds_details and NS.tendrl_context.integration_id:
                _cnc = NS.tendrl.objects.ClusterNodeContext().load()
                this_peer_uuid = ""
                if _cnc.is_managed != "yes" or not NS.node_context.fqdn:
                    for peer_uuid, data in sds_details.get("peers",
                                                           {}).iteritems():
                        peer = NS.tendrl.objects.GlusterPeer(
                            peer_uuid=peer_uuid,
                            hostname=data['hostname'],
                            connected=data['connected']
                        )
                        peer.save()
                        if data['hostname'] == "localhost":
                            this_peer_uuid = peer_uuid

                    # Figure out the hostname used to probe this peer
                    integration_id_index_key = \
                        "indexes/tags/tendrl/integration/%s" %\
                        NS.tendrl_context.integration_id
                    _node_ids = etcd_utils.read(integration_id_index_key).value
                    _node_ids = json.loads(_node_ids)
                    for _node_id in _node_ids:
                        if _node_id != NS.node_context.node_id:
                            peer = NS.tendrl.objects.GlusterPeer(
                                peer_uuid=this_peer_uuid, node_id=_node_id
                            ).load()
                            if peer.hostname:
                                NS.node_context.pkey = peer.hostname
                                NS.node_context.fqdn = peer.hostname
                                NS.node_context.ipv4_addr = \
                                    socket.gethostbyname(
                                        peer.hostname
                                    )
                                NS.node_context.save()
                                break

            if ('detected_cluster_id' in sds_details and sds_details[
                    'detected_cluster_id'] != ""):
                try:
                    integration_index_key = \
                        "indexes/detected_cluster_id_to_integration_id/" \
                        "%s" % sds_details['detected_cluster_id']
                    dc = NS.tendrl.objects.DetectedCluster().load()
                    if dc is None or dc.detected_cluster_id is None:
                        time.sleep(sync_ttl)
                        integration_id = str(uuid.uuid4())
                        try:
                            etcd_utils.write(
                                integration_index_key,
                                integration_id,
                                prevExist=False
                            )
                        except etcd.EtcdAlreadyExist:
                            pass

                    _ptag = None
                    if NS.tendrl_context.integration_id:
                        _ptag = "provisioner/%s" % \
                            NS.tendrl_context.integration_id

                        if _ptag in NS.node_context.tags:
                            if dc.detected_cluster_id and \
                                dc.detected_cluster_id != sds_details.get(
                                    'detected_cluster_id'):

                                # Gluster peer list has changed
                                integration_id = \
                                    NS.tendrl_context.integration_id
                                etcd_utils.write(
                                    integration_index_key,
                                    integration_id
                                )
                                # Set the cluster status as new peer detected
                                _cluster = NS.tendrl.objects.Cluster(
                                    integration_id=integration_id
                                ).load()
                                _cluster.status = "new_peers_detected"
                                _cluster.save()
                                # Raise an alert regarding the same
                                msg = "New peers identified in cluster: %s. " \
                                    "Make sure tendrl-ansible is executed " \
                                    "for the new nodes so that expand " \
                                    "cluster option can be triggered" % \
                                    _cluster.short_name
                                event_utils.emit_event(
                                    "cluster_status",
                                    "new_peers_detected",
                                    msg,
                                    "cluster_{0}".format(integration_id),
                                    "WARNING",
                                    integration_id=integration_id
                                )
                            _cluster = NS.tendrl.objects.Cluster(
                                integration_id=NS.tendrl_context.integration_id
                            ).load()
                            if _cluster.status == "new_peers_detected":
                                peers = []
                                cmd = subprocess.Popen(
                                    "gluster pool list",
                                    shell=True,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE
                                )
                                out, err = cmd.communicate()
                                if err or out is None or \
                                    "Connection failed" in out:
                                    pass  # set the no of peers as zero
                                if out:
                                    lines = out.split('\n')[1:]
                                    for line in lines:
                                        if line.strip() != '':
                                            peers.append(line.split()[0])
                                nodes_ids = json.loads(etcd_utils.read(
                                    "indexes/tags/tendrl/integration/%s" %
                                    NS.tendrl_context.integration_id
                                ).value)
                                if len(nodes_ids) == len(peers):
                                    # All the nodes are having node-agents
                                    # running and known to tendrl
                                    msg = "New nodes in cluster: %s have " \
                                        "node agents running now. Cluster " \
                                        "is ready to expand." % \
                                        _cluster.short_name
                                    event_utils.emit_event(
                                        "cluster_status",
                                        "expand_pending",
                                        msg,
                                        "cluster_{0}".format(
                                            NS.tendrl_context.integration_id
                                        ),
                                        "INFO",
                                        integration_id=NS.tendrl_context.
                                        integration_id
                                    )
                                    # Set the cluster status accordingly
                                    _cluster.status = 'expand_pending'
                                    _cluster.save()
                    loop_count = 0
                    while True:
                        # Wait till provisioner node assigns
                        # integration_id for this detected_cluster_id
                        if loop_count >= 72:
                            return
                        try:
                            time.sleep(5)
                            integration_id = etcd_utils.read(
                                integration_index_key).value
                            if integration_id:
                                break
                        except etcd.EtcdKeyNotFound:
                            loop_count += 1
                            continue

                    NS.tendrl_context.integration_id = integration_id
                    NS.tendrl_context.cluster_id = sds_details.get(
                        'detected_cluster_id')
                    NS.tendrl_context.cluster_name = sds_details.get(
                        'detected_cluster_name')
                    NS.tendrl_context.sds_name = sds_details.get(
                        'pkg_name')
                    NS.tendrl_context.sds_version = sds_details.get(
                        'pkg_version')
                    NS.tendrl_context.save()

                    NS.node_context = NS.node_context.load()
                    integration_tag = "tendrl/integration/%s" % \
                                      integration_id
                    detected_cluster_tag = "detected_cluster/%s" % \
                                           sds_details[
                                               'detected_cluster_id']
                    NS.node_context.tags += [detected_cluster_tag,
                                             integration_tag]
                    NS.node_context.tags = list(set(NS.node_context.tags))
                    NS.node_context.save()

                    NS.tendrl.objects.DetectedCluster(
                        detected_cluster_id=sds_details.get(
                            'detected_cluster_id'),
                        detected_cluster_name=sds_details.get(
                            'detected_cluster_name'),
                        sds_pkg_name=sds_details.get('pkg_name'),
                        sds_pkg_version=sds_details.get('pkg_version'),
                    ).save()
                    _cluster = NS.tendrl.objects.Cluster(
                        integration_id=NS.tendrl_context.integration_id
                    ).load()
                    if _cluster.current_job.get(
                        'status', ''
                    ) in ['', 'finished', 'failed'] \
                        and _cluster.status in [None, ""]:
                        _cluster.save()

                except (etcd.EtcdException, KeyError) as ex:
                    Event(
                        ExceptionMessage(
                            priority="debug",
                            publisher=NS.publisher_id,
                            payload={"message": "Failed SDS detection",
                                     "exception": ex
                                     }
                        )
                    )
                break
    except Exception as ex:
        Event(
            ExceptionMessage(
                priority="error",
                publisher=NS.publisher_id,
                payload={"message": "node_sync "
                                    "SDS detection failed: " +
                                    ex.message,
                         "exception": ex}
            )
        )
コード例 #23
0
    def __init__(self):
        org_key = "_NS/monitoring/grafana_org_id"
        auth_key = "_NS/monitoring/grafana_auth_key"
        cluster_detail_list = create_dashboards.get_cluster_details()
        org_id = NS.config.data.get("org_id", None)
        if not org_id:
            try:
                org_id = etcd_utils.read(org_key).value
            except etcd.EtcdKeyNotFound:
                org_id = grafana_org_utils.create_org("Alert_dashboard")
                try:
                    etcd_utils.write(org_key, org_id)
                except etcd.EtcdKeyNotFound:
                    pass
                NS.config.data["org_id"] = org_id
        key = ""
        if grafana_org_utils.switch_context(org_id):
            key = NS.config.data.get("grafana_auth_key", None)
            if not key:
                try:
                    key = etcd_utils.read(auth_key).value
                except etcd.EtcdKeyNotFound:
                    key = grafana_org_utils.create_api_token(
                        "grafana_auth_key", "Admin")
                    try:
                        etcd_utils.write(auth_key, key)
                    except etcd.EtcdKeyNotFound:
                        pass
                    NS.config.data["grafana_auth_key"] = key
            response = datasource.create_datasource()
            if response.status_code == 200:
                msg = '\n' + "Datasource " + \
                      " uploaded successfully" + '\n'
                logger.log("info", NS.get("publisher_id", None),
                           {'message': msg})

            else:
                msg = "Datasource upload failed. Error code: {0} ," + \
                      "Error message: " + \
                      "{1} ".format(
                          response.status_code,
                          str(self.get_message_from_response(response)))
                logger.log("info", NS.get("publisher_id", None),
                           {'message': msg})
            if cluster_detail_list:
                resource_name = ["volumes", "hosts", "bricks", "clusters"]
                for resource in resource_name:
                    # Uploading Alert Dashboards
                    resource_dashboard = \
                        create_dashboards.create_resource_dashboard(
                            cluster_detail_list, resource)
                    response = dashboard._post_dashboard(
                        resource_dashboard, key)
                    if response.status_code == 200:
                        msg = '\n' + "{} dashboard uploaded successfully". \
                            format(str(resource)) + '\n'
                        logger.log("info", NS.get("publisher_id", None),
                                   {'message': msg})
                    else:
                        msg = '\n' + "{} dashboard upload failed".format(
                            str(resource)) + '\n'
                        logger.log("info", NS.get("publisher_id", None),
                                   {'message': msg})
        else:
            msg = "Could not switch context, Alert dashboard upload failed"
            logger.log("error", NS.get("publisher_id", None), {'message': msg})
コード例 #24
0
def sync():
    try:
        _keep_alive_for = int(NS.config.data.get("sync_interval", 10)) + 250
        interfaces = get_node_network()
        if len(interfaces) > 0:
            for interface in interfaces:
                NS.tendrl.objects.NodeNetwork(**interface).save(
                    ttl=_keep_alive_for)
                if interface['ipv4']:
                    for ipv4 in interface['ipv4']:
                        index_key = "/indexes/ip/%s" % ipv4
                        try:
                            etcd_utils.write(index_key,
                                             NS.node_context.node_id,
                                             prevExist=False)
                        except etcd.EtcdAlreadyExist:
                            pass
                            # TODO(team) add ipv6 support
                            # if interface['ipv6']:
                            #    for ipv6 in interface['ipv6']:
                            #        index_key = "/indexes/ip/%s/%s" % (ipv6,
                            #
                            # NS.node_context.node_id)
                            #        NS._int.wclient.write(index_key, 1)

        # global network
        if len(interfaces) > 0:
            for interface in interfaces:
                if interface["subnet"] is not "":
                    NS.node_agent.objects.GlobalNetwork(**interface).save(
                        ttl=_keep_alive_for)
        try:
            networks = etcd_utils.read("/networks")
            for network in networks.leaves:
                try:
                    # it will delete any node with empty network detail in
                    # subnet, if one entry present then deletion never happen
                    NS._int.wclient.delete(
                        "%s/%s" % (network.key, NS.node_context.node_id),
                        dir=True)
                    # it will delete any subnet dir when it is empty
                    # if one entry present then deletion never happen
                    NS._int.wclient.delete(network.key, dir=True)
                except (etcd.EtcdKeyNotFound, etcd.EtcdDirNotEmpty):
                    continue
        except etcd.EtcdKeyNotFound as ex:
            Event(
                ExceptionMessage(priority="debug",
                                 publisher=NS.publisher_id,
                                 payload={
                                     "message": "Given key is not present in "
                                     "etcd .",
                                     "exception": ex
                                 }))
    except Exception as ex:
        _msg = "node_sync networks sync failed: " + ex.message
        Event(
            ExceptionMessage(priority="error",
                             publisher=NS.publisher_id,
                             payload={
                                 "message": _msg,
                                 "exception": ex
                             }))
コード例 #25
0
ファイル: __init__.py プロジェクト: nathan-weinberg/commons
    def on_change(self, attr, prev_value, current_value):
        if attr == "status" and "tendrl/monitor" in NS.node_context.tags:
            _tc = NS.tendrl.objects.TendrlContext(node_id=self.node_id).load()
            # Check node is managed
            _cnc = NS.tendrl.objects.ClusterNodeContext(
                node_id=self.node_id,
                integration_id=_tc.integration_id).load()
            if current_value is None and str(_cnc.is_managed).lower() == "yes":
                self.status = "DOWN"
                self.save()
                msg = "Node {0} is DOWN".format(self.fqdn)
                event_utils.emit_event("node_status",
                                       self.status,
                                       msg,
                                       "node_{0}".format(self.fqdn),
                                       "WARNING",
                                       node_id=self.node_id,
                                       integration_id=_tc.integration_id)
                # Load cluster_node_context will load node_context
                # and it will be updated with latest values
                _cnc_new = \
                    NS.tendrl.objects.ClusterNodeContext(
                        node_id=self.node_id,
                        integration_id=_tc.integration_id,
                        first_sync_done=_cnc.first_sync_done,
                        is_managed=_cnc.is_managed
                    )
                _cnc_new.save()
                del _cnc_new
                # Update cluster details
                self.update_cluster_details(_tc.integration_id)
                _tag = "provisioner/%s" % _tc.integration_id
                if _tag in self.tags:
                    _index_key = "/indexes/tags/%s" % _tag
                    self.tags.remove(_tag)
                    self.save()
                    etcd_utils.delete(_index_key)
                    _msg = "node_sync, STALE provisioner node "\
                        "found! re-configuring monitoring "\
                        "(job-id: %s) on this node"
                    payload = {
                        "tags": ["tendrl/node_%s" % self.node_id],
                        "run": "tendrl.flows.ConfigureMonitoring",
                        "status": "new",
                        "parameters": {
                            'TendrlContext.integration_id': _tc.integration_id
                        },
                        "type": "node"
                    }
                    _job_id = str(uuid.uuid4())
                    NS.tendrl.objects.Job(job_id=_job_id,
                                          status="new",
                                          payload=payload).save()
                    logger.log("debug", NS.publisher_id,
                               {"message": _msg % _job_id})

                if _tc.sds_name in ["gluster", "RHGS"]:
                    bricks = etcd_utils.read(
                        "clusters/{0}/Bricks/all/{1}".format(
                            _tc.integration_id, self.fqdn))

                    for brick in bricks.leaves:
                        try:
                            etcd_utils.write("{0}/status".format(brick.key),
                                             "Stopped")
                        except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound):
                            pass
            elif current_value == "UP" and str(
                    _cnc.is_managed).lower() == "yes":
                msg = "{0} is UP".format(self.fqdn)
                event_utils.emit_event("node_status",
                                       "UP",
                                       msg,
                                       "node_{0}".format(self.fqdn),
                                       "INFO",
                                       node_id=self.node_id,
                                       integration_id=_tc.integration_id)
            del _cnc
コード例 #26
0
def sync(sync_ttl=None):
    try:
        tags = []
        # update node agent service details
        logger.log("debug", NS.publisher_id,
                   {"message": "node_sync, Updating Service data"})
        for service in TENDRL_SERVICES:
            s = NS.tendrl.objects.Service(service=service)
            if s.running:
                service_tag = NS.compiled_definitions.get_parsed_defs(
                )['namespace.tendrl']['tags'][service.strip("@*")]
                tags.append(service_tag)

                if service_tag == "tendrl/server":
                    tags.append("tendrl/monitor")
            s.save()

        if "tendrl/monitor" not in tags and \
            NS.tendrl_context.integration_id:
            _cluster = NS.tendrl.objects.Cluster(
                integration_id=NS.tendrl_context.integration_id).load()
            # Try to claim orphan "provisioner_%integration_id" tag
            _tag = "provisioner/%s" % _cluster.integration_id
            _is_new_provisioner = False
            NS.node_context = NS.tendrl.objects.NodeContext().load()
            if _tag not in NS.node_context.tags:
                try:
                    _index_key = "/indexes/tags/%s" % _tag
                    _node_id = json.dumps([NS.node_context.node_id])
                    etcd_utils.write(_index_key, _node_id, prevExist=False)
                    etcd_utils.refresh(_index_key, sync_ttl + 50)
                    tags.append(_tag)
                    _is_new_provisioner = True
                except etcd.EtcdAlreadyExist:
                    pass

        # updating node context with latest tags
        logger.log(
            "debug", NS.publisher_id,
            {"message": "node_sync, updating node context "
             "data with tags"})
        NS.node_context = NS.tendrl.objects.NodeContext().load()
        current_tags = list(NS.node_context.tags)
        tags += current_tags
        NS.node_context.tags = list(set(tags))
        NS.node_context.tags.sort()
        current_tags.sort()
        if NS.node_context.tags != current_tags:
            NS.node_context.save()

        if "tendrl/monitor" not in tags and \
            NS.tendrl_context.integration_id:
            _cluster = _cluster.load()
            if _is_new_provisioner and _cluster.is_managed == "yes":
                _msg = "node_sync, NEW provisioner node found! "\
                    "re-configuring monitoring (job-id: %s) on this node"
                payload = {
                    "tags": ["tendrl/node_%s" % NS.node_context.node_id],
                    "run": "tendrl.flows.ConfigureMonitoring",
                    "status": "new",
                    "parameters": {
                        'TendrlContext.integration_id':
                        NS.tendrl_context.integration_id
                    },
                    "type": "node"
                }
                _job_id = str(uuid.uuid4())
                NS.tendrl.objects.Job(job_id=_job_id,
                                      status="new",
                                      payload=payload).save()
                logger.log("debug", NS.publisher_id,
                           {"message": _msg % _job_id})

        # Update /indexes/tags/:tag = [node_ids]
        for tag in NS.node_context.tags:

            index_key = "/indexes/tags/%s" % tag
            _node_ids = []
            try:
                _node_ids = etcd_utils.read(index_key).value
                _node_ids = json.loads(_node_ids)
            except etcd.EtcdKeyNotFound:
                pass

            if _node_ids:
                if "provisioner" in tag:
                    # Check if this is a stale provisioner
                    if NS.node_context.node_id != _node_ids[0]:
                        NS.node_context.tags.remove(tag)
                        NS.node_context.save()
                        continue
                if NS.node_context.node_id in _node_ids:
                    if sync_ttl and len(_node_ids) == 1:
                        etcd_utils.refresh(index_key, sync_ttl + 50)

                    continue
                else:
                    _node_ids += [NS.node_context.node_id]
            else:
                _node_ids = [NS.node_context.node_id]
            _node_ids = list(set(_node_ids))

            etcd_utils.write(index_key, json.dumps(_node_ids))
            if sync_ttl and len(_node_ids) == 1:
                etcd_utils.refresh(index_key, sync_ttl + 50)
        logger.log("debug", NS.publisher_id,
                   {"message": "node_sync, Updating detected "
                    "platform"})
    except Exception as ex:
        Event(
            ExceptionMessage(priority="error",
                             publisher=NS.publisher_id,
                             payload={
                                 "message":
                                 "node_sync service and indexes "
                                 "sync failed: " + ex.message,
                                 "exception":
                                 ex
                             }))
コード例 #27
0
ファイル: __init__.py プロジェクト: Tendrl/bridge_common
    def on_change(self, attr, prev_value, current_value):
        if attr == "status" and "tendrl/monitor" in NS.node_context.tags:
            _tc = NS.tendrl.objects.TendrlContext(
                node_id=self.node_id
            ).load()
            # Check node is managed
            _cnc = NS.tendrl.objects.ClusterNodeContext(
                node_id=self.node_id,
                integration_id=_tc.integration_id
            ).load()
            if current_value is None and str(_cnc.is_managed).lower() == "yes":
                self.status = "DOWN"
                self.save()
                msg = "Node {0} is DOWN".format(self.fqdn)
                event_utils.emit_event(
                    "node_status",
                    self.status,
                    msg,
                    "node_{0}".format(self.fqdn),
                    "WARNING",
                    node_id=self.node_id,
                    integration_id=_tc.integration_id
                )
                # Load cluster_node_context will load node_context
                # and it will be updated with latest values
                _cnc_new = \
                    NS.tendrl.objects.ClusterNodeContext(
                        node_id=self.node_id,
                        integration_id=_tc.integration_id,
                        first_sync_done=_cnc.first_sync_done,
                        is_managed=_cnc.is_managed
                    )
                _cnc_new.save()
                del _cnc_new
                # Update cluster details
                self.update_cluster_details(_tc.integration_id)
                _tag = "provisioner/%s" % _tc.integration_id
                if _tag in self.tags:
                    _index_key = "/indexes/tags/%s" % _tag
                    self.tags.remove(_tag)
                    self.save()
                    etcd_utils.delete(_index_key)
                if _tc.sds_name in ["gluster", "RHGS"]:
                    bricks = etcd_utils.read(
                        "clusters/{0}/Bricks/all/{1}".format(
                            _tc.integration_id,
                            self.fqdn
                        )
                    )

                    for brick in bricks.leaves:
                        try:
                            etcd_utils.write(
                                "{0}/status".format(brick.key),
                                "Stopped"
                            )
                        except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound):
                            pass
            elif current_value == "UP" and str(
                    _cnc.is_managed).lower() == "yes":
                msg = "{0} is UP".format(self.fqdn)
                event_utils.emit_event(
                    "node_status",
                    "UP",
                    msg,
                    "node_{0}".format(self.fqdn),
                    "INFO",
                    node_id=self.node_id,
                    integration_id=_tc.integration_id
                )
            del _cnc