class StatsPublisher(object): DEFAULT_PUBLISH_INTERVAL_SECS = 20.0 def __init__(self, tsdb): self._logger = logging.getLogger(__name__) self._db = tsdb self._last_seen_ts = 0 # XXX plugin configuration should be decoupled from agent_config arg # parsing self._agent_config = common.services.get(ServiceName.AGENT_CONFIG) self._host_id = self._agent_config.host_id self._publish_interval_secs = float( self._agent_config.__dict__.get( "stats_publish_interval", StatsPublisher.DEFAULT_PUBLISH_INTERVAL_SECS)) self._publisher_thread = None self._publishers = [] def start_publishing(self): self._publisher_thread = Periodic(self.publish, self._publish_interval_secs) self._publisher_thread.daemon = True self._publisher_thread.start() def stop_publishing(self): if self._publisher_thread is not None: self._publisher_thread.stop() def register_publisher(self, publisher): """ Add a new publisher Args: publisher: Publisher instance """ self._publishers.append(publisher) def configure_publishers(self): host = self._agent_config.stats_store_endpoint pm_publisher = GraphitePublisher(host_id=self._host_id, carbon_host=host) self.register_publisher(pm_publisher) def publish(self): retrieved_stats = {} latest_ts = self._last_seen_ts for metric in self._db.get_keys(): values = self._db.get_values_since(self._last_seen_ts, metric) retrieved_stats[metric] = values if values: latest_ts = max(latest_ts, max([x[0] for x in values])) self._last_seen_ts = latest_ts if retrieved_stats: for publisher in self._publishers: self._logger.info("publish metrics with %s" % str(publisher)) publisher.publish(retrieved_stats)
class StatsCollector(object): DEFAULT_COLLECT_INTERVAL_SECS = 20.0 def __init__(self, tsdb): self._logger = logging.getLogger(__name__) # XXX plugin configuration should be decoupled from agent_config arg # parsing agent_config = common.services.get(ServiceName.AGENT_CONFIG) self._collect_interval_secs = float( agent_config.__dict__.get( "stats_collection_interval", StatsCollector.DEFAULT_COLLECT_INTERVAL_SECS)) self._collector_thread = None self._collectors = [] # Cache up to 1 hour's worth of metrics self._metric_cache = tsdb assert (self._collect_interval_secs < 3600) freq_str = "%ds" % self._collect_interval_secs self._metric_cache.set_policy(freq_str, "1h") self._last_publish_ts = datetime.now() def start_collection(self): self._collector_thread = Periodic(self.collect, self._collect_interval_secs) self._collector_thread.daemon = True self._collector_thread.start() def stop_collection(self): if self._collector_thread is not None: self._collector_thread.stop() def register_collector(self, collector): """ Add a new collector Args: collector: Collector instance """ self._collectors.append(collector) def configure_collectors(self): # XXX List of collectors are hard coded for now. pm_collector = PerfManagerCollector() self.register_collector(pm_collector) self._logger.info("Stats collector configured") def collect(self): for c in self._collectors: since = self._last_publish_ts self._last_publish_ts = datetime.now() metrics = c.collect(since=since) for key in metrics.keys(): self._logger.debug("Metrics collected %s -> %s" % (key, metrics[key])) for value_tuple in metrics[key]: self._metric_cache.add(key, value_tuple[0], value_tuple[1])
class StatsPublisher(object): DEFAULT_PUBLISH_INTERVAL_SECS = 20.0 def __init__(self, tsdb): self._logger = logging.getLogger(__name__) self._db = tsdb self._last_seen_ts = 0 # XXX plugin configuration should be decoupled from agent_config arg # parsing self._agent_config = common.services.get(ServiceName.AGENT_CONFIG) self._host_id = self._agent_config.host_id self._publish_interval_secs = float(self._agent_config.__dict__.get( "stats_publish_interval", StatsPublisher.DEFAULT_PUBLISH_INTERVAL_SECS)) self._publisher_thread = None self._publishers = [] def start_publishing(self): self._publisher_thread = Periodic(self.publish, self._publish_interval_secs) self._publisher_thread.daemon = True self._publisher_thread.start() def stop_publishing(self): if self._publisher_thread is not None: self._publisher_thread.stop() def register_publisher(self, publisher): """ Add a new publisher Args: publisher: Publisher instance """ self._publishers.append(publisher) def configure_publishers(self): host = self._agent_config.stats_store_endpoint pm_publisher = GraphitePublisher(host_id=self._host_id, carbon_host=host) self.register_publisher(pm_publisher) def publish(self): retrieved_stats = {} latest_ts = self._last_seen_ts for metric in self._db.get_keys(): values = self._db.get_values_since(self._last_seen_ts, metric) retrieved_stats[metric] = values if values: latest_ts = max(latest_ts, max([x[0] for x in values])) self._last_seen_ts = latest_ts if retrieved_stats: for publisher in self._publishers: self._logger.info("publish metrics with %s" % str(publisher)) publisher.publish(retrieved_stats)
class StatsCollector(object): DEFAULT_COLLECT_INTERVAL_SECS = 20.0 def __init__(self, tsdb): self._logger = logging.getLogger(__name__) # XXX plugin configuration should be decoupled from agent_config arg # parsing agent_config = common.services.get(ServiceName.AGENT_CONFIG) self._collect_interval_secs = float(agent_config.__dict__.get( "stats_collection_interval", StatsCollector.DEFAULT_COLLECT_INTERVAL_SECS)) self._collector_thread = None self._collectors = [] # Cache up to 1 hour's worth of metrics self._metric_cache = tsdb assert(self._collect_interval_secs < 3600) freq_str = "%ds" % self._collect_interval_secs self._metric_cache.set_policy(freq_str, "1h") self._last_publish_ts = datetime.now() def start_collection(self): self._collector_thread = Periodic(self.collect, self._collect_interval_secs) self._collector_thread.daemon = True self._collector_thread.start() def stop_collection(self): if self._collector_thread is not None: self._collector_thread.stop() def register_collector(self, collector): """ Add a new collector Args: collector: Collector instance """ self._collectors.append(collector) def configure_collectors(self): # XXX List of collectors are hard coded for now. pm_collector = PerfManagerCollector() self.register_collector(pm_collector) def collect(self): for c in self._collectors: self._logger.debug("Collecting from %s" % str(c)) since = self._last_publish_ts self._last_publish_ts = datetime.now() metrics = c.collect(since=since) for key in metrics.keys(): self._logger.debug(" %s -> %s" % (key, metrics[key])) for value_tuple in metrics[key]: self._metric_cache.add(key, value_tuple[0], value_tuple[1])
def test_periodic(self): cc = CallCounter() periodic_thread = Periodic(cc.test_fn, 0.05) periodic_thread.start() self.assertTrue(self._match_thread_count(self._thread_count + 1)) self._check_value(self._match_repeated_calls, cc) periodic_thread.stop(wait=True) self.assertTrue(self._match_thread_count(self._thread_count))
def start_publishing(self): self._publisher_thread = Periodic(self.publish, self._publish_interval_secs) self._publisher_thread.daemon = True self._publisher_thread.start()
class StatsPublisher(object): DEFAULT_PUBLISH_INTERVAL_SECS = 20.0 DEFAULT_PUBLISH_TRY_COUNT = 10 DEFAULT_FAILED_PUBLISH_INTERVAL_SECS = 10 * 60 def __init__( self, tsdb, publish_try_count=DEFAULT_PUBLISH_TRY_COUNT, failed_publish_interval_secs=DEFAULT_FAILED_PUBLISH_INTERVAL_SECS): self._logger = logging.getLogger(__name__) self._db = tsdb self._last_seen_ts = 0 self.failed_count = 0 self.publish_try_count = publish_try_count self.failed_publish_interval_secs = failed_publish_interval_secs # XXX plugin configuration should be decoupled from agent_config arg # parsing self._agent_config = common.services.get(ServiceName.AGENT_CONFIG) self._hostname = self._agent_config.hostname if self._hostname is None: self._hostname = socket.gethostname() self._publish_interval_secs = float( self._agent_config.__dict__.get( "stats_publish_interval", StatsPublisher.DEFAULT_PUBLISH_INTERVAL_SECS)) self._publisher_thread = None self._publishers = [] def start_publishing(self): self._publisher_thread = Periodic(self.publish, self._publish_interval_secs) self._publisher_thread.daemon = True self._publisher_thread.start() def stop_publishing(self): if self._publisher_thread is not None: self._publisher_thread.stop() def register_publisher(self, publisher): """ Add a new publisher Args: publisher: Publisher instance """ self._publishers.append(publisher) def configure_publishers(self): stats_store_endpoint = self._agent_config.stats_store_endpoint stats_store_port = self._agent_config.stats_store_port stats_host_tags = self._agent_config.stats_host_tags pm_publisher = GraphitePublisher(hostname=self._hostname, carbon_host=stats_store_endpoint, carbon_port=stats_store_port, host_tags=stats_host_tags) self.register_publisher(pm_publisher) self._logger.info("Stats publisher configured") def publish(self): if len(self._publishers) <= 0: self._logger.debug("No publishers found.") return retrieved_stats = {} latest_ts = self._last_seen_ts self._logger.debug("DB metrics size %d" % len(self._db.get_keys())) for metric in self._db.get_keys(): values = self._db.get_values_since(self._last_seen_ts, metric) retrieved_stats[metric] = values if values: latest_ts = max(latest_ts, max([x[0] for x in values])) self._last_seen_ts = latest_ts if len(retrieved_stats) > 0: # Use first publisher by default for now publisher = self._publishers[0] published = publisher.publish(retrieved_stats) if not published: self.failed_count += 1 self._logger.critical( "Publisher failed to publish stats, failed_count:%s" % str(self.failed_count)) elif self.failed_count > 0: self.failed_count = 0 self._publisher_thread.update_wait_interval( self.DEFAULT_PUBLISH_INTERVAL_SECS) else: self._logger.debug("No metrics to send") if self.failed_count >= self.publish_try_count: self.failed_count = 0 self._logger.critical( "Too many failed attempts to publish stats. Publisher will sleep for %s seconds now" % str(self.failed_publish_interval_secs)) self._publisher_thread.update_wait_interval( self.failed_publish_interval_secs)
def start_collection(self): self._collector_thread = Periodic(self.collect, self._collect_interval_secs) self._collector_thread.daemon = True self._collector_thread.start()
class EsxImageManager(ImageManager): NUM_MAKEDIRS_ATTEMPTS = 10 DEFAULT_TMP_IMAGES_CLEANUP_INTERVAL = 600.0 REAP_TMP_IMAGES_GRACE_PERIOD = 600.0 IMAGE_MARKER_FILE_NAME = "unused_image_marker.txt" IMAGE_TIMESTAMP_FILE_NAME = "image_timestamp.txt" IMAGE_TIMESTAMP_FILE_RENAME_SUFFIX = ".renamed" def __init__(self, vim_client, ds_manager): super(EsxImageManager, self).__init__() self._logger = logging.getLogger(__name__) self._vim_client = vim_client self._ds_manager = ds_manager self._image_reaper = None def monitor_for_cleanup(self, reap_interval=DEFAULT_TMP_IMAGES_CLEANUP_INTERVAL): self._image_reaper = Periodic(self.reap_tmp_images, reap_interval) self._image_reaper.daemon = True self._image_reaper.start() def cleanup(self): if self._image_reaper is not None: self._image_reaper.stop() @log_duration def check_image(self, image_id, datastore): image_dir = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) try: return os.path.exists(image_dir) except: self._logger.exception( "Error looking up %s" % image_dir) return False """ The following method is intended as a replacement of check_image in the vm creation workflow compatible with the new image sweeper. For an image to be valid both the directory and the image timestamp file must exists on the datastore. """ def check_and_validate_image(self, image_id, ds_id): image_dir = os.path.dirname( os_vmdk_path(ds_id, image_id, IMAGE_FOLDER_NAME_PREFIX)) try: if not os.path.exists(image_dir): return False except: self._logger.exception( "Error looking up %s" % image_dir) return False # Check the existence of the timestamp file timestamp_pathname = \ os.path.join(image_dir, self.IMAGE_TIMESTAMP_FILE_NAME) try: if os.path.exists(timestamp_pathname): return True except Exception as ex: self._logger.exception( "Exception looking up %s, %s" % (timestamp_pathname, ex)) return False return False """ This method is used to update the mod time on the image timestamp file. """ def touch_image_timestamp(self, ds_id, image_id): """ :param ds_id: :param image_id: :return: """ image_path = os.path.dirname( os_vmdk_path(ds_id, image_id, IMAGE_FOLDER_NAME_PREFIX)) # Touch the timestamp file timestamp_pathname = os.path.join(image_path, self.IMAGE_TIMESTAMP_FILE_NAME) try: os.utime(timestamp_pathname, None) except Exception as ex: self._logger.exception( "Exception looking up %s, %s" % (timestamp_pathname, ex)) raise ex @log_duration def check_image_dir(self, image_id, datastore): image_path = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) try: return os.path.exists(os.path.dirname(image_path)) except: self._logger.error( "Error looking up %s" % image_path, exc_info=True) return False def get_image_directory_path(self, datastore_id, image_id): return image_directory_path(datastore_id, image_id) def get_image_path(self, datastore_id, image_id): return os_vmdk_path(datastore_id, image_id, IMAGE_FOLDER_NAME_PREFIX) def image_size(self, image_id): for image_ds in self._ds_manager.image_datastores(): try: image_path = os_vmdk_flat_path(image_ds, image_id, IMAGE_FOLDER_NAME_PREFIX) return os.path.getsize(image_path) except os.error: self._logger.info("Image %s not found in DataStore %s" % (image_id, image_ds)) self._logger.warning("Failed to get image size:", exc_info=True) # Failed to access shared image. raise NoSuchResourceException( ResourceType.IMAGE, "Image does not exist.") def _load_json(self, metadata_path): if os.path.exists(metadata_path): with open(metadata_path) as fh: try: data = json.load(fh) return data except ValueError: self._logger.error( "Error loading metadata file %s" % metadata_path, exc_info=True) return {} def get_image_metadata(self, image_id, datastore): metadata_path = os_metadata_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) self._logger.info("Loading metadata %s" % metadata_path) return self._load_json(metadata_path) def _get_datastore_type(self, datastore_id): datastores = self._ds_manager.get_datastores() return [ds.type for ds in datastores if ds.id == datastore_id][0] def _prepare_virtual_disk_spec(self, disk_type, adapter_type): """ :param disk_type [vim.VirtualDiskManager.VirtualDiskType]: :param adapter_type [vim.VirtualDiskManager.VirtualDiskAdapterType]: """ _vd_spec = vim.VirtualDiskManager.VirtualDiskSpec() _vd_spec.diskType = str(disk_type) _vd_spec.adapterType = str(adapter_type) return _vd_spec def _copy_to_tmp_image(self, source_datastore, source_id, dest_datastore, dest_id): """ Copy an image into a temp location. 1. Lock a tmp image destination file with an exclusive lock. This is to prevent the GC thread from garbage collecting directories that are actively being used. The temp directory name contains a random UUID to prevent collisions with concurrent copies 2. Create the temp directory. 3. Copy the metadata file over. 4. Copy the vmdk over. @return the tmp image directory on success. """ ds_type = self._get_datastore_type(dest_datastore) if ds_type == DatastoreType.VSAN: tmp_image_dir = os_datastore_path(dest_datastore, compond_path_join(IMAGE_FOLDER_NAME_PREFIX, dest_id), compond_path_join(TMP_IMAGE_FOLDER_NAME_PREFIX, str(uuid.uuid4()))) else: tmp_image_dir = os_datastore_path(dest_datastore, compond_path_join(TMP_IMAGE_FOLDER_NAME_PREFIX, str(uuid.uuid4()))) # Create the temp directory self._vim_client.make_directory(tmp_image_dir) # Copy the metadata file if it exists. source_meta = os_metadata_path(source_datastore, source_id, IMAGE_FOLDER_NAME_PREFIX) if os.path.exists(source_meta): try: dest_meta = os.path.join(tmp_image_dir, metadata_filename(dest_id)) shutil.copy(source_meta, dest_meta) except: self._logger.exception("Failed to copy metadata file %s", source_meta) raise # Create the timestamp file self._create_image_timestamp_file(tmp_image_dir) _vd_spec = self._prepare_virtual_disk_spec( vim.VirtualDiskManager.VirtualDiskType.thin, vim.VirtualDiskManager.VirtualDiskAdapterType.lsiLogic) self._manage_disk(vim.VirtualDiskManager.CopyVirtualDisk_Task, sourceName=vmdk_path(source_datastore, source_id, IMAGE_FOLDER_NAME_PREFIX), destName=os_to_datastore_path(os.path.join(tmp_image_dir, "%s.vmdk" % dest_id)), destSpec=_vd_spec) return tmp_image_dir def _move_image(self, image_id, datastore, tmp_dir): """ Atomic move of a tmp folder into the image datastore. Handles concurrent moves by locking a well know derivative of the image_id while doing the atomic move. The exclusive file lock ensures that only one move is successful. Has the following side effects: a - If the destination image already exists, it is assumed that someone else successfully copied the image over and the temp directory is deleted. b - If we fail to acquire the file lock after retrying 3 times, or the atomic move fails, the tmp image directory will be left behind and needs to be garbage collected later. image_id: String.The image id of the image being moved. datastore: String. The datastore id of the datastore. tmp_dir: String. The absolute path of the temp image directory. raises: OsError if the move fails AcquireLockFailure, InvalidFile if we fail to lock the destination image. """ ds_type = self._get_datastore_type(datastore) image_path = os_datastore_path(datastore, compond_path_join(IMAGE_FOLDER_NAME_PREFIX, image_id)) self._logger.info("_move_image: %s => %s, ds_type: %s" % (tmp_dir, image_path, ds_type)) if not os.path.exists(tmp_dir): raise ImageNotFoundException("Temp image %s not found" % tmp_dir) try: with FileBackedLock(image_path, ds_type, retry=300, wait_secs=0.01): # wait lock for 3 seconds if self._check_image_repair(image_id, datastore): raise DiskAlreadyExistException("Image already exists") if ds_type == DatastoreType.VSAN: # on VSAN, move all files under [datastore]/image_[image_id]/tmp_image_[uuid]/* to # [datastore]/image_[image_id]/*. # Also we do not delete tmp_image folder in success case, because VSAN accesses it # when creating linked VM, even the folder is now empty. for entry in os.listdir(tmp_dir): shutil.move(os.path.join(tmp_dir, entry), os.path.join(image_path, entry)) else: # on VMFS/NFS/etc, rename [datastore]/tmp_image_[uuid] to [datastore]/tmp_image_[image_id] self._vim_client.move_file(tmp_dir, image_path) except: self._logger.exception("Move image %s to %s failed" % (image_id, image_path)) self._vim_client.delete_file(tmp_dir) raise """ The following method should be used to check and validate the existence of a previously created image. With the new image delete path the "timestamp" file must exists inside the image directory. If the directory exists and the file does not, it may mean that an image delete operation was aborted mid-way. In this case the following method recreate the timestamp file. All operations are performed while holding the image directory lock (FileBackedLock), the caller is required to hold the lock. """ def _check_image_repair(self, image_id, datastore): vmdk_pathname = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) image_dirname = os.path.dirname(vmdk_pathname) try: # Check vmdk file if not os.path.exists(vmdk_pathname): self._logger.info("Vmdk path doesn't exists: %s" % vmdk_pathname) return False except Exception as ex: self._logger.exception( "Exception validating %s, %s" % (image_dirname, ex)) return False # Check timestamp file timestamp_pathname = \ os.path.join(image_dirname, self.IMAGE_TIMESTAMP_FILE_NAME) try: if os.path.exists(timestamp_pathname): self._logger.info("Timestamp file exists: %s" % timestamp_pathname) return True except Exception as ex: self._logger.exception( "Exception validating %s, %s" % (timestamp_pathname, ex)) # The timestamp file is not accessible, # try creating one, if successful try to # delete the renamed timestamp file if it # exists try: self._create_image_timestamp_file(image_dirname) self._delete_renamed_image_timestamp_file(image_dirname) except Exception as ex: self._logger.exception( "Exception creating %s, %s" % (timestamp_pathname, ex)) return False self._logger.info("Image repaired: %s" % image_dirname) return True def copy_image(self, source_datastore, source_id, dest_datastore, dest_id): """Copy an image between datastores. This method is used to create a "full clone" of a vmdk. It does so by copying a disk to a unique directory in a well known temporary directory then moving the disk to the destination image location. Data in the temporary directory not properly cleaned up will be periodically garbage collected by the reaper thread. This minimizes the window during which the vmdk path exists with incomplete content. It also works around a hostd issue where cp -f does not work. The current behavior for when the destination disk exists is to overwrite said disk. source_datastore: id of the source datastore source_id: id of the image to copy from dest_datastore: id of the destination datastore dest_id: id of the new image in the destination datastore throws: AcquireLockFailure if timed out waiting to acquire lock on tmp image directory throws: InvalidFile if unable to lock tmp image directory or some other reasons """ if self.check_and_validate_image(dest_id, dest_datastore): # The image is copied, presumably via some other concurrent # copy, so we move on. self._logger.info("Image %s already copied" % dest_id) raise DiskAlreadyExistException("Image already exists") # Copy image to the tmp directory. tmp_dir = self._copy_to_tmp_image(source_datastore, source_id, dest_datastore, dest_id) self._move_image(dest_id, dest_datastore, tmp_dir) def reap_tmp_images(self): """ Clean up unused directories in the temp image folder. """ for ds in self._ds_manager.get_datastores(): tmp_image_pattern = os_datastore_path_pattern(ds.id, TMP_IMAGE_FOLDER_NAME_PREFIX) for image_dir in glob.glob(tmp_image_pattern): if not os.path.isdir(image_dir): continue create_time = os.stat(image_dir).st_ctime current_time = time.time() if current_time - self.REAP_TMP_IMAGES_GRACE_PERIOD < create_time: # Skip folders that are newly created in past x minutes # For example, during host-to-host transfer, hostd on # receiving end stores the uploaded file in temp images # folder but does not lock it with FileBackedLock, so we # need to allow a grace period before reaping it. self._logger.info( "Skip folder: %s, created: %s, now: %s" % (image_dir, create_time, current_time)) continue try: with FileBackedLock(image_dir, ds.type): if os.path.exists(image_dir): self._logger.info("Delete folder %s" % image_dir) shutil.rmtree(image_dir, ignore_errors=True) except (AcquireLockFailure, InvalidFile): self._logger.info("Already locked: %s, skipping" % image_dir) except: self._logger.info("Unable to remove %s" % image_dir, exc_info=True) def get_images(self, datastore): """ Get image list from datastore :param datastore: datastore id :return: list of string, image id list """ image_ids = [] if not os.path.exists(os_datastore_root(datastore)): raise DatastoreNotFoundException() # image_folder is /vmfs/volumes/${datastore}/images_* image_folder_pattern = os_datastore_path_pattern(datastore, IMAGE_FOLDER_NAME_PREFIX) for dir in glob.glob(image_folder_pattern): image_id = dir.split(COMPOND_PATH_SEPARATOR)[1] if self.check_image(image_id, datastore): image_ids.append(image_id) return image_ids def _unzip(self, src, dst): self._logger.info("unzip %s -> %s" % (src, dst)) fsrc = gzip.open(src, "rb") fdst = open(dst, "wb") try: shutil.copyfileobj(fsrc, fdst) finally: fsrc.close() fdst.close() def _copy_disk(self, src, dst): self._manage_disk(vim.VirtualDiskManager.CopyVirtualDisk_Task, sourceName=src, destName=dst) def _manage_disk(self, op, **kwargs): try: self._logger.debug("Invoking %s(%s)" % (op.info.name, kwargs)) task = op(self._manager, **kwargs) self._vim_client.wait_for_task(task) except vim.Fault.FileAlreadyExists, e: raise DiskAlreadyExistException(e.msg) except vim.Fault.FileFault, e: raise DiskFileException(e.msg)
class EsxImageManager(ImageManager): NUM_MAKEDIRS_ATTEMPTS = 10 DEFAULT_TMP_IMAGES_CLEANUP_INTERVAL = 600.0 IMAGE_TOMBSTONE_FILE_NAME = "image_tombstone.txt" IMAGE_MARKER_FILE_NAME = "unused_image_marker.txt" IMAGE_TIMESTAMP_FILE_NAME = "image_timestamp.txt" IMAGE_TIMESTAMP_FILE_RENAME_SUFFIX = ".renamed" def __init__(self, vim_client, ds_manager): super(EsxImageManager, self).__init__() self._logger = logging.getLogger(__name__) self._vim_client = vim_client self._ds_manager = ds_manager self._image_reaper = None self._uwsim_nas_exist = None agent_config = services.get(ServiceName.AGENT_CONFIG) self._in_uwsim = agent_config.in_uwsim def monitor_for_cleanup(self, reap_interval=DEFAULT_TMP_IMAGES_CLEANUP_INTERVAL): self._image_reaper = Periodic(self.reap_tmp_images, reap_interval) self._image_reaper.daemon = True self._image_reaper.start() def cleanup(self): if self._image_reaper is not None: self._image_reaper.stop() @log_duration def check_image(self, image_id, datastore): image_dir = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME) try: return os.path.exists(image_dir) except: self._logger.exception( "Error looking up %s" % image_dir) return False """ The following method is intended as a replacement of check_image in the vm creation workflow compatible with the new image sweeper. For an image to be valid both the directory and the image timestamp file must exists on the datastore. """ def check_and_validate_image(self, image_id, ds_id): image_dir = os.path.dirname( os_vmdk_path(ds_id, image_id, IMAGE_FOLDER_NAME)) try: if not os.path.exists(image_dir): return False except: self._logger.exception( "Error looking up %s" % image_dir) return False # Check the existence of the timestamp file timestamp_pathname = \ os.path.join(image_dir, self.IMAGE_TIMESTAMP_FILE_NAME) try: if os.path.exists(timestamp_pathname): return True except Exception as ex: self._logger.exception( "Exception looking up %s, %s" % (timestamp_pathname, ex)) return False return False """ This method is used to update the mod time on the image timestamp file. It also checks for the existence of a tombstone file for this image. If the tombstone file exists it throws an exception. """ def touch_image_timestamp(self, ds_id, image_id): """ :param ds_id: :param image_id: :return: """ image_path = os.path.dirname( os_vmdk_path(ds_id, image_id, IMAGE_FOLDER_NAME)) # Check the existence of the timestamp file tombstone_pathname = \ os.path.join(image_path, self.IMAGE_TOMBSTONE_FILE_NAME) try: tombstone = os.path.exists(tombstone_pathname) except Exception as ex: self._logger.exception( "Exception looking up %s, %s" % (tombstone_pathname, ex)) if tombstone: raise InvalidImageState # Touch the timestamp file timestamp_pathname = \ os.path.join(image_path, self.IMAGE_TIMESTAMP_FILE_NAME) try: os.utime(timestamp_pathname, None) except Exception as ex: self._logger.exception( "Exception looking up %s, %s" % (timestamp_pathname, ex)) raise ex """ This method is used to create a tombstone marker in the new image management work flow. The tombstone marker is a file under the image directory. """ def create_image_tombstone(self, ds_id, image_id): """ :param ds_id: :param image_id: :return: """ image_path = os.path.dirname( os_vmdk_path(ds_id, image_id, IMAGE_FOLDER_NAME)) # Create tombstone file for the image tombstone_pathname = \ os.path.join(image_path, self.IMAGE_TOMBSTONE_FILE_NAME) try: open(tombstone_pathname, 'w').close() except Exception as ex: self._logger.exception( "Exception creating %s, %s" % (tombstone_pathname, ex)) raise ex self._logger.info("Image: %s tombstoned" % tombstone_pathname) @log_duration def check_image_dir(self, image_id, datastore): image_path = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME) try: return os.path.exists(os.path.dirname(image_path)) except: self._logger.error( "Error looking up %s" % image_path, exc_info=True) return False def get_image_directory_path(self, datastore_id, image_id): return image_directory_path(datastore_id, image_id) def get_image_path(self, datastore_id, image_id): return os_vmdk_path(datastore_id, image_id, IMAGE_FOLDER_NAME) def image_size(self, image_id): # TODO(mmutsuzaki) We should iterate over all the image datastores # until we find one that has the image. image_ds = list(self._ds_manager.image_datastores())[0] image_path = os_vmdk_flat_path(image_ds, image_id, IMAGE_FOLDER_NAME) return os.path.getsize(image_path) def _load_json(self, metadata_path): if os.path.exists(metadata_path): with open(metadata_path) as fh: try: data = json.load(fh) return data except ValueError: self._logger.error( "Error loading metadata file %s" % metadata_path, exc_info=True) return {} def get_image_metadata(self, image_id, datastore): metadata_path = os_metadata_path(datastore, image_id, IMAGE_FOLDER_NAME) self._logger.info("Loading metadata %s" % metadata_path) return self._load_json(metadata_path) def get_image_manifest(self, image_id): # This is a shortcut for ttylinux. ttylinux doesn't have manifest file. if image_id == "ttylinux": return ImageType.CLOUD, ImageReplication.EAGER # TODO(mmutsuzaki) We should iterate over all the image datastores # until we find one that has the image. image_ds = list(self._ds_manager.image_datastores())[0] manifest_path = os_image_manifest_path(image_ds, image_id) if not os.path.isfile(manifest_path): self._logger.info("Manifest file %s not found" % manifest_path) return None, None self._logger.info("Loading manifest %s" % manifest_path) data = self._load_json(manifest_path) type = ImageType._NAMES_TO_VALUES[data["imageType"]] replication = ImageReplication._NAMES_TO_VALUES[ data["imageReplication"]] return type, replication def _get_datastore_type(self, datastore_id): datastores = self._ds_manager.get_datastores() return [ds.type for ds in datastores if ds.id == datastore_id][0] def _prepare_virtual_disk_spec(self, disk_type, adapter_type): """ :param disk_type [vim.VirtualDiskManager.VirtualDiskType]: :param adapter_type [vim.VirtualDiskManager.VirtualDiskAdapterType]: """ _vd_spec = vim.VirtualDiskManager.VirtualDiskSpec() _vd_spec.diskType = str(disk_type) _vd_spec.adapterType = str(adapter_type) return _vd_spec def _create_tmp_image(self, source_datastore, source_id, dest_datastore, dest_id): """ Copy an image into a temp location. 1. Lock a tmp image destination file with an exclusive lock. This is to prevent the GC thread from garbage collecting directories that are actively being used. The temp directory name contains a random UUID to prevent collisions with concurrent copies 2. Create the temp directory. 3. Copy the metadata file over. 4. Copy the vmdk over. @return the tmp image directory on success. """ source = vmdk_path(source_datastore, source_id, IMAGE_FOLDER_NAME) temp_dest = tmp_image_path(dest_datastore, dest_id) ds_type = self._get_datastore_type(dest_datastore) tmp_image_dir_path = os.path.dirname(datastore_to_os_path(temp_dest)) # Try grabbing the lock on the temp directory if it fails # (very unlikely) someone else is copying an image just retry # later. with FileBackedLock(tmp_image_dir_path, ds_type): source_meta = os_metadata_path(source_datastore, source_id, IMAGE_FOLDER_NAME) # Create the temp directory mkdir_p(tmp_image_dir_path) # Copy the metadata file if it exists. if os.path.exists(source_meta): try: shutil.copy(source_meta, tmp_image_dir_path) except: self._logger.exception("Failed to copy metadata file %s", source_meta) raise # Create the timestamp file self._create_image_timestamp_file(tmp_image_dir_path) _vd_spec = self._prepare_virtual_disk_spec( vim.VirtualDiskManager.VirtualDiskType.thin, vim.VirtualDiskManager.VirtualDiskAdapterType.lsiLogic) self._manage_disk(vim.VirtualDiskManager.CopyVirtualDisk_Task, sourceName=source, destName=temp_dest, destSpec=_vd_spec) return tmp_image_dir_path def _move_image(self, image_id, datastore, tmp_dir): """ Atomic move of a tmp folder into the image datastore. Handles concurrent moves by locking a well know derivative of the image_id while doing the atomic move. The exclusive file lock ensures that only one move is successful. Has the following side effects: a - If the destination image already exists, it is assumed that someone else successfully copied the image over and the temp directory is deleted. b - If we fail to acquire the file lock after retrying 3 times, or the atomic move fails, the tmp image directory will be left behind and needs to be garbage collected later. image_id: String.The image id of the image being moved. datastore: String. The datastore id of the datastore. tmp_dir: String. The absolute path of the temp image directory. raises: OsError if the move fails AcquireLockFailure, InvalidFile if we fail to lock the destination image. """ ds_type = self._get_datastore_type(datastore) image_path = os.path.dirname(os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME)) parent_path = os.path.dirname(image_path) # Create the parent image directory if it doesn't exist. try: mkdir_p(parent_path) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(parent_path): # Parent directory exists nothing to do. pass else: raise try: with FileBackedLock(image_path, ds_type, retry=300, wait_secs=0.01): # wait lock for 3 seconds if self._check_image_repair(image_id, datastore): raise DiskAlreadyExistException("Image already exists") shutil.move(tmp_dir, image_path) except (AcquireLockFailure, InvalidFile): self._logger.info("Unable to lock %s for atomic move" % image_id) raise except DiskAlreadyExistException: self._logger.info("Image %s already copied" % image_id) rm_rf(tmp_dir) raise """ The following method should be used to check and validate the existence of a previously created image. With the new image delete path the "timestamp" file must exists inside the image directory. If the directory exists and the file does not, it may mean that an image delete operation was aborted mid-way. In this case the following method recreate the timestamp file. All operations are performed while holding the image directory lock (FileBackedLock), the caller is required to hold the lock. """ def _check_image_repair(self, image_id, datastore): vmdk_pathname = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME) image_dirname = os.path.dirname(vmdk_pathname) try: # Check vmdk file if not os.path.exists(vmdk_pathname): self._logger.info("Vmdk path doesn't exists: %s" % vmdk_pathname) return False except Exception as ex: self._logger.exception( "Exception validating %s, %s" % (image_dirname, ex)) return False # Check timestamp file timestamp_pathname = \ os.path.join(image_dirname, self.IMAGE_TIMESTAMP_FILE_NAME) try: if os.path.exists(timestamp_pathname): self._logger.info("Timestamp file exists: %s" % timestamp_pathname) return True except Exception as ex: self._logger.exception( "Exception validating %s, %s" % (timestamp_pathname, ex)) # The timestamp file is not accessible, # try creating one, if successful try to # delete the renamed timestamp file if it # exists try: self._create_image_timestamp_file(image_dirname) self._delete_renamed_image_timestamp_file(image_dirname) except Exception as ex: self._logger.exception( "Exception creating %s, %s" % (timestamp_pathname, ex)) return False self._logger.info("Image repaired: %s" % image_dirname) return True def copy_image(self, source_datastore, source_id, dest_datastore, dest_id): """Copy an image between datastores. This method is used to create a "full clone" of a vmdk. It does so by copying a disk to a unique directory in a well known temporary directory then moving the disk to the destination image location. Data in the temporary directory not properly cleaned up will be periodically garbage collected by the reaper thread. This minimizes the window during which the vmdk path exists with incomplete content. It also works around a hostd issue where cp -f does not work. The current behavior for when the destination disk exists is to overwrite said disk. source_datastore: id of the source datastore source_id: id of the image to copy from dest_datastore: id of the destination datastore dest_id: id of the new image in the destination datastore throws: AcquireLockFailure if timed out waiting to acquire lock on tmp image directory throws: InvalidFile if unable to lock tmp image directory or some other reasons """ if self.check_and_validate_image(dest_id, dest_datastore): # The image is copied, presumably via some other concurrent # copy, so we move on. self._logger.info("Image %s already copied" % dest_id) raise DiskAlreadyExistException("Image already exists") # Copy image to the tmp directory. tmp_dir = self._create_tmp_image(source_datastore, source_id, dest_datastore, dest_id) self._move_image(dest_id, dest_datastore, tmp_dir) def reap_tmp_images(self): """ Clean up unused directories in the temp image folder. """ for ds in self._ds_manager.get_datastores(): images_dir = tmp_image_folder_os_path(ds.id) for f in os.listdir(images_dir): path = os.path.join(images_dir, f) if not os.path.isdir(path): continue try: with FileBackedLock(path, ds.type): if (os.path.exists(path)): self._logger.info("Delete folder %s" % path) shutil.rmtree(path, ignore_errors=True) except (AcquireLockFailure, InvalidFile): self._logger.info("Already locked: %s, skipping" % path) except: self._logger.info("Unable to remove %s" % path, exc_info=True) def delete_image(self, datastore_id, image_id, ds_type, force): # Check if the image currently exists if not self.check_image_dir(image_id, datastore_id): self._logger.info("Image %s on datastore %s not found" % (image_id, datastore_id)) raise ImageNotFoundException("Image %s not found" % image_id) # Mark image as tombstoned self.create_image_tombstone(datastore_id, image_id) if not force: return # If force try to actively garbage collect the image here if self._lock_data_disk(datastore_id, image_id): self._gc_image_dir(datastore_id, image_id) else: raise ImageInUse("Image %s is currently in use" % image_id) # Now attempt GCing the image directory. try: self._clean_gc_dir(datastore_id) except Exception: # Swallow the exception the next clean call will clear it all. self._logger.exception("Failed to delete gc dir on datastore %s" % datastore_id) def _lock_data_disk(self, datastore_id, image_id): """ Lock the data disks associated with the VMs in the provided ref file. Return True if locking was successful false otherwise. """ data_disk = os_vmdk_flat_path(datastore_id, image_id) try: # Its ok to delete the data disk as a subsequent power on will # fail if the data disk is not there. os.remove(data_disk) except OSError: # Remove failed so disk is locked. self._logger.debug("Disk %s on datastore %s is already locked" % (data_disk, datastore_id)) return False return True def get_images(self, datastore): """ Get image list from datastore :param datastore: datastore id :return: list of string, image id list """ image_ids = [] # image_folder is /vmfs/volumes/${datastore}/images image_folder = os_datastore_path(datastore, IMAGE_FOLDER_NAME) if not os.path.exists(image_folder): raise DatastoreNotFoundException() # prefix is the 2-digit prefix of image id for prefix in os.listdir(image_folder): # outer path is something like # /vmfs/volumes/${datastore}/images/${image_id}[0:2] outer_path = os.path.join(image_folder, prefix) if not os.path.isdir(outer_path): continue for image_id in os.listdir(outer_path): if self.check_image(image_id, datastore): image_ids.append(image_id) return image_ids def mark_unused(self, image_scanner): images_dir_path = os_datastore_path(image_scanner.datastore_id, IMAGE_FOLDER_NAME) # Log messages with prefix: "IMAGE SCANNER" are for debugging # and will be removed after basic testing self._logger.info("IMAGE SCANNER: images_dir: %s" % images_dir_path) if not os.path.isdir(images_dir_path): self._logger.info("images_dir_path: images_dir: %s, doesn't exist" % images_dir_path) raise DatastoreNotFoundException( "Image scanner, cannot find image " "directory for datastore: %s" % image_scanner.datastore_id) return self._mark_unused_images(image_scanner, images_dir_path) def delete_unused(self, image_sweeper): images_dir_path = os_datastore_path(image_sweeper.datastore_id, IMAGE_FOLDER_NAME) # Log messages with prefix: "IMAGE SWEEPER" are for debugging # and will be removed after basic testing self._logger.info("IMAGE SWEEPER: images_dir: %s" % images_dir_path) if not os.path.isdir(images_dir_path): self._logger.info("images_dir_path: images_dir: %s, doesn't exist" % images_dir_path) raise DatastoreNotFoundException( "Image sweeper, cannot find image " "directory for datastore: %s" % image_sweeper.datastore_id) return self._delete_unused_images(image_sweeper, images_dir_path) def _unzip(self, src, dst): self._logger.info("unzip %s -> %s" % (src, dst)) fsrc = gzip.open(src, "rb") fdst = open(dst, "wb") try: shutil.copyfileobj(fsrc, fdst) finally: fsrc.close() fdst.close() def _copy_disk(self, src, dst): self._manage_disk(vim.VirtualDiskManager.CopyVirtualDisk_Task, sourceName=src, destName=dst) def _manage_disk(self, op, **kwargs): if self._in_uwsim: self._manage_disk_uwsim(op, **kwargs) return try: self._logger.debug("Invoking %s(%s)" % (op.info.name, kwargs)) task = op(self._manager, **kwargs) self._vim_client.wait_for_task(task) except vim.Fault.FileAlreadyExists, e: raise DiskAlreadyExistException(e.msg) except vim.Fault.FileFault, e: raise DiskFileException(e.msg)
class EsxImageManager(ImageManager): NUM_MAKEDIRS_ATTEMPTS = 10 DEFAULT_TMP_IMAGES_CLEANUP_INTERVAL = 600.0 IMAGE_TOMBSTONE_FILE_NAME = "image_tombstone.txt" IMAGE_MARKER_FILE_NAME = "unused_image_marker.txt" IMAGE_TIMESTAMP_FILE_NAME = "image_timestamp.txt" IMAGE_TIMESTAMP_FILE_RENAME_SUFFIX = ".renamed" def __init__(self, vim_client, ds_manager): super(EsxImageManager, self).__init__() self._logger = logging.getLogger(__name__) self._vim_client = vim_client self._ds_manager = ds_manager self._image_reaper = None self._uwsim_nas_exist = None agent_config = services.get(ServiceName.AGENT_CONFIG) self._in_uwsim = agent_config.in_uwsim def monitor_for_cleanup(self, reap_interval=DEFAULT_TMP_IMAGES_CLEANUP_INTERVAL): self._image_reaper = Periodic(self.reap_tmp_images, reap_interval) self._image_reaper.daemon = True self._image_reaper.start() def cleanup(self): if self._image_reaper is not None: self._image_reaper.stop() @log_duration def check_image(self, image_id, datastore): image_dir = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME) try: return os.path.exists(image_dir) except: self._logger.exception("Error looking up %s" % image_dir) return False """ The following method is intended as a replacement of check_image in the vm creation workflow compatible with the new image sweeper. For an image to be valid both the directory and the image timestamp file must exists on the datastore. """ def check_and_validate_image(self, image_id, ds_id): image_dir = os.path.dirname( os_vmdk_path(ds_id, image_id, IMAGE_FOLDER_NAME)) try: if not os.path.exists(image_dir): return False except: self._logger.exception("Error looking up %s" % image_dir) return False # Check the existence of the timestamp file timestamp_pathname = \ os.path.join(image_dir, self.IMAGE_TIMESTAMP_FILE_NAME) try: if os.path.exists(timestamp_pathname): return True except Exception as ex: self._logger.exception("Exception looking up %s, %s" % (timestamp_pathname, ex)) return False return False """ This method is used to update the mod time on the image timestamp file. It also checks for the existence of a tombstone file for this image. If the tombstone file exists it throws an exception. """ def touch_image_timestamp(self, ds_id, image_id): """ :param ds_id: :param image_id: :return: """ image_path = os.path.dirname( os_vmdk_path(ds_id, image_id, IMAGE_FOLDER_NAME)) # Check the existence of the timestamp file tombstone_pathname = \ os.path.join(image_path, self.IMAGE_TOMBSTONE_FILE_NAME) try: tombstone = os.path.exists(tombstone_pathname) except Exception as ex: self._logger.exception("Exception looking up %s, %s" % (tombstone_pathname, ex)) if tombstone: raise InvalidImageState # Touch the timestamp file timestamp_pathname = \ os.path.join(image_path, self.IMAGE_TIMESTAMP_FILE_NAME) try: os.utime(timestamp_pathname, None) except Exception as ex: self._logger.exception("Exception looking up %s, %s" % (timestamp_pathname, ex)) raise ex """ This method is used to create a tombstone marker in the new image management work flow. The tombstone marker is a file under the image directory. """ def create_image_tombstone(self, ds_id, image_id): """ :param ds_id: :param image_id: :return: """ image_path = os.path.dirname( os_vmdk_path(ds_id, image_id, IMAGE_FOLDER_NAME)) # Create tombstone file for the image tombstone_pathname = \ os.path.join(image_path, self.IMAGE_TOMBSTONE_FILE_NAME) try: open(tombstone_pathname, 'w').close() except Exception as ex: self._logger.exception("Exception creating %s, %s" % (tombstone_pathname, ex)) raise ex self._logger.info("Image: %s tombstoned" % tombstone_pathname) @log_duration def check_image_dir(self, image_id, datastore): image_path = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME) try: return os.path.exists(os.path.dirname(image_path)) except: self._logger.error("Error looking up %s" % image_path, exc_info=True) return False def get_image_directory_path(self, datastore_id, image_id): return image_directory_path(datastore_id, image_id) def get_image_path(self, datastore_id, image_id): return os_vmdk_path(datastore_id, image_id, IMAGE_FOLDER_NAME) def image_size(self, image_id): # TODO(mmutsuzaki) We should iterate over all the image datastores # until we find one that has the image. image_ds = list(self._ds_manager.image_datastores())[0] image_path = os_vmdk_flat_path(image_ds, image_id, IMAGE_FOLDER_NAME) return os.path.getsize(image_path) def _load_json(self, metadata_path): if os.path.exists(metadata_path): with open(metadata_path) as fh: try: data = json.load(fh) return data except ValueError: self._logger.error("Error loading metadata file %s" % metadata_path, exc_info=True) return {} def get_image_metadata(self, image_id, datastore): metadata_path = os_metadata_path(datastore, image_id, IMAGE_FOLDER_NAME) self._logger.info("Loading metadata %s" % metadata_path) return self._load_json(metadata_path) def get_image_manifest(self, image_id): # This is a shortcut for ttylinux. ttylinux doesn't have manifest file. if image_id == "ttylinux": return ImageType.CLOUD, ImageReplication.EAGER # TODO(mmutsuzaki) We should iterate over all the image datastores # until we find one that has the image. image_ds = list(self._ds_manager.image_datastores())[0] manifest_path = os_image_manifest_path(image_ds, image_id) if not os.path.isfile(manifest_path): self._logger.info("Manifest file %s not found" % manifest_path) return None, None self._logger.info("Loading manifest %s" % manifest_path) data = self._load_json(manifest_path) type = ImageType._NAMES_TO_VALUES[data["imageType"]] replication = ImageReplication._NAMES_TO_VALUES[ data["imageReplication"]] return type, replication def _get_datastore_type(self, datastore_id): datastores = self._ds_manager.get_datastores() return [ds.type for ds in datastores if ds.id == datastore_id][0] def _prepare_virtual_disk_spec(self, disk_type, adapter_type): """ :param disk_type [vim.VirtualDiskManager.VirtualDiskType]: :param adapter_type [vim.VirtualDiskManager.VirtualDiskAdapterType]: """ _vd_spec = vim.VirtualDiskManager.VirtualDiskSpec() _vd_spec.diskType = str(disk_type) _vd_spec.adapterType = str(adapter_type) return _vd_spec def _create_tmp_image(self, source_datastore, source_id, dest_datastore, dest_id): """ Copy an image into a temp location. 1. Lock a tmp image destination file with an exclusive lock. This is to prevent the GC thread from garbage collecting directories that are actively being used. The temp directory name contains a random UUID to prevent collisions with concurrent copies 2. Create the temp directory. 3. Copy the metadata file over. 4. Copy the vmdk over. @return the tmp image directory on success. """ source = vmdk_path(source_datastore, source_id, IMAGE_FOLDER_NAME) temp_dest = tmp_image_path(dest_datastore, dest_id) ds_type = self._get_datastore_type(dest_datastore) tmp_image_dir_path = os.path.dirname(datastore_to_os_path(temp_dest)) # Try grabbing the lock on the temp directory if it fails # (very unlikely) someone else is copying an image just retry # later. with FileBackedLock(tmp_image_dir_path, ds_type): source_meta = os_metadata_path(source_datastore, source_id, IMAGE_FOLDER_NAME) # Create the temp directory mkdir_p(tmp_image_dir_path) # Copy the metadata file if it exists. if os.path.exists(source_meta): try: shutil.copy(source_meta, tmp_image_dir_path) except: self._logger.exception("Failed to copy metadata file %s", source_meta) raise # Create the timestamp file self._create_image_timestamp_file(tmp_image_dir_path) _vd_spec = self._prepare_virtual_disk_spec( vim.VirtualDiskManager.VirtualDiskType.thin, vim.VirtualDiskManager.VirtualDiskAdapterType.lsiLogic) self._manage_disk(vim.VirtualDiskManager.CopyVirtualDisk_Task, sourceName=source, destName=temp_dest, destSpec=_vd_spec) return tmp_image_dir_path def _move_image(self, image_id, datastore, tmp_dir): """ Atomic move of a tmp folder into the image datastore. Handles concurrent moves by locking a well know derivative of the image_id while doing the atomic move. The exclusive file lock ensures that only one move is successful. Has the following side effects: a - If the destination image already exists, it is assumed that someone else successfully copied the image over and the temp directory is deleted. b - If we fail to acquire the file lock after retrying 3 times, or the atomic move fails, the tmp image directory will be left behind and needs to be garbage collected later. image_id: String.The image id of the image being moved. datastore: String. The datastore id of the datastore. tmp_dir: String. The absolute path of the temp image directory. raises: OsError if the move fails AcquireLockFailure, InvalidFile if we fail to lock the destination image. """ ds_type = self._get_datastore_type(datastore) image_path = os.path.dirname( os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME)) parent_path = os.path.dirname(image_path) # Create the parent image directory if it doesn't exist. try: mkdir_p(parent_path) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(parent_path): # Parent directory exists nothing to do. pass else: raise try: with FileBackedLock(image_path, ds_type, retry=300, wait_secs=0.01): # wait lock for 3 seconds if self._check_image_repair(image_id, datastore): raise DiskAlreadyExistException("Image already exists") shutil.move(tmp_dir, image_path) except (AcquireLockFailure, InvalidFile): self._logger.info("Unable to lock %s for atomic move" % image_id) raise except DiskAlreadyExistException: self._logger.info("Image %s already copied" % image_id) rm_rf(tmp_dir) raise """ The following method should be used to check and validate the existence of a previously created image. With the new image delete path the "timestamp" file must exists inside the image directory. If the directory exists and the file does not, it may mean that an image delete operation was aborted mid-way. In this case the following method recreate the timestamp file. All operations are performed while holding the image directory lock (FileBackedLock), the caller is required to hold the lock. """ def _check_image_repair(self, image_id, datastore): vmdk_pathname = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME) image_dirname = os.path.dirname(vmdk_pathname) try: # Check vmdk file if not os.path.exists(vmdk_pathname): self._logger.info("Vmdk path doesn't exists: %s" % vmdk_pathname) return False except Exception as ex: self._logger.exception("Exception validating %s, %s" % (image_dirname, ex)) return False # Check timestamp file timestamp_pathname = \ os.path.join(image_dirname, self.IMAGE_TIMESTAMP_FILE_NAME) try: if os.path.exists(timestamp_pathname): self._logger.info("Timestamp file exists: %s" % timestamp_pathname) return True except Exception as ex: self._logger.exception("Exception validating %s, %s" % (timestamp_pathname, ex)) # The timestamp file is not accessible, # try creating one, if successful try to # delete the renamed timestamp file if it # exists try: self._create_image_timestamp_file(image_dirname) self._delete_renamed_image_timestamp_file(image_dirname) except Exception as ex: self._logger.exception("Exception creating %s, %s" % (timestamp_pathname, ex)) return False self._logger.info("Image repaired: %s" % image_dirname) return True def copy_image(self, source_datastore, source_id, dest_datastore, dest_id): """Copy an image between datastores. This method is used to create a "full clone" of a vmdk. It does so by copying a disk to a unique directory in a well known temporary directory then moving the disk to the destination image location. Data in the temporary directory not properly cleaned up will be periodically garbage collected by the reaper thread. This minimizes the window during which the vmdk path exists with incomplete content. It also works around a hostd issue where cp -f does not work. The current behavior for when the destination disk exists is to overwrite said disk. source_datastore: id of the source datastore source_id: id of the image to copy from dest_datastore: id of the destination datastore dest_id: id of the new image in the destination datastore throws: AcquireLockFailure if timed out waiting to acquire lock on tmp image directory throws: InvalidFile if unable to lock tmp image directory or some other reasons """ if self.check_and_validate_image(dest_id, dest_datastore): # The image is copied, presumably via some other concurrent # copy, so we move on. self._logger.info("Image %s already copied" % dest_id) raise DiskAlreadyExistException("Image already exists") # Copy image to the tmp directory. tmp_dir = self._create_tmp_image(source_datastore, source_id, dest_datastore, dest_id) self._move_image(dest_id, dest_datastore, tmp_dir) def reap_tmp_images(self): """ Clean up unused directories in the temp image folder. """ for ds in self._ds_manager.get_datastores(): images_dir = tmp_image_folder_os_path(ds.id) for f in os.listdir(images_dir): path = os.path.join(images_dir, f) if not os.path.isdir(path): continue try: with FileBackedLock(path, ds.type): if (os.path.exists(path)): self._logger.info("Delete folder %s" % path) shutil.rmtree(path, ignore_errors=True) except (AcquireLockFailure, InvalidFile): self._logger.info("Already locked: %s, skipping" % path) except: self._logger.info("Unable to remove %s" % path, exc_info=True) def delete_image(self, datastore_id, image_id, ds_type, force): # Check if the image currently exists if not self.check_image_dir(image_id, datastore_id): self._logger.info("Image %s on datastore %s not found" % (image_id, datastore_id)) raise ImageNotFoundException("Image %s not found" % image_id) # Mark image as tombstoned self.create_image_tombstone(datastore_id, image_id) if not force: return # If force try to actively garbage collect the image here if self._lock_data_disk(datastore_id, image_id): self._gc_image_dir(datastore_id, image_id) else: raise ImageInUse("Image %s is currently in use" % image_id) # Now attempt GCing the image directory. try: self._clean_gc_dir(datastore_id) except Exception: # Swallow the exception the next clean call will clear it all. self._logger.exception("Failed to delete gc dir on datastore %s" % datastore_id) def _lock_data_disk(self, datastore_id, image_id): """ Lock the data disks associated with the VMs in the provided ref file. Return True if locking was successful false otherwise. """ data_disk = os_vmdk_flat_path(datastore_id, image_id) try: # Its ok to delete the data disk as a subsequent power on will # fail if the data disk is not there. os.remove(data_disk) except OSError: # Remove failed so disk is locked. self._logger.debug("Disk %s on datastore %s is already locked" % (data_disk, datastore_id)) return False return True def get_images(self, datastore): """ Get image list from datastore :param datastore: datastore id :return: list of string, image id list """ image_ids = [] # image_folder is /vmfs/volumes/${datastore}/images image_folder = os_datastore_path(datastore, IMAGE_FOLDER_NAME) if not os.path.exists(image_folder): raise DatastoreNotFoundException() # prefix is the 2-digit prefix of image id for prefix in os.listdir(image_folder): # outer path is something like # /vmfs/volumes/${datastore}/images/${image_id}[0:2] outer_path = os.path.join(image_folder, prefix) if not os.path.isdir(outer_path): continue for image_id in os.listdir(outer_path): if self.check_image(image_id, datastore): image_ids.append(image_id) return image_ids def mark_unused(self, image_scanner): images_dir_path = os_datastore_path(image_scanner.datastore_id, IMAGE_FOLDER_NAME) # Log messages with prefix: "IMAGE SCANNER" are for debugging # and will be removed after basic testing self._logger.info("IMAGE SCANNER: images_dir: %s" % images_dir_path) if not os.path.isdir(images_dir_path): self._logger.info( "images_dir_path: images_dir: %s, doesn't exist" % images_dir_path) raise DatastoreNotFoundException( "Image scanner, cannot find image " "directory for datastore: %s" % image_scanner.datastore_id) return self._mark_unused_images(image_scanner, images_dir_path) def delete_unused(self, image_sweeper): images_dir_path = os_datastore_path(image_sweeper.datastore_id, IMAGE_FOLDER_NAME) # Log messages with prefix: "IMAGE SWEEPER" are for debugging # and will be removed after basic testing self._logger.info("IMAGE SWEEPER: images_dir: %s" % images_dir_path) if not os.path.isdir(images_dir_path): self._logger.info( "images_dir_path: images_dir: %s, doesn't exist" % images_dir_path) raise DatastoreNotFoundException( "Image sweeper, cannot find image " "directory for datastore: %s" % image_sweeper.datastore_id) return self._delete_unused_images(image_sweeper, images_dir_path) def _unzip(self, src, dst): self._logger.info("unzip %s -> %s" % (src, dst)) fsrc = gzip.open(src, "rb") fdst = open(dst, "wb") try: shutil.copyfileobj(fsrc, fdst) finally: fsrc.close() fdst.close() def _copy_disk(self, src, dst): self._manage_disk(vim.VirtualDiskManager.CopyVirtualDisk_Task, sourceName=src, destName=dst) def _manage_disk(self, op, **kwargs): if self._in_uwsim: self._manage_disk_uwsim(op, **kwargs) return try: self._logger.debug("Invoking %s(%s)" % (op.info.name, kwargs)) task = op(self._manager, **kwargs) self._vim_client.wait_for_task(task) except vim.Fault.FileAlreadyExists, e: raise DiskAlreadyExistException(e.msg) except vim.Fault.FileFault, e: raise DiskFileException(e.msg)
class ImageManager(): NUM_MAKEDIRS_ATTEMPTS = 10 DEFAULT_TMP_IMAGES_CLEANUP_INTERVAL = 600.0 REAP_TMP_IMAGES_GRACE_PERIOD = 2 * 60.0 * 60.0 # 2 hrs DELETE_IMAGE_GRACE_PERIOD = 60 UNUSED_IMAGE_MARKER_FILE_NAME = "unused_image_marker.txt" IMAGE_TIMESTAMP_FILE_NAME = "image_timestamp.txt" def __init__(self, host_client, ds_manager): self._logger = logging.getLogger(__name__) self._host_client = host_client self._ds_manager = ds_manager self._image_reaper = None def monitor_for_cleanup(self, reap_interval=DEFAULT_TMP_IMAGES_CLEANUP_INTERVAL): self._image_reaper = Periodic(self.reap_tmp_images, reap_interval) self._image_reaper.daemon = True self._image_reaper.start() def cleanup(self): if self._image_reaper is not None: self._image_reaper.stop() def datastores_with_image(self, image_id, datastores): if image_id is None: return [] return [ds for ds in datastores if self.check_image(image_id, ds)] def image_metadata(self, image_id, datastores): for ds in datastores: if self.check_image(image_id, ds): return self.get_image_metadata(image_id, ds) @staticmethod def get_image_id_from_disks(disks): """Find image id in the disk collection""" if not disks: return None for disk in disks: try: if disk.image.id is not None: return disk.image.id except AttributeError: continue return None @log_duration def check_image(self, image_id, datastore): image_dir = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) try: return os.path.exists(image_dir) except: self._logger.exception("Error looking up %s" % image_dir) return False """ The following method is intended as a replacement of check_image in the vm creation workflow compatible with the new image sweeper. For an image to be valid both the directory and the image timestamp file must exists on the datastore. """ def check_and_validate_image(self, image_id, ds_id): image_dir = os.path.dirname(os_vmdk_path(ds_id, image_id, IMAGE_FOLDER_NAME_PREFIX)) try: if not os.path.exists(image_dir): return False except: self._logger.exception("Error looking up %s" % image_dir) return False # Check the existence of the timestamp file timestamp_pathname = os.path.join(image_dir, self.IMAGE_TIMESTAMP_FILE_NAME) try: if os.path.exists(timestamp_pathname): return True except Exception as ex: self._logger.exception("Exception looking up %s, %s" % (timestamp_pathname, ex)) return False return False """ This method is used to update the mod time on the image timestamp file. """ def touch_image_timestamp(self, ds_id, image_id): image_path = os.path.dirname(os_vmdk_path(ds_id, image_id, IMAGE_FOLDER_NAME_PREFIX)) # Touch the timestamp file timestamp_pathname = os.path.join(image_path, self.IMAGE_TIMESTAMP_FILE_NAME) try: os.utime(timestamp_pathname, None) except Exception as ex: self._logger.exception("Exception looking up %s, %s" % (timestamp_pathname, ex)) raise ex @log_duration def check_image_dir(self, image_id, datastore): image_path = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) try: return os.path.exists(os.path.dirname(image_path)) except: self._logger.error( "Error looking up %s" % image_path, exc_info=True) return False def get_image_directory_path(self, datastore_id, image_id): return image_directory_path(datastore_id, image_id) def get_image_path(self, datastore_id, image_id): return os_vmdk_path(datastore_id, image_id, IMAGE_FOLDER_NAME_PREFIX) def image_size(self, image_id): for image_ds in self._ds_manager.image_datastores(): if self._ds_manager.datastore_type(image_ds) is DatastoreType.VSAN: if os.path.exists(os_vmdk_path(image_ds, image_id, IMAGE_FOLDER_NAME_PREFIX)): # VSAN does not have flat.vmdk so we cannot get file size. Default to 1GB. return 1024 ** 3 else: try: image_path = os_vmdk_flat_path(image_ds, image_id, IMAGE_FOLDER_NAME_PREFIX) return os.path.getsize(image_path) except os.error: pass self._logger.info("Image %s not found in DataStore %s" % (image_id, image_ds)) self._logger.warning("Failed to get image size:", exc_info=True) # Failed to access shared image. raise NoSuchResourceException(ResourceType.IMAGE, "Image does not exist.") def get_image_metadata(self, image_id, datastore): metadata_path = os_metadata_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) self._logger.info("Loading metadata %s" % metadata_path) if os.path.exists(metadata_path): with open(metadata_path) as fh: try: return json.load(fh) except ValueError: self._logger.error("Error loading metadata file %s" % metadata_path, exc_info=True) return {} def _get_datastore_type(self, datastore_id): datastores = self._ds_manager.get_datastores() return [ds.type for ds in datastores if ds.id == datastore_id][0] def _copy_to_tmp_image(self, source_datastore, source_id, dest_datastore, dest_id): """ Copy an image into a temp location. 1. Lock a tmp image destination file with an exclusive lock. This is to prevent the GC thread from garbage collecting directories that are actively being used. The temp directory name contains a random UUID to prevent collisions with concurrent copies 2. Create the temp directory. 3. Copy the metadata file over. 4. Copy the vmdk over. @return the tmp image directory on success. """ ds_type = self._get_datastore_type(dest_datastore) if ds_type == DatastoreType.VSAN: tmp_image_dir = os_datastore_path(dest_datastore, compond_path_join(IMAGE_FOLDER_NAME_PREFIX, dest_id), compond_path_join(TMP_IMAGE_FOLDER_NAME_PREFIX, str(uuid.uuid4()))) else: tmp_image_dir = os_datastore_path(dest_datastore, compond_path_join(TMP_IMAGE_FOLDER_NAME_PREFIX, str(uuid.uuid4()))) # Create the temp directory self._host_client.make_directory(tmp_image_dir) # Copy the metadata file if it exists. source_meta = os_metadata_path(source_datastore, source_id, IMAGE_FOLDER_NAME_PREFIX) if os.path.exists(source_meta): try: dest_meta = os.path.join(tmp_image_dir, metadata_filename(dest_id)) shutil.copy(source_meta, dest_meta) except: self._logger.exception("Failed to copy metadata file %s", source_meta) raise # Create the timestamp file self._create_image_timestamp_file(tmp_image_dir) self._host_client.copy_disk(vmdk_path(source_datastore, source_id, IMAGE_FOLDER_NAME_PREFIX), os.path.join(tmp_image_dir, "%s.vmdk" % dest_id)) return tmp_image_dir def _move_image(self, image_id, datastore, tmp_dir): """ Atomic move of a tmp folder into the image datastore. Handles concurrent moves by locking a well know derivative of the image_id while doing the atomic move. The exclusive file lock ensures that only one move is successful. Has the following side effects: a - If the destination image already exists, it is assumed that someone else successfully copied the image over and the temp directory is deleted. b - If we fail to acquire the file lock after retrying 3 times, or the atomic move fails, the tmp image directory will be left behind and needs to be garbage collected later. image_id: String.The image id of the image being moved. datastore: String. The datastore id of the datastore. tmp_dir: String. The absolute path of the temp image directory. raises: OsError if the move fails AcquireLockFailure, InvalidFile if we fail to lock the destination image. """ ds_type = self._get_datastore_type(datastore) image_path = os_datastore_path(datastore, compond_path_join(IMAGE_FOLDER_NAME_PREFIX, image_id)) self._logger.info("_move_image: %s => %s, ds_type: %s" % (tmp_dir, image_path, ds_type)) if not os.path.exists(tmp_dir): raise ImageNotFoundException("Temp image %s not found" % tmp_dir) try: with FileBackedLock(image_path, ds_type, retry=300, wait_secs=0.1): # wait lock for 30 seconds if self._check_image_repair(image_id, datastore): raise DiskAlreadyExistException("Image already exists") if ds_type == DatastoreType.VSAN: # on VSAN, move all files under [datastore]/image_[image_id]/tmp_image_[uuid]/* to # [datastore]/image_[image_id]/*. # Also we do not delete tmp_image folder in success case, because VSAN accesses it # when creating linked VM, even the folder is now empty. for entry in os.listdir(tmp_dir): shutil.move(os.path.join(tmp_dir, entry), os.path.join(image_path, entry)) else: # on VMFS/NFS/etc, rename [datastore]/tmp_image_[uuid] to [datastore]/tmp_image_[image_id] self._host_client.move_file(tmp_dir, image_path) except: self._logger.exception("Move image %s to %s failed" % (image_id, image_path)) self._host_client.delete_file(tmp_dir) raise """ The following method should be used to check and validate the existence of a previously created image. With the new image delete path the "timestamp" file must exists inside the image directory. If the directory exists and the file does not, it may mean that an image delete operation was aborted mid-way. In this case the following method recreate the timestamp file. All operations are performed while holding the image directory lock (FileBackedLock), the caller is required to hold the lock. """ def _check_image_repair(self, image_id, datastore): vmdk_pathname = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) image_dirname = os.path.dirname(vmdk_pathname) try: # Check vmdk file if not os.path.exists(vmdk_pathname): self._logger.info("Vmdk path doesn't exists: %s" % vmdk_pathname) return False except Exception as ex: self._logger.exception("Exception validating %s, %s" % (image_dirname, ex)) return False # Check timestamp file timestamp_pathname = os.path.join(image_dirname, self.IMAGE_TIMESTAMP_FILE_NAME) try: if os.path.exists(timestamp_pathname): self._logger.info("Timestamp file exists: %s" % timestamp_pathname) return True except Exception as ex: self._logger.exception("Exception validating %s, %s" % (timestamp_pathname, ex)) # The timestamp file is not accessible, try creating one try: self._create_image_timestamp_file(image_dirname) except Exception as ex: self._logger.exception("Exception creating %s, %s" % (timestamp_pathname, ex)) return False self._logger.info("Image repaired: %s" % image_dirname) return True def copy_image(self, source_datastore, source_id, dest_datastore, dest_id): """Copy an image between datastores. This method is used to create a "full clone" of a vmdk. It does so by copying a disk to a unique directory in a well known temporary directory then moving the disk to the destination image location. Data in the temporary directory not properly cleaned up will be periodically garbage collected by the reaper thread. This minimizes the window during which the vmdk path exists with incomplete content. It also works around a hostd issue where cp -f does not work. The current behavior for when the destination disk exists is to overwrite said disk. source_datastore: id of the source datastore source_id: id of the image to copy from dest_datastore: id of the destination datastore dest_id: id of the new image in the destination datastore throws: AcquireLockFailure if timed out waiting to acquire lock on tmp image directory throws: InvalidFile if unable to lock tmp image directory or some other reasons """ if self.check_and_validate_image(dest_id, dest_datastore): # The image is copied, presumably via some other concurrent # copy, so we move on. self._logger.info("Image %s already copied" % dest_id) raise DiskAlreadyExistException("Image already exists") # Copy image to the tmp directory. tmp_dir = self._copy_to_tmp_image(source_datastore, source_id, dest_datastore, dest_id) self._move_image(dest_id, dest_datastore, tmp_dir) def reap_tmp_images(self): """ Clean up unused directories in the temp image folder. """ for ds in self._ds_manager.get_datastores(): for image_dir in list_top_level_directory(ds.id, TMP_IMAGE_FOLDER_NAME_PREFIX): if not os.path.isdir(image_dir): continue create_time = os.stat(image_dir).st_ctime current_time = time.time() if current_time - self.REAP_TMP_IMAGES_GRACE_PERIOD < create_time: # Skip folders that are newly created in past x minutes # For example, during host-to-host transfer, hostd on # receiving end stores the uploaded file in temp images # folder but does not lock it with FileBackedLock, so we # need to allow a grace period before reaping it. self._logger.info("Skip folder: %s, created: %s, now: %s" % (image_dir, create_time, current_time)) continue try: with FileBackedLock(image_dir, ds.type): if os.path.exists(image_dir): self._logger.info("Delete folder %s" % image_dir) shutil.rmtree(image_dir, ignore_errors=True) except (AcquireLockFailure, InvalidFile): self._logger.info("Already locked: %s, skipping" % image_dir) except: self._logger.info("Unable to remove %s" % image_dir, exc_info=True) def get_images(self, datastore): """ Get image list from datastore :param datastore: datastore id :return: list of string, image id list """ image_ids = [] if not os.path.exists(os_datastore_root(datastore)): raise DatastoreNotFoundException() # image_folder is /vmfs/volumes/${datastore}/images_* for dir in list_top_level_directory(datastore, IMAGE_FOLDER_NAME_PREFIX): image_id = dir.split(COMPOND_PATH_SEPARATOR)[1] if self.check_image(image_id, datastore): image_ids.append(image_id) return image_ids def get_datastore_id_from_path(self, image_path): """Extract datastore id from the absolute path of an image. The image path looks something like this: /vmfs/volumes/datastore1/image_ttylinux/ttylinux.vmdk This method returns "datastore1" with this input. """ return image_path.split(os.sep)[3] def get_image_id_from_path(self, image_path): """Extract image id from the absolute path of an image. The image path looks something like this: /vmfs/volumes/datastore1/images_ttylinux/ttylinux.vmdk This method returns "ttylinux" with this input. """ return image_path.split(os.sep)[4].split(COMPOND_PATH_SEPARATOR)[1] def create_image(self, image_id, datastore_id): """ Create a temp image on given datastore, return its path. """ datastore_type = self._get_datastore_type(datastore_id) if datastore_type == DatastoreType.VSAN: # on VSAN, tmp_dir is [datastore]/image_[image_id]/tmp_image_[uuid] # Because VSAN does not allow moving top-level directories, we place tmp_image # under image's dir. relative_path = os.path.join(compond_path_join(IMAGE_FOLDER_NAME_PREFIX, image_id), compond_path_join(TMP_IMAGE_FOLDER_NAME_PREFIX, str(uuid.uuid4()))) tmp_dir = os_datastore_path(datastore_id, relative_path) else: # on VMFS/NFS/etc, tmp_dir is [datastore]/tmp_image_[uuid] tmp_dir = os_datastore_path(datastore_id, compond_path_join(TMP_IMAGE_FOLDER_NAME_PREFIX, str(uuid.uuid4()))) self._host_client.make_directory(tmp_dir) # return datastore path, so that it can be passed to nfc client return os_to_datastore_path(tmp_dir) def finalize_image(self, datastore_id, tmp_dir, image_id): """ Installs an image using image data staged at a temp directory. """ self._move_image(image_id, datastore_id, datastore_to_os_path(tmp_dir)) self._create_image_timestamp_file(self._image_directory(datastore_id, image_id)) def create_image_with_vm_disk(self, datastore_id, tmp_dir, image_id, vm_disk_os_path): """ Fills a temp image directory with a disk from a VM, then installs directory in the shared image folder. """ # Create parent directory as required by CopyVirtualDisk_Task dst_vmdk_path = os.path.join(datastore_to_os_path(tmp_dir), "%s.vmdk" % image_id) if os.path.exists(dst_vmdk_path): self._logger.warning("Unexpected disk %s present, overwriting" % dst_vmdk_path) self._host_client.copy_disk(vm_disk_os_path, dst_vmdk_path) try: self.finalize_image(datastore_id, tmp_dir, image_id) except: self._logger.warning("Delete copied disk %s" % dst_vmdk_path) self._host_client.delete_disk(dst_vmdk_path) raise def delete_tmp_dir(self, datastore_id, tmp_dir): """ Deletes a temp image directory by moving it to a GC directory """ file_path = os_datastore_path(datastore_id, tmp_dir) if not os.path.exists(file_path): self._logger.info("Tmp dir %s not" % file_path) raise DirectoryNotFound("Directory %s not found" % file_path) rm_rf(file_path) @staticmethod def _read_marker_file(filename): with open(filename, "r") as marker_file: start_time_str = marker_file.read() return float(start_time_str) """ Delete a single image following the delete image steps. This method is supposed to be safe when run concurrently with: a) itself, b) image creation/copy, c) vm creation The steps are outlined here: 1) Read content of the unused_image_marker file. If error, move on to next image. 2) Acquire image-lock. 3) Read the mod time on the t-stamp file. If t-stamp file doesn't exist go to 6. 4) If the mod time of the t-stamp file is newer than the content of the marker file move on to next image. 5) Move the t-stamp file to another name. 6) Check the mod time on the new name of the t-stamp file. if the mod time has changed, move on to next image. 7) move image directory to a trash location This method returns True if the image was removed, False if the image could not be removed. """ def delete_image(self, datastore_id, image_id, grace_period): self._logger.info("delete_image: Starting to delete image: %s, %s" % (datastore_id, image_id)) image_dir = self._image_directory(datastore_id, image_id) ds_type = self._get_datastore_type(datastore_id) marker_pathname = os.path.join(image_dir, self.UNUSED_IMAGE_MARKER_FILE_NAME) timestamp_pathname = os.path.join(image_dir, self.IMAGE_TIMESTAMP_FILE_NAME) try: with FileBackedLock(image_dir, ds_type): # Read marker file to determine when image scanner marked this image as unused marker_time = self._read_marker_file(marker_pathname) self._logger.info("delete_image: image was marked as unused at: %s" % marker_time) # Subtract grace time to avoid errors due to small difference in clock # values on different hosts. Pretend the scan started 60 seconds earlier. marker_time -= grace_period # Read timestamp mod_time to determine the latest vm creation using this image timestamp_exists, mod_time = self._get_mod_time(timestamp_pathname) self._logger.info("delete_image: image was last touched at: %s, %s" % (timestamp_exists, mod_time)) # Image was touched (due to VM creation) after scanner marked it as unused. # Remove unused image marker file if timestamp_exists and mod_time >= marker_time: self._logger.info("delete_image: image is in-use, do not delete") os.unlink(marker_pathname) return False # Delete image directory self._logger.info("delete_image: removing image directory: %s" % image_dir) self._host_client.delete_file(image_dir) return True except Exception: self._logger.exception("delete_image: failed to delete image") return False # Read the mod time on a file, returns two values, a boolean which is set to true if the # file exists, otherwise set to false and the mod time of the existing file def _get_mod_time(self, pathname): try: mod_time = os.path.getmtime(pathname) except OSError as ex: self._logger.warning("Cannot read mod time for file: %s, %s" % (pathname, ex)) if ex.errno == errno.ENOENT: return False, 0 else: raise ex return True, mod_time def get_timestamp_mod_time_from_dir(self, dirname): filename = os.path.join(dirname, self.IMAGE_TIMESTAMP_FILE_NAME) return self._get_mod_time(filename) def _create_image_timestamp_file(self, dirname): try: timestamp_pathname = os.path.join(dirname, self.IMAGE_TIMESTAMP_FILE_NAME) open(timestamp_pathname, 'w').close() except Exception as ex: self._logger.exception("Exception creating %s, %s" % (dirname, ex)) raise ex def _image_directory(self, datastore_id, image_id): return os.path.dirname(os_vmdk_path(datastore_id, image_id, IMAGE_FOLDER_NAME_PREFIX))
class ImageManager(): NUM_MAKEDIRS_ATTEMPTS = 10 DEFAULT_TMP_IMAGES_CLEANUP_INTERVAL = 600.0 REAP_TMP_IMAGES_GRACE_PERIOD = 2 * 60.0 * 60.0 # 2 hrs DELETE_IMAGE_GRACE_PERIOD = 60 UNUSED_IMAGE_MARKER_FILE_NAME = "unused_image_marker.txt" IMAGE_TIMESTAMP_FILE_NAME = "image_timestamp.txt" def __init__(self, host_client, ds_manager): self._logger = logging.getLogger(__name__) self._host_client = host_client self._ds_manager = ds_manager self._image_reaper = None def monitor_for_cleanup(self, reap_interval=DEFAULT_TMP_IMAGES_CLEANUP_INTERVAL): self._image_reaper = Periodic(self.reap_tmp_images, reap_interval) self._image_reaper.daemon = True self._image_reaper.start() def cleanup(self): if self._image_reaper is not None: self._image_reaper.stop() def datastores_with_image(self, image_id, datastores): if image_id is None: return [] return [ds for ds in datastores if self.check_image(image_id, ds)] def image_metadata(self, image_id, datastores): for ds in datastores: if self.check_image(image_id, ds): return self.get_image_metadata(image_id, ds) @staticmethod def get_image_id_from_disks(disks): """Find image id in the disk collection""" if not disks: return None for disk in disks: try: if disk.image.id is not None: return disk.image.id except AttributeError: continue return None @log_duration def check_image(self, image_id, datastore): image_dir = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) try: return os.path.exists(image_dir) except: self._logger.exception("Error looking up %s" % image_dir) return False """ The following method is intended as a replacement of check_image in the vm creation workflow compatible with the new image sweeper. For an image to be valid both the directory and the image timestamp file must exists on the datastore. """ def check_and_validate_image(self, image_id, ds_id): image_dir = os.path.dirname(os_vmdk_path(ds_id, image_id, IMAGE_FOLDER_NAME_PREFIX)) try: if not os.path.exists(image_dir): return False except: self._logger.exception("Error looking up %s" % image_dir) return False # Check the existence of the timestamp file timestamp_pathname = os.path.join(image_dir, self.IMAGE_TIMESTAMP_FILE_NAME) try: if os.path.exists(timestamp_pathname): return True except Exception as ex: self._logger.exception("Exception looking up %s, %s" % (timestamp_pathname, ex)) return False return False """ This method is used to update the mod time on the image timestamp file. """ def touch_image_timestamp(self, ds_id, image_id): image_path = os.path.dirname(os_vmdk_path(ds_id, image_id, IMAGE_FOLDER_NAME_PREFIX)) # Touch the timestamp file timestamp_pathname = os.path.join(image_path, self.IMAGE_TIMESTAMP_FILE_NAME) try: os.utime(timestamp_pathname, None) except Exception as ex: self._logger.exception("Exception looking up %s, %s" % (timestamp_pathname, ex)) raise ex @log_duration def check_image_dir(self, image_id, datastore): image_path = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) try: return os.path.exists(os.path.dirname(image_path)) except: self._logger.error( "Error looking up %s" % image_path, exc_info=True) return False def get_image_directory_path(self, datastore_id, image_id): return image_directory_path(datastore_id, image_id) def get_image_path(self, datastore_id, image_id): return os_vmdk_path(datastore_id, image_id, IMAGE_FOLDER_NAME_PREFIX) def image_size(self, image_id): for image_ds in self._ds_manager.image_datastores(): if self._ds_manager.datastore_type(image_ds) is DatastoreType.VSAN: if os.path.exists(os_vmdk_path(image_ds, image_id, IMAGE_FOLDER_NAME_PREFIX)): # VSAN does not have flat.vmdk so we cannot get file size. Default to 1GB. return 1024 ** 3 else: try: image_path = os_vmdk_flat_path(image_ds, image_id, IMAGE_FOLDER_NAME_PREFIX) return os.path.getsize(image_path) except os.error: pass self._logger.info("Image %s not found in DataStore %s" % (image_id, image_ds)) self._logger.warning("Failed to get image size:", exc_info=True) # Failed to access shared image. raise NoSuchResourceException(ResourceType.IMAGE, "Image does not exist.") def get_image_metadata(self, image_id, datastore): metadata_path = os_metadata_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) self._logger.info("Loading metadata %s" % metadata_path) if os.path.exists(metadata_path): with open(metadata_path) as fh: try: return json.load(fh) except ValueError: self._logger.error("Error loading metadata file %s" % metadata_path, exc_info=True) return {} def _get_datastore_type(self, datastore_id): datastores = self._ds_manager.get_datastores() return [ds.type for ds in datastores if ds.id == datastore_id][0] def _copy_to_tmp_image(self, source_datastore, source_id, dest_datastore, dest_id): """ Copy an image into a temp location. 1. Lock a tmp image destination file with an exclusive lock. This is to prevent the GC thread from garbage collecting directories that are actively being used. The temp directory name contains a random UUID to prevent collisions with concurrent copies 2. Create the temp directory. 3. Copy the metadata file over. 4. Copy the vmdk over. @return the tmp image directory on success. """ ds_type = self._get_datastore_type(dest_datastore) if ds_type == DatastoreType.VSAN: tmp_image_dir = os_datastore_path(dest_datastore, compond_path_join(IMAGE_FOLDER_NAME_PREFIX, dest_id), compond_path_join(TMP_IMAGE_FOLDER_NAME_PREFIX, str(uuid.uuid4()))) else: tmp_image_dir = os_datastore_path(dest_datastore, compond_path_join(TMP_IMAGE_FOLDER_NAME_PREFIX, str(uuid.uuid4()))) # Create the temp directory self._host_client.make_directory(tmp_image_dir) # Copy the metadata file if it exists. source_meta = os_metadata_path(source_datastore, source_id, IMAGE_FOLDER_NAME_PREFIX) if os.path.exists(source_meta): try: dest_meta = os.path.join(tmp_image_dir, metadata_filename(dest_id)) shutil.copy(source_meta, dest_meta) except: self._logger.exception("Failed to copy metadata file %s", source_meta) raise # Create the timestamp file self._create_image_timestamp_file(tmp_image_dir) self._host_client.copy_disk(vmdk_path(source_datastore, source_id, IMAGE_FOLDER_NAME_PREFIX), os.path.join(tmp_image_dir, "%s.vmdk" % dest_id)) return tmp_image_dir def _move_image(self, image_id, datastore, tmp_dir): """ Atomic move of a tmp folder into the image datastore. Handles concurrent moves by locking a well know derivative of the image_id while doing the atomic move. The exclusive file lock ensures that only one move is successful. Has the following side effects: a - If the destination image already exists, it is assumed that someone else successfully copied the image over and the temp directory is deleted. b - If we fail to acquire the file lock after retrying 3 times, or the atomic move fails, the tmp image directory will be left behind and needs to be garbage collected later. image_id: String.The image id of the image being moved. datastore: String. The datastore id of the datastore. tmp_dir: String. The absolute path of the temp image directory. raises: OsError if the move fails AcquireLockFailure, InvalidFile if we fail to lock the destination image. """ ds_type = self._get_datastore_type(datastore) image_path = os_datastore_path(datastore, compond_path_join(IMAGE_FOLDER_NAME_PREFIX, image_id)) self._logger.info("_move_image: %s => %s, ds_type: %s" % (tmp_dir, image_path, ds_type)) if not os.path.exists(tmp_dir): raise ImageNotFoundException("Temp image %s not found" % tmp_dir) try: with FileBackedLock(image_path, ds_type, retry=300, wait_secs=0.1): # wait lock for 30 seconds if self._check_image_repair(image_id, datastore): raise DiskAlreadyExistException("Image already exists") if ds_type == DatastoreType.VSAN: # on VSAN, move all files under [datastore]/image_[image_id]/tmp_image_[uuid]/* to # [datastore]/image_[image_id]/*. # Also we do not delete tmp_image folder in success case, because VSAN accesses it # when creating linked VM, even the folder is now empty. for entry in os.listdir(tmp_dir): shutil.move(os.path.join(tmp_dir, entry), os.path.join(image_path, entry)) else: # on VMFS/NFS/etc, rename [datastore]/tmp_image_[uuid] to [datastore]/tmp_image_[image_id] self._host_client.move_file(tmp_dir, image_path) except: self._logger.exception("Move image %s to %s failed" % (image_id, image_path)) self._host_client.delete_file(tmp_dir) raise """ The following method should be used to check and validate the existence of a previously created image. With the new image delete path the "timestamp" file must exists inside the image directory. If the directory exists and the file does not, it may mean that an image delete operation was aborted mid-way. In this case the following method recreate the timestamp file. All operations are performed while holding the image directory lock (FileBackedLock), the caller is required to hold the lock. """ def _check_image_repair(self, image_id, datastore): vmdk_pathname = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) image_dirname = os.path.dirname(vmdk_pathname) try: # Check vmdk file if not os.path.exists(vmdk_pathname): self._logger.info("Vmdk path doesn't exists: %s" % vmdk_pathname) return False except Exception as ex: self._logger.exception("Exception validating %s, %s" % (image_dirname, ex)) return False # Check timestamp file timestamp_pathname = os.path.join(image_dirname, self.IMAGE_TIMESTAMP_FILE_NAME) try: if os.path.exists(timestamp_pathname): self._logger.info("Timestamp file exists: %s" % timestamp_pathname) return True except Exception as ex: self._logger.exception("Exception validating %s, %s" % (timestamp_pathname, ex)) # The timestamp file is not accessible, try creating one try: self._create_image_timestamp_file(image_dirname) except Exception as ex: self._logger.exception("Exception creating %s, %s" % (timestamp_pathname, ex)) return False self._logger.info("Image repaired: %s" % image_dirname) return True def copy_image(self, source_datastore, source_id, dest_datastore, dest_id): """Copy an image between datastores. This method is used to create a "full clone" of a vmdk. It does so by copying a disk to a unique directory in a well known temporary directory then moving the disk to the destination image location. Data in the temporary directory not properly cleaned up will be periodically garbage collected by the reaper thread. This minimizes the window during which the vmdk path exists with incomplete content. It also works around a hostd issue where cp -f does not work. The current behavior for when the destination disk exists is to overwrite said disk. source_datastore: id of the source datastore source_id: id of the image to copy from dest_datastore: id of the destination datastore dest_id: id of the new image in the destination datastore throws: AcquireLockFailure if timed out waiting to acquire lock on tmp image directory throws: InvalidFile if unable to lock tmp image directory or some other reasons """ if self.check_and_validate_image(dest_id, dest_datastore): # The image is copied, presumably via some other concurrent # copy, so we move on. self._logger.info("Image %s already copied" % dest_id) raise DiskAlreadyExistException("Image already exists") # Copy image to the tmp directory. tmp_dir = self._copy_to_tmp_image(source_datastore, source_id, dest_datastore, dest_id) self._move_image(dest_id, dest_datastore, tmp_dir) def reap_tmp_images(self): """ Clean up unused directories in the temp image folder. """ for ds in self._ds_manager.get_datastores(): for image_dir in list_top_level_directory(ds.id, TMP_IMAGE_FOLDER_NAME_PREFIX): if not os.path.isdir(image_dir): continue create_time = os.stat(image_dir).st_ctime current_time = time.time() if current_time - self.REAP_TMP_IMAGES_GRACE_PERIOD < create_time: # Skip folders that are newly created in past x minutes # For example, during host-to-host transfer, hostd on # receiving end stores the uploaded file in temp images # folder but does not lock it with FileBackedLock, so we # need to allow a grace period before reaping it. self._logger.info("Skip folder: %s, created: %s, now: %s" % (image_dir, create_time, current_time)) continue try: with FileBackedLock(image_dir, ds.type): if os.path.exists(image_dir): self._logger.info("Delete folder %s" % image_dir) shutil.rmtree(image_dir, ignore_errors=True) except (AcquireLockFailure, InvalidFile): self._logger.info("Already locked: %s, skipping" % image_dir) except: self._logger.info("Unable to remove %s" % image_dir, exc_info=True) def get_images(self, datastore): """ Get image list from datastore :param datastore: datastore id :return: list of string, image id list """ image_ids = [] if not os.path.exists(os_datastore_root(datastore)): raise DatastoreNotFoundException() # image_folder is /vmfs/volumes/${datastore}/images_* for dir in list_top_level_directory(datastore, IMAGE_FOLDER_NAME_PREFIX): image_id = dir.split(COMPOND_PATH_SEPARATOR)[1] if self.check_image(image_id, datastore): image_ids.append(image_id) return image_ids def get_datastore_id_from_path(self, image_path): """Extract datastore id from the absolute path of an image. The image path looks something like this: /vmfs/volumes/datastore1/image_ttylinux/ttylinux.vmdk This method returns "datastore1" with this input. """ return image_path.split(os.sep)[3] def get_image_id_from_path(self, image_path): """Extract image id from the absolute path of an image. The image path looks something like this: /vmfs/volumes/datastore1/images_ttylinux/ttylinux.vmdk This method returns "ttylinux" with this input. """ return image_path.split(os.sep)[4].split(COMPOND_PATH_SEPARATOR)[1] def create_image(self, image_id, datastore_id): """ Create a temp image on given datastore, return its path. """ datastore_type = self._get_datastore_type(datastore_id) if datastore_type == DatastoreType.VSAN: # on VSAN, tmp_dir is [datastore]/image_[image_id]/tmp_image_[uuid] # Because VSAN does not allow moving top-level directories, we place tmp_image # under image's dir. relative_path = os.path.join(compond_path_join(IMAGE_FOLDER_NAME_PREFIX, image_id), compond_path_join(TMP_IMAGE_FOLDER_NAME_PREFIX, str(uuid.uuid4()))) tmp_dir = os_datastore_path(datastore_id, relative_path) else: # on VMFS/NFS/etc, tmp_dir is [datastore]/tmp_image_[uuid] tmp_dir = os_datastore_path(datastore_id, compond_path_join(TMP_IMAGE_FOLDER_NAME_PREFIX, str(uuid.uuid4()))) self._host_client.make_directory(tmp_dir) # return datastore path, so that it can be passed to nfc client return os_to_datastore_path(tmp_dir) def finalize_image(self, datastore_id, tmp_dir, image_id): """ Installs an image using image data staged at a temp directory. """ self._move_image(image_id, datastore_id, datastore_to_os_path(tmp_dir)) self._create_image_timestamp_file(self._image_directory(datastore_id, image_id)) def create_image_with_vm_disk(self, datastore_id, tmp_dir, image_id, vm_disk_os_path): """ Fills a temp image directory with a disk from a VM, then installs directory in the shared image folder. """ # Create parent directory as required by CopyVirtualDisk_Task dst_vmdk_path = os.path.join(datastore_to_os_path(tmp_dir), "%s.vmdk" % image_id) if os.path.exists(dst_vmdk_path): self._logger.warning("Unexpected disk %s present, overwriting" % dst_vmdk_path) self._host_client.copy_disk(vm_disk_os_path, dst_vmdk_path) try: self.finalize_image(datastore_id, tmp_dir, image_id) except: self._logger.warning("Delete copied disk %s" % dst_vmdk_path) self._host_client.delete_disk(dst_vmdk_path) raise def delete_tmp_dir(self, datastore_id, tmp_dir): """ Deletes a temp image directory by moving it to a GC directory """ file_path = os_datastore_path(datastore_id, tmp_dir) if not os.path.exists(file_path): self._logger.info("Tmp dir %s not" % file_path) raise DirectoryNotFound("Directory %s not found" % file_path) rm_rf(file_path) @staticmethod def _read_marker_file(filename): with open(filename, "r") as marker_file: start_time_str = marker_file.read() return float(start_time_str) """ Delete a single image following the delete image steps. This method is supposed to be safe when run concurrently with: a) itself, b) image creation/copy, c) vm creation The steps are outlined here: 1) Read content of the unused_image_marker file. If error, move on to next image. 2) Acquire image-lock. 3) Read the mod time on the t-stamp file. If t-stamp file doesn't exist go to 6. 4) If the mod time of the t-stamp file is newer than the content of the marker file move on to next image. 5) Move the t-stamp file to another name. 6) Check the mod time on the new name of the t-stamp file. if the mod time has changed, move on to next image. 7) move image directory to a trash location This method returns True if the image was removed, False if the image could not be removed. """ def delete_image(self, datastore_id, image_id, grace_period): self._logger.info("delete_image: Starting to delete image: %s, %s" % (datastore_id, image_id)) image_dir = self._image_directory(datastore_id, image_id) ds_type = self._get_datastore_type(datastore_id) marker_pathname = os.path.join(image_dir, self.UNUSED_IMAGE_MARKER_FILE_NAME) timestamp_pathname = os.path.join(image_dir, self.IMAGE_TIMESTAMP_FILE_NAME) try: with FileBackedLock(image_dir, ds_type): # Read marker file to determine when image scanner marked this image as unused marker_time = self._read_marker_file(marker_pathname) self._logger.info("delete_image: image was marked as unused at: %s" % marker_time) # Subtract grace time to avoid errors due to small difference in clock # values on different hosts. Pretend the scan started 60 seconds earlier. marker_time -= grace_period # Read timestamp mod_time to determine the latest vm creation using this image timestamp_exists, mod_time = self._get_mod_time(timestamp_pathname) self._logger.info("delete_image: image was last touched at: %s, %s" % (timestamp_exists, mod_time)) # Image was touched (due to VM creation) after scanner marked it as unused. # Remove unused image marker file if timestamp_exists and mod_time >= marker_time: self._logger.info("delete_image: image is in-use, do not delete") os.unlink(marker_pathname) return False # Delete image directory self._logger.info("delete_image: removing image directory: %s" % image_dir) if ds_type == DatastoreType.VSAN: # Special handling on VSAN before deleting an image self._delete_image_on_vsan(datastore_id, image_id) else: self._host_client.delete_file(image_dir) return True except Exception: self._logger.exception("delete_image: failed to delete image") return False # Special handling on VSAN # need to delete vdisk then osfs namespace in separate steps def _delete_image_on_vsan(self, datastore_id, image_id): self._logger.info("_delete_image_on_vsan: datastore_id=%s, image_id=%s" % (datastore_id, image_id)) # clear ddb.deletable flag in .vmdk file which would otherwise cause # Vmacore::File::PermissionDeniedException (PR 1704935) vmdk_path = os_vmdk_path(datastore_id, image_id, IMAGE_FOLDER_NAME_PREFIX) temp_path = "%s~" % vmdk_path pattern = re.compile("^ddb.deletable = ") disk_file = open(vmdk_path) temp_file = open(temp_path, "w+") for line in disk_file: if not pattern.match(line): temp_file.write(line) else: self._logger.info("_delete_image_on_vsan: skip %s" % line) temp_file.close() disk_file.close() os.rename(temp_path, vmdk_path) # delete vdisk self._host_client.delete_file(vmdk_path) # delete folder content which would otherwise cause vim.fault.DirectoryNotEmpty (PR 1721520) image_dir = self._image_directory(datastore_id, image_id) for entry in os.listdir(image_dir): if not entry.startswith('.') or entry.endswith(".lck"): self._logger.info("_delete_image_on_vsan: delete %s" % os.path.join(image_dir, entry)) entry_full_path = os.path.join(image_dir, entry) if os.path.isdir(entry_full_path): rm_rf(entry_full_path) else: os.unlink(entry_full_path) # delete folder (osfs namespace) self._host_client.delete_file(image_dir) # Read the mod time on a file, returns two values, a boolean which is set to true if the # file exists, otherwise set to false and the mod time of the existing file def _get_mod_time(self, pathname): try: mod_time = os.path.getmtime(pathname) except OSError as ex: self._logger.warning("Cannot read mod time for file: %s, %s" % (pathname, ex)) if ex.errno == errno.ENOENT: return False, 0 else: raise ex return True, mod_time def get_timestamp_mod_time_from_dir(self, dirname): filename = os.path.join(dirname, self.IMAGE_TIMESTAMP_FILE_NAME) return self._get_mod_time(filename) def _create_image_timestamp_file(self, dirname): try: timestamp_pathname = os.path.join(dirname, self.IMAGE_TIMESTAMP_FILE_NAME) open(timestamp_pathname, 'w').close() except Exception as ex: self._logger.exception("Exception creating %s, %s" % (dirname, ex)) raise ex def _image_directory(self, datastore_id, image_id): return os.path.dirname(os_vmdk_path(datastore_id, image_id, IMAGE_FOLDER_NAME_PREFIX))
class StatsPublisher(object): DEFAULT_PUBLISH_INTERVAL_SECS = 20.0 DEFAULT_PUBLISH_TRY_COUNT = 10 DEFAULT_FAILED_PUBLISH_INTERVAL_SECS = 10 * 60 def __init__( self, tsdb, publish_try_count=DEFAULT_PUBLISH_TRY_COUNT, failed_publish_interval_secs=DEFAULT_FAILED_PUBLISH_INTERVAL_SECS, ): self._logger = logging.getLogger(__name__) self._db = tsdb self._last_seen_ts = 0 self.failed_count = 0 self.publish_try_count = publish_try_count self.failed_publish_interval_secs = failed_publish_interval_secs # XXX plugin configuration should be decoupled from agent_config arg # parsing self._agent_config = common.services.get(ServiceName.AGENT_CONFIG) self._hostname = self._agent_config.hostname if self._hostname is None: self._hostname = socket.gethostname() self._publish_interval_secs = float( self._agent_config.__dict__.get("stats_publish_interval", StatsPublisher.DEFAULT_PUBLISH_INTERVAL_SECS) ) self._publisher_thread = None self._publishers = [] def start_publishing(self): self._publisher_thread = Periodic(self.publish, self._publish_interval_secs) self._publisher_thread.daemon = True self._publisher_thread.start() def stop_publishing(self): if self._publisher_thread is not None: self._publisher_thread.stop() def register_publisher(self, publisher): """ Add a new publisher Args: publisher: Publisher instance """ self._publishers.append(publisher) def configure_publishers(self): stats_store_endpoint = self._agent_config.stats_store_endpoint stats_store_port = self._agent_config.stats_store_port stats_host_tags = self._agent_config.stats_host_tags pm_publisher = GraphitePublisher( hostname=self._hostname, carbon_host=stats_store_endpoint, carbon_port=stats_store_port, host_tags=stats_host_tags, ) self.register_publisher(pm_publisher) self._logger.info("Stats publisher configured") def publish(self): if len(self._publishers) <= 0: self._logger.debug("No publishers found.") return retrieved_stats = {} latest_ts = self._last_seen_ts self._logger.debug("DB metrics size %d" % len(self._db.get_keys())) for metric in self._db.get_keys(): values = self._db.get_values_since(self._last_seen_ts, metric) retrieved_stats[metric] = values if values: latest_ts = max(latest_ts, max([x[0] for x in values])) self._last_seen_ts = latest_ts if len(retrieved_stats) > 0: # Use first publisher by default for now publisher = self._publishers[0] published = publisher.publish(retrieved_stats) if not published: self.failed_count += 1 self._logger.critical("Publisher failed to publish stats, failed_count:%s" % str(self.failed_count)) elif self.failed_count > 0: self.failed_count = 0 self._publisher_thread.update_wait_interval(self.DEFAULT_PUBLISH_INTERVAL_SECS) else: self._logger.debug("No metrics to send") if self.failed_count >= self.publish_try_count: self.failed_count = 0 self._logger.critical( "Too many failed attempts to publish stats. Publisher will sleep for %s seconds now" % str(self.failed_publish_interval_secs) ) self._publisher_thread.update_wait_interval(self.failed_publish_interval_secs)
def monitor_for_cleanup(self, reap_interval=DEFAULT_TMP_IMAGES_CLEANUP_INTERVAL): self._image_reaper = Periodic(self.reap_tmp_images, reap_interval) self._image_reaper.daemon = True self._image_reaper.start()
class EsxImageManager(ImageManager): NUM_MAKEDIRS_ATTEMPTS = 10 DEFAULT_TMP_IMAGES_CLEANUP_INTERVAL = 600.0 REAP_TMP_IMAGES_GRACE_PERIOD = 600.0 IMAGE_MARKER_FILE_NAME = "unused_image_marker.txt" IMAGE_TIMESTAMP_FILE_NAME = "image_timestamp.txt" IMAGE_TIMESTAMP_FILE_RENAME_SUFFIX = ".renamed" def __init__(self, vim_client, ds_manager): super(EsxImageManager, self).__init__() self._logger = logging.getLogger(__name__) self._vim_client = vim_client self._ds_manager = ds_manager self._image_reaper = None def monitor_for_cleanup(self, reap_interval=DEFAULT_TMP_IMAGES_CLEANUP_INTERVAL): self._image_reaper = Periodic(self.reap_tmp_images, reap_interval) self._image_reaper.daemon = True self._image_reaper.start() def cleanup(self): if self._image_reaper is not None: self._image_reaper.stop() @log_duration def check_image(self, image_id, datastore): image_dir = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) try: return os.path.exists(image_dir) except: self._logger.exception( "Error looking up %s" % image_dir) return False """ The following method is intended as a replacement of check_image in the vm creation workflow compatible with the new image sweeper. For an image to be valid both the directory and the image timestamp file must exists on the datastore. """ def check_and_validate_image(self, image_id, ds_id): image_dir = os.path.dirname( os_vmdk_path(ds_id, image_id, IMAGE_FOLDER_NAME_PREFIX)) try: if not os.path.exists(image_dir): return False except: self._logger.exception( "Error looking up %s" % image_dir) return False # Check the existence of the timestamp file timestamp_pathname = \ os.path.join(image_dir, self.IMAGE_TIMESTAMP_FILE_NAME) try: if os.path.exists(timestamp_pathname): return True except Exception as ex: self._logger.exception( "Exception looking up %s, %s" % (timestamp_pathname, ex)) return False return False """ This method is used to update the mod time on the image timestamp file. """ def touch_image_timestamp(self, ds_id, image_id): """ :param ds_id: :param image_id: :return: """ image_path = os.path.dirname( os_vmdk_path(ds_id, image_id, IMAGE_FOLDER_NAME_PREFIX)) # Touch the timestamp file timestamp_pathname = os.path.join(image_path, self.IMAGE_TIMESTAMP_FILE_NAME) try: os.utime(timestamp_pathname, None) except Exception as ex: self._logger.exception( "Exception looking up %s, %s" % (timestamp_pathname, ex)) raise ex @log_duration def check_image_dir(self, image_id, datastore): image_path = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) try: return os.path.exists(os.path.dirname(image_path)) except: self._logger.error( "Error looking up %s" % image_path, exc_info=True) return False def get_image_directory_path(self, datastore_id, image_id): return image_directory_path(datastore_id, image_id) def get_image_path(self, datastore_id, image_id): return os_vmdk_path(datastore_id, image_id, IMAGE_FOLDER_NAME_PREFIX) def image_size(self, image_id): for image_ds in self._ds_manager.image_datastores(): try: image_path = os_vmdk_flat_path(image_ds, image_id, IMAGE_FOLDER_NAME_PREFIX) return os.path.getsize(image_path) except os.error: self._logger.info("Image %s not found in DataStore %s" % (image_id, image_ds)) self._logger.warning("Failed to get image size:", exc_info=True) # Failed to access shared image. raise NoSuchResourceException( ResourceType.IMAGE, "Image does not exist.") def _load_json(self, metadata_path): if os.path.exists(metadata_path): with open(metadata_path) as fh: try: data = json.load(fh) return data except ValueError: self._logger.error( "Error loading metadata file %s" % metadata_path, exc_info=True) return {} def get_image_metadata(self, image_id, datastore): metadata_path = os_metadata_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) self._logger.info("Loading metadata %s" % metadata_path) return self._load_json(metadata_path) def _get_datastore_type(self, datastore_id): datastores = self._ds_manager.get_datastores() return [ds.type for ds in datastores if ds.id == datastore_id][0] def _prepare_virtual_disk_spec(self, disk_type, adapter_type): """ :param disk_type [vim.VirtualDiskManager.VirtualDiskType]: :param adapter_type [vim.VirtualDiskManager.VirtualDiskAdapterType]: """ _vd_spec = vim.VirtualDiskManager.VirtualDiskSpec() _vd_spec.diskType = str(disk_type) _vd_spec.adapterType = str(adapter_type) return _vd_spec def _copy_to_tmp_image(self, source_datastore, source_id, dest_datastore, dest_id): """ Copy an image into a temp location. 1. Lock a tmp image destination file with an exclusive lock. This is to prevent the GC thread from garbage collecting directories that are actively being used. The temp directory name contains a random UUID to prevent collisions with concurrent copies 2. Create the temp directory. 3. Copy the metadata file over. 4. Copy the vmdk over. @return the tmp image directory on success. """ ds_type = self._get_datastore_type(dest_datastore) if ds_type == DatastoreType.VSAN: tmp_image_dir = os_datastore_path(dest_datastore, compond_path_join(IMAGE_FOLDER_NAME_PREFIX, dest_id), compond_path_join(TMP_IMAGE_FOLDER_NAME_PREFIX, str(uuid.uuid4()))) else: tmp_image_dir = os_datastore_path(dest_datastore, compond_path_join(TMP_IMAGE_FOLDER_NAME_PREFIX, str(uuid.uuid4()))) # Create the temp directory self._vim_client.make_directory(tmp_image_dir) # Copy the metadata file if it exists. source_meta = os_metadata_path(source_datastore, source_id, IMAGE_FOLDER_NAME_PREFIX) if os.path.exists(source_meta): try: dest_meta = os.path.join(tmp_image_dir, metadata_filename(dest_id)) shutil.copy(source_meta, dest_meta) except: self._logger.exception("Failed to copy metadata file %s", source_meta) raise # Create the timestamp file self._create_image_timestamp_file(tmp_image_dir) _vd_spec = self._prepare_virtual_disk_spec( vim.VirtualDiskManager.VirtualDiskType.thin, vim.VirtualDiskManager.VirtualDiskAdapterType.lsiLogic) self._manage_disk(vim.VirtualDiskManager.CopyVirtualDisk_Task, sourceName=vmdk_path(source_datastore, source_id, IMAGE_FOLDER_NAME_PREFIX), destName=os_to_datastore_path(os.path.join(tmp_image_dir, "%s.vmdk" % dest_id)), destSpec=_vd_spec) return tmp_image_dir def _move_image(self, image_id, datastore, tmp_dir): """ Atomic move of a tmp folder into the image datastore. Handles concurrent moves by locking a well know derivative of the image_id while doing the atomic move. The exclusive file lock ensures that only one move is successful. Has the following side effects: a - If the destination image already exists, it is assumed that someone else successfully copied the image over and the temp directory is deleted. b - If we fail to acquire the file lock after retrying 3 times, or the atomic move fails, the tmp image directory will be left behind and needs to be garbage collected later. image_id: String.The image id of the image being moved. datastore: String. The datastore id of the datastore. tmp_dir: String. The absolute path of the temp image directory. raises: OsError if the move fails AcquireLockFailure, InvalidFile if we fail to lock the destination image. """ ds_type = self._get_datastore_type(datastore) image_path = os_datastore_path(datastore, compond_path_join(IMAGE_FOLDER_NAME_PREFIX, image_id)) self._logger.info("_move_image: %s => %s, ds_type: %s" % (tmp_dir, image_path, ds_type)) if not os.path.exists(tmp_dir): raise ImageNotFoundException("Temp image %s not found" % tmp_dir) try: with FileBackedLock(image_path, ds_type, retry=300, wait_secs=0.01): # wait lock for 3 seconds if self._check_image_repair(image_id, datastore): raise DiskAlreadyExistException("Image already exists") if ds_type == DatastoreType.VSAN: # on VSAN, move all files under [datastore]/image_[image_id]/tmp_image_[uuid]/* to # [datastore]/image_[image_id]/*. # Also we do not delete tmp_image folder in success case, because VSAN accesses it # when creating linked VM, even the folder is now empty. for entry in os.listdir(tmp_dir): shutil.move(os.path.join(tmp_dir, entry), os.path.join(image_path, entry)) else: # on VMFS/NFS/etc, rename [datastore]/tmp_image_[uuid] to [datastore]/tmp_image_[image_id] self._vim_client.move_file(tmp_dir, image_path) except: self._logger.exception("Move image %s to %s failed" % (image_id, image_path)) self._vim_client.delete_file(tmp_dir) raise """ The following method should be used to check and validate the existence of a previously created image. With the new image delete path the "timestamp" file must exists inside the image directory. If the directory exists and the file does not, it may mean that an image delete operation was aborted mid-way. In this case the following method recreate the timestamp file. All operations are performed while holding the image directory lock (FileBackedLock), the caller is required to hold the lock. """ def _check_image_repair(self, image_id, datastore): vmdk_pathname = os_vmdk_path(datastore, image_id, IMAGE_FOLDER_NAME_PREFIX) image_dirname = os.path.dirname(vmdk_pathname) try: # Check vmdk file if not os.path.exists(vmdk_pathname): self._logger.info("Vmdk path doesn't exists: %s" % vmdk_pathname) return False except Exception as ex: self._logger.exception( "Exception validating %s, %s" % (image_dirname, ex)) return False # Check timestamp file timestamp_pathname = \ os.path.join(image_dirname, self.IMAGE_TIMESTAMP_FILE_NAME) try: if os.path.exists(timestamp_pathname): self._logger.info("Timestamp file exists: %s" % timestamp_pathname) return True except Exception as ex: self._logger.exception( "Exception validating %s, %s" % (timestamp_pathname, ex)) # The timestamp file is not accessible, # try creating one, if successful try to # delete the renamed timestamp file if it # exists try: self._create_image_timestamp_file(image_dirname) self._delete_renamed_image_timestamp_file(image_dirname) except Exception as ex: self._logger.exception( "Exception creating %s, %s" % (timestamp_pathname, ex)) return False self._logger.info("Image repaired: %s" % image_dirname) return True def copy_image(self, source_datastore, source_id, dest_datastore, dest_id): """Copy an image between datastores. This method is used to create a "full clone" of a vmdk. It does so by copying a disk to a unique directory in a well known temporary directory then moving the disk to the destination image location. Data in the temporary directory not properly cleaned up will be periodically garbage collected by the reaper thread. This minimizes the window during which the vmdk path exists with incomplete content. It also works around a hostd issue where cp -f does not work. The current behavior for when the destination disk exists is to overwrite said disk. source_datastore: id of the source datastore source_id: id of the image to copy from dest_datastore: id of the destination datastore dest_id: id of the new image in the destination datastore throws: AcquireLockFailure if timed out waiting to acquire lock on tmp image directory throws: InvalidFile if unable to lock tmp image directory or some other reasons """ if self.check_and_validate_image(dest_id, dest_datastore): # The image is copied, presumably via some other concurrent # copy, so we move on. self._logger.info("Image %s already copied" % dest_id) raise DiskAlreadyExistException("Image already exists") # Copy image to the tmp directory. tmp_dir = self._copy_to_tmp_image(source_datastore, source_id, dest_datastore, dest_id) self._move_image(dest_id, dest_datastore, tmp_dir) def reap_tmp_images(self): """ Clean up unused directories in the temp image folder. """ for ds in self._ds_manager.get_datastores(): for image_dir in list_top_level_directory(ds.id, TMP_IMAGE_FOLDER_NAME_PREFIX): if not os.path.isdir(image_dir): continue create_time = os.stat(image_dir).st_ctime current_time = time.time() if current_time - self.REAP_TMP_IMAGES_GRACE_PERIOD < create_time: # Skip folders that are newly created in past x minutes # For example, during host-to-host transfer, hostd on # receiving end stores the uploaded file in temp images # folder but does not lock it with FileBackedLock, so we # need to allow a grace period before reaping it. self._logger.info( "Skip folder: %s, created: %s, now: %s" % (image_dir, create_time, current_time)) continue try: with FileBackedLock(image_dir, ds.type): if os.path.exists(image_dir): self._logger.info("Delete folder %s" % image_dir) shutil.rmtree(image_dir, ignore_errors=True) except (AcquireLockFailure, InvalidFile): self._logger.info("Already locked: %s, skipping" % image_dir) except: self._logger.info("Unable to remove %s" % image_dir, exc_info=True) def get_images(self, datastore): """ Get image list from datastore :param datastore: datastore id :return: list of string, image id list """ image_ids = [] if not os.path.exists(os_datastore_root(datastore)): raise DatastoreNotFoundException() # image_folder is /vmfs/volumes/${datastore}/images_* for dir in list_top_level_directory(datastore, IMAGE_FOLDER_NAME_PREFIX): image_id = dir.split(COMPOND_PATH_SEPARATOR)[1] if self.check_image(image_id, datastore): image_ids.append(image_id) return image_ids def _unzip(self, src, dst): self._logger.info("unzip %s -> %s" % (src, dst)) fsrc = gzip.open(src, "rb") fdst = open(dst, "wb") try: shutil.copyfileobj(fsrc, fdst) finally: fsrc.close() fdst.close() def _copy_disk(self, src, dst): self._manage_disk(vim.VirtualDiskManager.CopyVirtualDisk_Task, sourceName=src, destName=dst) def _manage_disk(self, op, **kwargs): try: self._logger.debug("Invoking %s(%s)" % (op.info.name, kwargs)) task = op(self._manager, **kwargs) self._vim_client.wait_for_task(task) except vim.Fault.FileAlreadyExists, e: raise DiskAlreadyExistException(e.msg) except vim.Fault.FileFault, e: raise DiskFileException(e.msg)