def get(self, token_prefix, dataset_prefix): required_privileges = [ Privileges.RO_WATCH_DATASET, Privileges.ADMIN_EDIT_TOKEN ] _, token = self.token_parser.parse_args( required_any_token_privileges=required_privileges) full_dataset_url_prefix = "{}/{}".format(token_prefix, dataset_prefix) dataset = DatasetFactory(token).get_dataset(full_dataset_url_prefix) total_size = global_config.get_storage().get_files_size( [l.file_ref_id for l in dataset.elements]) dataset.size = total_size self.session.flush() return total_size, 200
def __init__(self, storage=None): if storage is None: storage = global_config.get_storage() self.storage = storage drive_folder = global_config.get_google_drive_folder() i("Initializing PYGFolder...") self.pygfolder = PyGFolder() i("Done") if drive_folder.endswith("/"): drive_folder = drive_folder[:-1] init_file = drive_folder + "/init" i("Creating init file with current timestamp...") self.pygfolder[init_file] = as_bytes(now()) i("Done") i("Accessing folder...") self.pygfolder = self.pygfolder[drive_folder] i("Done") self.uploader = Thread(target=self.__uploader__, daemon=True) self.downloader = Thread(target=self.__downloader__, daemon=True) self.uploader.start() self.downloader.start()
from mldatahub.config.config import global_config global_config.set_session_uri("mongodb://localhost:27017/unittests") global_config.set_page_size(2) from mldatahub.factory.dataset_factory import DatasetFactory from mldatahub.factory.dataset_element_factory import DatasetElementFactory from werkzeug.exceptions import Unauthorized, BadRequest, RequestedRangeNotSatisfiable, NotFound, Conflict from mldatahub.config.privileges import Privileges import unittest from mldatahub.odm.dataset_dao import DatasetDAO, DatasetCommentDAO, DatasetElementDAO, DatasetElementCommentDAO from mldatahub.odm.file_dao import FileDAO from mldatahub.odm.token_dao import TokenDAO __author__ = 'Iván de Paz Centeno' storage = global_config.get_storage() class TestDatasetElementFactory(unittest.TestCase): def setUp(self): self.session = global_config.get_session() DatasetDAO.query.remove() DatasetCommentDAO.query.remove() DatasetElementDAO.query.remove() DatasetElementCommentDAO.query.remove() FileDAO.query.remove() TokenDAO.query.remove() def test_dataset_element_creation(self): """
class GarbageCollector(object): """ Class whose purpose is to clean the filesystem from files that lost references. This is a GarbageCollector. In order to do this optimally, fortunately the storage class keeps a set with all the filenames he worked with, it is guaranteed that it is up-to-date and persistent across server reboots. It is better to iterate over this structure rather than the filesystem itself. """ lock = threading.Lock() do_stop = False storage = global_config.get_storage() # type: GenericStorage last_tick = now() - datetime.timedelta(minutes=60) def __init__(self): self.thread = Thread(target=self.__thread_func, daemon=True) self.thread.start() self.previous_unused_files = {} def __stop_requested(self): with self.lock: value = self.do_stop return value def __thread_func(self): while not self.__stop_requested(): if (now() - self.last_tick).total_seconds() > TIMER_TICK: with self.lock: self.last_tick = now() self.do_garbage_collect() sleep(1) print("[GC] Exited.") def collect_unused_files(self): """ Searchs for unused files in the DB and returns a list of ids. :return: list with IDs of the unused files. """ unused_files = [] files_count = len(self.storage) print("[GC] {} files to be checked.".format(files_count)) sleep_batch=50 files_per_second = [0] files_per_second_avg = 0 time_remaining = -1 with Measure() as timing: for index, file in enumerate(self.storage): if DatasetElementDAO.query.get(file_ref_id=file._id) is None: unused_files.append(file._id) if index % sleep_batch == 0: sleep(0.1) if len(files_per_second) < 5: files_per_second[-1] += 1 if timing.elapsed().seconds >= 1: files_per_second.append(0) timing.reset() files_per_second_avg = sum(files_per_second) / len(files_per_second) time_remaining = "" else: time_remaining = " {} remaining".format(time_left_as_str((len(self.storage) - index) // files_per_second_avg)) if self.__stop_requested(): break progress(index+1, files_count, "{} files are orphan.{}".format(len(unused_files), time_remaining)) print("") return unused_files def do_garbage_collect(self): print("[GC] Collecting garbage...") global_config.session.clear() # 1. We retrieve the unused files. unused_files = self.collect_unused_files() # 2. We check how many unused files are in common with the previous unused files. new_unused_files = [] remove_files = [] print("[GC] Comparing {} unused files to previous {} unused files.".format(len(unused_files), len(self.previous_unused_files))) print("[GC] Cleaning {} elements...".format(len(remove_files))) for index, file in enumerate(unused_files): if file in self.previous_unused_files: remove_files.append(file) else: new_unused_files.append(file) # 3. We delete by batches files_count = 0 for list_ids in segment(remove_files, 50): files_count += len(list_ids) self.storage.delete_files(list_ids) progress(files_count, len(remove_files), "{} files garbage collected.".format(files_count)) sleep(0.1) self.previous_unused_files = set(new_unused_files) print("[GC] Cleaned {} elements...".format(len(remove_files))) return files_count def stop(self, wait_for_finish=True): with self.lock: self.do_stop = True if wait_for_finish: self.thread.join() def __del__(self): self.stop()