コード例 #1
0
    def get(self, token_prefix, dataset_prefix):
        required_privileges = [
            Privileges.RO_WATCH_DATASET, Privileges.ADMIN_EDIT_TOKEN
        ]

        _, token = self.token_parser.parse_args(
            required_any_token_privileges=required_privileges)
        full_dataset_url_prefix = "{}/{}".format(token_prefix, dataset_prefix)

        dataset = DatasetFactory(token).get_dataset(full_dataset_url_prefix)

        total_size = global_config.get_storage().get_files_size(
            [l.file_ref_id for l in dataset.elements])
        dataset.size = total_size
        self.session.flush()

        return total_size, 200
コード例 #2
0
    def __init__(self, storage=None):
        if storage is None:
            storage = global_config.get_storage()

        self.storage = storage

        drive_folder = global_config.get_google_drive_folder()
        i("Initializing PYGFolder...")
        self.pygfolder = PyGFolder()
        i("Done")

        if drive_folder.endswith("/"): drive_folder = drive_folder[:-1]

        init_file = drive_folder + "/init"
        i("Creating init file with current timestamp...")
        self.pygfolder[init_file] = as_bytes(now())
        i("Done")
        i("Accessing folder...")
        self.pygfolder = self.pygfolder[drive_folder]
        i("Done")
        self.uploader = Thread(target=self.__uploader__, daemon=True)
        self.downloader = Thread(target=self.__downloader__, daemon=True)
        self.uploader.start()
        self.downloader.start()
コード例 #3
0
from mldatahub.config.config import global_config
global_config.set_session_uri("mongodb://localhost:27017/unittests")
global_config.set_page_size(2)
from mldatahub.factory.dataset_factory import DatasetFactory
from mldatahub.factory.dataset_element_factory import DatasetElementFactory
from werkzeug.exceptions import Unauthorized, BadRequest, RequestedRangeNotSatisfiable, NotFound, Conflict
from mldatahub.config.privileges import Privileges
import unittest
from mldatahub.odm.dataset_dao import DatasetDAO, DatasetCommentDAO, DatasetElementDAO, DatasetElementCommentDAO
from mldatahub.odm.file_dao import FileDAO
from mldatahub.odm.token_dao import TokenDAO


__author__ = 'Iván de Paz Centeno'

storage = global_config.get_storage()


class TestDatasetElementFactory(unittest.TestCase):

    def setUp(self):
        self.session = global_config.get_session()
        DatasetDAO.query.remove()
        DatasetCommentDAO.query.remove()
        DatasetElementDAO.query.remove()
        DatasetElementCommentDAO.query.remove()
        FileDAO.query.remove()
        TokenDAO.query.remove()

    def test_dataset_element_creation(self):
        """
コード例 #4
0
class GarbageCollector(object):
    """
    Class whose purpose is to clean the filesystem from files that lost references.
    This is a GarbageCollector.

    In order to do this optimally, fortunately the storage class keeps a set with all the filenames he worked with,
    it is guaranteed that it is up-to-date and persistent across server reboots.
    It is better to iterate over this structure rather than the filesystem itself.
    """
    lock = threading.Lock()
    do_stop = False
    storage = global_config.get_storage() # type: GenericStorage
    last_tick = now() - datetime.timedelta(minutes=60)

    def __init__(self):
        self.thread = Thread(target=self.__thread_func, daemon=True)
        self.thread.start()
        self.previous_unused_files = {}

    def __stop_requested(self):
        with self.lock:
            value = self.do_stop
        return value

    def __thread_func(self):
        while not self.__stop_requested():
            if (now() - self.last_tick).total_seconds() > TIMER_TICK:
                with self.lock:
                    self.last_tick = now()

                self.do_garbage_collect()
            sleep(1)
        print("[GC] Exited.")

    def collect_unused_files(self):
        """
        Searchs for unused files in the DB and returns a list of ids.
        :return: list with IDs of the unused files.
        """
        unused_files = []

        files_count = len(self.storage)

        print("[GC] {} files to be checked.".format(files_count))

        sleep_batch=50
        files_per_second = [0]
        files_per_second_avg = 0
        time_remaining = -1

        with Measure() as timing:
            for index, file in enumerate(self.storage):
                if DatasetElementDAO.query.get(file_ref_id=file._id) is None:
                    unused_files.append(file._id)

                if index % sleep_batch == 0:
                    sleep(0.1)

                if len(files_per_second) < 5:
                    files_per_second[-1] += 1
                    if timing.elapsed().seconds >= 1:
                        files_per_second.append(0)
                        timing.reset()

                    files_per_second_avg = sum(files_per_second) / len(files_per_second)
                    time_remaining = ""
                else:
                    time_remaining = " {} remaining".format(time_left_as_str((len(self.storage) - index) // files_per_second_avg))

                if self.__stop_requested():
                    break

                progress(index+1, files_count, "{} files are orphan.{}".format(len(unused_files), time_remaining))

        print("")
        return unused_files

    def do_garbage_collect(self):
        print("[GC] Collecting garbage...")

        global_config.session.clear()

        # 1. We retrieve the unused files.
        unused_files = self.collect_unused_files()

        # 2. We check how many unused files are in common with the previous unused files.
        new_unused_files = []
        remove_files = []
        print("[GC] Comparing {} unused files to previous {} unused files.".format(len(unused_files), len(self.previous_unused_files)))

        print("[GC] Cleaning {} elements...".format(len(remove_files)))

        for index, file in enumerate(unused_files):
            if file in self.previous_unused_files:
                remove_files.append(file)
            else:
                new_unused_files.append(file)

        # 3. We delete by batches
        files_count = 0
        for list_ids in segment(remove_files, 50):
            files_count += len(list_ids)
            self.storage.delete_files(list_ids)
            progress(files_count, len(remove_files), "{} files garbage collected.".format(files_count))
            sleep(0.1)

        self.previous_unused_files = set(new_unused_files)

        print("[GC] Cleaned {} elements...".format(len(remove_files)))

        return files_count

    def stop(self, wait_for_finish=True):
        with self.lock:
            self.do_stop = True

        if wait_for_finish:
            self.thread.join()

    def __del__(self):
        self.stop()