def _put_dataset(self, path: str, username: str, owner: str) -> Dataset: # Validate that given path contains a dataset _ = self.load_dataset_from_directory(path) p = os.path.join(self.inventory_root, username, owner, 'datasets') dir_name = os.path.basename(path) if os.path.exists(p) and dir_name in os.listdir(p): raise InventoryException( f"Dataset directory {dir_name} already exists") if not os.path.exists(p): os.makedirs(p, exist_ok=True) if os.path.exists(os.path.join(p, dir_name)): raise InventoryException( f"Dataset directory {dir_name} already exists") final_path = shutil.move(path, p) assert os.path.dirname( final_path) != 'datasets', f"shutil.move used incorrectly" ds = self.load_dataset_from_directory(final_path) # link dataset objects ds.namespace = owner m = Manifest(ds, logged_in_username=username) m.link_revision() return ds
def put_labbook(self, path: str, username: str, owner: str) -> LabBook: """ Take given path to a candidate labbook and insert it into its proper place in the file system. Args: path: Path to a given labbook username: Active username owner: Intended owner of labbook Returns: LabBook """ try: lb = self._put_labbook(path, username, owner) # Init dataset submodules if present if len(lb.git.repo.submodules) > 0: # Link datasets for submodule in lb.git.list_submodules(): try: namespace, dataset_name = submodule['name'].split("&") rel_submodule_dir = os.path.join( '.gigantum', 'datasets', namespace, dataset_name) submodule_dir = os.path.join(lb.root_dir, rel_submodule_dir) call_subprocess( ['git', 'submodule', 'init', rel_submodule_dir], cwd=lb.root_dir, check=True) call_subprocess( ['git', 'submodule', 'update', rel_submodule_dir], cwd=lb.root_dir, check=True) ds = InventoryManager().load_dataset_from_directory( submodule_dir) ds.namespace = namespace manifest = Manifest(ds, username) manifest.link_revision() except Exception as err: logger.exception( f"Failed to import submodule: {submodule['name']}") continue return lb except Exception as e: logger.error(e) raise InventoryException(e)
def create_dataset(self, username: str, owner: str, dataset_name: str, storage_type: str, description: Optional[str] = None, author: Optional[GitAuthor] = None) -> Dataset: """Create a new Dataset in this Gigantum working directory. Args: username: Active username owner: Namespace in which to place this Dataset dataset_name: Name of the Dataset storage_type: String identifying the type of Dataset to instantiate description: Optional brief description of Dataset author: Optional Git Author Returns: Newly created LabBook instance """ dataset = Dataset(config_file=self.config_file, author=author, namespace=owner) if storage_type not in SUPPORTED_STORAGE_BACKENDS: raise ValueError( f"Unsupported Dataset storage type: {storage_type}") try: build_info = Configuration(self.config_file).config['build_info'] except KeyError: logger.warning("Could not obtain build_info from config") build_info = None # Build data file contents dataset._data = { "schema": DATASET_CURRENT_SCHEMA, "id": uuid.uuid4().hex, "name": dataset_name, "storage_type": storage_type, "description": description or '', "created_on": datetime.datetime.utcnow().isoformat(), "build_info": build_info } dataset._validate_gigantum_data() logger.info("Creating new Dataset on disk for {}/{}/{}".format( username, owner, dataset_name)) # lock while creating initial directory with dataset.lock( lock_key=f"new_dataset_lock|{username}|{owner}|{dataset_name}" ): # Verify or Create user subdirectory # Make sure you expand a user dir string starting_dir = os.path.expanduser( dataset.client_config.config["git"]["working_directory"]) user_dir = os.path.join(starting_dir, username) if not os.path.isdir(user_dir): os.makedirs(user_dir) # Create owner dir - store LabBooks in working dir > logged in user > owner owner_dir = os.path.join(user_dir, owner) if not os.path.isdir(owner_dir): os.makedirs(owner_dir) # Create `datasets` subdir in the owner dir owner_dir = os.path.join(owner_dir, "datasets") else: owner_dir = os.path.join(owner_dir, "datasets") # Verify name not already in use if os.path.isdir(os.path.join(owner_dir, dataset_name)): raise ValueError( f"Dataset `{dataset_name}` already exists locally. Choose a new Dataset name" ) # Create Dataset subdirectory new_root_dir = os.path.join(owner_dir, dataset_name) os.makedirs(new_root_dir) dataset._set_root_dir(new_root_dir) # Init repository dataset.git.initialize() # Create Directory Structure dirs = [ 'manifest', 'metadata', '.gigantum', os.path.join('.gigantum', 'favorites'), os.path.join('.gigantum', 'activity'), os.path.join('.gigantum', 'activity', 'log') ] for d in dirs: p = os.path.join(dataset.root_dir, d, '.gitkeep') os.makedirs(os.path.dirname(p), exist_ok=True) with open(p, 'w') as gk: gk.write( "This file is necessary to keep this directory tracked by Git" " and archivable by compression tools. Do not delete or modify!" ) dataset._save_gigantum_data() # Create an empty storage.json file dataset.backend_config = {} # Create .gitignore default file shutil.copyfile( os.path.join(resource_filename('gtmcore', 'dataset'), 'gitignore.default'), os.path.join(dataset.root_dir, ".gitignore")) # Commit dataset.git.add_all() # NOTE: this string is used to indicate there are no more activity records to get. Changing the string will # break activity paging. # TODO: Improve method for detecting the first activity record dataset.git.commit(f"Creating new empty Dataset: {dataset_name}") # Create Activity Record adr = ActivityDetailRecord(ActivityDetailType.DATASET, show=False, importance=0) adr.add_value('text/plain', f"Created new Dataset: {username}/{dataset_name}") ar = ActivityRecord( ActivityType.DATASET, message=f"Created new Dataset: {username}/{dataset_name}", show=True, importance=255, linked_commit=dataset.git.commit_hash) ar.add_detail_object(adr) store = ActivityStore(dataset) store.create_activity_record(ar) # Initialize file cache and link revision m = Manifest(dataset, username) m.link_revision() return dataset
def mutate_and_get_payload(cls, root, info, labbook_owner, labbook_name, dataset_owner, dataset_name, action, dataset_url=None, client_mutation_id=None): logged_in_username = get_logged_in_username() im = InventoryManager() lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name, author=get_logged_in_author()) with lb.lock(): if action == 'link': if dataset_url: remote_domain = cls._get_remote_domain( dataset_url, dataset_owner, dataset_name) if remote_domain: # Make sure git creds are configured for the remote admin_service = None for remote in lb.client_config.config['git'][ 'remotes']: if remote_domain == remote: admin_service = lb.client_config.config['git'][ 'remotes'][remote]['admin_service'] break if "HTTP_AUTHORIZATION" in info.context.headers.environ: token = parse_token(info.context.headers. environ["HTTP_AUTHORIZATION"]) else: raise ValueError( "Authorization header not provided." " Must have a valid session to query for collaborators" ) mgr = GitLabManager(remote_domain, admin_service, token) mgr.configure_git_credentials(remote_domain, logged_in_username) else: # Link to local dataset ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) dataset_url = f"{ds.root_dir}/.git" # Link the dataset to the labbook ds = im.link_dataset_to_labbook(dataset_url, dataset_owner, dataset_name, lb) ds.namespace = dataset_owner # Preload the dataloader info.context.dataset_loader.prime( f"{get_logged_in_username()}&{dataset_owner}&{dataset_name}", ds) # Relink the revision m = Manifest(ds, logged_in_username) m.link_revision() elif action == 'unlink': im.unlink_dataset_from_labbook(dataset_owner, dataset_name, lb) elif action == 'update': ds = im.update_linked_dataset_reference( dataset_owner, dataset_name, lb) m = Manifest(ds, logged_in_username) m.force_reload() info.context.dataset_loader.prime( f"{get_logged_in_username()}&{dataset_owner}&{dataset_name}", ds) else: raise ValueError( "Unsupported action. Use `link`, `unlink`, or `update`") info.context.labbook_loader.prime( f"{get_logged_in_username()}&{labbook_owner}&{labbook_name}", lb) edge = LabbookConnection.Edge(node=Labbook(owner=labbook_owner, name=labbook_name), cursor=base64.b64encode( f"{0}".encode('utf-8'))) return ModifyDatasetLink(new_labbook_edge=edge)
def test_make_directory(self, fixture_working_dir, snapshot): im = InventoryManager(fixture_working_dir[0]) ds = im.create_dataset('default', 'default', "dataset-dir", storage_type="gigantum_object_v1", description="testing move") m = Manifest(ds, 'default') m.link_revision() query = """ mutation myMutation { makeDatasetDirectory(input: {datasetOwner: "default", datasetName: "dataset-dir", key: "test_dir1/"}) { newDatasetFileEdge { node { id key isDir isLocal size } } } } """ result = fixture_working_dir[2].execute(query) assert 'errors' not in result assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['key'] == 'test_dir1/' assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['isDir'] is True assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['isLocal'] is True assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['size'] == '0' assert os.path.isdir( os.path.join(m.cache_mgr.current_revision_dir, "test_dir1")) is True query = """ mutation myMutation { makeDatasetDirectory(input: {datasetOwner: "default", datasetName: "dataset-dir", key: "test_dir1/test_dir2/"}) { newDatasetFileEdge { node { id key isDir isLocal size } } } } """ result = fixture_working_dir[2].execute(query) assert 'errors' not in result assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['key'] == 'test_dir1/test_dir2/' assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['isDir'] is True assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['isLocal'] is True assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['size'] == '0' assert os.path.isdir( os.path.join(m.cache_mgr.current_revision_dir, "test_dir1")) is True assert os.path.isdir( os.path.join(m.cache_mgr.current_revision_dir, "test_dir1", "test_dir2")) is True
def test_pull_objects(self, mock_config_file, mock_dataset_head): im = InventoryManager(mock_config_file[0]) ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") m = Manifest(ds, 'default') iom = IOManager(ds, m) os.makedirs( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, "other_dir")) helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfadfsdf") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test2.txt", "fdsfgfd") m.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj2_target = obj_to_push[1].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) obj2_source = os.path.join('/tmp', uuid.uuid4().hex) assert os.path.exists(obj1_target) is True assert os.path.exists(obj2_target) is True helper_compress_file(obj1_target, obj1_source) helper_compress_file(obj2_target, obj2_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj2_target) is False assert os.path.isfile(obj1_source) is True assert os.path.isfile(obj2_source) is True # Clear out from linked dir os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test2.txt')) with patch.object(Configuration, 'find_default_config', lambda self: mock_config_file[0]): with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_2}?params=1", "namespace": ds.namespace, "obj_id": obj_id_2, "dataset": ds.name }, status=200) with open(obj2_source, 'rb') as data2: mocked_responses.get( f"https://dummyurl.com/{obj_id_2}?params=1", body=data2.read(), status=200, content_type='application/octet-stream') dl_kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "default", 'dataset_name': "dataset100", 'labbook_owner': None, 'labbook_name': None, 'keys': ["test1.txt"] } gtmcore.dispatcher.dataset_jobs.pull_objects(**dl_kwargs) # Manually link since this is disabled by default in the job (because in real use, multiple jobs run # in parallel and you only want to link once. m.link_revision() assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is False decompressor = snappy.StreamDecompressor() with open(obj1_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj1_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1 # Download other file dl_kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "default", 'dataset_name': "dataset100", 'labbook_owner': None, 'labbook_name': None, 'keys': ["test2.txt"] } gtmcore.dispatcher.dataset_jobs.pull_objects(**dl_kwargs) # Manually link since this is disabled by default in the job (because in real use, multiple jobs run # in parallel and you only want to link once. m.link_revision() assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is True with open(obj1_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj1_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1 with open(obj2_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj2_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1