def helper_resolve_all_files(self, dataset, kwargs): """Helper method to populate the DatasetFileConnection""" manifest = Manifest(dataset, get_logged_in_username()) if "after" in kwargs: after_index = int(base64.b64decode(kwargs["after"])) else: after_index = 0 # Generate naive cursors edges, indexes = manifest.list(first=kwargs.get("first"), after_index=after_index) cursors = [ base64.b64encode("{}".format(x).encode("UTF-8")).decode("UTF-8") for x in indexes ] edge_objs = [] for edge, cursor in zip(edges, cursors): create_data = { "owner": self.owner, "name": self.name, "key": edge['key'], "_file_info": edge } edge_objs.append( DatasetFileConnection.Edge(node=DatasetFile(**create_data), cursor=cursor)) has_previous_page = False has_next_page = len(edges) > 0 start_cursor = None end_cursor = None if cursors: start_cursor = cursors[0] end_cursor = cursors[-1] if indexes[-1] == len(manifest.manifest) - 1: has_next_page = False if kwargs.get("after"): if int(base64.b64decode(kwargs["after"])) > 0: has_previous_page = True page_info = graphene.relay.PageInfo( has_next_page=has_next_page, has_previous_page=has_previous_page, start_cursor=start_cursor, end_cursor=end_cursor) return DatasetFileConnection(edges=edge_objs, page_info=page_info)
def test_complete_dataset_upload_transaction_simple( self, mock_config_file_background_tests): im = InventoryManager(mock_config_file_background_tests[0]) ds = im.create_dataset('default', 'default', "new-ds", storage_type="gigantum_object_v1", description="100") m = Manifest(ds, 'default') helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "fake content!") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test2.txt", "moar fake content!") dl_kwargs = { 'dispatcher': Dispatcher, 'logged_in_username': "******", 'logged_in_email': "*****@*****.**", 'dataset_owner': "default", 'dataset_name': "new-ds", 'config_file': mock_config_file_background_tests[0] } assert len(m.manifest) == 0 gtmcore.dispatcher.dataset_jobs.complete_dataset_upload_transaction( **dl_kwargs) m = Manifest(ds, 'default') # make sure manifest got updated assert len(m.manifest) == 2 assert 'test1.txt' in m.manifest assert 'test2.txt' in m.manifest assert m.manifest['test1.txt']['b'] == '13' assert len(m.manifest['test1.txt']['h']) == 128 assert 'manifest-' in m.manifest['test1.txt']['fn'] assert m.manifest['test2.txt']['b'] == '18' assert len(m.manifest['test2.txt']['h']) == 128 assert 'manifest-' in m.manifest['test2.txt']['fn'] assert m.manifest['test2.txt']['h'] != m.manifest['test1.txt']['h'] # Make sure activity created assert len(ds.git.log()) == 6 assert "_GTM_ACTIVITY_START_" in ds.git.log()[0]['message'] assert "Uploaded 2 new file(s)." in ds.git.log()[0]['message']
def generate_bg_hash_job_list(filenames: List[str], manifest: Manifest, dispatcher_obj: Dispatcher) -> List[BackgroundHashJob]: """Method to generate batches of files to be hashed, ensuring files aren't added to a batch once it is larger than MAX_JOB_BYTES Args: filenames: list of files to be hashed manifest: the Manifest instance dispatcher_obj: the Dispatcher instance Returns: list """ num_cores = manifest.get_num_hashing_cpus() file_lists: List[List] = [list() for _ in range(num_cores)] size_sums = [0 for _ in range(num_cores)] revision_dir = manifest.current_revision_dir for filename in filenames: index = size_sums.index(min(size_sums)) file_lists[index].append(filename) size_sums[index] += os.path.getsize(os.path.join(revision_dir, filename)) if all(fs > MAX_JOB_BYTES for fs in size_sums): # 1GB of data to hash already in every job. Add another. file_lists.append(list()) size_sums.append(0) # Prune Jobs back if there are lots of cores but not lots of work file_lists = [x for x in file_lists if x != []] size_sums = [x for x in size_sums if x != 0] # Prep hashing jobs return [BackgroundHashJob(dispatcher_obj, fl, ss) for ss, fl in zip(size_sums, file_lists)]
def _get_dataset_file_info(self, dataset) -> dict: """helper method to iterate over the manifest and get file info for the overview page Returns: None """ m = Manifest(dataset, get_logged_in_username()) count = 0 total_bytes = 0 file_type_distribution: OrderedDict = OrderedDict() for key in m.manifest: item = m.manifest[key] if key[-1] == '/': # Skip directories continue filename = os.path.basename(key) if filename[0] == '.': # Skip hidden files continue if '.' not in filename: # Skip files without an extension continue # Count file type distribution _, ext = os.path.splitext(filename) if ext: file_type = ext if file_type in file_type_distribution: file_type_distribution[file_type] += 1 else: file_type_distribution[file_type] = 1 # Count total file size total_bytes += int(item['b']) # Count files count += 1 # Format the output for file type distribution formatted_file_type_info: List[str] = list() file_type_distribution = OrderedDict( sorted(file_type_distribution.items(), key=itemgetter(1), reverse=True)) for file_type in file_type_distribution: percentage = float( file_type_distribution[file_type]) / float(count) formatted_file_type_info.append(f"{percentage:.2f}|{file_type}") self._dataset_file_info = { 'num_files': count, 'total_bytes': total_bytes, 'local_bytes': count, 'file_type_distribution': formatted_file_type_info } return self._dataset_file_info
def _push_dataset_objects(self, dataset: Dataset, logged_in_username: str, feedback_callback: Callable, access_token, id_token) -> None: dataset.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(dataset, logged_in_username) iom = IOManager(dataset, m) iom.push_objects(status_update_fn=feedback_callback) iom.manifest.link_revision()
def test_move_dataset_file(self, fixture_working_dir, snapshot): im = InventoryManager(fixture_working_dir[0]) ds = im.create_dataset('default', 'default', "dataset-move", storage_type="gigantum_object_v1", description="testing move") m = Manifest(ds, 'default') revision = m.dataset_revision helper_append_file(m.cache_mgr.cache_root, revision, "test1.txt", "asdfasdghndfdf") m.sweep_all_changes() revision = m.dataset_revision cr = m.cache_mgr.cache_root assert os.path.exists(os.path.join(cr, revision, "test1.txt")) is True query = """ mutation myMutation { moveDatasetFile(input: {datasetOwner: "default", datasetName: "dataset-move", srcPath: "test1.txt", dstPath: "test1-renamed.txt"}) { updatedEdges { node { id key isDir isLocal size } } } } """ result = fixture_working_dir[2].execute(query) assert 'errors' not in result snapshot.assert_match(result) revision = m.dataset_revision cr = m.cache_mgr.cache_root assert os.path.exists(os.path.join(cr, revision, "test1.txt")) is False assert os.path.exists(os.path.join(cr, revision, "test1-renamed.txt")) is True
def mutate_and_get_payload(cls, root, info, dataset_owner, dataset_name, keys, client_mutation_id=None): logged_in_username = get_logged_in_username() ds = InventoryManager().load_dataset(logged_in_username, dataset_owner, dataset_name, author=get_logged_in_author()) ds.namespace = dataset_owner m = Manifest(ds, logged_in_username) with ds.lock(): m.delete(keys) return DeleteDatasetFiles(success=True)
def test_delete_dataset_while_linked(self, mock_config_file): inv_manager = InventoryManager(mock_config_file[0]) auth = GitAuthor(name="test", email="*****@*****.**") lb = inv_manager.create_labbook("test", "test", "labbook1", description="my first labbook") ds = inv_manager.create_dataset("test", "test", "dataset1", "gigantum_object_v1", description="my first dataset", author=auth) ds_root_dir = ds.root_dir lb_root_dir = lb.root_dir assert os.path.exists(ds_root_dir) is True assert os.path.exists(lb_root_dir) is True # Link dataset inv_manager.link_dataset_to_labbook(f"{ds_root_dir}/.git", "test", "dataset1", lb) m = Manifest(ds, 'test') helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfasdf") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test2.txt", "dfg") assert os.path.exists( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt")) is True assert os.path.exists( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, "test2.txt")) is True dataset_delete_job = inv_manager.delete_dataset( "test", "test", "dataset1") assert os.path.exists(ds_root_dir) is False assert os.path.exists(lb_root_dir) is True assert os.path.exists(m.cache_mgr.cache_root) is True assert dataset_delete_job.namespace == "test" assert dataset_delete_job.name == "dataset1" assert dataset_delete_job.cache_root == m.cache_mgr.cache_root jobs.clean_dataset_file_cache("test", dataset_delete_job.namespace, dataset_delete_job.name, dataset_delete_job.cache_root, config_file=mock_config_file[0]) assert os.path.exists(m.cache_mgr.cache_root) is True cache_base, _ = m.cache_mgr.cache_root.rsplit(os.path.sep, 1) assert os.path.exists(cache_base) is True
def mutate_and_get_payload(cls, root, info, dataset_owner, dataset_name, src_path, dst_path, client_mutation_id=None): logged_in_username = get_logged_in_username() ds = InventoryManager().load_dataset(logged_in_username, dataset_owner, dataset_name, author=get_logged_in_author()) ds.namespace = dataset_owner m = Manifest(ds, logged_in_username) with ds.lock(): edge_data = m.move(src_path, dst_path) file_edges = list() for edge_dict in edge_data: file_edges.append( DatasetFile(owner=dataset_owner, name=dataset_name, key=edge_dict['key'], is_dir=edge_dict['is_dir'], is_favorite=edge_dict['is_favorite'], modified_at=edge_dict['modified_at'], is_local=edge_dict['is_local'], size=str(edge_dict['size']))) cursors = [ base64.b64encode("{}".format(cnt).encode("UTF-8")).decode("UTF-8") for cnt, x in enumerate(file_edges) ] edge_objs = [ DatasetFileConnection.Edge(node=e, cursor=c) for e, c in zip(file_edges, cursors) ] return MoveDatasetFile(updated_edges=edge_objs)
def test_sync__dataset(self, mock_config_file): def update_feedback(msg: str, has_failures: Optional[bool] = None, failure_detail: Optional[str] = None, percent_complete: Optional[float] = None): """Method to update the job's metadata and provide feedback to the UI""" assert has_failures is None or has_failures is False assert failure_detail is None def dispatch_query_mock(self, job_key): JobStatus = namedtuple("JobStatus", ['status', 'meta']) return JobStatus(status='finished', meta={'completed_bytes': '100'}) def dispatch_mock(self, method_reference, kwargs, metadata, persist): return "afakejobkey" username = '******' im = InventoryManager(mock_config_file[0]) ds = im.create_dataset(username, username, 'dataset-1', 'gigantum_object_v1') m = Manifest(ds, username) wf = DatasetWorkflow(ds) iom = IOManager(ds, m) assert len(glob.glob(f'{iom.push_dir}/*')) == 0 wf.publish(username=username, feedback_callback=update_feedback) # Put a file into the dataset that needs to be pushed helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfadfsdf") m.sweep_all_changes() assert len(glob.glob(f'{iom.push_dir}/*')) == 1 with patch.object(Dispatcher, 'dispatch_task', dispatch_mock): with patch.object(Dispatcher, 'query_task', dispatch_query_mock): wf.sync(username=username, feedback_callback=update_feedback) assert os.path.exists(wf.remote) assert len(glob.glob(f'{iom.push_dir}/*')) == 0
def helper_resolve_all_files(self, dataset, kwargs): """Helper method to populate the DatasetFileConnection""" manifest = Manifest(dataset, get_logged_in_username()) # Generate naive cursors # TODO: Use manifest pagination interface edges = manifest.list() cursors = [base64.b64encode("{}".format(cnt).encode("UTF-8")).decode("UTF-8") for cnt, x in enumerate(edges)] # Process slicing and cursor args lbc = ListBasedConnection(edges, cursors, kwargs) lbc.apply() edge_objs = [] for edge, cursor in zip(lbc.edges, lbc.cursors): create_data = {"owner": self.owner, "name": self.name, "key": edge['key'], "_file_info": edge} edge_objs.append(DatasetFileConnection.Edge(node=DatasetFile(**create_data), cursor=cursor)) return DatasetFileConnection(edges=edge_objs, page_info=lbc.page_info)
def _helper_local_bytes(dataset): """Helper to compute total size of a dataset on disk""" m = Manifest(dataset, get_logged_in_username()) total_size = 0 for dirpath, dirnames, filenames in os.walk(m.current_revision_dir): for f in filenames: if f == '.smarthash': continue fp = os.path.join(dirpath, f) total_size += os.path.getsize(fp) return total_size
def test_make_directory_error(self, fixture_working_dir, snapshot): im = InventoryManager(fixture_working_dir[0]) ds = im.create_dataset('default', 'default', "dataset-dir", storage_type="gigantum_object_v1", description="testing move") m = Manifest(ds, 'default') # Test where parent dir doesnt exists (because you need to create the parent) query = """ mutation myMutation { makeDatasetDirectory(input: {datasetOwner: "default", datasetName: "dataset-dir", key: "test_dir1/test_dir2/"}) { newDatasetFileEdge { node { id key isDir isLocal size } } } } """ result = fixture_working_dir[2].execute(query) assert 'errors' in result assert 'Parent directory' in result['errors'][0]['message'] # Test where missing trailing slash query = """ mutation myMutation { makeDatasetDirectory(input: {datasetOwner: "default", datasetName: "dataset-dir", key: "test_dir1"}) { newDatasetFileEdge { node { id key isDir isLocal size } } } } """ result = fixture_working_dir[2].execute(query) assert 'errors' in result assert 'Provided relative path must end in' in result['errors'][0][ 'message']
def mutate_and_get_payload(cls, root, info, dataset_owner, dataset_name, key, client_mutation_id=None): logged_in_username = get_logged_in_username() ds = InventoryManager().load_dataset(logged_in_username, dataset_owner, dataset_name, author=get_logged_in_author()) ds.namespace = dataset_owner m = Manifest(ds, logged_in_username) if key[-1] != '/': raise ValueError( "Provided relative path must end in `/` to indicate it is a directory" ) with ds.lock(): file_info = m.create_directory(key) create_data = { 'owner': dataset_owner, 'name': dataset_name, 'key': file_info['key'], '_file_info': file_info } # TODO: Fix cursor implementation, this currently doesn't make sense cursor = base64.b64encode(f"{0}".encode('utf-8')) return MakeDatasetDirectory( new_dataset_file_edge=DatasetFileConnection.Edge( node=DatasetFile(**create_data), cursor=cursor))
def delete_dataset(self, username: str, owner: str, dataset_name: str) -> None: """Delete a Dataset from this Gigantum working directory. Args: username: Active username owner: Namespace in which to place this Dataset dataset_name: Name of the Datasets Returns: None """ ds = self.load_dataset(username, owner, dataset_name) # Delete dataset contents from file cache m = Manifest(ds, username) shutil.rmtree(m.cache_mgr.cache_root, ignore_errors=True) # Delete dataset repository from working dir shutil.rmtree(ds.root_dir, ignore_errors=True)
def test_delete_dataset(self, mock_config_file): inv_manager = InventoryManager(mock_config_file[0]) auth = GitAuthor(name="test", email="*****@*****.**") ds = inv_manager.create_dataset("test", "test", "dataset1", "gigantum_object_v1", description="my first dataset", author=auth) root_dir = ds.root_dir assert os.path.exists(root_dir) is True m = Manifest(ds, 'test') helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfasdf") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test2.txt", "dfg") assert os.path.exists(os.path.join(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt")) is True assert os.path.exists(os.path.join(m.cache_mgr.cache_root, m.dataset_revision, "test2.txt")) is True inv_manager.delete_dataset("test", "test", "dataset1") assert os.path.exists(root_dir) is False assert os.path.exists(m.cache_mgr.cache_root) is False cache_base, _ = m.cache_mgr.cache_root.rsplit(os.path.sep, 1) assert os.path.exists(cache_base) is True
def test_delete_dataset_files(self, fixture_working_dir, snapshot): im = InventoryManager(fixture_working_dir[0]) ds = im.create_dataset('default', 'default', "dataset-delete", storage_type="gigantum_object_v1", description="testing delete") m = Manifest(ds, 'default') os.makedirs( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, "other_dir")) helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfadfsdf") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test2.txt", "fdsfgfd") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test3.txt", "ghgdsr") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "other_dir/test3.txt", "hhgf") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "other_dir/test1.txt", "jkjghfg") m.sweep_all_changes() revision = m.dataset_revision assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "test1.txt")) is True assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "test2.txt")) is True assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "test3.txt")) is True assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "other_dir", "test3.txt")) is True assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "other_dir", "test1.txt")) is True query = """ mutation myMutation { deleteDatasetFiles(input: {datasetOwner: "default", datasetName: "dataset-delete", keys: ["test1.txt"]}) { success } } """ result = fixture_working_dir[2].execute(query) assert 'errors' not in result assert result['data']['deleteDatasetFiles']['success'] is True revision = m.dataset_revision assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "test1.txt")) is False assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "test2.txt")) is True assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "test3.txt")) is True assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "other_dir", "test3.txt")) is True assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "other_dir", "test1.txt")) is True query = """ mutation myMutation { deleteDatasetFiles(input: {datasetOwner: "default", datasetName: "dataset-delete", keys: ["test3.txt", "other_dir/"]}) { success } } """ result = fixture_working_dir[2].execute(query) assert 'errors' not in result assert result['data']['deleteDatasetFiles']['success'] is True revision = m.dataset_revision assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "test1.txt")) is False assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "test2.txt")) is True assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "test3.txt")) is False assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "other_dir", "test3.txt")) is False assert os.path.exists( os.path.join(m.cache_mgr.cache_root, revision, "other_dir", "test1.txt")) is False
def test_update_dataset_link(self, fixture_working_dir, snapshot): im = InventoryManager(fixture_working_dir[0]) lb = im.create_labbook('default', 'default', 'test-lb', 'testing dataset links') ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") manifest = Manifest(ds, 'default') helper_append_file(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test1.txt", "12345") manifest.sweep_all_changes() # Fake publish to a local bare repo _MOCK_create_remote_repo2(ds, 'default', None, None) assert os.path.exists(os.path.join(lb.root_dir, '.gitmodules')) is False overview_query = """ { labbook(owner: "default", name:"test-lb") { linkedDatasets{ name overview { localBytes totalBytes } } } } """ query = """ mutation myMutation($lo: String!, $ln: String!, $do: String!, $dn: String!, $a: String!, $du: String) { modifyDatasetLink(input: {labbookOwner: $lo, labbookName: $ln, datasetOwner: $do, datasetName: $dn, action: $a, datasetUrl: $du}) { newLabbookEdge { node { id name description linkedDatasets { name } } } } } """ variables = { "lo": "default", "ln": "test-lb", "do": "default", "dn": "dataset100", "a": "link", "du": ds.remote } result = fixture_working_dir[2].execute(query, variable_values=variables) assert "errors" not in result snapshot.assert_match(result) assert os.path.exists(os.path.join(lb.root_dir, '.gitmodules')) is True dataset_submodule_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', 'default', 'dataset100') assert os.path.exists(dataset_submodule_dir) is True assert os.path.exists(os.path.join(dataset_submodule_dir, '.gigantum')) is True assert os.path.exists( os.path.join(dataset_submodule_dir, 'test_file.dat')) is False with open(os.path.join(lb.root_dir, '.gitmodules'), 'rt') as mf: data = mf.read() assert len(data) > 0 # check overview result = fixture_working_dir[2].execute(overview_query) assert "errors" not in result assert result['data']['labbook']['linkedDatasets'][0]['overview'][ 'localBytes'] == '5' assert result['data']['labbook']['linkedDatasets'][0]['overview'][ 'totalBytes'] == '5' # Make change to published dataset git_dir = os.path.join(tempfile.gettempdir(), 'test_update_dataset_link_mutation') try: os.makedirs(git_dir) call_subprocess(['git', 'clone', ds.remote], cwd=git_dir, check=True) with open(os.path.join(git_dir, ds.name, 'test_file.dat'), 'wt') as tf: tf.write("Test File Contents") call_subprocess(['git', 'add', 'test_file.dat'], cwd=os.path.join(git_dir, ds.name), check=True) call_subprocess(['git', 'commit', '-m', 'editing repo'], cwd=os.path.join(git_dir, ds.name), check=True) call_subprocess(['git', 'push'], cwd=os.path.join(git_dir, ds.name), check=True) query = """ mutation myMutation($lo: String!, $ln: String!, $do: String!, $dn: String!, $a: String!) { modifyDatasetLink(input: {labbookOwner: $lo, labbookName: $ln, datasetOwner: $do, datasetName: $dn, action: $a}) { newLabbookEdge { node { id name description linkedDatasets { name } } } } } """ variables = { "lo": "default", "ln": "test-lb", "do": "default", "dn": "dataset100", "a": "update" } result = fixture_working_dir[2].execute(query, variable_values=variables) assert "errors" not in result snapshot.assert_match(result) # verify change is reflected assert os.path.exists( os.path.join(dataset_submodule_dir, 'test_file.dat')) is True # Verify activity record assert "Updated Dataset `default/dataset100` link to version" in lb.git.log( )[0]['message'] finally: if os.path.exists(git_dir): shutil.rmtree(git_dir)
def test_push_objects(self, mock_config_file, mock_dataset_head): im = InventoryManager(mock_config_file[0]) ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") manifest = Manifest(ds, 'default') iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj2 = obj_to_push[1].object_path.rsplit('/', 1) with aioresponses() as mocked_responses: mocked_responses.put( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj1}', payload={ "presigned_url": f"https://dummyurl.com/{obj1}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj1, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj1}?params=1", payload={}, status=200) mocked_responses.put( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj2}', payload={ "presigned_url": f"https://dummyurl.com/{obj2}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj2, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj2}?params=1", payload={}, status=200) job_kwargs = { 'objs': obj_to_push, 'logged_in_username': "******", 'access_token': "faketoken", 'id_token': "faketoken", 'dataset_owner': ds.namespace, 'dataset_name': ds.name, 'config_file': ds.client_config.config_file, } gtmcore.dispatcher.dataset_jobs.push_dataset_objects(**job_kwargs)
def test_verify_contents_linked_dataset(self, mock_dataset_with_local_dir): class JobMock(): def __init__(self): self.meta = dict() def save_meta(self): pass CURRENT_JOB = JobMock() def get_current_job_mock(): return CURRENT_JOB with patch('gtmcore.dispatcher.jobs.get_current_job', side_effect=get_current_job_mock): ds = mock_dataset_with_local_dir[0] im = InventoryManager() ds.backend.update_from_remote(ds, lambda x: print(x)) m = Manifest(ds, 'tester') assert len(m.manifest.keys()) == 4 assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test2.txt')) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'subdir', 'test3.txt')) modified_items = ds.backend.verify_contents(ds, lambda x: print(x)) assert len(modified_items) == 0 lb = im.create_labbook("tester", "tester", 'test-labbook') im.link_dataset_to_labbook(f"{ds.root_dir}/.git", "tester", ds.name, lb) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', 'tester', ds.name) ds = im.load_dataset_from_directory(dataset_dir) test_dir = os.path.join(mock_dataset_with_local_dir[1], "local_data", "test_dir") with open(os.path.join(test_dir, 'test1.txt'), 'wt') as tf: tf.write("This file got changed in the filesystem") kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "tester", 'dataset_name': 'dataset-1', 'labbook_owner': "tester", 'labbook_name': 'test-labbook' } jobs.verify_dataset_contents(**kwargs) job = gtmcore.dispatcher.jobs.get_current_job() assert 'modified_keys' in job.meta assert job.meta['modified_keys'] == ["test1.txt"] assert 'Validating contents of 3 files.' in job.meta['feedback']
def test_pull_objects(self, mock_config_file, mock_dataset_head): im = InventoryManager(mock_config_file[0]) ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") m = Manifest(ds, 'default') iom = IOManager(ds, m) os.makedirs( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, "other_dir")) helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfadfsdf") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test2.txt", "fdsfgfd") m.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj2_target = obj_to_push[1].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) obj2_source = os.path.join('/tmp', uuid.uuid4().hex) assert os.path.exists(obj1_target) is True assert os.path.exists(obj2_target) is True helper_compress_file(obj1_target, obj1_source) helper_compress_file(obj2_target, obj2_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj2_target) is False assert os.path.isfile(obj1_source) is True assert os.path.isfile(obj2_source) is True # Clear out from linked dir os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test2.txt')) with patch.object(Configuration, 'find_default_config', lambda self: mock_config_file[0]): with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_2}?params=1", "namespace": ds.namespace, "obj_id": obj_id_2, "dataset": ds.name }, status=200) with open(obj2_source, 'rb') as data2: mocked_responses.get( f"https://dummyurl.com/{obj_id_2}?params=1", body=data2.read(), status=200, content_type='application/octet-stream') dl_kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "default", 'dataset_name': "dataset100", 'labbook_owner': None, 'labbook_name': None, 'keys': ["test1.txt"] } gtmcore.dispatcher.dataset_jobs.pull_objects(**dl_kwargs) # Manually link since this is disabled by default in the job (because in real use, multiple jobs run # in parallel and you only want to link once. m.link_revision() assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is False decompressor = snappy.StreamDecompressor() with open(obj1_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj1_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1 # Download other file dl_kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "default", 'dataset_name': "dataset100", 'labbook_owner': None, 'labbook_name': None, 'keys': ["test2.txt"] } gtmcore.dispatcher.dataset_jobs.pull_objects(**dl_kwargs) # Manually link since this is disabled by default in the job (because in real use, multiple jobs run # in parallel and you only want to link once. m.link_revision() assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is True with open(obj1_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj1_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1 with open(obj2_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj2_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1
def mutate_and_get_payload(cls, root, info, labbook_owner, labbook_name, dataset_owner, dataset_name, action, dataset_url=None, client_mutation_id=None): logged_in_username = get_logged_in_username() im = InventoryManager() lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name, author=get_logged_in_author()) with lb.lock(): if action == 'link': if dataset_url: remote_domain = cls._get_remote_domain( dataset_url, dataset_owner, dataset_name) if remote_domain: # Make sure git creds are configured for the remote admin_service = None for remote in lb.client_config.config['git'][ 'remotes']: if remote_domain == remote: admin_service = lb.client_config.config['git'][ 'remotes'][remote]['admin_service'] break if "HTTP_AUTHORIZATION" in info.context.headers.environ: token = parse_token(info.context.headers. environ["HTTP_AUTHORIZATION"]) else: raise ValueError( "Authorization header not provided." " Must have a valid session to query for collaborators" ) mgr = GitLabManager(remote_domain, admin_service, token) mgr.configure_git_credentials(remote_domain, logged_in_username) else: # Link to local dataset ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) dataset_url = f"{ds.root_dir}/.git" # Link the dataset to the labbook ds = im.link_dataset_to_labbook(dataset_url, dataset_owner, dataset_name, lb) ds.namespace = dataset_owner # Preload the dataloader info.context.dataset_loader.prime( f"{get_logged_in_username()}&{dataset_owner}&{dataset_name}", ds) # Relink the revision m = Manifest(ds, logged_in_username) m.link_revision() elif action == 'unlink': im.unlink_dataset_from_labbook(dataset_owner, dataset_name, lb) elif action == 'update': ds = im.update_linked_dataset_reference( dataset_owner, dataset_name, lb) m = Manifest(ds, logged_in_username) m.force_reload() info.context.dataset_loader.prime( f"{get_logged_in_username()}&{dataset_owner}&{dataset_name}", ds) else: raise ValueError( "Unsupported action. Use `link`, `unlink`, or `update`") info.context.labbook_loader.prime( f"{get_logged_in_username()}&{labbook_owner}&{labbook_name}", lb) edge = LabbookConnection.Edge(node=Labbook(owner=labbook_owner, name=labbook_name), cursor=base64.b64encode( f"{0}".encode('utf-8'))) return ModifyDatasetLink(new_labbook_edge=edge)
def create_dataset(self, username: str, owner: str, dataset_name: str, storage_type: str, description: Optional[str] = None, author: Optional[GitAuthor] = None) -> Dataset: """Create a new Dataset in this Gigantum working directory. Args: username: Active username owner: Namespace in which to place this Dataset dataset_name: Name of the Dataset storage_type: String identifying the type of Dataset to instantiate description: Optional brief description of Dataset author: Optional Git Author Returns: Newly created LabBook instance """ dataset = Dataset(config_file=self.config_file, author=author, namespace=owner) if storage_type not in SUPPORTED_STORAGE_BACKENDS: raise ValueError( f"Unsupported Dataset storage type: {storage_type}") try: build_info = Configuration(self.config_file).config['build_info'] except KeyError: logger.warning("Could not obtain build_info from config") build_info = None # Build data file contents dataset._data = { "schema": DATASET_CURRENT_SCHEMA, "id": uuid.uuid4().hex, "name": dataset_name, "storage_type": storage_type, "description": description or '', "created_on": datetime.datetime.utcnow().isoformat(), "build_info": build_info } dataset._validate_gigantum_data() logger.info("Creating new Dataset on disk for {}/{}/{}".format( username, owner, dataset_name)) # lock while creating initial directory with dataset.lock( lock_key=f"new_dataset_lock|{username}|{owner}|{dataset_name}" ): # Verify or Create user subdirectory # Make sure you expand a user dir string starting_dir = os.path.expanduser( dataset.client_config.config["git"]["working_directory"]) user_dir = os.path.join(starting_dir, username) if not os.path.isdir(user_dir): os.makedirs(user_dir) # Create owner dir - store LabBooks in working dir > logged in user > owner owner_dir = os.path.join(user_dir, owner) if not os.path.isdir(owner_dir): os.makedirs(owner_dir) # Create `datasets` subdir in the owner dir owner_dir = os.path.join(owner_dir, "datasets") else: owner_dir = os.path.join(owner_dir, "datasets") # Verify name not already in use if os.path.isdir(os.path.join(owner_dir, dataset_name)): raise ValueError( f"Dataset `{dataset_name}` already exists locally. Choose a new Dataset name" ) # Create Dataset subdirectory new_root_dir = os.path.join(owner_dir, dataset_name) os.makedirs(new_root_dir) dataset._set_root_dir(new_root_dir) # Init repository dataset.git.initialize() # Create Directory Structure dirs = [ 'manifest', 'metadata', '.gigantum', os.path.join('.gigantum', 'favorites'), os.path.join('.gigantum', 'activity'), os.path.join('.gigantum', 'activity', 'log') ] for d in dirs: p = os.path.join(dataset.root_dir, d, '.gitkeep') os.makedirs(os.path.dirname(p), exist_ok=True) with open(p, 'w') as gk: gk.write( "This file is necessary to keep this directory tracked by Git" " and archivable by compression tools. Do not delete or modify!" ) dataset._save_gigantum_data() # Create an empty storage.json file dataset.backend_config = {} # Create .gitignore default file shutil.copyfile( os.path.join(resource_filename('gtmcore', 'dataset'), 'gitignore.default'), os.path.join(dataset.root_dir, ".gitignore")) # Commit dataset.git.add_all() # NOTE: this string is used to indicate there are no more activity records to get. Changing the string will # break activity paging. # TODO: Improve method for detecting the first activity record dataset.git.commit(f"Creating new empty Dataset: {dataset_name}") # Create Activity Record adr = ActivityDetailRecord(ActivityDetailType.DATASET, show=False, importance=0) adr.add_value('text/plain', f"Created new Dataset: {username}/{dataset_name}") ar = ActivityRecord( ActivityType.DATASET, message=f"Created new Dataset: {username}/{dataset_name}", show=True, importance=255, linked_commit=dataset.git.commit_hash) ar.add_detail_object(adr) store = ActivityStore(dataset) store.create_activity_record(ar) # Initialize file cache and link revision m = Manifest(dataset, username) m.link_revision() return dataset
def test_complete_dataset_upload_transaction_failure( self, mock_config_file_background_tests): im = InventoryManager(mock_config_file_background_tests[0]) ds = im.create_dataset('default', 'default', "new-ds", storage_type="gigantum_object_v1", description="100") m = Manifest(ds, 'default') dispatcher_obj = Dispatcher() helper_write_big_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.dat", "12") helper_write_big_file(m.cache_mgr.cache_root, m.dataset_revision, "test2.dat", "23") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "zztest3.txt", "fake content 3") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "zztest4.txt", "fake content 4") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "zztest5.txt", "fake content 5") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "zztest6.txt", "fake content 6") job_kwargs = { 'dispatcher': Dispatcher, 'logged_in_username': "******", 'logged_in_email': "*****@*****.**", 'dataset_owner': "default", 'dataset_name': "new-ds", 'config_file': mock_config_file_background_tests[0] } job_metadata = { 'dataset': f"default|default|new-ds", 'method': 'complete_dataset_upload_transaction' } assert len(m.manifest) == 0 job_key = dispatcher_obj.dispatch_task( gtmcore.dispatcher.dataset_jobs. complete_dataset_upload_transaction, kwargs=job_kwargs, metadata=job_metadata) time.sleep(3) # Remove files to make them fail os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, "zztest4.txt")) os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, "zztest5.txt")) cnt = 0 while cnt < 120: job_status = dispatcher_obj.query_task(job_key) if job_status.status == 'finished': break time.sleep(1) cnt += 1 assert cnt < 119 m = Manifest(ds, 'default') assert len(m.manifest) == 4 assert 'test1.dat' in m.manifest assert 'test2.dat' in m.manifest assert 'zztest3.txt' in m.manifest assert 'zztest6.txt' in m.manifest assert 'zztest5.txt' not in m.manifest assert 'zztest4.txt' not in m.manifest assert job_status.meta['has_failures'] is True assert 'The following files failed to hash. Try re-uploading the files again:\nzztest4.txt \nzztest5.txt' ==\ job_status.meta['failure_detail'] assert 'An error occurred while processing some files. Check details and re-upload.' == \ job_status.meta['feedback']
def test_complete_dataset_upload_transaction_all_types( self, mock_config_file_background_tests): im = InventoryManager(mock_config_file_background_tests[0]) ds = im.create_dataset('default', 'default', "new-ds", storage_type="gigantum_object_v1", description="100") m = Manifest(ds, 'default') helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "fake content 1") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test2.txt", "fake content 2") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test3.txt", "fake content 3") dl_kwargs = { 'dispatcher': Dispatcher, 'logged_in_username': "******", 'logged_in_email': "*****@*****.**", 'dataset_owner': "default", 'dataset_name': "new-ds", 'config_file': mock_config_file_background_tests[0] } assert len(m.manifest) == 0 gtmcore.dispatcher.dataset_jobs.complete_dataset_upload_transaction( **dl_kwargs) m = Manifest(ds, 'default') # make sure manifest got updated assert len(m.manifest) == 3 assert 'test1.txt' in m.manifest assert 'test2.txt' in m.manifest assert 'test3.txt' in m.manifest hash1 = m.manifest['test1.txt']['h'] # Make sure activity created assert len(ds.git.log()) == 6 assert "_GTM_ACTIVITY_START_" in ds.git.log()[0]['message'] assert "Uploaded 3 new file(s)." in ds.git.log()[0]['message'] helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "fake content changed") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test4.txt", "fake content 4") os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, "test3.txt")) gtmcore.dispatcher.dataset_jobs.complete_dataset_upload_transaction( **dl_kwargs) m = Manifest(ds, 'default') # make sure manifest got updated assert len(m.manifest) == 3 assert 'test1.txt' in m.manifest assert 'test2.txt' in m.manifest assert 'test4.txt' in m.manifest assert hash1 != m.manifest['test1.txt']['h'] # Make sure activity created assert len(ds.git.log()) == 8 assert "_GTM_ACTIVITY_START_" in ds.git.log()[0]['message'] assert "Uploaded 1 new file(s). Uploaded 1 modified file(s). 1 deleted file(s)." in ds.git.log( )[0]['message']
def test_download_dataset_files(self, mock_config_file_background_tests, mock_dataset_head): def dispatch_query_mock(self, job_key): JobStatus = namedtuple("JobStatus", ['status', 'meta']) return JobStatus(status='finished', meta={'completed_bytes': '500'}) def dispatch_mock(self, method_reference, kwargs, metadata, persist): with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') gtmcore.dispatcher.dataset_jobs.pull_objects(**kwargs) return "afakejobkey" im = InventoryManager(mock_config_file_background_tests[0]) ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") m = Manifest(ds, 'default') iom = IOManager(ds, m) helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfadfsdf") m.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 1 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) assert os.path.exists(obj1_target) is True helper_compress_file(obj1_target, obj1_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj1_source) is True # Clear out from linked dir os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) with patch.object(Configuration, 'find_default_config', lambda self: mock_config_file_background_tests[0]): with patch.object(Dispatcher, 'dispatch_task', dispatch_mock): with patch.object(Dispatcher, 'query_task', dispatch_query_mock): dl_kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "default", 'dataset_name': "dataset100", 'labbook_owner': None, 'labbook_name': None, 'keys': ["test1.txt"], 'config_file': mock_config_file_background_tests[0] } gtmcore.dispatcher.dataset_jobs.download_dataset_files( **dl_kwargs) assert os.path.isfile(obj1_target) is True decompressor = snappy.StreamDecompressor() with open(obj1_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj1_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1
def test_download_dataset_files_file_fail( self, mock_config_file_background_tests): def dispatch_query_mock(self, job_key): # mock the job actually running and returning status JobStatus = namedtuple("JobStatus", ['status', 'meta']) return JobStatus(status='finished', meta={ 'completed_bytes': '0', 'failure_keys': 'test1.txt' }) def dispatch_mock(self, method_reference, kwargs, metadata, persist): gtmcore.dispatcher.dataset_jobs.pull_objects(**kwargs) return "afakejobkey" im = InventoryManager(mock_config_file_background_tests[0]) ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") m = Manifest(ds, 'default') iom = IOManager(ds, m) helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfadfsdf") m.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 1 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) assert os.path.exists(obj1_target) is True helper_compress_file(obj1_target, obj1_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj1_source) is True # Clear out from linked dir os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) with patch.object(Configuration, 'find_default_config', lambda self: mock_config_file_background_tests[0]): with patch.object(Dispatcher, 'dispatch_task', dispatch_mock): with patch.object(Dispatcher, 'query_task', dispatch_query_mock): dl_kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "default", 'dataset_name': "dataset100", 'labbook_owner': None, 'labbook_name': None, 'keys': ["test1.txt"], 'config_file': mock_config_file_background_tests[0] } with pytest.raises(IOError): gtmcore.dispatcher.dataset_jobs.download_dataset_files( **dl_kwargs) assert os.path.isfile(obj1_target) is False
def test_add_file(self, mock_create_dataset): """Test adding a new file to a labbook""" class DummyContext(object): def __init__(self, file_handle): self.dataset_loader = None self.files = {'uploadChunk': file_handle} client = Client(mock_create_dataset[3], middleware=[DataloaderMiddleware()]) # Create file to upload test_file = os.path.join(tempfile.gettempdir(), "myValidFile.dat") est_size = 9000000 try: os.remove(test_file) except: pass with open(test_file, 'wb') as tf: tf.write(os.urandom(est_size)) new_file_size = os.path.getsize(tf.name) # Get upload params chunk_size = 4194000 file_info = os.stat(test_file) file_size = int(file_info.st_size / 1000) total_chunks = int(math.ceil(file_info.st_size / chunk_size)) ds = InventoryManager(mock_create_dataset[0]).load_dataset( 'default', 'default', 'dataset1') fsc = HostFilesystemCache(ds, 'default') target_file = os.path.join(fsc.current_revision_dir, "myValidFile.dat") txid = "000-unitest-transaction" with open(test_file, 'rb') as tf: # Check for file to exist (shouldn't yet) assert os.path.exists(target_file) is False for chunk_index in range(total_chunks): # Upload a chunk chunk = io.BytesIO() chunk.write(tf.read(chunk_size)) chunk.seek(0) file = FileStorage(chunk) query = f""" mutation addDatasetFile{{ addDatasetFile(input:{{owner:"default", datasetName: "dataset1", filePath: "myValidFile.dat", transactionId: "{txid}", chunkUploadParams:{{ uploadId: "fdsfdsfdsfdfs", chunkSize: {chunk_size}, totalChunks: {total_chunks}, chunkIndex: {chunk_index}, fileSizeKb: {file_size}, filename: "{os.path.basename(test_file)}" }} }}) {{ newDatasetFileEdge {{ node{{ id key isDir size }} }} }} }} """ r = client.execute(query, context_value=DummyContext(file)) assert 'errors' not in r # So, these will only be populated once the last chunk is uploaded. Will be None otherwise. assert r['data']['addDatasetFile']['newDatasetFileEdge']['node'][ 'isDir'] is False assert r['data']['addDatasetFile']['newDatasetFileEdge']['node'][ 'key'] == 'myValidFile.dat' assert r['data']['addDatasetFile']['newDatasetFileEdge']['node'][ 'size'] == f"{new_file_size}" # When done uploading, file should exist in the labbook assert os.path.exists(target_file) assert os.path.isfile(target_file) complete_query = f""" mutation completeQuery {{ completeDatasetUploadTransaction(input: {{ owner: "default", datasetName: "dataset1", transactionId: "{txid}" }}) {{ success }} }} """ r = client.execute(complete_query, context_value=DummyContext(file)) assert 'errors' not in r m = Manifest(ds, 'default') status = m.status() assert len(status.created) == 0 assert len(status.modified) == 0 assert len(status.deleted) == 0 assert 'Uploaded 1 new file(s)' in ds.git.log()[0]['message']
def test_make_directory(self, fixture_working_dir, snapshot): im = InventoryManager(fixture_working_dir[0]) ds = im.create_dataset('default', 'default', "dataset-dir", storage_type="gigantum_object_v1", description="testing move") m = Manifest(ds, 'default') m.link_revision() query = """ mutation myMutation { makeDatasetDirectory(input: {datasetOwner: "default", datasetName: "dataset-dir", key: "test_dir1/"}) { newDatasetFileEdge { node { id key isDir isLocal size } } } } """ result = fixture_working_dir[2].execute(query) assert 'errors' not in result assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['key'] == 'test_dir1/' assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['isDir'] is True assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['isLocal'] is True assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['size'] == '0' assert os.path.isdir( os.path.join(m.cache_mgr.current_revision_dir, "test_dir1")) is True query = """ mutation myMutation { makeDatasetDirectory(input: {datasetOwner: "default", datasetName: "dataset-dir", key: "test_dir1/test_dir2/"}) { newDatasetFileEdge { node { id key isDir isLocal size } } } } """ result = fixture_working_dir[2].execute(query) assert 'errors' not in result assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['key'] == 'test_dir1/test_dir2/' assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['isDir'] is True assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['isLocal'] is True assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][ 'node']['size'] == '0' assert os.path.isdir( os.path.join(m.cache_mgr.current_revision_dir, "test_dir1")) is True assert os.path.isdir( os.path.join(m.cache_mgr.current_revision_dir, "test_dir1", "test_dir2")) is True
def _push_dataset_objects(self, logged_in_username: str, feedback_callback: Callable, access_token, id_token) -> None: """Method to schedule a push operta Args: logged_in_username: feedback_callback: access_token: id_token: Returns: """ dispatcher_obj = Dispatcher() try: self.dataset.backend.set_default_configuration( logged_in_username, access_token, id_token) m = Manifest(self.dataset, logged_in_username) iom = IOManager(self.dataset, m) obj_batches, total_bytes, num_files = iom.compute_push_batches() if obj_batches: # Schedule jobs for batches bg_jobs = list() for objs in obj_batches: job_kwargs = { 'objs': objs, 'logged_in_username': logged_in_username, 'access_token': access_token, 'id_token': id_token, 'dataset_owner': self.dataset.namespace, 'dataset_name': self.dataset.name, 'config_file': self.dataset.client_config.config_file, } job_metadata = { 'dataset': f"{logged_in_username}|{self.dataset.namespace}|{self.dataset.name}", 'method': 'pull_objects' } feedback_callback( f"Preparing to upload {num_files} files. Please wait..." ) job_key = dispatcher_obj.dispatch_task( method_reference=gtmcore.dispatcher.dataset_jobs. push_dataset_objects, kwargs=job_kwargs, metadata=job_metadata, persist=True) bg_jobs.append( BackgroundUploadJob(dispatcher_obj, objs, job_key)) logger.info( f"Schedule dataset object upload job for" f" {logged_in_username}/{self.dataset.namespace}/{self.dataset.name} with" f" {len(objs)} objects to upload") while sum([(x.is_complete or x.is_failed) for x in bg_jobs]) != len(bg_jobs): # Refresh all job statuses and update status feedback [j.refresh_status() for j in bg_jobs] total_completed_bytes = sum( [j.completed_bytes for j in bg_jobs]) if total_completed_bytes > 0: pc = (float(total_completed_bytes) / float(total_bytes)) * 100 feedback_callback( f"Please wait - Uploading {num_files} files ({format_size(total_completed_bytes)}" f" of {format_size(total_bytes)}) - {round(pc)}% complete", percent_complete=pc) time.sleep(1) # if you get here, all jobs are done or failed. # Remove all the push files so they can be regenerated if needed for f in glob.glob(f'{iom.push_dir}/*'): os.remove(f) # Aggregate failures if they exist failure_keys: List[str] = list() for j in bg_jobs: if j.is_failed: # Background job hard failed. Assume entire batch should get re-uploaded for obj in j.objs: failure_keys.append( f"{obj.dataset_path} at {obj.revision[0:8]}") m.queue_to_push(obj.object_path, obj.dataset_path, obj.revision) else: for obj in j.get_failed_objects(): # Some individual objects failed failure_keys.append( f"{obj.dataset_path} at {obj.revision[0:8]}") m.queue_to_push(obj.object_path, obj.dataset_path, obj.revision) # Set final status for UI if len(failure_keys) == 0: feedback_callback(f"Upload complete!", percent_complete=100, has_failures=False) else: failure_str = "\n".join(failure_keys) failure_detail_str = f"Files that failed to upload:\n{failure_str}" feedback_callback("", percent_complete=100, has_failures=True, failure_detail=failure_detail_str) # Finish up by linking everything just in case iom.manifest.link_revision() if len(failure_keys) > 0: # If any downloads failed, exit non-zero to the UI knows there was an error raise IOError( f"{len(failure_keys)} file(s) failed to upload. Check message detail for more information" " and try to sync again.") except Exception as err: logger.exception(err) raise