Beispiel #1
0
    def helper_resolve_all_files(self, dataset, kwargs):
        """Helper method to populate the DatasetFileConnection"""
        manifest = Manifest(dataset, get_logged_in_username())

        if "after" in kwargs:
            after_index = int(base64.b64decode(kwargs["after"]))
        else:
            after_index = 0

        # Generate naive cursors
        edges, indexes = manifest.list(first=kwargs.get("first"),
                                       after_index=after_index)
        cursors = [
            base64.b64encode("{}".format(x).encode("UTF-8")).decode("UTF-8")
            for x in indexes
        ]

        edge_objs = []
        for edge, cursor in zip(edges, cursors):
            create_data = {
                "owner": self.owner,
                "name": self.name,
                "key": edge['key'],
                "_file_info": edge
            }
            edge_objs.append(
                DatasetFileConnection.Edge(node=DatasetFile(**create_data),
                                           cursor=cursor))

        has_previous_page = False
        has_next_page = len(edges) > 0
        start_cursor = None
        end_cursor = None
        if cursors:
            start_cursor = cursors[0]
            end_cursor = cursors[-1]
            if indexes[-1] == len(manifest.manifest) - 1:
                has_next_page = False

        if kwargs.get("after"):
            if int(base64.b64decode(kwargs["after"])) > 0:
                has_previous_page = True

        page_info = graphene.relay.PageInfo(
            has_next_page=has_next_page,
            has_previous_page=has_previous_page,
            start_cursor=start_cursor,
            end_cursor=end_cursor)

        return DatasetFileConnection(edges=edge_objs, page_info=page_info)
Beispiel #2
0
    def test_complete_dataset_upload_transaction_simple(
            self, mock_config_file_background_tests):
        im = InventoryManager(mock_config_file_background_tests[0])
        ds = im.create_dataset('default',
                               'default',
                               "new-ds",
                               storage_type="gigantum_object_v1",
                               description="100")
        m = Manifest(ds, 'default')

        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test1.txt", "fake content!")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test2.txt", "moar fake content!")

        dl_kwargs = {
            'dispatcher': Dispatcher,
            'logged_in_username': "******",
            'logged_in_email': "*****@*****.**",
            'dataset_owner': "default",
            'dataset_name': "new-ds",
            'config_file': mock_config_file_background_tests[0]
        }

        assert len(m.manifest) == 0
        gtmcore.dispatcher.dataset_jobs.complete_dataset_upload_transaction(
            **dl_kwargs)

        m = Manifest(ds, 'default')

        # make sure manifest got updated
        assert len(m.manifest) == 2
        assert 'test1.txt' in m.manifest
        assert 'test2.txt' in m.manifest

        assert m.manifest['test1.txt']['b'] == '13'
        assert len(m.manifest['test1.txt']['h']) == 128
        assert 'manifest-' in m.manifest['test1.txt']['fn']

        assert m.manifest['test2.txt']['b'] == '18'
        assert len(m.manifest['test2.txt']['h']) == 128
        assert 'manifest-' in m.manifest['test2.txt']['fn']

        assert m.manifest['test2.txt']['h'] != m.manifest['test1.txt']['h']

        # Make sure activity created
        assert len(ds.git.log()) == 6
        assert "_GTM_ACTIVITY_START_" in ds.git.log()[0]['message']
        assert "Uploaded 2 new file(s)." in ds.git.log()[0]['message']
Beispiel #3
0
def generate_bg_hash_job_list(filenames: List[str],
                              manifest: Manifest,
                              dispatcher_obj: Dispatcher) -> List[BackgroundHashJob]:
    """Method to generate batches of files to be hashed, ensuring files aren't added to a batch once it is
    larger than MAX_JOB_BYTES

    Args:
        filenames: list of files to be hashed
        manifest: the Manifest instance
        dispatcher_obj: the Dispatcher instance

    Returns:
        list
    """
    num_cores = manifest.get_num_hashing_cpus()
    file_lists: List[List] = [list() for _ in range(num_cores)]
    size_sums = [0 for _ in range(num_cores)]
    revision_dir = manifest.current_revision_dir

    for filename in filenames:
        index = size_sums.index(min(size_sums))
        file_lists[index].append(filename)
        size_sums[index] += os.path.getsize(os.path.join(revision_dir, filename))
        if all(fs > MAX_JOB_BYTES for fs in size_sums):
            # 1GB of data to hash already in every job. Add another.
            file_lists.append(list())
            size_sums.append(0)

    # Prune Jobs back if there are lots of cores but not lots of work
    file_lists = [x for x in file_lists if x != []]
    size_sums = [x for x in size_sums if x != 0]

    # Prep hashing jobs
    return [BackgroundHashJob(dispatcher_obj, fl, ss) for ss, fl in zip(size_sums, file_lists)]
Beispiel #4
0
    def _get_dataset_file_info(self, dataset) -> dict:
        """helper method to iterate over the manifest and get file info for the overview page

        Returns:
            None
        """
        m = Manifest(dataset, get_logged_in_username())

        count = 0
        total_bytes = 0
        file_type_distribution: OrderedDict = OrderedDict()
        for key in m.manifest:
            item = m.manifest[key]
            if key[-1] == '/':
                # Skip directories
                continue

            filename = os.path.basename(key)
            if filename[0] == '.':
                # Skip hidden files
                continue

            if '.' not in filename:
                # Skip files without an extension
                continue

            # Count file type distribution
            _, ext = os.path.splitext(filename)
            if ext:
                file_type = ext
                if file_type in file_type_distribution:
                    file_type_distribution[file_type] += 1
                else:
                    file_type_distribution[file_type] = 1

            # Count total file size
            total_bytes += int(item['b'])

            # Count files
            count += 1

        # Format the output for file type distribution
        formatted_file_type_info: List[str] = list()
        file_type_distribution = OrderedDict(
            sorted(file_type_distribution.items(),
                   key=itemgetter(1),
                   reverse=True))
        for file_type in file_type_distribution:
            percentage = float(
                file_type_distribution[file_type]) / float(count)
            formatted_file_type_info.append(f"{percentage:.2f}|{file_type}")

        self._dataset_file_info = {
            'num_files': count,
            'total_bytes': total_bytes,
            'local_bytes': count,
            'file_type_distribution': formatted_file_type_info
        }

        return self._dataset_file_info
 def _push_dataset_objects(self, dataset: Dataset, logged_in_username: str,
                           feedback_callback: Callable, access_token,
                           id_token) -> None:
     dataset.backend.set_default_configuration(logged_in_username,
                                               access_token, id_token)
     m = Manifest(dataset, logged_in_username)
     iom = IOManager(dataset, m)
     iom.push_objects(status_update_fn=feedback_callback)
     iom.manifest.link_revision()
    def test_move_dataset_file(self, fixture_working_dir, snapshot):
        im = InventoryManager(fixture_working_dir[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset-move",
                               storage_type="gigantum_object_v1",
                               description="testing move")
        m = Manifest(ds, 'default')

        revision = m.dataset_revision
        helper_append_file(m.cache_mgr.cache_root, revision, "test1.txt",
                           "asdfasdghndfdf")
        m.sweep_all_changes()

        revision = m.dataset_revision
        cr = m.cache_mgr.cache_root
        assert os.path.exists(os.path.join(cr, revision, "test1.txt")) is True

        query = """
                   mutation myMutation {
                     moveDatasetFile(input: {datasetOwner: "default", datasetName: "dataset-move", 
                                             srcPath: "test1.txt", dstPath: "test1-renamed.txt"}) {
                         updatedEdges {
                            node {
                              id
                              key
                              isDir
                              isLocal
                              size
                            }
                         }
                     }
                   }
                   """
        result = fixture_working_dir[2].execute(query)
        assert 'errors' not in result
        snapshot.assert_match(result)

        revision = m.dataset_revision
        cr = m.cache_mgr.cache_root
        assert os.path.exists(os.path.join(cr, revision, "test1.txt")) is False
        assert os.path.exists(os.path.join(cr, revision,
                                           "test1-renamed.txt")) is True
Beispiel #7
0
    def mutate_and_get_payload(cls,
                               root,
                               info,
                               dataset_owner,
                               dataset_name,
                               keys,
                               client_mutation_id=None):
        logged_in_username = get_logged_in_username()
        ds = InventoryManager().load_dataset(logged_in_username,
                                             dataset_owner,
                                             dataset_name,
                                             author=get_logged_in_author())
        ds.namespace = dataset_owner
        m = Manifest(ds, logged_in_username)

        with ds.lock():
            m.delete(keys)

        return DeleteDatasetFiles(success=True)
    def test_delete_dataset_while_linked(self, mock_config_file):
        inv_manager = InventoryManager(mock_config_file[0])
        auth = GitAuthor(name="test", email="*****@*****.**")
        lb = inv_manager.create_labbook("test",
                                        "test",
                                        "labbook1",
                                        description="my first labbook")
        ds = inv_manager.create_dataset("test",
                                        "test",
                                        "dataset1",
                                        "gigantum_object_v1",
                                        description="my first dataset",
                                        author=auth)
        ds_root_dir = ds.root_dir
        lb_root_dir = lb.root_dir
        assert os.path.exists(ds_root_dir) is True
        assert os.path.exists(lb_root_dir) is True

        # Link dataset
        inv_manager.link_dataset_to_labbook(f"{ds_root_dir}/.git", "test",
                                            "dataset1", lb)

        m = Manifest(ds, 'test')
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test1.txt", "asdfasdf")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test2.txt", "dfg")

        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         "test1.txt")) is True
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         "test2.txt")) is True

        dataset_delete_job = inv_manager.delete_dataset(
            "test", "test", "dataset1")
        assert os.path.exists(ds_root_dir) is False
        assert os.path.exists(lb_root_dir) is True
        assert os.path.exists(m.cache_mgr.cache_root) is True
        assert dataset_delete_job.namespace == "test"
        assert dataset_delete_job.name == "dataset1"
        assert dataset_delete_job.cache_root == m.cache_mgr.cache_root

        jobs.clean_dataset_file_cache("test",
                                      dataset_delete_job.namespace,
                                      dataset_delete_job.name,
                                      dataset_delete_job.cache_root,
                                      config_file=mock_config_file[0])

        assert os.path.exists(m.cache_mgr.cache_root) is True

        cache_base, _ = m.cache_mgr.cache_root.rsplit(os.path.sep, 1)
        assert os.path.exists(cache_base) is True
Beispiel #9
0
    def mutate_and_get_payload(cls,
                               root,
                               info,
                               dataset_owner,
                               dataset_name,
                               src_path,
                               dst_path,
                               client_mutation_id=None):
        logged_in_username = get_logged_in_username()
        ds = InventoryManager().load_dataset(logged_in_username,
                                             dataset_owner,
                                             dataset_name,
                                             author=get_logged_in_author())
        ds.namespace = dataset_owner
        m = Manifest(ds, logged_in_username)

        with ds.lock():
            edge_data = m.move(src_path, dst_path)

        file_edges = list()
        for edge_dict in edge_data:
            file_edges.append(
                DatasetFile(owner=dataset_owner,
                            name=dataset_name,
                            key=edge_dict['key'],
                            is_dir=edge_dict['is_dir'],
                            is_favorite=edge_dict['is_favorite'],
                            modified_at=edge_dict['modified_at'],
                            is_local=edge_dict['is_local'],
                            size=str(edge_dict['size'])))

        cursors = [
            base64.b64encode("{}".format(cnt).encode("UTF-8")).decode("UTF-8")
            for cnt, x in enumerate(file_edges)
        ]

        edge_objs = [
            DatasetFileConnection.Edge(node=e, cursor=c)
            for e, c in zip(file_edges, cursors)
        ]
        return MoveDatasetFile(updated_edges=edge_objs)
Beispiel #10
0
    def test_sync__dataset(self, mock_config_file):
        def update_feedback(msg: str,
                            has_failures: Optional[bool] = None,
                            failure_detail: Optional[str] = None,
                            percent_complete: Optional[float] = None):
            """Method to update the job's metadata and provide feedback to the UI"""
            assert has_failures is None or has_failures is False
            assert failure_detail is None

        def dispatch_query_mock(self, job_key):
            JobStatus = namedtuple("JobStatus", ['status', 'meta'])
            return JobStatus(status='finished',
                             meta={'completed_bytes': '100'})

        def dispatch_mock(self, method_reference, kwargs, metadata, persist):
            return "afakejobkey"

        username = '******'
        im = InventoryManager(mock_config_file[0])
        ds = im.create_dataset(username, username, 'dataset-1',
                               'gigantum_object_v1')
        m = Manifest(ds, username)
        wf = DatasetWorkflow(ds)

        iom = IOManager(ds, m)
        assert len(glob.glob(f'{iom.push_dir}/*')) == 0
        wf.publish(username=username, feedback_callback=update_feedback)

        # Put a file into the dataset that needs to be pushed
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test1.txt", "asdfadfsdf")
        m.sweep_all_changes()

        assert len(glob.glob(f'{iom.push_dir}/*')) == 1
        with patch.object(Dispatcher, 'dispatch_task', dispatch_mock):
            with patch.object(Dispatcher, 'query_task', dispatch_query_mock):
                wf.sync(username=username, feedback_callback=update_feedback)
                assert os.path.exists(wf.remote)
                assert len(glob.glob(f'{iom.push_dir}/*')) == 0
Beispiel #11
0
    def helper_resolve_all_files(self, dataset, kwargs):
        """Helper method to populate the DatasetFileConnection"""
        manifest = Manifest(dataset, get_logged_in_username())

        # Generate naive cursors
        # TODO: Use manifest pagination interface
        edges = manifest.list()
        cursors = [base64.b64encode("{}".format(cnt).encode("UTF-8")).decode("UTF-8") for cnt, x in enumerate(edges)]

        # Process slicing and cursor args
        lbc = ListBasedConnection(edges, cursors, kwargs)
        lbc.apply()

        edge_objs = []
        for edge, cursor in zip(lbc.edges, lbc.cursors):
            create_data = {"owner": self.owner,
                           "name": self.name,
                           "key": edge['key'],
                           "_file_info": edge}
            edge_objs.append(DatasetFileConnection.Edge(node=DatasetFile(**create_data), cursor=cursor))

        return DatasetFileConnection(edges=edge_objs, page_info=lbc.page_info)
Beispiel #12
0
    def _helper_local_bytes(dataset):
        """Helper to compute total size of a dataset on disk"""
        m = Manifest(dataset, get_logged_in_username())
        total_size = 0

        for dirpath, dirnames, filenames in os.walk(m.current_revision_dir):
            for f in filenames:
                if f == '.smarthash':
                    continue
                fp = os.path.join(dirpath, f)
                total_size += os.path.getsize(fp)

        return total_size
    def test_make_directory_error(self, fixture_working_dir, snapshot):
        im = InventoryManager(fixture_working_dir[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset-dir",
                               storage_type="gigantum_object_v1",
                               description="testing move")
        m = Manifest(ds, 'default')

        # Test where parent dir doesnt exists (because you need to create the parent)
        query = """
                   mutation myMutation {
                     makeDatasetDirectory(input: {datasetOwner: "default", datasetName: "dataset-dir", 
                                             key: "test_dir1/test_dir2/"}) {
                         newDatasetFileEdge {
                            node {
                              id
                              key
                              isDir
                              isLocal
                              size
                            }
                         }
                     }
                   }
                   """
        result = fixture_working_dir[2].execute(query)
        assert 'errors' in result
        assert 'Parent directory' in result['errors'][0]['message']

        # Test where missing trailing slash
        query = """
                   mutation myMutation {
                     makeDatasetDirectory(input: {datasetOwner: "default", datasetName: "dataset-dir", 
                                             key: "test_dir1"}) {
                         newDatasetFileEdge {
                            node {
                              id
                              key
                              isDir
                              isLocal
                              size
                            }
                         }
                     }
                   }
                   """
        result = fixture_working_dir[2].execute(query)
        assert 'errors' in result
        assert 'Provided relative path must end in' in result['errors'][0][
            'message']
Beispiel #14
0
    def mutate_and_get_payload(cls,
                               root,
                               info,
                               dataset_owner,
                               dataset_name,
                               key,
                               client_mutation_id=None):
        logged_in_username = get_logged_in_username()
        ds = InventoryManager().load_dataset(logged_in_username,
                                             dataset_owner,
                                             dataset_name,
                                             author=get_logged_in_author())
        ds.namespace = dataset_owner
        m = Manifest(ds, logged_in_username)

        if key[-1] != '/':
            raise ValueError(
                "Provided relative path must end in `/` to indicate it is a directory"
            )

        with ds.lock():
            file_info = m.create_directory(key)

        create_data = {
            'owner': dataset_owner,
            'name': dataset_name,
            'key': file_info['key'],
            '_file_info': file_info
        }

        # TODO: Fix cursor implementation, this currently doesn't make sense
        cursor = base64.b64encode(f"{0}".encode('utf-8'))

        return MakeDatasetDirectory(
            new_dataset_file_edge=DatasetFileConnection.Edge(
                node=DatasetFile(**create_data), cursor=cursor))
Beispiel #15
0
    def delete_dataset(self, username: str, owner: str,
                       dataset_name: str) -> None:
        """Delete a Dataset from this Gigantum working directory.

        Args:
            username: Active username
            owner: Namespace in which to place this Dataset
            dataset_name: Name of the Datasets

        Returns:
            None

        """
        ds = self.load_dataset(username, owner, dataset_name)

        # Delete dataset contents from file cache
        m = Manifest(ds, username)
        shutil.rmtree(m.cache_mgr.cache_root, ignore_errors=True)

        # Delete dataset repository from working dir
        shutil.rmtree(ds.root_dir, ignore_errors=True)
Beispiel #16
0
    def test_delete_dataset(self, mock_config_file):
        inv_manager = InventoryManager(mock_config_file[0])
        auth = GitAuthor(name="test", email="*****@*****.**")
        ds = inv_manager.create_dataset("test", "test", "dataset1", "gigantum_object_v1",
                                        description="my first dataset",
                                        author=auth)
        root_dir = ds.root_dir
        assert os.path.exists(root_dir) is True

        m = Manifest(ds, 'test')
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfasdf")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test2.txt", "dfg")

        assert os.path.exists(os.path.join(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt")) is True
        assert os.path.exists(os.path.join(m.cache_mgr.cache_root, m.dataset_revision, "test2.txt")) is True

        inv_manager.delete_dataset("test", "test", "dataset1")
        assert os.path.exists(root_dir) is False
        assert os.path.exists(m.cache_mgr.cache_root) is False

        cache_base, _ = m.cache_mgr.cache_root.rsplit(os.path.sep, 1)
        assert os.path.exists(cache_base) is True
    def test_delete_dataset_files(self, fixture_working_dir, snapshot):
        im = InventoryManager(fixture_working_dir[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset-delete",
                               storage_type="gigantum_object_v1",
                               description="testing delete")
        m = Manifest(ds, 'default')

        os.makedirs(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         "other_dir"))
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test1.txt", "asdfadfsdf")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test2.txt", "fdsfgfd")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test3.txt", "ghgdsr")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "other_dir/test3.txt", "hhgf")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "other_dir/test1.txt", "jkjghfg")
        m.sweep_all_changes()

        revision = m.dataset_revision
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision,
                         "test1.txt")) is True
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision,
                         "test2.txt")) is True
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision,
                         "test3.txt")) is True
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision, "other_dir",
                         "test3.txt")) is True
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision, "other_dir",
                         "test1.txt")) is True

        query = """
                   mutation myMutation {
                     deleteDatasetFiles(input: {datasetOwner: "default", datasetName: "dataset-delete", 
                                                keys: ["test1.txt"]}) {
                         success
                     }
                   }
                   """
        result = fixture_working_dir[2].execute(query)
        assert 'errors' not in result
        assert result['data']['deleteDatasetFiles']['success'] is True

        revision = m.dataset_revision
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision,
                         "test1.txt")) is False
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision,
                         "test2.txt")) is True
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision,
                         "test3.txt")) is True
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision, "other_dir",
                         "test3.txt")) is True
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision, "other_dir",
                         "test1.txt")) is True

        query = """
                   mutation myMutation {
                     deleteDatasetFiles(input: {datasetOwner: "default", datasetName: "dataset-delete", 
                                                keys: ["test3.txt", "other_dir/"]}) {
                         success
                     }
                   }
                   """
        result = fixture_working_dir[2].execute(query)
        assert 'errors' not in result
        assert result['data']['deleteDatasetFiles']['success'] is True

        revision = m.dataset_revision
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision,
                         "test1.txt")) is False
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision,
                         "test2.txt")) is True
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision,
                         "test3.txt")) is False
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision, "other_dir",
                         "test3.txt")) is False
        assert os.path.exists(
            os.path.join(m.cache_mgr.cache_root, revision, "other_dir",
                         "test1.txt")) is False
    def test_update_dataset_link(self, fixture_working_dir, snapshot):
        im = InventoryManager(fixture_working_dir[0])
        lb = im.create_labbook('default', 'default', 'test-lb',
                               'testing dataset links')
        ds = im.create_dataset('default',
                               'default',
                               "dataset100",
                               storage_type="gigantum_object_v1",
                               description="100")
        manifest = Manifest(ds, 'default')
        helper_append_file(manifest.cache_mgr.cache_root,
                           manifest.dataset_revision, "test1.txt", "12345")
        manifest.sweep_all_changes()

        # Fake publish to a local bare repo
        _MOCK_create_remote_repo2(ds, 'default', None, None)

        assert os.path.exists(os.path.join(lb.root_dir,
                                           '.gitmodules')) is False

        overview_query = """
                {
                  labbook(owner: "default", name:"test-lb")
                  {
                    linkedDatasets{
                      name
                      overview {
                          localBytes
                          totalBytes
                      }
                    }
                  }
                }
                """

        query = """
                   mutation myMutation($lo: String!, $ln: String!, $do: String!, $dn: String!,
                                       $a: String!, $du: String) {
                     modifyDatasetLink(input: {labbookOwner: $lo, labbookName: $ln, datasetOwner: $do, datasetName: $dn,
                                               action: $a, datasetUrl: $du}) {
                         newLabbookEdge {
                           node {
                             id
                             name
                             description
                             linkedDatasets {
                               name
                             }
                           }
                         }
                     }
                   }
                   """
        variables = {
            "lo": "default",
            "ln": "test-lb",
            "do": "default",
            "dn": "dataset100",
            "a": "link",
            "du": ds.remote
        }
        result = fixture_working_dir[2].execute(query,
                                                variable_values=variables)
        assert "errors" not in result
        snapshot.assert_match(result)

        assert os.path.exists(os.path.join(lb.root_dir, '.gitmodules')) is True
        dataset_submodule_dir = os.path.join(lb.root_dir, '.gigantum',
                                             'datasets', 'default',
                                             'dataset100')
        assert os.path.exists(dataset_submodule_dir) is True
        assert os.path.exists(os.path.join(dataset_submodule_dir,
                                           '.gigantum')) is True
        assert os.path.exists(
            os.path.join(dataset_submodule_dir, 'test_file.dat')) is False

        with open(os.path.join(lb.root_dir, '.gitmodules'), 'rt') as mf:
            data = mf.read()
        assert len(data) > 0

        # check overview
        result = fixture_working_dir[2].execute(overview_query)
        assert "errors" not in result
        assert result['data']['labbook']['linkedDatasets'][0]['overview'][
            'localBytes'] == '5'
        assert result['data']['labbook']['linkedDatasets'][0]['overview'][
            'totalBytes'] == '5'

        # Make change to published dataset
        git_dir = os.path.join(tempfile.gettempdir(),
                               'test_update_dataset_link_mutation')
        try:
            os.makedirs(git_dir)
            call_subprocess(['git', 'clone', ds.remote],
                            cwd=git_dir,
                            check=True)
            with open(os.path.join(git_dir, ds.name, 'test_file.dat'),
                      'wt') as tf:
                tf.write("Test File Contents")
            call_subprocess(['git', 'add', 'test_file.dat'],
                            cwd=os.path.join(git_dir, ds.name),
                            check=True)
            call_subprocess(['git', 'commit', '-m', 'editing repo'],
                            cwd=os.path.join(git_dir, ds.name),
                            check=True)
            call_subprocess(['git', 'push'],
                            cwd=os.path.join(git_dir, ds.name),
                            check=True)

            query = """
                       mutation myMutation($lo: String!, $ln: String!, $do: String!, $dn: String!,
                                           $a: String!) {
                         modifyDatasetLink(input: {labbookOwner: $lo, labbookName: $ln, datasetOwner: $do, datasetName: $dn,
                                                   action: $a}) {
                             newLabbookEdge {
                               node {
                                 id
                                 name
                                 description
                                 linkedDatasets {
                                   name
                                 }
                               }
                             }
                         }
                       }
                       """
            variables = {
                "lo": "default",
                "ln": "test-lb",
                "do": "default",
                "dn": "dataset100",
                "a": "update"
            }
            result = fixture_working_dir[2].execute(query,
                                                    variable_values=variables)
            assert "errors" not in result
            snapshot.assert_match(result)

            # verify change is reflected
            assert os.path.exists(
                os.path.join(dataset_submodule_dir, 'test_file.dat')) is True

            # Verify activity record
            assert "Updated Dataset `default/dataset100` link to version" in lb.git.log(
            )[0]['message']

        finally:
            if os.path.exists(git_dir):
                shutil.rmtree(git_dir)
Beispiel #19
0
    def test_push_objects(self, mock_config_file, mock_dataset_head):
        im = InventoryManager(mock_config_file[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset100",
                               storage_type="gigantum_object_v1",
                               description="100")
        manifest = Manifest(ds, 'default')
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test content 1")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content 2")
        manifest.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 2
        _, obj1 = obj_to_push[0].object_path.rsplit('/', 1)
        _, obj2 = obj_to_push[1].object_path.rsplit('/', 1)

        with aioresponses() as mocked_responses:
            mocked_responses.put(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj1}',
                payload={
                    "presigned_url": f"https://dummyurl.com/{obj1}?params=1",
                    "namespace": ds.namespace,
                    "key_id": "hghghg",
                    "obj_id": obj1,
                    "dataset": ds.name
                },
                status=200)
            mocked_responses.put(f"https://dummyurl.com/{obj1}?params=1",
                                 payload={},
                                 status=200)

            mocked_responses.put(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj2}',
                payload={
                    "presigned_url": f"https://dummyurl.com/{obj2}?params=1",
                    "namespace": ds.namespace,
                    "key_id": "hghghg",
                    "obj_id": obj2,
                    "dataset": ds.name
                },
                status=200)
            mocked_responses.put(f"https://dummyurl.com/{obj2}?params=1",
                                 payload={},
                                 status=200)

            job_kwargs = {
                'objs': obj_to_push,
                'logged_in_username': "******",
                'access_token': "faketoken",
                'id_token': "faketoken",
                'dataset_owner': ds.namespace,
                'dataset_name': ds.name,
                'config_file': ds.client_config.config_file,
            }
            gtmcore.dispatcher.dataset_jobs.push_dataset_objects(**job_kwargs)
Beispiel #20
0
    def test_verify_contents_linked_dataset(self, mock_dataset_with_local_dir):
        class JobMock():
            def __init__(self):
                self.meta = dict()

            def save_meta(self):
                pass

        CURRENT_JOB = JobMock()

        def get_current_job_mock():
            return CURRENT_JOB

        with patch('gtmcore.dispatcher.jobs.get_current_job',
                   side_effect=get_current_job_mock):
            ds = mock_dataset_with_local_dir[0]
            im = InventoryManager()

            ds.backend.update_from_remote(ds, lambda x: print(x))

            m = Manifest(ds, 'tester')
            assert len(m.manifest.keys()) == 4
            assert os.path.isfile(
                os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                             'test1.txt'))
            assert os.path.isfile(
                os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                             'test2.txt'))
            assert os.path.isfile(
                os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                             'subdir', 'test3.txt'))

            modified_items = ds.backend.verify_contents(ds, lambda x: print(x))
            assert len(modified_items) == 0

            lb = im.create_labbook("tester", "tester", 'test-labbook')
            im.link_dataset_to_labbook(f"{ds.root_dir}/.git", "tester",
                                       ds.name, lb)

            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       'tester', ds.name)
            ds = im.load_dataset_from_directory(dataset_dir)

            test_dir = os.path.join(mock_dataset_with_local_dir[1],
                                    "local_data", "test_dir")
            with open(os.path.join(test_dir, 'test1.txt'), 'wt') as tf:
                tf.write("This file got changed in the filesystem")

            kwargs = {
                'logged_in_username': "******",
                'access_token': "asdf",
                'id_token': "1234",
                'dataset_owner': "tester",
                'dataset_name': 'dataset-1',
                'labbook_owner': "tester",
                'labbook_name': 'test-labbook'
            }

            jobs.verify_dataset_contents(**kwargs)
            job = gtmcore.dispatcher.jobs.get_current_job()

            assert 'modified_keys' in job.meta
            assert job.meta['modified_keys'] == ["test1.txt"]
            assert 'Validating contents of 3 files.' in job.meta['feedback']
Beispiel #21
0
    def test_pull_objects(self, mock_config_file, mock_dataset_head):
        im = InventoryManager(mock_config_file[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset100",
                               storage_type="gigantum_object_v1",
                               description="100")
        m = Manifest(ds, 'default')
        iom = IOManager(ds, m)

        os.makedirs(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         "other_dir"))
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test1.txt", "asdfadfsdf")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test2.txt", "fdsfgfd")
        m.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 2
        _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1)
        _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1)
        obj1_target = obj_to_push[0].object_path
        obj2_target = obj_to_push[1].object_path

        obj1_source = os.path.join('/tmp', uuid.uuid4().hex)
        obj2_source = os.path.join('/tmp', uuid.uuid4().hex)

        assert os.path.exists(obj1_target) is True
        assert os.path.exists(obj2_target) is True
        helper_compress_file(obj1_target, obj1_source)
        helper_compress_file(obj2_target, obj2_source)
        assert os.path.isfile(obj1_target) is False
        assert os.path.isfile(obj2_target) is False
        assert os.path.isfile(obj1_source) is True
        assert os.path.isfile(obj2_source) is True

        # Clear out from linked dir
        os.remove(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test1.txt'))
        os.remove(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test2.txt'))

        with patch.object(Configuration, 'find_default_config',
                          lambda self: mock_config_file[0]):
            with aioresponses() as mocked_responses:
                mocked_responses.get(
                    f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}',
                    payload={
                        "presigned_url":
                        f"https://dummyurl.com/{obj_id_1}?params=1",
                        "namespace": ds.namespace,
                        "obj_id": obj_id_1,
                        "dataset": ds.name
                    },
                    status=200)

                with open(obj1_source, 'rb') as data1:
                    mocked_responses.get(
                        f"https://dummyurl.com/{obj_id_1}?params=1",
                        body=data1.read(),
                        status=200,
                        content_type='application/octet-stream')

                mocked_responses.get(
                    f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}',
                    payload={
                        "presigned_url":
                        f"https://dummyurl.com/{obj_id_2}?params=1",
                        "namespace": ds.namespace,
                        "obj_id": obj_id_2,
                        "dataset": ds.name
                    },
                    status=200)

                with open(obj2_source, 'rb') as data2:
                    mocked_responses.get(
                        f"https://dummyurl.com/{obj_id_2}?params=1",
                        body=data2.read(),
                        status=200,
                        content_type='application/octet-stream')

                dl_kwargs = {
                    'logged_in_username': "******",
                    'access_token': "asdf",
                    'id_token': "1234",
                    'dataset_owner': "default",
                    'dataset_name': "dataset100",
                    'labbook_owner': None,
                    'labbook_name': None,
                    'keys': ["test1.txt"]
                }

                gtmcore.dispatcher.dataset_jobs.pull_objects(**dl_kwargs)

                # Manually link since this is disabled by default in the job (because in real use, multiple jobs run
                # in parallel and you only want to link once.
                m.link_revision()

                assert os.path.isfile(obj1_target) is True
                assert os.path.isfile(obj2_target) is False

                decompressor = snappy.StreamDecompressor()
                with open(obj1_source, 'rb') as dd:
                    source1 = decompressor.decompress(dd.read())
                    source1 += decompressor.flush()
                with open(obj1_target, 'rt') as dd:
                    dest1 = dd.read()
                assert source1.decode("utf-8") == dest1

                # Download other file
                dl_kwargs = {
                    'logged_in_username': "******",
                    'access_token': "asdf",
                    'id_token': "1234",
                    'dataset_owner': "default",
                    'dataset_name': "dataset100",
                    'labbook_owner': None,
                    'labbook_name': None,
                    'keys': ["test2.txt"]
                }

                gtmcore.dispatcher.dataset_jobs.pull_objects(**dl_kwargs)

                # Manually link since this is disabled by default in the job (because in real use, multiple jobs run
                # in parallel and you only want to link once.
                m.link_revision()

                assert os.path.isfile(obj1_target) is True
                assert os.path.isfile(obj2_target) is True

                with open(obj1_source, 'rb') as dd:
                    source1 = decompressor.decompress(dd.read())
                    source1 += decompressor.flush()
                with open(obj1_target, 'rt') as dd:
                    dest1 = dd.read()
                assert source1.decode("utf-8") == dest1

                with open(obj2_source, 'rb') as dd:
                    source1 = decompressor.decompress(dd.read())
                    source1 += decompressor.flush()
                with open(obj2_target, 'rt') as dd:
                    dest1 = dd.read()
                assert source1.decode("utf-8") == dest1
Beispiel #22
0
    def mutate_and_get_payload(cls,
                               root,
                               info,
                               labbook_owner,
                               labbook_name,
                               dataset_owner,
                               dataset_name,
                               action,
                               dataset_url=None,
                               client_mutation_id=None):
        logged_in_username = get_logged_in_username()
        im = InventoryManager()
        lb = im.load_labbook(logged_in_username,
                             labbook_owner,
                             labbook_name,
                             author=get_logged_in_author())

        with lb.lock():
            if action == 'link':
                if dataset_url:
                    remote_domain = cls._get_remote_domain(
                        dataset_url, dataset_owner, dataset_name)

                    if remote_domain:
                        # Make sure git creds are configured for the remote
                        admin_service = None
                        for remote in lb.client_config.config['git'][
                                'remotes']:
                            if remote_domain == remote:
                                admin_service = lb.client_config.config['git'][
                                    'remotes'][remote]['admin_service']
                                break
                        if "HTTP_AUTHORIZATION" in info.context.headers.environ:
                            token = parse_token(info.context.headers.
                                                environ["HTTP_AUTHORIZATION"])
                        else:
                            raise ValueError(
                                "Authorization header not provided."
                                " Must have a valid session to query for collaborators"
                            )
                        mgr = GitLabManager(remote_domain, admin_service,
                                            token)
                        mgr.configure_git_credentials(remote_domain,
                                                      logged_in_username)
                else:
                    # Link to local dataset
                    ds = im.load_dataset(logged_in_username, dataset_owner,
                                         dataset_name)
                    dataset_url = f"{ds.root_dir}/.git"

                # Link the dataset to the labbook
                ds = im.link_dataset_to_labbook(dataset_url, dataset_owner,
                                                dataset_name, lb)
                ds.namespace = dataset_owner

                # Preload the dataloader
                info.context.dataset_loader.prime(
                    f"{get_logged_in_username()}&{dataset_owner}&{dataset_name}",
                    ds)

                # Relink the revision
                m = Manifest(ds, logged_in_username)
                m.link_revision()
            elif action == 'unlink':
                im.unlink_dataset_from_labbook(dataset_owner, dataset_name, lb)
            elif action == 'update':
                ds = im.update_linked_dataset_reference(
                    dataset_owner, dataset_name, lb)
                m = Manifest(ds, logged_in_username)
                m.force_reload()

                info.context.dataset_loader.prime(
                    f"{get_logged_in_username()}&{dataset_owner}&{dataset_name}",
                    ds)
            else:
                raise ValueError(
                    "Unsupported action. Use `link`, `unlink`, or `update`")

            info.context.labbook_loader.prime(
                f"{get_logged_in_username()}&{labbook_owner}&{labbook_name}",
                lb)
            edge = LabbookConnection.Edge(node=Labbook(owner=labbook_owner,
                                                       name=labbook_name),
                                          cursor=base64.b64encode(
                                              f"{0}".encode('utf-8')))

        return ModifyDatasetLink(new_labbook_edge=edge)
Beispiel #23
0
    def create_dataset(self,
                       username: str,
                       owner: str,
                       dataset_name: str,
                       storage_type: str,
                       description: Optional[str] = None,
                       author: Optional[GitAuthor] = None) -> Dataset:
        """Create a new Dataset in this Gigantum working directory.

        Args:
            username: Active username
            owner: Namespace in which to place this Dataset
            dataset_name: Name of the Dataset
            storage_type: String identifying the type of Dataset to instantiate
            description: Optional brief description of Dataset
            author: Optional Git Author

        Returns:
            Newly created LabBook instance

        """
        dataset = Dataset(config_file=self.config_file,
                          author=author,
                          namespace=owner)

        if storage_type not in SUPPORTED_STORAGE_BACKENDS:
            raise ValueError(
                f"Unsupported Dataset storage type: {storage_type}")

        try:
            build_info = Configuration(self.config_file).config['build_info']
        except KeyError:
            logger.warning("Could not obtain build_info from config")
            build_info = None

        # Build data file contents
        dataset._data = {
            "schema": DATASET_CURRENT_SCHEMA,
            "id": uuid.uuid4().hex,
            "name": dataset_name,
            "storage_type": storage_type,
            "description": description or '',
            "created_on": datetime.datetime.utcnow().isoformat(),
            "build_info": build_info
        }
        dataset._validate_gigantum_data()

        logger.info("Creating new Dataset on disk for {}/{}/{}".format(
            username, owner, dataset_name))
        # lock while creating initial directory
        with dataset.lock(
                lock_key=f"new_dataset_lock|{username}|{owner}|{dataset_name}"
        ):
            # Verify or Create user subdirectory
            # Make sure you expand a user dir string
            starting_dir = os.path.expanduser(
                dataset.client_config.config["git"]["working_directory"])
            user_dir = os.path.join(starting_dir, username)
            if not os.path.isdir(user_dir):
                os.makedirs(user_dir)

            # Create owner dir - store LabBooks in working dir > logged in user > owner
            owner_dir = os.path.join(user_dir, owner)
            if not os.path.isdir(owner_dir):
                os.makedirs(owner_dir)

                # Create `datasets` subdir in the owner dir
                owner_dir = os.path.join(owner_dir, "datasets")
            else:
                owner_dir = os.path.join(owner_dir, "datasets")

            # Verify name not already in use
            if os.path.isdir(os.path.join(owner_dir, dataset_name)):
                raise ValueError(
                    f"Dataset `{dataset_name}` already exists locally. Choose a new Dataset name"
                )

            # Create Dataset subdirectory
            new_root_dir = os.path.join(owner_dir, dataset_name)
            os.makedirs(new_root_dir)
            dataset._set_root_dir(new_root_dir)

            # Init repository
            dataset.git.initialize()

            # Create Directory Structure
            dirs = [
                'manifest', 'metadata', '.gigantum',
                os.path.join('.gigantum', 'favorites'),
                os.path.join('.gigantum', 'activity'),
                os.path.join('.gigantum', 'activity', 'log')
            ]

            for d in dirs:
                p = os.path.join(dataset.root_dir, d, '.gitkeep')
                os.makedirs(os.path.dirname(p), exist_ok=True)
                with open(p, 'w') as gk:
                    gk.write(
                        "This file is necessary to keep this directory tracked by Git"
                        " and archivable by compression tools. Do not delete or modify!"
                    )

            dataset._save_gigantum_data()

            # Create an empty storage.json file
            dataset.backend_config = {}

            # Create .gitignore default file
            shutil.copyfile(
                os.path.join(resource_filename('gtmcore', 'dataset'),
                             'gitignore.default'),
                os.path.join(dataset.root_dir, ".gitignore"))

            # Commit
            dataset.git.add_all()

            # NOTE: this string is used to indicate there are no more activity records to get. Changing the string will
            # break activity paging.
            # TODO: Improve method for detecting the first activity record
            dataset.git.commit(f"Creating new empty Dataset: {dataset_name}")

            # Create Activity Record
            adr = ActivityDetailRecord(ActivityDetailType.DATASET,
                                       show=False,
                                       importance=0)
            adr.add_value('text/plain',
                          f"Created new Dataset: {username}/{dataset_name}")
            ar = ActivityRecord(
                ActivityType.DATASET,
                message=f"Created new Dataset: {username}/{dataset_name}",
                show=True,
                importance=255,
                linked_commit=dataset.git.commit_hash)
            ar.add_detail_object(adr)
            store = ActivityStore(dataset)
            store.create_activity_record(ar)

            # Initialize file cache and link revision
            m = Manifest(dataset, username)
            m.link_revision()

            return dataset
Beispiel #24
0
    def test_complete_dataset_upload_transaction_failure(
            self, mock_config_file_background_tests):
        im = InventoryManager(mock_config_file_background_tests[0])
        ds = im.create_dataset('default',
                               'default',
                               "new-ds",
                               storage_type="gigantum_object_v1",
                               description="100")
        m = Manifest(ds, 'default')
        dispatcher_obj = Dispatcher()

        helper_write_big_file(m.cache_mgr.cache_root, m.dataset_revision,
                              "test1.dat", "12")
        helper_write_big_file(m.cache_mgr.cache_root, m.dataset_revision,
                              "test2.dat", "23")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "zztest3.txt", "fake content 3")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "zztest4.txt", "fake content 4")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "zztest5.txt", "fake content 5")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "zztest6.txt", "fake content 6")
        job_kwargs = {
            'dispatcher': Dispatcher,
            'logged_in_username': "******",
            'logged_in_email': "*****@*****.**",
            'dataset_owner': "default",
            'dataset_name': "new-ds",
            'config_file': mock_config_file_background_tests[0]
        }

        job_metadata = {
            'dataset': f"default|default|new-ds",
            'method': 'complete_dataset_upload_transaction'
        }
        assert len(m.manifest) == 0

        job_key = dispatcher_obj.dispatch_task(
            gtmcore.dispatcher.dataset_jobs.
            complete_dataset_upload_transaction,
            kwargs=job_kwargs,
            metadata=job_metadata)

        time.sleep(3)

        # Remove files to make them fail
        os.remove(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         "zztest4.txt"))
        os.remove(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         "zztest5.txt"))

        cnt = 0
        while cnt < 120:
            job_status = dispatcher_obj.query_task(job_key)

            if job_status.status == 'finished':
                break

            time.sleep(1)
            cnt += 1

        assert cnt < 119

        m = Manifest(ds, 'default')
        assert len(m.manifest) == 4
        assert 'test1.dat' in m.manifest
        assert 'test2.dat' in m.manifest
        assert 'zztest3.txt' in m.manifest
        assert 'zztest6.txt' in m.manifest
        assert 'zztest5.txt' not in m.manifest
        assert 'zztest4.txt' not in m.manifest

        assert job_status.meta['has_failures'] is True
        assert 'The following files failed to hash. Try re-uploading the files again:\nzztest4.txt \nzztest5.txt' ==\
               job_status.meta['failure_detail']
        assert 'An error occurred while processing some files. Check details and re-upload.' == \
               job_status.meta['feedback']
Beispiel #25
0
    def test_complete_dataset_upload_transaction_all_types(
            self, mock_config_file_background_tests):
        im = InventoryManager(mock_config_file_background_tests[0])
        ds = im.create_dataset('default',
                               'default',
                               "new-ds",
                               storage_type="gigantum_object_v1",
                               description="100")
        m = Manifest(ds, 'default')

        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test1.txt", "fake content 1")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test2.txt", "fake content 2")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test3.txt", "fake content 3")

        dl_kwargs = {
            'dispatcher': Dispatcher,
            'logged_in_username': "******",
            'logged_in_email': "*****@*****.**",
            'dataset_owner': "default",
            'dataset_name': "new-ds",
            'config_file': mock_config_file_background_tests[0]
        }

        assert len(m.manifest) == 0
        gtmcore.dispatcher.dataset_jobs.complete_dataset_upload_transaction(
            **dl_kwargs)

        m = Manifest(ds, 'default')

        # make sure manifest got updated
        assert len(m.manifest) == 3
        assert 'test1.txt' in m.manifest
        assert 'test2.txt' in m.manifest
        assert 'test3.txt' in m.manifest
        hash1 = m.manifest['test1.txt']['h']

        # Make sure activity created
        assert len(ds.git.log()) == 6
        assert "_GTM_ACTIVITY_START_" in ds.git.log()[0]['message']
        assert "Uploaded 3 new file(s)." in ds.git.log()[0]['message']

        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test1.txt", "fake content changed")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test4.txt", "fake content 4")
        os.remove(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         "test3.txt"))

        gtmcore.dispatcher.dataset_jobs.complete_dataset_upload_transaction(
            **dl_kwargs)
        m = Manifest(ds, 'default')

        # make sure manifest got updated
        assert len(m.manifest) == 3
        assert 'test1.txt' in m.manifest
        assert 'test2.txt' in m.manifest
        assert 'test4.txt' in m.manifest
        assert hash1 != m.manifest['test1.txt']['h']

        # Make sure activity created
        assert len(ds.git.log()) == 8
        assert "_GTM_ACTIVITY_START_" in ds.git.log()[0]['message']
        assert "Uploaded 1 new file(s). Uploaded 1 modified file(s). 1 deleted file(s)." in ds.git.log(
        )[0]['message']
Beispiel #26
0
    def test_download_dataset_files(self, mock_config_file_background_tests,
                                    mock_dataset_head):
        def dispatch_query_mock(self, job_key):
            JobStatus = namedtuple("JobStatus", ['status', 'meta'])
            return JobStatus(status='finished',
                             meta={'completed_bytes': '500'})

        def dispatch_mock(self, method_reference, kwargs, metadata, persist):
            with aioresponses() as mocked_responses:
                mocked_responses.get(
                    f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}',
                    payload={
                        "presigned_url":
                        f"https://dummyurl.com/{obj_id_1}?params=1",
                        "namespace": ds.namespace,
                        "obj_id": obj_id_1,
                        "dataset": ds.name
                    },
                    status=200)

                with open(obj1_source, 'rb') as data1:
                    mocked_responses.get(
                        f"https://dummyurl.com/{obj_id_1}?params=1",
                        body=data1.read(),
                        status=200,
                        content_type='application/octet-stream')
                gtmcore.dispatcher.dataset_jobs.pull_objects(**kwargs)

                return "afakejobkey"

        im = InventoryManager(mock_config_file_background_tests[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset100",
                               storage_type="gigantum_object_v1",
                               description="100")
        m = Manifest(ds, 'default')
        iom = IOManager(ds, m)

        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test1.txt", "asdfadfsdf")
        m.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 1
        _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1)
        obj1_target = obj_to_push[0].object_path

        obj1_source = os.path.join('/tmp', uuid.uuid4().hex)

        assert os.path.exists(obj1_target) is True
        helper_compress_file(obj1_target, obj1_source)
        assert os.path.isfile(obj1_target) is False
        assert os.path.isfile(obj1_source) is True

        # Clear out from linked dir
        os.remove(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test1.txt'))

        with patch.object(Configuration, 'find_default_config',
                          lambda self: mock_config_file_background_tests[0]):
            with patch.object(Dispatcher, 'dispatch_task', dispatch_mock):
                with patch.object(Dispatcher, 'query_task',
                                  dispatch_query_mock):
                    dl_kwargs = {
                        'logged_in_username': "******",
                        'access_token': "asdf",
                        'id_token': "1234",
                        'dataset_owner': "default",
                        'dataset_name': "dataset100",
                        'labbook_owner': None,
                        'labbook_name': None,
                        'keys': ["test1.txt"],
                        'config_file': mock_config_file_background_tests[0]
                    }

                    gtmcore.dispatcher.dataset_jobs.download_dataset_files(
                        **dl_kwargs)
                    assert os.path.isfile(obj1_target) is True

                    decompressor = snappy.StreamDecompressor()
                    with open(obj1_source, 'rb') as dd:
                        source1 = decompressor.decompress(dd.read())
                        source1 += decompressor.flush()
                    with open(obj1_target, 'rt') as dd:
                        dest1 = dd.read()
                    assert source1.decode("utf-8") == dest1
Beispiel #27
0
    def test_download_dataset_files_file_fail(
            self, mock_config_file_background_tests):
        def dispatch_query_mock(self, job_key):
            # mock the job actually running and returning status
            JobStatus = namedtuple("JobStatus", ['status', 'meta'])
            return JobStatus(status='finished',
                             meta={
                                 'completed_bytes': '0',
                                 'failure_keys': 'test1.txt'
                             })

        def dispatch_mock(self, method_reference, kwargs, metadata, persist):
            gtmcore.dispatcher.dataset_jobs.pull_objects(**kwargs)
            return "afakejobkey"

        im = InventoryManager(mock_config_file_background_tests[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset100",
                               storage_type="gigantum_object_v1",
                               description="100")
        m = Manifest(ds, 'default')
        iom = IOManager(ds, m)

        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test1.txt", "asdfadfsdf")
        m.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 1
        _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1)
        obj1_target = obj_to_push[0].object_path

        obj1_source = os.path.join('/tmp', uuid.uuid4().hex)

        assert os.path.exists(obj1_target) is True
        helper_compress_file(obj1_target, obj1_source)
        assert os.path.isfile(obj1_target) is False
        assert os.path.isfile(obj1_source) is True

        # Clear out from linked dir
        os.remove(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test1.txt'))

        with patch.object(Configuration, 'find_default_config',
                          lambda self: mock_config_file_background_tests[0]):
            with patch.object(Dispatcher, 'dispatch_task', dispatch_mock):
                with patch.object(Dispatcher, 'query_task',
                                  dispatch_query_mock):
                    dl_kwargs = {
                        'logged_in_username': "******",
                        'access_token': "asdf",
                        'id_token': "1234",
                        'dataset_owner': "default",
                        'dataset_name': "dataset100",
                        'labbook_owner': None,
                        'labbook_name': None,
                        'keys': ["test1.txt"],
                        'config_file': mock_config_file_background_tests[0]
                    }

                    with pytest.raises(IOError):
                        gtmcore.dispatcher.dataset_jobs.download_dataset_files(
                            **dl_kwargs)
                    assert os.path.isfile(obj1_target) is False
Beispiel #28
0
    def test_add_file(self, mock_create_dataset):
        """Test adding a new file to a labbook"""
        class DummyContext(object):
            def __init__(self, file_handle):
                self.dataset_loader = None
                self.files = {'uploadChunk': file_handle}

        client = Client(mock_create_dataset[3],
                        middleware=[DataloaderMiddleware()])

        # Create file to upload
        test_file = os.path.join(tempfile.gettempdir(), "myValidFile.dat")
        est_size = 9000000
        try:
            os.remove(test_file)
        except:
            pass
        with open(test_file, 'wb') as tf:
            tf.write(os.urandom(est_size))

        new_file_size = os.path.getsize(tf.name)
        # Get upload params
        chunk_size = 4194000
        file_info = os.stat(test_file)
        file_size = int(file_info.st_size / 1000)
        total_chunks = int(math.ceil(file_info.st_size / chunk_size))

        ds = InventoryManager(mock_create_dataset[0]).load_dataset(
            'default', 'default', 'dataset1')

        fsc = HostFilesystemCache(ds, 'default')
        target_file = os.path.join(fsc.current_revision_dir, "myValidFile.dat")

        txid = "000-unitest-transaction"
        with open(test_file, 'rb') as tf:
            # Check for file to exist (shouldn't yet)
            assert os.path.exists(target_file) is False
            for chunk_index in range(total_chunks):
                # Upload a chunk
                chunk = io.BytesIO()
                chunk.write(tf.read(chunk_size))
                chunk.seek(0)
                file = FileStorage(chunk)

                query = f"""
                            mutation addDatasetFile{{
                              addDatasetFile(input:{{owner:"default",
                                                      datasetName: "dataset1",
                                                      filePath: "myValidFile.dat",
                                                      transactionId: "{txid}",
                                chunkUploadParams:{{
                                  uploadId: "fdsfdsfdsfdfs",
                                  chunkSize: {chunk_size},
                                  totalChunks: {total_chunks},
                                  chunkIndex: {chunk_index},
                                  fileSizeKb: {file_size},
                                  filename: "{os.path.basename(test_file)}"
                                }}
                              }}) {{
                                      newDatasetFileEdge {{
                                        node{{
                                          id
                                          key
                                          isDir
                                          size
                                        }}
                                      }}
                                    }}
                            }}
                            """
                r = client.execute(query, context_value=DummyContext(file))
        assert 'errors' not in r

        # So, these will only be populated once the last chunk is uploaded. Will be None otherwise.
        assert r['data']['addDatasetFile']['newDatasetFileEdge']['node'][
            'isDir'] is False
        assert r['data']['addDatasetFile']['newDatasetFileEdge']['node'][
            'key'] == 'myValidFile.dat'
        assert r['data']['addDatasetFile']['newDatasetFileEdge']['node'][
            'size'] == f"{new_file_size}"
        # When done uploading, file should exist in the labbook
        assert os.path.exists(target_file)
        assert os.path.isfile(target_file)

        complete_query = f"""
        mutation completeQuery {{
            completeDatasetUploadTransaction(input: {{
                owner: "default",
                datasetName: "dataset1",
                transactionId: "{txid}"
            }}) {{
                success
            }}
        }}
        """
        r = client.execute(complete_query, context_value=DummyContext(file))
        assert 'errors' not in r

        m = Manifest(ds, 'default')
        status = m.status()
        assert len(status.created) == 0
        assert len(status.modified) == 0
        assert len(status.deleted) == 0

        assert 'Uploaded 1 new file(s)' in ds.git.log()[0]['message']
    def test_make_directory(self, fixture_working_dir, snapshot):
        im = InventoryManager(fixture_working_dir[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset-dir",
                               storage_type="gigantum_object_v1",
                               description="testing move")
        m = Manifest(ds, 'default')
        m.link_revision()

        query = """
                   mutation myMutation {
                     makeDatasetDirectory(input: {datasetOwner: "default", datasetName: "dataset-dir", 
                                             key: "test_dir1/"}) {
                         newDatasetFileEdge {
                            node {
                              id
                              key
                              isDir
                              isLocal
                              size
                            }
                         }
                     }
                   }
                   """
        result = fixture_working_dir[2].execute(query)
        assert 'errors' not in result
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['key'] == 'test_dir1/'
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['isDir'] is True
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['isLocal'] is True
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['size'] == '0'

        assert os.path.isdir(
            os.path.join(m.cache_mgr.current_revision_dir,
                         "test_dir1")) is True

        query = """
                   mutation myMutation {
                     makeDatasetDirectory(input: {datasetOwner: "default", datasetName: "dataset-dir", 
                                             key: "test_dir1/test_dir2/"}) {
                         newDatasetFileEdge {
                            node {
                              id
                              key
                              isDir
                              isLocal
                              size
                            }
                         }
                     }
                   }
                   """
        result = fixture_working_dir[2].execute(query)
        assert 'errors' not in result
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['key'] == 'test_dir1/test_dir2/'
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['isDir'] is True
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['isLocal'] is True
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['size'] == '0'

        assert os.path.isdir(
            os.path.join(m.cache_mgr.current_revision_dir,
                         "test_dir1")) is True
        assert os.path.isdir(
            os.path.join(m.cache_mgr.current_revision_dir, "test_dir1",
                         "test_dir2")) is True
Beispiel #30
0
    def _push_dataset_objects(self, logged_in_username: str,
                              feedback_callback: Callable, access_token,
                              id_token) -> None:
        """Method to schedule a push operta

        Args:
            logged_in_username:
            feedback_callback:
            access_token:
            id_token:

        Returns:

        """
        dispatcher_obj = Dispatcher()

        try:
            self.dataset.backend.set_default_configuration(
                logged_in_username, access_token, id_token)
            m = Manifest(self.dataset, logged_in_username)
            iom = IOManager(self.dataset, m)

            obj_batches, total_bytes, num_files = iom.compute_push_batches()

            if obj_batches:
                # Schedule jobs for batches
                bg_jobs = list()
                for objs in obj_batches:
                    job_kwargs = {
                        'objs': objs,
                        'logged_in_username': logged_in_username,
                        'access_token': access_token,
                        'id_token': id_token,
                        'dataset_owner': self.dataset.namespace,
                        'dataset_name': self.dataset.name,
                        'config_file': self.dataset.client_config.config_file,
                    }
                    job_metadata = {
                        'dataset':
                        f"{logged_in_username}|{self.dataset.namespace}|{self.dataset.name}",
                        'method': 'pull_objects'
                    }

                    feedback_callback(
                        f"Preparing to upload {num_files} files. Please wait..."
                    )
                    job_key = dispatcher_obj.dispatch_task(
                        method_reference=gtmcore.dispatcher.dataset_jobs.
                        push_dataset_objects,
                        kwargs=job_kwargs,
                        metadata=job_metadata,
                        persist=True)
                    bg_jobs.append(
                        BackgroundUploadJob(dispatcher_obj, objs, job_key))
                    logger.info(
                        f"Schedule dataset object upload job for"
                        f" {logged_in_username}/{self.dataset.namespace}/{self.dataset.name} with"
                        f" {len(objs)} objects to upload")

                while sum([(x.is_complete or x.is_failed)
                           for x in bg_jobs]) != len(bg_jobs):
                    # Refresh all job statuses and update status feedback
                    [j.refresh_status() for j in bg_jobs]
                    total_completed_bytes = sum(
                        [j.completed_bytes for j in bg_jobs])
                    if total_completed_bytes > 0:
                        pc = (float(total_completed_bytes) /
                              float(total_bytes)) * 100
                        feedback_callback(
                            f"Please wait - Uploading {num_files} files ({format_size(total_completed_bytes)}"
                            f" of {format_size(total_bytes)}) - {round(pc)}% complete",
                            percent_complete=pc)
                    time.sleep(1)

                # if you get here, all jobs are done or failed.
                # Remove all the push files so they can be regenerated if needed
                for f in glob.glob(f'{iom.push_dir}/*'):
                    os.remove(f)

                # Aggregate failures if they exist
                failure_keys: List[str] = list()
                for j in bg_jobs:
                    if j.is_failed:
                        # Background job hard failed. Assume entire batch should get re-uploaded
                        for obj in j.objs:
                            failure_keys.append(
                                f"{obj.dataset_path} at {obj.revision[0:8]}")
                            m.queue_to_push(obj.object_path, obj.dataset_path,
                                            obj.revision)
                    else:
                        for obj in j.get_failed_objects():
                            # Some individual objects failed
                            failure_keys.append(
                                f"{obj.dataset_path} at {obj.revision[0:8]}")
                            m.queue_to_push(obj.object_path, obj.dataset_path,
                                            obj.revision)

                # Set final status for UI
                if len(failure_keys) == 0:
                    feedback_callback(f"Upload complete!",
                                      percent_complete=100,
                                      has_failures=False)
                else:
                    failure_str = "\n".join(failure_keys)
                    failure_detail_str = f"Files that failed to upload:\n{failure_str}"
                    feedback_callback("",
                                      percent_complete=100,
                                      has_failures=True,
                                      failure_detail=failure_detail_str)

                # Finish up by linking everything just in case
                iom.manifest.link_revision()

                if len(failure_keys) > 0:
                    # If any downloads failed, exit non-zero to the UI knows there was an error
                    raise IOError(
                        f"{len(failure_keys)} file(s) failed to upload. Check message detail for more information"
                        " and try to sync again.")
        except Exception as err:
            logger.exception(err)
            raise