Beispiel #1
0
 def test_pattern_1(self):
     self._test_pattern(["*"], [
         MetadataPath(".datalad_metadata"),
         MetadataPath("s1"),
         MetadataPath("s2"),
         MetadataPath("d3")
     ])
Beispiel #2
0
def get_path_info(
    dataset: Dataset,
    element_path: Optional[Path],
    into_dataset_path: Optional[Path] = None
) -> Tuple[MetadataPath, MetadataPath]:
    """
    Determine the dataset tree path and the file tree path.

    If the path is absolute, we can determine the containing dataset
    and the metadatasets around it. If the path is not an element of
    a locally known dataset, we signal an error.

    If the path is relative, we convert it to an absolute path
    by appending it to the dataset or current directory and perform
    the above check.
    """
    full_dataset_path = Path(dataset.path).resolve()
    if into_dataset_path is None:
        dataset_tree_path = MetadataPath("")
    else:
        full_into_dataset_path = into_dataset_path.resolve()
        dataset_tree_path = MetadataPath(
            full_dataset_path.relative_to(full_into_dataset_path))

    if element_path is None:
        return dataset_tree_path, MetadataPath("")

    if element_path.is_absolute():
        full_file_path = element_path
    else:
        full_file_path = full_dataset_path / element_path

    file_tree_path = full_file_path.relative_to(full_dataset_path)

    return dataset_tree_path, MetadataPath(file_tree_path)
 def test_tree_version(self):
     parser = MetadataURLParser("tree:/a/b/c@00112233:/x/y")
     result = parser.parse()
     self.assertIsInstance(result, TreeMetadataURL)
     self.assertEqual(result.version, "00112233")
     self.assertEqual(result.dataset_path, MetadataPath("/a/b/c"))
     self.assertEqual(result.local_path, MetadataPath("/x/y"))
 def test_relative_path(self):
     parser = MetadataURLParser(":a/b/c")
     result = parser.parse()
     self.assertIsInstance(result, TreeMetadataURL)
     self.assertIsNone(result.version)
     self.assertEqual(result.dataset_path, MetadataPath(""))
     self.assertEqual(result.local_path, MetadataPath("a/b/c"))
 def test_empty_paths(self):
     parser = MetadataURLParser("tree:@00112233")
     result = parser.parse()
     self.assertIsInstance(result, TreeMetadataURL)
     self.assertEqual(result.version, "00112233")
     self.assertEqual(result.dataset_path, MetadataPath(""))
     self.assertEqual(result.local_path, MetadataPath(""))
Beispiel #6
0
    def test_auto_list_root_on(self):
        found, failed = self.tree_search.get_matching_paths(
            [""], False, auto_list_root=True)

        self.assertPathsInResult(found, [
            MetadataPath(".datalad_metadata"),
            MetadataPath("s1"),
            MetadataPath("s2"),
            MetadataPath("d3")
        ])

        self.assertListEqual(failed, [])
Beispiel #7
0
def dump_from_dataset_tree(mapper: str, metadata_store: str,
                           tree_version_list: TreeVersionList,
                           metadata_url: TreeMetadataURL,
                           recursive: bool) -> Generator[dict, None, None]:
    """ Dump dataset tree elements that are referenced in path """

    # Normalize path representation
    if not metadata_url or metadata_url.dataset_path is None:
        metadata_url = TreeMetadataURL(MetadataPath(""), MetadataPath(""))

    # Get specified version, if none is specified, take the first from the
    # tree version list.
    requested_root_dataset_version = metadata_url.version
    if requested_root_dataset_version is None:
        requested_root_dataset_version = (
            # TODO: add an item() method to VersionList
            tuple(tree_version_list.versions())[0]
            if metadata_url.version is None else metadata_url.version)

    # Fetch dataset tree for the specified version
    time_stamp, dataset_tree = tree_version_list.get_dataset_tree(
        requested_root_dataset_version)
    root_mrr = dataset_tree.get_metadata_root_record(MetadataPath(""))
    root_dataset_version = root_mrr.dataset_version
    root_dataset_identifier = root_mrr.dataset_identifier

    # Create a tree search object to search for the specified datasets
    tree_search = TreeSearch(dataset_tree)
    matches, not_found_paths = tree_search.get_matching_paths(
        [str(metadata_url.dataset_path)], recursive, auto_list_root=False)

    for missing_path in not_found_paths:
        lgr.error(f"could not locate metadata for dataset path {missing_path} "
                  f"in tree version {metadata_url.version} in "
                  f"metadata_store {mapper}:{metadata_store}")

    for match_record in matches:
        yield from show_dataset_metadata(mapper, metadata_store,
                                         root_dataset_identifier,
                                         root_dataset_version,
                                         match_record.path,
                                         match_record.node.value)

        yield from show_file_tree_metadata(mapper, metadata_store,
                                           root_dataset_identifier,
                                           root_dataset_version,
                                           MetadataPath(match_record.path),
                                           match_record.node.value,
                                           str(metadata_url.local_path),
                                           recursive)

    return
Beispiel #8
0
    def get_matching_paths(
        self,
        pattern_list: List[str],
        recursive: bool,
        auto_list_root: bool = True
    ) -> Tuple[List[MatchRecord], List[MetadataPath]]:
        """
        Get all metadata paths that are matching the patterns in
        pattern_list.

        - Leading "/" are removed from patterns, since metadata
          paths are not absolute.

        - Empty pattern-specifications, i.e. '', are interpreted
          as root-dataset or root-file-tree nodes.
        """
        pattern_elements_list = [
            MetadataPath(pattern) for pattern in set(pattern_list)
        ]
        matching, failed = self._get_matching_nodes(pattern_elements_list,
                                                    auto_list_root)

        if recursive:
            matching = self._list_recursive(matching[:])
        return matching, failed
Beispiel #9
0
def test_add_file_end_to_end(file_name):

    test_path = "d_0/d_0.0/f_0.0.0"

    json.dump({
        **metadata_template,
        "type": "file",
        "path": test_path
    }, open(file_name, "tw"))

    with tempfile.TemporaryDirectory() as temp_dir:
        git_repo = GitRepo(temp_dir)

        res = meta_add(metadata=file_name, metadata_store=git_repo.path)
        assert_result_count(res, 1)
        assert_result_count(res, 1, type='file')
        assert_result_count(res, 0, type='dataset')

        # Verify file level metadata was added
        tree_version_list, uuid_set, mrr = _get_top_nodes(
            git_repo,
            UUID(metadata_template["dataset_id"]),
            metadata_template["dataset_version"])

        file_tree = mrr.get_file_tree()
        assert_is_not_none(file_tree)
        assert_true(test_path in file_tree)

        metadata = file_tree.get_metadata(MetadataPath(test_path))
        metadata_content = _get_metadata_content(metadata)
        eq_(metadata_content, metadata_template["extracted_metadata"])
Beispiel #10
0
def get_file_info(dataset: Dataset, file_path: MetadataPath) -> FileInfo:
    """
    Get information about the file in the dataset or
    None, if the file is not part of the dataset.
    """

    # Convert the metadata file-path into a system file path
    path = Path(file_path)
    try:
        relative_path = path.relative_to(dataset.pathobj)
    except ValueError:
        relative_path = path

    path = dataset.pathobj / relative_path

    path_status = (list(dataset.status(path, result_renderer="disabled"))
                   or [None])[0]

    if path_status is None:
        raise FileNotFoundError("file not found: {}".format(path))

    if path_status["state"] == "untracked":
        raise ValueError("file not tracked: {}".format(path))

    # noinspection PyUnresolvedReferences
    return FileInfo(
        type=path_status["type"],
        git_sha_sum=path_status["gitshasum"],
        byte_size=path_status.get("bytesize", 0),
        state=path_status["state"],
        path=path_status["path"],  # TODO: use the dataset-tree path here?
        intra_dataset_path=str(
            MetadataPath(*PurePath(path_status["path"]).relative_to(
                dataset.pathobj).parts)))
Beispiel #11
0
    def test_auto_list_root_off(self):
        """ Expect a single root record for non-autolist root search """
        found, failed = self.tree_search.get_matching_paths(
            [""], False, auto_list_root=False)

        self.assertListEqual(
            found, [MatchRecord(MetadataPath(""), self.tree_search.tree)])
        self.assertListEqual(failed, [])
 def test_uuid_empty(self):
     parser = MetadataURLParser("uuid:00112233-0011-2233-4455-66778899aabb")
     result = parser.parse()
     self.assertIsInstance(result, UUIDMetadataURL)
     self.assertEqual(result.version, None)
     self.assertEqual(result.uuid,
                      UUID("00112233-0011-2233-4455-66778899aabb"))
     self.assertEqual(result.local_path, MetadataPath(""))
Beispiel #13
0
    def __call__(metadata: Union[str, JSONObject],
                 metadata_store: Optional[str] = None,
                 additionalvalues: Optional[Union[str, JSONObject]] = None,
                 allow_override: bool = False,
                 allow_unknown: bool = False):

        additionalvalues = additionalvalues or dict()
        metadata_store = Path(metadata_store or curdir)

        metadata = process_parameters(
            metadata=read_json_object(metadata),
            additional_values=get_json_object(additionalvalues),
            allow_override=allow_override,
            allow_unknown=allow_unknown)

        lgr.debug(f"attempting to add metadata: {json.dumps(metadata)}")

        add_parameter = AddParameter(
            dataset_id=UUID(metadata["dataset_id"]),
            dataset_version=metadata["dataset_version"],
            file_path=(MetadataPath(metadata["path"])
                       if "path" in metadata else None),
            root_dataset_id=(UUID(metadata["root_dataset_id"])
                             if "root_dataset_id" in metadata else None),
            root_dataset_version=metadata.get("root_dataset_version", None),
            dataset_path=MetadataPath(metadata.get("dataset_path", "")),
            extractor_name=metadata["extractor_name"],
            extractor_version=metadata["extractor_version"],
            extraction_time=metadata["extraction_time"],
            extraction_parameter=metadata["extraction_parameter"],
            agent_name=metadata["agent_name"],
            agent_email=metadata["agent_email"],
            extracted_metadata=metadata["extracted_metadata"])

        # If the key "path" is present in the metadata
        # dictionary, we assume that the metadata-dictionary describes
        # file-level metadata. Otherwise, we assume that the
        # metadata-dictionary contains dataset-level metadata.
        if add_parameter.file_path:
            yield from add_file_metadata(metadata_store, add_parameter)
        else:
            yield from add_dataset_metadata(metadata_store, add_parameter)
        return
Beispiel #14
0
def test_subdataset_add_file_end_to_end(file_name):

    test_path = "d_1/d_1.0/f_1.0.0"

    json.dump({
        **metadata_template,
        **additional_keys_template,
        "type": "file",
        "path": test_path
    }, open(file_name, "tw"))

    with tempfile.TemporaryDirectory() as temp_dir:
        git_repo = GitRepo(temp_dir)

        res = meta_add(metadata=file_name, metadata_store=git_repo.path)
        assert_result_count(res, 1)
        assert_result_count(res, 1, type='file')
        assert_result_count(res, 0, type='dataset')

        # Verify dataset level metadata was added
        root_dataset_id = UUID(additional_keys_template["root_dataset_id"])
        root_dataset_version = additional_keys_template["root_dataset_version"]
        dataset_tree_path = MetadataPath(
            additional_keys_template["dataset_path"])

        tree_version_list, uuid_set, mrr = _get_top_nodes(
            git_repo,
            root_dataset_id,
            root_dataset_version)

        _, dataset_tree = tree_version_list.get_dataset_tree(
            root_dataset_version)

        mrr = dataset_tree.get_metadata_root_record(dataset_tree_path)
        eq_(mrr.dataset_identifier, UUID(metadata_template["dataset_id"]))

        file_tree = mrr.get_file_tree()
        assert_is_not_none(file_tree)
        assert_true(test_path in file_tree)

        metadata = file_tree.get_metadata(MetadataPath(test_path))
        metadata_content = _get_metadata_content(metadata)
        eq_(metadata_content, metadata_template["extracted_metadata"])
Beispiel #15
0
    def _search_matches(self, pattern_parts: Tuple[str], tree: FileTree,
                        accumulated_path: MetadataPath) -> List[MatchRecord]:

        if not pattern_parts:
            return [MatchRecord(MetadataPath(accumulated_path), tree)]

        match_records = []
        for name, sub_tree in tree.child_nodes.items():
            if fnmatchcase(name, pattern_parts[0]):
                match_records.extend(
                    self._search_matches(pattern_parts[1:], sub_tree,
                                         accumulated_path / name))

        return match_records
Beispiel #16
0
def _get_top_nodes(realm: str, ap: AddParameter):

    if ap.root_dataset_id is None:
        return get_top_nodes_and_metadata_root_record(default_mapper_family,
                                                      realm,
                                                      ap.dataset_id,
                                                      ap.dataset_version,
                                                      MetadataPath(""),
                                                      auto_create=True)

    tree_version_list, uuid_set, mrr = get_top_nodes_and_metadata_root_record(
        default_mapper_family,
        realm,
        ap.root_dataset_id,
        ap.root_dataset_version,
        MetadataPath(""),
        auto_create=True)

    _, dataset_tree = tree_version_list.get_dataset_tree(
        ap.root_dataset_version)

    if ap.dataset_path != MetadataPath("") and ap.dataset_path in dataset_tree:
        mrr = dataset_tree.get_metadata_root_record(ap.dataset_path)
        if mrr.dataset_identifier != ap.dataset_id:
            raise ValueError(
                f"add-metadata claims that the metadata store contains dataset "
                f"id {ap.dataset_id} at path {ap.dataset_path}, but the "
                f"id of the stored dataset is {mrr.dataset_identifier}")
    else:
        dataset_level_metadata = Metadata(default_mapper_family, realm)
        file_tree = FileTree(default_mapper_family, realm)
        mrr = MetadataRootRecord(default_mapper_family, realm, ap.dataset_id,
                                 ap.dataset_version,
                                 Connector.from_object(dataset_level_metadata),
                                 Connector.from_object(file_tree))
        dataset_tree.add_dataset(ap.dataset_path, mrr)
    return tree_version_list, uuid_set, mrr
    def parse(self):
        """
        Parse a metadata path spec. It can either be a uuid spec or a tree
        spec. If no scheme is provided, a tree-spec is assumed. Note, if a
        dataset_path is empty, the root dataset is assumed and the primary
        data version of the youngest metadata record will be chosen.

        UUID:   "uuid:" UUID-DIGITS ["@" VERSION-DIGITS] [":" [LOCAL_PATH]]
        TREE:   ["tree:"] [DATASET_PATH] ["@" VERSION-DIGITS] [":" [LOCAL_PATH]]
        """

        # Try to parse a uuid-spec
        if self.match(MetadataURLParser.uuid_header):
            uuid = UUID(self.fetch(MetadataURLParser.uuid_string_length))
            _, version = self.parse_version()
            _, local_path = self.get_path()
            return UUIDMetadataURL(uuid, local_path, version)

        # Expect a tree spec
        self.match(self.tree_header)

        success, dataset_path = self.fetch_upto("@")
        if success:
            dataset_path = MetadataPath(dataset_path)
            _, version = self.parse_version()
            self.match(":")
            local_path = MetadataPath(self.get_remaining())
        else:
            version = None
            success, dataset_path = self.fetch_upto(":")
            if success:
                dataset_path = MetadataPath(dataset_path)
                _, local_path = self.get_path()
            else:
                dataset_path = MetadataPath(self.get_remaining())
                local_path = MetadataPath("")
        return TreeMetadataURL(dataset_path, local_path, version)
Beispiel #18
0
def _get_top_nodes(git_repo, dataset_id, dataset_version):
    # Ensure that metadata was created
    tree_version_list, uuid_set, mrr = \
        get_top_nodes_and_metadata_root_record(
            "git",
            git_repo.path,
            dataset_id,
            dataset_version,
            MetadataPath(""))

    assert_is_not_none(tree_version_list)
    assert_is_not_none(uuid_set)
    assert_is_not_none(mrr)

    return tree_version_list, uuid_set, mrr
Beispiel #19
0
 def setUp(self) -> None:
     self.path_list = [
         MetadataPath(".datalad_metadata"),
         MetadataPath("s1/s1.1/d1.1.1/.datalad_metadata"),
         MetadataPath("s1/s1.2/d1.2.1/.datalad_metadata"),
         MetadataPath("s2/d2.1/.datalad_metadata"),
         MetadataPath("d3/.datalad_metadata"),
         MetadataPath("d3/some_file")
     ]
     self.tree_search = self.create_tree_search_from_paths(self.path_list)
def _check_metadata_record(metadata_record: dict,
                           dataset: Dataset,
                           extractor_name: str,
                           extractor_version: str,
                           extraction_parameter: dict,
                           path: Optional[str] = None):

    assert_in("extraction_time", metadata_record)
    eq_(metadata_record["dataset_id"], UUID(dataset.id))
    eq_(metadata_record["dataset_version"], dataset.repo.get_hexsha())
    eq_(metadata_record["extractor_version"], extractor_version)
    eq_(metadata_record["extractor_name"], extractor_name)
    eq_(metadata_record["extraction_parameter"], extraction_parameter)
    eq_(metadata_record["agent_name"], "DataLad Tester")
    eq_(metadata_record["agent_email"], "*****@*****.**")
    if path is not None:
        eq_(metadata_record["path"], MetadataPath(path))
Beispiel #21
0
    def _get_matching_nodes(
            self, pattern_list: List[MetadataPath], auto_list_root: bool
    ) -> Tuple[List[MatchRecord], List[MetadataPath]]:

        match_records: List[MatchRecord] = []
        failed_patterns: List[MetadataPath] = []

        for pattern in pattern_list:
            if pattern.parts == ():
                match_records.extend(self._get_root_nodes(auto_list_root))

            else:
                matching_path_records = self._search_matches(
                    pattern.parts, self.tree, MetadataPath(""))

                if matching_path_records:
                    match_records.extend(matching_path_records)
                else:
                    failed_patterns.append(pattern)

        return match_records, failed_patterns
Beispiel #22
0
 def test_pattern_3(self):
     self._test_pattern(["s*/*"], [
         MetadataPath("s1/s1.1"),
         MetadataPath("s1/s1.2"),
         MetadataPath("s2/d2.1")
     ])
Beispiel #23
0
 def _get_root_nodes(self, auto_list_root: bool) -> List[MatchRecord]:
     return ([
         MatchRecord(MetadataPath(name), child_node)
         for name, child_node in self.tree.child_nodes.items()
     ] if auto_list_root is True else
             [MatchRecord(MetadataPath(""), self.tree)])
Beispiel #24
0
 def test_pattern_4(self):
     self._test_pattern(["d3/*"], [
         MetadataPath("d3/.datalad_metadata"),
         MetadataPath("d3/some_file")
     ])
 def test_blank_path(self):
     parser = MetadataURLParser("a/b/c")
     result = parser.parse()
     self.assertIsInstance(result, TreeMetadataURL)
     self.assertEqual(result.dataset_path, MetadataPath("a/b/c"))
     self.assertEqual(result.local_path, MetadataPath(""))
Beispiel #26
0
 def test_pattern_5(self):
     self._test_pattern(["*/s*"], [
         MetadataPath("s1/s1.1"),
         MetadataPath("s1/s1.2"),
         MetadataPath("d3/some_file")
     ])
Beispiel #27
0
 def test_pattern_7(self):
     found, failed = self.tree_search.get_matching_paths(["see"], False)
     self.assertListEqual(found, [])
     self.assertListEqual(failed, [MetadataPath("see")])
Beispiel #28
0
 def test_recursive_list_2(self):
     self._test_pattern_rec(["d3"], [
         MetadataPath("d3/.datalad_metadata"),
         MetadataPath("d3/some_file")
     ])
Beispiel #29
0
def legacy_extract_file(ep: ExtractionParameter) -> Iterable[dict]:

    if issubclass(ep.extractor_class, MetadataExtractor):

        # Metalad legacy extractor
        status = [{
            "type": "file",
            "path": str(ep.source_dataset.pathobj / ep.file_tree_path),
            "state": "clean",
            "gitshasum": ep.source_dataset_version
        }]
        extractor = ep.extractor_class()
        ensure_legacy_content_availability(ep, extractor, "content", status)

        for result in extractor(ep.source_dataset, ep.source_dataset_version,
                                "content", status):

            result["action"] = "meta_extract"
            if result["status"] == "ok":
                result["metadata_record"] = dict(
                    type="file",
                    dataset_id=ep.source_dataset_id,
                    dataset_version=ep.source_dataset_version,
                    path=ep.file_tree_path,
                    extractor_name=ep.extractor_name,
                    extractor_version=str(
                        extractor.get_state(ep.source_dataset)["version"]),
                    extraction_parameter=ep.extractor_arguments,
                    extraction_time=time.time(),
                    agent_name=ep.agent_name,
                    agent_email=ep.agent_email,
                    extracted_metadata=result["metadata"])

            yield result

    elif issubclass(ep.extractor_class, BaseMetadataExtractor):

        # Datalad legacy extractor
        path = str(ep.source_dataset.pathobj / ep.file_tree_path)
        if ep.extractor_class.NEEDS_CONTENT:
            ensure_legacy_path_availability(ep, path)

        extractor = ep.extractor_class(ep.source_dataset, [path])
        _, file_result = extractor.get_metadata(False, True)

        for path, metadata in file_result:
            result = dict(action="meta_extract",
                          status="ok",
                          type="file",
                          metadata_record=dict(
                              type="file",
                              dataset_id=ep.source_dataset_id,
                              dataset_version=ep.source_dataset_version,
                              path=MetadataPath(path),
                              extractor_name=ep.extractor_name,
                              extractor_version="un-versioned",
                              extraction_parameter=ep.extractor_arguments,
                              extraction_time=time.time(),
                              agent_name=ep.agent_name,
                              agent_email=ep.agent_email,
                              extracted_metadata=metadata))

            yield result

    else:
        raise ValueError(
            f"unknown extractor class: {type(ep.extractor_class).__name__}")
 def get_path(self):
     if self.match(":"):
         path = MetadataPath(self.get_remaining())
         return True, path
     return False, MetadataPath("")