Ejemplo n.º 1
0
def test_replace_plain():
    ar = pa.array(['foo', 'food', None])
    ar = pc.replace_substring(ar, pattern='foo', replacement='bar')
    assert ar.tolist() == ['bar', 'bard', None]
Ejemplo n.º 2
0
    def _generate_examples(self, files, metadata_files, split_name):
        if not self.config.drop_metadata and metadata_files:
            split_metadata_files = metadata_files.get(split_name, [])
            image_empty_metadata = {
                k: None
                for k in self.info.features if k != "image"
            }

            last_checked_dir = None
            metadata_dir = None
            metadata_dict = None
            downloaded_metadata_file = None

            file_idx = 0
            for original_file, downloaded_file_or_dir in files:
                if original_file is not None:
                    _, original_file_ext = os.path.splitext(original_file)
                    if original_file_ext.lower() in self.IMAGE_EXTENSIONS:
                        # If the file is an image, and we've just entered a new directory,
                        # find the nereast metadata file (by counting path segments) for the directory
                        current_dir = os.path.dirname(original_file)
                        if last_checked_dir is None or last_checked_dir != current_dir:
                            last_checked_dir = current_dir
                            metadata_file_candidates = [
                                (
                                    os.path.relpath(
                                        original_file,
                                        os.path.dirname(
                                            metadata_file_candidate)),
                                    metadata_file_candidate,
                                    downloaded_metadata_file,
                                ) for metadata_file_candidate,
                                downloaded_metadata_file in
                                split_metadata_files
                                if metadata_file_candidate is
                                not None  # ignore metadata_files that are inside archives
                                and not os.path.relpath(
                                    original_file,
                                    os.path.dirname(metadata_file_candidate)
                                ).startswith("..")
                            ]
                            if metadata_file_candidates:
                                _, metadata_file, downloaded_metadata_file = min(
                                    metadata_file_candidates,
                                    key=lambda x: count_path_segments(x[0]))
                                with open(downloaded_metadata_file, "rb") as f:
                                    pa_metadata_table = paj.read_json(f)
                                pa_file_name_array = pa_metadata_table[
                                    "file_name"]
                                pa_file_name_array = pc.replace_substring(
                                    pa_file_name_array,
                                    pattern="\\",
                                    replacement="/")
                                pa_metadata_table = pa_metadata_table.drop(
                                    ["file_name"])
                                metadata_dir = os.path.dirname(metadata_file)
                                metadata_dict = {
                                    file_name: image_metadata
                                    for file_name, image_metadata in zip(
                                        pa_file_name_array.to_pylist(),
                                        pa_table_to_pylist(pa_metadata_table))
                                }
                            else:
                                raise ValueError(
                                    f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}."
                                )
                        if metadata_dir is not None and downloaded_metadata_file is not None:
                            file_relpath = os.path.relpath(
                                original_file, metadata_dir)
                            file_relpath = file_relpath.replace("\\", "/")
                            if file_relpath not in metadata_dict:
                                raise ValueError(
                                    f"Image at {file_relpath} doesn't have metadata in {downloaded_metadata_file}."
                                )
                            image_metadata = metadata_dict[file_relpath]
                        else:
                            raise ValueError(
                                f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}."
                            )
                        yield file_idx, {
                            **image_empty_metadata,
                            "image": downloaded_file_or_dir,
                            **image_metadata,
                        }
                        file_idx += 1
                else:
                    for downloaded_dir_file in downloaded_file_or_dir:
                        _, downloaded_dir_file_ext = os.path.splitext(
                            downloaded_dir_file)
                        if downloaded_dir_file_ext.lower(
                        ) in self.IMAGE_EXTENSIONS:
                            current_dir = os.path.dirname(downloaded_dir_file)
                            if last_checked_dir is None or last_checked_dir != current_dir:
                                last_checked_dir = current_dir
                                metadata_file_candidates = [
                                    (
                                        os.path.relpath(
                                            downloaded_dir_file,
                                            os.path.dirname(
                                                downloaded_metadata_file)),
                                        metadata_file_candidate,
                                        downloaded_metadata_file,
                                    ) for metadata_file_candidate,
                                    downloaded_metadata_file in
                                    split_metadata_files
                                    if metadata_file_candidate is
                                    None  # ignore metadata_files that are not inside archives
                                    and not os.path.relpath(
                                        downloaded_dir_file,
                                        os.path.dirname(
                                            downloaded_metadata_file)
                                    ).startswith("..")
                                ]
                                if metadata_file_candidates:
                                    _, metadata_file, downloaded_metadata_file = min(
                                        metadata_file_candidates,
                                        key=lambda x: count_path_segments(x[0]
                                                                          ))
                                    with open(downloaded_metadata_file,
                                              "rb") as f:
                                        pa_metadata_table = paj.read_json(f)
                                    pa_file_name_array = pa_metadata_table[
                                        "file_name"]
                                    pa_file_name_array = pc.replace_substring(
                                        pa_file_name_array,
                                        pattern="\\",
                                        replacement="/")
                                    pa_metadata_table = pa_metadata_table.drop(
                                        ["file_name"])
                                    metadata_dir = os.path.dirname(
                                        downloaded_metadata_file)
                                    metadata_dict = {
                                        file_name: image_metadata
                                        for file_name, image_metadata in zip(
                                            pa_file_name_array.to_pylist(),
                                            pa_table_to_pylist(
                                                pa_metadata_table))
                                    }
                                else:
                                    raise ValueError(
                                        f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}."
                                    )
                            if metadata_dir is not None and downloaded_metadata_file is not None:
                                downloaded_dir_file_relpath = os.path.relpath(
                                    downloaded_dir_file, metadata_dir)
                                downloaded_dir_file_relpath = downloaded_dir_file_relpath.replace(
                                    "\\", "/")
                                if downloaded_dir_file_relpath not in metadata_dict:
                                    raise ValueError(
                                        f"Image at {downloaded_dir_file_relpath} doesn't have metadata in {downloaded_metadata_file}."
                                    )
                                image_metadata = metadata_dict[
                                    downloaded_dir_file_relpath]
                            else:
                                raise ValueError(
                                    f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}."
                                )
                            yield file_idx, {
                                **image_empty_metadata,
                                "image": downloaded_dir_file,
                                **image_metadata,
                            }
                            file_idx += 1
        else:
            file_idx = 0
            for original_file, downloaded_file_or_dir in files:
                if original_file is not None:
                    _, original_file_ext = os.path.splitext(original_file)
                    if original_file_ext.lower() in self.IMAGE_EXTENSIONS:
                        if self.config.drop_labels or metadata_files:
                            yield file_idx, {
                                "image": downloaded_file_or_dir,
                            }
                        else:
                            yield file_idx, {
                                "image":
                                downloaded_file_or_dir,
                                "label":
                                os.path.basename(
                                    os.path.dirname(original_file)),
                            }
                        file_idx += 1
                else:
                    for downloaded_dir_file in downloaded_file_or_dir:
                        _, downloaded_dir_file_ext = os.path.splitext(
                            downloaded_dir_file)
                        if downloaded_dir_file_ext.lower(
                        ) in self.IMAGE_EXTENSIONS:
                            if self.config.drop_labels or metadata_files:
                                yield file_idx, {
                                    "image": downloaded_dir_file,
                                }
                            else:
                                yield file_idx, {
                                    "image":
                                    downloaded_dir_file,
                                    "label":
                                    os.path.basename(
                                        os.path.dirname(downloaded_dir_file)),
                                }
                            file_idx += 1