Ejemplo n.º 1
0
def read_jpeg(path: str) -> torch.Tensor:
    """
    Reads a JPEG image into a 3 dimensional RGB Tensor.
    The values of the output tensor are uint8 between 0 and 255.
    Arguments:
        path (str): path of the JPEG image.
    Returns:
        output (Tensor[image_width, image_height, 3])
    """
    if not os.path.isfile(path):
        raise ValueError("Expected a valid file path.")

    size = os.path.getsize(path)
    if size == 0:
        raise ValueError("Expected a non empty file.")
    data = torch.from_file(path, dtype=torch.uint8, size=size)
    return decode_jpeg(data)
Ejemplo n.º 2
0
    def test_decode_jpeg(self):
        for img_path in get_images(IMAGE_ROOT, ".jpg"):
            img_pil = torch.load(img_path.replace('jpg', 'pth'))
            size = os.path.getsize(img_path)
            img_ljpeg = decode_jpeg(
                torch.from_file(img_path, dtype=torch.uint8, size=size))
            self.assertTrue(img_ljpeg.equal(img_pil))

        with self.assertRaisesRegex(
                ValueError, "Expected a non empty 1-dimensional tensor."):
            decode_jpeg(torch.empty((100, 1), dtype=torch.uint8))

        with self.assertRaisesRegex(ValueError,
                                    "Expected a torch.uint8 tensor."):
            decode_jpeg(torch.empty((100, ), dtype=torch.float16))

        with self.assertRaises(RuntimeError):
            decode_jpeg(torch.empty((100), dtype=torch.uint8))
Ejemplo n.º 3
0
 def make_mmap_file(path, input_size):
     view_size = torch.Size([num_batches * args.batch_size]) + input_size
     # shared needs to be true for file to be created
     return torch.from_file(path, size=int(np.prod(view_size)),
                            shared=True).view(view_size)
Ejemplo n.º 4
0
def preprocess_input_file(config, args):
    assert args.preprocess_input
    assert pathlib.Path(args.input_file).exists()

    is_edges = config.model.learning_task == m.config.LearningTask.LINK_PREDICTION

    if is_edges:
        storage_backend = config.storage.edges
    else:
        storage_backend = config.storage.nodes

    shape = infer_input_shape(config, args)
    str_dtype, numpy_dtype = get_dtype(storage_backend, args)

    node_mapping_file = config.storage.dataset.dataset_dir + PathConstants.node_mapping_path
    rel_mapping_file = config.storage.dataset.dataset_dir + PathConstants.relation_mapping_path

    node_mapping_df = None
    rel_mapping_df = None

    if pathlib.Path(node_mapping_file).exists():
        node_mapping_df = pd.read_csv(node_mapping_file, sep=",", header=None)

    if pathlib.Path(rel_mapping_file).exists():
        rel_mapping_df = pd.read_csv(rel_mapping_file, sep=",", header=None)

    if args.input_format.upper() == "BINARY" or args.input_format.upper() == "BIN":
        input_tensor = torch.from_file(np.fromfile(args.filename, numpy_dtype)).resize(shape)

        if node_mapping_df is not None:
            if len(input_tensor.shape) == 2:
                input_tensor = apply_mapping_edges(input_tensor, node_mapping_df, rel_mapping_df)
            else:
                input_tensor = apply_mapping1d(input_tensor, node_mapping_df)
    else:
        columns = get_columns(config, args)

        delim = args.delim

        if delim is None:
            if args.input_format.upper() == "CSV":
                delim = ","
            elif args.input_format.upper() == "TSV":
                delim = "\t"
            else:
                raise RuntimeError("Delimiter must be specified.")

        reader = PandasDelimitedFileReader(
            args.input_file,
            columns=columns,
            header_length=args.header_length,
            delim=delim,
            dtype=str_dtype
        )

        input_df, _, _ = reader.read()

        if node_mapping_df is not None:
            if len(input_df.shape) == 2:
                input_df = apply_mapping_edges(input_df, node_mapping_df, rel_mapping_df)
            else:
                input_df = apply_mapping1d(input_df, node_mapping_df)

        input_tensor = dataframe_to_tensor(input_df)

    # TODO probably not a great way to name the preprocessed file
    input_file = "preproc_" + args.input_file.split(".")[-2] + ".bin"
    input_file_offsets = None

    num_partitions = 1
    if config.storage.embeddings is not None and config.storage.embeddings.type == m.config.StorageBackend.PARTITION_BUFFER:
        num_partitions = config.storage.embeddings.options.num_partitions
    elif config.storage.features is not None and config.storage.features.type == m.config.StorageBackend.PARTITION_BUFFER:
        num_partitions = config.storage.features.options.num_partitions

    if num_partitions > 1 and len(input_tensor.shape) == 2:
        input_file_offsets = args.input_file.split(".")[-2] + "_offsets.txt"
        input_tensor, offsets = partition_edges(input_tensor, config.storage.dataset.num_nodes, num_partitions)

        with open(config.storage.dataset.dataset_dir + input_file_offsets, "w") as f:
            f.writelines([str(o) + "\n" for o in offsets])

    with open(config.storage.dataset.dataset_dir + input_file, "wb") as f:
        f.write(bytes(input_tensor.numpy()))

    return input_file, input_file_offsets, storage_backend, shape