def read_jpeg(path: str) -> torch.Tensor: """ Reads a JPEG image into a 3 dimensional RGB Tensor. The values of the output tensor are uint8 between 0 and 255. Arguments: path (str): path of the JPEG image. Returns: output (Tensor[image_width, image_height, 3]) """ if not os.path.isfile(path): raise ValueError("Expected a valid file path.") size = os.path.getsize(path) if size == 0: raise ValueError("Expected a non empty file.") data = torch.from_file(path, dtype=torch.uint8, size=size) return decode_jpeg(data)
def test_decode_jpeg(self): for img_path in get_images(IMAGE_ROOT, ".jpg"): img_pil = torch.load(img_path.replace('jpg', 'pth')) size = os.path.getsize(img_path) img_ljpeg = decode_jpeg( torch.from_file(img_path, dtype=torch.uint8, size=size)) self.assertTrue(img_ljpeg.equal(img_pil)) with self.assertRaisesRegex( ValueError, "Expected a non empty 1-dimensional tensor."): decode_jpeg(torch.empty((100, 1), dtype=torch.uint8)) with self.assertRaisesRegex(ValueError, "Expected a torch.uint8 tensor."): decode_jpeg(torch.empty((100, ), dtype=torch.float16)) with self.assertRaises(RuntimeError): decode_jpeg(torch.empty((100), dtype=torch.uint8))
def make_mmap_file(path, input_size): view_size = torch.Size([num_batches * args.batch_size]) + input_size # shared needs to be true for file to be created return torch.from_file(path, size=int(np.prod(view_size)), shared=True).view(view_size)
def preprocess_input_file(config, args): assert args.preprocess_input assert pathlib.Path(args.input_file).exists() is_edges = config.model.learning_task == m.config.LearningTask.LINK_PREDICTION if is_edges: storage_backend = config.storage.edges else: storage_backend = config.storage.nodes shape = infer_input_shape(config, args) str_dtype, numpy_dtype = get_dtype(storage_backend, args) node_mapping_file = config.storage.dataset.dataset_dir + PathConstants.node_mapping_path rel_mapping_file = config.storage.dataset.dataset_dir + PathConstants.relation_mapping_path node_mapping_df = None rel_mapping_df = None if pathlib.Path(node_mapping_file).exists(): node_mapping_df = pd.read_csv(node_mapping_file, sep=",", header=None) if pathlib.Path(rel_mapping_file).exists(): rel_mapping_df = pd.read_csv(rel_mapping_file, sep=",", header=None) if args.input_format.upper() == "BINARY" or args.input_format.upper() == "BIN": input_tensor = torch.from_file(np.fromfile(args.filename, numpy_dtype)).resize(shape) if node_mapping_df is not None: if len(input_tensor.shape) == 2: input_tensor = apply_mapping_edges(input_tensor, node_mapping_df, rel_mapping_df) else: input_tensor = apply_mapping1d(input_tensor, node_mapping_df) else: columns = get_columns(config, args) delim = args.delim if delim is None: if args.input_format.upper() == "CSV": delim = "," elif args.input_format.upper() == "TSV": delim = "\t" else: raise RuntimeError("Delimiter must be specified.") reader = PandasDelimitedFileReader( args.input_file, columns=columns, header_length=args.header_length, delim=delim, dtype=str_dtype ) input_df, _, _ = reader.read() if node_mapping_df is not None: if len(input_df.shape) == 2: input_df = apply_mapping_edges(input_df, node_mapping_df, rel_mapping_df) else: input_df = apply_mapping1d(input_df, node_mapping_df) input_tensor = dataframe_to_tensor(input_df) # TODO probably not a great way to name the preprocessed file input_file = "preproc_" + args.input_file.split(".")[-2] + ".bin" input_file_offsets = None num_partitions = 1 if config.storage.embeddings is not None and config.storage.embeddings.type == m.config.StorageBackend.PARTITION_BUFFER: num_partitions = config.storage.embeddings.options.num_partitions elif config.storage.features is not None and config.storage.features.type == m.config.StorageBackend.PARTITION_BUFFER: num_partitions = config.storage.features.options.num_partitions if num_partitions > 1 and len(input_tensor.shape) == 2: input_file_offsets = args.input_file.split(".")[-2] + "_offsets.txt" input_tensor, offsets = partition_edges(input_tensor, config.storage.dataset.num_nodes, num_partitions) with open(config.storage.dataset.dataset_dir + input_file_offsets, "w") as f: f.writelines([str(o) + "\n" for o in offsets]) with open(config.storage.dataset.dataset_dir + input_file, "wb") as f: f.write(bytes(input_tensor.numpy())) return input_file, input_file_offsets, storage_backend, shape