def test_fs_detection_on_container_hdfs(self): # Create a container for testing zip_file_name = "test" zip_file_path = zip_file_name + ".zip" # in the zip, the leading slash will be removed file_name_zip = self.tmpfile_path.lstrip('/') # TODO(tianqi): add functionality ot pfio from pyarrow import hdfs conn = hdfs.connect() hdfs_home = conn.info('.')['path'] conn.close() hdfs_file_path = os.path.join(hdfs_home, zip_file_path) shutil.make_archive(zip_file_name, "zip", base_dir=self.tmpdir.name) with pfio.open(hdfs_file_path, "wb") as hdfs_file: with pfio.open(zip_file_path, "rb") as posix_file: hdfs_file.write(posix_file.read()) with pfio.open_as_container(hdfs_file_path) as container: with container.open(file_name_zip, "r") as f: self.assertEqual(f.read(), self.test_string_str) pfio.remove(zip_file_path) pfio.remove(hdfs_file_path)
def __init__(self, dataDir='./facade/base', data_range=(1, 300)): print("load dataset start") print(" from: %s" % dataDir) print(" range: [%d, %d)" % (data_range[0], data_range[1])) self.dataDir = dataDir self.dataset = [] for i in range(data_range[0], data_range[1]): # PFIO add img_data = pfio.open(dataDir + "/cmp_b%04d.jpg" % i, mode='rb') img = Image.open(img_data) label_data = pfio.open(dataDir + "/cmp_b%04d.png" % i, mode='rb') label = Image.open(label_data) # PFIO add end w, h = img.size r = 286 / float(min(w, h)) # resize images so that min(w, h) == 286 img = img.resize((int(r * w), int(r * h)), Image.BILINEAR) label = label.resize((int(r * w), int(r * h)), Image.NEAREST) img = np.asarray(img).astype("f").transpose(2, 0, 1) / 128.0 - 1.0 label_ = np.asarray(label) - 1 # [0, 12) label = np.zeros((12, img.shape[1], img.shape[2])).astype("i") for j in range(12): label[j, :] = label_ == j self.dataset.append((img, label)) print("load dataset done")
def test_remove(self): test_file = "test_remove.txt" test_dir = "test_dir/" nested_dir = os.path.join(test_dir, "nested_file/") nested_file = os.path.join(nested_dir, test_file) with pfio.open(test_file, 'w') as fp: fp.write('foobar') # test remove on one file self.assertTrue(pfio.exists(test_file)) pfio.remove(test_file) self.assertFalse(pfio.exists(test_file)) # test remove on directory pfio.makedirs(nested_dir) with pfio.open(nested_file, 'w') as fp: fp.write('foobar') self.assertTrue(pfio.exists(test_dir)) self.assertTrue(pfio.exists(nested_dir)) self.assertTrue(pfio.exists(nested_file)) pfio.remove(test_dir, True) self.assertFalse(pfio.exists(test_dir)) self.assertFalse(pfio.exists(nested_dir)) self.assertFalse(pfio.exists(nested_file))
def test_root_local_override(self): pfio.set_root('file://' + self.tmpdir.name) with pfio.open(self.tmpfile_name, "r") as fp: self.assertEqual(fp.read(), self.test_string_str) # override with full URI with open(__file__, "r") as my_script: with pfio.open('file://' + __file__) as fp: self.assertEqual(fp.read(), my_script.read().encode("utf-8"))
def test_set_root(self): # Set default context globally in this process pfio.set_root('posix') # Using the context to open local file with pfio.open(self.tmpfile_path, "r") as fp: self.assertEqual(fp.read(), self.test_string_str) pfio.set_root('file://' + self.tmpdir.name) with pfio.open(self.tmpfile_name, "r") as fp: self.assertEqual(fp.read(), self.test_string_str)
def get_example(self, i): """Called by the iterator to fetch a data sample. A data sample from MSCOCO consists of an image and its corresponding caption. The returned image has the shape (channel, height, width). """ ann = self.anns[i] # Load the image img_id = ann['image_id'] img_file_name = self.coco.loadImgs([img_id])[0]['file_name'] # PFIO load file file_name = os.path.join(self.coco_root, self.coco_data, img_file_name) img = Image.open(pfio.open(file_name, 'rb')) # PFIO load end if img.mode == 'RGB': img = np.asarray(img, np.float32).transpose(2, 0, 1) elif img.mode == 'L': img = np.asarray(img, np.float32) img = np.broadcast_to(img, (3,) + img.shape) else: raise ValueError('Invalid image mode {}'.format(img.mode)) # Load the caption, i.e. sequence of tokens tokens = [self.vocab.get(w, _unk) for w in ['<bos>'] + split(ann['caption']) + ['<eos>']] tokens = np.array(tokens, np.int32) return img, tokens
def test_rename(self): new_tmp_dir = tempfile.TemporaryDirectory() try: src = os.path.join("file://", new_tmp_dir.name, 'src') dst = os.path.join("file://", new_tmp_dir.name, 'dst') with pfio.open(src, 'w') as fp: fp.write('foobar') assert pfio.exists(src) assert not pfio.exists(dst) pfio.rename(src, dst) with pfio.open(dst, 'r') as fp: data = fp.read() assert data == 'foobar' assert not pfio.exists(src) assert pfio.exists(dst) finally: new_tmp_dir.cleanup()
def read_corpus(path, max_size): # PFIO modify with pfio.open(path, mode='r', encoding='utf-8') as f: # PFIO modify end trees = [] for line in f: line = line.strip() tree = SexpParser(line).parse() trees.append(tree) if max_size and len(trees) >= max_size: break return trees
def test_root_fs_override(self): from pyarrow import hdfs hdfs_tmpfile = "tmpfile_hdfs" hdfs_file_string = "this is a test string for hdfs" conn = hdfs.connect() with conn.open(hdfs_tmpfile, "wb") as f: f.write(hdfs_file_string.encode('utf-8')) pfio.set_root("hdfs") with pfio.open(hdfs_tmpfile, "r") as fp: self.assertEqual(fp.read(), hdfs_file_string) # override with full URI with open(__file__, "r") as my_script: with pfio.open("file://" + __file__, "r") as fp: self.assertEqual(fp.read(), my_script.read()) with pfio.open(hdfs_tmpfile, "r") as fp: self.assertEqual(fp.read(), hdfs_file_string) conn.delete(hdfs_tmpfile) conn.close()
# PFIO import import pfio # PFIO end import numpy as np import matplotlib matplotlib.use('Agg') mushroomsfile = 'mushrooms.csv' # uncomment to use HDFS, remember to put the mushroomsfile to HDFS # pfio.set_root("hdfs") # PFIO read file start mushroomsdata = pfio.open(mushroomsfile, 'r') # PFIO read file end data_array = np.genfromtxt(mushroomsdata, delimiter=',', dtype=str, skip_header=1) for col in range(data_array.shape[1]): data_array[:, col] = np.unique(data_array[:, col], return_inverse=True)[1] X = data_array[:, 1:].astype(np.float32) Y = data_array[:, 0].astype(np.int32)[:, None] train, test = datasets.split_dataset_random(datasets.TupleDataset(X, Y), int(data_array.shape[0] * .7))