Beispiel #1
0
    def test_fs_detection_on_container_hdfs(self):
        # Create a container for testing
        zip_file_name = "test"
        zip_file_path = zip_file_name + ".zip"

        # in the zip, the leading slash will be removed
        file_name_zip = self.tmpfile_path.lstrip('/')

        # TODO(tianqi): add functionality ot pfio
        from pyarrow import hdfs

        conn = hdfs.connect()
        hdfs_home = conn.info('.')['path']
        conn.close()

        hdfs_file_path = os.path.join(hdfs_home, zip_file_path)

        shutil.make_archive(zip_file_name, "zip", base_dir=self.tmpdir.name)

        with pfio.open(hdfs_file_path, "wb") as hdfs_file:
            with pfio.open(zip_file_path, "rb") as posix_file:
                hdfs_file.write(posix_file.read())

        with pfio.open_as_container(hdfs_file_path) as container:
            with container.open(file_name_zip, "r") as f:
                self.assertEqual(f.read(), self.test_string_str)

        pfio.remove(zip_file_path)
        pfio.remove(hdfs_file_path)
    def __init__(self, dataDir='./facade/base', data_range=(1, 300)):
        print("load dataset start")
        print("    from: %s" % dataDir)
        print("    range: [%d, %d)" % (data_range[0], data_range[1]))
        self.dataDir = dataDir
        self.dataset = []

        for i in range(data_range[0], data_range[1]):
            # PFIO add
            img_data = pfio.open(dataDir + "/cmp_b%04d.jpg" % i, mode='rb')
            img = Image.open(img_data)

            label_data = pfio.open(dataDir + "/cmp_b%04d.png" % i, mode='rb')
            label = Image.open(label_data)
            # PFIO add end
            w, h = img.size
            r = 286 / float(min(w, h))
            # resize images so that min(w, h) == 286
            img = img.resize((int(r * w), int(r * h)), Image.BILINEAR)
            label = label.resize((int(r * w), int(r * h)), Image.NEAREST)

            img = np.asarray(img).astype("f").transpose(2, 0, 1) / 128.0 - 1.0
            label_ = np.asarray(label) - 1  # [0, 12)
            label = np.zeros((12, img.shape[1], img.shape[2])).astype("i")
            for j in range(12):
                label[j, :] = label_ == j
            self.dataset.append((img, label))
        print("load dataset done")
Beispiel #3
0
    def test_remove(self):
        test_file = "test_remove.txt"
        test_dir = "test_dir/"
        nested_dir = os.path.join(test_dir, "nested_file/")
        nested_file = os.path.join(nested_dir, test_file)

        with pfio.open(test_file, 'w') as fp:
            fp.write('foobar')

        # test remove on one file
        self.assertTrue(pfio.exists(test_file))
        pfio.remove(test_file)
        self.assertFalse(pfio.exists(test_file))

        # test remove on directory
        pfio.makedirs(nested_dir)
        with pfio.open(nested_file, 'w') as fp:
            fp.write('foobar')

        self.assertTrue(pfio.exists(test_dir))
        self.assertTrue(pfio.exists(nested_dir))
        self.assertTrue(pfio.exists(nested_file))

        pfio.remove(test_dir, True)

        self.assertFalse(pfio.exists(test_dir))
        self.assertFalse(pfio.exists(nested_dir))
        self.assertFalse(pfio.exists(nested_file))
Beispiel #4
0
    def test_root_local_override(self):
        pfio.set_root('file://' + self.tmpdir.name)
        with pfio.open(self.tmpfile_name, "r") as fp:
            self.assertEqual(fp.read(), self.test_string_str)

        # override with full URI
        with open(__file__, "r") as my_script:
            with pfio.open('file://' + __file__) as fp:
                self.assertEqual(fp.read(), my_script.read().encode("utf-8"))
Beispiel #5
0
    def test_set_root(self):
        # Set default context globally in this process
        pfio.set_root('posix')

        # Using the context to open local file
        with pfio.open(self.tmpfile_path, "r") as fp:
            self.assertEqual(fp.read(), self.test_string_str)

        pfio.set_root('file://' + self.tmpdir.name)
        with pfio.open(self.tmpfile_name, "r") as fp:
            self.assertEqual(fp.read(), self.test_string_str)
Beispiel #6
0
    def get_example(self, i):
        """Called by the iterator to fetch a data sample.

        A data sample from MSCOCO consists of an image and its corresponding
        caption.

        The returned image has the shape (channel, height, width).
        """
        ann = self.anns[i]

        # Load the image
        img_id = ann['image_id']
        img_file_name = self.coco.loadImgs([img_id])[0]['file_name']

        # PFIO load file
        file_name = os.path.join(self.coco_root, self.coco_data, img_file_name)
        img = Image.open(pfio.open(file_name, 'rb'))
        # PFIO load end

        if img.mode == 'RGB':
            img = np.asarray(img, np.float32).transpose(2, 0, 1)
        elif img.mode == 'L':
            img = np.asarray(img, np.float32)
            img = np.broadcast_to(img, (3,) + img.shape)
        else:
            raise ValueError('Invalid image mode {}'.format(img.mode))

        # Load the caption, i.e. sequence of tokens
        tokens = [self.vocab.get(w, _unk) for w in
                  ['<bos>'] + split(ann['caption']) + ['<eos>']]
        tokens = np.array(tokens, np.int32)

        return img, tokens
Beispiel #7
0
    def test_rename(self):
        new_tmp_dir = tempfile.TemporaryDirectory()

        try:
            src = os.path.join("file://", new_tmp_dir.name, 'src')
            dst = os.path.join("file://", new_tmp_dir.name, 'dst')
            with pfio.open(src, 'w') as fp:
                fp.write('foobar')

            assert pfio.exists(src)
            assert not pfio.exists(dst)

            pfio.rename(src, dst)
            with pfio.open(dst, 'r') as fp:
                data = fp.read()
                assert data == 'foobar'

            assert not pfio.exists(src)
            assert pfio.exists(dst)
        finally:
            new_tmp_dir.cleanup()
Beispiel #8
0
def read_corpus(path, max_size):
    # PFIO modify
    with pfio.open(path, mode='r', encoding='utf-8') as f:
        # PFIO modify end
        trees = []
        for line in f:
            line = line.strip()
            tree = SexpParser(line).parse()
            trees.append(tree)
            if max_size and len(trees) >= max_size:
                break

    return trees
Beispiel #9
0
    def test_root_fs_override(self):
        from pyarrow import hdfs

        hdfs_tmpfile = "tmpfile_hdfs"
        hdfs_file_string = "this is a test string for hdfs"

        conn = hdfs.connect()
        with conn.open(hdfs_tmpfile, "wb") as f:
            f.write(hdfs_file_string.encode('utf-8'))

        pfio.set_root("hdfs")
        with pfio.open(hdfs_tmpfile, "r") as fp:
            self.assertEqual(fp.read(), hdfs_file_string)

        # override with full URI
        with open(__file__, "r") as my_script:
            with pfio.open("file://" + __file__, "r") as fp:
                self.assertEqual(fp.read(), my_script.read())

        with pfio.open(hdfs_tmpfile, "r") as fp:
            self.assertEqual(fp.read(), hdfs_file_string)

        conn.delete(hdfs_tmpfile)
        conn.close()
# PFIO import
import pfio
# PFIO end

import numpy as np

import matplotlib
matplotlib.use('Agg')

mushroomsfile = 'mushrooms.csv'
# uncomment to use HDFS, remember to put the mushroomsfile to HDFS
# pfio.set_root("hdfs")

# PFIO read file start
mushroomsdata = pfio.open(mushroomsfile, 'r')
# PFIO read file end

data_array = np.genfromtxt(mushroomsdata,
                           delimiter=',',
                           dtype=str,
                           skip_header=1)

for col in range(data_array.shape[1]):
    data_array[:, col] = np.unique(data_array[:, col], return_inverse=True)[1]

X = data_array[:, 1:].astype(np.float32)
Y = data_array[:, 0].astype(np.int32)[:, None]
train, test = datasets.split_dataset_random(datasets.TupleDataset(X, Y),
                                            int(data_array.shape[0] * .7))