Example #1
0
    def _save_as_numpy(self,
                       image_path=None,
                       label_path=None,
                       image_save_path=None,
                       label_save_path=None):
        """
        save the files to *.npy format
        :param image_path: MNIST image data path (extract file path)
        :param label_path: MNIST image label path (extract file path)
        :param image_save_path: MNIST image save path as npy
        :param label_save_path: MNIST label save path as npy
        """
        images, labels = loadlocal_mnist(images_path=image_path,
                                         labels_path=label_path)

        self._image_data = images
        self._label_data = labels
        self._image_data_path = image_save_path
        self._label_data_path = label_save_path
        if not FileUtils.check_exist_with_message(
                file_path=image_save_path, message="Images Already Saved!"):
            np.save(image_save_path, images)
        if not FileUtils.check_exist_with_message(
                file_path=label_save_path, message="Labels Already Saved!"):
            np.save(label_save_path, labels)
Example #2
0
    def download(self):
        """
        Downloads the data and save to the disk
        """
        FileUtils.mkdir_branch_with_access(dir_path=self._source_dir)

        if self._train:
            # download train samples
            self._download_by_type(urls=self._train_urls,
                                   file_names=self._train_file_names)
            self._save_downloads(urls=self._train_urls)
        else:
            # download test samples
            self._download_by_type(urls=self._test_urls,
                                   file_names=self._test_file_names)
            self._save_downloads(urls=self._test_urls)
Example #3
0
 def _download_by_type(self, urls, file_names):
     """
     Downloads the data by urls and filenames
     TODO: replace the file_names with URL basename
     :param urls: urls from which data is downloaded
     :param file_names: corresponding files (TODO: replace this with basename)
     """
     for url, file_name in zip(urls, file_names):
         if not self.__file_exist(
                 path=os.path.join(self._source_dir, file_name)):
             Downloader.download(url=url,
                                 save_path=self._source_dir,
                                 file_name=file_name)
             _full_file_name = os.path.join(self._destination_dir,
                                            file_name)
             FileUtils.extract_archive(_full_file_name)
world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
world_rank = int(os.environ['OMPI_COMM_WORLD_RANK'])

__data_folder = '/tmp/twister2deepnet/mnist/'

train_data_save_path = "/tmp/parquet/train/"
test_data_save_path = "/tmp/parquet/test/"

train_data_file = str(world_rank) + ".data"
test_data_file = str(world_rank) + ".data"
train_target_file = str(world_rank) + ".target"
test_target_file = str(world_rank) + ".target"

if world_rank == 0:
    FileUtils.mkdir_branch_with_access(train_data_save_path)
    FileUtils.mkdir_branch_with_access(test_data_save_path)


def save_to_disk(data_set=None, save_path=None, save_file=None):
    # TODO use os.path.join and refactor
    if data_set is None or save_path is None or save_file is None:
        raise Exception("Input Cannot be None")
    elif not os.path.exists(save_path):
        raise Exception("Save Path doesn't exist")
    elif os.path.exists(save_path + save_file):
        pass
    else:
        utilPanda = UtilPanda()
        dataframe = utilPanda.convert_partition_to_pandas(data_set)
        table = ArrowUtils.create_to_table(dataFrame=dataframe)
Example #5
0
    "instances": PARALLELISM
}])
world_size = PARALLELISM  # int(os.environ['OMPI_COMM_WORLD_SIZE'])
world_rank = env.worker_id

TRAIN_DATA_FILE = str(world_rank) + ".data"
TRAIN_TARGET_FILE = str(world_rank) + ".target"
TEST_DATA_FILE = str(world_rank) + ".data"
TEST_TARGET_FILE = str(world_rank) + ".target"

TRAIN_DATA_FILES = [TRAIN_DATA_FILE, TRAIN_TARGET_FILE]
TEST_DATA_FILES = [TEST_DATA_FILE, TEST_TARGET_FILE]
DATA_SAVE_PATHS = [TRAIN_DATA_SAVE_PATH, TEST_DATA_SAVE_PATH]

if env.worker_id == 0:
    FileUtils.mkdir_branch_with_access(TRAIN_DATA_SAVE_PATH)
    FileUtils.mkdir_branch_with_access(TEST_DATA_SAVE_PATH)

# print("Hello from worker %d" % env.worker_id)


class DataSource(SourceFunc):
    def __init__(self, train=True):
        super().__init__()

        self.is_preprocess = True
        self.is_loaded = False
        self.mniste = None
        self.train_dataset = None
        self.train_targetset = None
        self.test_dataset = None