def _save_as_numpy(self, image_path=None, label_path=None, image_save_path=None, label_save_path=None): """ save the files to *.npy format :param image_path: MNIST image data path (extract file path) :param label_path: MNIST image label path (extract file path) :param image_save_path: MNIST image save path as npy :param label_save_path: MNIST label save path as npy """ images, labels = loadlocal_mnist(images_path=image_path, labels_path=label_path) self._image_data = images self._label_data = labels self._image_data_path = image_save_path self._label_data_path = label_save_path if not FileUtils.check_exist_with_message( file_path=image_save_path, message="Images Already Saved!"): np.save(image_save_path, images) if not FileUtils.check_exist_with_message( file_path=label_save_path, message="Labels Already Saved!"): np.save(label_save_path, labels)
def download(self): """ Downloads the data and save to the disk """ FileUtils.mkdir_branch_with_access(dir_path=self._source_dir) if self._train: # download train samples self._download_by_type(urls=self._train_urls, file_names=self._train_file_names) self._save_downloads(urls=self._train_urls) else: # download test samples self._download_by_type(urls=self._test_urls, file_names=self._test_file_names) self._save_downloads(urls=self._test_urls)
def _download_by_type(self, urls, file_names): """ Downloads the data by urls and filenames TODO: replace the file_names with URL basename :param urls: urls from which data is downloaded :param file_names: corresponding files (TODO: replace this with basename) """ for url, file_name in zip(urls, file_names): if not self.__file_exist( path=os.path.join(self._source_dir, file_name)): Downloader.download(url=url, save_path=self._source_dir, file_name=file_name) _full_file_name = os.path.join(self._destination_dir, file_name) FileUtils.extract_archive(_full_file_name)
world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) world_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) __data_folder = '/tmp/twister2deepnet/mnist/' train_data_save_path = "/tmp/parquet/train/" test_data_save_path = "/tmp/parquet/test/" train_data_file = str(world_rank) + ".data" test_data_file = str(world_rank) + ".data" train_target_file = str(world_rank) + ".target" test_target_file = str(world_rank) + ".target" if world_rank == 0: FileUtils.mkdir_branch_with_access(train_data_save_path) FileUtils.mkdir_branch_with_access(test_data_save_path) def save_to_disk(data_set=None, save_path=None, save_file=None): # TODO use os.path.join and refactor if data_set is None or save_path is None or save_file is None: raise Exception("Input Cannot be None") elif not os.path.exists(save_path): raise Exception("Save Path doesn't exist") elif os.path.exists(save_path + save_file): pass else: utilPanda = UtilPanda() dataframe = utilPanda.convert_partition_to_pandas(data_set) table = ArrowUtils.create_to_table(dataFrame=dataframe)
"instances": PARALLELISM }]) world_size = PARALLELISM # int(os.environ['OMPI_COMM_WORLD_SIZE']) world_rank = env.worker_id TRAIN_DATA_FILE = str(world_rank) + ".data" TRAIN_TARGET_FILE = str(world_rank) + ".target" TEST_DATA_FILE = str(world_rank) + ".data" TEST_TARGET_FILE = str(world_rank) + ".target" TRAIN_DATA_FILES = [TRAIN_DATA_FILE, TRAIN_TARGET_FILE] TEST_DATA_FILES = [TEST_DATA_FILE, TEST_TARGET_FILE] DATA_SAVE_PATHS = [TRAIN_DATA_SAVE_PATH, TEST_DATA_SAVE_PATH] if env.worker_id == 0: FileUtils.mkdir_branch_with_access(TRAIN_DATA_SAVE_PATH) FileUtils.mkdir_branch_with_access(TEST_DATA_SAVE_PATH) # print("Hello from worker %d" % env.worker_id) class DataSource(SourceFunc): def __init__(self, train=True): super().__init__() self.is_preprocess = True self.is_loaded = False self.mniste = None self.train_dataset = None self.train_targetset = None self.test_dataset = None