def extract_archive(source_path, destination_path=None, clean=False):
        """
        This is a similar implementation referring to Pytorch Utils
        Reference:
        :param source_path: archive file source (str)
        :param destination_path: specific location to extract data (str)
        :param clean: remove the archive files after extracting (boolean)
        """
        if destination_path is None:
            destination_path = os.path.dirname(source_path)

        if FileUtils.is_tar(source_path):
            with tarfile.open(source_path, 'r') as tar:
                tar.extractall(path=destination_path)
        elif FileUtils.is_targz(source_path):
            with tarfile.open(source_path, 'r:gz') as tar:
                tar.extractall(path=destination_path)
        elif FileUtils.is_gzip(source_path):
            destination_path = os.path.join(destination_path,
                                            os.path.splitext(os.path.basename(source_path))[0])
            with open(destination_path, "wb") as output_file, GzipFile(source_path) as zip_file:
                output_file.write(zip_file.read())
        elif FileUtils.is_zip(source_path):
            with ZipFile(source_path, 'r') as zipfile:
                zipfile.extractall(destination_path)
        else:
            raise ParameterError("Unsupported Extract Format".format(source_path))

        if clean:
            os.remove(source_path)
Beispiel #2
0
    def convert_numpy_to_pandas(ndarray=None):
        dataframe = None
        if not isinstance(ndarray, np.ndarray):
            raise ParameterError("Input is {}, but expected {}".format(
                type(ndarray), type(np.ndarray)))
        else:
            dataframe = pd.DataFrame(ndarray)

        return dataframe
Beispiel #3
0
    def __wrap_to_numpy(self, partition_set=None):
        array = []
        if isinstance(partition_set, Partition):
            for datapoint in partition_set:
                array.append(datapoint)
        else:
            raise ParameterError("Invalid Input type {}, Expected {}".format(
                type(partition_set, type(Partition))))

        return np.array(array)
Beispiel #4
0
    def __init__(self, source_dir=None, parallelism=4, world_rank=0):
        """
        :param parallelism: total process parallelism in data loading
        :param world_rank: current process id or MPI RANK
        """
        if source_dir is None:
            raise ParameterError("Source directory must be specified")

        self.source_dir = source_dir
        self.parallelism = parallelism
        self.rank = world_rank
Beispiel #5
0
    def __init__(self, mean=1, std=1, data=None):
        """
        Normalize function only supports 2 dimensional data.
        For 3 Channel Images do not use this class.
        :param mean: mean
        :param std: standard deviation
        :param data: numpy array must be in shape of 3 (samples, W, H)
        """
        if not isinstance(data, np.ndarray):
            raise ParameterError("Data must be in numpy format")

        self.mean = mean
        self.std = std
        self.data = data
Beispiel #6
0
    def _save_downloads(self, urls=None):
        """
        save the downloaded files to numpy format
        :param urls: urls from which data is being downloaded
        """
        image_file_name = os.path.basename(urls[0])
        image_file_path = os.path.join(self._destination_dir,
                                       image_file_name.split(".")[0])

        label_file_name = os.path.basename(urls[1])
        label_file_path = os.path.join(self._destination_dir,
                                       label_file_name.split(".")[0])

        if self.__file_exist(path=image_file_path) and self.__file_exist(
                path=label_file_path):
            print("Files Exist {}, {}".format(image_file_path,
                                              label_file_path))
            self._save_as_numpy(image_path=image_file_path,
                                label_path=label_file_path,
                                image_save_path=image_file_path + ".npy",
                                label_save_path=label_file_path + ".npy")
        else:
            raise ParameterError(
                "File cannot be located {}".format(image_file_path))
Beispiel #7
0
    def __init__(self,
                 dataset="mnist",
                 source_dir=None,
                 destination_dir=None,
                 transform=None):
        """

        """
        if source_dir is None:
            raise ParameterError("Source directory must be specified")
        else:
            self.source_dir = source_dir
        if destination_dir is None:
            self.destination_dir = self.source_dir
        else:
            self.destination_dir = destination_dir
        self.log = False
        self.dataset = dataset
        self.transform = transform

        self.train_save_files = None
        self.test_save_files = None

        self.__set_file_paths()