class Datasets: """ This is a wrapper for three different classes that together handle the datasets (locally and remotely). Datasets class takes one argument: the path to the local directory of the dataset files (e.g. data/ or /tmp/serenata-data). The argument is optional and the default value is data/ (following the default usage in the main repo, serenata-de-amor). The remote part of the class expect to find Amazon credentials in a config.ini file with an Amazon section (e.g. config.ini.exemple). Inside it object there are three main objects: local, remote, and downloader: * `Datasets.local` handles listing all local datasets through the property `Datasets.local.all` (hint: it's a generator) and deleting local datasets with the method `Datasets.local.delete(filename)`; * `Datasets.remote` has the `Datasets.remote.all` property (hint: it's also a generator) and `Dataset.remote.delete(filename)` method just like its local equivalent; in addition to them this object offers the `Datasets.remote.upload(file_path)` method to upload a local file to the remote bucket; `Datasets.remote` does not handles downloads because `boto3` does not support `asyncio` and we prefer to use async tasks to allow the download of more than one file in parallel; * `Datasets.downloader` implements a async manager to download files from the remote bucket. It's `Datasets.downloader.download(files)` take the path for a single file (str) as argument or an iterable of paths (str). Yet this wrapper implement the `Dataset.upload_all()` method to upload all local datasets that are not present in the remote bucket. :param local_directory: (str) path to local directory of the datasets """ def __init__(self, local_directory=None): if not local_directory: local_directory = 'data' self.local = LocalDatasets(local_directory) self.remote = RemoteDatasets() self.downloader = Downloader(local_directory, bucket=self.remote.bucket, **self.remote.credentials) @property def pending(self): """Files that are in the local datasets but not in S3.""" local = set(self.local.all) remote = set(self.remote.all) yield from (local - remote) def upload_all(self): for file_name in self.pending: full_path = os.path.join(self.local.directory, file_name) self.remote.upload(full_path)
def test_upload(self, bucket, s3, config_exists, print_): bucket.return_value = 'serenata-de-amor-data' remote = RemoteDatasets() remote.upload('/root/serenata/data/42.csv') s3.return_value.upload_file.assert_called_once_with( '/root/serenata/data/42.csv', 'serenata-de-amor-data', '42.csv' )
class Datasets: """ This is a wrapper for three different classes that together handle the datasets (locally and remotely). Datasets class takes one argument: the path to the local directory of the dataset files (e.g. data/ or /tmp/serenata-data). The argument is optional and the default value is data/ (following the default usage in the main repo, serenata-de-amor). The remote part of the class expect to find Amazon credentials in a config.ini file with an Amazon section (e.g. config.ini.exemple). Inside it object there are three main objects: local, remote, and downloader: * `Datasets.local` handles listing all local datasets through the property `Datasets.local.all` (hint: it's a generator) and deleting local datasets with the method `Datasets.local.delete(filename)`; * `Datasets.remote` has the `Datasets.remote.all` property (hint: it's also a generator) and `Dataset.remote.delete(filename)` method just like its local equivalent; in addition to them this object offers the `Datasets.remote.upload(file_path)` method to upload a local file to the remote bucket; `Datasets.remote` does not handles downloads because `boto3` does not support `asyncio` and we prefer to use async tasks to allow the download of more than one file in parallel; * `Datasets.downloader` implements a async manager to download files from the remote bucket. It's `Datasets.downloader.download(files)` take the path for a single file (str) as argument or an iterable of paths (str). Yet this wrapper implement the `Dataset.upload_all()` method to upload all local datasets that are not present in the remote bucket. :param local_directory: (str) path to local directory of the datasets """ def __init__(self, local_directory=None): if not local_directory: local_directory = 'data' self.local = LocalDatasets(local_directory) self.remote = RemoteDatasets() self.downloader = Downloader( local_directory, bucket=self.remote.bucket, **self.remote.credentials ) @property def pending(self): """Files that are in the local datasets but not in S3.""" local = set(self.local.all) remote = set(self.remote.all) yield from (local - remote) def upload_all(self): for file_name in self.pending: full_path = os.path.join(self.local.directory, file_name) self.remote.upload(full_path)