def abort_previous_upload(data_config): if data_config.tarball_path and os.path.exists(data_config.tarball_path): rmtree(os.path.dirname(data_config.tarball_path)) data_config.set_tarball_path("") data_config.set_data_endpoint("") DataConfigManager.set_config(data_config)
def upload(): """ Upload data in the current dir to Floyd. """ data_config = DataConfigManager.get_config() access_token = AuthConfigManager.get_access_token() version = data_config.version # Create data object data_name = "{}/{}:{}".format(access_token.username, data_config.name, version) data = DataRequest(name=data_name, description=version, version=version) data_id = DataClient().create(data) floyd_logger.debug("Created data with id : {}".format(data_id)) floyd_logger.info("Upload finished") # Update expt config including predecessor data_config.increment_version() data_config.set_data_predecessor(data_id) DataConfigManager.set_config(data_config) # Print output table_output = [["DATA ID", "NAME", "VERSION"], [data_id, data_name, version]] floyd_logger.info(tabulate(table_output, headers="firstrow"))
def init(dataset_name): """ Initialize a new dataset at the current dir. After init ensure that your data files are in this directory. Then you can upload them to Floyd. Example: floyd data upload """ dataset_obj = DatasetClient().get_by_name(dataset_name) if not dataset_obj: create_dataset_base_url = "{}/datasets/create".format( floyd.floyd_web_host) create_dataset_url = "{}?name={}".format(create_dataset_base_url, dataset_name) floyd_logger.error( ("Dataset name does not match your list of datasets. " "Create your new dataset in the web dashboard:\n\t%s"), create_dataset_base_url) webbrowser.open(create_dataset_url) return data_config = DataConfig(name=dataset_name, family_id=dataset_obj.id) DataConfigManager.set_config(data_config) floyd_logger.info( "Data source \"{}\" initialized in current directory".format( dataset_name)) floyd_logger.info(""" You can now upload your data to Floyd by: floyd data upload """)
def init(dataset_name): """ Initialize a new dataset at the current dir. Then run the upload command to copy all the files in this directory to FloydHub. floyd data upload """ dataset_obj = DatasetClient().get_by_name(dataset_name) if not dataset_obj: namespace, name = get_namespace_from_name(dataset_name) create_dataset_base_url = "{}/datasets/create".format( floyd.floyd_web_host) create_dataset_url = "{}?name={}&namespace={}".format( create_dataset_base_url, name, namespace) floyd_logger.info( ("Dataset name does not match your list of datasets. " "Create your new dataset in the web dashboard:\n\t%s"), create_dataset_base_url) webbrowser.open(create_dataset_url) name = click.prompt( 'Press ENTER to use dataset name "%s" or enter a different name' % dataset_name, default=dataset_name, show_default=False) dataset_name = name.strip() or dataset_name dataset_obj = DatasetClient().get_by_name(dataset_name) if not dataset_obj: raise FloydException( 'Dataset "%s" does not exist on floydhub.com. Ensure it exists before continuing.' % dataset_name) namespace, name = get_namespace_from_name(dataset_name) data_config = DataConfig(name=name, namespace=namespace, family_id=dataset_obj.id) DataConfigManager.set_config(data_config) floyd_logger.info( "Data source \"{}\" initialized in current directory".format( dataset_name)) floyd_logger.info(""" You can now upload your data to Floyd by: floyd data upload """)
def init(name): """ Initialize a new data upload. After init ensure that your data files are in this directory. Then you can upload them to Floyd. Example: floyd data upload """ data_config = DataConfig(name=name, family_id=generate_uuid()) DataConfigManager.set_config(data_config) floyd_logger.info("Data source \"{}\" initialized in current directory".format(name)) floyd_logger.info(""" You can now upload your data to Floyd by: floyd data upload """)
def add_data(self, source): data_config = DataConfigManager.get_config() dataset_id = data_config.family_id if not dataset_id: sys.exit( 'Please initialize current directory with \'floyd data init DATASET_NAME\' first.' ) re = self.request('POST', '%s/%s' % (self.url, dataset_id), json={'source': source}) return re.json()
def upload(resume, tar_file, message): """ Upload data in the current dir to Floyd. """ data_config = DataConfigManager.get_config() if not upload_is_resumable(data_config) or not opt_to_resume(resume): abort_previous_upload(data_config) access_token = AuthConfigManager.get_access_token() initialize_new_upload(data_config, access_token, tar_file, message) complete_upload(data_config)
def initialize_new_upload(data_config, access_token, description=None, source_dir='.'): # TODO: hit upload server to check for liveness before moving on data_config.set_tarball_path(None) data_config.set_data_endpoint(None) data_config.set_resource_id(None) namespace = data_config.namespace or access_token.username data_name = "{}/{}".format(namespace, data_config.name) # Create tarball of the data using the ID returned from the API # TODO: allow to the users to change directory for the compression temp_dir = tempfile.mkdtemp() tarball_path = os.path.join(temp_dir, "floydhub_data.tar.gz") floyd_logger.debug("Creating tarfile with contents of current directory: %s", tarball_path) floyd_logger.info("Compressing data...") # TODO: purge tarball on Ctrl-C create_tarfile(source_dir=source_dir, filename=tarball_path) # If starting a new upload fails for some reason down the line, we don't # want to re-tar, so save off the tarball path now data_config.set_tarball_path(tarball_path) DataConfigManager.set_config(data_config) # Create data object using API data = DataRequest(name=data_name, description=description, family_id=data_config.family_id, data_type='gzip') data_info = DataClient().create(data) if not data_info: rmtree(temp_dir) sys.exit(1) data_config.set_data_id(data_info['id']) data_config.set_data_name(data_info['name']) DataConfigManager.set_config(data_config) # fetch auth token for upload server creds = DataClient().new_tus_credentials(data_info['id']) if not creds: # TODO: delete module from server? rmtree(temp_dir) sys.exit(1) data_resource_id = creds[0] data_endpoint = TusDataClient().initialize_upload( tarball_path, metadata={"filename": data_resource_id}, auth=creds) if not data_endpoint: # TODO: delete module from server? floyd_logger.error("Failed to get upload URL from Floydhub!") rmtree(temp_dir) sys.exit(1) data_config.set_data_endpoint(data_endpoint) DataConfigManager.set_config(data_config)
def get_all(self): try: data_config = DataConfigManager.get_config() response = self.request("GET", self.url, params={ "module_type": "data", "family_id": data_config.family_id }) data_dict = response.json() return [Data.from_dict(data) for data in data_dict] except FloydException as e: floyd_logger.error("Error while retrieving data: %s", e.message) return []
def complete_upload(data_config): data_endpoint = data_config.data_endpoint data_id = data_config.data_id tarball_path = data_config.tarball_path if not data_id: floyd_logger.error("Corrupted upload state, please start a new one.") sys.exit(1) # check for tarball upload, upload to server if not done if not data_config.resource_id and (tarball_path and data_endpoint): floyd_logger.debug("Getting fresh upload credentials") creds = DataClient().new_tus_credentials(data_id) if not creds: sys.exit(1) file_size = os.path.getsize(tarball_path) # check for upload limit dimension if file_size > MAX_UPLOAD_SIZE: try: floyd_logger.info("Removing compressed data...") rmtree(os.path.dirname(tarball_path)) except (OSError, TypeError): pass sys.exit(("Data size too large to upload, please keep it under %s.\n") % (sizeof_fmt(MAX_UPLOAD_SIZE))) floyd_logger.info("Uploading compressed data. Total upload size: %s", sizeof_fmt(file_size)) tus_client = TusDataClient() if not tus_client.resume_upload(tarball_path, data_endpoint, auth=creds): floyd_logger.error("Failed to finish upload!") return try: floyd_logger.info("Removing compressed data...") rmtree(os.path.dirname(tarball_path)) except (OSError, TypeError): pass floyd_logger.debug("Created data with id : %s", data_id) floyd_logger.info("Upload finished.") # Update data config data_config.set_tarball_path(None) data_config.set_data_endpoint(None) data_source = DataClient().get(data_id) data_config.set_resource_id(data_source.resource_id) DataConfigManager.set_config(data_config) # data tarball uploaded, check for server untar if data_config.resource_id: floyd_logger.info( "Waiting for server to unpack data.\n" "You can exit at any time and come back to check the status with:\n" "\tfloyd data upload -r") try: for i in dots(ResourceWaitIter(data_config.resource_id), label='Waiting for unpack...'): pass except WaitTimeoutException: clint_STREAM.write('\n') clint_STREAM.flush() floyd_logger.info( "Looks like it is going to take longer for Floydhub to unpack " "your data. Please check back later.") sys.exit(1) else: data_config.set_resource_id(None) data_config.set_tarball_path(None) data_config.set_data_endpoint(None) data_config.set_resource_id(None) data_config.set_data_id(None) DataConfigManager.set_config(data_config) # Print output table_output = [["NAME"], [normalize_data_name(data_config.data_name)]] floyd_logger.info('') floyd_logger.info(tabulate(table_output, headers="firstrow"))
def current_dataset_namespace(): return DataConfigManager.get_config().namespace or current_username()
def current_dataset_name(): return DataConfigManager.get_config().name