def download_file(self, context, target_file): check.str_param(target_file, 'target_file') target_path = os.path.join(self.target_folder, target_file) if self.skip_if_present and safe_isfile(target_path): context.log.info( 'Skipping download, file already present at {target_path}'. format(target_path=target_path)) else: full_key = self.key + '/' + target_file if os.path.dirname(target_path): mkdir_p(os.path.dirname(target_path)) context.log.info( 'Starting download of {bucket}/{key} to {target_path}'.format( bucket=self.bucket, key=full_key, target_path=target_path)) headers = context.resources.s3.head_object(Bucket=self.bucket, Key=full_key) logger = S3Logger(context.log.debug, self.bucket, full_key, target_path, int(headers['ContentLength'])) context.resources.s3.download_file(Bucket=self.bucket, Key=full_key, Filename=target_path, Callback=logger) return target_path
def _download_from_s3_to_file(session, context, bucket, key, target_folder, skip_if_present): # TODO: remove context argument once we support resource logging # file name is S3 key path suffix after last / target_file = os.path.join(target_folder, key.split('/')[-1]) if skip_if_present and safe_isfile(target_file): context.log.info( 'Skipping download, file already present at {target_file}'.format( target_file=target_file ) ) else: if not os.path.exists(target_folder): mkdir_p(target_folder) context.log.info( 'Starting download of {bucket}/{key} to {target_file}'.format( bucket=bucket, key=key, target_file=target_file ) ) headers = session.head_object(Bucket=bucket, Key=key) logger = S3Logger( context.log.debug, bucket, key, target_file, int(headers['ContentLength']) ) session.download_file(Bucket=bucket, Key=key, Filename=target_file, Callback=logger) return target_file
def execute_create_notebook(notebook, force_overwrite, **kwargs): if not re.match(r'^[a-zA-Z0-9\-_\\/]+$', notebook): raise click.BadOptionUsage( notebook, ('Notebook name {name} is not valid, ' 'cannot contain anything except alphanumeric characters, ' '-, _, \\ and / for path manipulation').format(name=notebook), ) notebook_path = os.path.join( os.getcwd(), notebook if notebook.endswith('.ipynb') else notebook + ".ipynb") notebook_dir = os.path.dirname(notebook_path) if not os.path.exists(notebook_dir): os.makedirs(notebook_dir) if not force_overwrite and safe_isfile(notebook_path): click.confirm( ('Warning, {notebook_path} already exists and continuing ' 'will overwrite the existing notebook. ' 'Are you sure you want to continue?').format( notebook_path=notebook_path), abort=True, ) register_repo_info = get_register_repo_info(kwargs) with open(notebook_path, 'w') as f: f.write(get_notebook_scaffolding(register_repo_info)) click.echo("Created new dagstermill notebook at {path}".format( path=notebook_path))
def download_from_s3(context): '''Download an object from s3. Args: info (ExpectationExecutionInfo): Must expose a boto3 S3 client as its `s3` resource. Returns: str: The path to the downloaded object. ''' results = [] for file_ in context.solid_config: bucket = file_['bucket'] key = file_['key'] target_path = file_.get('target_path') or key if target_path is None: target_path = context.resources.tempfile.tempfile().name if file_['skip_if_present'] and safe_isfile(target_path): context.log.info( 'Skipping download, file already present at {target_path}'. format(target_path=target_path)) else: if os.path.dirname(target_path): mkdir_p(os.path.dirname(target_path)) context.resources.s3.download_file(bucket, key, target_path) results.append(target_path) return results
def download_from_s3(context): (bucket, key, target_folder, skip_if_present) = (context.solid_config.get(k) for k in ('bucket', 'key', 'target_folder', 'skip_if_present')) # file name is S3 key path suffix after last / target_file = os.path.join(target_folder, key.split('/')[-1]) if skip_if_present and safe_isfile(target_file): context.log.info( 'Skipping download, file already present at {target_file}'.format( target_file=target_file)) else: if not os.path.exists(target_folder): mkdir_p(target_folder) context.log.info( 'Starting download of {bucket}/{key} to {target_file}'.format( bucket=bucket, key=key, target_file=target_file)) s3 = boto3.client('s3') headers = s3.head_object(Bucket=bucket, Key=key) logger = S3Logger(context.log.debug, bucket, key, target_file, int(headers['ContentLength'])) s3.download_file(Bucket=bucket, Key=key, Filename=target_file, Callback=logger) return target_file
def file_exists_at_path_type_check(value): if not isinstance(value, six.string_types): raise Failure( 'FileExistsAtPath must be a string in memory. Got {value}'.format( value=repr(value))) if not safe_isfile(value): raise Failure( ('FileExistsAtPath must be a path that points to a file that ' 'exists. "{value}" does not exist on disk').format(value=value))
def unzip_file( context, archive_paths, archive_members, # destination_dir=None ): # FIXME # archive_path = info.config['archive_path'] # archive_member = info.config['archive_member'] results = [] for (i, archive_path) in enumerate(archive_paths): destination_dir = ( # info.config['destination_dir'] or os.path.dirname(archive_path)) if archive_members: archive_member = archive_members[i] else: archive_member = None with zipfile.ZipFile(archive_path, 'r') as zip_ref: if archive_member is not None: target_path = os.path.join(destination_dir, archive_member) is_file = safe_isfile(target_path) is_dir = os.path.isdir(target_path) if not (context.solid_config['skip_if_present'] and (is_file or is_dir)): zip_ref.extract(archive_member, destination_dir) else: if is_file: context.log.info( 'Skipping unarchive of {archive_member} from {archive_path}, ' 'file already present at {target_path}'.format( archive_member=archive_member, archive_path=archive_path, target_path=target_path, )) if is_dir: context.log.info( 'Skipping unarchive of {archive_member} from {archive_path}, ' 'directory already present at {target_path}'. format( archive_member=archive_member, archive_path=archive_path, target_path=target_path, )) else: if not (context.solid_config['skip_if_present'] and is_dir): zip_ref.extractall(destination_dir) else: context.log.info( 'Skipping unarchive of {archive_path}, directory already present ' 'at {target_path}'.format(archive_path=archive_path, target_path=target_path)) results.append(target_path) return results
def execute_create_notebook(notebook, solid_name, force_overwrite, **kwargs): if not re.match(r'^[a-zA-Z0-9\-_\\/]+$', notebook): raise click.BadOptionUsage( notebook, ('Notebook name {name} is not valid, ' 'cannot contain anything except alphanumeric characters, ' '-, _, \\ and / for path manipulation').format(name=notebook), ) notebook_path = os.path.join( os.getcwd(), notebook if notebook.endswith('.ipynb') else notebook + ".ipynb") notebook_dir = os.path.dirname(notebook_path) if not os.path.exists(notebook_dir): os.makedirs(notebook_dir) if not force_overwrite and safe_isfile(notebook_path): click.confirm( ('Warning, {notebook_path} already exists and continuing ' 'will overwrite the existing notebook. ' 'Are you sure you want to continue?').format( notebook_path=notebook_path), abort=True, ) if not solid_name: solid_name = os.path.basename(notebook_path).split(".")[0] repository_target_info = load_target_info_from_cli_args(kwargs) module_target_info = get_module_target_function(repository_target_info) if module_target_info: module = module_target_info.module_name fn_name = module_target_info.fn_name RegisterRepoInfo = namedtuple( 'RegisterRepoInfo', 'import_statement declaration_statement') register_repo_info = RegisterRepoInfo( "from {module} import {fn_name}".format(module=module, fn_name=fn_name), "dm.declare_as_solid({fn_name}(), '{solid_name}')".format( fn_name=fn_name, solid_name=solid_name), ) else: raise click.UsageError( "Cannot instantiate notebook with repository definition given by a function from a file" ) with open(notebook_path, 'w') as f: f.write(get_notebook_scaffolding(register_repo_info)) click.echo("Created new dagstermill notebook at {path}".format( path=notebook_path))
def gunzipper(_, gzip_file): """gunzips /path/to/foo.gz to /path/to/raw/2019/01/01/data.json""" # TODO: take date as an input path_prefix = os.path.dirname(gzip_file) output_folder = os.path.join(path_prefix, "raw/2019/01/01") outfile = os.path.join(output_folder, "data.json") if not safe_isfile(outfile): mkdir_p(output_folder) with gzip.open(gzip_file, "rb") as f_in, open(outfile, "wb") as f_out: shutil.copyfileobj(f_in, f_out) return [path_prefix]
def gunzipper(_, gzip_file): '''gunzips /path/to/foo.gz to /path/to/raw/2019/01/01/data.json ''' # TODO: take date as an input path_prefix = os.path.dirname(gzip_file) output_folder = os.path.join(path_prefix, 'raw/2019/01/01') outfile = os.path.join(output_folder, 'data.json') if not safe_isfile(outfile): mkdir_p(output_folder) with gzip.open(gzip_file, 'rb') as f_in, open(outfile, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) return [path_prefix]
def file_exists_at_path_type_check(_, value): if not isinstance(value, six.string_types): return TypeCheck( success=False, description='FileExistsAtPath must be a string in memory. Got {value}'.format( value=repr(value) ), ) if not safe_isfile(value): return TypeCheck( success=False, description=( 'FileExistsAtPath must be a path that points to a file that ' 'exists. "{value}" does not exist on disk' ).format(value=value), ) return True
def execute_create_notebook(notebook, force_overwrite, kernel): notebook_path = os.path.join( os.getcwd(), notebook if notebook.endswith('.ipynb') else notebook + ".ipynb") notebook_dir = os.path.dirname(notebook_path) mkdir_p(notebook_dir) if not force_overwrite and safe_isfile(notebook_path): click.confirm( ('Warning, {notebook_path} already exists and continuing ' 'will overwrite the existing notebook. ' 'Are you sure you want to continue?').format( notebook_path=notebook_path), abort=True, ) with open(notebook_path, 'w') as f: f.write(get_notebook_scaffolding(get_kernelspec(kernel))) click.echo("Created new dagstermill notebook at {path}".format( path=notebook_path))
def test_safe_isfile(): assert safe_isfile(file_relative_path(__file__, 'test_file_utils.py')) assert not safe_isfile(file_relative_path(__file__, 'not_a_file.py'))
def test_safe_isfile(): assert safe_isfile(script_relative_path('test_safe_isfile.py')) assert not safe_isfile(script_relative_path('test_safe_isfile_foobar.py'))
def test_safe_isfile(): assert safe_isfile(script_relative_path('test_file_utils.py')) assert not safe_isfile(script_relative_path('not_a_file.py'))