def generate_file_list(self): """Yield each individual path given a source folder and a set of file-matching expressions.""" for src in self.src: if src.startswith('s3'): # connect lazily as needed: if self.s3_conn is None: self.s3_conn = ScalableS3Client().s3 for _bucket, _root, path in generate_s3_sources(self.s3_conn, src, self.include, self.include_zero_length): source = url_path_join(src, path) yield ExternalURL(source) elif src.startswith('hdfs'): for source, size in luigi.contrib.hdfs.listdir(src, recursive=True, include_size=True): if not self.include_zero_length and size == 0: continue elif any(fnmatch.fnmatch(source, include_val) for include_val in self.include): yield ExternalURL(source) else: # Apply the include patterns to the relative path below the src directory. # TODO: implement exclude_zero_length to match S3 case. for dirpath, _dirnames, files in os.walk(src): for filename in files: filepath = os.path.join(dirpath, filename) relpath = os.path.relpath(filepath, src) if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include): yield ExternalURL(filepath)
def generate_file_list(self): """Yield each individual path given a source folder and a set of file-matching expressions.""" for src in self.src: if src.startswith('s3'): # connect lazily as needed: if self.s3_conn is None: self.s3_conn = ScalableS3Client().s3 for _bucket, _root, path in generate_s3_sources(self.s3_conn, src, self.include, self.include_zero_length): source = url_path_join(src, path) yield ExternalURL(source) elif src.startswith('hdfs'): for source, size in luigi.hdfs.listdir(src, recursive=True, include_size=True): if not self.include_zero_length and size == 0: continue elif any(fnmatch.fnmatch(source, include_val) for include_val in self.include): yield ExternalURL(source) else: # Apply the include patterns to the relative path below the src directory. # TODO: implement exclude_zero_length to match S3 case. for dirpath, _dirnames, files in os.walk(src): for filename in files: filepath = os.path.join(dirpath, filename) relpath = os.path.relpath(filepath, src) if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include): yield ExternalURL(filepath)
def list_s3_files(source_url, patterns): """List remote s3 files that match one of the patterns.""" s3_conn = connect_s3() for bucket, root, path in generate_s3_sources(s3_conn, source_url, patterns): source = join_as_s3_url(bucket, root, path) src_key = get_s3_key(s3_conn, source) print "%10d %s" % (src_key.size if src_key is not None else -1, path)
def list_s3_files(source_url, patterns): """List remote s3 files that match one of the patterns.""" s3_conn = connect_s3() for bucket, root, path in generate_s3_sources(s3_conn, source_url, patterns): source = join_as_s3_url(bucket, root, path) src_key = get_s3_key(s3_conn, source) print "%10d %s" % (src_key.size if src_key is not None else -1, path)
def get_s3_files(source_url, dest_root, patterns): """Copy remote s3 files that match one of the patterns to a local destination.""" s3_conn = connect_s3() for bucket, root, path in generate_s3_sources(s3_conn, source_url, patterns): source = join_as_s3_url(bucket, root, path) dest_name = path.replace('/', '_') destination = os.path.join(dest_root, dest_name) src_key = get_s3_key(s3_conn, source) if src_key is not None: src_key.get_contents_to_filename(destination) else: print "No key for source " + source
def get_s3_files(source_url, dest_root, patterns): """Copy remote s3 files that match one of the patterns to a local destination.""" s3_conn = connect_s3() for bucket, root, path in generate_s3_sources(s3_conn, source_url, patterns): source = join_as_s3_url(bucket, root, path) dest_name = path.replace('/', '_') destination = os.path.join(dest_root, dest_name) src_key = get_s3_key(s3_conn, source) if src_key is not None: src_key.get_contents_to_filename(destination) else: print "No key for source " + source
def _make_s3_generator(self, bucket_name, root, path_info, patterns): """Generates a list of matching S3 sources using a mock S3 connection.""" s3_conn = MagicMock() s3_bucket = MagicMock() s3_conn.get_bucket = MagicMock(return_value=s3_bucket) target_list = [self._make_key("{root}/{path}".format(root=root, path=path), size) for path, size in path_info.iteritems()] s3_bucket.list = MagicMock(return_value=target_list) print [(k.key, k.size) for k in target_list] s3_bucket.name = bucket_name source = "s3://{bucket}/{root}".format(bucket=bucket_name, root=root) generator = s3_util.generate_s3_sources(s3_conn, source, patterns) output = list(generator) return output
def _make_s3_generator(self, bucket_name, root, path_info, patterns): """ Generates a list of matching S3 sources using a mock S3 connection. """ s3_conn = MagicMock() s3_bucket = MagicMock() s3_conn.get_bucket = MagicMock(return_value=s3_bucket) target_list = [self._make_key("{root}/{path}".format(root=root, path=path), size) for path, size in path_info.iteritems()] s3_bucket.list = MagicMock(return_value=target_list) print([(k.key, k.size) for k in target_list]) s3_bucket.name = bucket_name source = "s3://{bucket}/{root}".format(bucket=bucket_name, root=root) generator = s3_util.generate_s3_sources(s3_conn, source, patterns) output = list(generator) return output
s3_key.size = size return s3_key def _make_s3_generator(self, bucket_name, root, path_info, patterns): """Generates a list of matching S3 sources using a mock S3 connection.""" s3_conn = MagicMock() s3_bucket = MagicMock() s3_conn.get_bucket = MagicMock(return_value=s3_bucket) target_list = [self._make_key("{root}/{path}".format(root=root, path=path), size) for path, size in path_info.iteritems()] s3_bucket.list = MagicMock(return_value=target_list) print [(k.key, k.size) for k in target_list] s3_bucket.name = bucket_name source = "s3://{bucket}/{root}".format(bucket=bucket_name, root=root) generator = s3_util.generate_s3_sources(s3_conn, source, patterns) output = list(generator) return output def _run_without_filtering(self, bucket_name, root, path_info): """Runs generator and checks output.""" patterns = ['*'] output = self._make_s3_generator(bucket_name, root, path_info, patterns) self.assertEquals(len(output), len(path_info)) expected = [(bucket_name, root, key) for key in path_info] self.assertEquals(set(output), set(expected)) def test_normal_generate(self): bucket_name = "bucket_name" root = "root1/root2" path_info = {