Ejemplo n.º 1
0
 def generate_file_list(self):
     """Yield each individual path given a source folder and a set of file-matching expressions."""
     for src in self.src:
         if src.startswith('s3'):
             # connect lazily as needed:
             if self.s3_conn is None:
                 self.s3_conn = boto.connect_s3()
             for _bucket, _root, path in generate_s3_sources(
                     self.s3_conn, src, self.include,
                     self.include_zero_length):
                 source = url_path_join(src, path)
                 yield ExternalURL(source)
         elif src.startswith('hdfs'):
             for source, size in luigi.hdfs.listdir(src,
                                                    recursive=True,
                                                    include_size=True):
                 if not self.include_zero_length and size == 0:
                     continue
                 elif any(
                         fnmatch.fnmatch(source, include_val)
                         for include_val in self.include):
                     yield ExternalURL(source)
         else:
             # Apply the include patterns to the relative path below the src directory.
             # TODO: implement exclude_zero_length to match S3 case.
             for dirpath, _dirnames, files in os.walk(src):
                 for filename in files:
                     filepath = os.path.join(dirpath, filename)
                     relpath = os.path.relpath(filepath, src)
                     if any(
                             fnmatch.fnmatch(relpath, include_val)
                             for include_val in self.include):
                         yield ExternalURL(filepath)
Ejemplo n.º 2
0
def list_s3_files(source_url, patterns):
    """List remote s3 files that match one of the patterns."""
    s3_conn = connect_s3()
    for bucket, root, path in generate_s3_sources(s3_conn, source_url, patterns):
        source = join_as_s3_url(bucket, root, path)
        src_key = get_s3_key(s3_conn, source)
        print "%10d %s" % (src_key.size if src_key is not None else -1, path)
 def generate_file_list(self):
     """Yield each individual path given a source folder and a set of file-matching expressions."""
     for src in self.src:
         if src.startswith('s3'):
             # connect lazily as needed:
             if self.s3_conn is None:
                 self.s3_conn = boto.connect_s3()
             for _bucket, _root, path in generate_s3_sources(self.s3_conn, src, self.include, self.include_zero_length):
                 source = url_path_join(src, path)
                 yield ExternalURL(source)
         elif src.startswith('hdfs'):
             for source, size in luigi.hdfs.listdir(src, recursive=True, include_size=True):
                 if not self.include_zero_length and size == 0:
                     continue
                 elif any(fnmatch.fnmatch(source, include_val) for include_val in self.include):
                     yield ExternalURL(source)
         else:
             # Apply the include patterns to the relative path below the src directory.
             # TODO: implement exclude_zero_length to match S3 case.
             for dirpath, _dirnames, files in os.walk(src):
                 for filename in files:
                     filepath = os.path.join(dirpath, filename)
                     relpath = os.path.relpath(filepath, src)
                     if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include):
                         yield ExternalURL(filepath)
Ejemplo n.º 4
0
def get_s3_files(source_url, dest_root, patterns):
    """Copy remote s3 files that match one of the patterns to a local destination."""
    s3_conn = connect_s3()
    for bucket, root, path in generate_s3_sources(s3_conn, source_url, patterns):
        source = join_as_s3_url(bucket, root, path)
        dest_name = path.replace('/', '_')
        destination = os.path.join(dest_root, dest_name)
        src_key = get_s3_key(s3_conn, source)
        if src_key is not None:
            src_key.get_contents_to_filename(destination)
        else:
            print "No key for source " + source
Ejemplo n.º 5
0
 def generate_file_list(self):
     """Yield each individual path given a source folder and a set of file-matching expressions."""
     if self.src.startswith('s3'):
         # connect lazily as needed:
         if self.s3_conn is None:
             self.s3_conn = boto.connect_s3()
         for _bucket, _root, path in generate_s3_sources(self.s3_conn, self.src, self.include):
             source = url_path_join(self.src, path)
             yield ExternalURL(source)
     else:
         # Apply the include patterns to the relative path below the src directory.
         for dirpath, _dirnames, files in os.walk(self.src):
             for filename in files:
                 filepath = os.path.join(dirpath, filename)
                 relpath = os.path.relpath(filepath, self.src)
                 if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include):
                     yield ExternalURL(filepath)
    def _make_s3_generator(self, bucket_name, root, path_info, patterns):
        """Generates a list of matching S3 sources using a mock S3 connection."""
        s3_conn = MagicMock()
        s3_bucket = MagicMock()
        s3_conn.get_bucket = MagicMock(return_value=s3_bucket)
        target_list = [
            self._make_key("{root}/{path}".format(root=root, path=path), size)
            for path, size in path_info.iteritems()
        ]
        s3_bucket.list = MagicMock(return_value=target_list)
        print[(k.key, k.size) for k in target_list]

        s3_bucket.name = bucket_name
        source = "s3://{bucket}/{root}".format(bucket=bucket_name, root=root)
        generator = s3_util.generate_s3_sources(s3_conn, source, patterns)
        output = list(generator)
        return output
Ejemplo n.º 7
0
 def generate_file_list(self):
     """Yield each individual path given a source folder and a set of file-matching expressions."""
     for src in self.src:
         if src.startswith('s3'):
             # connect lazily as needed:
             if self.s3_conn is None:
                 self.s3_conn = boto.connect_s3()
             for _bucket, _root, path in generate_s3_sources(
                     self.s3_conn, src, self.include):
                 source = url_path_join(src, path)
                 yield ExternalURL(source)
         else:
             # Apply the include patterns to the relative path below the src directory.
             for dirpath, _dirnames, files in os.walk(src):
                 for filename in files:
                     filepath = os.path.join(dirpath, filename)
                     relpath = os.path.relpath(filepath, src)
                     if any(
                             fnmatch.fnmatch(relpath, include_val)
                             for include_val in self.include):
                         yield ExternalURL(filepath)
Ejemplo n.º 8
0
    def download_output_files(self):
        self.assertEqual(len(list(generate_s3_sources(self.s3_client.s3, self.test_out))), len(self.output_files))

        self.temporary_dir = tempfile.mkdtemp()
        self.addCleanup(shutil.rmtree, self.temporary_dir)

        self.downloaded_outputs = os.path.join(self.temporary_dir, 'output')
        os.makedirs(self.downloaded_outputs)

        for output_file in self.output_files:
            local_file_name = self.generate_file_name(output_file)

            remote_url = url_path_join(self.test_out, output_file['course_id'], "events", local_file_name + '.gz')

            downloaded_output_path = get_file_from_key(self.s3_client, remote_url, self.downloaded_outputs)

            if downloaded_output_path is None:
                self.fail('Unable to find expected output file {0}'.format(remote_url))

            decompressed_file_name = downloaded_output_path[:-len('.gz')]
            output_file['downloaded_path'] = decompressed_file_name
            fs.decompress_file(downloaded_output_path, decompressed_file_name)
Ejemplo n.º 9
0
def list_s3_files(source_url, patterns):
    """List remote s3 files that match one of the patterns."""
    s3_conn = connect_s3()
    for _bucket, _root, path in generate_s3_sources(s3_conn, source_url,
                                                    patterns):
        print path
Ejemplo n.º 10
0
        s3_key.size = size
        return s3_key

    def _make_s3_generator(self, bucket_name, root, path_info, patterns):
        """Generates a list of matching S3 sources using a mock S3 connection."""
        s3_conn = MagicMock()
        s3_bucket = MagicMock()
        s3_conn.get_bucket = MagicMock(return_value=s3_bucket)
        target_list = [self._make_key("{root}/{path}".format(root=root, path=path), size)
                       for path, size in path_info.iteritems()]
        s3_bucket.list = MagicMock(return_value=target_list)
        print [(k.key, k.size) for k in target_list]

        s3_bucket.name = bucket_name
        source = "s3://{bucket}/{root}".format(bucket=bucket_name, root=root)
        generator = s3_util.generate_s3_sources(s3_conn, source, patterns)
        output = list(generator)
        return output

    def _run_without_filtering(self, bucket_name, root, path_info):
        """Runs generator and checks output."""
        patterns = ['*']
        output = self._make_s3_generator(bucket_name, root, path_info, patterns)
        self.assertEquals(len(output), len(path_info))
        expected = [(bucket_name, root, key) for key in path_info]
        self.assertEquals(set(output), set(expected))

    def test_normal_generate(self):
        bucket_name = "bucket_name"
        root = "root1/root2"
        path_info = {
Ejemplo n.º 11
0
def list_s3_files(source_url, patterns):
    """List remote s3 files that match one of the patterns."""
    s3_conn = connect_s3()
    for _bucket, _root, path in generate_s3_sources(s3_conn, source_url, patterns):
        print path