Beispiel #1
0
 def reset_files_extension(self, storage_name, prefix):
     bucket = self.__s3.get_bucket(storage_name)
     for key in bucket.list(prefix=prefix):
         if key.name.endswith('-done'):
             new_key_name = key.name.replace('-done','')
             bucket.copy_key(new_key_name=new_key_name, src_bucket_name=storage_name, src_key_name=key.name)
             bucket.delete_key(key.name)
Beispiel #2
0
    def test_write_reservoir_sample_s3(self):
        # given
        sample_size = 10
        items_to_write = [
            BaseRecord({
                u'key1': u'value1{}'.format(i),
                u'key2': u'value2{}'.format(i)
            }) for i in range(100)
        ]
        options = self.get_writer_config()
        options['options'].update({
            'compression': 'none',
            'write_buffer': RESERVOIR_SAMPLING_BUFFER_CLASS,
            'write_buffer_options': {
                'sample_size': sample_size
            }
        })

        # when:
        writer = S3Writer(options, meta())
        try:
            writer.write_batch(items_to_write)
            writer.flush()
        finally:
            writer.close()

        # then:
        bucket = self.s3_conn.get_bucket('fake_bucket')
        saved_keys = [k for k in bucket.list()]
        self.assertEquals(1, len(saved_keys))
        self.assertEqual(saved_keys[0].name, 'tests/0.jl')
        content = saved_keys[0].get_contents_as_string()
        self.assertEquals(len(content.strip().splitlines()), sample_size)
        self.assertNotEquals(content.strip().splitlines(),
                             items_to_write[:sample_size])
Beispiel #3
0
def download_from_s3(bucket_name, file_name, dest_path, logger, prefix=None, access_key=None, secret_key=None, dry_run=False):
    valid = True
    result_string = ""
    s3_conn = _get_s3_connection(access_key, secret_key)
    if s3_conn.lookup(bucket_name):
        bucket = s3_conn.get_bucket(bucket_name)
        files = bucket.list(prefix=prefix)
        for f in files:
            if prefix:
                k = os.path.join(prefix, file_name)
            else:
                k = file_name
            if k == str(f.key):
                def percent_cb(complete, total):
                    percentage = int(complete) * 100 / int(total)
                    logger.write('Downloading from S3: ' + str(complete) + ' / ' + str(total)
                                 + ' ( ' + str(percentage) + '%)', multi_line=False)
                    sys.stdout.flush()
                if dry_run:
                    result_string += 'Skipping actual download from S3 due to dry run.\n'
                else:
                    if not f.get_contents_to_filename(os.path.join(dest_path, file_name), cb=percent_cb, num_cb=5):
                        result_string += 'Downloaded package to ' + os.path.join(dest_path, file_name) + \
                                         ' from S3 bucket ' + bucket_name + '\n'
                        return {"valid": valid, "result_string": result_string}

        result_string += file_name + " does not exist in S3 bucket " + bucket_name + '\n'
        valid = False
    else:
        result_string += "Cannot find S3 bucket with name " + bucket_name + '\n'
        valid = False
    return {"valid": valid, "result_string": result_string}
Beispiel #4
0
def _todos(bucket, prefix, paths, check_removed=True):
    '''
    Return information about upcoming uploads and deletions.

    Returns a tuple: (upload, delete) 
    
    'upload' is a dictionary of info about files that need to be uploaded.
    It is keyed on local paths, and maps to a tuple:
    ((hex_md5, base64_md5, filesize), remote_path)

    'delete' is a list of S3 keys that should be removed.  If 'check_removed'
    is False, this list will always be empty.
    '''
    # map rpath -> lpath; we use this to compare md5s for existing keys
    rpath_map = dict((i[1], i[0]) for i in paths)

    # Iterate through the BucketListResultSet only once; we'll add elements to
    # two containers and will return them at the end.
    up = {}
    delete = []

    # Create a set of keys in S3 for comparison later
    s3_keys = set()

    # add entries for keys that have different contents
    for key in bucket.list(prefix):
        # Since we're already iterating through the result set, we'll save
        # key names.
        s3_keys.add(key.name)

        if check_removed and key.name not in rpath_map:
            # this key doesn't exist locally, schedule deletion
            delete.append(key.name)
            continue

        # file exists in both; compare md5s
        lpath = rpath_map[key.name]
        with open(lpath, 'rb') as fp:
            md5 = boto.s3.key.compute_md5(fp)
        if key.etag.strip('"') != md5[0].strip('"'):
            up[lpath] = (md5, key.name)

    # schedule uploads for new keys
    for rpath in set(i[1] for i in paths) - s3_keys:
        lpath = rpath_map[rpath]
        with open(lpath, 'rb') as fp:
            md5 = boto.s3.key.compute_md5(fp)
        up[lpath] = (md5, rpath)

    return up, delete
Beispiel #5
0
 def __get_file_contents_list_from_bucket(bucket, prefix, bucket_name):
     json_files_list = []
     for key in bucket.list(prefix=prefix):
         if key.name.endswith('/') or key.name.endswith('-done'):
             continue
         try:
             new_key_name = "{}-done".format(key.name)
             bucket.copy_key(new_key_name=new_key_name, src_bucket_name=bucket_name, src_key_name=key.name)
             bucket.delete_key(key.name)
             new_key = bucket.get_key(new_key_name)
             new_key.get_contents_to_filename(filename="tmp.json.gz")
             f = gzip.open('tmp.json.gz', 'rb')
             json_files_list.append(f.read())
             f.close()
         except Exception as ex:
             Logger.log("warning", "{} FAILED: {}".format(key.name, ex.message))
     return json_files_list
Beispiel #6
0
def download_from_s3(
    bucket_name, file_name, dest_path, logger, prefix=None, access_key=None, secret_key=None, dry_run=False
):
    valid = True
    result_string = ""
    s3_conn = _get_s3_connection(access_key, secret_key)
    if s3_conn.lookup(bucket_name):
        bucket = s3_conn.get_bucket(bucket_name)
        files = bucket.list(prefix=prefix)
        for f in files:
            if prefix:
                k = os.path.join(prefix, file_name)
            else:
                k = file_name
            if k == str(f.key):

                def percent_cb(complete, total):
                    percentage = int(complete) * 100 / int(total)
                    logger.write(
                        "Downloading from S3: " + str(complete) + " / " + str(total) + " ( " + str(percentage) + "%)",
                        multi_line=False,
                    )
                    sys.stdout.flush()

                if dry_run:
                    result_string += "Skipping actual download from S3 due to dry run.\n"
                else:
                    if not f.get_contents_to_filename(os.path.join(dest_path, file_name), cb=percent_cb, num_cb=5):
                        result_string += (
                            "Downloaded package to "
                            + os.path.join(dest_path, file_name)
                            + " from S3 bucket "
                            + bucket_name
                            + "\n"
                        )
                        return {"valid": valid, "result_string": result_string}

        result_string += file_name + " does not exist in S3 bucket " + bucket_name + "\n"
        valid = False
    else:
        result_string += "Cannot find S3 bucket with name " + bucket_name + "\n"
        valid = False
    return {"valid": valid, "result_string": result_string}
Beispiel #7
0
    if name is None:
        panic("Bucket name is not specified!")
    regions = dict((r.name, r) for r in boto.s3.regions())
    region = regions.get(region)
    if region is None:
        panic("Region is not specified or is invalid. Valid regions are {}".
              format(", ".join(regions.keys())))
    connection = region.connect(aws_access_key_id=akey,
                                aws_secret_access_key=skey)
    bucket = connection.lookup(name)
    if bucket is None:
        panic(
            "Could not open/find bucket \"{}\"! Are your AWS keys/configuration valid?"
            .format(name))

    tree = collect_files(bucket.list(prefix=prefix))

    # Generate the requested output formats
    outputs = config.get('output', {})
    targets = {
        'html': HtmlGenerator,
        'json': JsonGenerator,
        'txt': TxtGenerator
    }
    for name in outputs.keys():
        tp = config['output'][name].get('type', None)
        klass = targets.get(tp, None)
        if tp is None or klass is None:
            panic('Unknown type for section output.{}'.format(name))
        klass(config['output'][name]).run(tree)