def reset_files_extension(self, storage_name, prefix): bucket = self.__s3.get_bucket(storage_name) for key in bucket.list(prefix=prefix): if key.name.endswith('-done'): new_key_name = key.name.replace('-done','') bucket.copy_key(new_key_name=new_key_name, src_bucket_name=storage_name, src_key_name=key.name) bucket.delete_key(key.name)
def test_write_reservoir_sample_s3(self): # given sample_size = 10 items_to_write = [ BaseRecord({ u'key1': u'value1{}'.format(i), u'key2': u'value2{}'.format(i) }) for i in range(100) ] options = self.get_writer_config() options['options'].update({ 'compression': 'none', 'write_buffer': RESERVOIR_SAMPLING_BUFFER_CLASS, 'write_buffer_options': { 'sample_size': sample_size } }) # when: writer = S3Writer(options, meta()) try: writer.write_batch(items_to_write) writer.flush() finally: writer.close() # then: bucket = self.s3_conn.get_bucket('fake_bucket') saved_keys = [k for k in bucket.list()] self.assertEquals(1, len(saved_keys)) self.assertEqual(saved_keys[0].name, 'tests/0.jl') content = saved_keys[0].get_contents_as_string() self.assertEquals(len(content.strip().splitlines()), sample_size) self.assertNotEquals(content.strip().splitlines(), items_to_write[:sample_size])
def download_from_s3(bucket_name, file_name, dest_path, logger, prefix=None, access_key=None, secret_key=None, dry_run=False): valid = True result_string = "" s3_conn = _get_s3_connection(access_key, secret_key) if s3_conn.lookup(bucket_name): bucket = s3_conn.get_bucket(bucket_name) files = bucket.list(prefix=prefix) for f in files: if prefix: k = os.path.join(prefix, file_name) else: k = file_name if k == str(f.key): def percent_cb(complete, total): percentage = int(complete) * 100 / int(total) logger.write('Downloading from S3: ' + str(complete) + ' / ' + str(total) + ' ( ' + str(percentage) + '%)', multi_line=False) sys.stdout.flush() if dry_run: result_string += 'Skipping actual download from S3 due to dry run.\n' else: if not f.get_contents_to_filename(os.path.join(dest_path, file_name), cb=percent_cb, num_cb=5): result_string += 'Downloaded package to ' + os.path.join(dest_path, file_name) + \ ' from S3 bucket ' + bucket_name + '\n' return {"valid": valid, "result_string": result_string} result_string += file_name + " does not exist in S3 bucket " + bucket_name + '\n' valid = False else: result_string += "Cannot find S3 bucket with name " + bucket_name + '\n' valid = False return {"valid": valid, "result_string": result_string}
def _todos(bucket, prefix, paths, check_removed=True): ''' Return information about upcoming uploads and deletions. Returns a tuple: (upload, delete) 'upload' is a dictionary of info about files that need to be uploaded. It is keyed on local paths, and maps to a tuple: ((hex_md5, base64_md5, filesize), remote_path) 'delete' is a list of S3 keys that should be removed. If 'check_removed' is False, this list will always be empty. ''' # map rpath -> lpath; we use this to compare md5s for existing keys rpath_map = dict((i[1], i[0]) for i in paths) # Iterate through the BucketListResultSet only once; we'll add elements to # two containers and will return them at the end. up = {} delete = [] # Create a set of keys in S3 for comparison later s3_keys = set() # add entries for keys that have different contents for key in bucket.list(prefix): # Since we're already iterating through the result set, we'll save # key names. s3_keys.add(key.name) if check_removed and key.name not in rpath_map: # this key doesn't exist locally, schedule deletion delete.append(key.name) continue # file exists in both; compare md5s lpath = rpath_map[key.name] with open(lpath, 'rb') as fp: md5 = boto.s3.key.compute_md5(fp) if key.etag.strip('"') != md5[0].strip('"'): up[lpath] = (md5, key.name) # schedule uploads for new keys for rpath in set(i[1] for i in paths) - s3_keys: lpath = rpath_map[rpath] with open(lpath, 'rb') as fp: md5 = boto.s3.key.compute_md5(fp) up[lpath] = (md5, rpath) return up, delete
def __get_file_contents_list_from_bucket(bucket, prefix, bucket_name): json_files_list = [] for key in bucket.list(prefix=prefix): if key.name.endswith('/') or key.name.endswith('-done'): continue try: new_key_name = "{}-done".format(key.name) bucket.copy_key(new_key_name=new_key_name, src_bucket_name=bucket_name, src_key_name=key.name) bucket.delete_key(key.name) new_key = bucket.get_key(new_key_name) new_key.get_contents_to_filename(filename="tmp.json.gz") f = gzip.open('tmp.json.gz', 'rb') json_files_list.append(f.read()) f.close() except Exception as ex: Logger.log("warning", "{} FAILED: {}".format(key.name, ex.message)) return json_files_list
def download_from_s3( bucket_name, file_name, dest_path, logger, prefix=None, access_key=None, secret_key=None, dry_run=False ): valid = True result_string = "" s3_conn = _get_s3_connection(access_key, secret_key) if s3_conn.lookup(bucket_name): bucket = s3_conn.get_bucket(bucket_name) files = bucket.list(prefix=prefix) for f in files: if prefix: k = os.path.join(prefix, file_name) else: k = file_name if k == str(f.key): def percent_cb(complete, total): percentage = int(complete) * 100 / int(total) logger.write( "Downloading from S3: " + str(complete) + " / " + str(total) + " ( " + str(percentage) + "%)", multi_line=False, ) sys.stdout.flush() if dry_run: result_string += "Skipping actual download from S3 due to dry run.\n" else: if not f.get_contents_to_filename(os.path.join(dest_path, file_name), cb=percent_cb, num_cb=5): result_string += ( "Downloaded package to " + os.path.join(dest_path, file_name) + " from S3 bucket " + bucket_name + "\n" ) return {"valid": valid, "result_string": result_string} result_string += file_name + " does not exist in S3 bucket " + bucket_name + "\n" valid = False else: result_string += "Cannot find S3 bucket with name " + bucket_name + "\n" valid = False return {"valid": valid, "result_string": result_string}
if name is None: panic("Bucket name is not specified!") regions = dict((r.name, r) for r in boto.s3.regions()) region = regions.get(region) if region is None: panic("Region is not specified or is invalid. Valid regions are {}". format(", ".join(regions.keys()))) connection = region.connect(aws_access_key_id=akey, aws_secret_access_key=skey) bucket = connection.lookup(name) if bucket is None: panic( "Could not open/find bucket \"{}\"! Are your AWS keys/configuration valid?" .format(name)) tree = collect_files(bucket.list(prefix=prefix)) # Generate the requested output formats outputs = config.get('output', {}) targets = { 'html': HtmlGenerator, 'json': JsonGenerator, 'txt': TxtGenerator } for name in outputs.keys(): tp = config['output'][name].get('type', None) klass = targets.get(tp, None) if tp is None or klass is None: panic('Unknown type for section output.{}'.format(name)) klass(config['output'][name]).run(tree)