def get_latest_aws_manifest_key(self): manifestprefix = self.sourcePrefix + utils.get_period_prefix( self.year, self.month) print "Getting Manifest key for acccount:[{}] - bucket:[{}] - prefix:[{}]".format( self.accountId, self.sourceBucket, manifestprefix) manifest_key = '' try: response = self.s3sourceclient.list_objects_v2( Bucket=self.sourceBucket, Prefix=manifestprefix) #Get the latest manifest if 'Contents' in response: for o in response['Contents']: key = o['Key'] post_prefix = key[key.find(manifestprefix) + len(manifestprefix):] if '-Manifest.json' in key and post_prefix.find( "/" ) < 0: #manifest file is at top level after prefix and not inside one of the folders manifest_key = key break except Exception as e: print "Error when getting manifest key for acccount:[{}] - bucket:[{}] - key:[{}]".format( self.accountId, self.sourceBucket, manifest_key) print e.message traceback.print_exc() if not manifest_key: raise ManifestNotFoundError( "Could not find manifest file in bucket:[{}], key:[{}]".format( self.sourceBucket, manifest_key)) return manifest_key
def test_role(self): monthly_report_prefix = "" if self.year and self.month: monthly_report_prefix = utils.get_period_prefix( self.year, self.month) latest_report_keys = self.get_latest_aws_cur_keys( self.sourceBucket, self.sourcePrefix + monthly_report_prefix, self.s3sourceclient) if latest_report_keys: print "xAccount Source test passed!"
def create_manifest(self, type, bucket, prefix, report_keys): monthly_report_prefix = "" if self.year and self.month: monthly_report_prefix = utils.get_period_prefix( self.year, self.month) manifest = {} #report_keys can by any array of keys. If it's not provided, then we get the ones generated by AWS if not report_keys: report_keys = self.get_latest_aws_cur_keys( bucket, prefix + monthly_report_prefix, self.s3destclient) entries = [] uris = [] for key in report_keys: #TODO: manifest cannot point to more than 1000 files (add validation) uris.append("s3://" + bucket + "/" + key) if type == consts.MANIFEST_TYPE_REDSHIFT: entries.append({ "url": "s3://" + bucket + "/" + key, "mandatory": True }) if len(entries) == self.limit: break manifest_file_name = "" if type == consts.MANIFEST_TYPE_REDSHIFT: manifest['entries'] = entries manifest_file_name = "billing-redshift-manifest-concurrencylabs.json" if type == consts.MANIFEST_TYPE_QUICKSIGHT: manifest['fileLocations'] = [{"URIs": uris}] manifest_file_name = "billing-quicksight-manifest-concurrencylabs.json" manifest_body = json.dumps(manifest, indent=4, sort_keys=False) print("Manifest ({}):{}".format(type, manifest_body)) record_count = 0 if len(uris): record_count = len(uris) if len(entries): record_count = len(entries) print "Number of files in manifest: [{}]".format(record_count) #TODO: validate that no athena files exist in S3 destination, before creating manifest manifest_key = prefix + monthly_report_prefix + manifest_file_name if record_count: self.s3destclient.put_object(Bucket=bucket, Key=manifest_key, ACL='private', Body=manifest_body) print "Manifest S3 URL (this is the URL you provide in {}): [https://s3.amazonaws.com/{}/{}]".format( type, bucket, manifest_key) else: print "No entries found - did not write manifest"
def get_all_aws_manifest_keys(self): manifestprefix = self.sourcePrefix + utils.get_period_prefix( self.year, self.month) print "Getting Manifest key for acccount:[{}] - bucket:[{}] - prefix:[{}]".format( self.accountId, self.sourceBucket, manifestprefix) manifest_key = [] try: response = self.s3sourceclient.list_objects_v2( Bucket=self.sourceBucket, Prefix=manifestprefix) #recursive #Get the latest manifest if 'Contents' in response: for o in response['Contents']: key = o['Key'] post_prefix = key[key.find(manifestprefix) + len(manifestprefix):] #if '-Manifest.json' in key and post_prefix.find("/") < 0:#manifest file is at top level after prefix and not inside one of the folders if '-Manifest.json' in key: # Using the first manifest file for the month. how to deal with manifest change in between a month manifest_key.append(key) #break except BotoClientError as bce: self.status = consts.CUR_PROCESSOR_STATUS_ERROR if bce.response['Error']['Code'] == 'NoSuchBucket': self.statusDetails = bce.response['Error']['Code'] raise CurBucketNotFoundError("{} - bucket:[{}]".format( bce.message, self.sourceBucket)) else: self.statusDetails = 'BotoClientError_' + bce.response[ 'Error']['Code'] raise except Exception as e: self.status = consts.CUR_PROCESSOR_STATUS_ERROR self.statusDetails = e.message print "Error when getting manifest key for acccount:[{}] - bucket:[{}] - key:[{}]".format( self.accountId, self.sourceBucket, manifest_key) print e.message traceback.print_exc() if not manifest_key: self.status = consts.CUR_PROCESSOR_STATUS_ERROR self.statusDetails = "ManifestNotFoundError - key:[{}]".format( manifest_key) raise ManifestNotFoundError( "Could not find manifest file in bucket:[{}]".format( self.sourceBucket)) print "Manifest Keys: [{}]".format(manifest_key) return manifest_key
def process_latest_aws_cur(self, action): if action in (consts.ACTION_PREPARE_ATHENA, consts.ACTION_PREPARE_QUICKSIGHT): if not utils.is_valid_prefix(self.destPrefix): raise Exception( "Invalid Destination S3 Bucket prefix: [{}]".format( self.destPrefix)) period_prefix = utils.get_period_prefix(self.year, self.month) monthSourcePrefix = self.sourcePrefix + period_prefix monthDestPrefix = '{}{}/{}'.format(self.destPrefix, self.accountId, period_prefix) report_keys = self.get_latest_aws_cur_keys(self.sourceBucket, monthSourcePrefix, self.s3sourceclient) destS3keys = [] #Get content for all report files for rk in report_keys: tokens = rk.split("/") hash = tokens[len(tokens) - 2] response = self.s3sourceclient.get_object(Bucket=self.sourceBucket, Key=rk) if '/var/task' in os.getcwd(): #executing as a Lambda function tmpLocalFolder = '/tmp' else: tmpLocalFolder = os.getcwd() + '/tmp' if not os.path.isdir(tmpLocalFolder): os.mkdir(tmpLocalFolder) tmpLocalKey = tmpLocalFolder + '/tmp_' + rk.replace( "/", "-" ) + '.csv.gz' #temporary file that is downloaded from S3, before any modifications take place finalLocalKey = tmpLocalFolder + '/' + hash + '.csv.gz' #final local file after any modifications take place fileToUpload = '' finalS3Key = '' #Download latest report as a tmp local file with open(tmpLocalKey, 'wb') as report: self.s3resource.Bucket(self.sourceBucket).download_fileobj( rk, report) #Read through the tmp local file and skip first line (for Athena) record_count = 0 if action == consts.ACTION_PREPARE_ATHENA: fileToUpload = finalLocalKey finalS3Key = monthDestPrefix + str( uuid.uuid1()) + "cost-and-usage-athena.csv.gz" with gzip.open(tmpLocalKey, 'rb') as f: f.next() #skips first line for Athena files #Write contents to another tmp file, which will be uploaded to S3 with gzip.open(finalLocalKey, 'ab') as no_header: for line in f: no_header.write(line) record_count = record_count + 1 print "Number of records: [{}]".format(record_count) #TODO:if we're using the files for QuickSight, do a Copy operation and don't download. if action == consts.ACTION_PREPARE_QUICKSIGHT: fileToUpload = tmpLocalKey finalS3Key = monthDestPrefix + "cost-and-usage-quicksight.csv.gz" print "Putting: [{}/{}] in [{}/{}]".format(self.sourceBucket, rk, self.destBucket, finalS3Key) with open(fileToUpload, 'rb') as data: self.s3destclient.upload_fileobj(data, self.destBucket, finalS3Key, ExtraArgs={ 'Metadata': { 'reportId': hash }, 'StorageClass': 'REDUCED_REDUNDANCY' }) destS3keys.append(finalS3Key) #Remove temporary files. This is also important to avoid Lambda errors where the local Lambda storage limit can be easily reached after a few executions os.remove(tmpLocalKey) os.remove(finalLocalKey) self.status = consts.CUR_PROCESSOR_STATUS_OK return destS3keys