def multi_output_reducer(self, key, values, output_file): """ Write values to the appropriate file as determined by the key. Write to the encrypted file by streaming through gzip, which compresses before encrypting """ _date_string, org_id = key recipients = self.recipients_for_org_id[org_id] log.info('Encryption recipients: %s', str(recipients)) def report_progress(num_bytes): """Update hadoop counters as the file is written""" self.incr_counter('Event Export', 'Bytes Written to Output', num_bytes) key_file_targets = [get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients] with make_encrypted_file(output_file, key_file_targets, progress=report_progress) as encrypted_output_file: outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file) try: for value in values: outfile.write(value.strip()) outfile.write('\n') # WARNING: This line ensures that Hadoop knows that our process is not sitting in an infinite loop. # Do not remove it. self.incr_counter('Event Export', 'Raw Bytes Written', len(value) + 1) finally: outfile.close()
def multi_output_reducer(self, key, values, output_file): """ Write values to the appropriate file as determined by the key. Write to the encrypted file by streaming through gzip, which compresses before encrypting """ _date_string, org_id = key recipients = self.recipients_for_org_id[org_id] log.info('Encryption recipients: %s', str(recipients)) def report_progress(num_bytes): """Update hadoop counters as the file is written""" self.incr_counter('Event Export', 'Bytes Written to Output', num_bytes) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] with make_encrypted_file( output_file, key_file_targets, progress=report_progress) as encrypted_output_file: outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file) try: for value in values: outfile.write(value.strip()) outfile.write('\n') # WARNING: This line ensures that Hadoop knows that our process is not sitting in an infinite loop. # Do not remove it. self.incr_counter('Event Export', 'Raw Bytes Written', len(value) + 1) finally: outfile.close()
def test_make_encrypted_file_with_implied_recipients(self): values = ['this', 'is', 'a', 'test'] with tempfile.NamedTemporaryFile() as output_file: with make_encrypted_file(output_file, self.key_file_targets) as encrypted_output_file: for value in values: encrypted_output_file.write(value) encrypted_output_file.write('\n') output_file.seek(0) self.check_encrypted_data(output_file, values)
def run(self): recipients = set(self.recipient) if self.gpg_master_key is not None: recipients.add(self.gpg_master_key) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] path_task = PathSetTask([self.course_files_url], ['*.*']) with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory: for target in path_task.output(): with target.open('r') as input_file: # Get path without urlscheme. course_files_path = urlparse.urlparse( self.course_files_url).path # Calculates target's relative path to course_files_path by getting the substring that # occurs after course_files_path substring in target's path. # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget. # Examples: # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql r_index = target.path.find(course_files_path) + len( course_files_path) relative_path = target.path[r_index:].lstrip('/') local_file_path = os.path.join(tmp_directory, relative_path) try: os.makedirs(os.path.dirname(local_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise with open(local_file_path, 'w') as temp_file: copy_file_to_file(input_file, temp_file) def report_encrypt_progress(num_bytes): """Log encryption progress.""" log.info('Encrypted %d bytes', num_bytes) with self.output().open('w') as output_file: with make_encrypted_file( output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir) as encrypted_output_file: with tarfile.open(mode='w:gz', fileobj=encrypted_output_file ) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')
def test_make_encrypted_file_with_implied_recipients(self): values = ['this', 'is', 'a', 'test'] with tempfile.NamedTemporaryFile() as output_file: with make_encrypted_file( output_file, self.key_file_targets) as encrypted_output_file: for value in values: encrypted_output_file.write(value) encrypted_output_file.write('\n') output_file.seek(0) self.check_encrypted_data(output_file, values)
def multi_output_reducer(self, key, values, output_file): """ Write values to the appropriate file as determined by the key. Write to the encrypted file by streaming through gzip, which compresses before encrypting """ _date_string, org_id = key recipients = self._get_recipients(org_id) key_file_targets = [get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients] with make_encrypted_file(output_file, key_file_targets) as encrypted_output_file: outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file) try: for value in values: outfile.write(value.strip()) outfile.write('\n') finally: outfile.close()
def run(self): recipients = set(self.recipient) if self.gpg_master_key is not None: recipients.add(self.gpg_master_key) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] path_task = PathSetTask([self.course_files_url], ['*.*']) with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory: for target in path_task.output(): with target.open('r') as input_file: # Get path without urlscheme. course_files_path = urlparse.urlparse(self.course_files_url).path # Calculates target's relative path to course_files_path by getting the substring that # occurs after course_files_path substring in target's path. # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget. # Examples: # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql r_index = target.path.find(course_files_path) + len(course_files_path) relative_path = target.path[r_index:].lstrip('/') local_file_path = os.path.join(tmp_directory, relative_path) try: os.makedirs(os.path.dirname(local_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise with open(local_file_path, 'w') as temp_file: copy_file_to_file(input_file, temp_file) def report_encrypt_progress(num_bytes): """Log encryption progress.""" log.info('Encrypted %d bytes', num_bytes) with self.output().open('w') as output_file: with make_encrypted_file( output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir ) as encrypted_output_file: with tarfile.open(mode='w:gz', fileobj=encrypted_output_file) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')
def multi_output_reducer(self, key, values, output_file): """ Write values to the appropriate file as determined by the key. Write to the encrypted file by streaming through gzip, which compresses before encrypting """ _date_string, org_id = key recipients = self._get_recipients(org_id) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] with make_encrypted_file(output_file, key_file_targets) as encrypted_output_file: outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file) try: for value in values: outfile.write(value.strip()) outfile.write('\n') finally: outfile.close()