def validate_obfuscation(self):
        """Validates obfuscation workflow."""
        output_target = self.get_targets_from_remote_path(self.test_out, "*.tar.gz.gpg")[0]
        output_filename = os.path.basename(output_target.path)
        temp_output_filepath = os.path.join(self.temporary_dir, output_filename)

        with output_target.open("r") as input_file:
            with open(temp_output_filepath, "w") as output_file:
                copy_file_to_file(input_file, output_file)

        decrypted_filepath = temp_output_filepath[: -len(".gpg")]
        fs.decrypt_file(temp_output_filepath, decrypted_filepath, "insecure_secret.key")

        with tarfile.open(decrypted_filepath, "r:gz") as tfile:
            tfile.extractall(self.temporary_dir)

        # Validate package metadata info.
        metadata_filepath = os.path.join(self.temporary_dir, "metadata_file.json")
        with open(metadata_filepath) as metadata_file:
            metadata_info = json.load(metadata_file)
        self.assertItemsEqual(metadata_info["format_version"], self.FORMAT_VERSION)
        self.assertItemsEqual(metadata_info["pipeline_version"], self.PIPELINE_VERSION)

        self.validate_data_obfuscation()
        self.validate_events_obfuscation()
Example #2
0
def make_encrypted_file(output_file, key_file_targets, recipients=None, progress=None, dir=None,
                        hadoop_counter_incr_func=DEFAULT_HADOOP_COUNTER_FUNC):
    """
    Creates a file object to be written to, whose contents will afterwards be encrypted.

    Parameters:
        output_file:  a file object, opened for writing.
        key_file_targets: a list of luigi.Target objects defining the gpg public key files to be loaded.
        recipients:  an optional list of recipients to be loaded.  If not specified, uses all loaded keys.
        progress:  a function that is called periodically as progress is made.
        hadoop_counter_incr_func:  A callback to a function that can generate MR counters so that non-critical GPG
            messages can be promoted to a visible section of the MR run log.
    """
    with make_temp_directory(prefix="encrypt", dir=dir) as temp_dir:
        # Use temp directory to hold gpg keys.
        gpg = gnupg.GPG(gnupghome=temp_dir)
        gpg.encoding = 'utf-8'
        _import_key_files(gpg_instance=gpg, key_file_targets=key_file_targets,
                          hadoop_counter_incr_func=hadoop_counter_incr_func)

        # Create a temp file to contain the unencrypted output, in the same temp directory.
        with tempfile.NamedTemporaryFile(dir=temp_dir, delete=False) as temp_input_file:
            temp_input_filepath = temp_input_file.name
            log.info('Writing data to temporary file: %s', temp_input_filepath)
            yield temp_input_file

        # Encryption produces a second file in the same temp directory.
        temp_encrypted_filepath = "{filepath}.gpg".format(filepath=temp_input_filepath)
        if recipients is None:
            recipients = [key['keyid'] for key in gpg.list_keys()]
        with open(temp_input_filepath, 'r') as temp_input_file:
            _encrypt_file(gpg, temp_input_file, temp_encrypted_filepath, recipients)
        with open(temp_encrypted_filepath) as temp_encrypted_file:
            copy_file_to_file(temp_encrypted_file, output_file, progress)
    def run(self):
        with self.output().open('w') as output_file:
            with self.input()['data'][0].open('r') as input_file:
                with make_temp_directory(prefix='obfuscate-course.') as tmp_directory:
                    with tempfile.TemporaryFile() as temp_input_file:
                        # We cannot seek in HDFS streams, so copy the file to the local disk before extracting
                        copy_file_to_file(input_file, temp_input_file)
                        temp_input_file.flush()
                        temp_input_file.seek(0)

                        with tarfile.open(mode='r:gz', fileobj=temp_input_file) as course_archive:
                            course_archive.extractall(tmp_directory)

                        course_dir = os.listdir(tmp_directory)[0]
                        root_dir = os.path.join(tmp_directory, course_dir)

                        self.clean_drafts(root_dir)

                        course_package_ref = self.read_course_package_ref(root_dir)

                        policy_file_path = os.path.join(root_dir, 'policies', course_package_ref, 'policy.json')
                        self.clean_course_policy(course_package_ref, policy_file_path)

                        self.clean_xml_files(root_dir)

                        with tarfile.open(mode='w:gz', fileobj=output_file) as output_archive_file:
                            output_archive_file.add(tmp_directory, arcname='')
    def validate_obfuscation(self):
        """Validates obfuscation workflow."""
        output_target = PathSetTask([self.test_out], ['*.tar.gz.gpg']).output()[0]
        output_filename = os.path.basename(output_target.path)
        output_filepath = os.path.join(self.temporary_dir, output_filename)

        if output_target.path.startswith('s3://'):
            output_target = get_target_from_url(output_target.path.replace('s3://', 's3+https://'))

        with output_target.open('r') as input_file:
            with open(output_filepath, 'w') as output_file:
                copy_file_to_file(input_file, output_file)

        decrypted_filepath = output_filepath[:-len('.gpg')]
        fs.decrypt_file(output_filepath, decrypted_filepath, 'insecure_secret.key')

        with tarfile.open(decrypted_filepath, 'r:gz') as tfile:
            tfile.extractall(self.temporary_dir)

        # Validate package metadata info.
        metadata_filepath = os.path.join(self.temporary_dir, 'metadata_file.json')
        with open(metadata_filepath) as metadata_file:
            metadata_info = json.load(metadata_file)
        self.assertItemsEqual(metadata_info['format_version'], self.FORMAT_VERSION)
        self.assertItemsEqual(metadata_info['pipeline_version'], self.PIPELINE_VERSION)

        self.validate_data_obfuscation()
        self.validate_events_obfuscation()
Example #5
0
    def validate_obfuscation(self):
        """Validates obfuscation workflow."""
        output_target = self.get_targets_from_remote_path(
            self.test_out, '*.tar.gz.gpg')[0]
        output_filename = os.path.basename(output_target.path)
        temp_output_filepath = os.path.join(self.temporary_dir,
                                            output_filename)

        with output_target.open('r') as input_file:
            with open(temp_output_filepath, 'w') as output_file:
                copy_file_to_file(input_file, output_file)

        decrypted_filepath = temp_output_filepath[:-len('.gpg')]
        fs.decrypt_file(temp_output_filepath, decrypted_filepath,
                        'insecure_secret.key')

        with tarfile.open(decrypted_filepath, 'r:gz') as tfile:
            tfile.extractall(self.temporary_dir)

        # Validate package metadata info.
        metadata_filepath = os.path.join(self.temporary_dir,
                                         'metadata_file.json')
        with open(metadata_filepath) as metadata_file:
            metadata_info = json.load(metadata_file)
        self.assertItemsEqual(metadata_info['format_version'],
                              self.FORMAT_VERSION)
        self.assertItemsEqual(metadata_info['pipeline_version'],
                              self.PIPELINE_VERSION)

        self.validate_data_obfuscation()
        self.validate_events_obfuscation()
def make_encrypted_file(output_file, key_file_targets, recipients=None, progress=None):
    """
    Creates a file object to be written to, whose contents will afterwards be encrypted.

    Parameters:
        output_file:  a file object, opened for writing.
        key_file_targets: a list of luigi.Target objects defining the gpg public key files to be loaded.
        recipients:  an optional list of recipients to be loaded.  If not specified, uses all loaded keys.
        progress:  a function that is called periodically as progress is made.
    """
    with make_temp_directory(prefix="encrypt") as temp_dir:
        # Use temp directory to hold gpg keys.
        gpg = gnupg.GPG(gnupghome=temp_dir)
        gpg.encoding = 'utf-8'
        _import_key_files(gpg, key_file_targets)

        # Create a temp file to contain the unencrypted output, in the same temp directory.
        with tempfile.NamedTemporaryFile(dir=temp_dir, delete=False) as temp_input_file:
            temp_input_filepath = temp_input_file.name
            log.info('Writing data to temporary file: %s', temp_input_filepath)
            yield temp_input_file

        # Encryption produces a second file in the same temp directory.
        temp_encrypted_filepath = "{filepath}.gpg".format(filepath=temp_input_filepath)
        if recipients is None:
            recipients = [key['keyid'] for key in gpg.list_keys()]
        with open(temp_input_filepath, 'r') as temp_input_file:
            _encrypt_file(gpg, temp_input_file, temp_encrypted_filepath, recipients)
        with open(temp_encrypted_filepath) as temp_encrypted_file:
            copy_file_to_file(temp_encrypted_file, output_file, progress)
    def validate_obfuscation(self):
        """Validates obfuscation workflow."""
        output_target = PathSetTask([self.test_out],
                                    ['*.tar.gz.gpg']).output()[0]
        output_filename = os.path.basename(output_target.path)
        output_filepath = os.path.join(self.temporary_dir, output_filename)

        if output_target.path.startswith('s3://'):
            output_target = get_target_from_url(
                output_target.path.replace('s3://', 's3+https://'))

        with output_target.open('r') as input_file:
            with open(output_filepath, 'w') as output_file:
                copy_file_to_file(input_file, output_file)

        decrypted_filepath = output_filepath[:-len('.gpg')]
        fs.decrypt_file(output_filepath, decrypted_filepath,
                        'insecure_secret.key')

        with tarfile.open(decrypted_filepath, 'r:gz') as tfile:
            tfile.extractall(self.temporary_dir)

        # Validate package metadata info.
        metadata_filepath = os.path.join(self.temporary_dir,
                                         'metadata_file.json')
        with open(metadata_filepath) as metadata_file:
            metadata_info = json.load(metadata_file)
        self.assertItemsEqual(metadata_info['format_version'],
                              self.FORMAT_VERSION)
        self.assertItemsEqual(metadata_info['pipeline_version'],
                              self.PIPELINE_VERSION)

        self.validate_data_obfuscation()
        self.validate_events_obfuscation()
Example #8
0
    def run(self):
        with self.output().open('w') as output_file:
            with self.input()['data'][0].open('r') as input_file:
                with make_temp_directory(prefix='obfuscate-course.') as tmp_directory:
                    with tempfile.TemporaryFile() as temp_input_file:
                        # We cannot seek in HDFS streams, so copy the file to the local disk before extracting
                        copy_file_to_file(input_file, temp_input_file)
                        temp_input_file.flush()
                        temp_input_file.seek(0)

                        with tarfile.open(mode='r:gz', fileobj=temp_input_file) as course_archive:
                            course_archive.extractall(tmp_directory)

                        course_dir = os.listdir(tmp_directory)[0]
                        root_dir = os.path.join(tmp_directory, course_dir)

                        self.clean_drafts(root_dir)

                        course_package_ref = self.read_course_package_ref(root_dir)

                        policy_file_path = os.path.join(root_dir, 'policies', course_package_ref, 'policy.json')
                        self.clean_course_policy(course_package_ref, policy_file_path)

                        self.clean_xml_files(root_dir)

                        with tarfile.open(mode='w:gz', fileobj=output_file) as output_archive_file:
                            output_archive_file.add(tmp_directory, arcname='')
def make_encrypted_file(output_file, key_file_targets, recipients=None, progress=None, dir=None,
                        hadoop_counter_incr_func=DEFAULT_HADOOP_COUNTER_FUNC):
    """
    Creates a file object to be written to, whose contents will afterwards be encrypted.

    Parameters:
        output_file:  a file object, opened for writing.
        key_file_targets: a list of luigi.Target objects defining the gpg public key files to be loaded.
        recipients:  an optional list of recipients to be loaded.  If not specified, uses all loaded keys.
        progress:  a function that is called periodically as progress is made.
        hadoop_counter_incr_func:  A callback to a function that can generate MR counters so that non-critical GPG
            messages can be promoted to a visible section of the MR run log.
    """
    with make_temp_directory(prefix="encrypt", dir=dir) as temp_dir:
        # Use temp directory to hold gpg keys.
        gpg = gnupg.GPG(gnupghome=temp_dir)
        gpg.encoding = 'utf-8'
        _import_key_files(gpg_instance=gpg, key_file_targets=key_file_targets,
                          hadoop_counter_incr_func=hadoop_counter_incr_func)

        # Create a temp file to contain the unencrypted output, in the same temp directory.
        with tempfile.NamedTemporaryFile(dir=temp_dir, delete=False) as temp_input_file:
            temp_input_filepath = temp_input_file.name
            log.info('Writing data to temporary file: %s', temp_input_filepath)
            yield temp_input_file

        # Encryption produces a second file in the same temp directory.
        temp_encrypted_filepath = "{filepath}.gpg".format(filepath=temp_input_filepath)
        if recipients is None:
            recipients = [key['keyid'] for key in gpg.list_keys()]
        with open(temp_input_filepath, 'r') as temp_input_file:
            _encrypt_file(gpg, temp_input_file, temp_encrypted_filepath, recipients)
        with open(temp_encrypted_filepath) as temp_encrypted_file:
            copy_file_to_file(temp_encrypted_file, output_file, progress)
    def run(self):
        recipients = set(self.recipient)
        if self.gpg_master_key is not None:
            recipients.add(self.gpg_master_key)
        key_file_targets = [
            get_target_from_url(url_path_join(self.gpg_key_dir, recipient))
            for recipient in recipients
        ]

        path_task = PathSetTask([self.course_files_url], ['*.*'])
        with make_temp_directory(prefix='obfuscate-archive.',
                                 dir=self.temporary_dir) as tmp_directory:
            for target in path_task.output():
                with target.open('r') as input_file:
                    # Get path without urlscheme.
                    course_files_path = urlparse.urlparse(
                        self.course_files_url).path
                    # Calculates target's relative path to course_files_path by getting the substring that
                    # occurs after course_files_path substring in target's path.
                    # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget.
                    # Examples:
                    # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz
                    # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz
                    # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    r_index = target.path.find(course_files_path) + len(
                        course_files_path)
                    relative_path = target.path[r_index:].lstrip('/')

                    local_file_path = os.path.join(tmp_directory,
                                                   relative_path)
                    try:
                        os.makedirs(os.path.dirname(local_file_path))
                    except OSError as exc:
                        if exc.errno != errno.EEXIST:
                            raise
                    with open(local_file_path, 'w') as temp_file:
                        copy_file_to_file(input_file, temp_file)

            def report_encrypt_progress(num_bytes):
                """Log encryption progress."""
                log.info('Encrypted %d bytes', num_bytes)

            with self.output().open('w') as output_file:
                with make_encrypted_file(
                        output_file,
                        key_file_targets,
                        progress=report_encrypt_progress,
                        dir=self.temporary_dir) as encrypted_output_file:
                    with tarfile.open(mode='w:gz',
                                      fileobj=encrypted_output_file
                                      ) as output_archive_file:
                        output_archive_file.add(tmp_directory, arcname='')
    def run(self):
        recipients = set(self.recipient)
        if self.gpg_master_key is not None:
            recipients.add(self.gpg_master_key)
        key_file_targets = [
            get_target_from_url(url_path_join(self.gpg_key_dir, recipient))
            for recipient in recipients
        ]

        path_task = PathSetTask([self.course_files_url], ['*.*'])
        with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory:
            for target in path_task.output():
                with target.open('r') as input_file:
                    # Get path without urlscheme.
                    course_files_path = urlparse.urlparse(self.course_files_url).path
                    # Calculates target's relative path to course_files_path by getting the substring that
                    # occurs after course_files_path substring in target's path.
                    # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget.
                    # Examples:
                    # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz
                    # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz
                    # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    r_index = target.path.find(course_files_path) + len(course_files_path)
                    relative_path = target.path[r_index:].lstrip('/')

                    local_file_path = os.path.join(tmp_directory, relative_path)
                    try:
                        os.makedirs(os.path.dirname(local_file_path))
                    except OSError as exc:
                        if exc.errno != errno.EEXIST:
                            raise
                    with open(local_file_path, 'w') as temp_file:
                        copy_file_to_file(input_file, temp_file)

            def report_encrypt_progress(num_bytes):
                """Log encryption progress."""
                log.info('Encrypted %d bytes', num_bytes)

            with self.output().open('w') as output_file:
                with make_encrypted_file(
                    output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir
                ) as encrypted_output_file:
                    with tarfile.open(mode='w:gz', fileobj=encrypted_output_file) as output_archive_file:
                        output_archive_file.add(tmp_directory, arcname='')