Ejemplo n.º 1
0
    def multi_output_reducer(self, key, values, output_file):
        """
        Write values to the appropriate file as determined by the key.
        Write to the encrypted file by streaming through gzip, which compresses before encrypting
        """
        _date_string, org_id = key
        recipients = self.recipients_for_org_id[org_id]
        log.info('Encryption recipients: %s', str(recipients))

        def report_progress(num_bytes):
            """Update hadoop counters as the file is written"""
            self.incr_counter('Event Export', 'Bytes Written to Output',
                              num_bytes)

        key_file_targets = [
            get_target_from_url(url_path_join(self.gpg_key_dir, recipient))
            for recipient in recipients
        ]
        with make_encrypted_file(
                output_file, key_file_targets,
                progress=report_progress) as encrypted_output_file:
            outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file)
            try:
                for value in values:
                    outfile.write(value.strip())
                    outfile.write('\n')
                    # WARNING: This line ensures that Hadoop knows that our process is not sitting in an infinite loop.
                    # Do not remove it.
                    self.incr_counter('Event Export', 'Raw Bytes Written',
                                      len(value) + 1)
            finally:
                outfile.close()
Ejemplo n.º 2
0
    def multi_output_reducer(self, key, values, output_file):
        """
        Write values to the appropriate file as determined by the key.
        Write to the encrypted file by streaming through gzip, which compresses before encrypting
        """
        _date_string, org_id = key
        recipients = self.recipients_for_org_id[org_id]
        log.info('Encryption recipients: %s', str(recipients))

        def report_progress(num_bytes):
            """Update hadoop counters as the file is written"""
            self.event_export_counter(counter_title='Bytes Written to Output', incr_value=num_bytes)

        key_file_targets = [get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients]
        try:
            with make_encrypted_file(output_file, key_file_targets, progress=report_progress,
                                     hadoop_counter_incr_func=self.event_export_counter) as encrypted_output_file:
                outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file)
                try:
                    for value in values:
                        outfile.write(value.strip())
                        outfile.write('\n')
                        # WARNING: This line ensures that Hadoop knows that our process is not sitting in an infinite
                        # loop.  Do not remove it.
                        self.event_export_counter(counter_title='Raw Bytes Written', incr_value=(len(value) + 1))
                finally:
                    outfile.close()
        except IOError as err:
            log.error("Error encountered while encrypting and gzipping Organization: %s file: %s Exception: %s",
                      org_id, key_file_targets, err)
            # This counter is set when there is an error during the generation of the encryption file for an
            # organization for any reason, including encryption errors related to an expired GPG key.
            self.event_export_counter(counter_title="{} org with Errors".format(org_id), incr_value=1)
    def test_make_encrypted_file_with_implied_recipients(self):
        values = ['this', 'is', 'a', 'test']
        with tempfile.NamedTemporaryFile() as output_file:
            with make_encrypted_file(output_file, self.key_file_targets) as encrypted_output_file:
                for value in values:
                    encrypted_output_file.write(value)
                    encrypted_output_file.write('\n')

            output_file.seek(0)
            self.check_encrypted_data(output_file, values)
    def test_make_encrypted_file_with_implied_recipients(self):
        values = ['this', 'is', 'a', 'test']
        with tempfile.NamedTemporaryFile() as output_file:
            with make_encrypted_file(
                    output_file,
                    self.key_file_targets) as encrypted_output_file:
                for value in values:
                    encrypted_output_file.write(value)
                    encrypted_output_file.write('\n')

            output_file.seek(0)
            self.check_encrypted_data(output_file, values)
    def run(self):
        recipients = set(self.recipient)
        if self.gpg_master_key is not None:
            recipients.add(self.gpg_master_key)
        key_file_targets = [
            get_target_from_url(url_path_join(self.gpg_key_dir, recipient))
            for recipient in recipients
        ]

        path_task = PathSetTask([self.course_files_url], ['*.*'])
        with make_temp_directory(prefix='obfuscate-archive.',
                                 dir=self.temporary_dir) as tmp_directory:
            for target in path_task.output():
                with target.open('r') as input_file:
                    # Get path without urlscheme.
                    course_files_path = urlparse.urlparse(
                        self.course_files_url).path
                    # Calculates target's relative path to course_files_path by getting the substring that
                    # occurs after course_files_path substring in target's path.
                    # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget.
                    # Examples:
                    # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz
                    # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz
                    # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    r_index = target.path.find(course_files_path) + len(
                        course_files_path)
                    relative_path = target.path[r_index:].lstrip('/')

                    local_file_path = os.path.join(tmp_directory,
                                                   relative_path)
                    try:
                        os.makedirs(os.path.dirname(local_file_path))
                    except OSError as exc:
                        if exc.errno != errno.EEXIST:
                            raise
                    with open(local_file_path, 'w') as temp_file:
                        copy_file_to_file(input_file, temp_file)

            def report_encrypt_progress(num_bytes):
                """Log encryption progress."""
                log.info('Encrypted %d bytes', num_bytes)

            with self.output().open('w') as output_file:
                with make_encrypted_file(
                        output_file,
                        key_file_targets,
                        progress=report_encrypt_progress,
                        dir=self.temporary_dir) as encrypted_output_file:
                    with tarfile.open(mode='w:gz',
                                      fileobj=encrypted_output_file
                                      ) as output_archive_file:
                        output_archive_file.add(tmp_directory, arcname='')
Ejemplo n.º 6
0
    def run(self):
        recipients = set(self.recipient)
        if self.gpg_master_key is not None:
            recipients.add(self.gpg_master_key)
        key_file_targets = [
            get_target_from_url(url_path_join(self.gpg_key_dir, recipient))
            for recipient in recipients
        ]

        path_task = PathSetTask([self.course_files_url], ['*.*'])
        with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory:
            for target in path_task.output():
                with target.open('r') as input_file:
                    # Get path without urlscheme.
                    course_files_path = urlparse.urlparse(self.course_files_url).path
                    # Calculates target's relative path to course_files_path by getting the substring that
                    # occurs after course_files_path substring in target's path.
                    # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget.
                    # Examples:
                    # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz
                    # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz
                    # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    r_index = target.path.find(course_files_path) + len(course_files_path)
                    relative_path = target.path[r_index:].lstrip('/')

                    local_file_path = os.path.join(tmp_directory, relative_path)
                    try:
                        os.makedirs(os.path.dirname(local_file_path))
                    except OSError as exc:
                        if exc.errno != errno.EEXIST:
                            raise
                    with open(local_file_path, 'w') as temp_file:
                        copy_file_to_file(input_file, temp_file)

            def report_encrypt_progress(num_bytes):
                """Log encryption progress."""
                log.info('Encrypted %d bytes', num_bytes)

            with self.output().open('w') as output_file:
                with make_encrypted_file(
                    output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir
                ) as encrypted_output_file:
                    with tarfile.open(mode='w:gz', fileobj=encrypted_output_file) as output_archive_file:
                        output_archive_file.add(tmp_directory, arcname='')