def validate_obfuscation(self):
        """Validates obfuscation workflow."""
        output_target = PathSetTask([self.test_out], ['*.tar.gz.gpg']).output()[0]
        output_filename = os.path.basename(output_target.path)
        output_filepath = os.path.join(self.temporary_dir, output_filename)

        if output_target.path.startswith('s3://'):
            output_target = get_target_from_url(output_target.path.replace('s3://', 's3+https://'))

        with output_target.open('r') as input_file:
            with open(output_filepath, 'w') as output_file:
                copy_file_to_file(input_file, output_file)

        decrypted_filepath = output_filepath[:-len('.gpg')]
        fs.decrypt_file(output_filepath, decrypted_filepath, 'insecure_secret.key')

        with tarfile.open(decrypted_filepath, 'r:gz') as tfile:
            tfile.extractall(self.temporary_dir)

        # Validate package metadata info.
        metadata_filepath = os.path.join(self.temporary_dir, 'metadata_file.json')
        with open(metadata_filepath) as metadata_file:
            metadata_info = json.load(metadata_file)
        self.assertItemsEqual(metadata_info['format_version'], self.FORMAT_VERSION)
        self.assertItemsEqual(metadata_info['pipeline_version'], self.PIPELINE_VERSION)

        self.validate_data_obfuscation()
        self.validate_events_obfuscation()
    def validate_obfuscation(self):
        """Validates obfuscation workflow."""
        output_target = PathSetTask([self.test_out],
                                    ['*.tar.gz.gpg']).output()[0]
        output_filename = os.path.basename(output_target.path)
        output_filepath = os.path.join(self.temporary_dir, output_filename)

        if output_target.path.startswith('s3://'):
            output_target = get_target_from_url(
                output_target.path.replace('s3://', 's3+https://'))

        with output_target.open('r') as input_file:
            with open(output_filepath, 'w') as output_file:
                copy_file_to_file(input_file, output_file)

        decrypted_filepath = output_filepath[:-len('.gpg')]
        fs.decrypt_file(output_filepath, decrypted_filepath,
                        'insecure_secret.key')

        with tarfile.open(decrypted_filepath, 'r:gz') as tfile:
            tfile.extractall(self.temporary_dir)

        # Validate package metadata info.
        metadata_filepath = os.path.join(self.temporary_dir,
                                         'metadata_file.json')
        with open(metadata_filepath) as metadata_file:
            metadata_info = json.load(metadata_file)
        self.assertItemsEqual(metadata_info['format_version'],
                              self.FORMAT_VERSION)
        self.assertItemsEqual(metadata_info['pipeline_version'],
                              self.PIPELINE_VERSION)

        self.validate_data_obfuscation()
        self.validate_events_obfuscation()
Beispiel #3
0
    def run(self):
        recipients = set(self.recipient)
        if self.gpg_master_key is not None:
            recipients.add(self.gpg_master_key)
        key_file_targets = [
            get_target_from_url(url_path_join(self.gpg_key_dir, recipient))
            for recipient in recipients
        ]

        path_task = PathSetTask([self.course_files_url], ['*.*'])
        with make_temp_directory(prefix='obfuscate-archive.',
                                 dir=self.temporary_dir) as tmp_directory:
            for target in path_task.output():
                with target.open('r') as input_file:
                    # Get path without urlscheme.
                    course_files_path = urlparse.urlparse(
                        self.course_files_url).path
                    # Calculates target's relative path to course_files_path by getting the substring that
                    # occurs after course_files_path substring in target's path.
                    # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget.
                    # Examples:
                    # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz
                    # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz
                    # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    r_index = target.path.find(course_files_path) + len(
                        course_files_path)
                    relative_path = target.path[r_index:].lstrip('/')

                    local_file_path = os.path.join(tmp_directory,
                                                   relative_path)
                    try:
                        os.makedirs(os.path.dirname(local_file_path))
                    except OSError as exc:
                        if exc.errno != errno.EEXIST:
                            raise
                    with open(local_file_path, 'w') as temp_file:
                        copy_file_to_file(input_file, temp_file)

            def report_encrypt_progress(num_bytes):
                """Log encryption progress."""
                log.info('Encrypted %d bytes', num_bytes)

            with self.output().open('w') as output_file:
                with make_encrypted_file(
                        output_file,
                        key_file_targets,
                        progress=report_encrypt_progress,
                        dir=self.temporary_dir) as encrypted_output_file:
                    with tarfile.open(mode='w:gz',
                                      fileobj=encrypted_output_file
                                      ) as output_archive_file:
                        output_archive_file.add(tmp_directory, arcname='')
 def get_targets_from_remote_path(remote_path, pattern='*'):
     output_targets = PathSetTask([remote_path], [pattern]).output()
     modified = [
         modify_target_for_local_server(output_target)
         for output_target in output_targets
     ]
     return modified
    def __init__(self, *args, **kwargs):
        super(ObfuscatedCourseDumpTask, self).__init__(*args, **kwargs)

        filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(
            self.course)
        dump_path = url_path_join(self.dump_root, filename_safe_course_id,
                                  'state')
        auth_userprofile_targets = PathSetTask(
            [dump_path], ['*auth_userprofile*']).output()
        # TODO: Refactor out this logic of getting latest file. Right now we expect a date, so we use that
        dates = [
            target.path.rsplit('/', 2)[-2]
            for target in auth_userprofile_targets
        ]
        # TODO: Make the date a parameter that defaults to the most recent, but allows the user to override?
        # This should return an error if no data is found, rather than getting a cryptic 'index out of range' error.
        if len(dates) == 0:
            raise Exception(
                'Missing auth_userprofile data file in {}'.format(dump_path))
        latest_date = sorted(dates)[-1]
        self.data_directory = url_path_join(self.dump_root,
                                            filename_safe_course_id, 'state',
                                            latest_date)
        self.output_directory = url_path_join(self.output_root,
                                              filename_safe_course_id, 'state',
                                              latest_date)
    def run(self):
        recipients = set(self.recipient)
        if self.gpg_master_key is not None:
            recipients.add(self.gpg_master_key)
        key_file_targets = [
            get_target_from_url(url_path_join(self.gpg_key_dir, recipient))
            for recipient in recipients
        ]

        path_task = PathSetTask([self.course_files_url], ['*.*'])
        with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory:
            for target in path_task.output():
                with target.open('r') as input_file:
                    # Get path without urlscheme.
                    course_files_path = urlparse.urlparse(self.course_files_url).path
                    # Calculates target's relative path to course_files_path by getting the substring that
                    # occurs after course_files_path substring in target's path.
                    # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget.
                    # Examples:
                    # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz
                    # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz
                    # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    r_index = target.path.find(course_files_path) + len(course_files_path)
                    relative_path = target.path[r_index:].lstrip('/')

                    local_file_path = os.path.join(tmp_directory, relative_path)
                    try:
                        os.makedirs(os.path.dirname(local_file_path))
                    except OSError as exc:
                        if exc.errno != errno.EEXIST:
                            raise
                    with open(local_file_path, 'w') as temp_file:
                        copy_file_to_file(input_file, temp_file)

            def report_encrypt_progress(num_bytes):
                """Log encryption progress."""
                log.info('Encrypted %d bytes', num_bytes)

            with self.output().open('w') as output_file:
                with make_encrypted_file(
                    output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir
                ) as encrypted_output_file:
                    with tarfile.open(mode='w:gz', fileobj=encrypted_output_file) as output_archive_file:
                        output_archive_file.add(tmp_directory, arcname='')
 def obfuscate_directory(self, input_dir, output_dir):
     if output_dir is not None:
         create_directory(output_dir)
     if self.parameters['wiki']:
         for filepath in glob.glob(os.path.join(input_dir, '*wiki_articlerevision-prod-analytics.sql')):
             self.obfuscate_wiki_file(filepath, output_dir)
     if self.parameters['courseware']:
         for filepath in glob.glob(os.path.join(input_dir, '*courseware_studentmodule-prod-analytics.sql')):
             self.obfuscate_courseware_file(filepath, output_dir)
     if self.parameters['forum']:
         for filepath in glob.glob(os.path.join(input_dir, '*.mongo')):
             self.obfuscate_forum_file(filepath, output_dir)
     if self.parameters['event']:
         # This is generalized beyond localfs/glob.
         task = PathSetTask(src=[input_dir], include=['*-events-*.log.gz'])
         requirements = task.requires()
         for requirement in requirements:
             self.obfuscate_event_file(requirement.output(), output_dir)
 def requires(self):
     base_reqs = {
         # We want to process files that are zero-length.
         'data':
         PathSetTask([self.data_directory], [self.file_pattern],
                     include_zero_length=True)
     }
     base_reqs.update(self.user_info_requirements())
     return base_reqs
Beispiel #9
0
    def __init__(self, *args, **kwargs):
        super(LoadInternalReportingUserActivityToWarehouse, self).__init__(*args, **kwargs)

        path = url_path_join(self.warehouse_path, 'internal_reporting_user_activity')
        path_targets = PathSetTask([path]).output()
        paths = list(set([os.path.dirname(target.path) for target in path_targets]))
        dates = [path.rsplit('/', 2)[-1] for path in paths]
        latest_date = sorted(dates)[-1]

        self.load_date = datetime.datetime.strptime(latest_date, "dt=%Y-%m-%d").date()
Beispiel #10
0
    def read_dfs_directory(url):
        """Given the URL to a directory, read all of the files from it and concatenate them."""
        output_targets = PathSetTask([url], ['*']).output()
        raw_output = []
        for output_target in output_targets:
            if isinstance(output_target, S3HdfsTarget):
                output_target = get_target_from_url(get_jenkins_safe_url(output_target.path))
            raw_output.append(output_target.open('r').read())

        return ''.join(raw_output)
 def clean_xml_files(self, root_dir):
     """Find all of the XML files in the package and remove any unrecognized or known sensitive fields from them."""
     log.debug('Cleaning XML files')
     xml_file_paths = [
         target.path
         for target in PathSetTask([root_dir], ['*.xml']).output()
     ]
     for xml_file_path in xml_file_paths:
         document = xml.etree.ElementTree.parse(xml_file_path)
         element = document.getroot()
         self.clean_element(element)
         document.write(xml_file_path)
 def obfuscate_directory(self, input_dir, output_dir):
     if output_dir is not None:
         create_directory(output_dir)
     if self.parameters['wiki']:
         for filepath in glob.glob(
                 os.path.join(input_dir,
                              '*wiki_articlerevision-prod-analytics.sql')):
             self.obfuscate_wiki_file(filepath, output_dir)
     if self.parameters['courseware']:
         for filepath in glob.glob(
                 os.path.join(
                     input_dir,
                     '*courseware_studentmodule-prod-analytics.sql')):
             self.obfuscate_courseware_file(filepath, output_dir)
     if self.parameters['forum']:
         for filepath in glob.glob(os.path.join(input_dir, '*.mongo')):
             self.obfuscate_forum_file(filepath, output_dir)
     if self.parameters['event']:
         # This is generalized beyond localfs/glob.
         task = PathSetTask(src=[input_dir], include=['*-events-*.log.gz'])
         requirements = task.requires()
         for requirement in requirements:
             self.obfuscate_event_file(requirement.output(), output_dir)
    def test_end_to_end_without_vertica(self):
        # Similar to test_end_to_end but it excludes the vertica part and it checks data values,
        # not just data shape.
        table_name = 'reconciled_order_transactions'
        output_root = url_path_join(
            self.warehouse_path, table_name, 'dt=' + self.UPPER_BOUND_DATE
        ) + '/'
        self.task.launch([
            'ReconcileOrdersAndTransactionsTask',
            '--import-date', self.UPPER_BOUND_DATE,
            '--n-reduce-tasks', str(self.NUM_REDUCERS),
            '--output-root', output_root,
        ])
        final_output_task = LoadInternalReportingOrderTransactionsToWarehouse(
            import_date=luigi.DateParameter().parse(self.UPPER_BOUND_DATE)
        )
        columns = [x[0] for x in final_output_task.columns]
        output_targets = PathSetTask([output_root], ['*']).output()
        raw_output = ""
        for output_target in output_targets:
            output_target = get_target_from_url(get_jenkins_safe_url(output_target.path))
            raw_output += output_target.open('r').read()

        expected_output_csv = os.path.join(self.data_dir, 'output', 'expected_financial_report.csv')
        expected = pandas.read_csv(expected_output_csv, parse_dates=True)

        output = StringIO(raw_output.replace('\t\\N', '\t'))
        data = pandas.read_table(output, header=None, names=columns, parse_dates=True)
        # Re-order dataframe for consistent comparison:
        for frame in (data, expected):
            frame.sort(['payment_ref_id', 'transaction_type'], inplace=True, ascending=[True, False])
            frame.reset_index(drop=True, inplace=True)
        try:
            assert_frame_equal(data, expected)
        except AssertionError:
            pandas.set_option('display.max_columns', None)
            print('----- The report generated this data: -----')
            print(data)
            print('----- vs expected: -----')
            print(expected)
            if data.shape != expected.shape:
                print("Data shapes differ.")
            else:
                for index, series in data.iterrows():
                    # Try to print a more helpful/localized difference message:
                    try:
                        assert_series_equal(data.iloc[index, :], expected.iloc[index, :])
                    except AssertionError:
                        print("First differing row: {index}".format(index=index))
            raise
Beispiel #14
0
 def requires(self):
     return PathSetTask(self.src, self.include, self.manifest)
 def requires(self):
     filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course)
     event_files_url = url_path_join(self.dump_root, filename_safe_course_id, 'events')
     return PathSetTask([event_files_url], ['*'])
Beispiel #16
0
 def requires(self):
     results = {
         'events': PathSetTask(self.src, self.include, self.manifest),
         'geoloc_data': ExternalURL(self.geolocation_data),
     }
     return results
 def requires(self):
     return PathSetTask(self.dump_root)