コード例 #1
0
    def validate_data_obfuscation(self):
        """Validates data obfuscation."""
        data_dir = os.path.join(self.temporary_dir, 'state', self.EXPORT_DATE)
        for data_filename in os.listdir(data_dir):
            data_filepath = os.path.join(data_dir, data_filename)
            expected_output_filepath = os.path.join(self.data_dir, 'output',
                                                    'obfuscation', 'state',
                                                    data_filename)

            if data_filename.endswith('mongo'):
                with open(data_filepath) as mongo_output_file:
                    output_json = [
                        json.loads(line) for line in mongo_output_file
                    ]
                with open(expected_output_filepath) as expected_output_file:
                    expected_output_json = [
                        json.loads(line) for line in expected_output_file
                    ]
                self.assertItemsEqual(output_json, expected_output_json)
            elif data_filename.endswith('.json'):
                with open(data_filepath) as actual_output_file:
                    output_json = json.load(actual_output_file)
                with open(expected_output_filepath) as expected_output_file:
                    expected_output_json = json.load(expected_output_file)
                self.assertDictEqual(output_json, expected_output_json)
            elif data_filename.endswith('.tar.gz'):
                pass
            else:
                shell.run(['diff', data_filepath, expected_output_filepath])
コード例 #2
0
    def run_legacy_exporter(self, org_id, course_id):
        """
        Preconditions: A text file for courseware_studentmodule has been generated and stored in the external file path.
        External Effect: Runs the legacy exporter which assembles the data package, encrypts it, and uploads it to S3.

        Reads <temporary_dir>/external/<day of month>/edX-E929-2014_T1-courseware_studentmodule-acceptance-analytics.sql
            and copies it in to the data package.

        Writes the configuration to <temporary_dir>/acceptance.yml.

        Uploads the package to s3://<exporter_output_bucket>/<output_prefix>edx-<year>-<month>-<day>.zip

        """
        config_file_path = os.path.join(self.temporary_dir,
                                        '{}_acceptance.yml'.format(org_id))

        self.write_exporter_config(org_id, course_id, config_file_path)

        src_url_tuple = urlparse.urlparse(self.test_src)

        command = [
            os.getenv('EXPORTER'), '--work-dir', self.working_dir,
            '--output-bucket',
            self.config.get('exporter_output_bucket'), '--pipeline-bucket',
            src_url_tuple.netloc, '--external-prefix',
            src_url_tuple.path.lstrip('/'), '--output-prefix',
            self.output_prefix, config_file_path, '--env', self.ENVIRONMENT,
            '--org', org_id, '--task', 'StudentModuleTask'
        ]
        shell.run(command)
コード例 #3
0
    def run_legacy_exporter(self, org_id, course_id):
        """
        Preconditions: A text file for courseware_studentmodule has been generated and stored in the external file path.
        External Effect: Runs the legacy exporter which assembles the data package, encrypts it, and uploads it to S3.

        Reads <temporary_dir>/external/<day of month>/edX-E929-2014_T1-courseware_studentmodule-acceptance-analytics.sql
            and copies it in to the data package.

        Writes the configuration to <temporary_dir>/acceptance.yml.

        Uploads the package to s3://<exporter_output_bucket>/<output_prefix>edx-<year>-<month>-<day>.zip

        """
        config_file_path = os.path.join(self.temporary_dir, '{}_acceptance.yml'.format(org_id))

        self.write_exporter_config(org_id, course_id, config_file_path)

        src_url_tuple = urlparse.urlparse(self.test_src)

        command = [
            os.getenv('EXPORTER'),
            '--work-dir', self.working_dir,
            '--output-bucket', self.config.get('exporter_output_bucket'),
            '--pipeline-bucket', src_url_tuple.netloc,
            '--external-prefix', src_url_tuple.path.lstrip('/'),
            '--output-prefix', self.output_prefix,
            config_file_path,
            '--env', self.ENVIRONMENT,
            '--org', org_id,
            '--task', 'StudentModuleTask'
        ]
        shell.run(command)
コード例 #4
0
    def validate_output_file(self, date, org_id, site, use_master_key=False):
        if use_master_key:
            key_filename = 'insecure_master_secret.key'
        else:
            if org_id == 'edx':
                key_filename = 'insecure_secret.key'
            else:
                key_filename = 'insecure_secret_2.key'

        self.temporary_dir = tempfile.mkdtemp()
        self.addCleanup(shutil.rmtree, self.temporary_dir)

        self.downloaded_outputs = os.path.join(self.temporary_dir, 'output')
        os.makedirs(self.downloaded_outputs)

        local_file_name = '{org}-{site}-events-{date}.log'.format(
            org=org_id,
            site=site,
            date=date,
        )

        year = str(date).split("-")[0]

        remote_url = url_path_join(self.test_out, org_id, site, "events", year,
                                   local_file_name + '.gz.gpg')

        # Files won't appear in S3 instantaneously, wait for the files to appear.
        # TODO: exponential backoff
        for _index in range(30):
            key = self.s3_client.get_key(remote_url)
            if key is not None:
                break
            else:
                time.sleep(2)

        if key is None:
            self.fail(
                'Unable to find expected output file {0}'.format(remote_url))

        downloaded_output_path = os.path.join(self.downloaded_outputs,
                                              remote_url.split('/')[-1])
        key.get_contents_to_filename(downloaded_output_path)

        # first decrypt file
        decrypted_file_name = downloaded_output_path[:-len('.gpg')]
        fs.decrypt_file(downloaded_output_path, decrypted_file_name,
                        key_filename)

        # now decompress file
        decompressed_file_name = decrypted_file_name[:-len(',gz')]
        fs.decompress_file(decrypted_file_name, decompressed_file_name)

        shell.run([
            'diff', decompressed_file_name,
            os.path.join(self.data_dir, 'output', local_file_name)
        ])
コード例 #5
0
    def validate_output_file(self, date, org_id, site, use_master_key=False):
        if use_master_key:
            key_filename = 'insecure_master_secret.key'
        else:
            if org_id == 'edx':
                key_filename = 'insecure_secret.key'
            else:
                key_filename = 'insecure_secret_2.key'

        self.temporary_dir = tempfile.mkdtemp()
        self.addCleanup(shutil.rmtree, self.temporary_dir)

        self.downloaded_outputs = os.path.join(self.temporary_dir, 'output')
        os.makedirs(self.downloaded_outputs)

        local_file_name = '{org}-{site}-events-{date}.log'.format(
            org=org_id,
            site=site,
            date=date,
        )

        year = str(date).split("-")[0]

        remote_url = url_path_join(self.test_out, org_id, site, "events", year, local_file_name + '.gz.gpg')

        # Files won't appear in S3 instantaneously, wait for the files to appear.
        # TODO: exponential backoff
        for _index in range(30):
            key = self.s3_client.get_key(remote_url)
            if key is not None:
                break
            else:
                time.sleep(2)

        if key is None:
            self.fail('Unable to find expected output file {0}'.format(remote_url))

        downloaded_output_path = os.path.join(self.downloaded_outputs, remote_url.split('/')[-1])
        key.get_contents_to_filename(downloaded_output_path)

        # first decrypt file
        decrypted_file_name = downloaded_output_path[:-len('.gpg')]
        fs.decrypt_file(downloaded_output_path, decrypted_file_name, key_filename)

        # now decompress file
        decompressed_file_name = decrypted_file_name[:-len(',gz')]
        fs.decompress_file(decrypted_file_name, decompressed_file_name)

        shell.run(['diff', decompressed_file_name, os.path.join(self.data_dir, 'output', local_file_name)])
コード例 #6
0
    def validate_data_deidentification(self):
        """Validates data deid."""
        data_dir = os.path.join(self.temporary_dir, 'state', self.EXPORT_DATE)
        for data_filename in os.listdir(data_dir):
            data_filepath = os.path.join(data_dir, data_filename)
            expected_output_filepath = os.path.join(self.data_dir, 'output', 'deidentification', 'state', data_filename)

            if data_filename.endswith('mongo'):
                with open(data_filepath) as mongo_output_file:
                    output_json = [json.loads(line) for line in mongo_output_file]
                with open(expected_output_filepath) as expected_output_file:
                    expected_output_json = [json.loads(line) for line in expected_output_file]
                self.assertItemsEqual(output_json, expected_output_json)
            else:
                shell.run(['diff', data_filepath, expected_output_filepath])
コード例 #7
0
    def validate_output_file(self, date, org_id, site, use_master_key=False):
        if use_master_key:
            key_filename = 'insecure_master_secret.key'
        else:
            if org_id == 'edx':
                key_filename = 'insecure_secret.key'
            else:
                key_filename = 'insecure_secret_2.key'

        self.temporary_dir = tempfile.mkdtemp()
        self.addCleanup(shutil.rmtree, self.temporary_dir)

        self.downloaded_outputs = os.path.join(self.temporary_dir, 'output')
        os.makedirs(self.downloaded_outputs)

        local_file_name = '{org}-{site}-events-{date}.log'.format(
            org=org_id,
            site=site,
            date=date,
        )

        year = str(date).split("-")[0]

        remote_url = url_path_join(self.test_out, org_id, site, "events", year,
                                   local_file_name + '.gz.gpg')

        downloaded_output_path = get_file_from_key(self.s3_client, remote_url,
                                                   self.downloaded_outputs)

        if downloaded_output_path is None:
            self.fail(
                'Unable to find expected output file {0}'.format(remote_url))

        # first decrypt file
        decrypted_file_name = downloaded_output_path[:-len('.gpg')]
        fs.decrypt_file(downloaded_output_path, decrypted_file_name,
                        key_filename)

        # now decompress file
        decompressed_file_name = decrypted_file_name[:-len(',gz')]
        fs.decompress_file(decrypted_file_name, decompressed_file_name)

        shell.run([
            'diff', decompressed_file_name,
            os.path.join(self.data_dir, 'output', local_file_name)
        ])
コード例 #8
0
ファイル: task.py プロジェクト: npoed/edx-analytics-pipeline
    def launch(self, task_args, config_override=None):
        self.delete_existing_logs()

        config_parser = ConfigParser.ConfigParser()
        config_parser.read(os.environ['LUIGI_CONFIG_PATH'])
        self.override_config(config_parser, self.default_config_override)
        if config_override:
            self.override_config(config_parser, config_override)

        with tempfile.NamedTemporaryFile() as temp_config_file:
            config_parser.write(temp_config_file)
            temp_config_file.flush()

            temp_config_file.seek(0)
            log.info('Task Configuration')
            log.info(temp_config_file.read())
            temp_config_file.seek(0)

            command = [
                os.getenv('REMOTE_TASK'),
                '--branch',
                self.config.get('tasks_branch'),
                '--repo',
                self.config.get('tasks_repo'),
                '--remote-name',
                self.identifier,
                '--wait',
                '--log-path',
                self.log_path,
                '--user',
                self.config.get('connection_user'),
                '--override-config',
                temp_config_file.name,
            ]

            if 'job_flow_name' in self.config:
                command.extend(
                    ['--job-flow-name', self.config['job_flow_name']])
            elif 'host' in self.config:
                command.extend(['--host', self.config['host']])

            if 'wheel_url' in self.config:
                command.extend(['--wheel-url', self.config['wheel_url']])

            command.extend(task_args)
            command.append('--local-scheduler')

            try:
                output = shell.run(command)
            finally:
                self.write_logs_to_standard_streams()

        return output
コード例 #9
0
    def validate_output_file(self, date, org_id, site, use_master_key=False):
        if use_master_key:
            key_filename = 'insecure_master_secret.key'
        else:
            if org_id == 'edx':
                key_filename = 'insecure_secret.key'
            else:
                key_filename = 'insecure_secret_2.key'

        self.temporary_dir = tempfile.mkdtemp()
        self.addCleanup(shutil.rmtree, self.temporary_dir)

        self.downloaded_outputs = os.path.join(self.temporary_dir, 'output')
        os.makedirs(self.downloaded_outputs)

        local_file_name = '{org}-{site}-events-{date}.log'.format(
            org=org_id,
            site=site,
            date=date,
        )

        year = str(date).split("-")[0]

        remote_url = url_path_join(self.test_out, org_id, site, "events", year, local_file_name + '.gz.gpg')

        downloaded_output_path = get_file_from_key(self.s3_client, remote_url, self.downloaded_outputs)

        if downloaded_output_path is None:
            self.fail('Unable to find expected output file {0}'.format(remote_url))

        # first decrypt file
        decrypted_file_name = downloaded_output_path[:-len('.gpg')]
        fs.decrypt_file(downloaded_output_path, decrypted_file_name, key_filename)

        # now decompress file
        decompressed_file_name = decrypted_file_name[:-len(',gz')]
        fs.decompress_file(decrypted_file_name, decompressed_file_name)

        shell.run(['diff', decompressed_file_name, os.path.join(self.data_dir, 'output', local_file_name)])
コード例 #10
0
    def validate_data_obfuscation(self):
        """Validates data obfuscation."""
        data_dir = os.path.join(self.temporary_dir, "state", self.EXPORT_DATE)
        for data_filename in os.listdir(data_dir):
            data_filepath = os.path.join(data_dir, data_filename)
            expected_output_filepath = os.path.join(self.data_dir, "output", "obfuscation", "state", data_filename)

            if data_filename.endswith("mongo"):
                with open(data_filepath) as mongo_output_file:
                    output_json = [json.loads(line) for line in mongo_output_file]
                with open(expected_output_filepath) as expected_output_file:
                    expected_output_json = [json.loads(line) for line in expected_output_file]
                self.assertItemsEqual(output_json, expected_output_json)
            elif data_filename.endswith(".json"):
                with open(data_filepath) as actual_output_file:
                    output_json = json.load(actual_output_file)
                with open(expected_output_filepath) as expected_output_file:
                    expected_output_json = json.load(expected_output_file)
                self.assertDictEqual(output_json, expected_output_json)
            elif data_filename.endswith(".tar.gz"):
                pass
            else:
                shell.run(["diff", data_filepath, expected_output_filepath])
コード例 #11
0
ファイル: hive.py プロジェクト: edx/edx-analytics-pipeline
 def execute(self, statement, explicit_db=True):
     if self.is_remote:
         db_parameter = ' --database ' + self.database_name if explicit_db else ''
         return self.task.launch([
             '--user', self.config['connection_user'],
             '--sudo-user', self.config['hive_user'],
             '--shell', ". $HOME/.bashrc && hive --service cli{db} -e \"{stmt}\"".format(
                 db=db_parameter,
                 stmt=statement
             ),
         ])
     else:
         cmd = ['hive', '--service', 'cli']
         if explicit_db:
             cmd.extend(['--database', self.database_name])
         cmd.extend(['-e', statement])
         return shell.run(cmd)
コード例 #12
0
    def launch(self, task_args, config_override=None):
        self.delete_existing_logs()

        config_parser = ConfigParser.ConfigParser()
        config_parser.read(os.environ['LUIGI_CONFIG_PATH'])
        self.override_config(config_parser, self.default_config_override)
        if config_override:
            self.override_config(config_parser, config_override)

        with tempfile.NamedTemporaryFile() as temp_config_file:
            config_parser.write(temp_config_file)
            temp_config_file.flush()

            temp_config_file.seek(0)
            log.info('Task Configuration')
            log.info(temp_config_file.read())
            temp_config_file.seek(0)

            command = [
                os.getenv('REMOTE_TASK'),
                '--branch', self.config.get('tasks_branch'),
                '--repo', self.config.get('tasks_repo'),
                '--remote-name', self.identifier,
                '--wait',
                '--log-path', self.log_path,
                '--user', self.config.get('connection_user'),
                '--override-config', temp_config_file.name,
            ]

            if 'job_flow_name' in self.config:
                command.extend(['--job-flow-name', self.config['job_flow_name']])
            elif 'host' in self.config:
                command.extend(['--host', self.config['host']])

            if 'wheel_url' in self.config:
                command.extend(['--wheel-url', self.config['wheel_url']])

            command.extend(task_args)
            command.append('--local-scheduler')

            try:
                output = shell.run(command)
            finally:
                self.write_logs_to_standard_streams()

        return output
コード例 #13
0
 def execute(self, statement, explicit_db=True):
     if self.is_remote:
         db_parameter = ' --database ' + self.database_name if explicit_db else ''
         return self.task.launch([
             '--user',
             self.config['connection_user'],
             '--sudo-user',
             self.config['hive_user'],
             '--shell',
             ". $HOME/.bashrc && hive --service cli{db} -e \"{stmt}\"".
             format(db=db_parameter, stmt=statement),
         ])
     else:
         cmd = ['hive', '--service', 'cli']
         if explicit_db:
             cmd.extend(['--database', self.database_name])
         cmd.extend(['-e', statement])
         return shell.run(cmd)
コード例 #14
0
    def validate_exporter_output(self, org_id, exported_filename):
        """
        Preconditions: A complete data package has been uploaded to S3.
        External Effect: Downloads the complete data package, decompresses it, decrypts it and then compares it to the
            static expected output ignoring the ordering of the records in both files.

        Downloads s3://<exporter_output_bucket>/<output_prefix><org_id>-<year>-<month>-<day>.zip to <temporary_dir>/work/validation/.

        """
        today = datetime.datetime.utcnow().strftime('%Y-%m-%d')
        bucket = ScalableS3Client().s3.get_bucket(
            self.config.get('exporter_output_bucket'))
        export_id = '{org}-{date}'.format(org=org_id, date=today)
        filename = export_id + '.zip'
        key = bucket.lookup(self.output_prefix + filename)
        if key is None:
            self.fail(
                'Expected output from legacy exporter not found. Url = s3://{bucket}/{pre}{filename}'
                .format(bucket=self.config.get('exporter_output_bucket'),
                        pre=self.output_prefix,
                        filename=filename))
        exporter_archive_path = os.path.join(self.validation_dir, filename)
        key.get_contents_to_filename(exporter_archive_path)

        shell.run(['unzip', exporter_archive_path, '-d', self.validation_dir])

        gpg = gnupg.GPG(gnupghome=self.gpg_dir)
        with open(os.path.join('gpg-keys', 'insecure_secret.key'),
                  'r') as key_file:
            gpg.import_keys(key_file.read())

        exported_file_path = os.path.join(self.validation_dir,
                                          exported_filename)
        with open(
                os.path.join(self.validation_dir, export_id,
                             exported_filename + '.gpg'),
                'r') as encrypted_file:
            gpg.decrypt_file(encrypted_file, output=exported_file_path)

        sorted_filename = exported_file_path + '.sorted'
        shell.run(['sort', '-o', sorted_filename, exported_file_path])

        expected_output_path = os.path.join(self.data_dir, 'output',
                                            exported_filename + '.sorted')
        shell.run(['diff', sorted_filename, expected_output_path])
コード例 #15
0
    def validate_exporter_output(self, org_id, exported_filename):
        """
        Preconditions: A complete data package has been uploaded to S3.
        External Effect: Downloads the complete data package, decompresses it, decrypts it and then compares it to the
            static expected output ignoring the ordering of the records in both files.

        Downloads s3://<exporter_output_bucket>/<output_prefix><org_id>-<year>-<month>-<day>.zip to <temporary_dir>/work/validation/.

        """
        today = datetime.datetime.utcnow().strftime('%Y-%m-%d')
        bucket = boto.connect_s3().get_bucket(self.config.get('exporter_output_bucket'))
        export_id = '{org}-{date}'.format(org=org_id, date=today)
        filename = export_id + '.zip'
        key = bucket.lookup(self.output_prefix + filename)
        if key is None:
            self.fail(
                'Expected output from legacy exporter not found. Url = s3://{bucket}/{pre}{filename}'.format(
                    bucket=self.config.get('exporter_output_bucket'),
                    pre=self.output_prefix,
                    filename=filename
                )
            )
        exporter_archive_path = os.path.join(self.validation_dir, filename)
        key.get_contents_to_filename(exporter_archive_path)

        shell.run(['unzip', exporter_archive_path, '-d', self.validation_dir])

        gpg = gnupg.GPG(gnupghome=self.gpg_dir)
        with open(os.path.join('gpg-keys', 'insecure_secret.key'), 'r') as key_file:
            gpg.import_keys(key_file.read())

        exported_file_path = os.path.join(self.validation_dir, exported_filename)
        with open(os.path.join(self.validation_dir, export_id, exported_filename + '.gpg'), 'r') as encrypted_file:
            gpg.decrypt_file(encrypted_file, output=exported_file_path)

        sorted_filename = exported_file_path + '.sorted'
        shell.run(['sort', '-o', sorted_filename, exported_file_path])

        expected_output_path = os.path.join(self.data_dir, 'output', exported_filename + '.sorted')
        shell.run(['diff', sorted_filename, expected_output_path])
コード例 #16
0
 def validate_output(self):
     for output_file in self.output_files:
         local_file_name = self.generate_file_name(output_file)
         shell.run(['diff', output_file['downloaded_path'], os.path.join(self.data_dir, 'output', local_file_name)])
コード例 #17
0
    def launch(self, task_args, config_override=None):
        self.delete_existing_logs()

        config_parser = ConfigParser.ConfigParser()
        config_parser.read(os.environ['LUIGI_CONFIG_PATH'])
        self.override_config(config_parser, self.default_config_override)
        if config_override:
            self.override_config(config_parser, config_override)

        with tempfile.NamedTemporaryFile() as temp_config_file:
            config_parser.write(temp_config_file)
            temp_config_file.flush()

            temp_config_file.seek(0)
            log.info('Task Configuration')
            log.info(temp_config_file.read())
            temp_config_file.seek(0)

            env = dict(os.environ)
            if self.is_remote:
                command = [
                    os.getenv('REMOTE_TASK'),
                    '--branch',
                    self.config.get('tasks_branch'),
                    '--repo',
                    self.config.get('tasks_repo'),
                    '--remote-name',
                    self.identifier,
                    '--wait',
                    '--log-path',
                    self.log_path,
                    '--user',
                    self.config.get('connection_user'),
                    '--override-config',
                    temp_config_file.name,
                ]

                if 'job_flow_name' in self.config:
                    command.extend(
                        ['--job-flow-name', self.config['job_flow_name']])
                elif 'host' in self.config:
                    command.extend(['--host', self.config['host']])

                if 'wheel_url' in self.config:
                    command.extend(['--wheel-url', self.config['wheel_url']])

                command.extend(task_args)
                command.append('--local-scheduler')
            else:
                # run the command in a shell since that's what is done by remote-task.
                # Otherwise values like '"*"' cause problems since they are properly escaped for the "shell" case but
                # malformed when not interpreted by a shell.
                command = [
                    '/bin/bash', '-c',
                    '. ~/.bashrc && {0} {1} --local-scheduler'.format(
                        os.getenv('LAUNCH_TASK', 'launch-task'),
                        ' '.join(task_args))
                ]
                env['LUIGI_CONFIG_PATH'] = temp_config_file.name

            try:
                output = shell.run(command, env=env)
            finally:
                self.write_logs_to_standard_streams()

        return output