def validate_data_obfuscation(self): """Validates data obfuscation.""" data_dir = os.path.join(self.temporary_dir, 'state', self.EXPORT_DATE) for data_filename in os.listdir(data_dir): data_filepath = os.path.join(data_dir, data_filename) expected_output_filepath = os.path.join(self.data_dir, 'output', 'obfuscation', 'state', data_filename) if data_filename.endswith('mongo'): with open(data_filepath) as mongo_output_file: output_json = [ json.loads(line) for line in mongo_output_file ] with open(expected_output_filepath) as expected_output_file: expected_output_json = [ json.loads(line) for line in expected_output_file ] self.assertItemsEqual(output_json, expected_output_json) elif data_filename.endswith('.json'): with open(data_filepath) as actual_output_file: output_json = json.load(actual_output_file) with open(expected_output_filepath) as expected_output_file: expected_output_json = json.load(expected_output_file) self.assertDictEqual(output_json, expected_output_json) elif data_filename.endswith('.tar.gz'): pass else: shell.run(['diff', data_filepath, expected_output_filepath])
def run_legacy_exporter(self, org_id, course_id): """ Preconditions: A text file for courseware_studentmodule has been generated and stored in the external file path. External Effect: Runs the legacy exporter which assembles the data package, encrypts it, and uploads it to S3. Reads <temporary_dir>/external/<day of month>/edX-E929-2014_T1-courseware_studentmodule-acceptance-analytics.sql and copies it in to the data package. Writes the configuration to <temporary_dir>/acceptance.yml. Uploads the package to s3://<exporter_output_bucket>/<output_prefix>edx-<year>-<month>-<day>.zip """ config_file_path = os.path.join(self.temporary_dir, '{}_acceptance.yml'.format(org_id)) self.write_exporter_config(org_id, course_id, config_file_path) src_url_tuple = urlparse.urlparse(self.test_src) command = [ os.getenv('EXPORTER'), '--work-dir', self.working_dir, '--output-bucket', self.config.get('exporter_output_bucket'), '--pipeline-bucket', src_url_tuple.netloc, '--external-prefix', src_url_tuple.path.lstrip('/'), '--output-prefix', self.output_prefix, config_file_path, '--env', self.ENVIRONMENT, '--org', org_id, '--task', 'StudentModuleTask' ] shell.run(command)
def validate_output_file(self, date, org_id, site, use_master_key=False): if use_master_key: key_filename = 'insecure_master_secret.key' else: if org_id == 'edx': key_filename = 'insecure_secret.key' else: key_filename = 'insecure_secret_2.key' self.temporary_dir = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, self.temporary_dir) self.downloaded_outputs = os.path.join(self.temporary_dir, 'output') os.makedirs(self.downloaded_outputs) local_file_name = '{org}-{site}-events-{date}.log'.format( org=org_id, site=site, date=date, ) year = str(date).split("-")[0] remote_url = url_path_join(self.test_out, org_id, site, "events", year, local_file_name + '.gz.gpg') # Files won't appear in S3 instantaneously, wait for the files to appear. # TODO: exponential backoff for _index in range(30): key = self.s3_client.get_key(remote_url) if key is not None: break else: time.sleep(2) if key is None: self.fail( 'Unable to find expected output file {0}'.format(remote_url)) downloaded_output_path = os.path.join(self.downloaded_outputs, remote_url.split('/')[-1]) key.get_contents_to_filename(downloaded_output_path) # first decrypt file decrypted_file_name = downloaded_output_path[:-len('.gpg')] fs.decrypt_file(downloaded_output_path, decrypted_file_name, key_filename) # now decompress file decompressed_file_name = decrypted_file_name[:-len(',gz')] fs.decompress_file(decrypted_file_name, decompressed_file_name) shell.run([ 'diff', decompressed_file_name, os.path.join(self.data_dir, 'output', local_file_name) ])
def validate_output_file(self, date, org_id, site, use_master_key=False): if use_master_key: key_filename = 'insecure_master_secret.key' else: if org_id == 'edx': key_filename = 'insecure_secret.key' else: key_filename = 'insecure_secret_2.key' self.temporary_dir = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, self.temporary_dir) self.downloaded_outputs = os.path.join(self.temporary_dir, 'output') os.makedirs(self.downloaded_outputs) local_file_name = '{org}-{site}-events-{date}.log'.format( org=org_id, site=site, date=date, ) year = str(date).split("-")[0] remote_url = url_path_join(self.test_out, org_id, site, "events", year, local_file_name + '.gz.gpg') # Files won't appear in S3 instantaneously, wait for the files to appear. # TODO: exponential backoff for _index in range(30): key = self.s3_client.get_key(remote_url) if key is not None: break else: time.sleep(2) if key is None: self.fail('Unable to find expected output file {0}'.format(remote_url)) downloaded_output_path = os.path.join(self.downloaded_outputs, remote_url.split('/')[-1]) key.get_contents_to_filename(downloaded_output_path) # first decrypt file decrypted_file_name = downloaded_output_path[:-len('.gpg')] fs.decrypt_file(downloaded_output_path, decrypted_file_name, key_filename) # now decompress file decompressed_file_name = decrypted_file_name[:-len(',gz')] fs.decompress_file(decrypted_file_name, decompressed_file_name) shell.run(['diff', decompressed_file_name, os.path.join(self.data_dir, 'output', local_file_name)])
def validate_data_deidentification(self): """Validates data deid.""" data_dir = os.path.join(self.temporary_dir, 'state', self.EXPORT_DATE) for data_filename in os.listdir(data_dir): data_filepath = os.path.join(data_dir, data_filename) expected_output_filepath = os.path.join(self.data_dir, 'output', 'deidentification', 'state', data_filename) if data_filename.endswith('mongo'): with open(data_filepath) as mongo_output_file: output_json = [json.loads(line) for line in mongo_output_file] with open(expected_output_filepath) as expected_output_file: expected_output_json = [json.loads(line) for line in expected_output_file] self.assertItemsEqual(output_json, expected_output_json) else: shell.run(['diff', data_filepath, expected_output_filepath])
def validate_output_file(self, date, org_id, site, use_master_key=False): if use_master_key: key_filename = 'insecure_master_secret.key' else: if org_id == 'edx': key_filename = 'insecure_secret.key' else: key_filename = 'insecure_secret_2.key' self.temporary_dir = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, self.temporary_dir) self.downloaded_outputs = os.path.join(self.temporary_dir, 'output') os.makedirs(self.downloaded_outputs) local_file_name = '{org}-{site}-events-{date}.log'.format( org=org_id, site=site, date=date, ) year = str(date).split("-")[0] remote_url = url_path_join(self.test_out, org_id, site, "events", year, local_file_name + '.gz.gpg') downloaded_output_path = get_file_from_key(self.s3_client, remote_url, self.downloaded_outputs) if downloaded_output_path is None: self.fail( 'Unable to find expected output file {0}'.format(remote_url)) # first decrypt file decrypted_file_name = downloaded_output_path[:-len('.gpg')] fs.decrypt_file(downloaded_output_path, decrypted_file_name, key_filename) # now decompress file decompressed_file_name = decrypted_file_name[:-len(',gz')] fs.decompress_file(decrypted_file_name, decompressed_file_name) shell.run([ 'diff', decompressed_file_name, os.path.join(self.data_dir, 'output', local_file_name) ])
def launch(self, task_args, config_override=None): self.delete_existing_logs() config_parser = ConfigParser.ConfigParser() config_parser.read(os.environ['LUIGI_CONFIG_PATH']) self.override_config(config_parser, self.default_config_override) if config_override: self.override_config(config_parser, config_override) with tempfile.NamedTemporaryFile() as temp_config_file: config_parser.write(temp_config_file) temp_config_file.flush() temp_config_file.seek(0) log.info('Task Configuration') log.info(temp_config_file.read()) temp_config_file.seek(0) command = [ os.getenv('REMOTE_TASK'), '--branch', self.config.get('tasks_branch'), '--repo', self.config.get('tasks_repo'), '--remote-name', self.identifier, '--wait', '--log-path', self.log_path, '--user', self.config.get('connection_user'), '--override-config', temp_config_file.name, ] if 'job_flow_name' in self.config: command.extend( ['--job-flow-name', self.config['job_flow_name']]) elif 'host' in self.config: command.extend(['--host', self.config['host']]) if 'wheel_url' in self.config: command.extend(['--wheel-url', self.config['wheel_url']]) command.extend(task_args) command.append('--local-scheduler') try: output = shell.run(command) finally: self.write_logs_to_standard_streams() return output
def validate_output_file(self, date, org_id, site, use_master_key=False): if use_master_key: key_filename = 'insecure_master_secret.key' else: if org_id == 'edx': key_filename = 'insecure_secret.key' else: key_filename = 'insecure_secret_2.key' self.temporary_dir = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, self.temporary_dir) self.downloaded_outputs = os.path.join(self.temporary_dir, 'output') os.makedirs(self.downloaded_outputs) local_file_name = '{org}-{site}-events-{date}.log'.format( org=org_id, site=site, date=date, ) year = str(date).split("-")[0] remote_url = url_path_join(self.test_out, org_id, site, "events", year, local_file_name + '.gz.gpg') downloaded_output_path = get_file_from_key(self.s3_client, remote_url, self.downloaded_outputs) if downloaded_output_path is None: self.fail('Unable to find expected output file {0}'.format(remote_url)) # first decrypt file decrypted_file_name = downloaded_output_path[:-len('.gpg')] fs.decrypt_file(downloaded_output_path, decrypted_file_name, key_filename) # now decompress file decompressed_file_name = decrypted_file_name[:-len(',gz')] fs.decompress_file(decrypted_file_name, decompressed_file_name) shell.run(['diff', decompressed_file_name, os.path.join(self.data_dir, 'output', local_file_name)])
def validate_data_obfuscation(self): """Validates data obfuscation.""" data_dir = os.path.join(self.temporary_dir, "state", self.EXPORT_DATE) for data_filename in os.listdir(data_dir): data_filepath = os.path.join(data_dir, data_filename) expected_output_filepath = os.path.join(self.data_dir, "output", "obfuscation", "state", data_filename) if data_filename.endswith("mongo"): with open(data_filepath) as mongo_output_file: output_json = [json.loads(line) for line in mongo_output_file] with open(expected_output_filepath) as expected_output_file: expected_output_json = [json.loads(line) for line in expected_output_file] self.assertItemsEqual(output_json, expected_output_json) elif data_filename.endswith(".json"): with open(data_filepath) as actual_output_file: output_json = json.load(actual_output_file) with open(expected_output_filepath) as expected_output_file: expected_output_json = json.load(expected_output_file) self.assertDictEqual(output_json, expected_output_json) elif data_filename.endswith(".tar.gz"): pass else: shell.run(["diff", data_filepath, expected_output_filepath])
def execute(self, statement, explicit_db=True): if self.is_remote: db_parameter = ' --database ' + self.database_name if explicit_db else '' return self.task.launch([ '--user', self.config['connection_user'], '--sudo-user', self.config['hive_user'], '--shell', ". $HOME/.bashrc && hive --service cli{db} -e \"{stmt}\"".format( db=db_parameter, stmt=statement ), ]) else: cmd = ['hive', '--service', 'cli'] if explicit_db: cmd.extend(['--database', self.database_name]) cmd.extend(['-e', statement]) return shell.run(cmd)
def launch(self, task_args, config_override=None): self.delete_existing_logs() config_parser = ConfigParser.ConfigParser() config_parser.read(os.environ['LUIGI_CONFIG_PATH']) self.override_config(config_parser, self.default_config_override) if config_override: self.override_config(config_parser, config_override) with tempfile.NamedTemporaryFile() as temp_config_file: config_parser.write(temp_config_file) temp_config_file.flush() temp_config_file.seek(0) log.info('Task Configuration') log.info(temp_config_file.read()) temp_config_file.seek(0) command = [ os.getenv('REMOTE_TASK'), '--branch', self.config.get('tasks_branch'), '--repo', self.config.get('tasks_repo'), '--remote-name', self.identifier, '--wait', '--log-path', self.log_path, '--user', self.config.get('connection_user'), '--override-config', temp_config_file.name, ] if 'job_flow_name' in self.config: command.extend(['--job-flow-name', self.config['job_flow_name']]) elif 'host' in self.config: command.extend(['--host', self.config['host']]) if 'wheel_url' in self.config: command.extend(['--wheel-url', self.config['wheel_url']]) command.extend(task_args) command.append('--local-scheduler') try: output = shell.run(command) finally: self.write_logs_to_standard_streams() return output
def execute(self, statement, explicit_db=True): if self.is_remote: db_parameter = ' --database ' + self.database_name if explicit_db else '' return self.task.launch([ '--user', self.config['connection_user'], '--sudo-user', self.config['hive_user'], '--shell', ". $HOME/.bashrc && hive --service cli{db} -e \"{stmt}\"". format(db=db_parameter, stmt=statement), ]) else: cmd = ['hive', '--service', 'cli'] if explicit_db: cmd.extend(['--database', self.database_name]) cmd.extend(['-e', statement]) return shell.run(cmd)
def validate_exporter_output(self, org_id, exported_filename): """ Preconditions: A complete data package has been uploaded to S3. External Effect: Downloads the complete data package, decompresses it, decrypts it and then compares it to the static expected output ignoring the ordering of the records in both files. Downloads s3://<exporter_output_bucket>/<output_prefix><org_id>-<year>-<month>-<day>.zip to <temporary_dir>/work/validation/. """ today = datetime.datetime.utcnow().strftime('%Y-%m-%d') bucket = ScalableS3Client().s3.get_bucket( self.config.get('exporter_output_bucket')) export_id = '{org}-{date}'.format(org=org_id, date=today) filename = export_id + '.zip' key = bucket.lookup(self.output_prefix + filename) if key is None: self.fail( 'Expected output from legacy exporter not found. Url = s3://{bucket}/{pre}{filename}' .format(bucket=self.config.get('exporter_output_bucket'), pre=self.output_prefix, filename=filename)) exporter_archive_path = os.path.join(self.validation_dir, filename) key.get_contents_to_filename(exporter_archive_path) shell.run(['unzip', exporter_archive_path, '-d', self.validation_dir]) gpg = gnupg.GPG(gnupghome=self.gpg_dir) with open(os.path.join('gpg-keys', 'insecure_secret.key'), 'r') as key_file: gpg.import_keys(key_file.read()) exported_file_path = os.path.join(self.validation_dir, exported_filename) with open( os.path.join(self.validation_dir, export_id, exported_filename + '.gpg'), 'r') as encrypted_file: gpg.decrypt_file(encrypted_file, output=exported_file_path) sorted_filename = exported_file_path + '.sorted' shell.run(['sort', '-o', sorted_filename, exported_file_path]) expected_output_path = os.path.join(self.data_dir, 'output', exported_filename + '.sorted') shell.run(['diff', sorted_filename, expected_output_path])
def validate_exporter_output(self, org_id, exported_filename): """ Preconditions: A complete data package has been uploaded to S3. External Effect: Downloads the complete data package, decompresses it, decrypts it and then compares it to the static expected output ignoring the ordering of the records in both files. Downloads s3://<exporter_output_bucket>/<output_prefix><org_id>-<year>-<month>-<day>.zip to <temporary_dir>/work/validation/. """ today = datetime.datetime.utcnow().strftime('%Y-%m-%d') bucket = boto.connect_s3().get_bucket(self.config.get('exporter_output_bucket')) export_id = '{org}-{date}'.format(org=org_id, date=today) filename = export_id + '.zip' key = bucket.lookup(self.output_prefix + filename) if key is None: self.fail( 'Expected output from legacy exporter not found. Url = s3://{bucket}/{pre}{filename}'.format( bucket=self.config.get('exporter_output_bucket'), pre=self.output_prefix, filename=filename ) ) exporter_archive_path = os.path.join(self.validation_dir, filename) key.get_contents_to_filename(exporter_archive_path) shell.run(['unzip', exporter_archive_path, '-d', self.validation_dir]) gpg = gnupg.GPG(gnupghome=self.gpg_dir) with open(os.path.join('gpg-keys', 'insecure_secret.key'), 'r') as key_file: gpg.import_keys(key_file.read()) exported_file_path = os.path.join(self.validation_dir, exported_filename) with open(os.path.join(self.validation_dir, export_id, exported_filename + '.gpg'), 'r') as encrypted_file: gpg.decrypt_file(encrypted_file, output=exported_file_path) sorted_filename = exported_file_path + '.sorted' shell.run(['sort', '-o', sorted_filename, exported_file_path]) expected_output_path = os.path.join(self.data_dir, 'output', exported_filename + '.sorted') shell.run(['diff', sorted_filename, expected_output_path])
def validate_output(self): for output_file in self.output_files: local_file_name = self.generate_file_name(output_file) shell.run(['diff', output_file['downloaded_path'], os.path.join(self.data_dir, 'output', local_file_name)])
def launch(self, task_args, config_override=None): self.delete_existing_logs() config_parser = ConfigParser.ConfigParser() config_parser.read(os.environ['LUIGI_CONFIG_PATH']) self.override_config(config_parser, self.default_config_override) if config_override: self.override_config(config_parser, config_override) with tempfile.NamedTemporaryFile() as temp_config_file: config_parser.write(temp_config_file) temp_config_file.flush() temp_config_file.seek(0) log.info('Task Configuration') log.info(temp_config_file.read()) temp_config_file.seek(0) env = dict(os.environ) if self.is_remote: command = [ os.getenv('REMOTE_TASK'), '--branch', self.config.get('tasks_branch'), '--repo', self.config.get('tasks_repo'), '--remote-name', self.identifier, '--wait', '--log-path', self.log_path, '--user', self.config.get('connection_user'), '--override-config', temp_config_file.name, ] if 'job_flow_name' in self.config: command.extend( ['--job-flow-name', self.config['job_flow_name']]) elif 'host' in self.config: command.extend(['--host', self.config['host']]) if 'wheel_url' in self.config: command.extend(['--wheel-url', self.config['wheel_url']]) command.extend(task_args) command.append('--local-scheduler') else: # run the command in a shell since that's what is done by remote-task. # Otherwise values like '"*"' cause problems since they are properly escaped for the "shell" case but # malformed when not interpreted by a shell. command = [ '/bin/bash', '-c', '. ~/.bashrc && {0} {1} --local-scheduler'.format( os.getenv('LAUNCH_TASK', 'launch-task'), ' '.join(task_args)) ] env['LUIGI_CONFIG_PATH'] = temp_config_file.name try: output = shell.run(command, env=env) finally: self.write_logs_to_standard_streams() return output