def test_get_filename_for_invalid_id(self): self.assertEquals( opaque_key_util.get_filename_safe_course_id( INVALID_LEGACY_COURSE_ID), "org_course_id_course_run") self.assertEquals( opaque_key_util.get_filename_safe_course_id( INVALID_LEGACY_COURSE_ID, '-'), "org-course_id-course_run")
def test_get_filename(self): self.assertEquals( opaque_key_util.get_filename_safe_course_id(VALID_COURSE_ID), "org_course_id_course_run") self.assertEquals( opaque_key_util.get_filename_safe_course_id(VALID_COURSE_ID, '-'), "org-course_id-course_run")
def test_get_filename_with_default_separator( self, course_id, expected_filename, expected_filename_with_hyphen): self.assertEquals( opaque_key_util.get_filename_safe_course_id(course_id), expected_filename) self.assertEquals( opaque_key_util.get_filename_safe_course_id(course_id, '-'), expected_filename_with_hyphen)
def test_get_filename_for_invalid_id(self): self.assertEquals( opaque_key_util.get_filename_safe_course_id(INVALID_LEGACY_COURSE_ID), "org_course_id_course_run" ) self.assertEquals( opaque_key_util.get_filename_safe_course_id(INVALID_LEGACY_COURSE_ID, '-'), "org-course_id-course_run" )
def test_get_filename_for_nonascii_id(self): self.assertEquals( opaque_key_util.get_filename_safe_course_id(NONASCII_LEGACY_COURSE_ID), u"org_course\ufffd_id_course_run" ) self.assertEquals( opaque_key_util.get_filename_safe_course_id(NONASCII_LEGACY_COURSE_ID, '-'), u"org-course\ufffd_id-course_run" )
def test_get_filename_with_colon(self): course_id = unicode( CourseLocator(org='org', course='course:id', run='course:run')) self.assertEquals( opaque_key_util.get_filename_safe_course_id(VALID_COURSE_ID), "org_course_id_course_run") self.assertEquals( opaque_key_util.get_filename_safe_course_id(course_id, '-'), "org-course-id-course-run")
def setUp(self): super(DeidentificationAcceptanceTest, self).setUp() self.temporary_dir = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, self.temporary_dir) self.dump_root = url_path_join(self.test_src, 'course_exports', 'raw') self.filename_safe_course_id = get_filename_safe_course_id(self.COURSE_ID) self.test_gpg_key_dir = url_path_join(self.test_root, 'gpg-keys')
def __init__(self, *args, **kwargs): super(ObfuscatedPackageTask, self).__init__(*args, **kwargs) self.filename_safe_course_id = opaque_key_util.get_filename_safe_course_id( self.course) self.course_files_url = url_path_join(self.obfuscated_output_root, self.format_version, self.filename_safe_course_id)
def setUp(self): super(ObfuscationAcceptanceTest, self).setUp() self.temporary_dir = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, self.temporary_dir) self.dump_root = url_path_join(self.test_src, 'course_exports', 'raw') self.filename_safe_course_id = get_filename_safe_course_id(self.COURSE_ID) self.test_gpg_key_dir = url_path_join(self.test_root, 'gpg-keys')
def __init__(self, *args, **kwargs): super(ObfuscatedCourseDumpTask, self).__init__(*args, **kwargs) filename_safe_course_id = opaque_key_util.get_filename_safe_course_id( self.course) dump_path = url_path_join(self.dump_root, filename_safe_course_id, 'state') auth_userprofile_targets = PathSetTask( [dump_path], ['*auth_userprofile*']).output() # TODO: Refactor out this logic of getting latest file. Right now we expect a date, so we use that dates = [ target.path.rsplit('/', 2)[-2] for target in auth_userprofile_targets ] # TODO: Make the date a parameter that defaults to the most recent, but allows the user to override? # This should return an error if no data is found, rather than getting a cryptic 'index out of range' error. if len(dates) == 0: raise Exception( 'Missing auth_userprofile data file in {}'.format(dump_path)) latest_date = sorted(dates)[-1] self.data_directory = url_path_join(self.dump_root, filename_safe_course_id, 'state', latest_date) self.output_directory = url_path_join(self.output_root, filename_safe_course_id, 'state', latest_date)
def setUp(self): super(ObfuscationAcceptanceTest, self).setUp() self.temporary_dir = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, self.temporary_dir) self.dump_root = url_path_join(self.test_src, "course_exports", "raw") self.filename_safe_course_id = get_filename_safe_course_id(self.COURSE_ID) self.test_gpg_key_dir = url_path_join(self.test_root, "gpg-keys")
def test_get_filename_for_nonascii_id(self): self.assertEquals( opaque_key_util.get_filename_safe_course_id( VALID_NONASCII_LEGACY_COURSE_ID), u"org_cours__id_course_run") self.assertEquals( opaque_key_util.get_filename_safe_course_id( VALID_NONASCII_LEGACY_COURSE_ID, '-'), u"org-cours-_id-course_run") self.assertEquals( opaque_key_util.get_filename_safe_course_id( INVALID_NONASCII_LEGACY_COURSE_ID), u"org_course__id_course_run") self.assertEquals( opaque_key_util.get_filename_safe_course_id( INVALID_NONASCII_LEGACY_COURSE_ID, '-'), u"org-course-_id-course_run")
def run_task(self): """Runs the task with fake targets.""" output_archive_root = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, output_archive_root) with tempfile.NamedTemporaryFile() as tmp_input_archive: with tarfile.open(mode='w:gz', fileobj=tmp_input_archive) as input_archive_file: input_archive_file.add(self.archive_root, arcname='') tmp_input_archive.seek(0) task = obfuscate.CourseContentTask( course=sentinel.ignored, output_directory=sentinel.ignored, data_directory=sentinel.ignored, auth_user_path=sentinel.ignored, auth_userprofile_path=sentinel.ignored, ) fake_input = {'data': [LocalTarget(path=tmp_input_archive.name)]} task.input = MagicMock(return_value=fake_input) output_target = FakeTarget() task.output = MagicMock(return_value=output_target) task.user_info_requirements = get_mock_user_info_requirements() reset_user_info_for_testing() task.run() with tarfile.open(mode='r:gz', fileobj=output_target.buffer) as output_archive_file: output_archive_file.extractall(output_archive_root) self.output_course_root = os.path.join(output_archive_root, get_filename_safe_course_id(self.COURSE_ID))
def output_path_for_key(self, course_id): filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(course_id, '_') filename = u'{course_id}_enroll_validated_{dumpdate}.log.gz'.format( course_id=filename_safe_course_id, dumpdate=self.dump_date, ) return url_path_join(self.output_root, filename)
def output_path_for_key(self, course_id): template = "{course_id}-courseware_studentmodule-{suffix}analytics.sql" filename = template.format( course_id=opaque_key_util.get_filename_safe_course_id(course_id, '-'), suffix=(self.output_suffix + '-') if self.output_suffix else '' ) return url_path_join(self.output_root, filename)
def output_path_for_key(self, course_id): template = "{course_id}-courseware_studentmodule-{suffix}analytics.sql" filename = template.format( course_id=opaque_key_util.get_filename_safe_course_id( course_id, '-'), suffix=(self.output_suffix + '-') if self.output_suffix else '') return url_path_join(self.output_root, filename)
def requires_local(self): filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course) results = { 'auth_user': PathSetTask([url_path_join(self.dump_root, filename_safe_course_id, 'state')], ['*-auth_user-*']) } if os.path.basename(self.explicit_event_whitelist) != self.explicit_event_whitelist: results['explicit_events'] = ExternalURL(url=self.explicit_event_whitelist) return results
def create_paths(self, course, dates): """Setups directory structure and files as expected by DeidentifyCourseDumpTask task.""" self.temp_rootdir = tempfile.mkdtemp() self.dump_root = os.path.join(self.temp_rootdir, "dump_root") self.output_root = os.path.join(self.temp_rootdir, "output_root") filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(course) for date in dates: filepath = os.path.join(self.dump_root, filename_safe_course_id, 'state', date, 'auth_userprofile_file') os.makedirs(os.path.dirname(filepath)) open(filepath, 'a').close()
def output_path_for_key(self, key): filename_safe_course_id = opaque_key_util.get_filename_safe_course_id( self.course) return url_path_join( self.output_root, filename_safe_course_id, 'events', '{course}-events-{date}.log.gz'.format( course=filename_safe_course_id, date=key, ))
def output_path_for_key(self, key): date, course_id = key filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(course_id) return url_path_join( self.output_root, filename_safe_course_id, "events", "{course}-events-{date}.log.gz".format(course=filename_safe_course_id, date=date), )
def output_path_for_key(self, course_id): """ Match the course folder hierarchy that is expected by the instructor dashboard. The instructor dashboard expects the file to be stored in a folder named sha1(course_id). All files in that directory will be displayed on the instructor dashboard for that course. """ hashed_course_id = hashlib.sha1(course_id).hexdigest() filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(course_id, '_') filename = u'{course_id}_answer_distribution.csv'.format(course_id=filename_safe_course_id) return url_path_join(self.output_root, hashed_course_id, filename)
def output_path_for_key(self, key): date, course_id = key filename_safe_course_id = opaque_key_util.get_filename_safe_course_id( course_id) return url_path_join( self.output_root, filename_safe_course_id, "events", '{course}-events-{date}.log.gz'.format( course=filename_safe_course_id, date=date, ))
def create_paths(self, course, dates): """Setups directory structure and files as expected by ObfuscateCourseDumpTask task.""" self.temp_rootdir = tempfile.mkdtemp() self.dump_root = os.path.join(self.temp_rootdir, "dump_root") self.output_root = os.path.join(self.temp_rootdir, "output_root") filename_safe_course_id = get_filename_safe_course_id(course) for date in dates: filepath = os.path.join(self.dump_root, filename_safe_course_id, 'state', date, 'auth_userprofile_file') os.makedirs(os.path.dirname(filepath)) open(filepath, 'a').close()
def __init__(self, *args, **kwargs): super(DeidentifiedCourseDumpTask, self).__init__(*args, **kwargs) filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course) auth_userprofile_targets = PathSetTask([url_path_join(self.dump_root, filename_safe_course_id, 'state')], ['*auth_userprofile*']).output() # TODO: Refactor out this logic of getting latest file. Right now we expect a date, so we use that dates = [re.search(r"\d{4}-\d{2}-\d{2}", target.path).group() for target in auth_userprofile_targets] # TODO: Make the date a parameter that defaults to the most recent, but allows the user to override? latest_date = sorted(dates)[-1] self.data_directory = url_path_join(self.dump_root, filename_safe_course_id, 'state', latest_date) self.output_directory = url_path_join(self.output_root, filename_safe_course_id, 'state', latest_date)
def output_path_for_key(self, course_id): """ Match the course folder hierarchy that is expected by the Analytics API. The Analytics API expects the problem response files to be stored in a folder named by the course_id, so we sanitize it to create the filename. """ if course_id: safe_course_id = get_filename_safe_course_id(course_id) filename = self.report_filename_template.format(course_id=safe_course_id) return url_path_join(self.output_root, filename) return None
def output_path_for_key(self, key): filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course) return url_path_join( self.output_root, filename_safe_course_id, 'events', '{course}-events-{date}.log.gz'.format( course=filename_safe_course_id, date=key, ) )
def __init__(self, *args, **kwargs): super(ObfuscatedCourseDumpTask, self).__init__(*args, **kwargs) filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course) dump_path = url_path_join(self.dump_root, filename_safe_course_id, 'state') auth_userprofile_targets = PathSetTask([dump_path], ['*auth_userprofile*']).output() # TODO: Refactor out this logic of getting latest file. Right now we expect a date, so we use that dates = [target.path.rsplit('/', 2)[-2] for target in auth_userprofile_targets] # TODO: Make the date a parameter that defaults to the most recent, but allows the user to override? # This should return an error if no data is found, rather than getting a cryptic 'index out of range' error. if len(dates) == 0: raise Exception('Missing auth_userprofile data file in {}'.format(dump_path)) latest_date = sorted(dates)[-1] self.data_directory = url_path_join(self.dump_root, filename_safe_course_id, 'state', latest_date) self.output_directory = url_path_join(self.output_root, filename_safe_course_id, 'state', latest_date)
def setUp(self): self.archive_root = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, self.archive_root) course_id_filename = get_filename_safe_course_id(self.COURSE_ID) self.course_root = os.path.join(self.archive_root, course_id_filename) os.makedirs(self.course_root) with open(os.path.join(self.course_root, 'course.xml'), 'w') as course_file: course_file.write('<course url_name="foo" org="edX" course="DemoX"/>') policy_dir_path = os.path.join(self.course_root, 'policies', 'foo') os.makedirs(policy_dir_path) with open(os.path.join(policy_dir_path, 'policy.json'), 'w') as policy_file: json.dump({}, policy_file)
def test_database_export(self): # An S3 bucket to store the output in. assert ('exporter_output_bucket' in self.config) self.load_data_from_file() self.run_export_task() for course_id in [self.COURSE_ID2, self.COURSE_ID]: org_id = get_org_id_for_course(course_id).lower() self.run_legacy_exporter(org_id, course_id) exported_filename = '{safe_course_id}-{table}-{suffix}-analytics.sql'.format( safe_course_id=get_filename_safe_course_id(course_id, '-'), table=self.TABLE, suffix=self.ENVIRONMENT, ) self.validate_exporter_output(org_id, exported_filename)
def test_database_export(self): # An S3 bucket to store the output in. assert('exporter_output_bucket' in self.config) self.load_data_from_file() self.run_export_task() for course_id in [self.COURSE_ID2, self.COURSE_ID]: org_id = get_org_id_for_course(course_id).lower() self.run_legacy_exporter(org_id, course_id) exported_filename = '{safe_course_id}-{table}-{suffix}-analytics.sql'.format( safe_course_id=get_filename_safe_course_id(course_id, '-'), table=self.TABLE, suffix=self.ENVIRONMENT, ) self.validate_exporter_output(org_id, exported_filename)
def setUp(self): self.archive_root = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, self.archive_root) course_id_filename = get_filename_safe_course_id(self.COURSE_ID) self.course_root = os.path.join(self.archive_root, course_id_filename) os.makedirs(self.course_root) with open(os.path.join(self.course_root, 'course.xml'), 'w') as course_file: course_file.write( '<course url_name="foo" org="edX" course="DemoX"/>') policy_dir_path = os.path.join(self.course_root, 'policies', 'foo') os.makedirs(policy_dir_path) with open(os.path.join(policy_dir_path, 'policy.json'), 'w') as policy_file: json.dump({}, policy_file)
def run_task(self): """Runs the task with fake targets.""" output_archive_root = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, output_archive_root) with tempfile.NamedTemporaryFile() as tmp_input_archive: with tarfile.open(mode='w:gz', fileobj=tmp_input_archive) as input_archive_file: input_archive_file.add(self.archive_root, arcname='') tmp_input_archive.seek(0) task = obfuscate.CourseContentTask( course=sentinel.ignored, output_directory=sentinel.ignored, data_directory=sentinel.ignored, auth_user_path=sentinel.ignored, auth_userprofile_path=sentinel.ignored, ) fake_input = {'data': [LocalTarget(path=tmp_input_archive.name)]} task.input = MagicMock(return_value=fake_input) output_target = FakeTarget() task.output = MagicMock(return_value=output_target) task.user_info_requirements = get_mock_user_info_requirements() reset_user_info_for_testing() task.run() with tarfile.open( mode='r:gz', fileobj=output_target.buffer) as output_archive_file: output_archive_file.extractall(output_archive_root) self.output_course_root = os.path.join( output_archive_root, get_filename_safe_course_id(self.COURSE_ID))
def test_get_filename(self): self.assertEquals(opaque_key_util.get_filename_safe_course_id(VALID_COURSE_ID), "org_course_id_course_run") self.assertEquals(opaque_key_util.get_filename_safe_course_id(VALID_COURSE_ID, '-'), "org-course_id-course_run")
def test_get_filename_with_colon(self): course_id = unicode(CourseLocator(org='org', course='course:id', run='course:run')) self.assertEquals(opaque_key_util.get_filename_safe_course_id(VALID_COURSE_ID), "org_course_id_course_run") self.assertEquals(opaque_key_util.get_filename_safe_course_id(course_id, '-'), "org-course-id-course-run")
def test_get_filename_with_default_separator(self, course_id, expected_filename, expected_filename_with_hyphen): self.assertEquals(opaque_key_util.get_filename_safe_course_id(course_id), expected_filename) self.assertEquals(opaque_key_util.get_filename_safe_course_id(course_id, '-'), expected_filename_with_hyphen)
def output(self): filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course) return get_target_from_url(url_path_join( self.obfuscated_output_root, self.format_version, filename_safe_course_id, 'metadata_file.json' ))
def __init__(self, *args, **kwargs): super(ObfuscatedPackageTask, self).__init__(*args, **kwargs) self.filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course) self.course_files_url = url_path_join( self.obfuscated_output_root, self.format_version, self.filename_safe_course_id )
def requires(self): filename_safe_course_id = opaque_key_util.get_filename_safe_course_id( self.course) event_files_url = url_path_join(self.dump_root, filename_safe_course_id, 'events') return PathSetTask([event_files_url], ['*'])
def output(self): filename_safe_course_id = opaque_key_util.get_filename_safe_course_id( self.course) return get_target_from_url( url_path_join(self.obfuscated_output_root, self.format_version, filename_safe_course_id, 'metadata_file.json'))
def requires(self): filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course) event_files_url = url_path_join(self.dump_root, filename_safe_course_id, 'events') return PathSetTask([event_files_url], ['*'])