def _get_required_tasks(self): """Internal method to actually calculate required tasks once.""" start_date = self.interval.date_a # pylint: disable=no-member end_date = self.interval.date_b # pylint: disable=no-member table_name = "student_courseenrollment" source_root = url_path_join(self.warehouse_path, table_name) today_datestring = datetime.datetime.utcnow().strftime('%Y-%m-%d') current_date = start_date while current_date <= end_date: datestring = current_date.strftime('%Y-%m-%d') current_date += datetime.timedelta(days=1) src_datestring = "dt={}".format(datestring) source_dir = url_path_join(source_root, src_datestring) target = get_target_from_url(source_dir) output_dir = url_path_join(self.output_root, datestring) if datestring == today_datestring: yield CreateEnrollmentValidationEventsForTodayTask( source_dir=source_dir, output_root=output_dir, n_reduce_tasks=self.n_reduce_tasks, credentials=self.credentials, ) elif target.exists(): yield CreateEnrollmentValidationEventsTask( source_dir=source_dir, output_root=output_dir, n_reduce_tasks=self.n_reduce_tasks, )
def setUp(self): super(ObfuscationAcceptanceTest, self).setUp() self.temporary_dir = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, self.temporary_dir) self.dump_root = url_path_join(self.test_src, 'course_exports', 'raw') self.filename_safe_course_id = get_filename_safe_course_id(self.COURSE_ID) self.test_gpg_key_dir = url_path_join(self.test_root, 'gpg-keys')
def test_enrollment_validation(self): # Initial setup. context = { 'days': lambda n: datetime.timedelta(days=n), 'start_date': self.START_DATE } self.upload_tracking_log(self.INPUT_FILE, self.START_DATE, template_context=context) self.execute_sql_fixture_file(self.SQL_FIXTURE) self.test_validate = url_path_join(self.test_root, 'validate') # Run once. This will generate the new validation events, but # will not include them in the validation run (because the # requirements for the validation run are computed before any # validation events are generated). self.test_first_run = url_path_join(self.test_out, 'first_run') self.launch_task(self.test_first_run, run_with_validation_events=False) # Check that validation took place. self.check_validation_events() # Run again, with the validation events generated by the first run. self.test_second_run = url_path_join(self.test_out, 'second_run') self.launch_task(self.test_second_run) # Check that synthetic events were created. self.check_synthetic_events(self.test_second_run) # Run again, with the synthetic events generated by the second run. self.test_third_run = url_path_join(self.test_out, 'third_run') self.launch_task(self.test_third_run, extra_source=self.test_second_run) # Check that no events are output. self.check_no_synthetic_events(self.test_third_run)
def setUp(self): """Copy the input data into place.""" super(CourseListPartitionTaskAcceptanceTest, self).setUp() # Copy course list REST API data file_name = 'course_list.json' daily_partition = self.DATE.strftime(self.DAILY_PARTITION_FORMAT) self.upload_file(url_path_join(self.data_dir, 'input', file_name), url_path_join(self.warehouse_path, 'course_list_raw', "dt=" + daily_partition, file_name))
def setUp(self): super(InternalReportingUserCourseLoadAcceptanceTest, self).setUp() self.upload_file( os.path.join(self.data_dir, 'input', 'course_catalog.json'), url_path_join(self.warehouse_path, 'course_catalog_raw', 'dt=' + self.DATE, 'course_catalog.json') ) self.upload_file( os.path.join(self.data_dir, 'input', 'programs.json'), url_path_join(self.warehouse_path, 'programs_raw', 'dt=' + self.DATE, 'programs.json') )
def run_obfuscation_task(self): """Run ObfuscatedCourseTask.""" self.task.launch([ 'ObfuscatedCourseTask', '--course', self.filename_safe_course_id, '--dump-root', self.dump_root, '--obfuscated-output-root', url_path_join(self.test_root, 'obfuscated-output'), '--format-version', self.FORMAT_VERSION, '--pipeline-version', self.PIPELINE_VERSION, '--auth-user-path', url_path_join(self.test_root, 'warehouse', 'auth_user'), '--auth-userprofile-path', url_path_join(self.test_root, 'warehouse', 'auth_userprofile') ])
def test_answer_distribution(self): self.task.launch([ 'AnswerDistributionOneFilePerCourseTask', '--src', self.test_src, '--dest', url_path_join(self.test_root, 'dst'), '--name', 'test', '--output-root', self.test_out, '--include', '"*"', '--manifest', url_path_join(self.test_root, 'manifest.txt'), '--base-input-format', self.input_format, '--lib-jar', self.oddjob_jar, '--n-reduce-tasks', str(self.NUM_REDUCERS), ]) self.validate_output()
def validate_hive(self): """Ensure hive partition was created as expected.""" table_name = 'course_blocks' output_dir = url_path_join(self.data_dir, 'output', table_name) for file_name in ('_SUCCESS', 'part-00000', 'part-00001'): actual_output_file = url_path_join(self.warehouse_path, table_name, self.partition, file_name) actual_output_target = get_target_for_local_server(actual_output_file) self.assertTrue(actual_output_target.exists(), '{} not created'.format(file_name)) actual_output = actual_output_target.open('r').read() expected_output_file = url_path_join(output_dir, file_name) expected_output_target = get_target_for_local_server(expected_output_file) expected_output = expected_output_target.open('r').read() self.assertEqual(actual_output, expected_output)
def test_answer_distribution_mysql(self): self.task.launch([ 'AnswerDistributionToMySQLTaskWorkflow', '--src', self.test_src, '--dest', url_path_join(self.test_root, 'dst'), '--name', 'test', '--include', '"*"', '--manifest', url_path_join(self.test_root, 'manifest.txt'), '--base-input-format', self.input_format, '--lib-jar', self.oddjob_jar, '--n-reduce-tasks', str(self.NUM_REDUCERS), '--credentials', self.export_db.credentials_file_url, ]) self.validate_output()
def __init__(self, *args, **kwargs): super(ObfuscatedCourseDumpTask, self).__init__(*args, **kwargs) filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course) dump_path = url_path_join(self.dump_root, filename_safe_course_id, 'state') auth_userprofile_targets = PathSetTask([dump_path], ['*auth_userprofile*']).output() # TODO: Refactor out this logic of getting latest file. Right now we expect a date, so we use that dates = [target.path.rsplit('/', 2)[-2] for target in auth_userprofile_targets] # TODO: Make the date a parameter that defaults to the most recent, but allows the user to override? # This should return an error if no data is found, rather than getting a cryptic 'index out of range' error. if len(dates) == 0: raise Exception('Missing auth_userprofile data file in {}'.format(dump_path)) latest_date = sorted(dates)[-1] self.data_directory = url_path_join(self.dump_root, filename_safe_course_id, 'state', latest_date) self.output_directory = url_path_join(self.output_root, filename_safe_course_id, 'state', latest_date)
def insert_source_task(self): """ We are already exporting vertica tables to S3 using SqoopImportFromVertica through VerticaSchemaToBigQueryTask workflow, so we specify ExternalURL here instead. In the future we can change this to a SqoopImportFromVertica task. """ partition_path_spec = HivePartition('dt', self.date).path_spec intermediate_warehouse_path = url_path_join(self.warehouse_path, 'import/vertica/sqoop/') url = url_path_join(intermediate_warehouse_path, self.vertica_warehouse_name, self.vertica_schema_name, self.table_name, partition_path_spec) + '/' return ExternalURL(url=url)
def output(self): return get_target_from_url( url_path_join( self.hive_partition_path('course_list_raw', partition_value=self.partition_value), 'course_list.json' ) )
def output_path_for_key(self, course_id): filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(course_id, '_') filename = u'{course_id}_enroll_validated_{dumpdate}.log.gz'.format( course_id=filename_safe_course_id, dumpdate=self.dump_date, ) return url_path_join(self.output_root, filename)
def test_event_log_exports_using_manifest(self): config_override = { 'manifest': { 'threshold': 1 } } folders = { 'edx': self.PROD_FOLDER, 'edge': self.EDGE_FOLDER } for environment in ['edx', 'edge']: self.task.launch([ 'EventExportTask', '--source', as_list_param(url_path_join(self.test_src, environment)), '--output-root', self.test_out, '--config', self.test_config, '--environment', environment, '--interval', '2014-05', '--gpg-key-dir', self.test_gpg_key_dir, '--gpg-master-key', '*****@*****.**', '--required-path-text', folders[environment], '--n-reduce-tasks', str(self.NUM_REDUCERS), ], config_override) self.validate_output()
def insert_source_task(self): # Get the columns to request from Sqoop, as a side effect of # getting the Vertica columns. The Vertica column names are quoted, so strip the quotes off. column_names = [name[1:-1] for (name, _) in self.columns] partition_path_spec = HivePartition('dt', self.date.isoformat()).path_spec destination = url_path_join( self.warehouse_path, self.warehouse_subdirectory, self.database, self.table_name, partition_path_spec ) + '/' # The arguments here to SqoopImportFromMysql should be the same as for BigQuery. # The old format used mysql_delimiters, and direct mode. We have now removed direct mode, # and that gives us more choices for other settings. We have already changed null_string and field termination, # and we hardcode here the replacement of delimiters (like newlines) with spaces # (using Sqoop's --hive-delims-replacement option). # We could also set other SqoopImportTask parameters: escaped_by, enclosed_by, optionally_enclosed_by. # If we wanted to model 'mysql_delimiters=True', we would set escaped-by: \ optionally-enclosed-by: '. # But instead we use the defaults for them, so that there is no escaping or enclosing. return SqoopImportFromMysql( table_name=self.table_name, credentials=self.db_credentials, database=self.database, destination=destination, overwrite=self.overwrite, mysql_delimiters=False, fields_terminated_by=self.field_delimiter, null_string=self.null_marker, delimiter_replacement=' ', direct=False, columns=column_names, )
def validate_problem_response_report(self): """Run the ProblemResponseReportWorkflow task and test the output.""" marker_path = url_path_join(self.test_out, 'marker-{}'.format(str(time.time()))) report_date = self.DATE.strftime('%Y-%m-%d') # The test tracking.log file contains problem_check events for 2016-09-06, 09-07, and 09-08. # However, to test the interval parameter propagation, we deliberately exclude all but the 2016-09-07 events. # # This is important because this task can be run multiple times a day, and so must be configurable to have an # interval-end of "tomorrow", which will include all events from today. interval_start = '2016-09-07' interval_end = '2016-09-08' self.task.launch([ 'ProblemResponseReportWorkflow', '--interval-start', interval_start, '--interval-end', interval_end, '--date', report_date, '--marker', marker_path, '--n-reduce-tasks', str(self.NUM_REDUCERS), ]) self.maxDiff = None self.validate_marker(marker_path) self.validate_hive() self.validate_reports()
def check_validation_events(self): """Confirm that validation data was properly created.""" validate_output_dir = url_path_join(self.test_validate, str(self.END_DATE)) outputs = self.get_targets_from_remote_path(validate_output_dir) # There are 2 courses in the test data. self.assertEqual(len(outputs), 2)
def insert_source_task(self): hive_table = "user_activity_by_user" # User activity data for each day is stored in a dated directory. # We want to be able to load all that data into Vertica in one go, hence we use # a wildcard('*') here. url = url_path_join(self.warehouse_path, hive_table) + '/dt=*/' return ExternalURL(url=url)
def validate_hive(self): """Ensure hive partition was created.""" hourly_partition = self.DATE.strftime(self.HOURLY_PARTITION_FORMAT) hive_partition = url_path_join(self.warehouse_path, "problem_response_location", "dt=" + hourly_partition) partition_target = get_target_for_local_server(hive_partition) self.assertTrue(partition_target.exists())
def output(self): return get_target_from_url( url_path_join( self.hive_partition_path('program_course_order', partition_value=self.date), '{0}.tsv'.format('program_course_order') ) )
def output(self): return get_target_from_url(url_path_join( self.output_root, 'transaction', 'dt=' + self.import_date.isoformat(), # pylint: disable=no-member 'transactions.csv' ))
def requires(self): yield ExternalURL(url=self.vertica_credentials) yield ExternalURL(url=self.gcp_credentials) if self.bigquery_dataset is None: self.bigquery_dataset = self.vertica_schema_name intermediate_warehouse_path = url_path_join(self.s3_warehouse_path, 'import/vertica/sqoop/') query = "SELECT table_name FROM all_tables WHERE schema_name='{schema_name}' AND table_type='TABLE' " \ "".format(schema_name=self.vertica_schema_name) table_list = [row[0] for row in get_vertica_results(self.vertica_credentials, query)] for table_name in table_list: if not self.should_exclude_table(table_name): yield LoadVerticaTableToBigQuery( date=self.date, overwrite=self.overwrite, intermediate_warehouse_path=intermediate_warehouse_path, dataset_id=self.bigquery_dataset, credentials=self.gcp_credentials, max_bad_records=self.max_bad_records, table_name=table_name, vertica_schema_name=self.vertica_schema_name, vertica_warehouse_name=self.vertica_warehouse_name, vertica_credentials=self.vertica_credentials, exclude=self.exclude, )
def output(self): if len(self.input()['data']) == 0: raise IOError("Course File '{filename}' not found for course '{course}'".format( filename=self.file_pattern, course=self.course )) output_filename = os.path.basename(self.input()['data'][0].path) return get_target_from_url(url_path_join(self.output_directory, output_filename))
def test_end_to_end_without_vertica(self): # Similar to test_end_to_end but it excludes the vertica part and it checks data values, # not just data shape. table_name = 'reconciled_order_transactions' output_root = url_path_join( self.warehouse_path, table_name, 'dt=' + self.UPPER_BOUND_DATE ) + '/' self.task.launch([ 'ReconcileOrdersAndTransactionsTask', '--import-date', self.UPPER_BOUND_DATE, '--n-reduce-tasks', str(self.NUM_REDUCERS), '--output-root', output_root, ]) final_output_task = LoadInternalReportingOrderTransactionsToWarehouse( import_date=luigi.DateParameter().parse(self.UPPER_BOUND_DATE) ) columns = [x[0] for x in final_output_task.columns] expected_output_csv = os.path.join(self.data_dir, 'output', 'expected_financial_report.csv') expected = pandas.read_csv(expected_output_csv, parse_dates=True) raw_output = self.read_dfs_directory(output_root) output = StringIO(raw_output.replace('\t\\N', '\t')) data = pandas.read_table(output, header=None, names=columns, parse_dates=True) # Re-order dataframe for consistent comparison: for frame in (data, expected): frame.sort(['payment_ref_id', 'transaction_type'], inplace=True, ascending=[True, False]) frame.reset_index(drop=True, inplace=True) self.assert_data_frames_equal(data, expected)
def multi_output_reducer(self, key, values, output_file): """ Write values to the appropriate file as determined by the key. Write to the encrypted file by streaming through gzip, which compresses before encrypting """ _date_string, org_id = key recipients = self.recipients_for_org_id[org_id] log.info('Encryption recipients: %s', str(recipients)) def report_progress(num_bytes): """Update hadoop counters as the file is written""" self.event_export_counter(counter_title='Bytes Written to Output', incr_value=num_bytes) key_file_targets = [get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients] try: with make_encrypted_file(output_file, key_file_targets, progress=report_progress, hadoop_counter_incr_func=self.event_export_counter) as encrypted_output_file: outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file) try: for value in values: outfile.write(value.strip()) outfile.write('\n') # WARNING: This line ensures that Hadoop knows that our process is not sitting in an infinite # loop. Do not remove it. self.event_export_counter(counter_title='Raw Bytes Written', incr_value=(len(value) + 1)) finally: outfile.close() except IOError as err: log.error("Error encountered while encrypting and gzipping Organization: %s file: %s Exception: %s", org_id, key_file_targets, err) # This counter is set when there is an error during the generation of the encryption file for an # organization for any reason, including encryption errors related to an expired GPG key. self.event_export_counter(counter_title="{} org with Errors".format(org_id), incr_value=1)
def generate_file_list(self): """Yield each individual path given a source folder and a set of file-matching expressions.""" for src in self.src: if src.startswith('s3'): # connect lazily as needed: if self.s3_conn is None: self.s3_conn = ScalableS3Client().s3 for _bucket, _root, path in generate_s3_sources(self.s3_conn, src, self.include, self.include_zero_length): source = url_path_join(src, path) yield ExternalURL(source) elif src.startswith('hdfs'): for source, size in luigi.contrib.hdfs.listdir(src, recursive=True, include_size=True): if not self.include_zero_length and size == 0: continue elif any(fnmatch.fnmatch(source, include_val) for include_val in self.include): yield ExternalURL(source) else: # Apply the include patterns to the relative path below the src directory. # TODO: implement exclude_zero_length to match S3 case. for dirpath, _dirnames, files in os.walk(src): for filename in files: filepath = os.path.join(dirpath, filename) relpath = os.path.relpath(filepath, src) if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include): yield ExternalURL(filepath)
def output(self): output_root = url_path_join( self.warehouse_path, self.partition_task.hive_table_task.table, self.partition.path_spec + '/' ) return get_target_from_url(output_root, marker=True)
def partition_location(self): """Provides location of Hive database table's partition data.""" # The actual folder name where the data is stored is expected to be in the format <key>=<value> partition_name = '='.join(self.partition.items()[0]) # Make sure that input path ends with a slash, to indicate a directory. # (This is necessary for S3 paths that are output from Hadoop jobs.) return url_path_join(self.table_location, partition_name + '/')
def upload_public_keys(self): gpg_key_dir = os.path.join('gpg-keys') for key_filename in os.listdir(gpg_key_dir): full_local_path = os.path.join(gpg_key_dir, key_filename) remote_url = url_path_join(self.test_gpg_key_dir, key_filename) if not key_filename.endswith('.key'): self.upload_file(full_local_path, remote_url)
def output_path_for_key(self, key): date_string = key return url_path_join( self.hive_partition_path('video_viewing_by_date', date_string), 'video_viewing_{date}'.format( date=date_string, ), )
def insert_source_task(self): hive_table = "internal_reporting_user_activity" partition_location = url_path_join(self.warehouse_path, hive_table, self.partition.path_spec) + '/' return ExternalURL(url=partition_location)
def prepare_database(self): sql_fixture_base_url = url_path_join(self.data_dir, 'input', 'enterprise') for filename in os.listdir(sql_fixture_base_url): self.execute_sql_fixture_file( url_path_join(sql_fixture_base_url, filename))
def output(self): return get_target_from_url( url_path_join( self.hive_partition_path('discovery_api_raw', partition_value=self.date), 'programs.json'))
def output(self): return get_target_from_url( url_path_join( self.hive_partition_path('program_course_order', partition_value=self.date), '{0}.tsv'.format('program_course_order')))
def output(self): return get_target_from_url( url_path_join( self.hive_partition_path(self.table_name, partition_value=self.date), '{0}.tsv'.format(self.table_name)))
def output_path_for_key(self, key): date_string = key return url_path_join( self.hive_partition_path('user_activity_by_user', date_string), 'user_activity_{date}'.format(date=date_string, ))
def output(self): return get_target_from_url( url_path_join(self.output_root, 'temp/CountProgramCohortEnrollments/'))
def output(self): output_name = u'answer_distribution_per_course_{name}/'.format( name=self.name) return get_target_from_url(url_path_join(self.dest, output_name))
def partition_location(self): """Returns the full URL of the partition. This allows data to be written to the partition by external systems""" return url_path_join(self.hive_table_task.table_location, self.partition.path_spec + '/')
def output(self): # pragma: no cover output_root = url_path_join(self.warehouse_path, self.partition_task.hive_table_task.table, self.partition.path_spec + '/') return get_target_from_url(output_root, marker=True)
def output(self): return get_target_from_url( url_path_join( self.destination, "daily_registrations_enrollments_{0}.csv".format(self.name)))
def requires(self): yield self.hive_table_task yield ExternalURL( url=url_path_join(self.warehouse_path, 'course_enrollment_summary', self.partition.path_spec) + '/')
def output(self): return get_target_from_url( url_path_join( self.hive_partition_path('course_catalog_raw', partition_value=self.date), 'course_catalog.json'))
def metadata_output(self): """Return target to which metadata about the task execution can be written.""" return get_target_from_url( url_path_join(self.destination, METADATA_FILENAME))
def output_path_for_key(self, key): authoring_institution, program_uuid = key filename = u'{}__{}.csv'.format(self.report_name, self.date) return url_path_join(self.output_root, authoring_institution, program_uuid, filename)
def table_location(self): """Provides root location of Hive database table's data.""" return url_path_join(self.warehouse_path, self.table) + '/'
def output(self): return get_target_from_url( url_path_join(self.output_root, 'temp', 'CountCourseEnrollments/'))
def partition_location(self): """Provides location of Hive database table's partition data.""" # Make sure that input path ends with a slash, to indicate a directory. # (This is necessary for S3 paths that are output from Hadoop jobs.) return url_path_join(self.table_location, self.partition.path_spec + '/')
def output_path_for_key(self, key): org_key, program_uuid = key filename = u'{}__{}.csv'.format(self.report_name, self.date) return url_path_join(self.output_root, org_key, program_uuid, filename)
def complete(self): if self.overwrite and not self.attempted_removal: return False else: return get_target_from_url( url_path_join(self.output_url(), '_SUCCESS')).exists()
def output(self): url_with_filename = url_path_join(self.destination, self.filename) return get_target_from_url(url_with_filename)
def output_path_for_key(self, key): date_string = key return url_path_join( self.hive_partition_path('last_ip_of_user_id', date_string), 'last_ip_of_user_{date}'.format(date=date_string), )
def output(self): output_name = u'seq_open_dist_{name}/'.format(name=self.name) return get_target_from_url(url_path_join(self.dest, output_name))
def insert_source_task(self): url = url_path_join(self.hive_partition_path('course_seat', self.date), 'course_seat.tsv') return ExternalURL(url=url)
def table_location(self): return url_path_join(self.destination, self.table_name)
def output(self): return get_target_from_url( url_path_join( self.destination, "incremental_users_and_enrollments_{0}.csv".format(self.name)))
def output(self): return get_target_from_url( url_path_join(self.output_root, 'event_type_distribution/'))
def marker_output(self): """Return target for _SUCCESS marker indicating the task was successfully completed.""" return get_target_from_url(url_path_join(self.destination, "_SUCCESS"))
def complete(self): """ The task is complete if the output_root/_SUCCESS file is present. """ return get_target_from_url(url_path_join(self.output_root, '_SUCCESS')).exists()
def output(self): output_name = u'problem_check_events_{name}/'.format(name=self.name) return get_target_from_url(url_path_join(self.dest, output_name))