def output(self): return get_target_from_url( url_path_join( self.user_country_output, 'dt={0}/'.format(self.interval.date_b.strftime('%Y-%m-%d')) # pylint: disable=no-member ) )
def _get_required_tasks(self): """Internal method to actually calculate required tasks once.""" start_date = self.interval.date_a end_date = self.interval.date_b table_name = "student_courseenrollment" source_root = url_path_join(self.warehouse_path, table_name) today_datestring = datetime.datetime.utcnow().strftime('%Y-%m-%d') current_date = start_date while current_date <= end_date: datestring = current_date.strftime('%Y-%m-%d') current_date += datetime.timedelta(days=1) src_datestring = "dt={}".format(datestring) source_dir = url_path_join(source_root, src_datestring) target = get_target_from_url(source_dir) output_dir = url_path_join(self.output_root, datestring) if datestring == today_datestring: yield CreateEnrollmentValidationEventsForTodayTask( source_dir=source_dir, output_root=output_dir, n_reduce_tasks=self.n_reduce_tasks, credentials=self.credentials, ) elif target.exists(): yield CreateEnrollmentValidationEventsTask( source_dir=source_dir, output_root=output_dir, n_reduce_tasks=self.n_reduce_tasks, )
def output(self): return get_target_from_url( url_path_join( self.output_root, 'count-user-activity-per-interval-{interval}.tsv/'.format(interval=self.interval), ) )
def validate_obfuscation(self): """Validates obfuscation workflow.""" output_target = PathSetTask([self.test_out], ['*.tar.gz.gpg']).output()[0] output_filename = os.path.basename(output_target.path) output_filepath = os.path.join(self.temporary_dir, output_filename) if output_target.path.startswith('s3://'): output_target = get_target_from_url(output_target.path.replace('s3://', 's3+https://')) with output_target.open('r') as input_file: with open(output_filepath, 'w') as output_file: copy_file_to_file(input_file, output_file) decrypted_filepath = output_filepath[:-len('.gpg')] fs.decrypt_file(output_filepath, decrypted_filepath, 'insecure_secret.key') with tarfile.open(decrypted_filepath, 'r:gz') as tfile: tfile.extractall(self.temporary_dir) # Validate package metadata info. metadata_filepath = os.path.join(self.temporary_dir, 'metadata_file.json') with open(metadata_filepath) as metadata_file: metadata_info = json.load(metadata_file) self.assertItemsEqual(metadata_info['format_version'], self.FORMAT_VERSION) self.assertItemsEqual(metadata_info['pipeline_version'], self.PIPELINE_VERSION) self.validate_data_obfuscation() self.validate_events_obfuscation()
def output(self): if len(self.input()['data']) == 0: raise IOError("Course File '{filename}' not found for course '{course}'".format( filename=self.file_pattern, course=self.course )) output_filename = os.path.basename(self.input()['data'][0].path) return get_target_from_url(url_path_join(self.output_directory, output_filename))
def output(self): return get_target_from_url(url_path_join( self.output_root, 'transaction', 'dt=' + self.import_date.isoformat(), # pylint: disable=no-member 'transactions.csv' ))
def credentials(self): """The credentials for connecting to the database, read from a URL.""" if not hasattr(self, '_credentials'): with get_target_from_url(self.vertica_creds_url).open('r') as credentials_file: self._credentials = json.load(credentials_file) return self._credentials
def output(self): return get_target_from_url( url_path_join( self.destination, "incremental_users_and_enrollments_{0}.csv".format(self.name) ) )
def output(self): return get_target_from_url( url_path_join( self.destination, 'total_users_and_enrollments_{0}-{1}.csv'.format(self.start_date, self.date) ) )
def multi_output_reducer(self, key, values, output_file): """ Write values to the appropriate file as determined by the key. Write to the encrypted file by streaming through gzip, which compresses before encrypting """ _date_string, org_id = key recipients = self.recipients_for_org_id[org_id] log.info('Encryption recipients: %s', str(recipients)) def report_progress(num_bytes): """Update hadoop counters as the file is written""" self.incr_counter('Event Export', 'Bytes Written to Output', num_bytes) key_file_targets = [get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients] with make_encrypted_file(output_file, key_file_targets, progress=report_progress) as encrypted_output_file: outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file) try: for value in values: outfile.write(value.strip()) outfile.write('\n') # WARNING: This line ensures that Hadoop knows that our process is not sitting in an infinite loop. # Do not remove it. self.incr_counter('Event Export', 'Raw Bytes Written', len(value) + 1) finally: outfile.close()
def run_job(self, job): job.init_hadoop() job.init_mapper() map_output = StringIO.StringIO() input_targets = luigi.task.flatten(job.input_hadoop()) for input_target in input_targets: # if file is a directory, then assume that it's Hadoop output, # and actually loop through its contents: if os.path.isdir(input_target.path): filenames = os.listdir(input_target.path) for filename in filenames: url = url_path_join(input_target.path, filename) input_targets.append(get_target_from_url(url.strip())) continue with input_target.open('r') as input_file: # S3 files not yet supported since they don't support tell() and seek() if input_target.path.endswith('.gz'): input_file = gzip.GzipFile(fileobj=input_file) elif input_target.path.endswith('.manifest'): for url in input_file: input_targets.append(get_target_from_url(url.strip())) continue os.environ['map_input_file'] = input_target.path try: outputs = job._map_input((line[:-1] for line in input_file)) job.internal_writer(outputs, map_output) finally: del os.environ['map_input_file'] map_output.seek(0) reduce_input = self.group(map_output) try: reduce_output = job.output().open('w') except Exception: reduce_output = StringIO.StringIO() try: job._run_reducer(reduce_input, reduce_output) finally: try: reduce_output.close() except Exception: pass
def output(self): if len(self.input()) == 0: raise IOError("Course File '{filename}' not found for course '{course}'".format( filename=self.file_pattern, course=self.course )) # TODO: should we change the filename to indicate that it has been de-identified? output_filename = os.path.basename(self.input()[0].path) return get_target_from_url(url_path_join(self.output_directory, output_filename))
def reset_external_state(self): root_target = get_target_from_url(get_jenkins_safe_url(self.test_root)) if root_target.exists(): root_target.remove() self.import_db.reset() self.export_db.reset() self.otto_db.reset() self.hive.reset() self.vertica.reset()
def requires_hadoop(self): # Check first if running locally with Sqoop output. target = get_target_from_url(self.source_dir) if isinstance(target, luigi.LocalTarget) and os.path.isdir(self.source_dir): files = [f for f in os.listdir(self.source_dir) if f.startswith("part")] for filename in files: yield ExternalURL(url_path_join(self.source_dir, filename)) else: yield ExternalURL(self.source_dir)
def manifest_file_list(self): """Write each individual path to a manifest file and yield the path to that file.""" manifest_target = get_target_from_url(self.manifest) if not manifest_target.exists(): with manifest_target.open('w') as manifest_file: for external_url_task in self.generate_file_list(): manifest_file.write(external_url_task.url + '\n') yield ExternalURL(self.manifest)
def when_geolocation_data_available(function): config = get_test_config() geolocation_data = config.get('geolocation_data') geolocation_data_available = bool(geolocation_data) if geolocation_data_available: geolocation_data_available = get_target_from_url(get_jenkins_safe_url(geolocation_data)).exists() return unittest.skipIf( not geolocation_data_available, 'Geolocation data is not available' )(function)
def read_config_file(filename): """Read a config file from either an external source (S3, HDFS etc) or the "share" directory of this repo.""" if os.path.basename(filename) != filename: target = get_target_from_url(filename) with target.open('r') as config_file: yield config_file else: file_path = os.path.join(sys.prefix, 'share', 'edx.analytics.tasks', filename) with open(file_path, 'r') as config_file: yield config_file
def output(self): """ Output is set up so that it can be read as a Hive table with partitions. The form is {warehouse_path}/course_structure/dt={CCYY-mm-dd}/courses.tsv. """ date_string = self.run_date.strftime("%Y-%m-%d") # pylint: disable=no-member partition_path_spec = HivePartition("dt", date_string).path_spec url_with_filename = url_path_join(self.warehouse_path, "course_structure", partition_path_spec, "courses.tsv") return get_target_from_url(url_with_filename)
def run(self): self.remove_output_on_overwrite() super(CourseEnrollmentEventsTask, self).run() # This makes sure that a output file exists for each date in the interval # as downstream tasks require that they exist. for date in self.interval: url = self.output_path_for_key(date.isoformat()) target = get_target_from_url(url) if not target.exists(): target.open("w").close() # touch the file
def output(self): """Output is in the form {output_root}/cybersource/{CCYY-mm}/cybersource_{merchant}_{CCYYmmdd}.csv""" month_year_string = self.run_date.strftime('%Y-%m') # pylint: disable=no-member date_string = self.run_date.strftime('%Y%m%d') # pylint: disable=no-member filename = "cybersource_{merchant_id}_{date_string}.{report_format}".format( merchant_id=self.merchant_id, date_string=date_string, report_format=self.REPORT_FORMAT, ) url_with_filename = url_path_join(self.output_root, "cybersource", month_year_string, filename) return get_target_from_url(url_with_filename)
def run(self): self.remove_output_on_overwrite() super(LastDailyIpAddressOfUserTask, self).run() # This makes sure that a output file exists for each date in the interval # as downstream tasks require that they exist (as provided by downstream_input_tasks()). for date in self.interval: url = self.output_path_for_key(date.isoformat()) target = get_target_from_url(url) if not target.exists(): target.open("w").close() # touch the file
def output(self): """ Output is set up so it can be read in as a Hive table with partitions. The form is {output_root}/payments/dt={CCYY-mm-dd}/cybersource_{merchant}.tsv """ date_string = self.run_date.strftime('%Y-%m-%d') # pylint: disable=no-member partition_path_spec = HivePartition('dt', date_string).path_spec filename = "cybersource_{}.tsv".format(self.merchant_id) url_with_filename = url_path_join(self.output_root, "payments", partition_path_spec, filename) return get_target_from_url(url_with_filename)
def output(self): """ Output is set up so that it can be read as a Hive table with partitions, The form is {warehouse_path}/course_catalog_api/subjects/dt={CCYY-mm-dd}/subjects.tsv. """ date_string = self.date.strftime('%Y-%m-%d') # pylint: disable=no-member partition_path_spec = HivePartition('dt', date_string).path_spec url_with_filename = url_path_join(self.warehouse_path, "course_catalog", "subjects", partition_path_spec, "subjects.tsv") return get_target_from_url(url_with_filename)
def output(self): config = configuration.get_config() base_url = config.get(CONFIG_SECTION, 'path') target = get_target_from_url(url_path_join(base_url, str(hash(self))) + '.manifest') lib_jar = config.get(CONFIG_SECTION, 'lib_jar', None) if lib_jar: target.lib_jar = [lib_jar] input_format = config.get(CONFIG_SECTION, 'input_format', None) if input_format: target.input_format = input_format return target
def reducer(self, key, values): """ Write out values from each key into different output files. """ output_path = self.output_path_for_key(key) if output_path: output_file_target = get_target_from_url(output_path) with output_file_target.open('w') as output_file: self.multi_output_reducer(key, values, output_file) # Luigi requires the reducer to return an iterable return iter(tuple())
def reset_external_state(self): # The machine running the acceptance test suite may not have hadoop installed on it, so convert S3 paths (which # are normally handled by the hadoop DFS client) to S3+https paths, which are handled by the python native S3 # client. root_target = get_target_from_url(self.test_root.replace('s3://', 's3+https://')) if root_target.exists(): root_target.remove() self.import_db.reset() self.export_db.reset() self.otto_db.reset() self.hive.reset() self.vertica.reset()
def __init__(self, *args, **kwargs): super(MultiOutputMapReduceJobTask, self).__init__(*args, **kwargs) if self.delete_output_root: # If requested, make sure that the output directory is empty. This gets rid # of any generated data files from a previous run (that might not get # regenerated in this run). It also makes sure that the marker file # (i.e. the output target) will be removed, so that external functionality # will know that the generation of data files is not complete. output_dir_target = get_target_from_url(self.output_root) for target in [self.output(), output_dir_target]: if target.exists(): target.remove()
def test_end_to_end_without_vertica(self): # Similar to test_end_to_end but it excludes the vertica part and it checks data values, # not just data shape. table_name = 'reconciled_order_transactions' output_root = url_path_join( self.warehouse_path, table_name, 'dt=' + self.UPPER_BOUND_DATE ) + '/' self.task.launch([ 'ReconcileOrdersAndTransactionsTask', '--import-date', self.UPPER_BOUND_DATE, '--n-reduce-tasks', str(self.NUM_REDUCERS), '--output-root', output_root, ]) final_output_task = LoadInternalReportingOrderTransactionsToWarehouse( import_date=luigi.DateParameter().parse(self.UPPER_BOUND_DATE) ) columns = [x[0] for x in final_output_task.columns] output_targets = PathSetTask([output_root], ['*']).output() raw_output = "" for output_target in output_targets: output_target = get_target_from_url(get_jenkins_safe_url(output_target.path)) raw_output += output_target.open('r').read() expected_output_csv = os.path.join(self.data_dir, 'output', 'expected_financial_report.csv') expected = pandas.read_csv(expected_output_csv, parse_dates=True) output = StringIO(raw_output.replace('\t\\N', '\t')) data = pandas.read_table(output, header=None, names=columns, parse_dates=True) # Re-order dataframe for consistent comparison: for frame in (data, expected): frame.sort(['payment_ref_id', 'transaction_type'], inplace=True, ascending=[True, False]) frame.reset_index(drop=True, inplace=True) try: assert_frame_equal(data, expected) except AssertionError: pandas.set_option('display.max_columns', None) print('----- The report generated this data: -----') print(data) print('----- vs expected: -----') print(expected) if data.shape != expected.shape: print("Data shapes differ.") else: for index, series in data.iterrows(): # Try to print a more helpful/localized difference message: try: assert_series_equal(data.iloc[index, :], expected.iloc[index, :]) except AssertionError: print("First differing row: {index}".format(index=index)) raise
def multi_output_reducer(self, key, values, output_file): """ Write values to the appropriate file as determined by the key. Write to the encrypted file by streaming through gzip, which compresses before encrypting """ _date_string, org_id = key recipients = self._get_recipients(org_id) key_file_targets = [get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients] with make_encrypted_file(output_file, key_file_targets) as encrypted_output_file: outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file) try: for value in values: outfile.write(value.strip()) outfile.write('\n') finally: outfile.close()
def run(self): recipients = set(self.recipient) if self.gpg_master_key is not None: recipients.add(self.gpg_master_key) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] path_task = PathSetTask([self.course_files_url], ['*.*']) with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory: for target in path_task.output(): with target.open('r') as input_file: # Get path without urlscheme. course_files_path = urlparse.urlparse(self.course_files_url).path # Calculates target's relative path to course_files_path by getting the substring that # occurs after course_files_path substring in target's path. # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget. # Examples: # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql r_index = target.path.find(course_files_path) + len(course_files_path) relative_path = target.path[r_index:].lstrip('/') local_file_path = os.path.join(tmp_directory, relative_path) try: os.makedirs(os.path.dirname(local_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise with open(local_file_path, 'w') as temp_file: copy_file_to_file(input_file, temp_file) def report_encrypt_progress(num_bytes): """Log encryption progress.""" log.info('Encrypted %d bytes', num_bytes) with self.output().open('w') as output_file: with make_encrypted_file( output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir ) as encrypted_output_file: with tarfile.open(mode='w:gz', fileobj=encrypted_output_file) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')
def complete(self): return get_target_from_url(url_path_join(self.output_root, '_SUCCESS')).exists()
def output(self): return get_target_from_url(url_path_join(self.output_root, 'event_type_distribution/'))
def output(self): return get_target_from_url(self.output_root)
def output(self): return get_target_from_url(self.course_country_output + "/")