def requires(self): # The end date is not included in the result, so we have to add a day # to the provided date in order to ensure user registration data is # gathered for that date. end_date = self.date + timedelta(1) # In order to compute the cumulative sum of user registrations we need # all changes in registrations up to (and including) the provided date. registrations = UserRegistrationsPerDay(credentials=self.credentials, destination=self.destination, date_interval=Custom( MINIMUM_DATE, end_date)) results = { 'enrollments': CourseEnrollmentChangesPerDay( name=self.name, src=self.src, dest=self.destination, include=self.include, manifest=self.manifest, mapreduce_engine=self.mapreduce_engine, lib_jar=self.lib_jar, n_reduce_tasks=self.n_reduce_tasks), 'registrations': registrations } if self.offsets: results.update({'offsets': ExternalURL(self.offsets)}) if self.history: results.update({'history': ExternalURL(self.history)}) if self.blacklist: results.update({'blacklist': ExternalURL(self.blacklist)}) return results
def generate_file_list(self): """Yield each individual path given a source folder and a set of file-matching expressions.""" for src in self.src: if src.startswith('s3'): # connect lazily as needed: if self.s3_conn is None: self.s3_conn = boto.connect_s3() for _bucket, _root, path in generate_s3_sources( self.s3_conn, src, self.include, self.include_zero_length): source = url_path_join(src, path) yield ExternalURL(source) elif src.startswith('hdfs'): for source, size in luigi.hdfs.listdir(src, recursive=True, include_size=True): if not self.include_zero_length and size == 0: continue elif any( fnmatch.fnmatch(source, include_val) for include_val in self.include): yield ExternalURL(source) else: # Apply the include patterns to the relative path below the src directory. # TODO: implement exclude_zero_length to match S3 case. for dirpath, _dirnames, files in os.walk(src): for filename in files: filepath = os.path.join(dirpath, filename) relpath = os.path.relpath(filepath, src) if any( fnmatch.fnmatch(relpath, include_val) for include_val in self.include): yield ExternalURL(filepath)
def requires_hadoop(self): # Check first if running locally with Sqoop output. target = get_target_from_url(self.source_dir) if isinstance(target, luigi.LocalTarget) and os.path.isdir(self.source_dir): files = [f for f in os.listdir(self.source_dir) if f.startswith("part")] for filename in files: yield ExternalURL(url_path_join(self.source_dir, filename)) else: yield ExternalURL(self.source_dir)
def requires(self): """Require the external config if we are not using the default one""" reqs = super(XBlockConfigMixin, self).requires() if os.path.basename(self.xblock_obfuscation_config ) != self.xblock_obfuscation_config: reqs['xblock_config'] = ExternalURL(self.xblock_obfuscation_config) return reqs
def requires(self): if self.required_tasks is None: self.required_tasks = { 'credentials': ExternalURL(url=self.credentials), 'insert_source': self.insert_source_task } return self.required_tasks
def requires(self): return {'insert_source': LoadInternalReportingUserActivityToWarehouse( n_reduce_tasks=self.n_reduce_tasks, date=self.date, warehouse_path=self.warehouse_path, overwrite=self.overwrite, schema=self.schema, credentials=self.credentials), 'credentials': ExternalURL(self.credentials)}
def requires_local(self): """Adds geolocation_data as a local requirement.""" result = super(GeolocationMixin, self).requires_local() # Default is an empty list, but assume that any real data added is done # so as a dict. if not result: result = {} result['geolocation_data'] = ExternalURL(self.geolocation_data) return result
def manifest_file_list(self): """Write each individual path to a manifest file and yield the path to that file.""" manifest_target = get_target_from_url(self.manifest) if not manifest_target.exists(): with manifest_target.open('w') as manifest_file: for external_url_task in self.generate_file_list(): manifest_file.write(external_url_task.url + '\n') yield ExternalURL(self.manifest)
def requires(self): results = { 'source': CourseEnrollmentChangesPerDay( name=self.name, src=self.src, dest=self.destination, include=self.include, manifest=self.manifest, mapreduce_engine=self.mapreduce_engine, lib_jar=self.lib_jar, n_reduce_tasks=self.n_reduce_tasks ) } if self.offsets: results.update({'offsets': ExternalURL(self.offsets)}) if self.statuses: results.update({'statuses': ExternalURL(self.statuses)}) return results
def generate_file_list(self): """Yield each individual path given a source folder and a set of file-matching expressions.""" for src in self.src: if src.startswith('s3'): # connect lazily as needed: if self.s3_conn is None: self.s3_conn = boto.connect_s3() for _bucket, _root, path in generate_s3_sources( self.s3_conn, src, self.include): source = url_path_join(src, path) yield ExternalURL(source) else: # Apply the include patterns to the relative path below the src directory. for dirpath, _dirnames, files in os.walk(src): for filename in files: filepath = os.path.join(dirpath, filename) relpath = os.path.relpath(filepath, src) if any( fnmatch.fnmatch(relpath, include_val) for include_val in self.include): yield ExternalURL(filepath)
def requires(self): return { 'source': LoadWarehouseTask(date=self.date, schema=self.schema, credentials=self.credentials, marker_schema=self.marker_schema, overwrite=self.overwrite, n_reduce_tasks=self.n_reduce_tasks), 'credentials': ExternalURL(self.credentials) }
def get_mysql_query_results(credentials, database, query): """ Executes a mysql query on the provided database and returns the results. """ credentials_target = ExternalURL(url=credentials).output() cred = None with credentials_target.open('r') as credentials_file: cred = json.load(credentials_file) connection = mysql.connector.connect(user=cred.get('username'), password=cred.get('password'), host=cred.get('host'), port=cred.get('port'), database=database) try: cursor = connection.cursor() cursor.execute(query) results = cursor.fetchall() finally: connection.close() return results
def requires(self): results = { 'events': ProblemCheckEvent( mapreduce_engine=self.mapreduce_engine, input_format=self.base_input_format, lib_jar=self.lib_jar, n_reduce_tasks=self.n_reduce_tasks, name=self.name, src=self.src, dest=self.dest, include=self.include, manifest=self.manifest, ), } if self.answer_metadata: results.update({'answer_metadata': ExternalURL(self.answer_metadata)}) return results
def requires(self): end_date = self.date + timedelta(1) results = { 'enrollments': CourseEnrollmentChangesPerDay( name=self.name, src=self.src, dest=self.destination, include=self.include, manifest=self.manifest, mapreduce_engine=self.mapreduce_engine, lib_jar=self.lib_jar, n_reduce_tasks=self.n_reduce_tasks), 'registrations': UserRegistrationsPerDay(credentials=self.credentials, destination=self.destination, date_interval=Custom( MINIMUM_DATE, end_date)), } if self.blacklist: results.update({'blacklist': ExternalURL(self.blacklist)}) return results
def requires(self): return {'credentials': ExternalURL(self.credentials)}
def insert_source_task(self): hive_table = "course_enrollment" partition_location = url_path_join(self.warehouse_path, hive_table, self.partition.path_spec) + '/' return ExternalURL(url=partition_location)
def requires_local(self): return ExternalURL(self.geolocation_data)
def insert_source_task(self): return ExternalURL(url=self.insert_source)
def insert_source_task(self): hive_table = "internal_reporting_d_country" partition_location = url_path_join(self.warehouse_path, hive_table, self.partition.path_spec) + '/' return ExternalURL(url=partition_location)
def requires(self): return ExternalURL( url_path_join(self.course_files_url, 'metadata_file.json'))
def requires(self): yield ExternalURL( url=url_path_join(self.warehouse_path, 'course_enrollment', self.partition.path_spec) + '/')
def requires(self): yield ExternalURL( url=url_path_join(self.warehouse_path, 'last_country_of_user', self.partition_spec()) + '/' )
def insert_source_task(self): return ExternalURL(url=self.hive_partition_path('course_enrollment_summary', self.date))
def requires(self): results = { 'events': PathSetTask(self.src, self.include, self.manifest), 'geoloc_data': ExternalURL(self.geolocation_data), } return results
def requires_local(self): results = super(ObfuscateCourseEventsTask, self).requires_local() if os.path.basename(self.explicit_event_whitelist) != self.explicit_event_whitelist: results['explicit_events'] = ExternalURL(url=self.explicit_event_whitelist) return results
def data_task(self): url = super(ExternalLastCountryOfUserToHiveTask, self).data_task.output().path return ExternalURL(url.rstrip('/') + '/')
def input_hadoop(self): # The requires() method will return the Hive target, but really we need the # file dump instead. yield ExternalURL(self.source_dir).output()
def requires_local(self): return ExternalURL(url=self.config)
def requires(self): return ExternalURL(self.counts)
def requires_local(self): results = {} if os.path.basename(self.explicit_event_whitelist) != self.explicit_event_whitelist: results['explicit_events'] = ExternalURL(url=self.explicit_event_whitelist) results['geolocation_data'] = ExternalURL(self.geolocation_data) return results