def run_job(self, job): job.init_hadoop() job.init_mapper() map_output = StringIO.StringIO() input_targets = luigi.task.flatten(job.input_hadoop()) for input_target in input_targets: # if file is a directory, then assume that it's Hadoop output, # and actually loop through its contents: if os.path.isdir(input_target.path): filenames = os.listdir(input_target.path) for filename in filenames: url = url_path_join(input_target.path, filename) input_targets.append(get_target_from_url(url.strip())) continue with input_target.open('r') as input_file: # S3 files not yet supported since they don't support tell() and seek() if input_target.path.endswith('.gz'): input_file = gzip.GzipFile(fileobj=input_file) elif input_target.path.endswith('.manifest'): for url in input_file: input_targets.append(get_target_from_url(url.strip())) continue os.environ['map_input_file'] = input_target.path try: outputs = job._map_input( (line[:-1] for line in input_file)) job.internal_writer(outputs, map_output) finally: del os.environ['map_input_file'] map_output.seek(0) reduce_input = self.group(map_output) try: reduce_output = job.output().open('w') except Exception: reduce_output = StringIO.StringIO() try: job._run_reducer(reduce_input, reduce_output) finally: try: reduce_output.close() except Exception: pass
def hive_partition_path(self, table_name, partition_value, partition_key='dt'): """ Given a table name and partition value return the full URL of the folder for that partition in the warehouse. Arguments: table_name (str): The name of the hive table. partition_value (object): Usually a string specifying the partition in the table. If it is a `date` object it will be serialized to a ISO8601 formatted date string. This is a common use case. partition_key (str): The partition key. This is usually "dt". """ if hasattr(partition_value, 'isoformat'): partition_value = partition_value.isoformat() partition = HivePartition(partition_key, partition_value) return url_path_join(self.warehouse_path, table_name, partition.path_spec) + '/'
def complete(self): """ The task is complete if the output_root/_SUCCESS file is present. """ return get_target_from_url(url_path_join(self.output_root, '_SUCCESS')).exists()
def output(self): return get_target_from_url( url_path_join( self.hive_partition_path('course_list_raw', partition_value=self.partition_value), 'course_list.json'))
def output(self): marker_url = url_path_join(self.marker, str(hash(self))) return get_target_from_url(marker_url)
def get_manifest_file_path(manifest_id): # Construct the manifest file URL from the manifest_id and the configuration base_url = configuration.get_config().get(CONFIG_SECTION, 'path') manifest_file_path = url_path_join(base_url, manifest_id + '.manifest') return manifest_file_path
def output(self): # pragma: no cover output_root = url_path_join(self.warehouse_path, self.partition_task.hive_table_task.table, self.partition.path_spec + '/') return get_target_from_url(output_root, marker=True)
def partition_location(self): """Returns the full URL of the partition. This allows data to be written to the partition by external systems""" return url_path_join(self.hive_table_task.table_location, self.partition.path_spec + '/')
def table_location(self): """Provides root location of Hive database table's data.""" return url_path_join(self.warehouse_path, self.table) + '/'
def partition_location(self): """Provides location of Hive database table's partition data.""" # Make sure that input path ends with a slash, to indicate a directory. # (This is necessary for S3 paths that are output from Hadoop jobs.) return url_path_join(self.table_location, self.partition.path_spec + '/')