def run_job(self, job): job.init_hadoop() job.init_mapper() map_output = StringIO.StringIO() input_targets = luigi.task.flatten(job.input_hadoop()) for input_target in input_targets: # if file is a directory, then assume that it's Hadoop output, # and actually loop through its contents: if os.path.isdir(input_target.path): filenames = os.listdir(input_target.path) for filename in filenames: url = url_path_join(input_target.path, filename) input_targets.append(get_target_from_url(url.strip())) continue with input_target.open('r') as input_file: # S3 files not yet supported since they don't support tell() and seek() if input_target.path.endswith('.gz'): input_file = gzip.GzipFile(fileobj=input_file) elif input_target.path.endswith('.manifest'): for url in input_file: input_targets.append(get_target_from_url(url.strip())) continue os.environ['map_input_file'] = input_target.path try: outputs = job._map_input( (line[:-1] for line in input_file)) job.internal_writer(outputs, map_output) finally: del os.environ['map_input_file'] map_output.seek(0) reduce_input = self.group(map_output) try: reduce_output = job.output().open('w') except Exception: reduce_output = StringIO.StringIO() try: job._run_reducer(reduce_input, reduce_output) finally: try: reduce_output.close() except Exception: pass
def manifest_file_list(self): """Write each individual path to a manifest file and yield the path to that file.""" manifest_target = get_target_from_url(self.manifest) if not manifest_target.exists(): with manifest_target.open('w') as manifest_file: for external_url_task in self.generate_file_list(): manifest_file.write(external_url_task.url + '\n') yield ExternalURL(self.manifest)
def remove_manifest_target_if_exists(manifest_id): """Given an id and configuration, construct a target that can check and remove a manifest file.""" manifest_file_path = get_manifest_file_path(manifest_id) # we don't need the mixin in order to check for existence or to remove the manifest file. manifest_target = get_target_from_url(manifest_file_path) if manifest_target.exists(): log.info('Removing existing manifest found at %s', manifest_target.path) manifest_target.remove()
def __init__(self, *args, **kwargs): super(MultiOutputMapReduceJobTask, self).__init__(*args, **kwargs) if self.delete_output_root: # If requested, make sure that the output directory is empty. This gets rid # of any generated data files from a previous run (that might not get # regenerated in this run). It also makes sure that the marker file # (i.e. the output target) will be removed, so that external functionality # will know that the generation of data files is not complete. output_dir_target = get_target_from_url(self.output_root) for target in [self.output(), output_dir_target]: if target.exists(): target.remove()
def reducer(self, key, values): """ Write out values from each key into different output files. """ output_path = self.output_path_for_key(key) if output_path: log.info('Writing output file: %s', output_path) output_file_target = get_target_from_url(output_path) with output_file_target.open('w') as output_file: self.multi_output_reducer(key, values, output_file) # Luigi requires the reducer to return an iterable return iter(tuple())
def complete(self): """ The task is complete if the output_root/_SUCCESS file is present. """ return get_target_from_url(url_path_join(self.output_root, '_SUCCESS')).exists()
def output(self): """Expose the data location target as the output.""" return get_target_from_url(self.output_root)
def output(self): return get_target_from_url( url_path_join( self.hive_partition_path('course_list_raw', partition_value=self.partition_value), 'course_list.json'))
def output(self): marker_url = url_path_join(self.marker, str(hash(self))) return get_target_from_url(marker_url)
def output(self): return get_target_from_url(self.output_root)
def output(self): # pragma: no cover output_root = url_path_join(self.warehouse_path, self.partition_task.hive_table_task.table, self.partition.path_spec + '/') return get_target_from_url(output_root, marker=True)
def output(self): return get_target_from_url(self.partition_location.rstrip('/') + '/')