Beispiel #1
0
    def run_job(self, job):
        job.init_hadoop()
        job.init_mapper()
        map_output = StringIO.StringIO()
        input_targets = luigi.task.flatten(job.input_hadoop())
        for input_target in input_targets:
            # if file is a directory, then assume that it's Hadoop output,
            # and actually loop through its contents:
            if os.path.isdir(input_target.path):
                filenames = os.listdir(input_target.path)
                for filename in filenames:
                    url = url_path_join(input_target.path, filename)
                    input_targets.append(get_target_from_url(url.strip()))
                continue

            with input_target.open('r') as input_file:

                # S3 files not yet supported since they don't support tell() and seek()
                if input_target.path.endswith('.gz'):
                    input_file = gzip.GzipFile(fileobj=input_file)
                elif input_target.path.endswith('.manifest'):
                    for url in input_file:
                        input_targets.append(get_target_from_url(url.strip()))
                    continue

                os.environ['map_input_file'] = input_target.path
                try:
                    outputs = job._map_input(
                        (line[:-1] for line in input_file))
                    job.internal_writer(outputs, map_output)
                finally:
                    del os.environ['map_input_file']

        map_output.seek(0)

        reduce_input = self.group(map_output)
        try:
            reduce_output = job.output().open('w')
        except Exception:
            reduce_output = StringIO.StringIO()

        try:
            job._run_reducer(reduce_input, reduce_output)
        finally:
            try:
                reduce_output.close()
            except Exception:
                pass
Beispiel #2
0
    def hive_partition_path(self,
                            table_name,
                            partition_value,
                            partition_key='dt'):
        """
        Given a table name and partition value return the full URL of the folder for that partition in the warehouse.

        Arguments:
            table_name (str): The name of the hive table.
            partition_value (object): Usually a string specifying the partition in the table. If it is a `date` object
                it will be serialized to a ISO8601 formatted date string. This is a common use case.
            partition_key (str): The partition key. This is usually "dt".
        """
        if hasattr(partition_value, 'isoformat'):
            partition_value = partition_value.isoformat()
        partition = HivePartition(partition_key, partition_value)
        return url_path_join(self.warehouse_path, table_name,
                             partition.path_spec) + '/'
Beispiel #3
0
 def complete(self):
     """
     The task is complete if the output_root/_SUCCESS file is present.
     """
     return get_target_from_url(url_path_join(self.output_root,
                                              '_SUCCESS')).exists()
Beispiel #4
0
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path('course_list_raw',
                                      partition_value=self.partition_value),
             'course_list.json'))
Beispiel #5
0
 def output(self):
     marker_url = url_path_join(self.marker, str(hash(self)))
     return get_target_from_url(marker_url)
Beispiel #6
0
def get_manifest_file_path(manifest_id):
    # Construct the manifest file URL from the manifest_id and the configuration
    base_url = configuration.get_config().get(CONFIG_SECTION, 'path')
    manifest_file_path = url_path_join(base_url, manifest_id + '.manifest')
    return manifest_file_path
Beispiel #7
0
 def output(self):  # pragma: no cover
     output_root = url_path_join(self.warehouse_path,
                                 self.partition_task.hive_table_task.table,
                                 self.partition.path_spec + '/')
     return get_target_from_url(output_root, marker=True)
Beispiel #8
0
 def partition_location(self):
     """Returns the full URL of the partition. This allows data to be written to the partition by external systems"""
     return url_path_join(self.hive_table_task.table_location,
                          self.partition.path_spec + '/')
Beispiel #9
0
 def table_location(self):
     """Provides root location of Hive database table's data."""
     return url_path_join(self.warehouse_path, self.table) + '/'
Beispiel #10
0
 def partition_location(self):
     """Provides location of Hive database table's partition data."""
     # Make sure that input path ends with a slash, to indicate a directory.
     # (This is necessary for S3 paths that are output from Hadoop jobs.)
     return url_path_join(self.table_location,
                          self.partition.path_spec + '/')