Ejemplo n.º 1
0
class TableTask(luigi.Task):
    config = get_config()
    database_name = luigi.Parameter()
    table_name = luigi.Parameter()
    action = luigi.Parameter(default='create')
    schema = luigi.Parameter(default=None, significant=False)
    empty = luigi.BooleanParameter(default=False, significant=False)

    def requires(self):
        return DatabaseTask(self.database_name)

    def output(self):
        return TableTarget(self.database_name,
                           self.table_name,
                           self.schema,
                           empty=self.empty)

    def run(self):
        client = self.config.get_client()
        logger.debug('%s: creating table: %s.%s', self, self.database_name,
                     self.table_name)
        client.create_log_table(self.database_name, self.table_name)
        if self.schema is not None:
            logger.debug('%s: updating schema for %s.%s', self,
                         self.database_name, self.table_name)
            client.update_schema(self.database_name, self.table_name,
                                 [s.split(':') for s in self.schema])
Ejemplo n.º 2
0
class DatabaseTask(luigi.Task):
    config = get_config()
    database_name = luigi.Parameter()
    action = luigi.Parameter(default='create')

    def output(self):
        return DatabaseTarget(self.database_name)

    def run(self):
        client = self.config.get_client()
        logger.debug('%s: creating database: %s', self, self.database_name)
        client.create_database(self.database_name)
Ejemplo n.º 3
0
class Query(luigi.Task):
    config = get_config()
    debug = False
    timeout = None
    priority = None
    retry_limit = None
    type = 'hive'
    database = None
    source = None
    variables = {}

    def query(self):
        return NotImplemented

    def load_query(self, source):
        env = jinja2.Environment(
            loader=jinja2.PackageLoader(self.__module__, '.'))
        template = env.get_template(source)
        return template.render(task=self, **self.variables)

    def run_query(self, query):
        result = self.output()
        result_url = None
        if isinstance(result, ResultTarget):
            result_url = result.get_result_url()
        client = self.config.get_client()
        job = client.query(self.database,
                           query,
                           priority=self.priority,
                           retry_limit=self.retry_limit,
                           type=self.type,
                           result_url=result_url)
        job.update()
        logger.info("%s: td.job.url: %s", self, job.url)

        # wait for the result
        try:
            if self.timeout:
                timeout = time.time() + self.timeout
            else:
                timeout = None
            while not job.finished():
                if timeout and time.time() > timeout:
                    raise Timeout(
                        '{0} (Job ID {1}) timed out after {2} seconds'.format(
                            self, job.job_id, self.timeout))
                time.sleep(2)
        except:
            # kill query on exceptions
            job.kill()
            raise
        job.update()

        logger.info("%s: td.job.result: job_id=%s status=%s", self, job.job_id,
                    job.status())

        if not job.success():
            stderr = job.debug['stderr']
            if stderr:
                logger.error(stderr)
            raise RuntimeError("job {0} {1}\n\nOutput:\n{2}".format(
                job.job_id, job.status(), job.debug['cmdout']))

        return ResultProxy(job)

    def run(self):
        if hasattr(self, 'query_file'):
            self.source = self.query_file
        query = self.load_query(self.source) if self.source else self.query()
        result = self.run_query(query)
        target = self.output()
        if target and isinstance(target, ResultTarget):
            target.save_result_state(result)
Ejemplo n.º 4
0
 def __init__(self, path, result_url=None, config=None):
     self.path = path
     self.result_url = result_url
     self.config = config or get_config()
Ejemplo n.º 5
0
 def __init__(self, database_name, table_name, schema=None, empty=False, config=None):
     self.database_name = database_name
     self.table_name = table_name
     self.schema = schema or []
     self.empty = empty
     self.config = config or get_config()
Ejemplo n.º 6
0
 def __init__(self, database_name, config=None):
     self.database_name = database_name
     self.config = config or get_config()