Beispiel #1
0
class SqoopImportFromMysql(SqoopImportTask):
    """
    An abstract task that uses Sqoop to read data out of a database and writes it to a file in CSV format.

    By default, the output format is defined by meaning of --mysql-delimiters option, which defines defaults used by
    mysqldump tool:

    * fields delimited by comma
    * lines delimited by \n
    * delimiters escaped by backslash
    * delimiters optionally enclosed by single quotes (')

    Parameters:
        direct: use mysqldump's "direct" mode.  Requires that no set of columns be selected. Defaults to True.
        mysql-delimiters:  use standard mysql delimiters (on by default).
    """
    mysql_delimiters = luigi.BooleanParameter(default=True)
    direct = luigi.BooleanParameter(default=True, significant=False)

    def connection_url(self, cred):
        """Construct connection URL from provided credentials."""
        return 'jdbc:mysql://{host}/{database}'.format(host=cred['host'], database=self.database)

    def import_args(self):
        """Returns list of arguments specific to Sqoop import from a Mysql database."""
        arglist = super(SqoopImportFromMysql, self).import_args()
        if self.direct:
            arglist.append('--direct')
        if self.mysql_delimiters:
            arglist.append('--mysql-delimiters')
        return arglist
class CellDatasetBandChunkTask(workflow.Task):

    __metaclass__ = abc.ABCMeta

    x = luigi.IntParameter()
    y = luigi.IntParameter()

    acq_min = luigi.DateParameter()
    acq_max = luigi.DateParameter()

    satellites = luigi.Parameter(is_list=True)

    output_directory = luigi.Parameter()

    csv = luigi.BooleanParameter()
    dummy = luigi.BooleanParameter()

    mask_pqa_apply = luigi.BooleanParameter()
    mask_pqa_mask = luigi.Parameter()

    dataset_type = luigi.Parameter()
    band = luigi.Parameter()

    x_offset = luigi.IntParameter()
    y_offset = luigi.IntParameter()

    chunk_size_x = luigi.IntParameter()
    chunk_size_y = luigi.IntParameter()
class VectorizerTask(luigi.Task):
    cached_result = luigi.Parameter()

    use_idf = luigi.BooleanParameter(default=True)
    sublinear_tf = luigi.BooleanParameter(default=True)

    dim_red = luigi.Parameter(default='svd')
    dim_red_N = luigi.IntParameter(default=1000)

    def run(self):
        with open(self.input().path, 'rb') as f:
            mlp_isv = pickle.load(f)

        vectors = ivs.vectorize(mlp_isv,
                                use_idf=self.use_idf,
                                sublinear_tf=self.use_idf)
        vectors = dimred(vectors, algo=self.dim_red, N=self.dim_red_N)

        with open(self.output().path, 'wb') as f:
            pickle.dump(vectors, f)

    def requires(self):
        return IdentifierVsmRepresentationTask()

    def output(self):
        return LocalTarget(self.cached_result)
Beispiel #4
0
class CourseEnrollmentValidationDownstreamMixin(
        EventLogSelectionDownstreamMixin, MapReduceJobTaskMixin):
    """
    Defines parameters for passing upstream to tasks that use CourseEnrollmentValidationTask.

    """
    # location to write output
    output_root = luigi.Parameter(
        description='A URL to a path where output event files will be written.',
    )

    # Flag indicating whether to output synthetic events or tuples
    tuple_output = luigi.BooleanParameter(
        default=False,
        description=
        'A flag indicating that output should be in the form of tuples, not events. '
        'Default is False (output is events).',
    )

    # If set, generates events that occur before the start of the specified interval.
    # Default is incremental validation.
    generate_before = luigi.BooleanParameter(
        default=False,
        description=
        'A flag indicating that events should be created preceding the '
        'specified interval. Default behavior is to suppress the generation of events '
        'before the specified interval.',
    )

    # If set, events are included for transitions that don't result in a
    # change in enrollment state.  (For example, two activations in a row.)
    include_nonstate_changes = luigi.BooleanParameter(
        default=False,
        description='A flag indicating that events should be created '
        'to fix all transitions, even those that don\'t result in a change in enrollment '
        'state.  An "activate" following another "activate" is one such example. '
        'Default behavior is to skip generating events for non-state changes.',
    )

    # If set, events that would be generated before this timestamp would instead
    # be assigned this timestamp.
    earliest_timestamp = luigi.DateHourParameter(
        default=None,
        description='A "DateHour" parameter ("yyyy-mm-ddThh"), which if set, '
        'specifies the earliest timestamp that should occur in the output.  Events '
        'that would be generated before this timestamp would instead be assigned this '
        'timestamp.  This is left unspecified by default.',
    )

    # If set, users with events before this timestamp would be expected to have
    # a corresponding validation event.
    expected_validation = luigi.DateHourParameter(
        default=None,
        description='A "DateHour" parameter ("yyyy-mm-ddThh"), which if set, '
        'specifies a point in time where every user with events before this time '
        'should also have a corresponding validation event.  Those without such an '
        'validation event were not really created, and events should be synthesized '
        'to simulate "roll back" of the events.',
    )
Beispiel #5
0
class InsertToMysqlVideoTimelineTask(VideoTableDownstreamMixin,
                                     MysqlTableTask):
    """Insert information about video timelines from a Hive table into MySQL."""

    overwrite = luigi.BooleanParameter(
        default=True,
        description=
        'Overwrite the table when writing to it by default. Allow users to override this behavior if they '
        'want.',
        significant=False)
    allow_empty_insert = luigi.BooleanParameter(
        default=True,
        description=
        'Allow the video table to be empty (e.g. if no video activity has occurred)',
        config_path={
            'section': 'videos',
            'name': 'allow_empty_insert'
        },
        significant=False,
    )

    @property
    def table(self):  # pragma: no cover
        return 'video_timeline'

    @property
    def columns(self):  # pragma: no cover
        return VideoTimelineRecord.get_sql_schema()

    @property
    def insert_query(self):
        return """
            SELECT
                pipeline_video_id,
                segment,
                num_users,
                num_views
            FROM video_usage
        """

    @property
    def indexes(self):  # pragma: no cover
        return [
            ('pipeline_video_id', ),
        ]

    def requires(self):
        for req in super(InsertToMysqlVideoTimelineTask, self).requires():
            yield req
        # the process that generates the source table used by this query
        yield (VideoTimelineDataTask(
            source=self.source,
            interval=self.interval,
            pattern=self.pattern,
            overwrite_n_days=self.overwrite_n_days,
        ))
class InsertToMysqlCourseActivityTask(WeeklyIntervalMixin,
                                      UserActivityDownstreamMixin,
                                      MysqlInsertTask):
    """
    Creates/populates the `course_activity` Result store table.
    """

    overwrite_hive = luigi.BooleanParameter(
        default=False,
        description=
        'Overwrite the hive data used as source for this task. Users should set this to True '
        'when using a persistent Hive metastore.',
        significant=False)

    overwrite_mysql = luigi.BooleanParameter(
        default=False,
        description=
        'Overwrite the table if set to True. Allow users to override this behavior if they '
        'want.',
        significant=False)

    overwrite = None

    def __init__(self, *args, **kwargs):
        super(InsertToMysqlCourseActivityTask, self).__init__(*args, **kwargs)
        self.overwrite = self.overwrite_mysql

    @property
    def table(self):
        return "course_activity"

    @property
    def columns(self):
        return [
            ('course_id', 'VARCHAR(255) NOT NULL'),
            ('interval_start', 'DATETIME NOT NULL'),
            ('interval_end', 'DATETIME NOT NULL'),
            ('label', 'VARCHAR(255) NOT NULL'),
            ('count', 'INT(11) NOT NULL'),
        ]

    @property
    def indexes(self):
        return [('course_id', 'label'), ('interval_end', )]

    @property
    def insert_source_task(self):
        return CourseActivityPartitionTask(
            warehouse_path=self.warehouse_path,
            end_date=self.end_date,
            weeks=self.weeks,
            n_reduce_tasks=self.n_reduce_tasks,
            overwrite=self.overwrite_hive,
            overwrite_n_days=self.overwrite_n_days,
        )
Beispiel #7
0
class WarehouseWorkflowMixin(WarehouseMixin):
    """
    Parameters for running warehouse workflow.
    """

    date = luigi.DateParameter()

    n_reduce_tasks = luigi.Parameter()

    # We are not using VerticaCopyTaskMixin as OverwriteOutputMixin changes the complete() method behavior.
    schema = luigi.Parameter(
        config_path={
            'section': 'vertica-export',
            'name': 'schema'
        },
        description='The schema to which to write.',
    )
    credentials = luigi.Parameter(
        config_path={
            'section': 'vertica-export',
            'name': 'credentials'
        },
        description='Path to the external access credentials file.',
    )

    overwrite = luigi.BooleanParameter(default=False, significant=False)

    # We rename the schema after warehouse loading step. This causes
    # Luigi to think that tasks are not complete as it cannot find the
    # marker table entries. Using a different schema for the marker table
    # solves this issue.
    marker_schema = luigi.Parameter()
Beispiel #8
0
class AnswerDistributionToMySQLTaskWorkflow(
    InsertToMysqlAnswerDistributionTableBase,
    AnswerDistributionDownstreamMixin,
    MapReduceJobTaskMixin
):

    # Override the parameter that normally defaults to false. This ensures that the table will always be overwritten.
    overwrite = luigi.BooleanParameter(default=True)

    @property
    def insert_source_task(self):
        """
        Write to answer_distribution table from AnswerDistributionTSVTask.
        """
        return AnswerDistributionPerCourse(
            mapreduce_engine=self.mapreduce_engine,
            lib_jar=self.lib_jar,
            base_input_format=self.base_input_format,
            n_reduce_tasks=self.n_reduce_tasks,
            src=self.src,
            dest=self.dest,
            include=self.include,
            name=self.name,
            answer_metadata=self.answer_metadata,
            manifest=self.manifest,
        )
Beispiel #9
0
class InputText(luigi.Task):
    ''' Generate random 'text'
    '''
    date = luigi.DateParameter()
    hdfs = luigi.BooleanParameter(default=False)

    def output(self):
        if self.hdfs:
            return luigi.hdfs.HdfsTarget(self.date.strftime('/tmp/text/%Y-%m-%d.txt'))
        else:
            return luigi.LocalTarget(self.date.strftime('/var/tmp/text/%Y-%m-%d.txt'))

    def run(self):
        f = self.output().open('w')

        def random_word():
            return ''.join([random.choice(string.ascii_letters) for i in xrange(random.randrange(1, 3))])

        def random_line():
            return [random_word() for i in xrange(random.randrange(10, 20))]

        for line in xrange(random.randrange(0, 1000)):
            f.write(' '.join(random_line()) + '\n')

        f.close()
Beispiel #10
0
class TableTask(luigi.Task):
    config = get_config()
    database_name = luigi.Parameter()
    table_name = luigi.Parameter()
    action = luigi.Parameter(default='create')
    schema = luigi.Parameter(default=None, significant=False)
    empty = luigi.BooleanParameter(default=False, significant=False)

    def requires(self):
        return DatabaseTask(self.database_name)

    def output(self):
        return TableTarget(self.database_name,
                           self.table_name,
                           self.schema,
                           empty=self.empty)

    def run(self):
        client = self.config.get_client()
        logger.debug('%s: creating table: %s.%s', self, self.database_name,
                     self.table_name)
        client.create_log_table(self.database_name, self.table_name)
        if self.schema is not None:
            logger.debug('%s: updating schema for %s.%s', self,
                         self.database_name, self.table_name)
            client.update_schema(self.database_name, self.table_name,
                                 [s.split(':') for s in self.schema])
Beispiel #11
0
class InsertToMysqlVideoTask(VideoTableDownstreamMixin, MysqlInsertTask):
    """Insert summary information into the video table in MySQL."""

    overwrite = luigi.BooleanParameter(
        default=True,
        description='Overwrite the table when writing to it by default. Allow users to override this behavior if they '
                    'want.',
        significant=False
    )

    @property
    def table(self):  # pragma: no cover
        return 'video'

    @property
    def insert_source_task(self):  # pragma: no cover
        return VideoDataTask(
            mapreduce_engine=self.mapreduce_engine,
            n_reduce_tasks=self.n_reduce_tasks,
            source=self.source,
            interval=self.interval,
            pattern=self.pattern,
            warehouse_path=self.warehouse_path,
        )

    @property
    def columns(self):  # pragma: no cover
        return VideoSegmentSummaryRecord.get_sql_schema()

    @property
    def indexes(self):  # pragma: no cover
        return [
            ('course_id', 'encoded_module_id'),
        ]
Beispiel #12
0
class Top10Artists(luigi.Task):
    date_interval = luigi.DateIntervalParameter()
    use_hadoop = luigi.BooleanParameter()

    def requires(self):
        if self.use_hadoop:
            return AggregateArtistsHadoop(self.date_interval)
        else:
            return AggregateArtists(self.date_interval)

    def output(self):
        return luigi.LocalTarget("data/top_artists_%s.tsv" %
                                 self.date_interval)

    def run(self):
        top_10 = nlargest(10, self._input_iterator())
        with self.output().open('w') as out_file:
            for streams, artist in top_10:
                print >> out_file, self.date_interval.date_a, self.date_interval.date_b, artist, streams

    def _input_iterator(self):
        with self.input().open('r') as in_file:
            for line in in_file:
                artist, streams = line.strip().split()
                yield int(streams), int(artist)
class BuildFinancialReportsTask(MapReduceJobTaskMixin, VerticaCopyTaskMixin,
                                luigi.WrapperTask):
    """Provide entry-point for generating finance reports."""

    # Instead of importing all of DatabaseImportMixin at this level, we just define
    # what we need and are willing to pass through.  That way the use of "credentials"
    # for the output of the report data is not conflicting.
    import_date = luigi.DateParameter()

    # Redefine the overwrite parameter to change its default to True.
    # This will cause the reports to reload when loading into internal reporting.
    overwrite = luigi.BooleanParameter(default=True)

    def requires(self):
        yield (
            TransactionReportTask(
                import_date=self.import_date,
                n_reduce_tasks=self.n_reduce_tasks,
            ),
            LoadInternalReportingOrderTransactionsToWarehouse(
                import_date=self.import_date,
                n_reduce_tasks=self.n_reduce_tasks,
                schema=self.schema,
                credentials=self.credentials,
                overwrite=self.overwrite,
            ),
            LoadInternalReportingEdServicesReportToWarehouse(
                import_date=self.import_date,
                n_reduce_tasks=self.n_reduce_tasks,
                schema=self.schema,
                credentials=self.credentials,
                overwrite=self.overwrite,
            ),
        )
Beispiel #14
0
class InsertToMysqlVideoTimelineTask(VideoTableDownstreamMixin, MysqlInsertTask):
    """Insert information about video timelines from a Hive table into MySQL."""

    overwrite = luigi.BooleanParameter(
        default=True,
        description='Overwrite the table when writing to it by default. Allow users to override this behavior if they '
                    'want.',
        significant=False
    )

    @property
    def table(self):  # pragma: no cover
        return 'video_timeline'

    @property
    def insert_source_task(self):  # pragma: no cover
        return VideoTimelineDataTask(
            mapreduce_engine=self.mapreduce_engine,
            n_reduce_tasks=self.n_reduce_tasks,
            source=self.source,
            interval=self.interval,
            pattern=self.pattern,
            warehouse_path=self.warehouse_path,
        )

    @property
    def columns(self):  # pragma: no cover
        return VideoTimelineRecord.get_sql_schema()

    @property
    def indexes(self):  # pragma: no cover
        return [
            ('pipeline_video_id',),
        ]
class StudentModulePerCourseAfterImportWorkflow(StudentModulePerCourseTask):
    """
    Generates a raw SQL dump of a courseware_studentmodule table
    and separates it into a different tsv file for each course.

    Parameters:
        dump_root: a URL location of the database dump.
        output_root: a URL location where the split files will be stored.
        output_suffix: added to the filenames for identification.
        delete_output_root: if True, recursively deletes the output_root at task creation.
        credentials: Path to the external access credentials file.
        num_mappers: The number of map tasks to ask Sqoop to use.
        where:  A 'where' clause to be passed to Sqoop.
        verbose: Sqoop prints more information while working.

    """
    credentials = luigi.Parameter(default_from_config={
        'section': 'database-import',
        'name': 'credentials'
    })
    num_mappers = luigi.Parameter(default=None,
                                  significant=False)  # TODO: move to config
    where = luigi.Parameter(default=None)
    verbose = luigi.BooleanParameter(default=False, significant=False)

    def requires(self):
        table_name = 'courseware_studentmodule'
        return SqoopImportFromMysql(credentials=self.credentials,
                                    destination=url_path_join(
                                        self.dump_root, table_name),
                                    table_name=table_name,
                                    num_mappers=self.num_mappers,
                                    where=self.where,
                                    verbose=self.verbose)
class CellDatasetBandTask(workflow.Task):

    __metaclass__ = abc.ABCMeta

    x = luigi.IntParameter()
    y = luigi.IntParameter()

    acq_min = luigi.DateParameter()
    acq_max = luigi.DateParameter()

    satellites = luigi.Parameter(is_list=True)

    output_directory = luigi.Parameter()

    csv = luigi.BooleanParameter()
    dummy = luigi.BooleanParameter()

    mask_pqa_apply = luigi.BooleanParameter()
    mask_pqa_mask = luigi.Parameter()

    dataset_type = luigi.Parameter()
    band = luigi.Parameter()

    chunk_size_x = luigi.IntParameter()
    chunk_size_y = luigi.IntParameter()

    def requires(self):

        return [
            self.create_cell_dataset_band_chunk_task(x_offset, y_offset)
            for x_offset, y_offset in self.get_chunks()
        ]

    def get_chunks(self):

        import itertools

        for x_offset, y_offset in itertools.product(
                range(0, 4000, self.chunk_size_x),
                range(0, 4000, self.chunk_size_y)):
            yield x_offset, y_offset

    @abc.abstractmethod
    def create_cell_dataset_band_chunk_task(self, x_offset, y_offset):

        raise Exception("Abstract method should be overridden")
Beispiel #17
0
class MultipleInputText(luigi.Task):
    date_interval = luigi.DateIntervalParameter()
    hdfs = luigi.BooleanParameter(default=False)

    def requires(self):
        return [
            InputText(date, self.hdfs) for date in self.date_interval.dates()
        ]
class ImportMysqlToVerticaTask(MysqlToVerticaTaskMixin, luigi.WrapperTask):
    """Provides entry point for importing a mysql database into Vertica."""

    schema = luigi.Parameter(
        config_path={
            'section': 'vertica-export',
            'name': 'schema'
        },
        description='The schema to which to write.',
    )
    credentials = luigi.Parameter(
        config_path={
            'section': 'vertica-export',
            'name': 'credentials'
        },
        description='Path to the external access credentials file.',
    )
    date = luigi.DateParameter(default=datetime.datetime.utcnow().date(), )
    overwrite = luigi.BooleanParameter(
        default=False,
        significant=False,
    )

    exclude = luigi.Parameter(
        is_list=True,
        default=(),
    )

    def __init__(self, *args, **kwargs):
        super(ImportMysqlToVerticaTask, self).__init__(*args, **kwargs)
        self.table_list = []

    def should_exclude_table(self, table_name):
        """Determines whether to exlude a table during the import."""
        if any(re.match(pattern, table_name) for pattern in self.exclude):
            return True
        return False

    def requires(self):
        if not self.table_list:
            results = get_mysql_query_results(self.db_credentials,
                                              self.database, 'show tables')
            self.table_list = [result[0].strip() for result in results]

        for table_name in self.table_list:
            if not self.should_exclude_table(table_name):
                yield LoadMysqlToVerticaTableTask(
                    credentials=self.credentials,
                    schema=self.schema,
                    db_credentials=self.db_credentials,
                    database=self.database,
                    warehouse_path=self.warehouse_path,
                    table_name=table_name,
                    overwrite=self.overwrite,
                    date=self.date,
                )
Beispiel #19
0
class HasGlobalParam(luigi.Task):
    x = luigi.Parameter()
    global_param = luigi.IntParameter(is_global=True, default=123)  # global parameters need default values
    global_bool_param = luigi.BooleanParameter(is_global=True, default=False)

    def run(self):
        self.complete = lambda: True

    def complete(self):
        return False
Beispiel #20
0
class DummyTask(luigi.Task):

    param = luigi.Parameter()
    bool_param = luigi.BooleanParameter()
    int_param = luigi.IntParameter()
    float_param = luigi.FloatParameter()
    date_param = luigi.DateParameter()
    datehour_param = luigi.DateHourParameter()
    timedelta_param = luigi.TimeDeltaParameter()
    list_param = luigi.Parameter(is_list=True)
class CourseSummaryEnrollmentDownstreamMixin(
        CourseEnrollmentDownstreamMixin,
        LoadInternalReportingCourseCatalogMixin):
    """Combines course enrollment and catalog parameters."""

    enable_course_catalog = luigi.BooleanParameter(
        config_path={
            'section': 'course-summary-enrollment',
            'name': 'enable_course_catalog'
        },
        default=False,
        description="Enables course catalog data jobs.")
class TagsDistributionWorkflow(TagsDistributionDownstreamMixin,
                               EventLogSelectionDownstreamMixin,
                               MapReduceJobTaskMixin, MysqlInsertTask,
                               luigi.WrapperTask):
    """
    This task calculates total and correct submissions for each unique pair: problem id + connected tag.
    It makes sense to use this task only if:

    - xblock asides are enabled in your CMS and LMS settings (it could be done through the Django admin panel)
    - you have already populated MySQL tables `tagging_tagcategories` and `tagging_tagavailablevalues` with necessary
    tags values
    - you have already marked some questions in CMS with existed tags

    Once the LMS is properly configured, the assigned tags will be included in the emitted event each time
    a learner answers a tagged question. This task can then calculate aggregates based on these tagged events.

    The results of this task will be stored in the `tags_distribution` table in the resultstore database.
    When enabled in Insights, this table is used to display counts for a given set of tags.
    When the tags are used to label learning outcomes for individual problems, Insights can provide a sense of how
    learners in the LMS perform on different learning outcomes.
    """

    # Override the parameter that normally defaults to false. This ensures that the table will always be overwritten.
    overwrite = luigi.BooleanParameter(
        default=True,
        description="Whether or not to overwrite existing outputs",
        significant=False)

    @property
    def insert_source_task(self):
        """
        Write to tags_distribution table.
        """
        return TagsDistributionPerCourse(n_reduce_tasks=self.n_reduce_tasks,
                                         output_root=self.output_root,
                                         interval=self.interval,
                                         source=self.source)

    @property
    def table(self):
        return "tags_distribution"

    @property
    def columns(self):
        return TagsDistributionRecord.get_sql_schema()

    @property
    def indexes(self):
        return [
            ('course_id', ),
            ('module_id', ),
            ('course_id', 'module_id'),
        ]
class BadReqTask(luigi.Task):
    succeed = luigi.BooleanParameter()

    def requires(self):
        assert self.succeed
        yield BadReqTask(False)

    def run(self):
        pass

    def complete(self):
        return False
Beispiel #24
0
class VariantsLoading(luigi.Task):
    """
    Run the OpenCGA variant loading tool, whose options are (* are mandatory):

      -b, --backend
         Storage to save files into: mongo (default) or hbase (pending)
    * -c, --credentials
         Path to the file where the backend credentials are stored
          --include-effect
         Save variant effect information (optional)
         Default: false
          --include-samples
         Save samples information (optional)
         Default: false
          --include-stats
         Save statistics information (optional)
         Default: false
    * -i, --input
         Prefix of files to save in the selected backend
    """

    # TODO Possible FileParameter or PathParameter class?
    file = luigi.Parameter(description='Input VCF file to process and load')
    vcf_dir = luigi.Parameter(
        description='Folder for storage of EVA VCF files')
    version = luigi.Parameter(
        description='EVA version where the file is released')
    json_dir = luigi.Parameter(
        description='Folder for storage of EVA JSON files')

    aggregated = luigi.BooleanParameter(default=False)

    def requires(self):
        return VariantsTransformation(self.file, self.version, self.vcf_dir,
                                      self.json_dir, self.aggregated)

    def run(self):
        # Get input files root name (remove .gz, then .json, then .file)
        (root_name, extension) = os.path.splitext(
            os.path.splitext(os.path.splitext(self.input()[0].fn)[0])[0])
        print 'Root name = ' + root_name

        # TODO --include-effect when VEP is ready
        config = configuration.get_opencga_config('pipeline_config.conf')
        command = '{opencga-root}/bin/opencga.sh load-variants -i {input} -b mongo ' \
                  '-c /home/cyenyxe/appl/opencga/mongo.properties --include-samples --include-stats'
        kwargs = {'opencga-root': config['root_folder'], 'input': root_name}

        # Launch tool
        shellout_no_stdout(command, **kwargs)

        print "Variants loaded"
Beispiel #25
0
class SqoopImportMixin(object):
    """Mixin to expose useful parameters when importing from a database using Sqoop.

    In order to protect the database access credentials they are
    loaded from an external file which can be secured appropriately.
    The credentials file is expected to be JSON formatted and contain
    a simple map specifying the host, port, username password and
    database.

    Example Credentials File::

    {
        "host": "db.example.com",
        "port": "3306",
        "username": "******",
        "password": "******"
    }
    """
    destination = luigi.Parameter(
        config_path={
            'section': 'database-import',
            'name': 'destination'
        },
        description='The directory to write the output files to.',
    )
    credentials = luigi.Parameter(
        config_path={
            'section': 'database-import',
            'name': 'credentials'
        },
        description='Path to the external access credentials file.',
    )
    database = luigi.Parameter(config_path={
        'section': 'database-import',
        'name': 'database'
    }, )
    num_mappers = luigi.Parameter(
        default=None,
        significant=False,
        description='The number of map tasks to ask Sqoop to use.',
    )
    verbose = luigi.BooleanParameter(
        default=False,
        significant=False,
        description='Print more information while working.',
    )
    where = luigi.Parameter(
        default=None,
        description='A "where" clause to be passed to Sqoop.  Note that '
        'no spaces should be embedded and special characters should '
        'be escaped.  For example:  --where "id\<50". ',
    )
class DatabaseImportMixin(object):
    """
    Provides general parameters needed for accessing RDBMS databases.

    Example Credentials File::

        {
            "host": "db.example.com",
            "port": "3306",
            "username": "******",
            "password": "******"
        }
    """
    destination = luigi.Parameter(
        config_path={
            'section': 'database-import',
            'name': 'destination'
        },
        description='The directory to write the output files to.')
    credentials = luigi.Parameter(
        config_path={
            'section': 'database-import',
            'name': 'credentials'
        },
        description='Path to the external access credentials file.',
    )
    database = luigi.Parameter(default_from_config={
        'section': 'database-import',
        'name': 'database'
    })
    import_date = luigi.DateParameter(
        default=None,
        description=
        'Date to assign to Hive partition.  Default is today\'s date, UTC.',
    )
    num_mappers = luigi.Parameter(
        default=None,
        significant=False,
        description='The number of map tasks to ask Sqoop to use.',
    )
    verbose = luigi.BooleanParameter(
        default=False,
        significant=False,
        description='Print more information while working.',
    )

    def __init__(self, *args, **kwargs):
        super(DatabaseImportMixin, self).__init__(*args, **kwargs)

        if not self.import_date:
            self.import_date = datetime.datetime.utcnow().date()
class OverwriteOutputMixin(object):
    """
    Provides support to allow a workflow to force execution of a task.

    For tasks that may generate the most current version of a data import,
    we may want the import to be run whenever asked, rather than on a
    schedule with labelled outputs.

    Assumes that the same task object is accessed later, and can
    hold the state.  This assumption may be flawed.  (If the task
    is recreated from its arguments, the new task object doesn't
    contain the state that the task did when it was executed.)

    Note that this should be included in a task definition *before*
    the Task base class, so that the complete() method is overridden.
    """
    overwrite = luigi.BooleanParameter(
        default=False,
        description=
        'Whether or not to overwrite existing outputs; set to False by default for now.',
        significant=False)
    attempted_removal = False

    def complete(self):
        """
        Wrap Task.complete() to check for overwrite flag.
        """
        # Force complete() to return False any time before the job is
        # actually run, but defer the removal of output until the job is
        # actually run.  This is better than performing the removal
        # at task construction time, since side effects at task
        # definition are less intuitive than having all side effects
        # occur only during execution.
        if self.overwrite and not self.attempted_removal:
            return False
        else:
            return super(OverwriteOutputMixin, self).complete()

    def remove_output_on_overwrite(self):
        """
        Remove output only if it exists and needs to be removed.

        This is a default implementation.  It can be overridden by
        classes that need to do something else to remove output.
        """
        if self.overwrite:
            self.attempted_removal = True
            if self.output().exists():
                log.info("Removing existing output for task %s", str(self))
                self.output().remove()
Beispiel #28
0
class TileTask(workflow.Task):

    __metaclass__ = abc.ABCMeta

    tile = workflow.ComplexParameter()

    x = luigi.IntParameter()
    y = luigi.IntParameter()

    acq_min = luigi.DateParameter()
    acq_max = luigi.DateParameter()

    satellites = luigi.Parameter(is_list=True)

    output_directory = luigi.Parameter()

    csv = luigi.BooleanParameter()
    dummy = luigi.BooleanParameter()

    mask_pqa_apply = luigi.BooleanParameter()
    mask_pqa_mask = luigi.Parameter()

    mask_wofs_apply = luigi.BooleanParameter()
    mask_wofs_mask = luigi.Parameter()
Beispiel #29
0
class ArtistToplistToDatabase(luigi.postgres.CopyToTable):
    date_interval = luigi.DateIntervalParameter()
    use_hadoop = luigi.BooleanParameter()

    host = "localhost"
    database = "toplists"
    user = "******"
    password = "******"  # ;)
    table = "top10"

    columns = [("date_from", "DATE"), ("date_to", "DATE"), ("artist", "TEXT"),
               ("streams", "INT")]

    def requires(self):
        return Top10Artists(self.date_interval, self.use_hadoop)
Beispiel #30
0
class ProblemResponseReportWorkflow(ProblemResponseTableMixin,
                                    luigi.WrapperTask):
    """
    Workflow task that generates the problem response reports from the hive table.
    """
    output_root = luigi.Parameter(
        config_path={
            'section': 'problem-response',
            'name': 'report_output_root'
        },
        description='Location where the report files will be stored.',
    )
    marker = luigi.Parameter(
        significant=False,
        description=
        'URL directory where a marker file will be written on task completion.'
        ' Note that the report task will not run if this marker file exists.',
    )
    overwrite = luigi.BooleanParameter(
        default=False,
        description=
        'Set to True to force rebuild hive data and reports from tracking logs.'
    )

    def requires(self):
        """
        Initialize the problem response report task
        """
        yield ProblemResponseReportTask(
            # ProblemResponseTableMixin
            date=self.date,
            partition_format=self.partition_format,
            interval=self.interval,
            interval_start=self.interval_start,
            interval_end=self.interval_end,
            mapreduce_engine=self.mapreduce_engine,
            input_format=self.input_format,
            lib_jar=self.lib_jar,
            n_reduce_tasks=self.n_reduce_tasks,
            remote_log_level=self.remote_log_level,

            # OverwriteMixin
            overwrite=self.overwrite,

            # MultiOutputMapReduceJobTask
            output_root=self.output_root,
            marker=self.marker,
        )