コード例 #1
0
class CourseEnrollmentTableDownstreamMixin(WarehouseMixin, EventLogSelectionDownstreamMixin, MapReduceJobTaskMixin):
    """All parameters needed to run the CourseEnrollmentTableTask task."""

    # Make the interval be optional:
    interval = luigi.DateIntervalParameter(
        default=None,
        description='The range of dates to export logs for. '
        'If not specified, `interval_start` and `interval_end` are used to construct the `interval`.',
    )

    # Define optional parameters, to be used if 'interval' is not defined.
    interval_start = luigi.DateParameter(
        config_path={'section': 'enrollments', 'name': 'interval_start'},
        significant=False,
        description='The start date to export logs for.  Ignored if `interval` is provided.',
    )
    interval_end = luigi.DateParameter(
        default=datetime.datetime.utcnow().date(),
        significant=False,
        description='The end date to export logs for.  Ignored if `interval` is provided. '
        'Default is today, UTC.',
    )

    def __init__(self, *args, **kwargs):
        super(CourseEnrollmentTableDownstreamMixin, self).__init__(*args, **kwargs)

        if not self.interval:
            self.interval = luigi.date_interval.Custom(self.interval_start, self.interval_end)
コード例 #2
0
class PushToVerticaEventTypeDistributionTask(VerticaCopyTask):
    """Push the event type distribution task data to Vertica."""
    output_root = luigi.Parameter()
    interval = luigi.DateIntervalParameter()
    n_reduce_tasks = luigi.Parameter()
    events_list_file_path = luigi.Parameter(default=None)

    @property
    def table(self):
        return "event_type_distribution"

    @property
    def columns(self):
        return [
            ('event_date', 'DATETIME'),
            ('event_category', 'VARCHAR(255)'),
            ('event_type', 'VARCHAR(255)'),
            ('event_source', 'VARCHAR(255)'),
            ('exported', 'BOOLEAN'),
            ('event_count', 'INT'),
        ]

    @property
    def insert_source_task(self):
        return EventTypeDistributionTask(
            output_root=self.output_root,
            interval=self.interval,
            n_reduce_tasks=self.n_reduce_tasks,
            events_list_file_path=self.events_list_file_path)
コード例 #3
0
class OpenNTPWeek(luigi.Task):
    week = luigi.DateIntervalParameter()

    def output(self):
        #return luigi.LocalTarget("data/OpenNTP/week-{}".format(self.week))
        return luigi.s3.S3Target("s3://test-bucket/raw/ntp-scan/parsed-{}".format(self.week))

    def run(self):
        ssh = luigi.contrib.ssh.RemoteFileSystem("127.0.0.1")
        dir_list = ssh.listdir("/tmp/ntp-scan/")
        found_a_file=False
        for day in self.week.dates():
            d_file = 'parsed-{}.out'.format(day)
            for f in dir_list:
                if d_file == os.path.basename(f):
                    if found_a_file:
                        raise RuntimeError("Multiple files for week")
                    logging.info("found {} in {}".format(day, dir_list))
                    tmpfile = luigi.LocalTarget(is_tmp=True)
                    ssh.get(f, tmpfile.path)
                    with self.output().open("w") as out_file, tmpfile.open('r') as in_file:
                        for line in in_file:
                            out_file.write(line)
                    found_a_file = True
            if not found_a_file:
                raise RuntimeError("didn't find anything for the week")
コード例 #4
0
class ArtistS3ToDatabase(luigi.contrib.postgres.CopyToTable):
    """
    This task runs a :py:class:`luigi.postgres.CopyToTable` task
    over the target data returned by :py:meth:`~/.Top10Artists.output` and
    writes the result into its :py:meth:`~.ArtistToplistToDatabase.output` target which,
    by default, is :py:class:`luigi.postgres.PostgresTarget` (a table in PostgreSQL).
    This class uses :py:meth:`luigi.postgres.CopyToTable.run` and :py:meth:`luigi.postgres.CopyToTable.output`.
    """

    date_interval = luigi.DateIntervalParameter()
    sleep_seconds = luigi.Parameter()

    host = os.environ["LUIGI_DBHOST"]
    database = os.environ["LUIGI_DBDATABASE"]
    user = os.environ["LUIGI_DBUSER"]
    password = os.environ["LUIGI_DBPASSWORD"]
    table = "artist_streams"

    columns = [("date_from", "DATE"), ("date_to", "DATE"), ("artist", "TEXT"),
               ("streams", "INT")]

    def requires(self):
        """
        This task's dependencies:
        * :py:class:`~.Top10Artists`
        :return: list of object (:py:class:`luigi.task.Task`)
        """
        return Top10ArtistsS3(self.date_interval, self.sleep_seconds)
コード例 #5
0
class EventLogSelectionDownstreamMixin(object):
    """Defines parameters for passing upstream to tasks that use EventLogSelectionMixin."""

    source = luigi.Parameter(
        is_list=True,
        config_path={'section': 'event-logs', 'name': 'source'},
        description='A URL to a path that contains log files that contain the events. (e.g., s3://my_bucket/foo/).',
    )
    interval = luigi.DateIntervalParameter(
        description='The range of dates to export logs for.',
    )
    expand_interval = luigi.TimeDeltaParameter(
        config_path={'section': 'event-logs', 'name': 'expand_interval'},
        description='A time interval to add to the beginning and end of the interval to expand the windows of '
        'files captured.',
    )
    pattern = luigi.Parameter(
        is_list=True,
        config_path={'section': 'event-logs', 'name': 'pattern'},
        description='A regex with a named capture group for the date that approximates the date that the events '
        'within were emitted. Note that the search interval is expanded, so events don\'t have to be in exactly '
        'the right file in order for them to be processed.',
    )

    date_pattern = luigi.Parameter(
        default='%Y%m%d',
        description='The format of the date as it appears in the source file name. Note that this correlates with the '
        'named capture group for date in the pattern parameter. This is intended to select relevant event log files '
        'by making sure the date is within the interval.',
    )
 def create_link_clicked_task(self, interval='2013-01-01'):
     """Create a task for testing purposes."""
     fake_param = luigi.DateIntervalParameter()
     self.task = LMSCoursewareLinkClickedTask(
         interval=fake_param.parse(interval),
         output_root="/fake/output",
     )
コード例 #7
0
 def create_task_distribution_task(self, interval='2013-01-01'):
     """Create a task for testing purposes."""
     fake_param = luigi.DateIntervalParameter()
     self.task = TagsDistributionPerCourse(
         interval=fake_param.parse(interval),
         output_root="/fake/output",
     )
コード例 #8
0
class SongMetaPreprocess(luigi.Task):
    date_interval = luigi.DateIntervalParameter()

    def requires(self):
        return UserTastePreprocess(self.date_interval)

    def run(self):
        track_meta_path = './origin/track_metadata.db'
        conn = sqlite3.connect(track_meta_path)
        q = "SELECT song_id, title, artist_name FROM songs"
        res = conn.execute(q)
        echonest_meta = res.fetchall()
        song_meta = pd.DataFrame(
            echonest_meta, columns=['song_id', 'song_title', 'artist_name'])

        with self.input().open('r') as in_file:
            taste = pd.read_csv(in_file)
        song_encode = taste[['song_id', 'song_index']].drop_duplicates()

        song_encode_meta = pd.merge(song_encode,
                                    song_meta,
                                    how='left',
                                    on='song_id')

        with self.output().open('w') as out_file:
            song_encode_meta.drop('song_id', axis=1).to_csv(out_file,
                                                            index=False,
                                                            encoding='utf-8')

    def output(self):
        return luigi.LocalTarget("./data/song_encode_meta_%s.csv" %
                                 self.date_interval)
コード例 #9
0
ファイル: move_to_hdfs.py プロジェクト: ukwa/ukwa-manage
class ScanForLaunches(luigi.WrapperTask):
    """
    This task scans the output folder for jobs and instances of those jobs, looking for crawled content to process.

    Sub-class this and override the scan_job_launch method as needed.
    """
    task_namespace = 'scan'
    date_interval = luigi.DateIntervalParameter(default=get_large_interval())
    timestamp = luigi.DateMinuteParameter(default=datetime.datetime.today())

    def requires(self):
        # Enumerate the jobs:
        for (job, launch) in self.enumerate_launches():
            logger.info("Processing %s/%s" % (job, launch))
            yield self.scan_job_launch(job, launch)

    def enumerate_launches(self):
        # Look for jobs that need to be processed:
        for date in self.date_interval:
            logger.info("Looking at date %s" % date)
            for job_item in glob.glob("%s/*" % CRAWL_OUTPUT_FOLDER):
                job = os.path.basename(job_item)
                if os.path.isdir(job_item):
                    launch_glob = "%s/%s*" % (job_item,
                                              date.strftime('%Y%m%d'))
                    logger.info("Looking for job launch folders matching %s" %
                                launch_glob)
                    for launch_item in glob.glob(launch_glob):
                        logger.info("Found %s" % launch_item)
                        if os.path.isdir(launch_item):
                            launch = os.path.basename(launch_item)
                            yield (job, launch)
コード例 #10
0
class PushToVerticaLMSCoursewareLinkClickedTask(VerticaCopyTask):
    """Push the LMS courseware link clicked task data to Vertica."""
    output_root = luigi.Parameter()
    interval = luigi.DateIntervalParameter()
    n_reduce_tasks = luigi.Parameter()

    @property
    def table(self):
        return "lms_courseware_link_clicked_events"

    @property
    def columns(self):
        return [('course_id', 'VARCHAR(255)'), ('event_date', 'DATE'),
                ('external_link_clicked_events', 'INT'),
                ('link_clicked_events', 'INT')]

    @property
    def insert_source_task(self):
        return LMSCoursewareLinkClickedTask(output_root=self.output_root,
                                            interval=self.interval,
                                            n_reduce_tasks=self.n_reduce_tasks)

    @property
    def auto_primary_key(self):
        """Use 'record_number' as primary key to match the schema"""
        return ('record_number', 'AUTO_INCREMENT')

    @property
    def default_columns(self):
        """List of tuples defining name and definition of automatically-filled columns."""
        return None
コード例 #11
0
class UserRegistrationsPerDay(MysqlSelectTask):
    """
    Determine the number of users that registered accounts each day.

    """

    date_interval = luigi.DateIntervalParameter(
        description='The range of dates to gather data for.',
    )

    @property
    def query(self):
        return ("SELECT DATE(date_joined), COUNT(1) FROM `auth_user`"
                " WHERE `date_joined` >= %s AND `date_joined` < %s GROUP BY DATE(date_joined) ORDER BY 1 ASC")

    @property
    def query_parameters(self):
        dates = self.date_interval.dates()  # pylint: disable=no-member
        start_date = dates[0]
        # Note that we could probably use the end date at 23:59:59, however, it's easier to just add a day and use the
        # next day as an excluded upper bound on the interval. So we actually select all data earlier than
        # 00:00:00.000 on the day following the last day in the interval.
        end_date = dates[-1] + datetime.timedelta(1)
        return (
            mysql_datetime(start_date),
            mysql_datetime(end_date)
        )

    @property
    def filename(self):
        return 'user_registrations_{0}.tsv'.format(self.date_interval)
class AggregateInternalReportingUserActivityTableHive(HiveTableFromQueryTask):
    """Aggregate the user activity table in Hive."""
    interval = luigi.DateIntervalParameter()
    n_reduce_tasks = luigi.Parameter()

    def requires(self):
        """
        This task reads from auth_user and user_activity_daily, so require that they be
        loaded into Hive (via MySQL loads into Hive or via the pipeline as needed).
        """
        return [ImportAuthUserTask(overwrite=False, destination=self.warehouse_path),
                UserActivityTableTask(interval=self.interval, warehouse_path=self.warehouse_path,
                                      n_reduce_tasks=self.n_reduce_tasks)]

    @property
    def table(self):
        return 'internal_reporting_user_activity'

    @property
    def columns(self):
        return [
            ('user_id', 'INT'),
            ('course_id', 'STRING'),
            ('date', 'STRING'),
            ('activity_type', 'STRING'),
            ('number_of_activities', 'INT'),
        ]

    @property
    def partition(self):
        return HivePartition('dt', self.interval.date_b.isoformat())  # pylint: disable=no-member

    @property
    def insert_query(self):
        return """
コード例 #13
0
 def test_use_interval(self):
     interval = luigi.DateIntervalParameter().parse('2013-01-01')
     interval_start = None
     CourseEnrollmentTask(interval=interval,
                          interval_start=interval_start,
                          output_root="/fake/output",
                          overwrite_n_days=5)
コード例 #14
0
ファイル: mixins.py プロジェクト: initialkommit/fika
class DateIntervalMixin(luigi.Task):
    """조회 일자 범위를 Parameter로 받는 클래스"""
    today = datetime.datetime.now().date()
    luigi_today = luigi_date_interval.Date(today.year, today.month, today.day)
    yesterday = today - datetime.timedelta(days=1)
    luigi_yesterday = luigi_date_interval.Date(yesterday.year, yesterday.month, yesterday.day)
    date_interval = luigi.DateIntervalParameter(luigi_yesterday, luigi_today)
コード例 #15
0
class Top10Artists(luigi.Task):
    '''Find top 10 artists from agrigated list
    '''
    date_interval = luigi.DateIntervalParameter()
    use_hadoop = luigi.BoolParameter()

    def requires(self):
        if self.use_hadoop:
            return AggregateArtistsSpark(self.date_interval)
        else:
            return AggregateArtists(self.date_interval)

    def output(self):
        return luigi.LocalTarget("data/top_artists_%s.tsv" % self.daterinterval)

    def run(self):
        top_10 = nlargest(10, self._input_iterator())
        with self.output().open('w') as out_file:
            for streams, artist in top_10:
                print >> out_file, self.date_interval.date_a, self.date_interval.date_b, artist, streams

    def _input_iterator(self):
        with self.input().open('r') as in_file:
            for line in in_file:
                artist, streams = line.strip().split()
                yield int(streams), int(artist)
コード例 #16
0
ファイル: dates.py プロジェクト: arapso-scaffoldings/python
class CurrencyRatesForInterval(luigi.WrapperTask):

    dates = luigi.DateIntervalParameter()

    def requires(self):
        for date in self.dates:
            yield CurrencyRatesDaily(date=date)
コード例 #17
0
class ScanForPackages(luigi.WrapperTask):
    """
    This task scans the output folder for jobs and instances of those jobs, looking for crawled content to process.
    """
    task_namespace = 'output'
    date_interval = luigi.DateIntervalParameter(default=[
        datetime.date.today() - datetime.timedelta(days=1),
        datetime.date.today()
    ])

    def requires(self):
        # Look for jobs that need to be processed:
        for date in self.date_interval:
            for job_item in glob.glob("%s/*/*" % state().state_folder):
                job = Jobs[os.path.basename(job_item)]
                if os.path.isdir(job_item):
                    launch_glob = "%s/%s*" % (job_item,
                                              date.strftime('%Y%m%d'))
                    # self.set_status_message("Looking for job launch folders matching %s" % launch_glob)
                    for launch_item in glob.glob(launch_glob):
                        if os.path.isdir(launch_item):
                            launch = os.path.basename(launch_item)
                            # TODO Limit total number of processes?
                            logger.info("ScanForPackages - looking at %s %s" %
                                        (job, launch_item))
                            yield ProcessPackages(job, launch, launch_item)
コード例 #18
0
class WordCount(luigi.hadoop.JobTask):
    """
    This task runs a :py:class:`luigi.contrib.hadoop.JobTask`
    over the target data returned by :py:meth:`~/.InputText.output` and
    writes the result into its :py:meth:`~.WordCount.output` target.
    This class uses :py:meth:`luigi.contrib.hadoop.JobTask.run`.
    """

    date_interval = luigi.DateIntervalParameter()

    def requires(self):
        """
        This task's dependencies:
        * :py:class:`~.InputText`
        :return: list of object (:py:class:`luigi.task.Task`)
        """
        return [InputText(date) for date in self.date_interval.dates()]

    def output(self):
        """
        Returns the target output for this task.
        In this case, a successful execution of this task will create a file in HDFS.
        :return: the target output for this task.
        :rtype: object (:py:class:`luigi.target.Target`)
        """
        return luigi.hdfs.HdfsTarget('/tmp/text-count/%s' % self.date_interval)

    def mapper(self, line):
        
        for word in line.strip().split():
            yield word, 1

    def reducer(self, key, values):
        yield key, 1#values sum(map(int,values))
コード例 #19
0
 def create_task(self, interval='2013-01-01'):
     """Create a task for testing purposes."""
     fake_param = luigi.DateIntervalParameter()
     self.task = CourseEnrollmentTask(
         interval=fake_param.parse(interval),
         output_root="/fake/output",
     )
コード例 #20
0
 def setUp(self):
     self.interval = '2014-12-17'
     fake_param = luigi.DateIntervalParameter()
     self.task = TotalEventsDailyTask(interval=fake_param.parse(
         self.interval),
                                      output_root="/fake/output")
     self.key = '2014-12-17T00:00:01'
コード例 #21
0
class WordCount(luigi.Task):
    date_interval = luigi.DateIntervalParameter()

    def requires(self):
        return [InputText(date) for date in self.date_interval.dates()]

    def output(self):
        return luigi.LocalTarget('data/text-count-%s.tsv' % self.date_interval)

    def run(self):
        count = {}
        for file in self.input(
        ):  # The input() method is a wrapper around requires() that returns Target objects
            for line in file.open(
                    'r'
            ):  # Target objects are a file system/format abstraction and this will return a file stream object
                for word in line.strip().split():
                    count[word] = count.get(word, 0) + 1

        # output data
        f = self.output().open('w')
        for word, count in count.iteritems():
            f.write("%s\t%d\n" % (word, count))
        f.close(
        )  # Note that this is essential because file system operations are atomic
コード例 #22
0
ファイル: top_artists.py プロジェクト: leochencipher/luigi
class Top10Artists(luigi.Task):
    date_interval = luigi.DateIntervalParameter()
    use_hadoop = luigi.BoolParameter()

    def requires(self):
        if self.use_hadoop:
            return AggregateArtistsHadoop(self.date_interval)
        else:
            return AggregateArtists(self.date_interval)

    def output(self):
        return luigi.LocalTarget("data/top_artists_%s.tsv" % self.date_interval)

    def run(self):
        top_10 = nlargest(10, self._input_iterator())
        with self.output().open('w') as out_file:
            for streams, artist in top_10:
                out_line = '\t'.join([
                    str(self.date_interval.date_a),
                    str(self.date_interval.date_b),
                    artist,
                    str(streams)
                ])
                out_file.write(out_line + '\n')

    def _input_iterator(self):
        with self.input().open('r') as in_file:
            for line in in_file:
                artist, streams = line.strip().split()
                yield int(streams), artist
コード例 #23
0
class Top10ArtistsS3(luigi.Task):

    date_interval = luigi.DateIntervalParameter()
    sleep_seconds = luigi.Parameter()

    def requires(self):
        return AggregateArtistsS3(self.date_interval, self.sleep_seconds)

    def output(self):
        s3_string = "s3:{0}{1}".format(
            os.environ["LUIGIS3_EXAMPLES"],
            "top_artists_{}.tsv".format(self.date_interval))
        return luigi_s3.S3Target(s3_string)

    def run(self):
        top_10 = nlargest(10, self._input_iterator())
        with self.output().open('w') as out_file:
            for streams, artist in top_10:
                out_line = '\t'.join([
                    str(self.date_interval.date_a),
                    str(self.date_interval.date_b), artist,
                    str(streams)
                ])
                out_file.write((out_line + '\n'))

    def _input_iterator(self):
        with self.input().open('r') as in_file:
            for line in in_file:
                artist, streams = line.strip().split()
                yield int(streams), artist
コード例 #24
0
class CalendarDownstreamMixin(OverwriteOutputMixin):
    """The parameters needed to generate a complete calendar."""

    interval = luigi.DateIntervalParameter(config_path={
        'section': 'calendar',
        'name': 'interval'
    })
コード例 #25
0
    def create_validation_task(self,
                               generate_before=True,
                               tuple_output=True,
                               include_nonstate_changes=True,
                               earliest_timestamp=None,
                               expected_validation=None):
        """Create a task for testing purposes."""
        interval = '2013-01-01-2014-10-10'

        interval_value = luigi.DateIntervalParameter().parse(interval)
        earliest_timestamp_value = luigi.DateHourParameter().parse(
            earliest_timestamp) if earliest_timestamp else None
        expected_validation_value = (
            luigi.DateHourParameter().parse(expected_validation)
            if expected_validation else None)

        self.task = CourseEnrollmentValidationTask(
            interval=interval_value,
            output_root="/fake/output",
            generate_before=generate_before,
            tuple_output=tuple_output,
            include_nonstate_changes=include_nonstate_changes,
            earliest_timestamp=earliest_timestamp_value,
            expected_validation=expected_validation_value,
        )
        self.task.init_local()
コード例 #26
0
ファイル: geotweet.py プロジェクト: elnikkis/snlocest
class AggregateCountArea(luigi.Task):
    '''Task: Aggregate the count for each file

    ファイルごとに数えたエリアを合計する

    Args:
        date_range (datetime-datetime):
        output_dir (string, optional): Output directory
        output_prefix (string, optional): default='aggarea'
    '''
    date_range = luigi.DateIntervalParameter()
    output_dir = luigi.Parameter(
        default=os.path.join(PREPROCESS_GEOTWEETS_DIR, 'aggarea'))
    output_prefix = luigi.Parameter(default='aggarea')

    def requires(self):
        return [CountArea(date=date) for date in self.date_range]

    def output(self):
        path = os.path.join(
            self.output_dir, '{}-{}.tsv'.format(self.output_prefix,
                                                self.date_range))
        return luigi.LocalTarget(path)

    def run(self):
        input_files = ' '.join([
            i.path for i in self.input()
        ])  # if path includes any spaces, invalid params will be created
        with self.output().temporary_path() as temp_output_path:
            run('python -m snlocest.scripts.agg_count {} | LC_ALL=C sort > {}'.
                format(input_files, temp_output_path),
                shell=True,
                check=True)
コード例 #27
0
ファイル: top_artists.py プロジェクト: zhaohc10/luigi
class ArtistToplistToDatabase(luigi.contrib.postgres.CopyToTable):
    """
    This task runs a :py:class:`luigi.postgres.CopyToTable` task
    over the target data returned by :py:meth:`~/.Top10Artists.output` and
    writes the result into its :py:meth:`~.ArtistToplistToDatabase.output` target which,
    by default, is :py:class:`luigi.postgres.PostgresTarget` (a table in PostgreSQL).

    This class uses :py:meth:`luigi.postgres.CopyToTable.run` and :py:meth:`luigi.postgres.CopyToTable.output`.
    """

    date_interval = luigi.DateIntervalParameter()
    use_hadoop = luigi.BoolParameter()

    host = "localhost"
    database = "toplists"
    user = "******"
    password = "******"  # ;)
    table = "top10"

    columns = [("date_from", "DATE"), ("date_to", "DATE"), ("artist", "TEXT"),
               ("streams", "INT")]

    def requires(self):
        """
        This task's dependencies:

        * :py:class:`~.Top10Artists`

        :return: list of object (:py:class:`luigi.task.Task`)
        """
        return Top10Artists(self.date_interval, self.use_hadoop)
コード例 #28
0
ファイル: geotweet.py プロジェクト: elnikkis/snlocest
class SelectMajorityHomeLocation(luigi.Task):
    '''Task: Select the home loation by majority voting

    Args:
        date_range (datetime-datetime):
        min_majoritynum (int, optional): The minimum number of the tweet in the majority area (default=1; no limit)
        min_totalnum (int, optional): The total number of the tweet of the user (default=1; no limit)
        output_dir (string, optional): output directory
    '''
    date_range = luigi.DateIntervalParameter()
    #method = luigi.Parameter(default='MajorityVote') # used in future
    min_majoritynum = luigi.IntParameter(default=1)
    min_totalnum = luigi.IntParameter(default=1)
    output_dir = luigi.Parameter(default=os.path.join(
        PREPROCESS_GEOTWEETS_DIR, 'homelocation', 'majority'))

    def requires(self):
        return AggregateCountArea(date_range=self.date_range)

    def output(self):
        path = os.path.join(
            self.output_dir, '{}_MinMajorityNum-{}_MinTotalNum-{}.tsv'.format(
                self.date_range, self.min_majoritynum, self.min_totalnum))
        return luigi.LocalTarget(path)

    def run(self):
        cmd = 'cat {} | python -m snlocest.scripts.decidehomelocation --min-majoritynum {} --min-totalnum {} > {}'
        with self.output().temporary_path() as temp_output_path:
            run(cmd.format(self.input().path, self.min_majoritynum,
                           self.min_totalnum, temp_output_path),
                shell=True,
                check=True)
コード例 #29
0
ファイル: crawl_reports.py プロジェクト: min2ha/ukwa-monitor
class ScanForOutputs(luigi.WrapperTask):
    """
    This task scans the output folder for jobs and instances of those jobs, looking for crawled content to process.

    Sub-class this and override the scan_job_launch method as needed.
    """
    task_namespace = 'scan'
    date_interval = luigi.DateIntervalParameter(default=get_modest_interval())
    timestamp = luigi.DateMinuteParameter(default=datetime.datetime.today())

    def requires(self):
        # Enumerate the jobs:
        for (job, launch) in self.enumerate_launches():
            #logger.debug("Yielding %s/%s" % ( job, launch ))
            yield self.process_output(job, launch)

    def enumerate_launches(self):
        # Get HDFS client:
        client = luigi.contrib.hdfs.WebHdfsClient()
        # Look for jobs that need to be processed:
        for date in self.date_interval:
            logger.info("Scanning date %s..." % date)
            for job_item in client.listdir(CRAWL_STATS_PREFIX):
                job = os.path.basename(job_item)
                launch_glob = date.strftime('%Y%m%d')
                logger.debug(
                    "Looking for job launch folders matching %s in %s/%s" %
                    (launch_glob, CRAWL_STATS_PREFIX, job))
                for launch_item in client.listdir("%s/%s" %
                                                  (CRAWL_STATS_PREFIX, job)):
                    logger.debug("Looking at %s" % launch_item)
                    launch = os.path.basename(launch_item)
                    if launch.startswith(launch_glob):
                        yield (job, launch)
コード例 #30
0
class WordCountHadoopTask(luigi.contrib.hadoop.JobTask):
    task_namespaces = 'examples'

    date_interval = luigi.DateIntervalParameter()

    def requires(self):
        return [InputText(date) for date in self.date_interval.dates()]

    def output(self):
        return luigi.contrib.hdfs.HdfsTarget(
            '/pgc/baoqiang/examples/{}.output'.format(self.date_interval),
            format=luigi.contrib.hdfs.Plain)

    def jobconfs(self):
        jcs = super(luigi.contrib.hadoop.JobTask, self).jobconfs()
        jcs.append('mapred.compress.map.output=false')
        jcs.append('mapred.output.compress=false')
        jcs.append('mapred.output.fileoutputformat.compress=false')
        return jcs

    def mapper(self, line):
        for word in line.strip().split():
            yield word, 1

    def reducer(self, key, values):
        yield key, sum(values)