class CourseEnrollmentTableDownstreamMixin(WarehouseMixin, EventLogSelectionDownstreamMixin, MapReduceJobTaskMixin): """All parameters needed to run the CourseEnrollmentTableTask task.""" # Make the interval be optional: interval = luigi.DateIntervalParameter( default=None, description='The range of dates to export logs for. ' 'If not specified, `interval_start` and `interval_end` are used to construct the `interval`.', ) # Define optional parameters, to be used if 'interval' is not defined. interval_start = luigi.DateParameter( config_path={'section': 'enrollments', 'name': 'interval_start'}, significant=False, description='The start date to export logs for. Ignored if `interval` is provided.', ) interval_end = luigi.DateParameter( default=datetime.datetime.utcnow().date(), significant=False, description='The end date to export logs for. Ignored if `interval` is provided. ' 'Default is today, UTC.', ) def __init__(self, *args, **kwargs): super(CourseEnrollmentTableDownstreamMixin, self).__init__(*args, **kwargs) if not self.interval: self.interval = luigi.date_interval.Custom(self.interval_start, self.interval_end)
class PushToVerticaEventTypeDistributionTask(VerticaCopyTask): """Push the event type distribution task data to Vertica.""" output_root = luigi.Parameter() interval = luigi.DateIntervalParameter() n_reduce_tasks = luigi.Parameter() events_list_file_path = luigi.Parameter(default=None) @property def table(self): return "event_type_distribution" @property def columns(self): return [ ('event_date', 'DATETIME'), ('event_category', 'VARCHAR(255)'), ('event_type', 'VARCHAR(255)'), ('event_source', 'VARCHAR(255)'), ('exported', 'BOOLEAN'), ('event_count', 'INT'), ] @property def insert_source_task(self): return EventTypeDistributionTask( output_root=self.output_root, interval=self.interval, n_reduce_tasks=self.n_reduce_tasks, events_list_file_path=self.events_list_file_path)
class OpenNTPWeek(luigi.Task): week = luigi.DateIntervalParameter() def output(self): #return luigi.LocalTarget("data/OpenNTP/week-{}".format(self.week)) return luigi.s3.S3Target("s3://test-bucket/raw/ntp-scan/parsed-{}".format(self.week)) def run(self): ssh = luigi.contrib.ssh.RemoteFileSystem("127.0.0.1") dir_list = ssh.listdir("/tmp/ntp-scan/") found_a_file=False for day in self.week.dates(): d_file = 'parsed-{}.out'.format(day) for f in dir_list: if d_file == os.path.basename(f): if found_a_file: raise RuntimeError("Multiple files for week") logging.info("found {} in {}".format(day, dir_list)) tmpfile = luigi.LocalTarget(is_tmp=True) ssh.get(f, tmpfile.path) with self.output().open("w") as out_file, tmpfile.open('r') as in_file: for line in in_file: out_file.write(line) found_a_file = True if not found_a_file: raise RuntimeError("didn't find anything for the week")
class ArtistS3ToDatabase(luigi.contrib.postgres.CopyToTable): """ This task runs a :py:class:`luigi.postgres.CopyToTable` task over the target data returned by :py:meth:`~/.Top10Artists.output` and writes the result into its :py:meth:`~.ArtistToplistToDatabase.output` target which, by default, is :py:class:`luigi.postgres.PostgresTarget` (a table in PostgreSQL). This class uses :py:meth:`luigi.postgres.CopyToTable.run` and :py:meth:`luigi.postgres.CopyToTable.output`. """ date_interval = luigi.DateIntervalParameter() sleep_seconds = luigi.Parameter() host = os.environ["LUIGI_DBHOST"] database = os.environ["LUIGI_DBDATABASE"] user = os.environ["LUIGI_DBUSER"] password = os.environ["LUIGI_DBPASSWORD"] table = "artist_streams" columns = [("date_from", "DATE"), ("date_to", "DATE"), ("artist", "TEXT"), ("streams", "INT")] def requires(self): """ This task's dependencies: * :py:class:`~.Top10Artists` :return: list of object (:py:class:`luigi.task.Task`) """ return Top10ArtistsS3(self.date_interval, self.sleep_seconds)
class EventLogSelectionDownstreamMixin(object): """Defines parameters for passing upstream to tasks that use EventLogSelectionMixin.""" source = luigi.Parameter( is_list=True, config_path={'section': 'event-logs', 'name': 'source'}, description='A URL to a path that contains log files that contain the events. (e.g., s3://my_bucket/foo/).', ) interval = luigi.DateIntervalParameter( description='The range of dates to export logs for.', ) expand_interval = luigi.TimeDeltaParameter( config_path={'section': 'event-logs', 'name': 'expand_interval'}, description='A time interval to add to the beginning and end of the interval to expand the windows of ' 'files captured.', ) pattern = luigi.Parameter( is_list=True, config_path={'section': 'event-logs', 'name': 'pattern'}, description='A regex with a named capture group for the date that approximates the date that the events ' 'within were emitted. Note that the search interval is expanded, so events don\'t have to be in exactly ' 'the right file in order for them to be processed.', ) date_pattern = luigi.Parameter( default='%Y%m%d', description='The format of the date as it appears in the source file name. Note that this correlates with the ' 'named capture group for date in the pattern parameter. This is intended to select relevant event log files ' 'by making sure the date is within the interval.', )
def create_link_clicked_task(self, interval='2013-01-01'): """Create a task for testing purposes.""" fake_param = luigi.DateIntervalParameter() self.task = LMSCoursewareLinkClickedTask( interval=fake_param.parse(interval), output_root="/fake/output", )
def create_task_distribution_task(self, interval='2013-01-01'): """Create a task for testing purposes.""" fake_param = luigi.DateIntervalParameter() self.task = TagsDistributionPerCourse( interval=fake_param.parse(interval), output_root="/fake/output", )
class SongMetaPreprocess(luigi.Task): date_interval = luigi.DateIntervalParameter() def requires(self): return UserTastePreprocess(self.date_interval) def run(self): track_meta_path = './origin/track_metadata.db' conn = sqlite3.connect(track_meta_path) q = "SELECT song_id, title, artist_name FROM songs" res = conn.execute(q) echonest_meta = res.fetchall() song_meta = pd.DataFrame( echonest_meta, columns=['song_id', 'song_title', 'artist_name']) with self.input().open('r') as in_file: taste = pd.read_csv(in_file) song_encode = taste[['song_id', 'song_index']].drop_duplicates() song_encode_meta = pd.merge(song_encode, song_meta, how='left', on='song_id') with self.output().open('w') as out_file: song_encode_meta.drop('song_id', axis=1).to_csv(out_file, index=False, encoding='utf-8') def output(self): return luigi.LocalTarget("./data/song_encode_meta_%s.csv" % self.date_interval)
class ScanForLaunches(luigi.WrapperTask): """ This task scans the output folder for jobs and instances of those jobs, looking for crawled content to process. Sub-class this and override the scan_job_launch method as needed. """ task_namespace = 'scan' date_interval = luigi.DateIntervalParameter(default=get_large_interval()) timestamp = luigi.DateMinuteParameter(default=datetime.datetime.today()) def requires(self): # Enumerate the jobs: for (job, launch) in self.enumerate_launches(): logger.info("Processing %s/%s" % (job, launch)) yield self.scan_job_launch(job, launch) def enumerate_launches(self): # Look for jobs that need to be processed: for date in self.date_interval: logger.info("Looking at date %s" % date) for job_item in glob.glob("%s/*" % CRAWL_OUTPUT_FOLDER): job = os.path.basename(job_item) if os.path.isdir(job_item): launch_glob = "%s/%s*" % (job_item, date.strftime('%Y%m%d')) logger.info("Looking for job launch folders matching %s" % launch_glob) for launch_item in glob.glob(launch_glob): logger.info("Found %s" % launch_item) if os.path.isdir(launch_item): launch = os.path.basename(launch_item) yield (job, launch)
class PushToVerticaLMSCoursewareLinkClickedTask(VerticaCopyTask): """Push the LMS courseware link clicked task data to Vertica.""" output_root = luigi.Parameter() interval = luigi.DateIntervalParameter() n_reduce_tasks = luigi.Parameter() @property def table(self): return "lms_courseware_link_clicked_events" @property def columns(self): return [('course_id', 'VARCHAR(255)'), ('event_date', 'DATE'), ('external_link_clicked_events', 'INT'), ('link_clicked_events', 'INT')] @property def insert_source_task(self): return LMSCoursewareLinkClickedTask(output_root=self.output_root, interval=self.interval, n_reduce_tasks=self.n_reduce_tasks) @property def auto_primary_key(self): """Use 'record_number' as primary key to match the schema""" return ('record_number', 'AUTO_INCREMENT') @property def default_columns(self): """List of tuples defining name and definition of automatically-filled columns.""" return None
class UserRegistrationsPerDay(MysqlSelectTask): """ Determine the number of users that registered accounts each day. """ date_interval = luigi.DateIntervalParameter( description='The range of dates to gather data for.', ) @property def query(self): return ("SELECT DATE(date_joined), COUNT(1) FROM `auth_user`" " WHERE `date_joined` >= %s AND `date_joined` < %s GROUP BY DATE(date_joined) ORDER BY 1 ASC") @property def query_parameters(self): dates = self.date_interval.dates() # pylint: disable=no-member start_date = dates[0] # Note that we could probably use the end date at 23:59:59, however, it's easier to just add a day and use the # next day as an excluded upper bound on the interval. So we actually select all data earlier than # 00:00:00.000 on the day following the last day in the interval. end_date = dates[-1] + datetime.timedelta(1) return ( mysql_datetime(start_date), mysql_datetime(end_date) ) @property def filename(self): return 'user_registrations_{0}.tsv'.format(self.date_interval)
class AggregateInternalReportingUserActivityTableHive(HiveTableFromQueryTask): """Aggregate the user activity table in Hive.""" interval = luigi.DateIntervalParameter() n_reduce_tasks = luigi.Parameter() def requires(self): """ This task reads from auth_user and user_activity_daily, so require that they be loaded into Hive (via MySQL loads into Hive or via the pipeline as needed). """ return [ImportAuthUserTask(overwrite=False, destination=self.warehouse_path), UserActivityTableTask(interval=self.interval, warehouse_path=self.warehouse_path, n_reduce_tasks=self.n_reduce_tasks)] @property def table(self): return 'internal_reporting_user_activity' @property def columns(self): return [ ('user_id', 'INT'), ('course_id', 'STRING'), ('date', 'STRING'), ('activity_type', 'STRING'), ('number_of_activities', 'INT'), ] @property def partition(self): return HivePartition('dt', self.interval.date_b.isoformat()) # pylint: disable=no-member @property def insert_query(self): return """
def test_use_interval(self): interval = luigi.DateIntervalParameter().parse('2013-01-01') interval_start = None CourseEnrollmentTask(interval=interval, interval_start=interval_start, output_root="/fake/output", overwrite_n_days=5)
class DateIntervalMixin(luigi.Task): """조회 일자 범위를 Parameter로 받는 클래스""" today = datetime.datetime.now().date() luigi_today = luigi_date_interval.Date(today.year, today.month, today.day) yesterday = today - datetime.timedelta(days=1) luigi_yesterday = luigi_date_interval.Date(yesterday.year, yesterday.month, yesterday.day) date_interval = luigi.DateIntervalParameter(luigi_yesterday, luigi_today)
class Top10Artists(luigi.Task): '''Find top 10 artists from agrigated list ''' date_interval = luigi.DateIntervalParameter() use_hadoop = luigi.BoolParameter() def requires(self): if self.use_hadoop: return AggregateArtistsSpark(self.date_interval) else: return AggregateArtists(self.date_interval) def output(self): return luigi.LocalTarget("data/top_artists_%s.tsv" % self.daterinterval) def run(self): top_10 = nlargest(10, self._input_iterator()) with self.output().open('w') as out_file: for streams, artist in top_10: print >> out_file, self.date_interval.date_a, self.date_interval.date_b, artist, streams def _input_iterator(self): with self.input().open('r') as in_file: for line in in_file: artist, streams = line.strip().split() yield int(streams), int(artist)
class CurrencyRatesForInterval(luigi.WrapperTask): dates = luigi.DateIntervalParameter() def requires(self): for date in self.dates: yield CurrencyRatesDaily(date=date)
class ScanForPackages(luigi.WrapperTask): """ This task scans the output folder for jobs and instances of those jobs, looking for crawled content to process. """ task_namespace = 'output' date_interval = luigi.DateIntervalParameter(default=[ datetime.date.today() - datetime.timedelta(days=1), datetime.date.today() ]) def requires(self): # Look for jobs that need to be processed: for date in self.date_interval: for job_item in glob.glob("%s/*/*" % state().state_folder): job = Jobs[os.path.basename(job_item)] if os.path.isdir(job_item): launch_glob = "%s/%s*" % (job_item, date.strftime('%Y%m%d')) # self.set_status_message("Looking for job launch folders matching %s" % launch_glob) for launch_item in glob.glob(launch_glob): if os.path.isdir(launch_item): launch = os.path.basename(launch_item) # TODO Limit total number of processes? logger.info("ScanForPackages - looking at %s %s" % (job, launch_item)) yield ProcessPackages(job, launch, launch_item)
class WordCount(luigi.hadoop.JobTask): """ This task runs a :py:class:`luigi.contrib.hadoop.JobTask` over the target data returned by :py:meth:`~/.InputText.output` and writes the result into its :py:meth:`~.WordCount.output` target. This class uses :py:meth:`luigi.contrib.hadoop.JobTask.run`. """ date_interval = luigi.DateIntervalParameter() def requires(self): """ This task's dependencies: * :py:class:`~.InputText` :return: list of object (:py:class:`luigi.task.Task`) """ return [InputText(date) for date in self.date_interval.dates()] def output(self): """ Returns the target output for this task. In this case, a successful execution of this task will create a file in HDFS. :return: the target output for this task. :rtype: object (:py:class:`luigi.target.Target`) """ return luigi.hdfs.HdfsTarget('/tmp/text-count/%s' % self.date_interval) def mapper(self, line): for word in line.strip().split(): yield word, 1 def reducer(self, key, values): yield key, 1#values sum(map(int,values))
def create_task(self, interval='2013-01-01'): """Create a task for testing purposes.""" fake_param = luigi.DateIntervalParameter() self.task = CourseEnrollmentTask( interval=fake_param.parse(interval), output_root="/fake/output", )
def setUp(self): self.interval = '2014-12-17' fake_param = luigi.DateIntervalParameter() self.task = TotalEventsDailyTask(interval=fake_param.parse( self.interval), output_root="/fake/output") self.key = '2014-12-17T00:00:01'
class WordCount(luigi.Task): date_interval = luigi.DateIntervalParameter() def requires(self): return [InputText(date) for date in self.date_interval.dates()] def output(self): return luigi.LocalTarget('data/text-count-%s.tsv' % self.date_interval) def run(self): count = {} for file in self.input( ): # The input() method is a wrapper around requires() that returns Target objects for line in file.open( 'r' ): # Target objects are a file system/format abstraction and this will return a file stream object for word in line.strip().split(): count[word] = count.get(word, 0) + 1 # output data f = self.output().open('w') for word, count in count.iteritems(): f.write("%s\t%d\n" % (word, count)) f.close( ) # Note that this is essential because file system operations are atomic
class Top10Artists(luigi.Task): date_interval = luigi.DateIntervalParameter() use_hadoop = luigi.BoolParameter() def requires(self): if self.use_hadoop: return AggregateArtistsHadoop(self.date_interval) else: return AggregateArtists(self.date_interval) def output(self): return luigi.LocalTarget("data/top_artists_%s.tsv" % self.date_interval) def run(self): top_10 = nlargest(10, self._input_iterator()) with self.output().open('w') as out_file: for streams, artist in top_10: out_line = '\t'.join([ str(self.date_interval.date_a), str(self.date_interval.date_b), artist, str(streams) ]) out_file.write(out_line + '\n') def _input_iterator(self): with self.input().open('r') as in_file: for line in in_file: artist, streams = line.strip().split() yield int(streams), artist
class Top10ArtistsS3(luigi.Task): date_interval = luigi.DateIntervalParameter() sleep_seconds = luigi.Parameter() def requires(self): return AggregateArtistsS3(self.date_interval, self.sleep_seconds) def output(self): s3_string = "s3:{0}{1}".format( os.environ["LUIGIS3_EXAMPLES"], "top_artists_{}.tsv".format(self.date_interval)) return luigi_s3.S3Target(s3_string) def run(self): top_10 = nlargest(10, self._input_iterator()) with self.output().open('w') as out_file: for streams, artist in top_10: out_line = '\t'.join([ str(self.date_interval.date_a), str(self.date_interval.date_b), artist, str(streams) ]) out_file.write((out_line + '\n')) def _input_iterator(self): with self.input().open('r') as in_file: for line in in_file: artist, streams = line.strip().split() yield int(streams), artist
class CalendarDownstreamMixin(OverwriteOutputMixin): """The parameters needed to generate a complete calendar.""" interval = luigi.DateIntervalParameter(config_path={ 'section': 'calendar', 'name': 'interval' })
def create_validation_task(self, generate_before=True, tuple_output=True, include_nonstate_changes=True, earliest_timestamp=None, expected_validation=None): """Create a task for testing purposes.""" interval = '2013-01-01-2014-10-10' interval_value = luigi.DateIntervalParameter().parse(interval) earliest_timestamp_value = luigi.DateHourParameter().parse( earliest_timestamp) if earliest_timestamp else None expected_validation_value = ( luigi.DateHourParameter().parse(expected_validation) if expected_validation else None) self.task = CourseEnrollmentValidationTask( interval=interval_value, output_root="/fake/output", generate_before=generate_before, tuple_output=tuple_output, include_nonstate_changes=include_nonstate_changes, earliest_timestamp=earliest_timestamp_value, expected_validation=expected_validation_value, ) self.task.init_local()
class AggregateCountArea(luigi.Task): '''Task: Aggregate the count for each file ファイルごとに数えたエリアを合計する Args: date_range (datetime-datetime): output_dir (string, optional): Output directory output_prefix (string, optional): default='aggarea' ''' date_range = luigi.DateIntervalParameter() output_dir = luigi.Parameter( default=os.path.join(PREPROCESS_GEOTWEETS_DIR, 'aggarea')) output_prefix = luigi.Parameter(default='aggarea') def requires(self): return [CountArea(date=date) for date in self.date_range] def output(self): path = os.path.join( self.output_dir, '{}-{}.tsv'.format(self.output_prefix, self.date_range)) return luigi.LocalTarget(path) def run(self): input_files = ' '.join([ i.path for i in self.input() ]) # if path includes any spaces, invalid params will be created with self.output().temporary_path() as temp_output_path: run('python -m snlocest.scripts.agg_count {} | LC_ALL=C sort > {}'. format(input_files, temp_output_path), shell=True, check=True)
class ArtistToplistToDatabase(luigi.contrib.postgres.CopyToTable): """ This task runs a :py:class:`luigi.postgres.CopyToTable` task over the target data returned by :py:meth:`~/.Top10Artists.output` and writes the result into its :py:meth:`~.ArtistToplistToDatabase.output` target which, by default, is :py:class:`luigi.postgres.PostgresTarget` (a table in PostgreSQL). This class uses :py:meth:`luigi.postgres.CopyToTable.run` and :py:meth:`luigi.postgres.CopyToTable.output`. """ date_interval = luigi.DateIntervalParameter() use_hadoop = luigi.BoolParameter() host = "localhost" database = "toplists" user = "******" password = "******" # ;) table = "top10" columns = [("date_from", "DATE"), ("date_to", "DATE"), ("artist", "TEXT"), ("streams", "INT")] def requires(self): """ This task's dependencies: * :py:class:`~.Top10Artists` :return: list of object (:py:class:`luigi.task.Task`) """ return Top10Artists(self.date_interval, self.use_hadoop)
class SelectMajorityHomeLocation(luigi.Task): '''Task: Select the home loation by majority voting Args: date_range (datetime-datetime): min_majoritynum (int, optional): The minimum number of the tweet in the majority area (default=1; no limit) min_totalnum (int, optional): The total number of the tweet of the user (default=1; no limit) output_dir (string, optional): output directory ''' date_range = luigi.DateIntervalParameter() #method = luigi.Parameter(default='MajorityVote') # used in future min_majoritynum = luigi.IntParameter(default=1) min_totalnum = luigi.IntParameter(default=1) output_dir = luigi.Parameter(default=os.path.join( PREPROCESS_GEOTWEETS_DIR, 'homelocation', 'majority')) def requires(self): return AggregateCountArea(date_range=self.date_range) def output(self): path = os.path.join( self.output_dir, '{}_MinMajorityNum-{}_MinTotalNum-{}.tsv'.format( self.date_range, self.min_majoritynum, self.min_totalnum)) return luigi.LocalTarget(path) def run(self): cmd = 'cat {} | python -m snlocest.scripts.decidehomelocation --min-majoritynum {} --min-totalnum {} > {}' with self.output().temporary_path() as temp_output_path: run(cmd.format(self.input().path, self.min_majoritynum, self.min_totalnum, temp_output_path), shell=True, check=True)
class ScanForOutputs(luigi.WrapperTask): """ This task scans the output folder for jobs and instances of those jobs, looking for crawled content to process. Sub-class this and override the scan_job_launch method as needed. """ task_namespace = 'scan' date_interval = luigi.DateIntervalParameter(default=get_modest_interval()) timestamp = luigi.DateMinuteParameter(default=datetime.datetime.today()) def requires(self): # Enumerate the jobs: for (job, launch) in self.enumerate_launches(): #logger.debug("Yielding %s/%s" % ( job, launch )) yield self.process_output(job, launch) def enumerate_launches(self): # Get HDFS client: client = luigi.contrib.hdfs.WebHdfsClient() # Look for jobs that need to be processed: for date in self.date_interval: logger.info("Scanning date %s..." % date) for job_item in client.listdir(CRAWL_STATS_PREFIX): job = os.path.basename(job_item) launch_glob = date.strftime('%Y%m%d') logger.debug( "Looking for job launch folders matching %s in %s/%s" % (launch_glob, CRAWL_STATS_PREFIX, job)) for launch_item in client.listdir("%s/%s" % (CRAWL_STATS_PREFIX, job)): logger.debug("Looking at %s" % launch_item) launch = os.path.basename(launch_item) if launch.startswith(launch_glob): yield (job, launch)
class WordCountHadoopTask(luigi.contrib.hadoop.JobTask): task_namespaces = 'examples' date_interval = luigi.DateIntervalParameter() def requires(self): return [InputText(date) for date in self.date_interval.dates()] def output(self): return luigi.contrib.hdfs.HdfsTarget( '/pgc/baoqiang/examples/{}.output'.format(self.date_interval), format=luigi.contrib.hdfs.Plain) def jobconfs(self): jcs = super(luigi.contrib.hadoop.JobTask, self).jobconfs() jcs.append('mapred.compress.map.output=false') jcs.append('mapred.output.compress=false') jcs.append('mapred.output.fileoutputformat.compress=false') return jcs def mapper(self, line): for word in line.strip().split(): yield word, 1 def reducer(self, key, values): yield key, sum(values)