class DateHourParameterTask(FireflowerTask): start_datetime = luigi.DateHourParameter(default=None) end_datetime = luigi.DateHourParameter(default=None) def __init__(self, *args, **kwargs): start = kwargs.get('start_datetime', datetime.utcnow() - timedelta(1)) end = kwargs.get('end_datetime', datetime.utcnow()) kwargs['start_datetime'] = to_datetime(start, raise_=True) kwargs['end_datetime'] = to_datetime(end, raise_=True) super(DateHourParameterTask, self).__init__(*args, **kwargs) @property def start_date_str(self): return self.start_datetime.strftime('%Y-%m-%d') @property def end_date_str(self): return self.end_datetime.strftime('%Y-%m-%d') @property def start_datetime_str(self): return self.start_datetime.strftime('%Y-%m-%d %H:00') @property def end_datetime_str(self): return self.end_datetime.strftime('%Y-%m-%d %H:00')
def create_validation_task(self, generate_before=True, tuple_output=True, include_nonstate_changes=True, earliest_timestamp=None, expected_validation=None): """Create a task for testing purposes.""" interval = '2013-01-01-2014-10-10' interval_value = luigi.DateIntervalParameter().parse(interval) earliest_timestamp_value = luigi.DateHourParameter().parse( earliest_timestamp) if earliest_timestamp else None expected_validation_value = ( luigi.DateHourParameter().parse(expected_validation) if expected_validation else None) self.task = CourseEnrollmentValidationTask( interval=interval_value, output_root="/fake/output", generate_before=generate_before, tuple_output=tuple_output, include_nonstate_changes=include_nonstate_changes, earliest_timestamp=earliest_timestamp_value, expected_validation=expected_validation_value, ) self.task.init_local()
class CourseEnrollmentValidationDownstreamMixin( EventLogSelectionDownstreamMixin, MapReduceJobTaskMixin): """ Defines parameters for passing upstream to tasks that use CourseEnrollmentValidationTask. """ # location to write output output_root = luigi.Parameter( description='A URL to a path where output event files will be written.', ) # Flag indicating whether to output synthetic events or tuples tuple_output = luigi.BooleanParameter( default=False, description= 'A flag indicating that output should be in the form of tuples, not events. ' 'Default is False (output is events).', ) # If set, generates events that occur before the start of the specified interval. # Default is incremental validation. generate_before = luigi.BooleanParameter( default=False, description= 'A flag indicating that events should be created preceding the ' 'specified interval. Default behavior is to suppress the generation of events ' 'before the specified interval.', ) # If set, events are included for transitions that don't result in a # change in enrollment state. (For example, two activations in a row.) include_nonstate_changes = luigi.BooleanParameter( default=False, description='A flag indicating that events should be created ' 'to fix all transitions, even those that don\'t result in a change in enrollment ' 'state. An "activate" following another "activate" is one such example. ' 'Default behavior is to skip generating events for non-state changes.', ) # If set, events that would be generated before this timestamp would instead # be assigned this timestamp. earliest_timestamp = luigi.DateHourParameter( default=None, description='A "DateHour" parameter ("yyyy-mm-ddThh"), which if set, ' 'specifies the earliest timestamp that should occur in the output. Events ' 'that would be generated before this timestamp would instead be assigned this ' 'timestamp. This is left unspecified by default.', ) # If set, users with events before this timestamp would be expected to have # a corresponding validation event. expected_validation = luigi.DateHourParameter( default=None, description='A "DateHour" parameter ("yyyy-mm-ddThh"), which if set, ' 'specifies a point in time where every user with events before this time ' 'should also have a corresponding validation event. Those without such an ' 'validation event were not really created, and events should be synthesized ' 'to simulate "roll back" of the events.', )
class RangeHourlyBase(RangeBase): """ Produces a contiguous completed range of an hourly recurring task. """ start = luigi.DateHourParameter( default=None, description= "beginning datehour, inclusive. Default: None - work backward forever (requires reverse=True)" ) stop = luigi.DateHourParameter( default=None, description= "ending datehour, exclusive. Default: None - work forward forever") hours_back = luigi.IntParameter( default=100 * 24, # slightly more than three months description=("extent to which contiguousness is to be assured into " "past, in hours from current time. Prevents infinite " "loop when start is none. If the dataset has limited " "retention (i.e. old outputs get removed), this should " "be set shorter to that, too, to prevent the oldest " "outputs flapping. Increase freely if you intend to " "process old dates - worker's memory is the limit")) # TODO always entire interval for reprocessings (fixed start and stop)? hours_forward = luigi.IntParameter( default=0, description= "extent to which contiguousness is to be assured into future, in hours from current time. Prevents infinite loop when stop is none" ) def datetime_to_parameter(self, dt): return dt def parameter_to_datetime(self, p): return p def moving_start(self, now): return now - timedelta(hours=self.hours_back) def moving_stop(self, now): return now + timedelta(hours=self.hours_forward) def finite_datetimes(self, finite_start, finite_stop): """ Simply returns the points in time that correspond to whole hours. """ datehour_start = datetime(finite_start.year, finite_start.month, finite_start.day, finite_start.hour) datehours = [] for i in itertools.count(): t = datehour_start + timedelta(hours=i) if t >= finite_stop: return datehours if t >= finite_start: datehours.append(t) def _format_datetime(self, dt): return luigi.DateHourParameter().serialize(dt)
class FindAllTopPages(luigi.Task): start_time = luigi.DateHourParameter() end_time = luigi.DateHourParameter() def requires(self): wiki_util = WikiUtils(self.start_time, self.end_time) return [FindTopPages(dt, file) for dt, file in wiki_util._url_generator()] def run(self): logging.info('Top 25 computation complete') self.output().open('w').close() def output(self): return luigi.LocalTarget('',is_tmp=True)
def testSerialize(self): date = datetime.date(2013, 2, 3) self.assertEqual(luigi.DateParameter().serialize(date), '2013-02-03') self.assertEqual(luigi.YearParameter().serialize(date), '2013') self.assertEqual(luigi.MonthParameter().serialize(date), '2013-02') dt = datetime.datetime(2013, 2, 3, 4, 5) self.assertEqual(luigi.DateHourParameter().serialize(dt), '2013-02-03T04')
class DatabaseHourly(luigi.WrapperTask): date_hour = luigi.DateHourParameter() def requires(self): yield InsertHourlyValues(**self.param_kwargs) yield HourlyValuesCleanup(**self.param_kwargs) yield UpdateCoinsRank(**self.param_kwargs)
class UpdateCoinsRank(DatabaseQuery): date_hour = luigi.DateHourParameter() table = _coins_table @property def sql(self): yield ( 'UPDATE {table} SET {previous_rank}={current_rank}' .format(table=self.table, previous_rank='PreviousRank', current_rank='Rank') ) for query in self._update_current_rank: yield query def transform(self, df): return df[['rank', 'name', 'symbol']] @property def _update_current_rank(self): for batch in np.array_split(self.get_data().values, self.number_of_batches): update_str = '' for row in batch: rank, _, symbol = row update_str += ( "UPDATE {table} SET Rank={current_rank} WHERE Symbol='{symbol}';\n" .format(table=self.table, current_rank=rank, symbol=symbol) ) yield update_str def requires(self): return CryptoWatchHourlyIngress(date_hour=self.date_hour)
class TaskB(luigi.Task): dh = luigi.DateHourParameter() complicator = luigi.Parameter() def output(self): return MockFile( self.dh.strftime('TaskB/%%s%Y-%m-%d/%H') % self.complicator)
class Scrapes(luigi.WrapperTask): date_hour = luigi.DateHourParameter() def requires(self): yield UpdateCoinDeskScrapes(**self.param_kwargs) yield UpdateAmbCryptoScrapes(**self.param_kwargs) yield UpdateScrapeCoinTelegraph(**self.param_kwargs)
class CommonWrapperTask(luigi.WrapperTask): dh = luigi.DateHourParameter() def requires(self): yield TaskA(dh=self.dh) yield TaskB(dh=self.dh, complicator='no/worries' ) # str(self.dh) would complicate beyond working
class SocialHarvestTask(luigi.WrapperTask): date = luigi.DateParameter(default=date.today()) hour = luigi.DateHourParameter(default=get_current_hour()) debug = luigi.BoolParameter(default=False) def requires(self): yield TelegramMembersToDatabaseTask(date=self.date, hour=self.hour, debug=self.debug) yield TwitterMembersToDatabaseTask(date=self.date, hour=self.hour, debug=self.debug)
class TrainAll9DNetworks(luigi.WrapperTask): submit_date = luigi.DateHourParameter() #train_dims = luigi.ListParameter() #scan = luigi.DictParameter() def requires(self): for train_dims in target_names_generator(): yield TrainNarrow9DBatch(self.submit_date, train_dims)
class BulkCompleteTask(luigi.Task): dh = luigi.DateHourParameter() @classmethod def bulk_complete(self, parameter_tuples): return parameter_tuples[:-2] def output(self): raise RuntimeError("Shouldn't get called while resolving deps via bulk_complete")
class HourlyCron(luigi.WrapperTask): date_hour = luigi.DateHourParameter() def requires(self): yield CryptoWatchResult(**self.param_kwargs) yield DatabaseHourly(**self.param_kwargs) yield InsertCoinAggregates(**self.param_kwargs) yield HourlyDbAggregatesOutput(**self.param_kwargs) yield Scrapes(**self.param_kwargs)
class HourlyValuesCleanup(DatabaseQuery): date_hour = luigi.DateHourParameter() @property def sql(self): date_limit = self.date_hour.date() - timedelta(days=_window_period) return ( "DELETE FROM {table} WHERE Date <= CONVERT(DATE, '{date:%Y-%m-%d}') AND Hour <= {hour}" .format(table=_values_table, date=date_limit, hour=self.date_hour.hour) )
class DateHourTaskOk(luigi.Task): hour = luigi.DateHourParameter() def complete(self): # test against 2000.03.01T02 return self.hour in [ datetime.datetime(2000, 2, 29, 22), datetime.datetime(2000, 3, 1, 2), datetime.datetime(2000, 3, 1, 3) ]
class DummyTask(luigi.Task): param = luigi.Parameter() bool_param = luigi.BoolParameter() int_param = luigi.IntParameter() float_param = luigi.FloatParameter() date_param = luigi.DateParameter() datehour_param = luigi.DateHourParameter() timedelta_param = luigi.TimeDeltaParameter() insignificant_param = luigi.Parameter(significant=False)
class DummyTask(luigi.Task): param = luigi.Parameter() bool_param = luigi.BoolParameter() int_param = luigi.IntParameter() float_param = luigi.FloatParameter() date_param = luigi.DateParameter() datehour_param = luigi.DateHourParameter() timedelta_param = luigi.TimeDeltaParameter() list_param = luigi.Parameter(is_list=True)
class TrainBatch(luigi.WrapperTask): submit_date = luigi.DateHourParameter() train_dims = luigi.ListParameter() #scan = luigi.DictParameter() settings_list = luigi.ListParameter() def requires(self): for settings in self.settings_list: check_settings_dict(settings) yield TrainNN(settings, self.train_dims, self.task_id)
class MoveWarcProxFiles(luigi.Task): task_namespace = 'ingest' date = luigi.DateHourParameter(default=datetime.datetime.today()) prefix = luigi.Parameter(default="/mnt/gluster/fc") total_moved = 0 def output(self): return IngestTaskDBTarget('mv-warcprox-files', self.task_id) def run(self): # Expected filenaming: p = re.compile( "BL-....-WEBRENDER-([a-z\-0-9]+)-([0-9]{14})-([a-z\-0-9]+)\.warc\.gz" ) # List all matching files in source directory: webrender_path = os.path.join(self.prefix, 'heritrix/wren/') for file_path in os.listdir(webrender_path): if file_path.endswith('.warc.gz'): file_name = os.path.basename(file_path) matches = p.search(file_name) if matches: destination_folder_path = "%s/heritrix/output/%s/%s/warcs" % ( self.prefix, matches.group(1), matches.group(2)) if not os.path.exists(destination_folder_path): raise Exception( "Expected destination folder does not exist! :: %s" % destination_folder_path) if not os.path.isdir(destination_folder_path): raise Exception( "Expected destination folder is not a folder! :: %s" % destination_folder_path) source_file_path = os.path.join(webrender_path, file_name) destination_file_path = os.path.join( destination_folder_path, file_name) if os.path.exists(destination_file_path): raise Exception( "Destination file already exists! :: %s" % destination_file_path) shutil.move(source_file_path, destination_file_path) self.total_moved += 1 # Record that all went well: self.output().touch() def get_metrics(self, registry): # type: (CollectorRegistry) -> None g = Gauge('ukwa_files_moved_total_count', 'Total number of files moved by this task.', labelnames=['kind'], registry=registry) g.labels(kind='warcprox-warcs').set(self.total_moved)
class KeywordFilter(luigi.WrapperTask): string = luigi.Parameter() integer = luigi.IntParameter() float = luigi.FloatParameter() datehourparam = luigi.DateHourParameter() def requires(self): return SaveResult(string=self.string, integer=self.integer, float=self.float, datehourparam=self.datehourparam)
class Bar(luigi.Task): datehour = luigi.DateHourParameter() def __init__(self, *args, **kwargs): super(Bar, self).__init__(*args, **kwargs) self.comp = False def run(self): self.comp = True def complete(self): return self.comp
class HourlyIngress(ReadableTask): date_hour = luigi.DateHourParameter() @abc.abstractproperty def name(self): pass @property def _out_path(self): return ( _hourly_output_path.format(self.name, self.date_hour.date()) + 'hour={}/{}.snappy.parquet'.format(self.date_hour.hour, self.name))
class TrainRepeatingBatch(luigi.WrapperTask): submit_date = luigi.DateHourParameter() train_dims = luigi.ListParameter() #scan = luigi.DictParameter() settings = luigi.DictParameter() repeat = luigi.IntParameter(significant=False) def requires(self): check_settings_dict(self.settings) for ii in range(self.repeat): yield TrainNN(self.settings, self.train_dims, self.task_id + '_' + str(ii))
class InconsistentlyOutputtingDateHourTask(luigi.Task): dh = luigi.DateHourParameter() def output(self): base = self.dh.strftime('/even/%Y%m%d%H') if self.dh.hour % 2 == 0: return MockTarget(base) else: return { 'spi': MockTarget(base + '/something.spi'), 'spl': MockTarget(base + '/something.spl'), }
class ForecastCityWeatherCSV(luigi.Task): timestamp = luigi.DateHourParameter(default=dt.now()) city = luigi.Parameter() def requires(self): return ForecastCityWeatherJson(self.timestamp, self.city) def output(self): triple = lambda x: (x.year, x.month, x.day) year, month, day = triple(self.timestamp) ts = self.timestamp.strftime("%HH%M") # 16H35 path = os.path.join(DATADIR, self.city, '{year}', '{month:02d}', '{day:02d}', 'forecast', '{ts}.csv') return luigi.LocalTarget(path.format(year=year, month=month, day=day, ts=ts), format=UTF8) def run(self): with self.input().open('r') as fobj: data = json.load(fobj)['list'] def get(single): """get data for a single forecast """ rain = single.get('rain', {'3h': None}) snow = single.get('snow', {'3h': None}) return { "forecast_at": self.timestamp, "weather_id": single['weather'][0]['id'], "weather_desc": single['weather'][0]['main'], "wind_speed": single['wind']['speed'], # m/s unit, "humidity": single['main']['humidity'], # in % "temp": single['main']['temp'], # C unit, "temp_min": single['main']['temp_min'], # C unit "temp_max": single['main']['temp_max'], # C unit "pressure": single['main']['pressure'], # hPa unit "rain_3h": rain.get('3h', None), # rain volume mm unit "snow_3h": snow.get('3h', None), # snow volume "cloudiness": single['clouds']['all'], # in % "ts": pd.Timestamp.fromtimestamp(single['dt']) } columns = [ "forecast_at", "ts", "weather_id", "weather_desc", "temp", "temp_min", "temp_max", "rain_3h", "snow_3h", "pressure", "humidity", "wind_speed", "cloudiness" ] df = pd.DataFrame([get(x) for x in data]).sort_values(by="ts") with self.output().open('w') as fobj: df[columns].to_csv(fobj, index=False)
class BulkCompleteHourlyTask(luigi.Task): non_positional_arbitrary_argument = luigi.Parameter(default="whatever", positional=False, significant=False) dh = luigi.DateHourParameter() arbitrary_argument = luigi.BoolParameter() @classmethod def bulk_complete(cls, parameter_tuples): for t in map(cls, parameter_tuples): assert t.arbitrary_argument return parameter_tuples[:-2] def output(self): raise RuntimeError("Shouldn't get called while resolving deps via bulk_complete")
class ParseTelegramMemberCountTask(luigi.Task): date = luigi.DateParameter(default=date.today()) hour = luigi.DateHourParameter(default=get_current_hour()) limit = luigi.Parameter(default=None) # DEBUG: REMOVE THIS!!!! def requires(self): return [base_pipe.CreateDateFolder(date=self.date), base_pipe.ParseTelegramJSONtoCSVTask(date=self.date)] def output(self): path = Path(str(self.input()[0].path)) / f'Telegram_Data_{self.hour}.csv' return luigi.LocalTarget(str(path)) def run(self): telegram_links = [] with self.input()[1].open('r') as f: reader = csv.reader(f) header = next(reader) for i, row in enumerate(reader): name = row[0] link = row[1] telegram_links.append({'name': name, 'link': link}) # IDEA: tuple if self.limit: if i > self.limit: break max_processes = cpu_count() * 2 with Pool(max_processes) as p: member_records = p.map(parse_member_count, telegram_links) if len(member_records) > 0: df = pd.DataFrame(member_records) df.set_index('name', inplace=True) mean_series = df.groupby(df.index)['members'].mean() sum_series = df.groupby(df.index)['members'].sum() median_series = df.groupby(df.index)['members'].median() count_series = df.groupby(df.index).count() data = pd.concat([sum_series, mean_series, median_series, count_series], axis=1) data.columns = ['sum', 'mean', 'median', 'link_count'] data.dropna(inplace=True) data.to_csv(self.output().path)
class TwitterMembersToDatabaseTask(luigi.Task): date = luigi.DateParameter(default=date.today()) hour = luigi.DateHourParameter(default=get_current_hour()) debug = luigi.BoolParameter(default=False) # NOTE: SUPER DANGEROUS WILL SCRUB DATABASE def requires(self): return ParseTwitterMemberCountTask(date=self.date, hour=self.hour) def run(self): if not Twitter.table_exists(): create_twitter_table() if not self.complete(): df = pd.read_csv(self.input().path) df.set_index('name', inplace=True) for name, row in df.iterrows(): followers = row['followers'] following = row['following'] likes = row['likes'] tweets = row['tweets'] data = {'name': name, 'followers': followers, 'following': following, 'likes': likes, 'tweets': tweets, 'date': self.hour} Twitter.add_member_data(**data) # TODO: Twitter LINKS, RAW DATA. rename csv files def complete(self): # TODO: Add task to create a DB/Table or # IDEA: Add an except for no table - create table then check databsse for complete if self.debug: clean_twitter_table() # DEBUG: REMOVE print('DELETING TABLE FOR DEBUGGING!!!!!!!!!!!!!!!!!') try: local_ = pd.read_csv(self.input().path) dbase_ = Twitter.data_by_date(self.hour) print('#' * 25) print(len(local_)) # TODO: Logging print(len(dbase_)) # TODO: Logging # TODO: If else raise data not written to db print(len(local_.index) == len(dbase_.index)) # TODO: If else raise data not written to db print('#' * 25) return len(local_.index) == len(dbase_.index) except (FileNotFoundError, KeyError): print() return False