class CreateRecommendations(Task): """ Task that generates recommendations for each user and saves it to filesystem. """ date = DateParameter(default=datetime.date.today()) def requires(self): return CreateModel() def run(self): logger = getLogger("luigi-interface") input = yield CreateModel() (dataset, train_interactions, model) = input.get() logger.info("Generating recommendations") recommendations = recommend_movies(dataset, train_interactions, model) logger.info("Backing up recommendations on disk") directory = './generated/recommendations/' if not exists(directory): makedirs(directory) with self.output().open('w') as f: json.dump(recommendations, f) def output(self): return LocalTarget( path='./generated/recommendations/{}.json'.format(self.date))
class ExecuteClientBatch(WrapperTask): client = Parameter() run_date = DateParameter() def requires(self): return { 'client_holdings': GetClientHoldingsFactData(run_date=self.run_date, client=self.client), 'security_reference': GetSecurityMasterDimension(run_date=self.run_date) } output = TargetOutput('data/', target_class=ParquetTarget) def run(self): redis_cache = RedisCache() sec_master_key = 'client_' + self.run_date.strftime( '%Y-%m-%d') + '_sec_master_dimension' df_dimension = pd.read_msgpack( redis_cache.get_pickle('dataframe', sec_master_key)) numcols = ["mktval_btl"] df_fact = self.input()['client_holdings'].read_dask() df_fact = df_fact.astype(dtype=dict.fromkeys(numcols, 'float64')) df_fact = df_fact.merge(df_dimension, on='asset_id', how='left') self.output().write_dask(df_fact)
class Dump(Task): ''' Dumps the entire ``observatory`` schema to a local file using the `binary <https://www.postgresql.org/docs/9.4/static/app-pgdump.html>`_ Postgres dump format. Automatically updates :class:`~.meta.OBSDumpVersion`. :param timestamp: Optional date parameter, defaults to today. ''' timestamp = DateParameter(default=date.today()) def requires(self): yield ConfirmTablesDescribedExist() yield OBSMetaToLocal() def run(self): session = current_session() try: self.output().makedirs() session.execute( 'INSERT INTO observatory.obs_dump_version (dump_id) ' "VALUES ('{task_id}')".format(task_id=self.task_id)) session.commit() shell('pg_dump -Fc -Z0 -x -n observatory -f {output}'.format( output=self.output().path)) except Exception as err: session.rollback() raise err def output(self): return LocalTarget( os.path.join('tmp', classpath(self), self.task_id + '.dump'))
class DayAggTask(Task): date = DateParameter(default=datetime.today().date()) def requires(self): return { "workday": WorkDayTask(date=self.date, ), "season": SeasonTask(month=self.date.month, ), "horoscope": HoroscopeTask(date=self.date, ), "zodiac": ZodiacTask(year=self.date.year, ), } def output(self): return LocalTarget("filesystem/DATEAGG-{}".format( self.date.strftime("%Y%m%d"))) def run(self): with open(self.requires()["workday"].output().path, "r") as f: work_day = f.read().strip() assert work_day with open(self.requires()["season"].output().path, "r") as f: season = f.read().strip() assert season with open(self.requires()["zodiac"].output().path, "r") as f: zodiac = f.read().strip() assert zodiac with open(self.requires()["horoscope"].output().path, "r") as f: horoscope = f.read().strip() assert horoscope with open(self.output().path, 'a') as f: f.write("\n".join([zodiac, season, work_day, horoscope]))
class DaysBack_90(Task): 'This job transforms the date aggregates into 90 day "logs"' date = DateParameter(default=datetime.today().date()) def requires(self): for day_back in range(0, 90): retro = self.date - timedelta(days=day_back) yield DayAggTask(date=retro) def output(self): return LocalTarget("filesystem/retrospective-{}.log".format( self.date.strftime("%Y%m%d"))) def run(self): zodiacs = {z: 0 for z in ZODIAC.values()} seasons = {s: 0 for s in SEASON.values()} horoscopes = {s: 0 for s in SIGNS} logs = [] for date_task in self.requires(): with open(date_task.output().path, "r") as f: date_info = f.read().strip() zodi, seas, work, horo = date_info.split("\n") logs.append(",".join([zodi, seas, work, horo])) with open(self.output().path, 'a') as f: f.write("\n".join(logs))
class DumpS3(Task): ''' Uploads ``observatory`` schema dumped from :class:`~.carto.Dump` to `Amazon S3 <https://aws.amazon.com/s3/>`_, using credentials from ``.env``. Automatically updates :class:`~.meta.OBSDumpVersion`. :param timestamp: Optional date parameter, defaults to today. ''' timestamp = DateParameter(default=date.today()) force = BooleanParameter(default=False, significant=False) def requires(self): return Dump(timestamp=self.timestamp) def run(self): shell('aws s3 cp {input} {output}'.format(input=self.input().path, output=self.output().path)) def output(self): path = self.input().path.replace('tmp/carto/Dump_', 'do-release-') path = path.replace('.dump', '/obs.dump') path = 's3://cartodb-observatory-data/{path}'.format(path=path) LOGGER.info(path) target = S3Target(path) if self.force: shell('aws s3 rm {output}'.format(output=path)) self.force = False return target
class GetClientMetaData(Task): client = Parameter() run_date = DateParameter() def run(self): df_metadata = pd.read_sql( "SELECT {} from {} where client_id='{}'".format( '*', 'iced.client', self.client), con=get_connection()) df_metadata = df_metadata.set_index('client_id') redis_cache = RedisCache() client_key = 'client_' + self.client + '_metadata' redis_cache.store_pickle('dataframe', client_key, df_metadata.to_msgpack(compress='zlib')) with self.output().open('w') as f: df_metadata.to_csv(f) @property def root_path(self): return 'data/client- {}/run_date- {}/metadata/client_metadata.csv'.format( self.client, self.run_date) def output(self): return LocalTarget(self.root_path)
class PDFCatalogToS3(Task): timestamp = DateParameter(default=date.today()) force = BoolParameter(significant=False) def __init__(self, **kwargs): if kwargs.get('force'): try: shell('aws s3 rm s3://data-observatory/observatory.pdf') except: pass super(PDFCatalogToS3, self).__init__() def run(self): for target in self.output(): shell('aws s3 cp catalog/build/observatory.pdf {output} ' '--acl public-read'.format(output=target.path)) def output(self): return [ S3Target('s3://data-observatory/observatory.pdf'), S3Target( 's3://data-observatory/observatory-{timestamp}.pdf'.format( timestamp=self.timestamp)), ]
class MonthTask(Task): date = DateParameter(default=datetime.today().date()) def output(self): return LocalTarget("filesystem/m-{}".format(self.date.strftime("%m"))) def run(self): open(self.output().path, 'a').close()
class HoroscopeTask(Task): date: datetime.date = DateParameter() def output(self): return LocalTarget("filesystem/HOROSCOPE_{}".format( self.date.strftime("%Y%m%d"))) def run(self): with open(self.output().path, 'a') as f: f.write(_horoscope(self.date.day, self.date.month))
class SortedDataRaw(DownloadFromUrl, ExternalTask): date = DateParameter() def output(self): return LocalTarget('../StateData/NC/sorted/{}.zip'.format( self.date.strftime('%Y%m%d'))) def url(self): url = self.BASE_URL + 'ENRS/{}/results_sort_{}.zip'.format( self.date.strftime('%Y_%m_%d'), self.date.strftime('%Y%m%d')) return url
class ShapeData(DownloadFromUrl, ExternalTask): date = DateParameter() level = Parameter(default='VTD') ftp_date_format = Parameter(default='%Y%m%d') def output(self): return LocalTarget('../StateData/NC/shapefiles/SBE_{}_{}.zip'.format( self.level, self.date.strftime('%Y%m%d'))) def url(self): return self.BASE_URL + 'ShapeFiles/{}/SBE_{}_{}.zip'.format( self.level, self.level, self.date.strftime(self.ftp_date_format))
class S3FlagDatedDummyTask(Task): date = DateParameter() def output(self): return S3FlagTarget('s3://verve-home/scottstewart/luigi/%s/%s/' % (self.__class__.__name__, self.date)) def run(self): outPath = self.output().path for i in range(2): s3.put(outPath + ('part-0000%s' % i), rand()) s3.put(outPath + self.output().flag, '')
class LoadJsonBase(ABC, CopyToTable): date = DateParameter(default=date.today()) file_path = Parameter() host = "localhost" database = "datawarehouse" user = "******" password = "******" columns = [ ("date", "DATE"), ("json_content", "JSON"), ]
class HoroscopeReportTask(Task): 'This is a Map Reduce job' date = DateParameter(default=datetime.today().date()) def requires(self): return DaysBack_90(date=self.date) def output(self): return LocalTarget("filesystem/horoscope_report-{}.tsv".format( self.date.strftime("%Y%m%d"))) def run(self): logs = [] with open(self.requires().output().path, "r") as f: logs = f.read().strip().split("\n") def map_log(row): zodi, seas, work, horo = row.split(",") is_workday = work == "work" is_weekend = work == "weekend" is_holiday = work == "holiday" return ( horo, # key ( 1 if is_workday else 0, 1 if is_holiday else 0, 1 if is_weekend else 0, 1, # count ), ) mapped_logs = map(map_log, logs) Row = namedtuple("Row", ["work", "holiday", "weekend", "total"]) reduced_logs = reduce_by_key( lambda l, r: Row( work=l[0] + r[0], holiday=l[1] + r[1], weekend=l[2] + r[2], total=l[3] + r[3], ), mapped_logs) tsv = ["sign\tworking_days\tholidays\tweekends\ttotal_days"] for row in reduced_logs: tsv.append("{}\t{}\t{}\t{}\t{}".format( row[0], row[1].work, row[1].holiday, row[1].weekend, row[1].total, )) with open(self.output().path, 'a') as f: f.write("\n".join(tsv))
class SortedDataRaw(DownloadFromUrl, ExternalTask): date = DateParameter() directory = os.path.join(os.pardir, 'stateData', 'NC', 'sorted') if not os.path.exists(directory): os.makedirs(directory) def output(self): return LocalTarget( os.path.join(self.directory, '{}.zip').format(self.date.strftime('%Y%m%d'))) def url(self): url = self.BASE_URL + 'ENRS/{}/results_sort_{}.zip'.format( self.date.strftime('%Y_%m_%d'), self.date.strftime('%Y%m%d')) return url
class DatedDummyTask(Task): date = DateParameter() def output(self): return LocalTarget("pocOutput/%s/%s.tsv" % (self.__class__.__name__, self.date)) def run(self): with self.output().open('w') as outFile: for target in self.input(): with target.open('r') as inFile: for line in inFile: outFile.write('%s-%s' % (self.__class__.__name__, line))
class UnzippedSortedData(Task): date = DateParameter() def requires(self): return [SortedDataRaw(date=self.date)] def output(self): return LocalTarget('../StateData/NC/sorted/results_sort_{}.txt'.format( self.date.strftime('%Y%m%d'))) def run(self): for infile in self.input(): z = zipfile.ZipFile(infile.path) z.extractall('../StateData/NC/sorted/')
class ShapeData(DownloadFromUrl, ExternalTask): date = DateParameter() level = Parameter(default='VTD') ftp_date_format = Parameter(default='%Y%m%d') directory = os.path.join(os.pardir, 'stateData', 'NC', 'shapefiles') if not os.path.exists(directory): os.makedirs(directory) def output(self): return LocalTarget( os.path.join(self.directory, 'SBE_{}_{}.zip').format(self.level, self.date.strftime('%Y%m%d'))) def url(self): return self.BASE_URL + 'ShapeFiles/{}/SBE_{}_{}.zip'.format( self.level, self.level, self.date.strftime(self.ftp_date_format))
class DateTask(Task): date = DateParameter(default=datetime.today().date()) def requires(self): return { "day": DayTask(date=self.date), "month": MonthTask(date=self.date), "year": YearTask(date=self.date), } def output(self): return LocalTarget("filesystem/date-{}".format( self.date.strftime("%Y%m%d"))) def run(self): open(self.output().path, 'a').close()
class WorkDayTask(Task): 'WorkDayTask is a simple daily task to check if a given day is a workday' date = DateParameter() def output(self): return LocalTarget("filesystem/WORKDAY_{}".format( self.date.strftime("%Y%m%d"))) def run(self): us_holidays = holidays.US() is_holiday = True if self.date in us_holidays else False is_workday = WORKWEEK[self.date.weekday()] with open(self.output().path, 'a') as f: if not is_workday: f.write("weekend") elif is_holiday: f.write("holiday") else: f.write("work")
class UnzippedSortedData(Task): date = DateParameter() directory = os.path.join(os.pardir, 'stateData', 'NC', 'sorted') if not os.path.exists(directory): os.makedirs(directory) def requires(self): return [SortedDataRaw(date=self.date)] def output(self): return LocalTarget( os.path.join(self.directory, 'results_sort_{}').format( self.date.strftime('%Y%m%d'))) def run(self): for infile in self.input(): z = zipfile.ZipFile(infile.path) z.extractall( os.path.join(self.directory, 'results_sort_{}').format( self.date.strftime('%Y%m%d')))
class GetClientHoldingsFactData(Task): client = Parameter() run_date = DateParameter() def requires(self): return GetClientMetaData(client=self.client, run_date=self.run_date) output = TargetOutput('data/', target_class=ParquetTarget) def run(self): redis_cache = RedisCache() client_key = 'client_' + self.client + '_metadata' df_c = pd.read_msgpack(redis_cache.get_pickle('dataframe', client_key)) fund_id = df_c['fund_id'].to_list() print(fund_id) df_p = pd.read_sql( "SELECT {} from {} where client_id='{}' and fund_id in {}".format( '*', 'iced.position', self.client, tuple(fund_id)), con=get_connection()) df = dd.from_pandas(df_p, chunksize=1000) self.output().write_dask(df)
class GetSecurityMasterDimension(Task): run_date = DateParameter() def run(self): redis_cache = RedisCache() df_dimension = pd.read_sql("SELECT {} from {}".format( '*', 'iced.master'), con=get_connection()) sec_master_key = 'client_' + self.run_date.strftime( '%Y-%m-%d') + '_sec_master_dimension' redis_cache.store_pickle('dataframe', sec_master_key, df_dimension.to_msgpack(compress='zlib')) print('data stored in redis') with self.output().open('w') as f: df_dimension.to_csv(f) @property def root_path(self): return '{}/{}/{}/abc.csv'.format('data', 'security_master', self.run_date) def output(self): return LocalTarget(self.root_path)
class ExecuteDashboard(Task): pd.options.display.float_format = '{:20,.2f}'.format run_date = DateParameter() output = TargetOutput('data/dashboard', target_class=ParquetTarget) def requires(self): return { 'client_5294': self.clone(ExecuteClientBatch, run_date=self.run_date, client='JP Morgan'), 'client_6000': self.clone(ExecuteClientBatch, run_date=self.run_date, client='Visa'), 'client_7000': self.clone(ExecuteClientBatch, run_date=self.run_date, client='Chase'), 'client_8000': self.clone(ExecuteClientBatch, run_date=self.run_date, client='BOFA'), 'client_9000': self.clone(ExecuteClientBatch, run_date=self.run_date, client='AMEX'), } def run(self): for i, key in enumerate(self.input()): if i == 0: df_calc = self.input()[key].read_dask().groupby(by='client_id').mktval_btl.sum().round(2) .to_frame() else: df_calc2 = self.input()[key].read_dask().groupby(by='client_id').mktval_btl.sum().round(2).to_frame() df_calc = dd.concat([df_calc, df_calc2], interleave_partitions=True).compute() df_calc = df_calc.assign(asof=str(self.run_date)) df_calc = df_calc.reset_index() numcols = ["mktval_btl"] df_calc = df_calc.astype(dtype=dict.fromkeys(numcols, 'float64')) df_final = dd.from_pandas(df_calc, chunksize=1000) self.output().write_dask(df_final) self.draw_plot(df_calc) def draw_plot(self, df_calc): sns.set(style="whitegrid") sns.lineplot(x='client_id', y='mktval_btl', data=df_calc,color="coral", label="Market Value") plt.show()
class SoiaEmailFetcher(sqla.CopyToTable): email_address = Parameter() password = Parameter() date = DateParameter() columns = [(["id", Integer()], { "autoincrement": True, "primary_key": True }), (["start", BigInteger()], {}), (["end", BigInteger()], {}), (["insert_date", BigInteger()], {})] connection_string = "sqlite:///data/soia_email.db" table = "soia" regexes = [(r"<b>Duration:</b>.*<br>", ["<b>Duration:</b>", "<br>"]), (r"(\d{2,4}.){2,4}.*<o", ["<o"])] def rows(self): for start, end in deduplicated(self.generate_rows()): yield "auto", start, end, datetime.now().strftime('%s') def copy(self, conn, ins_rows, table): bound_cols = dict((c, bindparam("_" + c.key)) for c in table.columns if c.key != "id") ins = table.insert().values(bound_cols) conn.execute(ins, ins_rows) def generate_rows(self): imap_client = create_imap_client(self.email_address, self.password) try: code, data = imap_client.search(None, "ALL") soia_timestamps = [] # iterate over emails for number in data[0].split(b" "): code, data = imap_client.fetch(number, '(RFC822)') message = email.message_from_string(data[0][1].decode()) # get actual email content date = dateparser.parse(message["Date"]) content = message.get_payload() # handle base64 content try: unbased_content = unbase64_content(content) except ValueError: continue # iterate over regexes trying to match date for regex, replaces in self.regexes: match = re.search(regex, unbased_content) if match is not None: dates = remove_occurances(match.group(), replaces) start, end = dates.rsplit("-", maxsplit=1) if " " not in end.strip(): end = f"{date.year}-{date.month}-{date.day} {end}" if " " not in start.strip(): start = f"{date.year}-{date.month}-{date.day} {start}" parsed_start = dateparser.parse(start) parsed_end = dateparser.parse(end) if parsed_end is None or parsed_start is None: logger.warning("coudn't parse the following: %s", match) continue row = (parsed_start.strftime('%s'), parsed_end.strftime('%s')) logger.debug("Adding the following row: %s", row) soia_timestamps.append(row) break except Exception as err: logger.error("Something went terribly wrong! %s", err) finally: imap_client.close() imap_client.logout() return soia_timestamps
class MySqlDatedDummyTask(Task): date = DateParameter() def output(self): return MySqlTarget()
class ExternalDatedS3DummyFlagTask(ExternalTask): date = DateParameter() def output(self): return S3FlagTarget('s3://verve-home/scottstewart/luigi/%s/%s/' % (self.__class__.__name__, self.date))
class SoiaMetricsFetcher(sqla.CopyToTable): columns = [(["id", Integer()], { "autoincrement": True, "primary_key": True }), (["start", BigInteger()], {}), (["end", BigInteger()], {}), (["insert_date", BigInteger()], {}), (["path", Text()], {}), (["metric_anomaly", Text()], {}), (["metric_whole", Text()], {})] connection_string = "sqlite:///data/soia_email.db" table = "soia_with_values" path = Parameter() date = DateParameter() def requires(self): return SoiaEmailFetcher(date=datetime.now()), MetricFetcher( path_prefix=self.path) def copy(self, conn, ins_rows, table): bound_cols = dict((c, bindparam("_" + c.key)) for c in table.columns if c.key != "id") ins = table.insert().values(bound_cols) conn.execute(ins, ins_rows) def rows(self): for start, end, path, metric, whole in deduplicated( self.generate_rows()): yield "auto", start, end, datetime.now().strftime( '%s'), path, metric, whole def generate_rows(self): now = int(datetime.now().strftime('%s')) _14_days_ago = int( (datetime.now() - timedelta(days=14)).strftime('%s')) _, preloaded_metrics = self.input() metrics = json.loads(preloaded_metrics.open('r').read()) conn = sqlite3.connect('data/soia_email.db') c = conn.cursor() c.execute("select distinct start, end from soia;") rows = c.fetchall() conn.close() formed_rows = [] for start, end in rows: if start < _14_days_ago or end < _14_days_ago: logging.warning( f"date to early :C - {datetime.fromtimestamp(start)}, {datetime.fromtimestamp(end)}" ) else: logging.info( f"date good to go! - {datetime.fromtimestamp(start)}, {datetime.fromtimestamp(end)}" ) for metric in metrics: shorter = list( filter(lambda tup: tup[1] >= start and tup[1] <= end, metric['datapoints'])) formed_rows.append( (start, end, metric['target'], json.dumps(shorter), json.dumps(metric['datapoints']))) print(len(formed_rows[0])) return formed_rows