コード例 #1
0
class CreateRecommendations(Task):
    """
    Task that generates recommendations for each user and saves it to filesystem.
    """
    date = DateParameter(default=datetime.date.today())

    def requires(self):
        return CreateModel()

    def run(self):
        logger = getLogger("luigi-interface")
        input = yield CreateModel()
        (dataset, train_interactions, model) = input.get()

        logger.info("Generating recommendations")
        recommendations = recommend_movies(dataset, train_interactions, model)

        logger.info("Backing up recommendations on disk")
        directory = './generated/recommendations/'
        if not exists(directory):
            makedirs(directory)
        with self.output().open('w') as f:
            json.dump(recommendations, f)

    def output(self):
        return LocalTarget(
            path='./generated/recommendations/{}.json'.format(self.date))
コード例 #2
0
ファイル: batch.py プロジェクト: manishbabel/ice
class ExecuteClientBatch(WrapperTask):
    client = Parameter()
    run_date = DateParameter()

    def requires(self):
        return {
            'client_holdings':
            GetClientHoldingsFactData(run_date=self.run_date,
                                      client=self.client),
            'security_reference':
            GetSecurityMasterDimension(run_date=self.run_date)
        }

    output = TargetOutput('data/', target_class=ParquetTarget)

    def run(self):
        redis_cache = RedisCache()
        sec_master_key = 'client_' + self.run_date.strftime(
            '%Y-%m-%d') + '_sec_master_dimension'
        df_dimension = pd.read_msgpack(
            redis_cache.get_pickle('dataframe', sec_master_key))
        numcols = ["mktval_btl"]
        df_fact = self.input()['client_holdings'].read_dask()
        df_fact = df_fact.astype(dtype=dict.fromkeys(numcols, 'float64'))
        df_fact = df_fact.merge(df_dimension, on='asset_id', how='left')
        self.output().write_dask(df_fact)
コード例 #3
0
class Dump(Task):
    '''
    Dumps the entire ``observatory`` schema to a local file using the
    `binary <https://www.postgresql.org/docs/9.4/static/app-pgdump.html>`_
    Postgres dump format.

    Automatically updates :class:`~.meta.OBSDumpVersion`.

    :param timestamp: Optional date parameter, defaults to today.
    '''

    timestamp = DateParameter(default=date.today())

    def requires(self):
        yield ConfirmTablesDescribedExist()
        yield OBSMetaToLocal()

    def run(self):
        session = current_session()
        try:
            self.output().makedirs()
            session.execute(
                'INSERT INTO observatory.obs_dump_version (dump_id) '
                "VALUES ('{task_id}')".format(task_id=self.task_id))
            session.commit()
            shell('pg_dump -Fc -Z0 -x -n observatory -f {output}'.format(
                output=self.output().path))
        except Exception as err:
            session.rollback()
            raise err

    def output(self):
        return LocalTarget(
            os.path.join('tmp', classpath(self), self.task_id + '.dump'))
コード例 #4
0
ファイル: pipeline.py プロジェクト: fenimore/example_pipeline
class DayAggTask(Task):
    date = DateParameter(default=datetime.today().date())

    def requires(self):
        return {
            "workday": WorkDayTask(date=self.date, ),
            "season": SeasonTask(month=self.date.month, ),
            "horoscope": HoroscopeTask(date=self.date, ),
            "zodiac": ZodiacTask(year=self.date.year, ),
        }

    def output(self):
        return LocalTarget("filesystem/DATEAGG-{}".format(
            self.date.strftime("%Y%m%d")))

    def run(self):
        with open(self.requires()["workday"].output().path, "r") as f:
            work_day = f.read().strip()
            assert work_day
        with open(self.requires()["season"].output().path, "r") as f:
            season = f.read().strip()
            assert season
        with open(self.requires()["zodiac"].output().path, "r") as f:
            zodiac = f.read().strip()
            assert zodiac
        with open(self.requires()["horoscope"].output().path, "r") as f:
            horoscope = f.read().strip()
            assert horoscope
        with open(self.output().path, 'a') as f:
            f.write("\n".join([zodiac, season, work_day, horoscope]))
コード例 #5
0
ファイル: pipeline.py プロジェクト: fenimore/example_pipeline
class DaysBack_90(Task):
    'This job transforms the date aggregates into 90 day "logs"'
    date = DateParameter(default=datetime.today().date())

    def requires(self):
        for day_back in range(0, 90):
            retro = self.date - timedelta(days=day_back)
            yield DayAggTask(date=retro)

    def output(self):
        return LocalTarget("filesystem/retrospective-{}.log".format(
            self.date.strftime("%Y%m%d")))

    def run(self):
        zodiacs = {z: 0 for z in ZODIAC.values()}
        seasons = {s: 0 for s in SEASON.values()}
        horoscopes = {s: 0 for s in SIGNS}
        logs = []
        for date_task in self.requires():
            with open(date_task.output().path, "r") as f:
                date_info = f.read().strip()
                zodi, seas, work, horo = date_info.split("\n")
                logs.append(",".join([zodi, seas, work, horo]))
        with open(self.output().path, 'a') as f:
            f.write("\n".join(logs))
コード例 #6
0
class DumpS3(Task):
    '''
    Uploads ``observatory`` schema dumped from :class:`~.carto.Dump` to
    `Amazon S3 <https://aws.amazon.com/s3/>`_, using credentials from ``.env``.

    Automatically updates :class:`~.meta.OBSDumpVersion`.

    :param timestamp: Optional date parameter, defaults to today.
    '''
    timestamp = DateParameter(default=date.today())
    force = BooleanParameter(default=False, significant=False)

    def requires(self):
        return Dump(timestamp=self.timestamp)

    def run(self):
        shell('aws s3 cp {input} {output}'.format(input=self.input().path,
                                                  output=self.output().path))

    def output(self):
        path = self.input().path.replace('tmp/carto/Dump_', 'do-release-')
        path = path.replace('.dump', '/obs.dump')
        path = 's3://cartodb-observatory-data/{path}'.format(path=path)
        LOGGER.info(path)
        target = S3Target(path)
        if self.force:
            shell('aws s3 rm {output}'.format(output=path))
            self.force = False
        return target
コード例 #7
0
ファイル: etl_flow.py プロジェクト: manishbabel/ice
class GetClientMetaData(Task):
    client = Parameter()
    run_date = DateParameter()

    def run(self):
        df_metadata = pd.read_sql(
            "SELECT {} from {} where client_id='{}'".format(
                '*', 'iced.client', self.client),
            con=get_connection())
        df_metadata = df_metadata.set_index('client_id')
        redis_cache = RedisCache()
        client_key = 'client_' + self.client + '_metadata'
        redis_cache.store_pickle('dataframe', client_key,
                                 df_metadata.to_msgpack(compress='zlib'))

        with self.output().open('w') as f:
            df_metadata.to_csv(f)

    @property
    def root_path(self):
        return 'data/client- {}/run_date- {}/metadata/client_metadata.csv'.format(
            self.client, self.run_date)

    def output(self):
        return LocalTarget(self.root_path)
コード例 #8
0
class PDFCatalogToS3(Task):

    timestamp = DateParameter(default=date.today())
    force = BoolParameter(significant=False)

    def __init__(self, **kwargs):
        if kwargs.get('force'):
            try:
                shell('aws s3 rm s3://data-observatory/observatory.pdf')
            except:
                pass
        super(PDFCatalogToS3, self).__init__()

    def run(self):
        for target in self.output():
            shell('aws s3 cp catalog/build/observatory.pdf {output} '
                  '--acl public-read'.format(output=target.path))

    def output(self):
        return [
            S3Target('s3://data-observatory/observatory.pdf'),
            S3Target(
                's3://data-observatory/observatory-{timestamp}.pdf'.format(
                    timestamp=self.timestamp)),
        ]
コード例 #9
0
class MonthTask(Task):
    date = DateParameter(default=datetime.today().date())

    def output(self):
        return LocalTarget("filesystem/m-{}".format(self.date.strftime("%m")))

    def run(self):
        open(self.output().path, 'a').close()
コード例 #10
0
ファイル: pipeline.py プロジェクト: fenimore/example_pipeline
class HoroscopeTask(Task):
    date: datetime.date = DateParameter()

    def output(self):
        return LocalTarget("filesystem/HOROSCOPE_{}".format(
            self.date.strftime("%Y%m%d")))

    def run(self):
        with open(self.output().path, 'a') as f:
            f.write(_horoscope(self.date.day, self.date.month))
コード例 #11
0
class SortedDataRaw(DownloadFromUrl, ExternalTask):
    date = DateParameter()

    def output(self):
        return LocalTarget('../StateData/NC/sorted/{}.zip'.format(
            self.date.strftime('%Y%m%d')))

    def url(self):
        url = self.BASE_URL + 'ENRS/{}/results_sort_{}.zip'.format(
            self.date.strftime('%Y_%m_%d'), self.date.strftime('%Y%m%d'))
        return url
コード例 #12
0
class ShapeData(DownloadFromUrl, ExternalTask):
    date = DateParameter()
    level = Parameter(default='VTD')
    ftp_date_format = Parameter(default='%Y%m%d')

    def output(self):
        return LocalTarget('../StateData/NC/shapefiles/SBE_{}_{}.zip'.format(
            self.level, self.date.strftime('%Y%m%d')))

    def url(self):
        return self.BASE_URL + 'ShapeFiles/{}/SBE_{}_{}.zip'.format(
            self.level, self.level, self.date.strftime(self.ftp_date_format))
コード例 #13
0
class S3FlagDatedDummyTask(Task):
    date = DateParameter()

    def output(self):
        return S3FlagTarget('s3://verve-home/scottstewart/luigi/%s/%s/' %
                            (self.__class__.__name__, self.date))

    def run(self):
        outPath = self.output().path
        for i in range(2):
            s3.put(outPath + ('part-0000%s' % i), rand())
        s3.put(outPath + self.output().flag, '')
コード例 #14
0
class LoadJsonBase(ABC, CopyToTable):
    date = DateParameter(default=date.today())
    file_path = Parameter()

    host = "localhost"
    database = "datawarehouse"
    user = "******"
    password = "******"

    columns = [
        ("date", "DATE"),
        ("json_content", "JSON"),
    ]
コード例 #15
0
ファイル: pipeline.py プロジェクト: fenimore/example_pipeline
class HoroscopeReportTask(Task):
    'This is a Map Reduce job'
    date = DateParameter(default=datetime.today().date())

    def requires(self):
        return DaysBack_90(date=self.date)

    def output(self):
        return LocalTarget("filesystem/horoscope_report-{}.tsv".format(
            self.date.strftime("%Y%m%d")))

    def run(self):
        logs = []
        with open(self.requires().output().path, "r") as f:
            logs = f.read().strip().split("\n")

        def map_log(row):
            zodi, seas, work, horo = row.split(",")
            is_workday = work == "work"
            is_weekend = work == "weekend"
            is_holiday = work == "holiday"
            return (
                horo,  # key
                (
                    1 if is_workday else 0,
                    1 if is_holiday else 0,
                    1 if is_weekend else 0,
                    1,  # count
                ),
            )

        mapped_logs = map(map_log, logs)
        Row = namedtuple("Row", ["work", "holiday", "weekend", "total"])
        reduced_logs = reduce_by_key(
            lambda l, r: Row(
                work=l[0] + r[0],
                holiday=l[1] + r[1],
                weekend=l[2] + r[2],
                total=l[3] + r[3],
            ), mapped_logs)
        tsv = ["sign\tworking_days\tholidays\tweekends\ttotal_days"]
        for row in reduced_logs:
            tsv.append("{}\t{}\t{}\t{}\t{}".format(
                row[0],
                row[1].work,
                row[1].holiday,
                row[1].weekend,
                row[1].total,
            ))
        with open(self.output().path, 'a') as f:
            f.write("\n".join(tsv))
コード例 #16
0
class SortedDataRaw(DownloadFromUrl, ExternalTask):
    date = DateParameter()
    directory = os.path.join(os.pardir, 'stateData', 'NC', 'sorted')
    if not os.path.exists(directory): os.makedirs(directory)

    def output(self):
        return LocalTarget(
            os.path.join(self.directory,
                         '{}.zip').format(self.date.strftime('%Y%m%d')))

    def url(self):
        url = self.BASE_URL + 'ENRS/{}/results_sort_{}.zip'.format(
            self.date.strftime('%Y_%m_%d'), self.date.strftime('%Y%m%d'))
        return url
コード例 #17
0
class DatedDummyTask(Task):
    date = DateParameter()

    def output(self):
        return LocalTarget("pocOutput/%s/%s.tsv" %
                           (self.__class__.__name__, self.date))

    def run(self):
        with self.output().open('w') as outFile:
            for target in self.input():
                with target.open('r') as inFile:
                    for line in inFile:
                        outFile.write('%s-%s' %
                                      (self.__class__.__name__, line))
コード例 #18
0
class UnzippedSortedData(Task):
    date = DateParameter()

    def requires(self):
        return [SortedDataRaw(date=self.date)]

    def output(self):
        return LocalTarget('../StateData/NC/sorted/results_sort_{}.txt'.format(
            self.date.strftime('%Y%m%d')))

    def run(self):
        for infile in self.input():
            z = zipfile.ZipFile(infile.path)
            z.extractall('../StateData/NC/sorted/')
コード例 #19
0
class ShapeData(DownloadFromUrl, ExternalTask):
    date = DateParameter()
    level = Parameter(default='VTD')
    ftp_date_format = Parameter(default='%Y%m%d')
    directory = os.path.join(os.pardir, 'stateData', 'NC', 'shapefiles')
    if not os.path.exists(directory): os.makedirs(directory)

    def output(self):
        return LocalTarget(
            os.path.join(self.directory,
                         'SBE_{}_{}.zip').format(self.level,
                                                 self.date.strftime('%Y%m%d')))

    def url(self):
        return self.BASE_URL + 'ShapeFiles/{}/SBE_{}_{}.zip'.format(
            self.level, self.level, self.date.strftime(self.ftp_date_format))
コード例 #20
0
class DateTask(Task):
    date = DateParameter(default=datetime.today().date())

    def requires(self):
        return {
            "day": DayTask(date=self.date),
            "month": MonthTask(date=self.date),
            "year": YearTask(date=self.date),
        }

    def output(self):
        return LocalTarget("filesystem/date-{}".format(
            self.date.strftime("%Y%m%d")))

    def run(self):
        open(self.output().path, 'a').close()
コード例 #21
0
ファイル: pipeline.py プロジェクト: fenimore/example_pipeline
class WorkDayTask(Task):
    'WorkDayTask is a simple daily task to check if a given day is a workday'
    date = DateParameter()

    def output(self):
        return LocalTarget("filesystem/WORKDAY_{}".format(
            self.date.strftime("%Y%m%d")))

    def run(self):
        us_holidays = holidays.US()
        is_holiday = True if self.date in us_holidays else False
        is_workday = WORKWEEK[self.date.weekday()]
        with open(self.output().path, 'a') as f:
            if not is_workday:
                f.write("weekend")
            elif is_holiday:
                f.write("holiday")
            else:
                f.write("work")
コード例 #22
0
class UnzippedSortedData(Task):
    date = DateParameter()
    directory = os.path.join(os.pardir, 'stateData', 'NC', 'sorted')
    if not os.path.exists(directory): os.makedirs(directory)

    def requires(self):
        return [SortedDataRaw(date=self.date)]

    def output(self):
        return LocalTarget(
            os.path.join(self.directory, 'results_sort_{}').format(
                self.date.strftime('%Y%m%d')))

    def run(self):
        for infile in self.input():
            z = zipfile.ZipFile(infile.path)
            z.extractall(
                os.path.join(self.directory, 'results_sort_{}').format(
                    self.date.strftime('%Y%m%d')))
コード例 #23
0
ファイル: etl_flow.py プロジェクト: manishbabel/ice
class GetClientHoldingsFactData(Task):
    client = Parameter()
    run_date = DateParameter()

    def requires(self):
        return GetClientMetaData(client=self.client, run_date=self.run_date)

    output = TargetOutput('data/', target_class=ParquetTarget)

    def run(self):
        redis_cache = RedisCache()
        client_key = 'client_' + self.client + '_metadata'
        df_c = pd.read_msgpack(redis_cache.get_pickle('dataframe', client_key))

        fund_id = df_c['fund_id'].to_list()
        print(fund_id)
        df_p = pd.read_sql(
            "SELECT {} from {} where client_id='{}' and fund_id in {}".format(
                '*', 'iced.position', self.client, tuple(fund_id)),
            con=get_connection())
        df = dd.from_pandas(df_p, chunksize=1000)
        self.output().write_dask(df)
コード例 #24
0
ファイル: etl_flow.py プロジェクト: manishbabel/ice
class GetSecurityMasterDimension(Task):
    run_date = DateParameter()

    def run(self):
        redis_cache = RedisCache()
        df_dimension = pd.read_sql("SELECT {} from {}".format(
            '*', 'iced.master'),
                                   con=get_connection())
        sec_master_key = 'client_' + self.run_date.strftime(
            '%Y-%m-%d') + '_sec_master_dimension'
        redis_cache.store_pickle('dataframe', sec_master_key,
                                 df_dimension.to_msgpack(compress='zlib'))

        print('data stored in redis')
        with self.output().open('w') as f:
            df_dimension.to_csv(f)

    @property
    def root_path(self):
        return '{}/{}/{}/abc.csv'.format('data', 'security_master',
                                         self.run_date)

    def output(self):
        return LocalTarget(self.root_path)
コード例 #25
0
class ExecuteDashboard(Task):
    pd.options.display.float_format = '{:20,.2f}'.format
    run_date = DateParameter()

    output = TargetOutput('data/dashboard', target_class=ParquetTarget)

    def requires(self):
        return {
            'client_5294': self.clone(ExecuteClientBatch, run_date=self.run_date, client='JP Morgan'),
            'client_6000': self.clone(ExecuteClientBatch, run_date=self.run_date, client='Visa'),
            'client_7000': self.clone(ExecuteClientBatch, run_date=self.run_date, client='Chase'),
            'client_8000': self.clone(ExecuteClientBatch, run_date=self.run_date, client='BOFA'),
            'client_9000': self.clone(ExecuteClientBatch, run_date=self.run_date, client='AMEX'),
        }

    def run(self):

        for i, key in enumerate(self.input()):
            if i == 0:
                df_calc = self.input()[key].read_dask().groupby(by='client_id').mktval_btl.sum().round(2)  .to_frame()
            else:
                df_calc2 = self.input()[key].read_dask().groupby(by='client_id').mktval_btl.sum().round(2).to_frame()
                df_calc = dd.concat([df_calc, df_calc2], interleave_partitions=True).compute()
        df_calc = df_calc.assign(asof=str(self.run_date))
        df_calc = df_calc.reset_index()
        numcols = ["mktval_btl"]
        df_calc = df_calc.astype(dtype=dict.fromkeys(numcols, 'float64'))

        df_final = dd.from_pandas(df_calc, chunksize=1000)
        self.output().write_dask(df_final)
        self.draw_plot(df_calc)

    def draw_plot(self, df_calc):
        sns.set(style="whitegrid")
        sns.lineplot(x='client_id', y='mktval_btl', data=df_calc,color="coral", label="Market Value")
        plt.show()
コード例 #26
0
class SoiaEmailFetcher(sqla.CopyToTable):
    email_address = Parameter()
    password = Parameter()
    date = DateParameter()

    columns = [(["id", Integer()], {
        "autoincrement": True,
        "primary_key": True
    }), (["start", BigInteger()], {}), (["end", BigInteger()], {}),
               (["insert_date", BigInteger()], {})]
    connection_string = "sqlite:///data/soia_email.db"
    table = "soia"
    regexes = [(r"<b>Duration:</b>.*<br>", ["<b>Duration:</b>", "<br>"]),
               (r"(\d{2,4}.){2,4}.*<o", ["<o"])]

    def rows(self):
        for start, end in deduplicated(self.generate_rows()):
            yield "auto", start, end, datetime.now().strftime('%s')

    def copy(self, conn, ins_rows, table):
        bound_cols = dict((c, bindparam("_" + c.key)) for c in table.columns
                          if c.key != "id")
        ins = table.insert().values(bound_cols)
        conn.execute(ins, ins_rows)

    def generate_rows(self):
        imap_client = create_imap_client(self.email_address, self.password)
        try:
            code, data = imap_client.search(None, "ALL")

            soia_timestamps = []
            # iterate over emails
            for number in data[0].split(b" "):
                code, data = imap_client.fetch(number, '(RFC822)')
                message = email.message_from_string(data[0][1].decode())

                # get actual email content
                date = dateparser.parse(message["Date"])
                content = message.get_payload()

                # handle base64 content
                try:
                    unbased_content = unbase64_content(content)
                except ValueError:
                    continue

                # iterate over regexes trying to match date
                for regex, replaces in self.regexes:
                    match = re.search(regex, unbased_content)
                    if match is not None:
                        dates = remove_occurances(match.group(), replaces)

                        start, end = dates.rsplit("-", maxsplit=1)
                        if " " not in end.strip():
                            end = f"{date.year}-{date.month}-{date.day} {end}"
                        if " " not in start.strip():
                            start = f"{date.year}-{date.month}-{date.day} {start}"

                        parsed_start = dateparser.parse(start)
                        parsed_end = dateparser.parse(end)
                        if parsed_end is None or parsed_start is None:
                            logger.warning("coudn't parse the following: %s",
                                           match)
                            continue
                        row = (parsed_start.strftime('%s'),
                               parsed_end.strftime('%s'))
                        logger.debug("Adding the following row: %s", row)
                        soia_timestamps.append(row)
                        break
        except Exception as err:
            logger.error("Something went terribly wrong! %s", err)

        finally:
            imap_client.close()
            imap_client.logout()
        return soia_timestamps
コード例 #27
0
class MySqlDatedDummyTask(Task):
    date = DateParameter()

    def output(self):
        return MySqlTarget()
コード例 #28
0
class ExternalDatedS3DummyFlagTask(ExternalTask):
    date = DateParameter()

    def output(self):
        return S3FlagTarget('s3://verve-home/scottstewart/luigi/%s/%s/' %
                            (self.__class__.__name__, self.date))
コード例 #29
0
class SoiaMetricsFetcher(sqla.CopyToTable):

    columns = [(["id", Integer()], {
        "autoincrement": True,
        "primary_key": True
    }), (["start", BigInteger()], {}), (["end", BigInteger()], {}),
               (["insert_date", BigInteger()], {}), (["path", Text()], {}),
               (["metric_anomaly", Text()], {}), (["metric_whole",
                                                   Text()], {})]
    connection_string = "sqlite:///data/soia_email.db"
    table = "soia_with_values"

    path = Parameter()
    date = DateParameter()

    def requires(self):
        return SoiaEmailFetcher(date=datetime.now()), MetricFetcher(
            path_prefix=self.path)

    def copy(self, conn, ins_rows, table):
        bound_cols = dict((c, bindparam("_" + c.key)) for c in table.columns
                          if c.key != "id")
        ins = table.insert().values(bound_cols)
        conn.execute(ins, ins_rows)

    def rows(self):
        for start, end, path, metric, whole in deduplicated(
                self.generate_rows()):
            yield "auto", start, end, datetime.now().strftime(
                '%s'), path, metric, whole

    def generate_rows(self):

        now = int(datetime.now().strftime('%s'))
        _14_days_ago = int(
            (datetime.now() - timedelta(days=14)).strftime('%s'))
        _, preloaded_metrics = self.input()

        metrics = json.loads(preloaded_metrics.open('r').read())

        conn = sqlite3.connect('data/soia_email.db')
        c = conn.cursor()
        c.execute("select distinct start, end from soia;")
        rows = c.fetchall()
        conn.close()
        formed_rows = []
        for start, end in rows:
            if start < _14_days_ago or end < _14_days_ago:
                logging.warning(
                    f"date to early :C - {datetime.fromtimestamp(start)}, {datetime.fromtimestamp(end)}"
                )
            else:
                logging.info(
                    f"date good to go! - {datetime.fromtimestamp(start)}, {datetime.fromtimestamp(end)}"
                )
                for metric in metrics:
                    shorter = list(
                        filter(lambda tup: tup[1] >= start and tup[1] <= end,
                               metric['datapoints']))
                    formed_rows.append(
                        (start, end, metric['target'], json.dumps(shorter),
                         json.dumps(metric['datapoints'])))
                    print(len(formed_rows[0]))

        return formed_rows