コード例 #1
0
class YellowTaxiDateRangeTask(luigi.WrapperTask):
    start = luigi.MonthParameter()
    stop = luigi.MonthParameter()

    def requires(self):
        current_month = self.start
        while current_month <= self.stop:
            yield CopyTaxiTripData2SQLite(date=current_month)
            current_month += relativedelta(months=1)
コード例 #2
0
ファイル: advanced.py プロジェクト: andrewsosa/luigi-demo
class PandasDFDemo(luigi.Task):
    """ Print a sample dataframe """

    month: datetime.date = luigi.MonthParameter(default=datetime.date.today())
    profession: str = luigi.Parameter(default="Engineer")

    def requires(self):
        return GenerateCustomers(month=self.month)

    def output(self):
        return luigi.LocalTarget(
            f"data/customers#{self.profession}_{self.month}.txt")

    def run(self):
        with self.input().open() as customer_file:
            customer_df = pandas.read_csv(
                customer_file,
                names=[
                    "Name", "Address", "Birthdate", "Job", "Company", "Email"
                ],
            )

        with self.output().open("w") as outfile:
            outfile.write(str(customer_df))

        return customer_df
コード例 #3
0
class NominetDomainListToHDFS(luigi.Task):
    """
    """
    date = luigi.MonthParameter(default=datetime.date.today())

    task_namespace = 'ingest'

    def requires(self):
        return NominetDomainListFTP(date=self.date)

    def output(self):
        filename = "/1_data/nominet/domains.%s.csv.gz" % self.date.strftime(
            '%Y%m')
        return luigi.contrib.hdfs.HdfsTarget(path=filename,
                                             format=WebHdfsPlainFormat())

    def run(self):
        # Read the file in and write it to HDFS
        with self.input().open('r') as reader:
            with self.output().open('w') as writer:
                logger.info("Uploading %s to %s" %
                            (self.input().path, self.output().path))
                while True:
                    chunk = reader.read(DEFAULT_BUFFER_SIZE)
                    if not chunk:
                        break
                    writer.write(chunk)
コード例 #4
0
class NominetDomainListFTP(luigi.ExternalTask):
    """
    Remote SFTP service and filenaming pattern for monthly releases.

    NOTE that for this to work, the host key must be set up and known to the server that runs this task. e.g.
    a `ssh USER@HOST` check to get the key registered will be needed to set up a new server or if the remote server changes.

    """
    date = luigi.MonthParameter(default=datetime.date.today())

    task_namespace = 'ingest'

    def output(self):
        """
        Returns the target output for this task.
        In this case, a successful execution of this task will create a file that will be created in a FTP server.
        :return: the target output for this task.
        :rtype: object (:py:class:`~luigi.target.Target`)
        """
        filename = '/home/bl/domains.%s.csv.gz' % self.date.strftime('%Y%m')
        return luigi.contrib.ftp.RemoteTarget(filename,
                                              NOM_HOST,
                                              username=NOM_USER,
                                              password=NOM_PWD,
                                              sftp=True)
コード例 #5
0
ファイル: simple.py プロジェクト: andrewsosa/luigi-demo
class GenerateCustomers(luigi.Task):
    """ Generate :count:-many customers from :month:. """

    month: datetime.date = luigi.MonthParameter()
    count: int = luigi.IntParameter(default=10000)

    def output(self):
        return luigi.LocalTarget(f"data/customers_{self.month}.csv")

    def run(self):
        fake = faker.Faker()

        with self.output().open("w") as outfile:
            writer = csv.writer(outfile)
            for _ in range(self.count):
                writer.writerow([
                    fake.name(),
                    fake.address().replace("\n", " "),
                    fake.date_between_dates(
                        date_start=datetime.date(1930, 1, 1),
                        date_end=datetime.date(2000, 1, 1),
                    ),
                    fake.job(),
                    fake.company(),
                    fake.company_email(),
                ])
コード例 #6
0
ファイル: advanced.py プロジェクト: andrewsosa/luigi-demo
class CustomerSalaries(luigi.Task):
    """ Load the customers into a dataframe """

    month: datetime.date = luigi.MonthParameter(default=datetime.date.today())
    job: str = luigi.Parameter(default="Engineer")

    def output(self):
        return luigi.LocalTarget(f"data/salaries_{self.job}_{self.month}.csv")

    def requires(self):
        return {
            "a": GenerateCustomers(month=self.month),
            "b": CompanyEngineerSalary(month=self.month, job=self.job),
        }

    def run(self):
        with self.input()["a"].open() as customer_file:
            customer_df = pandas.read_csv(
                customer_file,
                names=[
                    "Name", "Address", "Birthdate", "Job", "Company", "Email"
                ],
            )

        with self.input()["b"].open() as salaries_file:
            salaries_df = pandas.read_csv(
                salaries_file,
                delimiter=",",
                names=["Position", "Company", "Salary"])

        employees_df = customer_df[['Name', 'Company']]
        employee_salaries_df = employees_df.merge(salaries_df, on="Company")
        employee_salaries_df.to_csv(self.output().path)
        print(employee_salaries_df.head())
コード例 #7
0
class FollowFilteredEdgelist(luigi.Task):
    '''edgelistの左側にunknownが出て来るエッジを消して、居住地の付けたユーザからのデータのみにしたエッジリスト

    Args:
        --name LocationUserListとUnknownListがわかるように保存パスに使われる名前
        --month
    '''
    month = luigi.MonthParameter()
    name = luigi.Parameter()
    type = luigi.ChoiceParameter(choices=['followers', 'following'])
    sources = luigi.TupleParameter(default=('followers', 'following'))

    def requires(self):
        return {
            'edgelist': TwitterFollowRawEdgelist(month=self.month,
                                                 type=self.type),
            'hl': RemainedHomeLocation(name=self.name, month=self.month)
        }

    def output(self):
        return luigi.LocalTarget(
            os.path.join(
                NETWORK_DIR, 'filtered', self.name,
                self.month.strftime('%Y%m_{}.tsv.gz'.format(self.type))))

    def run(self):
        with self.output().temporary_path() as temp_output_path:
            cmd = 'zcat {edgelist.path} | python -m snlocest.scripts.edgefilter -i {hl.path} | gzip > {}'.format(
                temp_output_path, **self.input())
            run(cmd, shell=True, check=True)
コード例 #8
0
ファイル: parameter_test.py プロジェクト: thejens/luigi
 def testSerialize(self):
     date = datetime.date(2013, 2, 3)
     self.assertEqual(luigi.DateParameter().serialize(date), '2013-02-03')
     self.assertEqual(luigi.YearParameter().serialize(date), '2013')
     self.assertEqual(luigi.MonthParameter().serialize(date), '2013-02')
     dt = datetime.datetime(2013, 2, 3, 4, 5)
     self.assertEqual(luigi.DateHourParameter().serialize(dt), '2013-02-03T04')
コード例 #9
0
class WriteRollingAveragesToDB(sqla.CopyToTable):
    month = luigi.MonthParameter()

    def requires(self):
        return CalculateTripDurationRollingAverage45Days(month=self.month)

    columns = [
        (["date", sa.DATE], {
            "primary_key": True
        }),
        (["rolling_average_45d", sa.Float], {
            "nullable": True
        }),
    ]
    connection_string = settings.db.url
    table = "trip_duration_rolling_average"
    column_separator = ","

    def rows(self):
        for date_str, duration in super().rows():
            date = parse(date_str).date()
            if date.month != self.month.month:
                continue
            if not duration:
                duration = None
            yield date, duration
コード例 #10
0
class Indicator(luigi.Task):
    pair = luigi.Parameter()
    exchange = luigi.Parameter()
    month = luigi.MonthParameter()
    period = luigi.Parameter(default="1d")
    destination_path = luigi.Parameter()
    FN = None
    COLUMN_NAME = ""

    def column_name(self):
        return self.COLUMN_NAME

    def output(self):
        parms = self.to_str_params()
        cls = self.__class__.__name__
        parms["class"] = cls
        path = hamp.path(hamp.DEFINITIONS[cls], **parms)
        path = os.path.join(self.destination_path, path)
        self.target = luigi.LocalTarget(path)
        yield self.target

    def run(self):
        self.target.makedirs()
        data = hamt.input_df(self.requires())
        name = self.column_name()
        data[name] = self.FN(data)
        next_m = hamt.next_month(self.month, False)
        data = data[self.month:next_m]
        data[[name]].to_csv(self.target.path, date_format=hamt.DATE_FORMAT)
コード例 #11
0
class NYTaxiTripDurationAnalytics(luigi.WrapperTask):
    month = luigi.MonthParameter()

    def requires(self):
        yield WriteDailyAveragesToDB(self.month)
        yield WriteMonthlyAveragesToDB(self.month)
        yield WriteRollingAveragesToDB(self.month)
コード例 #12
0
class RemainedHomeLocation(luigi.Task):
    '''作成した居住地データ(LocationuserList)からunknownになったユーザをひいて、
    ソーシャルネットワークを取得しているuserlistとANDをとったものを保存する

    Args:
        --homelocation-path 居住地データのファイルへのパス
    '''
    name = luigi.Parameter()
    month = luigi.MonthParameter()
    sources = luigi.TupleParameter(default=('followers', 'following'))
    homelocation_path = luigi.Parameter()

    def requires(self):
        return {
            'unknown': UnknownList(month=self.month, sources=self.sources),
            'userlist': LocationUserList(path=self.homelocation_path),
            'seed': SeedUserList(month=self.month)
        }

    def output(self):
        return luigi.LocalTarget(
            os.path.join('data/datasets', self.name, 'groundtruth',
                         os.path.basename(self.input()['userlist'].path)))

    def run(self):
        cmd = 'cat {userlist.path} | python -m snlocest.scripts.edgefilter -e {unknown.path} | python -m snlocest.scripts.edgefilter -i {seed.path} > {}'
        with self.output().temporary_path() as temp_output_path:
            run(cmd.format(temp_output_path, **self.input()),
                shell=True,
                check=True)
コード例 #13
0
ファイル: ingest_data.py プロジェクト: iamshri8/nyc-taxi-trip
class CalculateRollingAverage(luigi.Task):
    """ Task for calculating the rolling average for 45 days. """

    year_month = luigi.MonthParameter()

    def requires(self):
        return SaveDailyAverage(self.year_month)

    def run(self):
        df = pd.read_sql_table(
            "daily_average_duration",
            con=self.input().engine,
            parse_dates=["date"],
            index_col="date",
        )

        rolling_avg = s.rolling_average_n_days(df, num_of_days=45)
        rolling_avg.to_csv(self.output().path)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(
                os.getenv("DATA_DIR"),
                f"rolling_average_{u.year_month_to_str(self.year_month)}.csv",
            ))
コード例 #14
0
ファイル: ingest_data.py プロジェクト: iamshri8/nyc-taxi-trip
class IngestData(luigi.WrapperTask):
    """ Task that starts the data pipeline. """

    year_month = luigi.MonthParameter()

    def requires(self):
        yield CalculateRollingAverage(self.year_month)
コード例 #15
0
class DownloadDataByDate(luigi.Task):
    '''
    Download by year and month as date string (formatted YYYY-MM)
    and by taxi color (green or yellow)
    '''

    date = luigi.MonthParameter()
    taxi_color = luigi.Parameter(default='yellow')

    def output(self):
        return luigi.LocalTarget('tmp/taxi_{color}.{date}.csv'.format(
            date=self.date, color=self.taxi_color))

    def download(self):
        url = download_url.format(color=self.taxi_color,
                                  year=self.date,
                                  month=self.date)
        shell('wget -P {output} {url}'.format(output=self.output().path,
                                              url=url))

    def run(self):
        try:
            self.output().makedirs()
            self.download()
        except Exception as err:
            os.remove(self.output().path)
            raise
コード例 #16
0
class FollowSocialNetworks(luigi.WrapperTask):
    month = luigi.MonthParameter()
    name = luigi.Parameter()

    def requires(self):
        networks = [
            MutualNetwork, FollowerNetwork, FolloweeNetwork, LinkedNetwork
        ]
        return [N(month=self.month, name=self.name) for N in networks]
コード例 #17
0
class SeedUserList(luigi.ExternalTask):
    month = luigi.MonthParameter()
    basedir = luigi.Parameter(default='data/twitter-following-followers-geo')
    number = luigi.IntParameter(
        default=2
    )  # 2が2014年にある市区町村で5回以上ツイートがあるユーザ、1はそれに加えて2014年に365回以上のツイートがあるユーザ

    def output(self):
        return luigi.LocalTarget(
            os.path.join(self.basedir, 'user_id_{}.txt'.format(self.number)))
コード例 #18
0
class TwitterFollowingFollowers(luigi.ExternalTask):
    '''Twitterユーザの関係データ
    '''
    month = luigi.MonthParameter()
    type = luigi.ChoiceParameter(choices=['followers', 'following'])
    basedir = luigi.Parameter(default='data/twitter-following-followers-geo')

    def output(self):
        return luigi.LocalTarget(
            os.path.join(
                self.basedir,
                self.month.strftime('%Y%m-{}.tar.gz'.format(self.type))))
コード例 #19
0
        class Bar(luigi.Task):
            month = luigi.MonthParameter()

            def __init__(self, *args, **kwargs):
                super(Bar, self).__init__(*args, **kwargs)
                self.comp = False

            def run(self):
                self.comp = True

            def complete(self):
                return self.comp
コード例 #20
0
class GetNYTaxiMontlyData(luigi.Task):
    month = luigi.MonthParameter()

    def run(self):
        url = h.get_url(self.month)
        self.output().makedirs()
        fname = h.download(url)
        self.output().fs.move(fname, self.output().path)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(settings.local_cache_dir,
                         f"taxi_data_{self.month}.csv"))
コード例 #21
0
class FingridMonthlyTask(FingridTask, luigi.Task):
    measurement_name = luigi.ChoiceParameter(
        choices=fingrid.MEASUREMENTS.keys())
    month = luigi.MonthParameter()

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        assert self.month.day == 1
        start_time = datetime.combine(self.month, datetime.min.time())
        # First of next month
        end_time = (start_time + timedelta(days=32)).replace(day=1)
        self.fingrid_init(start_time, end_time)
コード例 #22
0
ファイル: ingest_data.py プロジェクト: iamshri8/nyc-taxi-trip
class DownloadData(luigi.Task):
    """ Task for downloading the data. """

    year_month = luigi.MonthParameter()

    def output(self):
        local_path = u.get_local_path(self.year_month)
        return luigi.LocalTarget(local_path)

    def run(self):
        if not os.path.exists(os.getenv("DATA_DIR")):
            os.makedirs(os.getenv("DATA_DIR"))
        local_path = u.download_data(self.year_month)
コード例 #23
0
class DownloadGeolite2CityDatabase(luigi.Task):
    task_namespace = "dc"
    date = luigi.MonthParameter(default=datetime.datetime.today())

    download = "http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.tar.gz"
    match_glob = "GeoLite2-Country_*/GeoLite2-City.mmdb"

    def output(self):
        return luigi.LocalTarget("GeoLite2-City-%s.mmdb" % self.date)

    def run(self):
        os.system("curl -O %s" % self.download)
        os.system("tar xvfz GeoLite2-City.tar.gz")
        os.system("cp %s %s" % (self.match_glob, self.output().path))
コード例 #24
0
class CalculateTripDurations(luigi.Task):
    month = luigi.MonthParameter()

    def requires(self):
        return CleanUpTaxiData(month=self.month)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(settings.local_cache_dir,
                         f"trip_durations_{self.month}.pickle"))

    def run(self):
        df = pd.read_pickle(self.input().path)
        trip_durations = p.calculate_durations(df).pipe(p.reindex_on_pickup)
        trip_durations.to_pickle(self.output().path)
コード例 #25
0
class CleanUpTaxiData(luigi.Task):
    month = luigi.MonthParameter()

    def requires(self):
        return GetNYTaxiMontlyData(month=self.month)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(settings.local_cache_dir,
                         f"taxi_data_clean_{self.month}.pickle"))

    def run(self):
        df = (p.load_csv(self.input().path).pipe(p.rename_columns).pipe(
            p.filter_by_month_year, self.month))
        df.to_pickle(self.output().path)
コード例 #26
0
class KgdTaxPaymentsForMonth(Runner):
    """  As result we get three files:
        payments data in .csv file, all processed bins in .prs file and
        bins with no payments data for given month
    """
    month = luigi.MonthParameter(default=previous_month(1))
    date = luigi.Parameter(default=datetime.today().replace(day=1))
    name = 'kgd_taxpayments'

    def requires(self):
        # start_date, end_date = month_as_dates_range(self.month)

        yield GzipKgdTaxPaymentsToFtp(suff=self.suff,
                                      period=month_as_dates_range(self.month),
                                      struct=TaxPaymentsRow,
                                      **self.params)
コード例 #27
0
class WriteDailyAveragesToDB(sqla.CopyToTable):
    month = luigi.MonthParameter()

    def requires(self):
        return CalculateDailyAverageTripDuration(month=self.month)

    columns = [(["date", sa.DATE], {
        "primary_key": True
    }), (["duration", sa.Float], {})]
    connection_string = settings.db.url
    table = "trip_duration_daily_average"
    column_separator = ","

    def rows(self):
        for date_str, duration in super().rows():
            yield parse(date_str).date(), duration
コード例 #28
0
class CalculateMonthlyAverageTripDuration(luigi.Task):
    month = luigi.MonthParameter()

    def requires(self):
        return CalculateTripDurations(month=self.month)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(
                settings.local_cache_dir,
                f"trip_duration_monthly_average_{self.month}.csv",
            ))

    def run(self):
        df = pd.read_pickle(self.input().path)
        daily_averages = p.monthly_average_durations(df)
        daily_averages.to_csv(self.output().path, header=False)
コード例 #29
0
class FolloweeNetwork(luigi.Task):
    month = luigi.MonthParameter()
    name = luigi.Parameter()

    def requires(self):
        return MasterFollowEdgelist(month=self.month, name=self.name)

    def output(self):
        return luigi.LocalTarget(
            os.path.join('data/datasets', self.name, 'networks',
                         'f_followee.tsv'))

    def run(self):
        with self.output().temporary_path() as temp_output_path:
            run('zcat {} > {}'.format(self.input().path, temp_output_path),
                shell=True,
                check=True)
コード例 #30
0
class LinkedNetwork(luigi.Task):
    month = luigi.MonthParameter()
    name = luigi.Parameter()

    def requires(self):
        return MasterFollowEdgelist(month=self.month, name=self.name)

    def output(self):
        return luigi.LocalTarget(
            os.path.join('data/datasets', self.name, 'networks',
                         'f_linked.tsv'))

    def run(self):
        with self.output().temporary_path() as temp_output_path:
            cmd = '''zcat %s | awk -F"\t" 'BEGIN{OFS="\t"}{print $1,$2;print $2,$1}' | LC_ALL=C sort | LC_ALL=C uniq > %s ''' % (
                self.input().path, temp_output_path)
            run(cmd, shell=True, check=True)