Example #1
0
class YellowTaxiDateRangeTask(luigi.WrapperTask):
    start = luigi.MonthParameter()
    stop = luigi.MonthParameter()

    def requires(self):
        current_month = self.start
        while current_month <= self.stop:
            yield CopyTaxiTripData2SQLite(date=current_month)
            current_month += relativedelta(months=1)
Example #2
0
class PandasDFDemo(luigi.Task):
    """ Print a sample dataframe """

    month: datetime.date = luigi.MonthParameter(default=datetime.date.today())
    profession: str = luigi.Parameter(default="Engineer")

    def requires(self):
        return GenerateCustomers(month=self.month)

    def output(self):
        return luigi.LocalTarget(
            f"data/customers#{self.profession}_{self.month}.txt")

    def run(self):
        with self.input().open() as customer_file:
            customer_df = pandas.read_csv(
                customer_file,
                names=[
                    "Name", "Address", "Birthdate", "Job", "Company", "Email"
                ],
            )

        with self.output().open("w") as outfile:
            outfile.write(str(customer_df))

        return customer_df
Example #3
0
class NominetDomainListToHDFS(luigi.Task):
    """
    """
    date = luigi.MonthParameter(default=datetime.date.today())

    task_namespace = 'ingest'

    def requires(self):
        return NominetDomainListFTP(date=self.date)

    def output(self):
        filename = "/1_data/nominet/domains.%s.csv.gz" % self.date.strftime(
            '%Y%m')
        return luigi.contrib.hdfs.HdfsTarget(path=filename,
                                             format=WebHdfsPlainFormat())

    def run(self):
        # Read the file in and write it to HDFS
        with self.input().open('r') as reader:
            with self.output().open('w') as writer:
                logger.info("Uploading %s to %s" %
                            (self.input().path, self.output().path))
                while True:
                    chunk = reader.read(DEFAULT_BUFFER_SIZE)
                    if not chunk:
                        break
                    writer.write(chunk)
Example #4
0
class NominetDomainListFTP(luigi.ExternalTask):
    """
    Remote SFTP service and filenaming pattern for monthly releases.

    NOTE that for this to work, the host key must be set up and known to the server that runs this task. e.g.
    a `ssh USER@HOST` check to get the key registered will be needed to set up a new server or if the remote server changes.

    """
    date = luigi.MonthParameter(default=datetime.date.today())

    task_namespace = 'ingest'

    def output(self):
        """
        Returns the target output for this task.
        In this case, a successful execution of this task will create a file that will be created in a FTP server.
        :return: the target output for this task.
        :rtype: object (:py:class:`~luigi.target.Target`)
        """
        filename = '/home/bl/domains.%s.csv.gz' % self.date.strftime('%Y%m')
        return luigi.contrib.ftp.RemoteTarget(filename,
                                              NOM_HOST,
                                              username=NOM_USER,
                                              password=NOM_PWD,
                                              sftp=True)
Example #5
0
class GenerateCustomers(luigi.Task):
    """ Generate :count:-many customers from :month:. """

    month: datetime.date = luigi.MonthParameter()
    count: int = luigi.IntParameter(default=10000)

    def output(self):
        return luigi.LocalTarget(f"data/customers_{self.month}.csv")

    def run(self):
        fake = faker.Faker()

        with self.output().open("w") as outfile:
            writer = csv.writer(outfile)
            for _ in range(self.count):
                writer.writerow([
                    fake.name(),
                    fake.address().replace("\n", " "),
                    fake.date_between_dates(
                        date_start=datetime.date(1930, 1, 1),
                        date_end=datetime.date(2000, 1, 1),
                    ),
                    fake.job(),
                    fake.company(),
                    fake.company_email(),
                ])
Example #6
0
class CustomerSalaries(luigi.Task):
    """ Load the customers into a dataframe """

    month: datetime.date = luigi.MonthParameter(default=datetime.date.today())
    job: str = luigi.Parameter(default="Engineer")

    def output(self):
        return luigi.LocalTarget(f"data/salaries_{self.job}_{self.month}.csv")

    def requires(self):
        return {
            "a": GenerateCustomers(month=self.month),
            "b": CompanyEngineerSalary(month=self.month, job=self.job),
        }

    def run(self):
        with self.input()["a"].open() as customer_file:
            customer_df = pandas.read_csv(
                customer_file,
                names=[
                    "Name", "Address", "Birthdate", "Job", "Company", "Email"
                ],
            )

        with self.input()["b"].open() as salaries_file:
            salaries_df = pandas.read_csv(
                salaries_file,
                delimiter=",",
                names=["Position", "Company", "Salary"])

        employees_df = customer_df[['Name', 'Company']]
        employee_salaries_df = employees_df.merge(salaries_df, on="Company")
        employee_salaries_df.to_csv(self.output().path)
        print(employee_salaries_df.head())
Example #7
0
class FollowFilteredEdgelist(luigi.Task):
    '''edgelistの左側にunknownが出て来るエッジを消して、居住地の付けたユーザからのデータのみにしたエッジリスト

    Args:
        --name LocationUserListとUnknownListがわかるように保存パスに使われる名前
        --month
    '''
    month = luigi.MonthParameter()
    name = luigi.Parameter()
    type = luigi.ChoiceParameter(choices=['followers', 'following'])
    sources = luigi.TupleParameter(default=('followers', 'following'))

    def requires(self):
        return {
            'edgelist': TwitterFollowRawEdgelist(month=self.month,
                                                 type=self.type),
            'hl': RemainedHomeLocation(name=self.name, month=self.month)
        }

    def output(self):
        return luigi.LocalTarget(
            os.path.join(
                NETWORK_DIR, 'filtered', self.name,
                self.month.strftime('%Y%m_{}.tsv.gz'.format(self.type))))

    def run(self):
        with self.output().temporary_path() as temp_output_path:
            cmd = 'zcat {edgelist.path} | python -m snlocest.scripts.edgefilter -i {hl.path} | gzip > {}'.format(
                temp_output_path, **self.input())
            run(cmd, shell=True, check=True)
Example #8
0
 def testSerialize(self):
     date = datetime.date(2013, 2, 3)
     self.assertEqual(luigi.DateParameter().serialize(date), '2013-02-03')
     self.assertEqual(luigi.YearParameter().serialize(date), '2013')
     self.assertEqual(luigi.MonthParameter().serialize(date), '2013-02')
     dt = datetime.datetime(2013, 2, 3, 4, 5)
     self.assertEqual(luigi.DateHourParameter().serialize(dt), '2013-02-03T04')
Example #9
0
class WriteRollingAveragesToDB(sqla.CopyToTable):
    month = luigi.MonthParameter()

    def requires(self):
        return CalculateTripDurationRollingAverage45Days(month=self.month)

    columns = [
        (["date", sa.DATE], {
            "primary_key": True
        }),
        (["rolling_average_45d", sa.Float], {
            "nullable": True
        }),
    ]
    connection_string = settings.db.url
    table = "trip_duration_rolling_average"
    column_separator = ","

    def rows(self):
        for date_str, duration in super().rows():
            date = parse(date_str).date()
            if date.month != self.month.month:
                continue
            if not duration:
                duration = None
            yield date, duration
Example #10
0
class Indicator(luigi.Task):
    pair = luigi.Parameter()
    exchange = luigi.Parameter()
    month = luigi.MonthParameter()
    period = luigi.Parameter(default="1d")
    destination_path = luigi.Parameter()
    FN = None
    COLUMN_NAME = ""

    def column_name(self):
        return self.COLUMN_NAME

    def output(self):
        parms = self.to_str_params()
        cls = self.__class__.__name__
        parms["class"] = cls
        path = hamp.path(hamp.DEFINITIONS[cls], **parms)
        path = os.path.join(self.destination_path, path)
        self.target = luigi.LocalTarget(path)
        yield self.target

    def run(self):
        self.target.makedirs()
        data = hamt.input_df(self.requires())
        name = self.column_name()
        data[name] = self.FN(data)
        next_m = hamt.next_month(self.month, False)
        data = data[self.month:next_m]
        data[[name]].to_csv(self.target.path, date_format=hamt.DATE_FORMAT)
Example #11
0
class NYTaxiTripDurationAnalytics(luigi.WrapperTask):
    month = luigi.MonthParameter()

    def requires(self):
        yield WriteDailyAveragesToDB(self.month)
        yield WriteMonthlyAveragesToDB(self.month)
        yield WriteRollingAveragesToDB(self.month)
Example #12
0
class RemainedHomeLocation(luigi.Task):
    '''作成した居住地データ(LocationuserList)からunknownになったユーザをひいて、
    ソーシャルネットワークを取得しているuserlistとANDをとったものを保存する

    Args:
        --homelocation-path 居住地データのファイルへのパス
    '''
    name = luigi.Parameter()
    month = luigi.MonthParameter()
    sources = luigi.TupleParameter(default=('followers', 'following'))
    homelocation_path = luigi.Parameter()

    def requires(self):
        return {
            'unknown': UnknownList(month=self.month, sources=self.sources),
            'userlist': LocationUserList(path=self.homelocation_path),
            'seed': SeedUserList(month=self.month)
        }

    def output(self):
        return luigi.LocalTarget(
            os.path.join('data/datasets', self.name, 'groundtruth',
                         os.path.basename(self.input()['userlist'].path)))

    def run(self):
        cmd = 'cat {userlist.path} | python -m snlocest.scripts.edgefilter -e {unknown.path} | python -m snlocest.scripts.edgefilter -i {seed.path} > {}'
        with self.output().temporary_path() as temp_output_path:
            run(cmd.format(temp_output_path, **self.input()),
                shell=True,
                check=True)
Example #13
0
class CalculateRollingAverage(luigi.Task):
    """ Task for calculating the rolling average for 45 days. """

    year_month = luigi.MonthParameter()

    def requires(self):
        return SaveDailyAverage(self.year_month)

    def run(self):
        df = pd.read_sql_table(
            "daily_average_duration",
            con=self.input().engine,
            parse_dates=["date"],
            index_col="date",
        )

        rolling_avg = s.rolling_average_n_days(df, num_of_days=45)
        rolling_avg.to_csv(self.output().path)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(
                os.getenv("DATA_DIR"),
                f"rolling_average_{u.year_month_to_str(self.year_month)}.csv",
            ))
Example #14
0
class IngestData(luigi.WrapperTask):
    """ Task that starts the data pipeline. """

    year_month = luigi.MonthParameter()

    def requires(self):
        yield CalculateRollingAverage(self.year_month)
class DownloadDataByDate(luigi.Task):
    '''
    Download by year and month as date string (formatted YYYY-MM)
    and by taxi color (green or yellow)
    '''

    date = luigi.MonthParameter()
    taxi_color = luigi.Parameter(default='yellow')

    def output(self):
        return luigi.LocalTarget('tmp/taxi_{color}.{date}.csv'.format(
            date=self.date, color=self.taxi_color))

    def download(self):
        url = download_url.format(color=self.taxi_color,
                                  year=self.date,
                                  month=self.date)
        shell('wget -P {output} {url}'.format(output=self.output().path,
                                              url=url))

    def run(self):
        try:
            self.output().makedirs()
            self.download()
        except Exception as err:
            os.remove(self.output().path)
            raise
Example #16
0
class FollowSocialNetworks(luigi.WrapperTask):
    month = luigi.MonthParameter()
    name = luigi.Parameter()

    def requires(self):
        networks = [
            MutualNetwork, FollowerNetwork, FolloweeNetwork, LinkedNetwork
        ]
        return [N(month=self.month, name=self.name) for N in networks]
Example #17
0
class SeedUserList(luigi.ExternalTask):
    month = luigi.MonthParameter()
    basedir = luigi.Parameter(default='data/twitter-following-followers-geo')
    number = luigi.IntParameter(
        default=2
    )  # 2が2014年にある市区町村で5回以上ツイートがあるユーザ、1はそれに加えて2014年に365回以上のツイートがあるユーザ

    def output(self):
        return luigi.LocalTarget(
            os.path.join(self.basedir, 'user_id_{}.txt'.format(self.number)))
Example #18
0
class TwitterFollowingFollowers(luigi.ExternalTask):
    '''Twitterユーザの関係データ
    '''
    month = luigi.MonthParameter()
    type = luigi.ChoiceParameter(choices=['followers', 'following'])
    basedir = luigi.Parameter(default='data/twitter-following-followers-geo')

    def output(self):
        return luigi.LocalTarget(
            os.path.join(
                self.basedir,
                self.month.strftime('%Y%m-{}.tar.gz'.format(self.type))))
Example #19
0
        class Bar(luigi.Task):
            month = luigi.MonthParameter()

            def __init__(self, *args, **kwargs):
                super(Bar, self).__init__(*args, **kwargs)
                self.comp = False

            def run(self):
                self.comp = True

            def complete(self):
                return self.comp
Example #20
0
class GetNYTaxiMontlyData(luigi.Task):
    month = luigi.MonthParameter()

    def run(self):
        url = h.get_url(self.month)
        self.output().makedirs()
        fname = h.download(url)
        self.output().fs.move(fname, self.output().path)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(settings.local_cache_dir,
                         f"taxi_data_{self.month}.csv"))
Example #21
0
class FingridMonthlyTask(FingridTask, luigi.Task):
    measurement_name = luigi.ChoiceParameter(
        choices=fingrid.MEASUREMENTS.keys())
    month = luigi.MonthParameter()

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        assert self.month.day == 1
        start_time = datetime.combine(self.month, datetime.min.time())
        # First of next month
        end_time = (start_time + timedelta(days=32)).replace(day=1)
        self.fingrid_init(start_time, end_time)
Example #22
0
class DownloadData(luigi.Task):
    """ Task for downloading the data. """

    year_month = luigi.MonthParameter()

    def output(self):
        local_path = u.get_local_path(self.year_month)
        return luigi.LocalTarget(local_path)

    def run(self):
        if not os.path.exists(os.getenv("DATA_DIR")):
            os.makedirs(os.getenv("DATA_DIR"))
        local_path = u.download_data(self.year_month)
Example #23
0
class DownloadGeolite2CityDatabase(luigi.Task):
    task_namespace = "dc"
    date = luigi.MonthParameter(default=datetime.datetime.today())

    download = "http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.tar.gz"
    match_glob = "GeoLite2-Country_*/GeoLite2-City.mmdb"

    def output(self):
        return luigi.LocalTarget("GeoLite2-City-%s.mmdb" % self.date)

    def run(self):
        os.system("curl -O %s" % self.download)
        os.system("tar xvfz GeoLite2-City.tar.gz")
        os.system("cp %s %s" % (self.match_glob, self.output().path))
Example #24
0
class CalculateTripDurations(luigi.Task):
    month = luigi.MonthParameter()

    def requires(self):
        return CleanUpTaxiData(month=self.month)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(settings.local_cache_dir,
                         f"trip_durations_{self.month}.pickle"))

    def run(self):
        df = pd.read_pickle(self.input().path)
        trip_durations = p.calculate_durations(df).pipe(p.reindex_on_pickup)
        trip_durations.to_pickle(self.output().path)
Example #25
0
class CleanUpTaxiData(luigi.Task):
    month = luigi.MonthParameter()

    def requires(self):
        return GetNYTaxiMontlyData(month=self.month)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(settings.local_cache_dir,
                         f"taxi_data_clean_{self.month}.pickle"))

    def run(self):
        df = (p.load_csv(self.input().path).pipe(p.rename_columns).pipe(
            p.filter_by_month_year, self.month))
        df.to_pickle(self.output().path)
Example #26
0
class KgdTaxPaymentsForMonth(Runner):
    """  As result we get three files:
        payments data in .csv file, all processed bins in .prs file and
        bins with no payments data for given month
    """
    month = luigi.MonthParameter(default=previous_month(1))
    date = luigi.Parameter(default=datetime.today().replace(day=1))
    name = 'kgd_taxpayments'

    def requires(self):
        # start_date, end_date = month_as_dates_range(self.month)

        yield GzipKgdTaxPaymentsToFtp(suff=self.suff,
                                      period=month_as_dates_range(self.month),
                                      struct=TaxPaymentsRow,
                                      **self.params)
Example #27
0
class WriteDailyAveragesToDB(sqla.CopyToTable):
    month = luigi.MonthParameter()

    def requires(self):
        return CalculateDailyAverageTripDuration(month=self.month)

    columns = [(["date", sa.DATE], {
        "primary_key": True
    }), (["duration", sa.Float], {})]
    connection_string = settings.db.url
    table = "trip_duration_daily_average"
    column_separator = ","

    def rows(self):
        for date_str, duration in super().rows():
            yield parse(date_str).date(), duration
Example #28
0
class CalculateMonthlyAverageTripDuration(luigi.Task):
    month = luigi.MonthParameter()

    def requires(self):
        return CalculateTripDurations(month=self.month)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(
                settings.local_cache_dir,
                f"trip_duration_monthly_average_{self.month}.csv",
            ))

    def run(self):
        df = pd.read_pickle(self.input().path)
        daily_averages = p.monthly_average_durations(df)
        daily_averages.to_csv(self.output().path, header=False)
Example #29
0
class FolloweeNetwork(luigi.Task):
    month = luigi.MonthParameter()
    name = luigi.Parameter()

    def requires(self):
        return MasterFollowEdgelist(month=self.month, name=self.name)

    def output(self):
        return luigi.LocalTarget(
            os.path.join('data/datasets', self.name, 'networks',
                         'f_followee.tsv'))

    def run(self):
        with self.output().temporary_path() as temp_output_path:
            run('zcat {} > {}'.format(self.input().path, temp_output_path),
                shell=True,
                check=True)
Example #30
0
class LinkedNetwork(luigi.Task):
    month = luigi.MonthParameter()
    name = luigi.Parameter()

    def requires(self):
        return MasterFollowEdgelist(month=self.month, name=self.name)

    def output(self):
        return luigi.LocalTarget(
            os.path.join('data/datasets', self.name, 'networks',
                         'f_linked.tsv'))

    def run(self):
        with self.output().temporary_path() as temp_output_path:
            cmd = '''zcat %s | awk -F"\t" 'BEGIN{OFS="\t"}{print $1,$2;print $2,$1}' | LC_ALL=C sort | LC_ALL=C uniq > %s ''' % (
                self.input().path, temp_output_path)
            run(cmd, shell=True, check=True)