Exemple #1
0
class DateHourParameterTask(FireflowerTask):
    start_datetime = luigi.DateHourParameter(default=None)
    end_datetime = luigi.DateHourParameter(default=None)

    def __init__(self, *args, **kwargs):
        start = kwargs.get('start_datetime', datetime.utcnow() - timedelta(1))
        end = kwargs.get('end_datetime', datetime.utcnow())
        kwargs['start_datetime'] = to_datetime(start, raise_=True)
        kwargs['end_datetime'] = to_datetime(end, raise_=True)
        super(DateHourParameterTask, self).__init__(*args, **kwargs)

    @property
    def start_date_str(self):
        return self.start_datetime.strftime('%Y-%m-%d')

    @property
    def end_date_str(self):
        return self.end_datetime.strftime('%Y-%m-%d')

    @property
    def start_datetime_str(self):
        return self.start_datetime.strftime('%Y-%m-%d %H:00')

    @property
    def end_datetime_str(self):
        return self.end_datetime.strftime('%Y-%m-%d %H:00')
Exemple #2
0
    def create_validation_task(self,
                               generate_before=True,
                               tuple_output=True,
                               include_nonstate_changes=True,
                               earliest_timestamp=None,
                               expected_validation=None):
        """Create a task for testing purposes."""
        interval = '2013-01-01-2014-10-10'

        interval_value = luigi.DateIntervalParameter().parse(interval)
        earliest_timestamp_value = luigi.DateHourParameter().parse(
            earliest_timestamp) if earliest_timestamp else None
        expected_validation_value = (
            luigi.DateHourParameter().parse(expected_validation)
            if expected_validation else None)

        self.task = CourseEnrollmentValidationTask(
            interval=interval_value,
            output_root="/fake/output",
            generate_before=generate_before,
            tuple_output=tuple_output,
            include_nonstate_changes=include_nonstate_changes,
            earliest_timestamp=earliest_timestamp_value,
            expected_validation=expected_validation_value,
        )
        self.task.init_local()
Exemple #3
0
class CourseEnrollmentValidationDownstreamMixin(
        EventLogSelectionDownstreamMixin, MapReduceJobTaskMixin):
    """
    Defines parameters for passing upstream to tasks that use CourseEnrollmentValidationTask.

    """
    # location to write output
    output_root = luigi.Parameter(
        description='A URL to a path where output event files will be written.',
    )

    # Flag indicating whether to output synthetic events or tuples
    tuple_output = luigi.BooleanParameter(
        default=False,
        description=
        'A flag indicating that output should be in the form of tuples, not events. '
        'Default is False (output is events).',
    )

    # If set, generates events that occur before the start of the specified interval.
    # Default is incremental validation.
    generate_before = luigi.BooleanParameter(
        default=False,
        description=
        'A flag indicating that events should be created preceding the '
        'specified interval. Default behavior is to suppress the generation of events '
        'before the specified interval.',
    )

    # If set, events are included for transitions that don't result in a
    # change in enrollment state.  (For example, two activations in a row.)
    include_nonstate_changes = luigi.BooleanParameter(
        default=False,
        description='A flag indicating that events should be created '
        'to fix all transitions, even those that don\'t result in a change in enrollment '
        'state.  An "activate" following another "activate" is one such example. '
        'Default behavior is to skip generating events for non-state changes.',
    )

    # If set, events that would be generated before this timestamp would instead
    # be assigned this timestamp.
    earliest_timestamp = luigi.DateHourParameter(
        default=None,
        description='A "DateHour" parameter ("yyyy-mm-ddThh"), which if set, '
        'specifies the earliest timestamp that should occur in the output.  Events '
        'that would be generated before this timestamp would instead be assigned this '
        'timestamp.  This is left unspecified by default.',
    )

    # If set, users with events before this timestamp would be expected to have
    # a corresponding validation event.
    expected_validation = luigi.DateHourParameter(
        default=None,
        description='A "DateHour" parameter ("yyyy-mm-ddThh"), which if set, '
        'specifies a point in time where every user with events before this time '
        'should also have a corresponding validation event.  Those without such an '
        'validation event were not really created, and events should be synthesized '
        'to simulate "roll back" of the events.',
    )
Exemple #4
0
class RangeHourlyBase(RangeBase):
    """
    Produces a contiguous completed range of an hourly recurring task.
    """
    start = luigi.DateHourParameter(
        default=None,
        description=
        "beginning datehour, inclusive. Default: None - work backward forever (requires reverse=True)"
    )
    stop = luigi.DateHourParameter(
        default=None,
        description=
        "ending datehour, exclusive. Default: None - work forward forever")
    hours_back = luigi.IntParameter(
        default=100 * 24,  # slightly more than three months
        description=("extent to which contiguousness is to be assured into "
                     "past, in hours from current time. Prevents infinite "
                     "loop when start is none. If the dataset has limited "
                     "retention (i.e. old outputs get removed), this should "
                     "be set shorter to that, too, to prevent the oldest "
                     "outputs flapping. Increase freely if you intend to "
                     "process old dates - worker's memory is the limit"))
    # TODO always entire interval for reprocessings (fixed start and stop)?
    hours_forward = luigi.IntParameter(
        default=0,
        description=
        "extent to which contiguousness is to be assured into future, in hours from current time. Prevents infinite loop when stop is none"
    )

    def datetime_to_parameter(self, dt):
        return dt

    def parameter_to_datetime(self, p):
        return p

    def moving_start(self, now):
        return now - timedelta(hours=self.hours_back)

    def moving_stop(self, now):
        return now + timedelta(hours=self.hours_forward)

    def finite_datetimes(self, finite_start, finite_stop):
        """
        Simply returns the points in time that correspond to whole hours.
        """
        datehour_start = datetime(finite_start.year, finite_start.month,
                                  finite_start.day, finite_start.hour)
        datehours = []
        for i in itertools.count():
            t = datehour_start + timedelta(hours=i)
            if t >= finite_stop:
                return datehours
            if t >= finite_start:
                datehours.append(t)

    def _format_datetime(self, dt):
        return luigi.DateHourParameter().serialize(dt)
Exemple #5
0
class FindAllTopPages(luigi.Task):

    start_time = luigi.DateHourParameter()
    end_time = luigi.DateHourParameter()
    
    def requires(self):
        wiki_util = WikiUtils(self.start_time, self.end_time)
        return [FindTopPages(dt, file) for dt, file in wiki_util._url_generator()]

    def run(self):
        logging.info('Top 25 computation complete')
        self.output().open('w').close()

    def output(self):
        return luigi.LocalTarget('',is_tmp=True)
Exemple #6
0
 def testSerialize(self):
     date = datetime.date(2013, 2, 3)
     self.assertEqual(luigi.DateParameter().serialize(date), '2013-02-03')
     self.assertEqual(luigi.YearParameter().serialize(date), '2013')
     self.assertEqual(luigi.MonthParameter().serialize(date), '2013-02')
     dt = datetime.datetime(2013, 2, 3, 4, 5)
     self.assertEqual(luigi.DateHourParameter().serialize(dt), '2013-02-03T04')
Exemple #7
0
class DatabaseHourly(luigi.WrapperTask):
    date_hour = luigi.DateHourParameter()

    def requires(self):
        yield InsertHourlyValues(**self.param_kwargs)
        yield HourlyValuesCleanup(**self.param_kwargs)
        yield UpdateCoinsRank(**self.param_kwargs)
Exemple #8
0
class UpdateCoinsRank(DatabaseQuery):
    date_hour = luigi.DateHourParameter()
    table = _coins_table

    @property
    def sql(self):
        yield (
            'UPDATE {table} SET {previous_rank}={current_rank}'
            .format(table=self.table, previous_rank='PreviousRank', current_rank='Rank')
        )

        for query in self._update_current_rank:
            yield query

    def transform(self, df):
        return df[['rank', 'name', 'symbol']]

    @property
    def _update_current_rank(self):
        for batch in np.array_split(self.get_data().values, self.number_of_batches):
            update_str = ''
            for row in batch:
                rank, _, symbol = row
                update_str += (
                    "UPDATE {table} SET Rank={current_rank} WHERE Symbol='{symbol}';\n"
                    .format(table=self.table, current_rank=rank, symbol=symbol)
                )

            yield update_str

    def requires(self):
        return CryptoWatchHourlyIngress(date_hour=self.date_hour)
class TaskB(luigi.Task):
    dh = luigi.DateHourParameter()
    complicator = luigi.Parameter()

    def output(self):
        return MockFile(
            self.dh.strftime('TaskB/%%s%Y-%m-%d/%H') % self.complicator)
class Scrapes(luigi.WrapperTask):
    date_hour = luigi.DateHourParameter()

    def requires(self):
        yield UpdateCoinDeskScrapes(**self.param_kwargs)
        yield UpdateAmbCryptoScrapes(**self.param_kwargs)
        yield UpdateScrapeCoinTelegraph(**self.param_kwargs)
class CommonWrapperTask(luigi.WrapperTask):
    dh = luigi.DateHourParameter()

    def requires(self):
        yield TaskA(dh=self.dh)
        yield TaskB(dh=self.dh, complicator='no/worries'
                    )  # str(self.dh) would complicate beyond working
Exemple #12
0
class SocialHarvestTask(luigi.WrapperTask):
    date = luigi.DateParameter(default=date.today())
    hour = luigi.DateHourParameter(default=get_current_hour())
    debug = luigi.BoolParameter(default=False)

    def requires(self):
        yield TelegramMembersToDatabaseTask(date=self.date, hour=self.hour, debug=self.debug)
        yield TwitterMembersToDatabaseTask(date=self.date, hour=self.hour, debug=self.debug)
class TrainAll9DNetworks(luigi.WrapperTask):
    submit_date = luigi.DateHourParameter()

    #train_dims = luigi.ListParameter()
    #scan = luigi.DictParameter()

    def requires(self):
        for train_dims in target_names_generator():
            yield TrainNarrow9DBatch(self.submit_date, train_dims)
Exemple #14
0
        class BulkCompleteTask(luigi.Task):
            dh = luigi.DateHourParameter()

            @classmethod
            def bulk_complete(self, parameter_tuples):
                return parameter_tuples[:-2]

            def output(self):
                raise RuntimeError("Shouldn't get called while resolving deps via bulk_complete")
Exemple #15
0
class HourlyCron(luigi.WrapperTask):
    date_hour = luigi.DateHourParameter()

    def requires(self):
        yield CryptoWatchResult(**self.param_kwargs)
        yield DatabaseHourly(**self.param_kwargs)
        yield InsertCoinAggregates(**self.param_kwargs)
        yield HourlyDbAggregatesOutput(**self.param_kwargs)
        yield Scrapes(**self.param_kwargs)
Exemple #16
0
class HourlyValuesCleanup(DatabaseQuery):
    date_hour = luigi.DateHourParameter()

    @property
    def sql(self):
        date_limit = self.date_hour.date() - timedelta(days=_window_period)
        return (
            "DELETE FROM {table} WHERE Date <= CONVERT(DATE, '{date:%Y-%m-%d}') AND Hour <= {hour}"
            .format(table=_values_table, date=date_limit, hour=self.date_hour.hour)
        )
Exemple #17
0
class DateHourTaskOk(luigi.Task):
    hour = luigi.DateHourParameter()

    def complete(self):
        # test against 2000.03.01T02
        return self.hour in [
            datetime.datetime(2000, 2, 29, 22),
            datetime.datetime(2000, 3, 1, 2),
            datetime.datetime(2000, 3, 1, 3)
        ]
Exemple #18
0
class DummyTask(luigi.Task):

    param = luigi.Parameter()
    bool_param = luigi.BoolParameter()
    int_param = luigi.IntParameter()
    float_param = luigi.FloatParameter()
    date_param = luigi.DateParameter()
    datehour_param = luigi.DateHourParameter()
    timedelta_param = luigi.TimeDeltaParameter()
    insignificant_param = luigi.Parameter(significant=False)
Exemple #19
0
class DummyTask(luigi.Task):

    param = luigi.Parameter()
    bool_param = luigi.BoolParameter()
    int_param = luigi.IntParameter()
    float_param = luigi.FloatParameter()
    date_param = luigi.DateParameter()
    datehour_param = luigi.DateHourParameter()
    timedelta_param = luigi.TimeDeltaParameter()
    list_param = luigi.Parameter(is_list=True)
class TrainBatch(luigi.WrapperTask):
    submit_date = luigi.DateHourParameter()
    train_dims = luigi.ListParameter()
    #scan = luigi.DictParameter()
    settings_list = luigi.ListParameter()

    def requires(self):
        for settings in self.settings_list:
            check_settings_dict(settings)
            yield TrainNN(settings, self.train_dims, self.task_id)
class MoveWarcProxFiles(luigi.Task):

    task_namespace = 'ingest'
    date = luigi.DateHourParameter(default=datetime.datetime.today())
    prefix = luigi.Parameter(default="/mnt/gluster/fc")

    total_moved = 0

    def output(self):
        return IngestTaskDBTarget('mv-warcprox-files', self.task_id)

    def run(self):
        # Expected filenaming:
        p = re.compile(
            "BL-....-WEBRENDER-([a-z\-0-9]+)-([0-9]{14})-([a-z\-0-9]+)\.warc\.gz"
        )
        # List all matching files in source directory:
        webrender_path = os.path.join(self.prefix, 'heritrix/wren/')
        for file_path in os.listdir(webrender_path):
            if file_path.endswith('.warc.gz'):
                file_name = os.path.basename(file_path)
                matches = p.search(file_name)
                if matches:
                    destination_folder_path = "%s/heritrix/output/%s/%s/warcs" % (
                        self.prefix, matches.group(1), matches.group(2))
                    if not os.path.exists(destination_folder_path):
                        raise Exception(
                            "Expected destination folder does not exist! :: %s"
                            % destination_folder_path)
                    if not os.path.isdir(destination_folder_path):
                        raise Exception(
                            "Expected destination folder is not a folder! :: %s"
                            % destination_folder_path)
                    source_file_path = os.path.join(webrender_path, file_name)
                    destination_file_path = os.path.join(
                        destination_folder_path, file_name)
                    if os.path.exists(destination_file_path):
                        raise Exception(
                            "Destination file already exists! :: %s" %
                            destination_file_path)
                    shutil.move(source_file_path, destination_file_path)
                    self.total_moved += 1

        # Record that all went well:
        self.output().touch()

    def get_metrics(self, registry):
        # type: (CollectorRegistry) -> None

        g = Gauge('ukwa_files_moved_total_count',
                  'Total number of files moved by this task.',
                  labelnames=['kind'],
                  registry=registry)
        g.labels(kind='warcprox-warcs').set(self.total_moved)
class KeywordFilter(luigi.WrapperTask):
    string = luigi.Parameter()
    integer = luigi.IntParameter()
    float = luigi.FloatParameter()
    datehourparam = luigi.DateHourParameter()

    def requires(self):
        return SaveResult(string=self.string,
                          integer=self.integer,
                          float=self.float,
                          datehourparam=self.datehourparam)
        class Bar(luigi.Task):
            datehour = luigi.DateHourParameter()

            def __init__(self, *args, **kwargs):
                super(Bar, self).__init__(*args, **kwargs)
                self.comp = False

            def run(self):
                self.comp = True

            def complete(self):
                return self.comp
class HourlyIngress(ReadableTask):
    date_hour = luigi.DateHourParameter()

    @abc.abstractproperty
    def name(self):
        pass

    @property
    def _out_path(self):
        return (
            _hourly_output_path.format(self.name, self.date_hour.date()) +
            'hour={}/{}.snappy.parquet'.format(self.date_hour.hour, self.name))
class TrainRepeatingBatch(luigi.WrapperTask):
    submit_date = luigi.DateHourParameter()
    train_dims = luigi.ListParameter()
    #scan = luigi.DictParameter()
    settings = luigi.DictParameter()
    repeat = luigi.IntParameter(significant=False)

    def requires(self):
        check_settings_dict(self.settings)
        for ii in range(self.repeat):
            yield TrainNN(self.settings, self.train_dims,
                          self.task_id + '_' + str(ii))
Exemple #26
0
        class InconsistentlyOutputtingDateHourTask(luigi.Task):
            dh = luigi.DateHourParameter()

            def output(self):
                base = self.dh.strftime('/even/%Y%m%d%H')
                if self.dh.hour % 2 == 0:
                    return MockTarget(base)
                else:
                    return {
                        'spi': MockTarget(base + '/something.spi'),
                        'spl': MockTarget(base + '/something.spl'),
                    }
Exemple #27
0
class ForecastCityWeatherCSV(luigi.Task):
    timestamp = luigi.DateHourParameter(default=dt.now())
    city = luigi.Parameter()

    def requires(self):
        return ForecastCityWeatherJson(self.timestamp, self.city)

    def output(self):
        triple = lambda x: (x.year, x.month, x.day)
        year, month, day = triple(self.timestamp)
        ts = self.timestamp.strftime("%HH%M")  # 16H35
        path = os.path.join(DATADIR, self.city, '{year}', '{month:02d}',
                            '{day:02d}', 'forecast', '{ts}.csv')
        return luigi.LocalTarget(path.format(year=year,
                                             month=month,
                                             day=day,
                                             ts=ts),
                                 format=UTF8)

    def run(self):
        with self.input().open('r') as fobj:
            data = json.load(fobj)['list']

        def get(single):
            """get data for a single forecast
            """
            rain = single.get('rain', {'3h': None})
            snow = single.get('snow', {'3h': None})
            return {
                "forecast_at": self.timestamp,
                "weather_id": single['weather'][0]['id'],
                "weather_desc": single['weather'][0]['main'],
                "wind_speed": single['wind']['speed'],  # m/s unit,
                "humidity": single['main']['humidity'],  # in %
                "temp": single['main']['temp'],  # C unit,
                "temp_min": single['main']['temp_min'],  # C unit
                "temp_max": single['main']['temp_max'],  # C unit
                "pressure": single['main']['pressure'],  # hPa unit
                "rain_3h": rain.get('3h', None),  # rain volume mm unit
                "snow_3h": snow.get('3h', None),  # snow volume
                "cloudiness": single['clouds']['all'],  # in %
                "ts": pd.Timestamp.fromtimestamp(single['dt'])
            }

        columns = [
            "forecast_at", "ts", "weather_id", "weather_desc", "temp",
            "temp_min", "temp_max", "rain_3h", "snow_3h", "pressure",
            "humidity", "wind_speed", "cloudiness"
        ]
        df = pd.DataFrame([get(x) for x in data]).sort_values(by="ts")
        with self.output().open('w') as fobj:
            df[columns].to_csv(fobj, index=False)
Exemple #28
0
        class BulkCompleteHourlyTask(luigi.Task):
            non_positional_arbitrary_argument = luigi.Parameter(default="whatever", positional=False, significant=False)
            dh = luigi.DateHourParameter()
            arbitrary_argument = luigi.BoolParameter()

            @classmethod
            def bulk_complete(cls, parameter_tuples):
                for t in map(cls, parameter_tuples):
                    assert t.arbitrary_argument
                return parameter_tuples[:-2]

            def output(self):
                raise RuntimeError("Shouldn't get called while resolving deps via bulk_complete")
Exemple #29
0
class ParseTelegramMemberCountTask(luigi.Task):
    date = luigi.DateParameter(default=date.today())
    hour = luigi.DateHourParameter(default=get_current_hour())
    limit = luigi.Parameter(default=None)  # DEBUG: REMOVE THIS!!!!

    def requires(self):
        return [base_pipe.CreateDateFolder(date=self.date), base_pipe.ParseTelegramJSONtoCSVTask(date=self.date)]

    def output(self):
        path = Path(str(self.input()[0].path)) / f'Telegram_Data_{self.hour}.csv'
        return luigi.LocalTarget(str(path))

    def run(self):
        telegram_links = []

        with self.input()[1].open('r') as f:
            reader = csv.reader(f)
            header = next(reader)

            for i, row in enumerate(reader):
                name = row[0]
                link = row[1]
                telegram_links.append({'name': name, 'link': link})  # IDEA: tuple
                if self.limit:
                    if i > self.limit:
                        break

        max_processes = cpu_count() * 2

        with Pool(max_processes) as p:
            member_records = p.map(parse_member_count, telegram_links)

        if len(member_records) > 0:
            df = pd.DataFrame(member_records)
            df.set_index('name', inplace=True)

            mean_series = df.groupby(df.index)['members'].mean()

            sum_series = df.groupby(df.index)['members'].sum()

            median_series = df.groupby(df.index)['members'].median()

            count_series = df.groupby(df.index).count()

            data = pd.concat([sum_series,
                              mean_series,
                              median_series,
                              count_series], axis=1)
            data.columns = ['sum', 'mean', 'median', 'link_count']
            data.dropna(inplace=True)
            data.to_csv(self.output().path)
class TwitterMembersToDatabaseTask(luigi.Task):
    date = luigi.DateParameter(default=date.today())
    hour = luigi.DateHourParameter(default=get_current_hour())
    debug = luigi.BoolParameter(default=False)  # NOTE: SUPER DANGEROUS WILL SCRUB DATABASE

    def requires(self):
        return ParseTwitterMemberCountTask(date=self.date, hour=self.hour)

    def run(self):
        if not Twitter.table_exists():
            create_twitter_table()

        if not self.complete():

            df = pd.read_csv(self.input().path)
            df.set_index('name', inplace=True)
            for name, row in df.iterrows():
                followers = row['followers']
                following = row['following']
                likes = row['likes']
                tweets = row['tweets']

                data = {'name': name, 'followers': followers, 'following': following, 'likes': likes,
                        'tweets': tweets, 'date': self.hour}
                Twitter.add_member_data(**data)

        # TODO: Twitter LINKS, RAW DATA. rename csv files

    def complete(self):
        # TODO: Add task to create a DB/Table or
        # IDEA: Add an except for no table - create table then check databsse for complete
        if self.debug:
            clean_twitter_table()  # DEBUG: REMOVE
            print('DELETING TABLE FOR DEBUGGING!!!!!!!!!!!!!!!!!')
        try:
            local_ = pd.read_csv(self.input().path)
            dbase_ = Twitter.data_by_date(self.hour)
            print('#' * 25)
            print(len(local_))  # TODO: Logging
            print(len(dbase_))  # TODO: Logging
            # TODO: If else raise data not written to db
            print(len(local_.index) == len(dbase_.index))
            # TODO: If else raise data not written to db
            print('#' * 25)
            return len(local_.index) == len(dbase_.index)

        except (FileNotFoundError, KeyError):
            print()
            return False