Example #1
0
class JobCleanupTask(BaseTask):
    job = Parameter(visibility=ParameterVisibility.HIDDEN)
    interval = Parameter()
    start_date = DateParameter()
    owner = Parameter()
    email = Parameter()
    time = DateParameter()

    def __init__(self, *args, **kwargs):
        super(JobCleanupTask, self).__init__(*args, **kwargs)

    def run(self):
        self.log.critical('job-cleanup')
        self._completed = True
Example #2
0
class DataPreProcessing(luigi.Task):

    dataset_version = DateParameter(default=datetime.date.today())
    dataset_name = Parameter(default="covidIT")

    def requires(self):
        return DownloadDataset(self.dataset_version, self.dataset_name)

    output_folder = os.path.join(output_dir, "processed")

    def output(self):
        return LocalTarget(
            f"{self.output_folder}/{self.dataset_name}_processed_v{self.dataset_version}.csv"
        )

    def run(self):
        df_data = pd.read_csv(self.input().path, index_col="date")
        df_data = self.preprocess_data(df_data)

        Path(self.output_folder).mkdir(parents=True, exist_ok=True)
        df_data.to_csv(self.output().path)

    def preprocess_data(self, df_data):
        df_data["diff_death"] = df_data["death"].diff()
        df_data["diff_intensive_care"] = df_data["intensive_care"].diff()
        df_data["diff_performed_tests"] = df_data["performed_tests"].diff()
        # .
        df_data["diff_recovered"] = df_data["recovered"].diff()
        df_data["ratio_molecular"] = (
            df_data["total_positives_molecular_test"] /
            df_data["swabs_test_molecular"])
        df_data["ratio_antigenic"] = (
            df_data["total_positives_antigenic_test_rapid"] /
            df_data["swabs_test_antigenic_rapid"])
        return df_data
 def test_use_interval_start(self):
     interval = None
     interval_start = DateParameter().parse('2013-01-01')
     LastCountryOfUser(
         interval=interval,
         interval_start=interval_start,
     )
Example #4
0
class DataTransform(luigi.Task):

    dataset_version = DateParameter(default=datetime.date.today())
    dataset_name = Parameter(default="covidIT")
    attribute = Parameter(default="total_positive")
    window_size = IntParameter(default=7)

    def requires(self):
        return DataPreProcessing(self.dataset_version, self.dataset_name)

    output_folder = os.path.join(output_dir, "transformed_window")

    def output(self):
        return LocalTarget(
            os.path.join(
                self.output_folder,
                f"{self.dataset_name}_transformed_window_w{self.window_size}_{self.attribute}_v{self.dataset_version}.csv",
            ))

    def run(self):
        df_data = pd.read_csv(self.input().path, index_col="date")
        df_windows = self.getXyWindow_df(df_data[self.attribute],
                                         window=self.window_size)
        df_windows = df_windows.dropna()

        Path(self.output_folder).mkdir(parents=True, exist_ok=True)
        df_windows.to_csv(self.output().path)

    def getXyWindow_df(self, d_attribute, window=7):
        attribute = d_attribute.name
        df_windows = pd.DataFrame(d_attribute)
        for i in range(1, window + 1):
            df_windows[f"v_t-{i}"] = df_windows[attribute].shift(i)
        df_windows[[attribute] + [f"v_t-{i}" for i in range(1, window + 1)]]
        return df_windows
Example #5
0
class Modeling(luigi.Task):

    dataset_version = DateParameter(default=datetime.date.today())
    dataset_name = Parameter(default="covidIT")
    attribute = Parameter(default="total_positive")
    window_size = IntParameter(default=7)
    model_name = Parameter(default="LR")

    def requires(self):
        return DataTransform(self.dataset_version, self.dataset_name,
                             self.attribute, self.window_size)

    output_folder = os.path.join(output_dir, "model")

    def output(self):
        return LocalTarget(
            os.path.join(
                self.output_folder,
                f"{self.dataset_name}_model_{self.attribute}_w{self.window_size}_{self.model_name}_v{self.dataset_version}.pkl",
            ))

    def run(self):
        df_windows = pd.read_csv(self.input().path, index_col="date")
        df_windows.index = pd.to_datetime(df_windows.index)
        regr = self.modeling(
            df_windows,
            self.attribute,
            regressor=self.model_name,
        )

        Path(self.output_folder).mkdir(parents=True, exist_ok=True)

        import pickle

        with open(self.output().path, "wb") as f:
            pickle.dump(regr, f)

    def modeling(self,
                 df_windows,
                 attribute,
                 regressor="LR",
                 date_end_train=None):
        from sklearn.ensemble import GradientBoostingRegressor
        from sklearn.linear_model import LinearRegression

        if regressor not in ["LR", "GBR"]:
            raise ValueError
        regr = LinearRegression(
        ) if regressor == "LR" else GradientBoostingRegressor()
        if date_end_train is None:
            date_end_train = df_windows.index[-1]

        X = df_windows.drop(columns=attribute)[:date_end_train].values
        y = df_windows[attribute][:date_end_train].values
        regr.fit(X, y)
        return regr
Example #6
0
class DownloadDataset(luigi.Task):

    dataset_version = DateParameter(default=datetime.date.today())
    dataset_name = Parameter(default="covidIT")

    columns_ita_eng = {
        "data": "date",
        "stato": "country",
        "ricoverati_con_sintomi": "hospitalized_with_symptoms",
        "terapia_intensiva": "intensive_care",
        "totale_ospedalizzati": "total_hospitalized",
        "isolamento_domiciliare": "home_confinement",
        "totale_positivi": "total_positive",
        "variazione_totale_positivi": "total_positive_change",
        "nuovi_positivi": "new_positives",
        "dimessi_guariti": "recovered",
        "deceduti": "death",
        "casi_da_sospetto_diagnostico":
        "positive_cases_from_clinical_activity",
        "casi_da_screening": "screening_cases",
        "totale_casi": "total_cases",
        "tamponi": "performed_tests",
        "casi_testati": "total_people_tested",
        "note": "notes",
        "ingressi_terapia_intensiva": "new_entries_intensive_care",
        "note_test": "notes_tests",
        "note_casi": "notes_cases",
        "totale_positivi_test_molecolare": "total_positives_molecular_test",
        "totale_positivi_test_antigenico_rapido":
        "total_positives_antigenic_test_rapid",
        "tamponi_test_molecolare": "swabs_test_molecular",
        "tamponi_test_antigenico_rapido": "swabs_test_antigenic_rapid",
    }
    data_url = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv"
    output_folder = os.path.join(output_dir, "dataset")

    def output(self):
        return LocalTarget(
            f"{self.output_folder}/{self.dataset_name}_v{self.dataset_version}.csv"
        )

    def run(self):
        df_data = self.load_data(self.data_url,
                                 columns_new_names=self.columns_ita_eng)

        Path(self.output_folder).mkdir(parents=True, exist_ok=True)
        df_data.to_csv(self.output().path)

    def load_data(self, data_url, columns_new_names=None):
        data = pd.read_csv(data_url)
        if columns_new_names:
            data.rename(columns=columns_new_names, inplace=True)
        data["date"] = pd.to_datetime(data["date"])
        data.set_index("date", inplace=True)
        return data
Example #7
0
class Fetch(Task):
    from datetime import date, timedelta

    # Ein Datum wird als Parameter uebergeben
    date = DateParameter(default=date.today())

    # PRAW arbeitet mit Zeitintervallen
    # Um einen Tag zu importieren wird
    # von Tag N bis Tag N+1 importiert
    delta = timedelta(days=1)

    # Das LocalTarget fuer die rohen Daten
    # Die Daten werden unter
    # "daily/<datum>/roh.csv gespeichert
    def output(self):
        prefix = self.date.strftime("%m-%d-%Y")
        return LocalTarget("daily/%s/roh.csv" % prefix)

    # Die Posts fuer einen Tag
    # werden heruntergeladen,
    # in einen Dataframe konvertiert
    # und als CSV in das Target geschrieben
    def run(self):
        start = self.date
        end = start + self.delta
        posts = self.fetch(start, end)
        frame = self.konvertiere(posts)
        self.speichern(frame, self.output())

    def fetch(self, start, end):
        import time
        import praw
        subreddits = ["datascience", "gameofthrones"]
        reddit = praw.Reddit(user_agent="test",
                             client_id="wpaIV3-b3AYOJQ", 
                             client_secret="-M_LPtLCpkqlJTCyg--Rg9ePAwg")
        subreddits = '+'.join(subreddits)
        subreddit = reddit.subreddit(subreddits)
        start = time.mktime(self.date.timetuple())
        end = self.date + self.delta
        end = time.mktime(end.timetuple())
        filtered = list(subreddit.submissions(start=start, end=end))
        return filtered
    
    def konvertiere(self, posts):
        import pandas
        dataframe = pandas.DataFrame([f.__dict__ for f in posts])[['id', 'title', 'selftext', 'subreddit']]
        return dataframe

    def speichern(self, dataframe, target):
        with target.open("w") as out:
            dataframe.to_csv(out, encoding='utf-8', index=False, sep=';')
class FetchLichessApiJSON(Task):

    player = Parameter(default='thibault')
    perf_type = Parameter(default='blitz')
    since = DateParameter(default=datetime.today().date() - timedelta(days=1))
    single_day = BoolParameter()

    def output(self):
        import os

        file_location = (f'~/Temp/luigi/{self.since}-raw-games-'
                         f'{self.player}-{self.perf_type}-json.pckl')
        return LocalTarget(os.path.expanduser(file_location), format=Nop)

    def run(self):
        import lichess.api
        from lichess.format import JSON
        from pandas import json_normalize
        from calendar import timegm

        self.output().makedirs()

        if self.single_day:
            unix_time_until = timegm((self.since
                                      + timedelta(days=1)).timetuple())
        else:
            unix_time_until = timegm(datetime.today().date().timetuple())
        self.until = int(1000 * unix_time_until)

        unix_time_since = timegm(self.since.timetuple())
        self.since_unix = int(1000 * unix_time_since)

        token = lichess_token().token

        games = lichess.api.user_games(self.player,
                                       since=self.since_unix,
                                       until=self.until,
                                       perfType=self.perf_type,
                                       auth=token,
                                       evals='false',
                                       clocks='false',
                                       moves='false',
                                       format=JSON)

        df = json_normalize([game
                             for game in games],
                            sep='_')

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)
Example #9
0
class Filter(Task):
    earliest_date = DateParameter(default=date.today())

    def requires(self):
        return Scrape(self.earliest_date)

    def run(self):
        # open the saved tweets
        with self.output().open('w') as output_file:
            with self.input().open('r') as input_file:
                records = csv.DictReader(input_file)
                for row in records:
                    # filter out retweets (starting with 'RT')
                    if not row['tweet'].startswith('RT'):
                        output_file.write(row['tweet'])

    def output(self):
        return LocalTarget('tweets_filtered_since_%s.csv' % self.earliest_date)
Example #10
0
class AggregateInReport(luigi.Task):

    dataset_version = DateParameter(default=datetime.date.today())
    dataset_name = Parameter(default="covidIT")
    output_folder = os.path.join(output_dir, "report_trends")

    # --> Alternative for dynamic repor
    # run as --attributes '["total_positive", "recovered", "ratio_molecular"]'
    # attributes = ListParameter(default=["total_positive", "recovered", "ratio_molecular"])
    #
    attributes = ["total_positive", "recovered", "ratio_molecular"]

    def requires(self):
        return {
            attribute: PlotTrend(self.dataset_version, self.dataset_name,
                                 attribute)
            for attribute in self.attributes
        }

    def output(self):
        return LocalTarget(
            os.path.join(
                self.output_folder,
                f"{self.dataset_name}_report_trends_v{self.dataset_version}.html",
            ))

    def run(self):
        path_by_attribute = {k: self.input()[k].path for k in self.input()}

        plots_html = self.getHTMLTrends(path_by_attribute)

        Path(self.output_folder).mkdir(parents=True, exist_ok=True)

        with open(self.output().path, "w") as fp:
            for plot_html in plots_html:
                fp.write(plot_html)

    def getHTMLTrends(self, path_by_attribute):
        plots_html = [
            f"<h2 style='text-align: center'>{k}</h2>\n<p style='text-align: center'><img src='{path_by_attribute[k]}'  style='width: 50%; height: 50%' /> </p>"
            for k in path_by_attribute
        ]
        return plots_html
Example #11
0
class GetBidens(Task):
    earliest_date = DateParameter(default=date.today())

    def requires(self):
        return Filter(self.earliest_date)

    def run(self):
        with self.input().open('r') as file:
            doc = nlp(file.read())

            # print all mentioned persons
            for entity in doc.ents:
                if entity.label_ == 'PERSON':
                    print(entity.text, entity.label_)

            # count the number of Bidens in the tweets
            num_bidens = len([
                e for e in doc.ents if e.label_ == 'PERSON' and (
                    'biden' in e.text.lower() or 'joe' in e.text.lower())
            ])
            print('Number of Bidens in Trump\'s tweets since ' +
                  str(EARLIEST_DATE) + ': ' + str(num_bidens))
Example #12
0
class PlotTrend(luigi.Task):

    dataset_version = DateParameter(default=datetime.date.today())
    dataset_name = Parameter(default="covidIT")
    attribute = Parameter(default="ratio_molecular")

    def requires(self):
        return DataPreProcessing(self.dataset_version, self.dataset_name)

    output_folder = os.path.join(output_dir, "trend")

    def output(self):
        return LocalTarget(
            os.path.join(
                self.output_folder,
                f"{self.dataset_name}_trend_{self.attribute}_v{self.dataset_version}.png",
            ))

    def run(self):
        df_data = pd.read_csv(self.input().path, index_col="date")
        fig = self.plotDateTrend(df_data.index, df_data[self.attribute],
                                 self.attribute)

        Path(self.output_folder).mkdir(parents=True, exist_ok=True)
        fig.savefig(self.output().path)

    def plotDateTrend(self, x_date, y, attribute, interval=40):
        fig, ax = plt.subplots(figsize=(12, 5))
        ax.grid()
        from datetime import datetime

        x_date = [datetime.strptime(d, "%Y-%m-%d %H:%M:%S") for d in x_date]
        ax.scatter(x_date, y, s=3)
        ax.set(xlabel="Date", ylabel=attribute, title=attribute)

        date_form = DateFormatter("%d-%m")
        ax.xaxis.set_major_formatter(date_form)
        ax.xaxis.set_major_locator(mdates.DayLocator(interval=interval))
        return fig
Example #13
0
class Scrape(Task):
    earliest_date = DateParameter(default=date.today())

    def run(self):
        output_filename = 'tweets_since_%s.csv' % self.earliest_date

        # remove old output file if it exists
        if os.path.exists(output_filename):
            os.remove(output_filename)

        # scrape tweets
        c = twint.Config()
        c.Username = "******"
        c.Since = str(self.earliest_date)
        c.Store_csv = True
        c.Custom_csv = ["tweet"]
        # save the scraped tweets into this file
        c.Output = output_filename
        # run the actual twitter search
        twint.run.Search(c)

    def output(self):
        return LocalTarget('tweets_since_%s.csv' % self.earliest_date)
Example #14
0
class Clean(Task):
    from datetime import date
    import nltk
    nltk.download('punkt')
    nltk.download('stopwords')

    # Ein Datum wird als Parameter uebergeben
    date = DateParameter(default=date.today())

    # Die Liste von Stop-Woertern
    # die herausgefiltert werden
    stoppwoerter = nltk.corpus.stopwords.words('english')

    # Der verwendete Tokenizer
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

    # Der Stemmer fuer Englische Woerter
    stemmer = nltk.SnowballStemmer("english")

    # Als Abhaengigkeit wird der
    # Task *Fetch* zurueckgegeben
    def requires(self):
        return Fetch(self.date)

    # Das LocalTarget fuer die sauberen Daten
    # Die Daten werden unter
    # "daily/<datum>/cleaned.csv gespeichert
    def output(self):
        prefix = self.date.strftime("%m-%d-%Y")
        return LocalTarget("daily/%s/cleaned.csv" % prefix)

    # Die Rohdaten werden zerstueckelt
    # durch die Stopwort-Liste gefiltert
    # und auf ihre Wortstaemme zurueckgefuehrt
    def run(self):
        csv = self.lade()
        tokenized = self.tokenize(csv)
        gefiltert = self.entferne(tokenized)
        wortstamm = self.stemme(gefiltert)
        csv["cleaned_words"] = wortstamm
        self.speichern(csv, self.output())

    def lade(self):
        import pandas
        dataset = pandas.read_csv(self.input().path, encoding='utf-8', sep=';').fillna('')
        return dataset

    def tokenize(self, csv):
        def tok(post):
            tokenized = self.tokenizer.tokenize(post["title"] + " " + post["selftext"])
            return tokenized
        tokenized = csv.apply(tok, axis=1)
        return tokenized

    def entferne(self, tokenized):
        lowercase = tokenized.apply(lambda post: [wort.lower() for wort in post])
        filtered = lowercase.apply(lambda post: [wort for wort in post if wort not in self.stoppwoerter])
        return filtered

    def stemme(self, gefiltert):
        wortstamm = gefiltert.apply(lambda post: [self.stemmer.stem(wort) for wort in post])
        wortstamm = wortstamm.apply(lambda post: " ".join(post))
        return wortstamm
    
    def speichern(self, dataframe, target):
        with target.open("w") as out:
            dataframe[["id", "cleaned_words", "subreddit"]].to_csv(out, encoding='utf-8', index=False, sep=';')
Example #15
0
class Classify(PySparkTask):
    from datetime import date

    date = DateParameter(default=date.today())
    version = IntParameter(default=1)

    # PySpark Parameter
    driver_memory = '1g'
    executor_memory = '2g'
    executor_cores = '2'
    num_executors = '4'
    master = 'local'

    # Als Abhaengigkeit werden
    # Task *Clean* und *ModelExists*
    # zurueckgegeben
    def requires(self):
        return [ModelExists(self.version), Clean(self.date)]

    # Das LocalTarget fuer die Klassifikation
    # Die Daten werden unter
    # "daily/<datum>/ergebnis.csv gespeichert
    def output(self):
        prefix = self.date.strftime("%m-%d-%Y")
        return LocalTarget("daily/%s/ergebnis.csv" % prefix)

    def main(self, sc, *args):
        from pyspark.sql.session import SparkSession
        from pyspark.ml import PipelineModel
        from pyspark.sql.functions import when

        # Initialisiere den SQLContext
        sql = SparkSession.builder\
            .enableHiveSupport() \
            .config("hive.exec.dynamic.partition", "true") \
            .config("hive.exec.dynamic.partition.mode", "nonstrict") \
            .config("hive.exec.max.dynamic.partitions", "4096") \
            .getOrCreate()

        # Lade die bereinigten Daten
        df = sql.read.format("com.databricks.spark.csv") \
                     .option("delimiter", ";") \
                     .option("header", "true") \
                     .load(self.input()[1].path)

        # Lade das Model das zuvor mit SparkML trainiert wurde
        model = PipelineModel.load(self.input()[0].path)

        # Klassifiziere die Datensaetze eines Tages mit dem Model
        ergebnis = model.transform(df)[["id",
                                        "subreddit",
                                        "probability",
                                        "prediction"]]

        # Eine kleine Aufbereitung der Daten denn
        # die Klasse "1" hat den Namen "datascience"
        ergebnis = ergebnis.withColumn("prediction_label",
                                        when(ergebnis.prediction==1,
                                            "datascience") \
                                        .otherwise("gameofthrones"))

        # Der Einfachheit halber wird der Dataframe
        # in einen Pandas Dataframe konvertiert.
        # Dies sollte bei grossen Datenmengen vermieden.
        with self.output().open("w") as out:
            ergebnis.toPandas().to_csv(out,
                                       encoding='utf-8',
                                       index=False,
                                       sep=';')
Example #16
0
class PlotFutureTrend(luigi.Task):

    dataset_version = DateParameter(default=datetime.date.today())

    dataset_name = Parameter(default="covidIT")
    attribute = Parameter(default="total_positive")
    window_size = IntParameter(default=7)
    model_name = Parameter(default="LR")
    n_days_to_predict = IntParameter(default=7)

    def requires(self):
        return {
            "data_pred":
            PredictTrend(
                self.dataset_version,
                self.dataset_name,
                self.attribute,
                self.window_size,
                self.model_name,
                self.n_days_to_predict,
            ),
            "data_transformed":
            DataTransform(
                self.dataset_version,
                self.dataset_name,
                self.attribute,
                self.window_size,
            ),
        }

    output_folder = os.path.join(output_dir, "report_future_trend")

    def output(self):
        return LocalTarget(
            os.path.join(
                self.output_folder,
                f"{self.dataset_name}_future_trend_{self.attribute}_w{self.window_size}_N{self.n_days_to_predict}_{self.model_name}_v{self.dataset_version}.png",
            ))

    def run(self):
        df_windows_pred = pd.read_csv(self.input()["data_pred"].path,
                                      index_col="date")
        df_windows_pred.index = pd.to_datetime(df_windows_pred.index)
        df_date = pd.read_csv(self.input()["data_transformed"].path,
                              index_col="date")
        df_date.index = pd.to_datetime(pd.to_datetime(df_date.index).date)

        import datetime

        fig = self.plotEstimatedTrend(df_date, df_windows_pred, self.attribute)

        Path(self.output_folder).mkdir(parents=True, exist_ok=True)

        fig.savefig(self.output().path)

    def plotEstimatedTrend(
        self,
        df_date,
        df_windows_predicted,
        attribute,
        start_train=None,
        date_end_train=None,
        interval=40,
    ):
        import datetime

        # Starting date of the plot
        start_train = df_date.index[0] if start_train is None else start_train
        # End date of the true label/value of the plot
        date_end_train = df_date.index[
            -1] if date_end_train is None else date_end_train

        start_test = date_end_train.date() + datetime.timedelta(days=1)
        if df_windows_predicted[start_test:].empty:
            # TODO
            raise ValueError

        fig, ax = plt.subplots(figsize=(12, 5))
        ax.grid()
        # Observed trend until training date
        x_date = df_date[start_train:date_end_train].index
        y_train = df_date[start_train:date_end_train][attribute].values
        ax.scatter(x_date, y_train, s=3, color="blue", label=attribute)
        # Predicted future trend
        ax.scatter(
            df_windows_predicted[start_test:].index,
            df_windows_predicted[start_test:][f"y_pred_{attribute}"].values,
            s=4,
            color="orange",
            label=f"{attribute} predicted",
        )
        ax.legend()
        ax.set(xlabel="Date", ylabel=attribute, title=attribute)
        date_form = DateFormatter("%d-%m")
        ax.xaxis.set_major_formatter(date_form)
        ax.xaxis.set_major_locator(mdates.DayLocator(interval=interval))
        return fig
Example #17
0
class PredictTrend(luigi.Task):

    dataset_version = DateParameter(default=datetime.date.today())
    dataset_name = Parameter(default="covidIT")
    attribute = Parameter(default="total_positive")
    window_size = IntParameter(default=7)
    model_name = Parameter(default="LR")
    n_days_to_predict = IntParameter(default=7)

    def requires(self):
        return {
            "model":
            Modeling(
                self.dataset_version,
                self.dataset_name,
                self.attribute,
                self.window_size,
                self.model_name,
            ),
            "data_transformed":
            DataTransform(
                self.dataset_version,
                self.dataset_name,
                self.attribute,
                self.window_size,
            ),
        }

    output_folder = os.path.join(output_dir, "prediction")

    def output(self):
        return LocalTarget(
            os.path.join(
                self.output_folder,
                f"{self.dataset_name}_prediction_{self.attribute}_w{self.window_size}_{self.model_name}_N{self.n_days_to_predict}_v{self.dataset_version}.csv",
            ))

    def run(self):
        df_date = pd.read_csv(self.input()["data_transformed"].path,
                              index_col="date")
        df_date.index = pd.to_datetime(df_date.index)

        import pickle

        with open(self.input()["model"].path, "rb") as f:
            regr = pickle.load(f)

        df_windows_pred = self.predictWindowing(
            df_date,
            self.attribute,
            regr,
            self.window_size,
            self.n_days_to_predict,
        )

        Path(self.output_folder).mkdir(parents=True, exist_ok=True)

        df_windows_pred.to_csv(self.output().path)

    def predictWindowing(self,
                         df_windows_train,
                         attribute,
                         regr,
                         window,
                         n_days_to_predict=10):

        date_end_train_date = df_windows_train.index[-1].date()
        date_previous_window = date_end_train_date - datetime.timedelta(
            days=window + 1)
        df_test_window = pd.DataFrame(
            df_windows_train[date_previous_window:date_end_train_date]
            [attribute])
        test_window = df_test_window[attribute].values
        start_i = len(test_window)
        X_test_prog = []
        y_pred_prog = []
        for i in range(start_i, start_i + n_days_to_predict):
            # X: |window| preceding samples
            X_test_i = test_window[i - window:i][::-1]
            X_test_prog.append(X_test_i)
            # y: regressor estimation given |window| preceding samples
            y_pred_prog.append(regr.predict([X_test_i])[0])
            test_window = np.append(test_window, regr.predict([X_test_i])[0])

        # Dataframe X |window| preceding samples
        df_pred = pd.DataFrame(
            X_test_prog, columns=[f"v_t-{i}" for i in range(1, window + 1)])
        # y_predicted --> y estimated by the regressor
        df_pred[f"y_pred_{attribute}"] = y_pred_prog

        # Add date and sed as index
        start_pred_date = date_end_train_date + datetime.timedelta(days=1)
        datelist = pd.date_range(start_pred_date, periods=n_days_to_predict)
        df_pred.set_index(datelist, inplace=True)
        df_pred.index.name = "date"

        return df_pred
class FetchLichessApiPGN(Task):

    player = Parameter(default='thibault')
    perf_type = Parameter(default='blitz')
    since = DateParameter(default=datetime.today().date() - timedelta(days=1))
    single_day = BoolParameter()

    def output(self):
        import os

        file_location = (f'~/Temp/luigi/{self.since}-raw-games-'
                         f'{self.player}-{self.perf_type}-pgn.pckl')
        return LocalTarget(os.path.expanduser(file_location), format=Nop)

    def run(self):
        import lichess.api
        from lichess.format import PYCHESS
        from pandas import DataFrame, read_pickle
        from calendar import timegm
        from pipeline_import.visitors import EvalsVisitor, ClocksVisitor
        from pipeline_import.visitors import QueenExchangeVisitor
        from pipeline_import.visitors import CastlingVisitor, PositionsVisitor
        from pipeline_import.visitors import PromotionsVisitor, MaterialVisitor

        self.output().makedirs()

        with self.input().open('r') as f:
            json = read_pickle(f, compression=None)
            game_count = len(json)

        if self.single_day:
            unix_time_until = timegm((self.since
                                      + timedelta(days=1)).timetuple())
        else:
            unix_time_until = timegm(datetime.today().date().timetuple())
        self.until = int(1000 * unix_time_until)

        unix_time_since = timegm(self.since.timetuple())
        self.since_unix = int(1000 * unix_time_since)

        token = lichess_token().token

        games = lichess.api.user_games(self.player,
                                       since=self.since_unix,
                                       until=self.until,
                                       perfType=self.perf_type,
                                       auth=token,
                                       clocks='true',
                                       evals='true',
                                       opening='true',
                                       format=PYCHESS)

        visitors = [EvalsVisitor,
                    ClocksVisitor,
                    QueenExchangeVisitor,
                    CastlingVisitor,
                    PromotionsVisitor,
                    PositionsVisitor,
                    MaterialVisitor,
                    ]

        header_infos = []

        counter = 0

        for game in games:
            game_infos = parse_headers(game, visitors)
            header_infos.append(game_infos)

            # progress bar stuff
            counter += 1

            current = f'{game_infos["UTCDate"]} {game_infos["UTCTime"]}'

            current_progress = counter / game_count
            self.set_status_message(f'Parsed until {current} :: '
                                    f'{counter} / {game_count}')
            self.set_progress_percentage(round(current_progress * 100, 2))

        df = DataFrame(header_infos)

        self.set_status_message('Parsed all games')
        self.set_progress_percentage(100)

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)
Example #19
0
class FetchLichessApiPGN(Task):

    player = Parameter(default='thibault')
    perf_type = Parameter(default='blitz')
    since = DateParameter(default=datetime.today().date() - timedelta(days=1))
    single_day = BoolParameter()

    def output(self):
        import os

        file_location = (f'~/Temp/luigi/{self.since}-raw-games-'
                         f'{self.player}-pgn.pckl')
        return LocalTarget(os.path.expanduser(file_location), format=Nop)

    def run(self):
        import lichess.api
        from lichess.format import PYCHESS
        from pandas import DataFrame, read_pickle
        from calendar import timegm
        from pipeline_import.visitors import EvalsVisitor, ClocksVisitor
        from pipeline_import.visitors import QueenExchangeVisitor
        from pipeline_import.visitors import CastlingVisitor, PositionsVisitor
        from pipeline_import.visitors import PromotionsVisitor, MaterialVisitor

        self.output().makedirs()

        with self.input().open('r') as f:
            json = read_pickle(f, compression=None)
            game_count = len(json)

        if self.single_day:
            unix_time_until = timegm((self.since
                                      + timedelta(days=1)).timetuple())
        else:
            unix_time_until = timegm(datetime.today().date().timetuple())
        self.until = int(1000 * unix_time_until)

        unix_time_since = timegm(self.since.timetuple())
        self.since_unix = int(1000 * unix_time_since)

        token = lichess_token().token

        games = lichess.api.user_games(self.player,
                                       since=self.since_unix,
                                       until=self.until,
                                       perfType=self.perf_type,
                                       auth=token,
                                       clocks='true',
                                       evals='true',
                                       opening='true',
                                       format=PYCHESS)

        visitors = [EvalsVisitor,
                    ClocksVisitor,
                    QueenExchangeVisitor,
                    CastlingVisitor,
                    PromotionsVisitor,
                    PositionsVisitor,
                    MaterialVisitor,
                    ]

        visitor_stats = {'clocks': 'clocks',
                         'evaluations': 'evals',
                         'eval_depths': 'eval_depths',
                         'queen_exchange': 'queen_exchange',
                         'castling_sides': 'castling',
                         'has_promotion': 'has_promotion',
                         'promotion_count_white': 'promotion_count_white',
                         'promotion_count_black': 'promotion_count_black',
                         'promotions_white': 'promotions_white',
                         'promotions_black': 'promotions_black',
                         'positions': 'positions',
                         'black_berserked': 'black_berserked',
                         'white_berserked': 'white_berserked',
                         'material_by_move': 'material_by_move',
                         }

        header_infos = []

        counter = 0

        for game in games:
            game_infos = {x: y for x, y in game.headers.items()}
            if game.headers['Variant'] == 'From Position':
                game.headers['Variant'] = 'Standard'
            for visitor in visitors:
                game.accept(visitor(game))
            for k, v in visitor_stats.items():
                game_infos[k] = getattr(game, v)
            game_infos['moves'] = [x.san() for x in game.mainline()]
            header_infos.append(game_infos)

            # progress bar stuff
            counter += 1

            current = f'{game_infos["UTCDate"]} {game_infos["UTCTime"]}'

            current_progress = counter / game_count
            self.set_status_message(f'Parsed until {current} :: '
                                    f'{counter} / {game_count}')
            self.set_progress_percentage(round(current_progress * 100, 2))

        df = DataFrame(header_infos)

        self.set_status_message('Parsed all games')
        self.set_progress_percentage(100)

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)