class JobCleanupTask(BaseTask): job = Parameter(visibility=ParameterVisibility.HIDDEN) interval = Parameter() start_date = DateParameter() owner = Parameter() email = Parameter() time = DateParameter() def __init__(self, *args, **kwargs): super(JobCleanupTask, self).__init__(*args, **kwargs) def run(self): self.log.critical('job-cleanup') self._completed = True
class DataPreProcessing(luigi.Task): dataset_version = DateParameter(default=datetime.date.today()) dataset_name = Parameter(default="covidIT") def requires(self): return DownloadDataset(self.dataset_version, self.dataset_name) output_folder = os.path.join(output_dir, "processed") def output(self): return LocalTarget( f"{self.output_folder}/{self.dataset_name}_processed_v{self.dataset_version}.csv" ) def run(self): df_data = pd.read_csv(self.input().path, index_col="date") df_data = self.preprocess_data(df_data) Path(self.output_folder).mkdir(parents=True, exist_ok=True) df_data.to_csv(self.output().path) def preprocess_data(self, df_data): df_data["diff_death"] = df_data["death"].diff() df_data["diff_intensive_care"] = df_data["intensive_care"].diff() df_data["diff_performed_tests"] = df_data["performed_tests"].diff() # . df_data["diff_recovered"] = df_data["recovered"].diff() df_data["ratio_molecular"] = ( df_data["total_positives_molecular_test"] / df_data["swabs_test_molecular"]) df_data["ratio_antigenic"] = ( df_data["total_positives_antigenic_test_rapid"] / df_data["swabs_test_antigenic_rapid"]) return df_data
def test_use_interval_start(self): interval = None interval_start = DateParameter().parse('2013-01-01') LastCountryOfUser( interval=interval, interval_start=interval_start, )
class DataTransform(luigi.Task): dataset_version = DateParameter(default=datetime.date.today()) dataset_name = Parameter(default="covidIT") attribute = Parameter(default="total_positive") window_size = IntParameter(default=7) def requires(self): return DataPreProcessing(self.dataset_version, self.dataset_name) output_folder = os.path.join(output_dir, "transformed_window") def output(self): return LocalTarget( os.path.join( self.output_folder, f"{self.dataset_name}_transformed_window_w{self.window_size}_{self.attribute}_v{self.dataset_version}.csv", )) def run(self): df_data = pd.read_csv(self.input().path, index_col="date") df_windows = self.getXyWindow_df(df_data[self.attribute], window=self.window_size) df_windows = df_windows.dropna() Path(self.output_folder).mkdir(parents=True, exist_ok=True) df_windows.to_csv(self.output().path) def getXyWindow_df(self, d_attribute, window=7): attribute = d_attribute.name df_windows = pd.DataFrame(d_attribute) for i in range(1, window + 1): df_windows[f"v_t-{i}"] = df_windows[attribute].shift(i) df_windows[[attribute] + [f"v_t-{i}" for i in range(1, window + 1)]] return df_windows
class Modeling(luigi.Task): dataset_version = DateParameter(default=datetime.date.today()) dataset_name = Parameter(default="covidIT") attribute = Parameter(default="total_positive") window_size = IntParameter(default=7) model_name = Parameter(default="LR") def requires(self): return DataTransform(self.dataset_version, self.dataset_name, self.attribute, self.window_size) output_folder = os.path.join(output_dir, "model") def output(self): return LocalTarget( os.path.join( self.output_folder, f"{self.dataset_name}_model_{self.attribute}_w{self.window_size}_{self.model_name}_v{self.dataset_version}.pkl", )) def run(self): df_windows = pd.read_csv(self.input().path, index_col="date") df_windows.index = pd.to_datetime(df_windows.index) regr = self.modeling( df_windows, self.attribute, regressor=self.model_name, ) Path(self.output_folder).mkdir(parents=True, exist_ok=True) import pickle with open(self.output().path, "wb") as f: pickle.dump(regr, f) def modeling(self, df_windows, attribute, regressor="LR", date_end_train=None): from sklearn.ensemble import GradientBoostingRegressor from sklearn.linear_model import LinearRegression if regressor not in ["LR", "GBR"]: raise ValueError regr = LinearRegression( ) if regressor == "LR" else GradientBoostingRegressor() if date_end_train is None: date_end_train = df_windows.index[-1] X = df_windows.drop(columns=attribute)[:date_end_train].values y = df_windows[attribute][:date_end_train].values regr.fit(X, y) return regr
class DownloadDataset(luigi.Task): dataset_version = DateParameter(default=datetime.date.today()) dataset_name = Parameter(default="covidIT") columns_ita_eng = { "data": "date", "stato": "country", "ricoverati_con_sintomi": "hospitalized_with_symptoms", "terapia_intensiva": "intensive_care", "totale_ospedalizzati": "total_hospitalized", "isolamento_domiciliare": "home_confinement", "totale_positivi": "total_positive", "variazione_totale_positivi": "total_positive_change", "nuovi_positivi": "new_positives", "dimessi_guariti": "recovered", "deceduti": "death", "casi_da_sospetto_diagnostico": "positive_cases_from_clinical_activity", "casi_da_screening": "screening_cases", "totale_casi": "total_cases", "tamponi": "performed_tests", "casi_testati": "total_people_tested", "note": "notes", "ingressi_terapia_intensiva": "new_entries_intensive_care", "note_test": "notes_tests", "note_casi": "notes_cases", "totale_positivi_test_molecolare": "total_positives_molecular_test", "totale_positivi_test_antigenico_rapido": "total_positives_antigenic_test_rapid", "tamponi_test_molecolare": "swabs_test_molecular", "tamponi_test_antigenico_rapido": "swabs_test_antigenic_rapid", } data_url = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv" output_folder = os.path.join(output_dir, "dataset") def output(self): return LocalTarget( f"{self.output_folder}/{self.dataset_name}_v{self.dataset_version}.csv" ) def run(self): df_data = self.load_data(self.data_url, columns_new_names=self.columns_ita_eng) Path(self.output_folder).mkdir(parents=True, exist_ok=True) df_data.to_csv(self.output().path) def load_data(self, data_url, columns_new_names=None): data = pd.read_csv(data_url) if columns_new_names: data.rename(columns=columns_new_names, inplace=True) data["date"] = pd.to_datetime(data["date"]) data.set_index("date", inplace=True) return data
class Fetch(Task): from datetime import date, timedelta # Ein Datum wird als Parameter uebergeben date = DateParameter(default=date.today()) # PRAW arbeitet mit Zeitintervallen # Um einen Tag zu importieren wird # von Tag N bis Tag N+1 importiert delta = timedelta(days=1) # Das LocalTarget fuer die rohen Daten # Die Daten werden unter # "daily/<datum>/roh.csv gespeichert def output(self): prefix = self.date.strftime("%m-%d-%Y") return LocalTarget("daily/%s/roh.csv" % prefix) # Die Posts fuer einen Tag # werden heruntergeladen, # in einen Dataframe konvertiert # und als CSV in das Target geschrieben def run(self): start = self.date end = start + self.delta posts = self.fetch(start, end) frame = self.konvertiere(posts) self.speichern(frame, self.output()) def fetch(self, start, end): import time import praw subreddits = ["datascience", "gameofthrones"] reddit = praw.Reddit(user_agent="test", client_id="wpaIV3-b3AYOJQ", client_secret="-M_LPtLCpkqlJTCyg--Rg9ePAwg") subreddits = '+'.join(subreddits) subreddit = reddit.subreddit(subreddits) start = time.mktime(self.date.timetuple()) end = self.date + self.delta end = time.mktime(end.timetuple()) filtered = list(subreddit.submissions(start=start, end=end)) return filtered def konvertiere(self, posts): import pandas dataframe = pandas.DataFrame([f.__dict__ for f in posts])[['id', 'title', 'selftext', 'subreddit']] return dataframe def speichern(self, dataframe, target): with target.open("w") as out: dataframe.to_csv(out, encoding='utf-8', index=False, sep=';')
class FetchLichessApiJSON(Task): player = Parameter(default='thibault') perf_type = Parameter(default='blitz') since = DateParameter(default=datetime.today().date() - timedelta(days=1)) single_day = BoolParameter() def output(self): import os file_location = (f'~/Temp/luigi/{self.since}-raw-games-' f'{self.player}-{self.perf_type}-json.pckl') return LocalTarget(os.path.expanduser(file_location), format=Nop) def run(self): import lichess.api from lichess.format import JSON from pandas import json_normalize from calendar import timegm self.output().makedirs() if self.single_day: unix_time_until = timegm((self.since + timedelta(days=1)).timetuple()) else: unix_time_until = timegm(datetime.today().date().timetuple()) self.until = int(1000 * unix_time_until) unix_time_since = timegm(self.since.timetuple()) self.since_unix = int(1000 * unix_time_since) token = lichess_token().token games = lichess.api.user_games(self.player, since=self.since_unix, until=self.until, perfType=self.perf_type, auth=token, evals='false', clocks='false', moves='false', format=JSON) df = json_normalize([game for game in games], sep='_') with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None)
class Filter(Task): earliest_date = DateParameter(default=date.today()) def requires(self): return Scrape(self.earliest_date) def run(self): # open the saved tweets with self.output().open('w') as output_file: with self.input().open('r') as input_file: records = csv.DictReader(input_file) for row in records: # filter out retweets (starting with 'RT') if not row['tweet'].startswith('RT'): output_file.write(row['tweet']) def output(self): return LocalTarget('tweets_filtered_since_%s.csv' % self.earliest_date)
class AggregateInReport(luigi.Task): dataset_version = DateParameter(default=datetime.date.today()) dataset_name = Parameter(default="covidIT") output_folder = os.path.join(output_dir, "report_trends") # --> Alternative for dynamic repor # run as --attributes '["total_positive", "recovered", "ratio_molecular"]' # attributes = ListParameter(default=["total_positive", "recovered", "ratio_molecular"]) # attributes = ["total_positive", "recovered", "ratio_molecular"] def requires(self): return { attribute: PlotTrend(self.dataset_version, self.dataset_name, attribute) for attribute in self.attributes } def output(self): return LocalTarget( os.path.join( self.output_folder, f"{self.dataset_name}_report_trends_v{self.dataset_version}.html", )) def run(self): path_by_attribute = {k: self.input()[k].path for k in self.input()} plots_html = self.getHTMLTrends(path_by_attribute) Path(self.output_folder).mkdir(parents=True, exist_ok=True) with open(self.output().path, "w") as fp: for plot_html in plots_html: fp.write(plot_html) def getHTMLTrends(self, path_by_attribute): plots_html = [ f"<h2 style='text-align: center'>{k}</h2>\n<p style='text-align: center'><img src='{path_by_attribute[k]}' style='width: 50%; height: 50%' /> </p>" for k in path_by_attribute ] return plots_html
class GetBidens(Task): earliest_date = DateParameter(default=date.today()) def requires(self): return Filter(self.earliest_date) def run(self): with self.input().open('r') as file: doc = nlp(file.read()) # print all mentioned persons for entity in doc.ents: if entity.label_ == 'PERSON': print(entity.text, entity.label_) # count the number of Bidens in the tweets num_bidens = len([ e for e in doc.ents if e.label_ == 'PERSON' and ( 'biden' in e.text.lower() or 'joe' in e.text.lower()) ]) print('Number of Bidens in Trump\'s tweets since ' + str(EARLIEST_DATE) + ': ' + str(num_bidens))
class PlotTrend(luigi.Task): dataset_version = DateParameter(default=datetime.date.today()) dataset_name = Parameter(default="covidIT") attribute = Parameter(default="ratio_molecular") def requires(self): return DataPreProcessing(self.dataset_version, self.dataset_name) output_folder = os.path.join(output_dir, "trend") def output(self): return LocalTarget( os.path.join( self.output_folder, f"{self.dataset_name}_trend_{self.attribute}_v{self.dataset_version}.png", )) def run(self): df_data = pd.read_csv(self.input().path, index_col="date") fig = self.plotDateTrend(df_data.index, df_data[self.attribute], self.attribute) Path(self.output_folder).mkdir(parents=True, exist_ok=True) fig.savefig(self.output().path) def plotDateTrend(self, x_date, y, attribute, interval=40): fig, ax = plt.subplots(figsize=(12, 5)) ax.grid() from datetime import datetime x_date = [datetime.strptime(d, "%Y-%m-%d %H:%M:%S") for d in x_date] ax.scatter(x_date, y, s=3) ax.set(xlabel="Date", ylabel=attribute, title=attribute) date_form = DateFormatter("%d-%m") ax.xaxis.set_major_formatter(date_form) ax.xaxis.set_major_locator(mdates.DayLocator(interval=interval)) return fig
class Scrape(Task): earliest_date = DateParameter(default=date.today()) def run(self): output_filename = 'tweets_since_%s.csv' % self.earliest_date # remove old output file if it exists if os.path.exists(output_filename): os.remove(output_filename) # scrape tweets c = twint.Config() c.Username = "******" c.Since = str(self.earliest_date) c.Store_csv = True c.Custom_csv = ["tweet"] # save the scraped tweets into this file c.Output = output_filename # run the actual twitter search twint.run.Search(c) def output(self): return LocalTarget('tweets_since_%s.csv' % self.earliest_date)
class Clean(Task): from datetime import date import nltk nltk.download('punkt') nltk.download('stopwords') # Ein Datum wird als Parameter uebergeben date = DateParameter(default=date.today()) # Die Liste von Stop-Woertern # die herausgefiltert werden stoppwoerter = nltk.corpus.stopwords.words('english') # Der verwendete Tokenizer tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') # Der Stemmer fuer Englische Woerter stemmer = nltk.SnowballStemmer("english") # Als Abhaengigkeit wird der # Task *Fetch* zurueckgegeben def requires(self): return Fetch(self.date) # Das LocalTarget fuer die sauberen Daten # Die Daten werden unter # "daily/<datum>/cleaned.csv gespeichert def output(self): prefix = self.date.strftime("%m-%d-%Y") return LocalTarget("daily/%s/cleaned.csv" % prefix) # Die Rohdaten werden zerstueckelt # durch die Stopwort-Liste gefiltert # und auf ihre Wortstaemme zurueckgefuehrt def run(self): csv = self.lade() tokenized = self.tokenize(csv) gefiltert = self.entferne(tokenized) wortstamm = self.stemme(gefiltert) csv["cleaned_words"] = wortstamm self.speichern(csv, self.output()) def lade(self): import pandas dataset = pandas.read_csv(self.input().path, encoding='utf-8', sep=';').fillna('') return dataset def tokenize(self, csv): def tok(post): tokenized = self.tokenizer.tokenize(post["title"] + " " + post["selftext"]) return tokenized tokenized = csv.apply(tok, axis=1) return tokenized def entferne(self, tokenized): lowercase = tokenized.apply(lambda post: [wort.lower() for wort in post]) filtered = lowercase.apply(lambda post: [wort for wort in post if wort not in self.stoppwoerter]) return filtered def stemme(self, gefiltert): wortstamm = gefiltert.apply(lambda post: [self.stemmer.stem(wort) for wort in post]) wortstamm = wortstamm.apply(lambda post: " ".join(post)) return wortstamm def speichern(self, dataframe, target): with target.open("w") as out: dataframe[["id", "cleaned_words", "subreddit"]].to_csv(out, encoding='utf-8', index=False, sep=';')
class Classify(PySparkTask): from datetime import date date = DateParameter(default=date.today()) version = IntParameter(default=1) # PySpark Parameter driver_memory = '1g' executor_memory = '2g' executor_cores = '2' num_executors = '4' master = 'local' # Als Abhaengigkeit werden # Task *Clean* und *ModelExists* # zurueckgegeben def requires(self): return [ModelExists(self.version), Clean(self.date)] # Das LocalTarget fuer die Klassifikation # Die Daten werden unter # "daily/<datum>/ergebnis.csv gespeichert def output(self): prefix = self.date.strftime("%m-%d-%Y") return LocalTarget("daily/%s/ergebnis.csv" % prefix) def main(self, sc, *args): from pyspark.sql.session import SparkSession from pyspark.ml import PipelineModel from pyspark.sql.functions import when # Initialisiere den SQLContext sql = SparkSession.builder\ .enableHiveSupport() \ .config("hive.exec.dynamic.partition", "true") \ .config("hive.exec.dynamic.partition.mode", "nonstrict") \ .config("hive.exec.max.dynamic.partitions", "4096") \ .getOrCreate() # Lade die bereinigten Daten df = sql.read.format("com.databricks.spark.csv") \ .option("delimiter", ";") \ .option("header", "true") \ .load(self.input()[1].path) # Lade das Model das zuvor mit SparkML trainiert wurde model = PipelineModel.load(self.input()[0].path) # Klassifiziere die Datensaetze eines Tages mit dem Model ergebnis = model.transform(df)[["id", "subreddit", "probability", "prediction"]] # Eine kleine Aufbereitung der Daten denn # die Klasse "1" hat den Namen "datascience" ergebnis = ergebnis.withColumn("prediction_label", when(ergebnis.prediction==1, "datascience") \ .otherwise("gameofthrones")) # Der Einfachheit halber wird der Dataframe # in einen Pandas Dataframe konvertiert. # Dies sollte bei grossen Datenmengen vermieden. with self.output().open("w") as out: ergebnis.toPandas().to_csv(out, encoding='utf-8', index=False, sep=';')
class PlotFutureTrend(luigi.Task): dataset_version = DateParameter(default=datetime.date.today()) dataset_name = Parameter(default="covidIT") attribute = Parameter(default="total_positive") window_size = IntParameter(default=7) model_name = Parameter(default="LR") n_days_to_predict = IntParameter(default=7) def requires(self): return { "data_pred": PredictTrend( self.dataset_version, self.dataset_name, self.attribute, self.window_size, self.model_name, self.n_days_to_predict, ), "data_transformed": DataTransform( self.dataset_version, self.dataset_name, self.attribute, self.window_size, ), } output_folder = os.path.join(output_dir, "report_future_trend") def output(self): return LocalTarget( os.path.join( self.output_folder, f"{self.dataset_name}_future_trend_{self.attribute}_w{self.window_size}_N{self.n_days_to_predict}_{self.model_name}_v{self.dataset_version}.png", )) def run(self): df_windows_pred = pd.read_csv(self.input()["data_pred"].path, index_col="date") df_windows_pred.index = pd.to_datetime(df_windows_pred.index) df_date = pd.read_csv(self.input()["data_transformed"].path, index_col="date") df_date.index = pd.to_datetime(pd.to_datetime(df_date.index).date) import datetime fig = self.plotEstimatedTrend(df_date, df_windows_pred, self.attribute) Path(self.output_folder).mkdir(parents=True, exist_ok=True) fig.savefig(self.output().path) def plotEstimatedTrend( self, df_date, df_windows_predicted, attribute, start_train=None, date_end_train=None, interval=40, ): import datetime # Starting date of the plot start_train = df_date.index[0] if start_train is None else start_train # End date of the true label/value of the plot date_end_train = df_date.index[ -1] if date_end_train is None else date_end_train start_test = date_end_train.date() + datetime.timedelta(days=1) if df_windows_predicted[start_test:].empty: # TODO raise ValueError fig, ax = plt.subplots(figsize=(12, 5)) ax.grid() # Observed trend until training date x_date = df_date[start_train:date_end_train].index y_train = df_date[start_train:date_end_train][attribute].values ax.scatter(x_date, y_train, s=3, color="blue", label=attribute) # Predicted future trend ax.scatter( df_windows_predicted[start_test:].index, df_windows_predicted[start_test:][f"y_pred_{attribute}"].values, s=4, color="orange", label=f"{attribute} predicted", ) ax.legend() ax.set(xlabel="Date", ylabel=attribute, title=attribute) date_form = DateFormatter("%d-%m") ax.xaxis.set_major_formatter(date_form) ax.xaxis.set_major_locator(mdates.DayLocator(interval=interval)) return fig
class PredictTrend(luigi.Task): dataset_version = DateParameter(default=datetime.date.today()) dataset_name = Parameter(default="covidIT") attribute = Parameter(default="total_positive") window_size = IntParameter(default=7) model_name = Parameter(default="LR") n_days_to_predict = IntParameter(default=7) def requires(self): return { "model": Modeling( self.dataset_version, self.dataset_name, self.attribute, self.window_size, self.model_name, ), "data_transformed": DataTransform( self.dataset_version, self.dataset_name, self.attribute, self.window_size, ), } output_folder = os.path.join(output_dir, "prediction") def output(self): return LocalTarget( os.path.join( self.output_folder, f"{self.dataset_name}_prediction_{self.attribute}_w{self.window_size}_{self.model_name}_N{self.n_days_to_predict}_v{self.dataset_version}.csv", )) def run(self): df_date = pd.read_csv(self.input()["data_transformed"].path, index_col="date") df_date.index = pd.to_datetime(df_date.index) import pickle with open(self.input()["model"].path, "rb") as f: regr = pickle.load(f) df_windows_pred = self.predictWindowing( df_date, self.attribute, regr, self.window_size, self.n_days_to_predict, ) Path(self.output_folder).mkdir(parents=True, exist_ok=True) df_windows_pred.to_csv(self.output().path) def predictWindowing(self, df_windows_train, attribute, regr, window, n_days_to_predict=10): date_end_train_date = df_windows_train.index[-1].date() date_previous_window = date_end_train_date - datetime.timedelta( days=window + 1) df_test_window = pd.DataFrame( df_windows_train[date_previous_window:date_end_train_date] [attribute]) test_window = df_test_window[attribute].values start_i = len(test_window) X_test_prog = [] y_pred_prog = [] for i in range(start_i, start_i + n_days_to_predict): # X: |window| preceding samples X_test_i = test_window[i - window:i][::-1] X_test_prog.append(X_test_i) # y: regressor estimation given |window| preceding samples y_pred_prog.append(regr.predict([X_test_i])[0]) test_window = np.append(test_window, regr.predict([X_test_i])[0]) # Dataframe X |window| preceding samples df_pred = pd.DataFrame( X_test_prog, columns=[f"v_t-{i}" for i in range(1, window + 1)]) # y_predicted --> y estimated by the regressor df_pred[f"y_pred_{attribute}"] = y_pred_prog # Add date and sed as index start_pred_date = date_end_train_date + datetime.timedelta(days=1) datelist = pd.date_range(start_pred_date, periods=n_days_to_predict) df_pred.set_index(datelist, inplace=True) df_pred.index.name = "date" return df_pred
class FetchLichessApiPGN(Task): player = Parameter(default='thibault') perf_type = Parameter(default='blitz') since = DateParameter(default=datetime.today().date() - timedelta(days=1)) single_day = BoolParameter() def output(self): import os file_location = (f'~/Temp/luigi/{self.since}-raw-games-' f'{self.player}-{self.perf_type}-pgn.pckl') return LocalTarget(os.path.expanduser(file_location), format=Nop) def run(self): import lichess.api from lichess.format import PYCHESS from pandas import DataFrame, read_pickle from calendar import timegm from pipeline_import.visitors import EvalsVisitor, ClocksVisitor from pipeline_import.visitors import QueenExchangeVisitor from pipeline_import.visitors import CastlingVisitor, PositionsVisitor from pipeline_import.visitors import PromotionsVisitor, MaterialVisitor self.output().makedirs() with self.input().open('r') as f: json = read_pickle(f, compression=None) game_count = len(json) if self.single_day: unix_time_until = timegm((self.since + timedelta(days=1)).timetuple()) else: unix_time_until = timegm(datetime.today().date().timetuple()) self.until = int(1000 * unix_time_until) unix_time_since = timegm(self.since.timetuple()) self.since_unix = int(1000 * unix_time_since) token = lichess_token().token games = lichess.api.user_games(self.player, since=self.since_unix, until=self.until, perfType=self.perf_type, auth=token, clocks='true', evals='true', opening='true', format=PYCHESS) visitors = [EvalsVisitor, ClocksVisitor, QueenExchangeVisitor, CastlingVisitor, PromotionsVisitor, PositionsVisitor, MaterialVisitor, ] header_infos = [] counter = 0 for game in games: game_infos = parse_headers(game, visitors) header_infos.append(game_infos) # progress bar stuff counter += 1 current = f'{game_infos["UTCDate"]} {game_infos["UTCTime"]}' current_progress = counter / game_count self.set_status_message(f'Parsed until {current} :: ' f'{counter} / {game_count}') self.set_progress_percentage(round(current_progress * 100, 2)) df = DataFrame(header_infos) self.set_status_message('Parsed all games') self.set_progress_percentage(100) with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None)
class FetchLichessApiPGN(Task): player = Parameter(default='thibault') perf_type = Parameter(default='blitz') since = DateParameter(default=datetime.today().date() - timedelta(days=1)) single_day = BoolParameter() def output(self): import os file_location = (f'~/Temp/luigi/{self.since}-raw-games-' f'{self.player}-pgn.pckl') return LocalTarget(os.path.expanduser(file_location), format=Nop) def run(self): import lichess.api from lichess.format import PYCHESS from pandas import DataFrame, read_pickle from calendar import timegm from pipeline_import.visitors import EvalsVisitor, ClocksVisitor from pipeline_import.visitors import QueenExchangeVisitor from pipeline_import.visitors import CastlingVisitor, PositionsVisitor from pipeline_import.visitors import PromotionsVisitor, MaterialVisitor self.output().makedirs() with self.input().open('r') as f: json = read_pickle(f, compression=None) game_count = len(json) if self.single_day: unix_time_until = timegm((self.since + timedelta(days=1)).timetuple()) else: unix_time_until = timegm(datetime.today().date().timetuple()) self.until = int(1000 * unix_time_until) unix_time_since = timegm(self.since.timetuple()) self.since_unix = int(1000 * unix_time_since) token = lichess_token().token games = lichess.api.user_games(self.player, since=self.since_unix, until=self.until, perfType=self.perf_type, auth=token, clocks='true', evals='true', opening='true', format=PYCHESS) visitors = [EvalsVisitor, ClocksVisitor, QueenExchangeVisitor, CastlingVisitor, PromotionsVisitor, PositionsVisitor, MaterialVisitor, ] visitor_stats = {'clocks': 'clocks', 'evaluations': 'evals', 'eval_depths': 'eval_depths', 'queen_exchange': 'queen_exchange', 'castling_sides': 'castling', 'has_promotion': 'has_promotion', 'promotion_count_white': 'promotion_count_white', 'promotion_count_black': 'promotion_count_black', 'promotions_white': 'promotions_white', 'promotions_black': 'promotions_black', 'positions': 'positions', 'black_berserked': 'black_berserked', 'white_berserked': 'white_berserked', 'material_by_move': 'material_by_move', } header_infos = [] counter = 0 for game in games: game_infos = {x: y for x, y in game.headers.items()} if game.headers['Variant'] == 'From Position': game.headers['Variant'] = 'Standard' for visitor in visitors: game.accept(visitor(game)) for k, v in visitor_stats.items(): game_infos[k] = getattr(game, v) game_infos['moves'] = [x.san() for x in game.mainline()] header_infos.append(game_infos) # progress bar stuff counter += 1 current = f'{game_infos["UTCDate"]} {game_infos["UTCTime"]}' current_progress = counter / game_count self.set_status_message(f'Parsed until {current} :: ' f'{counter} / {game_count}') self.set_progress_percentage(round(current_progress * 100, 2)) df = DataFrame(header_infos) self.set_status_message('Parsed all games') self.set_progress_percentage(100) with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None)