class worker(Config): # NOTE: `section.config-variable` in the config_path argument is deprecated in favor of `worker.config_variable` ping_interval = FloatParameter(default=1.0, config_path=dict( section='core', name='worker-ping-interval')) keep_alive = BoolParameter(default=False, config_path=dict(section='core', name='worker-keep-alive')) count_uniques = BoolParameter( default=False, config_path=dict(section='core', name='worker-count-uniques'), description='worker-count-uniques means that we will keep a ' 'worker alive only if it has a unique pending task, as ' 'well as having keep-alive true') wait_interval = FloatParameter(default=1.0, config_path=dict( section='core', name='worker-wait-interval')) wait_jitter = FloatParameter(default=5.0) max_reschedules = IntParameter(default=1, config_path=dict( section='core', name='worker-max-reschedules')) timeout = IntParameter(default=0, config_path=dict(section='core', name='worker-timeout')) task_limit = IntParameter(default=None, config_path=dict(section='core', name='worker-task-limit')) retry_external_tasks = BoolParameter( default=False, config_path=dict(section='core', name='retry-external-tasks'), description='If true, incomplete external tasks will be ' 'retested for completion while Luigi is running.') no_install_shutdown_handler = BoolParameter( default=False, description='If true, the SIGUSR1 shutdown handler will' 'NOT be install on the worker')
class worker(Config): # NOTE: `section.config-variable` in the config_path argument is deprecated in favor of `worker.config_variable` ping_interval = FloatParameter(default=1.0, config_path=dict(section='core', name='worker-ping-interval')) keep_alive = BoolParameter(default=False, config_path=dict(section='core', name='worker-keep-alive')) count_uniques = BoolParameter(default=False, config_path=dict(section='core', name='worker-count-uniques'), description='worker-count-uniques means that we will keep a ' 'worker alive only if it has a unique pending task, as ' 'well as having keep-alive true') count_last_scheduled = BoolParameter(default=False, description='Keep a worker alive only if there are ' 'pending tasks which it was the last to ' 'schedule.') wait_interval = FloatParameter(default=1.0, config_path=dict(section='core', name='worker-wait-interval')) wait_jitter = FloatParameter(default=5.0) max_reschedules = IntParameter(default=1, config_path=dict(section='core', name='worker-max-reschedules')) timeout = IntParameter(default=0, config_path=dict(section='core', name='worker-timeout')) task_limit = IntParameter(default=None, config_path=dict(section='core', name='worker-task-limit')) retry_external_tasks = BoolParameter(default=False, config_path=dict(section='core', name='retry-external-tasks'), description='If true, incomplete external tasks will be ' 'retested for completion while Luigi is running.') send_failure_email = BoolParameter(default=True, description='If true, send e-mails directly from the worker' 'on failure') no_install_shutdown_handler = BoolParameter(default=False, description='If true, the SIGUSR1 shutdown handler will' 'NOT be install on the worker') check_unfulfilled_deps = BoolParameter(default=True, description='If true, check for completeness of ' 'dependencies before running a task')
class postgres_cfg_music(Config): user = Parameter(visibility=ParameterVisibility.PRIVATE, significant=False) password = Parameter(visibility=ParameterVisibility.PRIVATE, significant=False) host = Parameter(visibility=ParameterVisibility.PRIVATE, significant=False) port = IntParameter(visibility=ParameterVisibility.PRIVATE, significant=False) database = Parameter(visibility=ParameterVisibility.PRIVATE, significant=False) read_user = Parameter(visibility=ParameterVisibility.PRIVATE, significant=False) read_password = Parameter(visibility=ParameterVisibility.PRIVATE, significant=False)
class ThePayneMixin(SlurmMixin, BaseTask): task_namespace = "ThePayne" n_steps = IntParameter( default=100000, config_path=dict(section=task_namespace, name="n_steps") ) n_neurons = IntParameter( default=300, config_path=dict(section=task_namespace, name="n_neurons") ) weight_decay = FloatParameter( default=0.0, config_path=dict(section=task_namespace, name="weight_decay") ) learning_rate = FloatParameter( default=0.001, config_path=dict(section=task_namespace, name="learning_rate") ) training_set_path = Parameter( config_path=dict(section=task_namespace, name="training_set_path") )
class worker(Config): ping_interval = FloatParameter(default=1.0, config_path=dict(section='core', name='worker-ping-interval')) keep_alive = BoolParameter(default=False, config_path=dict(section='core', name='worker-keep-alive')) count_uniques = BoolParameter(default=False, config_path=dict(section='core', name='worker-count-uniques'), description='worker-count-uniques means that we will keep a ' 'worker alive only if it has a unique pending task, as ' 'well as having keep-alive true') wait_interval = IntParameter(default=1, config_path=dict(section='core', name='worker-wait-interval')) max_reschedules = IntParameter(default=1, config_path=dict(section='core', name='worker-max-reschedules')) timeout = IntParameter(default=0, config_path=dict(section='core', name='worker-timeout')) task_limit = IntParameter(default=None, config_path=dict(section='core', name='worker-task-limit')) retry_external_tasks = BoolParameter(default=False, config_path=dict(section='core', name='retry-external-tasks'), description='If true, incomplete external tasks will be ' 'retested for completion while Luigi is running.')
class Download(Task): import praw # Die Version des Models version = IntParameter(default=1) # Es werden maximal 500 Posts pro Klasse geladen limit = IntParameter(default=500) # Definition der Subreddits mit der der DecisionTree trainiert wird subreddits = ["datascience", "gameofthrones"] # PRAW benötigt einen Account bei Reddit # inklusive einer registrierten Anwendung mit Client-ID und Secret reddit = praw.Reddit(user_agent="test", client_id="wpaIV3-b3AYOJQ", client_secret="-M_LPtLCpkqlJTCyg--Rg9ePAwg") # Das LocalTarget fuer die rohen Daten # Die Daten werden unter # "model/<version>/raw.csv gespeichert def output(self): return LocalTarget("model/%d/raw.csv" % self.version) # Die Posts werden heruntergeladen, # in einen Dataframe konvertiert # und als CSV in das Target geschrieben def run(self): dataset = reduce(lambda p, n: p.append(n), self.fetch_reddit_data()) with self.output().open("w") as out: dataset.to_csv(out, encoding='utf-8', index=False, sep=';') def fetch_reddit_data(self): from pandas import DataFrame for sub in self.subreddits: posts = list(self.reddit.subreddit(sub).hot(limit=self.limit)) relevant = DataFrame([p.__dict__ for p in posts])[['title', 'selftext', "subreddit"]] yield relevant
class ExtractDataset(ExternalProgramTask): dataset_version = IntParameter(default=1) dataset_name = Parameter(default="dataset") def requires(self): return DownloadDataset(self.dataset_version, self.dataset_name) def output(self): return LocalTarget("datasets/fruit-images-dataset/%d" % self.dataset_version) def program_args(self): self.output().makedirs() return ["unzip", "-u", "-q", "-d", self.output().path, self.input().path]
class DownloadDataset(ExternalProgramTask): dataset_version = IntParameter(default=1) dataset_name = Parameter(default="dataset") base_url = "http://plainpixels.work/resources/datasets" file_fomat = "zip" def output(self): return LocalTarget( "/tmp/%s_v%d.%s" % (self.dataset_name, self.dataset_version, self.file_fomat)) def program_args(self): url = "%s/%s_v%d.%s" % (self.base_url, self.dataset_name, self.dataset_version, self.file_fomat) return ["curl", "-L", "-o", self.output().path, url]
class ClassifyWhiteDwarfMixin(BaseTask): """ Mix-in class for classifying white dwarfs. """ model_path = Parameter() wavelength_regions = ListParameter( default=[ [3860, 3900], # Balmer line [3950, 4000], # Balmer line [4085, 4120], # Balmer line [4320, 4360], # Balmer line [4840, 4880], # Balmer line [6540, 6580], # Balmer line [3880, 3905], # He I/II line [3955, 3975], # He I/II line [3990, 4056], # He I/II line [4110, 4140], # He I/II line [4370, 4410], # He I/II line [4450, 4485], # He I/II line [4705, 4725], # He I/II line [4900, 4950], # He I/II line [5000, 5030], # He I/II line [5860, 5890], # He I/II line [6670, 6700], # He I/II line [7050, 7090], # He I/II line [7265, 7300], # He I/II line [4600, 4750], # Molecular C absorption band [5000, 5160], # Molecular C absorption band [3925, 3940], # Ca H/K line [3960, 3975], # Ca H/K line ] ) polyfit_order = IntParameter(default=5) polyfit_regions = ListParameter( default=[ [3850, 3870], [4220, 4245], [5250, 5400], [6100, 6470], [7100, 9000] ] )
class BaselineValidation(Task): dataset_version = IntParameter(default=1) dataset_name = Parameter(default="dataset") config_name = Parameter(default="standard") validation_set = "Test" baseline_name = "find_round_objects" def requires(self): yield ExtractDataset(self.dataset_version, self.dataset_name) yield Configure(self.config_name) def output(self): return LocalTarget("baseline/%s.json" % self.baseline_name) def run(self): dataset = self.input()[0].path config = self.input()[1].path test_data = build_generator(config, dataset, self.validation_set) result = calc_baseline_acc(test_data, dataset, self.validation_set) with self.output().open("wb") as f: json.dump(result, f)
class stockfish_cfg(Config): depth = IntParameter() location = Parameter(visibility=ParameterVisibility.PRIVATE, significant=False)
class worker(Config): # NOTE: `section.config-variable` in the config_path argument is deprecated in favor of `worker.config_variable` ping_interval = FloatParameter(default=1.0, config_path=dict( section='core', name='worker-ping-interval')) keep_alive = BoolParameter(default=False, config_path=dict(section='core', name='worker-keep-alive')) count_uniques = BoolParameter( default=False, config_path=dict(section='core', name='worker-count-uniques'), description='worker-count-uniques means that we will keep a ' 'worker alive only if it has a unique pending task, as ' 'well as having keep-alive true') count_last_scheduled = BoolParameter( default=False, description='Keep a worker alive only if there are ' 'pending tasks which it was the last to ' 'schedule.') wait_interval = FloatParameter(default=1.0, config_path=dict( section='core', name='worker-wait-interval')) wait_jitter = FloatParameter(default=5.0) max_reschedules = IntParameter(default=1, config_path=dict( section='core', name='worker-max-reschedules')) timeout = IntParameter(default=0, config_path=dict(section='core', name='worker-timeout')) task_limit = IntParameter(default=None, config_path=dict(section='core', name='worker-task-limit')) retry_external_tasks = BoolParameter( default=False, config_path=dict(section='core', name='retry-external-tasks'), description='If true, incomplete external tasks will be ' 'retested for completion while Luigi is running.') send_failure_email = BoolParameter( default=True, description='If true, send e-mails directly from the worker' 'on failure') no_install_shutdown_handler = BoolParameter( default=False, description='If true, the SIGUSR1 shutdown handler will' 'NOT be install on the worker') check_unfulfilled_deps = BoolParameter( default=True, description='If true, check for completeness of ' 'dependencies before running a task') force_multiprocessing = BoolParameter( default=False, description='If true, use multiprocessing also when ' 'running with 1 worker') task_process_context = Parameter( default=None, description='If set to a fully qualified class name, the class will ' 'be instantiated with a TaskProcess as its constructor parameter and ' 'applied as a context manager around its run() call, so this can be ' 'used for obtaining high level customizable monitoring or logging of ' 'each individual Task run.')
class TrainTheCannonBase(TheCannonMixin): """ A base task for training The Cannon. :param label_names: A list of label names. :param order: (optional) The polynomial order to use for this model (default: 2). :param regularization: (optional) The strength of L1-regularization to apply during training. :param threads: (optional) The number of threads to use (default: 1). :param plot: (optional) A boolean flag to indicate whether to produce post-training quality plots. """ regularization = FloatParameter(default=0.0) threads = IntParameter(default=1, significant=False) plot = BoolParameter(default=True, significant=False) def run(self): """ Execute this task. """ # Load training set labels and spectra. labels, dispersion, training_set_flux, training_set_ivar = read_training_set( self.input().path, ) # Set the vectorizer. # We sort the label names so that luigi doesn't re-train models if we alter the order. vectorizer = tc.vectorizer.PolynomialVectorizer( sorted(self.label_names), self.order) # Initiate model. model = tc.model.CannonModel(labels, training_set_flux, training_set_ivar, vectorizer=vectorizer, dispersion=dispersion, regularization=self.regularization) log.info(f"Training The Cannon model {model}") model.train(threads=self.threads) output_path = self.output().path log.info(f"Writing The Cannon model {model} to disk {output_path}") model.write(output_path) if self.plot: # Plot zeroth and first order coefficients. fig = plot.theta( model, indices=np.arange(1 + len(model.vectorizer.label_names)), normalize=False) fig.savefig(f"{self.task_id}-theta.png") # Plot scatter. fig = plot.scatter(model) fig.savefig(f"{self.task_id}-scatter.png") # Plot one-to-one. test_labels, test_cov, test_meta = model.test( training_set_flux, training_set_ivar, initial_labels=model.training_set_labels) fig = plot.one_to_one(model, test_labels, cov=test_cov) fig.savefig(f"{self.task_id}-one-to-one.png") def output(self): """ The output of this task. """ return LocalTarget( os.path.join(self.output_base_dir, f"{self.task_id}.pkl"))
class PredictTrend(luigi.Task): dataset_version = DateParameter(default=datetime.date.today()) dataset_name = Parameter(default="covidIT") attribute = Parameter(default="total_positive") window_size = IntParameter(default=7) model_name = Parameter(default="LR") n_days_to_predict = IntParameter(default=7) def requires(self): return { "model": Modeling( self.dataset_version, self.dataset_name, self.attribute, self.window_size, self.model_name, ), "data_transformed": DataTransform( self.dataset_version, self.dataset_name, self.attribute, self.window_size, ), } output_folder = os.path.join(output_dir, "prediction") def output(self): return LocalTarget( os.path.join( self.output_folder, f"{self.dataset_name}_prediction_{self.attribute}_w{self.window_size}_{self.model_name}_N{self.n_days_to_predict}_v{self.dataset_version}.csv", )) def run(self): df_date = pd.read_csv(self.input()["data_transformed"].path, index_col="date") df_date.index = pd.to_datetime(df_date.index) import pickle with open(self.input()["model"].path, "rb") as f: regr = pickle.load(f) df_windows_pred = self.predictWindowing( df_date, self.attribute, regr, self.window_size, self.n_days_to_predict, ) Path(self.output_folder).mkdir(parents=True, exist_ok=True) df_windows_pred.to_csv(self.output().path) def predictWindowing(self, df_windows_train, attribute, regr, window, n_days_to_predict=10): date_end_train_date = df_windows_train.index[-1].date() date_previous_window = date_end_train_date - datetime.timedelta( days=window + 1) df_test_window = pd.DataFrame( df_windows_train[date_previous_window:date_end_train_date] [attribute]) test_window = df_test_window[attribute].values start_i = len(test_window) X_test_prog = [] y_pred_prog = [] for i in range(start_i, start_i + n_days_to_predict): # X: |window| preceding samples X_test_i = test_window[i - window:i][::-1] X_test_prog.append(X_test_i) # y: regressor estimation given |window| preceding samples y_pred_prog.append(regr.predict([X_test_i])[0]) test_window = np.append(test_window, regr.predict([X_test_i])[0]) # Dataframe X |window| preceding samples df_pred = pd.DataFrame( X_test_prog, columns=[f"v_t-{i}" for i in range(1, window + 1)]) # y_predicted --> y estimated by the regressor df_pred[f"y_pred_{attribute}"] = y_pred_prog # Add date and sed as index start_pred_date = date_end_train_date + datetime.timedelta(days=1) datelist = pd.date_range(start_pred_date, periods=n_days_to_predict) df_pred.set_index(datelist, inplace=True) df_pred.index.name = "date" return df_pred
class Classify(PySparkTask): from datetime import date date = DateParameter(default=date.today()) version = IntParameter(default=1) # PySpark Parameter driver_memory = '1g' executor_memory = '2g' executor_cores = '2' num_executors = '4' master = 'local' # Als Abhaengigkeit werden # Task *Clean* und *ModelExists* # zurueckgegeben def requires(self): return [ModelExists(self.version), Clean(self.date)] # Das LocalTarget fuer die Klassifikation # Die Daten werden unter # "daily/<datum>/ergebnis.csv gespeichert def output(self): prefix = self.date.strftime("%m-%d-%Y") return LocalTarget("daily/%s/ergebnis.csv" % prefix) def main(self, sc, *args): from pyspark.sql.session import SparkSession from pyspark.ml import PipelineModel from pyspark.sql.functions import when # Initialisiere den SQLContext sql = SparkSession.builder\ .enableHiveSupport() \ .config("hive.exec.dynamic.partition", "true") \ .config("hive.exec.dynamic.partition.mode", "nonstrict") \ .config("hive.exec.max.dynamic.partitions", "4096") \ .getOrCreate() # Lade die bereinigten Daten df = sql.read.format("com.databricks.spark.csv") \ .option("delimiter", ";") \ .option("header", "true") \ .load(self.input()[1].path) # Lade das Model das zuvor mit SparkML trainiert wurde model = PipelineModel.load(self.input()[0].path) # Klassifiziere die Datensaetze eines Tages mit dem Model ergebnis = model.transform(df)[["id", "subreddit", "probability", "prediction"]] # Eine kleine Aufbereitung der Daten denn # die Klasse "1" hat den Namen "datascience" ergebnis = ergebnis.withColumn("prediction_label", when(ergebnis.prediction==1, "datascience") \ .otherwise("gameofthrones")) # Der Einfachheit halber wird der Dataframe # in einen Pandas Dataframe konvertiert. # Dies sollte bei grossen Datenmengen vermieden. with self.output().open("w") as out: ergebnis.toPandas().to_csv(out, encoding='utf-8', index=False, sep=';')
class ModelExists(WrapperTask): version = IntParameter(default=1) def output(self): return LocalTarget("model/%d/model" % self.version)
class PlotFutureTrend(luigi.Task): dataset_version = DateParameter(default=datetime.date.today()) dataset_name = Parameter(default="covidIT") attribute = Parameter(default="total_positive") window_size = IntParameter(default=7) model_name = Parameter(default="LR") n_days_to_predict = IntParameter(default=7) def requires(self): return { "data_pred": PredictTrend( self.dataset_version, self.dataset_name, self.attribute, self.window_size, self.model_name, self.n_days_to_predict, ), "data_transformed": DataTransform( self.dataset_version, self.dataset_name, self.attribute, self.window_size, ), } output_folder = os.path.join(output_dir, "report_future_trend") def output(self): return LocalTarget( os.path.join( self.output_folder, f"{self.dataset_name}_future_trend_{self.attribute}_w{self.window_size}_N{self.n_days_to_predict}_{self.model_name}_v{self.dataset_version}.png", )) def run(self): df_windows_pred = pd.read_csv(self.input()["data_pred"].path, index_col="date") df_windows_pred.index = pd.to_datetime(df_windows_pred.index) df_date = pd.read_csv(self.input()["data_transformed"].path, index_col="date") df_date.index = pd.to_datetime(pd.to_datetime(df_date.index).date) import datetime fig = self.plotEstimatedTrend(df_date, df_windows_pred, self.attribute) Path(self.output_folder).mkdir(parents=True, exist_ok=True) fig.savefig(self.output().path) def plotEstimatedTrend( self, df_date, df_windows_predicted, attribute, start_train=None, date_end_train=None, interval=40, ): import datetime # Starting date of the plot start_train = df_date.index[0] if start_train is None else start_train # End date of the true label/value of the plot date_end_train = df_date.index[ -1] if date_end_train is None else date_end_train start_test = date_end_train.date() + datetime.timedelta(days=1) if df_windows_predicted[start_test:].empty: # TODO raise ValueError fig, ax = plt.subplots(figsize=(12, 5)) ax.grid() # Observed trend until training date x_date = df_date[start_train:date_end_train].index y_train = df_date[start_train:date_end_train][attribute].values ax.scatter(x_date, y_train, s=3, color="blue", label=attribute) # Predicted future trend ax.scatter( df_windows_predicted[start_test:].index, df_windows_predicted[start_test:][f"y_pred_{attribute}"].values, s=4, color="orange", label=f"{attribute} predicted", ) ax.legend() ax.set(xlabel="Date", ylabel=attribute, title=attribute) date_form = DateFormatter("%d-%m") ax.xaxis.set_major_formatter(date_form) ax.xaxis.set_major_locator(mdates.DayLocator(interval=interval)) return fig