def save_train_predictions(prediction, objetivo, model_name, hyperparams): s3_name = parse_filename(objetivo, model_name, hyperparams) s3_name = s3_name[2:] vars_bias = [ 'dayofmonth', 'flight_number_reporting_airline', 'prediction', 'originwac', 'label', 'distance' ] df_bias = prediction.select( [c for c in prediction.columns if c in vars_bias]) df_bias = df_bias.withColumnRenamed("prediction", "score").withColumnRenamed( "label", "label_value") df_bias = df_bias.withColumn('s3_name', lit(s3_name)) df_bias = df_bias.withColumn( 'aux', f.when(f.col('dayofmonth') < 9, "0").otherwise("")) df_bias = df_bias.withColumn( 'fecha', concat(lit("2019"), lit("12"), col('aux'), col('dayofmonth'))) vars_bias = [ 'flight_number_reporting_airline', 'prediction', 'originwac', 'label_value', 'distance', 'score', 's3_name', 'fecha' ] df_bias = df_bias.select([c for c in df_bias.columns if c in vars_bias]) df_pandas = df_bias.toPandas() save_rds_pandas(df_pandas, "predictions.train")
def output(self): objetivo = self.obj model_name = self.model hyperparams = {"iter": int(self.numIt), "pca": int(self.numPCA)} output_path = parse_filename(objetivo, model_name, hyperparams) output_path = "s3://" + str(self.bucname) + output_path[1:] + ".model.zip" return luigi.contrib.s3.S3Target(path=output_path)
def add_meta_data(objetivo, model_name, hyperparams, log, train_time, test_split, train_nrows): s3_name = parse_filename(objetivo, model_name, hyperparams) s3_name = s3_name[2:] AUROC = log['AUROC'] AUPR = log['AUPR'] precision = log['precision'] recall = log['recall'] f1 = log['F1 Measure'] today = date.today() d1 = today.strftime("%Y%m%d") query = """ INSERT INTO metadatos.models (fecha, objetivo, model_name, s3_name, hyperparams, AUROC, AUPR, precision, recall, f1, train_time, test_split, train_nrows ) VALUES ( %s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s ) """ values = (d1, objetivo, model_name, s3_name, json.dumps(hyperparams), AUROC, AUPR, precision, recall, f1, train_time, test_split, train_nrows) insert_query(query, values)