def test_with_life_goal(self): source_data = [("jose", 1), ("li", 2)] source_df = get_spark().createDataFrame(source_data, ["name", "age"]) actual_df = with_life_goal(source_df) expected_data = [("jose", 1, "escape!"), ("li", 2, "escape!")] expected_df = get_spark().createDataFrame(expected_data, ["name", "age", "life_goal"]) assert (expected_df.collect() == actual_df.collect())
def print_summary(jmodel): '''Print train and valid summary for model Args: jmodel(ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier) ''' # get spark and logger spark = get_spark(app_name="pyspark-xgb") logger = get_logger(spark, "app") train_summary = jmodel.summary().trainObjectiveHistory() valid_summary = jmodel.summary().validationObjectiveHistory() dataset_summary = [train_summary] dataset_name = ['train'] for idx in range(valid_summary.size()): eval_name = valid_summary.apply(idx)._1() eval_summary = valid_summary.apply(idx)._2() dataset_name.append(eval_name) dataset_summary.append(eval_summary) stop_flg = False for round_idx, row in enumerate(zip(*dataset_summary), 1): printString = "{:6} ".format('[{}]'.format(round_idx)) for idx, r in enumerate(row): if r == 0: stop_flg = True break printString += "{:5}\t{:10}\t".format(dataset_name[idx], r) if stop_flg is True: break logger.info(printString)
def read_table(table_name): from spark import get_spark spark_session = get_spark() import pandas as pd result = spark_session.sql("SELECT * FROM %s"%table_name) pandas_df = result.toPandas() json_result = pandas_df.to_json(orient='split') return json_result
def count_length(string): from spark import get_spark spark_session = get_spark() sc = spark_session.sparkContext list_string = list(string) rdd = sc.parallelize(list_string) count = rdd.count() return count
def spark_query(): if request.method == 'POST': query = str(request.form["query_text"]) from spark import get_spark spark_session = get_spark() sc = spark_session.sparkContext df = spark_session.sql(query) pandas_df = df.toPandas() return render_template('index.html', text=pandas_df.to_html())
def spark_process(): if request.method == 'POST': from spark import get_spark spark_session = get_spark() sc = spark_session.sparkContext array = session.get('list_tables', None) rdd = sc.parallelize(array) count = str(rdd.count()) return render_template( 'index.html', text= "This is calculated using Spark, its counting number of tables results from 'Get Table' Features.\n Result is : %s" % count)
def main(): try: # init spark spark = get_spark(app_name="pyspark-xgb") # get logger logger = get_logger(spark, "app") # load data df = spark.read.csv(DATASET_PATH + "/iris.data", get_mtrain_schema()) # preprocess LABEL = 'label' FEATURES = 'features' N_CLASS = 3 features = [c for c in df.columns if c != "class"] assembler = VectorAssembler(inputCols=features, outputCol='features') strIdxer = StringIndexer(inputCol="class", outputCol=LABEL) pipeline = Pipeline(stages=[assembler, strIdxer]) df = pipeline.fit(df).transform(df).select(FEATURES, LABEL) train, test = df.randomSplit([0.8, 0.2]) # training logger.info('training') xgb_params = { "eta": 0.1, "gamma": 0, "max_depth": 4, "num_round": 100, "num_early_stopping_rounds": 10, "num_workers": 1, "use_external_memory": False, "missing": np.nan, "num_class": 3, "eval_metric": "mlogloss", "min_child_weight": 1, "train_test_ratio": 0.8, "objective": "multi:softprob" } scala_map = spark._jvm.PythonUtils.toScalaMap(xgb_params) j = JavaWrapper._new_java_obj( "ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier", scala_map) \ .setFeaturesCol(FEATURES).setLabelCol(LABEL) jmodel = j.train(train._jdf) logger.info(jmodel.summary().toString()) # get validation metric preds = jmodel.transform(test._jdf) pred = DataFrame(preds, spark) slogloss = pred.withColumn('log_loss', udf_logloss('label', 'probability', N_CLASS)) \ .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)'] logger.info('valid logloss: {}'.format(slogloss)) # save or update model model_path = MODEL_PATH + '/model.bin' if os.path.exists(model_path): shutil.rmtree(model_path) logger.info('model exist, rm old model') jmodel.save(model_path) logger.info('save model to {}'.format(model_path)) except Exception: logger.error(traceback.print_exc()) finally: # stop spark spark.stop()
from pyspark.sql import SparkSession from spark import get_spark if __name__ == "__main__": spark = get_spark() l = [('Alice', 30)] ar = spark.createDataFrame(l, ['name', 'age']) ar.show() spark.stop()
def terminate_spark(): import pandas as pd from spark import get_spark spark_session = get_spark() spark_session.stop()
def main(): try: # init spark spark = get_spark(app_name="pyspark-xgb") # get logger logger = get_logger(spark, "app") # load data train = spark.read.csv(DATASET_PATH + "/iris_train.csv", get_mtrain_schema(), header=True) test = spark.read.csv(DATASET_PATH + "/iris_test.csv", get_mtrain_schema(), header=True) # preprocess # get label encode result from csv, since StringIndexer will get different result STR_LABEL = 'class' LABEL = 'label' FEATURES = 'features' N_CLASS = 3 features = [c for c in train.columns if c not in [STR_LABEL, LABEL]] assembler = VectorAssembler(inputCols=features, outputCol=FEATURES) pipeline = Pipeline(stages=[assembler]) preprocess = pipeline.fit(train) train = preprocess.transform(train).select(FEATURES, LABEL) test = preprocess.transform(test).select(FEATURES, LABEL) # set param map xgb_params = { "eta": 0.1, "eval_metric": "mlogloss", "gamma": 0, "max_depth": 5, "min_child_weight": 1.0, "objective": "multi:softprob", "seed": 0, "num_class": N_CLASS, # xgboost4j only "num_round": 100, "num_early_stopping_rounds": 10, "maximize_evaluation_metrics": False, "num_workers": 1, "use_external_memory": False, "missing": np.nan, } scala_map = spark._jvm.PythonUtils.toScalaMap(xgb_params) # set evaluation set eval_set = {'eval': test._jdf} scala_eval_set = spark._jvm.PythonUtils.toScalaMap(eval_set) logger.info('training') j = JavaWrapper._new_java_obj( "ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier", scala_map) \ .setFeaturesCol(FEATURES).setLabelCol(LABEL) \ .setEvalSets(scala_eval_set) jmodel = j.fit(train._jdf) print_summary(jmodel) # get validation metric preds = jmodel.transform(test._jdf) pred = DataFrame(preds, spark) slogloss = pred.withColumn('log_loss', udf_logloss(LABEL, 'probability', N_CLASS)) \ .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)'] logger.info('[xgboost4j] valid logloss: {}'.format(slogloss)) # save model - using native booster for single node library to read model_path = MODEL_PATH + '/model.bin' logger.info('save model to {}'.format(model_path)) jbooster = jmodel.nativeBooster() jbooster.saveModel(model_path) # get feature score imp_type = "gain" feature_map_path = MODEL_PATH + '/feature.map' create_feature_map(feature_map_path, features) jfeatureMap = jbooster.getScore(feature_map_path, imp_type) f_imp = dict() for feature in features: if not jfeatureMap.get(feature).isEmpty(): f_imp[feature] = jfeatureMap.get(feature).get() feature_imp_path = MODEL_PATH + '/feature.imp' create_feature_imp(feature_imp_path, f_imp) # [Optional] load model training by xgboost, predict and get validation metric local_model_path = LOCAL_MODEL_PATH + '/model.bin' if os.path.exists(local_model_path): logger.info('load model from {}'.format(local_model_path)) scala_xgb = spark.sparkContext._jvm.ml.dmlc.xgboost4j.scala.XGBoost jbooster = scala_xgb.loadModel(local_model_path) # uid, num_class, booster xgb_cls_model = JavaWrapper._new_java_obj( "ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel", "xgbc", N_CLASS, jbooster) jpred = xgb_cls_model.transform(test._jdf) pred = DataFrame(jpred, spark) slogloss = pred.withColumn('log_loss', udf_logloss(LABEL, 'probability', N_CLASS)) \ .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)'] logger.info('[xgboost] valid logloss: {}'.format(slogloss)) else: logger.info( "local model is not exist, call python_xgb/train_multi.py to get the model " "and compare logloss between xgboost and xgboost4j" ) except Exception: logger.error(traceback.print_exc()) finally: # stop spark spark.stop()
def spark(): sc = get_spark() sc.setLogLevel("WARN") yield sc sc.stop()
def main(): try: # init spark spark = get_spark(app_name="pyspark-xgb") # get logger logger = get_logger(spark, "app") # load data train = spark.read.schema(get_btrain_schema()).option('header', True).csv( DATASET_PATH + '/emp_train.csv') test = spark.read.schema(get_btrain_schema()).option('header', True).csv( DATASET_PATH + '/emp_test.csv') # preprocess LABEL = 'Attrition' FEATURES = 'features' features = [c for c in train.columns if c != LABEL] assembler = VectorAssembler(inputCols=features, outputCol=FEATURES) train = assembler.transform(train).select(FEATURES, LABEL) test = assembler.transform(test).select(FEATURES, LABEL) # training logger.info('training') xgb_params = { "eta": 0.1, "gamma": 0, "max_depth": 4, "num_round": 100, "num_early_stopping_rounds": 10, "num_workers": 1, "use_external_memory": False, "missing": np.nan, "eval_metric": "logloss", "min_child_weight": 1, "train_test_ratio": 0.8, "objective": "binary:logistic" } scala_map = spark._jvm.PythonUtils.toScalaMap(xgb_params) j = JavaWrapper._new_java_obj( "ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier", scala_map) \ .setFeaturesCol(FEATURES).setLabelCol(LABEL) jmodel = j.train(train._jdf) logger.info(jmodel.summary().toString()) # get validation metric preds = jmodel.transform(test._jdf) pred = DataFrame(preds, spark) slogloss = pred.withColumn('log_loss', udf_logloss(LABEL, 'probability')) \ .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)'] logger.info('valid logloss: {}'.format(slogloss)) # save or update model model_path = MODEL_PATH + '/model.bin' if os.path.exists(model_path): shutil.rmtree(model_path) logger.info('model exist, rm old model') jw = JavaWrapper._new_java_obj( "ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel.XGBoostClassificationModelWriter", jmodel) jw.saveImpl(model_path) logger.info('save model to {}'.format(model_path)) except Exception: logger.error(traceback.print_exc()) finally: # stop spark spark.stop()
## Keeping only Rows with valid float type in Coordinates ==> May be escape if coordinates is not needed for our training steps ## hence the df_1 will be sufficient. #df_2 = df_1[df_1.coordinates.apply(lambda x: type(x['latitude']) in [int, np.int64, float, np.float64])] #df_2 = df_1[df_1.coordinates.apply(lambda x: type(x['longitude']) in [int, np.int64, float, np.float64])] return df_1 if __name__ == "__main__": #spark = SparkSession \ # .builder \ # .appName("Python Spark SQL data source example") \ # .getOrCreate() spark = spark.get_spark() if len(sys.argv) < 4 or len(sys.argv) > 5: print( "Usage: kmeans <input file> <mode(trainning or predicting)> <k> <output file> OR kmeans <input file> <mode(trainning or predicting)> <k> ", file=sys.stderr) sys.exit(-1) mode = sys.argv[2] # mode is 'trainning' or 'predicting' k = int(sys.argv[3]) # Number of cluster needed for clustering currentdate = datetime.now().strftime("%Y-%m-%d") ###### LOADING DATA ##### # INPUT input_data_path = os.path.join(PATH, "datasets", "input", sys.argv[1]) output_data_path = os.path.join(PATH, "datasets", "output")