.build() evaluator = BinaryClassificationEvaluator() numFolds = 2 crossval_full = CrossValidator( estimator=full_pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=numFolds) # use 3+ folds in practice ############################################################ # schema for raw csv files userSchema = StructType().add("spam", "string").add("message", "string") sqlContext = SQLContext(sc) # create an empty datframe dataset_df = sqlContext.createDataFrame(sc.emptyRDD(), userSchema) # or populate initial dataframe from a local csv file #dataset_df = sc.textFile("gs://drive3/data/spark/8_cs1_dataset/SMSSpamCollection").map(lambda line: re.split('\t', line)).toDF(["spam", "message"]) #dataset_df = feature_pipeline.fit(dataset_df).transform(dataset_df) model = None prev_length = 0 # whether to split dataset into train and evaluate before training evaluate = True # duration of training a model on whole batch dataset train_duration = 20 # train a model every n seconds ############################################################ # append each batch of trainging stream to dataset_df # as part of structured streaming df = spark \