コード例 #1
0
    .build()
    evaluator = BinaryClassificationEvaluator()
    numFolds = 2
    crossval_full = CrossValidator(
        estimator=full_pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator,
        numFolds=numFolds)  # use 3+ folds in practice
    ############################################################

    # schema for raw csv files
    userSchema = StructType().add("spam", "string").add("message", "string")

    sqlContext = SQLContext(sc)
    # create an empty datframe
    dataset_df = sqlContext.createDataFrame(sc.emptyRDD(), userSchema)
    # or populate initial dataframe from a local csv file
    #dataset_df = sc.textFile("gs://drive3/data/spark/8_cs1_dataset/SMSSpamCollection").map(lambda line: re.split('\t', line)).toDF(["spam", "message"])
    #dataset_df = feature_pipeline.fit(dataset_df).transform(dataset_df)

    model = None

    prev_length = 0
    # whether to split dataset into train and evaluate before training
    evaluate = True
    # duration of training a model on whole batch dataset
    train_duration = 20  # train a model every n seconds
    ############################################################
    # append each batch of trainging stream to dataset_df
    # as part of structured streaming
    df = spark \