Esempio n. 1
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
    def train_lg(training_data, collection):
        # Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr.
        hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features")
        idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
        pipeline1 = Pipeline(stages=[hashingTF, idf])

        # Fit the pipeline1 to training documents.
        model1 = pipeline1.fit(training_data)

        lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
        pipeline2 = Pipeline(stages=[model1, lr])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \
            .addGrid(lr.regParam, [0.1, 0.01]) \
            .build()

        crossval = CrossValidator(estimator=pipeline2,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=BinaryClassificationEvaluator(),
                                  numFolds=5)

        # Run cross-validation, and choose the best set of parameters.
        cvModel = crossval.fit(training_data)

    #     model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_'
    #                             + collection["Id"] + '_'
    #                             + collection["name"])
    #     cvModel.save(sc, model_path)
        return cvModel
Esempio n. 3
0
def fit_kmeans(spark, products_df):
    step = 0

    step += 1
    tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")

    step += 1
    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")

    step += 1
    tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)

    step += 1
    idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")

    step += 1
    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")

    step += 1
    kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)

    kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])

    model = kmeans_pipeline.fit(products_df)
    words_prediction = model.transform(products_df)
    model.save("./kmeans")  # the whole machine learning instance is saved in a folder
    return model, words_prediction
Esempio n. 4
0
def model(classifiers, training, testing, week):

    results = ""
    timing = []

    for classifier in classifiers:

        timeStart = time.time()

        clf = get_classifier(classifier)

        labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
        featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")

        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
        model = pipeline.fit(training)

        prediction = model.transform(testing)

        metrics = BinaryClassificationMetrics(prediction.select("label","prediction").rdd)

        results = results + "new," + classifier + "," + week + "," + str(metrics.areaUnderROC) + "," +str(metrics.areaUnderPR) + "\n"

        timing.append(time.time()-timeStart)

    return results, timing
Esempio n. 5
0
def main(sc, spark):
    # Load the Corpus
    corpus = load_corpus(sc, spark)

    # Create the vector/cluster pipeline
    pipeline = Pipeline(stages=[
        Tokenizer(inputCol="text", outputCol="tokens"),
        Word2Vec(vectorSize=7, minCount=0, inputCol="tokens", outputCol="vecs"),
        BisectingKMeans(k=10, featuresCol="vecs", maxIter=10),
    ])

    # Fit the model
    model = pipeline.fit(corpus)
    corpus = model.transform(corpus)

    # Evaluate clustering.
    bkm = model.stages[-1]
    cost = bkm.computeCost(corpus)
    sizes = bkm.summary.clusterSizes

    # TODO: compute cost of each cluster individually

    # Get the text representation of each cluster.
    wvec = model.stages[-2]
    table = [["Cluster", "Size", "Terms"]]
    for ci, c in enumerate(bkm.clusterCenters()):
        ct = wvec.findSynonyms(c, 7)
        size = sizes[ci]
        terms = " ".join([row.word for row in ct.take(7)])
        table.append([ci, size, terms])

    # Print Results
    print(tabulate(table))
    print("Sum of square distance to center: {:0.3f}".format(cost))
Esempio n. 6
0
 def testLogisticMLPipeline1(self):
     training = sqlCtx.createDataFrame([
         ("a b c d e spark", 1.0),
         ("b d", 2.0),
         ("spark f g h", 1.0),
         ("hadoop mapreduce", 2.0),
         ("b spark who", 1.0),
         ("g d a y", 2.0),
         ("spark fly", 1.0),
         ("was mapreduce", 2.0),
         ("e spark program", 1.0),
         ("a e c l", 2.0),
         ("spark compile", 1.0),
         ("hadoop software", 2.0)
         ], ["text", "label"])
     tokenizer = Tokenizer(inputCol="text", outputCol="words")
     hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
     lr = LogisticRegression(sqlCtx)
     pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
     model = pipeline.fit(training)
     test = sqlCtx.createDataFrame([
         ("spark i j k", 1.0),
         ("l m n", 2.0),
         ("mapreduce spark", 1.0),
         ("apache hadoop", 2.0)], ["text", "label"])
     result = model.transform(test)
     predictionAndLabels = result.select("prediction", "label")
     evaluator = MulticlassClassificationEvaluator()
     score = evaluator.evaluate(predictionAndLabels)
     self.failUnless(score == 1.0)
def main(input_file):
    # Load and parse the data file, converting it to a DataFrame.
    data = MLUtils.loadLabeledPoints(sc, input_file)

    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10).fit(data)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    rf = RandomForestRegressor(featuresCol="indexedFeatures")

    # Chain indexer and forest in a Pipeline
    pipeline = Pipeline(stages=[featureIndexer, rf])

    # Train model.  This also runs the indexer.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("prediction", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(
        labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse))

    rfModel = model.stages[1]
    print(rfModel)  # summary only
    def test_nnclassifier_in_pipeline(self):

        if self.sc.version.startswith("1"):
            from pyspark.mllib.linalg import Vectors

            df = self.sqlContext.createDataFrame(
                [(Vectors.dense([2.0, 1.0]), 1.0),
                 (Vectors.dense([1.0, 2.0]), 2.0),
                 (Vectors.dense([2.0, 1.0]), 1.0),
                 (Vectors.dense([1.0, 2.0]), 2.0),
                 ], ["features", "label"])

            scaler = MinMaxScaler().setInputCol("features").setOutputCol("scaled")
            model = Sequential().add(Linear(2, 2))
            criterion = ClassNLLCriterion()
            classifier = NNClassifier(model, criterion, MLlibVectorToTensor([2]))\
                .setBatchSize(4) \
                .setLearningRate(0.01).setMaxEpoch(1).setFeaturesCol("scaled")

            pipeline = Pipeline(stages=[scaler, classifier])

            pipelineModel = pipeline.fit(df)

            res = pipelineModel.transform(df)
            assert type(res).__name__ == 'DataFrame'
Esempio n. 9
0
def run(start1, end1, start2, end2, df, sc, sql_context, is_pred):
    lp_data= get_labeled_points(start1, end2, df, sc, sql_context)
    print lp_data.count()

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(lp_data)
    td = labelIndexer.transform(lp_data)
    label2index = {}
    for each in  sorted(set([(i[0], i[1]) for i in td.select(td.label, td.indexedLabel).distinct().collect()]),
                key=lambda x: x[0]):
        label2index[int(each[0])] = int(each[1])
    print label2index

    featureIndexer = \
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(lp_data)

    rf = get_model()

    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

    lp_train = lp_data.filter(lp_data.date3<end1).filter(lp_data.is_labeled == 1)
    model = pipeline.fit(lp_train)
    lp_check = lp_data.filter(lp_data.date2>start2)
    predictions = model.transform(lp_check)
    predictions = val(predictions, label2index, sql_context)

    if is_pred:
        predictions = predictions.filter(predictions.is_labeled ==0).filter(predictions.date2 == get_cur()).sort(predictions.prob.desc())
        dfToTableWithPar(sql_context, predictions, "predictions", get_cur())
        for each in predictions.take(10):
            print each
def RunRandomForest(tf, ctx):
	sqlContext = SQLContext(ctx)
	rdd = tf.map(parseForRandomForest)
	# The schema is encoded in a string.
	schema = ['genre', 'track_id', 'features']
	# Apply the schema to the RDD.
	songDF = sqlContext.createDataFrame(rdd, schema)

	# Register the DataFrame as a table.
	songDF.registerTempTable("genclass")
	labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF)

	trainingData, testData = songDF.randomSplit([0.8, 0.2])

	labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)

	rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features")
	#rfc = SVMModel([.5, 10, 20], 5)
	#rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features")

	pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter])
	model = pipeline.fit(trainingData)

	predictions = model.transform(testData)
	predictions.show()

	evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
	accuracy = evaluator.evaluate(predictions)
	print 'Accuracy of RandomForest = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100
    def test_cv_lasso_with_mllib_featurization(self):
        data = [('hi there', 0.0),
                ('what is up', 1.0),
                ('huh', 1.0),
                ('now is the time', 5.0),
                ('for what', 0.0),
                ('the spark was there', 5.0),
                ('and so', 3.0),
                ('were many socks', 0.0),
                ('really', 1.0),
                ('too cool', 2.0)]
        data = self.sql.createDataFrame(data, ["review", "rating"])

        # Feature extraction using MLlib
        tokenizer = Tokenizer(inputCol="review", outputCol="words")
        hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000)
        pipeline = Pipeline(stages=[tokenizer, hashingTF])
        data = pipeline.fit(data).transform(data)

        df = self.converter.toPandas(data.select(data.features.alias("review"), "rating"))

        pipeline = SKL_Pipeline([
            ('lasso', SKL_Lasso(max_iter=1))
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }

        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})
def sparking_your_interest():
	df = SQLContext.read.json('speeches_dataset.json')
	df_fillna=df.fillna("")
	print(df_fillna.count())
	print(df_fillna.printSchema())

	df_utf=call_utf_encoder(df)
	df_cleaned=call_para_cleanup(df_utf)
	print(df_cleaned)
	df_with_bigrams = call_ngrams(df_cleaned, 2)
	df_with_trigrams = call_ngrams(df_with_bigrams, 3)
	df_with_4grams = call_ngrams(df_with_trigrams, 4)
	df_with_5grams = call_ngrams(df_with_4grams, 4)
	df_with_6grams = call_ngrams(df_with_5grams, 4)
	df_with_vocab_score = call_speech_vocab(df_with_6grams)

	df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams')
	df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams')
	df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams')
	assembler = VectorAssembler(
	    inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"],
	    outputCol="features")
	assembler_output = assembler.transform(df_with_4grams_idf_vectors)
	output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features')
	print(output.show())
	print(output.count())

	output_tordd = output.rdd
	train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123)
	train_df = train_rdd.toDF()
	test_df = test_rdd.toDF()
	print(train_df)
	print(test_df)

	print('Train DF - Count: ')
	print(train_df.count())
	print('Test DF - Count: ')
	print(test_df.count())

	print("Initializing RF Model")
	labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df)       
	rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)
	pipeline = Pipeline(stages=[labelIndexer,rf])
	model = pipeline.fit(output)
	print("Completed RF Model")

	predictions = model.transform(test_df)
	evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
	accuracy = evaluator.evaluate(predictions)
	print("Test Error = %g" % (1.0 - accuracy))
	rfModel = model.stages[1]
	print(rfModel)  # summary only
	print("Predictions: ")
	print(predictions.show())
Esempio n. 14
0
def model(classifier, ftrain, fvalid, fprediction):

    startTime = time.time()

    ctx = SparkContext(appName="model_on_Spark")
    sqlContext = SQLContext(ctx)
    logger = SparkLogger(ctx)
    logger.set_level('ERROR')

    # load and prepare training and validation data
    rawTrain, train = prepData(sqlContext, ctx, ftrain)
    rawValid, valid = prepData(sqlContext, ctx, fvalid)

    # is needed to join columns
    valid = indexData(valid)
    rawValid = indexData(rawValid)

    classifiers = {
        "RandomForestClassifier" : RFC
    }

    clf = classifiers[classifier]()

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")

    # train and predict
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
    model = pipeline.fit(train)

    predictions = model.transform(valid)

    # write to file:

    subsetPrediction = predictions.select("prediction", "index")
    subsetValidData = rawValid.select("dataset", "index")

    output = (subsetValidData
               .join(subsetPrediction, subsetPrediction.index == subsetValidData.index)
                    .drop("index")
                    .drop("index"))

    lines = output.map(toCSVLine)
    lines.saveAsTextFile('output')

    evaluator = MulticlassClassificationEvaluator(
       labelCol="label", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print "Test Error = %g" % (1.0 - accuracy)

    executionTime = time.time() - startTime
    row=classifier+','+str(executionTime)
    ctx.parallelize([row]).saveAsTextFile("timing")
def event_pipeline(dataset):
    """
    """
    EventCodeI = StringIndexer(inputCol="EventCode", outputCol="EventCodeI")
    EventBaseCodeI = StringIndexer(inputCol="EventBaseCode", outputCol="EventBaseCodeI")
    EventRootCodeI = StringIndexer(inputCol="EventRootCode", outputCol="EventRootCodeI")
    assembler = VectorAssembler(inputCols=["IsRootEvent", "EventCodeI", "EventBaseCodeI","EventRootCodeI", "QuadClass","GoldsteinScale","NumMentions","NumSources","NumArticles","AvgTone"], outputCol="features")
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=310)
    pipeline = Pipeline(stages=[EventCodeI, EventBaseCodeI, EventRootCodeI,assembler,featureIndexer])
    model = pipeline.fit(dataset)
    output = model.transform(dataset)

    data = output.map(lambda row: LabeledPoint(row[0], row[-1])).cache()
    print "Data:"
    print data.take(1)
    return data
    def group(self):
        reTokenizer = RegexTokenizer(inputCol=self.query_colname, outputCol="words", minTokenLength=2) #, pattern='\W'
        hashingTF = HashingTF(numFeatures=self.num_features, inputCol="words", outputCol="tf")


        if self.idf == True:
            idf = IDF(minDocFreq=self.min_doc_freq, inputCol="tf", outputCol="idf")
            kmeans = KMeans(featuresCol="idf", predictionCol="cluster_id", k=self.n)
            pipeline = Pipeline(stages=[reTokenizer, hashingTF, idf, kmeans])

        else:
            kmeans = KMeans(featuresCol="tf", predictionCol="cluster_id", k=self.n)
            pipeline = Pipeline(stages=[reTokenizer, hashingTF, kmeans])

        model = pipeline.fit(self.df)
        prediction = model.transform(self.df)
        return prediction
Esempio n. 17
0
    def getPipeline(self, df):
        # notify pipeline 
        self.success('Initializing ML Pipeline ...')

        # initialize our tokenizer, we're going to tokenize features
        tokenizer = Tokenizer(inputCol='tag_features', outputCol='words')
        # convert the tokenize data to vectorize data
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features')
        # initialize logistic regression algorithm
        lr        = LogisticRegression(maxIter=10, regParam=0.01)
        # create / initialize the ml pipeline
        pipeline  = Pipeline(stages=[tokenizer, hashingTF, lr])

        # fit the pipeline on our training dataframe
        model = pipeline.fit(df)

        return model
def event_pipeline(dataset):
    EventCodeI = StringIndexer(inputCol="EventCode", outputCol="EventCodeI")
    EventCodeV = OneHotEncoder(dropLast=True, inputCol="EventCodeI", outputCol="EventCodeV")

    EventRootCodeI = StringIndexer(inputCol="EventRootCode", outputCol="EventRootCodeI")
    EventRootCodeV = OneHotEncoder(dropLast=True, inputCol="EventRootCodeI", outputCol="EventRootCodeV")

    EventBaseCodeI = StringIndexer(inputCol="EventBaseCode", outputCol="EventBaseCodeI")
    EventBaseCodeV = OneHotEncoder(dropLast=True, inputCol="EventBaseCodeI", outputCol="EventBaseCodeV")

    assembler = VectorAssembler(inputCols=["IsRootEvent", "EventCodeV", "EventBaseCodeV","EventRootCodeV", "QuadClass","GoldsteinScale","NumMentions","NumSources","NumArticles","AvgTone"], outputCol="features")

    pipeline = Pipeline(stages=[EventCodeI, EventCodeV, EventRootCodeI, EventRootCodeV,EventBaseCodeI,EventBaseCodeV,assembler])

    model = pipeline.fit(dataset)
    output = model.transform(dataset)
    data = output.map(lambda row: LabeledPoint(row[0], row[-1])).toDF().cache()
    return data
Esempio n. 19
0
    def test_pipeline_persistence(self):
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            pl = Pipeline(stages=[tf, pca])
            model = pl.fit(df)
            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self.assertEqual(loaded_pipeline.uid, pl.uid)
            self.assertEqual(len(loaded_pipeline.getStages()), 2)

            [loaded_tf, loaded_pca] = loaded_pipeline.getStages()
            self.assertIsInstance(loaded_tf, HashingTF)
            self.assertEqual(loaded_tf.uid, tf.uid)
            param = loaded_tf.getParam("numFeatures")
            self.assertEqual(loaded_tf.getOrDefault(param), tf.getOrDefault(param))

            self.assertIsInstance(loaded_pca, PCA)
            self.assertEqual(loaded_pca.uid, pca.uid)
            self.assertEqual(loaded_pca.getK(), pca.getK())

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            [model_tf, model_pca] = model.stages
            [loaded_model_tf, loaded_model_pca] = loaded_model.stages
            self.assertEqual(model_tf.uid, loaded_model_tf.uid)
            self.assertEqual(model_tf.getOrDefault(param), loaded_model_tf.getOrDefault(param))

            self.assertEqual(model_pca.uid, loaded_model_pca.uid)
            self.assertEqual(model_pca.pc, loaded_model_pca.pc)
            self.assertEqual(model_pca.explainedVariance, loaded_model_pca.explainedVariance)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
def build_decision_tree(sqlContext, features, interested):
	print '-----------------------------------------'
	data = sqlContext.createDataFrame(
			[Row(label=interested[i],features=Vectors.dense(features[i])) for i in xrange(len(features))])
	data.printSchema()
	data.show(5)
	print 'created data frame'

	# Index the label column & adding metadata.
	labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
	print 'created label indexer'

	# Mark the features with < 4 distinct values as categorical
	featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

	# Split the data into training and test sets
	(trainingData, testData) = data.randomSplit([0.8, 0.2])

	# Train a DecisionTree model
	dt = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
#	dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
#	dt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)

	# Chain the indexers together with DecisionTree
	pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

	# Train the model
	model = pipeline.fit(trainingData)

	# Make predictions
	predictions = model.transform(testData)

	predictions.select("prediction", "indexedLabel", "features").show(5)

	# Select (prediction, true label) & compute test error
	evaluator = MulticlassClassificationEvaluator(
			labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
	precision = evaluator.evaluate(predictions)

	treeModel = model.stages[2]
	return (1 - precision, model)
    def test_featurizer_in_pipeline(self):
        """
        Tests that featurizer fits into an MLlib Pipeline.
        Does not test how good the featurization is for generalization.
        """
        featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features",
                                         modelName=self.name)
        lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
        pipeline = Pipeline(stages=[featurizer, lr])

        # add arbitrary labels to run logistic regression
        # TODO: it's weird that the test fails on some combinations of labels. check why.
        label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType())
        train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["filePath"]))

        lrModel = pipeline.fit(train_df)
        # see if we at least get the training examples right.
        # with 5 examples and e.g. 131k features (for InceptionV3), it ought to.
        pred_df_collected = lrModel.transform(train_df).collect()
        for row in pred_df_collected:
            self.assertEqual(int(row.prediction), row.label)
Esempio n. 22
0
 def test_pipeline(self):
     dataset = MockDataset()
     estimator0 = MockEstimator()
     transformer1 = MockTransformer()
     estimator2 = MockEstimator()
     transformer3 = MockTransformer()
     pipeline = Pipeline(stages=[estimator0, transformer1, estimator2, transformer3])
     pipeline_model = pipeline.fit(dataset, {estimator0.fake: 0, transformer1.fake: 1})
     model0, transformer1, model2, transformer3 = pipeline_model.stages
     self.assertEqual(0, model0.dataset_index)
     self.assertEqual(0, model0.getFake())
     self.assertEqual(1, transformer1.dataset_index)
     self.assertEqual(1, transformer1.getFake())
     self.assertEqual(2, dataset.index)
     self.assertIsNone(model2.dataset_index, "The last model shouldn't be called in fit.")
     self.assertIsNone(transformer3.dataset_index, "The last transformer shouldn't be called in fit.")
     dataset = pipeline_model.transform(dataset)
     self.assertEqual(2, model0.dataset_index)
     self.assertEqual(3, transformer1.dataset_index)
     self.assertEqual(4, model2.dataset_index)
     self.assertEqual(5, transformer3.dataset_index)
     self.assertEqual(6, dataset.index)
def main():
    '''
    takes one input argument :: Location of the directory for training and test data files.
    :return: Print output on console for the area under the ROC curve.
    '''

    conf = SparkConf().setAppName("MLPipeline")
    sc = SparkContext(conf=conf)

    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet("20news_train.parquet")

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training data.
    model = pipeline.fit(trainDF)

    numFeatures = (1000, 5000, 10000)
    regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
    paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build()


    cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)

    # Evaluate the model on testing data
    testDF = sqlCt.read.parquet("20news_test.parquet")
    prediction = model.transform(testDF)
    evaluator = BinaryClassificationEvaluator()


    model_cv = cv.fit(trainDF)
    prediction_cv = model_cv.transform(testDF)
    print evaluator.evaluate(prediction)
    print evaluator.evaluate(prediction_cv)
def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output)
def build_ngrams_wocs(inputCol=["Text","Sentiment"], n=3):
    tokenizer = [Tokenizer(inputCol="Text", outputCol="words")]
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=5460,inputCol="{0}_grams".format(i),
            outputCol="{0}_tf".format(i))
        for i in range(1, n + 1)
    ]
    idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)]

    assembler = [VectorAssembler(
        inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
        outputCol="features"
    )]
    label_stringIdx = [StringIndexer(inputCol = "Sentiment", outputCol = "label")]
    lr = [LogisticRegression(maxIter=100)]
    return Pipeline(stages=tokenizer + ngrams + cv + idf+ assembler + label_stringIdx+lr)
    pipeline = Pipeline(stages=[tokenizer, ngrams, cv, idf,assembler, label_stringIdx])
    pipelineFit = pipeline.fit(df)
    dataset = pipelineFit.transform(df)



document_assembler = DocumentAssembler()\
 .setInputCol(“text”)\
 .setOutputCol(“document”)
sentenceDetector = SentenceDetector()\
 .setInputCols([“document”])\
 .setOutputCol(“sentences”)
tokenizer = Tokenizer() \
 .setInputCols([“sentences”]) \
 .setOutputCol(“token”)
normalizer = Normalizer()\
 .setInputCols([“token”])\
 .setOutputCol(“normal”)
word_embeddings=WordEmbeddingsModel.pretrained()\
 .setInputCols([“document”,”normal”])\
 .setOutputCol(“embeddings”)
nlpPipeline = Pipeline(stages=[
 document_assembler, 
 sentenceDetector,
 tokenizer,
 normalizer,
 word_embeddings,
 ])
pipelineModel = nlpPipeline.fit(df)


### LightPipeline(someTrainedPipeline).annotate(someStringOrArray)
Esempio n. 27
0
    for indexer in indexers
]
assembler_onehot = ft.VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders],
    outputCol="onehot_features")

#scaler
assembler_numeric = ft.VectorAssembler(inputCols=numeric_features,
                                       outputCol="numeric_features")
std_scaler = ft.StandardScaler(inputCol="numeric_features",
                               outputCol="numeric_features_scaled")

assembler_final = ft.VectorAssembler(
    inputCols=['onehot_features', 'numeric_features_scaled'],
    outputCol="final_features")

pca_model = ft.PCA(k=6, inputCol="final_features", outputCol="pca_features")

pipeline = Pipeline(stages=indexers + encoders + [
    assembler_onehot, assembler_numeric, std_scaler, assembler_final, pca_model
])
preprocess_model = pipeline.fit(df)
scaledData = preprocess_model.transform(df)

# 保存和加载模型,save model load model
from pyspark.ml import PipelineModel
outpath = "/dbfs/classification_models/model-maxDepth{}-maxBins{}".format(
    MAXDEPTH, MAXBINS)

pipelineModel.write().overwrite().save(outpath)
model_in = PipelineModel.load(outpath)
Esempio n. 28
0
resultF = result.select("result", "OpenStatus_cat")
resultF.show()

# In[16]:

final_data = resultF.select('result', 'OpenStatus_cat')
train_data, test_data = final_data.randomSplit([0.7, 0.3])
train_data.describe().show()

# In[17]:

dt = DecisionTreeClassifier(labelCol="OpenStatus_cat", featuresCol="result")

pipeline = Pipeline(stages=[dt])

model = pipeline.fit(train_data)

# In[18]:

predictions = model.transform(test_data)

predictions.select("prediction", "OpenStatus_cat", "result").show(5)

evaluator = MulticlassClassificationEvaluator(labelCol="OpenStatus_cat",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = model.stages[0]
Esempio n. 29
0
encoded_cols = ["job", "marital", "education", "month", "pdays"]
indexers = [
    StringIndexer().setInputCol(col).setOutputCol(col + "_labeled")
    for col in labeled_cols + encoded_cols
]

# One Hot Encoders
encoders = [
    OneHotEncoder(inputCol=col + "_labeled", outputCol=col + "_encoded")
    for col in encoded_cols
]

# Create the preprocessing timeline
input_stages = assemblers + scalers + indexers + encoders
pipeline = Pipeline(stages=input_stages)
preprocessor = pipeline.fit(df)
df = preprocessor.transform(df)

# Remove unnecessary columns
labeled_cols += [s + "_labeled" for s in encoded_cols]
feature_list += [s + "_vec" for s in feature_list]
df = df.drop(*feature_list, *labeled_cols, *encoded_cols, "features")

# Write the pre-processed data to a csv file
df.write.csv("bank-processed.csv", header=True)

# Train test split
(trainingData, testData) = df.randomSplit([0.8, 0.2])

# Create a vector for all the features
features = [col for col in df.columns if col != "y_labeled"]
def run_pipeline(target, feature, num, cat, n_feat):
    train = spark.table('predictmodel_train')        
    train.persist()
    stages = []
    
    indexer = [StringIndexer(inputCol=s, outputCol=s+'_indexed',handleInvalid='keep') for s in cat]
    encoder = [OneHotEncoderEstimator(inputCols=[s+'_indexed'],outputCols=[s+"_encoded"],handleInvalid='keep') for s in cat]
        
    imputer = [Imputer(strategy='mean',inputCols=num, outputCols=num)]
    
    num_assmblr = [VectorAssembler(inputCols=[n], outputCol=n+'_vect') for n in num]
    num_scaler = [MinMaxScaler(inputCol=n+'_vect', outputCol=n+"_scaled") for n in num]
    
#    pipeline_num = Pipeline(stages=indexer + encoder + imputer + num_assmblr + num_scaler)
#    train = pipeline_num.fit(train).transform(train)

#    print("*** show encoded categorical variables ....")
#    train.select(*[s+'_encoded' for s in cat]).show(10, truncate=False)
    
#    unpack_list = F.udf(lambda x: round(float(list(x)[0]),3), DoubleType())
#    for n in num:
#        train = train.withColumn(n+"_scaled", unpack_list(n+"_scaled")).drop(n+"_vect") 
#    print("*** show scaled numeric variables ....")
#    train.select(*[n+'_scaled' for n in num]).summary("count", "min", "25%", "75%", "max").show(10, truncate=False)
    
#    assembler = VectorAssembler(inputCols=[num_scaler.getOutputCol()] + [s+"_encoded" for s in cat], outputCol=feature)
    assembler = VectorAssembler(inputCols=[n+'_scaled' for n in num] + [s+"_encoded" for s in cat], outputCol=feature)
    
    target_indexed = target+"_indx"
    labelIndexer = StringIndexer(inputCol = target, outputCol = target_indexed, handleInvalid = 'keep')   
    
    model = clf_rf(feature, target_indexed)
#    model = clf_gbt(feature, target)
#    model = clf_lr(feature, target)
    validator = _val(target_indexed, model)
    
    stages += [assembler, labelIndexer, validator]
    print('*** stages are created and now are running... ***')
    
    pipeline = Pipeline(stages=indexer + encoder + imputer + num_assmblr + num_scaler + stages)
    pipeline_model = pipeline.fit(train)
    last_stage = pipeline_model.stages[-1]
    transformedData = pipeline_model.transform(train)
    
    transformedData.write.mode('overwrite').saveAsTable('us_marketing_usecase.transformedData')
    print('*** transformed data is saved for modeling... ***')
    
#    print('*** Transformed training set ***')
#    cols = num + cat
#    transformedData.select(target_indexed,feature,*cols).show(10, truncate=False)
# =============================================================================
# RandomForest/GradientBoosting
# =============================================================================
    print('*** Model performance ***')
    evaluate(transformedData,target_indexed)
    
    print('*** Feature Importances ***')
    featImp = last_stage.bestModel.featureImportances
    
    print('*** Show important ' + str(n_feat) +  ' features ***')
    list_extract = []
    for i in transformedData.schema['features'].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + transformedData.schema['features'].metadata["ml_attr"]["attrs"][i]
    
    pd.set_option('display.max_rows', 500)
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x:featImp[x])
    selected_feat = varlist.sort_values('score', ascending=False)
    
    print(selected_feat.iloc[0:n_feat, :])
    
    # Get the best hyperparameters:
    print('MaxDepth: ' + str(last_stage.bestModel._java_obj.getMaxDepth()))
    print('NumTrees: ' + str(last_stage.bestModel.getNumTrees))
    
# =============================================================================
# Logistic Regression
# =============================================================================
#    print('*** Model performance ***')
#    evaluate(transformedData,target)
#    
#    print('*** Model feature attributes ***')
#    trainingSummary = last_stage.bestModel.summary
#    trainingSummary.roc.show()
#    print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# =============================================================================
# Prediction and Evaluation
# =============================================================================
    predicted = predict(pipeline_model,target_indexed)
    evaluate(predicted,target_indexed)
    
    train.unpersist()
Esempio n. 31
0
def main():
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    parser.add_argument("--s3_input_bucket", type=str, help="s3 input bucket")
    parser.add_argument("--s3_input_key_prefix",
                        type=str,
                        help="s3 input key prefix")
    parser.add_argument("--s3_output_bucket",
                        type=str,
                        help="s3 output bucket")
    parser.add_argument("--s3_output_key_prefix",
                        type=str,
                        help="s3 output key prefix")
    args = parser.parse_args()

    spark = SparkSession.builder.appName("PySparkApp").getOrCreate()

    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    spark.sparkContext._jsc.hadoopConfiguration().set(
        "mapred.output.committer.class",
        "org.apache.hadoop.mapred.FileOutputCommitter")

    # Defining the schema corresponding to the input data. The input data does not contain the headers
    schema = StructType([
        StructField("sex", StringType(), True),
        StructField("length", DoubleType(), True),
        StructField("diameter", DoubleType(), True),
        StructField("height", DoubleType(), True),
        StructField("whole_weight", DoubleType(), True),
        StructField("shucked_weight", DoubleType(), True),
        StructField("viscera_weight", DoubleType(), True),
        StructField("shell_weight", DoubleType(), True),
        StructField("rings", DoubleType(), True)
    ])

    # Downloading the data from S3 into a Dataframe
    total_df = spark.read.csv(('s3://' + os.path.join(
        args.s3_input_bucket, args.s3_input_key_prefix, 'abalone.csv')),
                              header=False,
                              schema=schema)

    #StringIndexer on the sex column which has categorical value
    sex_indexer = StringIndexer(inputCol="sex", outputCol="indexed_sex")

    #one-hot-encoding is being performed on the string-indexed sex column (indexed_sex)
    sex_encoder = OneHotEncoder(inputCol="indexed_sex", outputCol="sex_vec")

    #vector-assembler will bring all the features to a 1D vector for us to save easily into CSV format
    assembler = VectorAssembler(inputCols=[
        "sex_vec", "length", "diameter", "height", "whole_weight",
        "shucked_weight", "viscera_weight", "shell_weight"
    ],
                                outputCol="features")

    # The pipeline comprises of the steps added above
    pipeline = Pipeline(stages=[sex_indexer, sex_encoder, assembler])

    # This step trains the feature transformers
    model = pipeline.fit(total_df)

    # This step transforms the dataset with information obtained from the previous fit
    transformed_total_df = model.transform(total_df)

    # Split the overall dataset into 80-20 training and validation
    (train_df, validation_df) = transformed_total_df.randomSplit([0.8, 0.2])

    # Convert the train dataframe to RDD to save in CSV format and upload to S3
    train_rdd = train_df.rdd.map(lambda x: (x.rings, x.features))
    train_lines = train_rdd.map(csv_line)
    train_lines.saveAsTextFile('s3://' + os.path.join(
        args.s3_output_bucket, args.s3_output_key_prefix, 'train'))

    # Convert the validation dataframe to RDD to save in CSV format and upload to S3
    validation_rdd = validation_df.rdd.map(lambda x: (x.rings, x.features))
    validation_lines = validation_rdd.map(csv_line)
    validation_lines.saveAsTextFile('s3://' + os.path.join(
        args.s3_output_bucket, args.s3_output_key_prefix, 'validation'))
Esempio n. 32
0
indexers = [StringIndexer(inputCol=x, outputCol=x + '_tmp') for x in cols_now]

encoders = [
    OneHotEncoder(dropLast=False, inputCol=x + "_tmp", outputCol=y)
    for x, y in zip(cols_now, cols_now1)
]
tmp = [[i, j] for i, j in zip(indexers, encoders)]
tmp = [i for sublist in tmp for i in sublist]

assembler_features = VectorAssembler(inputCols=cols_now, outputCol='features')
labelIndexer = StringIndexer(inputCol='binary_response', outputCol='label')
tmp += [assembler_features, labelIndexer]
pipeline = Pipeline(stages=tmp)

allData = pipeline.fit(joinrdd).transform(joinrdd)
allData.cache()
trainingData, testData = allData.randomSplit([0.8, 0.2], seed=0)

# COMMAND ----------

# COMMAND ----------
# Decision Tree
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
model = dt.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)
Esempio n. 33
0
# We can now define our classifier and pipeline. With this done, we can split our labeled data in train and test sets and fit a model.
#
# To train the decision tree, give it the feature vector column and the label column.
#
# Pipeline is defined by stages. Index plan column, label column, create vectors, then define the decision tree.

from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier

classifier = DecisionTreeClassifier(labelCol='label', featuresCol='features')

pipeline = Pipeline(
    stages=[plan_indexer, label_indexer, assembler, classifier])

(train, test) = churn_data.randomSplit([0.7, 0.3])
model = pipeline.fit(train)

# ## Model Evaluation
#
# The most important question to ask:
#
#     Is my predictor better than random guessing?
#
# How do we quantify that?

# Measure the area under the ROC curve, abreviated to AUROC.
#
# Plots True Positive Rate vs False Positive Rate for binary classification system
#
# [More Info](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)
#
Esempio n. 34
0
from pyspark.ml.pipeline import Pipeline

labelPredDF1 = labelPredDF.withColumn(
    "proportion",
    (col("pos")) / (col("neg") + col("neu") / 3 +
                    .000000001))  # When include neu in numerator, dominates it
trainDF, testDF = labelPredDF1.randomSplit([.8, .2], seed=42)

rf = RFormula(formula="label ~ neg + pos + neu + compound")
lr = LogisticRegression(fitIntercept=True)
ir = IsotonicRegression(featuresCol='proportion',
                        predictionCol='prediction',
                        isotonic=True)
pipeline = Pipeline(stages=[ir])

pipelineModel = pipeline.fit(trainDF)
testPredDF = pipelineModel.transform(testDF)

# COMMAND ----------

display(testPredDF)

# COMMAND ----------

# DBTITLE 1,Logistic Regression Model
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import IsotonicRegression
from pyspark.ml.pipeline import Pipeline

trainDF, testDF = labelPredDF.randomSplit([.8, .2], seed=42)
Esempio n. 35
0
conf = SparkConf().setAppName("pet_adoption").setMaster("yarn")
sc = SparkContext(conf=conf)
# sc = init_nncontext("HowCute_train")
sqlCtx = SQLContext(sc)
df = sqlCtx.read.csv('hdfs:///project_data/pets/train/train.csv', header=True, inferSchema='True').drop('Name').drop(
    'State')
df_test = sqlCtx.read.csv('hdfs:///project_data/pets/train/train.csv', header=True, inferSchema='True').drop(
    'Name').drop('State')
# spark = SparkSession.builder.appName("pet_adoption").getOrCreate()
##pandas frame is easier to read
# df_pd.drop('Name', axis=1, inplace=True)
input_cols = [a for a, b in df.dtypes if b == 'int']
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index").fit(df) for column in ["AdoptionSpeed"]]
pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)
df_test = pipeline.fit(df_test).transform(df_test)

feature = VectorAssembler(inputCols=input_cols, outputCol="features")
feature_vector = feature.transform(df)

feature_vector_test = feature.transform(df_test)
(trainingData, testData) = feature_vector.randomSplit([0.8, 0.2], seed=11)
testData.printSchema()
#testData.show(10)
lr = DecisionTreeClassifier(labelCol="AdoptionSpeed_index", featuresCol="features")
lrModel = lr.fit(trainingData)
lrModel.write().overwrite().save("hdfs:///treemodelofcsv")
modelloaded = DecisionTreeClassificationModel.load("hdfs:///treemodelofcsv")
lr_prediction = modelloaded.transform(testData)
# lr_prediction.select("prediction", "Survived", "features").show()
Esempio n. 36
0
                   subsamplingRate=1.0)
rf = RandomForestClassifier(featuresCol="features",
                            labelCol="label",
                            predictionCol="prediction",
                            probabilityCol="probability",
                            rawPredictionCol="rawPrediction",
                            maxDepth=5,
                            maxBins=32,
                            minInstancesPerNode=1,
                            minInfoGain=0.0,
                            maxMemoryInMB=256,
                            cacheNodeIds=False,
                            checkpointInterval=10,
                            impurity="gini",
                            numTrees=20,
                            featureSubsetStrategy="auto",
                            seed=12345)
pipe = Pipeline(stages=[featurizer, gb])
pipe_model = pipe.fit(train_df)

predictions = pipe_model.transform(test_df)

predictions.select("filePath", "prediction").show(10, False)

predictionAndLabels = predictions.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Training set accuracy = " +
      str(evaluator.evaluate(predictionAndLabels)))

#ZEND
Esempio n. 37
0
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(training)

# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
], ["id", "text"])

# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    rid, text, prob, prediction = row  # type: ignore
    print(
Esempio n. 38
0
exploder = TileExploder()

# To "vectorize" the the band columns we use the SparkML `VectorAssembler`
assembler = VectorAssembler() \
    .setInputCols(bandColNames) \
    .setOutputCol("features")

# Configure our clustering algorithm
k = 5
kmeans = KMeans().setK(k)

# Combine the two stages
pipeline = Pipeline().setStages([exploder, assembler, kmeans])

# Compute clusters
model = pipeline.fit(joinedRF)

# Run the data through the model to assign cluster IDs to each
clustered = model.transform(joinedRF)
clustered.show(8)

# If we want to inspect the model statistics, the SparkML API requires us to go
# through this unfortunate contortion:
clusterResults = list(
    filter(lambda x: str(x).startswith('KMeans'), model.stages))[0]

# Compute sum of squared distances of points to their nearest center
metric = clusterResults.computeCost(clustered)
print("Within set sum of squared errors: %s" % metric)

tlm = joinedRF.tileLayerMetadata()
Esempio n. 39
0
    # compose a pipeline that includes feature transform, pretrained model and Logistic Regression
    transformer = ChainedPreprocessing([
        RowToImageFeature(),
        ImageResize(256, 256),
        ImageCenterCrop(224, 224),
        ImageChannelNormalize(123.0, 117.0, 104.0),
        ImageMatToTensor(),
        ImageFeatureToTensor()
    ])

    preTrainedNNModel = NNModel(Model.loadModel(model_path), transformer) \
        .setFeaturesCol("image") \
        .setPredictionCol("embedding")

    lrModel = Sequential().add(Linear(1000, 2)).add(LogSoftMax())
    classifier = NNClassifier(lrModel, ClassNLLCriterion(), SeqToTensor([1000])) \
        .setLearningRate(0.003).setBatchSize(40).setMaxEpoch(20).setFeaturesCol("embedding")

    pipeline = Pipeline(stages=[preTrainedNNModel, classifier])

    catdogModel = pipeline.fit(trainingDF)
    predictionDF = catdogModel.transform(validationDF).cache()
    predictionDF.show()

    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictionDF)
    # expected error should be less than 10%
    print("Test Error = %g " % (1.0 - accuracy))
Esempio n. 40
0
trainer = ADAG(keras_model=model, worker_optimizer='adam', loss='categorical_crossentropy',
               num_workers=1, batch_size=100, communication_window=5, num_epoch=50,
               features_col="matrix", label_col="label_encoded"
               )
trained_model = trainer.train(training_set)
from distkeras.predictors import *
from distkeras.transformers import *
from distkeras.evaluators import *
from distkeras.utils import *

print("Training time: " + str(trainer.get_training_time()))
print("Accuracy: " + str(evaluate_accuracy(trained_model, test_set)))
print("Number of parameter server updates: " + str(trainer.parameter_server.num_updates))

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[string_indexer, scaler, trainer_model])

from pyspark.mllib.evaluation import MulticlassMetrics

fitted_pipeline = pipeline.fit(dataset_train) # Fit model to data

prediction = fitted_pipeline.transform(dataset_train) # Evaluate on train data.
# prediction = fitted_pipeline.transform(test_df) # <-- The same code evaluates test data.
pnl = prediction.select("index_category", "prediction")
pnl.show(100)

prediction_and_label = pnl.map(lambda row: (row.index_category, row.prediction))
metrics = MulticlassMetrics(prediction_and_label)
print(metrics.precision())
    stop_words_remover = StopWordsRemover(
        inputCol="words", outputCol="filtered").setStopWords(stop_words)

    # bag of words count
    count_vectors = CountVectorizer(inputCol="filtered",
                                    outputCol="features",
                                    vocabSize=10000,
                                    minDF=5)
    label_string_index = StringIndexer(inputCol="category", outputCol="label")
    label_string_index.setHandleInvalid("keep")

    pipeline = Pipeline(stages=[
        regex_tokenizer, stop_words_remover, count_vectors, label_string_index
    ])
    (training_data, test_data) = df.randomSplit([0.8, 0.2], seed=100)
    pipeline_fit = pipeline.fit(training_data)
    pipeline_fit.save("lr_pipeline")

    training_data_set = pipeline_fit.transform(training_data)
    training_data_set.show(5)

    # stages = pipeline_fit.stages
    # vec = [s for s in stages if isinstance(s, CountVectorizerModel)]
    # v1 = vec[0].vocabulary
    # print(len(v1))

    print("Training: " + str(training_data_set.count()))
    print("Test: " + str(test_data.count()))

    lr = LogisticRegression(maxIter=100, regParam=0.2, elasticNetParam=0)
    lr_model = lr.fit(training_data_set)
    ).toDF()

df_new = sc.parallelize(
    [
        Row(p=u'p1', owner=u'u1', f1=0.1, f2=0.3, f3=0.5),
        Row(p=u'p2', owner=u'u1', f1=0.3, f2=0.5, f3=0.5),
        Row(p=u'p3', owner=u'u1', f1=0.6, f2=0.6, f3=0.9),
        Row(p=u'p4', owner=u'u1', f1=0.8, f2=0.1, f3=0.6),
        Row(p=u'p5', owner=u'u1', f1=0.0, f2=0.2, f3=0.2),
        Row(p=u'p1', owner=u'u2', f1=0.0, f2=0.4, f3=0.1),
        Row(p=u'p2', owner=u'u2', f1=0.3, f2=0.7, f3=0.4),
        Row(p=u'p3', owner=u'u2', f1=0.4, f2=0.6, f3=0.6),
        Row(p=u'p4', owner=u'u2', f1=0.6, f2=0.1, f3=0.7),
        Row(p=u'p5', owner=u'u2', f1=0.0, f2=0.0, f3=0.8),
    ]
    ).toDF()

owner_training = df_training.where(col('owner') == 'u1')
owner_new = df_new.where(col('owner') == 'u1')

label_indexer = StringIndexer(inputCol="status",
                             outputCol="indexedStatus")
assembler = VectorAssembler(inputCols=['f1', 'f2', 'f3'],
                            outputCol='features')
rf = RandomForestClassifier(labelCol="indexedStatus", featuresCol="features")

pipeline = Pipeline(stages=[label_indexer, assembler, rf])
model = pipeline.fit(owner_training)
predictions = model.transform(owner_new)
predictions.show()
    stages += [stringIndexer, encoder]

label_stringIdx = StringIndexer(inputCol='base_plan_id',
                                outputCol='label',
                                handleInvalid='skip')
stages += [label_stringIdx]
assemblerInputs = [c + "stringEnc" for c in string_cols] + numeric_cols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

import time

start_time = time.time()

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(final_data)
df = pipelineModel.transform(final_data)
end_time = time.time()
print("total time taken for Pipeline loop in seconds: ", end_time - start_time)

selectedCols = ['label', 'features'] + final_data.columns
df = df.select(selectedCols)
#df.printSchema()

# ## Random Forest Classification

from pyspark.ml.classification import RandomForestClassifier

### MinMax Scaling
from pyspark.ml.feature import MinMaxScaler
vectorAssembler = VectorAssembler()\
  .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
  .setOutputCol("features")


# COMMAND ----------

from pyspark.ml import Pipeline

transformationPipeline = Pipeline()\
  .setStages([indexer, encoder, vectorAssembler])


# COMMAND ----------

fittedPipeline = transformationPipeline.fit(trainDataFrame)


# COMMAND ----------

transformedTraining = fittedPipeline.transform(trainDataFrame)


# COMMAND ----------

from pyspark.ml.clustering import KMeans
kmeans = KMeans()\
  .setK(20)\
  .setSeed(1L)

Esempio n. 45
0
# Create 'features' vector: 'km', 'org_dummy', 'dow_dummy'
assembler = VectorAssembler(inputCols=['km', 'org_dummy', 'dow_dummy'],
                            outputCol='features')

# Split the data into training and testing sets
flights_train, flights_test = flites.randomSplit([0.8, 0.2], seed=23)

# Create a regression object and train on training data
regression = LinearRegression(labelCol="duration")

# Combine steps into a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

# run fit on training data
pipeline = pipeline.fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
predictions = pipeline.transform(flights_test)
print(predictions.toPandas().sample(12))

# Calculate the RMSE
print("\nRMSE", RegressionEvaluator(labelCol="duration").evaluate(predictions))

# Print the coefficients and intercept for linear regression
print("\nCoefficients: %s" %
      str(pipeline.stages[REGRESSION_STAGE].coefficients))
print("Intercept: %s" % str(pipeline.stages[REGRESSION_STAGE].intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = pipeline.stages[REGRESSION_STAGE].summary
Esempio n. 46
0
tokenizer= Tokenizer(inputCol='text',outputCol='token_text')
stop_remove=StopWordsRemover(inputCol='token_text',outputCol='stop_token')
count_vec= CountVectorizer(inputCol='stop_token',outputCol='c_vec')
idf=IDF(inputCol='c_vec',outputCol='tf_idf')
ham_spam_to_numeric= StringIndexer(inputCol='class',outputCol='label')

from pyspark.ml.feature import VectorAssembler
clean_up= VectorAssembler(inputCols=['tf_idf','length'],outputCol='features')

from pyspark.ml.classification import NaiveBayes
nb= NaiveBayes()

from pyspark.ml import Pipeline
data_prep_pipe= Pipeline(stages=[ham_spam_to_numeric,tokenizer,
                                stop_remove,count_vec,idf,clean_up])
cleaner=data_prep_pipe.fit(data)
clean_data=cleaner.transform(data)
clean_data.columns
"""
['class',
 'text',
 'length',
 'label',
 'token_text',
 'stop_token',
 'c_vec',
 'tf_idf',
 'features']
"""

clean_data=clean_data.select('label','features')
Esempio n. 47
0
############# Entrenamiento del modelo
###### Palabras a vectores Word2Vec

from pyspark.ml.feature import Word2Vec
from pyspark.ml import Pipeline

w2v = Word2Vec(vectorSize=100,
               minCount=0,
               inputCol="filtro_conectores",
               outputCol="vectores")
redesSociales_word2vec_modelo = w2v.fit(redesSociales_df)
redesSociales_df = redesSociales_word2vec_modelo.transform(redesSociales_df)
redesSociales_df.select("filtro_conectores",
                        "vectores").orderBy(rand()).show(5)

modelo_sentimiento = clasificador_rl_pipeline.fit(
    aprendizajemaquina_entrenamiento_df)
prediccion_sentimiento_redesSociales_df = modelo_sentimiento.transform(
    redesSociales_df)


def prediccionLiteral(column):
    if column == 1.0:
        return "Bueno"
    else:
        return "Malo"


prediccionLiteral_udf = udf(prediccionLiteral)
prediccion_sentimiento_redesSociales_df=prediccion_sentimiento_redesSociales_df\
 .withColumn('sentimiento',prediccionLiteral_udf(prediccion_sentimiento_redesSociales_df.prediccion))
Esempio n. 48
0
    'age', 'fnlwgt', 'capital-gain', 'educational-num', 'capital-loss',
    'hours-per-week'
]
assemblerInputs = [c + 'classVec' for c in CATE_FEATURES] + CONTI_FEATURES
assemblerInputs

# 4) assemble the steps
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features')
stages += [assembler]

# create a pipeline
# TODO 다른 예제에서는 train/test 먼저 나누고 변환하기도 함(이게 맞는거같음 더 찾아보기)
df_remove.show()

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df_remove)
model = pipelineModel.transform(df_remove)

model.take(1)
'''
Out[116]: [Row(age=25, age_square=625.0, 
workclass='Private', fnlwgt=226802, education='11th', educational-num=7, 
marital-status='Never-married', occupation='Machine-op-inspct', 
relationship='Own-child', race='Black', gender='Male', 
capital-gain=0, capital-loss=0, hours-per-week=40, native-country='United-States', 
label='<=50K', 
workclassIndex=0.0, 
workclassclassVec=SparseVector(8, {0: 1.0}), 
educationIndex=5.0, 
educationclassVec=SparseVector(15, {5: 1.0}), 
marital-statusIndex=1.0, 
Esempio n. 49
0
interactor.fit(df_train).transform(df_train).select("features").show()

from pyspark.ml.classification import LogisticRegression

classifier = LogisticRegression(maxIter=20,
                                regParam=0.000,
                                elasticNetParam=0.000)

stages = [interactor, classifier]

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)

model = pipeline.fit(df_train)

predictions = model.transform(df_test)

predictions.cache()

predictions.show()

from pyspark.ml.evaluation import BinaryClassificationEvaluator

ev = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                   metricName="areaUnderROC")
print(ev.evaluate(predictions))

spark.stop()
# Create a DecisionTreeRegressor
dt = DecisionTreeRegressor(maxDepth = 8)

dt.setLabelCol("TOTAL_BENEFICIARY_AMT")\
  .setPredictionCol("Predicted_EXP")\
  .setFeaturesCol("features")\
  .setMaxBins(10000)


# Create a Pipeline
dtPipeline = Pipeline()

# Set the stages of the Pipeline
dtPipeline.setStages([vectorizer, dt])
model = dtPipeline.fit(train_data)
train_data_output=model.transform(train_data)

from pyspark.ml.evaluation import RegressionEvaluator

# Create an RMSE evaluator using the label and predicted columns
regEval = RegressionEvaluator(predictionCol="Predicted_EXP", labelCol="TOTAL_BENEFICIARY_AMT", metricName="r2")

# Run the evaluator on the DataFrame
r2 = regEval.evaluate(train_data_output)

print("Root Mean Squared Error: %.2f" % r2)

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# We can reuse the RegressionEvaluator, regEval, to judge the model based on the best Root Mean Squared Error
Esempio n. 51
0
def multiclass(algorithms, dataset):
    # create label
    indexer = StringIndexer(inputCol='genre', outputCol='label') #https://spark.apache.org/docs/2.1.0/ml-features.html|https://stackoverflow.com/questions/36942233/apply-stringindexer-to-several-columns-in-a-pyspark-dataframe

    # set up pipeline
    pipeline = Pipeline(stages=[indexer])
    df = pipeline.fit(dataset).transform(dataset)

    # split training / test
    training, test = df.randomSplit([0.7, 0.3])

    # get class distribution
    class_dist = (
        df
        .groupBy(['genre', 'label'])
        .count()
        .withColumn('fraction', F.when(F.col('count') < 5000, 1).otherwise(5000 / F.col('count')))
        .orderBy('label')
    )

# create dictionary of fraction of each class
    
    fractions = dict()
    for row in class_dist.collect():
        fractions[row.label] = row.fraction

    # down-sampling audio features using fractions
    training_bal = training.sampleBy('label', fractions, seed=1)

    # print head
    str = '{:>10}|{:>10}|{:>10}|{:>10}|{}'
    print(str.format('accuracy', 'precision', 'recall', 'f1', 'algorithm'))

    # iterate algorithms
    for name, normal, cv in algorithms:
        # train
        model_nor = normal.fit(training)
        model_bal = normal.fit(training_bal)
        model_ovr = OneVsRest(classifier=normal).fit(training)

        # predict
        predict_nor = model_nor.transform(test)
        predict_bal = model_bal.transform(test)
        predict_ovr = model_ovr.transform(test)

        metrics = [('normal', predict_nor), ('balance', predict_bal), ('one-vs-rest', predict_ovr)]

        # cv
        if cv != None:
            model_cv = cv.fit(training)
            predict_cv = model_cv.transform(test)
            metrics.append(('cv', predict_cv))

        # metrics
        for mtype, d in metrics:
            eval_multi = MulticlassClassificationEvaluator()
            accuracy = eval_multi.evaluate(d, {eval_multi.metricName: 'accuracy'})
            precision = eval_multi.evaluate(d, {eval_multi.metricName: 'weightedPrecision'})
            recall = eval_multi.evaluate(d, {eval_multi.metricName: 'weightedRecall'})
            f1 = eval_multi.evaluate(d, {eval_multi.metricName: 'f1'})
            str = '{:>10.3f}|{:>10.3f}|{:>10.3f}|{:>10.3f}|{}'
            print(str.format(accuracy, precision, recall, f1, name + ' (' + mtype +')'))
    "VectorC1", "Vector_banner_pc", "Vector_site_category",
    "Vector_app_category", "Vector_device_type", "Vector_device_conn_type",
    "VectorC15", "VectorC16", "VectorC18", "VectorC19", "VectorC21"
],
                                   outputCol="VectoredFeatures")

# Using pipeline
pipelineTmp = Pipeline(stages=[
    C1Indexer, BannerPcIndexer, SiteCategoryIndexer, AppCategoryIndexer,
    DeviceTypeIndexer, DeviceConnTypeIndexer, C15Indexer, C16Indexer,
    C18Indexer, C19Indexer, C21Indexer, C1Encoder, BannerPcEncoder,
    SiteCategoryEncoder, AppCategoryEncoder, DeviceTypeEncoder,
    DeviceConnTypeEncoder, C15Encoder, C16Encoder, C18Encoder, C19Encoder,
    C21Encoder, FeatureAssembler
])
modelTmp = pipelineTmp.fit(schemaClick)
tmp = modelTmp.transform(schemaClick).select("click", "VectoredFeatures")
tmp.registerTempTable("CLICK")

# Selecting click and VectoredFeatures from Table "CLICK" and creating new dataFrame as results
results = sqlContext.sql("SELECT click, VectoredFeatures from CLICK")
results.show()

# Creating label points for attributes click and VectoredFeatures
click_transformed = results.select(
    'click', 'VectoredFeatures').rdd.map(lambda row: LabeledPoint(
        float(row.click), Vectors.dense((row.VectoredFeatures).toArray())))
click_transformed.take(2)

#Divide the data into training and test sets
weights = [.8, .2]
Esempio n. 53
0
class_indexer = StringIndexer(inputCol="C4", outputCol="label")

# Read in data for sensitivity analysis
test_data = sql_context.read.load('tests/resources/iris_test_data.csv',
                                  format='com.databricks.spark.csv',
                                  header='false',
                                  inferSchema='true')

# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="features", labelCol="label")

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[assembler, class_indexer, dt])

# Train model.  This also runs the indexer.
model = pipeline.fit(data)

# Get our data_info frame, courtesy of PSAML
cols_to_analyze = ['C0', 'C1', 'C2', 'C3']
data_info = psaml.make_data_info(sql_context, test_data, cols_to_analyze, 'C4')

# Make predictions.
predictions = psaml.do_continuous_input_analysis(sc, model, 5, 5, data_info)


# Select example rows to display.
# predictions.show()  # opt param: number of records to show

fig = plotly.tools.make_subplots(rows=len(cols_to_analyze), cols=1)
sql_context.registerDataFrameAsTable(predictions, "predictions")
Esempio n. 54
0
# define stage 1: tokenize the tweet text
stage_1 = RegexTokenizer(inputCol='tweet', outputCol='tokens', pattern='\\W')
# define stage 2: remove the stop words
stage_2 = StopWordsRemover(inputCol='tokens', outputCol='filtered_words')
# define stage 3: create a word vector of the size 100
stage_3 = Word2Vec(inputCol='filtered_words',
                   outputCol='vector',
                   vectorSize=100)
# define stage 4: Logistic Regression Model
model = LogisticRegression(featuresCol='vector', labelCol='positive')
# setup the pipeline
pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, model])

# fit the pipeline model with the training data
pipelineFit = pipeline.fit(training_set)

modelSummary = pipelineFit.stages[-1].summary
modelSummary.accuracy


def get_prediction_json(key, rdd):
    print("********************")
    tweet = rdd.map(lambda (key, value): json.loads(value)).map(
        lambda json_object: json_object["text"])
    tweetstr = tweet.collect()
    if not tweetstr:
        print("No Tweet")
        return

    print("********************")
Esempio n. 55
0
# convert features into vector
assembler = (
    VectorAssembler()
    .setInputCols([x for x in features.columns if x.startswith('method')])
    .setOutputCol('features')
)




# normalize vector
scaler = StandardScaler(inputCol='features', outputCol='scfeatures', withStd=True, withMean=False)

# set up pipeline
pipeline = Pipeline(stages=[assembler, scaler])
dataset = pipeline.fit(features).transform(features)

#model = Pipeline(stages=[assembler,scaler]).fit(features).transform(features)


# print table
dataset.select(['track_id', 'genre', 'features', 'scfeatures']).show(3, 30)

# +------------------+--------------+------------------------------+------------------------------+
# |          track_id|         genre|                      features|                    scfeatures|
# +------------------+--------------+------------------------------+------------------------------+
# |TRAAABD128F429CF47|      Pop_Rock|[0.1308,9.587,459.9,27280.0...|[2.022118802771498,2.624321...|
# |TRAAADT12903CCC339|Easy_Listening|[0.08392,7.541,423.7,36060....|[1.2973716355396339,2.06425...|
# |TRAAAEF128F4273421|      Pop_Rock|[0.1199,9.381,474.5,26990.0...|[1.8536089025405398,2.56793...|
# +------------------+--------------+------------------------------+------------------------------+
# only showing top 3 rows
Esempio n. 56
0
rdd = data.filter(lambda row: row != header)

r = rdd.mapPartitions(lambda x: csv.reader(x))
r = r.map(lambda x: (processTweet(x[3]), int(x[1])))

r = r.map(lambda x: Row(sentence=x[0], label=int(x[1])))
df = spark.createDataFrame(r).orderBy(rand()).limit(500000)

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="base_words")
hashingTF = HashingTF(numFeatures=10000,
                      inputCol="base_words",
                      outputCol="features")

lr = LogisticRegression(maxIter=10000, regParam=0.001, elasticNetParam=0.0001)

pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr])

splits = df.randomSplit([0.6, 0.4], 223)
trainSet = splits[0]
testSet = splits[1]

lrModel = pipeline.fit(trainSet)
lrResult = lrModel.transform(testSet)

testSet.show(truncate=False)
lrResult.show(truncate=False)

avg = lrResult.where('label == prediction').count() / lrResult.count()
print(avg)
Esempio n. 57
0
def main(dict):

    filename = dict['filename']
    savedmodelName = dict['modelname']

    def myFunc(input):
        lines = input.split("\n")
        for line in lines:
            parts = line.split(";")
            Category = parts[-1]
            Sentence = parts[1]
            url_pattern = re.compile(r'(http[s]://[\w./]+)*')
            rt_pattern = re.compile('RT @\w+: ')
            r_pattern = re.compile('@\w+ ')
            Sentence = r_pattern.sub(
                r'', rt_pattern.sub(r'',
                                    url_pattern.sub(r'', Sentence))).replace(
                                        '\n', ' ').strip()
        return (Category, Sentence)

    file = sc.textFile("4CVTweets/" + filename)
    lines = file.map(myFunc)
    sentenceDataFrame = spark.createDataFrame(lines, ["label", "sentence"])
    (trainingData, testData) = sentenceDataFrame.randomSplit([0.7, 0.3])
    df = spark.createDataFrame([(0, "NO"), (1, "crash"), (2, "fire"),
                                (3, "shooting")], ["id", "label"])

    # start building the pineline
    # No: 0,Crash:1,Fire:2,Shooting:3

    indexer = StringIndexer(inputCol="label", outputCol="categoryIndex")
    indexer.fit(df)

    tokenizer = RegexTokenizer(pattern="\\w+",
                               inputCol="sentence",
                               outputCol="words",
                               gaps=False)
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    hashingTF = HashingTF(inputCol="filtered",
                          outputCol="rawFeatures",
                          numFeatures=10000)
    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)

    # # Compute the Inverse Document Frequency (IDF) given a collection of documents.

    rf = RandomForestClassifier(labelCol="categoryIndex",
                                featuresCol="features",
                                numTrees=100,
                                maxDepth=10)

    # Using randomForest
    # mlr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8, family="multinomial",featuresCol="features",labelCol="categoryIndex")
    # Naive Bayers
    nb = NaiveBayes(labelCol="categoryIndex",
                    featuresCol="features",
                    smoothing=1)

    # converter = IndexToString(inputCol="prediction", outputCol="originalCategory")
    pipeline = Pipeline(
        stages=[indexer, tokenizer, remover, hashingTF, idf, nb])
    model = pipeline.fit(trainingData)

    # Start to count accuracy to evaluate the model using just the offline model

    predictionsForTraining = model.transform(trainingData)

    predictionsForTraining.show(100, False)

    joindf = spark.createDataFrame([(0.0, "NO"), (1.0, "crash"), (2.0, "fire"),
                                    (3.0, "shooting")],
                                   ["prediction", "Predictlabel"])
    innerjoin = predictionsForTraining.join(
        joindf, joindf.prediction == predictionsForTraining.prediction).drop(
            joindf.prediction)

    # innerjoin.select("label","categoryIndex","prediction","Predictlabel").show(1000,False)
    innerjoin.select("label", "Predictlabel").show(1000, False)

    evaluator1 = MulticlassClassificationEvaluator(labelCol="categoryIndex",
                                                   predictionCol="prediction",
                                                   metricName="accuracy")
    accuracy = evaluator1.evaluate(predictionsForTraining)
    print("Test Accuracy = %g " % (accuracy))
    print("Train Error = %g " % (1.0 - accuracy))

    predictions = model.transform(testData)
    evaluator2 = MulticlassClassificationEvaluator(labelCol="categoryIndex",
                                                   predictionCol="prediction",
                                                   metricName="accuracy")

    accuracy = evaluator2.evaluate(predictions)
    print("Test Accuracy = %g " % (accuracy))
    print("Test Error = %g " % (1.0 - accuracy))

    savePath = "tmp/pipeline/" + savedmodelName
    model.write().overwrite().save(savePath)
    print("model for Location", savedmodelName, "save successfully.")
# Q2

#(a)
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

user_stringIdx = StringIndexer(inputCol="USER_ID", outputCol="USER_INDEX")
song_stringIdx = StringIndexer(inputCol="SONG_ID", outputCol="SONG_INDEX")

pipeline1 = Pipeline(stages=[user_stringIdx, song_stringIdx])

# Fit the pipeline to training
pipelineFit = pipeline1.fit(user_song)
dataset1 = pipelineFit.transform(user_song)

trainingData1, testData1 = split_Data(dataset1, 10)

als = ALS(maxIter=5,
          regParam=0.01,
          implicitPrefs=True,
          userCol="USER_INDEX",
          itemCol="SONG_INDEX",
          ratingCol="COUNT")
ALSModel = als.fit(trainingData1)

# Generate top 10 recommendations for 10 SELECTED users

users = testData1.select(["USER_INDEX"]).distinct().limit(5)
REDDIT_AUG = "swift://reddit3.sjc01/RC_2015-08"
REDDIT_SEPT = "swift://reddit3.sjc01/RC_2015-09"

if __name__ == "__main__":
    
    # Configure Spark
    sc = SparkContext(appName=APP_NAME)
    sqlContext = SQLContext(sc)

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # prepare Reddit json files as sql Dataframes for pyspark.ml
    aug_comments =  sqlContext.read.json(REDDIT_AUG)
    sep_comments = sqlContext.read.json(REDDIT_SEPT)

    # transform data for log_reg model by dividing karma score by 1000
    #  error: Classification labels should be in {0 to 8114} Found 2576839 invalid labels.
    training = aug_comments.select('id', 'body', (aug_comments.score / 1000.0).cast("double").alias('label'))
    test = sep_comments.select('id', 'body')
    test_actual = sep_comments.select('id', (sep_comments.score / 1000.0).alias('actual'))

    model = pipeline.fit(training)
    prediction = model.transform(test)
    selected = prediction.select("id", "text", "prediction").join(test_actual, prediction.id == test_actual.id)
    selected.write.format('json').save("hdfs://master/usr/hadoop/karma_predictions")
    sc.stop()
multiclass_dataset = multiclass_dataset.drop("genre_id")

df = multiclass_dataset
cols = df.columns

stages = []
label_stringIdx = StringIndexer(inputCol='int_genre_id', outputCol='label')
stages += [label_stringIdx]
numericCols = multiclass_dataset.schema.names[0:-1]

assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
stages += [assembler]

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)
selectedCols = ['label', 'features'] + cols

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = df.randomSplit([0.7, 0.3])

# Train a Random Forest model.
rf = RandomForestClassifier(labelCol="categoryIndex",
                            featuresCol="features",
                            numTrees=100,
                            maxDepth=10,
                            impurity="entropy")
rf = OneVsRest(classifier=rf)

# Chain RF in a Pipeline