Example #1
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
    def train_lg(training_data, collection):
        # Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr.
        hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features")
        idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
        pipeline1 = Pipeline(stages=[hashingTF, idf])

        # Fit the pipeline1 to training documents.
        model1 = pipeline1.fit(training_data)

        lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
        pipeline2 = Pipeline(stages=[model1, lr])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \
            .addGrid(lr.regParam, [0.1, 0.01]) \
            .build()

        crossval = CrossValidator(estimator=pipeline2,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=BinaryClassificationEvaluator(),
                                  numFolds=5)

        # Run cross-validation, and choose the best set of parameters.
        cvModel = crossval.fit(training_data)

    #     model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_'
    #                             + collection["Id"] + '_'
    #                             + collection["name"])
    #     cvModel.save(sc, model_path)
        return cvModel
Example #3
0
def main(sc, spark):
    # Load the Corpus
    corpus = load_corpus(sc, spark)

    # Create the vector/cluster pipeline
    pipeline = Pipeline(stages=[
        Tokenizer(inputCol="text", outputCol="tokens"),
        Word2Vec(vectorSize=7, minCount=0, inputCol="tokens", outputCol="vecs"),
        BisectingKMeans(k=10, featuresCol="vecs", maxIter=10),
    ])

    # Fit the model
    model = pipeline.fit(corpus)
    corpus = model.transform(corpus)

    # Evaluate clustering.
    bkm = model.stages[-1]
    cost = bkm.computeCost(corpus)
    sizes = bkm.summary.clusterSizes

    # TODO: compute cost of each cluster individually

    # Get the text representation of each cluster.
    wvec = model.stages[-2]
    table = [["Cluster", "Size", "Terms"]]
    for ci, c in enumerate(bkm.clusterCenters()):
        ct = wvec.findSynonyms(c, 7)
        size = sizes[ci]
        terms = " ".join([row.word for row in ct.take(7)])
        table.append([ci, size, terms])

    # Print Results
    print(tabulate(table))
    print("Sum of square distance to center: {:0.3f}".format(cost))
Example #4
0
def fit_kmeans(spark, products_df):
    step = 0

    step += 1
    tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")

    step += 1
    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")

    step += 1
    tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)

    step += 1
    idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")

    step += 1
    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")

    step += 1
    kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)

    kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])

    model = kmeans_pipeline.fit(products_df)
    words_prediction = model.transform(products_df)
    model.save("./kmeans")  # the whole machine learning instance is saved in a folder
    return model, words_prediction
 def testLogisticMLPipeline1(self):
     training = sqlCtx.createDataFrame([
         ("a b c d e spark", 1.0),
         ("b d", 2.0),
         ("spark f g h", 1.0),
         ("hadoop mapreduce", 2.0),
         ("b spark who", 1.0),
         ("g d a y", 2.0),
         ("spark fly", 1.0),
         ("was mapreduce", 2.0),
         ("e spark program", 1.0),
         ("a e c l", 2.0),
         ("spark compile", 1.0),
         ("hadoop software", 2.0)
         ], ["text", "label"])
     tokenizer = Tokenizer(inputCol="text", outputCol="words")
     hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
     lr = LogisticRegression(sqlCtx)
     pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
     model = pipeline.fit(training)
     test = sqlCtx.createDataFrame([
         ("spark i j k", 1.0),
         ("l m n", 2.0),
         ("mapreduce spark", 1.0),
         ("apache hadoop", 2.0)], ["text", "label"])
     result = model.transform(test)
     predictionAndLabels = result.select("prediction", "label")
     evaluator = MulticlassClassificationEvaluator()
     score = evaluator.evaluate(predictionAndLabels)
     self.failUnless(score == 1.0)
    def test_nnclassifier_in_pipeline(self):

        if self.sc.version.startswith("1"):
            from pyspark.mllib.linalg import Vectors

            df = self.sqlContext.createDataFrame(
                [(Vectors.dense([2.0, 1.0]), 1.0),
                 (Vectors.dense([1.0, 2.0]), 2.0),
                 (Vectors.dense([2.0, 1.0]), 1.0),
                 (Vectors.dense([1.0, 2.0]), 2.0),
                 ], ["features", "label"])

            scaler = MinMaxScaler().setInputCol("features").setOutputCol("scaled")
            model = Sequential().add(Linear(2, 2))
            criterion = ClassNLLCriterion()
            classifier = NNClassifier(model, criterion, MLlibVectorToTensor([2]))\
                .setBatchSize(4) \
                .setLearningRate(0.01).setMaxEpoch(1).setFeaturesCol("scaled")

            pipeline = Pipeline(stages=[scaler, classifier])

            pipelineModel = pipeline.fit(df)

            res = pipelineModel.transform(df)
            assert type(res).__name__ == 'DataFrame'
Example #7
0
def model(classifiers, training, testing, week):

    results = ""
    timing = []

    for classifier in classifiers:

        timeStart = time.time()

        clf = get_classifier(classifier)

        labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
        featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")

        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
        model = pipeline.fit(training)

        prediction = model.transform(testing)

        metrics = BinaryClassificationMetrics(prediction.select("label","prediction").rdd)

        results = results + "new," + classifier + "," + week + "," + str(metrics.areaUnderROC) + "," +str(metrics.areaUnderPR) + "\n"

        timing.append(time.time()-timeStart)

    return results, timing
Example #8
0
def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(
        maxIter=10, regParam=0.3, elasticNetParam=0.8,
        family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[
        vector, labelIndex, clf
    ]).fit(training)

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only
def main(input_file):
    # Load and parse the data file, converting it to a DataFrame.
    data = MLUtils.loadLabeledPoints(sc, input_file)

    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10).fit(data)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    rf = RandomForestRegressor(featuresCol="indexedFeatures")

    # Chain indexer and forest in a Pipeline
    pipeline = Pipeline(stages=[featureIndexer, rf])

    # Train model.  This also runs the indexer.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("prediction", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(
        labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse))

    rfModel = model.stages[1]
    print(rfModel)  # summary only
Example #10
0
def run(start1, end1, start2, end2, df, sc, sql_context, is_pred):
    lp_data= get_labeled_points(start1, end2, df, sc, sql_context)
    print lp_data.count()

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(lp_data)
    td = labelIndexer.transform(lp_data)
    label2index = {}
    for each in  sorted(set([(i[0], i[1]) for i in td.select(td.label, td.indexedLabel).distinct().collect()]),
                key=lambda x: x[0]):
        label2index[int(each[0])] = int(each[1])
    print label2index

    featureIndexer = \
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(lp_data)

    rf = get_model()

    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

    lp_train = lp_data.filter(lp_data.date3<end1).filter(lp_data.is_labeled == 1)
    model = pipeline.fit(lp_train)
    lp_check = lp_data.filter(lp_data.date2>start2)
    predictions = model.transform(lp_check)
    predictions = val(predictions, label2index, sql_context)

    if is_pred:
        predictions = predictions.filter(predictions.is_labeled ==0).filter(predictions.date2 == get_cur()).sort(predictions.prob.desc())
        dfToTableWithPar(sql_context, predictions, "predictions", get_cur())
        for each in predictions.take(10):
            print each
def RunRandomForest(tf, ctx):
	sqlContext = SQLContext(ctx)
	rdd = tf.map(parseForRandomForest)
	# The schema is encoded in a string.
	schema = ['genre', 'track_id', 'features']
	# Apply the schema to the RDD.
	songDF = sqlContext.createDataFrame(rdd, schema)

	# Register the DataFrame as a table.
	songDF.registerTempTable("genclass")
	labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF)

	trainingData, testData = songDF.randomSplit([0.8, 0.2])

	labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)

	rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features")
	#rfc = SVMModel([.5, 10, 20], 5)
	#rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features")

	pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter])
	model = pipeline.fit(trainingData)

	predictions = model.transform(testData)
	predictions.show()

	evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
	accuracy = evaluator.evaluate(predictions)
	print 'Accuracy of RandomForest = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100
    def test_cv_lasso_with_mllib_featurization(self):
        data = [('hi there', 0.0),
                ('what is up', 1.0),
                ('huh', 1.0),
                ('now is the time', 5.0),
                ('for what', 0.0),
                ('the spark was there', 5.0),
                ('and so', 3.0),
                ('were many socks', 0.0),
                ('really', 1.0),
                ('too cool', 2.0)]
        data = self.sql.createDataFrame(data, ["review", "rating"])

        # Feature extraction using MLlib
        tokenizer = Tokenizer(inputCol="review", outputCol="words")
        hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000)
        pipeline = Pipeline(stages=[tokenizer, hashingTF])
        data = pipeline.fit(data).transform(data)

        df = self.converter.toPandas(data.select(data.features.alias("review"), "rating"))

        pipeline = SKL_Pipeline([
            ('lasso', SKL_Lasso(max_iter=1))
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }

        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})
def sparking_your_interest():
	df = SQLContext.read.json('speeches_dataset.json')
	df_fillna=df.fillna("")
	print(df_fillna.count())
	print(df_fillna.printSchema())

	df_utf=call_utf_encoder(df)
	df_cleaned=call_para_cleanup(df_utf)
	print(df_cleaned)
	df_with_bigrams = call_ngrams(df_cleaned, 2)
	df_with_trigrams = call_ngrams(df_with_bigrams, 3)
	df_with_4grams = call_ngrams(df_with_trigrams, 4)
	df_with_5grams = call_ngrams(df_with_4grams, 4)
	df_with_6grams = call_ngrams(df_with_5grams, 4)
	df_with_vocab_score = call_speech_vocab(df_with_6grams)

	df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams')
	df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams')
	df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams')
	assembler = VectorAssembler(
	    inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"],
	    outputCol="features")
	assembler_output = assembler.transform(df_with_4grams_idf_vectors)
	output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features')
	print(output.show())
	print(output.count())

	output_tordd = output.rdd
	train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123)
	train_df = train_rdd.toDF()
	test_df = test_rdd.toDF()
	print(train_df)
	print(test_df)

	print('Train DF - Count: ')
	print(train_df.count())
	print('Test DF - Count: ')
	print(test_df.count())

	print("Initializing RF Model")
	labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df)       
	rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)
	pipeline = Pipeline(stages=[labelIndexer,rf])
	model = pipeline.fit(output)
	print("Completed RF Model")

	predictions = model.transform(test_df)
	evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
	accuracy = evaluator.evaluate(predictions)
	print("Test Error = %g" % (1.0 - accuracy))
	rfModel = model.stages[1]
	print(rfModel)  # summary only
	print("Predictions: ")
	print(predictions.show())
Example #15
0
def model(classifier, ftrain, fvalid, fprediction):

    startTime = time.time()

    ctx = SparkContext(appName="model_on_Spark")
    sqlContext = SQLContext(ctx)
    logger = SparkLogger(ctx)
    logger.set_level('ERROR')

    # load and prepare training and validation data
    rawTrain, train = prepData(sqlContext, ctx, ftrain)
    rawValid, valid = prepData(sqlContext, ctx, fvalid)

    # is needed to join columns
    valid = indexData(valid)
    rawValid = indexData(rawValid)

    classifiers = {
        "RandomForestClassifier" : RFC
    }

    clf = classifiers[classifier]()

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")

    # train and predict
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
    model = pipeline.fit(train)

    predictions = model.transform(valid)

    # write to file:

    subsetPrediction = predictions.select("prediction", "index")
    subsetValidData = rawValid.select("dataset", "index")

    output = (subsetValidData
               .join(subsetPrediction, subsetPrediction.index == subsetValidData.index)
                    .drop("index")
                    .drop("index"))

    lines = output.map(toCSVLine)
    lines.saveAsTextFile('output')

    evaluator = MulticlassClassificationEvaluator(
       labelCol="label", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print "Test Error = %g" % (1.0 - accuracy)

    executionTime = time.time() - startTime
    row=classifier+','+str(executionTime)
    ctx.parallelize([row]).saveAsTextFile("timing")
def event_pipeline(dataset):
    """
    """
    EventCodeI = StringIndexer(inputCol="EventCode", outputCol="EventCodeI")
    EventBaseCodeI = StringIndexer(inputCol="EventBaseCode", outputCol="EventBaseCodeI")
    EventRootCodeI = StringIndexer(inputCol="EventRootCode", outputCol="EventRootCodeI")
    assembler = VectorAssembler(inputCols=["IsRootEvent", "EventCodeI", "EventBaseCodeI","EventRootCodeI", "QuadClass","GoldsteinScale","NumMentions","NumSources","NumArticles","AvgTone"], outputCol="features")
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=310)
    pipeline = Pipeline(stages=[EventCodeI, EventBaseCodeI, EventRootCodeI,assembler,featureIndexer])
    model = pipeline.fit(dataset)
    output = model.transform(dataset)

    data = output.map(lambda row: LabeledPoint(row[0], row[-1])).cache()
    print "Data:"
    print data.take(1)
    return data
    def group(self):
        reTokenizer = RegexTokenizer(inputCol=self.query_colname, outputCol="words", minTokenLength=2) #, pattern='\W'
        hashingTF = HashingTF(numFeatures=self.num_features, inputCol="words", outputCol="tf")


        if self.idf == True:
            idf = IDF(minDocFreq=self.min_doc_freq, inputCol="tf", outputCol="idf")
            kmeans = KMeans(featuresCol="idf", predictionCol="cluster_id", k=self.n)
            pipeline = Pipeline(stages=[reTokenizer, hashingTF, idf, kmeans])

        else:
            kmeans = KMeans(featuresCol="tf", predictionCol="cluster_id", k=self.n)
            pipeline = Pipeline(stages=[reTokenizer, hashingTF, kmeans])

        model = pipeline.fit(self.df)
        prediction = model.transform(self.df)
        return prediction
Example #18
0
    def getPipeline(self, df):
        # notify pipeline 
        self.success('Initializing ML Pipeline ...')

        # initialize our tokenizer, we're going to tokenize features
        tokenizer = Tokenizer(inputCol='tag_features', outputCol='words')
        # convert the tokenize data to vectorize data
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features')
        # initialize logistic regression algorithm
        lr        = LogisticRegression(maxIter=10, regParam=0.01)
        # create / initialize the ml pipeline
        pipeline  = Pipeline(stages=[tokenizer, hashingTF, lr])

        # fit the pipeline on our training dataframe
        model = pipeline.fit(df)

        return model
def event_pipeline(dataset):
    EventCodeI = StringIndexer(inputCol="EventCode", outputCol="EventCodeI")
    EventCodeV = OneHotEncoder(dropLast=True, inputCol="EventCodeI", outputCol="EventCodeV")

    EventRootCodeI = StringIndexer(inputCol="EventRootCode", outputCol="EventRootCodeI")
    EventRootCodeV = OneHotEncoder(dropLast=True, inputCol="EventRootCodeI", outputCol="EventRootCodeV")

    EventBaseCodeI = StringIndexer(inputCol="EventBaseCode", outputCol="EventBaseCodeI")
    EventBaseCodeV = OneHotEncoder(dropLast=True, inputCol="EventBaseCodeI", outputCol="EventBaseCodeV")

    assembler = VectorAssembler(inputCols=["IsRootEvent", "EventCodeV", "EventBaseCodeV","EventRootCodeV", "QuadClass","GoldsteinScale","NumMentions","NumSources","NumArticles","AvgTone"], outputCol="features")

    pipeline = Pipeline(stages=[EventCodeI, EventCodeV, EventRootCodeI, EventRootCodeV,EventBaseCodeI,EventBaseCodeV,assembler])

    model = pipeline.fit(dataset)
    output = model.transform(dataset)
    data = output.map(lambda row: LabeledPoint(row[0], row[-1])).toDF().cache()
    return data
Example #20
0
    def test_pipeline_persistence(self):
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            pl = Pipeline(stages=[tf, pca])
            model = pl.fit(df)
            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self.assertEqual(loaded_pipeline.uid, pl.uid)
            self.assertEqual(len(loaded_pipeline.getStages()), 2)

            [loaded_tf, loaded_pca] = loaded_pipeline.getStages()
            self.assertIsInstance(loaded_tf, HashingTF)
            self.assertEqual(loaded_tf.uid, tf.uid)
            param = loaded_tf.getParam("numFeatures")
            self.assertEqual(loaded_tf.getOrDefault(param), tf.getOrDefault(param))

            self.assertIsInstance(loaded_pca, PCA)
            self.assertEqual(loaded_pca.uid, pca.uid)
            self.assertEqual(loaded_pca.getK(), pca.getK())

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            [model_tf, model_pca] = model.stages
            [loaded_model_tf, loaded_model_pca] = loaded_model.stages
            self.assertEqual(model_tf.uid, loaded_model_tf.uid)
            self.assertEqual(model_tf.getOrDefault(param), loaded_model_tf.getOrDefault(param))

            self.assertEqual(model_pca.uid, loaded_model_pca.uid)
            self.assertEqual(model_pca.pc, loaded_model_pca.pc)
            self.assertEqual(model_pca.explainedVariance, loaded_model_pca.explainedVariance)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
def build_decision_tree(sqlContext, features, interested):
	print '-----------------------------------------'
	data = sqlContext.createDataFrame(
			[Row(label=interested[i],features=Vectors.dense(features[i])) for i in xrange(len(features))])
	data.printSchema()
	data.show(5)
	print 'created data frame'

	# Index the label column & adding metadata.
	labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
	print 'created label indexer'

	# Mark the features with < 4 distinct values as categorical
	featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

	# Split the data into training and test sets
	(trainingData, testData) = data.randomSplit([0.8, 0.2])

	# Train a DecisionTree model
	dt = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
#	dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
#	dt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)

	# Chain the indexers together with DecisionTree
	pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

	# Train the model
	model = pipeline.fit(trainingData)

	# Make predictions
	predictions = model.transform(testData)

	predictions.select("prediction", "indexedLabel", "features").show(5)

	# Select (prediction, true label) & compute test error
	evaluator = MulticlassClassificationEvaluator(
			labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
	precision = evaluator.evaluate(predictions)

	treeModel = model.stages[2]
	return (1 - precision, model)
    def test_featurizer_in_pipeline(self):
        """
        Tests that featurizer fits into an MLlib Pipeline.
        Does not test how good the featurization is for generalization.
        """
        featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features",
                                         modelName=self.name)
        lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
        pipeline = Pipeline(stages=[featurizer, lr])

        # add arbitrary labels to run logistic regression
        # TODO: it's weird that the test fails on some combinations of labels. check why.
        label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType())
        train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["filePath"]))

        lrModel = pipeline.fit(train_df)
        # see if we at least get the training examples right.
        # with 5 examples and e.g. 131k features (for InceptionV3), it ought to.
        pred_df_collected = lrModel.transform(train_df).collect()
        for row in pred_df_collected:
            self.assertEqual(int(row.prediction), row.label)
Example #23
0
 def test_pipeline(self):
     dataset = MockDataset()
     estimator0 = MockEstimator()
     transformer1 = MockTransformer()
     estimator2 = MockEstimator()
     transformer3 = MockTransformer()
     pipeline = Pipeline(stages=[estimator0, transformer1, estimator2, transformer3])
     pipeline_model = pipeline.fit(dataset, {estimator0.fake: 0, transformer1.fake: 1})
     model0, transformer1, model2, transformer3 = pipeline_model.stages
     self.assertEqual(0, model0.dataset_index)
     self.assertEqual(0, model0.getFake())
     self.assertEqual(1, transformer1.dataset_index)
     self.assertEqual(1, transformer1.getFake())
     self.assertEqual(2, dataset.index)
     self.assertIsNone(model2.dataset_index, "The last model shouldn't be called in fit.")
     self.assertIsNone(transformer3.dataset_index, "The last transformer shouldn't be called in fit.")
     dataset = pipeline_model.transform(dataset)
     self.assertEqual(2, model0.dataset_index)
     self.assertEqual(3, transformer1.dataset_index)
     self.assertEqual(4, model2.dataset_index)
     self.assertEqual(5, transformer3.dataset_index)
     self.assertEqual(6, dataset.index)
def main():
    '''
    takes one input argument :: Location of the directory for training and test data files.
    :return: Print output on console for the area under the ROC curve.
    '''

    conf = SparkConf().setAppName("MLPipeline")
    sc = SparkContext(conf=conf)

    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet("20news_train.parquet")

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training data.
    model = pipeline.fit(trainDF)

    numFeatures = (1000, 5000, 10000)
    regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
    paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build()


    cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)

    # Evaluate the model on testing data
    testDF = sqlCt.read.parquet("20news_test.parquet")
    prediction = model.transform(testDF)
    evaluator = BinaryClassificationEvaluator()


    model_cv = cv.fit(trainDF)
    prediction_cv = model_cv.transform(testDF)
    print evaluator.evaluate(prediction)
    print evaluator.evaluate(prediction_cv)
def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output)
def build_ngrams_wocs(inputCol=["Text","Sentiment"], n=3):
    tokenizer = [Tokenizer(inputCol="Text", outputCol="words")]
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=5460,inputCol="{0}_grams".format(i),
            outputCol="{0}_tf".format(i))
        for i in range(1, n + 1)
    ]
    idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)]

    assembler = [VectorAssembler(
        inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
        outputCol="features"
    )]
    label_stringIdx = [StringIndexer(inputCol = "Sentiment", outputCol = "label")]
    lr = [LogisticRegression(maxIter=100)]
    return Pipeline(stages=tokenizer + ngrams + cv + idf+ assembler + label_stringIdx+lr)
    pipeline = Pipeline(stages=[tokenizer, ngrams, cv, idf,assembler, label_stringIdx])
    pipelineFit = pipeline.fit(df)
    dataset = pipelineFit.transform(df)
Example #27
0
# We freeze layers from input to pool4/3x3_s2 inclusive.

model.freeze_up_to(["pool4/3x3_s2"])

# ### Add a few new layers

inputNode = Input(name="input", shape=(3, 224, 224))
inception = model.to_keras()(inputNode)
flatten = Flatten()(inception)
logits = Dense(2)(flatten)
lrModel = Model(inputNode, logits)
classifier = NNClassifier(
    lrModel, CrossEntropyCriterion(),
    transformer).setLearningRate(0.003).setBatchSize(64).setMaxEpoch(
        1).setFeaturesCol("image").setCachingSample(False)
pipeline = Pipeline(stages=[classifier])

# # Train the model
# The transfer learning can finish in a few minutes.

catdogModel = pipeline.fit(trainingDF)
predictionDF = catdogModel.transform(validationDF).cache()

predictionDF.select("name", "label",
                    "prediction").sort("label", ascending=False).show(10)
predictionDF.select("name", "label", "prediction").show(10)
correct = predictionDF.filter("label=prediction").count()
overall = predictionDF.count()
accuracy = correct * 1.0 / overall
print("Test Error = %g " % (1.0 - accuracy))
Example #28
0
assembler = VectorAssembler(
    inputCols = input_cols,
    outputCol = 'features')

from pyspark.ml import Pipeline


from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'CANCELLED', maxIter=100)

pipeline = Pipeline(stages=[op_carrier_indexer, 
                            op_carrier_encoder, 
                            origin_indexer,
                            origin_encoder,
                            dest_indexer,
                            dest_encoder,
                            crs_dep_hour_indexer,
                            crs_dep_hour_encoder,
                            assembler,
                            lr])

(train, test) = flight_df.randomSplit([0.7, 0.3])

lrModel = pipeline.fit(train)

from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictionslr = lrModel.transform(test)
evaluator = BinaryClassificationEvaluator(labelCol="CANCELLED",metricName="areaUnderROC")
evaluator.evaluate(predictionslr)
Example #29
0
# OneHot encode type
onehot = OneHotEncoderEstimator(inputCols=['type_idx'],
                                outputCols=['type_dummy'])

# Create 'features' vector: 'weight_kg', 'cyl', 'type_dummy'
assembler = VectorAssembler(inputCols=['weight_kg', 'cyl', 'type_dummy'],
                            outputCol='features')

# Split the data into training and testing sets
kars_train, kars_test = kars.randomSplit([0.8, 0.2], seed=23)

# Fit a Logistic Regression model to the training data
regression = LinearRegression(labelCol='consumption')

# Combine steps into a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

# object to evaluate performance
evaluator = RegressionEvaluator(labelCol='consumption')

# build grid of parameter values (now empty)
params = ParamGridBuilder().build()

# create cross-validation object
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=params,
                    evaluator=evaluator,
                    numFolds=10,
                    seed=13)

# run fit on training data
Example #30
0
#fulldata=fulldata.select(['product_uid','id','tf_idf_plus','tf_idfs_plus','relevance'])

#COMPUTE COSINE

# create NEW features & train and evaluate regression model
# Step 1: create features
fulldata = fulldata.withColumnRenamed('relevance',
                                      'label').select(['label', 'features'])

# Simple evaluation : train and test split
(train, test) = fulldata.rdd.randomSplit([0.8, 0.2])

#Initialize regresion model
#lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
featureIndexer = VectorIndexer(inputCol="features",
                               outputCol="indexedFeatures",
                               maxCategories=4).fit(
                                   sqlContext.createDataFrame(train))
rf = RandomForestRegressor(featuresCol="indexedFeatures")
pipeline = Pipeline(stages=[featureIndexer, rf])

# Fit the model
#lrModel = lr.fit(sqlContext.createDataFrame(train))
lrModel = pipeline.fit(sqlContext.createDataFrame(train))

# Apply model to test data
result = lrModel.transform(sqlContext.createDataFrame(test))
# Compute mean squared error metric
MSE = result.rdd.map(lambda r: (r['label'] - r['prediction'])**2).mean()
print("Mean Squared Error = " + str(MSE))
Example #31
0

# prepare labeled sets
cols_now = ['prod_price',
            'prod_feat_1',
            'prod_feat_2',
            'cust_age',
            'prod_feat_3_reduced_catVec',
            'cust_region_catVec',
            'prod_type_catVec',
            'cust_sex_catVec',
            'cust_title_catVec']
assembler_features = VectorAssembler(inputCols=cols_now, outputCol='features')
labelIndexer = StringIndexer(inputCol='binary_response', outputCol="label")
tmp += [assembler_features, labelIndexer]
pipeline = Pipeline(stages=tmp)



# prepare a pipline 

allData = pipeline.fit(df).transform(df)
allData.cache()
trainingData, testData = allData.randomSplit([0.8,0.2], seed=0) # need to ensure same split for each time
print("Distribution of Pos and Neg in trainingData is: ", trainingData.groupBy("label").count().take(3))

# prediction and evaluation data 


from pyspark.mllib.evaluation import BinaryClassificationMetrics as metric
results = transformed.select(['probability', 'label'])
Example #32
0
label_stringIdx = StringIndexer(inputCol = 'CRASH_FLAG', outputCol = 'label')
stages += [label_stringIdx]

assemblerInputs = [c + "classVec" for c in Categoric_features] + numeric_features

assembler = VectorAssembler()\
            .setInputCols(assemblerInputs) \
            .setOutputCol("vec_features") 
stages += [assembler]

scaler = StandardScaler()\
         .setInputCol("vec_features") \
         .setOutputCol("features") 
stages += [scaler]

pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(new_df)

testdf=pipelineModel.transform(new_df)

pipelineModel.write().overwrite().save(PipelineLoc)

#split the rows into 70% training and 30% testing sets
splits=testdf.randomSplit([0.7, 0.3], 2018)

train_df=splits[0]
test_df=splits[1]

#use Binomial Logistic regression to predict "CRASH_FLAG"

lr = LogisticRegression(featuresCol= 'features', labelCol='label', maxIter=10)
def graph(t1, t2, time_min, time_sec, fb, fi, ft, fd, t1tk, t2tk, t1ik, t2ik,
          first_dragon, first_rift_herald):
    copyfile('lib_full.txt', 'lib3.txt')

    for k in range(0, 10):
        for j in range(0, 10):
            line = [
                100 * time_min + time_sec, 9, fb, ft, fi, fd, first_dragon,
                first_rift_herald, t1[0], t1[1], t1[2], t1[3], t1[4], k, t1ik,
                t2[0], t2[1], t2[2], t2[3], t2[4], j, t2ik
            ]
            test_line = [2]
            for i in range(len(line)):
                new_item = "%s:%s" % (i + 1, line[i])
                test_line.append(new_item)

            print(test_line)

            with open('lib3.txt', 'a') as f:
                for item in test_line:
                    f.write("%s " % item)
                f.write("\n")
                f.close()

    data = spark.read.format("libsvm").option("numFeatures",
                                              "22").load("lib3.txt")
    (trainingData, testData) = split_by_row_index(data)

    labelIndexer = StringIndexer(inputCol="label",
                                 outputCol="indexedLabel").fit(data)
    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexedFeatures",
                                   maxCategories=32).fit(data)
    gbt = GBTClassifier(labelCol="indexedLabel",
                        featuresCol="indexedFeatures",
                        maxIter=10)
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt])
    model = pipeline.fit(trainingData)

    predictions = model.transform(testData)
    predictions.show(100, False)
    result_list = predictions.collect()
    # try:
    #     os.remove("lib3.txt")
    # except:
    #     print "file not existed"

    x1 = []
    y1 = []

    x2 = []
    y2 = []

    result1 = []
    result2 = []
    #13 , 20
    for i in range(0, 10):
        for j in range(0, 10):
            if result_list[len(result_list) - 1 - 99 + i * 10 +
                           j]['prediction'] == 0:
                result1.append('Team 1 Win')
                x1.append(i)
                y1.append(j)
            else:
                result2.append('Team 2 Win')
                x2.append(i)
                y2.append(j)

    plt = dcc.Graph(id='life-exp-vs-gdp',
                    figure={
                        'data': [
                            go.Scatter(
                                x=x1,
                                y=y1,
                                text=result1,
                                mode='markers',
                                opacity=0.7,
                                marker={
                                    'size': 15,
                                    'line': {
                                        'width': 0.5,
                                        'color': 'white'
                                    }
                                },
                            ),
                            go.Scatter(
                                x=x2,
                                y=y2,
                                text=result2,
                                mode='markers',
                                opacity=0.7,
                                marker={
                                    'size': 15,
                                    'line': {
                                        'width': 0.5,
                                        'color': 'blue'
                                    }
                                },
                            )
                        ],
                        'layout':
                        go.Layout(xaxis={
                            'type': 'log',
                            'title': 'Team 1 tower kill'
                        },
                                  yaxis={'title': 'Team 2 tower kill'},
                                  margin={
                                      'l': 40,
                                      'b': 40,
                                      't': 10,
                                      'r': 10
                                  },
                                  legend={
                                      'x': 0,
                                      'y': 1
                                  },
                                  hovermode='closest')
                    })

    return plt
def build_indep_vars(df,
                     independent_vars,
                     categorical_vars=None,
                     keep_intermediate=False,
                     summarizer=True):
    """
    Data verification
    df               : DataFrame
    independent_vars : List of column names
    categorical_vars : None or list of column names, e.g. ['col1', 'col2']
    """
    assert (
        type(df) is pyspark.sql.dataframe.DataFrame
    ), 'pypark_glm: A pySpark dataframe is required as the first argument.'
    assert (
        type(independent_vars) is list
    ), 'pyspark_glm: List of independent variable column names must be the third argument.'
    for iv in independent_vars:
        assert (
            type(iv) is str
        ), 'pyspark_glm: Independent variables must be column name strings.'
        assert (
            iv in df.columns
        ), 'pyspark_glm: Independent variable name is not a dataframe column.'
    if categorical_vars:
        for cv in categorical_vars:
            assert (
                type(cv) is str
            ), 'pyspark_glm: Categorical variables must be column name strings.'
            assert (
                cv in df.columns
            ), 'pyspark_glm: Categorical variable name is not a dataframe column.'
            assert (
                cv in independent_vars
            ), 'pyspark_glm: Categorical variables must be independent variables.'
    """
    Code
    """
    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
    from pyspark.ml.regression import GeneralizedLinearRegression

    if categorical_vars:
        string_indexer = [
            StringIndexer(inputCol=x, outputCol='{}_index'.format(x))
            for x in categorical_vars
        ]

        encoder = [
            OneHotEncoder(dropLast=True,
                          inputCol='{}_index'.format(x),
                          outputCol='{}_vector'.format(x))
            for x in categorical_vars
        ]

        independent_vars = [
            '{}_vector'.format(x) if x in categorical_vars else x
            for x in independent_vars
        ]
    else:
        string_indexer, encoder = [], []

    assembler = VectorAssembler(inputCols=independent_vars,
                                outputCol='indep_vars')
    pipeline = Pipeline(stages=string_indexer + encoder + [assembler])
    model = pipeline.fit(df)
    df = model.transform(df)

    #for building the crosswalk between indicies and column names
    if summarizer:
        param_crosswalk = {}

        i = 0
        for x in independent_vars:
            if '_vector' in x[-7:]:
                xrs = x.rstrip('_vector')
                dst = df[[xrs, '{}_index'.format(xrs)]].distinct().collect()

                for row in dst:
                    param_crosswalk[int(row['{}_index'.format(xrs)] +
                                        i)] = row[xrs]
                maxind = max(param_crosswalk.keys())
                del param_crosswalk[maxind]  #for droplast
                i += len(dst)
            elif '_index' in x[:-6]:
                pass
            else:
                param_crosswalk[i] = x
                i += 1
        """
        {0: 'carat',
         1: u'SI1',
         2: u'VS2',
         3: u'SI2',
         4: u'VS1',
         5: u'VVS2',
         6: u'VVS1',
         7: u'IF'}
        """
        make_summary = Summarizer(param_crosswalk)

    if not keep_intermediate:
        fcols = [
            c for c in df.columns
            if '_index' not in c[-6:] and '_vector' not in c[-7:]
        ]
        df = df[fcols]

    if summarizer:
        return df, make_summary
    else:
        return df
    ).toDF()

df_new = sc.parallelize(
    [
        Row(p=u'p1', owner=u'u1', f1=0.1, f2=0.3, f3=0.5),
        Row(p=u'p2', owner=u'u1', f1=0.3, f2=0.5, f3=0.5),
        Row(p=u'p3', owner=u'u1', f1=0.6, f2=0.6, f3=0.9),
        Row(p=u'p4', owner=u'u1', f1=0.8, f2=0.1, f3=0.6),
        Row(p=u'p5', owner=u'u1', f1=0.0, f2=0.2, f3=0.2),
        Row(p=u'p1', owner=u'u2', f1=0.0, f2=0.4, f3=0.1),
        Row(p=u'p2', owner=u'u2', f1=0.3, f2=0.7, f3=0.4),
        Row(p=u'p3', owner=u'u2', f1=0.4, f2=0.6, f3=0.6),
        Row(p=u'p4', owner=u'u2', f1=0.6, f2=0.1, f3=0.7),
        Row(p=u'p5', owner=u'u2', f1=0.0, f2=0.0, f3=0.8),
    ]
    ).toDF()

owner_training = df_training.where(col('owner') == 'u1')
owner_new = df_new.where(col('owner') == 'u1')

label_indexer = StringIndexer(inputCol="status",
                             outputCol="indexedStatus")
assembler = VectorAssembler(inputCols=['f1', 'f2', 'f3'],
                            outputCol='features')
rf = RandomForestClassifier(labelCol="indexedStatus", featuresCol="features")

pipeline = Pipeline(stages=[label_indexer, assembler, rf])
model = pipeline.fit(owner_training)
predictions = model.transform(owner_new)
predictions.show()
Example #36
0
        .getOrCreate()

# Prepare training documents from a list of (id, text, label) tuples, where label ham=0/spam=1
training = spark.createDataFrame([
    (0, "Meetup Spark user group Dublin", 0.0),
    (1, "Quick Loans availuble!", 1.0),
    (2, "New: The 20 pounds-per-day diet. Must try.", 1.0),
    (3, "hadoop mapreduce", 0.0),
    (4, "GET YOUR UUNIVERSITY DEGREE IN DATA ANALYSTICS. IN JUST 1 DAY", 1.0)
], ["id", "text", "label"])

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(training)

# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame(
    [(5, "I am not a spam, I promise!!!"),
     (6, "Spark can work on top of hadoop or standalone"),
     (7, "New release available for Spark on DataSets")], ["id", "text"])

# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    rid, text, prob, prediction = row
Example #37
0
print('Building Pipeline.\n')
regexTokenizer = RegexTokenizer(inputCol='review',
                                outputCol='tokenized',
                                pattern="\\W")

stopwordsRemover = StopWordsRemover(inputCol='tokenized',
                                    outputCol='removed_sw').setStopWords(sw)

countVectorizer = CountVectorizer(inputCol="removed_sw",
                                  outputCol="features",
                                  vocabSize=10000,
                                  minDF=5)

lr = LogisticRegression(featuresCol='features', labelCol='label')

pipeline = Pipeline(
    stages=[regexTokenizer, stopwordsRemover, countVectorizer, lr])

#Split data into training and testing partitions
print('Splitting data.\n')
train_data, test_data = data.randomSplit([0.9, 0.1])

#Fit and transform training data
print('Fitting training data to pipeline.\n')
pipelineFit = pipeline.fit(train_data)
transformed_train_data = pipelineFit.transform(train_data)

#Predict on test data
print('Predicting on testing data.\n')
test_predictions = pipelineFit.transform(test_data)

#Calculate Metrics
# Splitting data into train test data and streaming data
train_test_data, streaming_data = df_indexed.randomSplit([0.95, 0.05])

# SAVING STREAMING DATA
streaming_data.write.save("OutputGStore\\" + unique_key + "-streaming-data.csv", format="csv", header="true")
del streaming_data

## STEP 2: Prepare, train and validate the data
print("STEP 2: Train and validate the model")

feature_cols = train_test_data.columns
feature_cols.remove('Installs indexed')

assembler = VectorAssembler(inputCols = feature_cols, outputCol = "features", handleInvalid = "error")
pipeline = Pipeline(stages=[assembler])
outputModel = pipeline.fit(train_test_data)
output = outputModel.transform(train_test_data)
final_data = output.select("features", "Installs indexed")

train_data, test_data = final_data.randomSplit([0.7, 0.3])


# CLASIFICATION CODE
# Random forest classifier
rf = RandomForestClassifier(labelCol="Installs indexed", featuresCol="features", numTrees=32, maxBins=120)
model = rf.fit(train_data)

predictions = model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(
Example #39
0
        .getOrCreate()

    # Prepare training documents from a list of (id, text, label) tuples.
    training = sparkSession.createDataFrame([(0, "a b c d e spark", 1.0),
                                             (1, "b d", 0.0),
                                             (2, "spark f g h", 1.0),
                                             (3, "hadoop mapreduce", 0.0)],
                                            ["id", "text", "label"])

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and logistic regression.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    logistic_regression = LogisticRegression(maxIter=10, regParam=0.001)

    pipeline = Pipeline(stages=[tokenizer, hashingTF, logistic_regression])

    # Fit the pipeline to training documents.
    model = pipeline.fit(training)

    # Prepare test documents, which are unlabeled (id, text) tuples.
    test = sparkSession.createDataFrame([(4, "spark i j k"), (5, "l m n"),
                                         (6, "spark hadoop spark"),
                                         (7, "apache hadoop")], ["id", "text"])

    # Make predictions on test documents and print columns of interest.
    prediction = model.transform(test)
    selected = prediction.select("id", "text", "probability", "prediction")
    for row in selected.collect():
        rid, text, prob, prediction = row
        print("(%d, %s) --> prob=%s, prediction=%f" %
Example #40
0
# Change categorical values into numeric
indexers = [
    StringIndexer(inputCol=column, outputCol=column + "_index")
    for column in catColumns
]
encoder = OneHotEncoderEstimator(
    inputCols=[c + "_index" for c in catColumns],
    outputCols=[c + "_vector" for c in catColumns])

assembler = VectorAssembler(inputCols=encoder.getOutputCols() + numColumns,
                            outputCol="features")

label_stringIdx = StringIndexer(inputCol="income", outputCol="label")

pipeline = Pipeline(stages=indexers + [label_stringIdx, encoder, assembler])
encoded_df = pipeline.fit(df).transform(df)

selectedCols = ['label', 'features'] + cols
dataset = encoded_df.select(selectedCols)

# Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print(trainingData.count())
print(testData.count())

# fit model and train
lrModel = LogisticRegression().fit(encoded_df)
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
# COMMAND ----------

# Regularization Rates
from pyspark.ml.classification import LogisticRegression

# try a bunch of alpha values in a Linear Regression (Ridge) model
reg = 0
print("Regularization rate: {}".format(reg))
# create a bunch of child runs
#with root_run.child_run("reg-" + str(reg)) as run:
# create a new Logistic Regression model.

lr = LogisticRegression(regParam=reg)

# put together the pipeline
pipe = Pipeline(stages=[*si_xvars, *ohe_xvars, si_label, assembler, lr])

# train the model
model_pipeline = pipe.fit(trainingData)

# make prediction
predictions = model_pipeline.transform(testData)

# evaluate. note only 2 metrics are supported out of the box by Spark ML.
bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
au_roc = bce.setMetricName('areaUnderROC').evaluate(predictions)
au_prc = bce.setMetricName('areaUnderPR').evaluate(predictions)
truePositive = predictions.select("label").filter(
    "label = 1 and prediction = 1").count()
falsePositive = predictions.select("label").filter(
    "label = 0 and prediction = 1").count()

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler()\
  .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
  .setOutputCol("features")


# COMMAND ----------

from pyspark.ml import Pipeline

transformationPipeline = Pipeline()\
  .setStages([indexer, encoder, vectorAssembler])


# COMMAND ----------

fittedPipeline = transformationPipeline.fit(trainDataFrame)


# COMMAND ----------

transformedTraining = fittedPipeline.transform(trainDataFrame)


# COMMAND ----------

from pyspark.ml.clustering import KMeans
Example #43
0
def train_model_sentences_with_person():
    sentences_with_person_collection = get_db_collection_object(
        'SentencesWithPerson')

    with open("sentences_with_person.txt", "w",
              encoding='utf-8') as file_sentences_with_person:
        for sen in sentences_with_person_collection.find():
            file_sentences_with_person.write('{0}\n'.format(sen['sentence']))

    spark = SparkSession \
        .builder \
        .appName("SentenceProcessor") \
        .getOrCreate()

    input_data = spark.sparkContext.textFile('./sentences_with_person.txt')
    prepared_data = input_data.map(lambda x: (x, len(x)))
    prepared_data = prepared_data.filter(lambda x: x[1] > 0)

    prepared_df = prepared_data.toDF().selectExpr('_1 as sentence',
                                                  '_2 as length')
    # prepared_df.show(truncate=False)

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    words_data = tokenizer.transform(prepared_df)
    # words_data.show(truncate=False)

    # Отфильтровать токены, оставив только слова
    filtered_words_data = words_data.rdd.map(
        lambda x: (x[0], x[1], get_only_words(x[2])))
    filtered_df = filtered_words_data.toDF().selectExpr(
        '_1 as sentence', '_2 as length', '_3 as words')
    # filtered_df.show()

    # Удалить стоп-слова (союзы, предлоги, местоимения и т.д.)
    stop_words = stopwords.words('russian')
    remover = StopWordsRemover(inputCol='words',
                               outputCol='filtered',
                               stopWords=stop_words)
    filtered = remover.transform(filtered_df)

    #
    normalize_words_data = filtered.rdd.map(
        lambda x: (x[0], x[1], x[2], normalization_sentence(x[3])))
    normalized_df = normalize_words_data.toDF().selectExpr(
        '_1 as sentence', '_2 as length', '_3 as words',
        '_4 as normalize_words')
    # normalized_df.show()

    #
    vectorizer = CountVectorizer(inputCol='normalize_words',
                                 outputCol='raw_features').fit(normalized_df)
    featurized_data = vectorizer.transform(normalized_df)
    featurized_data.cache()

    #
    idf = IDF(inputCol='raw_features', outputCol='features')
    idf_model = idf.fit(featurized_data)
    rescaled_data = idf_model.transform(featurized_data)

    # Построить модель Word2Vec
    word2Vec = Word2Vec(vectorSize=300,
                        minCount=0,
                        inputCol='normalize_words',
                        outputCol='result')
    doc2vec_pipeline = Pipeline(stages=[tokenizer, word2Vec])
    model = word2Vec.fit(rescaled_data)
    w2v_df = model.transform(rescaled_data)
    # w2v_df.show(truncate=False)

    # print(model.findSynonyms('бочаров', 2).show())

    # sc = spark.sparkContext
    path = './models/model_person'
    #
    # print(sc, path)
    model.write().overwrite().save(path)

    #m = Word2Vec.load('./models/model_person/')
    # pickle.dump(model, './models/model_person/mp.model')

    spark.stop()
spark = SparkSession.builder.appName("Convolutional Neural Networks - Transfer Learning - Image Recognition").getOrCreate()

# (3) Load the Plane and Bird images into Spark DataFrames and define a literal label column
path_to_img_directory = '/data/workspaces/jillur.quddus/jupyter/notebooks/Machine-Learning-with-Apache-Spark-QuickStart-Guide/chapter07/data/image-recognition-data'
birds_df = ImageSchema.readImages(path_to_img_directory + "/birds").withColumn("label", lit(0))
planes_df = ImageSchema.readImages(path_to_img_directory + "/planes").withColumn("label", lit(1))

# (4) Create Training and Test DataFrames respectively
planes_train_df, planes_test_df = planes_df.randomSplit([0.75, 0.25], seed=12345)
birds_train_df, birds_test_df = birds_df.randomSplit([0.75, 0.25], seed=12345)
train_df = planes_train_df.unionAll(birds_train_df)
test_df = planes_test_df.unionAll(birds_test_df)

# (5) Transform the Images into Numeric Feature Vectors using Transfer Learning and the pre-trained InceptionV3 Convolutional Neural Network
featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")

# (6) Train a Logistic Regression Model to classify our images
logistic_regression = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")

# (7) Execute the Featurizer and Logistic Regression estimator within a Pipeline to generate the Trained Model
pipeline = Pipeline(stages=[featurizer, logistic_regression])
model = pipeline.fit(train_df)

# (8) Apply the Trained Image Classification Model to the Test DataFrame to make predictions
test_predictions_df = model.transform(test_df)
test_predictions_df.select("image.origin", "prediction").show(truncate=False)

# (9) Compute the accuracy of our Trained Image Classification Model
accuracy_evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Accuracy on Test Dataset = %g" % accuracy_evaluator.evaluate(test_predictions_df.select("label", "prediction")))
#Read Source file and creates DataFrame
ModalDF = sqlContext.read.csv(TrainSource,header="True",inferSchema="True").selectExpr("*",ConditionExpr)

#Transforms Input columns into single ArrayList called features
vectorizer = VectorAssembler()
vectorizer.setInputCols(["Lat", "Long", "Ele","LocalTime"])
vectorizer.setOutputCol("features")

#Declaring objects for Each Regressions
lr0 = LogisticRegression(labelCol="Condition",predictionCol="Predicted_Cond",maxIter=100, regParam=0, family="multinomial")
lr1 = LinearRegression(labelCol="Temp",predictionCol="Predicted_Temp",maxIter=100,regParam=0.1)
lr2 = LinearRegression(labelCol="Pres",predictionCol="Predicted_Pres",maxIter=100,regParam=0.1)
lr3 = LinearRegression(labelCol="Humid",predictionCol="Predicted_Humid",maxIter=100,regParam=0.1)

#Combining all the Regression in a pipeline and fit the Dataset to create a Modal
lrPipeline = Pipeline()
lrPipeline.setStages([vectorizer, lr1, lr2,lr3,lr0])
lrModel = lrPipeline.fit(ModalDF)

# COMMAND ----------

"""
The Following code Take the Test Dataset and perform following actions
    - Gets GeoInformation (Latitude, Longitude,Elevation)
    - Gets Monthly Data&Timestamps for each record
    - Predict the Temperature, Pressure, Humidity & Condition using Pipeline Model
    - Change the Fomat and write it in a file
"""

# Extract and Transform TestRDD
TestRDD = sc.textFile(TestSource).map(lambda line: list(get_geo_info(line))).flatMap(lambda line: list(get_datetime_info(line)))
Example #46
0
def data_processing(df):
    '''
    :param data: A PySpark dataframe
    :return: A preprocessed data that has been cleaned, indexed and assembled
    '''
    df.createOrReplaceTempView("data")

    processed_data = spark.sql("""
    select
        host_id,
        price,
        bathrooms,
        bedrooms,
        room_type,
        property_type,
        case when host_is_superhost = True
            then 1.0
            else 0.0
        end as host_is_superhost,
        accommodates,
        cancellation_policy,
        minimum_nights,
        maximum_nights,
        availability_30,
        availability_60,
        availability_90,
        availability_365,
        case when security_deposit is null
            then 0.0
            else security_deposit
        end as security_deposit,
        case when number_of_reviews is null
            then 0.0
            else number_of_reviews
        end as number_of_reviews,
        case when extra_people is null
            then 0.0
            else extra_people
        end as extra_people,
        case when instant_bookable = True
            then 1.0
            else 0.0
        end as instant_bookable,
        case when cleaning_fee is null
            then 0.0
            else cleaning_fee
        end as cleaning_fee,
        case when review_scores_rating is null
            then 0.0
            else review_scores_rating
        end as review_scores_rating,
        case when review_scores_accuracy is null
            then 0.0
            else review_scores_accuracy
        end as review_scores_accuracy,
        case when review_scores_cleanliness is null
            then 0.0
            else review_scores_cleanliness
        end as review_scores_cleanliness,
        case when review_scores_checkin is null
            then 0.0
            else review_scores_checkin
        end as review_scores_checkin,
        case when review_scores_communication is null
            then 0.0
            else review_scores_communication
        end as review_scores_communication,
        case when review_scores_location is null
            then 0.0
            else review_scores_location
        end as review_scores_location,
        case when review_scores_value is null
            then 0.0
            else review_scores_value
        end as review_scores_value,
        case when square_feet is not null and square_feet > 100
            then square_feet
            when (square_feet is null or square_feet <=100) and (bedrooms is null or bedrooms = 0)
            then 350.0
            else 380 * bedrooms
        end as square_feet,
        case when bathrooms >= 2
            then 1.0
            else 0.0
        end as n_bathrooms_more_than_two,
        case when amenity_wifi = True
            then 1.0
            else 0.0
        end as amenity_wifi,
        case when amenity_heating = True
            then 1.0
            else 0.0
        end as amenity_heating,
        case when amenity_essentials = True
            then 1.0
            else 0.0
        end as amenity_essentials,
        case when amenity_kitchen = True
            then 1.0
            else 0.0
        end as amenity_kitchen,
        case when amenity_tv = True
            then 1.0
            else 0.0
        end as amenity_tv,
        case when amenity_smoke_detector = True
            then 1.0
            else 0.0
        end as amenity_smoke_detector,
        case when amenity_washer = True
            then 1.0
            else 0.0
        end as amenity_washer,
        case when amenity_hangers = True
            then 1.0
            else 0.0
        end as amenity_hangers,
        case when amenity_laptop_friendly_workspace = True
            then 1.0
            else 0.0
        end as amenity_laptop_friendly_workspace,
        case when amenity_iron = True
            then 1.0
            else 0.0
        end as amenity_iron,
        case when amenity_shampoo = True
            then 1.0
            else 0.0
        end as amenity_shampoo,
        case when amenity_hair_dryer = True
            then 1.0
            else 0.0
        end as amenity_hair_dryer,
        case when amenity_family_kid_friendly = True
            then 1.0
            else 0.0
        end as amenity_family_kid_friendly,
        case when amenity_dryer = True
            then 1.0
            else 0.0
        end as amenity_dryer,
        case when amenity_fire_extinguisher = True
            then 1.0
            else 0.0
        end as amenity_fire_extinguisher,
        case when amenity_hot_water = True
            then 1.0
            else 0.0
        end as amenity_hot_water,
        case when amenity_internet = True
            then 1.0
            else 0.0
        end as amenity_internet,
        case when amenity_cable_tv = True
            then 1.0
            else 0.0
        end as amenity_cable_tv,
        case when amenity_carbon_monoxide_detector = True
            then 1.0
            else 0.0
        end as amenity_carbon_monoxide_detector,
        case when amenity_first_aid_kit = True
            then 1.0
            else 0.0
        end as amenity_first_aid_kit,
        case when amenity_host_greets_you = True
            then 1.0
            else 0.0
        end as amenity_host_greets_you,
        case when amenity_translation_missing_en_hosting_amenity_50 = True
            then 1.0
            else 0.0
        end as amenity_translation_missing_en_hosting_amenity_50,
        case when amenity_private_entrance = True
            then 1.0
            else 0.0
        end as amenity_private_entrance,
        case when amenity_bed_linens = True
            then 1.0
            else 0.0
        end as amenity_bed_linens,
        case when amenity_refrigerator = True
            then 1.0
            else 0.0
        end as amenity_refrigerator
    from data
    where bedrooms is not null
    """)

    processed_data = processed_data.na.drop()

    cat_cols = [
        f.name for f in processed_data.schema.fields
        if isinstance(f.dataType, StringType)
    ]
    num_cols = [
        f.name for f in processed_data.schema.fields
        if isinstance(f.dataType, IntegerType)
    ]
    decimal_cols = [
        f.name for f in processed_data.schema.fields
        if isinstance(f.dataType, DecimalType)
    ]
    double_cols = [
        f.name for f in processed_data.schema.fields
        if isinstance(f.dataType, DoubleType)
    ]
    num_features = num_cols + decimal_cols + double_cols
    dataset_imputed = processed_data.persist()

    stages = []
    for x in cat_cols:
        cats_indexer = StringIndexer(inputCol=x, outputCol=x + 'Index')
        encoder = OneHotEncoderEstimator(
            inputCols=[cats_indexer.getOutputCol()], outputCols=[x + "encode"])
        stages += [cats_indexer, encoder]

    assembler_inputs = [c + "encode" for c in cat_cols] + num_features
    assembler = VectorAssembler(inputCols=assembler_inputs,
                                outputCol="features")
    stages += [assembler]
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(dataset_imputed)
    df = pipeline_model.transform(dataset_imputed)

    return df
credit.printSchema()

featureCols = [
    "balance", "duration", "history", "purpose", "amount", "savings",
    "employment", "instPercent", "sexMarried", "guarantors",
    "residenceDuration", "assets", "age", "concCredit", "apartment", "credits",
    "occupation", "dependents", "hasPhone", "foreign"
]

lindexer = StringIndexer().setInputCol("creditability").setOutputCol("label")

assembler = VectorAssembler().setInputCols(featureCols).setOutputCol(
    "features")

pipeline = Pipeline().setStages([assembler, lindexer])

credit = pipeline.fit(credit).transform(credit)

(training, test) = credit.randomSplit([0.7, 0.3], seed=1234)

classifier = RandomForestClassifier().setImpurity("gini").setMaxDepth(
    3).setNumTrees(20).setFeatureSubsetStrategy("auto").setSeed(1234)
model = classifier.fit(training)

predictions = model.transform(test)

predictions.show()

evaluator = BinaryClassificationEvaluator().setLabelCol("label")
        labelCol="label")
elif algo == "xgboost":
    ## Create H2OXGBoost model
    algoStage = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True,
                           featuresCols=[idf.getOutputCol()],
                           labelCol="label")
## Remove all helper columns
colPruner = ColumnPruner(columns=[
    idf.getOutputCol(),
    hashingTF.getOutputCol(),
    stopWordsRemover.getOutputCol(),
    tokenizer.getOutputCol()
])

## Create the pipeline by defining all the stages
pipeline = Pipeline(
    stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner])

## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline
## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home
## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow.
pipeline.write().overwrite().save("examples/build/pipeline")
loaded_pipeline = Pipeline.load("examples/build/pipeline")

## Train the pipeline model
data = load()
model = loaded_pipeline.fit(data)

model.write().overwrite().save("examples/build/model")
loaded_model = PipelineModel.load("examples/build/model")

kmeans_df = sqlContext.read.format("com.databricks.spark.csv") \
  .option("header", "false").option("delimiter"," ").option("inferschema", "true") \
  .load("/FileStore/tables/1x1xr57q1502297004187/kmeans_data.txt")
  
# Prepare data for training (see later the explanation about ML Pipelines)
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

assembler = VectorAssembler(inputCols=["_c0","_c1","_c2"], outputCol="features") 
assembler.transform(kmeans_df)

# Create the KMeans model
kmeans_estimator = KMeans().setFeaturesCol("features").setPredictionCol("prediction")
    
# Pipeline stages definition
pipeline = Pipeline(stages=[assembler, kmeans_estimator])

# Pipeline training
model = pipeline.fit(kmeans_df)

# Get the results: 
results = model.transform(kmeans_df)

# Check results:
display(results) 


# Without using Pipelines: 


# Clustering
Example #50
0
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(output)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[assembler, labelIndexer, featureIndexer, rf, labelConverter])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("predictedLabel", "species", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
Example #51
0
    .csv(path_train)

train.persist()
print("Numero de casos en el train: %d" % train.count())

ignore_c = ['MachineIdentifier', 'HasDetections']
train_cols = [c for c in train.columns if c not in ignore_c]

# Convertimos el TRAIN en un VECTOR para poder pasarle el RF
print('Conversion de datos a VectorAssembler')
assembler_features = VectorAssembler(inputCols=train_cols,
                                     outputCol='features')
train_2 = train.limit(10000)
train_data = assembler_features.transform(train_2)
train_data = train_data.select('features', 'HasDetections')\
    .withColumnRenamed('HasDetections', 'label')

xgboost = XGBoostEstimator(featuresCol="features",
                           labelCol="label",
                           predictionCol="prediction")

pipeline = Pipeline().setStages([xgboost])

trainDF, testDF = train_data.randomSplit([0.8, 0.2], seed=24)

model = pipeline.fit(trainDF)
preds = model.transform(testDF)

preds.select(col("label"), col("prediction")).show()
preds.show()
Example #52
0
    outputCol="features")

# COMMAND ----------

trainingFeatureTest = featureAssembler.transform(trainingFeatureTest)
display(trainingFeatureTest.select("Survived", "indexedLabel", "Embarked", "feature2", "features"))

# COMMAND ----------

# Train a GBT model.
logisticRegression = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol="indexedLabel", featuresCol="features")
#gbtClassifier = GBTClassifier(labelCol="indexedLabel", featuresCol="features", maxIter=10)

# COMMAND ----------

pipeline = Pipeline(stages=[labelIndexer, featureIndexer1, featureIndexer2, featureAssembler, logisticRegression])

# COMMAND ----------

model = pipeline.fit(training)

# COMMAND ----------

treeModel = model.stages[-1]
# summary only
#display(treeModel)

# COMMAND ----------

training_predictions = model.transform(training)
Example #53
0
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
# (maxCategories is not set at the moment, however)
#  feature_indexer = VectorIndexer(inputCol="features", outputCol="indexed")
class_indexer = StringIndexer(inputCol="C4", outputCol="label")

# Read in data for sensitivity analysis
test_data = sql_context.read.load('tests/resources/iris_test_data.csv',
                                  format='com.databricks.spark.csv',
                                  header='false',
                                  inferSchema='true')

# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="features", labelCol="label")

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[assembler, class_indexer, dt])

# Train model.  This also runs the indexer.
model = pipeline.fit(data)

# Get our data_info frame, courtesy of PSAML
cols_to_analyze = ['C0', 'C1', 'C2', 'C3']
data_info = psaml.make_data_info(sql_context, test_data, cols_to_analyze, 'C4')

# Make predictions.
predictions = psaml.do_continuous_input_analysis(sc, model, 5, 5, data_info)


# Select example rows to display.
# predictions.show()  # opt param: number of records to show
                              outputCol="embarkedVec")

# Create the vector structured data (label,features(vector))
assembler = VectorAssembler(inputCols=[
    "Pclass", "sexVec", "Age", "SibSp", "Parch", "Fare", "embarkedVec"
],
                            outputCol="features")

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="Survived",
                        featuresCol="features",
                        maxIter=10)

# set up pipeline
pipeline = Pipeline(stages=[
    genderIndexer, embarkIndexer, genderEncoder, embarkEncoder, assembler, lr
])

# split the data
from pyspark.ml.tuning import TrainValidationSplit
train, test = df.randomSplit([0.7, 0.3], seed=41)

# fit the model
model = pipeline.fit(train)

# make prediction
predictions = model.transform(test)

lrmodel = model.stages[-1]
print("Coefficients: " + str(lrmodel.coefficientMatrix))
    'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
    'IsActiveMember', 'EstimatedSalary'
],
                            outputCol="features")

scaler = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        withStd=True,
                        withMean=False)

#model preparation

#create Logistic Regression object
LR = LogisticRegression(labelCol='Exited',
                        featuresCol='scaledFeatures',
                        predictionCol='Prediction')

#create pipeline
pipeline = Pipeline(stages=[assembler, scaler, LR])

#train model
model = pipeline.fit(ds_Churn_Modelling)

#make prediction
predictions = model.transform(ds_Churn_Modelling)
predictions.select('Prediction', 'Exited').show()

#save the model in hdfs.
#Note - you need a valid hdfs location with livy permission
model.write().overwrite().save('/LR_chrun_modelling_pyspark')
Example #56
0
def main():
    # Initialize spark and MLOps
    spark = SparkSession.builder.appName("RandomForestClassifier").getOrCreate()
    mlops.init(spark.sparkContext)

    # parse the arguments to component
    options = parse_args()
    print("PM: Configuration:")
    print("PM: Number of trees:                [{}]".format(options.num_trees))
    print("PM: Maximum depth:                  [{}]".format(options.max_depth))
    print("PM: Output model:                   [{}]".format(options.output_model))
    print("PM: Temp shared path:               [{}]".format(options.temp_shared_path))

    # Generate synthetic data using scikit learn
    num_samples = 50
    num_features = 20
    num_classes = 3
    X, y = make_classification(n_samples=num_samples, n_features=num_features, n_informative=2, n_redundant=1,
                               n_classes=num_classes, n_clusters_per_class=1, random_state=42)
    X = X + np.random.uniform(0, 5) * np.random.normal(0, 1, (num_samples, num_features))

    feature_names = ["".join(ascii_lowercase[a]) for a in range(num_features + 1)]
    feature_names[0] = "label"

    # Create a spark dataframe from the synthetic data generated 
    trainingData = spark.createDataFrame(
        pd.DataFrame(np.concatenate((y.reshape(-1, 1), X), axis=1), columns=feature_names))

    # Histogram of label distribution
    value, counts = np.unique(y, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    column_names = value.astype(str).tolist()
    print("Label distributions: \n {0}".format(label_distribution))

    # Output label distribution as a BarGraph using MCenter
    bar = BarGraph().name("Label Distribution").cols((label_distribution[:, 0]).astype(str).tolist()).data(
        (label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    # Output Health Statistics to MCenter
    # Report features whose distribution should be compared during inference
    mlops.set_data_distribution_stat(trainingData)

    # Fit a random forest classifiction model
    assembler = VectorAssembler(inputCols=feature_names[1:num_features + 1], outputCol="features")
    layers = [num_features, 5, 4, num_classes]
    classifier = RandomForestClassifier(numTrees=int(options.num_trees), maxDepth=int(options.max_depth))

    pipeline = Pipeline(stages=[assembler, classifier])
    model = pipeline.fit(trainingData)
    predictions = model.transform(trainingData)

    # Select (prediction, true label) and compute training error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    # Report accuracy of the chosen model using MCenter
    mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES)

    # Save the spark model 
    SparkPipelineModelHelper() \
        .set_shared_context(spark_context=spark.sparkContext) \
        .set_local_path(local_path=options.output_model) \
        .set_shared_path_prefix(shared_path_prefix=options.temp_shared_path) \
        .save_sparkml_model(model)

    # Stop spark context and MLOps
    spark.sparkContext.stop()
    mlops.done()
        [RowToImageFeature(), ImageResize(256, 256), ImageCenterCrop(224, 224),
         ImageChannelNormalize(123.0, 117.0, 104.0), ImageMatToTensor(), ImageFeatureToTensor()])

    preTrainedNNModel = NNModel(Model.loadModel(options.model_path), transformer) \
        .setFeaturesCol("image") \
        .setPredictionCol("embedding")

    lrModel = Sequential().add(Linear(1000, 2)).add(LogSoftMax())
    classifier = NNClassifier(lrModel, ClassNLLCriterion(), SeqToTensor([1000])) \
        .setLearningRate(options.learning_rate) \
        .setOptimMethod(Adam()) \
        .setBatchSize(options.batch_size) \
        .setMaxEpoch(options.nb_epoch) \
        .setFeaturesCol("embedding") \
        .setCachingSample(False) \

    pipeline = Pipeline(stages=[preTrainedNNModel, classifier])

    catdogModel = pipeline.fit(trainingDF)
    predictionDF = catdogModel.transform(validationDF).cache()
    predictionDF.sample(False, 0.1).show()

    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictionDF)
    # expected error should be less than 10%
    print("Test Error = %g " % (1.0 - accuracy))

    print("finished...")
    sc.stop()
Example #58
0
train_file = sys.argv[1]
trainingData = sqlContext.read.format("libsvm").load(train_file)

test_file = sys.argv[8]
testData = sqlContext.read.format("libsvm").load(test_file)

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData)

# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, dt])


'''
use union to add first col in testData
need to convert row into df
need to remove first row in testData, using original.subtract(firstRowDF)
'''
tmp_min1 = find_min_label(trainingData.collect())
tmp_min2 = find_min_label(testData.collect())
tmp_min = min(tmp_min1, tmp_min2)
# convert dataframe into list
test = testData.collect()
test = log_sinh_transform(test,tmp_min)
# test_id, test = collect_id(test)
train = trainingData.collect()
# Module Constants
APP_NAME = "reddit-comment-karma-regression"
REDDIT_AUG = "swift://reddit3.sjc01/RC_2015-08"
REDDIT_SEPT = "swift://reddit3.sjc01/RC_2015-09"

if __name__ == "__main__":
    
    # Configure Spark
    sc = SparkContext(appName=APP_NAME)
    sqlContext = SQLContext(sc)

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # prepare Reddit json files as sql Dataframes for pyspark.ml
    aug_comments =  sqlContext.read.json(REDDIT_AUG)
    sep_comments = sqlContext.read.json(REDDIT_SEPT)

    # transform data for log_reg model by dividing karma score by 1000
    #  error: Classification labels should be in {0 to 8114} Found 2576839 invalid labels.
    training = aug_comments.select('id', 'body', (aug_comments.score / 1000.0).cast("double").alias('label'))
    test = sep_comments.select('id', 'body')
    test_actual = sep_comments.select('id', (sep_comments.score / 1000.0).alias('actual'))

    model = pipeline.fit(training)
    prediction = model.transform(test)
    selected = prediction.select("id", "text", "prediction").join(test_actual, prediction.id == test_actual.id)
    selected.write.format('json').save("hdfs://master/usr/hadoop/karma_predictions")
    # Load the data stored in LIBSVM format as a DataFrame.
    data = spark.read.format("libsvm").load("data/mllib/sample_linear_regression_data.txt")

    # Automatically identify categorical features, and index them.
    # We specify maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

    # Chain indexer and tree in a Pipeline
    pipeline = Pipeline(stages=[featureIndexer, dt])

    # Train model.  This also runs the indexer.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("prediction", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(
        labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)