Exemple #1
0
def cross_validation_task_C(X,
                            estimator,
                            sqlContext,
                            class_type,
                            features_col,
                            sc,
                            k_folds=10):
    kf = KFold(n_splits=k_folds)
    maem = []
    maeni = []

    for train_index, test_index in kf.split(X):
        sparse_data = []
        test_data = []
        cl_cl = []

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]

        train_topic = sqlContext.createDataFrame(X_train)
        test_topic = sqlContext.createDataFrame(X_test)

        # True:  DecisionTree
        # False: NaiveBayes

        if (class_type):
            pred = pd.DataFrame(columns=['class', 'prediction'])
            train_topic = MLUtils.convertVectorColumnsFromML(
                train_topic, features_col)
            test_topic = MLUtils.convertVectorColumnsFromML(
                test_topic, features_col)

            for index, row in train_topic.toPandas().iterrows():
                sparse_data.append(
                    LabeledPoint(float(row['class']), row[features_col]))

            for index, row in test_topic.toPandas().iterrows():
                cl_cl.append(row['class'])
                test_data.append(row[features_col])

            model = DecisionTree.trainClassifier(sc.parallelize(sparse_data),
                                                 5, {})

            pred['class'] = cl_cl
            pred['prediction'] = model.predict(
                sc.parallelize(test_data)).collect()
            maem_aux, maeni_aux = mae_ms(pred)

        else:
            pred = estimator.fit(train_topic).transform(test_topic).select(
                'class', 'prediction').toPandas()
            maem_aux, maeni_aux = mae_ms(pred)

        maem.append(maem_aux)
        maeni.append(maeni_aux)

    return (np.mean(maem), np.mean(maeni))
def eval_logreg(new_df, filename):
    (train, test) = new_df.randomSplit([0.8, 0.2], 24)
    train = train.withColumnRenamed('prediction', 'label')
    test = test.withColumnRenamed('prediction', 'label')
    df = MLUtils.convertVectorColumnsFromML(train, "features")
    parsedData = df.select(col("label"), col("features")).rdd.map(
        lambda row: LabeledPoint(row.label, row.features))
    model = LogisticRegressionWithLBFGS.train(parsedData, numClasses=50)
    model.save(spark.sparkContext, filename)
    # sameModel = LogisticRegressionModel.load(spark.sparkContext, "LogRegLBFGSModel")
    labelsAndPreds = parsedData.map(lambda p:
                                    (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(
        lambda lp: lp[0] != lp[1]).count() / float(parsedData.count())
    print("LogReg Small Training Error = " + str(trainErr))
    df = MLUtils.convertVectorColumnsFromML(test, "features")
    parsed_test = df.select(col("label"), col("features")).rdd.map(
        lambda row: LabeledPoint(row.label, row.features))
    testErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(
        parsed_test.count())
    print("LogReg Small Test Error = " + str(testErr))
def analysis(df):
    """ML in Spark
    """
    htf = MLHashingTF(inputCol="message", outputCol="tf")
    tf = htf.transform(df)
    idf = MLIDF(inputCol="tf", outputCol="idf")
    tfidf = idf.fit(tf).transform(tf)
    #tfidf.show(truncate=True)

    #sum_ = udf(lambda v: float(v.values.sum()), DoubleType())
    #res_df = tfidf.withColumn("idf_sum", sum_("idf"))
    res_df = MLUtils.convertVectorColumnsFromML(tfidf, 'idf')
    ml_dataset = res_df.rdd.map(lambda x: x.idf).collect()
    model = KMeans.train(sc.parallelize(ml_dataset), 5, 50)

    return res_df, model
Exemple #4
0
    def __index_row_matrix_rdd(self, scale_df):
        """

        :param scale_df:
        :return:
        """
        try:
            vector_mllib = MLUtils.convertVectorColumnsFromML(
                scale_df, 'scaled_features').drop('features')
            vector_rdd = vector_mllib.select(
                'scaled_features',
                'id').rdd.map(lambda x: IndexedRow(x[1], x[0]))
            self.__logger.info("Build Index Row Matrix RDD")
            return IndexedRowMatrix(vector_rdd)
        except TypeError as te:
            raise OpheliaMLException(
                f"An error occurred while calling __index_row_matrix_rdd() method: {te}"
            )
        input = "data/mllib/sample_libsvm_data.txt"

    # Load input data
    print("Loading LIBSVM file with UDT from " + input + ".")
    df = spark.read.format("libsvm").load(input).cache()
    print("Schema from LIBSVM:")
    df.printSchema()
    print("Loaded training data as a DataFrame with " +
          str(df.count()) + " records.")

    # Show statistical summary of labels.
    labelSummary = df.describe("label")
    labelSummary.show()

    # Convert features column to an RDD of vectors.
    features = MLUtils.convertVectorColumnsFromML(df, "features") \
        .select("features").rdd.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print("Selected features column with average values:\n" +
          str(summary.mean()))

    # Save the records in a parquet file.
    tempdir = tempfile.NamedTemporaryFile(delete=False).name
    os.unlink(tempdir)
    print("Saving to " + tempdir + " as Parquet file.")
    df.write.parquet(tempdir)

    # Load the records back.
    print("Loading Parquet file with UDT from " + tempdir)
    newDF = spark.read.parquet(tempdir)
    print("Schema from Parquet:")
    newDF.printSchema()
Exemple #6
0
    input = "sample_libsvm_data.txt"

    # Load input data
    print("Loading LIBSVM file with UDT from " + input + ".")
    df = spark.read.format("libsvm").load(input).cache()
    print("Schema from LIBSVM:")
    df.printSchema()
    print("Loaded training data as a DataFrame with " +
          str(df.count()) + " records.")

    # Show statistical summary of labels.
    labelSummary = df.describe("label")
    labelSummary.show()

    # Convert features column to an RDD of vectors.
    features = MLUtils.convertVectorColumnsFromML(df, "features") \
        .select("features").rdd.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print("Selected features column with average values:\n" +
          str(summary.mean()))

    # Save the records in a parquet file.
    tempdir = tempfile.NamedTemporaryFile(delete=False).name
    os.unlink(tempdir)
    print("Saving to " + tempdir + " as Parquet file.")
    df.write.parquet(tempdir)

    # Load the records back.
    print("Loading Parquet file with UDT from " + tempdir)
    newDF = spark.read.parquet(tempdir)
    print("Schema from Parquet:")
    newDF.printSchema()
Exemple #7
0
    def transform(self, candidate_set):
        """
        TODO change comments
        Add prediction to each paper in the input data set based on the trained model and its features vector.

        :param dataset: paper profiles
        :return: dataset with predictions - column "prediction"
        """
        Logger.log("LTR: transform.")
        predictions = None
        # format user_id, paper_id
        candidate_set = candidate_set.select(
            self.userId_col,
            F.explode("candidate_set").alias(self.paperId_col))

        # scheme for the final prediction result data frame
        predictions_scheme = StructType([
            #  name, dataType, nullable
            StructField("user_id", StringType(), False),
            StructField("paper_id", IntegerType(), False),
            StructField("ranking_score", FloatType(), True)
        ])

        self.paper_profiles_model.setPaperIdCol(self.paperId_col)
        self.paper_profiles_model.setOutputCol(self.features_col)

        # add paper representation to each paper in the candidate set
        # candidate set format - user_id, paper_id
        predictions = self.paper_profiles_model.transform(candidate_set)
        # make predictions using the model over
        predictions = MLUtils.convertVectorColumnsFromML(
            predictions, self.features_col)

        if (self.model_training == "gm"
            ):  #self.Model_Training.SINGLE_MODEL_ALL_USERS):
            Logger.log("Prediction gm ...")

            model = self.models[0]
            # set threshold to NONE to receive raw predictions from the model
            model._threshold = None
            predictions = predictions.rdd.map(lambda p: (
                p.user_id, p.paper_id, float(model.predict(p.features))))
            predictions = predictions.toDF(predictions_scheme)

        elif (self.model_training == "imp"):
            Logger.log("Predicting imp...")
            model = self.models[0]
            # set threshold to NONE to receive raw predictions from the model
            model.threshold = None

            # broadcast weight vectors for all models
            model_br = self.spark.sparkContext.broadcast(model)

            predictions_rdd = predictions.rdd.map(lambda p: (
                p.user_id, p.paper_id,
                float(model_br.value.predict(p.user_id, p.features))))
            predictions = predictions_rdd.toDF(predictions_scheme)

        elif (self.model_training == "ims"):

            Logger.log("Predicting ims ...")

            # broadcast weight vectors for all models
            weights_br = self.spark.sparkContext.broadcast(self.models)

            def predict(id, features):
                weights = weights_br.value
                weight = weights[int(id)]
                prediction = weight.dot(features)
                return float(prediction)

            predict_udf = F.udf(predict, FloatType())

            predictions = predictions.withColumn("ranking_score", predict_udf("user_id", "features")) \
                    .select(self.userId_col, self.paperId_col, "ranking_score")

        elif (self.model_training == "cms"):
            Logger.log("Predicting cms ...")
            # add cluster id to each user - based on it, prediction are done
            users_in_cluster = self.user_clusters.withColumn(
                self.userId_col, F.explode("user_ids")).drop("user_ids")
            predictions = predictions.join(users_in_cluster, self.userId_col)

            for clusterId, model in self.models.items():
                # set threshold to NONE to receive raw predictions from the model
                model._threshold = None

            # broadcast weight vectors for all models
            models_br = self.spark.sparkContext.broadcast(self.models)

            def predict(id, features):
                models = models_br.value
                model = models[id]
                prediction = model.predict(features)
                return float(prediction)

            predict_udf = F.udf(predict, FloatType())

            predictions = predictions.withColumn("ranking_score", predict_udf("cluster_id", "features"))\
                .select(self.userId_col, self.paperId_col, "ranking_score")
        elif (self.model_training == "cmp"):
            # format user_id, paper_id, feature
            # add cluster id to each user - based on it, prediction are done
            predictions = predictions.join(self.user_clusters, self.userId_col)
            model = self.models[0]

            # set threshold to NONE to receive raw predictions from the model
            model.threshold = None
            # broadcast weight vectors for all models
            model_br = self.spark.sparkContext.broadcast(model)

            predictions_rdd = predictions.rdd.map(lambda p: (
                p.user_id, p.paper_id,
                float(model_br.value.predict(p.cluster_id, p.features))))
            predictions = predictions_rdd.toDF(predictions_scheme)
        else:
            # throw an error - unsupported option
            raise ValueError('The option' + self.model_training +
                             ' is not supported.')
        # user_id | paper_id | ranking_score|
        return predictions
Exemple #8
0
# COMMAND ----------

vectorized = (spark.read.format("delta")
                        .load(delta_gold_path)
                        .select(glow.array_to_sparse_vector(glow.genotype_states(fx.col("genotypes"))).alias("features"))
                        .cache())

# COMMAND ----------

# MAGIC %md
# MAGIC #### Use `pyspark.ml` to calculate principal components on sparse vector

# COMMAND ----------

matrix = RowMatrix(MLUtils.convertVectorColumnsFromML(vectorized, "features").rdd.map(lambda x: x.features))
pcs = matrix.computeSVD(num_pcs)

# COMMAND ----------

pd.DataFrame(pcs.V.toArray()).to_csv(principal_components_path)

# COMMAND ----------

# MAGIC %md
# MAGIC #### Read sample information in and plot out principal components
# MAGIC 
# MAGIC Here we annotate sample info with principal components by joining both DataFrames on index
# MAGIC 
# MAGIC Note: indexing is performed using the Spark SQL function `monotonically_increasing_id()`