def writeLumbarReadings(time, rdd):
	try:
		# Convert RDDs of the words DStream to DataFrame and run SQL query
		connectionProperties = MySQLConnection.getDBConnectionProps('/home/erik/mysql_credentials.txt')
		sqlContext = SQLContext(rdd.context)
		if rdd.isEmpty() == False:
			lumbarReadings = sqlContext.jsonRDD(rdd)
			lumbarReadingsIntermediate = lumbarReadings.selectExpr("readingID","readingTime","deviceID","metricTypeID","uomID","actual.y AS actualYaw","actual.p AS actualPitch","actual.r AS actualRoll","setPoints.y AS setPointYaw","setPoints.p AS setPointPitch","setPoints.r AS setPointRoll")
			assembler = VectorAssembler(
						inputCols=["actualPitch"], # Must be in same order as what was used to train the model.  Testing using only pitch since model has limited dataset.
						outputCol="features")
			lumbarReadingsIntermediate = assembler.transform(lumbarReadingsIntermediate)

			
			predictions = loadedModel.predict(lumbarReadingsIntermediate.map(lambda x: x.features))
			predictionsDF = lumbarReadingsIntermediate.map(lambda x: x.readingID).zip(predictions).toDF(["readingID","positionID"])
			combinedDF = lumbarReadingsIntermediate.join(predictionsDF, lumbarReadingsIntermediate.readingID == predictionsDF.readingID).drop(predictionsDF.readingID)
			
			combinedDF = combinedDF.drop("features")
			
			combinedDF.show()


			combinedDF.write.jdbc("jdbc:mysql://localhost/biosensor", "SensorReadings", properties=connectionProperties)
	except:
		pass
Example #2
0
def seg_model_gb(train_data, test_data, loss_type, num_iter, maxDepth):
    removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train]

    # Putting data in vector assembler form
    assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features")

    transformed_train = assembler_train.transform(train_data.fillna(0))

    # Creating input dataset in the form of labeled point for training the model
    data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features))

    # Training the model using Gradient Boosted Trees regressor
    model_train = GradientBoostedTrees.trainRegressor(sc.parallelize(data_train.collect(),5), categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Creating a list of features to be used for predictions
    removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final]

    # Putting data in vector assembler form
    assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features")

    transformed_final= assembler_final.transform(test_data.fillna(0))

    # Creating input dataset to be used for predictions
    data_final = transformed_final.select("features", "review_id")

    # Predicting ratings using the developed model
    predictions = model_train.predict(data_final.map(lambda x: x.features))
    labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions)
    return labelsAndPredictions
Example #3
0
def seg_model_lr(train_data, test_data, regType, num_iter):
    removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train]

    # Putting data in vector assembler form
    assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features")

    transformed_train = assembler_train.transform(train_data.fillna(0))

    # Creating input dataset in the form of labeled point for training the model
    data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features))

    # Training the model using Logistic regression Classifier
    model_train = LogisticRegressionWithLBFGS.train(sc.parallelize(data_train.collect(),5),
                                                    regType =regType, iterations=num_iter, numClasses=5)

    # Creating a list of features to be used for predictions
    removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final]

    # Putting data in vector assembler form
    assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features")

    transformed_final= assembler_final.transform(test_data.fillna(0))

    # Creating input dataset to be used for predictions
    data_final = transformed_final.select("features", "review_id")

    # Predicting ratings using the developed model
    predictions = model_train.predict(data_final.map(lambda x: x.features))
    labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions)
    return labelsAndPredictions
Example #4
0
def text_features(p_df):
    """
    Extracts features derived from the quora question texts.
    :param p_df: A DataFrame.
    :return: A DataFrame.  
    """
    diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType())
    common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType())
    unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType())


    p_df = p_df.withColumn("len_q1", length("question1")).withColumn("len_q2", length("question2"))
    p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2")))
    p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn("words_q2", size("question2_words"))
    p_df = p_df.withColumn("common_words", common_words(array("question1_words", "question2_words")))
    p_df = p_df.withColumn(
        "unique_chars_q1", unique_chars("question1")
    ).withColumn("unique_chars_q2", unique_chars("question2"))

    assembler = VectorAssembler(
        inputCols=["len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2"],
        outputCol="text_features"
    )
    p_df = assembler.transform(p_df)    
    return p_df
    def _convertPythonXToJavaObject(self, X):
        """
        Converts the input python object X to a java-side object (either MatrixBlock or Java DataFrame)

        Parameters
        ----------
        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
        """
        if isinstance(X, SUPPORTED_TYPES) and self.transferUsingDF:
            pdfX = convertToPandasDF(X)
            df = assemble(
                self.sparkSession,
                pdfX,
                pdfX.columns,
                self.features_col).select(
                self.features_col)
            return df._jdf
        elif isinstance(X, SUPPORTED_TYPES):
            return convertToMatrixBlock(self.sc, X)
        elif hasattr(X, '_jdf') and self.features_col in X.columns:
            # No need to assemble as input DF is likely coming via MLPipeline
            return X._jdf
        elif hasattr(X, '_jdf'):
            assembler = VectorAssembler(
                inputCols=X.columns, outputCol=self.features_col)
            df = assembler.transform(X)
            return df._jdf
        else:
            raise Exception('Unsupported input type')
Example #6
0
 def predict(self, X):
     if isinstance(X, SUPPORTED_TYPES):
         if self.transferUsingDF:
             pdfX = convertToPandasDF(X)
             df = assemble(self.sqlCtx, pdfX, pdfX.columns, 'features').select('features')
             retjDF = self.model.transform(df._jdf)
             retDF = DataFrame(retjDF, self.sqlCtx)
             retPDF = retDF.sort('ID').select('prediction').toPandas()
             if isinstance(X, np.ndarray):
                 return retPDF.as_matrix().flatten()
             else:
                 return retPDF
         else:
             retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))
             if isinstance(X, np.ndarray):
                 return retNumPy
             else:
                 return retNumPy # TODO: Convert to Pandas
     elif hasattr(X, '_jdf'):
         if 'features' in X.columns:
             # No need to assemble as input DF is likely coming via MLPipeline
             df = X
         else:
             assembler = VectorAssembler(inputCols=X.columns, outputCol='features')
             df = assembler.transform(X)
         retjDF = self.model.transform(df._jdf)
         retDF = DataFrame(retjDF, self.sqlCtx)
         # Return DF
         return retDF.sort('ID')
     else:
         raise Exception('Unsupported input type')
Example #7
0
    def scaleVecCol(self, columns, nameOutputCol):
        """
        This function groups the columns specified and put them in a list array in one column, then a scale
        process is made. The scaling proccedure is spark scaling default (see the example
        bellow).

        +---------+----------+
        |Price    |AreaLiving|
        +---------+----------+
        |1261706.9|16        |
        |1263607.9|16        |
        |1109960.0|19        |
        |978277.0 |19        |
        |885000.0 |19        |
        +---------+----------+

                    |
                    |
                    |
                    V
        +----------------------------------------+
        |['Price', 'AreaLiving']                 |
        +----------------------------------------+
        |[0.1673858972637624,0.5]                |
        |[0.08966137157852398,0.3611111111111111]|
        |[0.11587093205757598,0.3888888888888889]|
        |[0.1139820728616421,0.3888888888888889] |
        |[0.12260126542983639,0.4722222222222222]|
        +----------------------------------------+
        only showing top 5 rows

        """

        # Check if columns argument must be a string or list datatype:
        self.__assertTypeStrOrList(columns, "columns")

        # Check if columns to be process are in dataframe
        self.__assertColsInDF(columnsProvided=columns, columnsDF=self.__df.columns)

        # Check if nameOutputCol argument a string datatype:
        self.__assertTypeStr(nameOutputCol, "nameOutpuCol")

        # Model to use vectorAssember:
        vecAssembler = VectorAssembler(inputCols=columns, outputCol="features_assembler")
        # Model for scaling feature column:
        mmScaler = MinMaxScaler(inputCol="features_assembler", outputCol=nameOutputCol)
        # Dataframe with feature_assembler column
        tempDF = vecAssembler.transform(self.__df)
        # Fitting scaler model with transformed dataframe
        model = mmScaler.fit(tempDF)

        exprs = list(filter(lambda x: x not in columns, self.__df.columns))

        exprs.extend([nameOutputCol])

        self.__df = model.transform(tempDF).select(*exprs)
        self.__addTransformation()  # checkpoint in case

        return self
Example #8
0
def convert_to_flat_by_sparkpy(df):
    subkeys = df.select("subkey").dropDuplicates().collect()
    subkeys = [s[0] for s in subkeys]
    assembler = VectorAssembler().setInputCols(subkeys).setOutputCol("features")
    spark_df = assembler.transform(df.groupBy("key", "parameter").pivot("subkey").agg(first(col("reference"))))
    spark_df = spark_df.withColumnRenamed("parameter", "label")
    spark_df = spark_df.select("label", "features")
    return spark_df
def sparking_your_interest():
	df = SQLContext.read.json('speeches_dataset.json')
	df_fillna=df.fillna("")
	print(df_fillna.count())
	print(df_fillna.printSchema())

	df_utf=call_utf_encoder(df)
	df_cleaned=call_para_cleanup(df_utf)
	print(df_cleaned)
	df_with_bigrams = call_ngrams(df_cleaned, 2)
	df_with_trigrams = call_ngrams(df_with_bigrams, 3)
	df_with_4grams = call_ngrams(df_with_trigrams, 4)
	df_with_5grams = call_ngrams(df_with_4grams, 4)
	df_with_6grams = call_ngrams(df_with_5grams, 4)
	df_with_vocab_score = call_speech_vocab(df_with_6grams)

	df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams')
	df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams')
	df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams')
	assembler = VectorAssembler(
	    inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"],
	    outputCol="features")
	assembler_output = assembler.transform(df_with_4grams_idf_vectors)
	output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features')
	print(output.show())
	print(output.count())

	output_tordd = output.rdd
	train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123)
	train_df = train_rdd.toDF()
	test_df = test_rdd.toDF()
	print(train_df)
	print(test_df)

	print('Train DF - Count: ')
	print(train_df.count())
	print('Test DF - Count: ')
	print(test_df.count())

	print("Initializing RF Model")
	labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df)       
	rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)
	pipeline = Pipeline(stages=[labelIndexer,rf])
	model = pipeline.fit(output)
	print("Completed RF Model")

	predictions = model.transform(test_df)
	evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
	accuracy = evaluator.evaluate(predictions)
	print("Test Error = %g" % (1.0 - accuracy))
	rfModel = model.stages[1]
	print(rfModel)  # summary only
	print("Predictions: ")
	print(predictions.show())
    def _prepare_data_spark(self, data):
        """ Prepare data for spark format, output data will have the feature format and other useful information """

        keys = list(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE,
                                            self.TODAY_PRICE}))

        df = self._spark.createDataFrame(data)
        ass = VectorAssembler(inputCols=keys, outputCol="features")
        output = ass.transform(df)
        # output.select('features', 'ChangeDirection', 'ChangeAmount').write.save('test.parquet')
        return output
def predictPopularity(features):
    print(features)
    features = tuple(features)
    feature_label = []    
    for i in range(0, len(features)):
        feature_label.append('feature' +str(i))
    data_frame = spark.createDataFrame([features], feature_label)
    assembler = VectorAssembler(inputCols= feature_label, outputCol = 'features')
    data_frame = assembler.transform(data_frame)
    data_frame = data_frame.select('features')
    result = rfc_model.transform(data_frame)
    return result.select('prediction').head(1)[0][0]
Example #12
0
def test_train_data(overall_segment):
    removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_train = [v for i, v in enumerate(overall_segment.columns) if v not in removelist_train]

    # Putting data in vector assembler form
    assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features")

    transformed_train = assembler_train.transform(overall_segment.fillna(0))

    # Creating input dataset in the form of labeled point for training the model
    data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features))

    (trainingData, testData) = sc.parallelize(data_train.collect(),5).randomSplit([0.7, 0.3])
    return (trainingData, testData)
    def commit(self):
        self.update_domain_role_hints()
        if self.in_df is not None:
            attributes = [att for att in self.used_attrs._list]
            class_var = [var for var in self.class_attrs._list]
            metas = [meta for meta in self.meta_attrs._list]
            VA = VectorAssembler(inputCols = attributes, outputCol = 'features')
            self.out_df = VA.transform(self.in_df)
            if len(class_var):
                self.out_df = self.out_df.withColumn('label', self.out_df[class_var[0]].cast('double'))

            self.send("DataFrame", self.out_df)
        else:
            self.send("DataFrame", None)
Example #14
0
def tf_idf_features_quora(p_df):
    """
    Extracts TF-IDF features from quora dataset.
    :param p_df: A DataFrame.
    :return: A DataFrame.    
    """     
    tf_df = extract_tf_features(p_df, "question1_meaningful_words", "tf1")
    tf_df = extract_tf_features(tf_df, "question2_meaningful_words", "tf2")
    tf_idf_df = extract_idf_features(tf_df, "tf1", "tf-idf1")
    tf_idf_df = extract_idf_features(tf_idf_df, "tf2", "tf-idf2")
    assembler = VectorAssembler(
        inputCols=["tf-idf1", "tf-idf2"],
        outputCol="tf_idf_features"
    )
    return assembler.transform(tf_idf_df)
Example #15
0
def convert_to_flat_by_sparkpy(df):
    subkeys = df.select("subkey").dropDuplicates().collect()
    subkeys = [s[0] for s in subkeys]

    n = len(df.select("reference").first()[0])
    # df = df.groupBy("key").agg(array(*[avg(col("reference")[i]) for i in range(n)]).alias("averages"))
    df = df.groupBy("key").agg(array(*[collect_list(col("reference")[i]) for i in range(n)]).alias("averages"))
    df.show()
    r = df.collect()

    # changedTypedf = joindf.withColumn("label", joindf["show"].cast(DoubleType()))
    assembler = VectorAssembler().setInputCols(subkeys).setOutputCol("features")
    spark_df = assembler.transform(df.groupBy("key", "parameter").pivot("subkey").agg(first(col("reference"))))
    spark_df = spark_df.withColumnRenamed("parameter", "label")
    spark_df = spark_df.select("label", "features")
    return spark_df
Example #16
0
    def predict(self, X):
        """
        Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types

        Parameters
        ----------
        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
        """
        try:
            if self.estimator is not None and self.model is not None:
                self.estimator.copyProperties(self.model)
        except AttributeError:
            pass
        if isinstance(X, SUPPORTED_TYPES):
            if self.transferUsingDF:
                pdfX = convertToPandasDF(X)
                df = assemble(self.sparkSession, pdfX, pdfX.columns, self.features_col).select(self.features_col)
                retjDF = self.model.transform(df._jdf)
                retDF = DataFrame(retjDF, self.sparkSession)
                retPDF = retDF.sort('__INDEX').select('prediction').toPandas()
                if isinstance(X, np.ndarray):
                    return self.decode(retPDF.as_matrix().flatten())
                else:
                    return self.decode(retPDF)
            else:
                try:
                    retNumPy = self.decode(convertToNumPyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X))))
                except Py4JError:
                    traceback.print_exc()
                if isinstance(X, np.ndarray):
                    return retNumPy
                else:
                    return retNumPy # TODO: Convert to Pandas
        elif hasattr(X, '_jdf'):
            if self.features_col in X.columns:
                # No need to assemble as input DF is likely coming via MLPipeline
                df = X
            else:
                assembler = VectorAssembler(inputCols=X.columns, outputCol=self.features_col)
                df = assembler.transform(X)
            retjDF = self.model.transform(df._jdf)
            retDF = DataFrame(retjDF, self.sparkSession)
            # Return DF
            return retDF.sort('__INDEX')
        else:
            raise Exception('Unsupported input type')
Example #17
0
    def transform(self, df, featureCols, targetCol):
        """Keep the K most important features of the Spark DataFrame

        Parameters
        ----------
        df : Spark DataFrame
        featureCols: array, names of feature columns
            to consider in the feature selectio algorithm
        targetCol: str, name of target column, i.e, column to which
            compare each feature.

        Returns
        -------
        transformed_df : New Spark DataFrame with only the most important
            feature columns.

        """

        # build features assemble
        assembler = VectorAssembler(
            inputCols = featureCols,
            outputCol = 'features')
        assembled_df = assembler.transform(df)

        # rename target column
        assembled_df = assembled_df.withColumnRenamed(targetCol,'target')

        # extract features and target
        feats = assembled_df.select('features').rdd
        feats = feats.map(lambda x: x['features'])
        target = assembled_df.select('target').rdd
        target = target.map(lambda x: x['target'])

        # compute per-column metric
        scores = []
        for i,feat in enumerate(featureCols):
            vector = feats.map(lambda x: x[i])
            scores.append(self.sfunc_(vector,target))
        self.scores_ = scores
        
        # sort scores
        idx = sorted(range(len(self.scores_)),reverse=True,key=self.scores_.__getitem__)
        
        # return dataframe with k-best columns 
        return df.select(*[featureCols[idd] for idd in idx[:self.k_]])
Example #18
0
def convertToLabeledDF(sparkSession, X, y=None):
    from pyspark.ml.feature import VectorAssembler
    if y is not None:
        pd1 = pd.DataFrame(X)
        pd2 = pd.DataFrame(y, columns=['label'])
        pdf = pd.concat([pd1, pd2], axis=1)
        inputColumns = ['C' + str(i) for i in pd1.columns]
        outputColumns = inputColumns + ['label']
    else:
        pdf = pd.DataFrame(X)
        inputColumns = ['C' + str(i) for i in pdf.columns]
        outputColumns = inputColumns
    assembler = VectorAssembler(inputCols=inputColumns, outputCol='features')
    out = assembler.transform(sparkSession.createDataFrame(pdf, outputColumns))
    if y is not None:
        return out.select('features', 'label')
    else:
        return out.select('features')
Example #19
0
def merge_features(ddfs, join_column, merge_column, output_column='features', drop_merged_columns=True):
    """
    join (inner) several DataFrames by same id and merge its columns (merge_column) into one column using using pyspark.ml.feature.VectorAssembler

    Example:
        ddf_merge = merge_features(ddfs=[ddf_pivot1,ddf_pivot2], join_column='customer_id', merge_column='features')
    :param ddfs:
    :param join_column: id column to join by (each ddf must have this column)
    :param merge_column: column to merge (each ddf must have this column)
    :param output_column:
    :param drop_merged_columns:
    :return:
    """
    from pyspark.ml.feature import VectorAssembler

    ddf_res = ddfs.pop(0)
    merge_column_renamed = merge_column + str(0)
    merge_columns = [merge_column_renamed]
    ddf_res = ddf_res.withColumnRenamed(merge_column, merge_column_renamed)

    for i,ddf in enumerate(ddfs):
        merge_column_renamed = merge_column + str(i+1)
        merge_columns.append(merge_column_renamed)
        ddf_r = ddf.withColumnRenamed(merge_column, merge_column_renamed)
        ddf_res = ddf_res.join(ddf_r, on=join_column, how='inner')

    assembler = VectorAssembler(inputCols=merge_columns, outputCol=output_column)
    res = assembler.transform(ddf_res)

    if drop_merged_columns:
        res = drop_columns(res, columns=merge_columns)

    return res


# def pivot_aggregate(ddf, grpby_columns, pivot_column, aggs, pivot_filter_values=None, pivot_filter_support=None):
#     if pivot_filter_support and not pivot_filter_values:
#         frequent = ddf.freqItems([pivot_column], support=pivot_filter_support).first().asDict()[pivot_column+'_freqItems']
#         pivot_filter_values = map(str,frequent)
#
#     ddf_gr = ddf.groupBy(*grpby_columns)
#     ddf_pivot = ddf_gr.pivot(pivot_column, pivot_filter_values)
#     ddf_agg = ddf_pivot.agg(*aggs)
#     return ddf_agg
Example #20
0
def preprocess(data):
  data = data.select('Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime','UniqueCarrier'\
                            ,'FlightNum','TailNum','ActualElapsedTime','CRSElapsedTime','AirTime','ArrDelay','DepDelay', 'Origin'\
                            ,'Dest','Distance','TaxiIn','TaxiOut','Cancelled')
  data = data.na.fill('999999')
  for t in data.dtypes:
   if t[1]=='string' and t[0] not in ['Origin','Dest','TailNum','UniqueCarrier','FlightNum']:
     data=data.withColumn(t[0],x[t[0]].cast('integer'))
  data = data.na.fill(999999)
  data = data.withColumnRenamed('Cancelled','label')
  data = data.withColumn('label',data.label.cast('double'))
  assembler = VectorAssembler(
	    inputCols=['Year','Month','DayofMonth','DayOfWeek'
		,'DepTime','CRSDepTime','ArrTime','CRSArrTime',
		'ActualElapsedTime','CRSElapsedTime','AirTime',
		'ArrDelay','DepDelay','Distance','TaxiIn','TaxiOut'],
	    outputCol='features')
  data = assembler.transform(data)
  data = data.select('features','label')
  return data
Example #21
0
def to_numeric_df(kdf: 'ks.DataFrame') -> Tuple[pyspark.sql.DataFrame, List[str]]:
    """
    Takes a dataframe and turns it into a dataframe containing a single numerical
    vector of doubles. This dataframe has a single field called '_1'.

    TODO: index is not preserved currently
    :param kdf: the koalas dataframe.
    :return: a pair of dataframe, list of strings (the name of the columns
             that were converted to numerical types)

    >>> to_numeric_df(ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}))
    (DataFrame[_correlation_output: vector], ['A', 'B'])
    """
    # TODO, it should be more robust.
    accepted_types = {np.dtype(dt) for dt in [np.int8, np.int16, np.int32, np.int64,
                                              np.float32, np.float64, np.bool_]}
    numeric_fields = [fname for fname in kdf._metadata.data_columns
                      if kdf[fname].dtype in accepted_types]
    numeric_df = kdf._sdf.select(*numeric_fields)
    va = VectorAssembler(inputCols=numeric_fields, outputCol=CORRELATION_OUTPUT_COLUMN)
    v = va.transform(numeric_df).select(CORRELATION_OUTPUT_COLUMN)
    return v, numeric_fields
Example #22
0
def cluster():
    ld = load(open(DATAP+'\\temp\olangdict.json','r',encoding='UTF-8'))

    spark = SparkSession.builder\
                        .master("local")\
                        .appName("Word Count")\
                        .config("spark.some.config.option", "some-value")\
                        .getOrCreate()

    df = spark.createDataFrame([["0"],
                                ["1"],
                                ["2"],
                                ["3"],
                                ["4"]],
                               ["id"])
    df.show()

    vecAssembler = VectorAssembler(inputCols=["feat1", "feat2"], outputCol="features")
    new_df = vecAssembler.transform(df)

    kmeans = KMeans(k=2, seed=1)  # 2 clusters here
    model = kmeans.fit(new_df.select('features'))
    transformed = model.transform(new_df)
    print(transformed.show())
modelVars.remove('AgeCb')
modelVars.remove('CurrentLimit')
modelVars.remove('InitialLimit')
modelVars.remove('LimitReached')
modelVars.remove('InitialLimitEqualsCurrent')
modelVars.remove('LimitReachedInLast3Months')
modelVars.remove('LimitChangedInLast3Months')
modelVars.remove('ExcessPaymentAmmountCurrent')
modelVars.remove('NumberOfExcessPayments3M')
modelVars.remove('ExcessPaymentAmmount3M')
modelVars.remove('StartDate')
#############################################################
#### Assemble all feature columns into one vector column ####
####   called "features" and transform to RDD of lists   ####
#############################################################
allToOne = VectorAssembler(inputCols = modelVars, outputCol = "features")
assembledRDD = allToOne.transform(MyTrain.select(modelVars)).select("features").rdd.map(lambda line: line[0]).persist()

#### Calculate correlation matrix and set it to write mode ####
corM = Statistics.corr(assembledRDD)
corM.setflags(write = True)
fill_diagonal(corM, 0.0)

#######################################################
####     Iterate by rows of correlation matrix     ####
####   If there is a value greater than threshold  ####
#### in the current row set the row and the column ####
####            with same index to 0.              ####
#######################################################
nVar = corM.shape[0]
corToMod = array(corM.tolist()) # This is to ensure that a copy is made
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

dataset = spark.createDataFrame(
    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
    ["id", "hour", "mobile", "userFeatures", "clicked"])

assembler = VectorAssembler(inputCols=["hour", "mobile", "userFeatures"],
                            outputCol="features")

output = assembler.transform(dataset)
print(
    "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'"
)
output.select("features", "clicked").show(truncate=False)

spark.stop()
Example #25
0
spark = SparkSession.builder.appName('Popularity').getOrCreate()
data = spark.read.csv('OnlineNewsPopularity.csv',
                      inferSchema=True,
                      header=True)
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=[
    'timedelta', 'n_tokens_title', 'n_tokens_content', 'n_unique_tokens',
    'n_non_stop_words', 'n_non_stop_unique_tokens', 'num_hrefs',
    'num_self_hrefs', 'num_imgs', 'num_videos', 'average_token_length',
    'num_keywords', 'data_channel_is_lifestyle',
    'data_channel_is_entertainment', 'data_channel_is_bus',
    'data_channel_is_socmed', 'data_channel_is_tech', 'data_channel_is_world',
    'self_reference_max_shares', 'self_reference_avg_sharess',
    'weekday_is_monday', 'weekday_is_tuesday', 'weekday_is_wednesday',
    'weekday_is_thursday', 'weekday_is_friday', 'weekday_is_saturday',
    'weekday_is_sunday', 'is_weekend', 'global_subjectivity',
    'global_sentiment_polarity', 'title_subjectivity',
    'title_sentiment_polarity', 'abs_title_subjectivity',
    'abs_title_sentiment_polarity'
],
                            outputCol='features')
new_data = assembler.transform(data)

final_data = new_data.select('features', 'shares')
from pyspark.ml.feature import QuantileDiscretizer

discretizer = QuantileDiscretizer(numBuckets=2,
                                  inputCol="shares",
                                  outputCol="result")
Example #26
0
def main(base_path):

    APP_NAME = "make_predictions_streaming.py"

    # Process data every 10 seconds
    PERIOD = 10
    BROKERS = 'localhost:9092'
    PREDICTION_TOPIC = 'flight_delay_classification_request'

    try:
        sc and ssc
    except NameError as e:
        import findspark

        # Add the streaming package and initialize
        findspark.add_packages(
            ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
        findspark.init()

        import pyspark
        import pyspark.sql
        import pyspark.streaming

        from pyspark import SparkContext, SparkConf
        from pyspark.sql import SparkSession, Row
        from pyspark.streaming import StreamingContext
        from pyspark.streaming.kafka import KafkaUtils

        import pymongo_spark
        pymongo_spark.activate()

        conf = SparkConf().set("spark.default.parallelism", 1)
        sc = SparkContext(
            appName="Agile Data Science: PySpark Streaming 'Hello, World!'",
            conf=conf)
        ssc = StreamingContext(sc, PERIOD)
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # Load all models to be used in making predictions
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string field vectorizer pipelines into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Process Prediction Requests in Streaming
    #
    stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], {
        "metadata.broker.list": BROKERS,
        "group.id": "0",
    })

    object_stream = stream.map(lambda x: json.loads(x[1]))
    object_stream.pprint()

    row_stream = object_stream.map(
        lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']),
                      Origin=x['Origin'],
                      Distance=x['Distance'],
                      DayOfMonth=x['DayOfMonth'],
                      DayOfYear=x['DayOfYear'],
                      UUID=x['UUID'],
                      DepDelay=x['DepDelay'],
                      DayOfWeek=x['DayOfWeek'],
                      FlightNum=x['FlightNum'],
                      Dest=x['Dest'],
                      Timestamp=iso8601.parse_date(x['Timestamp']),
                      Carrier=x['Carrier']))
    row_stream.pprint()

    #
    # Create a dataframe from the RDD-based object stream
    #

    def classify_prediction_requests(rdd):

        from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
        from pyspark.sql.types import StructType, StructField

        prediction_request_schema = StructType([
            StructField("Carrier", StringType(), True),
            StructField("DayOfMonth", IntegerType(), True),
            StructField("DayOfWeek", IntegerType(), True),
            StructField("DayOfYear", IntegerType(), True),
            StructField("DepDelay", DoubleType(), True),
            StructField("Dest", StringType(), True),
            StructField("Distance", DoubleType(), True),
            StructField("FlightDate", DateType(), True),
            StructField("FlightNum", StringType(), True),
            StructField("Origin", StringType(), True),
            StructField("Timestamp", TimestampType(), True),
            StructField("UUID", StringType(), True),
        ])

        prediction_requests_df = spark.createDataFrame(
            rdd, schema=prediction_request_schema)
        prediction_requests_df.show()

        #
        # Add a Route variable to replace FlightNum
        #

        from pyspark.sql.functions import lit, concat
        prediction_requests_with_route = prediction_requests_df.withColumn(
            'Route',
            concat(prediction_requests_df.Origin, lit('-'),
                   prediction_requests_df.Dest))
        prediction_requests_with_route.show(6)

        # Vectorize string fields with the corresponding pipeline for that column
        # Turn category fields into categoric feature vectors, then drop intermediate fields
        for column in ["Carrier", "Origin", "Dest", "Route"]:
            string_indexer_model = string_indexer_models[column]
            prediction_requests_with_route = string_indexer_model.transform(
                prediction_requests_with_route)

        # Vectorize numeric columns: DepDelay, Distance and index columns
        final_vectorized_features = vector_assembler.transform(
            prediction_requests_with_route)

        # Inspect the vectors
        final_vectorized_features.show()

        # Drop the individual index columns
        index_columns = [
            "Carrier_index", "Origin_index", "Dest_index", "Route_index"
        ]
        for column in index_columns:
            final_vectorized_features = final_vectorized_features.drop(column)

        # Inspect the finalized features
        final_vectorized_features.show()

        # Make the prediction
        predictions = rfc.transform(final_vectorized_features)

        # Drop the features vector and prediction metadata to give the original fields
        predictions = predictions.drop("Features_vec")
        final_predictions = predictions.drop("indices").drop("values").drop(
            "rawPrediction").drop("probability")

        # Inspect the output
        final_predictions.show()

        # Store to Mongo
        if final_predictions.count() > 0:
            final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
                "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
            )

    # Do the classification and store to Mongo
    row_stream.foreachRDD(classify_prediction_requests)

    ssc.start()
    ssc.awaitTermination()
def preprocess_test(test, model=None):
    # test = test.dropna(axis=1, how='all', inplace=False)
    # for c in test.columns:
    #     if test.filter(col(c).isNotNull()).count() == 0:
    #         test = test.drop(c)

    print('Length of test : ' + str(len(test.columns)))

    if model == 'xgb':
        cols = [x for x in test.columns if x not in ['datetime']]
        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))

        test = clip(test, cols)

        # test = test.resample('H').mean()

        # test = test.rolling(window=50).mean()

        test = get_mean_of_cyl_values(test)
        test = test.fillna(0)

        return test

    elif model == 'lstm':

        cols = [x for x in test.columns if x not in ['datetime']]
        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))
        test = clip(test, cols)

        test = get_mean_of_cyl_values(test)

        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))
        print(test.schema)
        test = test.fillna(0)

        cols = [x for x in test.columns if x not in ['datetime']]


        assembler = VectorAssembler().setInputCols \
            (cols).setOutputCol("features")
        print('assembler')
        transformed = assembler.transform(test)

        # Normalize each Vector using $L^1$ norm.
        normalizer = Normalizer(inputCol="features",
                                outputCol="normFeatures",
                                p=1.0)
        l1NormData = normalizer.transform(transformed)

        scaler = StandardScaler(inputCol="normFeatures",
                                outputCol="scaledFeatures",
                                withStd=True,
                                withMean=False)

        # Compute summary statistics by fitting the StandardScaler
        scalerModel = scaler.fit(l1NormData)
        # Normalize each feature to have unit standard deviation.
        scaledData = scalerModel.transform(l1NormData)
        # train = scaledData.drop(*cols)
        del test, transformed, l1NormData

        n_components_ = 50

        pca = PCA(k=n_components_,
                  inputCol="scaledFeatures",
                  outputCol="pcaFeatures")
        model = pca.fit(scaledData)

        vds_5 = model.transform(scaledData).select(['pcaFeatures', 'datetime'])
        print(vds_5)

        def extract(row):
            return (row.datetime, ) + tuple(row.pcaFeatures.toArray().tolist())

        vds_5 = vds_5.rdd.map(extract).toDF(["datetime"])
        print(vds_5)

        vds_5 = vds_5.drop(*['pcaFeatures', 'datetime'])

        return vds_5

    elif model == 'svm':
        # test = test.toPandas()
        # test = clip_data(test)
        cols = [x for x in test.columns if x not in ['datetime']]
        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))

        test = clip(test, cols)

        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))

        # test = test.toPandas()
        # test_max = test.resample('H').max().add_suffix('_max')
        # test_min = test.resample('H').min().add_suffix('_min')
        # test_std = test.resample('H').std().add_suffix('_std')
        # test = test.resample('H').mean()
        #
        # test = pd.concat([test, test_max], axis=1, sort=False)
        # test = pd.concat([test, test_min], axis=1, sort=False)
        # test = pd.concat([test, test_std], axis=1, sort=False)
        # del test_max, test_min,
        # gc.collect()

        # test = test.toHandy()

        test = get_mean_of_cyl_values(test)

        # vds_5 = test
        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))

        # vds_5 = vds_5.replace(to_replace=0, value=1)

        # vds_5 = vds_5.pct_change(periods=1, fill_method='ffill')

        # window = Window.orderBy('datetime') \
        #     .rowsBetween(-sys.maxsize, 0)
        #
        # def ffill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # def bfill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # for column in cols:
        #     vds_5 = vds_5.withColumn(column,ffill(col(column)))
        #
        # for column in cols:
        #     vds_5 = vds_5.withColumn(column,bfill(col(column)))

        test = test.fillna(0)
        # vds_5 = vds_5.fillna(method='ffill')
        # vds_5 = vds_5.fillna(method='bfill')

        return test

    elif model == 'perm':

        # test = test.resample('H').mean()
        # test = test.rolling(window=20).mean()

        cols = [x for x in test.columns if x not in ['datetime']]
        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))

        test = test.fillna(0)
        test = clip(test, cols)

        # window = Window.orderBy('datetime') \
        #     .rowsBetween(-sys.maxsize, 0)
        #
        # def ffill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # def bfill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # for column in cols:
        #     test = test.withColumn(column,ffill(col(column)))
        #
        # for column in cols:
        #     test = test.withColumn(column,bfill(col(column)))

        test = test.fillna(0)

        return test
        .builder\
        .appName("VectorSizeHintExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame(
        [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0),
         (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)],
        ["id", "hour", "mobile", "userFeatures", "clicked"])

    sizeHint = VectorSizeHint(
        inputCol="userFeatures",
        handleInvalid="skip",
        size=3)

    datasetWithSize = sizeHint.transform(dataset)
    print("Rows where 'userFeatures' is not the right size are filtered out")
    datasetWithSize.show(truncate=False)

    assembler = VectorAssembler(
        inputCols=["hour", "mobile", "userFeatures"],
        outputCol="features")

    # This dataframe can be used by downstream transformers as before
    output = assembler.transform(datasetWithSize)
    print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
    output.select("features", "clicked").show(truncate=False)
    # $example off$

    spark.stop()
Example #29
0
# In[ ]:


# Load the training data into a dataframe
data = spark.read.format('json').load('train.jsonl')
data = clean_tokenize_remove_stopwords_quora(data)

# Get the tf-idf features
data = tf_idf_features_quora(data)
# Get the text features
data = text_features(data)

# combine all the features
feature_assembler = VectorAssembler(
    inputCols=["tf_idf_features", "text_features"],
    outputCol="combined_features"
)
data = feature_assembler.transform(data)


# Normalizing each feature to have unit standard deviation
scaler = StandardScaler(inputCol="combined_features", outputCol="features",
                        withStd=True, withMean=False)
scalerModel = scaler.fit(data)
# Normalize each feature to have unit standard deviation.
data = scalerModel.transform(data)


# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import os

df = sqlContext.read.json(os.environ['WORKDIR'] + "user_features.json")

df_restaurants = df.filter("category = \"Restaurants\"")


assembler = VectorAssembler(
    inputCols=["average_stars", "cat_avg_review_len", "cat_avg_stars", "cat_business_count", "cat_review_count", "months_yelping", "review_count", "votes_cool", "votes_funny", "votes_useful" ],
    outputCol="features")
output = assembler.transform(df_restaurants)

(trainingData, testData) = output.randomSplit([0.7, 0.3])

dt = DecisionTreeRegressor(labelCol = "elite", featuresCol="features")
pipeline = Pipeline(stages=[dt])
model = pipeline.fit(trainingData)
predictions = model.transform(testData)

predictions.select("prediction", "elite", "features").show(5)


evaluator = RegressionEvaluator(
    labelCol="elite", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
Example #31
0
numColumns = [
    item[0] for item in df.dtypes if not item[1].startswith('string')
]
catColVectors = [c + '_vector' for c in catColumns]

# Change categorical values into numeric
indexers = [
    StringIndexer(inputCol=column, outputCol=column + "_index")
    for column in catColumns
]
encoder = OneHotEncoderEstimator(
    inputCols=[c + "_index" for c in catColumns],
    outputCols=[c + "_vector" for c in catColumns])

assembler = VectorAssembler(inputCols=encoder.getOutputCols() + numColumns,
                            outputCol="features")

label_stringIdx = StringIndexer(inputCol="income", outputCol="label")

pipeline = Pipeline(stages=indexers + [label_stringIdx, encoder, assembler])
encoded_df = pipeline.fit(df).transform(df)

selectedCols = ['label', 'features'] + cols
dataset = encoded_df.select(selectedCols)

# Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print(trainingData.count())
print(testData.count())

# Fit model and train
Example #32
0
def with_features(raw_df, feature_cols):
    vector_assembler = VectorAssembler().setInputCols(feature_cols).setOutputCol('features')
    pipeline = Pipeline().setStages([vector_assembler])

    df = pipeline.fit(raw_df).transform(raw_df)
    return df
Example #33
0
out

# In[19]:

df_train.dtypes

# In[13]:

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

# In[14]:

featureassembler = VectorAssembler(inputCols=[
    "a", "area", "ci", "pi", "eccentricity", "kx", "ky", "m00", "m01", "m10",
    "minAreaPercent", "minEnclosingCircleArea", "mu02", "mu03", "mu11", "mu20",
    "mu30", "sx", "sy", "d"
],
                                   outputCol="features")

# In[20]:

output = featureassembler.transform(df_train)

# In[21]:

output.select("features").show(5)

# In[344]:

output.columns
Example #34
0
    def Logistic_regression(dataset_add, feature_colm, label_colm):

        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=";")

        dataset.show()

        dataset.groupBy("y").count().show()
        label = ''
        for y in label_colm:
            label = y

        print(label)

        # using the rformula for indexing, encoding and vectorising

        # f = ""
        # f = label + " ~ "
        #
        # for x in features:
        #     f = f + x + "+"
        # f = f[:-1]
        # f = (f)

        # extracting the schema

        val = dataset.schema

        string_features = []
        integer_features = []

        for x in val:
            if (str(x.dataType) == "StringType"):
                for y in feature_colm:
                    if x.name == y:
                        string_features.append(x.name)
            else:
                for y in feature_colm:
                    if x.name == y:
                        integer_features.append(x.name)

        print(string_features)
        print(integer_features)
        print(val)
        # print(label)
        # label = 'y'

        for z in val:
            if (z.name == label and str(z.dataType) == "StringType"):
                label_indexer = StringIndexer(inputCol=label,
                                              outputCol='indexed_' +
                                              label).fit(dataset)
                dataset = label_indexer.transform(dataset)
            if (z.name == label and str(z.dataType)
                    == ("IntegerType" or "FloatType" or "DoubleType")):
                dataset = dataset.withColumnRenamed(label, 'indexed_' + label)

        ###########################################################################
        indexed_features = []
        encoded_features = []
        for col in string_features:
            indexer = StringIndexer(inputCol=col,
                                    outputCol='indexed_' + col).fit(dataset)
            indexed_features.append('indexed_' + col)
            dataset = indexer.transform(dataset)
            # dataset.show()
            # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+col], outputCols=['encoded_'+col]).fit(dataset)
            # encoded_features.append('encoded_'+col)
            # dataset = encoder.transform(dataset)
            # dataset.show()

        print(indexed_features)
        print(encoded_features)

        # combining both the features colm together

        final_features = integer_features + indexed_features

        print(final_features)

        # now using the vector assembler

        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")

        dataset = featureassembler.transform(dataset)
        dataset.show()

        # combining both the features colm together

        # output.show()
        # output.select("features").show()

        # output_features = dataset.select("features")

        # using the vector indexer (for categorical data kind of one hot encoding)

        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=15).fit(dataset)

        categorical_features = vec_indexer.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))

        vec_indexed = vec_indexer.transform(dataset)
        vec_indexed.show()

        # preparing the finalized data

        finalized_data = vec_indexed.select('indexed_' + label,
                                            'vec_indexed_features')
        finalized_data.show()

        # formula = RFormula(formula=f,
        #                    featuresCol="features",
        #                    labelCol="label")
        #
        # output = formula.fit(dataset).transform(dataset)
        #
        # output_2 = output.select("features", "label")
        #
        # output_2.show()

        # splitting the dataset into train and test

        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)

        # implementing the logistic regression
        # lr1 =LogisticRegression()

        Accuracy_list = []
        # Accuracy_list.append(accuracy)
        FPR_list = []
        # FPR_list.append(falsePositiveRate)
        TPR_list = []
        precision_list = []
        recall_list = []

        y = 0.1
        # x=[]
        for i in range(0, 3):
            y = round(y + 0.1, 2)

            lr = LogisticRegression(featuresCol='vec_indexed_features',
                                    labelCol='indexed_' + label,
                                    maxIter=5,
                                    regParam=0.1,
                                    elasticNetParam=1.0,
                                    threshold=0.3)

            # fit the model

            lrModel = lr.fit(train_data)
            lrModel

            # print the coefficients and the intercept for the logistic regression

            print("coefficients:" + str(lrModel.coefficientMatrix))
            # mat = (lrModel.coefficientMatrix)
            # print mat
            print("intercept: " + str(lrModel.interceptVector))

            # getting the summary of the model

            # f-measure calculation
            from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary

            training_summary = lrModel.summary

            BinaryLogisticRegressionTrainingSummary.accuracy

            print(" area under roc : ", training_summary.areaUnderROC)
            print("  roc : ", training_summary.roc)
            roc = training_summary.roc
            roc.show()
            print(" pr value : ", training_summary.pr)
            pr = training_summary.pr
            pr.show()
            print(" precision by threshold : ",
                  training_summary.precisionByThreshold)
            prec_by_threshold = training_summary.precisionByThreshold
            prec_by_threshold.show()

            print(" accuracy : ", training_summary.accuracy)
            accuracy_d = training_summary.accuracy
            print(accuracy_d)

            fMeasure = training_summary.fMeasureByThreshold

            fMeasure.show()

            maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
                'max(F-Measure)').head()
            bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
                .select('threshold').head()['threshold']
            lr.setThreshold(bestThreshold)

            # obtain the objective per iteration

            objectiveHistory = training_summary.objectiveHistory
            print("objectiveHistory")
            for objective in objectiveHistory:
                print(objective)

            # for a multiclass we can inspect  a matrix on a per label basis

            print("false positive rate by label:")
            for i, rate in enumerate(
                    training_summary.falsePositiveRateByLabel):
                print("label %d: %s" % (i, rate))

            print("True positive rate")
            for i, rate in enumerate(training_summary.truePositiveRateByLabel):
                print("label %d : %s" % (i, rate))
            #
            # print("True Negative rate")
            # for i, rate in enumerate(training_summary)

            print("Precision by label:")
            for i, prec in enumerate(training_summary.precisionByLabel):
                print("label %d: %s" % (i, prec))

            print("Recall by label:")
            for i, rec in enumerate(training_summary.recallByLabel):
                print("label %d: %s" % (i, rec))

            print("F-measure by label:")
            for i, f in enumerate(training_summary.fMeasureByLabel()):
                print("label %d: %s" % (i, f))

            accuracy = training_summary.accuracy
            falsePositiveRate = training_summary.weightedFalsePositiveRate
            truePositiveRate = training_summary.weightedTruePositiveRate
            fMeasure = training_summary.weightedFMeasure()
            precision = training_summary.weightedPrecision
            recall = training_summary.weightedRecall
            print(
                "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
                % (accuracy, falsePositiveRate, truePositiveRate, fMeasure,
                   precision, recall))
            # Accuracy_list = []
            Accuracy_list.append(accuracy)
            # FPR_list = []
            FPR_list.append(falsePositiveRate)
            # TPR_list=[]
            TPR_list.append(truePositiveRate)
            precision_list.append(precision)
            recall_list.append(recall)

        print(Accuracy_list)
        print(FPR_list)
        print(TPR_list)
        print(precision_list)
        print(recall_list)

        import matplotlib.pyplot as plt
        #
        # plt.plot(recall_list, FPR_list)
        # plt.show()

        #
        # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ]
        # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395  , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0]
        # data visualization

        # ROC graph
        fpr = roc.select("FPR").toPandas()

        tpr = roc.select("TPR").toPandas()

        plt.plot(fpr, tpr)
        plt.show()

        # PR graph

        pr_recall = pr.select("recall").toPandas()
        pr_precision = pr.select("precision").toPandas()

        plt.plot(pr_precision, pr_recall)
        plt.show()

        # now applying the fit on the test data

        prediction_val = lrModel.transform(test_data)
        prediction_val.groupBy('indexed_' + label, "prediction").count().show()
        prediction_val.show()

        prediction_val.groupBy("prediction").count().show()

        prediction_val.groupBy("prediction", "probability").count().show()
Example #35
0
filterer = SQLTransformer(statement="select * from __THIS__ where cancelled = 0")

# Cast `star_rating` to double for the Binarizer:
converter = SQLTransformer(statement="select *, cast(star_rating as double) as star_rating_double from __THIS__")

# Binarize `star_rating_double`:
from pyspark.ml.feature import Binarizer
binarizer = Binarizer(inputCol="star_rating_double", outputCol="five_star_rating", threshold=4.5)

# Extract the `reviewed` feature:
extractor = SQLTransformer(statement="select *, review is not null as reviewed from __THIS__")

# Assemble the features:
from pyspark.ml.feature import VectorAssembler
selected = ["reviewed"]
assembler = VectorAssembler(inputCols=selected, outputCol="features")

# Specify the decision tree classifier:
from pyspark.ml.classification import DecisionTreeClassifier
classifier = DecisionTreeClassifier(featuresCol="features", labelCol="five_star_rating")

# Specify the pipeline:
from pyspark.ml import Pipeline
stages = [filterer, converter, binarizer, extractor, assembler, classifier]
pipeline = Pipeline(stages=stages)


# ## Save and load the machine learning pipeline

# Save the `Pipeline` instance to our local directory in HDFS:
pipeline.write().overwrite().save("models/pipeline")
def preprocess_train(train, model=None, spark=None):
    if model == 'xgb':
        # train = train.dropna(axis=1, how='all', inplace=False)
        cols = [x for x in train.columns if x not in ['datetime']]
        print('Test Columns : ' + str(len(train.columns)))
        print('Test Rows : ' + str(train.count()))
        train = clip(train, cols)

        # train = train.resample('H').mean()

        train = get_mean_of_cyl_values(train)

        train = train.fillna(0)

        # train.show(n=5)

        return train

    elif model == 'lstm':

        # train = train.dropna(axis=1, how='all', inplace=False)

        cols = [x for x in train.columns if x not in ['datetime']]
        print('Test Columns : ' + str(len(train.columns)))
        print('Test Rows : ' + str(train.count()))
        train = clip(train, cols)

        train = get_mean_of_cyl_values(train)

        # train_max = train.resample('H').max().add_suffix('_max')
        # train_min = train.resample('H').min().add_suffix('_min')
        # train_std = train.resample('H').std().add_suffix('_std')
        # train = train.resample('H').mean()
        #
        # train = pd.concat([train, train_max], axis=1, sort=False)
        # train = pd.concat([train, train_min], axis=1, sort=False)
        # train = pd.concat([train, train_std], axis=1, sort=False)
        # del train_max, train_min,
        # gc.collect()

        # train = train.rolling(window=150).mean()

        cols = [x for x in train.columns if x not in ['datetime']]
        # function to calculate number of seconds from number of days
        days = lambda i: i * 86400
        #
        # train = train.withColumn('datetime', train.datetime.cast('timestamp'))
        #
        # # create window by casting timestamp to long (number of seconds)
        # w = (Window.orderBy('datetime').rowsBetween(-50, 0))
        # for column in cols:
        #     train = train.withColumn(column, avg(train[column]).over(w))

        print('Test Columns : ' + str(len(train.columns)))
        print('Test Rows : ' + str(train.count()))
        print(train.schema)
        train = train.fillna(0)
        #
        # window = Window.orderBy('datetime') \
        #     .rowsBetween(-sys.maxsize, 0)
        #
        # def ffill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # def bfill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # for column in cols:
        #     train = train.withColumn(column, ffill(col(column)))
        #
        # for column in cols:
        #     train = train.withColumn(column, bfill(col(column)))

        # train = train.fillna(0)
        #
        # vds_5 = train

        # del train
        # gc.collect()
        #
        # vds_5 = vds_5.replace(to_replace=0, value=1)
        #
        # vds_5 = vds_5.pct_change(periods=1, fill_method='ffill')
        # #
        # vds_5 = vds_5.fillna(method='ffill')
        # vds_5 = vds_5.fillna(method='bfill')
        cols = [x for x in train.columns if x not in ['datetime']]
        # vds_55 = normalize(vds_5)
        # vds_55 = scale(vds_55)

        assembler = VectorAssembler().setInputCols \
            (cols).setOutputCol("features")
        print('assembler')
        transformed = assembler.transform(train)

        # Normalize each Vector using $L^1$ norm.
        normalizer = Normalizer(inputCol="features",
                                outputCol="normFeatures",
                                p=1.0)
        l1NormData = normalizer.transform(transformed)

        scaler = StandardScaler(inputCol="normFeatures",
                                outputCol="scaledFeatures",
                                withStd=True,
                                withMean=False)

        # Compute summary statistics by fitting the StandardScaler
        scalerModel = scaler.fit(l1NormData)
        # Normalize each feature to have unit standard deviation.
        scaledData = scalerModel.transform(l1NormData)
        # train = scaledData.drop(*cols)
        del train, transformed, l1NormData

        n_components_ = 50
        # pca = FastICA(n_components=n_components_)
        #
        # dump(pca, 'pca.joblib')
        #
        # pca2_results = pca.fit_transform(scaledData)
        # # n_comp=pca.n_components_
        # n_comp = n_components_
        # print('Number of componeds : ' + str(n_comp))
        # print(pca2_results)
        # print (len(pca2_results[:, 1]))

        # for i in range(0, n_comp):
        #     vds_5['pca_' + str(i)] = 0
        #     # print(len(vds_5['pca_' + str(i)]))
        #     # print(len(pca2_results[:, i]))
        #     vds_5['pca_' + str(i)] = pca2_results[:, i]

        # pca_columns = [x for x in vds_5.columns if x.startswith('pca')]
        # vds_5 = vds_5[pca_columns]

        pca = PCA(k=n_components_,
                  inputCol="scaledFeatures",
                  outputCol="pcaFeatures")
        model = pca.fit(scaledData)

        vds_5 = model.transform(scaledData).select(['pcaFeatures', 'datetime'])
        print(vds_5)

        def extract(row):
            return (row.datetime, ) + tuple(row.pcaFeatures.toArray().tolist())

        vds_5 = vds_5.rdd.map(extract).toDF(["datetime"])
        print(vds_5)

        vds_5 = vds_5.drop(*['pcaFeatures', 'datetime'])

        return vds_5

    elif model == 'svm':

        cols = [x for x in train.columns if x not in ['datetime']]
        print('Test Columns : ' + str(len(train.columns)))
        print('Test Rows : ' + str(train.count()))
        train = clip(train, cols)

        print('Test Columns : ' + str(len(train.columns)))
        print('Test Rows : ' + str(train.count()))

        # train_max = train.resample('H').max().add_suffix('_max')
        # train_min = train.resample('H').min().add_suffix('_min')
        # train_std = train.resample('H').std().add_suffix('_std')
        # train = train.resample('H').mean()
        #
        # train = pd.concat([train, train_max], axis=1, sort=False)
        # train = pd.concat([train, train_min], axis=1, sort=False)
        # train = pd.concat([train, train_std], axis=1, sort=False)
        # del train_max, train_min,
        # gc.collect()

        train = get_mean_of_cyl_values(train)

        vds_5 = train
        print('Test Columns : ' + str(len(train.columns)))
        print('Test Rows : ' + str(train.count()))

        # vds_5 = vds_5.replace(to_replace=0, value=1)

        # vds_5 = vds_5.pct_change(periods=1, fill_method='ffill')

        # window = Window.orderBy('datetime') \
        #     .rowsBetween(-sys.maxsize, 0)
        #
        # def ffill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # def bfill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # for column in cols:
        #     vds_5 = vds_5.withColumn(column, ffill(col(column)))
        #
        # for column in cols:
        #     vds_5 = vds_5.withColumn(column, bfill(col(column)))

        vds_5 = vds_5.fillna(0)
        # vds_5 = vds_5.fillna(method='ffill')
        # vds_5 = vds_5.fillna(method='bfill')

        return vds_5
data = spark.read.format("csv").option("header", True).option(
    "inferSchema", True
).option("delimiter", ",").load(
    "/home/charan/workspaces/big_data_programming/bigdata_progamming_m2_icp/icp7/apps/datasets/adult.data"
)

# data = data.select("*", F.when(data.X == ' <=50K', 1).when(data.X == ' >50K', 2).otherwise(0).alias('label'))

data = data.withColumnRenamed("age", "label").select("label", "education-num",
                                                     "hours-per-week")

data = data.select(data.label.cast("double"), "education-num",
                   "hours-per-week")

# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data = assembler.transform(data)

# Split data into training and test data set
training, test = data.select("label", "features").randomSplit([0.85, 0.15])

# Create Navie Bayes model and fit the model with training dataset
nb = NaiveBayes()
model = nb.fit(training)

# Generate prediction from test dataset
predictions = model.transform(test)

# Evaluate the accuracy of the model
evaluator = MulticlassClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)
    """
    clean_riskdata = nostring_riskdata.na.fill({"loan":avg("loan") ,
                                                "mortdue":avg("mortdue"), 
                                                "value":avg("value"),
                                                "derog":avg("derog"),
                                                "delinq":0,
                                                "clage":avg("clage"),
                                                "ninq":avg("ninq"),
                                                "clno":avg("clno"),
                                                "debtinc":avg("debtinc")
                                                })
    """
    #Define Input-output columns, i.e. transform to MLP features vector
    ignore=['bad']
    assembler = VectorAssembler(
    inputCols=[k for k in clean_riskdata.columns if k not in ignore],
    outputCol="predictors")
    Triskdata = assembler.transform(clean_riskdata)
    # Split the data into train and test
    splits = Triskdata.randomSplit([0.4, 0.6], 1234)
    train = splits[0]
    test = splits[1]

    #################################################################
    # Preliminary analysis
    #################################################################
    print(clean_riskdata.describe().show())
    print(riskdata.stat.crosstab("bad","job").show())
    print(riskdata.stat.crosstab("bad","reason").show())
    #################################################################
    # Multilayer Perceptron Classifier
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler()\
  .setInputCols(["Quantity", "UnitPrice"])\
  .setOutputCol("features")

sales = va.transform(spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("/data/retail-data/by-day/*.csv")
  .limit(50)
  .coalesce(1)
  .where("Description IS NOT NULL"))

sales.cache()


# COMMAND ----------

from pyspark.ml.clustering import KMeans
km = KMeans().setK(5)
print km.explainParams()
kmModel = km.fit(sales)


# COMMAND ----------

summary = kmModel.summary
print summary.clusterSizes # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
Example #40
0
# this will convert each unique string into a numeric
#indexer = StringIndexer(inputCol="property_state", outputCol="loc_state")
#indexed = indexer.fit(lndf).transform(lndf)
# indexed.show(5)
## First try a logistic regression 
# now we need to create  a  "label" and "features"
# input for using the sparkML library

## This runs in the Cloudera Spark Cluster
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

#
# the debt to income col has nulls
assembler = VectorAssembler(
    inputCols=[ "sensor1", "sensor2", "sensor3", "sensor4" ],
    outputCol="features")
      

# note the column headers - label and features are keywords
lrdf = assembler.transform(iotdf)
lrdf.show(5)
lrdf.count()

from pyspark.ml.classification import LogisticRegression

# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
#remove data_df from memory
data_df.unpersist() 

#encode the dependent variable - category_predict
classifyIndexer = StringIndexer(inputCol="Category", outputCol="Category_Index")
classifymodel = classifyIndexer.fit(encoded)
encoded2 = classifymodel.transform(encoded)



#keep the following columns: x, y, hour, day, month, year, dayofweek, week, x_sim, y_sim
#drop the following
cleaned = encoded2.select([c for c in encoded2.columns if c not in{'DayOfWeek','Category','Address','Dates','Descript','PdDistrict','Resolution','PdDistrict_Index'}])

ignore = ['Category_Index']
assembler = VectorAssembler(inputCols=[x for x in cleaned.columns if x not in ignore],outputCol='features')

transformed = assembler.transform(cleaned)


data_transformed = transformed.select(col("Category_Index").alias("label"), col("features")).map(lambda row: LabeledPoint(row.label, row.features))

#********************************************************************************
# split the training set
train, test = data_transformed.randomSplit([0.7, 0.3], seed = 2)

#naivebayes classifier
#lambda = 1.0
# initialize classifier:
nb_model = mllib_class.NaiveBayes.train(train, 1.0)
#this step will take 50 seconds
Example #42
0
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.shell import spark
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import DoubleType

df=spark.read.format("csv").option("inferSchema", "true").option("header", "true").option("sep", ";")\
    .load("TestDataset.csv")

###RENAMING THE CLOUMNS###
df = df.toDF("c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8", "c9", "c10",
             "c11", "quality")
featureassembler = VectorAssembler(inputCols=[
    "c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8", "c9", "c10", "c11"
],
                                   outputCol="Independent Features")
output = featureassembler.transform(df)

test_data = output.select("Independent Features", "quality")

#### LOADING_MODEL AND EVALUATING MODEL#####
reg = RandomForestClassificationModel.load("ModelV1")
pred = reg.transform(test_data)

pred.select('Independent Features', "quality", 'prediction').show(5)
evaluator = MulticlassClassificationEvaluator(labelCol="quality",
                                              predictionCol="prediction",
                                              metricName="accuracy")
Accuracy = evaluator.evaluate(pred)
Example #43
0
def number_transformers(spark):
    return [
        VectorAssembler(),
        StandardScaler(),
    ]
# Alternative way of filtering data
#data.registerTempTable("taxi")
#df_sql = sqlContext.sql("SELECT * FROM taxi WHERE passenger_count >0 and passenger_count < 10 and trip_time_in_secs > 0 and trip_time_in_secs < 3000 and trip_distance > 0 AND trip_distance < 25 AND fare_amount > 0 AND fare_amount < 50 AND total_amount>0 AND total_amount < 100 and tip_amount > 0 and tip_amount < 20")
#df_sql.show() # Show my result
# df_sql.registerTempTable("taxi_clean")
# sqlContext.sql("SELECT count(*) FROM taxi_clean").show()# calculate the number of rows 

############## regression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml.evaluation import RegressionEvaluator

# Merge all needed columns as one column called features 
assembler = VectorAssembler(inputCols = ['trip_time_in_secs', 'trip_distance'], outputCol="features")
# Select features and rename total_amount as label.
 regression_data = assembler.transform(data_cleaned).select([col for col in data_cleaned.columns if col != "total_amount"]+["features",data_cleaned["total_amount"].alias("label")] )
regression_data.show()

# Setup the linear regression solver
lr = LinearRegression(maxIter=1000, regParam=0.3, elasticNetParam=0)
# Fit the model
lrModel = lr.fit(regression_data)
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
from __future__ import print_function

# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("VectorAssemblerExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame(
        [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
        ["id", "hour", "mobile", "userFeatures", "clicked"])

    assembler = VectorAssembler(
        inputCols=["hour", "mobile", "userFeatures"],
        outputCol="features")

    output = assembler.transform(dataset)
    print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
    output.select("features", "clicked").show(truncate=False)
    # $example off$

    spark.stop()
Example #46
0
def assemble_features_pipeline_model(df, features, label, algorithm,
                                     set_features, set_label, prediction, keep,
                                     emit, task_id):
    """
    Prepare features and label to be processed by a ML algorithm. Features and
    labels are indexed (StringIndexer) if they are categorical.
    During the process, temporary columns are created but they're removed as
    soon as the pipeline ends.
    :arg df Input data frame
    :arg features array with column names to be used as features
    :arg label name of the column with label
    :arg algorithm algorithm to be used, can be a ML or a feature extraction one
    :arg set_features name of the method used to set the features
    :arg set_label name of the method used to set the label
    :arg prediction name of the prediction column (generated)
    :arg keep list of the columns to be kept after the processing
    :arg emit emit messages function
    :arg task_id task identifier
    :returns processing pipeline model
    """
    if keep is None:
        keep = []
    final_keep = [c.name for c in df.schema]
    final_keep.extend(keep)

    clean_null_rows = 'SELECT * FROM __THIS__ WHERE {}'
    if len(features) > 1 and not isinstance(
            df.schema[str(features[0])].dataType, VectorUDT):

        emit(name='update task',
             message=_(
                 'Features are not assembled as a vector. They will be '
                 'implicitly assembled and rows with null values will be '
                 'discarded. If this is undesirable, explicitly add a '
                 'attribute vectorizer, handle missing data and '
                 'categorical attributes in the workflow.'),
             level='warning',
             status='RUNNING',
             identifier=task_id)
        stages = []
        to_assemble = []
        for f in features:
            if not dataframe_util.is_numeric(df.schema, f):
                name = f + '__tmp__'
                to_assemble.append(name)
                stages.append(
                    StringIndexer(inputCol=f,
                                  outputCol=name,
                                  handleInvalid='keep'))
            else:
                to_assemble.append(f)

        # Remove rows with null (VectorAssembler doesn't support it)
        cond = ' AND '.join(['{} IS NOT NULL '.format(c) for c in to_assemble])
        stages.append(SQLTransformer(statement=clean_null_rows.format(cond)))

        final_features = 'features__tmp__'
        stages.append(
            VectorAssembler(inputCols=to_assemble, outputCol=final_features))

        getattr(algorithm, set_features)(final_features)

        if label is not None:
            final_label = '{}__tmp__'.format(label)
            getattr(algorithm, set_label)(final_label)
            stages.append(
                StringIndexer(inputCol=label,
                              outputCol=final_label,
                              handleInvalid='keep'))

        stages.append(algorithm)
        pipeline = Pipeline(stages=stages)
        model = pipeline.fit(df)

        last_stages = [model]
        if label is not None:
            last_stages.append(
                IndexToString(inputCol=prediction,
                              outputCol='{}'.format(prediction),
                              labels=model.stages[-2].labels))

        # Remove temporary columns
        sql = 'SELECT {} FROM __THIS__'.format(', '.join(final_keep))
        last_stages.append(SQLTransformer(statement=sql))

        pipeline = Pipeline(stages=last_stages)
        model = pipeline.fit(df)

    else:
        if label is not None:
            final_label = '{}__tmp__'.format(label)

            getattr(algorithm, set_label)(final_label)
            stages = [
                StringIndexer(inputCol=label,
                              outputCol=final_label,
                              handleInvalid='keep'), algorithm
            ]

            pipeline = Pipeline(stages=stages)
            model = pipeline.fit(df)
            last_stages = [model]
            if label is not None:
                last_stages.append(
                    IndexToString(inputCol=final_label,
                                  outputCol='{}_str'.format(prediction),
                                  labels=model.stages[-2].labels))

            # Remove temporary columns
            sql = 'SELECT {} FROM __THIS__'.format(', '.join(final_keep))
            last_stages.append(SQLTransformer(statement=sql))

            pipeline = Pipeline(stages=last_stages)
            model = pipeline.fit(df)

        else:
            getattr(algorithm, set_features)(features[0])
            model = algorithm.fit(df)

    return model
Example #47
0
                          .options(header='true', inferSchema='true') \
                          .load(path_train)
# Read the testing dataset.
raw_dataset_test = reader.read.format('com.databricks.spark.csv') \
                         .options(header='true', inferSchema='true') \
                         .load(path_test)

# First, we would like to extract the desired features from the raw dataset.
# We do this by constructing a list with all desired columns.
# This is identical for the test set.
features = raw_dataset_train.columns
features.remove('label')

# Next, we use Spark's VectorAssembler to "assemble" (create) a vector of all desired features.
# http://spark.apache.org/docs/latest/ml-features.html#vectorassembler
vector_assembler = VectorAssembler(inputCols=features, outputCol="features")
# This transformer will take all columns specified in features, and create an additional column
# "features" which will contain all the desired features aggregated into a single vector.
dataset_train = vector_assembler.transform(raw_dataset_train)
dataset_test = vector_assembler.transform(raw_dataset_test)

# Define the number of output classes.
nb_classes = 10
encoder = OneHotTransformer(nb_classes, input_col="label", output_col="label_encoded")
dataset_train = encoder.transform(dataset_train)
dataset_test = encoder.transform(dataset_test)

# Allocate a MinMaxTransformer from Distributed Keras to normalize the features..
# o_min -> original_minimum
# n_min -> new_minimum
transformer = MinMaxTransformer(n_min=0.0, n_max=1.0, \
Example #48
0
stringIndexerStages = [
    StringIndexer(inputCol = col, outputCol = col + '_INDEX', handleInvalid = 'skip')
    for col in COLUMNS_OHE + COLUMNS_HIGH_CARD
]
pipelineStages += stringIndexerStages

OHEStage = OneHotEncoderEstimator(
    inputCols = [col + '_INDEX' for col in COLUMNS_OHE],
    outputCols = [col + '_VEC' for col in COLUMNS_OHE]
)
pipelineStages += [OHEStage]

sparseVectorCols = [col + '_VEC' for col in COLUMNS_OHE] + [col + '_INDEX' for col in COLUMNS_HIGH_CARD]
assembler = VectorAssembler(
    inputCols = sparseVectorCols, 
    outputCol = 'features'
)
pipelineStages += [assembler]

normalizer = Normalizer(
    inputCol = 'features',
    outputCol = 'normFeatures'
)
pipelineStages += [normalizer]

decisionTree = DecisionTreeClassifier(
    featuresCol = 'normFeatures',
    labelCol = 'HasDetections'
)
pipelineStages += [decisionTree]
label_stringIdx = StringIndexer(inputCol = "income", outputCol = "label")
label_model = label_stringIdx.fit(dataset)
label_indexed = label_model.transform(dataset)
print label_indexed.take(1)

# COMMAND ----------

# MAGIC %md
# MAGIC Next, we will use the VectorAssembler() to combine all the feature columns into a single vector column. This will include both the numeric columns and the one-hot encoded binary vector columns in our dataset.

# COMMAND ----------

# Transform all features into a vector using VectorAssembler
assembler = VectorAssembler(
    inputCols=["age","workclassclassVec","fnlwgt","educationclassVec","education_num","marital_statusclassVec",
               "occupationclassVec","relationshipclassVec","raceclassVec", "sexclassVec", "capital_gain", "capital_loss", "hours_per_week",
               "native_countryclassVec"],
    outputCol="features")
output = assembler.transform(label_indexed)

# Keep relevant columns
selectedcols = ["label", "features"] + cols
dataset = output.select(selectedcols)
display(dataset)

# COMMAND ----------

### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print trainingData.count()
print testData.count()
Example #50
0
with open(json_file_path, 'r') as j:
    contents = json.load(j)
cluster = contents['cluster']
for item in cluster:
    path_aggregated_df = item['path_aggregated_df']
    path_metrics_kmeans_sse = item['path_metrics_kmeans_sse']
clustering_df = spark.read.parquet(path_aggregated_df)
columns_clustering_features = [
    "calls_outgoing_count", "user_spendings", "sms_incoming_count",
    "user_use_gprs", "sms_outgoing_count", "user_no_outgoing_activity_in_days",
    "calls_outgoing_spendings", "user_lifetime"
]

print("before assemble")
# duomenu paruosimas
vector_assembler = VectorAssembler(inputCols=columns_clustering_features,
                                   outputCol="initial_features")

standard_scaler = StandardScaler(inputCol="initial_features",
                                 outputCol="features",
                                 withStd=True,
                                 withMean=True)
print("after scale")
vectorized_df = vector_assembler.transform(clustering_df)
model_scaler = standard_scaler.fit(vectorized_df)
featurized_clustering_df = model_scaler.transform(vectorized_df)
featurization_pipeline = Pipeline(stages=[vector_assembler, standard_scaler])
featurization_pipeline_model = featurization_pipeline.fit(clustering_df)
model_scaler = featurization_pipeline_model.stages[-1]
featurized_clustering_df = featurization_pipeline_model.transform(
    clustering_df)
sse_cost = np.zeros(20)