def __build_pca(self, df, metadata_path): pca = PCA(k=self.k, inputCol='scaled_features', outputCol='pca_features') if self.__metadata: pca.fit(df).write().overwrite().save(metadata_path) return PCAModel.load(metadata_path).transform(df) return pca.fit(df).transform(df)
def PCA_transform(sc, samples_df, feature_count, threshold, k): # check input if threshold and ((threshold > 1) or (threshold < 0)): print "ERROR: PCA_transform: Input threshold should be within 0 to 1" return (None, None, None) if k and k < 0: print "ERROR: transform: Input k should be greater than 0" return (None, None, None) #print "df.shape=",df.shape #print "in ml_sklearn_PCA_transform()" df_reduced = None pca = None if not threshold is None: # by threshold =============== if feature_count > 200: fk = 200 print "INFO: force k to " + str(fk) + " for PCA." else: fk = feature_count pca = PCA(k=fk, inputCol="features", outputCol="pcaFeatures") pca_model = pca.fit(samples_df) sum_ratio = 0 # get ratio array and find n_components var_arr = pca_model.explainedVariance print "RESULT: PCA ratio_vec=", var_arr n_components = ml_util.ml_get_n_components(var_arr, threshold) ''' for n_components,val in enumerate(var_arr): sum_ratio=sum_ratio+val if sum_ratio >= threshold: break ''' k = n_components #print sum_ratio, n_components df_pcaed_all = pca_model.transform(samples_df).select( "hash", "label", "pcaFeatures") # get k column only sqlCtx = SQLContext(sc) df_pcaed = sqlCtx.createDataFrame( df_pcaed_all.rdd.map(lambda p: (p["hash"], p["label"], p[ "pcaFeatures"].toArray()[:k])).map(lambda p: Row( hash=p[0], label=p[1], pcaFeatures=DenseVector(p[2])))) print "INFO: PCA_transform: n_components =", n_components, ", threshold=", threshold elif k > 0: # by n_components =============== pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures") pca_model = pca.fit(samples_df) df_pcaed = pca_model.transform(samples_df).select( "hash", "label", "pcaFeatures") print "INFO: PCA_transform: n_components =", k return (df_pcaed, k, pca_model)
def _perform_pca(self, dataset: DataFrame, k: int): # Since we want to plot the clusters, it is important # downsize the dimensions to at most 3 dimensions. # We can use PCA with 3 principal components for this. pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures") pca_model = pca.fit(dataset) rows = pca_model \ .transform(dataset) \ .select("clusterNum", "pcaFeatures") \ .collect() # Now we'll plot the clusters as a 3D scatter plot with # each point's color corresponding to its cluster. # Cast cluterNum to string so it is treated as categorical # data for plotting purposes. axes = zip(*[row["pcaFeatures"] for row in rows]) colors = pd.Categorical([row["clusterNum"] for row in rows]) if k == 2: x, y = axes fig = plt.figure(figsize=(15, 15)) sns.scatterplot(x=x, y=y, hue=colors) if k == 3: x, y, z = axes plot_df = pd.DataFrame({"PCA 1": x, "PCA 2": y, "PCA 3": z, "cluster": colors}) g = sns.PairGrid(plot_df, hue="cluster", palette="coolwarm") g = g.map(sns.scatterplot, linewidths=0.75, edgecolor="w", s=40) g = g.add_legend() g.fig.set_size_inches(15, 15) # Specify number of principal components and clusters in model image_path = os.path.join("analysis", "results", "charts", f"pca-{k}-{self.model_name}.png") plt.savefig(image_path)
def get_preprocessed_data(input_train, input_test): # Train Data train = spark.read.csv(input_train, header=False, inferSchema="true") train_labels = get_vector(train.select('_c0'), 'train_label') train_features = get_vector(train.drop('_c0'), 'feature') # Test Data test = spark.read.csv(input_test, header=False, inferSchema="true") test_labels = get_vector(test.select('_c0'), 'test_label') test_features = get_vector(test.drop('_c0'), 'feature') # Compute PCA pca = PCA(k=50, inputCol="feature", outputCol="pca_feature") pca_model = pca.fit(train_features) # Apply PCA to train / test features train_features_pca = pca_model.transform(train_features).select( "pca_feature") test_features_pca = pca_model.transform(test_features).select( "pca_feature") # Rename pca feature column values train_features_pca = train_features_pca.withColumnRenamed( "pca_feature", "train_feature") test_features_pca = test_features_pca.withColumnRenamed( "pca_feature", "test_feature") # Create combined train / test data train_data = combine_features_labels(train_features_pca, train_labels, 'train') test_data = combine_features_labels(test_features_pca, test_labels, 'test') return train_data, test_data
def test_model_polynomial_expansion(self): data = self.spark.createDataFrame( [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )], ["features"]) pca = PCA(k=2, inputCol="features", outputCol="pca_features") model = pca.fit(data) # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size N = data.count() model_onnx = convert_sparkml( model, 'Sparkml PCA', [('features', FloatTensorType([N, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().pca_features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPCA") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['pca_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def pca_model(data, k=7, inputcol='scale_features', outputcol='pca_features'): # new_df pca = PCA(k=k, inputCol=inputcol, outputCol=outputcol) model = pca.fit(data) variance = model.explainedVariance pca_data = model.transform(data) return variance, pca_data
def pca(self, df, k=1): cov = RowMatrix( df.rdd.map(lambda x: list(x))).computeCovariance().toArray() col = cov.shape[1] eigVals, eigVecs = np.linalg.eigh(cov) inds = np.argsort(eigVals) eigVecs = eigVecs.T[inds[-1:-(col + 1):-1]] eigVals = eigVals[inds[-1:-(col + 1):-1]] components = RowMatrix( df.rdd.map(lambda x: list(x))).computePrincipalComponents(k) train_data = df.rdd.map( lambda x: Row(features=Vectors.dense(x))).toDF() pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures") model = pca.fit(train_data) score = model.transform(train_data) res = { "components": components.toArray(), "score": np.array( score.select("pcaFeatures").rdd.map( lambda x: list(x[0])).collect()), "eigVectors": eigVecs, "eigValues": eigVals } return res
def _get_pca_model(feat_train, k): from pyspark.ml.feature import PCA pca = PCA(k=k, inputCol="features", outputCol="pca_features") pca_model = pca.fit(feat_train) # Explained Variance logr.log_event('Training Accuracy', f"{sum(pca_model.explainedVariance)}") return pca_model
def PCA_setting(spark, rdd, n): df = spark.createDataFrame(rdd,schema=['features']) pca = PCA(k=n,inputCol='features',outputCol='pca_features') model = pca.fit(df) return model.transform(df).select('pca_features').collect()
def pca(inputdir,df,alg,k): from pyspark.ml.feature import PCA pca = PCA(k=int(k),inputCol="features", outputCol="pca_features") model = pca.fit(df) outData = model.transform(df) pcaFeatures = outData.select("labels","pca_features") output_data = writeOut(inputdir,pcaFeatures,alg,k) return output_data
def pca_generic(data, dimens, input_col, output_col="pcaFeatures"): print('PCA Result with dimentions = ' + str(dimens) + ' with output column pcaFeatures') pca_generic = PCA(k=dimens, inputCol=input_col, outputCol=output_col) pca_model_generic = pca_generic.fit(data) result_pca_generic = pca_model_generic.transform(data) result_pca_generic.show() print('\n') return result_pca_generic, pca_model_generic
def runPCA(vector_features, k=3): from pyspark.ml.feature import PCA #convert df to feature_vec feature_vec = vector_features.select('features') pca = PCA(k, inputCol="features", outputCol="pcaFeatures") model = pca.fit(feature_vec) result = model.transform(feature_vec).select("pcaFeatures") return result
def clustering(input_df, input_col_name, n): """ KMeans and PCA """ input_df = input_df.select('state','categories','stars',input_col_name) norm = Normalizer(inputCol=input_col_name, outputCol="features", p=1.0) df = norm.transform(input_df) kmeans = KMeans(k=n, seed=2) KMmodel = kmeans.fit(df) predicted = KMmodel.transform(df).cache() pca = PCA(k=2, inputCol='features', outputCol="pc") df = pca.fit(dfsample).transform(dfsample).cache() return df
async def plot_cluster(self, df, x='_3', y='_4'): pca = PCAml(k=2, inputCol="features", outputCol="pca") model3 = pca.fit(df) transformed2 = model3.transform(df) def extract(row): return (row.customer, ) + (row.prediction, ) + tuple( row.pca.toArray().tolist()) pcadf = transformed2.rdd.map(extract).toDF(["customer", "prediction"]) pcadf.show(10, False) pandad = pcadf.toPandas() pandad.plot.scatter(x=x, y=y, c='prediction', colormap='viridis') plt.show()
def train(df, hiperparameter): ''' Fits a model to the input dataset with optional parameters. Input/Parameters: datafame/dataset – input dataset, which is an instance of pyspark.sql.DataFrame config (configurasi hiperparameter) Output/Returns: fitted model(s) ''' pca = PCA(k=hiperparameter['k'], inputCol=hiperparameter['inputCol'], outputCol=hiperparameter['outputCol']) model = pca.fit(df) return model
def _compute_cluster_analysis(spark_df, clusters=5): numeric_columns = list(map(lambda col_dtype: col_dtype[0], spark_df.dtypes)) if (len(numeric_columns) == 0): raise ValueError("The provided spark dataframe does not contain any numeric columns. " "Cannot compute cluster analysis with k-means on categorical columns. " "The numeric datatypes are: {}" \ " and the number of numeric datatypes in the dataframe is: {} ({})".format( constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES, len(spark_df.dtypes), spark_df.dtypes)) if (len(numeric_columns) == 1): raise ValueError("The provided spark dataframe does contains only one numeric column. " "Cluster analysis will filter out numeric columns and then " "use pca to reduce dataset dimension to 2 dimensions and " "then apply KMeans, this is not possible when the input data have only one numeric column." "The numeric datatypes are: {}" " and the number of numeric datatypes in the dataframe is: {} ({})".format( constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES, len(spark_df.dtypes), spark_df.dtypes)) vecAssembler = VectorAssembler(inputCols=numeric_columns, outputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN) spark_df_1 = vecAssembler.transform(spark_df) kmeans = KMeans(k=clusters, seed=1, maxIter=20, featuresCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN, predictionCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN) model = kmeans.fit(spark_df_1.select(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN)) spark_df_2 = model.transform(spark_df_1) spark_df_3 = spark_df_2.select([constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN, constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN]) count = spark_df_3.count() if count < constants.FEATURE_STORE.CLUSTERING_ANALYSIS_SAMPLE_SIZE: spark_df_4 = spark_df_3 else: spark_df_4 = spark_df_3.sample(True, float(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_SAMPLE_SIZE) / float(count)) pca = PCA(k=2, inputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN, outputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN) model = pca.fit(spark_df_4) spark_df_5 = model.transform(spark_df_4).select([constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN, constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN]) spark_df_6 = spark_df_5.withColumnRenamed( constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN, constants.FEATURE_STORE.CLUSTERING_ANALYSIS_FEATURES_COLUMN) spark_df_7 = spark_df_6.withColumnRenamed(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN, "clusters") return json.loads(spark_df_7.toPandas().to_json())
def view(self, data, pred): """Use PCA to reduce dimension and visualize the data""" pca = PCA(k=3, inputCol="scaled", outputCol="pca-3") model = pca.fit(data) transformed = model.transform(data) view = (transformed.select("prediction", "pca-3").withColumn( "axis", self.to_array( column("pca-3"))).select(["prediction"] + [column("axis")[i] for i in range(3)])) dataframe = view.toPandas() fig = pyplot.figure(figsize=(20, 20)) ax = fig.add_subplot(111, projection="3d") ax.scatter( dataframe.iloc[:, 1], dataframe.iloc[:, 2], dataframe.iloc[:, 3], c=dataframe.iloc[:, 0] + 2, ) pyplot.show()
def feature_engi(df): ''' The function Combine the gender, usage time, and paid usage columns into a vector, and Scales the Vectors ''' #Combine the gender, usage time, and paid usage columns into a vector assembler = VectorAssembler(inputCols=["sex", "time_gap", "chgrd"], outputCol="NumFeatures") df = assembler.transform(df) pca = PCA(k=2, inputCol="NumFeatures", outputCol="pca") # k is the number of dims model = pca.fit(df) df = model.transform(df) #Scale the Vectors scaler = StandardScaler(inputCol="pca", outputCol="features", withMean=True, withStd=False) scalerModel = scaler.fit(df) df = scalerModel.transform(df) return df pass
def __init__(self, Movie): # Get all movies watched by all users, distinct of (ratings union of tags) self.usersMovies = Movie.usersMovies # Join with movies to get genres self.usersGenres = self.usersMovies.join(Movie.movies, 'movieId').\ select('userId', explode(split('genres', "\|").alias('genres')).alias('genre')) # All the 20 genres self.genres_str = 'Crime|Romance|Thriller|Adventure|Drama|War|Documentary|Fantasy|Mystery|Musical|Animation|Film-Noir|(no genres listed)|IMAX|Horror|Western|Comedy|Children|Action|Sci-Fi' # Get all users self.users = Movie.usersMovies.select('userId').distinct() # Form a template with users X genres self.usersGenresTemplate = self.users.withColumn('genres', lit(self.genres_str)).\ select('userId', explode(split('genres', "\|").alias('genres')).alias('genre')) # Fill in the template with the actual values, and zero where null self.usersGenresFilled = self.usersGenres.groupBy('userId', 'genre').agg(count('genre').alias('count')).\ join(self.usersGenresTemplate, ['userId', 'genre'], 'right').fillna(0) # Sort by Genre and form genre array and counts self.usersFeatures = self.usersGenresFilled.groupBy('userId', 'genre').agg(sum('count').alias('count_')).\ sort('genre', ascending=True).groupBy('userId').\ agg(collect_list('genre').alias('genres'), collect_list('count_').alias('count')).cache() # userGenres = self.usersFeatures.drop('genres') self.datapoints = userGenres.select( 'userId', normalizeUdf(col('count')).alias('features')) # Trains a k-means model. kmeans = KMeans(maxIter=10).setK(3).setSeed(1) self.model = kmeans.fit(self.datapoints.select('features')) # kmeans.save(data_path + "/kmeans") # PCA reduction for visual pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures") self.pcaModel = pca.fit(self.datapoints.select('features'))
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result") model = word2Vec.fit(documentDF) result = model.transform(documentDF) for row in result.collect(): text, vector = row print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector))) # COMMAND ---------- from pyspark.ml.feature import PCA pca = PCA().setInputCol("features").setK(2) pca.fit(scaleDF).transform(scaleDF).show(20, False) # COMMAND ---------- from pyspark.ml.feature import PolynomialExpansion pe = PolynomialExpansion().setInputCol("features").setDegree(2).setOutputCol( "polyFeatures") pe.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ChiSqSelector, Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn\
# -*- coding: utf-8 -*- """ Created on Sun Jun 25 20:48:04 2017 @author: vishal """ from __future__ import print_function from pyspark.sql import SparkSession session = SparkSession.builder.appName('PCA').getOrCreate() from pyspark.ml.linalg import Vectors data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )] data_frame = session.createDataFrame(data, ['features']) #data_frame.show() from pyspark.ml.feature import PCA pca = PCA(inputCol='features', outputCol="pca_feature", k=3) model = pca.fit(data_frame) pca_df = model.transform(data_frame) pca_df.show() session.stop()
print("Binarizer output with Threshold = %f" % binarizer.getThreshold()) binarizedDataFrame.show() # COMMAND ---------- ###PCA is a statistical procedure used to reduce the vector's dimensions. This example reduces a 5 dimensional feature into a 3 dimensional pca feature from pyspark.ml.feature import PCA from pyspark.ml.linalg import Vectors data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )] df = spark.createDataFrame(data, ["features"]) pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures") model = pca.fit(df) result = model.transform(df).select("pcaFeatures") result.show(truncate=False) # COMMAND ---------- ###Polynomial expansion is a process of expanding features in polynomial dimensions. This example expand the given features into 3 degree polynomial dimension from pyspark.ml.feature import PolynomialExpansion from pyspark.ml.linalg import Vectors df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ), (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([3.0, -1.0]), )], ["features"]) polyExpansion = PolynomialExpansion(degree=3,
plt.show() # ### Principal Components Analysis # Performs an orthogonal transformation to convert a set of possibly correlated variables into a set of values of linearly uncorrelated variables called <b>principal components</b> # * the pcaTransformer will extract the principal components from the features # * the number of components is set by the value of <b>k</b> # In[23]: from pyspark.ml.feature import PCA pca = PCA(k=8, inputCol='features', outputCol='pcaFeatures') # In[24]: pcaTransformer = pca.fit(vectorDF) # #### View the principal components in the transformed space # In[25]: pcaFeatureData = pcaTransformer.transform(vectorDF).select('pcaFeatures') pcaFeatureData.toPandas().head() # #### The principal components are stored as a DenseVector # In[26]: pcaFeatureData.toPandas()['pcaFeatures'][0]
def learn_pca_embedding(raw_data_frame): pca_computer = PCA(k=NBITS, inputCol='features', outputCol='pca') pca_model = pca_computer.fit(raw_data_frame) return pca_model
], ["text"]) # Learn a mapping from words to Vectors. word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result") model = word2Vec.fit(documentDF) result = model.transform(documentDF) for row in result.collect(): text, vector = row print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector))) # COMMAND ---------- from pyspark.ml.feature import PCA pca = PCA().setInputCol("features").setK(2) pca.fit(scaleDF).transform(scaleDF).show(20, False) # COMMAND ---------- from pyspark.ml.feature import PolynomialExpansion pe = PolynomialExpansion().setInputCol("features").setDegree(2) pe.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ChiSqSelector, Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn\ .transform(sales.select("Description", "CustomerId"))\
from pyspark.sql import SparkSession from pyspark.ml.feature import PCA from pyspark.ml.linalg import Vectors from redisai import save_sparkml from redisai import onnx_utils from redisai import DType executable = sys.executable os.environ["SPARK_HOME"] = pyspark.__path__[0] os.environ["PYSPARK_PYTHON"] = executable os.environ["PYSPARK_DRIVER_PYTHON"] = executable spark = SparkSession.builder.appName("redisai_trial").getOrCreate() data = spark.createDataFrame([(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )], ["features"]) pca = PCA(k=2, inputCol="features", outputCol="pca_features") model = pca.fit(data) feature_count = data.first()[0].size N = data.count() featurestype = onnx_utils.get_tensortype(node_name='features', dtype=DType.float32, shape=(N, feature_count)) save_sparkml(model, 'spark.onnx', initial_types=[featurestype], spark_session=spark)
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("PCAExample")\ .getOrCreate() if (len(sys.argv) != 3): print("bin/spark-submit pca-pyspark.py <data_set.csv> <param_K>") sys.exit(1) input = spark.read.load(sys.argv[1], format="csv", inferSchema="true", header="false") K = int(sys.argv[2]) assembler = VectorAssembler(inputCols=input.columns, outputCol="features") dataset = assembler.transform(input) dataset.show() pca = PCA(k=K, inputCol="features", outputCol="pcaFeatures") model = pca.fit(dataset) print("Principal Components: ", model.pc, sep='\n') print("Explained Variance: ", model.explainedVariance, sep='\n') spark.stop()
#need to first register dataframe as a SQL temporary view in order to use spark sql from pyspark.sql.functions import desc df_with_distance.createOrReplaceTempView("df") result_df = df_with_distance.sort(desc('distance')).limit(num_outliar) cols = list(set(result_df.columns)-{'scaledFeatures'}-{'features'}) result_df = result_df.select(cols) #write to dumbo local: result_df.toPandas().to_csv(filename+"_numeric_data_result.csv", sep=',') #write to hdfs #result_df.write.csv("numeric_data_result.csv", sep=',') ###################### #pca: from pyspark.ml.feature import PCA pca = PCA(k=3, inputCol="scaledFeatures", outputCol="pcaFeatures") #pca_model = pca.fit(final_data) pca_model = pca.fit(df_with_distance) #result_pca = pca_model.transform(final_data).select('pcaFeatures','prediction') result_pca = pca_model.transform(df_with_distance).select('pcaFeatures','prediction','distance') #will download to dumbo local result_pca.toPandas().to_csv(filename+"_pca_result.csv")
count1 = test1_df.filter(" prediction!=Occupancy").count() total1 = test1_df.count() count2 = test2_df.filter(" prediction!=Occupancy").count() total2 = test2_df.count() total = total1 + total2 tc = count1 + count2 ans = float(tc) / float(total) print(ans) #### Convert to PCA #### pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures") pcamodel = pca.fit(train_df) # Features of Data Set 1 pca_ds1_features = test1_df.select("features", "prediction") # Features of Data Set 2 pca_ds2_features = test2_df.select("features", "prediction") # Transform Data pca_ds1_features = pcamodel.transform(pca_ds1_features) pca_ds2_features = pcamodel.transform(pca_ds2_features)
Vectors.dense([inStr[1],inStr[2],inStr[3], \ inStr[4],inStr[5],inStr[6],inStr[7], \ inStr[8],inStr[9],inStr[10] ])) return lp bankLp = bankVectors.map(transformToLabeledPoint) bankLp.collect() bankDF = sqlContext.createDataFrame(bankLp, ["label", "features"]) bankDF.select("label", "features").show(10) #Perform PCA from pyspark.ml.feature import PCA bankPCA = PCA(k=3, inputCol="features", outputCol="pcaFeatures") pcaModel = bankPCA.fit(bankDF) pcaResult = pcaModel.transform(bankDF).select("label", "pcaFeatures") pcaResult.show(truncate=False) #Indexing needed as pre-req for Decision Trees from pyspark.ml.feature import StringIndexer stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(pcaResult) td = si_model.transform(pcaResult) td.collect() #Split into training and testing data (trainingData, testData) = td.randomSplit([0.7, 0.3]) trainingData.count() testData.count() testData.collect()
def pca(df): pca = PCA(k=10,inputCol="features", outputCol="pca_features") model = pca.fit(df) # outData = model.transform(lines) pcaFeatures = model.transform(lines).select("labels","pca_features") dfwrite(pcaFeatures,'pcaFeatures')
# Scale the numeric columns of the data. scalerModel = scaler.fit(transformed) scaled_genres_features = scalerModel.transform(transformed).distinct() # Select the desired columns from the data. scaled_genres_features = (scaled_genres_features.select( F.col('Track_ID'), F.col('scaledFeatures').alias('features'), F.col('Genre'))) scaled_genres_features.show() # Define the principle component analysis object. We only want the top 10 features. pca = PCA(k=10, inputCol="features", outputCol="pca_features") # Fit and transform the data into PCA features. pca_model = pca.fit(scaled_genres_features) scaled_genres_features = pca_model.transform(scaled_genres_features) ########################## # Convert the genre column into a column representing if the song is "Electronic" or some other genre # as a binary label. scaled_genres_features = (scaled_genres_features.withColumn( 'label', F.when((F.col('Genre') == 'Electronic'), 1).otherwise(0))) scaled_genres_features.show(20) # Show the class balance of the binary label. (scaled_genres_features.groupBy('label').agg(F.count(F.col('label'))).select( F.col('label'),
train_df = spark.read.csv(train_datafile,header=False,inferSchema="true") # transfer the test_df in to the dataframe with 2 column label and features so that we can do the further process assembler_test = VectorAssembler(inputCols = test_df.columns[1:],outputCol="features") test_vectors_withlabel = assembler_test.transform(test_df).selectExpr("_c0 as label","features") # transfer the train_df in to the dataframe with 2 column label and features so that we can do the further process assembler_train = VectorAssembler(inputCols = train_df.columns[1:],outputCol="features") train_vectors_withlabel = assembler_train.transform(train_df).selectExpr("_c0 as label","features") # fit the pca of train_vector first # we set the k=200, so that we can keep 90% data of MINST # After fit process, we can get the model of pca_200. # Therefore, we can use the model to transform the test and train data. pca = PCA(k=10, inputCol="features", outputCol="pca200") model_200 = pca.fit(train_vectors_withlabel) pca_train_result = model_200.transform(train_vectors_withlabel).select('label','pca200') pca_test_result = model_200.transform(test_vectors_withlabel).select('label','pca200') # transfer the dataframe into rdd test_rdd = pca_test_result.rdd train_rdd = pca_train_result.rdd # create the broadcast, so that every singe cluster can use it trainbc = spark.sparkContext.broadcast(train_rdd.collect()) # give the k to KNN and set the broadcast of k k=5 kbc = spark.sparkContext.broadcast(k)
os.system("export _JAVA_OPTIONS='-Xms1g -Xmx40g'") conf = (SparkConf().set("spark.driver.maxResultSize", "5g")) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) lines = sc.textFile(inputpath).map(lambda x:x.split(" ")) lines = lines.map(lambda x:(x[0],[float(y) for y in x[1:]])) df = lines.map(lambda x: Row(labels=x[0],features=Vectors.dense(x[1]))).toDF() ####Run#### pca = PCA(k=int(k),inputCol="features", outputCol="pca_features") model = pca.fit(df) outData = model.transform(df) pcaFeatures = outData.select("labels","pca_features") ####Write Out#### output_dir = inputdir + "/pca" + str(k) + "_Features" output_data = inputdir + "/pca" + str(k) + "_Data" n_data = 0 n_features = 0 if os.path.isdir(output_dir): os.system("rm -r " + output_dir) df.rdd.repartition(1).saveAsTextFile(output_dir) outputfile = open(output_data, 'w') inputfile = open(output_dir + '/part-00000', 'r')
# map feature matrix to spark vectors from pyspark.mllib.linalg import Vectors Feat = Feat.map(lambda vec: (Vectors.dense(vec),)) ## Define a df with feature matrix from pyspark.sql import SQLContext sqlContext = SQLContext(sc) dfFeat = sqlContext.createDataFrame(Feat,["features"]) dfFeat.printSchema() ## PCA to project Feature matrix to 2 dimensions from pyspark.ml.feature import PCA numComponents = 3 pca = PCA(k=numComponents, inputCol="features", outputCol="pcaFeatures") model = pca.fit(dfFeat) dfComp = model.transform(dfFeat).select("pcaFeatures") # get the first two components to lists to be plotted compX = dfComp.map(lambda vec: vec[0][0]).take(maxWordsVis) compY = dfComp.map(lambda vec: vec[0][1]).take(maxWordsVis) compZ = dfComp.map(lambda vec: vec[0][2]).take(maxWordsVis) ## finish Spark session sc.stop() ## plot fs=20 #fontsize w = words[0:maxWordsVis] import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D
#input rdd = sc.textFile("/user/demo/train.csv").filter(lambda x: x != titile).\ map(lambda x:x.split(",")) D = 2 ** 24 def helper1(r): features=[] try: fe = r[1:-1] for i in range(len(fe)): features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D) target = float(r[-1]) ID=float(r[0]) return target, Vectors.dense(features) except: return (0.0,[0.0]*1932) new_rdd = rdd.filter(lambda i : len(i)==1934) rdd_after_trans = new_rdd.map(helper1) rdd_after_trans.cache() df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"]) pca = PCA(k=1000, inputCol="features", outputCol="pca_features") model_pca = pca.fit(df) rdd_pca = model_pca.transform(df).select(["label","pca_features"]) rdd_pca1 = rdd_pca.withColumnRenamed('pca_features', 'features') (trainingData, testData) = rdd_pca1.randomSplit([0.7, 0.3]) lr = LogisticRegression(maxIter=100, regParam=0.01) model = lr.fit(trainingData) result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0])) result.saveAsTextFile("/user/demo/lr_pca_1000_001")
num_train_samples = 60000 test_df = spark.read.csv(test_datafile, header=False, inferSchema="true") train_df = spark.read.csv(train_datafile, header=False, inferSchema="true") # Formatting the Dataframe assembler = VectorAssembler(inputCols=test_df.columns[1:], outputCol="features") test_vectors = assembler.transform(test_df).select(test_df[0].alias('label'), "features").repartition(16) train_vectors = assembler.transform(train_df).select( train_df[0].alias('label'), "features").repartition(16) # PCA implementing pca = PCA(k=PCA_D, inputCol='features', outputCol='pca') model = pca.fit(test_vectors) train_data = model.transform(train_vectors).select('label', 'pca') test_data = model.transform(test_vectors).select('label', 'pca') # KNN Data Preprocessing # ONE time collect() function train_matrix = [] train_label = [] train_rows = train_data.rdd.collect() for i in train_rows: train_matrix.append(i.pca) train_label.append(i.label) train_label = sc.broadcast(np.array(train_label)) train_matrix = sc.broadcast(np.array(train_matrix))