def seg_model_lr(train_data, test_data, regType, num_iter): removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train] # Putting data in vector assembler form assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features") transformed_train = assembler_train.transform(train_data.fillna(0)) # Creating input dataset in the form of labeled point for training the model data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features)) # Training the model using Logistic regression Classifier model_train = LogisticRegressionWithLBFGS.train(sc.parallelize(data_train.collect(),5), regType =regType, iterations=num_iter, numClasses=5) # Creating a list of features to be used for predictions removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final] # Putting data in vector assembler form assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features") transformed_final= assembler_final.transform(test_data.fillna(0)) # Creating input dataset to be used for predictions data_final = transformed_final.select("features", "review_id") # Predicting ratings using the developed model predictions = model_train.predict(data_final.map(lambda x: x.features)) labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions) return labelsAndPredictions
def seg_model_gb(train_data, test_data, loss_type, num_iter, maxDepth): removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train] # Putting data in vector assembler form assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features") transformed_train = assembler_train.transform(train_data.fillna(0)) # Creating input dataset in the form of labeled point for training the model data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features)) # Training the model using Gradient Boosted Trees regressor model_train = GradientBoostedTrees.trainRegressor(sc.parallelize(data_train.collect(),5), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Creating a list of features to be used for predictions removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final] # Putting data in vector assembler form assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features") transformed_final= assembler_final.transform(test_data.fillna(0)) # Creating input dataset to be used for predictions data_final = transformed_final.select("features", "review_id") # Predicting ratings using the developed model predictions = model_train.predict(data_final.map(lambda x: x.features)) labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions) return labelsAndPredictions
def _convertPythonXToJavaObject(self, X): """ Converts the input python object X to a java-side object (either MatrixBlock or Java DataFrame) Parameters ---------- X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame """ if isinstance(X, SUPPORTED_TYPES) and self.transferUsingDF: pdfX = convertToPandasDF(X) df = assemble( self.sparkSession, pdfX, pdfX.columns, self.features_col).select( self.features_col) return df._jdf elif isinstance(X, SUPPORTED_TYPES): return convertToMatrixBlock(self.sc, X) elif hasattr(X, '_jdf') and self.features_col in X.columns: # No need to assemble as input DF is likely coming via MLPipeline return X._jdf elif hasattr(X, '_jdf'): assembler = VectorAssembler( inputCols=X.columns, outputCol=self.features_col) df = assembler.transform(X) return df._jdf else: raise Exception('Unsupported input type')
def writeLumbarReadings(time, rdd): try: # Convert RDDs of the words DStream to DataFrame and run SQL query connectionProperties = MySQLConnection.getDBConnectionProps('/home/erik/mysql_credentials.txt') sqlContext = SQLContext(rdd.context) if rdd.isEmpty() == False: lumbarReadings = sqlContext.jsonRDD(rdd) lumbarReadingsIntermediate = lumbarReadings.selectExpr("readingID","readingTime","deviceID","metricTypeID","uomID","actual.y AS actualYaw","actual.p AS actualPitch","actual.r AS actualRoll","setPoints.y AS setPointYaw","setPoints.p AS setPointPitch","setPoints.r AS setPointRoll") assembler = VectorAssembler( inputCols=["actualPitch"], # Must be in same order as what was used to train the model. Testing using only pitch since model has limited dataset. outputCol="features") lumbarReadingsIntermediate = assembler.transform(lumbarReadingsIntermediate) predictions = loadedModel.predict(lumbarReadingsIntermediate.map(lambda x: x.features)) predictionsDF = lumbarReadingsIntermediate.map(lambda x: x.readingID).zip(predictions).toDF(["readingID","positionID"]) combinedDF = lumbarReadingsIntermediate.join(predictionsDF, lumbarReadingsIntermediate.readingID == predictionsDF.readingID).drop(predictionsDF.readingID) combinedDF = combinedDF.drop("features") combinedDF.show() combinedDF.write.jdbc("jdbc:mysql://localhost/biosensor", "SensorReadings", properties=connectionProperties) except: pass
def text_features(p_df): """ Extracts features derived from the quora question texts. :param p_df: A DataFrame. :return: A DataFrame. """ diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType()) common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType()) unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType()) p_df = p_df.withColumn("len_q1", length("question1")).withColumn("len_q2", length("question2")) p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2"))) p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn("words_q2", size("question2_words")) p_df = p_df.withColumn("common_words", common_words(array("question1_words", "question2_words"))) p_df = p_df.withColumn( "unique_chars_q1", unique_chars("question1") ).withColumn("unique_chars_q2", unique_chars("question2")) assembler = VectorAssembler( inputCols=["len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2"], outputCol="text_features" ) p_df = assembler.transform(p_df) return p_df
def predict(self, X): if isinstance(X, SUPPORTED_TYPES): if self.transferUsingDF: pdfX = convertToPandasDF(X) df = assemble(self.sqlCtx, pdfX, pdfX.columns, 'features').select('features') retjDF = self.model.transform(df._jdf) retDF = DataFrame(retjDF, self.sqlCtx) retPDF = retDF.sort('ID').select('prediction').toPandas() if isinstance(X, np.ndarray): return retPDF.as_matrix().flatten() else: return retPDF else: retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X))) if isinstance(X, np.ndarray): return retNumPy else: return retNumPy # TODO: Convert to Pandas elif hasattr(X, '_jdf'): if 'features' in X.columns: # No need to assemble as input DF is likely coming via MLPipeline df = X else: assembler = VectorAssembler(inputCols=X.columns, outputCol='features') df = assembler.transform(X) retjDF = self.model.transform(df._jdf) retDF = DataFrame(retjDF, self.sqlCtx) # Return DF return retDF.sort('ID') else: raise Exception('Unsupported input type')
def scaleVecCol(self, columns, nameOutputCol): """ This function groups the columns specified and put them in a list array in one column, then a scale process is made. The scaling proccedure is spark scaling default (see the example bellow). +---------+----------+ |Price |AreaLiving| +---------+----------+ |1261706.9|16 | |1263607.9|16 | |1109960.0|19 | |978277.0 |19 | |885000.0 |19 | +---------+----------+ | | | V +----------------------------------------+ |['Price', 'AreaLiving'] | +----------------------------------------+ |[0.1673858972637624,0.5] | |[0.08966137157852398,0.3611111111111111]| |[0.11587093205757598,0.3888888888888889]| |[0.1139820728616421,0.3888888888888889] | |[0.12260126542983639,0.4722222222222222]| +----------------------------------------+ only showing top 5 rows """ # Check if columns argument must be a string or list datatype: self.__assertTypeStrOrList(columns, "columns") # Check if columns to be process are in dataframe self.__assertColsInDF(columnsProvided=columns, columnsDF=self.__df.columns) # Check if nameOutputCol argument a string datatype: self.__assertTypeStr(nameOutputCol, "nameOutpuCol") # Model to use vectorAssember: vecAssembler = VectorAssembler(inputCols=columns, outputCol="features_assembler") # Model for scaling feature column: mmScaler = MinMaxScaler(inputCol="features_assembler", outputCol=nameOutputCol) # Dataframe with feature_assembler column tempDF = vecAssembler.transform(self.__df) # Fitting scaler model with transformed dataframe model = mmScaler.fit(tempDF) exprs = list(filter(lambda x: x not in columns, self.__df.columns)) exprs.extend([nameOutputCol]) self.__df = model.transform(tempDF).select(*exprs) self.__addTransformation() # checkpoint in case return self
def convert_to_flat_by_sparkpy(df): subkeys = df.select("subkey").dropDuplicates().collect() subkeys = [s[0] for s in subkeys] assembler = VectorAssembler().setInputCols(subkeys).setOutputCol("features") spark_df = assembler.transform(df.groupBy("key", "parameter").pivot("subkey").agg(first(col("reference")))) spark_df = spark_df.withColumnRenamed("parameter", "label") spark_df = spark_df.select("label", "features") return spark_df
def sparking_your_interest(): df = SQLContext.read.json('speeches_dataset.json') df_fillna=df.fillna("") print(df_fillna.count()) print(df_fillna.printSchema()) df_utf=call_utf_encoder(df) df_cleaned=call_para_cleanup(df_utf) print(df_cleaned) df_with_bigrams = call_ngrams(df_cleaned, 2) df_with_trigrams = call_ngrams(df_with_bigrams, 3) df_with_4grams = call_ngrams(df_with_trigrams, 4) df_with_5grams = call_ngrams(df_with_4grams, 4) df_with_6grams = call_ngrams(df_with_5grams, 4) df_with_vocab_score = call_speech_vocab(df_with_6grams) df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams') df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams') df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams') assembler = VectorAssembler( inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"], outputCol="features") assembler_output = assembler.transform(df_with_4grams_idf_vectors) output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features') print(output.show()) print(output.count()) output_tordd = output.rdd train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123) train_df = train_rdd.toDF() test_df = test_rdd.toDF() print(train_df) print(test_df) print('Train DF - Count: ') print(train_df.count()) print('Test DF - Count: ') print(test_df.count()) print("Initializing RF Model") labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df) rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) pipeline = Pipeline(stages=[labelIndexer,rf]) model = pipeline.fit(output) print("Completed RF Model") predictions = model.transform(test_df) evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) rfModel = model.stages[1] print(rfModel) # summary only print("Predictions: ") print(predictions.show())
def _prepare_data_spark(self, data): """ Prepare data for spark format, output data will have the feature format and other useful information """ keys = list(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE, self.TODAY_PRICE})) df = self._spark.createDataFrame(data) ass = VectorAssembler(inputCols=keys, outputCol="features") output = ass.transform(df) # output.select('features', 'ChangeDirection', 'ChangeAmount').write.save('test.parquet') return output
def predictPopularity(features): print(features) features = tuple(features) feature_label = [] for i in range(0, len(features)): feature_label.append('feature' +str(i)) data_frame = spark.createDataFrame([features], feature_label) assembler = VectorAssembler(inputCols= feature_label, outputCol = 'features') data_frame = assembler.transform(data_frame) data_frame = data_frame.select('features') result = rfc_model.transform(data_frame) return result.select('prediction').head(1)[0][0]
def commit(self): self.update_domain_role_hints() if self.in_df is not None: attributes = [att for att in self.used_attrs._list] class_var = [var for var in self.class_attrs._list] metas = [meta for meta in self.meta_attrs._list] VA = VectorAssembler(inputCols = attributes, outputCol = 'features') self.out_df = VA.transform(self.in_df) if len(class_var): self.out_df = self.out_df.withColumn('label', self.out_df[class_var[0]].cast('double')) self.send("DataFrame", self.out_df) else: self.send("DataFrame", None)
def test_train_data(overall_segment): removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_train = [v for i, v in enumerate(overall_segment.columns) if v not in removelist_train] # Putting data in vector assembler form assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features") transformed_train = assembler_train.transform(overall_segment.fillna(0)) # Creating input dataset in the form of labeled point for training the model data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features)) (trainingData, testData) = sc.parallelize(data_train.collect(),5).randomSplit([0.7, 0.3]) return (trainingData, testData)
def tf_idf_features_quora(p_df): """ Extracts TF-IDF features from quora dataset. :param p_df: A DataFrame. :return: A DataFrame. """ tf_df = extract_tf_features(p_df, "question1_meaningful_words", "tf1") tf_df = extract_tf_features(tf_df, "question2_meaningful_words", "tf2") tf_idf_df = extract_idf_features(tf_df, "tf1", "tf-idf1") tf_idf_df = extract_idf_features(tf_idf_df, "tf2", "tf-idf2") assembler = VectorAssembler( inputCols=["tf-idf1", "tf-idf2"], outputCol="tf_idf_features" ) return assembler.transform(tf_idf_df)
def convert_to_flat_by_sparkpy(df): subkeys = df.select("subkey").dropDuplicates().collect() subkeys = [s[0] for s in subkeys] n = len(df.select("reference").first()[0]) # df = df.groupBy("key").agg(array(*[avg(col("reference")[i]) for i in range(n)]).alias("averages")) df = df.groupBy("key").agg(array(*[collect_list(col("reference")[i]) for i in range(n)]).alias("averages")) df.show() r = df.collect() # changedTypedf = joindf.withColumn("label", joindf["show"].cast(DoubleType())) assembler = VectorAssembler().setInputCols(subkeys).setOutputCol("features") spark_df = assembler.transform(df.groupBy("key", "parameter").pivot("subkey").agg(first(col("reference")))) spark_df = spark_df.withColumnRenamed("parameter", "label") spark_df = spark_df.select("label", "features") return spark_df
def predict(self, X): """ Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types Parameters ---------- X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame """ try: if self.estimator is not None and self.model is not None: self.estimator.copyProperties(self.model) except AttributeError: pass if isinstance(X, SUPPORTED_TYPES): if self.transferUsingDF: pdfX = convertToPandasDF(X) df = assemble(self.sparkSession, pdfX, pdfX.columns, self.features_col).select(self.features_col) retjDF = self.model.transform(df._jdf) retDF = DataFrame(retjDF, self.sparkSession) retPDF = retDF.sort('__INDEX').select('prediction').toPandas() if isinstance(X, np.ndarray): return self.decode(retPDF.as_matrix().flatten()) else: return self.decode(retPDF) else: try: retNumPy = self.decode(convertToNumPyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))) except Py4JError: traceback.print_exc() if isinstance(X, np.ndarray): return retNumPy else: return retNumPy # TODO: Convert to Pandas elif hasattr(X, '_jdf'): if self.features_col in X.columns: # No need to assemble as input DF is likely coming via MLPipeline df = X else: assembler = VectorAssembler(inputCols=X.columns, outputCol=self.features_col) df = assembler.transform(X) retjDF = self.model.transform(df._jdf) retDF = DataFrame(retjDF, self.sparkSession) # Return DF return retDF.sort('__INDEX') else: raise Exception('Unsupported input type')
def transform(self, df, featureCols, targetCol): """Keep the K most important features of the Spark DataFrame Parameters ---------- df : Spark DataFrame featureCols: array, names of feature columns to consider in the feature selectio algorithm targetCol: str, name of target column, i.e, column to which compare each feature. Returns ------- transformed_df : New Spark DataFrame with only the most important feature columns. """ # build features assemble assembler = VectorAssembler( inputCols = featureCols, outputCol = 'features') assembled_df = assembler.transform(df) # rename target column assembled_df = assembled_df.withColumnRenamed(targetCol,'target') # extract features and target feats = assembled_df.select('features').rdd feats = feats.map(lambda x: x['features']) target = assembled_df.select('target').rdd target = target.map(lambda x: x['target']) # compute per-column metric scores = [] for i,feat in enumerate(featureCols): vector = feats.map(lambda x: x[i]) scores.append(self.sfunc_(vector,target)) self.scores_ = scores # sort scores idx = sorted(range(len(self.scores_)),reverse=True,key=self.scores_.__getitem__) # return dataframe with k-best columns return df.select(*[featureCols[idd] for idd in idx[:self.k_]])
def convertToLabeledDF(sparkSession, X, y=None): from pyspark.ml.feature import VectorAssembler if y is not None: pd1 = pd.DataFrame(X) pd2 = pd.DataFrame(y, columns=['label']) pdf = pd.concat([pd1, pd2], axis=1) inputColumns = ['C' + str(i) for i in pd1.columns] outputColumns = inputColumns + ['label'] else: pdf = pd.DataFrame(X) inputColumns = ['C' + str(i) for i in pdf.columns] outputColumns = inputColumns assembler = VectorAssembler(inputCols=inputColumns, outputCol='features') out = assembler.transform(sparkSession.createDataFrame(pdf, outputColumns)) if y is not None: return out.select('features', 'label') else: return out.select('features')
def merge_features(ddfs, join_column, merge_column, output_column='features', drop_merged_columns=True): """ join (inner) several DataFrames by same id and merge its columns (merge_column) into one column using using pyspark.ml.feature.VectorAssembler Example: ddf_merge = merge_features(ddfs=[ddf_pivot1,ddf_pivot2], join_column='customer_id', merge_column='features') :param ddfs: :param join_column: id column to join by (each ddf must have this column) :param merge_column: column to merge (each ddf must have this column) :param output_column: :param drop_merged_columns: :return: """ from pyspark.ml.feature import VectorAssembler ddf_res = ddfs.pop(0) merge_column_renamed = merge_column + str(0) merge_columns = [merge_column_renamed] ddf_res = ddf_res.withColumnRenamed(merge_column, merge_column_renamed) for i,ddf in enumerate(ddfs): merge_column_renamed = merge_column + str(i+1) merge_columns.append(merge_column_renamed) ddf_r = ddf.withColumnRenamed(merge_column, merge_column_renamed) ddf_res = ddf_res.join(ddf_r, on=join_column, how='inner') assembler = VectorAssembler(inputCols=merge_columns, outputCol=output_column) res = assembler.transform(ddf_res) if drop_merged_columns: res = drop_columns(res, columns=merge_columns) return res # def pivot_aggregate(ddf, grpby_columns, pivot_column, aggs, pivot_filter_values=None, pivot_filter_support=None): # if pivot_filter_support and not pivot_filter_values: # frequent = ddf.freqItems([pivot_column], support=pivot_filter_support).first().asDict()[pivot_column+'_freqItems'] # pivot_filter_values = map(str,frequent) # # ddf_gr = ddf.groupBy(*grpby_columns) # ddf_pivot = ddf_gr.pivot(pivot_column, pivot_filter_values) # ddf_agg = ddf_pivot.agg(*aggs) # return ddf_agg
def preprocess(data): data = data.select('Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime','UniqueCarrier'\ ,'FlightNum','TailNum','ActualElapsedTime','CRSElapsedTime','AirTime','ArrDelay','DepDelay', 'Origin'\ ,'Dest','Distance','TaxiIn','TaxiOut','Cancelled') data = data.na.fill('999999') for t in data.dtypes: if t[1]=='string' and t[0] not in ['Origin','Dest','TailNum','UniqueCarrier','FlightNum']: data=data.withColumn(t[0],x[t[0]].cast('integer')) data = data.na.fill(999999) data = data.withColumnRenamed('Cancelled','label') data = data.withColumn('label',data.label.cast('double')) assembler = VectorAssembler( inputCols=['Year','Month','DayofMonth','DayOfWeek' ,'DepTime','CRSDepTime','ArrTime','CRSArrTime', 'ActualElapsedTime','CRSElapsedTime','AirTime', 'ArrDelay','DepDelay','Distance','TaxiIn','TaxiOut'], outputCol='features') data = assembler.transform(data) data = data.select('features','label') return data
def to_numeric_df(kdf: 'ks.DataFrame') -> Tuple[pyspark.sql.DataFrame, List[str]]: """ Takes a dataframe and turns it into a dataframe containing a single numerical vector of doubles. This dataframe has a single field called '_1'. TODO: index is not preserved currently :param kdf: the koalas dataframe. :return: a pair of dataframe, list of strings (the name of the columns that were converted to numerical types) >>> to_numeric_df(ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']})) (DataFrame[_correlation_output: vector], ['A', 'B']) """ # TODO, it should be more robust. accepted_types = {np.dtype(dt) for dt in [np.int8, np.int16, np.int32, np.int64, np.float32, np.float64, np.bool_]} numeric_fields = [fname for fname in kdf._metadata.data_columns if kdf[fname].dtype in accepted_types] numeric_df = kdf._sdf.select(*numeric_fields) va = VectorAssembler(inputCols=numeric_fields, outputCol=CORRELATION_OUTPUT_COLUMN) v = va.transform(numeric_df).select(CORRELATION_OUTPUT_COLUMN) return v, numeric_fields
def cluster(): ld = load(open(DATAP+'\\temp\olangdict.json','r',encoding='UTF-8')) spark = SparkSession.builder\ .master("local")\ .appName("Word Count")\ .config("spark.some.config.option", "some-value")\ .getOrCreate() df = spark.createDataFrame([["0"], ["1"], ["2"], ["3"], ["4"]], ["id"]) df.show() vecAssembler = VectorAssembler(inputCols=["feat1", "feat2"], outputCol="features") new_df = vecAssembler.transform(df) kmeans = KMeans(k=2, seed=1) # 2 clusters here model = kmeans.fit(new_df.select('features')) transformed = model.transform(new_df) print(transformed.show())
irisNormDf = si_model.transform(responses) print(irisNormDf.select("Species", "SPECIES_Catogery").distinct().collect()) for i in irisNormDf.columns: if not (isinstance(irisNormDf.select(i).take(1)[0][0], six.string_types)): print("Correlation to for ", i, irisNormDf.stat.corr('SPECIES_Catogery', i)) #[Row(Species='versicolor', SPECIES_Catogery=0.0), Row(Species='setosa', SPECIES_Catogery=2.0), Row(Species='virginica', SPECIES_Catogery=1.0)] iris_final = irisNormDf.drop('Species') vectorAssembler = VectorAssembler( inputCols=['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth'], outputCol='features') iris_final = vectorAssembler.transform(iris_final) iris_final = iris_final.select(['features', 'SPECIES_Catogery']) #print(vauto_df) iris_final.show(3) random.seed(100) splits = iris_final.randomSplit([0.8, 0.2]) train_df = splits[0] test_df = splits[1] print(train_df.count()) print(test_df.count()) ##############################----DECISION TREE CLASSIFICATION----######################################## dtreeeClassifer = DecisionTreeClassifier(maxDepth=2,
# Load the training data into a dataframe data = spark.read.format('json').load('train.jsonl') data = clean_tokenize_remove_stopwords_quora(data) # Get the tf-idf features data = tf_idf_features_quora(data) # Get the text features data = text_features(data) # combine all the features feature_assembler = VectorAssembler( inputCols=["tf_idf_features", "text_features"], outputCol="combined_features" ) data = feature_assembler.transform(data) # Normalizing each feature to have unit standard deviation scaler = StandardScaler(inputCol="combined_features", outputCol="features", withStd=True, withMean=False) scalerModel = scaler.fit(data) # Normalize each feature to have unit standard deviation. data = scalerModel.transform(data) # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) # Automatically identify categorical features, and index them. feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=2).fit(data)
.builder\ .appName("VectorSizeHintExample")\ .getOrCreate() # $example on$ dataset = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0), (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) sizeHint = VectorSizeHint( inputCol="userFeatures", handleInvalid="skip", size=3) datasetWithSize = sizeHint.transform(dataset) print("Rows where 'userFeatures' is not the right size are filtered out") datasetWithSize.show(truncate=False) assembler = VectorAssembler( inputCols=["hour", "mobile", "userFeatures"], outputCol="features") # This dataframe can be used by downstream transformers as before output = assembler.transform(datasetWithSize) print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(truncate=False) # $example off$ spark.stop()
'Room_Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio', 'perc_alumni', 'Expend', 'Grad_Rate'], outputCol="features") # In[86]: output = assembler.transform(data) # Deal with Private column being "yes" or "no" # In[87]: from pyspark.ml.feature import StringIndexer # In[88]: indexer = StringIndexer(inputCol="Private", outputCol="PrivateIndex") output_fixed = indexer.fit(output).transform(output)
def combine_columns(columns, df, out_col): assembler = VectorAssembler(inputCols=columns, outputCol=out_col) return assembler.transform(df)
#Loading the Student_Grades_Data.csv file, uploaded in previous step data = spark.read.csv('Student_Grades_Data.csv', header=True, inferSchema=True) #Taking a look at data type of each column to see what data types inferSchema=TRUE paramter has set for each column data.printSchema() #Display first few rows of data data.show() #Create a Feature array by omitting the last column feature_cols = data.columns[:-1] from pyspark.ml.feature import VectorAssembler vect_assembler = VectorAssembler(inputCols=feature_cols,outputCol="features") #Utilize Assembler created above in order to add the feature column data_w_features = vect_assembler.transform(data) #Display the data having additional column named features. Had it been multiple linear regression problem, you could see all the # independent variable values combined in one list data_w_features.show() #Select only Features and Label from previous dataset as we need these two entities for building machine learning model finalized_data = data_w_features.select("features","Grades") finalized_data.show() #Split the data into training and test model with 70% obs. going in training and 30% in testing train_dataset, test_dataset = finalized_data.randomSplit([0.7, 0.3]) #Peek into training data train_dataset.describe().show()
dfAvgStock = dfStock.groupby('stock_hour', 'company').agg(F.mean('close'), F.mean('volume')) dfJoin = dfAvgSent.join(dfAvgStock, (dfAvgSent.comp == dfAvgStock.company) & (dfAvgSent.tweet_hour == dfAvgStock.stock_hour+5)) dfJoin = dfJoin.withColumnRenamed("avg(sentiment)","avg-sentiment") dfJoin = dfJoin.withColumnRenamed("avg(close)","avg-close") dfJoin = dfJoin.withColumnRenamed("avg(volume)","avg-volume") dfJoin = dfJoin.withColumnRenamed("avg(followers_count)","avg-followers") dfJoin.show() # COMMAND ---------- from pyspark.ml.feature import VectorAssembler dfJoin1 = dfJoin.select("avg-sentiment","avg-followers","avg-volume") inputFeatures = ["avg-sentiment","avg-followers","avg-volume"] assembler = VectorAssembler(inputCols=inputFeatures, outputCol="features") dfJoin2 = assembler.transform(dfJoin1) # COMMAND ---------- # Scaling features scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") scalerModel = scaler.fit(dfJoin2) scaledData = scalerModel.transform(dfJoin2) scaledData.select("features", "scaledFeatures").show() # COMMAND ---------- #Elbow method import numpy as np cost = np.zeros(10) for k in range(2,10):
import findspark findspark.init() spark = SparkSession.builder.appName("SICP7").getOrCreate() spark.sparkContext.setLogLevel("ERROR") # Load data and select feature and label columns data = spark.read.format("csv").option("header", True).option( "inferSchema", True).option("delimiter", ",").load("C:/Users/bharani/PycharmProjects/SICP7/adult.data") data = data.withColumnRenamed("age", "label").select("label", "education-num", "hours-per-week") data = data.select(data.label.cast("double"), "education-num", "hours-per-week") # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features") data = assembler.transform(data) data.show() # Split data into training and test data set training, test = data.select("label", "features").randomSplit([0.85, 0.15]) # Create Navie Bayes model and fit the model with training dataset nb = NaiveBayes() model = nb.fit(training) # Generate prediction from test dataset predictions = model.transform(test) # Evaluate the accuracy of the model evaluator = MulticlassClassificationEvaluator() accuracy = evaluator.evaluate(predictions) # Show model accuracy print("Accuracy:", accuracy) # Report predictionAndLabels = predictions.select("label", "prediction").rdd
def main(): appName = "ukhouseprices" spark = s.spark_session(appName) spark.sparkContext._conf.setAll(v.settings) sc = s.sparkcontext() # # Get data from Hive table regionname = "Kensington and Chelsea" tableName = "ukhouseprices" fullyQualifiedTableName = v.DSDB + "." + tableName summaryTableName = v.DSDB + "." + "summary" start_date = "2010" end_date = "2020" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') " )).collect() print("\nStarted at") uf.println(lst) # Model predictions spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") #summary_df = spark.sql(f"""SELECT cast(date_format(datetaken, "yyyyMM") as int) as datetaken, flatprice, terracedprice, semidetachedprice, detachedprice FROM {summaryTableName}""") summary_df = spark.sql( f"""SELECT cast(Year as int) as year, AVGFlatPricePerYear, AVGTerracedPricePerYear, AVGSemiDetachedPricePerYear, AVGDetachedPricePerYear FROM {v.DSDB}.yearlyhouseprices""" ) df_10 = summary_df.filter( col("year").between(f'{start_date}', f'{end_date}')) print(df_10.toPandas().columns.tolist()) # show pandas column list ['Year', 'AVGPricePerYear', 'AVGFlatPricePerYear', 'AVGTerracedPricePerYear', 'AVGSemiDetachedPricePerYear', 'AVGDetachedPricePerYear'] p_dfm = df_10.toPandas() # converting spark DF to Pandas DF data = p_dfm.values # Non-Linear Least-Squares Minimization and Curve Fitting model = LorentzianModel() n = len(p_dfm.columns) for i in range(n): if p_dfm.columns[i] != 'year': # year is x axis in integer # it goes through the loop and plots individual average curves one by one and then prints a report for each y value vcolumn = p_dfm.columns[i] print(vcolumn) params = model.guess(p_dfm[vcolumn], x=p_dfm['year']) result = model.fit(p_dfm[vcolumn], params, x=p_dfm['year']) result.plot_fit() # do linear regression here # Prepare data for Machine Learning.And we need two columns only — features and label(p_dfm.columns[i]]): inputCols = ['year'] vectorAssembler = VectorAssembler(inputCols=inputCols, outputCol='features') vhouse_df = vectorAssembler.transform(df_10) vhouse_df = vhouse_df.select( ['features', 'AVGFlatPricePerYear']) vhouse_df.show(20) if vcolumn == "AVGFlatPricePerYear": plt.xlabel("Year", fontdict=v.font) plt.ylabel("Flat house prices in millions/GBP", fontdict=v.font) plt.title( f"""Flat price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.text(0.35, 0.45, "Best-fit based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=10) print(result.fit_report()) plt.xlim(left=2009) plt.xlim(right=2022) plt.show() plt.close() elif vcolumn == "AVGTerracedPricePerYear": plt.xlabel("Year", fontdict=v.font) plt.ylabel("Terraced house prices in millions/GBP", fontdict=v.font) plt.title( f"""Terraced house price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.text(0.35, 0.45, "Best-fit based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=10) print(result.fit_report()) plt.show() plt.close() elif vcolumn == "AVGSemiDetachedPricePerYear": plt.xlabel("Year", fontdict=v.font) plt.ylabel("semi-detached house prices in millions/GBP", fontdict=v.font) plt.title( f"""semi-detached house price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.text(0.35, 0.45, "Best-fit based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=10) print(result.fit_report()) plt.show() plt.close() elif vcolumn == "AVGDetachedPricePerYear": plt.xlabel("Year", fontdict=v.font) plt.ylabel("detached house prices in millions/GBP", fontdict=v.font) plt.title( f"""detached house price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.text(0.35, 0.45, "Best-fit based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=10) print(result.fit_report()) plt.show() plt.close() p_df = df_10.select('AVGFlatPricePerYear', 'AVGTerracedPricePerYear', 'AVGSemiDetachedPricePerYear', 'AVGDetachedPricePerYear').toPandas().describe() print(p_df) #axs = scatter_matrix(p_df, figsize=(10, 10)) # Describe returns a DF where count,mean, min, std,max... are values of the index y = p_df.loc[['min', 'mean', 'max']] #y = p_df.loc[['averageprice', 'flatprice']] ax = y.plot(linewidth=2, colormap='jet', marker='.', markersize=20) plt.grid(True) plt.xlabel("UK House Price Index, January 2020", fontdict=v.font) plt.ylabel("Property Prices in millions/GBP", fontdict=v.font) plt.title( f"""Property price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.legend(p_df.columns) plt.show() plt.close() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') " )).collect() print("\nFinished at") uf.println(lst)
# Setup the Spark -, and SQL Context (note: this is for Spark < 2.0.0) sc = SparkContext(appName="DistKeras ATLAS Higgs example") sqlContext = SQLContext(sc) # Read the Higgs dataset. dataset = sqlContext.read.format('com.databricks.spark.csv')\ .options(header='true', inferSchema='true').load("data/atlas_higgs.csv") # Print the schema of the dataset. dataset.printSchema() # Vectorize the features into the features column. features = dataset.columns features.remove('EventId') features.remove('Weight') features.remove('Label') assembler = VectorAssembler(inputCols=features, outputCol="features") dataset = assembler.transform(dataset) # Since the output layer will not be able to read the string label, convert it to an double. labelIndexer = StringIndexer(inputCol="Label", outputCol="label_index").fit(dataset) dataset = labelIndexer.transform(dataset) # Feature normalization. standardScaler = StandardScaler(inputCol="features", outputCol="features_normalized", withStd=True, withMean=True) standardScalerModel = standardScaler.fit(dataset) dataset = standardScalerModel.transform(dataset) # Define the structure of the dataset. nb_features = len(features) nb_classes = 2
spark = SparkSession.builder.appName("RateSourceLKF").getOrCreate() spark.sparkContext.setLogLevel("WARN") noise_param = 1 input_df = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\ .withColumn("mod", F.col("value") % num_states)\ .withColumn("stateKey", F.col("mod").cast("String"))\ .withColumn("trend", (F.col("value")/num_states).cast("Integer") + F.randn() * noise_param) lkf = LinearKalmanFilter()\ .setStateKeyCol("stateKey")\ .setMeasurementCol("measurement")\ .setInitialStateMean(Vectors.dense([0.0, 0.0]))\ .setInitialStateCovariance(Matrices.dense(2, 2, [10000.0, 0.0, 0.0, 10000.0]))\ .setProcessModel(Matrices.dense(2, 2, [1.0, 0.0, 1.0, 1.0]))\ .setProcessNoise(Matrices.dense(2, 2, [0.0001, 0.0, 0.0, 0.0001]))\ .setMeasurementNoise(Matrices.dense(1, 1, [noise_param]))\ .setMeasurementModel(Matrices.dense(1, 2, [1.0, 0.0])) assembler = VectorAssembler(inputCols=["trend"], outputCol="measurement") measurements = assembler.transform(input_df) query = lkf.transform(measurements)\ .writeStream\ .queryName("RateSourceLKF")\ .outputMode("append")\ .format("console")\ .start() query.awaitTermination()
def main(): # Setup Spark spark = SparkSession.builder.master("local[*]").getOrCreate() # Nice way to write a tmp file onto the system temp_csv_file = tempfile.mktemp() with open(temp_csv_file, mode="wb") as f: data_https = requests.get( "https://teaching.mrsharky.com/data/iris.data" ) f.write(data_https.content) iris_df = spark.read.csv(temp_csv_file, inferSchema="true", header="true") iris_df = iris_df.toDF( "sepal_length", "sepal_width", "petal_length", "petal_width", "class") iris_df.createOrReplaceTempView("iris") iris_df.persist(StorageLevel.DISK_ONLY) # Simple SQL results = spark.sql("SELECT * FROM iris") results.show() # Average for each of the 4 average_overall = spark.sql( """ SELECT AVG(sepal_length) AS avg_sepal_length , AVG(sepal_width) AS avg_sepal_width , AVG(petal_length) AS avg_petal_length , AVG(petal_width) AS avg_petal_width FROM iris """ ) average_overall.show() # Average for each of the 4 by class average_by_class = spark.sql( """ SELECT class , AVG(sepal_length) AS avg_sepal_length , AVG(sepal_width) AS avg_sepal_width , AVG(petal_length) AS avg_petal_length , AVG(petal_width) AS avg_petal_width FROM iris GROUP BY class """ ) average_by_class.show() # Add a new column iris_df = iris_df.withColumn("rand", functions.rand(seed=42)) iris_df.createOrReplaceTempView("iris") results = spark.sql("SELECT * FROM iris ORDER BY rand") results.show() vector_assembler = VectorAssembler( inputCols=[ "sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol="vector", ) iris_df = vector_assembler.transform(iris_df) iris_df.show() # Numberize the class column of iris string_indexer = StringIndexer(inputCol="class", outputCol="indexed") indexer_fitted = string_indexer.fit(iris_df) iris_df = indexer_fitted.transform(iris_df) iris_df.createOrReplaceTempView("iris") results = spark.sql("SELECT * FROM iris ORDER BY rand") results.show() return # Random Forest random_forest_classifier = RandomForestClassifier( featuresCol="vector", labelCol="indexed" ) random_forest_classifier_fitted = random_forest_classifier.fit(iris_df) iris_df = random_forest_classifier_fitted.transform(iris_df) iris_df.createOrReplaceTempView("iris") results = spark.sql("SELECT * FROM iris ORDER BY rand") results.show() # Calculate the model's Accuracy print_heading("Accuracy") iris_df_accuracy = spark.sql( """ SELECT SUM(correct)/COUNT(*) AS accuracy FROM (SELECT CASE WHEN prediction == class_idx THEN 1 ELSE 0 END AS correct FROM predicted) AS TMP """ ) iris_df_accuracy.show()
"mortdue":avg("mortdue"), "value":avg("value"), "derog":avg("derog"), "delinq":0, "clage":avg("clage"), "ninq":avg("ninq"), "clno":avg("clno"), "debtinc":avg("debtinc") }) """ #Define Input-output columns, i.e. transform to MLP features vector ignore=['bad'] assembler = VectorAssembler( inputCols=[k for k in clean_riskdata.columns if k not in ignore], outputCol="predictors") Triskdata = assembler.transform(clean_riskdata) # Split the data into train and test splits = Triskdata.randomSplit([0.4, 0.6], 1234) train = splits[0] test = splits[1] ################################################################# # Preliminary analysis ################################################################# print(clean_riskdata.describe().show()) print(riskdata.stat.crosstab("bad","job").show()) print(riskdata.stat.crosstab("bad","reason").show()) ################################################################# # Multilayer Perceptron Classifier #################################################################
from pyspark.ml.feature import VectorAssembler from pyspark.ml import Pipeline from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml.evaluation import RegressionEvaluator import os df = sqlContext.read.json(os.environ['WORKDIR'] + "user_features.json") df_restaurants = df.filter("category = \"Restaurants\"") assembler = VectorAssembler( inputCols=["average_stars", "cat_avg_review_len", "cat_avg_stars", "cat_business_count", "cat_review_count", "months_yelping", "review_count", "votes_cool", "votes_funny", "votes_useful" ], outputCol="features") output = assembler.transform(df_restaurants) (trainingData, testData) = output.randomSplit([0.7, 0.3]) dt = DecisionTreeRegressor(labelCol = "elite", featuresCol="features") pipeline = Pipeline(stages=[dt]) model = pipeline.fit(trainingData) predictions = model.transform(testData) predictions.select("prediction", "elite", "features").show(5) evaluator = RegressionEvaluator( labelCol="elite", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print "Root Mean Squared Error (RMSE) on test data = %g" % rmse
df_second_round = df_second_round.join(dosage_mapping, df_second_round.DOSAGE == dosage_mapping.CPA_DOSAGE, how="left").na.fill("") df_second_round = df_second_round.withColumn("EFFTIVENESS_DOSAGE_SE", dosage_replace(df_second_round.MASTER_DOSAGE, \ df_second_round.DOSAGE_STANDARD, df_second_round.EFFTIVENESS_DOSAGE)) df_second_round = df_second_round.withColumn("EFFTIVENESS_PRODUCT_NAME_SE", prod_name_replace(df_second_round.MOLE_NAME, df_second_round.MOLE_NAME_STANDARD, \ df_second_round.MANUFACTURER_NAME, df_second_round.MANUFACTURER_NAME_STANDARD, df_second_round.MANUFACTURER_NAME_EN_STANDARD)) df_second_round = df_second_round.withColumn("EFFTIVENESS_PACK_QTY_SE", pack_replace(df_second_round.EFFTIVENESS_PACK_QTY, df_second_round.SPEC_ORIGINAL, \ df_second_round.PACK_QTY, df_second_round.PACK_QTY_STANDARD)) assembler = VectorAssembler( \ inputCols=["EFFTIVENESS_MOLE_NAME", "EFFTIVENESS_PRODUCT_NAME_SE", "EFFTIVENESS_DOSAGE_SE", "EFFTIVENESS_SPEC", \ "EFFTIVENESS_PACK_QTY_SE", "EFFTIVENESS_MANUFACTURER"], \ outputCol="features") df_second_round = assembler.transform(df_second_round) # df_second_round.repartition(10).write.mode("overwrite").parquet("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/second_round_dt") predictions_second_round = model.transform(df_second_round) predictions_second_round.write.mode("overwrite").parquet("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/zyyin/second_round_prediction1106_1") evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions_second_round) print("Test Error = %g " % (1.0 - accuracy)) print("Test set accuracy = " + str(accuracy)) # 第二轮正确率检测 df_true_positive_se = predictions_second_round.where(predictions_second_round.prediction == 1.0) ph_positive_prodict_se = df_true_positive_se.count() print("机器判断第二轮TP条目 = " + str(ph_positive_prodict_se)) ph_positive_hit_se = df_true_positive_se.where((df_true_positive_se.prediction == df_true_positive_se.label) & (df_true_positive_se.label == 1.0)).count()
'Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership' ], outputCol='features') # COMMAND ---------- #type(vectorassember) # COMMAND ---------- print(vectorassember) # COMMAND ---------- output = vectorassember.transform(data) # COMMAND ---------- #df3.printSchema() # COMMAND ---------- #from pyspark.sql.types import IntegerType #df3= df3.withColumn("air_time", df3['air_time'].cast(IntegerType())) # COMMAND ---------- #df3.printSchema() # COMMAND ----------
# Create training and test data sets training_data, test_data = data.randomSplit([0.8, 0.2], seed=7) print('Test data') print(test_data.groupby('label').count().show()) print('Training data') print(training_data.groupby('label').count().show()) print(training_data.show()) # New data set with the following columns: # - 'label' - class. # - 'features' - a vector containing the particular attributes. assembler = VectorAssembler(inputCols=column_names, outputCol='features') training_data = assembler.transform(training_data) training_data = training_data.select('label', 'features') # training_data = training_data.drop(*column_names) test_data = assembler.transform(test_data) test_data = test_data.select('label', 'features') # test_data = test_data.drop(*column_names) print(training_data.take(1)) # Scale training_scale, _ = standardScale(training_data) print('\nScaled training data (Standard)') print(training_scale.take(1)) # training_scale.write.csv('db/training_scale', header=True) # training_scale.rdd.saveAsPickleFile('db/training_scale') training_scale, _ = minMaxScale(training_data)
# Initialize SparkSession spark = (SparkSession .builder .appName("news") .enableHiveSupport() .getOrCreate()) # Read raw data df = spark.read.csv('/home/worker/data/news.csv', header=True, inferSchema=True, mode="DROPMALFORMED", encoding='UTF-8') print("==== 生データ ====") df.show(truncate=False) assembler = VectorAssembler(inputCols=df.columns[1:], outputCol="変量") feature_vectors = assembler.transform(df) feature_vectors.show() print("==== LightGBMの学習 ====") model = LightGBMRegressor(alpha=0.3, learningRate=0.3, numIterations=100, numLeaves=31, featuresCol='変量', labelCol='スポーツ').fit(feature_vectors) print("==== 元のデータフレーム行数 ====") print((df.count(), len(df.columns)))
StructField("\"\"\"\"chlorides\"\"\"\"", FloatType(), True), StructField("\"\"\"\"free sulfur dioxide\"\"\"\"", FloatType(), True), StructField("\"\"\"\"total sulfur dioxide\"\"\"\"", FloatType(), True), StructField("\"\"\"\"density\"\"\"\"", FloatType(), True), StructField("\"\"\"\"pH\"\"\"\"", FloatType(), True), StructField("\"\"\"\"sulphates\"\"\"\"", FloatType(), True), StructField("\"\"\"\"alcohol\"\"\"\"", FloatType(), True), StructField("\"\"\"\"quality\"\"\"\"", FloatType(), True) ]) testing = spark.read.format("csv").option("header", "true").option( "delimiter", ";").schema(schema).load("s3n://643-pa2/ValidationDataset.csv") vectorAssembler = VectorAssembler(inputCols=[ "\"\"\"\"\"fixed acidity\"\"\"\"", "\"\"\"\"volatile acidity\"\"\"\"", "\"\"\"\"citric acid\"\"\"\"", "\"\"\"\"residual sugar\"\"\"\"", "\"\"\"\"chlorides\"\"\"\"", "\"\"\"\"free sulfur dioxide\"\"\"\"", "\"\"\"\"total sulfur dioxide\"\"\"\"", "\"\"\"\"density\"\"\"\"", "\"\"\"\"pH\"\"\"\"", "\"\"\"\"sulphates\"\"\"\"", "\"\"\"\"alcohol\"\"\"\"" ], outputCol='features') test_data = vectorAssembler.transform(testing) predictions = Model.transform(test_data) predictionAndLabels = predictions.select( ['prediction', "\"\"\"\"quality\"\"\"\""]).rdd # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabels) # Overall statistics print("F1 Score: " + str(metrics.weightedFMeasure()))
df = process(df_base) from pyspark.sql.functions import col, udf from pyspark.sql.types import IntegerType from pyspark import StorageLevel from pyspark.ml.clustering import GaussianMixture for i in range(6): n = 10**i for k in [5, 25, 50, 100, 500, 1000]: with Timer('limit', 'Limiting data, n={}, k={}'.format(n, k)): df_ik = df.limit(n) with Timer('clustering', 'n={}, k={}'.format(n, k)): gmm = GaussianMixture(k=k) va = VectorAssembler( inputCols=["pickup_latitude", "pickup_longitude"], outputCol="features") df_t = va.transform(df_ik) model = gmm.fit(df_t) df_p = model.transform(df_t) df_pp = df_p.select('pickup_latitude', 'pickup_longitude', 'prediction').toPandas()
ds = ds_raw.select([ds_raw.columns[3]] + [to_float(col(column)).alias(column) for column in ds_raw.columns[4:]]) from pyspark.ml.feature import StringIndexer categoryIndexer = StringIndexer(inputCol="alchemy_category", outputCol="alchemy_category_index") categoryTransformer = categoryIndexer.fit(ds) df1 = categoryTransformer.transform(ds) from pyspark.ml.feature import OneHotEncoder encoder = OneHotEncoder(dropLast=False, inputCol="alchemy_category_index", outputCol="alchemy_category_index_vector") df2 = encoder.transform(df1) from pyspark.ml.feature import VectorAssembler assemblerInput = ['alchemy_category_index_vector'] + ds.columns[1:-1] assembler = VectorAssembler(inputCols=assemblerInput, outputCol="features") df3 = assembler.transform(df2) # deal with categorical label from pyspark.ml.feature import StringIndexer # Index labels, adding metadata to the label column labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(df3) df4 = labelIndexer.transform(df3) from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="features", impurity="gini", maxDepth=10, maxBins=14) dt_model = dt.fit(df4) df5 = dt_model.transform(df4) # Convert indexed labels back to original labels. from pyspark.ml.feature import IndexToString
def main(): """ - Downloads outstanding data - Sets up Spark environment - Loads data - Summarises data - Merges data - Prepares data for modelling :return: None """ # --- Download data (if its not already downloaded) if config.MODE == 'prod': # In production mode we want to download all the csv files # Datasets in develop are controlled by the user # Download and save taxi journey data download_data(config.TAXI_DATA_URLS, config.TAXI_DATA_DIR) # Download and save road traffic accident data download_data(config.ACCIDENT_DATA_URLS, config.ACCIDENT_DATA_DIR) # --- Set up Spark environment spark = SparkSession.builder.appName('Basics').getOrCreate() # --- Load data # Load and parse taxi data, add an id column taxi_df = load_data(spark, data_dir=config.TAXI_DATA_DIR, schema=config.TAXI_DATA_SCHEMA)\ .withColumn(config.TAXI_ID_COL, monotonically_increasing_id()) logging.info( f'Selecting a random {config.SAMPLE_RATE * 100}% of data before merging' ) # Get a random 1% of data with random seed=1 splits = taxi_df.randomSplit([1 - config.SAMPLE_RATE, config.SAMPLE_RATE], seed=1) taxi_df = splits[1] # Load and parse accident data, add an id column and a timestamp column accident_df = load_data(spark, data_dir=config.ACCIDENT_DATA_DIR, schema=config.ACCIDENT_DATA_SCHEMA)\ .withColumn(config.ACCIDENT_ID_COL, monotonically_increasing_id())\ .withColumn('accident_timestamp', unix_timestamp('date', 'MM/dd/yyyy').cast('timestamp')) # --- Summarise data # Plot and save data summary (if its not already saved) # plot_summary(taxi_df, 'pickup_datetime', config.TAXI_ID_COL, config.TAXI_VOLUME_PLOT_FILE) # plot_summary(accident_df, 'accident_timestamp', config.ACCIDENT_ID_COL, config.ACCIDENT_VOLUME_PLOT_FILE) # --- Create ML features # Merge nearby accidents with taxi trips (this is a very long running process) df = merge_accidents(taxi_df, accident_df) # Create day of week, hour of day values from time stamp df = timestamps_to_features(df, 'pickup_datetime') df = timestamps_to_features(df, 'dropoff_datetime') # --- Log the results of the data preparation stages logging.info('Data preparation complete') # RDD has a countApprox() function that is quicker than .count(), unsure if the conversion from DataFrame to RDD # cancels out the savings logging.info(f'Number of rows: {df.rdd.countApprox(10)}') logging.info(f'Data schema: \n{df._jdf.schema().treeString()}') # TODO extract all this to function(s) # --- Train a gradient boosted trees model to predict the duration of a trip # Create the target variable ('label' is a special column name in MLlib logging.info( 'Creating label column (based on the delta between dropoff_datetime and pickup_datetime)' ) df = df.withColumn( 'label', unix_timestamp(df['dropoff_datetime']) - unix_timestamp(df['pickup_datetime'])) logging.info('Dropping rows that contain null (for a subset of columns)') # Drop any samples with a NULL value df = df.na.drop( how="any", subset=[config.TAXI_DATA_SCHEMA.fieldNames()].extend([ 'pickup_datetime_day_of_week', 'pickup_datetime_hour_of_day', 'pickup_datetime_month_of_year', 'dropoff_datetime_day_of_week', 'dropoff_datetime_hour_of_day', 'dropoff_datetime_month_of_year' ])) logging.info('Filling any remaining null values with 99999') # Filling na values with code 99999 df = df.na.fill(value=99999) logging.info('Spliting records into training and testing sets') # Split the data into training and test sets (30% held out for testing) (training_df, testing_df) = df.randomSplit([0.7, 0.3]) # Define an "VectorAssembler", which joins multiple columns into a single vector ignore = [ 'label', 'pickup_datetime', 'dropoff_datetime', config.ACCIDENT_ID_COL, config.TAXI_ID_COL ] assembler = VectorAssembler( inputCols=[col for col in df.columns if col not in ignore], outputCol='features') # Transform the data using the defined assembler logging.info('Transforming data using VectorAssembler') df = assembler.transform(df) # Define a "VectorIndexer" which converts categorical fields (defined by having less than 20 unique values) into # one-hot (?) encoded data feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit( df.select('features')) # Train a GBT model gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10) # Chain assembler, indexer and GBT in a Pipeline pipeline = Pipeline(stages=[assembler, feature_indexer, gbt]) logging.info('Running model pipeline') # Train model. This also runs the indexer. model_pipeline = pipeline.fit(training_df) logging.info('Making model predictions') # Make predictions predictions = model_pipeline.transform(testing_df) # Select example rows to display predictions.select("prediction", "label", "features").show(5) logging.info('Evaluating predictions') # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) logging.info(f'Root Mean Squared Error (RMSE) on test data: {rmse}') # TODO extract this to a function # Save the outputs (predictions and model) import uuid run_id = uuid.uuid4() date = datetime.date.today() model_pipeline.save( f'{config.MODEL_DIR}/{run_id}/model-{config.MODE}-{date}') predictions.select("prediction", "label").write.csv( f'{config.PREDICTIONS_DIR}/{run_id}/predictions-{config.MODE}-{date}.csv' ) # Print some cool stuff about the model gbt_model = model_pipeline.stages[2] logging.info(gbt_model) attrs = sorted((attr["idx"], attr["name"]) for attr in (chain( *df.schema["features"].metadata["ml_attr"]["attrs"].values()))) for idx, name in attrs: if gbt_model.featureImportances[idx]: print(name, gbt_model.featureImportances[idx])
def main(): # TODO - Check if valid CSV file path input_file = sys.argv[1] spark = SparkSession \ .builder \ .master("local[*]") \ .appName("cs643-prediction") \ .getOrCreate() # TODO - This is how docker container file structure should be loaded_regression_model = LinearRegressionModel.load( "/data/model/trained-model") # read dataset to predict input_dataset = spark.read.csv(input_file, header='true', inferSchema='true', sep=';') assembler = VectorAssembler(inputCols=[ input_dataset.columns[1], input_dataset.columns[2], input_dataset.columns[3], input_dataset.columns[4], input_dataset.columns[5], input_dataset.columns[6], input_dataset.columns[7], input_dataset.columns[8], input_dataset.columns[9], input_dataset.columns[10] ], outputCol="Attributes") valid_output = assembler.transform(input_dataset) valid_finalized_data = valid_output.select("Attributes", input_dataset.columns[11]) # predict the quality input_predictions = loaded_regression_model.transform(valid_finalized_data) data_eval = RegressionEvaluator(labelCol=input_dataset.columns[11], predictionCol="prediction", metricName="rmse") # r2 - coefficient of determination r2 = data_eval.evaluate(input_predictions, {data_eval.metricName: "r2"}) print("\n\n\n") print("r2: %.3f" % r2) # Root Mean Square Error rmse = data_eval.evaluate(input_predictions) print("Root Mean Squared Error (RMSE): %g" % rmse) # Mean Square Error mse = data_eval.evaluate(input_predictions, {data_eval.metricName: "mse"}) print("MSE: %g" % mse) # Mean Absolute Error mae = data_eval.evaluate(input_predictions, {data_eval.metricName: "mae"}) print("MAE: %g" % mae) # Check if user provided how many rows to print if args.o is not None: input_predictions.show(int(args.o), truncate=False) else: input_predictions.show(truncate=False)
# an iterable and returns the result. OneHot = map(lambda c: c + "classVec", categoricalColumns) # Target not included OneHot = list(OneHot) OneHot # List of names # Compile list of all OneHot and numerical cols for assembling into 'features' assemblerInputs = OneHot + numericCols assemblerInputs len(assemblerInputs) # 8 (OneHot) + 6 (numeric) = 14 # Create an object to apply to assemblerInputs, VectorAssembler # OutCol is 'always' has a name 'features'. 'features' contains # all predictors assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") df = assembler.transform(df) # Examine df df.take(2) df.columns df.dtypes len(df.columns) # 18 (one each from ToDo) + 15 (original Cols) = 35 ######################### CC.Modeling Data ######################################### # Keep only relevant columns # Note now we need just two columns: label and features # You can expt by removing others (cols) selectedcols = ["label", "features"] + cols # ie 15 + 2 =17 & ignore others
# MAGIC A decision tree is a simple representation for classifying examples. For this section, assume that all of the input features have finite discrete domains, and there is a single target feature called the "classification". Each element of the domain of the classification is called a class. A decision tree or a classification tree is a tree in which each internal (non-leaf) node is labeled with an input feature. The arcs coming from a node labeled with an input feature are labeled with each of the possible values of the target or output feature or the arc leads to a subordinate decision node on a different input feature. Each leaf of the tree is labeled with a class or a probability distribution over the classes, signifying that the data set has been classified by the tree into either a specific class. # COMMAND ---------- # MAGIC %md # MAGIC Let's build a decision tree using the training data set # COMMAND ---------- from pyspark.ml.feature import VectorAssembler from pyspark.ml.classification import DecisionTreeClassifier # Vectorize the features (all columns excluding the first one, Survived) features = trainDF.columns[1:] assembler = VectorAssembler(inputCols=features, outputCol="features") assembledTrainDF = assembler.transform(trainDF) # Train a decision tree, setting maxDepth parameter to 3 dtc = DecisionTreeClassifier(featuresCol="features", labelCol="Survived", maxDepth=2) dtcModel = dtc.fit(assembledTrainDF) # Print the constructed tree print(dtcModel.toDebugString) # COMMAND ---------- # Visualize the decision tree display(dtcModel) # COMMAND ----------
''' Create Spark Data Frame and add features/labels for the MLlib ''' if DEBUG_SMALL: print("Running training on small data-set") traindf = sqlContext.createDataFrame(train_df[0:5000]) else: print("Running training on 80% data-set") traindf = sqlContext.createDataFrame(train_df) # Below transformations are done in order to brind data-frame to the format MLlib is requiring. # MLlib requires data-frame with two columns: labels and features. While features column is collection of all features # labelIndexer = StringIndexer(inputCol="status_group", outputCol="indexedLabel").fit(traindf) assembler = VectorAssembler(inputCols=train_columns, outputCol="features") traindf = assembler.transform(traindf) featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=3).fit(traindf) (trainingData, testData) = traindf.randomSplit([0.8, 0.2]) # #Best params from sklearn {'n_estimators': 120, 'random_state': 1, 'min_samples_split': 5, 'max_features': 50, 'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1} # # MLlib RandomForestClassifier input (from documentation) # class pyspark.ml.classification.RandomForestClassifier(self, featuresCol="features", labelCol="label", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, #maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0) ######################################################################### # Below code implementation is adopted from Spark MLlib main guide
from pyspark.ml.feature import VectorAssembler va = VectorAssembler()\ .setInputCols(["Quantity", "UnitPrice"])\ .setOutputCol("features") sales = va.transform(spark.read.format("csv") .option("header", "true") .option("inferSchema", "true") .load("/data/retail-data/by-day/*.csv") .limit(50) .coalesce(1) .where("Description IS NOT NULL")) sales.cache() # COMMAND ---------- from pyspark.ml.clustering import KMeans km = KMeans().setK(5) print km.explainParams() kmModel = km.fit(sales) # COMMAND ---------- summary = kmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ")
indexer = StringIndexer(inputCol="education", outputCol="new_education") indexed = indexer.fit(new_data).transform(new_data) indexer1 = StringIndexer(inputCol="sex", outputCol="new_sex") indexed1 = indexer1.fit(indexed).transform(indexed) indexer2= StringIndexer(inputCol="relationship",outputCol="new_rel") indexed2= indexer2.fit(indexed1).transform(indexed1) indexed2=indexed2.drop("sex","education","relationship") indexed2.show() # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=indexed2.columns[1:], outputCol="features") data = assembler.transform(indexed2) # Split data into training and test data set training, test = data.select("label", "features").randomSplit([0.6, 0.4]) # Create Random Forest model and fit the model with training dataset rf = RandomForestClassifier() model = rf.fit(training) # Generate prediction from test dataset predictions = model.transform(test) # Evuluate the accuracy of the model evaluator = MulticlassClassificationEvaluator() accuracy = evaluator.evaluate(predictions) # Show model accuracy
#encode the dependent variable - category_predict classifyIndexer = StringIndexer(inputCol="Category", outputCol="Category_Index") classifymodel = classifyIndexer.fit(encoded) encoded2 = classifymodel.transform(encoded) #keep the following columns: x, y, hour, day, month, year, dayofweek, week, x_sim, y_sim #drop the following cleaned = encoded2.select([c for c in encoded2.columns if c not in{'DayOfWeek','Category','Address','Dates','Descript','PdDistrict','Resolution','PdDistrict_Index'}]) ignore = ['Category_Index'] assembler = VectorAssembler(inputCols=[x for x in cleaned.columns if x not in ignore],outputCol='features') transformed = assembler.transform(cleaned) data_transformed = transformed.select(col("Category_Index").alias("label"), col("features")).map(lambda row: LabeledPoint(row.label, row.features)) #******************************************************************************** # split the training set train, test = data_transformed.randomSplit([0.7, 0.3], seed = 2) #naivebayes classifier #lambda = 1.0 # initialize classifier: nb_model = mllib_class.NaiveBayes.train(train, 1.0) #this step will take 50 seconds # Make prediction and test accuracy.
def fit( self, sdf: DataFrame, label_col: str = "label", sdf_validation: Optional[DataFrame] = None, estimator_params: Optional[Dict[str, object]] = None, explainer_type_params: Optional[Dict[str, object]] = None, explainer_params: Optional[Dict[str, object]] = None, broadcast: bool = True, ) -> "SparkSelector": """Fit the Spark selector with the provided estimator. Args: sdf: The training input samples. label_col: The target column name. sdf_validation: The validation input samples. estimator_params: Additional parameters for the underlying estimator's fit method. explainer_type_params: Additional parameters for the explainer's init. explainer_params: Additional parameters for the explainer's shap_values method. broadcast: Whether to broadcast the target column when joining. """ # Check if pyspark and pyarrow are installed if DataFrame is None or importlib.util.find_spec("pyarrow") is None: raise ImportError( "SparkSelector requires both pyspark and pyarrow.") # Validate parameters self._validate_params() # Set estimator parameters self.estimator.setFeaturesCol(SPARK_FEATURES_NAME) self.estimator.setLabelCol(label_col) # Make sure that check_additivity is disabled (it's not supported for Spark estimators) explainer_params = self._set_additivity_false(explainer_params) # Assembly the features vector features = [col for col in sdf.columns if col != label_col] assembler = VectorAssembler(inputCols=features, outputCol=SPARK_FEATURES_NAME, handleInvalid="keep") sdf = assembler.transform(sdf) # With the progress bar with tqdm(total=self.n_iter, disable=not self.verbose) as pbar: # Get the true shap values (i.e. without shuffling) pbar.set_description("Computing true SHAP values") true_pos_shap_values, true_neg_shap_values = self._get_shap_values( sdf, label_col=label_col, shuffle=False, sdf_validation=sdf_validation, estimator_params=estimator_params, explainer_type_params=explainer_type_params, explainer_params=explainer_params, ) # Get the null shap values (i.e. with shuffling) pbar.set_description("Computing null SHAP values") null_pos_shap_values = [None] * self._n_outputs null_neg_shap_values = [None] * self._n_outputs for i in range(self.n_iter): self._current_iter = i + 1 if self.verbose: logger.info( f"Iteration {self._current_iter}/{self.n_iter}") pos_shap_values, neg_shap_values = self._get_shap_values( sdf, label_col=label_col, shuffle=True, sdf_validation=sdf_validation, estimator_params=estimator_params, explainer_type_params=explainer_type_params, explainer_params=explainer_params, broadcast=broadcast, ) for j in range(self._n_outputs): if i == 0: null_pos_shap_values[j] = pos_shap_values[j].to_frame() null_neg_shap_values[j] = neg_shap_values[j].to_frame() else: null_pos_shap_values[j] = null_pos_shap_values[j].join( pos_shap_values[j], rsuffix=f"_{self._current_iter}") null_neg_shap_values[j] = null_neg_shap_values[j].join( neg_shap_values[j], rsuffix=f"_{self._current_iter}") pbar.update(1) # Compute p-values self.p_values_ = self._compute_p_values(true_pos_shap_values, null_pos_shap_values, true_neg_shap_values, null_neg_shap_values) # Cleanup self._n_outputs = None self._X_with_index = None self._X_for_shap = None return self
# Entrenamos y calibramos el modelo modificando sus parametros internos viendo sus resultados en evaluation from pyspark.sql.types import * from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.feature import VectorAssembler #Definimos el dataset para la prediccion del label ARR_DEL15 #Generamos un vector con la columna label y la columna array features ignore = ['label'] assembler = VectorAssembler( inputCols=[x for x in train.columns if x not in ignore], outputCol='features') train_LP = assembler.transform(train).select(['label', 'features']) evaluation_LP = assembler.transform(evaluation).select(['label', 'features']) #Definimos el algoritmo del modelo (decision tree) dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=10, maxBins=64) # Fit the model model = dt.fit(train_LP) #Save the model #model.save("dbfs:/dataset/modelo_binario_DT") # Make predictions.
#LOAD DATA dataset = spark.read.format("libsvm").load("/mapreduce-test/pro_1/shot_logs.csv") df = df.drop(columns=['GAME_ID','SHOT_RESULT','MATCHUP','LOCATION','W','FINAL_MARGIN','SHOT_NUMBER','PERIOD','GAME_CLOCK','DRIBBLES','CLOSEST_DEFENDER','CLOSEST_DEFENDER_PLAYER_ID','FGM','PTS','player_id','PTS_TYPE','TOUCH_TIME']) df = df[df.SHOT_RESULT != 'missed'] df = df.groupby('player_name') df = df.mean() #FILTER DATA TO GROUP BY PLAYER training_set = x = df[['SHOT_CLOCK','SHOT_DIST','CLOSE_DEF_DIST']] #EXPLORATION OF KVALUES #Convert dataset into VectorRow data cells data_of_interest = dataset.withColumn('CLOSE_DEF_DIST', data_cleaned['CLOSE_DEF_DIST'].cast(DoubleType())).withColumn('SHOT_DIST', data_cleaned['SHOT_DIST, '].cast(DoubleType())).withColumn('SHOT_CLOCK', data_cleaned['SHOT_CLOCK'].cast(DoubleType())) feature_vector = VectorAssembler(inputCols=['CLOSE_DEF_DIST', 'SHOT_DIST', 'SHOT_CLOCK'], outputCol="features") transform_data = feature_vector.transform(data_of_interest) player_names = transform_data.select("player_name").distinct().collect() list_items = list() evaluator = ClusteringEvaluator() #Getting Silhouette with squared euclidean distance for k value ranging from 2 to 8 TotalSED = [] for player in player_name: features = transform_data.where(transform_data["player_name"] == player[0]).select("features") for k in range(2,8): kmeans = KMeans(featuresCol = 'features', k=k) model = kmeans.fit(features) predictions = model.transform(features) silhouette = evaluator.evaluate(predictions) print("With K={}".format(k)) print("Silhouette with squared euclidean distance = " + str(silhouette))
from __future__ import print_function # $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("VectorAssemblerExample")\ .getOrCreate() # $example on$ dataset = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) assembler = VectorAssembler( inputCols=["hour", "mobile", "userFeatures"], outputCol="features") output = assembler.transform(dataset) print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(truncate=False) # $example off$ spark.stop()
#from pyspark.ml.feature import StringIndexer # this will convert each unique string into a numeric #indexer = StringIndexer(inputCol="txtlabel", outputCol="label") #indexed = indexer.fit(mydf).transform(mydf) #indexed.show(5) # now we need to create a "label" and "features" # input for using the sparkML library from pyspark.ml.feature import VectorAssembler from pyspark.ml.linalg import Vectors ## use features to predict if there will be a fatality assembler = VectorAssembler( inputCols=[ "age_n", "wt_n", "gndr_n", "druglisthash", "medcount"], outputCol="features") output = assembler.transform(fda20k) # note the column headers - label and features are keywords print ( output.show(3) ) from pyspark.ml.classification import LogisticRegression # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(output) #### Major shortcut - no train and test data!!! # Since model1 is a Model (i.e., a transformer produced by an Estimator), # we can view the parameters it used during fit().
print(result.show()) # Rename Column count(IP) to IP result = result.withColumnRenamed("count(IP)", "IP") # Drop null values result = result.dropna(how="any", subset=["IP", "Time"]) print(result.show()) # Converting datetime to unix_timestamp result = result.withColumn("Time", unix_timestamp(result.Time)) print(result.show()) result = result.withColumn("IP", result["Time"].cast(IntegerType())) # Convert features to vectors with VectorAssembler - required by ML models assembler = VectorAssembler(inputCols=['IP', 'Time'], outputCol='features') v_result = assembler.transform(result) v_result = v_result.select(['features', 'Time']) splits = v_result.randomSplit([0.7, 0.3]) train_df = splits[0] test_df = splits[1] lr = LinearRegression(featuresCol='features', labelCol='Time', maxIter=10, regParam=0.3, elasticNetParam=0.8) lr_model = lr.fit(train_df) print("Coefficients: " + str(lr_model.coefficients)) print("Intercept: " + str(lr_model.intercept)) lrModelSummary = lr_model.summary print("Train R2 Score: ", lrModelSummary.r2)
"neighbourhood_cleansed", "room_type", "bedrooms", "bathrooms", "number_of_reviews", "price", ).show(5) trainDF, testDF = airbnbDF.randomSplit([0.8, 0.2], seed=42) print( f"""There are {trainDF.count()} rows in the training set, and {testDF.count()} in the test set""" ) vecAssembler = VectorAssembler(inputCols=["bedrooms"], outputCol="features") vecTrainDF = vecAssembler.transform(trainDF) vecTrainDF.select("bedrooms", "features", "price").show(10) lr = LinearRegression(featuresCol="features", labelCol="price") lrModel = lr.fit(vecTrainDF) m = round(lrModel.coefficients[0], 2) b = round(lrModel.intercept, 2) print(f"""The formula for the linear regression line is price = {m}*bedrooms + {b}""") pipeline = Pipeline(stages=[vecAssembler, lr]) pipelineModel = pipeline.fit(trainDF) predDF = pipelineModel.transform(testDF) predDF.select("bedrooms", "features", "price", "prediction").show(10)
print label_indexed.take(1) # COMMAND ---------- # MAGIC %md # MAGIC Next, we will use the VectorAssembler() to combine all the feature columns into a single vector column. This will include both the numeric columns and the one-hot encoded binary vector columns in our dataset. # COMMAND ---------- # Transform all features into a vector using VectorAssembler assembler = VectorAssembler( inputCols=["age","workclassclassVec","fnlwgt","educationclassVec","education_num","marital_statusclassVec", "occupationclassVec","relationshipclassVec","raceclassVec", "sexclassVec", "capital_gain", "capital_loss", "hours_per_week", "native_countryclassVec"], outputCol="features") output = assembler.transform(label_indexed) # Keep relevant columns selectedcols = ["label", "features"] + cols dataset = output.select(selectedcols) display(dataset) # COMMAND ---------- ### Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100) print trainingData.count() print testData.count() # COMMAND ----------
& (col("startLon") >= lonWest) & (col("startLon") <= lonEast) & (col("startLat") >= latSouth) & (col("startLat") <= latNorth) & (col("endLon") >= lonWest) & (col("endLon") <= lonEast) & (col("endLat") >= latSouth) & (col("endLat") <= latNorth)) taxi = taxi.select('startLon', 'startLat', 'tip') GA = taxi.rdd.map(lambda row: row.asDict()) GA.saveToMongoDB('mongodb://localhost:27017/POC1.données') vec_assembler = VectorAssembler(inputCols=taxi.columns, outputCol='features') final_data = vec_assembler.transform(taxi) kmeans = KMeans(featuresCol='features', k=12) model = kmeans.fit(final_data) centers = model.clusterCenters() print("Cluster Centers: ") A = [] for center in centers: print(center.tolist()) A.append(center.tolist()) resultat= sc.parallelize(A)\ .toDF(['startlon','startlat','tip']) GA = resultat.rdd.map(lambda row: row.asDict())