def _convert_vector(df, dtype): from pyspark.ml.linalg import VectorUDT from pyspark.mllib.linalg import VectorUDT as OldVectorUDT for field in df.schema: col_name = field.name if isinstance(field.dataType, VectorUDT) or \ isinstance(field.dataType, OldVectorUDT): df = df.withColumn(col_name, vector_to_array(df[col_name], dtype)) return df
def cast_spark_df_with_vector_to_array(input_spark_df): """ Finds columns of vector type in a spark dataframe and casts them to array<double> type. :param input_spark_df: :return: a spark dataframe with vector columns transformed to array<double> type """ vector_type_columns = [ _field.name for _field in input_spark_df.schema if isinstance(_field.dataType, VectorUDT) ] return reduce( lambda df, vector_col: df.withColumn(vector_col, vector_to_array(vector_col)), vector_type_columns, input_spark_df, )
def _convert_vector(df, precision): from pyspark.ml.linalg import Vector from pyspark.mllib.linalg import Vector as OldVector types_set = {struct_field.dataType for struct_field in df.schema} found_vectors = not types_set.isdisjoint({Vector, OldVector}) if not found_vectors: return df import pyspark if LooseVersion(pyspark.__version__) >= LooseVersion('3.0'): raise ValueError("Vector columns are not supported for pyspark<3.0.0.") # pylint: disable=import-error from pyspark.ml.functions import vector_to_array # pylint: enable=import-error for struct_field in df.schema: col_name = struct_field.name if struct_field.dataType in {Vector, OldVector}: df = df.withColumn(col_name, vector_to_array(df[col_name], precision)) return df
def _validate_and_convert_feature_col_as_array_col(dataset, features_col_name): features_col_datatype = dataset.schema[features_col_name].dataType features_col = col(features_col_name) if isinstance(features_col_datatype, ArrayType): if not isinstance( features_col_datatype.elementType, (DoubleType, FloatType, LongType, IntegerType, ShortType), ): raise ValueError( "If feature column is array type, its elements must be number type." ) features_array_col = features_col.cast(ArrayType( FloatType())).alias("values") elif isinstance(features_col_datatype, VectorUDT): features_array_col = vector_to_array(features_col, dtype="float32").alias("values") else: raise ValueError( "feature column must be array type or `pyspark.ml.linalg.Vector` type, " "if you want to use multiple numetric columns as features, please use " "`pyspark.ml.transform.VectorAssembler` to assemble them into a vector " "type column first.") return features_array_col
def process_bus_data(bus_df): """ Method to process raw business data from Yelp.""" def select_elibigble_bus(row): """ Select businesses which fall into selected categores.""" global categories try: # Return true if business falls into category list, else false. row_cats = row.split(',') for cat in row_cats: if cat.strip() in categories: return True return False except (TypeError, AttributeError): # Returns false if business has no defined categories. return False def unpack_bus_attributes(row): """ Unpacks Business attributes and assigns them an index value.""" # List to store business attributes. unpacked = list() # Unpack all attributes except PriceRange and Parking temp = [row[s] for s in bus_attributes] # Process PriceRange try: priceRange = int(row["RestaurantsPriceRange2"]) except (TypeError, ValueError): # If no price range specified - default=2 priceRange = 2 #Process Parking try: parking = 1 if (row["BusinessParking"].find("True")) != -1 else -1 except AttributeError: parking = 0 # Process WiFi if row["WiFi"] == 'no' or row["WiFi"] == "u'no'": wifi = -1 elif row["WiFi"] == None: wifi = 0 else: wifi = 1 # Tokenize all Boolean attributes. for i in temp: if i == "True": unpacked.append(1) elif i == "False": unpacked.append(-1) else: unpacked.append(0) # Append the Parking and PriceRange attributes unpacked.append(wifi) unpacked.append(parking) unpacked.append(priceRange) # Print any arrays that are not of desired length (=30). if len(unpacked) != 30: print(unpacked) return _convert_to_vector( csc_matrix(np.asarray(unpacked).astype(float)).T) def unpack_bus_categories(row): """Unpacks all business cattegories.""" # List to store business categories. unpacked = list() # Unpack all attributes except PriceRange and Parking for cat in row.split(','): unpacked.append(cat.strip()) return unpacked def unpack_price_range(row): """ Returns price range.""" return int(row[-1]) # Package the functions above into Spark SQL user-defined functions udf_select_eligible_bus = udf(select_elibigble_bus, BooleanType()) udf_unpack_bus_attributes = udf(unpack_bus_attributes, VectorUDT()) udf_unpack_bus_categories = udf(unpack_bus_categories, ArrayType(StringType())) udf_unpack_price_range = udf(unpack_price_range, IntegerType()) # Find businesses to include. eligible_bus = bus_df.withColumn("include", udf_select_eligible_bus(col("categories"))) \ .filter(col("include") == True) # Process business attributes feature. all_bus_attributes = set( bus_df.select("attributes").take(1)[0].attributes.asDict().keys()) bus_attributes_to_exclude = { 'AcceptsInsurance', 'AgesAllowed', 'ByAppointmentOnly', 'Caters', 'Corkage', 'DietaryRestrictions', 'HairSpecializesIn', 'Open24Hours', 'RestaurantsAttire', 'RestaurantsPriceRange2', 'BusinessParking', 'WiFi' } bus_attributes = list(all_bus_attributes - bus_attributes_to_exclude) bus_attributes.sort() eligible_attr = eligible_bus.withColumn( "unpackedAttr", udf_unpack_bus_attributes(col("attributes"))) # Process business categories feature. eligible_cats = eligible_attr.withColumn( "unpackedCats", udf_unpack_bus_categories(col("categories"))) cv = CountVectorizer(inputCol="unpackedCats", outputCol="vectorizedCats") vectorized_cats = cv.fit(eligible_cats).transform(eligible_cats) # Un-bundle price range from all other attributes. unpacked_pr = vectorized_cats.withColumn( "priceRange", udf_unpack_price_range(col("unpackedAttr"))) unpacked_pr.take(1) # Reduce dimensions of attributes and categories features, respectively. pca_attr = PCA(k=3, inputCol="unpackedAttr", outputCol="pcaAttr").fit(unpacked_pr) temp = pca_attr.transform(unpacked_pr) temp.show() pca_cats = PCA(k=1, inputCol="vectorizedCats", outputCol="pcaCats").fit(temp) temp2 = pca_cats.transform(temp) temp2.show() # Assemble into final feature vector. va = VectorAssembler( inputCols=["stars", "priceRange", "pcaAttr", "pcaCats"], outputCol="featureVec") features = va.transform(temp2).select("business_id", "stars", "categories", "featureVec") features.take(1) # Unpack n_features = len(features.select("featureVec").take(1)[0].featureVec) final = features.withColumn("f", vector_to_array(col("featureVec"))) \ .select(["business_id", "stars", "categories"] + [col("f")[i] for i in range(n_features)]) return final, n_features
# url=db_url, # table="text_data_train_nm_tfidf", # mode="overwrite", # properties=db_properties, # ) # test_finished.write.jdbc( # url=db_url, # table="text_data_test_nm_tfidf", # mode="overwrite", # properties=db_properties, # ) # SVM train_pred = train_pred.withColumn("rawPrediction", vector_to_array("rawPrediction")) test_pred = test_pred.withColumn("rawPrediction", vector_to_array("rawPrediction")) train_pred.createOrReplaceTempView("train_pred") test_pred.createOrReplaceTempView("test_pred") train_finished = spark.sql(""" SELECT review_id, ROUND(rawPrediction[1], 3) AS svm_pred FROM train_pred """) test_finished = spark.sql(""" SELECT review_id, ROUND(rawPrediction[1], 3) AS svm_pred
) if scaler_model is None: scaler = StandardScaler( inputCol=temp_vector_col, outputCol=temp_normalized_vector_col, withStd=parse_parameter(bool, scale, "scale", True), withMean=parse_parameter(bool, center, "center", False), ) scaler_model = fit_and_save_model(trained_parameters, "scaler_model", scaler, assembled_wo_nans) output_df = transform_using_trained_model(scaler_model, assembled, scaler_model_loaded) # convert the resulting vector back to numeric temp_flattened_vector_col = temp_col_name(output_df) output_df = output_df.withColumn(temp_flattened_vector_col, vector_to_array(temp_normalized_vector_col)) # keep only the final scaled column. output_column = input_column if output_column is None or not output_column else output_column output_column_value = sf.col(temp_flattened_vector_col)[0].alias(output_column) output_df = output_df.withColumn(output_column, output_column_value) final_columns = list(dict.fromkeys((list(df.columns) + [output_column]))) output_df = output_df.select(final_columns) return default_spark_with_trained_parameters(output_df, trained_parameters) def process_numeric_robust_scaler( df, input_column=None, lower_quantile=None,