def na_imputer(self, strategy, out_columns="*", na=None, columns="*"): """ replace missing value with mean or median according to users' choice user can also customize the definition of missing value, e.g. 999 the default missing value is 'nan' or null the default setting of out_columns is just columns, so the original columns will be overrided if not specially defined """ #check columns if columns == "*": columns = self._df.schema.names elif isinstance(columns, str): columns = [columns] else: assert isinstance( columns, list), "Error: columns argument must be a string or a list!" if out_columns == "*": out_columns = self._df.schema.names #check output columns if isinstance(out_columns, str): out_columns = [out_columns] else: assert isinstance( out_columns, list ), "Error: output columns argument must be a string or a list!" #check input and output columns have consistent lengths assert len(columns) == len( out_columns ), "Error: inconsistent lengths for argument of columns list and output columns list" #check strategy argument assert (strategy == "mean" or strategy == "median"), "Error: strategy can only be 'mean' or 'median'." #firstly convert the type in input columns to FloatType for Imputer for col in columns: self._df = self._df.withColumn(col, self._df[col].cast(FloatType())) #fit the model imputer = Imputer(inputCols=columns, outputCols=out_columns) if na is None: model = imputer.setStrategy(strategy).fit(self._df) else: model = imputer.setStrategy(strategy).setMissingValue(na).fit( self._df) self._df = model.transform(self._df) return self._df
def prepocess_data(df): # Preprocessing the data # Dimension reduction cols_reduce = [ 'Date', 'Time', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3' ] df = df.drop(*cols_reduce) # Fixing missing values (dataset uses ? as NaN for missing values) imputer = Imputer(inputCols=df.columns, outputCols=df.columns) imputer.setStrategy("mean") df = imputer.fit(df).transform(df) # Print the column name and datatype print(df.dtypes) return df
def cleanDraftData(postion): ''' [X] need to fill in nulls for the Age with the AVG, or with the median of all the ages --> opted out for the medium ''' unCleanData = spark.read.format("csv").option("header", "true").option( "inferSchema", "true").load("./data/NflDraftData/draftData.csv") # drop columns we don't need unCleanData = unCleanData.select("Rnd", "Pick", "Player Name", "Pos", 'Age', 'College', 'Draft Year') if (postion == "RB" or postion == "QB" or postion == "WR"): unCleanData = unCleanData.where(unCleanData["Pos"] == postion) else: # Retrun all of the skill offensive players (WR, RB, TE, QB, FB) #drop lineman both offense and defense as well as defensive players and special teams droppedPostions = [ 'DE', 'DT', 'T', 'O', 'G', 'C', 'K', 'NT', 'DL', 'OL', 'LS', 'LB', 'DB', 'P', 'OLB', 'CB', 'S', 'ILB' ] # With only O players we are down to 2000 data pints for postion in droppedPostions: unCleanData = unCleanData.where(unCleanData["Pos"] != postion) # Cast values to doubles doubleCols = ['Age', 'Rnd', 'Pick', 'Draft Year'] for c in doubleCols: unCleanData = unCleanData.withColumn(c, unCleanData[c].cast(DoubleType())) # Used to fill in Null values with the medium imputer = Imputer(inputCols=["Age"], outputCols=["Age"]) cleanData = imputer.setStrategy("median").fit(unCleanData).transform( unCleanData) #cleanData.show() return cleanData
def impute_missing(df, columns, out_cols, strategy='mean'): """ Imputes missing data from specified columns using the mean or median. Parameters ---------- columns : List of columns to be analyze. out_cols: List of output columns with missing values imputed. strategy: String that specifies the way of computing missing data. Can be "mean" or "median" return : Transformer object (DF with columns that has the imputed values). """ # Check if columns to be process are in dataframe assert_cols_in_df(df, columns_provided=columns, columns_df=df.columns) assert isinstance(columns, list), "Error: columns argument must be a list" assert isinstance(out_cols, list), "Error: out_cols argument must be a list" # Check if columns argument a string datatype: assert_type_str(df, strategy, "strategy") assert ( strategy == "mean" or strategy == "median" ), "Error: strategy has to be 'mean' or 'median'. 'mean' is default" imputer = Imputer(inputCols=columns, outputCols=out_cols) model = imputer.setStrategy(strategy).fit(df) df = model.transform(df) return df
def impute(input_cols, output_cols, strategy="mean"): """ Imputes missing data from specified columns using the mean or median. :param input_cols: List of columns to be analyze. :param output_cols: List of output columns with missing values imputed. :param strategy: String that specifies the way of computing missing data. Can be "mean" or "median" :return: Dataframe object (DF with columns that has the imputed values). """ input_cols = parse_columns(self, input_cols) output_cols = val_to_list(output_cols) imputer = Imputer(inputCols=input_cols, outputCols=output_cols) df = self model = imputer.setStrategy(strategy).fit(df) df = model.transform(df) return df
def impute(columns, strategy="mean"): """ Imputes missing data from specified columns using the mean or median. :param columns: List of columns to be analyze. :param strategy: String that specifies the way of computing missing data. Can be "mean" or "median" :return: Dataframe object (DF with columns that has the imputed values). """ columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) df = self output_cols = [] for col_name in columns: # Imputer require not only numeric but float or double df = df.cols.cast(col_name, "float") output_cols.append(col_name + IMPUTE_SUFFIX) imputer = Imputer(inputCols=columns, outputCols=output_cols) model = imputer.setStrategy(strategy).fit(df) df = model.transform(df) return df
def impute(cols): imputer = Imputer(inputCols=cols, outputCols=out_cols) model = imputer.setStrategy(strategy).fit(self._df) self._df = model.transform(self._df)
colNum = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'bedrooms_per_room', 'population_per_household'] # ????????????????????? # for col in renamedHousing.head(0): # if col != "label" or "ocean_proximity": # colNum.append(col) # print(colNum) ################################################不太会 for c in renamedHousing.columns: print(c, " has null values : ", renamedHousing.filter(renamedHousing[c].isNull()).count()) imputer = Imputer() imputer.setInputCols(["total_bedrooms", "bedrooms_per_room"]) imputer.setOutputCols(["out_total_bedrooms", "out_bedrooms_per_room"]) imputedHousing = imputer.setStrategy('median').setMissingValue(414).fit(renamedHousing).transform(renamedHousing) imputedHousing = imputedHousing.drop('total_bedrooms').drop('bedrooms_per_room') for c in imputedHousing.columns: print(c, " has null values : ", imputedHousing.filter(imputedHousing[c].isNull()).count()) colNum_to_scale = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'households', 'median_income', 'rooms_per_household','population_per_household','out_total_bedrooms','out_bedrooms_per_room'] va = VectorAssembler().setInputCols(colNum_to_scale).setOutputCol('features') featuredHousing = va.transform(imputedHousing) featuredHousing.show() scaler = StandardScaler(withMean=True, withStd=True) scaler.setInputCol("features").setOutputCol("scaled_features") scaledHousing = scaler.fit(featuredHousing).transform(featuredHousing) scaledHousing.select('scaled_features').show()
df1.select([count(when(isnull(c), c)).alias(c) for c in df1.columns]).show() #Vi tæller antal gange hvornår værdierne i c (kolonne) er null, i hver c. Det selecter vi, og renammer (alias) denne count #til hvad end kolonnen c hedder. Det gøres i et for loop. imputer = Imputer(inputCols=[ "Plasma glucose concentration", "Diastolic blood pressure", "Triceps skinfold thickness", "2-Hour serum insulin", "Body mass index" ], outputCols=[ "Plasma glucose concentration", "Diastolic blood pressure", "Triceps skinfold thickness", "2-Hour serum insulin", "Body mass index" ]) imputer = imputer.setStrategy('median').setMissingValue(np.nan) new_df = imputer.fit(df1).transform(df1) new_col_names = new_df.columns features2 = new_df.rdd.map(lambda row: row[0:]) corr_mat2 = Statistics.corr(features2, method="pearson") corr_df2 = pd.DataFrame(corr_mat2) corr_df2.index, corr_df2.columns = new_col_names, new_col_names #print(corr_df2.to_string()) #imputer.fit(df1).transform(df1).show() #imputer.setStrategy('median').setMissingValue(np.nan).fit(df1).transform(df1).show() sns.set(style='ticks') sns.pairplot(new_df.toPandas(),