Example #1
0
    def na_imputer(self, strategy, out_columns="*", na=None, columns="*"):
        """
        replace missing value with mean or median according to users' choice
        user can also customize the definition of missing value, e.g. 999
        the default missing value is 'nan' or null
        the default setting of out_columns is just columns, so the original columns will be overrided if not specially defined
        """

        #check columns
        if columns == "*":
            columns = self._df.schema.names
        elif isinstance(columns, str):
            columns = [columns]
        else:
            assert isinstance(
                columns,
                list), "Error: columns argument must be a string or a list!"

        if out_columns == "*":
            out_columns = self._df.schema.names

        #check output columns
        if isinstance(out_columns, str):
            out_columns = [out_columns]
        else:
            assert isinstance(
                out_columns, list
            ), "Error: output columns argument must be a string or a list!"

        #check input and output columns have consistent lengths
        assert len(columns) == len(
            out_columns
        ), "Error: inconsistent lengths for argument of columns list and output columns list"

        #check strategy argument
        assert (strategy == "mean" or strategy
                == "median"), "Error: strategy can only be 'mean' or 'median'."

        #firstly convert the type in input columns to FloatType for Imputer
        for col in columns:
            self._df = self._df.withColumn(col,
                                           self._df[col].cast(FloatType()))

        #fit the model
        imputer = Imputer(inputCols=columns, outputCols=out_columns)

        if na is None:
            model = imputer.setStrategy(strategy).fit(self._df)
        else:
            model = imputer.setStrategy(strategy).setMissingValue(na).fit(
                self._df)

        self._df = model.transform(self._df)

        return self._df
Example #2
0
File: app.py Project: mledl/BDMA_HW
def prepocess_data(df):
    # Preprocessing the data
    # Dimension reduction
    cols_reduce = [
        'Date', 'Time', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3'
    ]
    df = df.drop(*cols_reduce)

    # Fixing missing values (dataset uses ? as NaN for missing values)
    imputer = Imputer(inputCols=df.columns, outputCols=df.columns)
    imputer.setStrategy("mean")
    df = imputer.fit(df).transform(df)

    # Print the column name and datatype
    print(df.dtypes)
    return df
Example #3
0
def cleanDraftData(postion):
    '''
        [X] need to fill in nulls for the Age with the AVG, or with the median of all the ages  --> opted out for the medium 
    '''
    unCleanData = spark.read.format("csv").option("header", "true").option(
        "inferSchema", "true").load("./data/NflDraftData/draftData.csv")

    # drop columns we don't need
    unCleanData = unCleanData.select("Rnd", "Pick", "Player Name", "Pos",
                                     'Age', 'College', 'Draft Year')

    if (postion == "RB" or postion == "QB" or postion == "WR"):
        unCleanData = unCleanData.where(unCleanData["Pos"] == postion)
    else:  # Retrun all of the skill offensive players (WR, RB, TE, QB, FB)
        #drop lineman both offense and defense as well as defensive players and special teams
        droppedPostions = [
            'DE', 'DT', 'T', 'O', 'G', 'C', 'K', 'NT', 'DL', 'OL', 'LS', 'LB',
            'DB', 'P', 'OLB', 'CB', 'S', 'ILB'
        ]  # With only O players we are down to 2000 data pints
        for postion in droppedPostions:
            unCleanData = unCleanData.where(unCleanData["Pos"] != postion)

    # Cast values to doubles
    doubleCols = ['Age', 'Rnd', 'Pick', 'Draft Year']
    for c in doubleCols:
        unCleanData = unCleanData.withColumn(c,
                                             unCleanData[c].cast(DoubleType()))

    # Used to fill in Null values with the medium
    imputer = Imputer(inputCols=["Age"], outputCols=["Age"])
    cleanData = imputer.setStrategy("median").fit(unCleanData).transform(
        unCleanData)
    #cleanData.show()
    return cleanData
Example #4
0
def impute_missing(df, columns, out_cols, strategy='mean'):
    """
    Imputes missing data from specified columns using the mean or median.

    Parameters
    ----------
    columns : List of columns to be analyze.
    out_cols: List of output columns with missing values imputed.
    strategy: String that specifies the way of computing missing data. Can be "mean" or "median"
    
    return  : Transformer object (DF with columns that has the imputed values).
    """

    # Check if columns to be process are in dataframe
    assert_cols_in_df(df, columns_provided=columns, columns_df=df.columns)

    assert isinstance(columns, list), "Error: columns argument must be a list"
    assert isinstance(out_cols,
                      list), "Error: out_cols argument must be a list"

    # Check if columns argument a string datatype:
    assert_type_str(df, strategy, "strategy")

    assert (
        strategy == "mean" or strategy == "median"
    ), "Error: strategy has to be 'mean' or 'median'. 'mean' is default"

    imputer = Imputer(inputCols=columns, outputCols=out_cols)
    model = imputer.setStrategy(strategy).fit(df)
    df = model.transform(df)

    return df
Example #5
0
    def impute(input_cols, output_cols, strategy="mean"):
        """
        Imputes missing data from specified columns using the mean or median.
        :param input_cols: List of columns to be analyze.
        :param output_cols: List of output columns with missing values imputed.
        :param strategy: String that specifies the way of computing missing data. Can be "mean" or "median"
        :return: Dataframe object (DF with columns that has the imputed values).
        """

        input_cols = parse_columns(self, input_cols)
        output_cols = val_to_list(output_cols)

        imputer = Imputer(inputCols=input_cols, outputCols=output_cols)

        df = self
        model = imputer.setStrategy(strategy).fit(df)
        df = model.transform(df)

        return df
Example #6
0
    def impute(columns, strategy="mean"):
        """
        Imputes missing data from specified columns using the mean or median.
        :param columns: List of columns to be analyze.
        :param strategy: String that specifies the way of computing missing data. Can be "mean" or "median"
        :return: Dataframe object (DF with columns that has the imputed values).
        """

        columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)

        df = self
        output_cols = []
        for col_name in columns:
            # Imputer require not only numeric but float or double
            df = df.cols.cast(col_name, "float")
            output_cols.append(col_name + IMPUTE_SUFFIX)

        imputer = Imputer(inputCols=columns, outputCols=output_cols)

        model = imputer.setStrategy(strategy).fit(df)
        df = model.transform(df)

        return df
Example #7
0
 def impute(cols):
     imputer = Imputer(inputCols=cols, outputCols=out_cols)
     model = imputer.setStrategy(strategy).fit(self._df)
     self._df = model.transform(self._df)
Example #8
0
colNum = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population',
          'households', 'median_income', 'median_house_value', 'bedrooms_per_room', 'population_per_household']
# ?????????????????????
# for col in renamedHousing.head(0):
#   if col != "label" or "ocean_proximity":
#      colNum.append(col)
# print(colNum)
################################################不太会

for c in renamedHousing.columns:
    print(c, " has null values : ", renamedHousing.filter(renamedHousing[c].isNull()).count())

imputer = Imputer()
imputer.setInputCols(["total_bedrooms", "bedrooms_per_room"])
imputer.setOutputCols(["out_total_bedrooms", "out_bedrooms_per_room"])
imputedHousing = imputer.setStrategy('median').setMissingValue(414).fit(renamedHousing).transform(renamedHousing)
imputedHousing = imputedHousing.drop('total_bedrooms').drop('bedrooms_per_room')

for c in imputedHousing.columns:
    print(c, " has null values : ", imputedHousing.filter(imputedHousing[c].isNull()).count())

colNum_to_scale = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population',
          'households', 'median_income', 'rooms_per_household','population_per_household','out_total_bedrooms','out_bedrooms_per_room']
va = VectorAssembler().setInputCols(colNum_to_scale).setOutputCol('features')
featuredHousing = va.transform(imputedHousing)
featuredHousing.show()

scaler = StandardScaler(withMean=True, withStd=True)
scaler.setInputCol("features").setOutputCol("scaled_features")
scaledHousing = scaler.fit(featuredHousing).transform(featuredHousing)
scaledHousing.select('scaled_features').show()
Example #9
0
df1.select([count(when(isnull(c), c)).alias(c) for c in df1.columns]).show()
#Vi tæller antal gange hvornår værdierne i c (kolonne) er null, i hver c. Det selecter vi, og renammer (alias) denne count
#til hvad end kolonnen c hedder. Det gøres i et for loop.

imputer = Imputer(inputCols=[
    "Plasma glucose concentration", "Diastolic blood pressure",
    "Triceps skinfold thickness", "2-Hour serum insulin", "Body mass index"
],
                  outputCols=[
                      "Plasma glucose concentration",
                      "Diastolic blood pressure", "Triceps skinfold thickness",
                      "2-Hour serum insulin", "Body mass index"
                  ])

imputer = imputer.setStrategy('median').setMissingValue(np.nan)
new_df = imputer.fit(df1).transform(df1)

new_col_names = new_df.columns
features2 = new_df.rdd.map(lambda row: row[0:])
corr_mat2 = Statistics.corr(features2, method="pearson")
corr_df2 = pd.DataFrame(corr_mat2)
corr_df2.index, corr_df2.columns = new_col_names, new_col_names

#print(corr_df2.to_string())

#imputer.fit(df1).transform(df1).show()
#imputer.setStrategy('median').setMissingValue(np.nan).fit(df1).transform(df1).show()

sns.set(style='ticks')
sns.pairplot(new_df.toPandas(),