Example #1
0
def _handle_missing(df):
    from pyspark.ml.feature import Imputer
    from pyspark.sql import functions as F

    # handle missing values
    columns = list(
        filter(lambda col: col not in ('class', 'weight', 'crime_pair'),
               df.columns))
    dtypes = dict(df.dtypes)

    # for int columns
    int_columns = list(
        filter(lambda col: dtypes[col] not in ('float', 'double'), columns))
    stats = df.agg(*(F.avg(c).alias(c) for c in int_columns))
    fillers = {
        k: round(v)
        for k, v in stats.first().asDict().items() if v is not None
    }
    df = df.na.fill(fillers)

    # for float columns
    float_columns = list(
        filter(lambda col: dtypes[col] in ('float', 'double'), columns))
    print(float_columns)
    imputer = Imputer(
        inputCols=float_columns,
        outputCols=["{}_imputed".format(c) for c in float_columns])
    df = imputer.fit(df).transform(df)
    df = df.drop(*float_columns)

    return df
Example #2
0
def cleanDraftData(postion):
    '''
        [X] need to fill in nulls for the Age with the AVG, or with the median of all the ages  --> opted out for the medium 
    '''
    unCleanData = spark.read.format("csv").option("header", "true").option(
        "inferSchema", "true").load("./data/NflDraftData/draftData.csv")

    # drop columns we don't need
    unCleanData = unCleanData.select("Rnd", "Pick", "Player Name", "Pos",
                                     'Age', 'College', 'Draft Year')

    if (postion == "RB" or postion == "QB" or postion == "WR"):
        unCleanData = unCleanData.where(unCleanData["Pos"] == postion)
    else:  # Retrun all of the skill offensive players (WR, RB, TE, QB, FB)
        #drop lineman both offense and defense as well as defensive players and special teams
        droppedPostions = [
            'DE', 'DT', 'T', 'O', 'G', 'C', 'K', 'NT', 'DL', 'OL', 'LS', 'LB',
            'DB', 'P', 'OLB', 'CB', 'S', 'ILB'
        ]  # With only O players we are down to 2000 data pints
        for postion in droppedPostions:
            unCleanData = unCleanData.where(unCleanData["Pos"] != postion)

    # Cast values to doubles
    doubleCols = ['Age', 'Rnd', 'Pick', 'Draft Year']
    for c in doubleCols:
        unCleanData = unCleanData.withColumn(c,
                                             unCleanData[c].cast(DoubleType()))

    # Used to fill in Null values with the medium
    imputer = Imputer(inputCols=["Age"], outputCols=["Age"])
    cleanData = imputer.setStrategy("median").fit(unCleanData).transform(
        unCleanData)
    #cleanData.show()
    return cleanData
Example #3
0
def impute_missing(df, columns, out_cols, strategy='mean'):
    """
    Imputes missing data from specified columns using the mean or median.

    Parameters
    ----------
    columns : List of columns to be analyze.
    out_cols: List of output columns with missing values imputed.
    strategy: String that specifies the way of computing missing data. Can be "mean" or "median"
    
    return  : Transformer object (DF with columns that has the imputed values).
    """

    # Check if columns to be process are in dataframe
    assert_cols_in_df(df, columns_provided=columns, columns_df=df.columns)

    assert isinstance(columns, list), "Error: columns argument must be a list"
    assert isinstance(out_cols,
                      list), "Error: out_cols argument must be a list"

    # Check if columns argument a string datatype:
    assert_type_str(df, strategy, "strategy")

    assert (
        strategy == "mean" or strategy == "median"
    ), "Error: strategy has to be 'mean' or 'median'. 'mean' is default"

    imputer = Imputer(inputCols=columns, outputCols=out_cols)
    model = imputer.setStrategy(strategy).fit(df)
    df = model.transform(df)

    return df
Example #4
0
def imputeMonthlyIncome(df):
    imputer = Imputer(inputCols=['MonthlyIncome'],
                      outputCols=['imputed_MonthlyIncome'],
                      strategy='median')

    # Columns are required to either double or float by the Imputer...
    df = df.withColumn(
        'double_MonthlyIncome',
        df.MonthlyIncome.cast(DoubleType())
    ).drop('MonthlyIncome') \
     .withColumnRenamed('double_MonthlyIncome', 'MonthlyIncome')

    df = imputer.fit(df).transform(df).drop('MonthlyIncome')

    df = df.withColumnRenamed('imputed_MonthlyIncome', 'MonthlyIncome')

    # Addressing MonthlyIncome of 0
    incomeMedian = np.median(df.select('MonthlyIncome').collect())

    # Apply income median if the MonthlyIncome is 0
    df = df.withColumn(
        'MonthlyIncome',
        F.when((F.col('MonthlyIncome') == 1),
               incomeMedian).otherwise(F.col('MonthlyIncome')))

    return df
Example #5
0
    def _imputer_test_single(self):
        data = self.spark.createDataFrame([(1.0, float("nan")),
                                           (2.0, float("nan")),
                                           (float("nan"), 3.0), (4.0, 4.0),
                                           (5.0, 5.0)], ["a", "b"])
        imputer = Imputer(inputCols=["a"], outputCols=["out_a"])
        model = imputer.fit(data)

        # the input name should match the inputCols above
        model_onnx = convert_sparkml(model, 'Sparkml Imputer',
                                     [('a', FloatTensorType([None, 1]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.select("out_a").toPandas().values.astype(
            numpy.float32)
        data_np = data.toPandas().a.values.astype(numpy.float32)
        data_np = data_np.reshape((-1, 1))
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlImputerSingle")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['out_a'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #6
0
def preprocess(df):
    cont_col = ['_c{0}'.format(i) for i in range(0, 14)]
    for i in cont_col:
        df = df.withColumn(i, df[i].cast("float"))
    # Continuous columns fill null with mean
    imputer=Imputer(inputCols=cont_col,outputCols=cont_col).setStrategy('mean')

    return imputer.fit(df).transform(df)
Example #7
0
def imputaciones(VarLimpias):
    C = [i[0] for i in VarLimpias.dtypes if 'string' in i[1]]
    I = [i[0] for i in VarLimpias.dtypes if 'int' in i[1]]

    for f in I:
        VarLimpias = VarLimpias.withColumn(f, VarLimpias[f].cast(DoubleType()))
    imputer = Imputer(inputCols=[c for c in VarLimpias.columns if c not in C],
                      outputCols=[c for c in VarLimpias.columns if c not in C])
    Pba = imputer.fit(VarLimpias)
    return Pba.transform(VarLimpias)
Example #8
0
    def imputation(self):
        C=[i[0] for i in self.data.dtypes if 'string' in i[1]]
        I=[i[0] for i in self.data.dtypes if 'int' in i[1]]

        for f in I: self.data = self.data.withColumn(f, self.data[f].cast(DoubleType()))
        imputer = Imputer(
    	    	      inputCols= [c for c in self.data.columns if c not in C],
    	              outputCols=[c for c in self.data.columns if c not in C])
        Pba=imputer.fit(self.data)
        return Pba.transform(self.data)
Example #9
0
    def na_imputer(self, strategy, out_columns="*", na=None, columns="*"):
        """
        replace missing value with mean or median according to users' choice
        user can also customize the definition of missing value, e.g. 999
        the default missing value is 'nan' or null
        the default setting of out_columns is just columns, so the original columns will be overrided if not specially defined
        """

        #check columns
        if columns == "*":
            columns = self._df.schema.names
        elif isinstance(columns, str):
            columns = [columns]
        else:
            assert isinstance(
                columns,
                list), "Error: columns argument must be a string or a list!"

        if out_columns == "*":
            out_columns = self._df.schema.names

        #check output columns
        if isinstance(out_columns, str):
            out_columns = [out_columns]
        else:
            assert isinstance(
                out_columns, list
            ), "Error: output columns argument must be a string or a list!"

        #check input and output columns have consistent lengths
        assert len(columns) == len(
            out_columns
        ), "Error: inconsistent lengths for argument of columns list and output columns list"

        #check strategy argument
        assert (strategy == "mean" or strategy
                == "median"), "Error: strategy can only be 'mean' or 'median'."

        #firstly convert the type in input columns to FloatType for Imputer
        for col in columns:
            self._df = self._df.withColumn(col,
                                           self._df[col].cast(FloatType()))

        #fit the model
        imputer = Imputer(inputCols=columns, outputCols=out_columns)

        if na is None:
            model = imputer.setStrategy(strategy).fit(self._df)
        else:
            model = imputer.setStrategy(strategy).setMissingValue(na).fit(
                self._df)

        self._df = model.transform(self._df)

        return self._df
def imputer_continuous_features(df, data_types_map):
    continuous_features = list(
        set(data_types_map['DoubleType']) - set(['DEP_DEL15']))
    continuous_features_imputed = [
        var + "_imputed" for var in continuous_features
    ]
    imputer = Imputer(inputCols=continuous_features,
                      outputCols=continuous_features_imputed)
    tmp = imputer.fit(df).transform(df)
    get_missing_info(tmp)
    return [imputer]
def imputers(dataframe):
    inputCols = []
    outputCols = []
    for i in range(1,14):
        feature = 'I-'+str(i)
        dataframe =  dataframe.withColumn(feature, dataframe[feature].cast(DoubleType())) 
        inputCols.append(feature)
        outputCols.append(feature)
    imputer = Imputer(strategy="mean",
        inputCols=inputCols,
        outputCols=outputCols)
    return imputer.fit(dataframe).transform(dataframe)
Example #12
0
    def fill_na_numerical(self,data,columns):
        '''
        FILL NULL VALUES FOR NUMERICAL DATA
        args:
        1.data: <SPARK DATAFRAME> actual spark dataframe
        2.columns: <LIST> of numerical columns we want to Impute

        return: <SPARK DATAFRAME>Imputed spark dataframe
        '''
        columns=list(columns)
        imputer=Imputer(inputCols=columns,outputCols=['imputed_'+str(col) for col in columns])
        dataCopy=imputer.fit(data).transform(data)
        return dataCopy    
    def fill_na_numerical(self, data, columns):
        '''
        Purpose: Fill null values for numerical data
        Inputs : Data(spark dataframe), column(numerical columns)
        Output : Imputed spark dataframe

        '''
        columns = list(columns)
        imputer = Imputer(
            inputCols=columns,
            outputCols=['imputed_' + str(col) for col in columns])
        dataCopy = imputer.fit(data).transform(data)
        return dataCopy
def imputer_usecase():
    """
        用于计算数据集中的缺失值,使用指定的策略进行数据填充,
        strategy指定数据填充策略,
    """
    spark = getSparkSession()
    df = spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")),
                                (float("nan"), 3.0), (4.0, 4.0), (5.0, 5.0)],
                               ["a", "b"])

    imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
    model = imputer.fit(df)

    model.transform(df).show()
Example #15
0
    def preprocessing(self):

        model = GBTRegressor(labelCol="bicycle_rentals")

        cols = [
            "part_time", "holiday", "week_days", "weather_description_mf",
            "month"
        ]

        imputer = Imputer(inputCols=["humidity", "pressure"],
                          outputCols=["humidity_input", "pressure_input"])

        indexers = [
            StringIndexer(inputCol=col, outputCol="{0}_indexed".format(col))
            for col in cols
        ]

        assembler = VectorAssembler(inputCols=[
            "part_time_indexed", "holiday_indexed", "month_indexed",
            "week_days_indexed", "weather_description_mf_indexed",
            "humidity_input", "pressure_input", "temperature", "wind_speed",
            "from_station_id", "mean_dpcapacity_start", "mean_dpcapacity_end",
            "sum_subscriber", "sum_customer"
        ],
                                    outputCol="features")

        pipeline = Pipeline(stages=[imputer] + indexers + [assembler] +
                            [model])

        return pipeline
Example #16
0
File: app.py Project: mledl/BDMA_HW
def prepocess_data(df):
    # Preprocessing the data
    # Dimension reduction
    cols_reduce = [
        'Date', 'Time', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3'
    ]
    df = df.drop(*cols_reduce)

    # Fixing missing values (dataset uses ? as NaN for missing values)
    imputer = Imputer(inputCols=df.columns, outputCols=df.columns)
    imputer.setStrategy("mean")
    df = imputer.fit(df).transform(df)

    # Print the column name and datatype
    print(df.dtypes)
    return df
def get_ml1_pipeline():
    stages = []

    imputer = Imputer(inputCols=ML1_NUMERICAL_COLUMNS , outputCols=ML1_NUMERICAL_COLUMNS )
    stages.append(imputer)

    ohe_input_cols = []
    ohe_output_cols = []
    for categorical_column in ML1_CATEGORICAL_COLUMNS:
        str_indexer = StringIndexer(inputCol=categorical_column, outputCol=categorical_column + "_index", handleInvalid='keep')
        ohe_input_cols.append(str_indexer.getOutputCol())
        ohe_output_cols.append(categorical_column + "_class_vec")
        stages.append(str_indexer)

    encoder = OneHotEncoderEstimator(inputCols=ohe_input_cols, outputCols=ohe_output_cols, handleInvalid="error", dropLast=False)
    stages.append(encoder)

    numerical_vector_assembler = VectorAssembler(inputCols=ML1_NUMERICAL_COLUMNS , outputCol="numerial_cols_vec", handleInvalid="keep")
    scaler = MinMaxScaler(inputCol="numerial_cols_vec", outputCol= "scaled_numerical_cols")
    stages.append(numerical_vector_assembler)
    stages.append(scaler)

    label_str_indexer = StringIndexer(inputCol="result", outputCol="label", handleInvalid="keep")
    stages.append(label_str_indexer)

    assembler_input = encoder.getOutputCols() + [scaler.getOutputCol()]
    assembler = VectorAssembler(inputCols= assembler_input, outputCol="features", handleInvalid="skip")
    stages.append(assembler)

    pipeline = Pipeline(stages = stages)
    return pipeline
def _fit_crossvalidator(train, features, target):
    """
  Helper function that fits a CrossValidator model to predict a binary label
  `target` on the passed-in training DataFrame using the columns in `features`
  :param: train: Spark DataFrame containing training data
  :param: features: List of strings containing column names to use as features from `train`
  :param: target: String name of binary target column of `train` to predict
  """
    train = train.select(features + [target])
    model_matrix_stages = [
        Imputer(inputCols=features, outputCols=features),
        VectorAssembler(inputCols=features, outputCol="features"),
        StringIndexer(inputCol="bad_loan", outputCol="label")
    ]
    lr = LogisticRegression(maxIter=10,
                            elasticNetParam=0.5,
                            featuresCol="features")
    pipeline = Pipeline(stages=model_matrix_stages + [lr])
    paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build()
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=BinaryClassificationEvaluator(),
                              numFolds=5)
    with mlflow.start_run():
        mlflow.log_param("data_version", version_to_load)
        mlflow.log_param("data_path", DELTA_TABLE_DEFAULT_PATH)
        cvModel = crossval.fit(train)
        return cvModel.bestModel
    def handle_missing(self, non_feature_col=["ID", "TIME_SPAN"]):
        import pyspark
        if type(self) == data_run_experiment:
            raise NotImplementedError(
                "Method need to be called in sub-class but currently called in base class"
            )

        try:
            ret_data_frame = self.spark.read.parquet(self.temp_missing_drop)
            self.logger.info(self.temp_missing_drop)
            return ret_data_frame
        except pyspark.sql.utils.AnalysisException as ex:
            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            self.logger.info(message)
            self.logger.info("PROCESS")

            #impute only. aggregation will be done after adding demographics
            cur_df = self.spark.read.parquet(self.out_file_name)
            cur_cols = cur_df.columns
            categorical_cols = list()
            numerical_cols = list()
            for i in non_feature_col:
                cur_cols.remove(i)
            for i in cur_cols:
                if i.find("C_") == 0:
                    categorical_cols.append(i)
                else:
                    numerical_cols.append(i)

            cur_df = cur_df.fillna(
                0, subset=categorical_cols).repartition(400).checkpoint()
            self.logger.info(cur_df.count())

            from pyspark.ml.feature import Imputer
            imputedCols = ["imp_{0}".format(x) for x in numerical_cols]
            imputer = Imputer(inputCols=numerical_cols,
                              outputCols=imputedCols).setStrategy("mean")
            imputer_model = imputer.fit(cur_df)
            ret_data_frame = imputer_model.transform(cur_df)
            ret_data_frame.select(non_feature_col + imputedCols +
                                  categorical_cols).show()
            ret_data_frame.select(non_feature_col + imputedCols +
                                  categorical_cols).write.save(
                                      self.temp_missing_drop)
            ret_data_frame = self.spark.read.parquet(self.temp_missing_drop)
            return ret_data_frame
Example #20
0
def _fit_crossvalidator(train, features, target, version):
    """
  Helper function that fits a CrossValidator model to predict a binary label
  `target` on the passed-in training DataFrame using the columns in `features`
  :param: train: Spark DataFrame containing training data
  :param: features: List of strings containing column names to use as features from `train`
  :param: target: String name of binary target column of `train` to predict
  """
    train = train.select(features + [target])
    model_matrix_stages = [
        Imputer(inputCols=features, outputCols=features),
        VectorAssembler(inputCols=features, outputCol="features"),
        StringIndexer(inputCol="bad_loan", outputCol="label")
    ]
    lr = LogisticRegression(maxIter=10,
                            elasticNetParam=0.5,
                            featuresCol="features")
    pipeline = Pipeline(stages=model_matrix_stages + [lr])
    paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build()
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=BinaryClassificationEvaluator(),
                              numFolds=5)

    import matplotlib.pyplot as plt
    from mlflow import spark as mlflow_spark
    from mlflow import sklearn as mlflow_sk

    mlflow.start_run()
    cvModel = crossval.fit(train)
    best_model = cvModel.bestModel

    roc = best_model.stages[len(best_model.stages) - 1].summary.roc.toPandas()
    fig1 = plt.figure()
    fig1.clf()
    plt.clf()
    plt.plot(roc['FPR'], roc['TPR'])
    plt.ylabel('False Positive Rate')
    plt.xlabel('True Positive Rate')
    plt.title('ROC Curve')
    fig1.savefig("roc.png")
    mlflow.log_artifact("roc.png")
    fig1.clf()
    plt.clf()

    lr_summary = best_model.stages[len(best_model.stages) - 1].summary
    mlflow.log_metric("accuracy", lr_summary.accuracy)
    mlflow.log_metric("weightedFalsePositiveRate",
                      lr_summary.weightedFalsePositiveRate)
    mlflow.log_metric("weightedFalsePositiveRate",
                      lr_summary.weightedFalsePositiveRate)
    mlflow.log_metric("weightedFMeasure", lr_summary.weightedFMeasure())
    mlflow.log_metric("weightedPrecision", lr_summary.weightedPrecision)
    mlflow.log_metric("weightedRecall", lr_summary.weightedRecall)

    mlflow_spark.log_model(best_model, "loan-classifier-mllib")
    mlflow.end_run()
    return best_model
def imputer_mean(df):

    weather_numeric_with_nulls = [
        'origin_WND_speed_rate', 'origin_CIG_ceiling_height',
        'origin_VIS_distance', 'origin_TMP_air_temperature',
        'origin_DEW_dew_point_temp', 'dest_WND_speed_rate',
        'dest_CIG_ceiling_height', 'dest_VIS_distance',
        'dest_TMP_air_temperature', 'dest_DEW_dew_point_temp',
        'origin_aa1_rain_depth', 'dest_aa1_rain_depth',
        'origin_aj1_snow_depth', 'dest_aj1_snow_depth'
    ]

    imputer = Imputer(inputCols=weather_numeric_with_nulls,
                      outputCols=weather_numeric_with_nulls)
    model = imputer.fit(filter_to_train(df))
    df = model.transform(df)

    return df
Example #22
0
    def impute(input_cols, output_cols, strategy="mean"):
        """
        Imputes missing data from specified columns using the mean or median.
        :param input_cols: List of columns to be analyze.
        :param output_cols: List of output columns with missing values imputed.
        :param strategy: String that specifies the way of computing missing data. Can be "mean" or "median"
        :return: Dataframe object (DF with columns that has the imputed values).
        """

        input_cols = parse_columns(self, input_cols)
        output_cols = val_to_list(output_cols)

        imputer = Imputer(inputCols=input_cols, outputCols=output_cols)

        df = self
        model = imputer.setStrategy(strategy).fit(df)
        df = model.transform(df)

        return df
    def preprocessing(self, trainDF, validDF, testDF):
        """
        Data preprocessing steps involving  the following transformations:

        1. One-Hot encoding of categorical variables
        2. Imputation of missing values in numerical variables
        3. Standardization of numerical variables

        Parameters
        -----------
        trainDF: training data set
        validDF: test data set
        testDF: test data set

        Returns
        -----------
        Transformed training and test data sets with the assembler vector
        """
        # Extract numerical and categorical column names
        cat_cols = [field for (field, dataType) in trainDF.dtypes if dataType == "string"]
        num_cols = [field for (field, dataType) in trainDF.dtypes if ((dataType == "double") & \
                    (field != self.label_col))]

        # Create output columns
        index_output_cols = [x + "Index" for x in cat_cols]
        ohe_output_cols = [x + "OHE" for x in cat_cols]
        # num_output_cols = [x + "scaled" for x in num_cols]

        # strinf indexer for categorical variables
        s_indexer = StringIndexer(inputCols = cat_cols, outputCols = index_output_cols, 
                                    handleInvalid="skip")

        # One-hot code categorical columns
        cat_encoder = OneHotEncoder(inputCols = index_output_cols, outputCols = ohe_output_cols)

        # Impute missing values in numerical columns
        num_imputer = Imputer(inputCols = num_cols, outputCols = num_cols)

        # Vector assembler
        assembler_inputs = ohe_output_cols + num_cols
        assembler = VectorAssembler(inputCols = assembler_inputs, outputCol = "unscaled_features")

        # Features scaling using StandardScaler
        scaler = StandardScaler(inputCol = assembler.getOutputCol(), outputCol = "features")
        
        # Create pipeline
        stages = [s_indexer, cat_encoder, num_imputer, assembler, scaler]
        pipeline = Pipeline(stages = stages)
        pipelineModel = pipeline.fit(trainDF)

        # Preprocess training and test data
        trainDF_scaled = pipelineModel.transform(trainDF)
        validDF_scaled = pipelineModel.transform(validDF)
        testDF_scaled = pipelineModel.transform(testDF)
        return assembler, trainDF_scaled, validDF_scaled, testDF_scaled
    def missing_val_imput(self):
        check = self.input.select(*(sum(col(c).isNull().cast("int")).alias(c)
                                    for c in self.input.columns))
        check.show()
        print("||| Above table shows missing values accross columns |||")
        check_pd = self.input.toPandas()
        val = check_pd.isnull().any().any()

        if val == True:
            imputer = Imputer(
                inputCols=self.input.columns,
                outputCols=["{}".format(c) for c in self.input.columns])
            cleaned_input = imputer.fit(self.input).transform(self.input)
            print("Missing values replaced with mean accross columns")
            print("Returning cleaned data")
            return cleaned_input

        else:
            print("No missing value found")
            return self.input
Example #25
0
    def replace_missings(self, test=False):
        """
        Replace missing values with a default value
        """

        for col in list(self.config_dict.keys()):
            # check if the replace missing transformation needs to be applied
            if self.config_dict[col]["replace_missings"]["apply"]:
                imputer = Imputer(
                    inputCols=[col],
                    outputCols=[
                        "{}_replace_missings".format(col)
                    ]).setMissingValue(
                        self.config_dict[col]["replace_missings"]["value"])
                if test:
                    self.test_data = imputer.fit(self.test_data).transform(
                        self.test_data)
                else:
                    self.train_data = imputer.fit(self.train_data).transform(
                        self.train_data)
def imputeNumeric(numeric_DF):
    '''
    takes a spark df with continuous numeric columns
    outputs a spark df where all null values are replaced with the column average

    the first column, which is the outcome values, are preserved
    '''
    outputColumns=["{}".format(c) for c in numeric_DF.columns[1:11]]
    catColumns = ["{}".format(c) for c in numeric_DF.columns[11:]]

    imputer = Imputer(
        inputCols=numeric_DF.columns[1:11],
        outputCols=["{}".format(c) for c in numeric_DF.columns[1:11]]
    )

    model = imputer.fit(numeric_DF)

    imputedDF = model.transform(numeric_DF).select(['_1']+outputColumns+catColumns)

    return imputedDF
    def impute(self):
        from pyspark.ml.feature import Imputer

        df = self.session.createDataFrame([(1.0, float("nan")),
                                           (2.0, float("nan")),
                                           (float("nan"), 3.0), (4.0, 4.0),
                                           (5.0, 5.0)], ["a", "b"])

        # 默认采用平均值进行填充
        imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
        model = imputer.fit(df)
        model.transform(df).show()

        # 我们也可以设置为中位数,以及判定哪些是缺失值
        # null 则自动被认为缺失值
        imputer = Imputer(inputCols=["a", "b"],
                          outputCols=["out_a", "out_b"],
                          strategy="median",
                          missingValue=float("nan"))
        model = imputer.fit(df)
        model.transform(df).show()

        ## fit过程一般我们认为是一个学习的过程,我们也可以吧这个过程保留下来
        ## 遗憾的是,我们暂时没有办法变更参数
        model.write().overwrite().save("/tmp/wow")
        model = ImputerModel.read().load("/tmp/wow")
        model.transform(df).show()
Example #28
0
    def main(self, sc, *args):
        """ For each input files, i.e. train and test 'initiated, apply the same set of transformatons
        """

        sqlContext = SQLContext(sc)
        # For each key in the output dictionary of the Initiate task, i.e. train and test
        for inputFile in Initiate(self.input_file, self.output_path).output():
            df = sqlContext.read.csv(Initiate(
                self.input_file, self.output_path).output()[inputFile].path,
                                     sep=",",
                                     header=True,
                                     inferSchema=True)

            # Select final list of features
            list_features = ["Age", "Sex_indexed", "Fare", "Survived"]
            df = df.select(*list_features)

            # Replace missing values
            cols_missing = ["Age"]
            for col in cols_missing:
                imputer = Imputer(inputCols=[col],
                                  outputCols=[
                                      "{}_replace_missings".format(col)
                                  ]).setMissingValue(26.0)
                df = imputer.fit(df).transform(df)

            # Discretize
            cols_disc = {
                "Age_replace_missings":
                [-math.inf, 0.83, 21.0, 26.0, 33.0, 71.0, math.inf],
                "Fare": [-math.inf, 7.225, 8.122, 26.0, 83.475, math.inf],
            }
            for col in cols_disc:
                bucketizer = Bucketizer(splits=cols_disc[col],
                                        inputCol=col,
                                        outputCol="{}_discretized".format(col))
                df = bucketizer.transform(df)

            df.write.csv(self.output()[inputFile].path, header=True)
Example #29
0
    def impute(columns, strategy="mean"):
        """
        Imputes missing data from specified columns using the mean or median.
        :param columns: List of columns to be analyze.
        :param strategy: String that specifies the way of computing missing data. Can be "mean" or "median"
        :return: Dataframe object (DF with columns that has the imputed values).
        """

        columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)

        df = self
        output_cols = []
        for col_name in columns:
            # Imputer require not only numeric but float or double
            df = df.cols.cast(col_name, "float")
            output_cols.append(col_name + IMPUTE_SUFFIX)

        imputer = Imputer(inputCols=columns, outputCols=output_cols)

        model = imputer.setStrategy(strategy).fit(df)
        df = model.transform(df)

        return df
Example #30
0
    def imputer(features_name, strategy="mean", missing_value=None, footer="_imputer"):
        """
        Spark experiment method
        Args:
            features_name:
            strategy:
            missing_value:
            footer:

        Returns:

        """
        output_names = [name+footer for name in features_name]

        imputer = Imputer() \
            .setInputCols(features_name) \
            .setOutputCols(output_names)\
            .setStrategy(strategy)

        if missing_value:
            imputer.setMissingValue(missing_value)

        return imputer
Example #31
0
Run with:
  bin/spark-submit examples/src/main/python/ml/imputer_example.py
"""
# $example on$
from pyspark.ml.feature import Imputer
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("ImputerExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame([
        (1.0, float("nan")),
        (2.0, float("nan")),
        (float("nan"), 3.0),
        (4.0, 4.0),
        (5.0, 5.0)
    ], ["a", "b"])

    imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
    model = imputer.fit(df)

    model.transform(df).show()
    # $example off$

    spark.stop()