コード例 #1
0
def _handle_missing(df):
    from pyspark.ml.feature import Imputer
    from pyspark.sql import functions as F

    # handle missing values
    columns = list(
        filter(lambda col: col not in ('class', 'weight', 'crime_pair'),
               df.columns))
    dtypes = dict(df.dtypes)

    # for int columns
    int_columns = list(
        filter(lambda col: dtypes[col] not in ('float', 'double'), columns))
    stats = df.agg(*(F.avg(c).alias(c) for c in int_columns))
    fillers = {
        k: round(v)
        for k, v in stats.first().asDict().items() if v is not None
    }
    df = df.na.fill(fillers)

    # for float columns
    float_columns = list(
        filter(lambda col: dtypes[col] in ('float', 'double'), columns))
    print(float_columns)
    imputer = Imputer(
        inputCols=float_columns,
        outputCols=["{}_imputed".format(c) for c in float_columns])
    df = imputer.fit(df).transform(df)
    df = df.drop(*float_columns)

    return df
コード例 #2
0
def cleanDraftData(postion):
    '''
        [X] need to fill in nulls for the Age with the AVG, or with the median of all the ages  --> opted out for the medium 
    '''
    unCleanData = spark.read.format("csv").option("header", "true").option(
        "inferSchema", "true").load("./data/NflDraftData/draftData.csv")

    # drop columns we don't need
    unCleanData = unCleanData.select("Rnd", "Pick", "Player Name", "Pos",
                                     'Age', 'College', 'Draft Year')

    if (postion == "RB" or postion == "QB" or postion == "WR"):
        unCleanData = unCleanData.where(unCleanData["Pos"] == postion)
    else:  # Retrun all of the skill offensive players (WR, RB, TE, QB, FB)
        #drop lineman both offense and defense as well as defensive players and special teams
        droppedPostions = [
            'DE', 'DT', 'T', 'O', 'G', 'C', 'K', 'NT', 'DL', 'OL', 'LS', 'LB',
            'DB', 'P', 'OLB', 'CB', 'S', 'ILB'
        ]  # With only O players we are down to 2000 data pints
        for postion in droppedPostions:
            unCleanData = unCleanData.where(unCleanData["Pos"] != postion)

    # Cast values to doubles
    doubleCols = ['Age', 'Rnd', 'Pick', 'Draft Year']
    for c in doubleCols:
        unCleanData = unCleanData.withColumn(c,
                                             unCleanData[c].cast(DoubleType()))

    # Used to fill in Null values with the medium
    imputer = Imputer(inputCols=["Age"], outputCols=["Age"])
    cleanData = imputer.setStrategy("median").fit(unCleanData).transform(
        unCleanData)
    #cleanData.show()
    return cleanData
コード例 #3
0
def impute_missing(df, columns, out_cols, strategy='mean'):
    """
    Imputes missing data from specified columns using the mean or median.

    Parameters
    ----------
    columns : List of columns to be analyze.
    out_cols: List of output columns with missing values imputed.
    strategy: String that specifies the way of computing missing data. Can be "mean" or "median"
    
    return  : Transformer object (DF with columns that has the imputed values).
    """

    # Check if columns to be process are in dataframe
    assert_cols_in_df(df, columns_provided=columns, columns_df=df.columns)

    assert isinstance(columns, list), "Error: columns argument must be a list"
    assert isinstance(out_cols,
                      list), "Error: out_cols argument must be a list"

    # Check if columns argument a string datatype:
    assert_type_str(df, strategy, "strategy")

    assert (
        strategy == "mean" or strategy == "median"
    ), "Error: strategy has to be 'mean' or 'median'. 'mean' is default"

    imputer = Imputer(inputCols=columns, outputCols=out_cols)
    model = imputer.setStrategy(strategy).fit(df)
    df = model.transform(df)

    return df
コード例 #4
0
def imputeMonthlyIncome(df):
    imputer = Imputer(inputCols=['MonthlyIncome'],
                      outputCols=['imputed_MonthlyIncome'],
                      strategy='median')

    # Columns are required to either double or float by the Imputer...
    df = df.withColumn(
        'double_MonthlyIncome',
        df.MonthlyIncome.cast(DoubleType())
    ).drop('MonthlyIncome') \
     .withColumnRenamed('double_MonthlyIncome', 'MonthlyIncome')

    df = imputer.fit(df).transform(df).drop('MonthlyIncome')

    df = df.withColumnRenamed('imputed_MonthlyIncome', 'MonthlyIncome')

    # Addressing MonthlyIncome of 0
    incomeMedian = np.median(df.select('MonthlyIncome').collect())

    # Apply income median if the MonthlyIncome is 0
    df = df.withColumn(
        'MonthlyIncome',
        F.when((F.col('MonthlyIncome') == 1),
               incomeMedian).otherwise(F.col('MonthlyIncome')))

    return df
コード例 #5
0
ファイル: test_imputer.py プロジェクト: xadupre/onnxmltools
    def _imputer_test_single(self):
        data = self.spark.createDataFrame([(1.0, float("nan")),
                                           (2.0, float("nan")),
                                           (float("nan"), 3.0), (4.0, 4.0),
                                           (5.0, 5.0)], ["a", "b"])
        imputer = Imputer(inputCols=["a"], outputCols=["out_a"])
        model = imputer.fit(data)

        # the input name should match the inputCols above
        model_onnx = convert_sparkml(model, 'Sparkml Imputer',
                                     [('a', FloatTensorType([None, 1]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.select("out_a").toPandas().values.astype(
            numpy.float32)
        data_np = data.toPandas().a.values.astype(numpy.float32)
        data_np = data_np.reshape((-1, 1))
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlImputerSingle")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['out_a'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
コード例 #6
0
ファイル: gen_tfrecords.py プロジェクト: zhongqin1/recsys
def preprocess(df):
    cont_col = ['_c{0}'.format(i) for i in range(0, 14)]
    for i in cont_col:
        df = df.withColumn(i, df[i].cast("float"))
    # Continuous columns fill null with mean
    imputer=Imputer(inputCols=cont_col,outputCols=cont_col).setStrategy('mean')

    return imputer.fit(df).transform(df)
コード例 #7
0
ファイル: Semma.py プロジェクト: gatecesar/CRM_SEMMA_MEXICO
def imputaciones(VarLimpias):
    C = [i[0] for i in VarLimpias.dtypes if 'string' in i[1]]
    I = [i[0] for i in VarLimpias.dtypes if 'int' in i[1]]

    for f in I:
        VarLimpias = VarLimpias.withColumn(f, VarLimpias[f].cast(DoubleType()))
    imputer = Imputer(inputCols=[c for c in VarLimpias.columns if c not in C],
                      outputCols=[c for c in VarLimpias.columns if c not in C])
    Pba = imputer.fit(VarLimpias)
    return Pba.transform(VarLimpias)
コード例 #8
0
    def imputation(self):
        C=[i[0] for i in self.data.dtypes if 'string' in i[1]]
        I=[i[0] for i in self.data.dtypes if 'int' in i[1]]

        for f in I: self.data = self.data.withColumn(f, self.data[f].cast(DoubleType()))
        imputer = Imputer(
    	    	      inputCols= [c for c in self.data.columns if c not in C],
    	              outputCols=[c for c in self.data.columns if c not in C])
        Pba=imputer.fit(self.data)
        return Pba.transform(self.data)
コード例 #9
0
    def na_imputer(self, strategy, out_columns="*", na=None, columns="*"):
        """
        replace missing value with mean or median according to users' choice
        user can also customize the definition of missing value, e.g. 999
        the default missing value is 'nan' or null
        the default setting of out_columns is just columns, so the original columns will be overrided if not specially defined
        """

        #check columns
        if columns == "*":
            columns = self._df.schema.names
        elif isinstance(columns, str):
            columns = [columns]
        else:
            assert isinstance(
                columns,
                list), "Error: columns argument must be a string or a list!"

        if out_columns == "*":
            out_columns = self._df.schema.names

        #check output columns
        if isinstance(out_columns, str):
            out_columns = [out_columns]
        else:
            assert isinstance(
                out_columns, list
            ), "Error: output columns argument must be a string or a list!"

        #check input and output columns have consistent lengths
        assert len(columns) == len(
            out_columns
        ), "Error: inconsistent lengths for argument of columns list and output columns list"

        #check strategy argument
        assert (strategy == "mean" or strategy
                == "median"), "Error: strategy can only be 'mean' or 'median'."

        #firstly convert the type in input columns to FloatType for Imputer
        for col in columns:
            self._df = self._df.withColumn(col,
                                           self._df[col].cast(FloatType()))

        #fit the model
        imputer = Imputer(inputCols=columns, outputCols=out_columns)

        if na is None:
            model = imputer.setStrategy(strategy).fit(self._df)
        else:
            model = imputer.setStrategy(strategy).setMissingValue(na).fit(
                self._df)

        self._df = model.transform(self._df)

        return self._df
コード例 #10
0
def imputer_continuous_features(df, data_types_map):
    continuous_features = list(
        set(data_types_map['DoubleType']) - set(['DEP_DEL15']))
    continuous_features_imputed = [
        var + "_imputed" for var in continuous_features
    ]
    imputer = Imputer(inputCols=continuous_features,
                      outputCols=continuous_features_imputed)
    tmp = imputer.fit(df).transform(df)
    get_missing_info(tmp)
    return [imputer]
コード例 #11
0
def imputers(dataframe):
    inputCols = []
    outputCols = []
    for i in range(1,14):
        feature = 'I-'+str(i)
        dataframe =  dataframe.withColumn(feature, dataframe[feature].cast(DoubleType())) 
        inputCols.append(feature)
        outputCols.append(feature)
    imputer = Imputer(strategy="mean",
        inputCols=inputCols,
        outputCols=outputCols)
    return imputer.fit(dataframe).transform(dataframe)
コード例 #12
0
    def fill_na_numerical(self,data,columns):
        '''
        FILL NULL VALUES FOR NUMERICAL DATA
        args:
        1.data: <SPARK DATAFRAME> actual spark dataframe
        2.columns: <LIST> of numerical columns we want to Impute

        return: <SPARK DATAFRAME>Imputed spark dataframe
        '''
        columns=list(columns)
        imputer=Imputer(inputCols=columns,outputCols=['imputed_'+str(col) for col in columns])
        dataCopy=imputer.fit(data).transform(data)
        return dataCopy    
コード例 #13
0
    def fill_na_numerical(self, data, columns):
        '''
        Purpose: Fill null values for numerical data
        Inputs : Data(spark dataframe), column(numerical columns)
        Output : Imputed spark dataframe

        '''
        columns = list(columns)
        imputer = Imputer(
            inputCols=columns,
            outputCols=['imputed_' + str(col) for col in columns])
        dataCopy = imputer.fit(data).transform(data)
        return dataCopy
コード例 #14
0
def imputer_usecase():
    """
        用于计算数据集中的缺失值,使用指定的策略进行数据填充,
        strategy指定数据填充策略,
    """
    spark = getSparkSession()
    df = spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")),
                                (float("nan"), 3.0), (4.0, 4.0), (5.0, 5.0)],
                               ["a", "b"])

    imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
    model = imputer.fit(df)

    model.transform(df).show()
コード例 #15
0
    def preprocessing(self):

        model = GBTRegressor(labelCol="bicycle_rentals")

        cols = [
            "part_time", "holiday", "week_days", "weather_description_mf",
            "month"
        ]

        imputer = Imputer(inputCols=["humidity", "pressure"],
                          outputCols=["humidity_input", "pressure_input"])

        indexers = [
            StringIndexer(inputCol=col, outputCol="{0}_indexed".format(col))
            for col in cols
        ]

        assembler = VectorAssembler(inputCols=[
            "part_time_indexed", "holiday_indexed", "month_indexed",
            "week_days_indexed", "weather_description_mf_indexed",
            "humidity_input", "pressure_input", "temperature", "wind_speed",
            "from_station_id", "mean_dpcapacity_start", "mean_dpcapacity_end",
            "sum_subscriber", "sum_customer"
        ],
                                    outputCol="features")

        pipeline = Pipeline(stages=[imputer] + indexers + [assembler] +
                            [model])

        return pipeline
コード例 #16
0
ファイル: app.py プロジェクト: mledl/BDMA_HW
def prepocess_data(df):
    # Preprocessing the data
    # Dimension reduction
    cols_reduce = [
        'Date', 'Time', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3'
    ]
    df = df.drop(*cols_reduce)

    # Fixing missing values (dataset uses ? as NaN for missing values)
    imputer = Imputer(inputCols=df.columns, outputCols=df.columns)
    imputer.setStrategy("mean")
    df = imputer.fit(df).transform(df)

    # Print the column name and datatype
    print(df.dtypes)
    return df
コード例 #17
0
def get_ml1_pipeline():
    stages = []

    imputer = Imputer(inputCols=ML1_NUMERICAL_COLUMNS , outputCols=ML1_NUMERICAL_COLUMNS )
    stages.append(imputer)

    ohe_input_cols = []
    ohe_output_cols = []
    for categorical_column in ML1_CATEGORICAL_COLUMNS:
        str_indexer = StringIndexer(inputCol=categorical_column, outputCol=categorical_column + "_index", handleInvalid='keep')
        ohe_input_cols.append(str_indexer.getOutputCol())
        ohe_output_cols.append(categorical_column + "_class_vec")
        stages.append(str_indexer)

    encoder = OneHotEncoderEstimator(inputCols=ohe_input_cols, outputCols=ohe_output_cols, handleInvalid="error", dropLast=False)
    stages.append(encoder)

    numerical_vector_assembler = VectorAssembler(inputCols=ML1_NUMERICAL_COLUMNS , outputCol="numerial_cols_vec", handleInvalid="keep")
    scaler = MinMaxScaler(inputCol="numerial_cols_vec", outputCol= "scaled_numerical_cols")
    stages.append(numerical_vector_assembler)
    stages.append(scaler)

    label_str_indexer = StringIndexer(inputCol="result", outputCol="label", handleInvalid="keep")
    stages.append(label_str_indexer)

    assembler_input = encoder.getOutputCols() + [scaler.getOutputCol()]
    assembler = VectorAssembler(inputCols= assembler_input, outputCol="features", handleInvalid="skip")
    stages.append(assembler)

    pipeline = Pipeline(stages = stages)
    return pipeline
def _fit_crossvalidator(train, features, target):
    """
  Helper function that fits a CrossValidator model to predict a binary label
  `target` on the passed-in training DataFrame using the columns in `features`
  :param: train: Spark DataFrame containing training data
  :param: features: List of strings containing column names to use as features from `train`
  :param: target: String name of binary target column of `train` to predict
  """
    train = train.select(features + [target])
    model_matrix_stages = [
        Imputer(inputCols=features, outputCols=features),
        VectorAssembler(inputCols=features, outputCol="features"),
        StringIndexer(inputCol="bad_loan", outputCol="label")
    ]
    lr = LogisticRegression(maxIter=10,
                            elasticNetParam=0.5,
                            featuresCol="features")
    pipeline = Pipeline(stages=model_matrix_stages + [lr])
    paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build()
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=BinaryClassificationEvaluator(),
                              numFolds=5)
    with mlflow.start_run():
        mlflow.log_param("data_version", version_to_load)
        mlflow.log_param("data_path", DELTA_TABLE_DEFAULT_PATH)
        cvModel = crossval.fit(train)
        return cvModel.bestModel
コード例 #19
0
    def handle_missing(self, non_feature_col=["ID", "TIME_SPAN"]):
        import pyspark
        if type(self) == data_run_experiment:
            raise NotImplementedError(
                "Method need to be called in sub-class but currently called in base class"
            )

        try:
            ret_data_frame = self.spark.read.parquet(self.temp_missing_drop)
            self.logger.info(self.temp_missing_drop)
            return ret_data_frame
        except pyspark.sql.utils.AnalysisException as ex:
            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            self.logger.info(message)
            self.logger.info("PROCESS")

            #impute only. aggregation will be done after adding demographics
            cur_df = self.spark.read.parquet(self.out_file_name)
            cur_cols = cur_df.columns
            categorical_cols = list()
            numerical_cols = list()
            for i in non_feature_col:
                cur_cols.remove(i)
            for i in cur_cols:
                if i.find("C_") == 0:
                    categorical_cols.append(i)
                else:
                    numerical_cols.append(i)

            cur_df = cur_df.fillna(
                0, subset=categorical_cols).repartition(400).checkpoint()
            self.logger.info(cur_df.count())

            from pyspark.ml.feature import Imputer
            imputedCols = ["imp_{0}".format(x) for x in numerical_cols]
            imputer = Imputer(inputCols=numerical_cols,
                              outputCols=imputedCols).setStrategy("mean")
            imputer_model = imputer.fit(cur_df)
            ret_data_frame = imputer_model.transform(cur_df)
            ret_data_frame.select(non_feature_col + imputedCols +
                                  categorical_cols).show()
            ret_data_frame.select(non_feature_col + imputedCols +
                                  categorical_cols).write.save(
                                      self.temp_missing_drop)
            ret_data_frame = self.spark.read.parquet(self.temp_missing_drop)
            return ret_data_frame
コード例 #20
0
def _fit_crossvalidator(train, features, target, version):
    """
  Helper function that fits a CrossValidator model to predict a binary label
  `target` on the passed-in training DataFrame using the columns in `features`
  :param: train: Spark DataFrame containing training data
  :param: features: List of strings containing column names to use as features from `train`
  :param: target: String name of binary target column of `train` to predict
  """
    train = train.select(features + [target])
    model_matrix_stages = [
        Imputer(inputCols=features, outputCols=features),
        VectorAssembler(inputCols=features, outputCol="features"),
        StringIndexer(inputCol="bad_loan", outputCol="label")
    ]
    lr = LogisticRegression(maxIter=10,
                            elasticNetParam=0.5,
                            featuresCol="features")
    pipeline = Pipeline(stages=model_matrix_stages + [lr])
    paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build()
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=BinaryClassificationEvaluator(),
                              numFolds=5)

    import matplotlib.pyplot as plt
    from mlflow import spark as mlflow_spark
    from mlflow import sklearn as mlflow_sk

    mlflow.start_run()
    cvModel = crossval.fit(train)
    best_model = cvModel.bestModel

    roc = best_model.stages[len(best_model.stages) - 1].summary.roc.toPandas()
    fig1 = plt.figure()
    fig1.clf()
    plt.clf()
    plt.plot(roc['FPR'], roc['TPR'])
    plt.ylabel('False Positive Rate')
    plt.xlabel('True Positive Rate')
    plt.title('ROC Curve')
    fig1.savefig("roc.png")
    mlflow.log_artifact("roc.png")
    fig1.clf()
    plt.clf()

    lr_summary = best_model.stages[len(best_model.stages) - 1].summary
    mlflow.log_metric("accuracy", lr_summary.accuracy)
    mlflow.log_metric("weightedFalsePositiveRate",
                      lr_summary.weightedFalsePositiveRate)
    mlflow.log_metric("weightedFalsePositiveRate",
                      lr_summary.weightedFalsePositiveRate)
    mlflow.log_metric("weightedFMeasure", lr_summary.weightedFMeasure())
    mlflow.log_metric("weightedPrecision", lr_summary.weightedPrecision)
    mlflow.log_metric("weightedRecall", lr_summary.weightedRecall)

    mlflow_spark.log_model(best_model, "loan-classifier-mllib")
    mlflow.end_run()
    return best_model
def imputer_mean(df):

    weather_numeric_with_nulls = [
        'origin_WND_speed_rate', 'origin_CIG_ceiling_height',
        'origin_VIS_distance', 'origin_TMP_air_temperature',
        'origin_DEW_dew_point_temp', 'dest_WND_speed_rate',
        'dest_CIG_ceiling_height', 'dest_VIS_distance',
        'dest_TMP_air_temperature', 'dest_DEW_dew_point_temp',
        'origin_aa1_rain_depth', 'dest_aa1_rain_depth',
        'origin_aj1_snow_depth', 'dest_aj1_snow_depth'
    ]

    imputer = Imputer(inputCols=weather_numeric_with_nulls,
                      outputCols=weather_numeric_with_nulls)
    model = imputer.fit(filter_to_train(df))
    df = model.transform(df)

    return df
コード例 #22
0
    def impute(input_cols, output_cols, strategy="mean"):
        """
        Imputes missing data from specified columns using the mean or median.
        :param input_cols: List of columns to be analyze.
        :param output_cols: List of output columns with missing values imputed.
        :param strategy: String that specifies the way of computing missing data. Can be "mean" or "median"
        :return: Dataframe object (DF with columns that has the imputed values).
        """

        input_cols = parse_columns(self, input_cols)
        output_cols = val_to_list(output_cols)

        imputer = Imputer(inputCols=input_cols, outputCols=output_cols)

        df = self
        model = imputer.setStrategy(strategy).fit(df)
        df = model.transform(df)

        return df
コード例 #23
0
    def preprocessing(self, trainDF, validDF, testDF):
        """
        Data preprocessing steps involving  the following transformations:

        1. One-Hot encoding of categorical variables
        2. Imputation of missing values in numerical variables
        3. Standardization of numerical variables

        Parameters
        -----------
        trainDF: training data set
        validDF: test data set
        testDF: test data set

        Returns
        -----------
        Transformed training and test data sets with the assembler vector
        """
        # Extract numerical and categorical column names
        cat_cols = [field for (field, dataType) in trainDF.dtypes if dataType == "string"]
        num_cols = [field for (field, dataType) in trainDF.dtypes if ((dataType == "double") & \
                    (field != self.label_col))]

        # Create output columns
        index_output_cols = [x + "Index" for x in cat_cols]
        ohe_output_cols = [x + "OHE" for x in cat_cols]
        # num_output_cols = [x + "scaled" for x in num_cols]

        # strinf indexer for categorical variables
        s_indexer = StringIndexer(inputCols = cat_cols, outputCols = index_output_cols, 
                                    handleInvalid="skip")

        # One-hot code categorical columns
        cat_encoder = OneHotEncoder(inputCols = index_output_cols, outputCols = ohe_output_cols)

        # Impute missing values in numerical columns
        num_imputer = Imputer(inputCols = num_cols, outputCols = num_cols)

        # Vector assembler
        assembler_inputs = ohe_output_cols + num_cols
        assembler = VectorAssembler(inputCols = assembler_inputs, outputCol = "unscaled_features")

        # Features scaling using StandardScaler
        scaler = StandardScaler(inputCol = assembler.getOutputCol(), outputCol = "features")
        
        # Create pipeline
        stages = [s_indexer, cat_encoder, num_imputer, assembler, scaler]
        pipeline = Pipeline(stages = stages)
        pipelineModel = pipeline.fit(trainDF)

        # Preprocess training and test data
        trainDF_scaled = pipelineModel.transform(trainDF)
        validDF_scaled = pipelineModel.transform(validDF)
        testDF_scaled = pipelineModel.transform(testDF)
        return assembler, trainDF_scaled, validDF_scaled, testDF_scaled
コード例 #24
0
    def missing_val_imput(self):
        check = self.input.select(*(sum(col(c).isNull().cast("int")).alias(c)
                                    for c in self.input.columns))
        check.show()
        print("||| Above table shows missing values accross columns |||")
        check_pd = self.input.toPandas()
        val = check_pd.isnull().any().any()

        if val == True:
            imputer = Imputer(
                inputCols=self.input.columns,
                outputCols=["{}".format(c) for c in self.input.columns])
            cleaned_input = imputer.fit(self.input).transform(self.input)
            print("Missing values replaced with mean accross columns")
            print("Returning cleaned data")
            return cleaned_input

        else:
            print("No missing value found")
            return self.input
コード例 #25
0
    def replace_missings(self, test=False):
        """
        Replace missing values with a default value
        """

        for col in list(self.config_dict.keys()):
            # check if the replace missing transformation needs to be applied
            if self.config_dict[col]["replace_missings"]["apply"]:
                imputer = Imputer(
                    inputCols=[col],
                    outputCols=[
                        "{}_replace_missings".format(col)
                    ]).setMissingValue(
                        self.config_dict[col]["replace_missings"]["value"])
                if test:
                    self.test_data = imputer.fit(self.test_data).transform(
                        self.test_data)
                else:
                    self.train_data = imputer.fit(self.train_data).transform(
                        self.train_data)
def imputeNumeric(numeric_DF):
    '''
    takes a spark df with continuous numeric columns
    outputs a spark df where all null values are replaced with the column average

    the first column, which is the outcome values, are preserved
    '''
    outputColumns=["{}".format(c) for c in numeric_DF.columns[1:11]]
    catColumns = ["{}".format(c) for c in numeric_DF.columns[11:]]

    imputer = Imputer(
        inputCols=numeric_DF.columns[1:11],
        outputCols=["{}".format(c) for c in numeric_DF.columns[1:11]]
    )

    model = imputer.fit(numeric_DF)

    imputedDF = model.transform(numeric_DF).select(['_1']+outputColumns+catColumns)

    return imputedDF
コード例 #27
0
    def impute(self):
        from pyspark.ml.feature import Imputer

        df = self.session.createDataFrame([(1.0, float("nan")),
                                           (2.0, float("nan")),
                                           (float("nan"), 3.0), (4.0, 4.0),
                                           (5.0, 5.0)], ["a", "b"])

        # 默认采用平均值进行填充
        imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
        model = imputer.fit(df)
        model.transform(df).show()

        # 我们也可以设置为中位数,以及判定哪些是缺失值
        # null 则自动被认为缺失值
        imputer = Imputer(inputCols=["a", "b"],
                          outputCols=["out_a", "out_b"],
                          strategy="median",
                          missingValue=float("nan"))
        model = imputer.fit(df)
        model.transform(df).show()

        ## fit过程一般我们认为是一个学习的过程,我们也可以吧这个过程保留下来
        ## 遗憾的是,我们暂时没有办法变更参数
        model.write().overwrite().save("/tmp/wow")
        model = ImputerModel.read().load("/tmp/wow")
        model.transform(df).show()
コード例 #28
0
    def main(self, sc, *args):
        """ For each input files, i.e. train and test 'initiated, apply the same set of transformatons
        """

        sqlContext = SQLContext(sc)
        # For each key in the output dictionary of the Initiate task, i.e. train and test
        for inputFile in Initiate(self.input_file, self.output_path).output():
            df = sqlContext.read.csv(Initiate(
                self.input_file, self.output_path).output()[inputFile].path,
                                     sep=",",
                                     header=True,
                                     inferSchema=True)

            # Select final list of features
            list_features = ["Age", "Sex_indexed", "Fare", "Survived"]
            df = df.select(*list_features)

            # Replace missing values
            cols_missing = ["Age"]
            for col in cols_missing:
                imputer = Imputer(inputCols=[col],
                                  outputCols=[
                                      "{}_replace_missings".format(col)
                                  ]).setMissingValue(26.0)
                df = imputer.fit(df).transform(df)

            # Discretize
            cols_disc = {
                "Age_replace_missings":
                [-math.inf, 0.83, 21.0, 26.0, 33.0, 71.0, math.inf],
                "Fare": [-math.inf, 7.225, 8.122, 26.0, 83.475, math.inf],
            }
            for col in cols_disc:
                bucketizer = Bucketizer(splits=cols_disc[col],
                                        inputCol=col,
                                        outputCol="{}_discretized".format(col))
                df = bucketizer.transform(df)

            df.write.csv(self.output()[inputFile].path, header=True)
コード例 #29
0
ファイル: columns.py プロジェクト: marcelomata/Optimus
    def impute(columns, strategy="mean"):
        """
        Imputes missing data from specified columns using the mean or median.
        :param columns: List of columns to be analyze.
        :param strategy: String that specifies the way of computing missing data. Can be "mean" or "median"
        :return: Dataframe object (DF with columns that has the imputed values).
        """

        columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)

        df = self
        output_cols = []
        for col_name in columns:
            # Imputer require not only numeric but float or double
            df = df.cols.cast(col_name, "float")
            output_cols.append(col_name + IMPUTE_SUFFIX)

        imputer = Imputer(inputCols=columns, outputCols=output_cols)

        model = imputer.setStrategy(strategy).fit(df)
        df = model.transform(df)

        return df
コード例 #30
0
    def imputer(features_name, strategy="mean", missing_value=None, footer="_imputer"):
        """
        Spark experiment method
        Args:
            features_name:
            strategy:
            missing_value:
            footer:

        Returns:

        """
        output_names = [name+footer for name in features_name]

        imputer = Imputer() \
            .setInputCols(features_name) \
            .setOutputCols(output_names)\
            .setStrategy(strategy)

        if missing_value:
            imputer.setMissingValue(missing_value)

        return imputer
コード例 #31
0
Run with:
  bin/spark-submit examples/src/main/python/ml/imputer_example.py
"""
# $example on$
from pyspark.ml.feature import Imputer
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("ImputerExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame([
        (1.0, float("nan")),
        (2.0, float("nan")),
        (float("nan"), 3.0),
        (4.0, 4.0),
        (5.0, 5.0)
    ], ["a", "b"])

    imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
    model = imputer.fit(df)

    model.transform(df).show()
    # $example off$

    spark.stop()