def impute_missing_values(self, data):
        """
                                        Method Name: impute_missing_values
                                        Description: This method replaces all the missing values in the Dataframe using KNN Imputer.
                                        Output: A Dataframe which has all the missing values imputed.
                                        On Failure: Raise Exception

                     """
        self.logger_object.log(self.file_object, 'Entered the impute_missing_values method of the Preprocessor class')
        self.data= data
        try:
            imputer=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)
            self.new_array=imputer.fit_transform(self.data) # impute the missing values
            # convert the nd-array returned in the step above to a Dataframe
            self.new_data=pd.DataFrame(data=(self.new_array), columns=self.data.columns)
            self.logger_object.log(self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class')
            return self.new_data
        except Exception as e:
            self.logger_object.log(self.file_object,'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  ' + str(e))
            self.logger_object.log(self.file_object,'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class')
            raise Exception()
Ejemplo n.º 2
0
    def impute_feature(data, feature):
        data.loc[data[feature] < 0, feature] = np.NaN
        value_count = data.groupby('county_fips').count()
        counties_with_all_nulls = value_count[value_count[feature] == 0]
        temp = pd.DataFrame(index=data['county_fips'].unique().tolist(),
                            columns=data['date'].unique().tolist())

        for i in data['date'].unique():
            temp[i] = data.loc[data['date'] == i, feature].tolist()
        X = np.array(temp)
        imputer = KNNImputer(n_neighbors=5)
        imp = imputer.fit_transform(X)
        imp = pd.DataFrame(imp)
        imp.columns = temp.columns
        imp.index = temp.index
        for i in data['date'].unique():
            data.loc[data['date'] == i, feature] = imp[i].tolist()
        if (len(counties_with_all_nulls) > 0):
            data.loc[data['county_fips'].isin(counties_with_all_nulls.index),
                     feature] = np.NaN
        return (data)
Ejemplo n.º 3
0
def fill_data(train_data, test_data):
    imputer = KNNImputer(n_neighbors=3, weights='distance')
    imputer.fit(train_data)
    train = imputer.transform(train_data)
    test = imputer.transform(test_data)

    return train, test
Ejemplo n.º 4
0
def knn_impute(bigarray):
    #perform knn_imputation using sample rows
    print('*STARTING IMPUTING*')
    print(datetime.datetime.now())

    #impute sample rows so they are full
    samplerows = get_sample_rows(bigarray)
    stack_samplerows = np.vstack((samplerows[:]))
    imputer = KNNImputer(n_neighbors=5, weights='distance')
    complete_samples = imputer.fit_transform(stack_samplerows)

    #do knn imputation on each row using samples
    for i in range(0, len(bigarray)):
        if (i % 50000 == 0): print(f"Imputing row - {i}")
        if (rowprop(bigarray[i], 0) == False):  #if not full
            #make big array of row and samples
            big = np.vstack((bigarray[i], complete_samples))
            #do knn of row + samples
            imputer = KNNImputer(n_neighbors=5, weights='distance')
            filled = imputer.fit_transform(big)
            #extract and replace current row
            newrow = filled[0]
            bigarray[i] = newrow

    print('*FINISHED IMPUTING*')
    print(datetime.datetime.now())

    return bigarray
Ejemplo n.º 5
0
def knn_impute_by_item(matrix, valid_data, k):
    """ Fill in the missing values using k-Nearest Neighbors based on
    question similarity. Return the accuracy on valid_data.

    :param matrix: 2D sparse matrix
    :param valid_data: A dictionary {user_id: list, question_id: list,
    is_correct: list}
    :param k: int
    :return: float
    """
    #####################################################################
    # TODO:                                                             #
    # Implement the function as described in the docstring.             #
    #####################################################################
    nbrs = KNNImputer(n_neighbors=k)
    # We use NaN-Euclidean distance measure.
    mat = nbrs.fit_transform(matrix.transpose()).transpose()
    acc = sparse_matrix_evaluate(valid_data, mat)
    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################
    return acc
Ejemplo n.º 6
0
def knn_impute_by_item(matrix, valid_data, k):
    """ Fill in the missing values using k-Nearest Neighbors based on
    question similarity. Return the accuracy on valid_data.

    :param matrix: 2D sparse matrix
    :param valid_data: A dictionary {user_id: list, question_id: list,
    is_correct: list}
    :param k: int
    :return: float
    """
    #####################################################################
    # TODO:                                                             #
    # Implement the function as described in the docstring.             #
    #####################################################################
    nbrs = KNNImputer(n_neighbors=k)
    mat = nbrs.fit_transform(matrix.T)
    acc = sparse_matrix_evaluate(valid_data, mat.T)
    print("Validation Accuracy Item_based with k = {} : {}".format(k, acc))
    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################
    return acc
def imputer(df, numerical, binary):

    imputer_feature = df.copy()

    features_numerical = imputer_feature[numerical]
    features_binary = imputer_feature[binary]

    #Impute values with SimpleImputer for binary
    s_imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    s_imp = s_imp.fit(features_binary.values)
    features_binary = s_imp.transform(features_binary.values)

    #Impute values with KNNImputer for numerical
    KNNimp = KNNImputer()
    KNNimp = KNNimp.fit(features_numerical.values)
    features_numerical = KNNimp.transform(features_numerical.values)

    #Add columns and index again
    imputer_feature[binary] = features_binary
    imputer_feature[numerical] = features_numerical

    return imputer_feature, s_imp, KNNimp
Ejemplo n.º 8
0
def knn_missings(df, n_ngb=3):
    """
    First calls the function to select the numeric columns of the dataframe
    and transform the NaN through a KNN with 3 neighbors (optional).
    The return change the values on the original dataframe.

    Params:
        df = dataframe.
        n_ngb = number of neighbors of KNN, by default 3.
    """

    df_knn_msg = df.copy()

    list_num_cols = num_columns(df_knn_msg)

    imputer = KNNImputer(n_neighbors=n_ngb)

    imputer.fit(df[list_num_cols])

    df_knn_msg[list_num_cols] = imputer.transform(df_knn_msg[list_num_cols])

    return df_knn_msg
Ejemplo n.º 9
0
def impute_last_new_job(df, cat_var):
    df['last_new_job'] = df['last_new_job'].replace(['never'], 0)
    df['last_new_job'] = df['last_new_job'].replace(['>4'], 5)
    df1 = df
    df1 = df.drop(cat_var, axis=1)
    imputer = KNNImputer()
    df1_imputed = imputer.fit_transform(df1)
    df1_imputed = pd.DataFrame(df1_imputed,
                               index=df1.index,
                               columns=df1.columns)

    bins = np.linspace(-1, 5, 7)
    labels = [
        'lnj_zero', 'lnj_one', 'lnj_two', 'lnj_three', 'lnj_four', 'lnj_five'
    ]
    df1_imputed['lnj_bins'] = pd.cut(df1_imputed['last_new_job'],
                                     bins=bins,
                                     labels=labels)
    df2 = pd.get_dummies(df1_imputed['lnj_bins'])
    df = df.drop(['last_new_job'], axis=1)
    df = pd.concat([df, df2], axis=1)
    return df
def get_imputed(from_depth=0, to_depth=2, mode=MODE_MEAN):
    out = pd.DataFrame(
        index=pd.DatetimeIndex(pd.date_range(FROM_CUTOFF, TO_CUTOFF)))
    print("OUT:", out)

    for json_path in base_path.glob('*.csv'):
        print(json_path)
        with open(json_path, 'r') as f:
            df = pd.read_csv(f)

        df = df[(df.depth >= from_depth) & (df.depth <= to_depth)]
        df.index = pd.to_datetime(df['time'])

        df = df.drop(columns=['depth'])
        df = df.drop(columns=['time'])

        if df.empty:
            continue
        elif mode == MODE_MAX:
            df = df.groupby(pd.Grouper(freq='D')).max()
        elif mode == MODE_MIN:
            df = df.groupby(pd.Grouper(freq='D')).max()
        elif mode == MODE_MEDIAN:
            df = df.groupby(pd.Grouper(freq='D')).median()
        elif mode == MODE_MEAN:
            df = df.groupby(pd.Grouper(freq='D')).mean()
        else:
            raise Exception(mode)

        df = df.rename(columns={'value': json_path.name.replace('.csv', '')})
        out = pd.merge(out, df, left_index=True, right_index=True, how='outer')
        print(out)

    imputer = KNNImputer()
    out = pd.DataFrame(imputer.fit_transform(out),
                       columns=out.columns,
                       index=out.index)
    return out
Ejemplo n.º 11
0
def knn(data_mat,
        n_neighbors=5,
        weights='uniform',
        metric='nan_euclidean',
        copy=True,
        add_indicator=False):
    """

    @param data: numpy 2d array,missing values are represented by np.nan
    @param n_neighbors: number of neighbors
    @return: numpy 2d array after imputed
    """
    # 通过测试
    data = data_mat.copy()
    from sklearn.impute import KNNImputer
    imp = KNNImputer(n_neighbors=n_neighbors,
                     weights=weights,
                     metric=metric,
                     copy=copy,
                     add_indicator=add_indicator)
    # imp = KNNImputer(n_neighbors=5)
    mat = imp.fit_transform(data)
    return mat
Ejemplo n.º 12
0
def impute_by_age(train_df, test_df):
    """
    Function that perform missing data imputation
    on both train and test stratified by interview period.
    P1: [0; 30m]
    P2: (30; 72]
    P3: (72; 156]
    P4: (156; 204]
    P5: >204

    Parameters
    ----------
    train_df: dataframe
    test_df: dataframe
    Returns
    ------
    imputed dataframe train
    imputed dataframe test
    """
    knnimpute = KNNImputer(n_neighbors=ut.neighbors)
    col_n = [
        nc for nc in train_df.columns
        if not re.search('subjectkey|interview|respon|relation', nc)
    ]
    new_dict_tr, new_dict_ts = {}, {}
    for yr in sorted(train_df.interview_period.unique()):
        exp_tr = train_df.interview_period == yr
        exp_ts = test_df.interview_period == yr
        tmp_tr = train_df.loc[exp_tr].copy()
        tmp_ts = test_df.loc[exp_ts].copy()
        tmp_tr[col_n] = knnimpute.fit_transform(tmp_tr[col_n])
        tmp_ts[col_n] = knnimpute.transform(tmp_ts[col_n])
        new_dict_tr[yr] = tmp_tr
        new_dict_ts[yr] = tmp_ts
    new_tr = pd.concat([df for df in new_dict_tr.values()])
    new_ts = pd.concat([df for df in new_dict_ts.values()])
    return new_tr, new_ts