Ejemplo n.º 1
0
    def impute_feature(data, feature):
        data.loc[data[feature] < 0, feature] = np.NaN
        value_count = data.groupby('county_fips').count()
        counties_with_all_nulls = value_count[value_count[feature] == 0]
        temp = pd.DataFrame(index=data['county_fips'].unique().tolist(),
                            columns=data['date'].unique().tolist())

        for i in data['date'].unique():
            temp[i] = data.loc[data['date'] == i, feature].tolist()
        X = np.array(temp)
        imputer = KNNImputer(n_neighbors=5)
        imp = imputer.fit_transform(X)
        imp = pd.DataFrame(imp)
        imp.columns = temp.columns
        imp.index = temp.index
        for i in data['date'].unique():
            data.loc[data['date'] == i, feature] = imp[i].tolist()
        if (len(counties_with_all_nulls) > 0):
            data.loc[data['county_fips'].isin(counties_with_all_nulls.index),
                     feature] = np.NaN
        return (data)
Ejemplo n.º 2
0
    def impute_missing_values(self, data):
        """
                                        Method Name: impute_missing_values
                                        Description: This method replaces all the missing values in the Dataframe using KNN Imputer.
                                        Output: A Dataframe which has all the missing values imputed.
                                        On Failure: Raise Exception

                                        Written By: iNeuron Intelligence
                                        Version: 1.0
                                        Revisions: None
                     """
        self.logger_object.log(
            self.file_object,
            'Entered the impute_missing_values method of the Preprocessor class'
        )
        self.data = data
        try:
            imputer = KNNImputer(n_neighbors=3,
                                 weights='uniform',
                                 missing_values=np.nan)
            self.new_array = imputer.fit_transform(
                self.data)  # impute the missing values
            # convert the nd-array returned in the step above to a Dataframe
            self.new_data = pd.DataFrame(data=self.new_array,
                                         columns=self.data.columns)
            self.logger_object.log(
                self.file_object,
                'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class'
            )
            return self.new_data
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class'
            )
            raise Exception()
Ejemplo n.º 3
0
def impute_last_new_job(df, cat_var):
    df['last_new_job'] = df['last_new_job'].replace(['never'], 0)
    df['last_new_job'] = df['last_new_job'].replace(['>4'], 5)
    df1 = df
    df1 = df.drop(cat_var, axis=1)
    imputer = KNNImputer()
    df1_imputed = imputer.fit_transform(df1)
    df1_imputed = pd.DataFrame(df1_imputed,
                               index=df1.index,
                               columns=df1.columns)

    bins = np.linspace(-1, 5, 7)
    labels = [
        'lnj_zero', 'lnj_one', 'lnj_two', 'lnj_three', 'lnj_four', 'lnj_five'
    ]
    df1_imputed['lnj_bins'] = pd.cut(df1_imputed['last_new_job'],
                                     bins=bins,
                                     labels=labels)
    df2 = pd.get_dummies(df1_imputed['lnj_bins'])
    df = df.drop(['last_new_job'], axis=1)
    df = pd.concat([df, df2], axis=1)
    return df
Ejemplo n.º 4
0
def impute_df(df):
    # imputer = KNN()
    imputer = KNNImputer(n_neighbors=2)
    object_types = list(df.select_dtypes(include=['object']).columns)
    num_types = list(set(df.columns) - set(object_types))
    encoders_store = {}
    for column in num_types:
        skew = df[column].skew()
        if (-1 < skew < 1):
            df[column] = df[column].fillna(df[column].mean())
        else:
            df[column] = df[column].fillna(df[column].median())
    #create a for loop to iterate through each column in the data
    for columns in object_types:
        new = encode(df[columns])
        encoders_store[columns] = new[1]
    imputed_data = pd.DataFrame(np.round(imputer.fit_transform(df)),
                                columns=df.columns)
    for columns in object_types:
        imputed_data[columns] = encoders_store[columns].inverse_transform(
            np.array(imputed_data[columns]).reshape(-1, 1))
    return imputed_data
Ejemplo n.º 5
0
def get_estimator():
    K_imp = KNNImputer(missing_values=np.nan,
                       n_neighbors=3,
                       weights="distance")
    reg = RandomForestRegressor(n_estimators=10, max_depth=8)
    cols = [
        'P_MHD', 'DAUD', 'PDD', 'PAD', 'PADHD', 'DMSUD', 'PBD',
        'Current health expenditure', 'Current health expenditure per capita',
        'Out-of-pocket expenditure', 'Unemployment',
        'School enrollment primary', 'School enrollment secondary',
        'School enrollment tertiary', 'ghs', 'media integrity',
        'military expenditure'
    ]

    prep = ColumnTransformer(transformers=[
        ('prep', make_pipeline(K_imp, StandardScaler()), cols),
    ],
                             remainder='drop')

    estimator = Pipeline(steps=[('prep', prep), ('classifier', reg)])

    return estimator
Ejemplo n.º 6
0
def fit_imputed(v, train, valid):
    """
    Function to test a single model in validation sample [valid], having
    trained on the training [train] sample, after scaling and imputation.
    """
    # Select features/outcome
    X_train = train[v]
    y_train = train['y']
    n_train = np.shape(X_train)[0]
    # Scale/impute
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    imputer = KNNImputer()
    X_train = imputer.fit_transform(X_train)
    # Train Logistic Regression with inner CV using training sample
    clf = LogisticRegressionCV(cv=inner,
                               penalty='l1',
                               Cs=10**np.linspace(0.1, -3, 50),
                               random_state=42,
                               solver='liblinear',
                               scoring=roc_auc_scorer).fit(X_train, y_train)
    # Predict in validation sample
    X_test = valid[v]
    y_test = valid['y']
    X_test = scaler.transform(X_test)
    X_test = imputer.transform(X_test)
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1]
    # Return
    return ({
        'clf': clf,
        'n_train': n_train,
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test,
        'y_pred': y_pred,
        'y_prob': y_prob
    })
def MLP_model_pred(model,df_train,df_test,sentence_vector=False):
  '''
  Function to predict sentiment score using MLP
  '''

  #Filter Data
  if not sentence_vector:
    #Feature Engineering
    df_train,df_test=PMI(df_train,df_test)
    for gram in [1,2,3,4]:
      df_train,df_test=rf_ngram(df_train,df_test,gram=gram)

    df_train=df_train.drop(['cashtag','spans','text','clean_text','base_text','source'],1)
    df_test=df_test.drop(['clean_text','base_text'],1)
  else:
    df_train=W2V_sentence_embedding(df_train)
    df_test=W2V_sentence_embedding(df_test)

  #Split data into dependent and independent variable
  if 'sentiment score' in df_train.columns.tolist():
    X_train=df_train.drop(['sentiment score'],1)
  else:
    X_train=df_train.copy()
  X_test=df_test.copy()

  #Impute Missing Testues
  imputer = KNNImputer(n_neighbors=3)
  X_train=pd.DataFrame(imputer.fit_transform(X_train))
  X_test_split = np.array_split(X_test, 20)
  X_test_pool=pd.DataFrame(imputer.fit_transform(X_test_split[0]))
  for i in range(1,20):
    X_imputed=pd.DataFrame(imputer.fit_transform(X_test_split[i]))
    X_test_pool=pd.concat([X_test_pool,X_imputed],ignore_index=True)
  X_test=X_test_pool.copy()

  #Predict 
  y_pred=model.predict(X_test,batch_size=32)
  y_pred=pd.Series(y_pred.tolist()).apply(lambda x: x[0])
  return y_pred
Ejemplo n.º 8
0
def knn_impute_by_item(matrix, valid_data, k):
    """ Fill in the missing values using k-Nearest Neighbors based on
    question similarity. Return the accuracy on valid_data.

    :param matrix: 2D sparse matrix
    :param valid_data: A dictionary {user_id: list, question_id: list,
    is_correct: list}
    :param k: int
    :return: float
    """
    #####################################################################
    # TODO:                                                             #
    # Implement the function as described in the docstring.             #
    #####################################################################
    nbrs = KNNImputer(n_neighbors=k)
    # We use NaN-Euclidean distance measure.
    mat = nbrs.fit_transform(matrix.transpose()).transpose()
    acc = sparse_matrix_evaluate(valid_data, mat)
    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################
    return acc
Ejemplo n.º 9
0
def experiment_setting_5(X, y, runs=5, missingness=0.1):
    results = []
    for i in range(runs):
        np.random.seed(i)
        X_missing = make_missing_random(X, missingness)

        ss = StratifiedKFold(shuffle=True, random_state=i)

        for train_index, test_index in ss.split(X, y):
            X_train = X_missing[train_index]
            y_train = y[train_index]
            X_test = X[test_index]
            y_test = y[test_index]

            si = KNNImputer()
            X_train = si.fit_transform(X_train)

            dt = C45DecisionTree(criterion='c45', max_depth=20)
            dt.fit(X_train, y_train)
            results.append(accuracy_score(dt.predict(X_test), y_test))

    return results
Ejemplo n.º 10
0
def knn_impute_by_item(matrix, valid_data, k):
    """ Fill in the missing values using k-Nearest Neighbors based on
    question similarity. Return the accuracy on valid_data.

    :param matrix: 2D sparse matrix
    :param valid_data: A dictionary {user_id: list, question_id: list,
    is_correct: list}
    :param k: int
    :return: float
    """
    #####################################################################
    # TODO:                                                             #
    # Implement the function as described in the docstring.             #
    #####################################################################
    nbrs = KNNImputer(n_neighbors=k)
    mat = nbrs.fit_transform(matrix.T)
    acc = sparse_matrix_evaluate(valid_data, mat.T)
    print("Validation Accuracy Item_based with k = {} : {}".format(k, acc))
    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################
    return acc
Ejemplo n.º 11
0
def make_pipeline(df):
    x = df
    col_dtypes = get_types(x)

    encoder = ColumnTransformer(
        [('categorical', CatBoostEncoder(), col_dtypes['object']),
         # could use passthrough=remainder, but this way makes column ordering more obvious
         ('numeric', FunctionTransformer(), col_dtypes['int64'] + col_dtypes['float64'])
         ]
    )

    all_columns_idx = np.full((len(x)), True, dtype=bool)
    imputer = ColumnTransformer(
        [('knn_imputer', KNNImputer(), all_columns_idx)]
    )

    pipeline = Pipeline(steps=[
        ('encoder', encoder),
        ('imputer', imputer),
    ])

    return pipeline, col_dtypes['object'] + col_dtypes['int64'] + col_dtypes['float64']
def imputer(df, numerical, binary):

    imputer_feature = df.copy()

    features_numerical = imputer_feature[numerical]
    features_binary = imputer_feature[binary]

    #Impute values with SimpleImputer for binary
    s_imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    s_imp = s_imp.fit(features_binary.values)
    features_binary = s_imp.transform(features_binary.values)

    #Impute values with KNNImputer for numerical
    KNNimp = KNNImputer()
    KNNimp = KNNimp.fit(features_numerical.values)
    features_numerical = KNNimp.transform(features_numerical.values)

    #Add columns and index again
    imputer_feature[binary] = features_binary
    imputer_feature[numerical] = features_numerical

    return imputer_feature, s_imp, KNNimp
Ejemplo n.º 13
0
def knn_missings(df, n_ngb=3):
    """
    First calls the function to select the numeric columns of the dataframe
    and transform the NaN through a KNN with 3 neighbors (optional).
    The return change the values on the original dataframe.

    Params:
        df = dataframe.
        n_ngb = number of neighbors of KNN, by default 3.
    """

    df_knn_msg = df.copy()

    list_num_cols = num_columns(df_knn_msg)

    imputer = KNNImputer(n_neighbors=n_ngb)

    imputer.fit(df[list_num_cols])

    df_knn_msg[list_num_cols] = imputer.transform(df_knn_msg[list_num_cols])

    return df_knn_msg
Ejemplo n.º 14
0
    def impute_missing_values(self, data):
        """
                                        Method Name: impute_missing_values
                                        Description: This method replaces all the missing values in the Dataframe using KNN Imputer.
                                        Output: A Dataframe which has all the missing values imputed.
                                        On Failure: Raise Exception

                                       
                     """
        self.logger_object.log(
            self.file_object,
            'Entered the impute_missing_values method of the Preprocessor class'
        )
        self.data = data
        try:
            imputer = KNNImputer(n_neighbors=3,
                                 weights='uniform',
                                 missing_values=np.nan)
            self.new_array = imputer.fit_transform(
                self.data)  # impute the missing values
            # convert the nd-array returned in the step above to a Dataframe
            # rounding the value because KNNimputer returns value between 0 and 1, but we need either 0 or 1
            self.new_data = pd.DataFrame(data=np.round(self.new_array),
                                         columns=self.data.columns)
            self.logger_object.log(
                self.file_object,
                'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class'
            )
            return self.new_data
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class'
            )
            raise Exception()
def get_imputed(from_depth=0, to_depth=2, mode=MODE_MEAN):
    out = pd.DataFrame(
        index=pd.DatetimeIndex(pd.date_range(FROM_CUTOFF, TO_CUTOFF)))
    print("OUT:", out)

    for json_path in base_path.glob('*.csv'):
        print(json_path)
        with open(json_path, 'r') as f:
            df = pd.read_csv(f)

        df = df[(df.depth >= from_depth) & (df.depth <= to_depth)]
        df.index = pd.to_datetime(df['time'])

        df = df.drop(columns=['depth'])
        df = df.drop(columns=['time'])

        if df.empty:
            continue
        elif mode == MODE_MAX:
            df = df.groupby(pd.Grouper(freq='D')).max()
        elif mode == MODE_MIN:
            df = df.groupby(pd.Grouper(freq='D')).max()
        elif mode == MODE_MEDIAN:
            df = df.groupby(pd.Grouper(freq='D')).median()
        elif mode == MODE_MEAN:
            df = df.groupby(pd.Grouper(freq='D')).mean()
        else:
            raise Exception(mode)

        df = df.rename(columns={'value': json_path.name.replace('.csv', '')})
        out = pd.merge(out, df, left_index=True, right_index=True, how='outer')
        print(out)

    imputer = KNNImputer()
    out = pd.DataFrame(imputer.fit_transform(out),
                       columns=out.columns,
                       index=out.index)
    return out
Ejemplo n.º 16
0
def knn(data_mat,
        n_neighbors=5,
        weights='uniform',
        metric='nan_euclidean',
        copy=True,
        add_indicator=False):
    """

    @param data: numpy 2d array,missing values are represented by np.nan
    @param n_neighbors: number of neighbors
    @return: numpy 2d array after imputed
    """
    # 通过测试
    data = data_mat.copy()
    from sklearn.impute import KNNImputer
    imp = KNNImputer(n_neighbors=n_neighbors,
                     weights=weights,
                     metric=metric,
                     copy=copy,
                     add_indicator=add_indicator)
    # imp = KNNImputer(n_neighbors=5)
    mat = imp.fit_transform(data)
    return mat
Ejemplo n.º 17
0
 def impute_missing_values(self, data):
     self.logger_object.log(
         self.file_object,
         'Entered the Impute_Missing_Values  method of Data Proprocessing ')
     self.data = data
     try:
         imputer = KNNImputer(n_neighbors=3,
                              weights='uniform',
                              missing_values=np.nan)
         self.new_array = imputer.fit_transform(self.data)
         self.new_data = pd.DataFrame(data=self.new_data,
                                      columns=self.data.columns)
         self.logger_object.log(self.file_object,
                                'Imputing missing values Successful.')
         return self.new_data
     except Exception as e:
         self.logger_object.log(
             self.file_object,
             'Exception occured in impute_missing_values method Exception message:  %s'
             + str(e))
         self.logger_object.log(self.file_object,
                                'Imputing missing values failed.')
         raise e
Ejemplo n.º 18
0
def stats_preprocess():
    print("Data preprocessing(imputation) start...")

    raw_df = data.load(datecol=[1])
    dfs_h = []

    impute_statistics = {}

    for station_name in tqdm.tqdm(SEOUL_STATIONS.keys(),
                                  total=len(SEOUL_STATIONS.keys())):
        sdf = data.load_station(raw_df, SEOUL_STATIONS[station_name])

        imputer = KNNImputer(n_neighbors=5,
                             weights="distance",
                             missing_values=np.NaN)
        _df = pd.DataFrame(imputer.fit_transform(sdf))
        _df.columns = sdf.columns
        _df.index = sdf.index

        dfs_h.append(_df)

    df = pd.concat(dfs_h)
    df.to_csv("/input/python/input_seoul_imputed_hourly_pandas.csv")
Ejemplo n.º 19
0
def impute_by_age(train_df, test_df):
    """
    Function that perform missing data imputation
    on both train and test stratified by interview period.
    P1: [0; 30m]
    P2: (30; 72]
    P3: (72; 156]
    P4: (156; 204]
    P5: >204

    Parameters
    ----------
    train_df: dataframe
    test_df: dataframe
    Returns
    ------
    imputed dataframe train
    imputed dataframe test
    """
    knnimpute = KNNImputer(n_neighbors=ut.neighbors)
    col_n = [
        nc for nc in train_df.columns
        if not re.search('subjectkey|interview|respon|relation', nc)
    ]
    new_dict_tr, new_dict_ts = {}, {}
    for yr in sorted(train_df.interview_period.unique()):
        exp_tr = train_df.interview_period == yr
        exp_ts = test_df.interview_period == yr
        tmp_tr = train_df.loc[exp_tr].copy()
        tmp_ts = test_df.loc[exp_ts].copy()
        tmp_tr[col_n] = knnimpute.fit_transform(tmp_tr[col_n])
        tmp_ts[col_n] = knnimpute.transform(tmp_ts[col_n])
        new_dict_tr[yr] = tmp_tr
        new_dict_ts[yr] = tmp_ts
    new_tr = pd.concat([df for df in new_dict_tr.values()])
    new_ts = pd.concat([df for df in new_dict_ts.values()])
    return new_tr, new_ts
Ejemplo n.º 20
0
def test_knn_imputer_with_simple_example(na, working_memory):

    X = np.array(
        [
            [0, na, 0, na],
            [1, 1, 1, na],
            [2, 2, na, 2],
            [3, 3, 3, 3],
            [4, 4, 4, 4],
            [5, 5, 5, 5],
            [6, 6, 6, 6],
            [na, 7, 7, 7],
        ]
    )

    r0c1 = np.mean(X[1:6, 1])
    r0c3 = np.mean(X[2:-1, -1])
    r1c3 = np.mean(X[2:-1, -1])
    r2c2 = np.mean(X[[0, 1, 3, 4, 5], 2])
    r7c0 = np.mean(X[2:-1, 0])

    X_imputed = np.array(
        [
            [0, r0c1, 0, r0c3],
            [1, 1, 1, r1c3],
            [2, 2, r2c2, 2],
            [3, 3, 3, 3],
            [4, 4, 4, 4],
            [5, 5, 5, 5],
            [6, 6, 6, 6],
            [r7c0, 7, 7, 7],
        ]
    )

    with config_context(working_memory=working_memory):
        imputer_comp = KNNImputer(missing_values=na)
        assert_allclose(imputer_comp.fit_transform(X), X_imputed)
    def test_sklearn_knn_imputer_cdist(self):
        x_train = numpy.array([[1, 2, numpy.nan, 12], [3, numpy.nan, 3, 13],
                               [1, 4, numpy.nan, 1], [numpy.nan, 4, 3, 12]],
                              dtype=numpy.float32)
        x_test = numpy.array(
            [[1.3, 2.4, numpy.nan, 1], [-1.3, numpy.nan, 3.1, numpy.nan]],
            dtype=numpy.float32)
        model = KNNImputer(n_neighbors=3, metric='nan_euclidean').fit(x_train)

        with self.assertRaises(NameError):
            convert_sklearn(model,
                            "KNN imputer",
                            [("input", FloatTensorType(
                                (None, x_test.shape[1])))],
                            target_opset=TARGET_OPSET,
                            options={id(model): {
                                         'optim2': 'cdist'
                                     }})

        for opset in [TARGET_OPSET, 12, 11, 10, 9]:
            if opset > TARGET_OPSET:
                continue
            model_onnx = convert_sklearn(
                model,
                "KNN imputer",
                [("input", FloatTensorType((None, x_test.shape[1])))],
                target_opset=opset,
                options={id(model): {
                             'optim': 'cdist'
                         }})
            self.assertIsNotNone(model_onnx)
            self.assertIn('op_type: "cdist"', str(model_onnx).lower())
            self.assertNotIn('scan', str(model_onnx).lower())
            dump_data_and_model(x_test,
                                model,
                                model_onnx,
                                basename="SklearnKNNImputer%dcdist" % opset)
Ejemplo n.º 22
0
def get_train_test(fnc_file,loadings_file,lablels_file):
    '''
    function to get training and test data sets
    Works with Rapids.ai ONLY
    
    '''
    path = "../input/trends-assessment-prediction/"
    fnc_df = pd.read_csv(os.path.join(path,fnc_file))
    loading_df = pd.read_csv(os.path.join(path,loadings_file))
    fnc_features, loading_features = list(fnc_df.columns[1:]), list(loading_df.columns[1:])
    df = fnc_df.merge(loading_df, on="Id")
    labels_df = pd.read_csv(os.path.join(path,lablels_file))
    labels_df["is_train"] = True
    df = df.merge(labels_df, on="Id", how="left")
    test_df = df[df["is_train"] != True].copy()
    train_df = df[df["is_train"] == True].copy()
    train_df = train_df.drop(['is_train'], axis=1)
    target_cols = ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']
    test_df = test_df.drop(target_cols + ['is_train'], axis=1)
    features = loading_features + fnc_features 
    #-----------------Normalizing------------------------
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    train_df[features] = scaler.fit_transform(train_df[features],train_df[target_cols])
    test_df[features] = scaler.transform(test_df[features])
    #----------------------------------------------------
    # Giving less importance to FNC features since they are easier to overfit due to high dimensionality.
    train_df[fnc_features] = train_df[fnc_features].mul(1/600)
    test_df[fnc_features]  = test_df[fnc_features].mul(1/600) 
    #imputing missing values in targets
    from sklearn.impute import KNNImputer
    imputer = KNNImputer(n_neighbors = 5, weights="distance")
    train_df = cudf.from_pandas(pd.DataFrame(imputer.fit_transform(train_df), columns = list(train_df.columns)))
    test_df = cudf.from_pandas(test_df)#necessary for casting to gpu matrix
    del df
    gc.collect()
    return train_df,test_df,features,target_cols
Ejemplo n.º 23
0
def input_missing_data(exploration, df):
    percentual = st.slider(
        'Choose the missing percentage limit for the columns you want to input data',
        min_value=0,
        max_value=100)
    columns_list = list(
        exploration[(exploration['NA %'] < percentual)
                    & ((exploration['types'] == 'int64')
                       | (exploration['types'] == 'float64'))]['names'])
    select_method = st.radio('Choose a metod :',
                             ('Mean', 'Median', 'KNN_Imputer'))
    st.markdown('You chosse : ' + str(select_method))
    if select_method == 'Mean':

        df_inputed = df[columns_list].fillna(df[columns_list].mean())
        st.table(df_inputed[columns_list].head(10))
        st.subheader('Download data : ')
        st.markdown(get_table_download_link(df_inputed),
                    unsafe_allow_html=True)

    elif select_method == 'Median':

        df_inputed = df[columns_list].fillna(df[columns_list].median())
        st.table(df_inputed[columns_list].head(10))
        st.subheader('Download data : ')
        st.markdown(get_table_download_link(df_inputed),
                    unsafe_allow_html=True)

    elif select_method == 'KNN_Imputer':
        imputer = KNNImputer(n_neighbors=3)
        st.markdown(columns_list)
        df_inputed = pd.DataFrame(imputer.fit_transform(df[columns_list]),
                                  columns=columns_list)
        df_inputed = pd.concat([df.drop(columns_list, axis=1), df_inputed])
        st.subheader('Download data : ')
        st.markdown(get_table_download_link(df_inputed),
                    unsafe_allow_html=True)
Ejemplo n.º 24
0
def compare_to_lasso_analysis(adata, ccdtranscript):
    '''Perform a comparison of pseudotime analysis to LASSO analysis for finding CCD proteins'''
    prevPlotSize = plt.rcParams['figure.figsize']
    plt.rcParams['figure.figsize'] = (6, 5)

    print("ANALYZING SC-RNA-SEQ WITH LASSO")
    warnings.filterwarnings("ignore")
    fucci_rna_data = [(adata.obs["Red585"][ii], adata.obs["Green530"][ii]) for ii in np.arange(len(adata.obs))]
    imputer = KNNImputer(missing_values=0)
    expression = imputer.fit_transform(adata.X)
    fucci_rna_path = "output/pickles/fucci_rna_imputed_lasso.pkl"
    if os.path.exists(fucci_rna_path):
        fucci_rna = np.load(open(fucci_rna_path, 'rb'), allow_pickle=True)
    else:
        fucci_rna = MultiTaskLassoCV()
        fucci_rna.fit(expression, fucci_rna_data)
        pickle.dump(fucci_rna, open(fucci_rna_path, 'wb'))
    nz_coef = np.sum(fucci_rna.coef_, axis=0) != 0
    print(f"{sum(nz_coef)}: number of nonzero lasso coefficients")
    print(f"{adata.var_names[nz_coef]}: genes with nonzero lasso coeff")
    print(f"{sum(ccdtranscript[nz_coef]) / sum(nz_coef)}: % nonzero lasso found as CCD transcripts")
    print(f"{np.sum(fucci_rna.coef_, axis=0)[nz_coef]}: coefficients for nonzero lasso coeff")
    
    # Generate UMAP for CCD and nonCCD for the LASSO model
    adataCCd = adata[:,nz_coef]
    sc.pp.neighbors(adataCCd, n_neighbors=10, n_pcs=40)
    sc.tl.umap(adataCCd)
    sc.pl.umap(adataCCd, color="fucci_time", show=False, save=True)
    shutil.move("figures/umap.pdf", "figures/umapRNALassoCCD.pdf")
    adataNonCCd = adata[:,~nz_coef]
    sc.pp.neighbors(adataNonCCd, n_neighbors=10, n_pcs=40)
    sc.tl.umap(adataNonCCd)
    sc.pl.umap(adataNonCCd, color="fucci_time", show=False, save=True)
    shutil.move("figures/umap.pdf", "figures/umapRNALassoNonCCD.pdf")
    plt.rcParams['figure.figsize'] = prevPlotSize
    warnings.filterwarnings("default")
def create_preprocessor(df):
    """
    Takes input dataframe and applies imputation on numeric and categoric features
    Returns x_train and y_train
    """

    # Separate columns for each imputer
    features_numeric = ["kms"]
    features_categoric = list(df)
    features_categoric.remove("kms")

    # imputer for numerical and one imputer for categorical in pipeline

    # this imputer imputes with the mean
    imputer_numeric = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
    ])

    # this imputer imputes with an arbitrary value
    """
    imputer_categoric = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent'))
    ])
    """
    imputer_categoric = KNNImputer(n_neighbors=2, weights="uniform")

    # Combine features list and the transformers together using the column transformer

    preprocessor = ColumnTransformer(transformers=[('imputer_numeric',
                                                    imputer_numeric,
                                                    features_numeric),
                                                   ('imputer_categoric',
                                                    imputer_categoric,
                                                    features_categoric)])

    return preprocessor
Ejemplo n.º 26
0
def replace_missing_numbers(df, strat='median'):
    """Nahradi chybajuce numericke data pomocou zvolenej strategie (median, mean alebo kNN)."""
    x = df.copy()
    
    # Pre zvolenu strategiu sa vytvori imputer
    if strat in ['mean', 'median']:
        imp = SimpleImputer(strategy=strat)
    else:
        imp = KNNImputer()
    
    # Doplnia sa chybajuce hodnoty
    x = imp.fit_transform(x)
    
    # Z novych hodnot sa vytvori dataframe
    x = pd.DataFrame(x)
    
    # Pomenujeme stlpce a riadky rovnako ako v povodnom dataframe
    x.columns = df.columns
    x.index = df.index
    
    x['class'] = x['class'].round()
    x['income_>50K'] = x['income_>50K'].round()
    
    return x
Ejemplo n.º 27
0
def imputation_statique(df, statique):
    ###############################################################
    # Cette fonction vous permettra d'imputer les données manquantes
    # Si statique=True alors l'imputation se fera par la median ou le mode
    # selon le type des données en entrée
    ###############################################################
    missing_data = df.apply(lambda x: np.round(
        x.isnull().value_counts() * 100.0 / len(x), 2)).iloc[0]
    columns_MissingData = missing_data[missing_data < 100].index
    if imputation_statique:
        for col in columns_MissingData:
            if df[col].dtype == 'O':
                df[col] = df[col].fillna(df[col].mode().iloc[0])
            else:
                df[col] = df[col].fillna(df[col].median())
    else:
        imputer = KNNImputer(n_neighbors=3)
        ids = df.CustomerID
        X = pd.concat([
            pd.get_dummies(df.drop('CustomerID', axis=1).select_dtypes('O')),
            df.drop('CustomerID', axis=1).select_dtypes(exclude='O')
        ],
                      axis=1)
        X_filled_knn = pd.DataFrame(imputer.fit_transform(X))
        X_filled_knn.columns = X.columns
        for col in columns_MissingData:
            print(col)
            if df[col].dtypes == 'O':
                df_temp = X_filled_knn.filter(regex='^' + col + '*')
                df_temp.columns = [
                    x.replace(col + '_', '') for x in df_temp.columns
                ]
                df[col] = df_temp.idxmax(1)
            else:
                df[col] = np.round(X_filled_knn[col], 2)
    return (df)
Ejemplo n.º 28
0
def KNN_imputer(food_data, missed_features):
    features = [
        'protein', 'fat', 'carbohydrates', 'sugar', 'sodium', 'calories'
    ]

    Y = pd.read_csv(url).drop('class', 1).to_numpy()
    nan = np.nan
    protein = nan if 'protein' not in food_data else food_data['protein']
    fat = nan if 'fat' not in food_data else food_data['fat']
    carbohydrates = nan if 'carbohydrates' not in food_data else food_data[
        'carbohydrates']
    sugar = nan if 'sugar' not in food_data else food_data['sugar']
    sodium = nan if 'sodium' not in food_data else food_data['sodium']
    calories = nan if 'calories' not in food_data else food_data['calories']
    print('Vector before restoring {}'.format(
        np.array([[protein, fat, carbohydrates, sugar, sodium, calories]])))

    Y = np.concatenate(
        (Y, np.array([[protein, fat, carbohydrates, sugar, sodium,
                       calories]])))
    imputer = KNNImputer(n_neighbors=2, weights="uniform")
    X = imputer.fit_transform(Y)[-1].reshape(1, -1)
    print('Restored via KNNImputer vector {}'.format(X))
    return X
Ejemplo n.º 29
0
# RainTomorrow:	    Datos no NaN: 142193 	Datos Nan: 3267 	En%: 2.245978275814657
# RainToday:	    Datos no NaN: 142199 	Datos Nan: 3261 	En%: 2.241853430496356
# Rainfall:	        Datos no NaN: 142199 	Datos Nan: 3261 	En%: 2.241853430496356
# WindSpeed3pm:	    Datos no NaN: 142398 	Datos Nan: 3062 	En%: 2.105046060772721
# Humidity9am:	    Datos no NaN: 142806 	Datos Nan: 2654 	En%: 1.8245565791282827
# WindSpeed9am:	    Datos no NaN: 143693 	Datos Nan: 1767 	En%: 1.214766946239516
# Temp9am:	        Datos no NaN: 143693 	Datos Nan: 1767 	En%: 1.214766946239516
# MinTemp:	        Datos no NaN: 143975 	Datos Nan: 1485 	En%: 1.0208992162793895
# MaxTemp:	        Datos no NaN: 144199 	Datos Nan: 1261 	En%: 0.8669049910628351


x_train['MinTemp']=x_train['MinTemp'].fillna(x_train['MinTemp'].median())

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
x_train_knn_imp = imputer.fit_transform(x_train)

x_train.iloc[:,'Pressure9am'] = x_train_knn_imp[:,15]



df_train_c2 = pd.DataFrame(np.concatenate([x_train_c2,y_train_c2[:,np.newaxis]], axis=1), 
                        columns=data_c2.columns[np.concatenate([sel.get_support(), [True]])])


r = df_train.corr(method='pearson')
MI = mutual_info_regression(x_train_cca, y_train_cca)

fig, ax = plt.subplots(2,1, figsize=(22,15))
ax[0].set_title('Información mutua')
Ejemplo n.º 30
0
def fit_neighbours(data: pd.DataFrame, neighbors: int = 5) -> pd.DataFrame:
    return pd.DataFrame(KNNImputer(n_neighbors=neighbors).fit_transform(data.values),
                        columns=data.columns,
                        index=data.index
                        )