Ejemplo n.º 1
0
def k_means(input_df):
    input_df['pH'] = input_df['pH'] * 100
    k = input_df.loc[:, 'pH']
    X_pH_train, X_pH_test, y_pH_train, y_pH_test = train_test_split(
        input_df, k, test_size=0.33, random_state=42)
    X_pH_test = X_pH_test.drop(columns='pH')
    final = X_pH_train.append(X_pH_test, ignore_index=True)
    imputer = KNNImputer(n_neighbors=5, weights='uniform')
    np.set_printoptions(suppress=True)
    final = imputer.fit_transform(final)
    df = pd.DataFrame(final)
    df.columns = [
        'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
        'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
        'pH', 'sulphates', 'alcohol', 'quality'
    ]
    df['pH'] = df['pH'] / 100
    return df
Ejemplo n.º 2
0
def process_data(filename, fit_encoder=True):
    df = pd.read_csv(filename)

    # Handle categorical attributes
    df['CabinLetter'] = df['Title'] = df['TicketLabel'] = ""
    df['CabinLetter'] = list(df['Cabin'].apply(get_cabin_letter))
    df['TicketLabel'] = list(df['Ticket'].apply(get_ticket_label))
    df['Title'] = list(df['Name'].apply(get_title))
    df['Embarked'].fillna('S', inplace=True)
    X_cat = df[['Pclass', 'Sex', 'Embarked', 'Title', 'CabinLetter']]
    if fit_encoder:
        encoder.fit(X_cat)
    X_cat = encoder.transform(X_cat).toarray()

    # Handle numerical attributes
    df['CabinNumber'] = df['TicketNumber'] = ""
    df['CabinNumber'] = list(df['Cabin'].apply(get_cabin_number))
    #median_cabin = df['CabinNumber'].median()
    #df['CabinNumber'].fillna(median_cabin,inplace=True)
    df['TicketNumber'] = list(df['Ticket'].apply(get_ticket_number))
    median_ticket = df['TicketNumber'].median()
    df['TicketNumber'].fillna(median_ticket, inplace=True)
    X_num = df[['Age', 'SibSp', 'Parch', 'Fare', 'TicketNumber']]
    scaler = MinMaxScaler()
    X_num = scaler.fit_transform(X_num)
    imputer = KNNImputer()
    X_num = imputer.fit_transform(X_num)

    # Impute Age
    #median_age = df['Age'].median()
    #df['Age'].fillna(median_age,inplace=True)
    # Impute Fare
    #median_fare = df['Fare'].median()
    #df['Fare'].fillna(median_fare,inplace=True)

    # Final X matrix
    X = np.hstack((X_cat, X_num))

    # Final y array
    if 'Survived' in df:
        y = df['Survived'].array
    else:
        y = None
    return X, y
Ejemplo n.º 3
0
    def impute_feature(data,feature):
        data.loc[data[feature]<0,feature]=np.NaN
        value_count=data.groupby('county_fips').count()
        counties_with_all_nulls=value_count[value_count[feature]==0]
        temp=pd.DataFrame(index=data['county_fips'].unique().tolist(),columns=data['date'].unique().tolist())

        for i in data['date'].unique():
            temp[i]=data.loc[data['date']==i,feature].tolist()
        X = np.array(temp)
        imputer = KNNImputer(n_neighbors=5)
        imp=imputer.fit_transform(X)
        imp=pd.DataFrame(imp)
        imp.columns=temp.columns
        imp.index=temp.index
        for i in data['date'].unique():
            data.loc[data['date']==i,feature]=imp[i].tolist()
        if(len(counties_with_all_nulls)>0):
            data.loc[data['county_fips'].isin(counties_with_all_nulls.index),feature]=np.NaN
        return(data)
 def impute_missing_values(self, data):
     self.logger_object.log(self.file_object,
                            "Entered impute_missing_value method")
     self.data = data
     try:
         imputer = KNNImputer(n_neighbors=3,
                              weights='uniform',
                              missing_values=np.nan)
         self.new_array = imputer.fit_transform(self.data)
         self.new_data = pd.DataFrame(data=self.new_array,
                                      columns=self.data.columns)
         self.logger_object.log(self.file_object,
                                "Imputed missing values are success")
         return self.new_data
     except Exception:
         self.logger_object.log(self.file_object,
                                "Eror occured on impute_missing_values")
         self.logger_object.log(self.file_object, "Imputation unsuccessful")
         raise Exception()
Ejemplo n.º 5
0
def cv_preprocessing(X_train, X_test=None, random_state=None):
    variables_path = r"variables.json"
    with open(variables_path) as f:
        variables = json.load(f)
        t1_features, cogni = variables['t1_features'], variables['cogni']
        pcl = variables['questionnaires']['PCL'][:17]

    mice = KNNImputer()
    columns = X_train.columns
    X_train = pd.DataFrame(mice.fit_transform(X_train), columns=columns)

    #X_train = stds(X_train)
    #X_train = stats(X_train)
    #X_train = removal_correlated(X_train)
    # ss = StandardScaler()
    # X_train = ss.fit_transform(X_train)
    # X_train = pd.DataFrame(ss.fit_transform(X_train), columns=columns)
    if X_test is not None:
        X_test = pd.DataFrame(mice.transform(X_test), columns=columns)
        #X_test = stds(X_test)
        #X_test = stats(X_train, X_test)
        #_, X_test = removal_correlated(X_train, X_test)
        # X_test = ss.transform(X_test)
        # X_test = pd.DataFrame(ss.transform(X_test), columns=columns)

        X_train, X_test = outliers(
            X_train,
            X_test,
            features=[f"T1q5.{i}" for i in range(1, 10)],
            name='phq9')
        #X_train, X_test = outliers(X_train, X_test, features=pcl, name='PCL')
        X_train, X_test = outliers(X_train,
                                   X_test,
                                   features=cogni,
                                   name='cogni')
        X_train, X_test = outliers(X_train,
                                   X_test,
                                   features=t1_features,
                                   name='t1')

        return X_train, X_test
    else:
        return X_train
Ejemplo n.º 6
0
def test_knn_imputer_callable_metric():

    # Define callable metric that returns the l1 norm:
    def custom_callable(x, y, missing_values=np.nan, squared=False):
        x = np.ma.array(x, mask=np.isnan(x))
        y = np.ma.array(y, mask=np.isnan(y))
        dist = np.nansum(np.abs(x - y))
        return dist

    X = np.array([[4, 3, 3, np.nan], [6, 9, 6, 9], [4, 8, 6, 9],
                  [np.nan, 9, 11, 10.0]])

    X_0_3 = (9 + 9) / 2
    X_3_0 = (6 + 4) / 2
    X_imputed = np.array([[4, 3, 3, X_0_3], [6, 9, 6, 9], [4, 8, 6, 9],
                          [X_3_0, 9, 11, 10.0]])

    imputer = KNNImputer(n_neighbors=2, metric=custom_callable)
    assert_allclose(imputer.fit_transform(X), X_imputed)
Ejemplo n.º 7
0
def naKNN(train_x, test_x):
    """
    Sostituisce i valori mancanti nel training set e nel test set con KNNImputer().
    :param train_x: training set
    :param test_x: test set
    :return: None
    """
    getNaCount(train_x)  # calcola il numero di NaN per il training set
    imputer = KNNImputer(n_neighbors=3)

    imputed_train = imputer.fit_transform(train_x.data)
    train_x.data = pd.DataFrame(imputed_train, columns=train_x.data.columns)
    save_object(
        imputer, 'imputer.pkl'
    )  # salva imputer nel file 'imputer.pkl' (serve successivamente per il test finale)

    if test_x is not None:
        imputed_test = imputer.transform(test_x.data)
        test_x.data = pd.DataFrame(imputed_test, columns=test_x.data.columns)
    def impute_missing_values(self, data):

        self.data = data
        try:
            self.logger.info('Start of imputing missing values...')
            imputer = KNNImputer(n_neighbors=3,
                                 weights='uniform',
                                 missing_values=np.nan)
            self.new_array = imputer.fit_transform(
                self.data)  # impute the missing values
            # convert the nd-array returned in the step above to a Data frame
            self.new_data = pd.DataFrame(data=self.new_array,
                                         columns=self.data.columns)
            self.logger.info('End of imputing missing values...')
            return self.new_data
        except Exception as e:
            self.logger.exception(
                'Exception raised while imputing missing values:' + str(e))
            raise Exception()
Ejemplo n.º 9
0
def test_knn_imputer_not_enough_valid_distances(na, weights):
    # Samples with needed feature has nan distance
    X1 = np.array([
        [na, 11],
        [na, 1],
        [3, na]
    ])
    X1_imputed = np.array([
        [3, 11],
        [3, 1],
        [3, 6]
    ])

    knn = KNNImputer(missing_values=na, n_neighbors=1, weights=weights)
    assert_allclose(knn.fit_transform(X1), X1_imputed)

    X2 = np.array([[4, na]])
    X2_imputed = np.array([[4, 6]])
    assert_allclose(knn.transform(X2), X2_imputed)
Ejemplo n.º 10
0
def knn_impute_by_user(matrix, valid_data, k):
    """ Fill in the missing values using k-Nearest Neighbors based on
    student similarity. Return the accuracy on valid_data.

    See https://scikit-learn.org/stable/modules/generated/sklearn.
    impute.KNNImputer.html for details.

    :param matrix: 2D sparse matrix
    :param valid_data: A dictionary {user_id: list, question_id: list,
    is_correct: list}
    :param k: int
    :return: float
    """
    nbrs = KNNImputer(n_neighbors=k)
    # We use NaN-Euclidean distance measure.
    mat = nbrs.fit_transform(matrix)
    acc = sparse_matrix_evaluate(valid_data, mat)
    print("user Validation Accuracy: {}".format(acc))
    return acc
Ejemplo n.º 11
0
def impute_values(df, imp_strategy, neighbors, numeric_vars):

    X = convert_to_numeric(df, numeric_vars)
    X = df[numeric_vars].to_numpy()
    other_vars = list(set(df.columns) - set(numeric_vars) )
    X_strings = df[other_vars].reset_index(drop=True)
    if imp_strategy == "knn":
        imputer = KNNImputer(n_neighbors = neighbors) #weights = weight_type
        imputed = imputer.fit_transform(X) # This is very costly
# from here https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html
# https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html
#         imputed = fast_knn(X, k= neighbors)
    else:
        imputer = SimpleImputer(missing_values = np.nan, strategy = imp_strategy)
        imputer.fit(X)
        imputed = imputer.transform(X)
    X_imputed = pd.DataFrame.from_records(imputed, columns = numeric_vars)
    rv = X_strings.join(X_imputed)
    return rv
    def fill_categorical_na(self, df):
        """
        Impute categorical NaN with two methods
        Args:
            :df: Dataframe
        Returns:
            :df_result: Dataframe without NaN
        """

        df_result = df.copy()

        imputer = KNNImputer(n_neighbors=2, weights="uniform")
        df_result.columns
        data_cat_imputed = imputer.fit_transform(df_result)

        for i in range(data_cat_imputed.shape[1]):
            df_result[df_result.columns[i]] = data_cat_imputed[:, i]

        return df_result
Ejemplo n.º 13
0
def test_model(k, v, data):
    results = {}
    if k[4] == 'nosoc':
        # If excluding nosocomial patients
        if data['nosoc'].sum() == 0:
            return (False)
        else:
            data = data[data['nosoc'] == 0]
    X = data[v['X']]
    y = data[v['y']].astype('int')
    # Scale, impute
    scaler, clf = v['scaler'], v['clf']
    imputer = KNNImputer()
    X = scaler.transform(X)
    X = imputer.fit_transform(X)
    # 1. Pre-trained model ------------------------------------------------
    y_prob = clf.predict_proba(X)[:, 1]
    y_pred = clf.predict(X)
    results['pretrained'] = get_summaries(clf, X, y, y_prob, y_pred)
    # Get 'treat all' line for net benefit
    results['treat_all'] = net_benefit(clf, X, y, treat_all=True)
    # 2. Re-scaled model [based in internal validation] -------------------
    scale_coef = np.sum(X * (clf.coef_ * v['shrink_slope']), axis=1)
    scale_int = clf.intercept_ + v['shrink_int']
    odds = np.exp(scale_coef + scale_int)
    y_prob = odds / (1 + odds)
    y_pred = np.where(y_prob > 0.5, 1, 0)
    results['rescaled'] = get_summaries(clf, X, y, y_prob, y_pred)
    # 3. Re-calibrated model [based on validation sample] -----------------
    clf_recal = CalibratedClassifierCV(clf, method='sigmoid',
                                       cv='prefit').fit(X, y)
    y_pred = clf_recal.predict(X)
    y_prob = clf_recal.predict_proba(X)[:, 1]
    y_logp = np.log(y_prob / (1 - y_prob))
    results['recal'] = get_summaries(clf_recal,
                                     X,
                                     y,
                                     y_prob,
                                     y_pred,
                                     lp=y_logp)
    # Store outcome rate
    results['meany'] = np.mean(y)
    return (results)
Ejemplo n.º 14
0
def knn_impute_by_item(matrix, valid_data, k):
    """ Fill in the missing values using k-Nearest Neighbors based on
    question similarity. Return the accuracy on valid_data.

    :param matrix: 2D sparse matrix
    :param valid_data: A dictionary {user_id: list, question_id: list,
    is_correct: list}
    :param k: int
    :return: float
    """
    #####################################################################
    # Implement the function as described in the docstring.             #
    #####################################################################
    nbrs = KNNImputer(n_neighbors=k)
    # We use NaN-Euclidean distance measure.
    mat = nbrs.fit_transform(matrix.T)
    acc = sparse_matrix_evaluate(valid_data, mat.T)
    print("Validation Accuracy on question: {}".format(acc))

    return acc
Ejemplo n.º 15
0
def run_knn(train_data, val_data, k):
    """Create matrix prediction using KNN trained on train data. k is k nearest neighbors

    :param train_data: A dictionary {user_id: list, question_id: list,
        is_correct: list}
    :param val_data: A dictionary {user_id: list, question_id: list,
        is_correct: list}
    :param k: int
    :return: num_users by num_questions matrix of predictions
    """

    sparse_matrix = load_train_sparse("../data").toarray()
    size = len(train_data['user_id'])
    train_data_bootstrap = bootstrap_data(train_data, size)

    nbrs = KNNImputer(n_neighbors=k)
    # We use NaN-Euclidean distance measure.
    mat = nbrs.fit_transform(sparse_matrix)

    return mat
    def _fill_missing_data(self, data):
        """
        :param data: Data-frame after merging user data and job description data, and feature engineered.
        :return: Data-frame after filling in missing values in the features using KNN imputing.
        """

        sys.setrecursionlimit(100000)

        non_imput_cols = ['has_applied']
        data_to_imput = data.drop(non_imput_cols, axis=1)
        imput_cols = list(data_to_imput)
        non_imputed_data = data.drop(imput_cols, axis=1)

        imp_mean_knn = KNNImputer(n_neighbors=30)
        imputed_data = imp_mean_knn.fit_transform(data_to_imput)
        imputed_data = pd.DataFrame(imputed_data, columns=imput_cols)
        resultant_imputed_data = pd.concat([non_imputed_data, imputed_data],
                                           axis=1,
                                           join='inner')
        return resultant_imputed_data
Ejemplo n.º 17
0
def imputations(df1, cols):
    df = df1.copy()
    for variable in cols:
        mappings = find_category_mappings(df, variable)
        mappin[variable] = mappings

    for variable in cols:
        integer_encode(df, variable, mappin[variable])

    sca = mm.fit_transform(df)
    knn_imputer = KNNImputer()
    knn = knn_imputer.fit_transform(sca)
    df.iloc[:, :] = mm.inverse_transform(knn)
    for i in df.columns:
        df[i] = round(df[i]).astype('int')

    for i in cols:
        inv_map = {v: k for k, v in mappin[i].items()}
        df[i] = df[i].map(inv_map)
    return df
Ejemplo n.º 18
0
def _gen_inits_clustering(X, K, n_iter=10, skip_spectral=True):

    Xs = [X]

    Xn = X.copy().astype('float')
    Xn[Xn == 0] = np.nan

    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    Xs.append(imp.fit_transform(Xn))

    imputer = KNNImputer()
    Xs.append(imputer.fit_transform(Xn))

    lb_inits = []

    for Xt in Xs:
        li = gen_inits_for_X(Xt, K, n_iter, skip_spectral)
        lb_inits = lb_inits + li

    return dedup_labels(lb_inits)
 def impute_missing_values(self, data):
     """
                                     Method Name: impute_missing_values
                                     Description: This method replaces all the missing values in the Dataframe using KNN Imputer.
                                     Output: A Dataframe which has all the missing values imputed.
                                     On Failure: Raise Exception
                  """
     self.logger_object.log(self.file_object, 'Entered the impute_missing_values method of the Preprocessor class')
     self.data= data
     try:
         imputer=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)
         self.new_array=imputer.fit_transform(self.data) # impute the missing values
         # convert the nd-array returned in the step above to a Dataframe
         self.new_data=pd.DataFrame(data=self.new_array, columns=self.data.columns)
         self.logger_object.log(self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class')
         return self.new_data
     except Exception as e:
         self.logger_object.log(self.file_object,'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  ' + str(e))
         self.logger_object.log(self.file_object,'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class')
         raise Exception()
Ejemplo n.º 20
0
def impute_experience(df, cat_var):
    df['experience'] = df['experience'].replace(['<1'], 0)
    df['experience'] = df['experience'].replace(['>20'], 21)
    df1 = df
    df1 = df.drop(cat_var + list(['last_new_job']), axis=1)
    imputer = KNNImputer()
    df1_imputed = imputer.fit_transform(df1)
    df1_imputed = pd.DataFrame(df1_imputed,
                               index=df1.index,
                               columns=df1.columns)

    bins = np.linspace(0, 25, 6)
    labels = ['exp_one', 'exp_two', 'exp_three', 'exp_four', 'exp_five']
    df1_imputed['exp_bins'] = pd.cut(df1_imputed['experience'],
                                     bins=bins,
                                     labels=labels)
    df2 = pd.get_dummies(df1_imputed['exp_bins'])
    df = df.drop(['experience'], axis=1)
    df = pd.concat([df, df2], axis=1)
    return df
def compare_to_lasso_analysis(adata, ccdtranscript):
    '''Perform a comparison of pseudotime analysis to LASSO analysis for finding CCD proteins'''
    prevPlotSize = plt.rcParams['figure.figsize']
    plt.rcParams['figure.figsize'] = (6, 5)

    print("ANALYZING SC-RNA-SEQ WITH LASSO")
    warnings.filterwarnings("ignore")
    fucci_rna_data = [(adata.obs["Red585"][ii], adata.obs["Green530"][ii])
                      for ii in np.arange(len(adata.obs))]
    imputer = KNNImputer(missing_values=0)
    expression = imputer.fit_transform(adata.X)
    fucci_rna_path = "output/pickles/fucci_rna_imputed_lasso.pkl"
    if os.path.exists(fucci_rna_path):
        fucci_rna = np.load(open(fucci_rna_path, 'rb'), allow_pickle=True)
    else:
        fucci_rna = MultiTaskLassoCV()
        fucci_rna.fit(expression, fucci_rna_data)
        pickle.dump(fucci_rna, open(fucci_rna_path, 'wb'))
    nz_coef = np.sum(fucci_rna.coef_, axis=0) != 0
    print(f"{sum(nz_coef)}: number of nonzero lasso coefficients")
    print(f"{adata.var_names[nz_coef]}: genes with nonzero lasso coeff")
    print(
        f"{sum(ccdtranscript[nz_coef]) / sum(nz_coef)}: % nonzero lasso found as CCD transcripts"
    )
    print(
        f"{np.sum(fucci_rna.coef_, axis=0)[nz_coef]}: coefficients for nonzero lasso coeff"
    )

    # Generate UMAP for CCD and nonCCD for the LASSO model
    adataCCd = adata[:, nz_coef]
    sc.pp.neighbors(adataCCd, n_neighbors=10, n_pcs=40)
    sc.tl.umap(adataCCd)
    sc.pl.umap(adataCCd, color="fucci_time", show=False, save=True)
    shutil.move("figures/umap.pdf", f"figures/umapRNALassoCCD.pdf")
    adataNonCCd = adata[:, ~nz_coef]
    sc.pp.neighbors(adataNonCCd, n_neighbors=10, n_pcs=40)
    sc.tl.umap(adataNonCCd)
    sc.pl.umap(adataNonCCd, color="fucci_time", show=False, save=True)
    shutil.move("figures/umap.pdf", f"figures/umapRNALassoNonCCD.pdf")
    plt.rcParams['figure.figsize'] = prevPlotSize
    warnings.filterwarnings("default")
    def impute_missing_values(self, data):
        """
                                        Method Name: impute_missing_values
                                        Description: This method replaces all the missing values in the Dataframe using KNN Imputer.
                                        Output: A Dataframe which has all the missing values imputed.
                                        On Failure: Raise Exception

                                        Written By: iNeuron Intelligence
                                        Version: 1.0
                                        Revisions: None
                     """
        self.logger_object.log(
            self.file_object,
            'Entered the impute_missing_values method of the Preprocessor class'
        )
        self.data = data
        try:
            imputer = KNNImputer(n_neighbors=3,
                                 weights='uniform',
                                 missing_values=np.nan)
            self.new_array = imputer.fit_transform(
                self.data)  # impute the missing values
            # convert the nd-array returned in the step above to a Dataframe
            # rounding the value because KNNimputer returns value between 0 and 1, but we need either 0 or 1
            self.new_data = pd.DataFrame(data=np.round(self.new_array),
                                         columns=self.data.columns)
            self.logger_object.log(
                self.file_object,
                'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class'
            )
            return self.new_data
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class'
            )
            raise Exception()
Ejemplo n.º 23
0
    def impute(self):
        """
        This method performs the Nearest Neighbor Imputation.
        """

        batches = self.__get_batches()
        # return batches

        # The weight parameter was set to distance in order to increase the influence of closer elements
        imputer = KNNImputer(n_neighbors=self.__n_neighbors,
                             weights='distance')

        df = pd.DataFrame(columns=list(self.__df.columns))

        print('Performing imputations...')
        for batch in tqdm(batches):
            data = imputer.fit_transform(batch.drop(['name'], axis=1))
            batch.iloc[:, 1:] = data
            df = pd.concat([df, batch])

        self.__update_df(df)
Ejemplo n.º 24
0
def test_model(feature_set, dataset):
    """
    Test validation sample on pre-trained model for a given feature set.
    """
    if 'y' not in list(dataset):
        raise ValueError('Dataset must contain binary outcome, y')
    if not set(models[feature_set]).issubset(list(dataset)):
        raise ValueError('Dataset must contain required features')
    clf = pretrained[feature_set]
    y = dataset['y']
    X = dataset[models[feature_set]]
    # Scale/impute
    scaler = StandardScaler()
    imputer = KNNImputer()
    X = scaler.fit_transform(X)
    X = imputer.fit_transform(X)
    # Predict
    y_pred = clf.predict(X)
    y_prob = clf.predict_proba(X)[:, 1]
    # Return
    return ({'clf': clf, 'X': X, 'y': y, 'y_pred': y_pred, 'y_prob': y_prob})
Ejemplo n.º 25
0
def fit_imputed(v, train, valid):
    """
    Function to test a single model in validation sample [valid], having
    trained on the training [train] sample, after scaling and imputation.
    """
    # Select features/outcome
    X_train = train[v]
    y_train = train['y']
    n_train = np.shape(X_train)[0]
    # Scale/impute
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    imputer = KNNImputer()
    X_train = imputer.fit_transform(X_train)
    # Train Logistic Regression with inner CV using training sample
    clf = LogisticRegressionCV(cv=inner,
                               penalty='l1',
                               Cs=10**np.linspace(0.1, -3, 50),
                               random_state=42,
                               solver='liblinear',
                               scoring=roc_auc_scorer).fit(X_train, y_train)
    # Predict in validation sample
    X_test = valid[v]
    y_test = valid['y']
    X_test = scaler.transform(X_test)
    X_test = imputer.transform(X_test)
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1]
    # Return
    return ({
        'clf': clf,
        'n_train': n_train,
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test,
        'y_pred': y_pred,
        'y_prob': y_prob
    })
Ejemplo n.º 26
0
def knn_impute_by_item(matrix, valid_data, k):
    """ Fill in the missing values using k-Nearest Neighbors based on
    question similarity. Return the accuracy on valid_data.

    :param matrix: 2D sparse matrix
    :param valid_data: A dictionary {user_id: list, question_id: list,
    is_correct: list}
    :param k: int
    :return: float
    """
    #####################################################################
    # TODO:                                                             #
    # Implement the function as described in the docstring.             #
    #####################################################################
    nbrs = KNNImputer(n_neighbors=k)
    mat = nbrs.fit_transform(matrix.T)
    acc = sparse_matrix_evaluate(valid_data, mat.T)
    print("Validation Accuracy Item_based with k = {} : {}".format(k, acc))
    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################
    return acc
Ejemplo n.º 27
0
def impute_df(df):
    # imputer = KNN()
    imputer = KNNImputer(n_neighbors=2)
    object_types = list(df.select_dtypes(include=['object']).columns)
    num_types = list(set(df.columns) - set(object_types))
    encoders_store = {}
    for column in num_types:
        skew = df[column].skew()
        if (-1 < skew < 1):
            df[column] = df[column].fillna(df[column].mean())
        else:
            df[column] = df[column].fillna(df[column].median())
    #create a for loop to iterate through each column in the data
    for columns in object_types:
        new = encode(df[columns])
        encoders_store[columns] = new[1]
    imputed_data = pd.DataFrame(np.round(imputer.fit_transform(df)),
                                columns=df.columns)
    for columns in object_types:
        imputed_data[columns] = encoders_store[columns].inverse_transform(
            np.array(imputed_data[columns]).reshape(-1, 1))
    return imputed_data
Ejemplo n.º 28
0
def impute_last_new_job(df, cat_var):
    df['last_new_job'] = df['last_new_job'].replace(['never'], 0)
    df['last_new_job'] = df['last_new_job'].replace(['>4'], 5)
    df1 = df
    df1 = df.drop(cat_var, axis=1)
    imputer = KNNImputer()
    df1_imputed = imputer.fit_transform(df1)
    df1_imputed = pd.DataFrame(df1_imputed,
                               index=df1.index,
                               columns=df1.columns)

    bins = np.linspace(-1, 5, 7)
    labels = [
        'lnj_zero', 'lnj_one', 'lnj_two', 'lnj_three', 'lnj_four', 'lnj_five'
    ]
    df1_imputed['lnj_bins'] = pd.cut(df1_imputed['last_new_job'],
                                     bins=bins,
                                     labels=labels)
    df2 = pd.get_dummies(df1_imputed['lnj_bins'])
    df = df.drop(['last_new_job'], axis=1)
    df = pd.concat([df, df2], axis=1)
    return df
Ejemplo n.º 29
0
def experiment_setting_5(X, y, runs=5, missingness=0.1):
    results = []
    for i in range(runs):
        np.random.seed(i)
        X_missing = make_missing_random(X, missingness)

        ss = StratifiedKFold(shuffle=True, random_state=i)

        for train_index, test_index in ss.split(X, y):
            X_train = X_missing[train_index]
            y_train = y[train_index]
            X_test = X[test_index]
            y_test = y[test_index]

            si = KNNImputer()
            X_train = si.fit_transform(X_train)

            dt = C45DecisionTree(criterion='c45', max_depth=20)
            dt.fit(X_train, y_train)
            results.append(accuracy_score(dt.predict(X_test), y_test))

    return results
def get_imputed(from_depth=0, to_depth=2, mode=MODE_MEAN):
    out = pd.DataFrame(
        index=pd.DatetimeIndex(pd.date_range(FROM_CUTOFF, TO_CUTOFF)))
    print("OUT:", out)

    for json_path in base_path.glob('*.csv'):
        print(json_path)
        with open(json_path, 'r') as f:
            df = pd.read_csv(f)

        df = df[(df.depth >= from_depth) & (df.depth <= to_depth)]
        df.index = pd.to_datetime(df['time'])

        df = df.drop(columns=['depth'])
        df = df.drop(columns=['time'])

        if df.empty:
            continue
        elif mode == MODE_MAX:
            df = df.groupby(pd.Grouper(freq='D')).max()
        elif mode == MODE_MIN:
            df = df.groupby(pd.Grouper(freq='D')).max()
        elif mode == MODE_MEDIAN:
            df = df.groupby(pd.Grouper(freq='D')).median()
        elif mode == MODE_MEAN:
            df = df.groupby(pd.Grouper(freq='D')).mean()
        else:
            raise Exception(mode)

        df = df.rename(columns={'value': json_path.name.replace('.csv', '')})
        out = pd.merge(out, df, left_index=True, right_index=True, how='outer')
        print(out)

    imputer = KNNImputer()
    out = pd.DataFrame(imputer.fit_transform(out),
                       columns=out.columns,
                       index=out.index)
    return out