コード例 #1
0
def impute_parameter_adjustment(method, param_grid, impute_radio, x_init,
                                y_init, reference_x, reference_y):
    model = joblib.load('..\\models\\vote_model_hard.joblib')
    markers = ['o', '*', '1', 's', '2']
    I = 20
    for radio, marker in zip(impute_radio, markers):
        acc_1 = {i: 0 for i in param_grid}
        acc_2 = {i: 0 for i in param_grid}
        for m in range(I):
            corruptor = Corruptor(x_init, radio)
            x_miss = getattr(corruptor, "mcar")()
            for n in param_grid:
                if method == 'knn':
                    x_impute = fancyimpute.KNN(k=n).fit_transform(
                        np.vstack(
                            (x_miss, reference_x)))[range(x_init.shape[0])]
                if method == 'mice':
                    data_impute_list = []
                    for i in range(n):
                        imputer = fancyimpute.IterativeImputer(
                            n_iter=13, sample_posterior=True, random_state=i)
                        data_impute_list.append(
                            imputer.fit_transform(
                                np.vstack(
                                    (x_miss,
                                     reference_x)))[range(x_init.shape[0])])
                    x_impute = np.mean(data_impute_list, 0)
                    print(radio, m, n)
                if method == 'em':
                    x_impute = em(np.vstack((x_miss, reference_x)),
                                  loops=n)[range(x_init.shape[0])]
                if method == 'som':
                    x_impute = impute_SOM(x_miss, n)[range(x_init.shape[0])]
                y_pred1 = model.predict(x_impute)
                y_pred2 = model.predict(x_init)
                acc_1[n] += 1 - accuracy_score(y_pred1, y_pred2)
                acc_2[n] += 1 - accuracy_score(y_pred1, y_init)
        acc_1 = {i: (j / I) for i, j in acc_1.items()}
        acc_2 = {i: (j / I) for i, j in acc_2.items()}
        plt.subplot(121)
        plt.plot(acc_1.keys(),
                 acc_1.values(),
                 marker=marker,
                 label='%.1f%%' % (radio * 100))
        plt.xlabel('K')
        plt.ylabel('CER between imputation and prediction')
        plt.subplot(122)
        plt.plot(acc_2.keys(),
                 acc_2.values(),
                 marker=marker,
                 label='%.1f%%' % (radio * 100))
        plt.xlabel('K')
        plt.ylabel('CER between imputation and real label')
        plt.legend(loc=0, bbox_to_anchor=(0.3, -0.05), ncol=5)
    plt.show()
コード例 #2
0
def multi_imp_conf(perc, c, m=10):
    """multiple imputations for conf intervals"""
    df, y = glm_testing.create_missing(perc=perc, c=c)
    drug_vals, drug_true = glm_testing.test_drug(c=c)
    prob_sum = np.zeros(20)  # guassian probs sum
    for k in range(m):
        clf = BayesianRidge()
        design = fancyimpute.IterativeImputer(n_iter=10, sample_posterior=True, random_state=int(
            k * 243624) % 2**(32 - 1)).fit_transform(df)
        clf.fit(design, y)
        drug_preds, std = clf.predict(drug_vals, return_std=True)
        prob_sum += (1 / m) * scipy.stats.norm(drug_preds,
                                               std * 1).pdf(drug_true)
    return sum(prob_sum < 0.05)  # 95 percent
コード例 #3
0
def multi_imp(perc, c, m=10):
    """multiple imputation for log prob"""
    df, y = glm_testing.create_missing(perc=perc, c=c)
    drug_vals, drug_true = glm_testing.test_drug(c=c)
    drug_dists = []
    prob_sum = 0  # guassian probs sum
    for k in range(m):
        clf = BayesianRidge()
        design = fancyimpute.IterativeImputer(n_iter=10, sample_posterior=True, random_state=int(
            k * 243624) % 2**(32 - 1)).fit_transform(df)
        clf.fit(design, y)
        drug_preds, std = clf.predict(drug_vals, return_std=True)
        prob_sum += (1 / m) * scipy.stats.norm(drug_preds,
                                               std * 1).pdf(drug_true).prod()
    return -np.log(prob_sum)
コード例 #4
0
def impute(data, method='knn', n=5):
    if method == 'knn':
        data_impute = fancyimpute.KNN(k=n).fit_transform(data)
    if method == 'mice':
        data_impute_list = []
        for i in range(11):
            imputer = fancyimpute.IterativeImputer(n_iter=13,
                                                   sample_posterior=True,
                                                   random_state=i)
            data_impute_list.append(imputer.fit_transform(data))
        data_impute = np.mean(data_impute_list, 0)
        # data_impute = mice(data)
    if method == 'em':
        data_impute = em(data)
    if method == 'mean':
        data_impute = fancyimpute.simple_fill.SimpleFill(
            fill_method='mean').fit_transform(data)
    return data_impute
コード例 #5
0
def dec_multi(perc, c, m=10):
    """
    Decision tree classifier with multiple imputation
    """
    clf = DecisionTreeClassifier()
    df, y = glm_testing.create_missing(perc=perc, c=c)
    drug_vals, drug_true = glm_testing.test_drug(c=c)
    y = y.apply(lambda x: 1 if x > 1 else 0)
    drug_true = drug_true.apply(lambda x: 1 if x > 1 else 0)
    drug_dists = []
    prob_sum = 0
    for k in range(m):
        design = fancyimpute.IterativeImputer(n_iter=10, sample_posterior=True, random_state=int(
            k * 243624) % 2**(32 - 1)).fit_transform(df)
        clf.fit(design, y)
        prob_sum += (1 / m) * (clf.predict_proba(drug_vals)[:, 1][drug_true == 1].prod(
        ) * clf.predict_proba(drug_vals)[:, 0][drug_true == 0].prod())
    return np.log(prob_sum)
コード例 #6
0
def log_multi(perc, c, m=10, extras=0):
    """
    Logistic regression classifier with multiple imputation
    """
    clf = LogisticRegression(solver="liblinear")
    df, y = glm_testing.create_missing(perc=perc, c=c, extras=extras)
    drug_vals, drug_true = glm_testing.test_drug(c=c, extras=extras)
    y = y.apply(lambda x: 1 if x > 1 else 0)
    drug_true = drug_true.apply(lambda x: 1 if x > 1 else 0)
    drug_dists = []
    prob_sum = 0  # guassian probs sum
    for k in range(m):
        design = fancyimpute.IterativeImputer(n_iter=10, sample_posterior=True, random_state=int(
            k * 243624) % 2**(32 - 1)).fit_transform(df)
        clf.fit(design, y)
        prob_sum += (1 / m) * (clf.predict_proba(drug_vals)[:, 1][drug_true == 1].prod(
        ) * clf.predict_proba(drug_vals)[:, 0][drug_true == 0].prod())
    return np.log(prob_sum)
コード例 #7
0
def impute(df, method, verbose=False):
    """
    Impute missing data using specified imputation method.
    
    Parameters
    ----------
    df: pd.DataFrame
        Stat DataFrame with source columns and player/team  multi-index.
    method: str/bool
        Imputation method for missing data.
            - False: Do not impute missing data.
            - None: Do not impute missing data.
            - 'BiScaler'
            - 'IterativeImpute'
            - 'IterativeSVD'
            - 'KNN': Impute with nearest neighbors.
            - 'Mean': Impute missing with average of other sources.
            - 'NuclearNorm'
            - 'SoftImpute'
    verbose: bool, default=False
        If True, print debugging information.
        
    Returns
    -------
    df: pd.DataFrame
        Imputed DataFrame with no NaNs.
    """
    warnings.filterwarnings('ignore', category=RuntimeWarning)

    # Subset DataFrame to only include only projection columns.
    ignored_cols = ['Player', 'Team', 'Pos', 'Week', 'STATS']
    impute_cols = [col for col in list(df) if col not in ignored_cols]
    X = df[impute_cols].copy().T

    # Impute DataFrame.
    v = verbose
    if method in [None, False]:
        imputed_vals = X.values
    elif np.sum(np.sum(X.isnull())) == 0:
        # No missing values.
        imputed_vals = X.values
    elif method == 'BiScaler':
        imputed_vals = fi.BiScaler(verbose=v).fit_transform(X)
    elif method == 'IterativeImpute':
        imputed_vals = fi.IterativeImputer(verbose=v).fit_transform(X)
    elif method == 'IterativeSVD':
        imputed_vals = fi.IterativeSVD(verbose=v).fit_transform(X)
    elif method == 'KNN':
        imputed_vals = fi.KNN(k=3, verbose=v).fit_transform(X)
    elif method == 'MatrixFactorization':
        imputed_vals = fi.MatrixFactorization(verbose=v).fit_transform(X)
    elif method == 'Mean':
        imputed_vals = fi.SimpleFill('mean').fit_transform(X)
    elif method == 'Median':
        imputed_vals = fi.SimpleFill('median').fit_transform(X)
    elif method == 'NuclearNorm':
        imputed_vals = fi.NuclearNormMinimization(verbose=v).fit_transform(X)
    elif method == 'SoftImpute':
        imputed_vals = fi.SoftImpute(verbose=v).fit_transform(X)

    # Recombine ignored columns with imputed data.
    imputed_df = pd.DataFrame(imputed_vals.T, columns=X.index)
    for col in impute_cols:
        if len(imputed_df[col]) != len(df[col]):
            print(f'df: {len(df[col])}\nimp: {len(imputed_df[col])}')
        df[col] = imputed_df[col].values

    return df
コード例 #8
0
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import fancyimpute

model_tree = RandomForestClassifier(random_state=100, n_estimators=50)

steps = [('imputation', fancyimpute.IterativeImputer(verbose=0)),
         ('scaler', StandardScaler()), ("over", SMOTE(random_state=42)),
         ("under", RandomUnderSampler()), ('tree', RandomForestClassifier())]

# Create the pipeline: pipeline
pipeline = Pipeline(steps)

#best result with selected parameters
# Number of trees in random forest
n_estimators = [1400]
# Number of features to consider at every split
max_features = ['auto']
# Maximum number of levels in tree
max_depth = [130]
# Minimum number of samples required to split a node
min_samples_split = [8]
コード例 #9
0
ファイル: Util.py プロジェクト: piyushkhadgi/Butterfly
def feature_create(config):
    """ Function to describe the modeling data."""
    df = pandas.read_csv(config.raw_file, delimiter=',')

    # Feature creation title

    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df.loc[df["Title"] == 'Mr', 'Title2'] = 'Mr'
    df.loc[df["Title"] == 'Mrs', 'Title2'] = 'Mrs'
    df.loc[df["Title"] == 'Miss', 'Title2'] = 'Miss'
    df.loc[df["Title"] == 'Master', 'Title2'] = 'Master'
    df.loc[df["Title"] == 'Ms', 'Title2'] = 'Ms'
    df.loc[df["Title"] == 'Mlle', 'Title2'] = 'Miss'
    df.loc[df["Title"] == 'Ms', 'Title2'] = 'Miss'
    df.loc[df["Title"] == 'Mme', 'Title2'] = 'Mrs'
    df.loc[df["Title2"].isna(), 'Title2'] = 'Rare'
    df_title = pandas.get_dummies(df.Title2, prefix='Title')
    df = pandas.concat([df, df_title], axis=1)
    df['Name_len'] = df.Name.str.len()
    df['Name_space'] = df.Name.str.count(' ')
    df = df.drop(['Name'], axis=1)
    df = df.drop(['Title'], axis=1)
    df = df.drop(['Title2'], axis=1)

    # Missing Value treatment for Cabin

    df['Cabin_new'] = df.Cabin.str[:1]
    df = df.drop(['Cabin'], axis=1)
    df_cabin = pandas.get_dummies(df.Cabin_new, prefix='Cabin', dummy_na=True)
    df = pandas.concat([df, df_cabin], axis=1)
    df = df.drop(['Cabin_new'], axis=1)

    # Feature creation Sex

    df['Gender'] = df['Sex'].map({'female': 1, 'male': 0}).astype(int)
    df_sex = pandas.get_dummies(df.Sex, prefix='Sex')
    df = pandas.concat([df, df_sex], axis=1)
    df = df.drop(['Sex'], axis=1)

    # Family

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['withsomebody'] = df['SibSp'] + df['Parch']
    df["isalone"] = df['withsomebody'].copy()
    df["isalone"].loc[df['withsomebody'] > 0] = 0
    df["isalone"].loc[df['withsomebody'] == 0] = 1

    # Missing Value treatment for Embarked

    df['Embarked'] = df['Embarked'].fillna('C')
    df_Embarked = pandas.get_dummies(df.Embarked, prefix='Embarked')
    df = pandas.concat([df, df_Embarked], axis=1)
    df = df.drop(['Embarked'], axis=1)

    # Missing Value treatment for Ticket

    new = df["Ticket"].str.split(" ", n=2, expand=True)
    new[3] = numpy.where(new[2].isna(), new[1], new[2])
    new['Ticket1'] = numpy.where(new[3].isna(), new[0], new[3])
    new['Ticket2'] = new[0].str.extract('([A-Za-z]+)', expand=False)
    new['T_length'] = new.Ticket1.str.len()
    new['T_First'] = new.Ticket1.str[:1]
    new = new.drop([0], axis=1)
    new = new.drop([1], axis=1)
    new = new.drop([2], axis=1)
    new = new.drop([3], axis=1)
    new = new.drop(['Ticket1'], axis=1)
    df = pandas.concat([df, new], axis=1)
    df = df.drop(['Ticket'], axis=1)

    df.loc[df['T_length'] < 5, 'T_l_new'] = 'S'
    df.loc[df['T_length'] == 5, 'T_l_new'] = 'M'
    df.loc[df['T_length'] > 5, 'T_l_new'] = 'L'
    df.loc[df['T_First'] == '1', 'T_f_new'] = 'S'
    df.loc[df['T_First'] == '2', 'T_f_new'] = 'M'
    df.loc[df['T_f_new'].isna(), 'T_f_new'] = 'L'
    df['High_ticket'] = df['Ticket2'].isin(['PP', 'PC', 'C', 'P'])
    df_t1 = pandas.get_dummies(df.T_l_new, prefix='T_l')
    df = pandas.concat([df, df_t1], axis=1)
    df_t2 = pandas.get_dummies(df.T_f_new, prefix='T_F')
    df = pandas.concat([df, df_t2], axis=1)

    df = df.drop(['T_l_new'], axis=1)
    df = df.drop(['T_f_new'], axis=1)
    df = df.drop(['T_First'], axis=1)
    df = df.drop(['T_length'], axis=1)
    df = df.drop(['Ticket2'], axis=1)

    # interaction between class and age

    df['Age*Class'] = df["Age"] * df["Pclass"]

    # interaction between class and child

    df.loc[df['Age'] < 16, 'ischild'] = 1
    df.loc[df.ischild.isna(), 'ischild'] = 0
    df["Child*Class"] = df["ischild"] * df["Pclass"]

    # interaction between class and gender

    df["Gender*Class"] = df["Gender"] * df["Pclass"]

    # Missing Value treatment for Age

    df_ii = pandas.DataFrame(fancyimpute.IterativeImputer().fit_transform(
        df.drop('Survived', axis=1)))
    df_ii.columns = df.drop('Survived', axis=1).columns
    df_ii.index = df.index
    df_ii = pandas.concat([df_ii, df.Survived], axis=1)
    df_ii.loc[df_ii['_data_'] == 0, 'Survived'] = float('NaN')
    df = df_ii

    df.loc[df['Age'] <= 26, 'Age_new'] = 'A'
    df.loc[(df['Age'] > 16) & (df['Age'] <= 26), 'Age_new'] = 'B'
    df.loc[(df['Age'] > 26) & (df['Age'] <= 36), 'Age_new'] = 'C'
    df.loc[(df['Age'] > 36) & (df['Age'] <= 62), 'Age_new'] = 'D'
    df.loc[df['Age'] > 62, 'Age_new'] = 'E'

    df_age = pandas.get_dummies(df.Age_new, prefix='Age')
    df = pandas.concat([df, df_age], axis=1)
    df = df.drop(['Age_new'], axis=1)

    df = scaleColumns(config, df)

    df.to_csv(config.feature_file, index=False)

    return None
コード例 #10
0
ファイル: cnn_cf_hf.py プロジェクト: ZJU-BMI/ITE-estimation
from data.read_data import DataSet

with open('split_hf.pickle', 'rb') as f:
    sssList = pickle.load(f)

med_data = pd.read_csv('follow_up_data_2_1y_2_drug_8.csv', encoding='gbk')
med_data = med_data.values

t = med_data[:, -1]
t = t.reshape(-1, 1)

patient_data = pd.read_csv('follow_up_data_2_1y_2_feature_drug.csv',
                           encoding='gbk')
patient_data = patient_data.values
patient_data = patient_data[:, 1:106]
patient_data = fi.IterativeImputer().fit_transform(patient_data)
#patient_data = minmax_scale(patient_data, feature_range=(0, 1))
x = patient_data

outcome = pd.read_csv('follow_up_data_2_1y_2_outcome.csv', encoding='gbk')
outcome = outcome.values
outcome = outcome[:, 1]
y = outcome.reshape(-1, 1)

Acc_tol = np.zeros(shape=(10, 20))
precision_tol = np.zeros(shape=(10, 20))
recall_tol = np.zeros(shape=(10, 20))
f1_tol = np.zeros(shape=(10, 20))
for i in range(0, 10):  #hyperparameter tuning
    print("iteration number: %d" % i)
    sss = sssList[i]