Exemple #1
0
    def test_accuracy(self, pb, lmbd=0.1, max_iter=100):

        test_x, test_y = pb.get_batch_with_label(8000)
        test_y = np.argmax(test_y, axis=1)
        feed_final = {"X": test_x[:6000], "lmbd": lmbd}
        train_data = self.optimize(X=test_x[:6000],
                                   lmbd=lmbd,
                                   max_iter=max_iter)

        lgrs = lgr()
        lgrs.fit(train_data, test_y[:6000])

        # test_x,test_y = pb.get_batch_with_label(1000)
        # test_y = np.argmax(test_y,axis=1)
        lis_out = self.optimize(X=test_x[6000:], lmbd=lmbd, max_iter=max_iter)
        y_pre = lgrs.predict(lis_out)
        return accuracy_score(test_y[6000:], y_pre)
Exemple #2
0
    def test_accuracy(self, pb, lmbd=0.1):

        test_x, test_y = pb.get_batch_with_label(8000)
        test_y = np.argmax(test_y, axis=1)
        zs_test = np.zeros((8000, self.D.shape[0]))
        feed_final = {"Z": zs_test[:6000], "X": test_x[:6000], "lmbd": lmbd}
        train_data = self.output(**feed_final)

        lgrs = lgr()
        lgrs.fit(train_data, test_y[:6000])

        # test_x,test_y = pb.get_batch_with_label(1000)
        # test_y = np.argmax(test_y,axis=1)
        feed_final = {"Z": zs_test[:2000], "X": test_x[6000:], "lmbd": lmbd}
        lis_out = self.output(**feed_final)
        y_pre = lgrs.predict(lis_out)

        return accuracy_score(test_y[6000:], y_pre)
def SNP_loop(x_train, x_test, y_train, num_samples):
    x_train = tf.constant()
    x_train = x_train.numpy()
    y_train = y_train.numpy()
    a, b, c = x_train.shape
    output_shap = np.zeros((num_samples, 0, c))

    model_outputs = []

    for i in range(b):
        x_train1 = np.squeeze(x_train[:num_samples, i, :])
        x_test1 = np.squeeze(x_test[:num_samples, i, :])
        y_train1 = y_train[:num_samples, ...]
        model = lgr().fit(x_train1, y_train1)
        scores = shap_scores(model, x_train1[:num_samples, ...],
                             x_test1[:num_samples, ...])
        scores3 = np.expand_dims(scores, axis=1)
        output_shap = np.append(output_shap, scores3, axis=1)
        model_outputs.append((model.coef_, model.intercept_))

    return [output_shap], model_outputs
Exemple #4
0
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predict')
    plt.savefig(name+"_confusion_matrix.jpg")
    plt.close()


x_train, x_validation, x_test, x_train_SMOTE, x_train_undersample,\
y_train, y_validation, y_test, y_train_SMOTE, y_train_undersample = get_data()

params = {'C': [0.0001, 0.001, 0.01, 0.1, 1,
              10, 100, 1000, 10000],
          'penalty': ['l1', 'l2']}

grid_normal = gscv(lgr(), params, cv=10)
grid_undersample = gscv(lgr(), params, cv=10)
grid_SMOTE = gscv(lgr(), params, cv=10)

grid_normal.fit(x_train, y_train)
grid_undersample.fit(x_train_undersample, y_train_undersample)
grid_SMOTE.fit(x_train_SMOTE, y_train_SMOTE)

result_normal = pd.DataFrame(grid_normal.cv_results_)
result_undersample = pd.DataFrame(grid_undersample.cv_results_)
result_SMOTE = pd.DataFrame(grid_SMOTE.cv_results_)

#best1 = np.argmax(result1.mean_test_score.values)

name = ["normal", "undersample", "SMOTE"]
y_normal_predict = grid_normal.predict(x_test)
        mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
        im = mnist.train.next_batch(N)[0]
        im = im.reshape(N, 28, 28)
        # im = [imresize(a, (17, 17), interp='bilinear', mode='L')-.5
              # for a in im]
        X = np.array(im).reshape(N, -1)
        # model = ResNet50(weights='imagenet',include_top=False)
        # X = feat_extract(model,im)
        print(X.shape)

        dl = DictionaryLearning(K, alpha=lmbd*N, fit_algorithm='cd',
                                n_jobs=-1, verbose=1)
        dl.fit(X)
        D = dl.components_.reshape(K, -1)
        np.save(fname, D)
    return D
if __name__ == '__main__':
    D = create_dictionary_dl(0.1)
    pb = MnistProblemGenerator(D,0.1)
    test_x,test_y = pb.get_batch_with_label(8000)
    test_y = np.argmax(test_y,axis=1)
    from sklearn.linear_model import LogisticRegression as lgr
    from sklearn.metrics import accuracy_score
    lgrs = lgr()
    lgrs.fit(test_x[:6000],test_y[:6000])
    y_pre = lgrs.predict(test_x[6000:])
    print accuracy_score(test_y[6000:],y_pre)



def classify(s):
    import pandas as pd
    import numpy
    import pandas_montecarlo
    from scipy.stats import shapiro, kruskal, f_oneway
    from sklearn.ensemble import RandomForestClassifier as rfc
    from sklearn.neighbors import KNeighborsClassifier as knc
    from sklearn.svm import SVC as svc
    from sklearn.linear_model import LogisticRegression as lgr
    ## RandomForest Classifier with monte carlo simulated training set
    numpy.random.seed(s)

    #df = pd.read_csv("mc_test_data.csv")
    #df = pd.read_csv("rndf_filt_data.csv")
    df = pd.read_csv("data.csv")
    #random forest selected the following columns as most predictive
    df = df[['diagnosis','area_worst','concave points_mean','concave points_worst','perimeter_worst','radius_worst']]

    #print(df.head())
    #df = df.drop(["id","Unnamed: 32"],axis=1)
    #df = df.drop(["Unnamed: 0"],axis=1)
    df = df.replace({'diagnosis': "M"}, 1)
    df = df.replace({'diagnosis': "B"}, 0)

    #split dataset for mc seed and testing

    df_mc, df = numpy.split(df, [int(.7*len(df))])

    #split dataset by class
    #df_1 = pd.read_csv("mc_data_M.csv").drop(["Unnamed: 0"],axis=1)
    #df_0 = pd.read_csv("mc_data_B.csv").drop(["Unnamed: 0"],axis=1)
    df_1 = df_mc.loc[df_mc.diagnosis==1]
    df_0 = df_mc.loc[df_mc.diagnosis==0]
    df_1 = df_1.drop(["diagnosis"],axis=1)
    df_0 = df_0.drop(["diagnosis"],axis=1)

    #simulate class 0 data
    mc_sim_df_0 = pd.DataFrame()
    mc_sim_df_0['diagnosis']= ['0'] * len(df_0.index)
    for col in df_0.columns:
        col_sim = df_0[col].montecarlo(sims = 2, bust = 0, goal = 0).data
        col_sim = col_sim.drop(["original"],axis = 1)
        for col2 in col_sim.columns:
            mc_sim_df_0[col]=col_sim[col2]
            #if(shapiro(mc_sim_df_1[col])[1]>0.05):
                #print(kruskal(mc_sim_df_1[col],df_1[col]))
            #else:
                #print(f_oneway(mc_sim_df_1[col],df_1[col]))

    #simulate class 1 data
    mc_sim_df_1 = pd.DataFrame()
    mc_sim_df_1['diagnosis']= ['1'] * len(df_1.index)
    for col in df_1.columns:
        col_sim = df_1[col].montecarlo(sims = 2, bust = 0, goal = 0).data
        col_sim = col_sim.drop(["original"],axis = 1)
        for col2 in col_sim.columns:
            mc_sim_df_1[col]=col_sim[col2]
            #if(shapiro(mc_sim_df_1[col])[1]>0.05):
                #print(kruskal(mc_sim_df_1[col],df_1[col]))
            #else:
                #print(f_oneway(mc_sim_df_1[col],df_1[col]))


    #diag = mc_sim_df_1.append(mc_sim_df_0)['diagnosis']
    mc_sim_df = mc_sim_df_1.append(mc_sim_df_0)
    #shuffling dataframe for good luck
    #mc_sim_df = mc_sim_df.sample(frac=1)
    #mc_sim_df['diagnosis']=diag
    mc_sim_df.head(20)


    #values formatted
    labels = df["diagnosis"]
    df = df.drop("diagnosis",axis=1)
    dfDev, dfTes = numpy.split(df, [int(.7*len(df))])
    DDev, DTes = numpy.split(labels, [int(.7*len(labels))])

    #DTrn =  mc_sim_df['diagnosis']
    #dfTrn = mc_sim_df.drop(['diagnosis'], axis = 1)
    DTrn =  df_mc['diagnosis']
    dfTrn = df_mc.drop(['diagnosis'], axis = 1)
    
    scores = []

    #run model and test
    #randomforest
    model = rfc()
    model = model.fit(dfTrn.values,DTrn)
    pd = model.predict(dfDev)
    hit = 0
    for i in range(len(pd)):
        if(int(pd[i])==int(DDev.iloc[i])):
            hit+=1
    scores.append(hit/len(pd))
    
    #knn
    model = knc()
    model = model.fit(dfTrn.values,DTrn)
    pd = model.predict(dfDev)
    hit = 0
    for i in range(len(pd)):
        if(int(pd[i])==int(DDev.iloc[i])):
            hit+=1
    scores.append(hit/len(pd))
    
    #svc
    model = svc(kernel="linear")
    model = model.fit(dfTrn.values,DTrn)
    pd = model.predict(dfDev)
    hit = 0
    for i in range(len(pd)):
        if(int(pd[i])==int(DDev.iloc[i])):
            hit+=1
    scores.append(hit/len(pd))
    
    #svc
    model = svc(kernel="rbf")
    model = model.fit(dfTrn.values,DTrn)
    pd = model.predict(dfDev)
    hit = 0
    for i in range(len(pd)):
        if(int(pd[i])==int(DDev.iloc[i])):
            hit+=1
    scores.append(hit/len(pd))
    
    #logistic regression
    model = lgr()
    model = model.fit(dfTrn.values,DTrn)
    pd = model.predict(dfDev)
    hit = 0
    for i in range(len(pd)):
        if(int(pd[i])==int(DDev.iloc[i])):
            hit+=1
    scores.append(hit/len(pd))
    
    return scores
MiddleDF75 = DiabetesMiddle.loc[predictarray<0.75]

FinalTrain = pd.concat([DiabetesNoMiddle, MiddleDF75], axis=0)

#Get the logistic regression fit object, after removing specific columns:

TrainLR = FinalTrain.drop(['diabfeat_neurologic', 'race_AfricanAmerican', 'A1Cresult_>7', 'primarydiag_injury', 'number_diagnoses', 
    'med_glimepiride', 'med_insulin', 'diag_infection', 'medical_specialty_Orthopedics', 'med_nateglinide', 'discharge_disposition_leftAMA', 
    'admission_source_id_3', 'change_Ch', 'diag_circulatory', 'medical_specialty_Gastroenterology', 'medical_specialty_Surgery',
    'primarydiag_infection', 'primarydiag_mentaldis'], axis=1)
TrainLRX = TrainLR.drop('readmitted', axis=1)
TrainLRY = TrainLR['readmitted'].replace([2,1], [1,0])

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1, class_weight={0:.2, 1:.8})

lgr.fit(TrainLRX, TrainLRY)

#Get random forest fit object:

from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', 
               max_depth=60, random_state=42, class_weight={0:.2, 1:.8})

FinalTrainX = FinalTrain.drop('readmitted', axis=1)
FinalTrainY = FinalTrain['readmitted'].replace([2,1], [1,0])