#for x in X: # print x model = LogisticRegression() #from sklearn.svm import SVC #model=SVC(kernel='linear') #rfe = RFE(model, k) #rfe = rfe.fit(Xtrain, ytrain) clf=SelectKBest(chi2, k=k).fit_transform(Xtrain, ytrain) #print(rfe.support_) #print(rfe.ranking_) This is one of the results expected Xcv=cv[:,2:] ycv=cv[:,1] p=int(0) n=int(0) #pred=rfe.predict(Xcv) pred=clf.predict(Xcv) #print "pred set before matrix %s" % str(np.shape(pred)) #print "ycv set before matrix %s" % str(np.shape(ycv)) J+=float(f1_score(ycv,pred)) count+=1 f1score=float(J/count) print "No of features %d f1_score %f" % (k,f1score) if f1score>F1max: F1max=f1score n_features=k print n_features,F1max Xtrain=f[:,2:] ytrain=f[:,1] model = LogisticRegression() rfe = RFE(model, n_features) rfe = rfe.fit(Xtrain, ytrain)
def acc(classifier, mdict, splits=10, fselect='', nfeat=100, fmin=0, fmax=1000, a=.05, thresh=0): acc = [] acc_tr = [] # load data phi = mdict.get('phi') testPhi = mdict.get('testPhi') for i in range(0, splits): X = phi[(i, 0)] s = X.shape if len(s) == 3: X = np.reshape(X, [s[0], s[1] * s[2]]) else: X = np.reshape(X, [s[0], s[1]]) # random data rows = random.sample(range(0, s[0]), 4) rcts = X[rows, ] rlabs = range(0, 4) classifier.fit(rcts, rlabs) y = classifier.predict(X) y = np.reshape(y, s[0]) X_test = testPhi[(i, 0)] s = X_test.shape if len(s) == 3: X_test = np.reshape(X_test, [s[0], s[1] * s[2]]) else: X_test = np.reshape(X_test, [s[0], s[1]]) y_test = classifier.predict(X_test) y_test = np.reshape(y_test, s[0]) # subset features if 'min' in fselect: cols = X.astype(bool).sum(axis=0) > fmin X = X[:, cols] X_test = X_test[:, cols] if 'max' in fselect: cols = X.astype(bool).sum(axis=0) < fmax X = X[:, cols] X_test = X_test[:, cols] if 'thresh' in fselect: X[X < thresh] = 0 X_test[X_test < thresh] = 0 if 'MI' in fselect: model = SelectKBest(mutual_info_classif, k=nfeat).fit(X, y) X = model.transform(X) X_test = model.transform(X_test) elif 'PCA'in fselect: model = PCA(n_components=nfeat).fit(X) X = model.transform(X) X_test = model.transform(X_test) elif 'reg' in fselect: model = SelectFpr(f_classif, alpha=a).fit(X, y) X = model.transform(X) X_test = model.transform(X_test) elif 'kbest' in fselect: model = SelectKBest(f_classif, k=nfeat).fit(X, y) X = model.transform(X) X_test = model.transform(X_test) # fit model model = classifier.fit(X, y) # Compute accuracy for validation set y_hat = model.predict(X_test) acc.append(sum(y_hat == y_test)/len(y_test)) # Compute accuracy for training set y_hat = model.predict(X) acc_tr.append(sum(y_hat == y) / len(y)) i += 1 results = stats.ttest_1samp(acc, popmean=755/2126) p_val = results[1] results = stats.ttest_1samp(acc_tr, popmean=755/2126) p_val_tr = results[1] return np.mean(acc), np.std(acc), p_val, np.mean(acc_tr), np.std(acc_tr), p_val_tr
def acc(classifier, fname, yfname=None, root='./data/', fselect='min', nfeat=100, fmin=0, fmax=1000, a=.05, thresh=0): # load data mdict = scipy.io.loadmat(root + fname) # import dataset from matlab if yfname is None: ymdict = mdict else: ymdict = scipy.io.loadmat(root + yfname) # import dataset from matlab phi = mdict.get('phi') testPhi = mdict.get('testPhi') asd = ymdict.get('asd') testASD = ymdict.get('asdTe') X = phi[(0, 0)] if isinstance(X, np.void): s = X[2][0] X = torch.sparse.FloatTensor( torch.from_numpy(X[0].astype(dtype='float32') - 1).t().type( torch.LongTensor), torch.from_numpy(X[1][:, 0].astype(dtype='float32')), torch.Size(tuple(s))) X = X.to_dense().reshape(s[0], -1).numpy() else: X = phi s = X.shape y = asd y = np.reshape(y, s[0]) Xt = testPhi[(0, 0)] if isinstance(Xt, np.void): s = Xt[2][0] Xt = torch.sparse.FloatTensor( torch.from_numpy(Xt[0].astype(dtype='float32') - 1).t().type( torch.LongTensor), torch.from_numpy(Xt[1][:, 0].astype(dtype='float32')), torch.Size(tuple(s))) Xt = Xt.to_dense().reshape(s[0], -1).numpy() else: Xt = testPhi s = Xt.shape X_test = Xt y_test = testASD y_test = np.reshape(y_test, s[0]) # subset features if 'min' in fselect: cols = X.astype(bool).sum(axis=0) > fmin X = X[:, cols] X_test = X_test[:, cols] if 'max' in fselect: cols = X.astype(bool).sum(axis=0) < fmax X = X[:, cols] X_test = X_test[:, cols] if 'thresh' in fselect: X[X < thresh] = 0 X_test[X_test < thresh] = 0 # rescale if sparse.issparse(X): scaler = StandardScaler(with_mean=False) else: scaler = StandardScaler() scaler.fit(X) X = scaler.transform(X) X_test = scaler.transform(X_test) if 'MI' in fselect: model = SelectKBest(mutual_info_classif, k=nfeat).fit(X, y) X = model.transform(X) X_test = model.transform(X_test) elif 'PCA' in fselect: model = PCA(n_components=nfeat).fit(X) X = model.transform(X) X_test = model.transform(X_test) elif 'reg' in fselect: model = SelectFpr(f_classif, alpha=a).fit(X, y) X = model.transform(X) X_test = model.transform(X_test) elif 'kbest' in fselect: model = SelectKBest(f_classif, k=nfeat).fit(X, y) X = model.transform(X) X_test = model.transform(X_test) # fit model model = classifier.fit(X, y) """ if i == 0: coeffs = np.array(model.coef_).transpose() else: coeffs = np.c_[coeffs, np.array(model.coef_).transpose()] """ # Compute accuracy for validation set y_hat = model.predict(X_test) acc = sum(y_hat == y_test) / len(y_test) # Compute accuracy for training set y_hat = model.predict(X) acc_tr = sum(y_hat == y) / len(y) # pd.DataFrame(model.coef_).to_csv('data/cancer_coef_' + str(i) + '.csv') #np.savetxt("data/cancer_coeffs.csv", coeffs, delimiter=",") return acc, acc_tr
kmean = KMeans(n_clusters=3,max_iter = 1000).fit(data) cluster = kmean.predict(data) data['Cluster'] = cluster # using my algorithm (chi method) to find the best 5 features y = result['Life expectancy at birth (years)'] x = data new = SelectKBest(chi2, k=3).fit_transform(x,y) new = pd.DataFrame(new) a = new b = classlabel #T rain the knn model and produce the accuracy socre X_train, X_test, y_train, y_test = train_test_split(new,classlabel, train_size=(2/3), test_size=(1/3), random_state=100) new = neighbors.KNeighborsClassifier(n_neighbors=5) new.fit(X_train, y_train) new_pred=new.predict(X_test) print("Accuracy of feature engineering: "+ str(round(accuracy_score(y_test, new_pred)*100,3))+"%") ################# PCA ############################# #sperating the source and target into two table pcakeys = [str(result) for result in pcadata.keys()] pcafeacture = pcadata[pcakeys[1:]] pcatarget = pcadata[pcakeys[0]] #pca normalization, since pca can be incflacted by scale idf=pd.DataFrame(imp.fit_transform(pcafeacture)) idf.columns=pcafeacture.columns idf.index=pcafeacture.index pcafeacture = idf scaler = preprocessing.StandardScaler().fit(pcafeacture)
def acc(classifier, fname, yfname=None, splits=10, fselect='min', root='./data/', nfeat=100, fmin=0, fmax=1000, a=.05, thresh=0): acc = [] acc_tr = [] # coeffs = [] # load data mdict = scipy.io.loadmat(root + fname) # import dataset from matlab if yfname is None: ymdict = mdict else: ymdict = scipy.io.loadmat(root + yfname) # import dataset from matlab phi = mdict.get('phi') testPhi = mdict.get('testPhi') asd = ymdict.get('cvTrainASD') testASD = ymdict.get('cvTestASD') i = 0 for i in range(0, splits): X = phi[(i, 0)] s = X.shape if len(s) == 3: X = np.reshape(X, [s[0], s[1] * s[2]]) else: X = np.reshape(X, [s[0], s[1]]) y = asd[(i, 0)] y = np.reshape(y, s[0]) X_test = testPhi[(i, 0)] s = X_test.shape if len(s) == 3: X_test = np.reshape(X_test, [s[0], s[1] * s[2]]) else: X_test = np.reshape(X_test, [s[0], s[1]]) y_test = testASD[(i, 0)] y_test = np.reshape(y_test, s[0]) # add zero column if dims don't match dif = X.shape[1] - X_test.shape[1] if dif > 0: z = np.zeros((X_test.shape[0], dif)) X_test = np.append(X_test, z, 1) # subset features if 'min' in fselect: cols = X.astype(bool).sum(axis=0) > fmin X = X[:, cols] X_test = X_test[:, cols] if 'max' in fselect: cols = X.astype(bool).sum(axis=0) < fmax X = X[:, cols] X_test = X_test[:, cols] if 'thresh' in fselect: X[X < thresh] = 0 X_test[X_test < thresh] = 0 # rescale if sparse.issparse(X): scaler = StandardScaler(with_mean=False) else: scaler = StandardScaler() scaler.fit(X) X = scaler.transform(X) X_test = scaler.transform(X_test) if 'MI' in fselect: model = SelectKBest(mutual_info_classif, k=nfeat).fit(X, y) X = model.transform(X) X_test = model.transform(X_test) elif 'PCA' in fselect: model = PCA(n_components=nfeat).fit(X) X = model.transform(X) X_test = model.transform(X_test) elif 'reg' in fselect: model = SelectFpr(f_classif, alpha=a).fit(X, y) X = model.transform(X) X_test = model.transform(X_test) elif 'kbest' in fselect: model = SelectKBest(f_classif, k=nfeat).fit(X, y) X = model.transform(X) X_test = model.transform(X_test) # fit model model = classifier.fit(X, y) """ if i == 0: coeffs = np.array(model.coef_).transpose() else: coeffs = np.c_[coeffs, np.array(model.coef_).transpose()] """ # Compute accuracy for validation set y_hat = model.predict(X_test) acc.append(sum(y_hat == y_test) / len(y_test)) # Compute accuracy for training set y_hat = model.predict(X) acc_tr.append(sum(y_hat == y) / len(y)) # pd.DataFrame(model.coef_).to_csv('data/cancer_coef_' + str(i) + '.csv') i += 1 #np.savetxt("data/cancer_coeffs.csv", coeffs, delimiter=",") results = stats.ttest_1samp(acc, popmean=755 / 2126) p_val = results[1] results = stats.ttest_1samp(acc_tr, popmean=755 / 2126) p_val_tr = results[1] return np.mean(acc), np.std(acc), p_val, np.mean(acc_tr), np.std( acc_tr), p_val_tr
#f regression from sklearn.linear_model import LinearRegression from sklearn.feature_selection import SelectKBest, f_regression x = df.drop(['Sales', 'Sales_Bin'], axis='columns') y = df.Sales model = SelectKBest(score_func=f_regression, k=5) results = model.fit(x, y) results.scores_ scores = pd.DataFrame(results.scores_, index=x.columns) results.pvalues_ scores.sort_values(by=0, ascending=True) ''' 최종적으로 r squared 가 0.915인 모델로 선정 ''' #residual plot y_pred = model.predict(x) residual = y - y_pred std_residual = residual / np.std(residual) plt.scatter(y_pred, std_residual) plt.grid() #remove index 9 record df = pd.read_excel('cravens.xlsx') df = df.drop(9) x = df[['Time', 'Poten', 'AdvExp', 'Share', 'Change']] y = df.Sales x = sm.add_constant(x) model = sm.OLS(y, x).fit() model.summary() '''