def _variance_threshhold(self, variance): '''Remove columns that do not meat the variance threshold''' logging.info('Removing data that has variance less than %f.' %(variance)) vt = VarianceThreshold(variance) vt.fit(self.X) # XXX: Because idx should have high variance we pas all of X self.X = vt.transform(self.X) self.X_submit = vt.transform(self.X_submit) # Repeat this process for X_submit # XXX: This might not be kosher outside of competition vt.fit(self.X_submit) self.X = vt.transform(self.X) self.X_submit = vt.transform(self.X_submit)
def main(): args = getOptions() print args print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) train_x_clean, contentdict = cityclean(train_x_new) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) test_x_clean, contentdict = cityclean(test_x_new, contentdict) del contentdict #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_clean) test_x_uniq = sel.transform(test_x_clean) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection and modeling print "feature selection and modeling" exclusivefs(train_x_nor, train_y, test_x_nor, test_y)
def main(): args = getOptions() print args print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_new) test_x_uniq = sel.transform(test_x_new) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(train_y, 10), scoring='accuracy') rfecv.fit(train_x_nor, train_y) print("Optimal number of features : %d" % rfecv.n_features_)
def vectorize_EX(self, columns, variance_thresh=0, train_only=False): print('Start vectorizing') start_time = time.time() hasher = CountVectorizer(binary=True, tokenizer=LemmaTokenizer(), stop_words='english') train_dtm = hasher.fit_transform( self.ga_bm_train[columns].apply(lambda x: ','.join(x), axis=1)) print(hasher.get_feature_names()) print('dtm train shape: ', train_dtm.shape) selector = VarianceThreshold(variance_thresh) train_dtm = selector.fit_transform(train_dtm) print('dtm train shape after variance thresh: ', train_dtm.shape) if not train_only: test_dtm = hasher.transform( self.ga_bm_test[columns].apply(lambda x: ','.join(x), axis=1)) print('dtm test shape: ', test_dtm.shape) test_dtm = selector.transform(test_dtm) print('dtm test shape after variance thresh: ', test_dtm.shape) print("Time: ", round(((time.time() - start_time)/60), 2)) print('Complete vectorizing') if train_only: return train_dtm else: return (train_dtm, test_dtm)
def main(): args = getOptions() print args fn = "destreeSub.csv" print fn print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_new) test_x_uniq = sel.transform(test_x_new) # indices = [i for i in range(len(train_x[0]))] # frqIndex = trimfrq(train_x) # for i in frqIndex: # indices.remove(i) # train_x_uniq = indexTodata(train_x, indices) # test_x_uniq = indexTodata(test_x, indices) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" if args.fts == 'cor': train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor) elif args.fts == 'extraTrees': train_x_sel, test_x_sel = ExtraTreesSelect(train_x_nor, train_y, test_x_nor) else: train_x_sel = copy.deepcopy(train_x_nor) test_x_sel = copy.deepcopy(test_x_nor) del train_x_nor, test_x_nor, train_x_uniq, test_x_uniq print "modelsing" clf = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0, class_weight='auto') clf.fit(train_x_sel, train_y) train_pdt = clf.predict(train_x_sel) MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) test_pdt = clf.predict_proba(test_x_sel) # MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) # print "MCC, Acc_p , Acc_n, Acc_all(test): " # print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) fout=open(fn,'w') fout.write("ID,target\n") for index, eachline in enumerate(test_pdt): fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index][1]))) fout.close()
def main(): args = getOptions() fn = ("submission_cor_%s_%s_%s.csv" % (str(args.lrate).replace('.','dian'),str(args.nest),str(args.maxdepth))) print fn print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_new) test_x_uniq = sel.transform(test_x_new) # indices = [i for i in range(len(train_x[0]))] # frqIndex = trimfrq(train_x) # for i in frqIndex: # indices.remove(i) # train_x_uniq = indexTodata(train_x, indices) # test_x_uniq = indexTodata(test_x, indices) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor) # ftsel = correlationSel() # ftsel.dosel(train_x_nor,train_y) # train_x_sel = ftsel.transform(train_x_nor) # test_x_sel = ftsel.transform(test_x_nor) print "modelsing" clf = GradientBoostingClassifier(loss='deviance', learning_rate=args.lrate, n_estimators=args.nest, max_depth=args.maxdepth, verbose=1) clf.fit(train_x_sel, train_y) train_pdt = clf.predict(train_x_sel) MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) test_pdt = clf.predict_proba(test_x_sel) # MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) # print "MCC, Acc_p , Acc_n, Acc_all(test): " # print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) fout=open(fn,'w') fout.write("ID,target\n") for index, eachline in enumerate(test_pdt): fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index][1]))) fout.close()
def featureSelection(X_train,X_test,X_val,y_train,log,tech,C): if (tech == 'VarTh'): sel = VarianceThreshold(threshold=0.01) X_train_new = sel.fit_transform(X_train.todense()) X_test_new = sel.transform(X_test.todense()) X_val_new = sel.transform(X_val.todense()) if (log): X_train_new = np.log(X_train_new+1) X_test_new = np.log(X_test_new+1) X_val_new = np.log(X_val_new+1) if (tech == 'LinearSVC'): mod = LinearSVC(C=C, penalty="l1", dual=False) X_train_new = mod.fit_transform(X_train.todense(), y_train) X_test_new = mod.transform(X_test.todense()) X_val_new = mod.transform(X_val.todense()) if (log): X_train_new = np.log(X_train_new+1) X_test_new = np.log(X_test_new+1) X_val_new = np.log(X_val_new+1) return X_train_new, X_test_new , X_val_new
class VarianceThresholdStep(SklearnStep): def __init__(self, threshold): super(VarianceThresholdStep, self).__init__() self._threshold = threshold def fit_transform(self): self._model = VarianceThreshold(threshold=self._threshold) x, y = load_svmlight(self.input_path) x = self._model.fit_transform(x, y) save_svmlight(x, y, self._output_path) def transform(self, x=None): if x is None: x, y = load_svmlight(self._test_input_path) x = self._model.transform(x) save_svmlight(x, y, self._test_output_path) else: transformed_x = self._model.transform(x) return transformed_x def get_param(self): return {'threshold': self._threshold}
def main(): args = getOptions() print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_new) test_x_uniq = sel.transform(test_x_new) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" ftsel = ExtraTreesClassifier() ftsel.fit(train_x_nor, train_y) # importances = ftsel.feature_importances_ # indices_test = np.argsort(importances)[::-1] # indices_test = indices_test.tolist() train_x_trans = ftsel.transform(train_x_nor) test_x_trans = ftsel.transform(test_x_nor) #modelsing print "modelsing" train = xgb.DMatrix(train_x_trans,label=train_y) test = xgb.DMatrix(test_x_trans,label=test_y) gbm = xgb.train({'max_depth':3, 'n_estimators':1500, 'learning_rate':0.1 ,'objective':'binary:logistic','eval_metric':'auc'},train) train_pdt = gbm.predict(train) MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) test_pdt = gbm.predict(test) MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) print "MCC, Acc_p , Acc_n, Acc_all(test): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) fout=open("submission_xgbtrain.csv",'w') fout.write("ID,target\n") for index, eachline in enumerate(test_pdt): fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index]))) fout.close()
def chooseFeatures(train_x,train_y,test_x,kB): sel = VarianceThreshold() trainingX = sel.fit_transform(train_x) testingExamples = sel.transform(test_x) if kB > trainingX.shape[1]: kB = trainingX.shape[1] kBest = SelectKBest(chi2,k=kB) train_x = kBest.fit_transform(train_x,train_y) test_x = kBest.transform(test_x) return train_x,test_x
def runRF(directory, name): print(time.time(), time.clock()) test = directory + "transformed.test.SquibDWTFFT.npy" train = directory + "transformed.train.SquibDWTFFT.npy" data = loadTransformed(train) X,Y = splitData(data) y = Y.flatten() ocp = np.sum(y == 1) oci = np.sum(y == 0) ratio = oci.astype(float) / ocp.astype(float) nest = int(300 * ratio.round()) print ratio.round() threshold = 0.8 sel = VarianceThreshold(threshold=(threshold * (1 - threshold))) Xr = sel.fit_transform(X) #clf = RandomForestClassifier(n_estimators=10, criterion='entropy',max_depth=None,min_samples_split=1, random_state=0) print "Fit Random forest on " + name clf = RandomForestClassifier(n_estimators=nest , criterion = "entropy", max_features="auto", min_samples_split=1, bootstrap=False, n_jobs=8, random_state=0) clf.fit(Xr, y, sample_weight= np.array([ratio.round() if i == 0 else 1 for i in y])) print(time.time(), time.clock()) print "Predict " + name dtest = loadTransformed(test) XT,YT = splitData(dtest) yt = YT.flatten() XTr = sel.transform(XT) predictions = clf.predict_proba(XTr) filename = name + "_predictions.csv" print(time.time(), time.clock()) print "Writting out results ... " finalFile = open(filename, "w") count = 1 for pre in predictions: pr = '%.6f' % pre[1] count4 = "%04d" % (count,) print name + " " + count4 + " " + pr finalFile.write(name + "_test_segment_" + count4 + ".mat," + str(pr) + "\n") count += 1 finalFile.close() print(time.time(), time.clock())
def get_rmvar(train_x, test_x, threshold=20): selector = VarianceThreshold(threshold=20) selector.fit(train_x) train_var = selector.transform(train_x) test_var = selector.transform(test_x) return train_var, test_var
def main(): args = getOptions() print args if args.model == 'gBoosting': fn = ("submissionv4_%s_gBoosting_%s_%s_%s_%s_%s.csv" % (args.fts, args.loss, str(args.minsamplessplit), str( args.lrate).replace('.', 'dian'), str( args.nest), str(args.maxdepth))) elif args.model == 'randomForest': fn = ("submissionv4_%s_randomForest_%s.csv" % (args.fts, args.nest)) print fn print "train file read" train_x, train_y = readfile_noid(args.train, 'train') train_x_new, id = extractID(train_x) train_x_clean, contentdict = cityclean(train_x_new) del id print "test file read" test_x, test_y = readfile_noid(args.test, 'test') test_x_new, id = extractID(test_x) test_x_clean, contentdict = cityclean(test_x_new, contentdict) del contentdict #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_clean) test_x_uniq = sel.transform(test_x_clean) # indices = [i for i in range(len(train_x[0]))] # frqIndex = trimfrq(train_x) # for i in frqIndex: # indices.remove(i) # train_x_uniq = indexTodata(train_x, indices) # test_x_uniq = indexTodata(test_x, indices) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" if args.fts == 'cor': train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor) elif args.fts == 'extraTrees': train_x_sel, test_x_sel = ExtraTreesSelect(train_x_nor, train_y, test_x_nor) elif args.fts == 'randomTree': train_x_sel, test_x_sel = randomTreesSelect(train_x_nor, train_y, test_x_nor) else: train_x_sel = copy.deepcopy(train_x_nor) test_x_sel = copy.deepcopy(test_x_nor) print len(train_x_nor[0]) print len(train_x_sel[0]) del train_x_nor, test_x_nor, train_x_uniq, test_x_uniq print "modelsing" if args.model == 'gBoosting': clf = GradientBoostingClassifier( loss=args.loss, learning_rate=args.lrate, n_estimators=args.nest, max_depth=args.maxdepth, min_samples_split=args.minsamplessplit, verbose=1) elif args.model == 'randomForest': clf = RandomForestClassifier(n_estimators=args.nest, class_weight='auto') clf.fit(train_x_sel, train_y) train_pdt = clf.predict(train_x_sel) MCC, Acc_p, Acc_n, Acc_all = get_Accs(train_y, train_pdt) print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n), str(Acc_all)) test_pdt = clf.predict_proba(test_x_sel) # MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) # print "MCC, Acc_p , Acc_n, Acc_all(test): " # print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) fout = open(fn, 'w') fout.write("ID,target\n") for index, eachline in enumerate(test_pdt): fout.write("%s,%s\n" % (str(int(id[index])), str(test_pdt[index][1]))) fout.close()
# -*- coding: utf-8 -*- """ Created on Tue May 29 18:16:50 2018 @author: juanferna.perez """ from pandas import DataFrame from sklearn.feature_selection import VarianceThreshold from sklearn import datasets iris = datasets.load_iris() X = iris.data Xdf= DataFrame(X) print(Xdf.describe()) print(Xdf.var(ddof=0)) selector = VarianceThreshold(threshold=(0.8*0.8 )) selector.fit(X) print(selector.get_support()) Xbar = selector.transform(X) print(Xbar)
def remove_low_var_features(xtrain, xtest): selector = VarianceThreshold() xtrain = selector.fit_transform(xtrain) xtest = selector.transform(xtest) return xtrain, xtest, selector.get_support(indices=True)
XX = [] for i in xrange(len(Y)): if Y[i] == value: XX.append(X[i]) return XX out = open(sys.argv[1], "r") model = svm.OneClassSVM(kernel='rbf') X, Y = read_fea(sys.argv[1]) sel = VarianceThreshold(threshold=0) model.fit(sample_data(sel.fit_transform(X), Y, 1)) warning("useful features dim: " + str(len(sel.get_support(True)))) if hasattr(model, 'score'): warning("accuracy on training set: " + str(model.score(sel.transform(X), Y))) if len(sys.argv) > 2: X, Y = read_fea(sys.argv[2]) warning("accuracy on cv set: " + str(model.score(sel.transform(X), Y))) if len(sys.argv) > 3: X, Y = read_fea(sys.argv[3]) warning("accuracy on dev set: " + str(model.score(sel.transform(X), Y))) if len(sys.argv) > 4: ref = model.decision_function(sel.transform(X)) X, Y = read_fea(sys.argv[4], True) Z = model.decision_function(sel.transform(X)).tolist() Z = (Z - ref.mean()) / ref.std() for i in xrange(len(Y)):
def compute(train, test): #Train data train_X = []; train_restaurant_ids = []; test_X = []; test_restaurant_ids = []; train_Y = []; #Common feature values in train/test train_feature_val = {}; test_feature_val = {}; build_FeatureVal(train, train_feature_val); build_FeatureVal(test, test_feature_val); buildFeatures(train, train_feature_val, test_feature_val, train_X, train_Y, train_restaurant_ids, "train"); buildFeatures(test, train_feature_val, test_feature_val, test_X, None, test_restaurant_ids, "test"); train_Y = np.array(train_Y); enc = OneHotEncoder(categorical_features=np.array([3,4,5,32,33,34,35,36,37,38,39,40,41,42]), sparse=False, n_values=100); enc.fit(test_X); train_X = enc.transform(train_X); test_X = enc.transform(test_X); print("No of train features " + str(len(train_X[0]))); print("No of test features " + str(len(test_X[0]))); #Remove features with similar values selector = VarianceThreshold(); selector.fit(train_X); train_X = selector.transform(train_X); test_X = selector.transform(test_X); print("No of train features " + str(len(train_X[0]))); print("No of test features " + str(len(test_X[0]))); parameters_to_try = generateParams(); print("No of Paramters to test " + str(len(parameters_to_try))); #Contruct parameters as s list models_to_try = [ (copy.copy(train_X), copy.copy(train_Y), parameters_to_try[i] ) for i in range(0, len(parameters_to_try)) ]; #Create a Thread pool. pool = Pool(8); results = pool.map( train_model_wrapper, models_to_try ); pool.close(); pool.join(); best_params = None; best_rmse = sys.float_info.max; for i in range(0, len(results)): if results[i][1] < best_rmse: best_rmse = results[i][1]; best_params = results[i][0]; print("Best Params : " + str(best_params)); print("Best RMSE : " + str(best_rmse)); #estimator = SVR(**params) #estimator = RandomForestRegressor(**best_params) estimator = GradientBoostingRegressor(**best_params) estimator.fit(train_X, train_Y); print("Writing Output"); predict_and_save(estimator, test_X, test_restaurant_ids);
def preprocess(self): print 'Preprocess...' print 'Start: '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') data = self.data.copy() label = self.label.copy() m = data.shape[0] print data['MarriageStatus'].dtype #fillna for i in data.columns: if i!='AppId' and i!='InstallmentStartedOn': if data[i].hasnans: t0=pd.DataFrame(np.ones((data.shape[0],1),dtype=np.int),columns=[i+'_Ex'],index=data.index) ind0=data[data[i].isnull()].index t0.ix[ind0]=0 data[i+'_Ex']=t0 if data[i].dtype==np.object: if data[i].value_counts().sort_values().shape[0]>0: data[i].fillna(data[i].value_counts().sort_values().index[-1],inplace=True,downcast='infer') else: data[i].fillna('0',inplace=True,downcast='infer') else: if np.isnan(data[i].mean())==False: data[i].fillna(data[i].mean(),inplace=True,downcast='infer') else: data[i].fillna(0,inplace=True,downcast='infer') train,train_label,test,test_label=self.split(data,label) self.raw_train=train.copy() self.raw_train_label=train_label.copy() self.raw_test=test.copy() self.raw_test_label=test_label.copy() #delete AppId and InstallmentStartedOn data.drop(['AppId','InstallmentStartedOn'],axis=1,inplace=True) train.drop(['AppId','InstallmentStartedOn'],axis=1,inplace=True) test.drop(['AppId','InstallmentStartedOn'],axis=1,inplace=True) data.reset_index(inplace=True,drop=True) train.reset_index(inplace=True,drop=True) test.reset_index(inplace=True,drop=True) #preprocess enc0=LabelEncoder() enc1 = OneHotEncoder() scaler = MinMaxScaler() for i in train.columns: if train[i].dtype==np.object: t0=enc0.fit_transform(train[i].values.reshape(-1,1)) t1=enc1.fit_transform(t0.reshape(-1,1)).toarray() tf=pd.DataFrame(t1,index=train.index) tf.rename(columns=lambda x: i+'_'+str(x)+'_E', inplace=True) train.drop(i,inplace=True,axis=1) train=train.join(tf,how='inner') clas = enc0.classes_ if test[i][~test[i].isin(clas)].size != 0: ind = test[i][~test[i].isin(clas)].index test[i].iloc[ind] = clas[0] t0=enc0.transform(test[i].values.reshape(-1,1)) t1=enc1.transform(t0.reshape(-1,1)).toarray() tf=pd.DataFrame(t1,index=test.index) tf.rename(columns=lambda x: i+'_'+str(x)+'_E', inplace=True) test.drop(i,inplace=True,axis=1) test=test.join(tf,how='inner') else: tt0=train[i].values.reshape(-1,1) tt0_s=scaler.fit_transform(tt0) train[i+'_S']=tt0_s train.drop(i,inplace=True,axis=1) tt2=test[i].values.reshape(-1,1) tt2_s=scaler.transform(tt2) test[i+'_S']=tt2_s test.drop(i,inplace=True,axis=1) #feature selection sel = VarianceThreshold(threshold=0.0002) train_new=sel.fit_transform(train) sup=sel.get_support() features=train.columns.tolist() for i in xrange(train.shape[1]): if sup[i]==False: features.remove(train.columns[i]) train=pd.DataFrame(train_new,columns=features) test_new=sel.transform(test) test=pd.DataFrame(test_new,columns=features) self.train=train.copy() self.train_label=train_label.copy() self.test=test.copy() self.test_label=test_label.copy() print 'End: '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') return train,train_label,test,test_label
selector = VarianceThreshold() feature_train = selector.fit_transform(feature_train) scaler = preprocessing.StandardScaler().fit(feature_train) feature_train = scaler.transform(feature_train) feature_ER = np.zeros((length_ER, num_feature)) textfile = open(data_path + "ER_" + feature_type) count = 0 while length_ER > count: x = textfile.readline() x = x.strip() result = np.array([list(map(float, x.split()))]) feature_ER[count, ] = result count = count + 1 feature_ER = selector.transform(feature_ER) feature_ER = scaler.transform(feature_ER) feature_GPCR = np.zeros((length_GPCR, num_feature)) textfile = open(data_path + "GPCR_" + feature_type) count = 0 while length_GPCR > count: x = textfile.readline() x = x.strip() result = np.array([list(map(float, x.split()))]) feature_GPCR[count, ] = result count = count + 1 feature_GPCR = selector.transform(feature_GPCR) feature_GPCR = scaler.transform(feature_GPCR)
# Import data-sets train_exp, train_cnv, train_ess, leader_exp, leader_cnv, prioritized_genes = read_data_sets() # Setup genes = train_ess.axes[1] samples = leader_exp.axes[0] predictions = DataFrame(None, index=genes, columns=samples) spearman = make_scorer(spearm_cor_func, greater_is_better=True) X_train_pre = train_exp X_test_pre = leader_exp # Filter by coeficient variation var_thres = VarianceThreshold(best_var).fit(X_train_pre) X_train_pre = var_thres.transform(X_train_pre) X_test_pre = var_thres.transform(X_test_pre) for gene in genes: # Assemble prediction variables X_train = X_train_pre y_train = train_ess.ix[:, gene] X_test = X_test_pre # Feature selection fs = SelectKBest(f_regression, k=best_k).fit(X_train, y_train) X_train = fs.transform(X_train) X_test = fs.transform(X_test) # Estimation clf = PassiveAggressiveRegressor(epsilon=best_epsilon, n_iter=best_n_iter).fit(X_train, y_train)
def get_low_variance_columns(dframe=None, columns=None, skip_columns=None, thresh=0.0, autoremove=False): """ Wrapper for sklearn VarianceThreshold for use on pandas dataframes. """ print("Finding low-variance features.") try: # get list of all the original df columns all_columns = dframe.columns # remove `skip_columns` remaining_columns = all_columns.drop(skip_columns) # get length of new index max_index = len(remaining_columns) - 1 # get indices for `skip_columns` skipped_idx = [all_columns.get_loc(column) for column in skip_columns] # adjust insert location by the number of columns removed # (for non-zero insertion locations) to keep relative # locations intact for idx, item in enumerate(skipped_idx): if item > max_index: diff = item - max_index skipped_idx[idx] -= diff if item == max_index: diff = item - len(skip_columns) skipped_idx[idx] -= diff if idx == 0: skipped_idx[idx] = item # get values of `skip_columns` skipped_values = dframe.iloc[:, skipped_idx].values # get dataframe values X = dframe.loc[:, remaining_columns].values # instantiate VarianceThreshold object vt = VarianceThreshold(threshold=thresh) # fit vt to data vt.fit(X) # get the indices of the features that are being kept feature_indices = vt.get_support(indices=True) # remove low-variance columns from index feature_names = [remaining_columns[idx] for idx, _ in enumerate(remaining_columns) if idx in feature_indices] # get the columns to be removed removed_features = list(np.setdiff1d(remaining_columns, feature_names)) print("Found {0} low-variance columns.".format(len(removed_features))) # remove the columns if autoremove: print("Removing low-variance features.") # remove the low-variance columns X_removed = vt.transform(X) print("Reassembling the dataframe (with low-variance " "features removed).") # re-assemble the dataframe dframe = pd.DataFrame(data=X_removed, columns=feature_names) # add back the `skip_columns` for idx, index in enumerate(skipped_idx): dframe.insert(loc=index, column=skip_columns[idx], value=skipped_values[:, idx]) print("Succesfully removed low-variance columns.") # do not remove columns else: print("No changes have been made to the dataframe.") except Exception as e: print(e) print("Could not remove low-variance features. Something " "went wrong.") pass return dframe
labels_cols = train.columns[0] features_train = train[features_cols] labels_train = train[labels_cols] features_test = test #Create cross-validation set train_X, test_X, train_y, test_y = cross_validation.train_test_split(features_train, labels_train, test_size = 0.2, random_state=0) #Feature selection sel = VarianceThreshold(threshold=(.8*(1-.8))) sel.fit_transform(train_X[:5000]) sel.transform(test_X[:5000]) #Create and train classifier clf = GaussianNB() clf.fit(train_X, train_y) #Get accuracy score pred_train = clf.predict(train_X[:5000]) pred_test = clf.predict(test_X[:5000]) accuracy_train = accuracy_score(train_y[:5000], pred_train) accuracy_test = accuracy_score(test_y[:5000], pred_test) print('Accuracy score on training data is: ' + str(accuracy_train)) print('Accuracy score on testing data is: ' + str(accuracy_test))
tmpList, par = mult_clean_list(el[i],t) cln.extend(tmpList) t += 1 print "HERE" X = [] print sz for j in range(0,sz): tmp=[] for i in range(0,len(cln)): if(parList[i] not in blockList): tmp.append(cln[i][j]) X.append(tmp) X = sel.transform(X) for j in np.array(X[:3]): print j py = clf.predict(X) ans = [] for i in range(0,len(ids)): ans.append([ids[i],py[i]]) sorted(ans, key=lambda x: x[0]) with open('results.csv', 'wb') as testfile: csv_writer = csv.writer(testfile)
def remove_features_with_low_variance(x_data): variance = VarianceThreshold(threshold=1.4) print ('before transform', len(x_data[4]), x_data[4]) variance.fit(x_data) transformed_x = variance.transform(x_data) print ('after transform', len(transformed_x[4]), transformed_x[4])
X_train, y_train = rus.fit_sample(X_train, y_train) radioFeat_train = copy.deepcopy(X_train[:, :1692]) clinical_semanticFeat_train = copy.deepcopy(X_train[:, 1692:]) radioFeat_test = copy.deepcopy(X_test.iloc[:, :1692]) clinical_semanticFeat_test = copy.deepcopy(X_test.iloc[:, 1692:]) print('------------------开始特征选择---------------------') print('radiomics原始特征个数为:{}'.format(radioFeat_train.shape[1])) print('clinical_semantic原始特征个数为:{}'.format( clinical_semanticFeat_train.shape[1])) ##################方差特征选择################ from sklearn.feature_selection import VarianceThreshold # 导入python的相关模块 vad = VarianceThreshold( threshold=0.01) # 表示剔除特征的方差大于阈值的特征Removing features with low variance radioFeat_train = vad.fit_transform(radioFeat_train) # 返回的结果为选择的特征矩阵 radioFeat_test = vad.transform(radioFeat_test) print('train_test_split_seed={} 方差选择radiomics特征个数为:{}'.format( seeds, radioFeat_train.shape[1])) ######################特征归一化到【-1,1】之间##################### # max_abs_scaler = preprocessing.MaxAbsScaler() # max_abs_scaler.fit(xmantrain) # xabstrain = max_abs_scaler.transform(xmantrain) # xabstest = max_abs_scaler.transform(xmantest) ##################方差特征选择################ # from sklearn.feature_selection import VarianceThreshold # 导入python的相关模块 # sel = VarianceThreshold(threshold=0.01) # 表示剔除特征的方差大于阈值的特征Removing features with low variance # ss = sel.fit(xmantrain) # 返回的结果为选择的特征矩阵 # xvartrain = sel.transform(xmantrain) # xvartest = sel.transform(xmantest) # print("方差特征选择后特征个数:", xvartest.shape[1])
############################################################################# # # Feature Selection # ########################################## #Low Variance Filter if lv_filter == 1: print('--LOW VARIANCE FILTER ON--', '\n') #LV Threshold sel = VarianceThreshold( threshold=0.5) #Removes any feature with less than 20% variance fit_mod = sel.fit(data_np) fitted = sel.transform(data_np) sel_idx = fit_mod.get_support() #Get lists of selected and non-selected features (names and indexes) temp = [] temp_idx = [] temp_del = [] for i in range(len(data_np[0])): if sel_idx[i] == 1: #Selected Features get added to temp header temp.append(header[i + feat_start]) temp_idx.append(i) else: #Indexes of non-selected features get added to delete array temp_del.append(i) print('Selected', temp) print('Features (total, selected):', len(data_np[0]), len(temp))
def fit(self, x_or, y, w=None): """ Fits upper and lower bounds on p(y|x) """ if self.standardize: xselector = VarianceThreshold(threshold=.1).fit(x_or) temp_x = xselector.transform(x_or) xscaler = StandardScaler().fit(temp_x) self.xscaler = lambda x: xscaler.transform(xselector.transform(x)) x = self.xscaler(x_or) else: x = x_or.copy() if self.kernel == 'linear': self.kernel_fit = lambda x: x x = self.kernel_fit(x) elif self.kernel == 'poly': if self.p is None: raise ValueError('Need polynomial value') self.kernel_fit = lambda x: np.hstack([x**i for i in range(1, self.p + 1)]) x = self.kernel_fit(x) elif self.kernel == 'rbf': if self.sig is None: raise ValueError('Need Length scale value') self.x_tr = x.copy() self.kernel_fit = lambda x_ts: RBF(length_scale=self.sig).__call__(x_ts, self.x_tr) x = self.kernel_fit(x) elif self.kernel == 'rbf_approx': if self.sig is None: raise ValueError('Need Length scale value') rbf_fit = RBFSampler(gamma=1 / self.sig, n_components=50).fit(x.copy()) self.kernel_fit = lambda x_ts: rbf_fit.transform(x_ts) x = self.kernel_fit(x) n, d = x.shape[0], x.shape[1] mdl = grb.Model("qp") mdl.ModelSense = 1 mdl.setParam('OutputFlag', False) mdl.reset() L = 1e5 us = [mdl.addVar(name="u%d" % i, lb=-L, ub=L) for i in range(n)] ls = [mdl.addVar(name="l%d" % i, lb=-L, ub=L) for i in range(n)] bsU = [mdl.addVar(name="bu%d" % i, lb=-L, ub=L) for i in range(d + 1)] bsL = [mdl.addVar(name="bl%d" % i, lb=-L, ub=L) for i in range(d + 1)] rUs = [mdl.addVar(name="ru%d" % i, lb=0, ub=L) for i in range(n)] rLs = [mdl.addVar(name="rl%d" % i, lb=0, ub=L) for i in range(n)] slackU = 0 slackL = 0 if w is None: w = np.ones(n) / n obj_terms = [] for i in range(n): mdl.addConstr(us[i] >= ls[i]) mdl.addConstr(us[i] == np.dot(x[i, ], bsU[:d]) + bsU[-1]) mdl.addConstr(ls[i] == np.dot(x[i, ], bsL[:d]) + bsL[-1]) mdl.addConstr(rUs[i] >= y[i] - us[i]) mdl.addConstr(rLs[i] >= ls[i] - y[i]) slackU += w[i] * rUs[i] slackL += w[i] * rLs[i] if self.loss == 'square': obj_terms.append(w[i] * (us[i] - ls[i]) * (us[i] - ls[i])) elif self.loss == 'linear': if self.agg == 'max': obj_terms.append((us[i] - ls[i])) else: obj_terms.append(w[i] * (us[i] - ls[i])) else: raise Exception('Unrecognized loss: %s' % self.loss) if self.agg == 'max': o = mdl.addVar(name="o", lb=-L, ub=L) os = [] for i in range(n): oi = mdl.addVar(name="o%d" % i, lb=-L, ub=L) mdl.addConstr(oi == obj_terms[i]) os += [oi] mdl.addConstr(o == grb.max_(os)) obj = o else: obj = grb.quicksum(obj_terms) # ----add the values of the objectives obj_reg_u, obj_reg_l = 0, 0 for k in range(d): obj_reg_u += bsU[k] * bsU[k] obj_reg_l += bsL[k] * bsL[k] obj_reg = self.alphau * obj_reg_u + self.alphal * obj_reg_l mdl.addConstr(slackU <= self.lamdau) mdl.addConstr(slackL <= self.lamdal) obj_f = obj + obj_reg mdl.setObjective(obj_f) mdl.optimize() self.bu = np.array([bsU[j].x for j in range(d + 1)]) self.bl = np.array([bsL[j].x for j in range(d + 1)]) # print(obj.getValue(), obj_slack.getValue()) return self
__author__ = 'pierregagliardi' import numpy as np import pickle from sklearn.feature_selection import VarianceThreshold from projet_sentiment_analysis.code.utilities import extract_data if __name__ == "__main__": general_path = '/Users/pierregagliardi/DossierTravail/Programmation/PythonPath/projet_sentiment_analysis/' path_to_training_set = general_path + 'training_set_60000/training_set_unigram_all_features/' path_to_pickle = general_path + 'pickle_hyper_parameters/' (X_train, y_train, X_test, y_test, number_training, number_testing) = extract_data.extract_training_and_testing_set( path_to_training_set + 'metrics_training_set_7000.data', path_to_training_set + 'metrics_testing_set_7000.data') sel = VarianceThreshold(threshold=(.999 * (1 - .999))) X_train = sel.fit_transform(X_train) X_test = sel.transform(X_test) with open(path_to_pickle + 'metrics_60000_all_features_7000.pkl', 'wb') as fid: pickle.dump((X_train, y_train, X_test, y_test), fid)
def main(): train_data = pd.read_csv(train_path, index_col = 'Id') kaggl_data = pd.read_csv(kaggl_path, index_col = 'Id') # Train/Test Split X = train_data.drop('SalePrice', axis=1) y = train_data['SalePrice'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) print('Training data has {} rows.'.format(X_train.shape[0])) print('Testing data has {} rows.'.format(X_test.shape[0])) print('Kaggle data has {} rows.'.format(kaggl_data.shape[0])) # Manual Feature Engineering print('Manual Feature Engineering...') # Create an 'EDA' dataframe we'll use to do some exploring EDA = X_train.copy() EDA['SalePrice'] = y_train # There are 27 neighborhoods. Let's put them into groups of 9: neighborhood_ranks = EDA.groupby('Neighborhood')['SalePrice'].mean().sort_values().index low_neigh = neighborhood_ranks[:9] mid_neigh = neighborhood_ranks[9:18] high_neigh = neighborhood_ranks[18:] def manual_feature_eng(data): '''Some basic manual feature engineering based on EDA of X_train''' eng_data = data.copy() # Years info: eng_data['Years_Old'] = 2018 - eng_data['Year Built'] eng_data['Garage Age'] = 2018 - eng_data['Garage Yr Blt'] eng_data['Years Since Sale'] = 2018 - eng_data['Yr Sold'] eng_data['Years Since Remodel'] = 2018 - eng_data['Year Remod/Add'] eng_data.drop(['Year Built','Garage Yr Blt','Yr Sold','Year Remod/Add'], axis=1, inplace=True) # Neighborhood info: eng_data['High_Neigh'] = [1 if x in high_neigh else 0 for x in eng_data['Neighborhood']] eng_data['Mid_Neigh'] = [1 if x in mid_neigh else 0 for x in eng_data['Neighborhood']] eng_data['Low_Neigh'] = [1 if x in low_neigh else 0 for x in eng_data['Neighborhood']] eng_data.drop('Neighborhood', axis=1, inplace=True) # Is there miscellaneous furniture? eng_data['MiscFurn'] = eng_data['Misc Val'] > 0 return eng_data X_train = manual_feature_eng(X_train) X_test = manual_feature_eng(X_test) kaggl_data = manual_feature_eng(kaggl_data) # Data Preprocessing: Categorical Data print('Processing Categorical Data...') # Before we begin, let's check to see if there are any columns in the Kaggle # set that aren't in the training set: assert [col for col in kaggl_data.columns if col not in X_train.columns] == [] # And vice versa: assert [col for col in X_train.columns if col not in kaggl_data.columns] == [] # All of our preprocessing will ultimately go here: def preprocessing(data): try: cleaned_data = data.drop('PID', axis=1) except: cleaned_data = data fillna_dict = { 'Pool QC':'No Pool', 'Alley':'No Alley', # Let's let the get_dummies drop 'Misc Features' if NA 'Fence':'No Fence', 'Fireplace Qu':'No Fireplace', # Lot frontage can be mean imputed 'Garaga Finish': 'No Garage', 'Garage Qual': 'No Garage', 'Garage Cond': 'No Garage', 'Garage Type': 'No Garage', 'Bsmt Exposure':'No Garage', 'BsmtFin Type 2':'No Basement', 'BsmtFin Type 1':'No Basement', 'Bsmt Cond':'No Basement', 'Bsmt Qual':'No Basement', 'Mas Vnr Type':'No Mas Vnr' } cleaned_data = cleaned_data.fillna(fillna_dict) return(cleaned_data) X_train = preprocessing(X_train) X_test = preprocessing(X_test) kaggl_data = preprocessing(kaggl_data) # Grab the string columns: string_cols = X_train.select_dtypes(exclude=[np.number]).columns # Get some dummies: X_train = pd.get_dummies(X_train, columns=string_cols) X_test = pd.get_dummies(X_test, columns=string_cols) kaggl_data = pd.get_dummies(kaggl_data, columns=string_cols) # Addressing Column Mismatch After Dummifying print('Addressing column mismatch...') # Add columns of zeros to test and kaggle sets for columns that *do* appear in # the training set. model_cols = X_train.columns def add_model_cols(data, model_cols): new_data = data.copy() for missing_col in [col for col in model_cols if col not in data.columns]: new_data[missing_col] = 0 return new_data X_test = add_model_cols(X_test, model_cols=model_cols) kaggl_data = add_model_cols(kaggl_data, model_cols=model_cols) # Now, let's only consider columns in X_test and kaggl_data that appear in # the training set. We'll call these 'model columns': kaggl_data = kaggl_data[model_cols] X_test = X_test[model_cols] # Make sure we've done this correctly: assert X_train.shape[1] == X_test.shape[1] == kaggl_data.shape[1] assert X_train.columns.all() == X_test.columns.all()== kaggl_data.columns.all() # Imputing Numerical Missing Data: Handling Numerical Data print('Imputing missing numerical data...') imp = Imputer(strategy='mean') imp.fit(X_train) X_train = imp.transform(X_train) X_test = imp.transform(X_test) kaggl_data = imp.transform(kaggl_data) def array_null_check(array): '''Turns an array into a dataframe so that we can check for null values''' return pd.DataFrame(array).isnull().sum().sum() assert array_null_check(X_train) == array_null_check(X_test) == array_null_check(kaggl_data) # Brute Force Feature Engineering if brute: print('Brute force feature engineering...') pf = PolynomialFeatures(interaction_only=interaction_only) X_train = pf.fit_transform(X_train) X_test = pf.transform(X_test) kaggl_data = pf.transform(kaggl_data) # Maybe this is too many columns??? print('X_train has:\n---{} rows\n---{} columns'.format(X_train.shape[0], X_train.shape[1])) # Scaling print('Scaling all columns...') ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) kaggl_data = ss.transform(kaggl_data) # Feature Elimination if brute: print('Performing automatic feature elimination') # Only do feature elimination if feature engineering happened by brute force feature_variances = np.apply_along_axis(np.var, axis=0, arr= X_train) # Define a percentile threshold. Do I want the top 1% of features by variance? perc_thresh = np.percentile(feature_variances, 99) perc_thresh vt = VarianceThreshold(threshold=perc_thresh) X_train_reduced = vt.fit_transform(X_train) X_test_reduced = vt.transform(X_test) kaggl_reduced = vt.transform(kaggl_data) print('X_train now has:\n---{} rows\n---{} columns'.format(X_train.shape[0], X_train.shape[1])) else: X_train_reduced = X_train X_test_reduced = X_test kaggl_reduced = kaggl_data # Or do I want to select the top 1% of features according # to the f_regression function? # sp = SelectPercentile(score_func=f_regression, percentile = 1) # X_train_reduced = sp.fit_transform(X_train, y_train) # X_test_reduced = sp.transform(X_test) # kaggl_reduced = sp.transform(kaggl_data) # print(X_train.shape[1]) ## Modeling # Linear Regression if run_lin: lin = LinearRegression() lin.fit(X_train_reduced, y_train) cv_scores = cross_val_score(lin, X_train_reduced, y_train, cv=3).mean() print('{} model has average performance of {}' .format(str(lin).split('(')[0], cv_scores.mean())) # Ridge Regression if run_ridge: rid = RidgeCV() rid.fit(X_train_reduced, y_train) cv_scores = cross_val_score(rid, X_train_reduced, y_train, cv=3).mean() print('{} model has average performance of {}' .format(str(rid).split('(')[0], cv_scores.mean())) # Lasso Regression if run_las: # Define a reasonable range of alphas based on previous LASSO fits: alphas = np.logspace(2,4,20) las = LassoCV(alphas=alphas, n_jobs=-1) las.fit(X_train_reduced, y_train) cv_scores = cross_val_score(las, X_train_reduced, y_train, cv=3).mean() best_alpha = las.alpha_ print('{} model has average performance of {}' .format(str(las).split('(')[0], cv_scores.mean())) las = Lasso(alpha=best_alpha, max_iter=2000) cv_scores = cross_val_score(las, X_train_reduced, y_train, cv=3).mean() las.fit(X_train_reduced, y_train) print('{} model has average performance of {}' .format(str(las).split('(')[0], cv_scores.mean())) # ElasticNet Regression if run_elnet: elnet = ElasticNetCV(n_alphas=10) elnet.fit(X_train_reduced, y_train) cv_scores = cross_val_score(elnet, X_train_reduced, y_train, cv=3).mean() print('{} model has average performance of {}' .format(str(elnet).split('(')[0], cv_scores.mean())) # Final Model Test models = {} try: lin_score = lin.score(X_test_reduced, y_test) models[lin_score] = lin print('Test set performance of {}: {}'.format(str(lin).split('(')[0],lin_score)) except: pass try: rid_score = rid.score(X_test_reduced, y_test) models[rid_score] = rid print('Test set performance of {}: {}'.format(str(rid).split('(')[0],rid_score)) except: pass try: las_score = las.score(X_test_reduced, y_test) models[las_score] = las print('Test set performance of {}: {}'.format(str(las).split('(')[0],las_score)) except: pass try: elnet_score = elnet.score(X_test_reduced, y_test) models[elnet_score] = elnet print('Test set performance of {}: {}'.format(str(elnet).split('(')[0],elnet_score)) except: pass high_score = max(models.keys()) print('Best performing model was {},\nwith test set performance of {}'.format( str(models[high_score]).split('(')[0], round(high_score,5))) # Choosing a Model and Outputting Submission: # Choose a model based on test set performance: chosen_model = models[high_score] if submission_path: kaggl_preds = chosen_model.predict(kaggl_reduced) kaggl_id = pd.read_csv('data/test.csv')['Id'] sample_submission = pd.read_csv('data/sample_submission.csv') submission_columns= sample_submission.columns submission = pd.DataFrame({submission_columns[0]:kaggl_id, submission_columns[1]:kaggl_preds}) submission.to_csv(submission_path, index=False)
def main(): df = joblib.load('modelDataset.pkl') # Split dataframe into features and target y = df.iloc[:, 1] # .as_matrix() X = df.iloc[:, 2:] # .as_matrix() id = df.iloc[:, 0] # Scalings sc = StandardScaler() # Apply scaler colNames = X.columns X = sc.fit_transform(X) X = pd.DataFrame(X, columns=colNames) # Remove features with less than 20% variance colNames = X.columns sel = VarianceThreshold(threshold=0.16) X = sel.fit_transform(X) # Get column names back newCols = [] for remain, col in zip(sel.get_support(), colNames): if remain == True: newCols.append(col) X = pd.DataFrame(X, columns=newCols) # Perform univariate feature selection (ANOVA F-values) colNames = X.columns selection_Percent = SelectPercentile(percentile=5) X = selection_Percent.fit_transform(X, y) # Get column names back newCols = [] for remain, col in zip(selection_Percent.get_support(), colNames): if remain == True: newCols.append(col) X = pd.DataFrame(X, columns=newCols) # Perform tree-based feature selection clf = ExtraTreesRegressor() clf = clf.fit(X, y) colNames = X.columns sel = SelectFromModel(clf, prefit=True) X = sel.transform(X) newCols = [] for remain, col in zip(sel.get_support(), colNames): if remain == True: newCols.append(col) X = pd.DataFrame(X, columns=newCols) # Split train/test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1555) def testRegressor(clf): ''' #RF grid param_grid = [{'n_estimators': range(320, 350, 10), 'min_samples_split': range(2, 20, 2), 'min_samples_leaf': range(2, 20, 2), 'max_leaf_nodes': range(140, 170, 5) }] grid = GridSearchCV(clf, param_grid, cv=3, verbose=1, n_jobs=-1) fitted_classifier = grid.fit(X_train, y_train) print(grid.best_score_, grid.best_params_) predictions = fitted_classifier.predict(X_train)''' ''' #XGB tuning - concept, not in use param_grid = [{'max_depth': range(2, 4, 1), 'min_child_weight': range(3, 6, 1), 'n_estimators': range(80, 110, 10), 'learning_rate': [0.1], 'gamma': [0], 'subsample': [0.9, 1], 'colsample_bytree': [0.7], 'reg_alpha': [15, 50, 100, 150, 200], 'reg_lambda': [15, 20, 25, 30, 40, 50]}] fit_params = {"early_stopping_rounds": 8, "eval_metric": "mae", "eval_set": [[X_test, y_test]], "verbose": False} grid = GridSearchCV(clf, param_grid, fit_params=fit_params, cv=3, verbose=1, n_jobs=-1) fitted_classifier = grid.fit(X_train, y_train) print(grid.best_score_, grid.best_params_) predictions = fitted_classifier.predict(X_train) ''' fitted = clf.fit(X_train, y_train) scoresCV = cross_val_score(clf, X_train, y_train, cv=3, verbose=0, n_jobs=-1) trainPredictionsCV = cross_val_predict(clf, X_train, y_train, cv=3, verbose=0, n_jobs=-1) trainPredictions = clf.predict(X_train) testPredictions = clf.predict(X_test) score1 = metrics.explained_variance_score(y_test.values, testPredictions) score2 = metrics.mean_absolute_error(y_test.values, testPredictions) score3 = metrics.mean_squared_error(y_test.values, testPredictions) score4 = metrics.r2_score(y_test.values, testPredictions) print('Train score: ', metrics.mean_absolute_error(y_train.values, trainPredictions)) print('CV score: ', scoresCV) print('Explained Variance Score, MAE, MSE, R^2') print(score1, score2, score3, score4) tempIndex = range(0, len(y_test.values), 1) plt.scatter(tempIndex, y_test.values, color='black', s=20, alpha=0.8) plt.scatter(tempIndex, testPredictions, color='red', s=20, alpha=0.4) plt.show() #Results appear to be highly interesting #MSE (and thus penalising large errors more) suggests that the model does not deal well with #particular categories of retweets where there is a significant difference between true value and predicted #Data appears to have high bias in terms of selection, as if tweets were selected from specific pools #based on retweet value #While the random forest deals well with those particular types of tweets, more analysis is needed # Further steps would start by understanding the sampling procedure that produced these tweets # From there, features need to be relooked at, dimensionality reduction (such as PCA) might be needed # Simpler / more powerful models to then be appropriately applied #The target retweets actually seem to be created from a Decision Tree Model print('x') lr = LinearRegression() dt = DecisionTreeRegressor() rf = RandomForestRegressor() gb = xgboost.XGBRegressor() #print('LR') #testRegressor(lr) #print('DT') #testRegressor(dt) print('RF') testRegressor(dt)
def learn(X: pd.DataFrame, y: pd.DataFrame, s: pd.DataFrame, outer_folds: list, inner_folds: list) -> pd.DataFrame: """Apply the entire machine learning procedure. Arguments: - X: A m*n dataframe containing features, that is used as input for classifier - y: A boolean vector of length n, containing the targets - s: A boolean vector of length n, indicating whether a sample belongs to sensitive group. - outer_folds, inner_folds: Result of src.get_folds. Returns a pd.DataFrame containing the performance over all folds. """ assert all(X.index == y.index) assert all(X.index == s.index) # Convert X, y, s to np.arrays for compatibility reasons. X = np.ascontiguousarray(X.values) y = np.ascontiguousarray(y.values.ravel()) > 1 s = np.ascontiguousarray(s.values.ravel()) params = [ (int(max_depth), int(n_bins), float(orthogonality)) for n_bins in (2,) for max_depth in np.arange(1, 11) for orthogonality in np.linspace(0, 1, 11) ] # Learn on every outer fold iterations = [ (max_depth, n_bins, ortho, fold, trainval_idx, test_idx) for max_depth, n_bins, ortho in params for fold, (trainval_idx, test_idx) in outer_folds if not isfile(f'models/outer_folds/{max_depth}-{ortho:.2f}-{n_bins}-{fold}.pkl') ] for max_depth, n_bins, ortho, fold, trainval_idx, test_idx in tqdm(iterations): X_trainval = X[trainval_idx] y_trainval = y[trainval_idx] s_trainval = s[trainval_idx] vt = VarianceThreshold() vt.fit(X_trainval) X_trainval = vt.transform(X_trainval) clf = FairRandomForestClassifier( orthogonality=ortho, max_depth=max_depth, n_bins=n_bins) start_fit = time() clf.fit(X_trainval, y_trainval, s_trainval) clf.fit_time = time() - start_fit fp = f'models/outer_folds/{max_depth}-{ortho:.2f}-{n_bins}-{fold}.pkl' joblib.dump(clf, fp) # Learn on every inner fold iterations = [ (max_depth, n_bins, ortho, outer_fold, inner_fold, train_idx, val_idx) for max_depth, n_bins, ortho in params for (outer_fold, inner_fold), (train_idx, val_idx) in inner_folds if not isfile(f'models/inner_folds/{max_depth}-{ortho:.2f}-{n_bins}-{outer_fold}-{inner_fold}.pkl') ] for max_depth, n_bins, ortho, outer_fold, inner_fold, train_idx, val_idx in tqdm(iterations): X_train = X[train_idx] y_train = y[train_idx] s_train = s[train_idx] vt = VarianceThreshold() vt.fit(X_train) X_train = vt.transform(X_train) clf = FairRandomForestClassifier( orthogonality=ortho, max_depth=max_depth, n_bins=n_bins) start_fit = time() clf.fit(X_train, y_train, s_train) clf.fit_time = time() - start_fit fp = f'models/inner_folds/{max_depth}-{ortho:.2f}-{n_bins}-{outer_fold}-{inner_fold}.pkl' joblib.dump(clf, fp) # Predict on all outer folds iterations = [ (max_depth, n_bins, ortho, fold, trainval_idx, test_idx) for max_depth, n_bins, ortho in params for fold, (trainval_idx, test_idx) in outer_folds if not isfile(f'models/outer_folds/{max_depth}-{ortho:.2f}-{n_bins}-{fold}.npy') ] for max_depth, n_bins, ortho, fold, trainval_idx, test_idx in tqdm(iterations): X_trainval = X[trainval_idx] X_test = X[test_idx] vt = VarianceThreshold() vt.fit(X_trainval) X_trainval = vt.transform(X_trainval) X_test = vt.transform(X_test) fp = f'models/outer_folds/{max_depth}-{ortho:.2f}-{n_bins}-{fold}' clf = joblib.load(f'{fp}.pkl') y_score = clf.predict_proba(X_test)[:,1] np.save(f'{fp}.npy', y_score) # Predict on all inner folds iterations = [ (max_depth, n_bins, ortho, outer_fold, inner_fold, train_idx, val_idx) for max_depth, n_bins, ortho in params for (outer_fold, inner_fold), (train_idx, val_idx) in inner_folds if not isfile(f'models/inner_folds/{max_depth}-{ortho:.2f}-{n_bins}-{outer_fold}-{inner_fold}.npy') ] for max_depth, n_bins, ortho, outer_fold, inner_fold, train_idx, val_idx in tqdm(iterations): X_train = X[train_idx] X_val = X[val_idx] vt = VarianceThreshold() vt.fit(X_train) X_train = vt.transform(X_train) X_val = vt.transform(X_val) fp = f'models/inner_folds/{max_depth}-{ortho:.2f}-{n_bins}-{outer_fold}-{inner_fold}' clf = joblib.load(f'{fp}.pkl') y_score = clf.predict_proba(X_val)[:,1] np.save(f'{fp}.npy', y_score) # Measure performance for every outer loop iterations = [ (max_depth, n_bins, orthogonality, outer_fold, inner_fold, train_idx, val_idx) for max_depth, n_bins, orthogonality in params for (outer_fold, inner_fold), (train_idx, val_idx) in inner_folds ] performance_all_candidates = list() for max_depth, n_bins, ortho, outer_fold, inner_fold, train_idx, val_idx in tqdm(iterations): fp = f'models/inner_folds/{max_depth}-{ortho:.2f}-{n_bins}-{outer_fold}-{inner_fold}' y_score = np.load(f'{fp}.npy') y_val = y[val_idx] s_val = s[val_idx] auc_y = roc_auc_score(y_val, y_score) auc_s = roc_auc_score(s_val, y_score) auc_s = max(auc_s, 1-auc_s) performance_this_run = dict( max_depth=max_depth, n_bins=n_bins, orthogonality=ortho, outer_fold=outer_fold, inner_fold=inner_fold, auc_y=auc_y, auc_s=auc_s) performance_all_candidates.append(performance_this_run) return pd.DataFrame(performance_all_candidates)
#then we know that for real those are the features not helpful xtrain_aud = sio.loadmat('xtrain_all_aud.mat') xtrain_aud = xtrain_aud['xtrain'] ytrain_aud = sio.loadmat('ytrain_all_aud.mat') ytrain_aud = ytrain_aud['ytrain'] # method 1: variance threshold Var_selector = VarThresh(.5) # without any parameters passed to varthresh it defaults to anything with all feautres the exact same # am going to start with .1 Var_selector.fit(xtrain_aud) which_feats = Var_selector.get_support() x_aud_fitted = Var_selector.transform(xtrain_aud) print x_aud_fitted.shape xtrunclength = sio.loadmat('xtrunclength.mat') xtrunclength = xtrunclength['xtrunclength'] xtesting = sio.loadmat('xtesting.mat') xtesting = xtesting['xtesting'] xtesting = xtesting[~np.isnan(xtesting).any(axis=1),:] xtesting = xtesting[~np.isinf(xtesting).any(axis=1),:] from CurrentThingsNeededtoRun import FinalClassifier
class TextClassifier(BaseEstimator): def __init__(self, base_classifiers = [SGDClassifier()]): """ Parameters ---------- base_classifiers: array, shape = [n_estimators], optional, default: [SGDClassifier()] estimators objects implementing fit and predict used for classification, the best combination is choosen Attributes ---------- multilabel_: boolean, optional, default: True with_titles_: boolean, optional, default: False """ self.base_classifiers = base_classifiers def __feature_selection(self, text_data): """ Parameters ---------- text_data: array, shape = [n_samples] Returns ------- sparse matrix of text features """ X = self.count_vect_.fit_transform(text_data) X_tfidf = self.tfidf_transformer_.fit_transform(X) return X_tfidf def __transform_features(self, text_data): """ Transform data by using tf-idf Parameters ---------- Returns ------- """ X = self.count_vect_.transform(text_data) X_tfidf = self.tfidf_transformer_.transform(X) return X_tfidf def fit(self, X, y, titles = None, multilabel = True): """ Fit base_classifiers, choose the best model Parameters ---------- X: array, shape = [n_samples] y: array, shape = [n_samples] titles: array, shape = [n_samples], optional, default: None multilabel: boolean, optional, default:True Returns ------- self """ self.with_titles_ = (titles != None) self.multilabel_ = multilabel self.tfidf_transformer_ = TfidfTransformer() self.count_vect_ = CountVectorizer(decode_error='ignore') self.best_classifier_ = self.base_classifiers[0] best_quality = 0.0 if (self.with_titles_): X_train = [X[i] + ' ' + titles[i] for i in range(len(X))] else: X_train = X X_features = self.__feature_selection(X_train) if (self.multilabel_): """ remove target features, that are equal in all objects """ self.selector_ = VarianceThreshold() Y = self.selector_.fit_transform(y) self.best_classifier_ = OneVsRestClassifier(self.best_classifier_) else: for classifier in self.base_classifiers: new_quality = np.mean(cross_val_score(classifier, X_features, np.array(y))) if (new_quality > best_quality): best_quality = new_quality self.best_classifier_ = classifier Y = y self.best_classifier_.fit(X_features, Y) return self def predict(self, X, titles = None): """ Parameters ---------- X: array, shape = [n_samples] titles: array, shape = [n_samples], optional, default: None Returns ------- y_pred: array, shape = [n_samples] """ self.with_titles = (titles != None) if (self.with_titles_): X_train = [X[i] + ' ' + titles[i] for i in range(len(X))] else: X_train = X X_features = self.__transform_features(X_train) y_pred = self.best_classifier_.predict(X_features) return y_pred def predict_proba(self, X): """ Compute probabilities of possible outcomes for samples in X. Parameters ---------- X: array, shape = [n_samples] Returns ------- Returns the probability of the sample for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute classes_. """ X_features = self.__transform_features(X) return self.best_classifier_.predict_proba(X_features) def get_support(self): """ Get a mask, or integer index, of the features selected Returns ------- T: array, shape = [n_features] returns the mask of selected features """ return self.selector_.get_support() def score(self, X, y_true): """ Parameters ---------- X: array, shape = [n_samples] y_true: true labels for X Returns Mean accuracy of self.predict(X) wrt. y. ------- """ if (self.multilabel_): Y = self.selector_.transform(y_true) return np.mean(Y == self.predict(X)) else: return accuracy_score(Y, self.predict(X)) def load(self, path): """ Load model parameters from path Parameters ---------- path: path to load from ------- """ file = open(path, 'rb') sys.modules['textclassifier'] = sys.modules[__name__] state = pickle.load(file) self.__dict__ = state.__dict__ file.close()
def fit(self, x, y, w=None): """ Fits upper and lower bounds on p(y|x) Args: x, y are lists with control groups first """ # -----preprocessing if self.standardize: x0selector = VarianceThreshold(threshold=.1).fit(x[0]) temp_x0 = x0selector.transform(x[0]) x0scaler = StandardScaler().fit(temp_x0) self.x0scaler = lambda x: x0scaler.transform(x0selector.transform(x)) x1selector = VarianceThreshold(threshold=.1).fit(x[1]) temp_x1 = x1selector.transform(x[1]) x1scaler = StandardScaler().fit(temp_x1) self.x1scaler = lambda x: x1scaler.transform(x1selector.transform(x)) x00 = self.x0scaler(x[0]) x01 = self.x1scaler(x[0]) x11 = self.x1scaler(x[1]) x10 = self.x0scaler(x[1]) else: x00, x01 = x[0], x[0] x11, x10 = x[1], x[1] if self.kernel == 'linear': self.kernel_fit = lambda x: x x00 = self.kernel_fit(x00) x01 = self.kernel_fit(x01) x11 = self.kernel_fit(x11) x10 = self.kernel_fit(x10) elif self.kernel == 'poly': if self.p is None: raise ValueError('Need polynomial value') self.kernel_fit = lambda x: np.hstack([x**i for i in range(1, self.p + 1)]) x00 = self.kernel_fit(x00) x01 = self.kernel_fit(x01) x11 = self.kernel_fit(x11) x10 = self.kernel_fit(x10) elif self.kernel == 'rbf': if self.sig is None: raise ValueError('Need Length scale value') self.x0_tr = x00.copy() self.x1_tr = x11.copy() self.kernel_fit = lambda x, tg: RBF(length_scale=self.sig).__call__(x, self.x1_tr) if tg == 1 else \ RBF(length_scale=self.sig).__call__(x, self.x0_tr) x00 = self.kernel_fit(x00, tg=0) x01 = self.kernel_fit(x01, tg=1) x11 = self.kernel_fit(x11, tg=1) x10 = self.kernel_fit(x10, tg=0) elif self.kernel == 'rbf_approx': if self.sig is None: raise ValueError('Need Length scale value') self.x0_tr = x00.copy() self.x1_tr = x11.copy() self.rbf_approx1 = RBFSampler(gamma=1 / self.sig, n_components=100, random_state=0).fit(self.x1_tr) self.rbf_approx0 = RBFSampler(gamma=1 / self.sig, n_components=100, random_state=0).fit(self.x0_tr) self.kernel_fit = lambda x, tg: self.rbf_approx1.transform(x) if tg == 1 else \ self.rbf_approx0.transform(x) x00 = self.kernel_fit(x00, tg=0) x01 = self.kernel_fit(x01, tg=1) x11 = self.kernel_fit(x11, tg=1) x10 = self.kernel_fit(x10, tg=0) n1, d1 = x11.shape[0], x11.shape[1] n0, d0 = x00.shape[0], x00.shape[1] y0 = y[0] y1 = y[1] n = n1 + n0 mdl = grb.Model("cqp") mdl.ModelSense = 1 mdl.setParam('OutputFlag', False) mdl.reset() L = 1e5 u0 = [mdl.addVar(name="u0_%d" % i, lb=-L, ub=L) for i in range(n)] l0 = [mdl.addVar(name="l0_%d" % i, lb=-L, ub=L) for i in range(n)] u1 = [mdl.addVar(name="u1_%d" % i, lb=-L, ub=L) for i in range(n)] l1 = [mdl.addVar(name="l1_%d" % i, lb=-L, ub=L) for i in range(n)] bU0 = [mdl.addVar(name="bu0_%d" % i, lb=-L, ub=L) for i in range(d0 + 1)] bL0 = [mdl.addVar(name="bl0_%d" % i, lb=-L, ub=L) for i in range(d0 + 1)] bU1 = [mdl.addVar(name="bu1_%d" % i, lb=-L, ub=L) for i in range(d1 + 1)] bL1 = [mdl.addVar(name="bl1_%d" % i, lb=-L, ub=L) for i in range(d1 + 1)] rUs = [mdl.addVar(name="ru%d" % i, lb=0, ub=L) for i in range(n)] rLs = [mdl.addVar(name="rl%d" % i, lb=0, ub=L) for i in range(n)] slackU1 = 0 slackL1 = 0 slackU0 = 0 slackL0 = 0 if w is None: w0, w1= np.ones(n0) / n0, np.ones(n1) / n1 else: w0 = w[0] w1 = w[1] obj_terms = [] for i in range(n): mdl.addConstr(u1[i] >= l1[i]) mdl.addConstr(u0[i] >= l0[i]) for i in range(n0): mdl.addConstr(u1[i] == np.dot(x01[i, ], bU1[:d1]) + bU1[-1]) mdl.addConstr(l1[i] == np.dot(x01[i, ], bL1[:d1]) + bL1[-1]) mdl.addConstr(u0[i] == np.dot(x00[i, ], bU0[:d0]) + bU0[-1]) mdl.addConstr(l0[i] == np.dot(x00[i, ], bL0[:d0]) + bL0[-1]) mdl.addConstr(rUs[i] >= y0[i] - u0[i]) mdl.addConstr(rLs[i] >= l0[i] - y0[i]) slackU0 += w0[i] * rUs[i] slackL0 += w0[i] * rLs[i] if self.loss == 'square': obj_terms.append(w0[i] * ((u0[i] - l0[i]) * (u0[i] - l0[i]) + (u1[i] - l1[i]) * (u1[i] - l1[i]))) elif self.loss == 'linear': if self.agg == "max": obj_terms.append(((u0[i] - l0[i]) + (u1[i] - l1[i]))) else: obj_terms.append(w0[i]*((u0[i] - l0[i])+ (u1[i] - l1[i]))) else: raise Exception('Unrecognized loss: %s' % self.loss) for i in range(n0, n1+n0): mdl.addConstr(u1[i] == np.dot(x11[i - n0, ], bU1[:d1]) + bU1[-1]) mdl.addConstr(l1[i] == np.dot(x11[i - n0, ], bL1[:d1]) + bL1[-1]) mdl.addConstr(u0[i] == np.dot(x10[i - n0, ], bU0[:d0]) + bU0[-1]) mdl.addConstr(l0[i] == np.dot(x10[i - n0, ], bL0[:d0]) + bL0[-1]) mdl.addConstr(rUs[i] >= y1[i - n0] - u1[i]) mdl.addConstr(rLs[i] >= l1[i] - y1[i - n0]) slackU1 += w1[i - n0] * rUs[i] slackL1 += w1[i - n0] * rLs[i] if self.loss == 'square': obj_terms.append(w1[i - n0] * ((u1[i] - l1[i]) * (u1[i] - l1[i]))) elif self.loss == 'linear': if self.agg == "max": obj_terms.append(((u1[i] - l1[i]) + (u0[i] - l0[i]))) else: obj_terms.append(w1[i - n0] * ((u1[i] - l1[i]) + (u0[i] - l0[i]))) else: raise Exception('Unrecognized loss: %s' % self.loss) if self.agg == 'max': o = mdl.addVar(name="o", lb=-L, ub=L) os = [] for i in range(n): oi = mdl.addVar(name="o%d" % i, lb=-L, ub=L) mdl.addConstr(oi == obj_terms[i]) os += [oi] mdl.addConstr(o == grb.max_(os)) obj = o# + .01*grb.quicksum(obj_terms) else: obj = grb.quicksum(obj_terms) obj_reg_u0, obj_reg_l0, obj_reg_u1, obj_reg_l1 = 0, 0, 0, 0 for k in range(d1): obj_reg_u1 += bU1[k] * bU1[k] obj_reg_l1 += bL1[k] * bL1[k] for k in range(d0): obj_reg_u0 += bU0[k] * bU0[k] obj_reg_l0 += bL0[k] * bL0[k] obj_reg = self.alphau1 * obj_reg_u1 + self.alphal1 * obj_reg_l1 + \ self.alphau0 * obj_reg_u0 + self.alphal0 * obj_reg_l0 obj = obj + obj_reg mdl.addConstr((slackU0 <= self.lamdau0)) mdl.addConstr((slackL0 <= self.lamdal0)) mdl.addConstr((slackU1 <= self.lamdau1)) mdl.addConstr((slackL1 <= self.lamdal1)) mdl.setObjective(obj) mdl.optimize() self.bu0 = np.array([bU0[j].x for j in range(d0 + 1)]) self.bl0 = np.array([bL0[j].x for j in range(d0 + 1)]) self.bu1 = np.array([bU1[j].x for j in range(d1 + 1)]) self.bl1 = np.array([bL1[j].x for j in range(d1 + 1)]) return self
def preProcessData(trainFeatureMatrix, testFeatureMatrix): totalFeatureNum = 52 singleValueIndexList = [17, 19, 20, 23] categoricalAttriIndexList = [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 44, 45, 46] categoricalFeatureValueNumList = [13, 112, 2, 13, 13, 112, 2, 13, 145, 4, 3031, 4, 138, 102, 102, 2090] cateNumericIndexList = [1, 6, 15, 16, 18,21,22,24,25,26,27,28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,49,50,51] numericAttriIndexList = [1, 6, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 47, 48, 49, 50, 51] # for i in range(len(trainFeatureSpace[0])): # if not i in categoricalAttriIndexList: # #print 'numerical', i, len(list(set(trainFeatureSpace[:,i]))) # print '%s, numerical, train: %s, test:%s' % (i, len(list(set(trainFeatureMatrix[:,i]))), len(list(set(testFeatureMatrix[:,i])))) # else: # print '%s, categorical, train: %s, test:%s' % (i, len(list(set(trainFeatureMatrix[:,i]))), len(list(set(testFeatureMatrix[:,i])))) tempResultMatrix = np.concatenate((trainFeatureMatrix, testFeatureMatrix), axis=0) # print len(trainFeatureMatrix), len(trainFeatureMatrix[0]) # print len(testFeatureMatrix), len(testFeatureMatrix[0]) # print len(tempResultMatrix), len(tempResultMatrix[0]) # exit() # for i in range(len(trainFeatureMatrix)): # for j in range(len(trainFeatureMatrix[0])): # if j in cateNumericIndexList: # trainFeatureMatrix[i][j] = int(trainFeatureMatrix[i][j]) # for i in range(len(testFeatureMatrix)): # for j in range(len(testFeatureMatrix[0])): # if j in cateNumericIndexList: # testFeatureMatrix[i][j] = int(testFeatureMatrix[i][j]) #selectedFeatureList = [] # for i in range(53): # if not i in singleValueIndexList: # selectedFeatureList.append(i) # trainFeatureMatrix = trainFeatureMatrix[ : , selectedFeatureList] # testFeatureMatrix = testFeatureMatrix[ : , selectedFeatureList] from sklearn.preprocessing import OneHotEncoder enc = OneHotEncoder() enc.__init__(categorical_features = categoricalAttriIndexList + cateNumericIndexList) enc.fit(tempResultMatrix) trainFeatureMatrix = enc.transform(trainFeatureMatrix).toarray() testFeatureMatrix = enc.transform(testFeatureMatrix).toarray() print 'old feature num is ', len(trainFeatureMatrix[0]), len(testFeatureMatrix[0]) #tempResultMatrix = np.concatenate((trainFeatureMatrix, testFeatureMatrix), axis=0) sel = VarianceThreshold() sel.fit(trainFeatureMatrix) trainFeatureMatrix = sel.transform(trainFeatureMatrix) testFeatureMatrix = sel.transform(testFeatureMatrix) print 'new feature num is ', len(trainFeatureMatrix[0]), len(testFeatureMatrix[0]) #exit() return trainFeatureMatrix, testFeatureMatrix
#vtFT = VarianceThreshold(threshold=(0.2)) vtFT = VarianceThreshold(0.00025) print(trainFts.shape) trainFts = vtFT.fit_transform(trainFts) print(vtFT.variances_) print(min(vtFT.variances_)) print(max(vtFT.variances_)) _, ax = plt.subplots() bins = np.linspace(0.00030, 0.008, 1000) ax.hist(vtFT.variances_, bins) devFts = vtFT.transform(devFts) goldFts = vtFT.transform(goldFts) print(trainFts.shape) if DR_SVD_ON: svd = TruncatedSVD(n_components=200, n_iter=7, random_state=42) svd.fit(trainFts) #print(svd.explained_variance_ratio_) print(trainFts.shape) #print(type(trainFts)) trainFts = sp.csr_matrix(svd.transform(trainFts)) devFts = sp.csr_matrix(svd.transform(devFts)) goldFts = sp.csr_matrix(svd.transform(goldFts))
elimintated_feats = list(pool_features - selected_features)[:-1] # In[ ]: print("Elimintated features:", elimintated_feats) # In[ ]: len(elimintated_feats) # We eliminated 14 features. Do the same with TEST subset: # In[ ]: X_high_variance_ts = featFilter.transform(X_ts) # In[ ]: X_high_variance_ts.shape # In[ ]: X_high_variance.shape # In[ ]: df_selVar = df[df.columns[featFilter.get_support(indices=True)]] # Add Y:
class Model: # model related attribute model = None feat_transformer =None label_encoder = None selector = None pipeline = None # other parameters classification = True binarize = False; verbose = False # params for cloning params = dict() def __init__(self, model_type=None, \ model_params="", \ f_select=None, \ f_select_params="", \ sparse=True, \ n_features=100, \ n_components=10): self.params = {"model_type": model_type, "model_params": model_params, "f_select": f_select, "f_select_params": f_select_params, "sparse": sparse, "n_features": n_features, "n_components": n_components} if (model_type == None): return # initialize "default" values # models specific values should be set in model-specific if-else branch self.feat_transformer = DictVectorizer(sparse=sparse) self.label_encoder = LabelEncoder() # selector to remove zero-variance features self.var_selector = VarianceThreshold() #scaler = StandardScaler(with_mean=False) #selector = SelectKBest(chi2, k=n_features) #combined_features = FeatureUnion([('selector', selector)]) #self.pipeline = Pipeline([('vectorizer', feat_vectorizer), ('features', combined_features), ('scaler', scaler), ('model', self.model)]) # Choose model type if (model_type == "linear_svm"): self.model = eval("LinearSVC(" + model_params + ")") elif (model_type == "svm"): self.model = eval("SVC(" + model_params + ")") elif (model_type == "knn"): self.model = eval("KNeighborsClassifier(" + model_params + ")") elif (model_type == "ridge_classifier"): self.model = eval("RidgeClassifier(" + model_params + ")") elif (model_type == "ridge_regression"): self.classification = False self.model = eval("Ridge(" + model_params + ")") elif (model_type == "lasso"): self.classification = False self.model = eval("Lasso(" + model_params + ")") elif (model_type == "bayesian_ridge"): self.classification = False self.model = eval("BayesianRidge(" + model_params + ")") elif (model_type == "gaussian_bayes"): self.model = GaussianNB() elif (model_type == "decision_trees"): self.model = DecisionTreeClassifier(random_state=0) elif (model_type == "log_regression"): self.model = eval("LogisticRegression(" + model_params + ")") elif (model_type == "linear_regression"): self.classification = False self.model = eval("LinearRegression(" + model_params + ")") elif (model_type == "perceptron"): self.model = eval("Perceptron(" + model_params + ")") elif (model_type == "extra_trees"): self.feat_transformer = DictVectorizer(sparse=False) self.model = eval("ExtraTreesClassifier(" + model_params + ")") elif (model_type == "random_forest"): self.feat_transformer = DictVectorizer(sparse=False) self.model = eval("RandomForestClassifier(" + model_params + ")") elif (model_type == "ada_boost"): # self.feat_transformer = DictVectorizer(sparse=False) self.model = eval("AdaBoostClassifier(" + model_params + ")") elif (model_type == "sgd_classifier"): self.model = eval("SGDClassifier(" + model_params + ")") elif (model_type == "baseline"): self.model = eval("DummyClassifier(" + model_params + ")") else: print >> sys.stderr, "Model of type " + model_type + " is not supported." # Choose feature selector if (f_select == None): self.selector = EmptyModel() elif (f_select == "kbest"): # params: score_func, k self.selector = eval("SelectKBest(" + f_select_params + ")") elif (f_select == "percentile"): self.selector = eval("SelectPercentile(" + f_select_params + ")") elif (f_select == "kbest_anova"): #self.selector = SelectKBest(f_classif, k=n) self.selector = eval("SelectKBest(" + "f_classif," + f_select_params + ")") elif (f_select == "lassocv"): self.selector = eval("SelectFromModel(LassoCV()," + f_select_params + ")") elif (f_select == "rfe_svm"): self.selector = eval("RFE(LinearSVC()," + f_select_params + ")") elif (f_select == "rfecv"): self.selector = eval("RFECV(" + f_select_params + ")") elif (f_select == "rlregr"): self.selector = eval("RandomizedLogisticRegression(" + f_select_params + ")") elif (f_select == "svm"): print "SelectFromModel(LinearSVC(" + f_select_params + "))" self.selector = eval("SelectFromModel(LinearSVC(" + f_select_params + "))") elif (f_select == "extra_trees"): self.selector = eval("SelectFromModel(ExtraTreesClassifier(" + f_select_params + "))") elif (f_select == "random_forest"): self.selector = eval("SelectFromModel(RandomForestClassifier(" + f_select_params + "))") elif (f_select == "from_model"): self.selector = eval("SelectFromModel(" + f_select_params + ")") def fit(self, X, Y): Xtr = X if (self.classification): Xtr = [self.transform_features(i) for i in X] Xtr = self.feat_transformer.fit_transform(Xtr) Xtr = self.var_selector.fit_transform(Xtr) Y = self.label_encoder.fit_transform(Y) Xtr = self.selector.fit_transform(Xtr, Y) self.model.fit(Xtr,Y) def transform(self, Y): """ For evaluation, we want transformed predicted values. """ return self.label_encoder.transform(Y) def predict(self, X): if not isinstance(X, list) and not isinstance(X, numpy.ndarray): sys.stderr.write("Warning: X is not a list (type=%s)\n" % (str(type(X)))) #raise ValueError(X) if (self.classification): Xtr = [self.transform_features(i) for i in X] Xtr = self.feat_transformer.transform(Xtr) Xtr = self.var_selector.transform(Xtr) Xtr = self.selector.transform(Xtr) return self.label_encoder.inverse_transform(self.model.predict(Xtr)) else: return self.model.predict(Xtr) def predict_proba(self, X): if not isinstance(X, list) and not isinstance(X, numpy.ndarray): sys.stderr.write("Warning: X is not a list (type=%s)\n" % (str(type(X)))) #raise ValueError(X) if (self.classification): Xtr = [self.transform_features(i) for i in X] Xtr = self.feat_transformer.transform(Xtr) Xtr = self.var_selector.transform(Xtr) Xtr = self.selector.transform(Xtr) return self.model.predict_proba(Xtr) else: return self.model.predict_proba(Xtr) def score(self, X, Y): if (self.classification): Xtr = [self.transform_features(i) for i in X] Xtr = self.feat_transformer.transform(Xtr) Xtr = self.var_selector.transform(Xtr) Xtr = self.selector.transform(Xtr) Y = self.label_encoder.transform(Y) return self.model.score(Xtr, Y) def get_classes(self): return self.label_encoder.inverse_transform(self.model.classes_) def set_params(self, **params): self.model = self.model.set_params(**params) return self def get_params(self, deep=True): return self.params def print_params(self, file_path): f = open(file_path, "w") if (self.model.__class__.__name__ == "DecisionTreeClassifier"): f = tree.export_graphviz(self.model, out_file=f) f.close() def transform_features(self, features): """ Transform features with string values into new sets of features. """ transformed = dict() if not self.binarize: return features for name, value in features.iteritems(): if isinstance(value, basestring): name = "%s_%s" % (name,value) value = 1. transformed[name] = float(value) return transformed def setVerbose(self): self.verbose = True
# In[13]: constant_filter.get_support().sum() # In[14]: constant_list = [not temp for temp in constant_filter.get_support()] constant_list # In[15]: x.columns[constant_list] # In[16]: x_train_filter = constant_filter.transform(x_train) x_test_filter = constant_filter.transform(x_test) # In[17]: x_train_filter.shape, x_test_filter.shape, x_train.shape # ## Quesi Constant Feature Removal # * These have large output removal from the subset # * It's over load to Machine Learning Model # In[18]: quesi_constant_filter = VarianceThreshold(threshold=0.01) quesi_constant_filter.fit(x_train_filter)
X_train_T = y.T y_train = pd.DataFrame(X_train_T) X_test_T = y1.T y_test = pd.DataFrame(X_test_T) #X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 0,stratify = y) ##constant feature removall constant_filter = VarianceThreshold(threshold=0) constant_filter.fit(X) print(constant_filter.get_support().sum()) constant_list = [not temp for temp in constant_filter.get_support()] print(constant_list) print(X.columns[constant_list]) X_train_filter = constant_filter.transform(X) X_test_filter = constant_filter.transform(X_test) print(X_train_filter.shape) print(X_test_filter.shape) print(X.shape) ##Quasi constant feature removal quasi_constant_filter = VarianceThreshold(threshold=0.01) quasi_constant_filter.fit(X_train_filter) print(quasi_constant_filter.get_support().sum()) X_train_quasi_filter = quasi_constant_filter.transform(X_train_filter) X_test_quasi_filter = quasi_constant_filter.transform(X_test_filter) print(X_train_quasi_filter.shape) print(X_test_quasi_filter.shape)
sel.fit(x_train) sum(sel.get_support()) #another way len(x_train.columns[sel.get_support()]) print( len([ x for x in x_train.columns if x not in x_train.columns[sel.get_support()] ])) [x for x in x_train.columns if x not in x_train.columns[sel.get_support()]] x_train['ind_var2_0'].unique() x_train = sel.transform(x_train) x_test = sel.transform(x_test) #short adn easy constant_features = [ feat for feat in x_train.columns if x_train[feat].std() == 0 ] len(constant_features) x_train.drop(labels=constant_features, axis=1, inplace=True) x_test.drop(labels=constant_features, axis=1, inplace=True) #for categorical Variables
def feature_selector(x_train , x_test): vt = VarianceThreshold (0.01) vt.fit(x_train) return vt.transform(x_train) , vt.transform(x_test)
from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials #lightGBM parameters filename = "train_classify.csv" data = pd.read_csv(filename, index_col=0, nrows=10) float_cols = [c for c in data if data[c].dtype == "float64"] float32_cols = {c: np.float32 for c in float_cols} data = pd.read_csv(filename, index_col=0, dtype=float32_cols) x = data.values[:, :-1] y = data.label print("pre", x.shape) scaler = VarianceThreshold() scaler.fit(x) x = scaler.transform(x) stdScaler = StandardScaler() stdScaler.fit(x) x = stdScaler.transform(x) print("after", x.shape) xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=420) lgb_reg_params = { 'learning_rate': 0.2, 'max_depth': 10,
def do_t_recur(t_data, filenames, mode): # FEATURE SELECTION # Scale, Use VarianceThreshold and Pearson Correlation, and Select From RF Model scaler = MinMaxScaler() thresholding = VarianceThreshold() fs_data = [] for i, d in enumerate(t_data): print("\nFILENAME: {}".format(filenames[i])) t_rows = list(d.index) t_columns = d.columns[:-3] # Replace NaN values with the column mean t_data[i]['Recurrence'].fillna((t_data[i]['Recurrence'].mean()), inplace=True) # Scale fs_data.append(pd.DataFrame(scaler.fit_transform(t_data[i].iloc[:, :-3]), columns=t_columns, index=t_rows)) if mode == 'show': print("Scaling data -\n", fs_data[i].head()) # Variance Threshold selector = thresholding.fit(fs_data[i]) t_columns = t_columns[thresholding.get_support()] fs_data[i] = pd.DataFrame(thresholding.transform(fs_data[i]), columns=t_columns, index=t_rows) if mode == 'show': print("After variance thresholding -\n", fs_data[i].head()) # Select From RF classifier = RandomForestClassifier(n_estimators=1) classifier = classifier.fit(fs_data[i], d['Recurrence']) selector = SelectFromModel(classifier, prefit=True) t_columns = t_columns[selector.get_support()] fs_data[i] = pd.DataFrame(selector.transform(fs_data[i]), columns=t_columns, index=t_rows) fs_data[i]['Recurrence'] = d['Recurrence'] if mode in ('show'): print("Selecting data from RF model -\n", fs_data[i].head()) print("Shape after feature selection: {}".format(fs_data[i].shape), end="\n\n") # RESAMPLING data - SMOTEENN balanced_data = [[] for _ in range(2)] for i, d in enumerate(fs_data): sme = SMOTEENN(random_state=42, smote=SMOTE(random_state=42, k_neighbors=2)) x, y = sme.fit_resample(fs_data[i], t_data[i]['Recurrence']) # x are the features and y are the targets balanced_data[i].append(x) balanced_data[i].append(y) print("Upsampling the data... in {}".format(filenames[i])) if mode == 'show': print("FILENAME: {}".format(filenames[i]), Counter(balanced_data[i][1])) # DIMENSIONALITY REDUCTION # Kernel PCA (can be toggled on or off) pca = True pca_dim = 20 if pca: dr_data = [] for i in range(len(filenames)): print("\nFILENAME: {}".format(filenames[i])) decomposer = KernelPCA(n_components=pca_dim, kernel='rbf', gamma=0.5, degree=7) dr_data.append(decomposer.fit_transform(balanced_data[i][0])) print("Shape and type after PCA: ", dr_data[i].shape, type(dr_data[i])) else: dr_data.append(balanced_data[0][0]) dr_data.append(balanced_data[1][0]) # CLASSIFICATION splits = 10 seed = 7 kfold = KFold(n_splits=splits, random_state=seed, shuffle=True) results = {'SVM': [], 'RF': [], 'KNN': [], 'NB': [] } for i, d in enumerate(dr_data): # SVM res = [] classifier = SVC(gamma='auto') results['SVM'].append(cross_val_score(classifier, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold)) results['SVM'][i] = results['SVM'][i].mean() # RF # rf = RandomForestClassifier(n_estimators=100,n_jobs=-1,max_depth=10,max_features='auto') classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=7, max_features='auto', criterion='gini', n_jobs=-1) results['RF'].append(cross_val_score(classifier, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold)) results['RF'][i] = results['RF'][i].mean() # KNN k_scores = [] for n in range(1, 16): knn = KNeighborsClassifier(n_neighbors=3) scores = (cross_val_score(knn, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold)) k_scores.append(scores.mean()) results['KNN'].append(max(k_scores)) # NB nb = GaussianNB() results['NB'].append(cross_val_score(nb, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold)) results['NB'][i] = results['NB'][i].mean() print("\nFinal Results for datasets: {0}, {1} -".format(filenames[0], filenames[1])) pprint(results) # PLOTTING # PCA pca = PCA(n_components = 3) x_pca = pca.fit_transform(balanced_data[0][0]) fig = plt.figure(figsize=(13, 7)) plt.suptitle("3-D plot for resampled data using dimesnionality reduction (Biomedical Recurrence)\n\n") ax = fig.add_subplot(111, projection='3d') ax.set_title("PCA\n\n") ax.view_init(elev=177,azim=-96) for i in range(len(balanced_data[0][1])): if balanced_data[0][1][i] == 0: false = ax.scatter(x_pca[i][0], x_pca[i][1], x_pca[i][2], c='y', label=balanced_data[0][1][i]) elif balanced_data[0][1][i] == 1: true = ax.scatter(x_pca[i][0], x_pca[i][1], x_pca[i][2], c='g', label=balanced_data[0][1][i]) plt.legend((false, true), ("Didn't recur", "Recurred"), scatterpoints=1, loc='upper right', ncol=1, fontsize=10) #plt.show() return results
def trainclassifier(feat_m1, patientinfo, config, parameter_file, output_svm, output_json, feat_m2=None, feat_m3=None, verbose=True): # Load variables from the config file config = config_io.load_config(config) if type(parameter_file) is list: parameter_file = ''.join(parameter_file) if type(patientinfo) is list: patientinfo = ''.join(patientinfo) if type(config) is list: config = ''.join(config) with open(parameter_file) as data_file: parameters = json.load(data_file) label_type = config['Genetics']['mutation_type'] # Read the features and classification data image_features_select, labels, label_data =\ readdata(feat_m1, feat_m2, feat_m3, patientinfo, label_type, parameters) # Delete features which are are the same in more than 99% of patients # TODO: Separate this into a different tool sel = VarianceThreshold(threshold=0.99 * (1 - 0.99)) sel = sel.fit(image_features_select) image_features_select = sel.transform(image_features_select) labels = sel.transform(labels).tolist()[0] # If we have too few features left, don't proceed if len(image_features_select[1]) > 7: # Create tempdir name from parameter file name basename = os.path.basename(parameter_file) filename, _ = os.path.splitext(basename) path = parameter_file for i in range(4): # Use temp dir: result -> sample# -> parameters - > temppath path = os.path.dirname(path) _, path = os.path.split(path) path = os.path.join(path, 'trainclassifier', filename) # Construct the required classifier classifier, param_grid =\ cc.construct_classifier(config, image_features_select[0]) # For N_iter, perform k-fold crossvalidation if config['Classification']['fastr']: trained_classifier = cv.crossvalfastr(config, label_data, image_features_select, classifier, param_grid, path) else: trained_classifier = cv.crossval(config, label_data, image_features_select, classifier, param_grid, path) # Add labels to dataframe # TODO: Works only if single mutation is present labels_pd =\ pd.Series([labels], index=[trained_classifier.keys()[0]], name='feature_labels') classifier = classifier.append(labels_pd) # Calculate statistics of performance statistics = plot_single_SVM(classifier, label_data) else: statistics = "None" labels = ["Too Few Features."] feat = ["None"] panda_dict = dict(zip(labels, feat)) classifier = pd.Series(panda_dict) # Save output savedict = dict() savedict["Parameters"] = parameters savedict["Statistics"] = statistics print("Saving data!") if type(output_svm) is list: output_svm = ''.join(output_svm) if type(output_json) is list: output_json = ''.join(output_json) # TODO: ouptu_svm/json are list objects! classifier.to_hdf(output_svm, 'SVMdata') with open(output_json, 'w') as fp: json.dump(savedict, fp, indent=4)
def perform_variance_threshold(self, v_threshold): selector = VarianceThreshold(v_threshold) self.train_x = selector.fit_transform(self.train_x, self.train_y) self.test_x = selector.transform(self.test_x)
def get_mapper(dataframe): beta = 0.0 opt = Nadam(lr=0.001) print(dataframe.head(10)) x_train, x_test = train_test_split(dataframe, random_state=6, test_size=0.2) scaler = MinMaxScaler() var_thresh = VarianceThreshold() var_thresh = var_thresh.fit(x_train) x_train = var_thresh.transform(x_train) x_test = var_thresh.transform(x_test) scaler = scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) input = Input(x_train.shape[1:]) batch_norm_1 = BatchNormalization() batch_norm_2 = BatchNormalization() batch_norm_3 = BatchNormalization() batch_norm_4 = BatchNormalization() batch_norm_5 = BatchNormalization() batch_norm_6 = BatchNormalization() batch_norm_7 = BatchNormalization() batch_norm_8 = BatchNormalization() batch_norm_9 = BatchNormalization() batch_norm_10 = BatchNormalization() batch_norm_11 = BatchNormalization() batch_norm_12 = BatchNormalization() batch_norm_neck = BatchNormalization() dense_input = Dense(x_train.shape[1:][0], kernel_regularizer=l2(beta)) dense_1 = Dense(int(x_train.shape[1:][0] / 2), kernel_regularizer=l2(beta)) dense_2 = Dense(int(x_train.shape[1:][0] / 4), kernel_regularizer=l2(beta)) dense_3 = Dense(256, kernel_regularizer=l2(beta)) dense_4 = Dense(128, kernel_regularizer=l2(beta)) dense_5 = Dense(64, kernel_regularizer=l2(beta)) dense_6 = Dense(64, kernel_regularizer=l2(beta)) dense_7 = Dense(128, kernel_regularizer=l2(beta)) dense_8 = Dense(256, kernel_regularizer=l2(beta)) dense_9 = Dense(int(x_train.shape[1:][0] / 4), kernel_regularizer=l2(beta)) dense_10 = Dense(int(x_train.shape[1:][0] / 2), kernel_regularizer=l2(beta)) dense_11 = Dense(x_train.shape[1:][0], kernel_regularizer=l2(beta)) desc_decoder = Dense(x_train.shape[1:][0], activation="linear", kernel_regularizer=l2(beta)) neck = Dense(3, kernel_regularizer=l2(beta)) p_relu = PReLU() p_relu2 = PReLU() p_relu3 = PReLU() p_relu4 = PReLU() p_relu5 = PReLU() p_relu6 = PReLU() p_relu7 = PReLU() p_relu8 = PReLU() p_relu9 = PReLU() p_relu10 = PReLU() p_relu11 = PReLU() p_relu12 = PReLU() p_relu_neck = PReLU() layer1 = batch_norm_1(p_relu(dense_input(input))) layer2 = batch_norm_2(p_relu2(dense_1(layer1))) layer3 = batch_norm_3(p_relu3(dense_2(layer2))) neck_out = p_relu_neck(neck(layer3)) layer10 = batch_norm_10(p_relu4(dense_9(batch_norm_neck(neck_out)))) layer11 = batch_norm_11(p_relu5(dense_10(layer10))) layer12 = batch_norm_12(p_relu6(dense_11(layer11))) decoded_descs = desc_decoder(layer12) autoencoder = Model(input, decoded_descs) print(autoencoder.summary()) plot_model(autoencoder, to_file='model_graph.png') autoencoder.compile(optimizer=opt, loss='mean_squared_error') reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=3, min_lr=0, verbose=1, epsilon=0.00001) earlystopping = EarlyStopping(monitor='val_loss', min_delta=0.00001, patience=15, verbose=1, mode='auto') # Save the model for best validation loss checkpointer = ModelCheckpoint(filepath='checkpoint.h5', monitor='val_loss', verbose=1, save_best_only=True) model_history_tmp = autoencoder.fit( x_train, x_train, validation_data=(x_test, x_test), epochs=10000, batch_size=32, callbacks=[checkpointer, earlystopping, reduce_lr], shuffle=False, verbose=0) plot_train_history(model_history_tmp, 'compressor_0_1', '') # load the best model base on validation results for this fold autoencoder = load_model('checkpoint.h5') latent_to_map = Model(input, neck_out) latent_to_map.save('smi2lat.h5') return latent_to_map, var_thresh, scaler
cols.remove('wheezy-copper-turtle-magic') oof = np.zeros(len(train)) preds = np.zeros(len(test)) # BUILD 512 SEPARATE MODELS for i in range(512): # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I train2 = train[train['wheezy-copper-turtle-magic'] == i] test2 = test[test['wheezy-copper-turtle-magic'] == i] idx1 = train2.index idx2 = test2.index train2.reset_index(drop=True, inplace=True) # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES) sel = VarianceThreshold(threshold=1.5).fit(train2[cols]) train3 = sel.transform(train2[cols]) test3 = sel.transform(test2[cols]) # STRATIFIED K-FOLD skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True) for train_index, test_index in skf.split(train3, train2['target']): # MODEL AND PREDICT WITH QDA clf = QuadraticDiscriminantAnalysis(reg_param=0.5) clf.fit(train3[train_index, :], train2.loc[train_index]['target']) oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:, 1] preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits #if i%64==0: print(i) # PRINT CV AUC
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.4, random_state=123) #select from model clf_feat = ExtraTreesClassifier() clf_feat.fit(X_train, y_train) model = SelectFromModel(clf_feat, prefit=True) #X_train = model.transform(X_train) #X_test = model.transform(X_test) #select with variance sel = VarianceThreshold(threshold=(0.6 * (1 - 0.6))) X_train = sel.fit_transform(X_train) X_test = sel.transform(X_test) #test = SelectKBest(score_func=chi2,k=10) #fit = test.fit(X_train,y_train) #X_train = fit.transform(X_train) #X_test = fit.transform(X_test) print('shape of train data:', X_train.shape) print('shape of test data:', X_test.shape) #check for class imbalance in train & test_size print('positive class in train', np.sum(y_train) / y_train.shape[0]) print('positive class in test', np.sum(y_test) / y_test.shape[0]) #default classifer = logistic regression clf1 = []
allData = hstack([data_prepared_numSparse, catArray]) y = sss[label].values y = y.flatten() # remove categorical variables with low variance selector_variance = VarianceThreshold(threshold=.0025) selector_variance.fit(allData) c = selector_variance.get_support(indices=False) d = selector_variance.get_support(indices=True) featureItemize = featureNumToName.items() featureItemize = [x for x, z in zip(featureItemize, c) if (z == 1)] featureNumToName2 = dict([(i, x[1]) for i, x in enumerate(featureItemize)]) allDataVarThreshold = selector_variance.transform(allData) # Perform l1 feature selection clf_l = linear_model.LogisticRegression(C=.07, penalty='l1', tol=1e-6, max_iter=500) std_scaler = StandardScaler() allDataScaled = std_scaler.fit_transform(allDataVarThreshold.toarray()) clf_l.fit(allDataScaled, y) selector_l1 = SelectFromModel(clf_l, prefit=True) c = selector_l1.get_support(indices=False) d = selector_l1.get_support(indices=True)
score.append(once) plt.plot(threshold, score) plt.show() # In[]: # Wrapper包装法: from sklearn.feature_selection import RFE RFC_ = RFC(n_estimators=10, random_state=0) # 迭代法: 每次迭代删除50个特征 selector = RFE(RFC_, n_features_to_select=340, step=50).fit(X, y) selector.support_.sum() # 返回所有的特征的是否最后被选中的布尔矩阵。 340 selector.ranking_ # 返回特征的按数次迭代中综合重要性的排名 X_wrapper = selector.transform(X) cross_val_score(RFC_, X_wrapper, y, cv=5).mean() # In[]: # 学习曲线: # ======【TIME WARNING: 15 mins】====== # score = [] for i in range(1, 751, 50): X_wrapper = RFE(RFC_, n_features_to_select=i, step=50).fit_transform(X, y) once = cross_val_score(RFC_, X_wrapper, y, cv=5).mean() score.append(once) plt.figure(figsize=[20, 5]) plt.plot(range(1, 751, 50), score) plt.xticks(range(1, 751, 50)) plt.show()
# test[col] = sc.transform(test[col].values) ### create feature matrix and target vector X = train.drop(["id", "loss"], axis=1).as_matrix() y = np.array(train["loss"].values) ### Feature reduction (optional) sel = VarianceThreshold() sel.fit(X, y) print "Train before removing low-variance features", X.shape X = sel.transform(X) print "Train after removing low-variance features", X.shape ### define models and hyperparameters lr = LinearRegression() br = BayesianRidge() net = ElasticNetCV(l1_ratio=[.1, .7, .95, .99, 1], normalize=False) rf = RandomForestRegressor(n_estimators=75) ### build neural net model early_stopping = EarlyStopping(monitor='val_loss', patience=2, mode="auto") X_train, X_val, y_train, y_val = train_test_split(X,
def main(): args = getOptions() print args if args.model == 'gBoosting': fn = ("submission_%s_gBoosting_%s_%s_%s_%s_%s.csv" % (args.fts, args.loss, str(args.minsamplessplit), str(args.lrate).replace('.','dian'),str(args.nest),str(args.maxdepth))) elif args.model == 'randomForest': fn = ("submission_%s_randomForest_%s.csv" % (args.fts, args.nest)) print fn print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) train_x_clean, contentdict = cityclean(train_x_new) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) test_x_clean, contentdict = cityclean(test_x_new, contentdict) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_clean) test_x_uniq = sel.transform(test_x_clean) # indices = [i for i in range(len(train_x[0]))] # frqIndex = trimfrq(train_x) # for i in frqIndex: # indices.remove(i) # train_x_uniq = indexTodata(train_x, indices) # test_x_uniq = indexTodata(test_x, indices) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" if args.fts == 'cor': train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor) elif args.fts == 'extraTrees': train_x_sel, test_x_sel = ExtraTreesSelect(train_x_nor, train_y, test_x_nor) elif args.fts == 'randomTree': train_x_sel, test_x_sel = randomTreesSelect(train_x_nor, train_y, test_x_nor) else: train_x_sel = copy.deepcopy(train_x_nor) test_x_sel = copy.deepcopy(test_x_nor) print len(train_x_nor[0]) print len(train_x_sel[0]) del train_x_nor, test_x_nor, train_x_uniq, test_x_uniq print "modelsing" if args.model == 'gBoosting': clf = GradientBoostingClassifier(loss=args.loss, learning_rate=args.lrate, n_estimators=args.nest, max_depth=args.maxdepth, min_samples_split=args.minsamplessplit, verbose=1) elif args.model == 'randomForest': clf = RandomForestClassifier(n_estimators=args.nest, class_weight='auto') clf.fit(train_x_sel, train_y) train_pdt = clf.predict(train_x_sel) MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) test_pdt = clf.predict_proba(test_x_sel) # MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) # print "MCC, Acc_p , Acc_n, Acc_all(test): " # print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) fout=open(fn,'w') fout.write("ID,target\n") for index, eachline in enumerate(test_pdt): fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index][1]))) fout.close()
def sample_data(X, Y, value=0): XX=[] for i in xrange(len(Y)): if Y[i]==value: XX.append(X[i]) return XX out=open(sys.argv[1],"r") model=svm.OneClassSVM(kernel='rbf') X, Y = read_fea(sys.argv[1]) sel = VarianceThreshold(threshold=0) model.fit(sample_data(sel.fit_transform(X),Y, 1)) warning("useful features dim: "+str(len(sel.get_support(True)))) if hasattr(model,'score'): warning("accuracy on training set: "+str(model.score(sel.transform(X), Y))) if len(sys.argv)>2: X, Y = read_fea(sys.argv[2]) warning("accuracy on cv set: "+str(model.score(sel.transform(X), Y))) if len(sys.argv)>3: X, Y = read_fea(sys.argv[3]) warning("accuracy on dev set: "+str(model.score(sel.transform(X), Y))) if len(sys.argv)>4: ref = model.decision_function(sel.transform(X)) X, Y = read_fea(sys.argv[4], True) Z = model.decision_function(sel.transform(X)).tolist() Z = (Z-ref.mean())/ref.std() for i in xrange(len(Y)): print('S'+str(Y[i])+' '+str(Z[i]))
# test_with_new_data.py # This python script will first train the svm with training data set # then test it against the training data set provided from sklearn.feature_selection import VarianceThreshold from sklearn.metrics import accuracy_score from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC import VarianceThresholdTest import dataframe train_x, train_y = dataframe.get_dataset_from_file('proper.train.data') test_x, test_y = dataframe.get_dataset_from_file('corrected') v_threshold = 0.15 debug = True selector = VarianceThreshold(v_threshold) new_test_x = selector.fit(train_x) new_test_y = selector.transform(test_x) if debug: print 'After fit' print 'Train contains %d features' % len(new_test_x[0]) print 'Test contains %d features' % len(new_test_y[0])
sel.fit(X_train) # fit finds the features with low variance sum(sel.get_support()) # how many not quasi-constant? # In[8]: features_to_keep = X_train.columns[sel.get_support()] # In[9]: X_train = sel.transform(X_train) X_test = sel.transform(X_test) X_train.shape, X_test.shape # In[10]: # sklearn transformations lead to numpy arrays # here I transform the arrays back to dataframes # please be mindful of getting the columns assigned # correctly # In[11]:
import matplotlib.pyplot as plt #Import Data df = pd.read_csv(r'C:\Utkarsh\GIT\Python\PredictSatisfiedCustomers\Data\train.csv') df_test = pd.read_csv(r'C:\Utkarsh\GIT\Python\PredictSatisfiedCustomers\Data\test.csv') y = df['TARGET'] df = df.drop('TARGET',axis=1) df = df.drop('ID',axis=1) df_test = df_test.drop('ID',axis=1) #Dropping columns having least variance impact sel2 = VarianceThreshold(threshold = .9) np2 = sel2.fit_transform(df) df = pd.DataFrame(np2) np_test2 = sel2.transform(df_test) df_prediction = pd.DataFrame(np_test2) #Cross validation for removing over fitting df_fit, df_eval, y_fit, y_eval= train_test_split( df, y, test_size=0.1, random_state=2 ) #First predictive model using XGboost xgboosting_model = xgb.XGBClassifier(missing=9999999999,max_depth = 5,n_estimators=100, learning_rate=0.1,nthread=4,subsample=.7) xgboosting_model.fit(df_fit, y_fit) predict_target = xgboosting_model.predict_proba(df_eval)[:,1] validAUC = auc(y_eval, predict_target) print("Accuracy with misssing value imputation"+validAUC) #ROC curve and comparison with other models names = ["etsc","abc","xgb","gbc"]
# In[39]: ### if we sum over get_support, we get the number of features that are not constant # In[178]: sum(sel.get_support()) # In[179]: x_train = sel.transform(x_train) test = sel.transform(test) # In[180]: test.shape # In[181]: x_train.shape
class Model(object): __metaclass__ = ABCMeta default_train_file = "train_subset.pickle" default_test_file = "test_subset.pickle" default_model_param_file = "model_param.pickle" def __init__(self, **kwargs): self.verbose = kwargs.get("verbose", 1) if self.verbose: print "Opening HTML zip file" self.__html_zip = zipfile.ZipFile(kwargs.get("html_cleaned_zip", config.html_cleaned_zip)) self.__train_classes_filename = kwargs.get("train_file", self.default_train_file) self.__test_classes_filename = kwargs.get("test_file", self.default_test_file) self.__predict_classes_in_filename = kwargs.get("predict_in_file", None) self.__predict_classes_out_filename = kwargs.get("predict_out_file", None) self.__use_tfidf = kwargs.get("use_tfidx", False) self.__tfidf_transformer = None self.__use_variance_threshold = kwargs.get("use_variance_threshold", False) self.__variance_threshold = 0.8 self.__variance_threshold_selector = None self.__model_param_filename = kwargs.get("model_param_file", self.default_model_param_file) self.__dtype = kwargs.get("dtype", np.float32) self.__filenames = [] self.__contents = [] self.__is_file_handle = True self.__class_vector = [] if "vocabulary_file" in kwargs: self.__vocabulary = sorted(load_pickle(kwargs["vocabulary_file"])) else: self.__vocabulary = None self.__docmat = None def __load(self, filename, use_file_handles): self.__is_file_handle = use_file_handles if self.verbose: print "Reading data" with open(filename, "r") as pf: classes = pickle.load(pf) self.__file_names = classes.keys() self.__class_vector = np.empty(len(self.__file_names), dtype=self.__dtype) self.__content = [] for i, f in enumerate(self.__file_names): self.__class_vector[i] = classes[f] def iterfn(f): if self.__is_file_handle: return self.__html_zip.open(f, "r") else: with self.__html_zip.open(f, "r") as zf: return self.__content.append(zf.read()) self.__contents = imap(iterfn, self.__file_names) def load_training_data(self, use_file_handles=True): self.__load(self.__train_classes_filename, use_file_handles) def load_testing_data(self, use_file_handles=True): self.__load(self.__test_classes_filename, use_file_handles) def load_prediction_data(self, use_file_handles=True): self.__load(self.__predict_classes_in_filename, use_file_handles) def save_prediction_data(self): if self.verbose: print "Writing data" classes = {} for i, f in enumerate(self.__file_names): classes[f] = 1 if self.__class_vector[i] >= 0.5 else 0 with open(self.__predict_classes_out_filename, "w") as pf: pickle.dump(classes, pf, pickle.HIGHEST_PROTOCOL) def make_word_vectors(self): if self.verbose: print "Computing word vectors" if self.__vocabulary is None: cv = CountVectorizer( stop_words=config.common_words, input=("file" if self.__is_file_handle else "content"), dtype=self.__dtype, ) self.__docmat = cv.fit_transform(self.__contents) self.__vocabulary = cv.vocabulary_ else: cv = CountVectorizer( stop_words=config.common_words, input=("file" if self.__is_file_handle else "content"), dtype=self.__dtype, vocabulary=self.__vocabulary, ) self.__docmat = cv.transform(self.__contents) if self.__tfidf_transformer is None: if self.__use_tfidf: self.__tfidf_transformer = TfidfTransformer() self.__docmat = self.__tfidf_transformer.fit_transform(self.__docmat) else: self.__docmat = self.__tfidf_transformer.transform(self.__docmat) print "BEFORE", self.__docmat.shape if self.__variance_threshold_selector is None: if self.__use_variance_threshold: self.__variance_threshold_selector = VarianceThreshold( self.__variance_threshold * (1.0 - self.__variance_threshold) ) self.__docmat = self.__variance_threshold_selector.fit_transform(self.__docmat) else: self.__docmat = self.__variance_threshold_selector.transform(self.__docmat) print "AFTER ", self.__docmat.shape def get_document_matrix(self): return self.__docmat def get_document_class_vector(self): return self.__class_vector def set_document_class_vector(self, class_vector): self.__class_vector = class_vector @abstractmethod def getstate(self): return {"vocabulary": self.__vocabulary, "var_thresh_sel": self.__variance_threshold_selector} @abstractmethod def setstate(self, state): self.__vocabulary = state["vocabulary"] self.__variance_threshold_selector = state["var_thresh_sel"] def save(self): if self.verbose: print "Saving model" with open(self.__model_param_filename, "w") as pf: pickle.dump(self.getstate(), pf, pickle.HIGHEST_PROTOCOL) def load(self): if self.verbose: print "Loading model" with open(self.__model_param_filename, "r") as pf: state = pickle.load(pf) self.setstate(state) @abstractmethod def train(self): pass @abstractmethod def test(self): pass @abstractmethod def predict(self): pass