def select_indices(): print 'reading in features' test_features = Input.load_testdata_caffefeatures(padded=True) train_features = Input.load_traindata_caffefeatures(padded=True) print 'selecting indices' #get indices of features that have a non-zero variance in the test data selector1 = VarianceThreshold() selector1.fit_transform(test_features) indices_test = selector1.get_support(indices=True) #get indices of features that have a non-zero variance in the train data selector2 = VarianceThreshold() selector2.fit_transform(train_features) indices_train = selector2.get_support(indices=True) #only keep indices that have variance in both test and train data indices = list(set(indices_test) & set(indices_train)) #add 1 to all indices indices = [x + 1 for x in indices] #save indices to csv file myfile = open('caffefeature_indices_padded.csv', 'wb') wr = csv.writer(myfile) wr.writerow(indices)
features = Input.load_traindata_caffefeatures(userows=range(3000, 5500)) print features.shape print 'should be: 2500x3983' features = Input.load_validationset_caffefeatures( featureSelectionMethod='chi2', Percentile=100) print features.shape print 'should be: 8061x3983' features = Input.load_validationset_caffefeatures(featureSelectionMethod='hoi', Percentile=90) print features.shape print 'should print error message' features = Input.load_validationset_caffefeatures( featureSelectionMethod='chi2', Percentile=210) print features.shape print 'should print error message' features = Input.load_traindata_caffefeatures(featureSelectionMethod='chi2', Percentile=5) print features.shape print 'should be: 22424x200' features = Input.load_testdata_caffefeatures(featureSelectionMethod='chi2', Percentile=2, userows=range(20200, 30200)) print features.shape print 'should be: 10000x80'
import pandas as pd import time from sklearn.ensemble import RandomForestClassifier from IO import Input from IO import Output start_time = time.time() # load train data df_traindata_caf = Input.load_traindata_caffefeatures() df_traindata_lab = Input.load_traindata_labels() # Load test data df_testdata_caf = Input.load_testdata_caffefeatures() print("--- load data: %s seconds ---" % round((time.time() - start_time), 2)) start_time = time.time() x_train = df_traindata_caf y_train = df_traindata_lab x_test = df_testdata_caf # Train model rf = RandomForestClassifier(n_estimators=500) rf.fit(x_train, y_train) print("--- train model: %s seconds ---" % round((time.time() - start_time), 2)) start_time = time.time() # Predict