def process(discrete, cont): # Create discrete and continuous data matrices discrete_X = np.array(discrete) cont_X = np.array(cont) # Impute discrete values imp = Imputer(strategy='most_frequent') discrete_X = imp.fit_transform(discrete_X) # Impute continuous values imp_c = Imputer(strategy='mean') cont_X = imp_c.fit_transform(cont_X) # Discrete basis representation enc = OneHotEncoder() enc.fit(discrete_X) discrete_X = enc.transform(discrete_X).toarray() # Continuous scaling scaler = StandardScaler() scaler.fit(cont_X) cont_X = scaler.transform(cont_X) # Merge to one array X = np.concatenate((discrete_X, cont_X), axis=1) return X
def preprocess(data, feat_type): # replace missing value by most common if categorical and by mean if numerical try: if data.getformat()=='csr': return data except: print feat_type # separate numerical and categorical columns idx_num = [i for i in xrange(len(feat_type)) if feat_type[i] == 'Numerical'] data_num = data[:,idx_num] idx_cat = [i for i in xrange(len(feat_type)) if feat_type[i] == 'Categorical'] data_cat = data[:,idx_cat] # fill missing values imp_num = Imputer(axis = 0) data_num = imp_num.fit_transform(data_num) imp_cat = Imputer(axis = 0, strategy='most_frequent') data_cat = imp_cat.fit_transform(data_cat) # retrieve mean and divide by standard deviation data_num = scale(data_num) # one-hot encode using pandas # have to do it column by column because of pandas data_cat_pd = pd.DataFrame(data_cat) for i in xrange(data_cat.shape[1]): data_cat_pd = pd.concat((data_cat_pd, pd.get_dummies(data_cat[:,i])),join = 'outer', axis = 1) # delete the columns that have been one hot encoded; need to rename first, # otherwise some columns may be suppressed unwillingly data_cat_pd.columns = [i for i in xrange(data_cat_pd.shape[1])] data_cat_pd = data_cat_pd.drop(data_cat_pd.iloc[:,[i for i in xrange(data_cat.shape[1])]],axis =1) data_cat = np.asarray(data_cat_pd) # regroup categorical and numerical variables return np.hstack((data_num,data_cat))
def predict(self, raw_array, results, aux_data_a_d=None, diff=False, holdout_col=0, lag=1, positive_control=False, **kwargs): """ Given the input results model, predicts the year of data immediately succeeding the last year of the input array. Axis 0 indexes observations (schools) and axis 1 indexes years. For holdout_col>0, the last holdout_col years of data will be withheld from the prediction, which is ideal for finding the error of the algorithm. """ if positive_control: if holdout_col > 0: if diff: if holdout_col == 1: control_array = np.diff(raw_array[:, -2:], 1, axis=1) else: control_array = \ np.diff(raw_array[:, -holdout_col-1:-holdout_col+1], 1, axis=1) else: control_array = raw_array[:, -holdout_col] else: control_array = np.random.randn(raw_array.shape[0], 1) if holdout_col > 0: raw_array = raw_array[:, :-holdout_col] prediction_raw_array = raw_array if diff: array = np.diff(raw_array, 1, axis=1) X = array[:, -lag:] if positive_control: X = np.concatenate((X, control_array.reshape(-1, 1)), axis=1) if aux_data_a_d: for feature_s in aux_data_a_d.iterkeys(): if holdout_col > 0: raw_array = aux_data_a_d[feature_s][:, :-holdout_col] else: raw_array = aux_data_a_d[feature_s] array = np.diff(raw_array, 1, axis=1) X = np.concatenate((X, array[:, -lag:]), axis=1) estimatorX = Imputer(axis=0) X = estimatorX.fit_transform(X) predicted_change_a = results.predict(X) estimator_orig = Imputer(axis=0) orig_a = estimator_orig.fit_transform(prediction_raw_array[:, -1].reshape(-1,1)) prediction_a = orig_a + predicted_change_a.reshape(-1, 1) else: array = raw_array X = array[:, -lag:] if positive_control: X = np.concatenate((X, control_array.reshape(-1, 1)), axis=1) if aux_data_a_d: for feature_s in aux_data_a_d.iterkeys(): if holdout_col > 0: raw_array = aux_data_a_d[feature_s][:, :-holdout_col] else: raw_array = aux_data_a_d[feature_s] array = raw_array X = np.concatenate((X, array[:, -lag:]), axis=1) estimatorX = Imputer(axis=0) X = estimatorX.fit_transform(X) prediction_a = results.predict(X) return prediction_a.reshape((-1, 1))
def fill_missing_values(_df, dis_features, cont_features): # for discrete features we will use 'most_frequent' strategy imp_discrete = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) _df[dis_features] = imp_discrete.fit_transform(_df[dis_features].values) # for continuous features we will use 'mean' strategy imp_continuous = Imputer(missing_values='NaN', strategy='mean', axis=0) _df[cont_features] = imp_continuous.fit_transform(_df[cont_features].values) return _df
def main(): weather, train, spray, test = load_data() target = train.WnvPresent.values idcol = test.Id.values weather = wnvutils.clean_weather(weather) train = wnvutils.clean_train_test(train) test = wnvutils.clean_train_test(test) train, test = wnvutils.clean_train_test2(train, test) train = train.merge(weather, on="Date") test = test.merge(weather, on="Date") train.drop("Date", axis=1, inplace=True) test.drop("Date", axis=1, inplace=True) desc_df(train) train = train.ix[:, pd.notnull(train).any(axis=0)] test = test.ix[:, pd.notnull(test).any(axis=0)] def min_dist_to_spray_(x): return wnvutils.min_dist_to_spray(x.Latitude, x.Longitude, spray) train["DistToSpray"] = train.apply(min_dist_to_spray_, axis=1) test["DistToSpray"] = test.apply(min_dist_to_spray_, axis=1) desc_df(train) imputer = Imputer() traina = imputer.fit_transform(train) testa = imputer.fit_transform(test) training = np.random.choice([True, False], size=train.shape[0], p=[0.8, 0.2]) rfc = ensemble.RandomForestClassifier() # oob_score=True) rfc.fit(traina[training], target[training]) # print("oob score:", rfc.oob_score_) # with open("output/feature_imp.txt", "w") as fout: for name, imp in sorted(zip(train.columns, rfc.feature_importances_), key=lambda x: x[1], reverse=True): print(name, ":", imp) print(name, ":", imp, file=fout) predictions = rfc.predict(traina[~training]) print("Accuracy:", (predictions == target[~training]).mean()) predictions = rfc.predict_proba(traina[~training]) np.savetxt("/tmp/predictions.txt", predictions[:, 1]) print(predictions[:,1]) print("ROC AUC Score:", roc_auc_score(target[~training], predictions[:,1]))
def test_model(data, stat_as_index, make_vector, model, do_pca=False, target='score'): # compile and shit print('Compiling stats...') fv, sc = [], [] for year in ['2014', '2015', '2016']: f,s = build_fvs( data, year, stat_as_index, make_vector, target) fv.append(f) sc.append(s) # Compile into single vectors: Predict 2016 from 2014 and 2015 fv_train, fv_test = np.vstack(fv[0:2]), fv[2] sc_train, sc_test = np.concatenate(sc[0:2]), sc[2] # Impute NaNs train_nan = np.isnan(fv_train) test_nan = np.isnan(fv_test) for i in range(fv_train.shape[1]): if np.isnan(fv_train[0,i]): fv_train[0,i] = 0 for i in range(fv_test.shape[1]): if np.isnan(fv_test[0,i]): fv_test[0,i] = 0 print('Imputing...') if train_nan.any(): i1 = Imputer() fv_train = i1.fit_transform(fv_train) #print(i1.statistics_) if test_nan.any(): i2 = Imputer() fv_test = i2.fit_transform(fv_test) #print(i2.statistics_) if do_pca: print('Performing PCA...') pca = PCA(whiten=True) fv_train = pca.fit_transform(fv_train) fv_test = pca.transform(fv_test) print('Building test/train sets...') # Exclude players with missing scores train_nan, test_nan = np.isnan(sc_train), np.isnan(sc_test) fv_train, sc_train = fv_train[~train_nan], sc_train[~train_nan] fv_test, sc_test = fv_test[~test_nan], sc_test[~test_nan] print('Building model...') # Build model mod = model mod.fit(fv_train, sc_train) print('Predicting output...') # kluge to allow for classifier and regressor evaluation try: pred = mod.predict_proba(fv_test) except: pred = mod.predict(fv_test) return pred, sc_test, mod
def fillData(trainFeatures, testFeatures, missing_values=np.NaN, strategy='mean', axis=0, verbose=0, copy=True, all = True): imp = Imputer(missing_values, strategy, axis, verbose, copy) if all: trainCount = len(trainFeatures) full = np.vstack((trainFeatures, testFeatures)) full = imp.fit_transform(full) trainFeatures, testFeatures = np.array(full[:trainCount]), np.array(full[trainCount:]) return trainFeatures, testFeatures else: return imp.fit_transform(trainFeatures), imp.fit_transform(testFeatures)
def fill_missing_imputation(electionsData, most_frequent): most_frequent = electionsData.columns.intersection(most_frequent) im = Imputer(strategy="most_frequent") electionsData[most_frequent] = im.fit_transform(electionsData[most_frequent]) #Fill all of the rest (numeric) using mean im = Imputer(strategy="median") electionsData[:] = im.fit_transform(electionsData[:])
def imputing_most_frequent(dataset): ''' :param dataset: pandas DataFrame dataset. :return: The same dataset where the missing values are replaced with the column's most common value ''' imp = Imputer(missing_values='NaN', strategy='most_frequent', copy=False) imp.fit_transform(dataset) return dataset
def test_imputation_shape(self): """Verify the shapes of the imputed matrix for different strategies.""" X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent']: imputer = Imputer(strategy=strategy) X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (10, 2)) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert_equal(X_imputed.shape, (10, 2))
def preprocess(self): # impute missing values true_ids = set([urlid for urlid, label in self.target.iteritems() if label]) true_data = [v for k, v in self.data.iteritems() if k in true_ids] false_data = [v for k, v in self.data.iteritems() if k not in true_ids] self.target = [1 for x in xrange(len(true_data))] + [0 for x in xrange(len(false_data))] imp = Imputer(missing_values='NaN', strategy='mean', axis=0) true_data = imp.fit_transform(true_data) false_data = imp.fit_transform(false_data) self.data = np.concatenate((true_data, false_data), axis=0) self.test_data = imp.fit_transform(self.test_data.values())
def median_impute(self): """ impute """ tr = HFile(self.trfile) te = HFile(self.tefile) self.attributes = tr.attributes self.class_index = tr.class_index imp = Imputer(missing_values=-1, strategy='median') self.tr = imp.fit_transform(tr.data) self.ta = tr.classes self.te = imp.fit_transform(te.data)
def solve_missing_values(data): """ Solve missing values Parameters ---------- data: Values to remove missing values """ from sklearn.preprocessing import Imputer imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit_transform(data) return data
def get_modelKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) # benign_h2o.summary() benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values="NaN", strategy="mean", axis=0) benign_sci = imp.fit_transform(benign_sci) for i in range(2, 7): # Log.info("H2O K-Means") km_h2o = H2OKMeansEstimator(k=i) km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o) km_h2o.show() model = h2o.get_model(km_h2o._id) model.show() km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1) km_sci.fit(benign_sci) print "sckit centers" print km_sci.cluster_centers_
def get_features(frame): ''' Transforms and scales the input data and returns a numpy array that is suitable for use with scikit-learn. Note that in unsupervised learning there are no labels. ''' # Replace missing values with 0.0 # or we can use scikit-learn to calculate missing values below #frame[frame.isnull()] = 0.0 # Convert values to floats arr = np.array(frame, dtype=np.float) # Impute missing values from the mean of their entire column from sklearn.preprocessing import Imputer imputer = Imputer(strategy='mean') arr = imputer.fit_transform(arr) # Normalize the entire data set to mean=0.0 and variance=1.0 from sklearn.preprocessing import scale arr = scale(arr) return arr
def gettestdata(fil) : data = np.genfromtxt(fil,delimiter=',') imp = Imputer(missing_values='NaN', strategy='median', axis=0) X = imp.fit_transform(data[:,2:]) X = scale(X).copy() #spr.eliminate_zeros() return np.array(X)
def benignKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = "")) from h2o.estimators.kmeans import H2OKMeansEstimator for i in range(1,7): benign_h2o_km = H2OKMeansEstimator(k=i) benign_h2o_km.train(x = range(benign_h2o.ncol), training_frame=benign_h2o) print "H2O centers" print benign_h2o_km.centers() benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1) benign_sci_km.fit(benign_sci) print "sckit centers" print benign_sci_km.cluster_centers_
def run_whole_video(exp_folder, lims_ID): #initializes video pointer for video of interest based on lims ID file_string = get_file_string(exp_folder, lims_ID) video_pointer = cv2.VideoCapture(file_string) # import wheel data wheel = joblib.load('dxds2.pkl') first_non_nan = next(x for x in wheel if not isnan(x)) first_index = np.where(wheel == first_non_nan)[0] k = first_index[0] imp = Imputer(missing_values='NaN', strategy='mean') wheel = imp.fit_transform(wheel) wheel = preprocessing.MinMaxScaler((-1, 1)).fit(wheel).transform(wheel) # self.video_pointer.set(1, 41000) ret, frame = video_pointer.read() # crops and converts frame into desired format frame = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY) prvs = frame nex = frame # initialize vectors to keep track of data count = 0 mod = 0 opticals = [] angles = [] frames = [] # length of movie limit = int(video_pointer.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT)) # create hdf file hf = h5py.File('data_' + str(lims_ID) + '.h5', 'w') g = hf.create_group('feature space') vector = np.zeros((limit, 4321)) table = g.create_dataset('features', data = vector, shape =(limit, 4321)) while count <= limit: prvs = nex frames = process_input(prvs) ret, frame = video_pointer.read() nex = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY) optical = optical_flow(prvs, nex) opticals = optical['mag'] angles= optical['ang'] vector_data = np.concatenate((np.reshape(wheel[k], (1)), frames, opticals, angles)) table[count, :] = vector_data count += 1 if count%1000 == 0: print (count)
def learn(): global classifier, INPUT print 1 data = np.genfromtxt(INPUT, delimiter=' ', dtype='f8') np.random.shuffle(data) n = len(data) y = data[:,1] x = data[:][:,range(2,54)] # test_x = [] # test_y = [] train_x = [] train_y = [] print 2 imp = Imputer(missing_values='NaN', strategy='mean', axis=0) x = imp.fit_transform(x) print 3 for i in range(0, n): if y[i] == 0: continue train_x.append(x[i]) train_y.append(y[i]) # if i%100==0: # test_x.append(x[i]) # test_y.append(y[i]) # else: # train_x.append(x[i]) # train_y.append(y[i]) print 4 classifier.fit(train_x, train_y) print 5
def impute_and_scale(df, scaling='std'): """Impute missing values with mean and scale data included in pandas dataframe. Parameters ---------- df : pandas dataframe dataframe to impute and scale scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply """ df = df.dropna(axis=1, how='all') imputer = Imputer(strategy='mean', axis=0) mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': return pd.DataFrame(mat, columns=df.columns) if scaling == 'maxabs': scaler = MaxAbsScaler() elif scaling == 'minmax': scaler = MinMaxScaler() else: scaler = StandardScaler() mat = scaler.fit_transform(mat) df = pd.DataFrame(mat, columns=df.columns) return df
def preprocess(data): non_sparse_only = True use_all_category_only = False use_all_impute_mean_mode = False if non_sparse_only: nominal_samples = data.ix[:,['var4','dummy']] onehot_samples = onehot.transform(nominal_samples,['var4','dummy']) onehot_samples = pd.DataFrame(onehot_samples.toarray()) numbered_samples = data.ix[:,['var7','var8','var10','var11','var13','var15','var17']] numbered_samples[['var7','var8']] = numbered_samples[['var7','var8']].convert_objects(convert_numeric=True) #(var7 and 8 are ordinal, converting to floats which includes NaNs will allow mean imputing of missing values) other_samples = data.ix[:,'crimeVar1':'weatherVar236'] #all the continuous vars other_samples = other_samples.drop(['weatherVar115'], axis=1) #nothing in this feature samples = pd.concat([onehot_samples,numbered_samples,other_samples],axis=1) #combine w/ the cleaned up other vars imp_nan = Imputer(missing_values=np.nan, strategy='mean', axis=0) samples_imp = imp_nan.fit_transform(samples) if use_all_category_only: todo if use_all_impute_mean_mode: todo return samples_imp
def test_3_stage(self): from sklearn.preprocessing import Imputer infile_name = path_of_data('missing_vals.csv') p = Pipeline() csv_read_node = p.add(CSVRead(infile_name)) csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv'))) impute_node = p.add(wrap_and_make_instance(Imputer)) csv_read_node['output'] > impute_node['X_train'] impute_node['X_new'] > csv_write_node['input'] self.run_pipeline(p) ctrl_imputer = Imputer() ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",", names=True) num_type = ctrl_X_sa[0][0].dtype ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa) ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd) control = ctrl_X_new_nd result = self._tmp_files.csv_read('out.csv', True) self.assertTrue(np.allclose(result, control))
def fill_and_remove(self, s_strategy="zeros", l_features = False, b_remove = True): ''' fill all Nan values in numerical data with zeros and then remove data points that all features are equal to zero l_features: a list of features to be tested. If any, all features will be used b_remove: boolean indicating if should remove keys where all data is 0 s_strategy: string with the strategy used to fill NaNs. Can be "mean", "median" and "zeros" ''' df = self.getData() #pre-process data if not l_features: l_features = self.payments_features + self.stock_features l_features+= self.email_features df.loc[:, l_features] = df.loc[:, l_features].astype(float) #filling Nan with the strategy selected if s_strategy == "zeros": df.loc[:, l_features] = df.loc[:, l_features].fillna(0) else: na_X = df.loc[:, l_features].values imp = Imputer(missing_values='NaN', strategy=s_strategy, axis=0) df.loc[:, l_features] = imp.fit_transform(na_X) #exclude datapoint where every number is equal to 0 if b_remove: df = df.ix[((df.loc[:, l_features]!=0).sum(axis=1)!=0),:] #saving the new dataframe self.setData(df) #correct scaled df if type(self.df_scaled)!=list: df2 = self.df_scaled df2 = df2.ix[((df.loc[:, l_features]!=0).sum(axis=1)!=0).index,:] self.df_scaled = df2
def Train_And_Test(self): HOG_data=np.loadtxt('dataset.csv',delimiter=",") tmpdata=HOG_data[:,0:-2] target=HOG_data[:,-2] print(target) tmpdata[tmpdata==0]=np.nan imp=Imputer(missing_values='NaN',strategy='mean') data=imp.fit_transform(tmpdata) data_train,data_test,target_train,target_test=train_test_split(data,target,test_size=0.3) model=SVC(C=1.0,gamma=0.0,kernel='linear', class_weight='auto') model.fit(data_train,target_train) print(data_train) print(target_train) opencv_data_train=np.float32(data_train) opencv_target_train=np.float32(target_train) svm_params = dict( kernel_type = cv2.SVM_LINEAR, svm_type = cv2.SVM_C_SVC, C=2.67, gamma=5.383) svm = cv2.SVM() svm.train(opencv_data_train,opencv_target_train, params=svm_params) svm.save("hog_classifier.xml") print(model) expected=target_test predicted=model.predict(data_test) target_names = ['Not Human', 'Human'] print(metrics.classification_report(expected,predicted,target_names=target_names)) print(metrics.confusion_matrix(expected,predicted)) print(metrics.roc_curve(expected,predicted)) pickle.dump(model, open( "svm.p", "wb" ) )
def impute_missing_train(dataframe, missing_values='NaN', strategy='mean'): ''' Given a dataframe, imputes missing values with a given strategy. Supported strategies: 'mean', 'median', 'most_frequent'. Returns dictionary mapping transformed columns to its imputer value. ''' from sklearn.preprocessing import Imputer imp = Imputer(missing_values=missing_values, strategy=strategy, axis=0) imputed = imp.fit_transform(dataframe) df = pd.DataFrame(imputed) df.columns = list(dataframe.columns) imputers = {} if strategy == 'mean': for col in df.columns: mean = df[col].mean() imputers[col] = mean if strategy == 'median': for col in df.columns: median = df[col].median() imputers[col] = median if strategy == 'most_frequent': for col in df.columns: mode = df[col].mode() imputers[col] = mode return df, imputers
def run_clfList(clfList, stringList="", normalize=False): """ Run 100-fold 80/20 cross-validation on each classifier in clfList print the average AUC for each classifier :param clfList: list of classifiers to run :param stringList: names of the classifiers :param normalize: whether or not to normalize the data :return: the average AUC for each classifier in clfList """ # data, labels = six_features(force=False) # data, labels = six_and_time_features(force=False) # data, labels = five_features(force=False) # data, labels = five_and_rts(force=False) data, labels = new_features() if normalize: data = normalize_data(data) imp = Imputer(missing_values=np.NaN, strategy="mean") data = imp.fit_transform(data) # Cross-validate all clfs 100 times means = kfoldcvList(data, labels, clfList, 100) if stringList == "": stringList = ["" for i in range(len(labels))] # Print out the mean AUCs for i, mean in enumerate(means): print stringList[i]+": "+str(mean) for mean in means: sys.stdout.write(str(mean) + " & ") sys.stdout.write("\n") return means
def plot_ROCList(clfList, data, labels, stringList=""): """ Plot an ROC curve for each classifier in clfList, training on a single 80/20 split :param clfList: :param data: :param labels: :param stringList: :return: """ if stringList == "": stringList = ["" for i in range(len(labels))] imp = Imputer(missing_values=np.NaN, strategy="mean") data = imp.fit_transform(data) # Cross-validate on the data once using each model to get a ROC curve AUCs, fprs, tprs, threshs = cvList(data, labels, clfList) # Plote a ROC for each clf in clfList for i in range(len(clfList)): fpr = fprs[i] tpr = tprs[i] plt.plot(fpr, tpr) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title(stringList[i]+" ROC Curve, AUC = "+str(AUCs[i])) plt.savefig(stringList[i]+"_ROC.png") plt.close() print stringList[i] + ":" + str(AUCs[i])
def run_importance(clf, data, labels, feature_labels=[""], string=""): """ Fit a classifier using all the data and plot the feature importances :param clf: Classifier object that has feature_importances_ member :param feature_labels: names of the features :param string: classifier name :return: (void) plot Gini importance vs feature """ num_features = data.shape[1] importances = [0]*num_features imp = Imputer(missing_values=np.NaN, strategy="mean") data = imp.fit_transform(data) # run the classifier 100 times and average the importance found after each fit for r in range(100): clf.fit(data, labels) importances = [importances[i]+clf.feature_importances_[i] for i in range(num_features)] importances = [importance/100 for importance in importances] # Filter out the features that have 0 importance (e.g. values are all 0) # non_zeros are the indices in feature_importances that are not 0 non_zeros = [i for i in range(num_features) if not importances[i] == 0] importances = [importances[i] for i in non_zeros] feature_labels = [feature_labels[i] for i in non_zeros] # Plot the features bar_width = 0.7 plt.bar(range(len(feature_labels)), importances, bar_width) plt.xticks([ind + +float(bar_width)/2 for ind in range(len(feature_labels))], feature_labels,rotation="vertical") plt.gcf().subplots_adjust(bottom=0.35) plt.xlabel("Feature") plt.ylabel("Gini Importance") plt.title("Gini Importance v. Features for "+string+" Classifier") plt.show()
def avg_message_count_by_group(df_users, df_messages, df_user_features): columns = ["f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10"] features = df_user_features[list(columns)].values # Impute missing values to retain all sample data imp = Imputer(missing_values='NaN', strategy='mean', axis=0) X = imp.fit_transform(features) # Preprocess dataset and standardize features to have normally distributed data # MaxAbsScaler allows scaled features to lie between -1 and +1 X = MaxAbsScaler().fit_transform(X) # Apply PCA decomposition and use first 3 components that explain 75% of variance reduced_data = decomposition.PCA(n_components=3).fit_transform(X) kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10) # Predict which group each user belongs to cluster_labels = kmeans.fit_predict(reduced_data) df_user_features['group.id'] = cluster_labels # Call utility function to join the two dataframes df_joined_users_messages = get_merged_dataframes(df_users, df_messages) df_joined_users_messages_features = get_merged_dataframes(df_user_features, df_joined_users_messages) # Only keep messages that were received since signing up df_joined_users_messages_features = df_joined_users_messages_features[df_joined_users_messages_features['message.date'] >= df_joined_users_messages_features['signup.date']] # Get the average message count grouped by group.id avg_message_count = df_joined_users_messages_features.groupby('group.id')['message.count'].mean() # Return the average message count grouped by user groups and rounded to 2 decimals return np.round(avg_message_count.tolist(), decimals=2)
def impute_missing_data(datapoints, strategy='mean'): """ Inputes values for the 8 features missing data Arguments: datapoints -- X, a dataset with missing values represented 999.0 and 9999.0 strategy [optional] -- an imputation strategy, e.g., mean, median, or most_frequent Returns: X_imputed -- a dataset with missing values imputed according to the provided or default (mean) strategy. Uses the scikit-learn Imputer class. """ # First we will replace our placeholder values with NaN to only have # to run one imputation. np.putmask(datapoints, datapoints == 999.0, np.NaN) np.putmask(datapoints, datapoints == 9999.0, np.NaN) # Now create an imputer over NaN values, and average over axis=0 (columns) # Then, fit the imputer to the dataset. imp = Imputer(strategy=strategy, axis=0) X_imputed = imp.fit_transform(datapoints) return X_imputed
def read_split_aug(filepath, filename, rmv, finalNames): #read the csv try: dataset = genfromtxt(open(filepath + '/' + filename, 'rb'), delimiter=',', dtype='f8')[0:] # Clean the dataset # Sort the observations according to the timestamp dataset = dataset[dataset[:, 0].argsort()] dataset = dataset[12:, :] #exclude some nan observations dataset = dataset[0:360, :] #exclude some nan observations # Remove redundant resources dataset = np.delete(dataset, np.s_[rmv], axis=1) target = dataset[:, 1865] #values of the target variable tt = target[:, np.newaxis] rm_dataset = np.delete(dataset, np.s_[2453:2455], axis=1) #exclude VOD rm_dataset = np.delete(rm_dataset, np.s_[1863:1869], axis=1) #exclude NDVI rm_dataset = np.delete(rm_dataset, np.s_[1849:1853], axis=1) #exclude VOD dataset = np.concatenate((rm_dataset, tt), axis=1) #put the target column in the end dataset = DataFrame(dataset) dataset = dataset.fillna(0) dataset.columns = finalNames.ravel() names = dataset.columns[3:dataset.shape[1]] # Creat the new dataset X = dataset.iloc[:, 3:dataset.shape[1] - 1] y = dataset.iloc[:, dataset.shape[1] - 1] # import the lags of NDVI (target) win = 13 new_datasetAuto = np.empty((len(y), win)) for i in range(1, win): new_datasetAuto[:, i - 1] = shift2(y, i) #, cval=np.NaN) new_datasetAuto[:, win - 1] = y # Imputer the missing values with the mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) dataImputedAuto = imp.fit_transform(new_datasetAuto) X1 = dataImputedAuto[:, 0:dataImputedAuto.shape[1] - 1] X = np.concatenate((X, X1), axis=1) new_dataset = np.concatenate((X, DataFrame(y)), axis=1) new_dataset = DataFrame(new_dataset) # Imputer the missing values with zero new_dataset.replace([np.inf, -np.inf], np.nan, inplace=True) new_dataset = new_dataset.fillna(0) predictor_names = names[0:len(names) - 1].tolist() target_name = names[len(names) - 1] for i in range(1, 13): predictor_names.append(target_name + ` i `) predictor_names.append(target_name) predictor_names = np.array(predictor_names) new_dataset.columns = predictor_names.ravel() return new_dataset except IOError as e: #if the file does not exist throw an exception #print e return [] pass
print(recall) precision = precision_score(realclass, predict, average='weighted') print(precision) # In[5]: from pandas import read_csv from sklearn.preprocessing import Imputer import numpy dataset = read_csv('/home/ajit/Downloads/hepatitis_csv.csv', header=None) # mark zero values as missing or NaN dataset[[1, 2, 3, 4, 5]] = dataset[[1, 2, 3, 4, 5]].replace(0, numpy.NaN) # fill missing values with mean column values values = dataset.values imputer = Imputer() transformed_values = imputer.fit_transform(values) # count the number of NaN values in each column print(numpy.isnan(transformed_values).sum()) # In[47]: #logistic_regression import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from scipy import optimize as op from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score
# In[275]: df = pd.read_csv("./data/final_project_dataset.csv") # In[276]: ndf = df.drop(["Unnamed: 0", "email_address", "poi"], axis=1) #exclude_features=["director_fees","loan_advances","restricted_stock_deferred"] exclude_features = [] ndf = ndf.drop(exclude_features, axis=1) dfmtx = ndf.values dfmtx.astype(float) label = df["poi"].values # Fill in NaN imp = Imputer(axis=0, strategy="median") ndfmtx = imp.fit_transform(dfmtx) # ## Use random forest to select feature # In[277]: from sklearn.ensemble import RandomForestClassifier train_X = ndfmtx train_y = label rf = RandomForestClassifier() rf.fit(train_X, train_y) rfi = rf.feature_importances_ def list_feature_imp(name, score): sorted_rfi_idx = np.argsort(score)
'BMI', 'WEIGHT', 'WEIGHT', 'LV_MASS_INDEX'] #check for missing or NaN in the dataset: pd.isnull(dat_main).sum() > 0 #dataset for analysis outcome = chf 60 dat_chf = dat_main[list_my_features].copy() #the copy() is important to create new dataframe dat_chf.head() describe = dat_chf.describe() #function for easy descriptive statistics # A simple way to fill na (one by one) median_glu = dat_chf['GLUCOSE'].median() dat_chf['GLUCOSE'] = dat_chf['GLUCOSE'].fillna(median_glu) #imputer sklearn (better option to fill missing values) for all df imputer = Imputer(strategy = 'median', axis = 1) dat_chf = pd.DataFrame(imputer.fit_transform(dat_chf), columns = dat_chf.columns) # the imputation pd.isnull(dat_chf).sum() > 0 #descriptive statistics (table 1) columns = ["age", "gender", "ECHO1_ef", "pre_MI","pre_DM"] categorical = ['gender', 'pre_MI', 'pre_DM'] groupby = ["CHF60"] labels={'ECHO1_ef': 'Ejection fraction', 'pre_MI': 'Previous MI', 'CHF60' : 'CHF 60 days'} mytable = TableOne(dat_chf, columns = columns, categorical = categorical, groupby = groupby, labels = labels, isnull = True, remarks = False, pval = True)
y = home_data.SalePrice train = home_data.drop([ 'SalePrice', 'EnclosedPorch', 'LowQualFinSF', 'MiscVal', 'OpenPorchSF', 'PoolArea', 'ScreenPorch' ], axis=1) test = test_data.drop([ 'EnclosedPorch', 'LowQualFinSF', 'MiscVal', 'OpenPorchSF', 'PoolArea', 'ScreenPorch' ], axis=1) one_hot_encoded_training_predictors = pd.get_dummies(train) one_hot_encoded_test_predictors = pd.get_dummies(test) train, test = one_hot_encoded_training_predictors.align( one_hot_encoded_test_predictors, join='left', axis=1) my_imputer = Imputer() train = my_imputer.fit_transform(train) test = my_imputer.transform(test) model = XGBRegressor() model.fit(train, y) preds = model.predict(test) # outputting pd.DataFrame({ 'Id': test_data.Id, 'SalePrice': preds }).to_csv('submission.csv', index=False)
isin(list(CONTINUOUS_KEYS))] categorical_vars = observations.loc[:, observations.columns. isin(list(CATEGORICAL_KEYS))] categorical_vars_imputed = convert_to_categorical(observations, CATEGORICAL_KEYS, cat_only=True, key_var='ID') #combine continuous and categorical variables #add commented-out Random Forest Classifier Code Below Here #Fit a gaussian NB on continuous vars cont_imputer = Imputer(strategy='mean', axis=1, copy=False) imputed_continuous_vars = cont_imputer.fit_transform(continuous_vars) gauss_nb = GaussianNB() continuous_predictions = cross_val_predict(gauss_nb, imputed_continuous_vars, target, cv=10) output_metrics("Continuous NB", target, continuous_predictions) #Fit multinomial NB on categorical vars mult_nb = MultinomialNB() categorical_predictions = cross_val_predict(mult_nb, categorical_vars, target, cv=10) output_metrics("Categorical NB", target, categorical_predictions)
data = data.drop("FireplaceQu", 1) all_columns = data.columns.values non_categorical = [ "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal" ] categorical = [value for value in all_columns if value not in non_categorical] # One Hot Encoding and nan transformation data = pd.get_dummies(data) imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) data = imp.fit_transform(data) # Log transformation data = np.log(data) labels = np.log(labels) # Change -inf to 0 again data[data == -np.inf] = 0 # Split traing and test data_offset = np.average(data, axis=0) data -= data_offset labels_offset = np.average(labels, axis=0) labels -= labels_offset train = data[:1460]
import statsmodels.formula.api as sm from sklearn.preprocessing import MinMaxScaler veriler = pd.read_csv("veriler_nf.csv") Nhdort = veriler[["NH4N"]] ALF = veriler[["Averageleachateflow"]] SS = veriler[["SS"]] Sicaklik = veriler[["Temperature"]] Cod = veriler[["COD"]] MLSSAero = veriler[["MLSSaerobic"]] Nnf = veriler[["NNF"]] Codnf = veriler[["CODNF"]] print(veriler) imputer = Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) SS = imputer.fit_transform(SS) MLSSAero = imputer.fit_transform(MLSSAero) SS = pd.DataFrame(data=SS, index=range(275), columns=['SS']) MLSSAero = pd.DataFrame(data=MLSSAero, index=range(275), columns=['MLSSAero']) scaler = MinMaxScaler() Cod = scaler.fit_transform(Cod) Nhdort = scaler.fit_transform(Nhdort) SS = scaler.fit_transform(SS) Sicaklik = scaler.fit_transform(Sicaklik) MLSSAero = scaler.fit_transform(MLSSAero) ALF = scaler.fit_transform(ALF) Nnf = scaler.fit_transform(Nnf) Codnf = scaler.fit_transform(Codnf)
from pandas import DataFrame hold=pd.DataFrame(hold) y_pred=pd.DataFrame(y_pred) hold=pd.concat((hold,y_pred),axis=1) hold.to_csv('out.csv',index=False) ''' X = train.values y = y.values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.667) from sklearn.preprocessing import Imputer imp = Imputer() X_train = imp.fit_transform(X_train, y_train) X_test = imp.transform(X_test) from sklearn.preprocessing import StandardScaler s = StandardScaler() X_train = s.fit_transform(X_train, y) X_test = s.transform(X_test) from sklearn.metrics import confusion_matrix from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from sklearn.svm import SVR from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB regressor = KNeighborsClassifier(n_neighbors=10)
## ook hier weer de MAE. Uiteindelijk pakt XGBoosting dus heel veel modellen en kan ## met deze bak aan informatie heel precieze modellen maken. XGBoosting is daarom ## een stuk preciezer. Je doet het als volgt: # 1) eerst laad je de data, dealt met missende data # en breek je de data op in train en test data data = pd.read_csv('train.csv') data.dropna(axis=0, subset=['SalePrice'], inplace=True) y = data.SalePrice X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object']) train_X, test_X, train_y, test_y = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25) my_imputer = Imputer() train_X = my_imputer.fit_transform(train_X) test_X = my_imputer.transform(test_X) # 2) Net als in het sklearn pakket bouwen we het model, nu dus # het naieve model from xgboost import XGBRegressor my_model = XGBRegressor() # met silent=True zorg je dat niet alle cycle data wordt uitgeprint my_model.fit(train_X, train_y, verbose=False) # 3) Net als eers laten we het model voorspellingen maken en beoordelen # we het model op basis van MAE # make predictions predictions = my_model.predict(test_X) from sklearn.metrics import mean_absolute_error
#print(X_train.columns) X_train[feats_cat] = X_train[feats_cat].astype(object) X_train = pd.get_dummies(X_train, dummy_na= True) #print(X_train.columns) #print(X_test.columns) X_test[feats_cat] = X_test[feats_cat].astype(object) X_test = pd.get_dummies(X_test, dummy_na= True) #print(X_test.columns) # fillna from sklearn.preprocessing import Imputer fillnan= Imputer() X_train= fillnan.fit_transform(X_train) fillnan= Imputer() X_test= fillnan.fit_transform(X_test) ############################## PARS SEARCH ############################## def gridSearch(X, y): from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer from sklearn.preprocessing import Imputer from sklearn.model_selection import ShuffleSplit from numpy.random import randint from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split
dataset = pd.read_csv('dataset/sal.csv', names=[ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'salary' ], na_values=' ?') X = dataset.iloc[:, 0:14].values y = dataset.iloc[:, -1].values from sklearn.preprocessing import Imputer imp = Imputer() X[:, [0, 2, 4, 10, 11, 12]] = imp.fit_transform(X[:, [0, 2, 4, 10, 11, 12]]) test = pd.DataFrame(X[:, [1, 3, 5, 6, 7, 8, 9, 13]]) test[0].value_counts() test[1].value_counts() test[2].value_counts() test[3].value_counts() test[4].value_counts() test[5].value_counts() test[6].value_counts() test[7].value_counts() test[0] = test[0].fillna(' Private') test[0].value_counts()
def cv_get_mae_imputednans(X,y): model = RandomForestRegressor() my_imputer = Imputer() X_imputed = my_imputer.fit_transform(X) mae_avg = -1*cross_val_score(model,X_imputed,y,scoring='neg_mean_absolute_error').mean() return(mae_avg)
# In[ ]: print(data1.isnull().sum(), data2.isnull().sum()) # We can now use Imputer for Imputing Missing Data # In[ ]: from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder le = LabelEncoder() x_train['Embarked'] = x_train['Embarked'].fillna('$') x_train['Embarked'] = le.fit_transform(x_train['Embarked']) x_train['Cabin'] = le.fit_transform(x_train['Cabin']) imr = Imputer(missing_values=8, strategy='median', axis=0, copy=False) x_train[['Cabin']] = imr.fit_transform(x_train[['Cabin']]) imr.set_params(missing_values=np.nan, strategy='mean') x_train[['Age']] = imr.fit_transform(x_train[['Age']]) imr.set_params(missing_values=3, strategy='most_frequent') x_train[['Embarked']] = imr.fit_transform(x_train[['Embarked']]) ohe = OneHotEncoder(categorical_features=[1]) x_train['Sex'] = le.fit_transform(x_train['Sex']) print(x_train.head()) # In[ ]: fig, ax1 = plt.subplots(figsize=(10, 10)) sns.heatmap(data=x_train.corr(), annot=True, fmt='.1f', linewidths=.1) # In[ ]:
def clean_data_ML(df1): ''' Cleaning and performing feature extracting and engineering to the dataframe Input: df1 (DataFrame) Output: cleaned_df (DataFrame): cleaned df DataFrame ''' #drops columns with more than 20% of missing values print( "Drop columns with more than 20% of missing values and Droping unnecessary columns" ) print( "droping column EINGEFUEGT_AM and D19_LETZTER_KAUF_BRANCHE because it contain too many different items" ) df1.drop([ 'ALTER_KIND1', 'ALTER_KIND2', 'ALTER_KIND3', 'ALTER_KIND4', 'EXTSEL992', 'KK_KUNDENTYP', 'RT_KEIN_ANREIZ', 'CJT_TYP_6', 'D19_VERSI_ONLINE_QUOTE_12', 'CJT_TYP_2', 'EINGEZOGENAM_HH_JAHR', 'D19_LOTTO', 'CJT_KATALOGNUTZER', 'VK_ZG11', 'UMFELD_ALT', 'RT_SCHNAEPPCHEN', 'AGER_TYP', 'ALTER_HH', 'D19_BANKEN_ONLINE_QUOTE_12', 'D19_GESAMT_ONLINE_QUOTE_12', 'D19_KONSUMTYP', 'D19_VERSAND_ONLINE_QUOTE_12', 'GEBURTSJAHR', 'KBA05_BAUMAX', 'TITEL_KZ', 'D19_BANKEN_DATUM', 'D19_BANKEN_OFFLINE_DATUM', 'D19_BANKEN_ONLINE_DATUM', 'D19_GESAMT_DATUM', 'D19_GESAMT_OFFLINE_DATUM', 'D19_GESAMT_ONLINE_DATUM', 'D19_TELKO_DATUM', 'D19_TELKO_OFFLINE_DATUM', 'D19_TELKO_ONLINE_DATUM', 'D19_VERSAND_DATUM', 'D19_VERSAND_OFFLINE_DATUM', 'D19_VERSAND_ONLINE_DATUM', 'D19_VERSI_DATUM', 'D19_VERSI_OFFLINE_DATUM', 'D19_VERSI_ONLINE_DATUM', 'CAMEO_DEU_2015', 'LP_FAMILIE_FEIN', 'LP_STATUS_FEIN', 'ANREDE_KZ', 'GREEN_AVANTGARDE', 'SOHO_KZ', 'VERS_TYP', 'LP_LEBENSPHASE_GROB', 'LP_LEBENSPHASE_FEIN', 'EINGEFUEGT_AM', 'D19_LETZTER_KAUF_BRANCHE', 'CAMEO_INTL_2015', 'PRAEGENDE_JUGENDJAHRE', 'PLZ8_BAUMAX' ], axis=1, inplace=True) print("creating a copy of dataframe") df = df1.copy() try: df.drop(['PRODUCT_GROUP', 'CUSTOMER_GROUP', 'ONLINE_PURCHASE'], axis=1, inplace=True) except: pass #replace O with 0 and W with 1 print("Re-encode OST_WEST_KZ attribute") df['OST_WEST_KZ'].replace(['O', 'W'], [0, 1], inplace=True) #feature engineering Neighbourhood Column with three parts Rural(0), Not Rural(1) and Rural But Good Neighbourhood(2) print("Feature engineering WOHLANG") df['TYPE_QUALITY_NEIGHBOURHOOD'] = df['WOHNLAGE'] df['TYPE_QUALITY_NEIGHBOURHOOD'].replace( [-1, 0, 1, 2, 3, 4, 5, 7, 8], [np.nan, np.nan, 0, 0, 2, 2, 0, 1, 1], inplace=True) print("Droping WOHLANG column") #delete 'WOHNLAGE' df.drop(['WOHNLAGE'], axis=1, inplace=True) #change object type of CAMEO_DEUG_2015 to numeric type print("Feature extracting CAMEO_DEUG_2015") df['CAMEO_DEUG_2015'] = df['CAMEO_DEUG_2015'].apply( lambda x: check_value(x)) #remove columns with kba print("remove columns with start with kba") kba_cols = df.columns[df.columns.str.startswith('KBA05')] df.drop(list(kba_cols), axis='columns', inplace=True) #name of column of df that contains XX string for i in df.columns: df[i].astype('str').apply(lambda x: print(df[i].name) if x.startswith('XX') else 'pass') #imputing nan values print("Imputing Nan values") imp = Imputer(missing_values=np.nan, strategy='most_frequent') df[df.columns] = imp.fit_transform(df) print("Counting Nan values", np.isnan(df).sum().sum()) return df
def preprocessing(configparms, training_preprocessing_flag): if training_preprocessing_flag: df = pd.read_csv(configparms['training_file']) else: #empty list since test data does not include target variables configparms['target_name_list'] = list() df = pd.read_csv(configparms['test_file']) df['stabilityVec'] = 0 reportCMTVEM = list() reportCMTVEM.append( "---------------------------------------------------------------") reportCMTVEM.append("input parameters from configuration file") reportCMTVEM.append(configparms) #removing chemically/physically meaningless features df.drop(list(configparms['remove_features'].values()), axis=1, inplace=True) #revise features that need corrections for item in list(configparms['revised_features'].values()): feature_name_temp = item.split(',')[0].split()[0] element_name_temp = item.split(',')[1].split()[0] correct_value_temp = item.split(',')[2].split()[0] df.loc[df.formulaA == element_name_temp, feature_name_temp] = float(correct_value_temp) df.loc[df.formulaB == element_name_temp, feature_name_temp] = float(correct_value_temp) # revise features that are non-numerical and making categorical features in their stead on the data-frame df_added_categoricals = pd.get_dummies( df, columns=list(configparms['categorical_features'].values())) configparms['categorical_features_fullnames'] = list( set(df_added_categoricals.columns) - set(df.columns)) #copy back df = copy.deepcopy(df_added_categoricals) #Handling missing values #since the zero values in certain features are physically meaningless (eg. bulk modulus) it needs to be changed to NAN if configparms['imputation_method'] != 'none': for item in list(configparms['impute_features'].values()): df[item] = df[item].replace(0, np.nan) df_features_data = df.drop(['formulaA', 'formulaB', 'stabilityVec'], axis=1) if configparms['imputation_method'] == 'mn': imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) for item in list(configparms['impute_features'].values()): df_features_data[item] = imputer.fit_transform( df_features_data[item].values.reshape(-1, 1)).ravel() #Reflect the changes on the original data frame df[df_features_data.columns] = df_features_data[df_features_data.columns] if training_preprocessing_flag: #if training set is under preprocessing #This section is for preparing the target variable for training set only for item in list(configparms['element_removal'].values()): df = df[df.formulaA.str.contains(item) == False] df = df.reset_index(drop=True) df['target_vector'] = df['target_vector'].map( lambda x: x.lstrip('[').rstrip(']')) configparms['target_name_list'] = [ 'target1', 'target2', 'target3', 'target4', 'target5' ] df[configparms['target_name_list']] = df['target_vector'].str.split( ',', expand=True) df[configparms['target_name_list']] = df[ configparms['target_name_list']].astype(np.float) #Writes the preprocessed training data into a csv file df.to_csv('training_preprocessed.csv') output_writer(reportCMTVEM, configparms) return (df, configparms)
def impute_mem(self, memory): imputer = Imputer() imputed_memory = imputer.fit_transform(memory) return imputed_memory
def clean_data(df_set, strategy): imputer = Imputer(strategy=strategy) np_arr = imputer.fit_transform(df_set) return pd.DataFrame(np_arr, columns=df_set.columns)
count_null_embarked = len(train_df['Embarked'][train_df.Embarked.isnull()]) value_to_fill_embarked = train_df['Embarked'].dropna().mode().values train_df['Embarked'][train_df['Embarked'].isnull()] = value_to_fill_embarked lb2 = LabelEncoder() train_df['Embarked'] = lb2.fit_transform(train_df['Embarked']) # Set the target column to Survived targets = train_df.Survived #Dropping unwanted columns. Also, removing the target column. train_df = train_df.drop( ['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId', 'Survived'], axis=1) #Imputer is used to fill all the occurances of NaN with mean of that column. im = Imputer() predictors = im.fit_transform(train_df) #Using Decision Tree Classifier classifier = DecisionTreeClassifier(max_depth=3, min_samples_leaf=5) classifier = classifier.fit(predictors, targets) #Cleaning test data #Test data is cleaned in the same way as the training data lb3 = LabelEncoder() test_df['Sex'] = lb3.fit_transform(test_df['Sex']) #male:1, female:0 count_null_embarked = len(test_df.Embarked[test_df.Embarked.isnull()]) value_to_fill_embarked = test_df.Embarked.dropna().mode().values test_df['Embarked'][test_df.Embarked.isnull()] = value_to_fill_embarked lb4 = LabelEncoder() test_df['Embarked'] = lb4.fit_transform(test_df['Embarked'])
votes = np.array([[np.argmax(t) for t in c.predict(test_data)] for c in classes]) winners = np.reshape(mode(votes)[0], -1) return winners data = read_csv(open('train_pca.csv', 'r'), na_values='').as_matrix() X = data[:, 1:-1] # input features Y = data[:, -1].astype('int') # input features Y1 = to_categorical(Y) classes = [] imp = Imputer() #default arguments will suffice X = imp.fit_transform(X) # Dropout(rate, noise_shape=None, seed=None) i = 0 for train_i, test_i in StratifiedShuffleSplit(n_splits=3, random_state=None).split(X, Y): np.random.shuffle(train_i) X_train = X[train_i] Y_train = Y1[train_i] brain = Sequential() brain.add( Dense(871, input_dim=871, activation='relu', kernel_regularizer=l2(1e-2)))
from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error from sklearn.model_selection import train_test_split from sklearn.preprocessing import Imputer directory = '/Users/garethjones/Documents/Data Analysis/Kaggle Intro/Data/' file = 'train.csv' data = pd.read_csv(directory + file) ''' CLEAN DATA ''' # A nice way to write a for loop and if statement cols_with_missing = [col for col in data.columns if data[col].isnull().any()] # Use Imputer function to fill in NAN values with mean for that column my_imputer = Imputer() data_imputed = my_imputer.fit_transform(data) ''' SETUP TEST AND TRAIN VARIABLES ''' # These are the variables we will use to predict something else predictors = [ 'LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd' ] X = data[predictors] # This is what we want to predict y = data.SalePrice # Split our dataset into training and testing data train_X, val_X, train_y, val_y = train_test_split(X, y, train_size=0.7,
# drop rows with any missing values titanic.dropna().shape # drop rows where Age is missing titanic[titanic.Age.notnull()].shape # Sometimes a better strategy is to **impute missing values**: # fill missing values for Age with the mean age titanic.Age.fillna(titanic.Age.mean(), inplace=True) # equivalent function in scikit-learn, supports mean/median/most_frequent from sklearn.preprocessing import Imputer imp = Imputer(strategy='mean', axis=1) titanic['Age'] = imp.fit_transform(titanic.Age).T # include Age as a feature feature_cols = ['Pclass', 'Parch', 'Age'] X = titanic[feature_cols] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) logreg.fit(X_train, y_train) y_pred_class = logreg.predict(X_test) print metrics.accuracy_score(y_test, y_pred_class) # ## Part 2: Confusion Matrix # confusion matrix metrics.confusion_matrix(y_test, y_pred_class) # calculate the sensitivity
'adhe' ]] Selected_RiskFactor = RiskFactor[[ 'MotherBC', 'Preg1Age_cate', 'Signature_1', 'Signature_2', 'Signature_3', 'Nonsense_mutation' ]] #X=Selected_RiskFactor #X=Selected_Genetics X = pd.concat([Selected_RiskFactor.reset_index(drop=True), Selected_Genetics], axis=1) X = Selected_Genetics X = Selected_RiskFactor imputer = Imputer(missing_values='NaN', strategy='median', axis=0) X["Benign_Age"] = imputer.fit_transform(X[["Benign_Age"]]).ravel() imputer = Imputer(missing_values='NaN', strategy='median', axis=0) X["AgeofMerche"] = imputer.fit_transform(X[["AgeofMerche"]]).ravel() mapper3 = DataFrameMapper([ ('Class', sklearn.preprocessing.LabelEncoder()), ('MotherBC', sklearn.preprocessing.LabelEncoder()), ('Preg1Age_cate', sklearn.preprocessing.LabelEncoder()), ('scar', sklearn.preprocessing.LabelEncoder()), ('adhe', sklearn.preprocessing.LabelEncoder()), ], default=None) mapper3 = DataFrameMapper( [('MotherBC', sklearn.preprocessing.LabelEncoder()), ('Preg1Age_cate', sklearn.preprocessing.LabelEncoder())],
# print(model) # Loading a saved model model = gensim.models.Word2Vec.load('OpinionMiningModel') # Shuffling the dataset dataset = dataset.sample(frac=1).reset_index(drop=True) X = dataset.tweets.values y = dataset.sentiment.values X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2) trainDataVecs = getAvgFeatureVecs(X_train, model, num_features) testDataVecs = getAvgFeatureVecs(X_test, model, num_features) # Using an imputer because simply removing the null values changes the dimensions of the vectors. imp = Imputer(missing_values=np.nan, strategy='mean') trainDataVecs = imp.fit_transform(trainDataVecs) testDataVecs = imp.fit_transform(testDataVecs) trainDataVecs = trainDataVecs.reshape(len(X_train), -1) testDataVecs = testDataVecs.reshape(len(X_test), -1) # print(trainDataVecs) print(trainDataVecs.shape) print(testDataVecs.shape) svd = TruncatedSVD() trainDataVecs = svd.fit_transform(trainDataVecs) testDataVecs = svd.fit_transform(testDataVecs) print(trainDataVecs.shape) print(testDataVecs.shape) models = ['Logistic Regression', 'Random Forest', 'SVM'] dictionary = modelselectionword2vec(trainDataVecs, testDataVecs, y_train, y_test, models)
# Importing libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt # Importing dataset dataset = pd.read_csv('preprocessing.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 3].values # taking care of missing values from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) X[:, 1:3] = imputer.fit_transform(X[:, 1:3]) # Encoding values for categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelEncoderX = LabelEncoder() labelEncodery = LabelEncoder() X[:, 0] = labelEncoderX.fit_transform(X[:, 0]) y = labelEncodery.fit_transform(y) oneHotEncoder = OneHotEncoder(categorical_features=[0]) X = oneHotEncoder.fit_transform(X).toarray() # Splitting dataset for training and testing from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
def remove_missing_values(dataframe): imr = Imputer(missing_values='NaN', strategy='mean', axis=0) dataframe.ix[:, 3:] = imr.fit_transform(dataframe.ix[:,3:]) return dataframe
if __name__ == '__main__': rng = np.random.RandomState(0) dataset_train = LoadFile(p=r'F:\ODL\dataset\data_train.pickle') dataset_test = LoadFile(p=r'F:\ODL\dataset\data_test.pickle') dataset_train = (dataset_train - np.min(dataset_train, axis=0)) / ( np.max(dataset_train, axis=0) - np.min(dataset_train, axis=0)) dataset_test = (dataset_test - np.min(dataset_test, axis=0)) / ( np.max(dataset_test, axis=0) - np.min(dataset_test, axis=0)) imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True) dataset_train = imp.fit_transform(dataset_train) dataset_test = imp.fit_transform(dataset_test) rng.shuffle(dataset_train) rng.shuffle(dataset_test) print(dataset_train.shape, dataset_test.shape) #检查数据 # print(dataset_train.shape, dataset_test.shape) # num_train = Counter(dataset_train[:, -1]) # num_test = Counter(dataset_test[:, -1]) # print(num_train) # print(num_test) #数据预处理去噪 dataset = np.vstack((dataset_train, dataset_test)) dataset = dataset[:, :225] rng.shuffle(dataset) denoising(dataset, training_time=1, is_finishing=True)
def clean_df4(df, del_rows=True): ''' INPUT: (pandas dataframe) df OUTPUT: (pandas dataframe) cleaned df This funtion returns the df cleaned: 1. Coverts unknown values to NaN 2. Drops columuns with more than 50% missing values 3. Remove rows with more than 10% missing values 4. Clean and convert object columns to numeric. In same cases by hot-encoding. 5. Drop id column 6. Fill NaNs with mode. 7. Drop high correlated columns ''' for column in list(df.columns.values): df[column].replace(-1, np.NaN, inplace=True) null0 = [ 'ALTERSKATEGORIE_GROB', 'ALTER_HH', 'ANREDE_KZ', 'CJT_GESAMTTYP', 'GEBAEUDETYP', 'HH_EINKOMMEN_SCORE', 'KBA05_BAUMAX', 'KBA05_GBZ', 'KKK', 'NATIONALITAET_KZ', 'PRAEGENDE_JUGENDJAHRE', 'REGIOTYP', 'RETOURTYP_BK_S', 'TITEL_KZ', 'WOHNDAUER_2008', 'W_KEIT_KIND_HH' ] for column in null0: try: df[column].replace(0, np.NaN, inplace=True) except: continue null9 = [ 'KBA05_ALTER1', 'KBA05_ALTER2', 'KBA05_ALTER3', 'KBA05_ALTER4', 'KBA05_ANHANG', 'KBA05_AUTOQUOT', 'KBA05_CCM1', 'KBA05_CCM2', 'KBA05_CCM3', 'KBA05_CCM4', 'KBA05_DIESEL', 'KBA05_FRAU', 'KBA05_HERST1', 'KBA05_HERST2', 'KBA05_HERST3', 'KBA05_HERST4', 'KBA05_HERST5', 'KBA05_KRSAQUOT', 'KBA05_KRSHERST1', 'KBA05_KRSHERST2', 'KBA05_KRSHERST3', 'KBA05_KRSKLEIN', 'KBA05_KRSOBER', 'KBA05_KRSVAN', 'KBA05_KRSZUL', 'KBA05_KW1', 'KBA05_KW2', 'KBA05_KW3', 'KBA05_MAXAH', 'KBA05_MAXBJ', 'KBA05_MAXHERST', 'KBA05_MAXSEG', 'KBA05_MAXVORB', 'KBA05_MOD1', 'KBA05_MOD2', 'KBA05_MOD3', 'KBA05_MOD4', 'KBA05_MOD8', 'KBA05_MOTOR', 'KBA05_MOTRAD', 'KBA05_SEG1', 'KBA05_SEG2', 'KBA05_SEG3', 'KBA05_SEG4', 'KBA05_SEG5', 'KBA05_SEG6', 'KBA05_SEG7', 'KBA05_SEG8', 'KBA05_SEG9', 'KBA05_SEG10', 'KBA05_VORB0', 'KBA05_VORB1', 'KBA05_VORB2', 'KBA05_ZUL1', 'KBA05_ZUL2', 'KBA05_ZUL3', 'KBA05_ZUL4', 'RELAT_AB', 'SEMIO_SOZ', 'SEMIO_FAM', 'SEMIO_REL', 'SEMIO_MAT', 'SEMIO_VERT', 'SEMIO_LUST', 'SEMIO_ERL', 'SEMIO_KULT', 'SEMIO_RAT', 'SEMIO_KRIT', 'SEMIO_DOM', 'SEMIO_KAEM', 'SEMIO_PFLICHT', 'SEMIO_TRADV', 'ZABEOTYP', 'KBA05_HERSTTEMP' ] for column in null9: try: df[column].replace(9, np.NaN, inplace=True) except: continue dropcol = [ 'ALTER_KIND4', 'ALTER_KIND3', 'ALTER_KIND2', 'ALTER_KIND1', 'TITEL_KZ', 'AGER_TYP', 'EXTSEL992', 'KK_KUNDENTYP', 'KBA05_BAUMAX' ] for col in dropcol: try: df.drop(col, axis=1, inplace=True) except: continue if del_rows: row_nulls = (df.isnull().sum(axis=1) / df.shape[1]) df.drop(list(row_nulls[row_nulls > 0.1].index.values), inplace=True) df['CAMEO_DEUG_2015'].replace('X', np.NaN, inplace=True) df['CAMEO_INTL_2015'].replace('XX', np.NaN, inplace=True) df['CAMEO_DEUG_2015'] = df['CAMEO_DEUG_2015'].astype(float) df['CAMEO_INTL_2015'] = df['CAMEO_INTL_2015'].astype(float) df['OST_WEST_KZ'] = df.OST_WEST_KZ.map({'W': 0, 'O': 1}) columna = 'EINGEFUEGT_AM' if columna in (list(df.columns.values)): df['year'] = pd.DatetimeIndex(df['EINGEFUEGT_AM']).year df.drop('EINGEFUEGT_AM', axis=1, inplace=True) df['CAMEO_DEU_2015'].replace('XX', np.NaN, inplace=True) df = pd.get_dummies(df, columns=['D19_LETZTER_KAUF_BRANCHE', 'CAMEO_DEU_2015']) df = df.astype(float) df.drop('LNR', axis=1, inplace=True) imputer = Imputer(strategy='most_frequent') df_col = list(df.columns.values) df_imp = imputer.fit_transform(df) df = pd.DataFrame(df_imp, columns=df_col) drop = [ 'KBA13_HERST_SONST', 'PLZ8_GBZ', 'PLZ8_HHZ', 'CAMEO_INTL_2015', 'ANZ_STATISTISCHE_HAUSHALTE', 'LP_LEBENSPHASE_GROB', 'LP_STATUS_GROB', 'KBA13_KMH_250' ] df.drop(drop, axis=1, inplace=True) return df
svm_dict = {} train_directory = '/home/akash/learn/504/final_project/EECS_504_Project/data_train_json' test_directory = '/home/akash/learn/504/final_project/EECS_504_Project/data_bad_json' # Pepper will take care of getting the data in a nice insidious format pepper = DataPrepper() imp = Imputer(missing_values='NaN', strategy='mean', axis=0) for k,v in train_dir_dict.items(): train_data = [] train_data = pepper.multi_step(v) clf = svm.OneClassSVM(nu = 0.1, kernel='poly') train_data_imp = imp.fit_transform(train_data) clf.fit(train_data_imp) svm_dict[k] = clf y_train_pred = clf.predict(train_data_imp) # print ("Train array len", len(y_train_pred)) # print ("Train diff", (sum(y_train_pred))) # test_directory = '/home/akash/learn/504/final_project/EECS_504_Project/data_bad_json' # y_test = pepper.multi_step(test_directory) # y_test_file = '/home/akash/learn/504/final_project/EECS_504_Project/data_train_json/4519661B00000578-4955690-image-a-23_1507304927988_000000000000_keypoints.json' # y_test_file = '/home/akash/learn/504/final_project/EECS_504_Project/data_train_json/8_000000000000_keypoints.json'
def replace_testnan(dataframe): imr = Imputer(missing_values='NaN', strategy='mean', axis=0) dataframe = imr.fit_transform(dataframe) return dataframe