X = incomedata.to_numpy() incomedata_np = incomedata.to_numpy() X = X[:,0:41] ## Get norminal data X = np.delete(X,[0,5,16,17,18,29,38],axis=1) ## Missing values X = SimpleImputer(missing_values='?', strategy='most_frequent').fit_transform(X) ## Convert to continuous int values ##enc = preprocessing.OneHotEncoder() ##X = enc.fit_transform(X) enc = preprocessing.LabelEncoder() for i in range(34): X[:,i] = enc.fit_transform(X[:,i]) #X = X.toarray() X = X.astype(int) ## Missing values and Convert to int values Y = incomedata_np[:,[0,5,16,17,18,29,38]] Y = SimpleImputer(missing_values='?', strategy='most_frequent').fit_transform(Y) Y = Y.astype(int) ## Get the X, Y for regression X = np.hstack((X,Y)) ## Y = incomedata[:,0] ## save to text np.savetxt('incomedata/LRtestData',X) ######################################################################################################################## incomedata = pd.read_csv('incomedata/census-income.data',header=None) X = incomedata.to_numpy() incomedata_np = incomedata.to_numpy()
mat = df[Ratings_cols].values mat = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(mat) np.save("UCI_dataset_04", mat) np.save("UCI_dataset_05", pd.read_csv("yacht_hydrodynamics.data", header=None, sep="\s+").values) np.save("UCI_dataset_06", pd.read_excel("Concrete_Data.xls", header=0).values.astype(float)) np.save("UCI_dataset_07", pd.read_csv("airfoil_self_noise.dat", header=None, sep="\t").values) mat = pd.read_csv("communities.data", header=None, sep=",").values[:, 5:] mat[mat == "?"] = None mat = mat.astype(float) mat[mat == None] = np.nan mat = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(mat) np.save("UCI_dataset_08", mat) np.save( "UCI_dataset_09", pd.read_csv("CASP.csv", header=0)[[ 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'RMSD' ]].values.astype(float)) np.save( "UCI_dataset_10", pd.read_csv("Relation Network (Directed).data", header=None, sep=",").values[:, 1:].astype(float))