def load_data(train_file='./fashion-mnist_train.csv',test_file='fashion-mnist_test.csv'): #dataset - labels #10 - column1 of the csv file. train_df=pd.read_csv(train_file) test_df=pd.read_csv(test_file) train_labels=train_df.iloc[:,0].as_matrix() train_features=train_df.iloc[:,1:].as_matrix() train_features=minmax().fit_transform(train_features) test_labels=test_df.iloc[:,0].as_matrix() test_features=test_df.iloc[:,1:].as_matrix() test_features=minmax().fit_transform(test_features) return (train_features,train_labels,test_features,test_labels)
from sklearn.preprocessing import StandardScaler as standard # 2) minmax scaling : 최소값을 0으로, 최대값을 1로 만드는 작업 from sklearn.preprocessing import MinMaxScaler as minmax # 6.2 scaling 실행 # 1) standard scaling m_sc1 = standard() m_sc1.fit(train_x) # 각 설명변수의 평균, 표준편차 계산 m_sc1.transform(train_x) # 위 값 바탕으로 표준화 시킴 m_sc1.transform(train_x).mean(axis = 0) # 사실상 0이라 보는게 맞음 m_sc1.transform(train_x).std(axis = 0) # 1 # 2) minmax scaling m_sc2 = minmax() m_sc2.fit(train_x) # 각 설명변수의 최대, 최소 구하기 m_sc2.transform(train_x) # 최소를 0, 최대를 1에 맞춰 계산 m_sc2.transform(train_x).min(axis = 0) # 0 m_sc2.transform(train_x).max(axis = 0) # 1 # train값을 scaling하면 test값도 scaling 해야 함 m_sc1.transform(test_x) m_sc2.transform(test_x) m_sc2.transform(test_x).min(axis = 0) # 0이 아님 => train으로 fit한 값이기 때문 m_sc2.transform(test_x).max(axis = 0) # 1이 아님 => train으로 fit한 값이기 때문 # test를 fit하고 다시 실험하면 다시 0, 1나옴 m_sc3 = minmax() m_sc3.fit(test_x)
fake_data=df[df['target']==1] fake_data=fake_data.iloc[::10,:] original_data=pd.concat([real_data,fake_data],axis=0) dataset=original_data.as_matrix() X_test=dataset[:,:4] #take 70% of original as X_train ad 30% as X_val rd_=real_data.iloc[::2,:] fd_=df[df['target']==1] X_train=pd.concat([rd_,fd_],axis=0) X_train=X_train.as_matrix() X_train=X_train[:,:4] return (X_train,X_test) X_train,X_test=get_data() X_train=minmax().fit_transform(X_train) X_test=minmax().fit_transform(X_test) print("Shape of X_train and X_test:",X_train.shape,X_test.shape) batch_size=X_train.shape[0] def train_next_batch(i,batch_size): global X_train if(i+batch_size > X_train.shape[0]): return X_train[i:X_train.shape[0],:] else: return X_train[i:i+batch_size,:] def test_next_batch(i,batch_size): global X_test if(i+batch_size > X_test.shape[0]): return X_test[i:X_test.shape[0],:]
init = tf.global_variables_initializer() #load data from text file and #split to train,val sets. def get_data(file_='./Fashion_Mnist_Vectors.csv'): df = pd.read_csv(file_) df2 = pd.read_csv('./Fashion_Mnist_10.csv') labels = df2['label'].as_matrix() dataset = df.iloc[:, 1:].as_matrix() return (dataset, labels) X_train, y_train = get_data() X_train = minmax().fit_transform(X_train) print("Shape of X_train and y_train:", X_train.shape, y_train.shape) batch_size = X_train.shape[0] num_batches = X_train.shape[0] // batch_size num_steps = num_batches * num_epochs print(num_steps) def train_next_batch(i, batch_size): global X_train if (i + batch_size > X_train.shape[0]): return X_train[i:X_train.shape[0], :] else: return X_train[i:i + batch_size, :]
from sklearn.preprocessing import StandardScaler as standard m_sc1 = standard() m_sc1.fit(train_x) train_x_sc1 = m_sc1.transform(train_x) m_sc1.fit(test_x) test_x_sc1 = m_sc1.transform(test_x) # step 2) 데이터 학습 & 평가 m_knn.fit(train_x_sc, train_y) m_knn.score(test_x_sc, test_y) # 0.947 => standadrd 경우 scaling 전이랑 같음 # 2-2) min, max 방식 # step 1) scaling data 생성 from sklearn.preprocessing import MinMaxScaler as minmax m_sc2 = minmax() m_sc2.fit(train_x) train_x_sc2 = m_sc2.transform(train_x) m_sc2.fit(test_x) test_x_sc2 = m_sc2.transform(test_x) # step 2) 데이터 학습 & 평가 m_knn.fit(train_x_sc2, train_y) m_knn.score(test_x_sc2, test_y) # 0.959 => min_max 경우 scaling 전보다 상승* # 3) 전체 interaction 학습 (min_max로 scaling된 상태) # step 1) 모델 생성 from sklearn.preprocessing import PolynomialFeatures as poly m_poly = poly(degree=2) # 2차항까지 m_poly.fit(train_x_sc2)
# 5) data point들의 분포 확인 (산점도) import mglearn mglearn.discrete_scatter(train_x_pca[:, 0], train_x_pca[:, 1], train_y) # -------------------------------- 연 습 문 제 -------------------------------- # # 연습문제 7. cancer data의 PCA + SVM 적용 # 1) data loading df1 = pd.read_csv('cancer.csv') Y = df1.iloc[:, 1] X = df1.iloc[:, 2:] train_x, test_x, train_y, test_y = train_test_split(X, Y, random_state = 0) # 2) scaling from sklearn.preprocessing import MinMaxScaler as minmax m_sc = minmax() m_sc.fit(train_x) train_x_sc = m_sc.transform(train_x) test_x_sc = m_sc.transform(test_x) # 3) 인공변수 유도 from sklearn.decomposition import PCA m_pca = PCA(n_components = 2) m_pca.fit(train_x_sc) train_x_sc_pca = m_pca.transform(train_x_sc) test_x_sc_pca = m_pca.transform(test_x_Sc) m_pca.components_ # 4) SVM에 적용 m_svm = SVC()