Example #1
0
def load_data(train_file='./fashion-mnist_train.csv',test_file='fashion-mnist_test.csv'):
    #dataset - labels #10 - column1 of the csv file.
    train_df=pd.read_csv(train_file)
    test_df=pd.read_csv(test_file)
    train_labels=train_df.iloc[:,0].as_matrix()
    train_features=train_df.iloc[:,1:].as_matrix()
    train_features=minmax().fit_transform(train_features)
    test_labels=test_df.iloc[:,0].as_matrix()
    test_features=test_df.iloc[:,1:].as_matrix()
    test_features=minmax().fit_transform(test_features)
    return (train_features,train_labels,test_features,test_labels)
Example #2
0
from sklearn.preprocessing import StandardScaler as standard
    
# 2) minmax scaling : 최소값을 0으로, 최대값을 1로 만드는 작업
from sklearn.preprocessing import MinMaxScaler as minmax

# 6.2 scaling 실행
# 1) standard scaling
m_sc1 = standard()
m_sc1.fit(train_x)        # 각 설명변수의 평균, 표준편차 계산
m_sc1.transform(train_x)  # 위 값 바탕으로 표준화 시킴

m_sc1.transform(train_x).mean(axis = 0)    # 사실상 0이라 보는게 맞음
m_sc1.transform(train_x).std(axis = 0)     # 1

# 2) minmax scaling
m_sc2 = minmax()
m_sc2.fit(train_x)        # 각 설명변수의 최대, 최소 구하기
m_sc2.transform(train_x)  # 최소를 0, 최대를 1에 맞춰 계산

m_sc2.transform(train_x).min(axis = 0)    # 0
m_sc2.transform(train_x).max(axis = 0)    # 1

# train값을 scaling하면 test값도 scaling 해야 함
m_sc1.transform(test_x)
m_sc2.transform(test_x)

m_sc2.transform(test_x).min(axis = 0)    # 0이 아님 => train으로 fit한 값이기 때문
m_sc2.transform(test_x).max(axis = 0)    # 1이 아님 => train으로 fit한 값이기 때문
# test를 fit하고 다시 실험하면 다시 0, 1나옴
m_sc3 = minmax()
m_sc3.fit(test_x)
    fake_data=df[df['target']==1]
    fake_data=fake_data.iloc[::10,:]
    original_data=pd.concat([real_data,fake_data],axis=0)
    dataset=original_data.as_matrix()
    X_test=dataset[:,:4]

    #take 70% of original as X_train ad 30% as X_val
    rd_=real_data.iloc[::2,:]
    fd_=df[df['target']==1]
    X_train=pd.concat([rd_,fd_],axis=0)
    X_train=X_train.as_matrix()    
    X_train=X_train[:,:4]
    return (X_train,X_test)

X_train,X_test=get_data()
X_train=minmax().fit_transform(X_train)
X_test=minmax().fit_transform(X_test)
print("Shape of X_train and X_test:",X_train.shape,X_test.shape)
batch_size=X_train.shape[0]

def train_next_batch(i,batch_size):
    global X_train
    if(i+batch_size > X_train.shape[0]):
       return X_train[i:X_train.shape[0],:]
    else:
       return X_train[i:i+batch_size,:]

def test_next_batch(i,batch_size):
    global X_test
    if(i+batch_size > X_test.shape[0]):
       return X_test[i:X_test.shape[0],:]
init = tf.global_variables_initializer()


#load data from text file and
#split to train,val sets.
def get_data(file_='./Fashion_Mnist_Vectors.csv'):
    df = pd.read_csv(file_)
    df2 = pd.read_csv('./Fashion_Mnist_10.csv')
    labels = df2['label'].as_matrix()
    dataset = df.iloc[:, 1:].as_matrix()

    return (dataset, labels)


X_train, y_train = get_data()
X_train = minmax().fit_transform(X_train)
print("Shape of X_train and y_train:", X_train.shape, y_train.shape)
batch_size = X_train.shape[0]

num_batches = X_train.shape[0] // batch_size
num_steps = num_batches * num_epochs
print(num_steps)


def train_next_batch(i, batch_size):
    global X_train
    if (i + batch_size > X_train.shape[0]):
        return X_train[i:X_train.shape[0], :]
    else:
        return X_train[i:i + batch_size, :]
from sklearn.preprocessing import StandardScaler as standard
m_sc1 = standard()
m_sc1.fit(train_x)
train_x_sc1 = m_sc1.transform(train_x)

m_sc1.fit(test_x)
test_x_sc1 = m_sc1.transform(test_x)

# step 2) 데이터 학습 & 평가
m_knn.fit(train_x_sc, train_y)
m_knn.score(test_x_sc, test_y)  # 0.947 => standadrd 경우 scaling 전이랑 같음

# 2-2) min, max 방식
# step 1) scaling data 생성
from sklearn.preprocessing import MinMaxScaler as minmax
m_sc2 = minmax()
m_sc2.fit(train_x)
train_x_sc2 = m_sc2.transform(train_x)

m_sc2.fit(test_x)
test_x_sc2 = m_sc2.transform(test_x)

# step 2) 데이터 학습 & 평가
m_knn.fit(train_x_sc2, train_y)
m_knn.score(test_x_sc2, test_y)  # 0.959 => min_max 경우 scaling 전보다 상승*

# 3) 전체 interaction 학습 (min_max로 scaling된 상태)
# step 1) 모델 생성
from sklearn.preprocessing import PolynomialFeatures as poly
m_poly = poly(degree=2)  # 2차항까지
m_poly.fit(train_x_sc2)
# 5) data point들의 분포 확인 (산점도)
import mglearn
mglearn.discrete_scatter(train_x_pca[:, 0], train_x_pca[:, 1], train_y)

# -------------------------------- 연 습 문 제 -------------------------------- #
# 연습문제 7. cancer data의 PCA + SVM 적용
# 1) data loading
df1 = pd.read_csv('cancer.csv')
Y = df1.iloc[:, 1]
X = df1.iloc[:, 2:]

train_x, test_x, train_y, test_y = train_test_split(X, Y, random_state = 0)

# 2) scaling
from sklearn.preprocessing import MinMaxScaler as minmax
m_sc = minmax()
m_sc.fit(train_x)
train_x_sc = m_sc.transform(train_x)
test_x_sc = m_sc.transform(test_x)

# 3) 인공변수 유도
from sklearn.decomposition import PCA
m_pca = PCA(n_components = 2)
m_pca.fit(train_x_sc)
train_x_sc_pca = m_pca.transform(train_x_sc)
test_x_sc_pca = m_pca.transform(test_x_Sc)

m_pca.components_

# 4) SVM에 적용
m_svm = SVC()