예제 #1
0
def app_main():
    try:
        logger = lgr.Logger()
        print("--------application starting--------------")
        logger.info("--------application starting--------------")
        print("--------loading data----------------------")
        logger.info("--------loading data----------------------")
        dl = dataLoader.DataLoader()
        train_X, test_X = train_test_split(dl.DataFrame.copy(),
                                           test_size=0.3,
                                           random_state=42)
        # label = train_X["median_house_value"].copy()
        # housing = train_X.drop("median_house_value", axis=1)
        housing = test_X.drop("median_house_value", axis=1)
        #label_test = test_X["median_house_value"].copy()
        numeric_cols = list(housing.columns.values)
        category_cols = ["ocean_proximity"]
        numeric_cols.remove("ocean_proximity")
        dp = preproc.DataPreProcess(numeric_cols, category_cols)
        print("--------processing data-------------------")
        dataProcessed = dp.getProcessedData(housing)
        print(dataProcessed.shape)
        configMagt = cmfmagt.ConfigManager()
        engine = mlEngine.ModelEngine()
        print("------------loading model------------------")
        bestModel = engine.loadML(configMagt.config["APPSETTING"]["ml_path"])
        result = bestModel.predict(dataProcessed)
        print(result.shape)
    except Exception as e:
        print("Error : ", str(e))
예제 #2
0
def MakeDataset(data_path):
    # 开始制作数据
    total_data = DataPreProcess.GetData(data_path)
    day_nums = len(total_data)//400  # 数据的形式转换为(60,400,24)表示60天,400个grid,24小时
    total_data = np.reshape(np.array(total_data), [day_nums, 400, 24])
    print(total_data.shape)
    train, test = [], []
    # 说明一下为什么从第七天开始,因为要计算前三天的数据,以及上一周同一时间的数据,所以要从7开始,不然会造成越界
    for day in range(7, total_data.shape[0]):
        for hour in range(total_data.shape[2]):
            # 制作closeness部分
            closeness = []
            for city in range(total_data.shape[1]):
                temp = []
                if hour < 3:
                    # 注意,要保持顺序的一致,要先添加前一天的数据,在到今天的数据,不能反过来
                    for i in range(0, 3-hour):
                        temp.append(total_data[day-1, city, -(3-hour-i)])
                    for i in range(hour):
                        temp.append(total_data[day, city, i])
                else:
                    for i in range(3):
                        temp.append(total_data[day, city, hour-3+i])
                closeness.append(temp)
            
            # 制作period部分
            period = []
            for city in range(total_data.shape[1]):
                temp = []
                for i in range(3):
                    temp.append(total_data[day-3+i, city, hour])
                period.append(temp)
            
            # 制作trend部分
            trend = total_data[day-7, :, hour]
            
            # 制作label部分
            label = total_data[day, :, hour]
            
            # 矩阵变换
            closeness = np.reshape(np.array(closeness), [20, 20, 3])
            period = np.reshape(np.array(period), [20, 20, 3])
            trend = np.reshape(np.array(trend), [20, 20, 1])
            label = np.reshape(np.array(label), [20, 20, 1])
            
            # 将数据整合
            data = np.c_[closeness, period, trend, label] # 为[20, 20, 8] 
            # print(data.shape)
            
            # 为什么是46,因为要包括40天的数据,从第七天开始,所以是46
            if day <= 46:
                train.append(data)
            else:
                test.append(data)
    
    train = np.array(train) # [960, 20, 20, 8] 40*24 = 960
    test = np.array(test)   # [312, 20, 20, 8] 13*24 = 312
    
    return train, test
예제 #3
0
def DecodeData(data_path, max_min_path):
    data = np.array(DataPreProcess.GetData(data_path))
    # 解归一化
    max_min_str = []
    with codecs.open(max_min_path, 'r', 'utf-8') as r:
        max_min_str = [line for line in r.readlines()]
    max_min = []
    for i in range(len(max_min_str)):
        max_min.append([float(value) for value in max_min_str[i].split()])

    # 转置后为2*24000,即每个城市的最大值和最小值
    max_min = np.array(max_min).T
    data = data * (max_min[0] - max_min[1]) + max_min[1]
    return data
예제 #4
0
def load_data(filename, batch_size, n_words=10000):
    #           extract data
    PreProcessObj = DataPreProcess.DataPreProcess(filename, batch_size)
    # Create Batches of sentences of equal length for training and target
    trainSet, decodeInputSet, targetSet = PreProcessObj.makeInputAndLabels()
    #       build dictionary
    word_to_id = PreProcessObj.build_vocab(n_words=n_words)
    #           Get training, decoder input  and target set
    train_data = PreProcessObj.encodeSentencesToIds(trainSet, word_to_id)
    decoder_inputdata = PreProcessObj.encodeSentencesToIds(
        decodeInputSet, word_to_id)
    test_data = PreProcessObj.encodeSentencesToIds(targetSet, word_to_id)
    vocabulary = PreProcessObj.VocabularySize
    sentence_ids = PreProcessObj.sentence_id
    num_steps = PreProcessObj.seq_len
    max_len = PreProcessObj.max_len

    print("Number of sentences is ", train_data.shape[0])
    print("Print seq lens")
    print("Max len of sentences")
    print(max_len)
    return train_data, decoder_inputdata, test_data, vocabulary, sentence_ids, num_steps, max_len
예제 #5
0
def main():
    print("Enter main()")
    #=============================================================================================
    # 主成分分析 [PCA : Principal Component Analysis] による教師なしデータの次元削除、特徴抽出
    # scikit-learn ライブラリでの主成分分析使用
    #=============================================================================================

    #====================================================
    #   Data Preprocessing(前処理)
    #====================================================
    #----------------------------------------------------
    #   read & set  data
    #----------------------------------------------------
    prePro = DataPreProcess.DataPreProcess()

    # Wine データセットの読み込み
    prePro.setDataFrameFromCsvFile( "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data" )
    
    # 上記URLのWine データセットにはラベルがついてないので, 列名をセット
    prePro.setColumns( 
        [
            'Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols',
            'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
            'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'
        ] 
    )

    prePro.print("Wine データセット")

    X_train, X_test, y_train, y_test \
    = DataPreProcess.DataPreProcess.dataTrainTestSplit( 
        X_input = prePro.df_.iloc[:, 1:].values,   # iloc : 行、列を番号で指定(先頭が 0)。df_.iloc[:, 1:] = 全行、1~の全列
        y_input = prePro.df_.iloc[:, 0].values,    #
        ratio_test = 0.3
    )

    # 分割データ(トレーニングデータ、テストデータ)を出力
    print( "トレーニングデータ : \n", X_train )
    print("テストデータ : \n", X_test )
    print("トレーニング用教師データ : \n", y_train )
    print("テスト用教師データ : \n", y_test )

    # 特徴量のスケーリング(標準化 [standardization])
    X_train_std, X_test_std \
    = DataPreProcess.DataPreProcess.standardizeTrainTest( X_train, X_test )

    # 標準化後のデータを出力
    print( "トレーニングデータ [standardized] :\n", X_train_std )
    print("テストデータ [standardized] : \n", X_test_std )    
    
    #========================================================================
    # Learning Process (scikit-learn のPCAクラスを使用)&ロジスティクス回帰
    #========================================================================
    # PCA で次元削除
    pca = PCA( n_components = 2 )   # n_components : 主成分数(PC1,PC2)
    
    X_train_pca = pca.fit_transform( X_train_std )
    X_test_pca = pca.transform( X_test_std )

    # ロジスティクス回帰
    logReg = LogisticRegression()
    logReg = logReg.fit( X_train_pca, y_train )     # 次元削除したトレーニングデータ X_train_pca で識別

    #====================================================
    #   汎化性能の評価
    #====================================================

    #-------------------------------
    # 識別率を計算&出力
    #-------------------------------
    y_predict1 = logReg.predict( X_train_pca )
    y_predict2 = logReg.predict( X_test_pca )

    print("<テストデータの識別結果>")
    
    print("classifier1 : logisitic Regression 1 \n ( leraning data dimesion by PCA  )")
    # 誤分類のサンプル数を出力
    print( "誤識別数 [Misclassified samples] : %d" % (y_train != y_predict1).sum() )  # %d:10進数, string % data :文字とデータ(値)の置き換え
    # 分類の正解率を出力
    print( "正解率 [Accuracy] : %.2f" % accuracy_score(y_train, y_predict1) )

    print("classifier1 : logisitic Regression 2 \n ( test data dimesion by PCA  )")
    # 誤分類のサンプル数を出力
    print( "誤識別数 [Misclassified samples] : %d" % (y_test != y_predict2).sum() )  # %d:10進数, string % data :文字とデータ(値)の置き換え
    # 分類の正解率を出力
    print( "正解率 [Accuracy] : %.2f" % accuracy_score(y_test, y_predict2) )

    #--------------------------------------------------------
    # 13 次元 → 2 次元に次元削除した主成分空間での散布図
    #--------------------------------------------------------
    # 学習データ
    Plot2D.Plot2D.drawDiscriminantRegions( 
        dat_X = X_train_pca, dat_y = y_train,
        classifier = logReg
    )
    plt.title("Idefication Result - Learning data \n Logistic Regression (dimension is deleted by PCA)")
    plt.xlabel('PC 1')
    plt.ylabel('PC 2')
    plt.legend(loc='upper left')
    plt.tight_layout()

    # 図の保存&表示
    plt.savefig("./PCA_scikit-learn_4.png", dpi = 300, bbox_inches = 'tight' )
    plt.show()

    # テストデータ
    Plot2D.Plot2D.drawDiscriminantRegions( 
        dat_X = X_test_pca, dat_y = y_test,
        classifier = logReg
    )

    plt.title("Idefication Result - test data \n Logistic Regression (dimension is deleted by PCA)")
    plt.xlabel('PC 1')
    plt.ylabel('PC 2')
    plt.legend(loc='upper left')
    plt.tight_layout()

    # 図の保存&表示
    plt.savefig("./PCA_scikit-learn_5.png", dpi = 300, bbox_inches = 'tight' )
    plt.show()

    print("Finish main()")
    return
                        "BidPrice5","AskPrice5","BidPrice6","AskPrice6","BidPrice7","AskPrice7","BidPrice8","AskPrice8",
                        "BidPrice9","AskPrice9","BidPrice10","AskPrice10","BidPrice11","AskPrice11","BidPrice12","AskPrice12",
                        "BidPrice13","AskPrice13","BidPrice14","AskPrice14","BidPrice15","AskPrice15","BidPrice16","AskPrice16",
                        "BidPrice17","AskPrice17","BidPrice18","AskPrice18","BidPrice19","AskPrice19","BidPrice20","AskPrice20"]]\
            =book_dat.loc[:,["BidPrice1","AskPrice1","BidPrice2","AskPrice2","BidPrice3","AskPrice3","BidPrice4","AskPrice4",
                        "BidPrice5","AskPrice5","BidPrice6","AskPrice6","BidPrice7","AskPrice7","BidPrice8","AskPrice8",
                        "BidPrice9","AskPrice9","BidPrice10","AskPrice10","BidPrice11","AskPrice11","BidPrice12","AskPrice12",
                        "BidPrice13","AskPrice13","BidPrice14","AskPrice14","BidPrice15","AskPrice15","BidPrice16","AskPrice16",
                        "BidPrice17","AskPrice17","BidPrice18","AskPrice18","BidPrice19","AskPrice19","BidPrice20","AskPrice20"]]/100

        # # =============================================================================
        # # Combine all message orders arriving at the same time
        # # And split buy and sell MO
        # # =============================================================================

        message_new_dat, book_new_dat = DataPreProcess.mergemessage(
            message_dat, book_dat)

        # Pick out buy and sell market orders
        MO_Plus = message_new_dat.loc[(message_new_dat['Order type'] == 'MO')
                                      & (message_new_dat['Direction'] == 83)]
        MO_Minus = message_new_dat.loc[(message_new_dat['Order type'] == 'MO')
                                       & (message_new_dat['Direction'] == 66)]

        #         # # =============================================================================
        #         # # preset unequally spaced action times based on the following principles:
        #         # # 1. pi^+=pi^-=0.4
        #         # # 2. The number of arrivals between two consecutive action times is around 1
        #         # # Note: we don't set pi(1,1) right now. pi(1,1) is computed after fixing action times
        #         # # =============================================================================
        #
        #         # Split the time interval into 5 equally spaced parts between two consecutive even indexed MOs
예제 #7
0
"""
코드 작성일시 : 2020년 1월 11일
코드 내용 : 각종 테스트를 위한 임시코드들
"""
import cv2
import DataPreProcess as DPP

cap = cv2.VideoCapture(0)

print('width :%d, height : %d' % (cap.get(3), cap.get(4)))

while (True):
    ret, frame = cap.read()  # Read 결과와 frame

    if (ret):
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)  # 입력 받은 화면 Gray로 변환
        image = cv2.GaussianBlur(image, (5, 5), 0)
        result1 = DPP.filter_binary(image)
        result2 = DPP.filter_binaryinv(image)
        cv2.imshow('binary', result1)  # 컬러 화면 출력
        cv2.imshow('binaryinv', result2)  # 컬러 화면 출력

        #cv2.imshow('frame_gray', gray)    # Gray 화면 출력
        if cv2.waitKey(1) == ord('q'):
            break
cap.release()
cv2.destroyAllWindows()
예제 #8
0
def char_removal(x):
    return re.findall("[0-9]*$", x)[0]


df['Ticket'] = df['Ticket'].apply(char_removal)
maxTicket = max(df['Ticket'])

# Selecting Data to aply Neural Netowrk:
data_nn = df.loc[:, ('Survived', 'PassengerId', 'Pclass', 'Sex', 'Age',
                     'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked')]
data_nn = data_nn.apply(pd.to_numeric, axis=1)
data_nn = data_nn.dropna()
data_nn['Ticket'] = data_nn['Ticket'] / max(data_nn['Ticket'])
normColumns = ['PassengerId', 'Age', 'Fare']
data_nn.loc[:, normColumns] = dpp.normalize_data(data_nn.loc[:, normColumns])
data_nn['Pclass'] = data_nn['Pclass'] - 2
data_nn['Embarked'] = data_nn['Embarked'] - 1
Y = dpp.data_std_shape(data_nn['Survived'])
X = dpp.data_std_shape(data_nn.iloc[:, 1:])

X_seg, Y_seg = dpp.data_seg(X, Y, [0.75, 0.25, 0])
X_seg = X_seg[:2]
Y_seg = Y_seg[:2]

# Applying NN:
net1 = nn.Network(X_seg[0], Y_seg[0])
net1.train(2, [2, 2], alpha=20, printIteration=True, momentum=True)
net1.set_cross_data(X_seg[1], Y_seg[1])
net1.predict_targets()
sum(net1.prediction)
def main():
    print("Enter main()")
    #=============================================================================================
    # 主成分分析 [PCA : Principal Component Analysis] による教師なしデータの次元削除、特徴抽出
    # scikit-learn ライブラリでの主成分分析不使用
    #=============================================================================================

    #====================================================
    #   Data Preprocessing(前処理)
    #====================================================
    #----------------------------------------------------
    #   read & set  data
    #----------------------------------------------------
    prePro = DataPreProcess.DataPreProcess()

    # Wine データセットの読み込み
    prePro.setDataFrameFromCsvFile(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
    )

    # 上記URLのWine データセットにはラベルがついてないので, 列名をセット
    prePro.setColumns([
        'Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
        'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
        'Proanthocyanins', 'Color intensity', 'Hue',
        'OD280/OD315 of diluted wines', 'Proline'
    ])

    prePro.print("Wine データセット")

    X_train, X_test, y_train, y_test \
    = DataPreProcess.DataPreProcess.dataTrainTestSplit(
        X_input = prePro.df_.iloc[:, 1:].values,   # iloc : 行、列を番号で指定(先頭が 0)。df_.iloc[:, 1:] = 全行、1~の全列
        y_input = prePro.df_.iloc[:, 0].values,    #
        ratio_test = 0.3
    )

    # 分割データ(トレーニングデータ、テストデータ)を出力
    print("トレーニングデータ : \n", X_train)
    print("テストデータ : \n", X_test)
    print("トレーニング用教師データ : \n", y_train)
    print("テスト用教師データ : \n", y_test)

    # 特徴量のスケーリング(標準化 [standardization])
    X_train_std, X_test_std \
    = DataPreProcess.DataPreProcess.standardizeTrainTest( X_train, X_test )

    # 標準化後のデータを出力
    print("トレーニングデータ [standardized] :\n", X_train_std)
    print("テストデータ [standardized] : \n", X_test_std)

    #========================================================================
    # PCAによる各主成分に対する固有値&寄与率の算出と次元削除(特徴抽出)
    #========================================================================
    # トレーニングデータ(の転置)から共分散分散行列を作成
    Conv_mat = numpy.cov(X_test_std.T)

    # 固有値 [eigenvalue] , 固有ベクトルの算出
    # numpylinalg.eig() により,固有分解 [eigen decomposition] を実行し,
    # 13 個の固有値とそれに対応する固有ベクトルを作成する.
    eigen_values, eigen_vecs = numpy.linalg.eig(Conv_mat)

    print('\n固有値 [eigenvalue] \n%s' % eigen_values)
    print('\nEigen vectors \n%s' % eigen_vecs)

    # 固有値の和をとる
    eigen_total = sum(eigen_values)

    # 寄与率(分散の比)[proportion of the variance] を計算(リストの内包表記)
    var_ratio = [(ramda / eigen_total)
                 for ramda in sorted(eigen_values, reverse=True)]

    print("\n寄与率(分散の比)[proportion of the variance] \n|%-5s|" % var_ratio)

    # 累積寄与率 [Cumulative contribution rate] を計算
    cum_var_ratio = numpy.cumsum(var_ratio)

    print("\n累積寄与率 [Cumulative contribution rate \n|%-5s|" % cum_var_ratio)

    # 特徴変換(射影行列の作成)
    # 固有値, 固有ベクトルからなるタプルのリストを作成
    eigen_pairs = [(numpy.abs(eigen_values[i]), eigen_vecs[:, i])
                   for i in range(len(eigen_values))]

    # タプルを大きい順に並び替え
    eigen_pairs.sort(key=lambda k: k[0], reverse=True)  # ?

    # 13×2の射影行列の作成
    W_mat = numpy.hstack((eigen_pairs[0][1][:, numpy.newaxis],
                          eigen_pairs[1][1][:, numpy.newaxis])  # ?
                         )

    print('Matrix W:\n', W_mat)

    # 作成した射影行列でトレーニングデータを変換
    X_train_pca = X_train_std.dot(W_mat)

    #====================================================
    #   汎化性能の評価
    #====================================================
    #------------------------------------
    # 第 k 主成分の固有値の図 plot
    #------------------------------------
    # 現在の図をクリア
    plt.clf()

    # 棒グラフ(第1主成分, 第2主成分)赤棒
    plt.bar(
        range(1, 3),
        eigen_values[0:2],
        alpha=1.0,
        align='center',
        #label = 'Eigenvalues',
        color="red")

    # 棒グラフ(第3主成分, ...)青棒
    plt.bar(
        range(3, 14),
        eigen_values[2:13],
        alpha=1.0,
        align='center',
        #label = 'Eigenvalues',
        color="blue")

    #plt.grid()
    plt.axhline(1.0, color='gray', linestyle='--', linewidth=1)
    plt.axhline(2.0, color='gray', linestyle='--', linewidth=1)
    plt.axhline(3.0, color='gray', linestyle='--', linewidth=1)
    plt.axhline(4.0, color='gray', linestyle='--', linewidth=1)
    plt.axhline(5.0, color='gray', linestyle='--', linewidth=1)

    plt.xticks(range(1, 14), [
        "lamda_1", "lamda_2", "lamda_3", "lamda_4", "lamda_5", "lamda_6",
        "lamda_7", "lamda_8", "lamda_9", "lamda_10", "lamda_11", "lamda_12",
        "lamda_13"
    ],
               rotation=90)

    plt.title("Principal components - Eigenvalues (PCA)")
    plt.xlabel('Principal components')
    plt.ylabel('Eigenvalues')
    plt.legend(loc='best')
    plt.tight_layout()

    # 図の保存&表示
    plt.savefig("./PCA_scikit-learn_1.png", dpi=300, bbox_inches='tight')
    plt.show()

    #----------------------------------------
    # 第 k 主成分の寄与率&累積寄与率の plot
    #----------------------------------------
    # 現在の図をクリア
    plt.clf()

    # 棒グラフ(第1主成分, 第2主成分)赤棒
    plt.bar(range(1, 3),
            var_ratio[0:2],
            alpha=1.0,
            align='center',
            label='Eigenvalues (principal component 1 and 2)',
            color="red")

    # 棒グラフ(第3主成分, ...)青棒
    plt.bar(range(3, 14),
            var_ratio[2:13],
            alpha=1.0,
            align='center',
            label='Eigenvalues (principal component 3 and so on)',
            color="blue")

    # 累積寄与率の階段グラフ
    plt.step(range(1, 14),
             cum_var_ratio,
             where='mid',
             label='cumulative proportion of the variance')

    plt.axhline(0.1, color='gray', linestyle='--', linewidth=1)
    plt.axhline(0.2, color='gray', linestyle='--', linewidth=1)
    plt.axhline(0.3, color='gray', linestyle='--', linewidth=1)
    plt.axhline(0.4, color='gray', linestyle='--', linewidth=1)
    plt.axhline(0.5, color='gray', linestyle='--', linewidth=1)
    plt.axhline(0.6, color='gray', linestyle='--', linewidth=1)
    plt.axhline(0.7, color='gray', linestyle='--', linewidth=1)
    plt.axhline(0.8, color='gray', linestyle='--', linewidth=1)
    plt.axhline(0.9, color='gray', linestyle='--', linewidth=1)
    plt.axhline(1.0, color='gray', linestyle='--', linewidth=1)

    plt.xticks(range(1, 14), range(1, 14))

    plt.title("Principal components - Proportion of the variance (PCA)")
    plt.xlabel('Principal components')
    plt.ylabel('Proportion of the variance \n individual explained variance')
    plt.legend(loc='best')
    plt.tight_layout()

    # 図の保存&表示
    plt.savefig("./PCA_scikit-learn_2.png", dpi=300, bbox_inches='tight')
    plt.show()

    #--------------------------------------------------------
    # 13 次元 → 2 次元に次元削除した主成分空間での散布図
    #--------------------------------------------------------
    # 現在の図をクリア
    plt.clf()
    plt.grid()

    # パレット
    colors = ['r', 'b', 'g']
    markers = ['s', 'x', 'o']

    for l, c, m in zip(numpy.unique(y_train), colors, markers):
        plt.scatter(
            X_train_pca[y_train == l, 0],  # PC1 : class l (l=1,2,3)
            X_train_pca[y_train == l, 1],  # PC2 : class l (l=1,2,3)
            c=c,
            label=l,
            marker=m)

    plt.title(
        "Dimension deleted Wine data (PCA) \n 13×178 dim → 2×124 dim [dimension / feature extraction]"
    )
    plt.xlabel('PC 1')
    plt.ylabel('PC 2')
    plt.legend(loc='upper left')
    plt.tight_layout()

    # 図の保存&表示
    plt.savefig("./PCA_scikit-learn_3.png", dpi=300, bbox_inches='tight')
    plt.show()

    print("Finish main()")
    return
def main():
    """
    機械学習パイプラインによる、機械学習処理フロー(scikit-learn ライブラリの Pipeline クラスを使用)
    学習曲線, 検証曲線よるモデルの汎化性能の評価
    """
    print("Enter main()")

    # データの読み込み
    prePro = DataPreProcess.DataPreProcess()
    prePro.setDataFrameFromCsvFile(
        "https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/wdbc/wdbc.data"
    )
    #prePro.print( "Breast Cancer Wisconsin dataset" )

    dat_X = prePro.df_.loc[:, 2:].values
    dat_y = prePro.df_.loc[:, 1].values

    #===========================================
    # 前処理 [PreProcessing]
    #===========================================
    # 欠損データへの対応
    #prePro.meanImputationNaN()

    # ラベルデータをエンコード
    prePro.encodeClassLabelByLabelEncoder(colum=1)
    prePro.print("Breast Cancer Wisconsin dataset")

    # データをトレードオフデータとテストデータに分割
    X_train, X_test, y_train, y_test \
    = DataPreProcess.DataPreProcess.dataTrainTestSplit( X_input = dat_X, y_input = dat_y, ratio_test = 0.2 )

    #-------------------------------------------
    # Pipeline の設定
    #-------------------------------------------
    # パイプラインに各変換器、推定器を設定
    pipe_logReg = Pipeline(steps=[  # タプル (任意の識別文字, 変換器 or 推定器のクラス) で指定
        ("scl", StandardScaler()),  # スケーリング: 変換器のクラス(fit() 関数を持つ)
        ("clf", LogisticRegression(penalty='l2', random_state=0)
         )  # ロジスティクス回帰(L2正則化):推定器のクラス(predict()関数を持つ)
    ])

    # パイプラインに設定した変換器の fit() 関数を実行
    #pipe_logReg.fit( X_train, y_train )

    #
    #print( "Test Accuracy: %.3f" % pipe_logReg.score( X_test, y_test ) )

    #============================================
    # Learning Process
    #===========================================
    # パイプラインに設定した推定器の predict() 実行
    #y_predict = pipe_logReg.predict(X_test)
    #print("predict : ", y_predict )

    #===========================================
    # 学習曲線による汎化性能の確認
    #===========================================
    # learning_curve() 関数で"交差検証"による正解率を算出
    train_sizes, train_scores, test_scores \
    = learning_curve(
          estimator = pipe_logReg,                      # 推定器 : Pipeline に設定しているロジスティクス回帰
          X = X_train,                                  # 
          y = y_train,                                  # 
          train_sizes = numpy.linspace(0.1, 1.0, 10),   # トレードオフサンプルの絶対数 or 相対数
                                                        # トレーニングデータサイズに応じた, 等間隔の10 個の相対的な値を設定
          cv = 10,                                      # 交差検証の回数(分割数)
          n_jobs = -1                                   # 全てのCPUで並列処理
      )

    # 平均値、分散値を算出
    train_means = numpy.mean(train_scores, axis=1)  # axis = 1 : 行方向
    train_stds = numpy.std(train_scores, axis=1)
    test_means = numpy.mean(test_scores, axis=1)
    test_stds = numpy.std(test_scores, axis=1)

    #
    print("train_sizes : \n",
          train_sizes)  # トレーニングデータサイズに応じた, 等間隔の10 個の相対的な値のリスト
    print("train_scores : \n", train_scores)
    print("test_scores : \n", test_scores)
    print("train_means : \n", train_means)
    print("train_stds : \n", train_stds)
    print("test_means : \n", test_means)
    print("test_stds : \n", test_stds)

    #-------------------------------------------
    # 学習曲線を描写
    #-------------------------------------------
    Plot2D.Plot2D.drawLearningCurve(train_sizes=train_sizes,
                                    train_means=train_means,
                                    train_stds=train_stds,
                                    test_means=test_means,
                                    test_stds=test_stds)
    plt.title("Learning Curve \n LogisticRegression (L2 regularization)")
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='best')
    plt.ylim([0.8, 1.01])
    plt.tight_layout()

    plt.savefig("./MachineLearningPipeline_scikit-learn_1.png",
                dpi=300,
                bbox_inches='tight')
    plt.show()

    #===========================================
    # 検証曲線による汎化性能の確認
    #===========================================
    param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

    # validication_curve() 関数で"交差検証"による正解率を算出
    train_scores, test_scores \
    = validation_curve(
          estimator = pipe_logReg,      # 
          X = X_train,
          y = y_train,
          param_name = 'clf__C',        # 
          param_range = param_range,
          cv = 10
      )

    # 上書き
    train_means = numpy.mean(train_scores, axis=1)
    train_stds = numpy.std(train_scores, axis=1)
    test_means = numpy.mean(test_scores, axis=1)
    test_stds = numpy.std(test_scores, axis=1)

    #
    print("param_range : \n", param_range)
    print("train_scores : \n", train_scores)
    print("test_scores : \n", test_scores)
    print("train_means : \n", train_means)
    print("train_stds : \n", train_stds)
    print("test_means : \n", test_means)
    print("test_stds : \n", test_stds)

    #-------------------------------------------
    # 検証曲線を描写
    #-------------------------------------------
    Plot2D.Plot2D.drawValidationCurve(param_range=param_range,
                                      train_means=train_means,
                                      train_stds=train_stds,
                                      test_means=test_means,
                                      test_stds=test_stds)
    plt.xscale('log')  # log スケール
    plt.title("Validation Curve \n LogisticRegression (L2 regularization)")
    plt.xlabel('Parameter C [Reverse regularization parameter]')
    plt.ylabel('Accuracy')
    plt.legend(loc='best')
    plt.ylim([0.8, 1.01])
    plt.tight_layout()

    plt.savefig("./MachineLearningPipeline_scikit-learn_2.png",
                dpi=300,
                bbox_inches='tight')
    plt.show()

    print("Finish main()")
    return
예제 #11
0
def main():
    #=========================================
    # 機械学習における前処理の練習プログラム
    #=========================================
    print("Enter main()")

    #-----------------------------------------
    # Practice 1 : 欠損値 NaN の補完
    #-----------------------------------------
    prePro1 = DataPreProcess.DataPreProcess()

    csv_data = '''
                  A,B,C,D
                  1.0,2.0,3.0,4.0
                  5.0,6.0,,8.0
                  10.0,11.0,12.0,
               '''

    prePro1.setDataFrameFromCsvData(csv_data)
    prePro1.print("csv data")

    prePro1.meanImputationNaN()
    prePro1.print("欠損値 NaN の平均値補完")

    #--------------------------------------------------
    # Practice 2 : カテゴリデータの処理
    # 名義 [nominal] 特徴量、順序 [ordinal] 特徴量
    #--------------------------------------------------
    prePro2 = DataPreProcess.DataPreProcess()

    # list から pandas データフレームを作成
    prePro2.setDataFrameFromList(
        list=[['green', 'M', 10.1, 'class1'], ['red', 'L', 13.5, 'class2'],
              ['blue', 'XL', 15.3, 'class1']])

    prePro2.print("list から pandas データフレームを作成")

    # pandas データフレームにコラム(列)を追加
    prePro2.setColumns(['color', 'size', 'price', 'classlabel'])
    prePro2.print("pandas データフレームにコラム(列)を追加")

    # 順序特徴量 size の map(directionary) を作成
    dict_size = {'XL': 3, 'L': 2, 'M': 1}
    # 作成した map で順序特徴量を整数化
    prePro2.mappingOrdinalFeatures(key='size', input_dict=dict_size)
    prePro2.print("順序特徴量 size の map(directionary) を作成し、作成した map で順序特徴量を整数化")

    # クラスラベルのエンコーディング(ディクショナリマッピング方式)
    prePro2.encodeClassLabel("classlabel")
    prePro2.print("クラスラベルのエンコーディング(ディクショナリマッピング方式")

    # カテゴリデータのone-hot encoding
    prePro2.oneHotEncode(categories=['color', 'size', 'price'], col=0)
    prePro2.print("カテゴリデータのone-hot encoding")

    #--------------------------------------------------
    # Practice 3 : データセットの分割
    # トレーニングデータとテストデータへの分割
    #--------------------------------------------------
    prePro3 = DataPreProcess.DataPreProcess()

    # Wine データセットの読み込み
    prePro3.setDataFrameFromCsvFile(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
    )

    # 上記URLのWine データセットにはラベルがついてないので, 列名をセット
    prePro3.setColumns([
        'Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
        'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
        'Proanthocyanins', 'Color intensity', 'Hue',
        'OD280/OD315 of diluted wines', 'Proline'
    ])

    prePro3.print("Wine データセット")

    X_train, X_test, y_train, y_test \
    = DataPreProcess.DataPreProcess.dataTrainTestSplit(
        X_input = prePro3.df_.iloc[:, 1:].values,   # iloc : 行、列を番号で指定(先頭が 0)。df_.iloc[:, 1:] = 全行、1~の全列
        y_input = prePro3.df_.iloc[:, 0].values,    #
        ratio_test = 0.3
    )

    # 分割データ(トレーニングデータ、テストデータ)を出力
    print("トレーニングデータ : \n", X_train)
    print("テストデータ : \n", X_test)
    print("トレーニング用教師データ : \n", y_train)
    print("テスト用教師データ : \n", y_test)

    #--------------------------------------------------
    # Practice 4 : 特徴量のスケーリング
    # 正規化 [normalization], 標準化 [standardization]
    #--------------------------------------------------
    # 正規化
    X_train_norm, X_test_norm \
    = DataPreProcess.DataPreProcess.normalizedTrainTest( X_train, X_test )

    # 正規化後のデータを出力
    print("トレーニングデータ [normalized] :\n", X_train_norm)
    print("テストデータ [normalized] : \n", X_test_norm)

    # 標準化
    X_train_std, X_test_std \
    = DataPreProcess.DataPreProcess.standardizeTrainTest( X_train, X_test )

    # 標準化後のデータを出力
    print("トレーニングデータ [standardized] :\n", X_train_std)
    print("テストデータ [standardized] : \n", X_test_std)

    #--------------------------------------------------------
    # Practice 5 : 有益な特徴量の選択
    # L1正則化による疎な解(ロジスティクス回帰モデルで検証)
    #--------------------------------------------------------
    logReg = LogisticRegression(
        penalty='l1',  # L1正則化
        C=0.1  # 逆正則化パラメータ
    )

    logReg.fit(X_train_std, y_train)

    print('Training accuracy:', logReg.score(X_train_std, y_train))
    print('Test accuracy:', logReg.score(X_test_std, y_test))

    print("切片 :", logReg.intercept_)
    print("重み係数 : \n", logReg.coef_)

    #----------------------------------------
    # 図の作図
    fig = plt.figure()
    ax = plt.subplot(1, 1, 1)

    # 各係数(特徴)の色のリスト
    colors = [
        'blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'pink',
        'lightgreen', 'lightblue', 'gray', 'indigo', 'orange'
    ]

    # 重み係数のリスト、逆正則化パラメータのリスト(空のリストで初期化)
    weights = []
    params = []

    # 逆正則化パラメータの値毎に処理を繰り返す
    for c in numpy.arange(-4., 6.):
        lr = LogisticRegression(penalty='l1', C=10.**c, random_state=0)
        lr.fit(X_train_std, y_train)
        weights.append(lr.coef_[1])
        params.append(10.**c)

    weights = numpy.array(weights)  # 重み係数を Numpy 配列に変換

    # 各重み係数をプロット
    for column, color in zip(range(weights.shape[1]), colors):
        # 折れ線グラフ
        plt.plot(params,
                 weights[:, column],
                 label=prePro3.df_.columns[column + 1],
                 color=color)

    plt.grid()
    plt.axhline(0, color='black', linestyle='--', linewidth=3)
    plt.xlim([10**(-5), 10**5])
    plt.ylabel('weight coefficient')
    plt.xlabel('C [Reverse regularization parameter] (log scale)')
    plt.xscale('log')  # x 軸を log スケール化
    plt.legend(loc='lower left')
    """
    ax.legend(
        loc = 'upper center', 
        #bbox_to_anchor = (1.38, 1.03),
        ncol = 1
        #fancybox = True
    )
    """
    plt.tight_layout()

    plt.savefig('DataPreProcess_scikit-learn_1.png',
                dpi=300,
                bbox_inches='tight')
    plt.show()

    print("Finish main()")
    return
예제 #12
0
import tensorflow as tf
import os
import nltk
import numpy as np
import jieba
import json
import DataPreProcess

batchSize = 32
cn = "你是谁"
cn_seg = ' '.join(jieba.cut(cn))
data = DataPreProcess.Dataset()
with open(data.w2idx_cn_name, "r", encoding="utf8") as fp:
    w2idx_cn = json.load(fp)
fp.close()

with open(data.w2idx_en_name, "r", encoding="utf8") as fp:
    w2idx_en = json.load(fp)
fp.close()

idx2w_cn = {idx: word for word, idx in w2idx_cn.items()}
idx2w_en = {idx: word for word, idx in w2idx_en.items()}

cn_ids = [w2idx_cn.get(item, w2idx_cn["<UNK>"]) for item in cn_seg.split()]

if len(cn_ids) >= data.seqLength:
    xIds = cn_ids[:data.seqLength]
else:
    xIds = cn_ids + [w2idx_cn["<PAD>"]] * (data.seqLength - len(cn_ids))

graph = tf.Graph()
def main():
    """
    アンサンブル学習.
    バギング
    """
    print("Enter main()")

    # データの読み込み
    """
    # アヤメデータ
    # 三品種 (Setosa, Versicolor, Virginica) の特徴量、含まれる特徴量は、Sepal (がく片) と Petal (花びら) の長さと幅。
    #iris = datasets.load_iris()         
    #print(iris)

    #X_features = iris.data[ 50:, [1, 2] ]    # 
    #y_labels = iris.target[50:]            # 
    """
    
    # ワインデータセット
    prePro = DataPreProcess.DataPreProcess()
    prePro.setDataFrameFromCsvFile(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
    )
    
    prePro.setColumns( 
        ['Class label',
         'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols',
         'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
    )

    prePro.print("Breast Cancer Wisconsin dataset")
    
    # 使用する特徴量と教師データ
    # Class label が 1 のサンプルを除外し、ラベルが 2,3 のみのデータを使用
    prePro.df_ = prePro.df_[ prePro.df_['Class label'] != 1 ]
    X_features = prePro.df_[ ['Alcohol', "Hue" ] ].values       # 特徴行列 : 2つの特徴量×サンプル数
    y_labels = prePro.df_[ ['Class label'] ].values             # クラスラベル : 2 or 3
    
    #X_features, y_labels = DataPreProcess.DataPreProcess.generateCirclesDataSet()
    #X_features, y_labels = DataPreProcess.DataPreProcess.generateMoonsDataSet()
        
    #print( X_features )

    ratio_test = 0.4

    #===========================================
    # 前処理 [PreProcessing]
    #===========================================
    # 欠損データへの対応
    #prePro.meanImputationNaN()

    # ラベルデータをエンコード
    #prePro.encodeClassLabelByLabelEncoder( colum = 1 )
    #prePro.print( "" )
    encoder = LabelEncoder()
    y_labels = encoder.fit_transform( y_labels )

    # データをトレードオフデータとテストデータに分割
    X_train, X_test, y_train, y_test \
    = DataPreProcess.DataPreProcess.dataTrainTestSplit( X_input = X_features, y_input = y_labels, ratio_test = ratio_test, input_random_state = 1 )

    test_idx = []
    #test_idx = range( 26,50 )

    #
    stdScaler = StandardScaler()
    
    # X_train の平均値と標準偏差を計算
    stdScaler.fit( X_train )

    # 求めた平均値と標準偏差を用いて標準化
    X_train_std = stdScaler.transform( X_train )
    X_test_std  = stdScaler.transform( X_test )

    # 分割したデータを行方向に結合(後で plot データ等で使用する)
    X_combined_std = numpy.vstack( (X_train_std, X_test_std) )  # list:(X_train_std, X_test_std) で指定
    y_combined     = numpy.hstack( (y_train, y_test) )


    #print( "X_train :\n",  X_train )
    #print( "X_test :\n",  X_test )
    #print( "y_train :\n",  y_train )
    #print( "y_test :\n",  y_test )

    #-------------------------------------------
    # モデルの生成
    #-------------------------------------------
    # 決定木の生成
    decition_tree = DecisionTreeClassifier(
                        criterion = 'entropy',       # 不純度として, 交差エントロピー
                        max_depth = None,               # None : If None, then nodes are expanded until all leaves are pure 
                                                     # or until all leaves contain less than min_samples_split samples.(default=None)
                        random_state = 1
                    )
    
    # バギングの生成
    bagging = BaggingClassifier(
                  base_estimator = decition_tree,   # 弱識別器をして決定木を設定
                  n_estimators = 501,               # バギングを構成する弱識別器の数
                  max_samples = 1.0,                # The number of samples to draw from X to train each base estimator.
                                                    # If float, then draw max_samples * X.shape[0] samples.
                                                    # base_estimator に設定した弱識別器の内, 使用するサンプルの割合
                                                    # 
                  max_features = 1.0,               # The number of features to draw from X to train each base estimator.
                                                    # If float, then draw max_features * X.shape[1] features.
                  bootstrap = True,                 # ブートストラップサンプリングを行う 
                  bootstrap_features=False,         #
                  n_jobs = -1, 
                  random_state = 1
              )
    
    #-------------------------------------------
    # 各 Pipeline の設定
    #-------------------------------------------
    # パイプラインに各変換器、推定器を設定
    # タプル (任意の識別文字, 変換器 or 推定器のクラス) で指定
    
    #-------------------------------------------
    # 全識別器のリストの設定
    #-------------------------------------------
    # 各種スコア計算時に使用する識別器のリスト ( for 文の in で使用を想定) 
    all_clf = [ decition_tree, bagging ]
    print( "all_clf :", all_clf )

    # 各種スコア計算時に使用する識別器のラベルのリスト ( for 文の in で使用を想定)
    all_clf_labels = [ 
                        "Decision Tree ( criterion = 'entropy' )",
                        "Bagging ( base_estimator = decition_tree, n_estimators = 501 )"
                     ]

    print( "all_clf_labels :", all_clf_labels )

    #============================================
    # Learning Process
    #===========================================
    # 設定した推定器をトレーニングデータで fitting
    bagging = bagging.fit( X_train_std, y_train )
    decition_tree = decition_tree.fit( X_train_std, y_train )

    #print( "decition_tree : ", decition_tree.tree_.max_depth  )
    #print( "bagging : ", bagging )

    #===========================================
    # 汎化性能の確認
    #===========================================

    #-------------------------------------------
    # 正解率, 誤識率
    #-------------------------------------------
    # k-fold CV を行い, cross_val_score( scoring = 'accuracy' ) で 正解率を算出
    print( "[Accuracy]")
    # train data
    for clf, label in zip( all_clf, all_clf_labels ):
        scores = cross_val_score(
                     estimator = clf,
                     X = X_train_std,
                     y = y_train,
                     cv = 10,
                     n_jobs = -1,
                     scoring = 'accuracy'    # 正解率
                 )
        print( "Accuracy <train data> : %0.2f (+/- %0.2f) [%s]" % ( scores.mean(), scores.std(), label) )    
    
    # test data
    for clf, label in zip( all_clf, all_clf_labels ):
        scores = cross_val_score(
                     estimator = clf,
                     X = X_test_std,
                     y = y_test,
                     cv = 10,
                     n_jobs = -1,
                     scoring = 'accuracy'    # 正解率
                 )
        print( "Accuracy <test data> : %0.2f (+/- %0.2f) [%s]" % ( scores.mean(), scores.std(), label) )    

    
    #-------------------------------------------
    # AUC 値
    #-------------------------------------------
    # k-fold CV を行い, cross_val_score( scoring = 'roc_auc' ) で AUC を算出
    print( "[AUC]")
    for clf, label in zip( all_clf, all_clf_labels ):
        scores = cross_val_score(
                     estimator = clf,
                     X = X_train_std,
                     y = y_train,
                     cv = 10,
                     n_jobs = -1,
                     scoring = 'roc_auc'    # AUC
                 )
        print( "AUC <train data> : %0.2f (+/- %0.2f) [%s]" % ( scores.mean(), scores.std(), label) )

    for clf, label in zip( all_clf, all_clf_labels ):
        scores = cross_val_score(
                     estimator = clf,
                     X = X_test_std,
                     y = y_test,
                     cv = 10,
                     n_jobs = -1,

                     scoring = 'roc_auc'    # AUC
                 )
        print( "AUC <test data> : %0.2f (+/- %0.2f) [%s]" % ( scores.mean(), scores.std(), label) )


    #-------------------------------------------
    # 識別境界
    #-------------------------------------------
    plt.clf()

    for (idx, clf, label) in zip( range( 1,len(all_clf)+2 ),  all_clf, all_clf_labels ):
        print( "識別境界 for ループ idx : ", idx )
        print( "識別境界 for ループ clf : ", clf )

        # idx 番目の plot
        plt.subplot( 1, 2, idx )

        Plot2D.Plot2D.drawDiscriminantRegions( X_combined_std, y_combined, classifier = all_clf[idx-1] )
        plt.title( label )
        plt.xlabel( "Hue [standardized]" )
        plt.ylabel( "Alcohol [standardized]" )
        plt.legend(loc = "best")
        plt.tight_layout()

    plt.savefig("./EnsembleLearning_scikit-learn_5.png", dpi = 300, bbox_inches = 'tight' )
    plt.show()    

    #-------------------------------------------
    # 学習曲線
    #-------------------------------------------
    plt.clf()

    for (idx, clf, label) in zip( range( 1,len(all_clf)+2 ),  all_clf, all_clf_labels ):
        print( "学習曲線 for ループ idx : ", idx )
        print( "学習曲線 for ループ clf : ", clf )

        train_sizes, train_scores, test_scores \
        = learning_curve(
              estimator = clf,                              # 推定器 
              X = X_train_std,                              # トレーニングデータでの正解率を計算するため, トレーニングデータを設定
              y = y_train,                                  # 
              train_sizes = numpy.linspace(0.1, 1.0, 10),   # トレードオフサンプルの絶対数 or 相対数
                                                            # トレーニングデータサイズに応じた, 等間隔の10 個の相対的な値を設定
              cv = 10,                                      # 交差検証の回数(分割数)
              n_jobs = -1                                   # 
        )

        # 平均値、分散値を算出
        train_means = numpy.mean( train_scores, axis = 1 )   # axis = 1 : 行方向
        train_stds = numpy.std( train_scores, axis = 1 )
        test_means = numpy.mean( test_scores, axis = 1 )
        test_stds = numpy.std( test_scores, axis = 1 )

        print( "学習曲線 for ループ : \n")
        print( "train_sizes", train_sizes )
        print( "train_means", train_means )
        print( "train_stds", train_stds )
        print( "test_means", test_means )
        print( "test_stds", test_stds )

        # idx 番目の plot
        plt.subplot( 1, 2, idx )
        Plot2D.Plot2D.drawLearningCurve(
            train_sizes = train_sizes,
            train_means = train_means,
            train_stds = train_stds,
            test_means = test_means,
            test_stds = test_stds,
            train_label = "training accuracy",
            test_label = "k-fold cross validation accuracy (cv=10)"
        )
        plt.title( "Learning Curve \n" + label )
        plt.xlabel( "Number of training samples" )
        plt.ylabel( "Accuracy" )
        plt.legend( loc = "best" )
        plt.ylim( [0.5, 1.01] )
        plt.tight_layout()

    plt.savefig("./EnsembleLearning_scikit-learn_6.png", dpi = 300, bbox_inches = 'tight' )
    plt.show()    
  
    #-------------------------------------------
    # ROC 曲線
    #-------------------------------------------
    plt.clf()
    Plot2D.Plot2D.drawROCCurveFromClassifiers( 
        classifilers = all_clf, 
        class_labels = all_clf_labels, 
        X_train = X_train_std, y_train = y_train,
        X_test = X_test_std, y_test = y_test
    )

    plt.savefig("./EnsembleLearning_scikit-learn_7.png", dpi = 300, bbox_inches = 'tight' )
    plt.show()    
    
    
    print("Finish main()")
    return
예제 #14
0
def main():
    """
    機械学習パイプラインによる、機械学習処理フロー(scikit-learn ライブラリの Pipeline クラスを使用)
    クロス・バディゲーションによる汎化性能の確認
    """
    print("Enter main()")

    # データの読み込み
    prePro = DataPreProcess.DataPreProcess()
    prePro.setDataFrameFromCsvFile(
        "https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/wdbc/wdbc.data"
    )
    #prePro.print( "Breast Cancer Wisconsin dataset" )

    dat_X = prePro.df_.loc[:, 2:].values
    dat_y = prePro.df_.loc[:, 1].values

    #===========================================
    # 前処理 [PreProcessing]
    #===========================================
    # 欠損データへの対応
    #prePro.meanImputationNaN()

    # ラベルデータをエンコード
    prePro.encodeClassLabelByLabelEncoder(colum=1)
    prePro.print("Breast Cancer Wisconsin dataset")

    # データをトレードオフデータとテストデータに分割
    X_train, X_test, y_train, y_test \
    = DataPreProcess.DataPreProcess.dataTrainTestSplit( X_input = dat_X, y_input = dat_y, ratio_test = 0.2 )

    #-------------------------------------------
    # Pipeline の設定
    #-------------------------------------------
    # パイプラインに各変換器、推定器を設定
    pipe_logReg = Pipeline(steps=[  # タプル (任意の識別文字, 変換器 or 推定器のクラス) で指定
        ("scl", StandardScaler()),  # スケーリング: 変換器のクラス(fit() 関数を持つ)
        ("pca", PCA(n_components=2)),  # PCA でデータの次元削除
        ("clf", LogisticRegression(
            random_state=1))  # ロジスティクス回帰:推定器のクラス(preddict()関数を持つ)
    ])

    #
    """ 
    pipe_logReg.set_params(
        [
            ( "scl", StandardScaler() ),            # スケーリング: 変換器のクラス(fit() 関数を持つ)
            ( "pca", PCA( n_components=2 ) ),       # PCA でデータの次元削除
            ( "clf", LogisticRegression( random_state=1 ) ) # ロジスティクス回帰:推定器のクラス(preddict()関数を持つ)
        ]    
    )
    """

    # パイプラインに設定した変換器の fit() 関数を実行
    pipe_logReg.fit(X_train, y_train)

    #
    print("Test Accuracy: %.3f" % pipe_logReg.score(X_test, y_test))

    #============================================
    # Learning Process
    #===========================================
    # パイプラインに設定した推定器の predict() 実行
    y_predict = pipe_logReg.predict(X_test)
    print("predict : ", y_predict)

    # pipeline オブジェクトの内容確認
    #print( "pipe_logReg.get_params() : \n", pipe_logReg.get_params( deep = True ) )
    #print( "pipe_logReg.get_params() : \n", pipe_logReg.get_params( deep = False ) )

    #===========================================
    # 汎化性能の確認
    #===========================================

    #-------------------------------------------
    # クロスバディゲーション : CV
    #-------------------------------------------
    scores = cross_val_score(
        estimator=pipe_logReg,  # 推定器 [estimator]
        X=X_train,  #
        y=y_train,  #
        cv=10,  # 交差検証の回数(分割数)
        n_jobs=-1  # 全てのCPUで並列処理
    )

    print('CV accuracy scores: %s' % scores)
    print('CV accuracy: %.3f +/- %.3f' %
          (numpy.mean(scores), numpy.std(scores)))

    print("Finish main()")
    return
def main():
    """
    機械学習パイプラインによる、機械学習処理フロー(scikit-learn ライブラリの Pipeline クラスを使用)
    ROC 曲線によるモデルの汎化能力の評価
    """
    print("Enter main()")

    # データの読み込み
    prePro = DataPreProcess.DataPreProcess()
    prePro.setDataFrameFromCsvFile(
        "https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/wdbc/wdbc.data"
    )
    #prePro.print( "Breast Cancer Wisconsin dataset" )

    #===========================================
    # 前処理 [PreProcessing]
    #===========================================
    # 特徴データとラベルデータ(教師データ)を取り出し
    dat_X = prePro.df_.loc[:, 2:].values
    dat_y = prePro.df_.loc[:, 1].values

    # 欠損データへの対応
    #prePro.meanImputationNaN()

    # カテゴリデータのエンコード
    #prePro.encodeClassLabelByLabelEncoder( colum = 1, bPrint = True )
    encoder = LabelEncoder()
    dat_y = encoder.fit_transform(dat_y)  #
    encoder.transform(["M", "B"])  #

    print("encoder.fit_transform( dat_y ) : \n", encoder.fit_transform(dat_y))
    print("encoder.classes_ : \n", encoder.classes_)
    prePro.print("Breast Cancer Wisconsin dataset")

    # データをトレードオフデータとテストデータに分割
    X_train, X_test, y_train, y_test \
    = DataPreProcess.DataPreProcess.dataTrainTestSplit( X_input = dat_X, y_input = dat_y, ratio_test = 0.2 )

    print(X_train)
    print(y_train)

    #-------------------------------------------
    # Pipeline の設定
    #-------------------------------------------
    # パイプラインに各変換器、推定器を設定
    pipe_logReg = Pipeline(
        # タプル (任意の識別文字, 変換器 or 推定器のクラス) で指定
        [
            ("scl", StandardScaler()),  # 正規化 : 変換器のクラス(fit() 関数を持つ)
            ('pca', PCA(n_components=2)),  # PCA で2次元に削除(特徴抽出)
            ('clf', LogisticRegression(penalty='l2', random_state=0,
                                       C=100.0))  # ロジスティクス回帰(L2正則化) 
            # 推定器のクラス(predict()関数を持つ)
        ])

    # パイプラインに設定した変換器の fit() 関数を実行
    #pipe_logReg.fit( X_train, y_train )

    # 予想値
    #y_predict =pipe_logReg.predict( X_test )

    # pipeline オブジェクトの内容確認
    #print( "Pipeline.get_params( deep = True ) : \n", pipe_logReg.get_params( deep = True ) )
    #print( "Pipeline.get_params( deep = False ) : \n", pipe_logReg.get_params( deep = False ) )

    #print( "Pipeline.predict( X_test ) : \n", y_predict )
    #print( "Pipeline.predict( X_test )[0] : \n", y_predict[0] )
    #print( "Pipeline.predict( X_test )[1] : \n", y_predict[1] )
    #print( "Pipeline.predict( X_test )[2] : \n", y_predict[2] )
    #print( "Test Accuracy: %.3f" % pipe_csvm.score( X_test, y_test ) )

    # 使用するデータ(特徴量)の一部を抽出
    # ROC曲線の結果が検証用に適した形状となるように、特徴量の意図的な抽出
    #(AUC 値がほぼ 1.0の結果のみになってしまうため。)
    X_train2 = X_train[:, [4, 14]]

    #-------------------------------------------
    # クロスバリデーションの設定
    #-------------------------------------------
    # クロスバディゲーションの回数毎の ROC 曲線を描写するため、
    # クラスのオブジェクト作成を作成.
    cv = StratifiedKFold(n_splits=3, random_state=1)

    # クラスのオブジェクトをイテレータ化するために split() して list 化
    list_cv = list(cv.split(X_train2, y_train))

    #print( "StratifiedKFold() : \n", cv )
    #print( "list( StratifiedKFold().split() ) : \n", list_cv )

    #------------------------------------
    # ROC 曲線
    #------------------------------------
    Plot2D.Plot2D.drawROCCurveFromTrainTestIterator(
        classifiler=pipe_logReg,  # 推定器 : fit() 関数, predict() 関数を持つ
        iterator=list_cv,  #
        X_train=X_train2,  # 一部を抽出した特徴行列で ROC 曲線を作図
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        positiveLabel=1  # positive と見なすラベル値 : "B" = 1
    )

    plt.savefig("./MachineLearningPipeline_scikit-learn_5.png",
                dpi=300,
                bbox_inches='tight')
    plt.show()

    print("Finish main()")
    return
def main():
    """
    アンサンブル学習.
    渦巻きデータをアンサンブル法で識別
    """
    print("Enter main()")

    # データの読み込み
    # 渦巻きデータ
    prePro = DataPreProcess.DataPreProcess()
    prePro.setDataFrameFromCsvFile("naruto.csv")
    prePro.setColumns(["x", "y", "class labels"])

    prePro.print("渦巻きデータ ")

    X_features = prePro.df_[["x", "y"]].values
    y_labels = prePro.df_[["class labels"]].values

    #print( X_features )

    ratio_test = 0.3

    #===========================================
    # 前処理 [PreProcessing]
    #===========================================
    # 欠損データへの対応
    #prePro.meanImputationNaN()

    # ラベルデータをエンコード
    encoder = LabelEncoder()
    y_labels = encoder.fit_transform(y_labels)

    # データをトレードオフデータとテストデータに分割
    X_train, X_test, y_train, y_test \
    = DataPreProcess.DataPreProcess.dataTrainTestSplit( X_input = X_features, y_input = y_labels, ratio_test = ratio_test, input_random_state = 1 )

    test_idx = []
    #test_idx = range( 26,50 )

    #
    stdScaler = StandardScaler()

    # X_train の平均値と標準偏差を計算
    stdScaler.fit(X_train)

    # 求めた平均値と標準偏差を用いて標準化
    X_train_std = stdScaler.transform(X_train)
    X_test_std = stdScaler.transform(X_test)

    # 分割したデータを行方向に結合(後で plot データ等で使用する)
    X_combined_std = numpy.vstack(
        (X_train_std, X_test_std))  # list:(X_train_std, X_test_std) で指定
    y_combined = numpy.hstack((y_train, y_test))

    #print( "X_train :\n",  X_train )
    #print( "X_test :\n",  X_test )
    #print( "y_train :\n",  y_train )
    #print( "y_test :\n",  y_test )

    #-------------------------------------------
    # モデルの生成
    #-------------------------------------------
    # 決定木の生成
    decition_tree = DecisionTreeClassifier(
        criterion='entropy',  # 不純度として, 交差エントロピー
        max_depth=
        None,  # None : If None, then nodes are expanded until all leaves are pure 
        # or until all leaves contain less than min_samples_split samples.(default=None)
        random_state=0)
    # k-NN
    kNN = KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski')

    # SVM
    svm = SVC(
        kernel='rbf',  # rbf : RFBカーネルでのカーネルトリックを指定
        gamma=10.0,  # RFBカーネル関数のγ値
        C=0.1,  # C-SVM の C 値
        random_state=1,  #
        probability=True  # 学習後の predict_proba method による予想確率を有効にする
    )

    # LogisticRegression
    logReg = LogisticRegression(penalty='l2', C=0.001, random_state=1)

    # バギングの生成
    bagging = BaggingClassifier(
        base_estimator=decition_tree,  # 弱識別器をして決定木を設定
        n_estimators=501,  # バギングを構成する弱識別器の数
        max_samples=
        1.0,  # The number of samples to draw from X to train each base estimator.
        # If float, then draw max_samples * X.shape[0] samples.
        # base_estimator に設定した弱識別器の内, 使用するサンプルの割合
        #
        max_features=
        1.0,  # The number of features to draw from X to train each base estimator.
        # If float, then draw max_features * X.shape[1] features.
        bootstrap=True,  # ブートストラップサンプリングを行う 
        bootstrap_features=False,  #
        n_jobs=-1,
        random_state=1)

    # AdaBoost
    ada = AdaBoostClassifier(
        base_estimator=decition_tree,  # 弱識別器をして決定木を設定
        n_estimators=501,  # バギングを構成する弱識別器の数 
        learning_rate=0.1,  # 
        random_state=1  #
    )

    # Random Forest
    forest = RandomForestClassifier(
        criterion="gini",  # 不純度関数 [purity]
        bootstrap=True,  # 決定木の構築に、ブートストラップサンプルを使用するか否か(default:True)
        n_estimators=501,  # 弱識別器(決定木)の数
        n_jobs=
        -1,  # The number of jobs to run in parallel for both fit and predict ( -1 : 全てのCPUコアで並列計算)
        random_state=1,  #
        oob_score=
        True  # Whether to use out-of-bag samples to estimate the generalization accuracy.(default=False)
    )

    #-------------------------------------------
    # 各 Pipeline の設定
    #-------------------------------------------
    # パイプラインに各変換器、推定器を設定
    # タプル (任意の識別文字, 変換器 or 推定器のクラス) で指定

    #-----------------------------------------------------------
    # アンサンブル識別器 EnsembleLearningClassifier の設定
    #-----------------------------------------------------------
    ensemble_clf1 = EnsembleModelClassifier.EnsembleModelClassifier(
        classifiers=[bagging, ada, forest, decition_tree, logReg, kNN, svm],
        class_labels=[
            "Bagging ( base_estimator = decition_tree, n_estimators = 501 )",
            "AdaBoost (base_estimator = decition_tree, n_estimators = 501 )"
            "Random Forest ( criterion = 'gini', n_estimators = 501)"
            "Decision Tree ( criterion = 'entropy' )",
            "Logistic Regression( penalty = 'l2', C = 0.001 )",
            "k-NN ( n_neighbors = 3, metric='minkowski' )",
            "SVM ( kernel = 'rbf', C = 10.0, gamma = 0.1 )"
        ])

    #-------------------------------------------
    # 全識別器のリストの設定
    #-------------------------------------------
    # 各種スコア計算時に使用する識別器のリスト ( for 文の in で使用を想定)
    all_clf = [
        bagging, ada, forest, decition_tree, logReg, kNN, svm, ensemble_clf1
    ]

    # 各種スコア計算時に使用する識別器のラベルのリスト ( for 文の in で使用を想定)
    all_clf_labels = [
        "Decision Tree \n ( criterion = 'entropy' )",
        "Bagging \n ( base_estimator = decition_tree, n_estimators = 501 )",
        "AdaBoost \n (base_estimator = decition_tree, n_estimators = 501 )",
        "RamdomForest \n (base_estimator = decition_tree, n_estimators = 501 )",
        "Logistic Regression \n ( penalty = 'l2', C = 0.001 )",
        "k-NN \n ( n_neighbors = 3, metric='minkowski' )",
        "SVM \n ( kernel = 'rbf', C = 0.1, gamma = 10.0 )",
        "Ensemble Model \n ( Bagging, AdaBoost, RandamForest, Decision Tree, LogisticRegression, k-NN, SVM )"
    ]

    print("all_clf :", all_clf)
    print("len(all_clf) :", len(all_clf))
    print("all_clf_labels :", all_clf_labels)
    print("len(all_clf_labels) :", len(all_clf_labels))

    #============================================
    # Learning Process
    #===========================================
    # 設定した推定器をトレーニングデータで fitting
    decition_tree = decition_tree.fit(X_train_std, y_train)
    logReg = logReg.fit(X_train_std, y_train)
    kNN = kNN.fit(X_train_std, y_train)
    svm = svm.fit(X_train_std, y_train)

    bagging = bagging.fit(X_train_std, y_train)
    ada = ada.fit(X_train_std, y_train)
    forest = forest.fit(X_train_std, y_train)

    ensemble_clf1.fit(X_train_std, y_train)

    #print( "decition_tree : ", decition_tree.tree_.max_depth  )
    #print( "bagging : ", bagging )

    #===========================================
    # 汎化性能の確認
    #===========================================

    #-------------------------------------------
    # 正解率, 誤識率
    #-------------------------------------------
    # k-fold CV を行い, cross_val_score( scoring = 'accuracy' ) で 正解率を算出
    print("[Accuracy]")
    # train data
    for clf, label in zip(all_clf, all_clf_labels):
        scores = cross_val_score(
            estimator=clf,
            X=X_train_std,
            y=y_train,
            cv=10,
            n_jobs=-1,
            scoring='accuracy'  # 正解率
        )
        print("Accuracy <train data> : %0.2f (+/- %0.2f) [%s]" %
              (scores.mean(), scores.std(), label))

    # test data
    for clf, label in zip(all_clf, all_clf_labels):
        scores = cross_val_score(
            estimator=clf,
            X=X_test_std,
            y=y_test,
            cv=10,
            n_jobs=-1,
            scoring='accuracy'  # 正解率
        )
        print("Accuracy <test data> : %0.2f (+/- %0.2f) [%s]" %
              (scores.mean(), scores.std(), label))

    #-------------------------------------------
    # AUC 値
    #-------------------------------------------
    # k-fold CV を行い, cross_val_score( scoring = 'roc_auc' ) で AUC を算出
    print("[AUC]")
    for clf, label in zip(all_clf, all_clf_labels):
        scores = cross_val_score(
            estimator=clf,
            X=X_train_std,
            y=y_train,
            cv=10,
            n_jobs=-1,
            scoring='roc_auc'  # AUC
        )
        print("AUC <train data> : %0.2f (+/- %0.2f) [%s]" %
              (scores.mean(), scores.std(), label))

    for clf, label in zip(all_clf, all_clf_labels):
        scores = cross_val_score(
            estimator=clf,
            X=X_test_std,
            y=y_test,
            cv=10,
            n_jobs=-1,
            scoring='roc_auc'  # AUC
        )
        print("AUC <test data> : %0.2f (+/- %0.2f) [%s]" %
              (scores.mean(), scores.std(), label))

    #-------------------------------------------
    # 識別境界
    #-------------------------------------------
    plt.clf()

    for (idx, clf, label) in zip(range(1,
                                       len(all_clf) + 2), all_clf,
                                 all_clf_labels):
        print("識別境界 for ループ idx : ", idx)
        print("識別境界 for ループ clf : ", clf)

        # idx 番目の plot
        plt.subplot(2, 4, idx)

        Plot2D.Plot2D.drawDiscriminantRegions(X_combined_std,
                                              y_combined,
                                              classifier=all_clf[idx - 1])
        plt.title(label)
        plt.legend(loc="best")
        plt.tight_layout()

    plt.savefig("./EnsembleLearning_scikit-learn_naruto_x-1.png",
                dpi=300,
                bbox_inches='tight')
    plt.show()

    #-------------------------------------------
    # 学習曲線
    #-------------------------------------------
    plt.clf()

    for (idx, clf, label) in zip(range(1,
                                       len(all_clf) + 2), all_clf,
                                 all_clf_labels):
        print("学習曲線 for ループ idx : ", idx)
        print("学習曲線 for ループ clf : ", clf)

        train_sizes, train_scores, test_scores \
        = learning_curve(
              estimator = clf,    # 推定器 
              X = X_train_std,                              # トレーニングデータでの正解率を計算するため, トレーニングデータを設定
              y = y_train,                                  # 
              train_sizes = numpy.linspace(0.1, 1.0, 10),   # トレードオフサンプルの絶対数 or 相対数
                                                            # トレーニングデータサイズに応じた, 等間隔の10 個の相対的な値を設定
              cv = 10                                       # 交差検証の回数(分割数)
        )

        # 平均値、分散値を算出
        train_means = numpy.mean(train_scores, axis=1)  # axis = 1 : 行方向
        train_stds = numpy.std(train_scores, axis=1)
        test_means = numpy.mean(test_scores, axis=1)
        test_stds = numpy.std(test_scores, axis=1)

        print("学習曲線 for ループ : \n")
        print("train_sizes", train_sizes)
        print("train_means", train_means)
        print("train_stds", train_stds)
        print("test_means", test_means)
        print("test_stds", test_stds)

        # idx 番目の plot
        plt.subplot(2, 4, idx)
        Plot2D.Plot2D.drawLearningCurve(
            train_sizes=train_sizes,
            train_means=train_means,
            train_stds=train_stds,
            test_means=test_means,
            test_stds=test_stds,
            train_label="training accuracy",
            test_label="k-fold cross validation accuracy (cv=10)")
        plt.title("Learning Curve \n" + label)
        plt.xlabel("Number of training samples")
        plt.ylabel("Accuracy")
        plt.legend(loc="best")
        plt.ylim([0.5, 1.01])
        plt.tight_layout()

    plt.savefig("./EnsembleLearning_scikit-learn_naruto_x-2.png",
                dpi=300,
                bbox_inches='tight')
    plt.show()

    #-------------------------------------------
    # ROC 曲線
    #-------------------------------------------
    plt.clf()
    Plot2D.Plot2D.drawROCCurveFromClassifiers(classifilers=all_clf,
                                              class_labels=all_clf_labels,
                                              X_train=X_train_std,
                                              y_train=y_train,
                                              X_test=X_test_std,
                                              y_test=y_test)

    plt.savefig("./EnsembleLearning_scikit-learn_naruto_x-3.png",
                dpi=300,
                bbox_inches='tight')
    plt.show()

    print("Finish main()")
    return
예제 #17
0
"""
코드 작성일시 : 2020년 1
import cv2
import DataPreProcess as DPP
월 11일
작성자 : Park Jinsuk
코드 내용 : 전반적인 코드
"""
import cv2
import DataPreProcess as DPP

if __name__ == '__main__':
    print("this is main")

    # image = cv2.imread('test_image/1_cam-image_array_.jpg')  # 이미지 읽기
    # image1 = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # 입력 받은 화면 Gray로 변환
    # image1 = cv2.Canny(image1, threshold1 = 200, threshold2=300)
    # image1 = cv2.GaussianBlur(image1, (5, 5), 0)
    # cv2.imshow('orange', image1)
    # cv2.waitKey(0)

    # 학습시 필요한 이미지 전처리
    DPP.data_preprocess()
예제 #18
0
def main():
    """
    機械学習パイプラインによる、機械学習処理フロー(scikit-learn ライブラリの Pipeline クラスを使用)
    混同行列と適合率、再現率、F1スコアによるモデルの汎化能力の評価
    """
    print("Enter main()")

    # データの読み込み
    prePro = DataPreProcess.DataPreProcess()
    prePro.setDataFrameFromCsvFile(
        "https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/wdbc/wdbc.data"
    )
    #prePro.print( "Breast Cancer Wisconsin dataset" )

    dat_X = prePro.df_.loc[:, 2:].values
    dat_y = prePro.df_.loc[:, 1].values

    #===========================================
    # 前処理 [PreProcessing]
    #===========================================
    # 欠損データへの対応
    #prePro.meanImputationNaN()
    """
    # ラベルデータをエンコード
    prePro.setColumns( ['ID', 'B/M'] )
    map_encode = {
        'B': 0,
        'M': 1
    }

    prePro.encodeClassLabelByLabelEncoder( key = "B/M" )
    """
    prePro.encodeClassLabelByLabelEncoder(colum=0)
    prePro.print("Breast Cancer Wisconsin dataset")

    # データをトレードオフデータとテストデータに分割
    X_train, X_test, y_train, y_test \
    = DataPreProcess.DataPreProcess.dataTrainTestSplit( X_input = dat_X, y_input = dat_y, ratio_test = 0.2 )

    #-------------------------------------------
    # Pipeline の設定
    #-------------------------------------------
    # パイプラインに各変換器、推定器を設定
    pipe_csvm = Pipeline(steps=[  # タプル (任意の識別文字, 変換器 or 推定器のクラス) で指定
        ("scl", StandardScaler()),  # 正規化 : 変換器のクラス(fit() 関数を持つ)
        ('clf', SVC(C=10.0, kernel="rbf", gamma=0.01)
         )  # C-SVM : 推定器のクラス(predict()関数を持つ)
    ])

    # パイプラインに設定した変換器の fit() 関数を実行
    pipe_csvm.fit(X_train, y_train)

    # 予想値
    y_predict = pipe_csvm.predict(X_test)

    # pipeline オブジェクトの内容確認
    print("Pipeline.get_params( deep = True ) : \n",
          pipe_csvm.get_params(deep=True))
    print("Pipeline.get_params( deep = False ) : \n",
          pipe_csvm.get_params(deep=False))

    print("Pipeline.predict( X_test ) : \n", y_predict)
    print("Pipeline.predict( X_test )[0] : \n", y_predict[0])
    print("Pipeline.predict( X_test )[1] : \n", y_predict[1])
    print("Pipeline.predict( X_test )[2] : \n", y_predict[2])
    print("Test Accuracy: %.3f" % pipe_csvm.score(X_test, y_test))

    #-------------------------------------------
    # 混同行列 [confusion Matrix]
    #-------------------------------------------
    # テストデータと予想データから混同行列を作成
    mat_confusion = confusion_matrix(y_true=y_test, y_pred=y_predict)
    print("mat_confusion : \n", mat_confusion)

    # 混同行列のヒートマップを作図
    Plot2D.Plot2D.drawHeatMapFromConfusionMatrix(mat_confusion=mat_confusion)
    plt.title(
        "Heat Map of Confusion Matrix \n classifiler : RBF-kernel SVM (C = 10.0, gamma = 0.01)"
    )

    plt.savefig("./MachineLearningPipeline_scikit-learn_4.png",
                dpi=300,
                bbox_inches='tight')
    #plt.show()

    #-------------------------------------------
    # 適合率、再現率、F1 スコア
    #-------------------------------------------
    """
    for pred in range(0, len(y_predict)):
        if( y_predict[pred] == "M"):
            y_predict[pred] = 1
        else:
            y_predict[pred] = 0

    print( "Pipeline.predict( X_test ) : \n", y_predict )
    
    # UnboundLocalError: local variable 'precision_score' referenced before assignment
    # ValueError("pos_label=1 is not a valid label: array(['B', 'M'], \n      dtype='<U1')",)
    # ValueError: Can't handle mix of binary and unknown
    score_precision = precision_score( y_true = y_test, y_pred = y_predict )

    score_recall    = recall_score( y_true = y_test, y_pred = y_predict )
    score_f1        = f1_score( y_true = y_test, y_pred = y_predict )
    """
    # PRE = TP/(TP+FP)
    score_precision = mat_confusion[1, 1] / (mat_confusion[1, 1] +
                                             mat_confusion[0, 1])

    # REC = TP/(TP+FN)
    score_recall = mat_confusion[1, 1] / (mat_confusion[1, 1] +
                                          mat_confusion[1, 0])

    # F1 = 2*PRE*( REC/(PRE+REC)
    score_f1 = 2 * score_precision * (score_recall /
                                      (score_precision + score_recall))

    print('Precision: %.3f' % score_precision)
    print('Recall: %.3f' % score_recall)
    print('F1: %.3f' % score_f1)

    print("Finish main()")
    return
예제 #19
0
import numpy as np
import pandas as pd

import DataPreProcess as dpp

filePath = "../dataPool/"
confirm = "ConfirmedTimeSeries"
death = "DeathsTimeSeries"
CRTS = filePath + confirm + ".csv"
DRTS = filePath + death + ".csv"
datelist = dpp.DateStartFrom("2020-04-12")
dateNum = len(datelist)


def extract(path, daterange, fname):
    f = open(path, "r")
    data = pd.read_csv(f)
    print(path + " opened successfully,start extracting feature...")
    columnTag = []
    for state in data.index:
        name = str(data.iloc[state, :].loc["Province_State"])
        #columnTag.append(name + "_Lat")
        #columnTag.append(name + "_Long")
        for i in range(batchSize):
            columnTag.append(name + str(i + 1) + "day")
    Feature_Size = len(columnTag)
    result = pd.DataFrame(np.zeros([dateNum - batchSize + 1, Feature_Size]),
                          columns=columnTag)
    head = 0
    while (head + batchSize <= dateNum):
        for state in data.index:  # For each state in raw dataset
def main():
    """
    機械学習パイプラインによる、機械学習処理フロー(scikit-learn ライブラリの Pipeline クラスを使用)
    グリッドサーチによるハイパーパラメータのチューニング
    """
    print("Enter main()")
    
    # データの読み込み
    prePro = DataPreProcess.DataPreProcess()
    prePro.setDataFrameFromCsvFile(
        "https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/wdbc/wdbc.data"
    )
    #prePro.print( "Breast Cancer Wisconsin dataset" )
    
    dat_X = prePro.df_.loc[:, 2:].values
    dat_y = prePro.df_.loc[:, 1].values

    #===========================================
    # 前処理 [PreProcessing]
    #===========================================
    # 欠損データへの対応
    #prePro.meanImputationNaN()

    # ラベルデータをエンコード
    prePro.encodeClassLabelByLabelEncoder( colum = 1 )
    prePro.print( "Breast Cancer Wisconsin dataset" )

    # データをトレードオフデータとテストデータに分割
    X_train, X_test, y_train, y_test \
    = DataPreProcess.DataPreProcess.dataTrainTestSplit( X_input = dat_X, y_input = dat_y, ratio_test = 0.2 )

    #-------------------------------------------
    # Pipeline の設定
    #-------------------------------------------
    # パイプラインに各変換器、推定器を設定
    pipe_csvm = Pipeline(
                      steps = [                                     # タプル (任意の識別文字, 変換器 or 推定器のクラス) で指定
                                  ( "scl", StandardScaler() ),      # 正規化 : 変換器のクラス(fit() 関数を持つ)
                                  ('clf', SVC( random_state=1 ) )   # C-SVM : 推定器のクラス(predict()関数を持つ)
                              ]
                  )

    
    # パイプラインに設定した変換器の fit() 関数を実行
    #pipe_csvm.fit( X_train, y_train )

    # 
    # pipeline オブジェクトの内容確認
    print( "Pipeline.get_params() : \n", pipe_csvm.get_params( deep = True ) )
    print( "Pipeline.get_params() : \n", pipe_csvm.get_params( deep = False ) )

    #print( "Test Accuracy: %.3f" % pipe_csvm.score( X_test, y_test ) )

    
    #==============================
    # grid search
    #==============================
    # グリッドサーチの対象パラメータ : 今の場合 C=SVM の正規化パラメータ C 値とガンマ値
    param_range_C = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
    param_range_gamma = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

    # グリッドサーチでチューニングしたいモデルとそのパラメータ : ディクショナリ(辞書)のリストで指定
    param_grid = [
        { 'clf__C': param_range_C, 'clf__kernel': ['linear'] },                                # liner C-SVM
        { 'clf__C': param_range_C, 'clf__gamma': param_range_gamma, 'clf__kernel': ['rbf'] }   # RBF-kernel C-SVM
    ]

    # グリッドサーチを行う,GridSearchCV クラスのオブジェクト作成
    gs = GridSearchCV(
            estimator = pipe_csvm,      # 推定器
            param_grid = param_grid,    # グリッドサーチの対象パラメータ
            scoring = 'accuracy',       # 
            cv = 10,                    # クロスバディゲーションの回数
            n_jobs = -1                 # 全てのCPUで並列処理
         )
    # グリッドサーチを行う
    gs = gs.fit( X_train, y_train )


    # グリッドサーチの結果を print
    print( "sklearn.model_selection.GridSearchCV.best_score_ : \n", gs.best_score_ )        # 指定したモデルの内, 最もよいスコアを出したモデルのスコア
    print( "sklearn.model_selection.GridSearchCV.best_params_ : \n", gs.best_params_ )      # 最もよいスコアを出したモデルのパラメータ
    #print( "sklearn.model_selection.GridSearchCV.grid_scores_ : \n",gs.grid_scores_ )       # 全ての情報
    
    # 最もよいスコアを出したモデルを抽出し, テストデータを評価
    clf = gs.best_estimator_
    clf.fit( X_train, y_train )     # 抽出したモデルをトレーニングデータで学習
    print('sklearn.model_selection.GridSearchCV.best_estimator_ in Test accuracy: %.3f' % clf.score( X_test, y_test ) )     # 最もよいスコアを出したモデルでのテストデータ

    #-----------------------------------------------
    # グリッドサーチのためのヒートマップ図の plot
    #-----------------------------------------------
    # 再設定:RBF-kernel SVM
    param_grid = [
        { 'clf__C': param_range_C, 'clf__gamma': param_range_gamma, 'clf__kernel': ['rbf'] }    # RBF-kernel C-SVM
    ]

    # グリッドサーチを行う,GridSearchCV クラスのオブジェクト作成
    gs = GridSearchCV(
            estimator = pipe_csvm,      # 推定器
            param_grid = param_grid,    # グリッドサーチの対象パラメータ
            scoring = 'accuracy',       # 
            cv = 10,                    # クロスバディゲーションの回数
            n_jobs = -1                 # 全てのCPUで並列処理
         )

    # グリッドサーチを行う
    gs = gs.fit( X_train, y_train )

    # grid_scores_ 属性から正解率を抽出
    gs_params = []
    gs_mean_scores = []
    gs_scores = []

    for parames, mean_score, scores in gs.grid_scores_:
        gs_params.append( parames )
        gs_mean_scores.append( mean_score )
        gs_scores.append( scores )
    
    
    gs_mean_scores = numpy.reshape( gs_mean_scores , ( len(param_range_C), len(param_range_gamma) ) )
    #gs_scores = numpy.reshape( gs_scores , (8,8) )

    print( "sklearn.model_selection.GridSearchCV.grid_scores_.parmes : \n", gs_params )
    print( "sklearn.model_selection.GridSearchCV.grid_scores_.mean_scores : \n", gs_mean_scores )
    print( "sklearn.model_selection.GridSearchCV.grid_scores_.scores : \n", gs_scores )

    # ヒートマップのためのデータ
    heatmap_Z = gs_mean_scores
    heatmap_x = param_range_gamma
    heatmap_y = param_range_C
    
    # ヒートマップを作図
    Plot2D.Plot2D.drawHeatMapFromGridSearch(
        dat_Z = heatmap_Z,        # ヒートマップの値 : RBF-kernel SVM での正解率
        dat_x = heatmap_x,        # x 軸の目盛り
        dat_y = heatmap_y         # y 軸の目盛り
    )
    
    plt.title("Heat Map (Grid Serch) \n values : Accuracy , classifiler : RBF-kernel SVM")
    plt.ylabel( "C : RBF-kernel SVM parametor" )
    plt.xlabel( "gamma : RBF-kernel parametor" )

    plt.savefig("./MachineLearningPipeline_scikit-learn_3.png", dpi = 300, bbox_inches = 'tight' )
    plt.show()


    print("Finish main()")
    return