コード例 #1
0
def do_transform(dataset, mrs, test_data):
    # training data
    id_df, bi_df, mrs_df, nih_df = data_utils.get_tsr(mrs, '')
    # Using the correlation matrix is equivalent to standardizing each of the variables (to mean 0 and standard deviation 1).
    # we want to use covariance matrix so don't scale the data
    # scaled_bi, scaler = data_utils.scale(bi_df)
    bi_df_pca, pca = data_utils.pca_reduction(bi_df)

    # testing data
    labels = test_data[['label']]
    test_bi = test_data.drop(['label'], axis=1)

    # test_bi_scaled = scaler.transform(test_bi.values)
    test_bi_pca = pca.transform(test_bi.values)
    test_bi_pca_df = pd.DataFrame(data=test_bi_pca,
                                  index=test_data.index,
                                  columns=['pca_1', 'pca_2'])
    test_bi_pca_df['label'] = labels.values
    return test_bi_pca_df
コード例 #2
0
    # plt.title('mRS-3')
    #
    threshold = pd.Series(X_scores).quantile(0.85)
    # 0:95, 1:90, 2:90, 3:90, 4:90, 5:85
    plt.annotate('Elbow point',
                 xy=(1104, threshold),
                 arrowprops=dict(arrowstyle='->'),
                 xytext=(900, threshold + 0.09))
    print(threshold)
    return threshold


if __name__ == '__main__':
    mrs = 5
    test_dataset = 'tnk'
    id_df, bi_df, mrs_df, nih_df = data_utils.get_tsr(mrs, 'is')
    bi_df_unique = bi_df.drop_duplicates()
    bi_df_pca, pca = data_utils.pca_reduction(bi_df)
    bi_df_pca_unique = bi_df_pca.drop_duplicates()

    k_neighbors = 10
    outliers_fraction = 0.01
    # Training
    clf = LocalOutlierFactor(n_neighbors=k_neighbors,
                             contamination=outliers_fraction)
    # determine cutoff
    cutoff = score_cutoff(bi_df_pca_unique, clf)
    #
    labels = make_score_label(bi_df_pca_unique, clf, cutoff)
    data_labeled_all, data_labeled_unique = data_utils.label_data(
        bi_df, bi_df_pca_unique, labels)