# X, Y, err, names = datasets.read_dan_data() # n_bins = 64 # if args.use_restricted_bins: # X = np.take(X, range(17, 34)+range(n_bins+12, n_bins+17), axis=1) if args.dataset == 'equatorial': X, Y = datasets.read_acs_grid_data(shuffle=False) if args.thermal_only: X = np.take(X, range(64), axis=1) elif args.dataset == 'polar': X, Y = datasets.read_polar_data(shuffle=False) if args.thermal_only: X = np.take(X, range(161), axis=1) if args.normalize: X = datasets.normalize_counts(X) # Fit PCA model and project data into PC space pca = PCA(n_components=args.n_components) pca.fit(X) transformed = pca.transform(X) # Plot clusters in PC space fig = plt.figure() ax1 = fig.add_subplot(111, projection='3d') ax1.set_xlabel('1') ax1.set_ylabel('2') ax1.set_zlabel('3') fig2 = plt.figure() ax2 = fig2.add_subplot(111, projection='3d')
parser.add_argument('--n_components', type=int, default=3, help='number of principal components to use for PCA') parser.add_argument( '--use_restricted_bins', action='store_true', help='only use bins 18-34 and 13-17 for thermal and epithermal') args = parser.parse_args() # Load the data sets X, y = datasets.read_acs_grid_data() dan_X, dan_y, _, names = datasets.read_dan_data() # Normalize counts to approximately same range X = datasets.normalize_counts(X) dan_X = datasets.normalize_counts(dan_X) if args.use_restricted_bins: n_bins = 64 X = np.take(X, range(17, 34) + range(n_bins + 12, n_bins + 17), axis=1) dan_X = np.take(dan_X, range(17, 34) + range(n_bins + 12, n_bins + 17), axis=1) # Project the data into principal subspace of model data pca = PCA(n_components=args.n_components) pca.fit(X) X = pca.transform(X) dan_X = pca.transform(dan_X)
X, Y = datasets.read_acs_grid_data() X_test, Y_test, Y_chi2, test_names = datasets.read_dan_data(limit_2000us=False, label_source='asu') n_bins = len(datasets.time_bins_dan)-1 elif args.model_grid == 'both': X_full, Y_full = datasets.read_sim_data(use_dan_bins=True) X_rover, Y_rover = datasets.read_grid_data(limit_2000us=True) X = np.concatenate([X_full, X_rover]) Y = np.concatenate([Y_full, Y_rover]) X_test, Y_test, Y_test_error, test_names = datasets.read_dan_data(limit_2000us=True) n_bins = 34 X_train = X Y_train = Y n_test = X_test.shape[0] # Normalize counts to approximately same range X_train = datasets.normalize_counts(X_train) X_test = datasets.normalize_counts(X_test) # DAN bins have some count overlap in the early bins # between CTN (total neutrons) and CETN, leading to # negative thermal counts in the early bins if args.ignore_early_bins: X_train = np.take(X_train, range(5, n_bins)+range(n_bins+5, n_bins*2), axis=1) X_test = np.take(X_test, range(5, n_bins)+range(n_bins+5, n_bins*2), axis=1) # These bins demonstrate the most dynamic range with respect to changing # subsurface geochemistry: 18-34 for CTN and 13-17 for CETN if args.use_restricted_bins: X_train = np.take(X_train, range(17, 34)+range(n_bins+12, n_bins+17), axis=1) X_test = np.take(X_test, range(17, 34)+range(n_bins+12, n_bins+17), axis=1) print X_train.shape
type=int, default=3, help='number of principal components to use for PCA') parser.add_argument( '--normalize', action='store_true', help='normalize the data by dividing each bin by total counts') args = parser.parse_args() # Prepare the plot fig, axes = plt.subplots(nrows=3, ncols=2, sharey=True, sharex=True) # Plot the principal components of full grid data data, _ = datasets.read_sim_data() if args.normalize: data = datasets.normalize_counts(data) pca = PCA(n_components=3) pca.fit(data) axes[0, 0].step(datasets.time_bins_sim, pca.components_[0][:len(datasets.time_bins_sim)], where='post', linewidth=2, label='PC 1') axes[0, 0].step(datasets.time_bins_sim, pca.components_[1][:len(datasets.time_bins_sim)], where='post', linewidth=2, label='PC 2') axes[0, 0].step(datasets.time_bins_sim, pca.components_[2][:len(datasets.time_bins_sim)], where='post',
help='number of principal components to use for PCA') parser.add_argument('--normalize', action='store_true', help='normalize the data before PCA') args = parser.parse_args() X_sebina, Y_sebina = datasets.read_acs_grid_data() print Y_sebina.shape X_dan, Y_dan, err_dan, names_dan = datasets.read_dan_data() print Y_dan.shape time_bins = datasets.time_bins_dan n_bins = 64 if args.normalize: X_sebina = datasets.normalize_counts(X_sebina) X_dan = datasets.normalize_counts(X_dan) pca = PCA(n_components=args.n_components) X_t = pca.fit_transform(X_sebina) # Plot the Sebina grid points in PC space fig = plt.figure() ax1 = fig.add_subplot(1, 1, 1, projection='3d') ax1.set_xlabel('PC 1') ax1.set_ylabel('PC 2') ax1.set_zlabel('PC 3') for x_t, (h, acs) in zip(X_t, Y_sebina): exists = False for [h_dan, acs_dan] in Y_dan: