def pcAnalysis(X, Xtest, w=None, ncomp=2, useTSNE=False): """ PCA(TSNE """ if useTSNE: print "TSNE analysis for train/test" pca = TSNE(n_components=ncomp) else: print "PC analysis for train/test" pca = TruncatedSVD(n_components=ncomp) print pca pca.fit(X) X_all = pd.concat([Xtest, X]) X_r = pca.transform(X_all.values) plt.scatter(X_r[len(Xtest.index):, 0], X_r[len(Xtest.index):, 1], c='r', label="train", alpha=0.5) plt.scatter(X_r[:len(Xtest.index), 0], X_r[:len(Xtest.index), 1], c='g', label="test", alpha=0.5) print("Total variance:", np.sum(pca.explained_variance_ratio_)) print("Explained variance:", pca.explained_variance_ratio_) plt.legend() plt.show()
def process_data(train_df, test_df, ylabel='target', standarization=False, discretization=False, transform=None): numerical_features = train_df.columns if standarization: standarized_features = numerical_features standarize_feature(train_df, test_df, standarized_features) if discretization: discretized_features = numerical_features discretize_feature(train_df, test_df, discretized_features, num_bins=10, how='equal_freq') X = train_df.drop(ylabel, axis=1).as_matrix() y = train_df[ylabel].as_matrix() X_submission = test_df.as_matrix() if transform == 'log': X = np.log1p(X) X_submission = np.log1p(X_submission) elif transform == 'sqrt': X = np.sqrt(X + 3.0 / 8) X_submission = np.sqrt(X_submission + 3.0 / 8) elif transform == 'pca': pca = PCA(n_components=3).fit(X) X = pca.transform(X) X_submission = pca.transform(X_submission) elif transform == 'tsne': tsne = TSNE(n_components=3).fit(X) X = tsne.transform(X) X_submission = tsne.transform(X_submission) elif transform == 'pca+': pca = PCA(n_components=3).fit(X) X = np.hstack((X, pca.transform(X))) X_submission = np.hstack((X, pca.transform(X))) elif transform == 'tsne+': tsne = TSNE(n_components=3).fit(X) X = np.hstack((X, tsne.transform(X))) X_submission = np.hstack((X_submission, tsne.transform(X_submission))) return X, y, X_submission
def get_twenty_dataset(remove_stop_word=False, preprocessing_trick=None, n_components=2): twenty_train = fetch_20newsgroups(subset='train', \ remove=['headers', 'footers', 'quote'], shuffle=True) twenty_test = fetch_20newsgroups(subset='test', \ remove=['headers', 'footers', 'quote'], shuffle=True) if remove_stop_word: count_vect = CountVectorizer(stop_words=stopwords.words('english') + list(string.punctuation)) else: count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(twenty_train.data) X_test_counts = count_vect.transform(twenty_test.data) _, vocab_size = X_train_counts.shape tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_test_tfidf = tfidf_transformer.transform(X_test_counts) X_train, X_test = X_train_tfidf, X_test_tfidf if preprocessing_trick == 'SVD': pca = TruncatedSVD(n_components = n_components) X_train = pca.fit_transform(X_train) X_test = pca.transform(X_test) elif preprocessing_trick == 'LDA': lda = LinearDiscriminantAnalysis() X_train = lda.fit_transform(X_train.toarray(), twenty_train.target) X_test = lda.transform(X_test.toarray()) elif preprocessing_trick == 'TSNE': tsne = TSNE(n_components=n_components) X_train = tsne.fit_transform(X_train.toarray()) X_test = tsne.transform(X_test.toarray()) elif preprocessing_trick == 'autoencoder': #n_components = 256 num_samples, feature_dim = X_train.shape print('autoencoder: ',num_samples, feature_dim,n_components) input_sample = Input(shape=(feature_dim,)) encoded = Dense(1024, activation='relu')(input_sample) encoded = Dense(512, activation='relu')(encoded) encoded = Dense(256, activation='relu')(encoded) decoded = Dense(512, activation='relu')(encoded) decoded = Dense(1024, activation='relu')(decoded) decoded = Dense(feature_dim, activation='sigmoid')(decoded) autoencoder = Model(input_sample, decoded) autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy') autoencoder.fit(X_train.todense(), X_train.todense(), epochs=50, batch_size=256, shuffle=True, validation_data=(X_test.todense(), X_test.todense())) X_train = autoencoder.predict(X_train.todense()) X_test = autoencoder.predict(X_test.todense()) # Calculate dimensions max_length = np.amax(X_train) embedding_dict = {'vocab_size': vocab_size, 'max_length': max_length} return X_train, twenty_train.target, X_test, twenty_test.target, embedding_dict
def TSNE(X_train, y_train=None, X_test=None, n=100): from sklearn.manifold import TSNE mod = TSNE(n_components=n) X = mod.fit(X_train, y_train) test = mod.transform(X_train) if X_test is None: out = train else: test = pca.transform(X_test) out = train, test return out
def plot_data(args, seq, original_seq=None): if args.delta: plt.figure() dist = np.sum((seq[1:, ...] - seq[:-1, ...])**2, axis=1)**0.5 plt.hist(dist) if args.save: plt.savefig(args.save, dpi=120) else: plt.show() return if args.pca: pca = PCA(n_components=args.pca) if original_seq is None: seq = pca.fit_transform(seq) else: original_seq = pca.fit_transform(original_seq) seq = pca.transform(seq) if args.tsne: tsne = TSNE(n_components=2, perplexity=30.0, n_iter=2000, verbose=2) if original_seq is None: seq = tsne.fit_transform(seq) else: tsne.fit(original_seq) seq = tsne.transform(seq) if seq.shape[1] == 2: plt.figure() x, y = zip(*seq[:, :]) color_list = cm.get_cmap(name="viridis") if args.strip: n, m = tuple(args.strip) for i in range(0, seq.shape[0] - 1, m): plt.plot(x[i:(i + n)], y[i:(i + n)], '-', color=color_list(i / (seq.shape[0] - 1))) else: for i in range(seq.shape[0] - 1): plt.plot(x[i:(i + 2)], y[i:(i + 2)], '.', color=color_list(i / (seq.shape[0] - 1))) plt.axis('equal') if args.save: plt.savefig(args.save, dpi=120) else: plt.show() else: print("Cannot plot sequence: data is of size {}".format(seq.shape))
def do_tsne(train_std, val_std=np.array([]), num_dim=2): ''' DESCRIPTION: Perform tSNE dimensionality reduction on training and validation sets INPUT: |--- train_std: [array] 2D array of standardized train feature vectors for each training sample |--- val_std: [array] 2D array of validation feature vectors for each validation sample standardized using training metrics; |--- nb_dim: [int] dimensions of final subspace OUTPUT: |--- tsne_train: [array] 2D array nb training samples x nb of final dimensions, stores principal components of training matrix |--- tsne_val: [array] 2D array nb validation samples x nb of final dimensions, projection of validation matrix onto training tSNE subspace ''' tsne = TSNE(n_components=num_dim, random_state=0) tsne_train = tsne.fit_transform(train_std) if val_std.any(): tsne_val = tsne.transform(val_std) else: tsne_val = np.array([]) return tsne_train, tsne_val
class TSNERepresentation(Representation): @staticmethod def default_config(): default_config = Representation.default_config() # parameters default_config.parameters = Dict() default_config.parameters.perplexity = 30.0 default_config.parameters.init = "random" default_config.parameters.random_state = None return default_config def __init__(self, n_features=28 * 28, n_latents=10, config={}, **kwargs): Representation.__init__(self, config=config, **kwargs) # input size (flatten) self.n_features = n_features # latent size self.n_latents = n_latents # feature range self.feature_range = (0.0, 1.0) self.algorithm = TSNE(n_components=self.n_latents) self.update_algorithm_parameters() def fit(self, X_train, update_range=True): ''' X_train: array-like (n_samples, n_features) ''' X_train = np.nan_to_num(X_train) if update_range: self.feature_range = (X_train.min(axis=0), X_train.max(axis=0)) # save (min, max) for normalization X_train = (X_train - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0]) self.algorithm.fit(X_train) def calc_embedding(self, x): x = (x - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0]) x = self.algorithm.transform(x) return x def update_algorithm_parameters(self): self.algorithm.set_params(**self.config.parameters, verbose=False)
def get_IMDB_dataset(remove_stop_word=False, preprocessing_trick=None, n_components=2): with open('./dataset/IMDB.pickle', 'rb') as data: dataset = pickle.load(data) train_x_raw, train_y = dataset['train'] test_x_raw, test_y = dataset['test'] if remove_stop_word: count_vect = CountVectorizer(stop_words=stopwords.words('english') + list(string.punctuation)) else: count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(train_x_raw) X_test_counts = count_vect.transform(test_x_raw) _, vocab_size = X_train_counts.shape tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_test_tfidf = tfidf_transformer.transform(X_test_counts) X_train, X_test = X_train_tfidf, X_test_tfidf if preprocessing_trick == 'PCA': pca = TruncatedSVD(n_components = n_components) X_train = pca.fit_transform(X_train) X_test = pca.transform(X_test) elif preprocessing_trick == 'LDA': lda = LinearDiscriminantAnalysis() X_train = lda.fit_transform(X_train.toarray(), train_y) X_test = lda.transform(X_test.toarray()) elif preprocessing_trick == 'TSNE': tsne = TSNE(n_components=n_components) X_train = tsne.fit_transform(X_train.toarray()) X_test = tsne.transform(X_test.toarray()) # Calculate dimensions max_length = np.amax(X_train) embedding_dict = {'vocab_size': vocab_size, 'max_length': max_length} return X_train, train_y, X_test, test_y, embedding_dict
class Visual(object): def __init__(self, mode='pca', dim=2, full=True, save=False): self._mode = mode self._model = None self._dim = dim self._data = None self._labels = None self._sizes = [] self._counter = 0 self._result = None self.size = 1 # size of dots self._full = full self._save = save @property def data(self): return self._data @data.setter def data(self, new_data): self._data = join_data(self._data, new_data, np.vstack) @property def labels(self): return self._labels @labels.setter def labels(self, new_labels): self._labels = join_data(self._labels, new_labels, np.hstack) self._sizes += [self.size] * len(new_labels) @timing def fit_data(self, reduce=None): if self._mode == 'pca': self._model = PCA(n_components=self._dim, random_state=opt.seed) if self._mode == 'tsne': self._model = TSNE(n_components=self._dim, perplexity=15, random_state=opt.seed) if self._full: self._result = self._model.fit_transform(self._data) else: self._model.fit(self._data[:reduce]) self._result = self._model.transform(self._data) def plot(self, iter=0, show=True, gt_plot=0, prefix=''): if iter is not None: self._counter = iter plt.scatter(self._result[..., 0], self._result[..., 1], c=self._labels, s=self._sizes, alpha=0.5) plt.grid(True) if self._save: # plt.figure(figsize=(1)) dir_check(join(opt.dataset_root, 'plots')) dir_check(join(opt.dataset_root, 'plots', opt.subaction)) pose_segm = ['!pose_', ''][opt.pose_segm] name = ['iter%d' % self._counter, 'gt', 'time'][gt_plot] name += '_%s.png' % self._mode name = prefix + '%s_%s_' % (opt.subaction, opt.tr_type) + name # if opt.grid_search: weight = ['w%d_' % int(opt.time_weight), ''][opt.time_weight == 1] folder_name = '%s_%slr_%.1e_dim_%d_ep_%d' % \ (opt.prefix, pose_segm, opt.lr, opt.embed_dim, opt.epochs) folder_name = opt.prefix + weight + folder_name dir_check( join(opt.dataset_root, 'plots', opt.subaction, folder_name)) plt.savefig(join(opt.dataset_root, 'plots', opt.subaction, folder_name, name), dpi=400) # else: # plt.savefig(join(opt.dataset_root, 'plots', opt.subaction, name), dpi=400) if show: plt.show() def reset(self): plt.clf() self._counter += 1 self._data = None self._labels = None self._sizes = [] self.size = 1
#### T-Distributed Stochastic Neighbor Embedding #### model = TSNE(learning_rate=100) transformed = model.fit_transform(data2) x = transformed[:,0] y = transformed[:,1] plt.scatter(x,y,c = color_list ) plt.xlabel('pelvic_radius') plt.xlabel('degree_spondylolisthesis') plt.show() #### Principal Component Analysis #### model = PCA() model.fit(data3) transformed = model.transform(data3) print('Principle components: ',model.components_) #### PCA variance #### scaler = StandardScaler() pca = PCA() pipeline = make_pipeline(scaler,pca) pipeline.fit(data3) plt.bar(range(pca.n_components_), pca.explained_variance_) plt.xlabel('PCA feature') plt.ylabel('variance') plt.show() #### PCA ####
print(reconstruct_model.summary()) # %% predict_model = K.models.Sequential() for i in reconstruct_model.layers[6:]: predict_model.add(i) predict_model.build(input_shape=(None, 14, 14, 1)) print(predict_model.summary()) # %% from sklearn.manifold import TSNE t_sne = TSNE(n_components=2) t_sne.fit(tX_test) pr_test = t_sne.transform(tX_test) pr_clusters = t_sne.transform(k_means.cluster_centers_) # %% centroids = k_means.cluster_centers_.reshape(10, 14, 14, 1) cent_img = predict_model.predict(centroids) # %% import numpy as np def centroid_dist(sample): return np.array( [np.linalg.norm(sample - c) for c in k_means.cluster_centers_])
class Visual(object): def __init__(self, mode='pca', dim=2, reduce=None, save=False, svg=False, saved_dots=''): # mpl.rcParams['image.cmap'] = 'cool' self._mode = mode self._model = None self._dim = dim self._data = None self._labels = None self._sizes = [] self._counter = 0 self._result = None self.size = 1 # size of dots self.reduce = reduce self._save = save self.svg = svg self.saved_dots = saved_dots @property def data(self): return self._data @data.setter def data(self, new_data): self._data = join_data(self._data, new_data, np.vstack) @property def labels(self): return self._labels @labels.setter def labels(self, new_labels): self._labels = join_data(self._labels, new_labels, np.hstack) self._sizes += [self.size] * len(new_labels) @timing def fit_data(self): if self.saved_dots: self._result = np.loadtxt(self.saved_dots) else: if self._mode == 'pca': self._model = PCA(n_components=self._dim, random_state=opt.seed) if self._mode == 'tsne': self._model = TSNE(n_components=self._dim, perplexity=15, random_state=opt.seed) if self.reduce is None: self._result = self._model.fit_transform(self._data) else: fraction = int(self._data.shape[0] * self.reduce / 100) self._model.fit(self._data[:fraction]) self._result = self._model.transform(self._data) def plot(self, iter=0, show=True, prefix=''): if iter is not None: self._counter = iter if 20 in self._labels: self._labels = np.array(self._labels) mask = self._labels == 20 self._labels[mask] = 10 plt.axis('off') plt.scatter(self._result[..., 0], self._result[..., 1], c=self._labels, s=self._sizes, alpha=1) plt.grid(True) if prefix == 'time_': plt.colorbar() if self._save: # plt.figure(figsize=(1)) dir_check(join(opt.dataset_root, 'plots')) dir_check(join(opt.dataset_root, 'plots', opt.subaction)) # name = ['iter%d_' % self._counter, 'gt_'][gt_plot] name = prefix + '%s_%s_' % (opt.subaction, opt.model_name) folder_name = opt.log_str dir_check( join(opt.dataset_root, 'plots', opt.subaction, folder_name)) folder_name = join(opt.log_str, opt.vis_mode) dir_check( join(opt.dataset_root, 'plots', opt.subaction, folder_name)) if self.svg: name += '_%s.svg' % self._mode else: name += '_%s.png' % self._mode # plt.savefig(join(opt.dataset_root, 'plots', opt.subaction, # folder_name, name), dpi=400) plt.savefig(join(opt.dataset_root, 'plots', opt.subaction, folder_name, name), transparent=True, dpi=300) np.savetxt( join(opt.dataset_root, 'plots', opt.subaction, folder_name, '%s.txt' % opt.vis_mode), self._result) if show: plt.show() def reset(self): plt.clf() self._counter += 1 self._data = None self._labels = None self._sizes = [] self.size = 1 def color(self, labels, prefix, reset=False): plt.clf() self._labels = labels self.plot(show=False, prefix=prefix) if reset: self.reset() def fit(self, data, labels, prefix, reset=True): self._data = data self._labels = labels self._sizes += [self.size] * len(labels) self.fit_data() self.plot(show=False, prefix=prefix) if reset: self.reset()
def tsne(X, n_components): model = TSNE(n_components=2, perplexity=40) model.fit(X) return model.transform(X)
def train_delta(matrix): import numpy as np import tensorflow as tf import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import MinMaxScaler from tensorflow.keras import Model, Input from tensorflow.keras.layers import (Dense, Dropout, GRU, Flatten, GaussianNoise, concatenate) from tensorflow.keras.models import load_model from tensorflow.keras.callbacks import Callback from tensorflow.keras.callbacks import EarlyStopping from tensorflow.keras.callbacks import ModelCheckpoint from tensorflow.keras.optimizers import Adam import supplemental_functions from supplemental_functions import (sampling_fix, prepareinput, prepareinput_nozero, prepareoutput) tf.keras.backend.clear_session() [ newdim, percent_drilled, start, stop, inc_layer1, inc_layer2, data_layer1, data_layer2, dense_layer, range_max, memory, predictions, drop1, drop2, lr, bs, ensemble_count ] = matrix drop1 = drop1 / 100 drop2 = drop2 / 100 inc_layer2 = inc_layer2 / 1000 lr = lr / 10000 percent_drilled = percent_drilled / 100 df = pd.read_csv('F9ADepth.csv') df_target = df.copy() droplist = [ 'nameWellbore', 'name', 'Pass Name unitless', 'MWD Continuous Inclination dega', 'Measured Depth m', 'MWD Continuous Azimuth dega', "Unnamed: 0", "Unnamed: 0.1" ] for i in droplist: df = df.drop(i, 1) for i in list(df): if df[i].count() < 1000: del df[i] info(f'dropped {i}') start = start stop = stop step = 0.230876 X = np.arange(start, stop, step) X = X.reshape(X.shape[0], 1) X = np.arange(start, stop, step) X = X.reshape(X.shape[0], 1) my_data1 = sampling_fix(df_target, 'MWD Continuous Inclination dega', start, stop, 1.7, 1, 0).predict(X) data_array = [] for i in list(df): sampled = sampling_fix(df_target, i, start, stop, 1.7, 3, 0).predict(X) if np.isnan(np.sum(sampled)) == False: data_array.append(sampled) info(f'Using {i}') data_array = np.asarray(data_array) dftemp = pd.DataFrame() dftemp['dinc'] = my_data1 dftemp['dinc'] = dftemp['dinc'].diff(1).rolling(3, center=True).mean() my_data1 = dftemp['dinc'].ffill().bfill() data_array = data_array.T pre_PCA_scaler = MinMaxScaler() data_array = pre_PCA_scaler.fit_transform(data_array) from sklearn.decomposition import PCA from sklearn.manifold import TSNE # ============================================================================= # pca = PCA().fit(data_array) # plt.plot(np.cumsum(pca.explained_variance_ratio_)) # plt.xlabel('number of components') # plt.ylabel('cumulative explained variance'); # # plt.show() # ============================================================================= sampcount = int(len(data_array) * percent_drilled) pca = TSNE(n_components=newdim).fit(data_array[:sampcount]) projected = pca.transform(data_array) my_data = [] for i in range(newdim): my_data.append(projected[:, i]) my_data1 = my_data1[:, np.newaxis] my_data_newaxis = [] for i in my_data: my_data_newaxis.append(i[:, np.newaxis]) temp_data1 = pd.DataFrame(my_data1.flatten()) temp_data1 = pd.DataFrame(my_data1) range1 = temp_data1[0].diff(memory + predictions) range2 = np.amax(range1) RNN_scaler = MinMaxScaler() my_data1 = RNN_scaler.fit_transform(my_data1) my_data_scaled = [] for i in my_data_newaxis: my_data_scaled.append(MinMaxScaler().fit_transform(i)) X1 = prepareinput(my_data1, memory) Xdata = [] for i in my_data_scaled: Xn = prepareinput_nozero(i, memory, predictions) Xdata.append(Xn) y_temp = prepareoutput(my_data1, memory, predictions) stack = [] for i in range(memory): stack.append(np.roll(my_data1, -i)) X_temp = np.hstack(stack) y = y_temp data_length = len(my_data1) - memory - predictions testing_cutoff = 0.80 border1 = int((data_length) * (percent_drilled * 0.8)) border2 = int((data_length) * (percent_drilled)) border3 = int((data_length) * (percent_drilled + 0.2)) X1_train = X1[:border1] X1_test = X1[border1:border2] X1_test2 = X1[border2:border3] Xdata_train = [] Xdata_test = [] Xdata_test2 = [] for i in Xdata: Xdata_train.append(i[:border1]) Xdata_test.append(i[border1:border2]) Xdata_test2.append(i[border2:border3]) y_train, y_test, y_test2 = y[:border1], y[border1:border2], y[ border2:border3] X1_train = X1_train.reshape((X1_train.shape[0], X1_train.shape[1], 1)) X1_test = X1_test.reshape((X1_test.shape[0], X1_test.shape[1], 1)) X1_test2 = X1_test2.reshape((X1_test2.shape[0], X1_test2.shape[1], 1)) Xdata_train_r = [] Xdata_test_r = [] Xdata_test2_r = [] for i in range(newdim): Xdata_train_r.append(Xdata_train[i].reshape( (Xdata_train[i].shape[0], Xdata_train[i].shape[1], 1))) Xdata_test_r.append(Xdata_test[i].reshape( (Xdata_test[i].shape[0], Xdata_test[i].shape[1], 1))) Xdata_test2_r.append(Xdata_test2[i].reshape( (Xdata_test2[i].shape[0], Xdata_test2[i].shape[1], 1))) X_train_con = np.concatenate(Xdata_train_r, axis=2) X_test_con = np.concatenate(Xdata_test_r, axis=2) X_test2_con = np.concatenate(Xdata_test2_r, axis=2) X_train = [X1_train, X_train_con] X_test = [X1_test, X_test_con] X_test2 = [X1_test2, X_test2_con] input1 = Input(shape=(memory, 1)) input2 = Input(shape=(memory + predictions, newdim)) x1 = GaussianNoise(inc_layer2, input_shape=(memory, 1))(input1) x1 = GRU(units=inc_layer1, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', kernel_regularizer='l2', recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, return_sequences=False, return_state=False, stateful=False)(x1) x1 = Dropout(drop1)(x1) x1 = Model(inputs=input1, outputs=x1) x2 = Dense(data_layer1, input_shape=(memory + predictions, 3))(input2) x2 = Dropout(drop2)(x2) x2 = Flatten()(x2) x2 = Dense(data_layer2)(x2) x2 = Model(inputs=input2, outputs=x2) combined = concatenate([x1.output, x2.output]) z = Dense(dense_layer, activation="relu")(combined) z = Dense(predictions, activation="linear")(z) #define the model myadam = Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999, amsgrad=False) class PlotResuls(Callback): def on_train_begin(self, logs={}): self.i = 0 self.x = [] self.losses = [] self.val_losses = [] #self.fig = plt.figure() self.logs = [] def on_epoch_end(self, epoch, logs={}): self.logs.append(logs) self.x.append(self.i) self.losses.append(logs.get('loss')) self.val_losses.append(logs.get('val_loss')) self.i += 1 #print (".", end = '') if (epoch % 14999 == 0) & (epoch > 0): print(epoch) plt.plot(self.x, np.log(self.losses), label="loss") plt.plot(self.x, np.log(self.val_losses), label="val_loss") plt.grid(color='gray', linestyle='-', linewidth=1, alpha=0.2) plt.title("Loss") plt.legend() plt.show() #mymanyplots(epoch, data, model) #data = [X1, X2, X3, X4, y, X1_train,X_train, X_test, X1_test, border1, border2, y_train, y_test, memory, y_temp, predictions] plot_results = PlotResuls() es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=25) ens_val_array = np.zeros(ensemble_count) ens_test_array = np.zeros(ensemble_count) for ens_no in range(ensemble_count): tf.keras.backend.clear_session() mc = ModelCheckpoint(f'best_model_ens_{ens_no}.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=0) model = Model(inputs=[x1.input, x2.input], outputs=z) model.compile(optimizer=myadam, loss='mean_squared_error') history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2000, verbose=0, batch_size=bs, callbacks=[plot_results, es, mc]) model = load_model(f'best_model_ens_{ens_no}.h5') valresult = np.log(model.evaluate(x=X_test, y=y_test, verbose=0)) testresult = np.log(model.evaluate(x=X_test2, y=y_test2, verbose=0)) ens_val_array[ens_no] = valresult ens_test_array[ens_no] = testresult winner = ens_val_array.argmin() model = load_model(f'best_model_ens_{winner}.h5') info(ens_val_array) info(ens_test_array) info(f'Validation winner {winner}') sample_count = len(X_test2[0]) y_pred = model.predict(X_test2) plt.plot(np.log(history.history['loss']), label='loss') plt.plot(np.log(history.history['val_loss']), label='test') plt.grid(color='gray', linestyle='-', linewidth=1, alpha=0.2) plt.legend() plt.clf() plt.show() for i in range(5): rand = np.random.randint(0, len(X_test[0])) y_test_descaled = RNN_scaler.inverse_transform(y_test[rand, np.newaxis]) y_in_descaled = RNN_scaler.inverse_transform(X_test[0][rand, :]) y_in_descaled = y_in_descaled.flatten() y_test_descaled = y_test_descaled.flatten() y_pred = model.predict(X_test) y_pred_descaled = RNN_scaler.inverse_transform(y_pred[rand, np.newaxis]) y_pred_descaled = y_pred_descaled.flatten() plt.plot(y_test_descaled, label="true") plt.plot(y_pred_descaled, label="predicted") plt.title('Inclination delta') #plt.ylim(0,1) plt.legend() plt.show() plt.figure(figsize=(5, 4)) x_after = np.linspace(0, 23, 100) x_before = np.linspace(-23, -0.23, 100) plt.plot(x_before, np.cumsum(y_in_descaled), label="measured", linestyle="-", c="black") commonpoint = np.cumsum(y_in_descaled)[-1] plt.plot(x_after, commonpoint + np.cumsum(y_test_descaled), label="actual", linestyle='-.', c='black') plt.plot(x_after, commonpoint + np.cumsum(y_pred_descaled), label="predicted", linestyle=':', c='black') #plt.title('') plt.ylim(-1, 7) plt.grid() plt.tight_layout() #plt.hlines(0, -23, 23, linewidth=0.5) plt.xlabel("Distance to sensor [m]") plt.ylabel("Inclination, local coordinates, [deg]") plt.legend() plt.tight_layout() plt.savefig(f'Sample, {percent_drilled}, no.{i}.pdf') plt.show() # #### Different ensemble, voting ###### # ypred_array = [] # for i in range(ensemble_count): # model = load_model(f'best_model_ens_{i}.h5') # y_pred = model.predict(X_test2) # ypred_array.append(y_pred) # y_pred = np.average(ypred_array, axis=0) # ######## Different ensemble ends here # y_test_descaled = RNN_scaler.inverse_transform(y_test2) y_pred = model.predict(X_test2) y_pred_descaled = RNN_scaler.inverse_transform(y_pred) error_matrix = np.cumsum(y_pred_descaled, axis=1) - np.cumsum( y_test_descaled, axis=1) def rand_jitter(arr): stdev = .004 * (max(arr) - min(arr)) return arr + np.random.randn(len(arr)) * stdev def jitter(x, y, s=20, c='b', marker='o', cmap=None, norm=None, vmin=None, vmax=None, alpha=None, linewidths=None, verts=None, **kwargs): return plt.scatter(rand_jitter(x), rand_jitter(y), s=s, c=c, marker=marker, cmap=cmap, norm=norm, vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths, verts=verts, **kwargs) plt.figure(figsize=(5, 5), dpi=200) for i in range(sample_count): _ = jitter(x_after, error_matrix[i], alpha=1, s=0.5, marker=".", c="black") plt.title(f"delta, drilled {percent_drilled}") plt.xlabel("Distance to sensor [m]") plt.ylabel("Prediction error [deg]") plt.grid() plt.tight_layout() plt.savefig(f'Birdflock, {percent_drilled}.pdf') plt.show() #plt.plot(np.median(error_matrix, axis=0), linewidth=8, alpha=1, c="white") #plt.plot(np.median(error_matrix, axis=0), linewidth=2, alpha=1, c="black") plt.scatter(np.arange(0, 100, 1), np.average(np.abs(error_matrix), axis=0), marker="o", s=40, alpha=0.7, c="white", zorder=2) c_array = np.empty(100, dtype=object) aae = np.average(np.abs(error_matrix), axis=0) for i in range(100): if aae[i] <= 0.4: c_array[i] = "green" elif aae[i] <= 0.8: c_array[i] = "orange" else: c_array[i] = "red" plt.scatter(np.arange(0, 100, 1), aae, marker=".", s=20, alpha=1, c=c_array, zorder=3, label="Average Absolute Error") plt.ylim((-3, 3)) plt.axhline(y=0, xmin=0, xmax=1, linewidth=2, c="black") plt.axhline(y=0.4, xmin=0, xmax=1, linewidth=1, c="black") plt.axhline(y=0.8, xmin=0, xmax=1, linewidth=1, c="black") plt.legend() plt.show() model = load_model('best_model.h5') #mymanyplots(-1, data, model) #myerrorplots(data, model) valresult = np.log(model.evaluate(x=X_test, y=y_test, verbose=0)) testresult = np.log(model.evaluate(x=X_test2, y=y_test2, verbose=0)) return valresult, testresult, aae
# 2 t-SNE --------------------------------------------------------------------------- # パラメータの設定 n_components = 2 learning_rate = 300 perplexity = 30 early_exaggeration = 12 init = 'random' random_state = 2018 # インスタンス生成 tSNE = TSNE(n_components=n_components, learning_rate=learning_rate, perplexity=perplexity, early_exaggeration=early_exaggeration, init=init, random_state=random_state) # 学習器の作成 tSNE.fit(X_train_PCA.loc[:5000, :9]) # 学習器の適用 X_train_tSNE = tSNE.transform(X_train_PCA.loc[:5000, :9]) # データフレームに変換 X_train_tSNE = pd.DataFrame(data=X_train_tSNE, index=train_index[:5001]) # プロット表示 scatterPlot(X_train_tSNE, y_train, "t-SNE")
def get_representation(embeddings): tsne = TSNE() tsne.fit(embeddings) return tsne, tsne.transform(embeddings)
"""for row_id in range(0, 15): target_word = word2vec.words()[row_id] x = reduced_matrix[row_id, 0] y = reduced_matrix[row_id, 1] print("{} = ({}, {})".format(target_word, x, y)) plt.annotate(target_word, (x, y)) # end for""" """for index, pred in enumerate(predictions): for word in pred.keys(): reducted_vector = model.transform(pred[word]) plt.scatter(reducted_vector[:, 0], reducted_vector[:, 1], 10) plt.annotate(word, (reducted_vector[0, 0], reducted_vector[0, 1])) # end for # end for""" for word in average_vectors: reducted_vector = model.transform(average_vectors[word]) plt.scatter(reducted_vector[0, 0], reducted_vector[0, 1], 10) plt.annotate(word, (reducted_vector[0, 0], reducted_vector[0, 1]), arrowprops=dict(facecolor='red', shrink=0.025)) # end for plt.show() # end if # Continue answer = raw_input("Continue? ").lower() if answer == "n": cont = False # end if # Reset reservoir
def main(cfg): model_dir = os.path.abspath(cfg["model_dir"]) X_test, Y_test = get_data(cfg) print(f"Data loaded. X_test shape: {X_test.shape}, Y_test shape: " \ f"{Y_test.shape}") # Binarize outcome if need be Y_test[Y_test >= 0.5] = 1 Y_test[Y_test < 0.5] = 0 model = load_model(model_dir) model.summary() print("Model loaded") if cfg["task"].startswith("dpsom"): probas_test = model.predict(X_test) else: probas_test = model.predict([X_test[:, :, 7:], X_test[:, :, :7]]) ix_pred_a = (probas_test < 0.5).flatten() ix_pred_d = (probas_test >= 0.5).flatten() ix_a = (Y_test == 0).flatten() ix_d = (Y_test == 1).flatten() ix_tn = ix_a & ix_pred_a ix_fp = ix_a & ix_pred_d ix_fn = ix_d & ix_pred_a ix_tp = ix_d & ix_pred_d X_anl, Y_anl = get_analysis_subsets(X_test, Y_test, cfg["num_for_analysis"]) if cfg["write_out"]: pickle.dump(X_test, open(pj(bm_config.output_dir, "X_test.pkl"), "wb")) pickle.dump(Y_test, open(pj(bm_config.output_dir, "Y_test.pkl"), "wb")) # Note, data are *right-padded*, i.e. padded with zeros to the right # if there < 200 actual data samples # Y_test is {0,1}, 1 = death, about 12% mortality if cfg["cluster"]: bilstm_name = "bilstm_2" bilstm_layer = model.get_layer(bilstm_name) bilstm_layer.return_sequences = True bilstm_model = Model(inputs=model.input, outputs=bilstm_layer.output) if cfg["task"].startswith("dpsom"): bilstm_seqs = bilstm_model.predict(X_test) else: bilstm_seqs = bilstm_model.predict( [X_test[:, :, 7:], X_test[:, :, :7]]) print("Shape of BiLSTM output:", bilstm_seqs.shape) bilstm_seqs = np.concatenate( [bilstm_seqs[:, :, :64], bilstm_seqs[:, ::-1, 64:]], axis=2) reducer = cfg["reducer"] if reducer == "tsne": reducer_model = TSNE(n_components=2) elif reducer == "isomap": reducer_model = Isomap(n_components=2, n_neighbors=cfg["n_neighbors"]) else: raise NotImplementedError(reducer) probas_out = bilstm_seqs[:, -1, :] print("Shape of final probas matrix:", probas_out.shape) print(f"Fitting {reducer} model...") proj_X = reducer_model.fit_transform(probas_out) # Should really be training tsne with training data but oh well print("...Done") plt.figure(figsize=(16, 16)) plt.scatter(proj_X[ix_tn, 0], proj_X[ix_tn, 1], s=12, c="r") plt.scatter(proj_X[ix_fn, 0], proj_X[ix_fn, 1], s=12, c="g") plt.scatter(proj_X[ix_fp, 0], proj_X[ix_fp, 1], s=12, c="y") plt.scatter(proj_X[ix_tp, 0], proj_X[ix_tp, 1], s=12, c="b") plt.savefig(pj(model_dir, f"{reducer}.png")) plt.close() inc = cfg["plot_every_nth"] slices_dir = pj(model_dir, f"{reducer}_slices") if not pe(slices_dir): os.makedirs(slices_dir) seq_len = bilstm_seqs.shape[1] start_idx = seq_len - cfg["plot_last_n"] bilstm_seqs = bilstm_seqs[::inc, start_idx:] print("Creating sequence projections...") data_mat = np.zeros((bilstm_seqs.shape[0], bilstm_seqs.shape[1], 2)) for j in range(seq_len - start_idx): slice_j = bilstm_seqs[:, j, :] data_mat[:, j, :] = reducer_model.transform(slice_j) print("...Done") color_d = { "r": (ix_tn[::inc], 12), "g": (ix_fn[::inc], 24), "y": (ix_fp[::inc], 12), "b": (ix_tp[::inc], 24) } trajectories = Trajectories(data_mat, color_dict=color_d, final_extra=20) trajectories.save(pj(model_dir, f"{reducer}_{len(data_mat)}.gif")) plt.show() # Uses all subjects if cfg["confusion_matrix"]: print(f"X_test shape: {X_test.shape}, Y_test shape: {Y_test.shape}") print(f"Inferred probabilities, output shape {probas_test.shape}") fpr_mort, tpr_mort, thresholds = roc_curve(Y_test, probas_test) roc_auc_mort = auc(fpr_mort, tpr_mort) TN, FP, FN, TP = confusion_matrix(Y_test, probas_test.round()).ravel() PPV = TP / (TP + FP) NPV = TN / (TN + FN) cm = np.array([[TN, FP], [FN, TP]]) save_path = pj(cfg["model_dir"], "confusion_matrix.png") classes = ["False", "True"] plot_confusion_matrix(cm, save_path, classes, normalize=False, title='Confusion matrix') print("Inference:") print(f"PPV: {PPV:0.4f}, NPV: {NPV:0.4f}, roc_auc: " \ "{roc_auc_mort:0.4f}")
print('te_pca shape*****', te_pca.shape) print(te_pca.head()) X_train = tr_pca.values X_test = te_pca.values ''' tsne time ''' tsne = TSNE(n_components=3, perplexity=40, verbose=2) X_train_embedded = tsne.fit_transform(X_train) X_test_embedded = tsne.transform(X_test) this does not exists we will need to change it to https://github.com/kylemcdonald/Parametric-t-SNE/blob/master/Parametric%20t-SNE%20(Keras).ipynb train_principalDf = pd.DataFrame(data = X_train_embedded, columns = ['tsne_1', 'tsne_2', 'tsne_3']) tr = pd.concat([train_principalDf, train[['SK_ID_CURR']]], axis = 1) print('tr shape', tr.shape) print(tr.head()) test_principalDf = pd.DataFrame(data = X_test_embedded, columns = ['tsne_1', 'tsne_2', 'tsne_3']) te = pd.concat([test_principalDf, test[['SK_ID_CURR']]], axis = 1) print('te shape', te.shape) print(te.head()) tr_te = tr.append(te).reset_index()
def visualize_clusters(self, algorithm=None, fit: bool = True, xlim=None, ylim=None, cmap=None, markers=None, markersize=None, filename: str = None, block: bool = True): """Visualize the clusters from create_clusters(). Args: algorithm: the visualization algorithm to map data into 2D (default TSNE). fit: True means fit the data, False means algorithm is pre-trained, so use it to just transform the data into 2D without fitting the data first. Note that TSNE does not support fit=False yet. If you want fit=False, use another dimension-reduction algorithm like PCA(...). xlim (Pair[float,float]): optional axis limits for the X axis. ylim (Pair[float,float]): optional axis limits for the Y axis. cmap (Union[ColorMap,str]): optional color map for the cluster colors, or the name of a color map. See https://matplotlib.org/3.1.1/tutorials/colors/colormaps.html. Default is 'brg', which has a wide range of colors going from blue through red to green, and prints in black and white okay - though very non-linear - because it does not go all the way to white. markers (matplotlib.markers.MarkerStyle): optional marker styles for clusters. If this is a string, then the i'th character in the string will be used for the i'th marker style. See https://matplotlib.org/3.1.1/api/markers_api.html for the available marker characters. Note that clusters will be drawn from 0 up to n-1, so later clusters will be on top. Also, the earlier clusters tend to have more elements. One approach to improve readability is to use line-based shapes (from "1234+x|_") for the first few clusters (which have many points), and then filled shapes (from ".o<^>vsphPXd*") for the later clusters (which have few points). Note also that you can use a space for the marker character of a cluster if you want to not display that cluster at all. However, if your markers string is shorter than the number of clusters, all remaining clusters will be displayed using the "o" marker. markersize (float): size of the markers in points (only when markers is a str). The default seems to be about 6 points. filename (str): optional file name to save image into, as well as displaying it. block (bool): True (the default) means wait for user to close figure before returning. False means non-blocking. Limitations: if you call this multiple times with different numbers of clusters, the color map will not be exactly the same. """ data = self._cluster_data if data is None or self._clusters is None: raise Exception( "You must call create_clusters() before visualizing them!") num_clusters = max(self._clusters) + 1 if algorithm is None: if not fit: raise Exception( "You must supply pre-fitted algorithm when fit=False") algorithm = TSNE() alg_name = str(algorithm).split("(")[0] self.message(f"running {alg_name} on {len(data)} traces.") if fit: tsne_obj = algorithm.fit_transform(data) else: tsne_obj = algorithm.transform(data) # print(tsne_obj[0:5]) # All the following complex stuff is for adding a 'show label on mouse over' feature # to the visualisation scatter graph. # It works when run from command line, but not in Jupyter/Spyder! # Surely there must be an easier way than doing all this... # Code adapted from: # https://stackoverflow.com/questions/55891285/how-to-make-labels-appear- # when-hovering-over-a-point-in-multiple-axis/55892690#55892690 fig, ax = plt.subplots( ) # figsize=(8, 6)) # 25% larger, for better printing if xlim: ax.set_xlim(xlim) if ylim: ax.set_ylim(ylim) if cmap is None: # Choose a default colormap. See bottom of the matplotlib page: # https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html cmap = pltcm.get_cmap( 'brg') # sequential map with nice b&w printing. elif isinstance(cmap, str): cmap = pltcm.get_cmap( cmap) # it is the name of a matplotlib color map if markers is None: markers = "o" if isinstance(markers, str) and len(markers) > 1: # loop through the marker styles clusters = np.ma.array(self._clusters) markchars = markers + "o" * num_clusters for curr in range(max(num_clusters, len(markers))): #prepare for masking arrays - 'conventional' arrays won't do it mask = clusters != curr # False means unmasked x_masked = np.ma.array(tsne_obj[:, 0], mask=mask) y_masked = np.ma.array(tsne_obj[:, 1], mask=mask) color = cmap(curr / num_clusters) # c_masked = np.ma.array(clusters, mask=mask) # print(f"DEBUG: mark {curr} is '{markers[curr]}' x={x_masked[0:10]} cl={c_masked[0:10]} color={color}") sc = ax.plot(x_masked, y_masked, color=color, linewidth=0, label=f"c{curr}", marker=markchars[curr], markersize=markersize) leg = ax.legend( loc='best' ) #, ncol=2, mode="expand", shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) else: sc = plt.scatter(tsne_obj[:, 0], tsne_obj[:, 1], c=self._clusters, cmap=cmap, marker=markers) if filename: plt.savefig(filename) names = [str(tr) for tr in self.traces ] # these are in same order as tsne_df rows. annot = ax.annotate( "", xy=(0, 0), xytext=(0, 20), textcoords="offset points", bbox=dict(boxstyle="round", fc="w"), arrowprops=dict(arrowstyle="->"), ) annot.set_visible(False) def update_annot(ind): pos = sc.get_offsets()[ind["ind"][0]] annot.xy = pos # text = "{}, {}".format(" ".join(list(map(str, ind["ind"]))), # " ".join([str(names[n]) for n in ind["ind"]])) anns = [ f"{n} ({self._clusters[n]}): {str(names[n])}" for n in ind["ind"] ] text = "\n".join(anns) annot.set_text(text) # annot.get_bbox_patch().set_facecolor(cmap(norm(c[ind["ind"][0]]))) # annot.get_bbox_patch().set_alpha(0.4) def hover(event): vis = annot.get_visible() if event.inaxes == ax: cont, ind = sc.contains(event) if cont: update_annot(ind) annot.set_visible(True) fig.canvas.draw_idle() else: if vis: annot.set_visible(False) fig.canvas.draw_idle() fig.canvas.mpl_connect("motion_notify_event", hover) plt.show(block=block)