def scikit_pca(model, rel_wds, plot_lims, title, cluster="kmeans"): """ Given a word2vec model and a cluster (choice of "kmeans" or "spectral") Make a plot of all word-vectors in the model. """ X, keys = make_data_matrix(model) for i, key in enumerate(keys): X[i,] = model[key] if cluster == "kmeans": k_means = KMeans(n_clusters=8) labels = k_means.fit_predict(X) elif cluster == "spectral": sp_clust = SpectralClustering() labels = sp_clust.fit_predict(X) # PCA X_std = StandardScaler().fit_transform(X) sklearn_pca = PCA(n_components=2) X_transf = sklearn_pca.fit_transform(X_std) scatter_plot(X_transf[:,0], X_transf[:,1], rel_wds, labels, title, keys, plot_lims) return sklearn_pca.explained_variance_ratio_
def plot_data(data, has_label=True): import numpy as np import seaborn as sns from sklearn.manifold import TSNE from sklearn.decomposition import PCA if not has_label: data = data.copy() data['label'] = np.zeros([len(data),1]) LIMIT = 4000 if data.shape[0] > LIMIT: dt = data.sample(n=LIMIT, replace=False) X = dt.ix[:,:-1] labels = dt.ix[:,-1] else: X = data.ix[:,:-1] labels = data.ix[:,-1] tsne_model = TSNE(n_components=2, random_state=0) np.set_printoptions(suppress=True) points1 = tsne_model.fit_transform(X) df1 = pd.DataFrame(data=np.column_stack([points1,labels]), columns=["x","y","class"]) sns.lmplot("x", "y", data=df1, hue='class', fit_reg=False, palette=sns.color_palette('colorblind')) sns.plt.title('TNSE') pca = PCA(n_components=2) pca.fit(X) points2 = pca.transform(X) df2 = pd.DataFrame(data=np.column_stack([points2,labels]), columns=["x","y","class"]) sns.lmplot("x", "y", data=df2, hue='class', fit_reg=False, palette=sns.color_palette('colorblind')) sns.plt.title('PCA')
def _analyze_pca(csv_filepath): """ Analyze how much data can be compressed. Parameters ---------- csv_filepath : str Path relative to dataset_path to a CSV file which points to images """ from sklearn.decomposition import PCA import itertools as it symbol_id2index = generate_index(csv_filepath) data, y = load_images(csv_filepath, symbol_id2index, one_hot=False) data = data.reshape(data.shape[0], data.shape[1] * data.shape[2]) pca = PCA() pca.fit(data) sum_ = 0.0 done_values = [None, None, None] done_points = [False, False, False] chck_points = [0.9, 0.95, 0.99] for counter, el in enumerate(pca.explained_variance_ratio_): sum_ += el for check_point, done, i in zip(chck_points, done_points, it.count()): if not done and sum_ >= check_point: done_points[i] = counter done_values[i] = sum_ for components, variance in zip(done_points, done_values): print("%i components explain %0.2f of the variance" % (components, variance))
def fit_pca(trajs): print 'fitting PCA...' pca = PCA(2, copy=True, whiten=False) X = np.vstack(trajs.values()) pca.fit(X) print 'done' return pca
def pcafunction(dataList,countList,nameList): from sklearn.decomposition import PCA import pylab as pl pcadataArray = np.array(dataList) pcaCountArray = np.array(countList) pca = PCA(n_components=2) X = pca.fit(pcadataArray).transform(pcadataArray) pcaNameList = [] for i in range(0,len(nameList)): if nameList[i] not in pcaNameList: pcaNameList.append(nameList[i]) print('explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_)) plt.plot(X[pcaCountArray == 0, 0], X[pcaCountArray == 0, 1], 'or', X[pcaCountArray == 1, 0], X[pcaCountArray == 1, 1], '^b', X[pcaCountArray == 2, 0], X[pcaCountArray == 2, 1], 'sg' ) plt.xlabel('PC1 (explained variance ratio: ' + str(pca.explained_variance_ratio_[0])+')',fontsize=14) plt.ylabel('PC2 (explained variance ratio: ' + str(pca.explained_variance_ratio_[1])+')',fontsize=14) plt.legend((str(pcaNameList[0]),str(pcaNameList[1])),loc='best',fontsize=14) plt.title('PCA',fontsize=16)
def add_tsne_features(x_train, x_test): print('add_tsne_features <<') x_train_data = x_train.data_ x_test_data = x_test.data_ x = np.vstack((x_train_data, x_test_data)) print('applying pca...') pca = PCA(n_components=25) x_pca = pca.fit_transform(x) print('applying t-SNE...') tsne_model = TSNE(n_components=2, random_state=0) x_tsne = tsne_model.fit_transform(x_pca) x_train_data = np.hstack((x_train_data, x_tsne[:x_train_data.shape[0], :])) x_test_data = np.hstack((x_test_data, x_tsne[-x_test_data.shape[0]:, :])) assert(x_train.columns_ == x_test.columns_) columns = x_train.columns_ + ['tsne_1', 'tsne_2'] x_train = DataSet(x_train.ids_, columns, x_train_data) x_test = DataSet(x_test.ids_, columns, x_test_data) print('add_tsne_features >>') return x_train, x_test
def test_pca(): # PCA on dense arrays X = iris.data for n_comp in np.arange(X.shape[1]): pca = PCA(n_components=n_comp, svd_solver='full') X_r = pca.fit(X).transform(X) np.testing.assert_equal(X_r.shape[1], n_comp) X_r2 = pca.fit_transform(X) assert_array_almost_equal(X_r, X_r2) X_r = pca.transform(X) X_r2 = pca.fit_transform(X) assert_array_almost_equal(X_r, X_r2) # Test get_covariance and get_precision cov = pca.get_covariance() precision = pca.get_precision() assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]), 12) # test explained_variance_ratio_ == 1 with all components pca = PCA(svd_solver='full') pca.fit(X) assert_almost_equal(pca.explained_variance_ratio_.sum(), 1.0, 3)
def compute_pca(data2d): """ Compute PCA using sklearn :param data2d: 2d array. PCA will be computed on non-zeros values. :return: coordsrc: 2d array: centered non-zero coordinates pca: object: PCA result. centermass: 2x1 array: 2d coordinates of the center of mass """ # round it and make it int (otherwise end up with values like 10-7) data2d = data2d.round().astype(int) # get non-zero coordinates, and transpose to obtain nx2 dimensions coordsrc = np.array(data2d.nonzero()).T # get center of mass centermass = coordsrc.mean(0) # center data coordsrc = coordsrc - centermass # normalize data coordsrc /= coordsrc.std() # Performs PCA from sklearn.decomposition import PCA pca = PCA(n_components=2, copy=False, whiten=False) pca.fit(coordsrc) # pca_score = pca.explained_variance_ratio_ # V = pca.components_ return coordsrc, pca, centermass
def make_tsne_plot(model, rel_wds, plot_lims, title): dim = 30 X, keys = make_data_matrix(model) # first we actually do PCA to reduce the # dimensionality to make tSNE easier to calculate X_std = StandardScaler().fit_transform(X) sklearn_pca = PCA(n_components=2) X = sklearn_pca.fit_transform(X_std)[:,:dim] # do downsample k = 5000 sample = [] important_words = [] r_wds = [word[0] for word in rel_wds] for i, key in enumerate(keys): if key in r_wds: sample.append(i) sample = np.concatenate((np.array(sample), np.random.choice(len(keys), k-10, replace = False), )) X = X[sample,:] keys = [keys[i] for i in sample] # Do tSNE tsne = TSNE(n_components=2, random_state=0, metric="cosine") X_transf = tsne.fit_transform(X) k_means = KMeans(n_clusters=8) labels = k_means.fit_predict(X_transf) scatter_plot(X_transf[:,0], X_transf[:,1], rel_wds, labels, title, keys, plot_lims)
def pca(df, n_components=2, mean_center=False, *args, **kwargs): if not sklearn: assert('This library depends on scikit-learn (sklearn) to perform PCA analysis') from sklearn.decomposition import PCA df = df.copy() # We have to zero fill, nan errors in PCA df[ np.isnan(df) ] = 0 if mean_center: mean = np.mean(df.values, axis=0) df = df - mean pca = PCA(n_components=n_components, *args, **kwargs) pca.fit(df.values.T) scores = pd.DataFrame(pca.transform(df.values.T)).T scores.index = ['Principal Component %d' % (n+1) for n in range(0, scores.shape[0])] scores.columns = df.columns weights = pd.DataFrame(pca.components_).T weights.index = df.index weights.columns = ['Weights on Principal Component %d' % (n+1) for n in range(0, weights.shape[1])] return scores, weights
def fit(self, x, y, i=0): # if gaussian processes are being used, data dimensionality needs to be reduced before fitting if self.method[i] == 'GP': if self.reduce_dim == 'FastICA': print('Reducing dimensionality with ICA') do_ica = FastICA(n_components=self.n_components) self.do_reduce_dim = do_ica.fit(x) if self.reduce_dim == 'PCA': print('Reducing dimensionality with PCA') do_pca = PCA(n_components=self.n_components) self.do_reduce_dim = do_pca.fit(x) x = self.do_reduce_dim.transform(x) #try: print('Training model...') try: self.model.fit(x, y) self.goodfit = True print(self.model) except: self.goodfit = False if self.method[i] == 'GP': print('Model failed to train! (For GP this does not always indicate a problem, especially for low numbers of components.)') pass else: print('Model failed to train!') traceback.print_stack() if self.ransac: self.outliers = np.logical_not(self.model.inlier_mask_) print(str(np.sum(self.outliers)) + ' outliers removed with RANSAC')
def main(): inp=np.loadtxt('../../out_files/bivar_regress.txt', usecols=(1, 2, 3)) X=inp[:,[1, 2]] ncomp=int(sys.argv[3]) pca=PCA(n_components=ncomp) pca.fit(X) l=pca.transform(X) print "Doing an \t"+str(ncomp)+"\t component PCA \n\n----------------" #linear regression fit res=sm.OLS(inp[:,0], l).fit() t2_new=float(sys.argv[1]) err_t2_new=float(sys.argv[2]) #array for 1000 realisations with slope and slope error -0.0264 and 0.004 ar=np.array([(rn(-0.0264, 0.004)*rn(pca.transform([rn(t2_new, err_t2_new)]), 0.85)+rn(np.mean(inp[:,0]), 0.07))/rn(2.0, 0.3) for k in range(1000)]) print "The estimated L_max is\t "+ str(np.mean(ar)) print "The error from the PCA is\t "+ str(np.std(ar)) print "Standard error on y mean is \t "+ str(np.std(inp[:,0])/np.sqrt(len(inp[:,0]))) print "Error by bootstrapping is \t"+ str(np.std(boots(inp[:,0])))
class Transformer: def __init__(self, use_PCA=True): self._clf = DecisionTreeClassifier(min_samples_leaf=10) self._idx = None self._scaler = StandardScaler() self._trans = PCA('mle') self._use_PCA = use_PCA def fit(self, X, y): X = np.array(X) self._clf.fit(X, y) self._idx = filter(lambda x: self._clf.feature_importances_[x] > 0, \ range(len(self._clf.feature_importances_))) new_set = [X[i][self._idx] for i in xrange(len(X))] # new_set = self._scaler.fit_transform(new_set) if self._use_PCA: new_set = self._trans.fit_transform(new_set) return new_set def transform(self, features): features = features[self._idx] # features = self._scaler.transform(features.astype(float)) if self._use_PCA: features = self._trans.transform(features) return features
def __init__(self): super(RegressionDriver, self).__init__() if REGRESSOR == "LOG": self.driver = LogisticRegression() elif REGRESSOR == "RFR": self.driver = RandomForestRegressor(n_estimators=N_ESTIMATORS, n_jobs=N_JOBS) elif REGRESSOR == "GBR": self.driver = GradientBoostingClassifier(n_estimators=300, max_depth=5, learning_rate=0.05) elif REGRESSOR == "PCA": self.driver = PCA(n_components=1) else: raise Exception("Regressor: %s not supported." % REGRESSOR) genuineX = [] forgeryX = [] genuineY = [] forgeryY = [] # Training process for sigs in self.train_set: personTrain = PersonTraining(sigs) genuine, forgery = personTrain.calc_train_set() genuineX.extend(genuine) forgeryX.extend(forgery) # To adjust PCA result, 0 means genuine and 1 means forgery genuineY = [0.0] * len(genuineX) forgeryY = [1.0] * len(forgeryX) trainX = genuineX + forgeryX trainY = genuineY + forgeryY self.driver.fit(trainX, trainY)
def load_bipolar_cells(micronsPerDeg=50.): ''' Returns list of tuples (space, spatial receptive field) ''' data_path, this_filename = os.path.split(__file__) file_name1 = data_path + '/data/B1.txt' file_name2 = data_path + '/data/B2.txt' data_b1 = np.loadtxt(file_name1, delimiter="\t") # 50 time x 100 space data_b2 = np.loadtxt(file_name2, delimiter="\t") # 50 time x 100 space data_b = [data_b1, data_b2] # get spacing for all bipolar spatial receptive fields spatialDelta = 0.022 # mm # since receptive fields are noisy, use PCA spatial_rfs = [] for b in data_b: pca = PCA(n_components=2) pca.fit(b) b_pca = pca.components_[0] sign_of_pc = -1 * np.sign(b_pca[abs(b_pca) == np.max(abs(b_pca))]) space = get_space(b_pca, spatialDelta, micronsPerDeg) spatial_rfs.append((space, sign_of_pc * b_pca)) return spatial_rfs
def Ploting3D(data, n_dimension=3): pca = PCA(n_components = n_dimension) colors = ['r', 'g', 'b', 'm'] labels = ['label_1', 'label_2', 'label_3', 'label_4'] fig = plt.figure() ax = fig.add_subplot(111, projection='3d') idx = [0, len(data[0])] combined = np.array(data[0]) # Combined all data for i in xrange(1, len(data)): combined = np.insert(combined, len(combined), data[i], axis=0) idx.append(idx[i]+len(data[i])) combined = pca.fit_transform(combined) for i in xrange(len(data)): ax.scatter(combined[idx[i]:idx[i+1], 0], combined[idx[i]:idx[i+1], 1], combined[idx[i]:idx[i+1], 2], c=colors[i], marker='o', s=70) ax.set_xlabel('1st_component') ax.set_ylabel('2nd_component') ax.set_zlabel('3rd_component') ax.set_xlim3d(-100, 100) ax.set_ylim3d(-60, 50) ax.set_zlim3d(-60, 50) plt.show()
def pc_analysis(self): pca_result = {} pca = PCA(n_components=2) pca_result['result'] = self.pca_reduced = pca.fit_transform(self.player_stats, self.player_value) pca_result['ratios'] = pca.explained_variance_ratio_ pca_result['components'] = pca.components_ return pca_result
def load_ganglion_cells(micronsPerDeg=50., pca_mode='space'): ''' Returns list of tuples (space, spatial receptive field) ''' data_path, this_filename = os.path.split(__file__) filename = data_path + '/data/allGC.txt' data_gc = np.loadtxt(filename, delimiter="\t") data_gc = data_gc.reshape((100,80,28)) nCells = data_gc.shape[2] # get spacing for spatial receptive fields spatialDelta = 0.027 # mm # since receptive fields are noisy, use PCA spatial_rfs = [] for n in range(nCells): pca = PCA(n_components=2) if pca_mode == 'space': pca.fit(data_gc[:,:,n]) g_pca = pca.components_[0] elif pca_mode == 'time': pca.fit(data_gc[:,:,n].T) g_pca = np.dot(data_gc[:,:,n].T, pca.components_[0]) sign_of_pc = -1 * np.sign(g_pca[abs(g_pca) == np.max(abs(g_pca))]) space = get_space(g_pca, spatialDelta, micronsPerDeg) spatial_rfs.append((space, sign_of_pc * g_pca)) return spatial_rfs
def main(): x = 10 y = 10 steps = 10000 history = [] world = np.array([i for i in xrange(625)]) world.resize((25, 25)) for _ in xrange(steps): active = getActive(world, x, y) assert len(active) == 25, "{}, {}: {}".format(x, y, active) history.append(active) x, y = getNewLocation(x, y, 25, 2, False) correlation = computeCorrelation(history) #plt.imshow(correlation, cmap="hot", interpolation="nearest") #plt.show() pca = PCA(n_components=25) pca.fit(correlation) print 'components' print pca.components_ #negativeMask = (pca.components_ < 0) #pca.components_[negativeMask] = 0 print 'transform:' transform = pca.transform(correlation) #negativeMask = (transform < 0) #transform[negativeMask] = 0 print transform.shape for i in xrange(25): plt.imshow(transform[:,i].reshape((25, 25)), cmap="hot", interpolation="nearest") plt.show()
def pca(inF,MIN): df = pd.read_table(inF, header=0) dc = list(df.columns) dc[0]='GeneID' df.columns = dc print(df.shape) sel = True for i in range(4, df.shape[1]-1): sel = (df.ix[:,i] < MIN) & (df.ix[:,i+1]< MIN) df = df.ix[~sel,:] print(df.shape) X = df.ix[:,4:df.shape[1]].values.T y = df.columns[4:df.shape[1]].values X_std = StandardScaler().fit_transform(X) #pca = PCA(n_components=2) pca = PCA() Y_sklearn = pca.fit_transform(X_std) fig = plt.figure() plt.style.use('ggplot') #plt.style.use('seaborn-whitegrid') ax = fig.add_subplot(111) for lab, col in zip(y,['r','g','b','c'] + sns.color_palette("cubehelix", df.shape[1]-4-4)): ax.scatter(Y_sklearn[y==lab, 0],Y_sklearn[y==lab, 1],label=lab,c=col, s=80) ax.set_xlabel('Principal Component 1 : %.2f'%(pca.explained_variance_ratio_[0]*100) + '%') ax.set_ylabel('Principal Component 2 : %.2f'%(pca.explained_variance_ratio_[1]*100) + '%') ax.legend(loc='upper right', prop={'size':8}) plt.tight_layout() plt.savefig(inF + '-MIN' + str(MIN) + '.pdf')
def plot_2d_results(X, y, preds): pca = PCA(n_components=2) X_r = pca.fit(X).transform(X) # Plot scatter plt.figure() cs = "cm" cats = [1, -1] target_names = ["positive", "negative"] for c, i, target_name in zip(cs, cats, target_names): plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name) plt.legend() plt.title("PCA of 2d data") plt.savefig("figures/data-scatter.png") # Plot mispredictions plt.figure() diff = np.array([1 if y_test[i] == preds[i] else 0 for i in range(len(y_test))]) cs = "rg" cats = [0, 1] target_names = ["incorrect", "correct"] for c, i, target_name in zip(cs, cats, target_names): plt.scatter(X_r[diff == i, 0], X_r[diff == i, 1], c=c, label=target_name) plt.legend() plt.title("PCA of correct/incorrect predictions") # plt.show() plt.savefig("figures/residual-scatter.png")
def pca_variance(df): # inputs are original data frame df_pca = PCA() df_pca.fit(df) ratio = df_pca.explained_variance_ratio_ components = [('component'+str(x)) for x in range(1, (df.shape[1]+1))] df2 = pd.Series(ratio, index = components) return df2
def classification_level_SGDReg_pipeline(classifications_DF): X = classifications_DF.iloc[:,3:89] #assign the target (session length) to y and convert to int y_actual = classifications_DF.iloc[:,2:3].astype(float) #scaling the data for feature selection X_scaled = preprocessing.scale(X) X_scaled_train, X_scaled_test, y_actual_train, y_actual_test = train_test_split(X_scaled, y_actual, test_size=0.5, random_state=0) pca_selection = PCA(n_components=2) X_features = pca_selection.fit(X_scaled_train['session_length'].values).transform(X_scaled_train) SGDReg = SGDRegressor(alpha=0.0001) # Do grid search over k, n_components and SVR parameters: pipeline = Pipeline([('pca', pca_selection),('SGDReg',SGDReg)]) tuned_params = dict(pca__n_components=[5,30,40,50], SGDReg__alpha=[0.1,0.01,0.001,0.0001,0.00001], SGDReg__l1_ratio=[.05, .15, .5, .7, .9, .95, .99, 1], SGDReg__penalty=['l2','l1','elasticnet']) grid_search = GridSearchCV(pipeline, param_grid=tuned_params,scoring='mean_squared_error',cv=3,verbose=10) grid_search.fit(X_scaled_train, y_actual_train['session_length'].values) print(grid_search.best_estimator_) y_true, y_pred = y_actual_test['session_length'].values,grid_search.best_estimator_.predict(X_scaled_test) print "Mean squared error:"+str(mean_squared_error(y_true,y_pred)) pd.DataFrame(y_true, y_pred).to_csv("SGDReg_pred_true.csv")
def sentence_to_vec(sentence_list: List[Sentence], embedding_size: int, a: float=1e-3): sentence_set = [] for sentence in sentence_list: vs = np.zeros(embedding_size) # add all word2vec values into one vector for the sentence sentence_length = sentence.len() for word in sentence.word_list: a_value = a / (a + get_word_frequency(word.text)) # smooth inverse frequency, SIF vs = np.add(vs, np.multiply(a_value, word.vector)) # vs += sif * word_vector vs = np.divide(vs, sentence_length) # weighted average sentence_set.append(vs) # add to our existing re-calculated set of sentences # calculate PCA of this sentence set pca = PCA(n_components=embedding_size) pca.fit(np.array(sentence_set)) u = pca.components_[0] # the PCA vector u = np.multiply(u, np.transpose(u)) # u x uT # pad the vector? (occurs if we have less sentences than embeddings_size) if len(u) < embedding_size: for i in range(embedding_size - len(u)): u = np.append(u, 0) # add needed extension for multiplication below # resulting sentence vectors, vs = vs -u x uT x vs sentence_vecs = [] for vs in sentence_set: sub = np.multiply(u,vs) sentence_vecs.append(np.subtract(vs, sub)) return sentence_vecs
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = PCA(n_components=2, svd_solver='randomized', random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", Transf()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def pca(inF,MIN): df = pd.read_table(inF, header=0) dc = list(df.columns) dc[0]='GeneID' df.columns = dc print(df.shape) sel = ~((df.ix[:,2] < MIN) & (df.ix[:,3]< MIN) & (df.ix[:,4]< MIN) & (df.ix[:,5]< MIN) & (df.ix[:,6]< MIN) & (df.ix[:,7]< MIN) & (df.ix[:,8]< MIN) & (df.ix[:,9]< MIN)) df = df.ix[sel,:] print(df.shape) X = df.ix[:,2:df.shape[1]].values.T y = df.columns[2:df.shape[1]].values X_std = StandardScaler().fit_transform(X) #pca = PCA(n_components=2) pca = PCA() Y_sklearn = pca.fit_transform(X_std) fig = plt.figure() plt.style.use('ggplot') #plt.style.use('seaborn-whitegrid') ax = fig.add_subplot(111) for lab, col in zip(y,('red','red', 'green','green', 'blue','blue','m','m')): ax.scatter(Y_sklearn[y==lab, 0],Y_sklearn[y==lab, 1],label=lab,c=col, s=80) ax.set_xlabel('Principal Component 1 : %.2f'%(pca.explained_variance_ratio_[0]*100) + '%') ax.set_ylabel('Principal Component 2 : %.2f'%(pca.explained_variance_ratio_[1]*100) + '%') ax.legend(loc='lower right', prop={'size':8}) plt.tight_layout() plt.savefig(inF + '-RNASeq-MIN' + str(MIN) + '.pdf')
def reduced_dimension(posture): i_user = 1 session = 1 while i_user <= 31: currentdirectory = os.getcwd() # get the directory. parentdirectory = os.path.abspath(currentdirectory + "/../..") # Get the parent directory(2 levels up) path = parentdirectory + '\Output Files\Reduced Dimensional Dataset/posture-'+str(posture)+'/GenuineUser'+str(i_user)+'' if not os.path.exists(path): os.makedirs(path) while session <= 8: data = np.genfromtxt("../../Output Files/E2-Genuine User-Session Split/Posture-"+str(posture)+"/GenuineUser-"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(session)+".csv", dtype=float, delimiter=",") userinformation = data[:, [0,1,2,3,4]] sample_train = data[:, [5,6,7,8,9,10,11,13,15,16,17,18,19,20,21]] scaler = preprocessing.MinMaxScaler().fit(sample_train) sample_train_scaled = scaler.transform(sample_train) pca = PCA(n_components=7) sample_train_pca = pca.fit(sample_train_scaled).transform(sample_train_scaled) completedata = np.column_stack((userinformation, sample_train_pca)) np.savetxt("../../Output Files/Reduced Dimensional Dataset/Posture-"+str(posture)+"/GenuineUser"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(session)+".csv", completedata, delimiter=',') session += 1 session = 1 i_user += 1
def pca_project(vecs, n_components=2, whiten=False): pca = PCA(n_components=n_components) vecs_projected = pca.fit_transform(vecs) print "=== PCA projection ===" print pca.explained_variance_ratio_ print "choosen explained: %.2f" % np.sum(pca.explained_variance_ratio_[:n_components]) return vecs_projected
def feature_scaled_nn_acc(mds, type): train, validation = validation_split(mds) # Multiply by 1 to convert to bool y_train = train['Up'] * 1 X_train = train.drop('Up', axis=1) y_validation = validation['Up'] * 1 X_validation = validation.drop('Up', axis=1) pre = PCA(n_components=19, whiten=True) X_train_pca = pre.fit_transform(X_train) X_validation_pca = pre.fit_transform(X_validation) model = create_model(X_train_pca.shape[1], type) # Convert to Keras format y_train = to_categorical(y_train.values) y_validation = to_categorical(y_validation.values) model.fit(X_train_pca, y_train, nb_epoch=5, batch_size=16) time.sleep(0.1) # Fit and guess guess_train = model.predict_classes(X_train_pca) guess_train = to_categorical(guess_train) guess_validation = model.predict_classes(X_validation_pca) guess_validation = to_categorical(guess_validation) train_acc = accuracy_score(y_train, guess_train) validation_acc = accuracy_score(y_validation, guess_validation) print "\n neural net train accuracy is {}".format(train_acc) print "\n neural net validation accuracy is {}".format(validation_acc) return guess_validation
def cluster_kmeans(): from sklearn.cluster import KMeans from sklearn.decomposition import PCA # import sklearn.decomposition.pca limit = 10000 # X,real_labels=data_dict.get_training_set() filepath = '/home/wenjusun/bigdata/data/adult-income/adult.data' record_list = data_parser.parse_file_fetch_records(filepath, limit) X = np.array(data_parser.records_to_vector(record_list, enable_label=False)) pca_estimator = PCA(n_components=1) X=pca_estimator.fit_transform(X) kmeans_model = KMeans(n_clusters=4).fit(X) labels = kmeans_model.labels_ # print kmeans_model.cluster_centers_ # print labels[:100] print len(X),len(labels) print labels[:40] # print array(real_labels) # count=0 # for xLabel,eLabel in zip(X[-1],labels): # if xLabel==eLabel: # count +=1 # # print "count=%d,ratio:%f" %(count,1.0*count/len(labels)) # print np.sum(labels) plt.figure(1) plt.scatter(X,labels) plt.show()
n_init=10, random_state=0) # create 'cluster' column matrix['cluster'] = cluster.fit_predict(matrix[matrix.columns[1:]]) matrix.head() # Code ends here # -------------- # import packages from sklearn.decomposition import PCA # Code starts here # initialize pca object with 2 components pca = PCA(n_components=2, random_state=0) # create 'x' and 'y' columns donoting observation locations in decomposed form matrix['x'] = pca.fit_transform(matrix[matrix.columns[1:]])[:, 0] matrix['y'] = pca.fit_transform(matrix[matrix.columns[1:]])[:, 1] # dataframe to visualize clusters by customer names clusters = matrix.iloc[:, [0, 33, 34, 35]] # visualize clusters clusters.plot.scatter(x='x', y='y', c='cluster', colormap='viridis') # Code ends here # -------------- # Code starts here
# print(f"x.shape:{x.shape}") # print(f"y.shape:{y.shape}") #1. 데이터 입력 x_train,x_test,y_train,y_test=tts(x,y,train_size=0.8) # scale scale = StandardScaler() x_train = scale.fit_transform(x_train) x_test = scale.transform(x_test) #PCA pca = PCA(pca_n) x_train=pca.fit_transform(x_train) x_test=pca.transform(x_test) #다중분류니까, to_categorical y_train = np_utils.to_categorical(y_train) y_test = np_utils.to_categorical(y_test) # print(f"x_train.shape:{x_train.shape}") # print(f"y_train.shape:{y_train.shape}")
clf = get_classifier(classifier_name, params) # Classification X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) st.write("Classifier: {}".format(classifier_name)) st.write("Accuracy = {}".format(acc)) # Plot pca = PCA(2) X_projected = pca.fit_transform(X) x1 = X_projected[:, 0] x2 = X_projected[:, 1] fig = plt.figure() plt.scatter(x1, x2, c=y, alpha=0.8, cmap="viridis") plt.xlabel("Principal Component 1") plt.ylabel("Principal Component 2") plt.colorbar() st.pyplot(fig)
if random.random() < percent_training: training_labels.append(sample['label']) training_continuous.append(sample['continuous_features']) training_discrete.append(sample['discrete_features']) if sample['label']: percent_training_popular += 1 else: testing_labels.append(sample['label']) testing_continuous.append(sample['continuous_features']) testing_discrete.append(sample['discrete_features']) percent_training_popular /= float(len(training_labels)) training_continuous = np.array(training_continuous) testing_continuous = np.array(testing_continuous) pca = PCA(n_components=num_components) pca.fit(training_continuous) training_continuous = training_continuous.dot(np.transpose( pca.components_)) testing_continuous = testing_continuous.dot(np.transpose(pca.components_)) # pprint.pprint(pca.components_) cont_nb = GaussianNB() disc_nb = MultinomialNB() cont_nb = cont_nb.fit(training_continuous, training_labels) disc_nb = disc_nb.fit(training_discrete, training_labels) # Combine discrete and continuous by multiplying and handling double prior cont_pred = cont_nb.predict_proba(testing_continuous)
print(peak) evoked.plot(window_title="Evoked") ####################PCA & ICA###################### from mne.decoding import UnsupervisedSpatialFilter from sklearn.decomposition import PCA, FastICA import matplotlib.pyplot as plt X = epochs.get_data() #the number of channels == 30 print("==============PCA==================") pca = UnsupervisedSpatialFilter(PCA(30), average=False) pca_data = pca.fit_transform(X) ev = mne.EvokedArray(np.mean(pca_data, axis=0), mne.create_info(30, epochs.info['sfreq'], ch_types='eeg'), tmin=tmin) ev.plot(show=False, window_title="PCA") print("==============ICA==================") ica = UnsupervisedSpatialFilter(FastICA(30), average=False) ica_data = ica.fit_transform(X) ev1 = mne.EvokedArray(np.mean(ica_data, axis=0), mne.create_info(30, epochs.info['sfreq'], ch_types='eeg'), tmin=tmin) ev1.plot(show=False, window_title='ICA')
def main_function(testCaseNumber): t1 = cv2.getTickCount() #Defining constants basePath = "./Images/" print "Example Number : ", testCaseNumber tNo = "1" pNo = "2" testCaseNumber = str(testCaseNumber) trainingImagePath = basePath + testCaseNumber + "/" + tNo + ".jpg" grayscaleImagePath = basePath + testCaseNumber + "/" + pNo + "G.jpg" outputImagePath = basePath + testCaseNumber + "/output.jpg" k = 5 try: os.stat("./../temp/" + testCaseNumber + "/") except: os.mkdir("./../temp/" + testCaseNumber + "/") #Reading Training Image trainingImage = cv2.imread(trainingImagePath) trainingImage = cv2.cvtColor(trainingImage, cv2.COLOR_BGR2LAB) m, n, _ = trainingImage.shape print "Color Quantization : " #Preprocessing variable from image l = trainingImage[:, :, 0] a = trainingImage[:, :, 1] b = trainingImage[:, :, 2] scaler = preprocessing.MinMaxScaler() pca = PCA(32) qab, centroid = quantization(a, b, k) print centroid # with open('./../temp/'+testCaseNumber+'/centroids', 'w') as csvfile: # writer = csv.writer(csvfile) # [writer.writerow(r) for r in centroid] t2 = cv2.getTickCount() t = (t2 - t1) / cv2.getTickFrequency() print "Time for quantization : ", t, " seconds" print "Feature extraction : " feat, classes = getKeyPointFeatures(l, qab) print "Length of feature descriptor before PCA : ", len(feat[0]) feat = scaler.fit_transform(feat) feat = pca.fit_transform(feat) print "Length of feature descriptor after PCA : ", len(feat[0]) t3 = cv2.getTickCount() t = (t3 - t2) / cv2.getTickFrequency() print "Time for feature extraction : ", t, " seconds" print "Training : " svm_classifier = train(feat, classes, k) t4 = cv2.getTickCount() t = (t4 - t3) / cv2.getTickFrequency() print "Time for training: ", t, " seconds" print "Prediction : " grayscaleImage = cv2.imread(grayscaleImagePath, 0) outputImage, probabilityValues = predict(svm_classifier, grayscaleImage, centroid, scaler, pca) #Writing temporary objects to disk #Remove later #cv2.imwrite("./../temp/"+testCaseNumber+"/labTempOut.jpg",outputImage) #outputTempImageBGR = cv2.cvtColor(outputImage,cv2.COLOR_LAB2BGR) #cv2.imwrite("./../temp/"+testCaseNumber+"/BGRTempOut.jpg",outputTempImageBGR) #with open('./../temp/'+testCaseNumber+'/probVal', 'w') as csvfile: # writer = csv.writer(csvfile) # [writer.writerow(r) for r in probabilityValues] outputImage = postProcess(outputImage, centroid, probabilityValues) t5 = cv2.getTickCount() t = (t5 - t4) / cv2.getTickFrequency() print "Time for prediction : ", t, " seconds" t = (t5 - t1) / cv2.getTickFrequency() print "Total time : ", t, " seconds" outputImage = cv2.cvtColor(outputImage, cv2.COLOR_LAB2BGR) trainingImage = cv2.cvtColor(trainingImage, cv2.COLOR_LAB2BGR) cv2.imwrite(outputImagePath, outputImage) cv2.imshow("Training", trainingImage) cv2.imshow("Original", grayscaleImage) cv2.imshow("Predicted", outputImage) cv2.waitKey() cv2.destroyAllWindows()
#r,g,bのベクトルにして1000個のデータをpcaする import os import cv2 import numpy as np from keras.models import load_model from sklearn.decomposition import PCA import matplotlib.pyplot as plt from keras.layers import Activation,Conv2D,Dense,Flatten,MaxPooling2D,Dropout from keras.models import Sequential,load_model from keras.utils.np_utils import to_categorical pca=PCA(n_components=48) img_file_name_list=os.listdir("./face_scratch_image/") print(len(img_file_name_list)) for i in range(len(img_file_name_list)): n=os.path.join("./face_scratch_image",img_file_name_list[i]) img=cv2.imread(n) if isinstance(img,type(None))==True: img_file_name_list.pop(i) continue #print(img_file_name_list[0:2]) X_train=np.array([])
mushrooms = mushrooms[:150] tree = CobwebTree() mushrooms_no_class = [{a: mushroom[a] for a in mushroom if a != 'classification'} for mushroom in mushrooms] clusters = next(cluster(tree, mushrooms_no_class)) mushroom_class = [mushroom[a] for mushroom in mushrooms for a in mushroom if a == 'classification'] ari = adjusted_rand_score(clusters, mushroom_class) dv = DictVectorizer(sparse=False) mushroom_X = dv.fit_transform(mushrooms_no_class) pca = PCA(n_components=2) mushroom_2d_x = pca.fit_transform(mushroom_X) colors = ['b', 'g', 'r', 'y', 'k', 'c', 'm'] clust_set = {v: i for i, v in enumerate(list(set(clusters)))} class_set = {v: i for i, v in enumerate(list(set(mushroom_class)))} for class_idx, class_label in enumerate(class_set): x = [v[0] for i, v in enumerate( mushroom_2d_x) if mushroom_class[i] == class_label] y = [v[1] for i, v in enumerate( mushroom_2d_x) if mushroom_class[i] == class_label] c = [colors[clust_set[clusters[i]]] for i, v in enumerate(mushroom_2d_x) if mushroom_class[i] == class_label] plt.scatter(x, y, color=c, marker=r"$ {} $".format( class_label[0]), label=class_label)
def model_selection(self): """ hyperparameter tuning is performed using GridSearchCV technique uses cross-validation when applying the default values of a 5-fold cross validation as a means of splitting the training data into a training and validation sets. model score is representen with the R-squared metrics """ models = [] models_1 = ["Ridge","Lasso","LinearRegression","PoissonRegressor"] models_2 = ["RandomForestRegressor","GradientBoostingRegressor"] model_3 = ["SVR"] models += models_1 + models_2 + model_3 models_dictionary = {"Ridge":Ridge(),"Lasso":Lasso(),"LinearRegression":LinearRegression(fit_intercept=True), "RandomForestRegressor":RandomForestRegressor(random_state=0),"GradientBoostingRegressor":GradientBoostingRegressor(random_state=0), "SVR":SVR(epsilon=0.5),"PoissonRegressor":PoissonRegressor(max_iter=200)} models_score = {} # Tuning of parameters for regression by cross-validation # Number of cross valiations is 5 for model in models: if model in models_1: pipe = Pipeline([ ('scaler', StandardScaler()), ('reduce_dim', PCA()), ('regressor', models_dictionary[model]) ]) pipe = pipe.fit(self.X_train, self.y_train) n_features_to_test = np.arange(1, 13) alpha_to_test = 2.0**np.arange(-6, +6) if model == "LinearRegression": params = {'reduce_dim__n_components': n_features_to_test, 'scaler' : [StandardScaler(), RobustScaler()]} else: params = {'reduce_dim__n_components': n_features_to_test, 'regressor__alpha': alpha_to_test, 'scaler' : [StandardScaler(), RobustScaler()]} gridsearch = GridSearchCV(pipe, params, verbose=1).fit(self.X_train, self.y_train) elif model in models_2: if model == "RandomForestRegressor": model_estimator =models_dictionary[model] params={'n_estimators':[20,30,40,60,80,100], 'max_depth': [5,10,15,20],'max_features':[2,5,8]} else: model_estimator = models_dictionary[model] params = {'learning_rate': [0.01,0.02,0.03,0.04], 'subsample' : [0.9, 0.5, 0.2, 0.1], 'n_estimators' : [20,30,40,60,80,100], 'max_depth' : [4,6,8,10] } gridsearch = GridSearchCV(estimator = model_estimator,param_grid = params,n_jobs=-1).fit(self.X_train, self.y_train) else: parameters = {'gamma': [1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5, 0.6, 0.9],'C': [1, 2.5, 5,7.5,10,15]} gridsearch = GridSearchCV(models_dictionary[model], parameters).fit(self.X_train, self.y_train) print(" Results from Grid Search:",model) print("\n The best estimator across ALL searched params:\n",gridsearch.best_estimator_) print("\n The best score across ALL searched params:\n",gridsearch.best_score_) print("\n The best parameters across ALL searched params:\n",gridsearch.best_params_) print('\n Final score is: ', gridsearch.score(self.X_test, self.y_test)) print("") models_score[model] = gridsearch.score(self.X_test, self.y_test) self.models_score = models_score
ax.set_title(modelname, fontsize = 20) #title targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'] colors = ['r', 'g', 'b'] for target, color in zip(targets,colors): #new iterator with both variables combined (random) indicesToKeep = finalDf['target'] == target ax.scatter(finalDf.loc[indicesToKeep,'component 1'], finalDf.loc[indicesToKeep, 'component 2'] , c = color , s = 20) ax.legend(targets) ax.grid() df.corr()#correlation ##PCA (linear) #Transform higher-dimensional set of features that could be possibly correlated #into a lower-dimensional set of linearly uncorrelated features. from sklearn.decomposition import PCA pca = PCA(n_components=2)#,whiten=True,random_state=20 #n_components - Number of components to keep #When True (False by default) the components_ vectors are multiplied by the square root of n_samples #and then divided by the singular values to ensure uncorrelated outputs with unit component-wise #variances. pc = pca.fit_transform(x) plotgraph('PCA',pc) #Nonlinear kernelPCA from sklearn.decomposition import KernelPCA kpca = KernelPCA(n_components = 2, kernel = 'rbf') #kernel=linear,rbf(radial basis function),sigmoid,cosine xkpca = kpca.fit_transform(x) plotgraph('kernel pca',xkpca)
def runSVM(): # scale test_size = 0.4 C = 1.0 kernel = 'rbf' cancer_dim = 10 dna_dim = 180 for scale in params_svm['scale']: X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = SVC(C=C, kernel=kernel) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('cancer_scale.txt', 'a') as f: f.write("scale=%s, test_size=%f, C=%f, kernel=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, cancer_dim, score, end - start)) X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = SVC(C=C, kernel=kernel) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('dna_scale.txt', 'a') as f: f.write("scale=%s, test_size=%f, C=%f, kernel=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, dna_dim, score, end - start)) scale = False # test_size C = 1.0 kernel = 'rbf' cancer_dim = 10 dna_dim = 180 for test_size in params_svm['test_size']: X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = SVC(C=C, kernel=kernel) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('cancer_test_size.txt', 'a') as f: f.write("scale=%s, test_size=%f, C=%f, kernel=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, cancer_dim, score, end - start)) X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = SVC(C=C, kernel=kernel) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('dna_test_size.txt', 'a') as f: f.write("scale=%s, test_size=%f, C=%f, kernel=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, dna_dim, score, end - start)) scale = False test_size = 0.4 # C kernel = 'rbf' cancer_dim = 10 dna_dim = 180 for C in params_svm['C']: X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = SVC(C=C, kernel=kernel) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('cancer_C.txt', 'a') as f: f.write("scale=%s, test_size=%f, C=%f, kernel=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, cancer_dim, score, end - start)) X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = SVC(C=C, kernel=kernel) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('dna_C.txt', 'a') as f: f.write("scale=%s, test_size=%f, C=%f, kernel=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, dna_dim, score, end - start)) scale = False test_size = 0.4 C = 1.0 # kernel cancer_dim = 10 dna_dim = 180 for kernel in params_svm['kernel']: X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = SVC(C=C, kernel=kernel) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('cancer_kernel.txt', 'a') as f: f.write("scale=%s, test_size=%f, C=%f, kernel=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, cancer_dim, score, end - start)) X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = SVC(C=C, kernel=kernel) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('dna_kernel.txt', 'a') as f: f.write("scale=%s, test_size=%f, C=%f, kernel=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, dna_dim, score, end - start)) scale = False test_size = 0.4 C = 1.0 kernel = 'rbf' # cancer_dim # dna_dim for cancer_dim in params_svm['cancer_dim']: X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() if cancer_dim != 10: pca = PCA(n_components=cancer_dim) X_train_new = pca.fit_transform(X_train) X_test_new = pca.transform(X_test) else: X_train_new = X_train X_test_new = X_test if scale: scaler = StandardScaler() X_train_new = scaler.fit_transform(X_train_new) X_test_new = scaler.fit_transform(X_test_new) model = SVC(C=C, kernel=kernel) model.fit(X_train_new, y_train) score = model.score(X_test_new, y_test) end = time.time() with open('cancer_dim.txt', 'a') as f: f.write("scale=%s, test_size=%f, C=%f, kernel=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, cancer_dim, score, end - start)) for dna_dim in params_svm['dna_dim']: X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() if dna_dim != 180: pca = PCA(n_components=dna_dim) X_train_new = pca.fit_transform(X_train) X_test_new = pca.transform(X_test) else: X_train_new = X_train X_test_new = X_test if scale: scaler = StandardScaler() X_train_new = scaler.fit_transform(X_train_new) X_test_new = scaler.fit_transform(X_test_new) model = SVC(C=C, kernel=kernel) model.fit(X_train_new, y_train) score = model.score(X_test_new, y_test) end = time.time() with open('dna_dim.txt', 'a') as f: f.write("scale=%s, test_size=%f, C=%f, kernel=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, dna_dim, score, end - start))
def Score(self): ## data ######################################################################## model_1 = RandomForestRegressor(max_depth=15,random_state=0) model_2 = LinearRegression(fit_intercept=True) model_3 = Ridge(alpha=5) model_4 = Lasso(alpha=10) model_5 = SVR(C=2.5, epsilon=0.5) model_6 = GradientBoostingRegressor(random_state=0) model_7 = PoissonRegressor() MSE = [] R2 = [] for mymodels in [model_1,model_2,model_3,model_4,model_5,model_6,model_7]: model_pipeline = Pipeline(steps=[('pre_processing',self.pre_process),('scaler', StandardScaler()),('reduce_dim', PCA()), ('model', mymodels) ]) model_pipeline.fit(self.X_train,self.y_train) MSE.append(mean_squared_error(self.y_train,model_pipeline.predict(self.X_train))**0.5) R2.append(r2_score(self.y_train,model_pipeline.predict(self.X_train))) print(np.round(MSE,2)) print(np.round(R2,2))
activity = pd.read_csv('./evaluate/thirtydays_final.csv', delimiter=',') #activity1 = pd.read_csv('./datas/new_novins.csv', delimiter = ',') #pd.to_datetime(activity['date']) activity.dropna(inplace=True) del activity['date'] # convert to standard form s = StandardScaler().fit_transform(activity) normalize = pd.DataFrame(data=s) print(normalize.head()) #do the PCA pca = PCA(n_components=3) prin_Comp = pca.fit_transform(s) prin_CompDf = pd.DataFrame(data=prin_Comp, columns=['prin_comp1', 'prin_comp2', 'prin_comp3']) prin_CompDf.head() # Join the label to the data and un-comment 'for label data below' # pca_data = pd.concat([prin_CompDf, activity[['0']]], axis = 1) # print(pca_data.head(5)) # for no-label data # normalize.to_csv('./datas/normalize.csv') prin_CompDf.to_csv('./evaluate/thirtydays_feature.csv') plt.semilogy(prin_CompDf, '--o') plt.title('Feature after PCA')
def runMLP(): # scale test_size = 0.4 layers = (100, ) activation = 'relu' alpha = 0.0001 lr = 'adaptive' cancer_dim = 10 dna_dim = 180 for scale in params_mlp['scale']: X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('cancer_scale.txt', 'a') as f: f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, cancer_dim, score, end - start)) X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('dna_scale.txt', 'a') as f: f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, dna_dim, score, end - start)) scale = False # test_size layers = (100, ) activation = 'relu' alpha = 0.0001 lr = 'adaptive' cancer_dim = 10 dna_dim = 180 for test_size in params_mlp['test_size']: X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('cancer_test_size.txt', 'a') as f: f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, cancer_dim, score, end - start)) X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('dna_test_size.txt', 'a') as f: f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, dna_dim, score, end - start)) scale = False test_size = 0.4 # layers activation = 'relu' alpha = 0.0001 lr = 'adaptive' cancer_dim = 10 dna_dim = 180 for layers in params_mlp['layers']: X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('cancer_layers.txt', 'a') as f: f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, cancer_dim, score, end - start)) X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('dna_layers.txt', 'a') as f: f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, dna_dim, score, end - start)) scale = False test_size = 0.4 layers = (100, ) # activation alpha = 0.0001 lr = 'adaptive' cancer_dim = 10 dna_dim = 180 for activation in params_mlp['activation']: X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('cancer_activation.txt', 'a') as f: f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, cancer_dim, score, end - start)) X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('dna_activation.txt', 'a') as f: f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, dna_dim, score, end - start)) scale = False test_size = 0.4 layers = (100, ) activation = 'relu' # alpha lr = 'adaptive' cancer_dim = 10 dna_dim = 180 for alpha in params_mlp['alpha']: X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('cancer_alpha.txt', 'a') as f: f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, cancer_dim, score, end - start)) X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('dna_alpha.txt', 'a') as f: f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, dna_dim, score, end - start)) scale = False test_size = 0.4 layers = (100, ) activation = 'relu' alpha = 0.0001 # lr cancer_dim = 10 dna_dim = 180 for lr in params_mlp['lr']: X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('cancer_lr.txt', 'a') as f: f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, cancer_dim, score, end - start)) X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000) model.fit(X_train, y_train) score = model.score(X_test, y_test) end = time.time() with open('dna_lr.txt', 'a') as f: f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, dna_dim, score, end - start)) scale = False test_size = 0.4 layers = (100, ) activation = 'relu' alpha = 0.0001 lr = 'adaptive' # cancer_dim # dna_dim for cancer_dim in params_mlp['cancer_dim']: X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() if cancer_dim != 10: pca = PCA(n_components=cancer_dim) X_train_new = pca.fit_transform(X_train) X_test_new = pca.transform(X_test) else: X_train_new = X_train X_test_new = X_test if scale: scaler = StandardScaler() X_train_new = scaler.fit_transform(X_train_new) X_test_new = scaler.fit_transform(X_test_new) model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000) model.fit(X_train_new, y_train) score = model.score(X_test_new, y_test) end = time.time() with open('cancer_dim.txt', 'a') as f: f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, cancer_dim, score, end - start)) for dna_dim in params_mlp['dna_dim']: X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size) if scale: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) start = time.time() if dna_dim != 180: pca = PCA(n_components=dna_dim) X_train_new = pca.fit_transform(X_train) X_test_new = pca.transform(X_test) else: X_train_new = X_train X_test_new = X_test if scale: scaler = StandardScaler() X_train_new = scaler.fit_transform(X_train_new) X_test_new = scaler.fit_transform(X_test_new) model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000) model.fit(X_train_new, y_train) score = model.score(X_test_new, y_test) end = time.time() with open('dna_dim.txt', 'a') as f: f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, dna_dim, score, end - start))
import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.linear_model import RidgeCV COLUMN_NAMES = ["GRE", "TOEFL", "University Rating", "SOP", "Recommendation", "CGPA", "Research"] A = np.loadtxt("Admissions.csv", delimiter=",") X = A[:, 0:-1] # Independent variables y = A[:, -1] # Dependent variable (chance of acceptance) print(X.shape) # Plot PCA plt.figure(figsize=(10, 10)) pca = PCA(n_components=2) Y = pca.fit_transform(X) plt.scatter(Y[:, 0], Y[:, 1], c=y) plt.colorbar() # Perform cross-validated ridge regression clf = RidgeCV(alphas=[1e-2, 1e-1, 1, 10]).fit(X, y) print(clf.score(X, y)) # Do a scatterplot of predicted versus actual coeff = clf.coef_ ypred = X.dot(coeff) plt.figure(figsize=(8, 8)) plt.scatter(y, ypred) plt.xlabel("Actual Chance") plt.ylabel("Predicted Chance")
def reconstructRF(): """ run KFOLD method for random forest regression """ #import packages import os import numpy as np import pandas as pd #from sklearn import metrics #from scipy import stats #import seaborn as sns #import matplotlib.pyplot as plt #from sklearn.model_selection import KFold from datetime import datetime from sklearn.ensemble import RandomForestRegressor from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/rfReconstruction" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" # #load KFOLD result csv file # os.chdir('F:\\06_eraint_results\\sonstig') # kf_dat = pd.read_csv('eraint_randForest_kfold.csv') # #edit the tg names to be usable later on # editName = lambda x: x.split('.csv')[0] # kf_dat['tg'] = pd.DataFrame(list(map(editName, kf_dat['tg'])), columns= ['tg']) #cd to the lagged predictors directory os.chdir(dir_in) x = 129 y = 130 #looping through for tg in range(x,y): os.chdir(dir_in) tg_name = os.listdir()[tg] print(tg, tg_name) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis = 1, inplace = True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1) #standardize predictor data dat = pred.iloc[:,1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis = 1, inplace = True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True) surge.reset_index(inplace = True) surge.drop('index', axis = 1, inplace = True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right') pred_surge.sort_values(by = 'date', inplace = True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis =1)] pred_surge.drop(row_nan.index, axis = 0, inplace = True) pred_surge.reset_index(inplace = True) pred_surge.drop('index', axis = 1, inplace = True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-'*80) print('Predictors and Surge don''t overlap') print('-'*80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:,1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis = 1, inplace = True) #apply PCA #get the number of PCs used during validation # pc_num = kf_dat.loc[kf_dat['tg'] == tg_name]['num_95pcs'] pca = PCA(0.95) pca.fit(X) X_pca = pca.transform(X) {# #apply 10 fold cross validation # kf = KFold(n_splits=10, random_state=29) # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs']) # for train_index, test_index in kf.split(X): # X_train, X_test = X_pca[train_index], X_pca[test_index] # y_train, y_test = y['surge'][train_index], y['surge'][test_index] # #train regression model # rf = RandomForestRegressor(n_estimator = 50, min_samples_leaf = 1) # lm.fit(X_train, y_train) # #predictions # predictions = lm.predict(X_test) # # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # # pd.DataFrame(np.array(y_test))], \ # # axis = 1) # # pred_obs.columns = ['pred', 'obs'] # # combo = pd.concat([combo, pred_obs], axis = 0) # #evaluation matrix - check p value # if stats.pearsonr(y_test, predictions)[1] >= 0.05: # print("insignificant correlation!") # continue # else: # #print(stats.pearsonr(y_test, predictions)) # metric_corr.append(stats.pearsonr(y_test, predictions)[0]) # #print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) # metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions))) # #number of years used to train/test model # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\ # pred_surge['date'][0]).days/365) } longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components # corr = np.mean(metric_corr) # rmse = np.mean(metric_rmse) # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\ # np.mean(metric_corr), ' - avg_rmse (m) = ', \ # np.mean(metric_rmse), '\n') #%% #surge reconstruction pred_for_recon = pred[~pred.isna().any(axis = 1)] pred_for_recon = pred_for_recon.reset_index().drop('index', axis = 1) #standardize predictor data dat = pred_for_recon.iloc[:,1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred_for_recon['date'], dat_standardized], axis = 1) X_recon = pred_standardized.iloc[:, 1:] #apply PCA pca = PCA(num_pc) #use the same number of PCs used for training pca.fit(X_recon) X_pca_recon = pca.transform(X_recon) #%% #model preparation #defining the rf model with number of trees and minimum leaves rf = RandomForestRegressor(n_estimators=50, min_samples_leaf=1, \ random_state = 29) rf.fit(X_pca, y) #get prediction interval def pred_ints(model, X_pca_recon, percentile = 95): """ function to construct prediction interval taking into account the result of each regression tree """ err_down = []; err_up = []; preds= []; for pred in model.estimators_: preds.append(pred.predict(X_pca_recon)) preds = np.vstack(preds).T err_down = np.percentile(preds, (100 - percentile)/2., axis = 1, \ keepdims = True) err_up = np.percentile(preds, 100 - (100 - percentile)/2., axis =1, \ keepdims = True) return err_down.reshape(-1), err_up.reshape(-1) #compute 95% prediction intervals err_down, err_up = pred_ints(rf, X_pca_recon, percentile = 95); #reconstructed surge goes here truth = rf.predict(X_pca_recon); correct = 0.; for i, val in enumerate(truth): if err_down[i] <= val <= err_up[i]: correct +=1 print(correct*100/len(truth), '\n') #final dataframe final_dat = pd.concat([pred_standardized['date'], \ pd.DataFrame([truth, err_down, err_up]).T], axis = 1) final_dat['lon'] = longitude final_dat['lat'] = latitude final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\ 'pred_int_upper', 'lon', 'lat'] {#plot - optional # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date']) # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date']) # sns.set_context('notebook', font_scale = 2) # plt.figure() # plt.plot(final_dat['date'], final_dat['mean'], color = 'green') # plt.scatter(surge['date'], surge['surge'], color = 'blue') #prediction intervals # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red', linestyle = "--", lw = 0.8) # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red', linestyle = "--", lw = 0.8) #confidence intervals # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black', linestyle = "--", lw = 0.8) # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black', linestyle = "--", lw = 0.8) } #save df as cs - in case of interruption os.chdir(dir_out) final_dat.to_csv(tg_name) #cd to dir_in os.chdir(dir_in)
def fit_pca(fv): scaled_new_fv = scale_data(fv) pca = PCA() pca.fit(fv) pca_fv = pca.transform(fv) return pca_fv
# # License: BSD 3 clause from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC from sklearn.datasets import load_iris from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest iris = load_iris() X, y = iris.data, iris.target # This dataset is way too high-dimensional. Better do PCA: pca = PCA(n_components=2) # Maybe some original features where good, too? selection = SelectKBest(k=1) # Build estimator from PCA and Univariate selection: combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) # Use combined features to transform dataset: X_features = combined_features.fit(X, y).transform(X) print("Combined space has", X_features.shape[1], "features") svm = SVC(kernel="linear") # Do grid search over k, n_components and C:
sns.heatmap(correlation, annot=True) plt.show() print('end') from sklearn.cluster import KMeans kmeans_model = KMeans(n_clusters=5, random_state=1) good_columns = nba._get_numeric_data().dropna(axis=1) kmeans_model.fit(good_columns) labels = kmeans_model.labels_ print(labels) from sklearn.decomposition import PCA pca_2 = PCA(2) plot_columns = pca_2.fit_transform(good_columns) plt.scatter(x=plot_columns[:, 0], y=plot_columns[:, 1], c=labels) plt.show() # Find player LeBron LeBron = good_columns.loc[nba['Player'] == 'LeBron James\jamesle01', :] #Find player Durant Durant = good_columns.loc[nba['Player'] == 'Kevin Durant\duranke01', :] #print the players print(LeBron) print(Durant) #Change the dataframes to a list
# -*- coding: utf-8 -*- """ Created on Thu Jun 6 15:36:36 2019 @author: KIIT """ import pandas as pd from sklearn.decomposition import PCA from sklearn.cluster import KMeans import matplotlib.pyplot as plt df=pd.read_csv('crime_data.csv') features=df.iloc[:,[1,2,4]].values pca=PCA(n_components=2) features=pca.fit_transform(features) kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 0) pred_cluster = kmeans.fit_predict(features) plt.scatter(features[pred_cluster == 0, 0], features[pred_cluster == 0, 1], c = 'blue', label = 'LowCrime') plt.scatter(features[pred_cluster == 1, 0], features[pred_cluster == 1, 1], c = 'red', label = 'MedCrime') plt.scatter(features[pred_cluster == 2, 0], features[pred_cluster == 2, 1], c = 'green', label = 'HighCrime') plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c = 'yellow', label = 'Centroids') plt.title('Crime Data') plt.xlabel('P1 Features') plt.ylabel('P2 Features') plt.legend() plt.show()
def pca2d_OnClickFit(atoms, colors): from sklearn.decomposition import PCA from matplotlib.lines import Line2D #from numpy import arange, cos, linspace, pi, sin, random from scipy.interpolate import splprep, splev ### Calculate eigenvectors pca = PCA(n_components=2) X_reduced = pca.fit_transform(atoms) # draw a scatter plot of the generated values fig = plt.figure(figsize=(20, 16)) ax = fig.add_subplot(111) # legend elements base_colors = [ 'blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'indigo', 'burlywood', 'darksalmon', 'darkviolet' ] legend_elements = [ Line2D([0], [0], marker='o', color=base_colors[0], label='5%', markerfacecolor=base_colors[0], markersize=10), Line2D([0], [0], marker='o', color=base_colors[1], label='10%', markerfacecolor=base_colors[1], markersize=10), Line2D([0], [0], marker='o', color=base_colors[2], label='20%', markerfacecolor=base_colors[2], markersize=10), Line2D([0], [0], marker='o', color=base_colors[3], label='30%', markerfacecolor=base_colors[3], markersize=10), Line2D([0], [0], marker='o', color=base_colors[4], label='40%', markerfacecolor=base_colors[4], markersize=10), Line2D([0], [0], marker='o', color=base_colors[5], label='50%', markerfacecolor=base_colors[5], markersize=10), Line2D([0], [0], marker='o', color=base_colors[6], label='60%', markerfacecolor=base_colors[6], markersize=10), Line2D([0], [0], marker='o', color=base_colors[7], label='70%', markerfacecolor=base_colors[7], markersize=10), Line2D([0], [0], marker='o', color=base_colors[8], label='80%', markerfacecolor=base_colors[8], markersize=10), Line2D([0], [0], marker='o', color=base_colors[9], label='90%', markerfacecolor=base_colors[9], markersize=10), Line2D([0], [0], marker='o', color=base_colors[10], label='100%', markerfacecolor=base_colors[10], markersize=10), ] plt.legend(loc='upper left', handles=legend_elements, frameon=False) plt.title( 'Para-aminobenzamidine Morphology - Principal Components Analysis 5-atoms' ) # extract the scatterplot drawing in a separate function so we ca re-use the code ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=colors, alpha=0.8, edgecolor='none', s=50, picker=True) # define the behaviour -> what happens when you pick a dot on the scatterplot by clicking close to it def onpick(event): # Create embedded figure # spline parameters s = 3.0 # smoothness parameter k = 2 # spline order figi = plt.figure() for subplotnum, dataind in enumerate(event.ind): ax = Axes3D(figi) data = atoms[dataind].reshape(12, 3) tckp, u = splprep([data[:, 0], data[:, 1], data[:, 2]], s=s, k=k, nest=-1) # find the knot points xnew, ynew, znew = splev(np.linspace(0, 1, 400), tckp) print xnew, ynew, znew ax.plot(data[:, 0], data[:, 1], data[:, 2], 'o', c=colors[dataind]) ax.plot(xnew, ynew, znew, 'r-', label='fit', c='k') ax.set_xlabel('X Axis') ax.set_ylabel('Y Axis') ax.set_zlabel('Y Axis') ax.set_title('Spacer Arm Coordinates') figi.show() # connect the click handler function to the scatterplot fig.canvas.mpl_connect('pick_event', onpick) plt.show() fig.savefig('pca2d-interactive.png')
def validateRF(): """ run KFOLD method for regression """ #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/merraRFValidation" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" #cd to the lagged predictors directory os.chdir(dir_in) x = 66 y = 67 #empty dataframe for model validation df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse']) #looping through for tg in range(x,y): os.chdir(dir_in) #filter only .csv files tgNames = [] for file in glob.glob("*.csv"): tgNames.append(file) tg_name = sorted(tgNames)[tg] print(tg_name) ########################################## #check if this tg is already taken care of ########################################## os.chdir(dir_out) if os.path.isfile(tg_name): print("this tide gauge is already taken care of") return "file already analyzed!" os.chdir(dir_in) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis = 1, inplace = True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1) #standardize predictor data dat = pred.iloc[:,1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis = 1, inplace = True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True) surge.reset_index(inplace = True) surge.drop('index', axis = 1, inplace = True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right') pred_surge.sort_values(by = 'date', inplace = True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis =1)] pred_surge.drop(row_nan.index, axis = 0, inplace = True) pred_surge.reset_index(inplace = True) pred_surge.drop('index', axis = 1, inplace = True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-'*80) print('Predictors and Surge don''t overlap') print('-'*80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:,1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis = 1, inplace = True) #apply PCA pca = PCA(.95) pca.fit(X) X_pca = pca.transform(X) #apply 10 fold cross validation kf = KFold(n_splits=10, random_state=29) metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs']) for train_index, test_index in kf.split(X): X_train, X_test = X_pca[train_index], X_pca[test_index] y_train, y_test = y['surge'][train_index], y['surge'][test_index] #train regression model rf= RandomForestRegressor(n_estimators = 50, random_state = 101, \ min_samples_leaf = 1) rf.fit(X_train, y_train) #predictions predictions = rf.predict(X_test) # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # pd.DataFrame(np.array(y_test))], \ # axis = 1) # pred_obs.columns = ['pred', 'obs'] # combo = pd.concat([combo, pred_obs], axis = 0) #evaluation matrix - check p value if stats.pearsonr(y_test, predictions)[1] >= 0.05: print("insignificant correlation!") continue else: print(stats.pearsonr(y_test, predictions)) metric_corr.append(stats.pearsonr(y_test, predictions)[0]) print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) print() metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions))) #number of years used to train/test model num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\ pred_surge['date'][0]).days/365 longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components corr = np.mean(metric_corr) rmse = np.mean(metric_rmse) print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' - avg_rmse (m) = ', \ np.mean(metric_rmse), '\n') #original size and pca size of matrix added new_df = pd.DataFrame([tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T new_df.columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse'] df = pd.concat([df, new_df], axis = 0) #save df as cs - in case of interruption os.chdir(dir_out) df.to_csv(tg_name)
def test_CNN_model_Cifar(num_classes, model_name, x_train_bis, y_train_bis, x_test_bis, y_test_bis, x_other, y_other, training = False, plot = True): print("Testing CNN Model : {} on Cifar10 ...".format(model_name)) # -------------------------------------- # CNN network definition # -------------------------------------- # Network params conv_depth_1 = 100 kernel_size_1 = 3 conv_depth_2 = 100 kernel_size_2 = 3 pool_size_2 = 2 conv_depth_3 = 200 kernel_size_3 = 3 conv_depth_4 = 200 kernel_size_4 = 3 conv_depth_5 = 400 kernel_size_5 = 3 pool_size_5 = 2 hidden_size_1 = 600 weight_penalty = 0.0001 model = Sequential() model.add(Conv2D(conv_depth_1, (kernel_size_1, kernel_size_1), padding='same', input_shape=x_train_bis.shape[1:])) model.add(Activation('relu')) model.add(Conv2D(conv_depth_2, (kernel_size_2, kernel_size_2), padding='same')) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(pool_size_2, pool_size_2))) model.add(Dropout(0.3)) model.add(Conv2D(conv_depth_3, (kernel_size_3, kernel_size_3), padding='same', kernel_regularizer=regularizers.l2(weight_penalty))) model.add(Activation('relu')) model.add(Conv2D(conv_depth_4, (kernel_size_4, kernel_size_4), padding='same', kernel_regularizer=regularizers.l2(weight_penalty))) model.add(Activation('relu')) model.add(Conv2D(conv_depth_5, (kernel_size_5, kernel_size_5), padding='same', kernel_regularizer=regularizers.l2(weight_penalty))) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(pool_size_5, pool_size_5))) model.add(Dropout(0.3)) model.add(Flatten()) model.add(Dropout(0.5)) model.add(Dense(hidden_size_1, kernel_regularizer=regularizers.l2(weight_penalty))) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(num_classes)) model.add(Activation('softmax')) # Training on whole dataset if (training): learning_rate = 0.0001 batch_size = 32 num_epochs = 50 opt = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, amsgrad=False) #opt = RMSprop(lr=learning_rate,decay=1e-6) model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) history = model.fit(x_train_bis, y_train_bis, verbose = True, epochs = num_epochs, batch_size=batch_size, validation_split=0.2, shuffle = True) model.save(model_name) # Loading the model: if not training: print("Loading model...") model = load_model(model_name) print("Done.") # Test print("Evaluating Model...") print("Performance on train:", model.evaluate(x_train_bis, y_train_bis)) print("Performance on test:", model.evaluate(x_test_bis, y_test_bis)) ##### FEATURE EXTRACTION ##### print("Extracting Feature for some Examples...") intermediate_layer_model = Model(inputs=model.input, outputs=model.layers[-3].output) results = intermediate_layer_model.predict(x_other) # Extraction of some examples nb_examples = 700 state = True i = 0 results_per_class = [0]*(10-num_classes) while state: x = intermediate_layer_model.predict(np.array([x_other[i]]))[0] y = y_other[i,0] if type(results_per_class[y - num_classes]) == type(0): results_per_class[y - num_classes] = [x] else: if len(results_per_class[y - num_classes]) < nb_examples: results_per_class[y - num_classes].append(x) else: stop = True for elem in results_per_class: if type(elem) == type(0): stop = False break if len(elem) < nb_examples: stop = False break if stop: state = False i += 1 results_per_class = np.array(results_per_class) print("Done.") # Ploting the last feature for the new classes with dimension reduction # methods: if (plot): print("Ploting results with PCA and TSNE...") N = 3000 feat_cols = [ 'index'+str(i) for i in range(results.shape[1]) ] df = pd.DataFrame(results,columns=feat_cols) df['y'] = y_other df['label'] = df['y'].apply(lambda i: str(i)) print('Size of the dataframe: {}'.format(df.shape)) ########### PCA ########## # np.random.seed(42) rndperm = np.random.permutation(df.shape[0]) df_subset = df.loc[rndperm[:N],:].copy() data_subset = df_subset[feat_cols].values pca = PCA(n_components=3) pca_result = pca.fit_transform(data_subset) df_subset['pca-one'] = pca_result[:,0] df_subset['pca-two'] = pca_result[:,1] df_subset['pca-three'] = pca_result[:,2] print('Explained variation per \ principal component: {}'.format(pca.explained_variance_ratio_)) print('Size of the dataframe: {}'.format(df_subset.shape)) ax = plt.figure(figsize=(16,10)).gca(projection='3d') ax.scatter( xs=df_subset.loc[rndperm,:]["pca-one"], ys=df_subset.loc[rndperm,:]["pca-two"], zs=df_subset.loc[rndperm,:]["pca-three"], c=df.loc[rndperm,:]["y"], cmap='tab10' ) ax.set_xlabel('pca-one') ax.set_ylabel('pca-two') ax.set_zlabel('pca-three') plt.show() ######## T-SNE Method ######## data_subset = df_subset[feat_cols].values time_start = time.time() tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) tsne_results = tsne.fit_transform(data_subset) print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start)) df_subset['tsne-2d-one'] = tsne_results[:,0] df_subset['tsne-2d-two'] = tsne_results[:,1] plt.figure(figsize=(16,10)) sns.scatterplot( x="tsne-2d-one", y="tsne-2d-two",hue="y", palette=sns.color_palette("hls", 10-num_classes), data=df_subset, legend="full", alpha=0.3) plt.show() print("Done.") # Prediction evaluation on new classes: print("Computing the confusion matrix for one-shot learning on 300 different references...") # Tests: confusion_matrix = np.zeros((10-num_classes, 10 - num_classes), dtype = int) # ligne : prediction # colonne : reference nb_of_different_references = 100 accuracies = np.zeros((10-num_classes, nb_of_different_references)) for p in range(nb_of_different_references): index_ref = rd.randint(0, nb_examples-1) references = results_per_class[:,index_ref] for i in range(10-num_classes): nb_mistakes = 0 k = 0 for elem in results_per_class[i,:]: if k != index_ref: index = compute_nearest_neighbor(elem, references) confusion_matrix[index, i] += 1 if index != i: nb_mistakes += 1 k += 1 accuracies[i, p] = 1 - nb_mistakes/(nb_examples-1) for num_class in range(10-num_classes): print("Class n°", num_class+num_classes,"accuracy:", np.mean(accuracies[num_class,:])) print("Confusion Matrix:") print(confusion_matrix) print("Done.") print("Computing the confusion matrix for 3-shots learning on 100 different references...") confusion_matrix = np.zeros((10-num_classes, 10 - num_classes), dtype = int) # ligne : prediction # colonne : reference nb_of_different_references = 100 accuracies = np.zeros((10-num_classes, nb_of_different_references)) for p in range(nb_of_different_references): index_ref = rd.randint(0, nb_examples-3) references = results_per_class[:,index_ref] + results_per_class[:, index_ref +1 ] + \ results_per_class[:,index_ref+2] references /= 3 for i in range(10-num_classes): nb_mistakes = 0 k = 0 for elem in results_per_class[i,:]: if k != index_ref: index = compute_nearest_neighbor(elem, references) confusion_matrix[index, i] += 1 if index != i: nb_mistakes += 1 k += 1 accuracies[i, p] = 1 - nb_mistakes/(nb_examples-1) for num_class in range(10-num_classes): print("Class n°", num_class+num_classes,"accuracy:", np.mean(accuracies[num_class,:])) print("Confusion Matrix:") print(confusion_matrix) print("Done. End Testing.")
) parser.add_argument("--verbose", type=int, default=0) parser.add_argument( "--pca-components", type=int, default=50, help="Number of principal components for preprocessing.", ) args = parser.parse_args() print("Used number of threads: {}".format(_openmp_effective_n_threads())) X, y = load_data(order=args.order) if args.pca_components > 0: t0 = time() X = PCA(n_components=args.pca_components).fit_transform(X) print( "PCA preprocessing down to {} dimensions took {:0.3f}s".format( args.pca_components, time() - t0 ) ) methods = [] # Put TSNE in methods tsne = TSNE( n_components=2, init="pca", perplexity=args.perplexity, verbose=args.verbose, n_iter=1000,
df = pd.DataFrame(data=datas, columns=['x1', 'x2', 'x3', 'x4', 'x5']) axes = pd.plotting.scatter_matrix(df, alpha=0.9, figsize=(7, 7), c='blue', s=80) plt.tight_layout() plt.show() plt.savefig('scatter_matrix.png') fig = plt.figure(figsize=(7, 7)) d = spc(datas) + Tsquare_single() print(d) d2 = PCA(n_components=5) datas_standard = (datas - np.mean(datas, axis=0)) / np.std(datas, axis=0) transformed = d2.fit_transform(datas_standard) Lambda = d2.explained_variance_ UCL = 3 * sqrt(Lambda[0]) LCL = -3 * sqrt(Lambda[0]) fig = plt.figure(figsize=(7, 7)) plt.plot(transformed[:, 0], marker='.', markersize='15', linestyle='--',
cluster_df = new_df.loc[df['Cluster']==i] stratified_data = stratified_data.append(cluster_df.sample(frac=.25)) stratified_data = stratified_data.drop('Cluster', axis =1) pca = PCA(n_components=3) pca.fit_transform(stratified_data) loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2', 'PC3'], index=stratified_data.columns) load= loadings.apply(np.square) load["Sum_of_Squares"]= load.apply(np.sum, axis=1) load = load.sort_values(by=['Sum_of_Squares'], ascending=False ) top2PCA = list(zip(stratified_data.values[:,3], stratified_data.values[:,4],stratified_data.values[:,6])) return json.dumps(top2PCA) pca = PCA(n_components=17,random_state=0) original_df = pca.fit_transform(StandardScaler.fit_transform(df)) PCA_Array = pca.explained_variance_ratio_ calculate_percent = toPercent(PCA_Array) cumulative_sum_var = cumulative_sum(calculate_percent) random_df = pca.fit_transform(StandardScaler.fit_transform(randomSampledData())) PCA_Array_random = pca.explained_variance_ratio_ calculate_percent_random = toPercent(PCA_Array_random) cumulative_sum_random = cumulative_sum(calculate_percent_random) stratified_df = pca.fit_transform(StandardScaler.fit_transform(stratified_data())) PCA_Array_stratified = pca.explained_variance_ratio_ calculate_percent_stratified = toPercent(PCA_Array_stratified) cumulative_sum_stratified = cumulative_sum(calculate_percent_stratified)
def pca2d(atoms, colors): from sklearn.decomposition import PCA from matplotlib.lines import Line2D ### Calculate eigenvectors pca = PCA(n_components=2) X_reduced = pca.fit_transform(atoms) plt.title('PCA2D Clustering') plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=colors, alpha=0.8, edgecolor='none') base_colors = [ 'blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'indigo', 'burlywood', 'darksalmon', 'darkviolet' ] legend_elements = [ Line2D([0], [0], marker='o', color=base_colors[0], label='5%', markerfacecolor=base_colors[0], markersize=10), Line2D([0], [0], marker='o', color=base_colors[1], label='10%', markerfacecolor=base_colors[1], markersize=10), Line2D([0], [0], marker='o', color=base_colors[2], label='20%', markerfacecolor=base_colors[2], markersize=10), Line2D([0], [0], marker='o', color=base_colors[3], label='30%', markerfacecolor=base_colors[3], markersize=10), Line2D([0], [0], marker='o', color=base_colors[4], label='40%', markerfacecolor=base_colors[4], markersize=10), Line2D([0], [0], marker='o', color=base_colors[5], label='50%', markerfacecolor=base_colors[5], markersize=10), Line2D([0], [0], marker='o', color=base_colors[6], label='60%', markerfacecolor=base_colors[6], markersize=10), Line2D([0], [0], marker='o', color=base_colors[7], label='70%', markerfacecolor=base_colors[7], markersize=10), Line2D([0], [0], marker='o', color=base_colors[8], label='80%', markerfacecolor=base_colors[8], markersize=10), Line2D([0], [0], marker='o', color=base_colors[9], label='90%', markerfacecolor=base_colors[9], markersize=10), Line2D([0], [0], marker='o', color=base_colors[10], label='100%', markerfacecolor=base_colors[10], markersize=10), ] plt.legend(loc='upper left', handles=legend_elements, frameon=False) plt.show() plt.savefig('pca2d.png')
#logistic回归 from sklearn.linear_model.logistic import LogisticRegression, LogisticRegressionCV # naive_bayes # 见 D:\opensource\scrapy-work\wolf_nlp\算法学习笔记\NLP汉语自然语言处理原理与实践-读书笔记/20180424-bayes.md from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB # 会出现共线性问题 # 重复的词,多项式(出现多次),伯努利(出现1次),混合模型(计算句子概率时计算1次,统计时统计多次) #降维算法 t-SNE PCA # tsne保留下的属性信息,更具代表性,也即最能体现样本间的差异 # tsne运行极慢,PCA则相对较快 from sklearn.manifold import TSNE from sklearn.decomposition import PCA data_pca = PCA(n_components=50).fit_transform(data) data_pca_tsne = TSNE(n_components=2).fit_transform(data_pca) #构建词袋模型 from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 2, 3)) #单个词、2元词组、3元词组全部获取,生成词袋模型 train_data_features = vectorizer.fit_transform().toarray() #转成词袋模型进行编码 #数据预处理 #数据的幅度缩放 标准化 from sklearn.preprocessing import MinMaxScaler,StandardScaler,scale #独热向量编码 from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import minmax_scale minmax_scale.fit_transform()
% (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size))) bench_k_means(KMeans(init='k-means++', n_clusters=n_digits, n_init=10), name="k-means++", data=data) bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10), name="random", data=data) # in this case the seeding of the centers is deterministic, hence we run the # kmeans algorithm only once with n_init=1 pca = PCA(n_components=n_digits).fit(data) bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1), name="PCA-based", data=data) print(79 * '_') reduced_data = PCA(n_components=2).fit_transform(data) kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10) kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
def principal_component_analysis(n): pca = PCA(n_components=n) data = pca.fit_transform(x) predictions = cross_val_predict(clf, data, y, cv=10) print(metrics.r2_score(y, predictions)) simple_plot(predictions, "PCA n=" + str(n))