def PComponent_(train_Set, test_Set, var_Threshold=None, components=None): if (var_Threshold == None and components == None): print( "please give a threshold for PComponent - either var threshold or components" ) quit() if (var_Threshold != None and components != None): print("give only one threshold") quit() if (var_Threshold != None): pca = PCA() pca.fit(train_Set) #variance ratio in percentage explain_Variance = around(pca.explained_variance_ratio_, decimals=4) explain_Variance = explain_Variance.tolist() explain_Variance = [x * 100 for x in explain_Variance] #cumulative variance temp = 0 for x in range(len(explain_Variance)): explain_Variance[x] = temp + explain_Variance[x] temp = explain_Variance[x] explain_Variance = [x for x in explain_Variance if x < var_Threshold] n_components = len(explain_Variance) pca = PCA(n_components=n_components) return (pca.fit_transform(train_Set), pca.transform(test_Set)) else: pca = PCA(n_components=components) return (pca.fit_transform(train_Set), pca.transform(test_Set))
def classify_for_benchmark(data_set_df, user_info_df, features, label='gender', classifier=None, num=None): instance_num = len(data_set_df.columns) x = data_set_df.loc[features] x = x.dropna(how='all', axis=0) x = x.dropna(how='all', axis=1) imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=1) x_replaced = x.replace([np.inf, -np.inf], np.nan) x_imp = imp.transform(x_replaced) y = user_info_df.get(label) y_filtered = y[(map(int, x.columns.values))] clf = nb.BernoulliNB() if classifier is None else classifier cv_num = min(len(y_filtered), 10) if cv_num <= 1 or len(y_filtered.unique()) <= 1: return 0.0, 100.0 else: final_score = 0.0 for i in range(100): score = 0.0 cnt = 0 skf = StratifiedKFold(y_filtered, n_folds=cv_num, shuffle=True) for tr_index, te_index in skf: x_train, x_test = x_imp.T[tr_index], x_imp.T[te_index] y_train, y_test = y_filtered.iloc[tr_index], y_filtered.iloc[te_index] min_num = min(len(x_train), len(x_train.T), len(x_test), len(x_test.T), num) pca = PCA(min_num) x_train = pca.fit_transform(x_train) x_test = pca.fit_transform(x_test) try: clf.fit(x_train, y_train) score += clf.score(x_test, y_test) cnt += 1 # cv_score = cross_validation.cross_val_score(clf, x_imp.T, y_filtered, cv=cv_num) except ValueError: traceback.print_exc() print i, "why error? skip!" if cnt > 0: score /= cnt print i, score else: return 0.0, (float(instance_num - len(y_filtered)) / instance_num) final_score += score final_score /= 100 miss_clf_rate = (float(instance_num - len(y_filtered)) / instance_num) return final_score, miss_clf_rate
def cross_validate(self, train_x, train_y, test_x, test_y, **params): if not params: params = {"dummy": [0]} keys, values = list(zip(*list(params.items()))) for param_list in itertools.product(*values): cv_params = list(self.params.items()) + list(zip(keys, param_list)) for use_pca in (False, True): if self.have_tested(cv_params, use_pca): continue if use_pca: pca = PCA(n_components=0.99) proc_train_x = pca.fit_transform(train_x) proc_test_x = pca.transform(test_x) else: proc_train_x = train_x proc_test_x = test_x if "dummy" in params: model = self.func().fit(proc_train_x, train_y) else: model = self.func(**dict(cv_params)).fit( proc_train_x, train_y) predictions = model.predict_proba(proc_test_x) if len(predictions.shape) == 2: predictions = predictions[:, 1] num_right = (test_y == predictions.round()).sum() self.json["tests"].append({}) test_data = self.json["tests"][-1] test_data["use_pca"] = use_pca test_data["pct_right"] = 100 * num_right / float(len(test_y)) test_data["loss"] = log_loss(test_y, predictions) test_data["num_right"] = num_right test_data["num_tests"] = len(test_y) test_data["params"] = dict(cv_params) self._write() print((self.print_test(test_data)))
def write_predictions(self, model): if not os.path.exists(self.pred_dir): os.mkdir(self.pred_dir) raw_train_x, train_y = features_labels(self.season + 1) scaler = StandardScaler() train_x = scaler.fit_transform(raw_train_x) pca = PCA() if model.json.get("use_pca", False): train_x = pca.fit_transform(train_x) clf = model.func(**model.best_params()["params"]).fit(train_x, train_y) features, ids = self.get_features_and_ids() features = scaler.transform(features) if model.json.get("use_pca", False): features = pca.transform(features) predictions = clf.predict_proba(features) if len(predictions.shape) == 2: predictions = predictions[:, 1] with open(self.pred_path, 'w') as buff: buff.write("id,pred\n") for (label, pred) in zip(ids, predictions): buff.write("{:s},{:s}\n".format(label, str(pred)))
class PCACCLayer(Layer): def __init__(self, n_out): self.pca = PCA(n_components=n_out) def get_train_output_for(self, inputX): batches, n_in, rows, cols = inputX.shape # 归一化 # inputX = norm4d(inputX) # inputX, self.P1 = whiten4d(inputX) myUtils.visual.save_map(inputX[[10, 100, 1000]], dir_name, 'norm4d') inputX = inputX.transpose((0, 2, 3, 1)).reshape((-1, n_in)) outputX = self.pca.fit_transform(inputX) outputX = outputX.reshape((batches, rows, cols, -1)).transpose( (0, 3, 1, 2)) myUtils.visual.save_map(outputX[[10, 100, 1000]], dir_name, 'pca') return outputX def get_test_output_for(self, inputX): batches, n_in, rows, cols = inputX.shape # 归一化 # inputX = norm4d(inputX) # inputX = whiten4d(inputX, self.P1) myUtils.visual.save_map(inputX[[10, 100, 1000]], dir_name, 'norm4dte') inputX = inputX.transpose((0, 2, 3, 1)).reshape((-1, n_in)) outputX = self.pca.transform(inputX) outputX = outputX.reshape((batches, rows, cols, -1)).transpose( (0, 3, 1, 2)) myUtils.visual.save_map(outputX[[10, 100, 1000]], dir_name, 'pcate') return outputX
def recommender_system_using_svd_pca(user_id, user_movies, movie_tag_vector, genome_tags, model): movies_watched = list(user_movies[user_id]) movies_watched_tags = {} for movie in movies_watched: movie_tags = movie_tag_vector[movie] for tag in list(movie_tags.keys()): movies_watched_tags[tag] = 1 tags = list(movies_watched_tags.keys()) # tags = list(genome_tags.keys()) all_movies = list(movie_tag_vector.keys()) all_movies, movie_tag_matrix = build_movie_tag_matrix( all_movies, tags, movie_tag_vector) if model == 'PCA': pca = PCA(n_components=min(10, len(tags))) U = pca.fit_transform(movie_tag_matrix) else: U, S, Vt = np.linalg.svd(movie_tag_matrix, full_matrices=False) watched_indexed, U_watched, rest_indexed, U_rest \ = split_output(U, movies_watched, all_movies) similarity_mapping = get_similarity_mapping(watched_indexed, U_watched, rest_indexed, U_rest) weighted_similarities = weigh_similarities(user_id, similarity_mapping) return weighted_similarities
def build_images_KMeans(spectra, spectrum_columns, spectra_distances, TSNE_learning_rate=500, TSNE_n_iter=1500, TSNE_learning_rate2=300): colors = ['red', 'black'] c = spectra['marked'].apply(lambda x: colors[x]) plt.subplots(figsize=(18, 6)) plt.subplot(131) plt.title("PCA") pca = PCA(n_components=2, random_state=42) spectra_2D = pca.fit_transform(spectra[spectrum_columns]) plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5) plt.subplot(132) plt.title("TSNE, Euclidean distance") tsne = TSNE(n_components=2, random_state=42, learning_rate=TSNE_learning_rate, n_iter=TSNE_n_iter) spectra_2D = tsne.fit_transform(spectra[spectrum_columns]) plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5) plt.subplot(133) plt.title("TSNE, Chosen distance") tsne = TSNE(n_components=2, random_state=42, metric="precomputed", learning_rate=TSNE_learning_rate2) spectra_2D = tsne.fit_transform(spectra_distances) plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5) plt.show() # visualization - tsne with chosen distance print('Clustering') plt.subplots(figsize=(18, 12)) plt.subplot(3, 3, 1) colors = ['red', 'black'] c = spectra['marked'].apply(lambda x: colors[x]) plt.title("true labels") plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5) colors = [ 'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan' ] for n in range(2, 10): kmeans = cluster.KMeans(n_clusters=n, random_state=42) cluster_labels = kmeans.fit_predict(spectra_distances) plt.subplot(3, 3, n) c = [colors[l] for l in cluster_labels] plt.title("cluster labels ({} clusters)".format(n)) plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5) plt.show()
class LogisticClassifier(object): def __init__(self, learning_rate=0.01, reg=0., momentum=0.5): self.classifier = LogisticRegression(learning_rate, reg, momentum) self.pca = None self.scaler = None def sgd_optimize(self, data, n_epochs, mini_batch_size): data = self._preprocess_data(data) sgd_optimization(data, self.classifier, n_epochs, mini_batch_size) def _preprocess_data(self, data): # center data and scale to unit std if self.scaler is None: self.scaler = StandardScaler() data = self.scaler.fit_transform(data) else: data = self.scaler.transform(data) if self.pca is None: # use minika's mle to guess appropriate dimension self.pca = PCA(n_components='mle') data = self.pca.fit_transform(data) else: data = self.pca.transform(data) return data
class EnsembleModel: def __init__(self, models, **params): self.models = models.values() self.model_funcs = [j.model for j in models.values()] self.params = params self._pca = PCA(n_components=0.99) self._clf = None def fit(self, x, y): train_x, test_x, train_y, test_y, = train_test_split(x, y, test_size=0.2) pca_train_x = self._pca.fit_transform(train_x) pca_test_x = self._pca.transform(test_x) for model, model_func in zip(self.models, self.model_funcs): if model.json.get("use_pca", False): train_x = pca_train_x test_x = pca_test_x else: pass model_func.fit(train_x, train_y) self._fit_meta_estimator(test_x, test_y) return self def _fit_meta_estimator(self, x, y): predictions = self._predictions(x).T y = numpy.atleast_2d(y).T labels = numpy.argmin(abs(predictions - y * numpy.ones((1, predictions.shape[1]))), 1) self._clf = GaussianNB().fit(x, labels) def _predictions(self, x): pca_x = self._pca.transform(x) predictions = [] weights = [] for model, model_func in zip(self.models, self.model_funcs): if model.json.get("use_pca", False): test_x = pca_x else: test_x = x predictions.append(model_func.predict_proba(test_x)[:, 1]) weights.append(model.best_params()["loss"]) return numpy.array(predictions) def predict_proba(self, x): blend = self.params.get("blend", "mean") predictions = self._predictions(x) if blend == "median": return numpy.median(predictions, 0) if blend == "meta": probs = self._clf.predict_proba(x) preds = [] for row, prob in zip(predictions.T, probs): if max(prob) > 0.99: preds.append(row[numpy.argmax(prob)]) else: preds.append(numpy.median(row)) return numpy.array(preds) return predictions.mean(0)
def plot_pca_2d(data, ax=None, fpath='', show_fig=False): ax = _get_ax(ax) pca_2d = PCA(n_components=2) data_hat = pca_2d.fit_transform(data) ax.scatter(data_hat[:, 0], data_hat[:, 1]) _save_show_fig(fpath, show_fig) return pca_2d
def test_pass_pca_corr_pca_out(): X, y = iris_data() pca = PCA(n_components=2) X_pca = pca.fit_transform(X) eigen = pca.explained_variance_ plot_pca_correlation_graph(X, variables_names=['1', '2', '3', '4'], X_pca=X_pca, explained_variance=eigen)
def pca_scatter2d(data: np.ndarray, labels: Optional[np.ndarray] = None, label_mapping: Optional[Mapping[int, str]] = None, ax=None, fpath='', show_fig=False, title=None, **kwargs): pca_2d = PCA(n_components=2) data_hat = pca_2d.fit_transform(data) scatter2d(data_hat, labels, label_mapping, ax, fpath, show_fig, title, **kwargs)
def build_images_KMeans(spectra, spectrum_columns, spectra_distances, colors, TSNE_learning_rate=500, TSNE_n_iter=1500, TSNE_learning_rate2=300): colors_m = ['red','black'] cols = spectra['marked'].apply(lambda x: colors[x]) col = spectra['marked'] plt.subplots(figsize=(18, 6)) plt.subplot(131) plt.title("PCA") pca = PCA(n_components=2, random_state=42) spectra_2D = pca.fit_transform(spectra[spectrum_columns]) for i in range(len(spectra_2D)): #print(i) #print(spectra_2D[i, 0]) #print(spectra_2D[i, 1]) #print(cols[i]) #print(col[i]) plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]]) plt.subplot(132) plt.title("TSNE, Euclidean distance") tsne = TSNE(n_components=2, random_state=42, learning_rate=TSNE_learning_rate, n_iter=TSNE_n_iter) spectra_2D = tsne.fit_transform(spectra[spectrum_columns]) for i in range(len(col)): plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]]) plt.subplot(133) plt.title("TSNE, Chosen distance") tsne = TSNE(n_components=2, random_state=42, metric="precomputed", learning_rate=TSNE_learning_rate2) spectra_2D = tsne.fit_transform(spectra_distances) for i in range(len(col)): plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]],s=markersizes[col[i]]) # visualization - tsne with chosen distance print('Clustering') plt.subplots(figsize=(18, 12)) for n in range(2, 10): kmeans = cluster.KMeans(n_clusters=n, random_state=42) cluster_labels = kmeans.fit_predict(spectra_distances) plt.subplot(3, 3, n-1) cols = [colors[l] for l in cluster_labels] plt.title("cluster labels ({} clusters)".format(n)) for i in range(len(col)): plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]]) plt.show() plt.show() return spectra_2D
def compute_pca(self, model): X = model.wv.vectors #self.plot_matrix(X, 'Before') scaler = preprocessing.StandardScaler() X_scaled = scaler.fit_transform(X) #self.plot_matrix(X_scaled, 'After') pca = PCA(n_components=2) principalComponents = pca.fit_transform(X_scaled) print(pca.explained_variance_ratio_) self.scatterplot(principalComponents) return principalComponents
def test_X_PCA_but_no_explained_variance(): with pytest.raises( ValueError, match='If `X_pca` is not None, the `explained variance` ' 'values should not be `None`.'): X, y = iris_data() pca = PCA(n_components=2) X_pca = pca.fit_transform(X) plot_pca_correlation_graph(X, variables_names=['1', '2', '3', '4'], X_pca=X_pca, explained_variance=None)
def apply_pca(X_train, X_test, pca_thresh): """ apply principal component analysis to reduce dimensionality of feature vectors""" feature_labels = X_train.columns pca = PCA(n_components=pca_thresh) shape_orig = X_train.shape X_train = pca.fit_transform(X_train) shape_reduced = X_train.shape X_test = pca.transform(X_test) logging.info("reduced dimensionality from {} to {}".format( shape_orig, shape_reduced)) rows = ["PC-{}".format(i) for i in range(len(pca.components_))] pcs = pd.DataFrame(pca.components_, columns=feature_labels, index=rows) return X_train, X_test, pcs
def find_distributions(query_areas, query_energies, binned_image, n_classes=2, bimages_path='/home/fin/Documents/Timepix/particle-tk/datagen/images.pkl', segments_path='/home/fin/Documents/Timepix/particle-tk/datagen/segments.pkl'): # Load binned images and segments b_im = pkl.load(open(bimages_path, 'rb')) segments = pkl.load(open(segments_path, 'rb')) reductor = PCA(n_components=3) b_im = reductor.fit_transform(b_im) queried_binned_image = reductor.transform(binned_image.reshape(1,-1)) areas = [[] for i in range(0,n_classes)] pixel_energies = [[] for i in range(0,n_classes)] binned_images = [[] for i in range(0,n_classes)] binned_images_energies = [[] for i in range(0,n_classes)] for segment in segments: for lbl in range(1,n_classes+1): if segment.get_metadata('label') == lbl: areas[lbl-1].append(area(segment.get_bitmap())) nonzeroE = segment.get_bitmap().flatten()[segment.get_bitmap().flatten() > 0] for e in nonzeroE: pixel_energies[lbl-1].append(e) binned_images_energies[lbl-1].append(b_im[segment.get_metadata('parent_im_id')]) binned_images[lbl-1].append(b_im[segment.get_metadata('parent_im_id')]) break # Estimation of size density given image sizes = list() # for each particle type one array of size sizes.append(np.linspace(0,20,100)) sizes.append(np.linspace(0,10,100)) energies = list() energies.append(np.linspace(0,400,100)) energies.append(np.linspace(0,400,100)) p_SgX = list() p_EgX = list() for lbl in range(1,n_classes+1): print(areas[lbl-1]) estimator_P_SgX = estimate_P_SgX(areas[lbl-1], binned_images[lbl-1]) estimator_P_EgX = estimate_P_SgX(pixel_energies[lbl-1], binned_images_energies[lbl-1]) p_SgX.append(estimator_P_SgX.score_samples(query_areas[lbl-1,:], np.repeat(np.atleast_2d(queried_binned_image), query_areas[lbl-1,:].shape[0], axis=0))) p_EgX.append(estimator_P_EgX.score_samples(query_energies[lbl-1,:], np.repeat(np.atleast_2d(queried_binned_image), query_energies[lbl-1,:].shape[0], axis=0))) return np.array(p_SgX), np.array(p_EgX)
def test_not_enough_components(): s = ( 'Number of principal components must match the number of eigenvalues. Got 2 != 1' ) with pytest.raises(ValueError, match=s): X, y = iris_data() pca = PCA(n_components=2) X_pca = pca.fit_transform(X) eigen = pca.explained_variance_ plot_pca_correlation_graph(X, variables_names=['1', '2', '3', '4'], X_pca=X_pca, explained_variance=eigen[:-1])
def do_pca(X, c=3): """Do PCA""" from sklearn import preprocessing from sklearn.decomposition.pca import PCA, RandomizedPCA #do PCA #S = standardize_data(X) S = pd.DataFrame(preprocessing.scale(X),columns = X.columns) pca = PCA(n_components=c) pca.fit(S) print (pca.explained_variance_ratio_) #print pca.components_ w = pd.DataFrame(pca.components_,columns=S.columns)#,index=['PC1','PC2']) #print w.T.max(1).sort_values() pX = pca.fit_transform(S) pX = pd.DataFrame(pX,index=X.index) return pX
def build_images_DBSCAN(spectra, spectrum_columns, spectra_distances, colors, eps_l=[0.01 * i for i in range(12, 0, -2)], TSNE_learning_rate=500, TSNE_n_iter=1500, TSNE_learning_rate2=300): markers = ['x', 'o'] cols = spectra['marked'].apply(lambda x: colors[x]) col = spectra['marked'] plt.subplots(figsize=(18, 6)) plt.subplot(131) plt.title("PCA") pca = PCA(n_components=2, random_state=42) spectra_2D = pca.fit_transform(spectra[spectrum_columns]) for i in range(len(col)): plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]]) plt.subplot(132) plt.title("TSNE, Euclidean distance") tsne = TSNE(n_components=2, random_state=42, learning_rate=TSNE_learning_rate, n_iter=TSNE_n_iter) spectra_2D = tsne.fit_transform(spectra[spectrum_columns]) for i in range(len(col)): plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]]) plt.subplot(133) plt.title("TSNE, Chosen distance") tsne = TSNE(n_components=2, random_state=42, metric="precomputed", learning_rate=TSNE_learning_rate2) spectra_2D = tsne.fit_transform(spectra_distances) for i in range(len(col)): plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]]) plt.show() # visualization - tsne with chosen distance print('Clustering') plt.subplots(figsize=(18, 18)) for i, eps in enumerate(eps_l): dbscan = cluster.DBSCAN(eps=eps, min_samples=4) cluster_labels = dbscan.fit_predict(spectra_distances) plt.subplot(3, 3, i + 1) cols = [colors[l] for l in cluster_labels] plt.title("cluster labels (eps = {:.2})".format(eps)) for j in range(len(col)): plt.scatter(spectra_2D[j, 0], spectra_2D[j, 1], c=cols[j], alpha=0.5, marker=markers[col[j]], s=markersizes[col[j]]) plt.show() return spectra_2D
def test_pipeline_transform(): # Test whether pipeline works with a transformer at the end. # Also test pipline.transform and pipeline.inverse_transform iris = load_iris() X = iris.data pca = PCA(n_components=2) pipeline = Pipeline([('pca', pca)]) # test transform and fit_transform: X_trans = pipeline.fit(X).transform(X) X_trans2 = pipeline.fit_transform(X) X_trans3 = pca.fit_transform(X) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) X_back = pipeline.inverse_transform(X_trans) X_back2 = pca.inverse_transform(X_trans) assert_array_almost_equal(X_back, X_back2)
def test_pipeline_transform(): # Test whether pipeline works with a transformer at the end. # Also test pipline.transform and pipeline.inverse_transform iris = load_iris() X = iris.data pca = PCA(n_components=2) pipeline = Pipeline([('pca', pca)]) # test transform and fit_transform: X_trans = pipeline.fit(X).transform(X) X_trans2 = pipeline.fit_transform(X) X_trans3 = pca.fit_transform(X) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) X_back = pipeline.inverse_transform(X_trans) X_back2 = pca.inverse_transform(X_trans) assert_array_almost_equal(X_back, X_back2)
def do_pca(X, c=3): """Do PCA""" from sklearn import preprocessing from sklearn.decomposition.pca import PCA, RandomizedPCA #do PCA #S = standardize_data(X) #remove non numeric X = X._get_numeric_data() S = pd.DataFrame(preprocessing.scale(X), columns=X.columns) pca = PCA(n_components=c) pca.fit(S) out = 'explained variance %s' % pca.explained_variance_ratio_ print(out) #print pca.components_ w = pd.DataFrame(pca.components_, columns=S.columns) #print w.T.max(1).sort_values() pX = pca.fit_transform(S) pX = pd.DataFrame(pX, index=X.index) return pX, pca
def plot(self): pca = PCA(n_components=2) df = pca.fit_transform(self.data.values) df = pd.DataFrame(df, index=self.data.index) ids = sorted(list(self.data.columns)) df = df.reset_index() fig = plt.figure() ax = sns.scatterplot(x=0, y=1, data=df, hue='class', edgecolor='black') sns.despine(fig=fig, top=True, right=True) plt.xlabel('PC1 ({}%)'.format(round(pca.explained_variance_ratio_[0], 4) * 100)) plt.ylabel('PC2 ({}%)'.format(round(pca.explained_variance_ratio_[1], 4) * 100)) from functools import reduce annot = reduce(lambda x, y: f'{x}\n{y}', ids) plt.title("Silhouette Score={}".format(round(self.score, 3))) plt.annotate('Proteins\n' + annot, xycoords=ax.transAxes, xy=(1, 0.1)) if self.filename is None: plt.show() else: plt.savefig(self.filename, dpi=300, bbox_inches='tight') print('Results saved to "{}"'.format(self.filename))
class Classifier_pca(Layer): def __init__(self, C, n_times): self.C = C self.n_times = n_times self.pca = PCA(n_components=0.99) def get_train_output_for(self, inputX, inputy=None): inputX = self.pca.fit_transform(inputX) n_hidden = int(self.n_times * inputX.shape[1]) self.W = init.GlorotNormal().sample((inputX.shape[1], n_hidden)) self.b = init.Normal().sample(n_hidden) H = dotbiasact_decomp(inputX, self.W, self.b) self.beta = compute_beta(H, inputy, self.C) out = dot_decomp(H, self.beta) return out def get_test_output_for(self, inputX): inputX = self.pca.transform(inputX) H = dotbiasact_decomp(inputX, self.W, self.b) out = dot_decomp(H, self.beta) return out
def compute_pca_explain_power(self, model, num_dims=2): ''' Given a computed course vectors embedding model, extract the vectors, standardize them, perform a 2-dim PCA, and return the two-tuple with each of the dimensions explained-variance ratio. @param model: course context model as trained by neural net @type model: gensim.model.word_vectors @return: ratio of explained variance for each of the two dims @rtype: (float,float) ''' vectors = model.wv.vectors #******** #vectors_standardized = preprocessing.scale(vectors) vectors_standardized = vectors vectors_standardized_normalized = preprocessing.normalize(vectors_standardized, norm='l2') #******** pca = PCA(n_components=num_dims) _principalComponents = pca.fit_transform(vectors_standardized_normalized) explained_variance_ratios = pca.explained_variance_ratio_ return explained_variance_ratios
X_test = pd.read_csv('Data/test.csv', header=None).as_matrix() y = pd.read_csv('Data/trainLabels.csv', header=None)[0].as_matrix() X = pd.read_csv('Data/train.csv', header=None).as_matrix() pca2 = PCA(n_components=2, whiten=True) pca2.fit(np.r_[X, X_test]) X_pca = pca2.transform(X) i0 = np.argwhere(y == 0)[:, 0] i1 = np.argwhere(y == 1)[:, 0] X0 = X_pca[i0, :] X1 = X_pca[i1, :] plt.plot(X0[:, 0], X0[:, 1], 'ro') plt.plot(X1[:, 0], X1[:, 1], 'b*') pca = PCA(whiten=True) X_all = pca.fit_transform(np.r_[X, X_test]) print (pca.explained_variance_ratio_) def kde_plot(x): from scipy.stats.kde import gaussian_kde kde = gaussian_kde(x) positions = np.linspace(x.min(), x.max()) smoothed = kde(positions) plt.plot(positions, smoothed) def qq_plot(x): from scipy.stats import probplot probplot(x, dist='norm', plot=plt) kde_plot(X_all[:, 0]) kde_plot(X_all[:, 2])
sep='\s+', header=None, names=cols) #%% X = data.iloc[:, 1:8].values X = StandardScaler().fit_transform(X) y = data.iloc[:, 8:] #%% data.groupby('Target')['Name'].nunique().plot(kind='bar') plt.show() #%% from pandas.plotting import scatter_matrix scatter_matrix(data, alpha=0.2, figsize=(12, 12), diagonal='kde') #%% model = PCA(n_components=3) principle_comps = model.fit_transform(X) #%% principle_df = pd.DataFrame(data=principle_comps, columns=["PC1", "PC2", "PC3"]) final_df = pd.concat([principle_df, y], axis=1) #%% new_y = y.values.tolist() new_y = [entry[0] for entry in new_y] #%% labels = set(new_y) print(labels) #%% sns.lmplot('PC1', 'PC2', final_df, hue='Target', fit_reg=False, size=10)
plt.subplot(len(datasets), len(clustering_algorithms), plot_num) if i_dataset == 0: plt.title(name, size=18) # colours = numpy.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a', # '#f781bf', '#a65628', '#984ea3', # '#999999', '#e41a1c', '#dede00']), # int(max(y_pred) + 1)))) #import random r = lambda: random.randint(255) colours = numpy.array([ '#%02X%02X%02X'%(r(),r(),r()) for i in range(max(y_pred)+1) ]) print(colours) pca = PCA() to_plot = pca.fit_transform(X) #print(y_pred) c = Counter(y_pred) print(c.most_common()) exp_mx.loc[:,"cluster"] = y_pred exp_mx.to_csv("exp_w_clusters.csv") plt.scatter(to_plot[:,0], to_plot[:,1], alpha=0.5, c=colours[y_pred]) # plt.xlim(-2.5, 2.5) # plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
df_info = pd.read_excel(os.path.join(d, f)) df_info["Instrument"] = pd.Series([d] * len(df_info), index=df_info.index) else: df = pd.read_excel(os.path.join(d, f)) df["Instrument"] = pd.Series([d] * len(df), index=df.index) df_info = df_info.append(df, ignore_index=True) # %% Apply PCA, plot some components, see variance explained etc from sklearn.decomposition.pca import PCA import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D mdl = PCA() new_data = mdl.fit_transform(df_seg) p = df_info["Instrument"] == "Piano" v = df_info["Instrument"] == "Violin" plt.figure() plt.scatter(new_data[p, 0], new_data[p, 1], label="Piano") plt.scatter(new_data[v, 0], new_data[v, 1], label="Violin") plt.legend() plt.grid(b=True) plt.savefig("pca_2d.png", dpi=300, transparent=True) plt.figure() plt.plot(mdl.explained_variance_) plt.grid(b=True) plt.savefig("explained_variance.png", dpi=300, transparent=True)
df = pd.read_pickle(data_path / "1_koinworks_cleaned.pkl") df = df[["id", "date", "username", "cleaned", "tweet", "name"]] df["date"] = pd.to_datetime(df["date"]) print(f"before drop duplicate: {len(df)}") df = df.drop_duplicates(subset=["cleaned"]) print(f"after drop duplicate: {len(df)}") df.dropna(inplace=True) # TFIDF embeddings vectorizer = TfidfVectorizer() pca = PCA(n_components=2, svd_solver="full") um = UMAP(n_components=5, n_neighbors=15, metric='euclidean') X = vectorizer.fit_transform(df.cleaned.values) X_umap = um.fit_transform(X.toarray()) X_pca = pca.fit_transform(X.toarray()) df["pca"] = [a for a in X_pca] um = UMAP(n_components=2, n_neighbors=15, metric='euclidean') df["umap_2d"] = [a for a in um.fit_transform(X)] df["umap"] = [a for a in X_umap] df["tfidf"] = X df.to_pickle(data_path / "2_koinworks_fix.pkl") tweets = df.cleaned.values x, y = train_test_split(tweets) y_test, y_val = train_test_split(y) with open(data_path / "flair_format/train/train.txt", "w") as f: for t in tweets: f.writelines(f"{t}\n") with open(data_path / "flair_format/test.txt", "w") as f: for t in y_test: f.writelines(f"{t}\n")
# eng = matlab.engine.start_matlab() X=[] for i in xrange(1,30): file=open('J_Left/'+`i`) mylist=[] x=0 for line in file: line=line[:-1] temp=line.split(' ') for i in range(len(temp)-1): # print temp[i] mylist.append(float(temp[i])) mylist=mylist+[0]*(5000*9-len(mylist)) X.append(mylist) print 'len of X',len(X) pca=PCA(n_components=4) t=pca.fit_transform(X) l=[] for v in t: arr=[] for e in v: f=float(e) arr.append(f) l.append(arr) # ret = eng.moh_pca(l) print l print type(l) print 'len of t',len(t)
gc = GridSearchCV(estimator=Lasso(), param_grid=gs_params) lasso_model = gc.fit(X_train, Y_train) Y_pred = lasso_model.predict(X_test) best_alpha = lasso_model.best_params_['alpha'] print 'The best lasso model is obtained wiht an alpha of', best_alpha print 'RMSE of the best standardized lasso regression model:', mean_squared_error( Y_test, Y_pred)**0.5 # split data into train, dev and test # todo: do this before, so that we have errors measured on the same test set for i in range(1, 9): n_components = 2**i #n_components = i pca = PCA(n_components=n_components) X_reduced_train = pca.fit_transform(X_train) X_reduced_test = pca.transform(X_test) vanilla_lr = LinearRegression() vanilla_lr = vanilla_lr.fit(X_reduced_train, Y_train) Y_pred = vanilla_lr.predict(X_reduced_test) print 'MSE for ', n_components, ' components with LR ', mean_squared_error( Y_test, Y_pred)**0.5 gs_params = {'alpha': [2**i for i in range(-10, 20)]} gc = GridSearchCV(estimator=Ridge(), param_grid=gs_params) ridge_model = gc.fit(X_reduced_train, Y_train) Y_pred = ridge_model.predict(X_reduced_test) print 'RMSE for ', n_components, ' components with Ridge ', mean_squared_error( Y_test, Y_pred)**0.5
# plt.show() plt.draw() # plt.savefig("some_digits.png") ### Preprocessing data ### # Standardize features by scaling them to the range (0,1) # Note: this standardization is often used as an alternative to # zero mean, unit variance scaling (performed with sklearn.preprocessing.scale) min_max_scaler = MinMaxScaler(feature_range=(0, 1)) train_minmax = min_max_scaler.fit_transform(train) # PCA n_pc = 78 # principal components to keep pca = PCA(n_components=n_pc, whiten=True) # whitening to remove correlations train_pca = pca.fit_transform(train_minmax) # plot all the principal components with their relative explained variance features = [x for x in range(1, n_pc + 1)] plt.figure(2) # percentage of variance explained by each of the selected components. # The sum of explained variances is equal to 1.0 # plt.plot(features, pca.explained_variance_ratio_, 'g--', marker='o') plt.semilogy(features, pca.explained_variance_ratio_, "g--", marker="o") plt.axis([1, n_pc, 0, pca.explained_variance_ratio_.max()]) plt.grid(True) plt.xlabel("principal components"), plt.ylabel("variance explained (log)") plt.title("scree plot") # plt.savefig("screeplot_" + str(n_pc) + "_PC.png") ### Train a SVM Classifier ###
def bhtsne(vectors, vecs_with_center, args): # if args.bhtsne or not(args.timeline or args.bhtsne or args.wordclouds): # bhtnse pca = PCA(n_components=50) vectors = pca.fit_transform(vectors) print('Bhtsne..') Y = tsne(vectors, perplexity=args["tsne_perplexity"]) pd.DataFrame(Y).to_csv('{}/bhtsne.csv'.format(args['path'])) plt.scatter(Y[:, 0], Y[:, 1], s=0.3) plt.savefig('{}/bhtsne.svg'.format(args['path']), bbox_inches='tight') plt.savefig('{}/bhtsne.png'.format(args['path']), bbox_inches='tight') pd.DataFrame(Y).to_csv('{}/bhtsne_2d.csv'.format(args['path'])) pvtm_utils.svg_to_pdf('{}/bhtsne.svg'.format(args['path'])) plt.close() print('Bhtsne with center..') Y = tsne(vecs_with_center.values, perplexity=args["tsne_perplexity"]) pd.DataFrame(Y).to_csv('{}/bhtsne_with_center.csv'.format(args['path'])) plt.scatter(Y[:len(vectors), 0], Y[:len(vectors), 1], s=0.3) plt.scatter(Y[len(vectors):, 0], Y[len(vectors):, 1], s=0.8, c='r', marker='x') plt.savefig('{}/bhtsne_with_center.svg'.format(args['path']), bbox_inches='tight') plt.savefig('{}/bhtsne_with_center.png'.format(args['path']), bbox_inches='tight') pd.DataFrame(Y).to_csv('{}/bhtsne_with_center_2d.csv'.format(args['path'])) pvtm_utils.svg_to_pdf('{}/bhtsne_with_center.svg'.format(args['path'])) plt.close() print('3D tsne...') Y = tsne(vectors, dimensions=3, perplexity=args["tsne_perplexity"]) fig = pyplot.figure(frameon=False, figsize=(8, 5)) ax = Axes3D(fig) ax.scatter(Y[:, 0], Y[:, 1], Y[:, 2], s=1, c='b', marker='^') ax.scatter(Y[:, 0], Y[:, 1], Y[:, 2], s=20, c='r', marker='^') # pyplot.axis('off') xmax, ymax, zmax = Y[:len(vectors), 0].max(), Y[:len(vectors), 1].max(), Y[:len(vectors), 2].max() xmin, ymin, zmin = Y[:len(vectors), 0].min(), Y[:len(vectors), 1].min(), Y[:len(vectors), 2].min() ax.set_xlim(xmin + 4, xmax - 4) ax.set_ylim(ymin + 4, ymax - 4) ax.set_zlim(zmin + 4, zmax - 4) pyplot.savefig('{}/bhtsne_3d.svg'.format(args['path']), bbox_inches='tight') pyplot.savefig('{}/bhtsne_3d.png'.format(args['path']), bbox_inches='tight') pvtm_utils.svg_to_pdf('{}/bhtsne_3d.svg'.format(args['path'])) pd.DataFrame(Y).to_csv('{}/bhtsne_3d.csv'.format(args['path'])) Y = tsne(vecs_with_center.values, dimensions=3, perplexity=args["tsne_perplexity"]) fig = pyplot.figure(frameon=False, figsize=(8, 5)) ax = Axes3D(fig) ax.scatter(Y[:len(vectors), 0], Y[:len(vectors), 1], Y[:len(vectors), 2], s=1, c='b', marker='^') ax.scatter(Y[len(vectors):, 0], Y[len(vectors):, 1], Y[len(vectors):, 2], s=20, c='r', marker='^') # pyplot.axis('off') xmax, ymax, zmax = Y[:len(vectors), 0].max(), Y[:len(vectors), 1].max(), Y[:len(vectors), 2].max() xmin, ymin, zmin = Y[:len(vectors), 0].min(), Y[:len(vectors), 1].min(), Y[:len(vectors), 2].min() ax.set_xlim(xmin + 4, xmax - 4) ax.set_ylim(ymin + 4, ymax - 4) ax.set_zlim(zmin + 4, zmax - 4) pyplot.savefig('{}/bhtsne_with_center_3d.svg'.format(args['path']), bbox_inches='tight') pyplot.savefig('{}/bhtsne_with_center_3d.png'.format(args['path']), bbox_inches='tight') pvtm_utils.svg_to_pdf('{}/bhtsne_with_center_3d.svg'.format(args['path'])) pd.DataFrame(Y).to_csv('{}/bhtsne_with_center_3d.csv'.format(args['path']))
np.max(sel.variances_) #plot import matplotlib.pyplot as plt plt.figure(figsize = (7,4)) plt.plot(sel.variances_) plt.text(55,0.135,"0.135, 5th nt", style='italic', bbox={'facecolor':'red', 'alpha':0.5, 'pad':1}) plt.title("Cleavage site reactivity variance("+deg+")") plt.show() #generate files for length in [21,71,121]: for deg in ["wt","xrn4"]: X = np.loadtxt("cs_datasets/cs_reactivity_" + deg +"_"+ str(length) + ".csv",delimiter = ",") #y = np.loadtxt("cs_efficiency/cs_efficiency_log_"+deg+".csv",delimiter = ",") model = PCA(length / 2) X2 = model.fit_transform(X) X2.shape sum( model.explained_variance_ratio_) #np.savetxt("cs_datasets/cs_reactivity_" + deg +"_"+ str(length) + "_pca2.csv",X2,delimiter=",",fmt="%0.7g") a = [1,2] a.index(1)