def get_embedding(X, y, type_embeding): n_neighbors = 30 X_projected = None if type_embeding == "Random": rp = random_projection.SparseRandomProjection(n_components=2, random_state=42) X_projected = rp.fit_transform(X) elif type_embeding == "PCA": X_projected = decomposition.TruncatedSVD(n_components=2).fit_transform( X) elif type_embeding == "LDA": X2 = X.copy() X2.flat[::X.shape[1] + 1] += 0.01 # Make X invertible X_projected = discriminant_analysis.LinearDiscriminantAnalysis( n_components=2).fit_transform(X2, y) elif type_embeding == "Isomap": X_projected = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X) elif type_embeding == "LLE": clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='standard') X_projected = clf.fit_transform(X) elif type_embeding == "mLLE": clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='modified') X_projected = clf.fit_transform(X) elif type_embeding == "hLLE": clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='hessian') X_projected = clf.fit_transform(X) elif type_embeding == "ltsa": clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='ltsa') X_projected = clf.fit_transform(X) elif type_embeding == "MDS": clf = manifold.MDS(n_components=2, n_init=1, max_iter=100) X_projected = clf.fit_transform(X) elif type_embeding == "RF": hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=5) X_transformed = hasher.fit_transform(X) pca = decomposition.TruncatedSVD(n_components=2) X_projected = pca.fit_transform(X_transformed) elif type_embeding == "Spectral": embedder = manifold.SpectralEmbedding(n_components=2, random_state=0, eigen_solver="arpack") X_projected = embedder.fit_transform(X) elif type_embeding == "T-SNE": tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) X_projected = tsne.fit_transform(X) else: print("""Valid options are: Random => Random Projections PCA => Principal Component Analysis LDA => Linear Discriminant Analysis Isomap => Isomap LLE => Locally Linear Embedding mLLE => Modified Locally Linear Embedding hLLE => Hessian Locally Linear Embedding ltsa => Locally Linear Embedding (ltsa) MDS => Multidimensional Scaling RF => Random Forest Embeding Spectral => Spectral Embeding T-SNE => T-SNE """) return X_projected
corpus.loc[:, "review"] = corpus.review.apply(clean_text) # collect only the text in review column corpus = corpus.review.values # initialize TfidfVectorizer tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None) # fit the vectorizer to corpus tfv.fit(corpus) # transform the corpus using tfidf corpus_transformed = tfv.transform(corpus) # initialize SVD with 10 components svd = decomposition.TruncatedSVD(n_components=10) # fit SVD corpus_svd = svd.fit(corpus_transformed) # choose first sample and create a dictionary # of feature names and their scores from svd # you can change the sample_index variable to # get dictionary for any other sample sample_index = 0 feature_scores = dict( zip(tfv.get_feature_names(), corpus_svd.components_[sample_index])) # once we have the dictionary, we can now # sort it in decreasing order and get the # top N topics
from sklearn import cluster n = 300000 X = np.fromfile('train_feat', dtype=np.float32, count=n * 2048).reshape(-1, 2048) X_test = np.fromfile('test_feat', dtype=np.float32, count=500000 * 2048).reshape(-1, 2048) print(X.shape, X_test.shape) #XX = np.concatenate([X, X_test]) #print (XX.shape) mean = X.mean(axis=0) print(mean.shape) X -= mean X_test -= mean pca = decomposition.TruncatedSVD(n_components=128) pca.fit(X) X = pca.transform(X).astype(np.float32) X.tofile('train_feat_128') X_test = pca.transform(X_test).astype(np.float32) X_test.tofile('test_feat_128') #print (X.shape) # #print (pca.explained_variance_ratio_.sum()) #print (pca.components_.dtype) # #print (pca.components_.shape) #pca.mean_.tofile('mean') #pca.components_.tofile('comp')
fit_train_transform_result = pca.fit_transform(x_train_tfidf.toarray()) fit_test_transform_result = pca.transform(x_test_tfidf.toarray()) fit_train_transform_result.shape fit_test_transform_result.shape run_model(fit_train_transform_result, y_train, fit_test_transform_result, y_test) pca = decomposition.PCA(n_components=i) fit_train_transform_result = pca.fit_transform(x_train_extended) fit_test_transform_result = pca.transform(x_test_extended) fit_train_transform_result.shape fit_test_transform_result.shape run_model(fit_train_transform_result, y_train, fit_test_transform_result, y_test) # trunc SVD # from sklearn import decomposition, metrics, model_selection import numpy as np ks = [10, 20, 30, 50, 75, 100, 150, 250, 300] ttrain = [x_train_tfidf, x_train_extended] ttest = [x_test_tfidf, x_test_extended] for tt in range(2): for kk in ks: tSVD = decomposition.TruncatedSVD(n_components=kk, random_state=2017) xt_train = tSVD.fit_transform(ttrain[tt]) xt_test = tSVD.transform(ttest[tt]) run_model(xt_train, y_train, xt_test, y_test)
print(f1_score(yvalid, predictions, average=None)) print("Accuracy") print(accuracy_score(yvalid, predictions)) # Fitting a simple Naive Bayes on Counts clf = MultinomialNB() clf.fit(xtrain_ctv, ytrain) predictions = clf.predict(xvalid_ctv) print("f1 Score") print(f1_score(yvalid, predictions, average='macro')) print("f1 Score Individual") print(f1_score(yvalid, predictions, average=None)) print("Accuracy") print(accuracy_score(yvalid, predictions)) # Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model. svd = decomposition.TruncatedSVD(n_components=120) svd.fit(xtrain_tfv) xtrain_svd = svd.transform(xtrain_tfv) xvalid_svd = svd.transform(xvalid_tfv) # Scale the data obtained from SVD. Renaming variable to reuse without scaling. scl = preprocessing.StandardScaler() scl.fit(xtrain_svd) xtrain_svd_scl = scl.transform(xtrain_svd) xvalid_svd_scl = scl.transform(xvalid_svd) # Fitting a simple SVM clf = SVC(C=1.0) clf.fit(xtrain_svd_scl, ytrain) predictions = clf.predict(xvalid_svd_scl) print("f1 Score")
max_depth=5).fit_transform(data) print_results("tree kmeans", labels, compute_kmeans(tree_embedding_data)) 0 / 0 srp_emb_data = random_projection.SparseRandomProjection( n_components=20, random_state=42).fit_transform(data) print_results("SparseRandomProjection kmeans", labels, compute_kmeans(srp_emb_data)) iso_emb_data = manifold.Isomap(30, n_components=2).fit_transform(data) print_results("iso kmeans", labels, compute_kmeans(iso_emb_data)) # lle_emb_data = manifold.LocallyLinearEmbedding(10, n_components=2, method='ltsa').fit_transform(data) # print_results("lle kmeans", labels, compute_kmeans(lle_emb_data)) svd = decomposition.TruncatedSVD(n_components=2).fit_transform(data) print_results("svd kmeans", labels, compute_kmeans(svd)) tsne_emb_data = manifold.TSNE(n_components=3, init="pca", random_state=17).fit_transform(data) print_results("tsne kmeans", labels, compute_kmeans(tsne_emb_data)) 0 / 0 # print_results("tree kmeans+rdc", labels, compute_kmeans_rdc(tree_embedding_data)) mds_embedding_data = manifold.MDS(n_components=10, n_init=1, max_iter=100) print_results("tree kmeans", labels, compute_kmeans(mds_embedding_data)) print_results("tree kmeans+rdc", labels, compute_kmeans_rdc(mds_embedding_data))
def __init__(self, **kwargs): super().__init__() self.truncated_svd = decomposition.TruncatedSVD(**kwargs)
def main(): class name: """ algebra of LaTeX names of representations and transformations """ def __init__(self, n, is_compound=False): self.n = n self.is_compound = is_compound def bracket(self): if not self.is_compound: return self return name("$(" + self.n.strip("$") + ")$", is_compound=self.is_compound) def __mul__(self, other): return name(self.n.rstrip("$") + "\cdot" + other.n.lstrip("$"), is_compound=True) def __or__(self, other): return name(self.bracket().n.rstrip("$") + " | " + other.bracket().n.lstrip("$"), is_compound=False) def __call__(self, *params): return name(self.n % params, is_compound=self.is_compound) def __hash__(self): return hash(self.n) def __eq__(self, other): return self.n == other.n def __ne__(self, other): return not self.__eq__(other) def __str__(self): return self.n def __repr__(self): return "name({})".format(self.n) class r: bow = name("$\mathrm{Bow}$") bow_norm = name("$\mathrm{BowNorm}$") sentiment = name("$\mathrm{Sentiment}$") ratios = name("$\mathrm{Ratios}$") svd = name("$\mathrm{SVD}(%s)$") lda = name("$\mathrm{LDA}(%s)$") diff = name("$\mathrm{Diff}$") ewm = name("$\mathrm{Ewm}(%s)$") user_stats = name("$\mathrm{UserStats}$") day = name("$\mathrm{Day}$") returns = name("$\mathrm{Returns}_t$") speed = name("$\mathrm{Speed}$") class c: lr = name("$\mathrm{LR}$") xgb = name("$\mathrm{XGB}$") class pair: def __init__(self, clf, rep): self.clf = clf self.rep = rep self.str = "$\\langle {}, {} \\rangle$".format( self.clf.n.strip("$"), self.rep.n.strip("$")) def __str__(self): return self.str def __repr__(self): return str(self) def __float__(self): raise ValueError def __hash__(self): return hash((self.clf, self.rep)) def __eq__(self, other): return (self.clf, self.rep) == (other.clf, other.rep) def __ne__(self, other): return not self.__eq__(other) df_price = models.load_price() df_ug, widx = models.load_unigrams() reps = {} reps[r.bow] = df_ug reps[r.bow_norm] = np.log1p(df_ug).div(np.log1p(df_ug).apply( np.linalg.norm, axis=1), axis=0) reps[r.bow_norm * r.diff] = reps[r.bow_norm].diff().dropna() reps[r.bow_norm * r.svd(32)] = models.sklearn_transform_in_sample( decomp.TruncatedSVD(n_components=32), reps[r.bow_norm]) reps[r.bow_norm * r.diff * r.svd(32)] = models.sklearn_transform_in_sample( decomp.TruncatedSVD(n_components=32), reps[r.bow_norm * r.diff]) reps[r.bow_norm * r.svd(32) * r.ewm(55)] = reps[r.bow_norm * r.svd(32)].ewm(55).mean() reps[r.bow_norm * r.diff * r.svd(32) * r.ewm(120)] = reps[r.bow_norm * r.diff * r.svd(32)].ewm(120).mean() reps[r.ratios] = models.Ratios.load().get() reps[r.ratios * r.diff] = reps[r.ratios].diff().dropna() reps[r.ratios * r.svd(32)] = models.sklearn_transform_in_sample( decomp.TruncatedSVD(n_components=32), reps[r.ratios]) reps[r.ratios * r.diff * r.svd(32)] = models.sklearn_transform_in_sample( decomp.TruncatedSVD(n_components=32), reps[r.ratios * r.diff]) reps[r.ratios * r.svd(32) * r.ewm(30)] = reps[r.ratios * r.svd(32)].ewm(30).mean() reps[r.sentiment] = models.load_sentiment().fillna(0) reps[r.sentiment * r.diff] = reps[r.sentiment].diff() reps[r.sentiment * r.ewm(90)] = reps[r.sentiment].ewm(90).mean() reps[r.sentiment | (r.sentiment * r.ewm(90))] = util.join( reps[r.sentiment], reps[r.sentiment * r.ewm(90)]) reps[r.lda(50)] = df_lda #models.load_lda() reps[r.lda(50) * r.ewm(90)] = reps[r.lda(50)].ewm(90).mean() reps[(r.lda(50) * r.ewm(90)) | r.lda(50)] = util.join( reps[r.lda(50)], reps[r.lda(50) * r.ewm(90)]) df_user_stats = np.log1p(models.load_user_stats()) reps[r.user_stats] = df_user_stats - df_user_stats.shift(56) df_time = pd.DataFrame(dict(time=np.arange(df_price.shape[0])), index=df_price.index) reps[(r.lda(50) * r.ewm(90)) | (r.ratios * r.svd(32)) | (r.sentiment * r.ewm(90)) | r.day] = util.join( reps[r.lda(50) * r.ewm(90)], reps[r.ratios * r.svd(32)], reps[r.sentiment * r.ewm(90)], df_time) reps[(r.lda(50) * r.ewm(90)) | (r.ratios * r.svd(32)) | (r.sentiment * r.ewm(90)) | r.user_stats | r.day] = util.join( reps[r.lda(50) * r.ewm(90)], reps[r.ratios * r.svd(32)], reps[r.sentiment * r.ewm(90)], reps[r.user_stats], df_time) new_rep = {} new_rep[(r.lda(50) * r.ewm(90)) | (r.ratios * r.svd(32)) | (r.sentiment * r.ewm(90)) | r.user_stats | r.returns | r.day] = util.join(reps[r.lda(50) * r.ewm(90)], reps[r.ratios * r.svd(32)], reps[r.sentiment * r.ewm(90)], reps[r.user_stats], df_price[["log_return", "up_down"]], df_time) # df_ratios_speed = reps[r.ratios * r.svd(32)]\ # .diff()\ # .dropna()\ # .apply(np.linalg.norm, axis=1)\ # .to_frame("ratio_speed") # df_svd_speed = reps[r.bow_norm * r.svd(32)]\ # .diff()\ # .dropna()\ # .apply(np.linalg.norm, axis=1)\ # .to_frame("svd_speed") in_sample = slice(data_config.date_begin, data_config.date_is_end) in_sample_recent = slice(data_config.date_turning_point, data_config.date_is_end) out_of_sample = slice(data_config.date_oos_begin, data_config.date_oos_end - dt.timedelta(days=1)) df_price_is, df_price_oos = df_price.ix[in_sample], df_price.ix[ out_of_sample] clfs = { c.lr: lambda: lm.LogisticRegression(), c.xgb: lambda: xgboost.XGBClassifier(reg_lambda=2, max_depth=3, subsample=.5), } results_cv = [] for rep_name, rep in sorted(new_rep.items(), key=lambda x: x[0]): for clf_name, clf_f in clfs.items(): if rep_name == r.bow_norm or rep_name == r.bow_norm * r.diff or rep_name == r.bow and clf_name == c.xgb: continue print rep_name.n, clf_name.n acc, roc_auc, matthews = zip(*training.cv_test( clf_f(), rep.ix[in_sample], df_price_is, k=100)) df_results = pd.DataFrame( dict(acc=acc, roc_auc=roc_auc, matthews=matthews)) df_results["rep"] = rep_name df_results["clf"] = clf_name results_cv.append(df_results) results_cv_all = pd.concat(results_cv) results_cv_all["rep_clf_name"] = [ pair(a, b) for a, b in zip(results_cv_all.clf, results_cv_all.rep) ] rank_by = "acc" top_3_cv = results_cv_all\ .query("rep != 'user_stats'")\ .groupby("rep_clf_name")\ .quantile(.1)\ .sort_values(rank_by, ascending=False)\ .head(3) bottom_3_cv = results_cv_all\ .query("rep != 'user_stats'")\ .groupby("rep_clf_name")\ .quantile(.1)\ .sort_values(rank_by)\ .head(3) plots.plot_best_worst_cv(results_cv_all, top_3_cv, bottom_3_cv, rank_by) df_soft_returns = pd.DataFrame(index=df_price_oos.index) df_hard_returns = pd.DataFrame(index=df_price_oos.index) results_oos = [] for pair_clf_rep in top_3_cv.index.values: rep, clf_f = reps[pair_clf_rep.rep], clfs[pair_clf_rep.clf] pair_name = str(pair_clf_rep) [acc, roc_auc, matthews], [realized_returns_hard, realized_returns_soft ] = training.oos_test(clf_f(), rep.ix[in_sample], rep.ix[out_of_sample], df_price_is, df_price_oos) results_oos.append([pair_name, acc, roc_auc, matthews]) df_soft_returns[pair_name] = realized_returns_soft df_hard_returns[pair_name] = realized_returns_hard results_oos_all = pd.DataFrame( results_oos, columns=["rep", "acc", "roc_auc", "matthews"]) oos_sharpe = (df_hard_returns.diff().mean() / df_hard_returns.diff().std()).to_frame("sharpe") oos_log_return = df_hard_returns.ix[-1].to_frame("log_return") df_random_classifier = random_returns.random_returns_stats(df_price_oos, n=1000) df_buy_hold = pd.DataFrame( dict(log_return=[df_price_oos.log_return.sum()], sharpe=[ df_price_oos.log_return.mean() / df_price_oos.log_return.std() ])) df_oos = results_oos_all.join(oos_sharpe.join(oos_log_return), on="rep") plots.plot_returns(df_price_oos, df_hard_returns) random_returns.plot_random_returns( df_price_oos, random_returns.random_returns_stats(df_price_oos))
Height, Width = Base_img.shape[0], Base_img.shape[1] f_original = open(Picture) f_original.seek(0, os.SEEK_END) Base_weight = f_original.tell() f_original.close() Range = int( Max_conversion_rate * Height / 100) if int(Max_conversion_rate * Height / 100) < Width else Width - 1 if Height == Width else Width RGB_Encoded_array = [] RGB_Compressed_array = [] svd = decomposition.TruncatedSVD(n_components=Range if Range != 0 else 1) for RGB_each_array in range(3): RGB = Base_img[:, :, RGB_each_array] RGB_simple_array = svd.fit_transform(RGB) RGB_Compressed_array.append(RGB_simple_array) RGB_Encoded_array.append((svd.inverse_transform(RGB_simple_array))) if Normalization == 1: Image_encoded = normalization(np.dstack(RGB_Encoded_array)) Image_coded = normalization(np.dstack(RGB_Compressed_array)) RGB_Encoded_array = normalization(RGB_Encoded_array) else: Image_encoded = np.dstack(RGB_Encoded_array) Image_coded = np.dstack(RGB_Compressed_array) RGB_Encoded_array = RGB_Encoded_array
def plot_other_manifold(X, y, n_neighbors, n_estimators=00, max_depth=5, random_state=0): # ---------------------------------------------------------------------- # Modified Locally linear embedding of the digits dataset print("Computing modified LLE embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='modified') t0 = time() X_mlle = clf.fit_transform(X) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding(X_mlle, y, "Modified Locally Linear Embedding of the digits (time %.2fs)" % (time() - t0)) # ------------------------------------------------------------- # HLLE embedding of the digits dataset print("Computing Hessian LLE embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='hessian') t0 = time() X_hlle = clf.fit_transform(X) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding(X_hlle, y, "Hessian Locally Linear Embedding of the digits (time %.2fs)" % (time() - t0)) # -------------------------------------------------------------------- # LTSA embedding of the digits dataset print("Computing LTSA embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='ltsa') t0 = time() X_ltsa = clf.fit_transform(X) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding(X_ltsa, y, "Local Tangent Space Alignment of the digits (time %.2fs)" % (time() - t0)) # ---------------------------------------------------------------------- # Random Trees embedding of the digits dataset print("Computing Totally Random Trees embedding") hasher = ensemble.RandomTreesEmbedding(n_estimators=n_estimators, random_state=random_state, max_depth=max_depth) t0 = time() X_transformed = hasher.fit_transform(X) pca = decomposition.TruncatedSVD(n_components=2) X_reduced = pca.fit_transform(X_transformed) plot_embedding(X_reduced, y, "Random forest embedding of the digits (time %.2fs)" % (time() - t0)) # ---------------------------------------------------------------------- # Spectral embedding of the digits dataset print("Computing Spectral embedding") embedder = manifold.SpectralEmbedding(n_components=2, random_state=random_state, eigen_solver="arpack") t0 = time() X_se = embedder.fit_transform(X) plot_embedding(X_se, y, "Spectral embedding of the digits (time %.2fs)" % (time() - t0))
#%% Tests regarding flameplot import flameplot as flameplot import numpy as np from sklearn import (manifold, decomposition) # %% Load data X, y = flameplot.import_example() # %% PCA X_pca_50 = decomposition.TruncatedSVD(n_components=50).fit_transform(X) X_pca_2 = decomposition.TruncatedSVD(n_components=2).fit_transform(X) # tSNE X_tsne = manifold.TSNE(n_components=2, init='pca').fit_transform(X) # Random X_rand = np.c_[np.random.permutation(X_pca_2[:, 0]), np.random.permutation(X_pca_2[:, 1])] # %% Scatter flameplot.scatter(X_pca_2[:, 0], X_pca_2[:, 1], label=y, title='PCA') flameplot.scatter(X_tsne[:, 0], X_tsne[:, 1], label=y, title='tSNE') flameplot.scatter(X_rand[:, 0], X_rand[:, 1], label=y, title='Random') # %% Compare PCA(50) vs. tSNE scores = flameplot.compare(X_pca_50, X_tsne, n_steps=25) fig = flameplot.plot(scores, xlabel='PCA (50d)', ylabel='tSNE (2d)') # Compare PCA(2) vs. tSNE scores = flameplot.compare(X_pca_2, X_tsne, n_steps=25) fig = flameplot.plot(scores, xlabel='PCA (2d)', ylabel='tSNE (2d)') # Compare random vs. tSNE scores = flameplot.compare(X_rand, X_tsne, n_steps=25) fig = flameplot.plot(scores, xlabel='Random (2d)', ylabel='tSNE (2d)')
def generate_transformers(x, dataset, global_dir, min_variance=10, additional_scale_tsvd=1): """ This function returns a dictionary with callables for a given dataset. """ transform_functions = { 'vae': (lambda x: transform_vae(x, VAE_net)), 'pca': (lambda x: transform_pca(x, pca, var_pca)), 'tsvd': (lambda x: transform_tsvd(x, tsvd)), 'kpca': (lambda x: transform_kpca(x, kpca)), 'spca': (lambda x: transform_spca(x, spca)), 'iso': (lambda x: transform_iso(x, iso)), 'lle': (lambda x: transform_lle(x, lle)), } """ Note that below, we could have dynamically generated most transformer functions. However, doing so would potentially lose overview, and we do not have to optimize for efficiency here, while we actually have to preserve readability. """ ################ Regular PCA ################ pca = decomposition.PCA(n_components=2) var_pca = np.var(pca.fit_transform( x)) # We do this in one call, since we don't need latent_X for now # print(np.sum(pca.explained_variance_ratio_)) # Could be interesting to explain results with def transform_pca(x, pca, var_pca): return np.matmul(x, np.transpose( pca.components_)) / math.sqrt(var_pca) * math.sqrt(min_variance) ################ Truncated SVD ################ tsvd = decomposition.TruncatedSVD(n_components=2, n_iter=7, random_state=42) var_tsvd = np.var(tsvd.fit_transform(x)) def transform_tsvd(x, tsvd): return np.matmul(x, np.transpose(tsvd.components_)) / math.sqrt( var_tsvd) * math.sqrt(min_variance) * additional_scale_tsvd ################ Kernel PCA ################ kpca = decomposition.KernelPCA(n_components=2, kernel="sigmoid", fit_inverse_transform=True, gamma=None, random_state=42) var_kpca = np.var(kpca.fit_transform(x)) if 0. in kpca.lambdas_: # KPCA with Sigmoid kernel does not work for this set del transform_functions['kpca'] def transform_kpca(x, kpca): x = np.array(x) if len(x.shape) == 1: x = x.reshape(1, -1) return kpca.transform(x) / math.sqrt(var_kpca) * math.sqrt( min_variance) ################ Sparse PCA ################ spca = decomposition.SparsePCA(n_components=2, alpha=0.0001, random_state=42, n_jobs=-1) var_spca = np.var(spca.fit_transform(x)) def transform_spca(x, spca): return np.matmul(x, np.transpose( spca.components_)) / math.sqrt(var_spca) * math.sqrt(min_variance) ################ ISO ################ iso = manifold.Isomap(n_neighbors=8, n_components=2, eigen_solver='dense') var_iso = np.var(iso.fit_transform(x)) def transform_iso(x, iso): x = np.array(x) if len(x.shape) == 1: x = x.reshape(1, -1) return iso.transform(x) / math.sqrt(var_iso) * math.sqrt(min_variance) ################ LLE ################ lle = manifold.LocallyLinearEmbedding(n_neighbors=8, n_components=2, eigen_solver='dense') var_lle = np.var(lle.fit_transform(x)) def transform_lle(x, lle): x = np.array(x) if len(x.shape) == 1: x = x.reshape(1, -1) return lle.transform(x) / math.sqrt(var_lle) * math.sqrt(min_variance) ################ SCVIS VAE ################ VAE_save_file = global_dir + "/results/vae_models/" + dataset + ".pt" if not os.path.isfile(VAE_save_file): # Auto-encoder needs to be trained on the model first print('Training new VAE model on %s dataset' % dataset) trainVAE( x, global_dir, dataset ) # normalizing using np.max(np.abs(x)) not necessary as it equals 1 # Once trained, it loads existing model, also for reproducability VAE_model = torch.load(VAE_save_file)['model_state_dict'] print('Loaded VAE model for %s dataset' % dataset) VAE_net = VAE(input_dim=x.shape[1], latent_dim=2) VAE_net.load_state_dict(VAE_model) VAE_net.eval() def transform_vae(x, VAE_net): x = np.array(x) if len(x.shape) == 1: x = x.reshape(1, -1) with torch.no_grad(): x_batch = torch.from_numpy(x).float() encoder_mu, encoder_log_var = VAE_net.encoder(x_batch, p=1.0) batch_z = VAE_net.sampling(encoder_mu, encoder_log_var, batch_size=len(x), eval=True).numpy() return np.array(batch_z, dtype=float) return transform_functions
def _eval_search_params(params_builder): search_params = {} for p in params_builder['param_set']: search_list = p['sp_list'].strip() if search_list == '': continue param_name = p['sp_name'] if param_name.lower().endswith(NON_SEARCHABLE): print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) continue if not search_list.startswith(':'): safe_eval = SafeEval(load_scipy=True, load_numpy=True) ev = safe_eval(search_list) search_params[param_name] = ev else: # Have `:` before search list, asks for estimator evaluatio safe_eval_es = SafeEval(load_estimators=True) search_list = search_list[1:].strip() # TODO maybe add regular express check ev = safe_eval_es(search_list) preprocessings = ( preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0)) newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessings[0:35]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessings[0:7]) elif obj == 'fs_all': newlist.extend(preprocessings[7:14]) elif obj == 'decomp_all': newlist.extend(preprocessings[14:25]) elif obj == 'k_appr_all': newlist.extend(preprocessings[25:29]) elif obj == 'reb_all': newlist.extend(preprocessings[30:35]) elif obj == 'imb_all': newlist.extend(preprocessings[35:54]) elif type(obj) is int and -1 < obj < len(preprocessings): newlist.append(preprocessings[obj]) elif hasattr(obj, 'get_params'): # user uploaded object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported estimator type: %r" % (obj)) search_params[param_name] = newlist return search_params
def test_singular_values(): # Check that the TruncatedSVD output has the correct singular values rng = np.random.RandomState(0) n_samples = 100 n_features = 80 X = rng.randn(n_samples, n_features) dX = da.from_array(X, chunks=(n_samples // 2, n_features)) apca = dd.TruncatedSVD(n_components=2, algorithm="tsqr", random_state=rng).fit(dX) rpca = sd.TruncatedSVD(n_components=2, algorithm="arpack", random_state=rng).fit(X) assert_array_almost_equal(apca.singular_values_, rpca.singular_values_, 12) # Compare to the Frobenius norm X_apca = apca.transform(X) X_rpca = rpca.transform(X) assert_array_almost_equal(np.sum(apca.singular_values_**2.0), np.linalg.norm(X_apca, "fro")**2.0, 12) assert_array_almost_equal(np.sum(rpca.singular_values_**2.0), np.linalg.norm(X_rpca, "fro")**2.0, 12) # Compare to the 2-norms of the score vectors assert_array_almost_equal(apca.singular_values_, np.sqrt(np.sum(X_apca**2.0, axis=0)), 12) assert_array_almost_equal(rpca.singular_values_, np.sqrt(np.sum(X_rpca**2.0, axis=0)), 12) # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = rng.randn(n_samples, n_features) dX = da.from_array(X, chunks=(50, n_features)) apca = dd.TruncatedSVD(n_components=3, algorithm="randomized", random_state=0) rpca = sd.TruncatedSVD(n_components=3, algorithm="randomized", random_state=0) X_apca = apca.fit_transform(dX).compute() X_rpca = rpca.fit_transform(X) X_apca /= np.sqrt(np.sum(X_apca**2.0, axis=0)) X_rpca /= np.sqrt(np.sum(X_rpca**2.0, axis=0)) X_apca[:, 0] *= 3.142 X_apca[:, 1] *= 2.718 X_rpca[:, 0] *= 3.142 X_rpca[:, 1] *= 2.718 X_hat_apca = np.dot(X_apca, apca.components_) X_hat_rpca = np.dot(X_rpca, rpca.components_) apca.fit(da.from_array(X_hat_apca, chunks=(50, n_features))) rpca.fit(X_hat_rpca) assert_array_almost_equal(apca.singular_values_, [3.142, 2.718, 1.0], 14) assert_array_almost_equal(rpca.singular_values_, [3.142, 2.718, 1.0], 14)
def find_components(input_data, n_components=2, method='pca', **kwargs): ''' Extract components from an array of data input_data: np.array The input data matrix n_comps : int The number of components to extract method : str The dimensionality reduction technique to use kwargs : optional arguments to pass to the construction of the estimator Note: this function is basically a wrapper for a bunch of the standard estimators found in sklearn. Please refer to the sklearn documentation for the keyword arguments to pass to the various estimators. http://scikit-learn.org/stable/modules/decomposition.html Examples -------- >>>components = find_components(data,method='k-means',tol=1e-3, batch_size=100, max_iter=50) >>>plot(components.T) >>>components = find_components(copy(all_traces.T),method='incremental pca',batch_size=100) >>>plot(components.T) DEV: Automatically compute the batch sizes for incremental and minibatch PCA "The computational overhead of each SVD is O(batch_size * n_features ** 2), but only 2 * batch_size samples remain in memory at a time. There will be n_samples / batch_size SVD computations to get the principal components, versus 1 large SVD of complexity O(n_samples * n_features ** 2) for PCA." Issues ------ LDA returns components that are not orthogonal; need to apply Gram-Schmidt Kernel PCA is currently broken, also LDA is erratic http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html#sklearn.decomposition.KernelPCA ''' if not has_sklearn: warnings.warn( 'scikit-learn not found. This function will not work correctly.') return None n_samples, n_features = input_data.shape rng = RandomState(0) if n_samples < n_features: warnings.warn( 'More features than samples; assuming input data matrix was transposed.' ) input_data = input_data.T n_samples, n_features = n_features, n_samples if method in [ 'pca', 'ica', 'k-means', 'incremental pca', 'kernel pca', 'random pca' ]: data = input_data - input_data.mean(axis=0) else: data = input_data if method == 'pca': estimator = decomposition.PCA(n_components=n_components, **kwargs) elif method == 'k-means': estimator = MiniBatchKMeans(n_clusters=n_components, random_state=rng, **kwargs) elif method == 'ica': estimator = decomposition.FastICA(n_components=n_components, whiten=True, **kwargs) elif method == 'incremental pca': estimator = decomposition.IncrementalPCA(n_components=n_components, whiten=True) elif method == 'svd': estimator = decomposition.TruncatedSVD(n_components=n_components, random_state=rng, **kwargs) elif method == 'kernel pca': estimator = decomposition.KernelPCA(n_components=n_components, **kwargs) elif method == 'lda': estimator = decomposition.LatentDirichletAllocation( n_topics=n_components, random_state=rng, **kwargs) data = data + abs(min(ravel(data))) elif method == 'random pca': estimator = decomposition.RandomizedPCA(n_components=n_components, whiten=True, **kwargs) else: warnings.warn('Unknown \'method\' argument given; falling back to PCA') estimator = decomposition.PCA(n_components=n_components) estimator.fit(data) if hasattr(estimator, 'cluster_centers_'): components = estimator.cluster_centers_ else: components = estimator.components_ return components
print('Pipeline...') fp = pipeline.Pipeline([( 'union', pipeline.FeatureUnion( n_jobs=-1, transformer_list=[ ('standard', cust_regression_vals()), ('pi1', pipeline.Pipeline([ ('Gene', cust_txt_col('Gene')), ('count_Gene', feature_extraction.text.CountVectorizer(analyzer=u'char', ngram_range=(1, 8))), ('tsvd1', decomposition.TruncatedSVD(n_components=20, n_iter=30, random_state=12)) ])), ('pi2', pipeline.Pipeline([ ('Variation', cust_txt_col('Variation')), ('count_Variation', feature_extraction.text.CountVectorizer(analyzer=u'char', ngram_range=(1, 8))), ('tsvd2', decomposition.TruncatedSVD(n_components=20, n_iter=30, random_state=12)) ])), # commented for Kaggle Limits ('pi3',
def as_local(self): target = sk_dec.TruncatedSVD(self.n_components, algorithm=self.algorithm, n_iter=self.n_iter, random_state=self.random_state, tol=self.tol) copy_attrs_as_local(self, target, 'components_', 'explained_variance_', 'explained_variance_ratio_', 'singular_values_') return target
def relationPrediction(self, inputDir): print("Model may take atleast 20 mins for predictions..") # Read the user-like Data.. relationDataFrame = pd.read_csv("/data/training/relation/relation.csv") # Get Count of No.of likes to each page.. counts = relationDataFrame['like_id'].value_counts() # Filter out the least liked and Most liked pages.. filteredFrame = relationDataFrame[relationDataFrame['like_id'].isin( counts[counts > 25].index)] filteredFrame = filteredFrame[filteredFrame['like_id'].isin( counts[counts < 1000].index)] # Read the test data.. relationTestDataFrame = pd.read_csv(inputDir + "/relation/relation.csv") # Get unique like id's from test data.. uniqueLikeIds = relationTestDataFrame.like_id.unique() filteredFrame = filteredFrame[filteredFrame.like_id.isin( uniqueLikeIds)] tempTestDataFrame = relationTestDataFrame[ relationTestDataFrame.like_id.isin(filteredFrame.like_id.unique())] uselessTestUsers = relationTestDataFrame[~relationTestDataFrame.userid. isin(tempTestDataFrame.userid. unique())] print(len(uselessTestUsers.userid.unique())) uselessTestUsers = uselessTestUsers[~uselessTestUsers.duplicated( subset='userid')] tempTestDataFrame = tempTestDataFrame.append(uselessTestUsers) relationTestDataFrame = tempTestDataFrame # append test data to train data.. filteredFrame = filteredFrame.append(relationTestDataFrame) print(filteredFrame.shape) # Get a user-like Cross Matrix.. crossedTable = pd.crosstab(filteredFrame['userid'], filteredFrame['like_id']) crossedFrameWithIndex = pd.DataFrame(crossedTable.to_records()) # Apply PCA to the data.. X = crossedFrameWithIndex.ix[:, 1:] pca = decomposition.TruncatedSVD(n_components=100) X = pca.fit_transform(X) pcaedDataFrame = pd.DataFrame(X, index=crossedFrameWithIndex.ix[:, 'userid']) pcaedDataFrame.reset_index(level=0, inplace=True) # Seperate test data from training data.. trainDataFrame = pcaedDataFrame[ ~pcaedDataFrame.userid.isin(relationTestDataFrame.userid.unique())] testDataFrame = pcaedDataFrame[pcaedDataFrame.userid.isin( relationTestDataFrame.userid.unique())] # Merge the test userid's and training userid's with their Profile.. trainProfileDataFrame = pd.read_csv( "/data/training/profile/profile.csv") testProfileDataFrame = pd.read_csv(inputDir + "/profile/profile.csv") trainDataFrame = pd.merge(trainDataFrame, trainProfileDataFrame, on='userid') testDataFrame = pd.merge(testDataFrame, testProfileDataFrame, on='userid') gender = RelationPredictor.predictGender(self, trainingData=trainDataFrame, testingData=testDataFrame, doKFold=False) age = RelationPredictor.predictAge(self, trainingData=trainDataFrame, testingData=testDataFrame, doKFold=False) ope, con, ext, agr, neu = RelationPredictor.predictPersonalityScores( self, trainingData=trainDataFrame, testingData=testDataFrame, doKFold=False) finDataFrame = pd.DataFrame({ 'userid': testDataFrame['userid'], 'gender': gender, 'age': age, 'ope': ope, 'con': con, 'ext': ext, 'agr': agr, 'neu': neu }) # print("Gender Acc: " + str(accuracy_score(testDataFrame.ix[:,'gender'], finDataFrame['gender']))) # testingAge = pd.cut(testDataFrame.ix[:,'age'], [0.0, 24.0, 34.0, 49.0, 1000.0], labels=[0, 1, 2, 3], retbins=False, include_lowest=True) # print("Age Acc: " + str(accuracy_score(testingAge, finDataFrame['age']))) # print("Ope RMSE: " + str(sqrt(mean_squared_error(testDataFrame.ix[:,'ope'], finDataFrame['ope'])))) # print("Con RMSE: " + str(sqrt(mean_squared_error(testDataFrame.ix[:,'con'], finDataFrame['con'])))) # print("Ext RMSE: " + str(sqrt(mean_squared_error(testDataFrame.ix[:,'ext'], finDataFrame['ext'])))) # print("Agr RMSE: " + str(sqrt(mean_squared_error(testDataFrame.ix[:,'agr'], finDataFrame['agr'])))) # print("Neu RMSE: " + str(sqrt(mean_squared_error(testDataFrame.ix[:,'neu'], finDataFrame['neu'])))) return finDataFrame
def perform(self): lsa = skd.TruncatedSVD(n_components=self.n_components, random_state=self.random_state) transform = lsa.fit_transform(self.data) return transform
ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english') # Fitting TF-IDF to both training and test sets (semi-supervised learning) tfv.fit(list(xtrain) + list(xtest)) # Learn vocabulary and idf from training set. # list of -> (# of sentence, occurred words number tf-idf-score) xtrain_tfv = tfv.transform(xtrain) # create sparse matrix with tf-idf probs xvalid_tfv = tfv.transform(xtest) # Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model. print("SVM + TFIDF") svd = decomposition.TruncatedSVD( n_components=200) # can we test with more features?! up to 200... svd.fit(xtrain_tfv) xtrain_svd = svd.transform(xtrain_tfv) xvalid_svd = svd.transform(xvalid_tfv) # Scale the data obtained from SVD. Renaming variable to reuse without scaling. scl = preprocessing.StandardScaler() scl.fit(xtrain_svd) xtrain_svd_scl = scl.transform(xtrain_svd) xvalid_svd_scl = scl.transform(xvalid_svd) # Fitting a simple SVM clf = SVC(C=1.0, probability=True) # since we need probabilities clf.fit(xtrain_svd_scl, ytrain) predictions = clf.predict_proba(xvalid_svd_scl) predictions_classes = clf.predict(xvalid_svd_scl)
from sklearn.externals import joblib from sklearn import decomposition matrix_ppmi = joblib.load('matrix_ppmi') svd = decomposition.TruncatedSVD(300) matrix_300 = svd.fit_transform(matrix_ppmi) joblib.dump(matrix_300, 'matrix_300')
def run_category(): # 构建分类模型 t = time.time() # 读取原始数据 data = pd.read_excel(os.path.join(root_dir, r'全部数据/附件2.xlsx')) data['留言'] = (2 * data['留言主题'] + data['留言详情']).apply( lambda i: process(i)) # 创建新的属性并对属性值执行process函数 data['text_split_list'] = data['留言'].apply( lambda i: jieba.lcut(i, cut_all=True)) # 使用全模式对文本进行切分 data['text_split'] = [' '.join(i) for i in data['text_split_list']] data.head() # 查看预处理后的数据 lbl_enc = preprocessing.LabelEncoder() # 将文本标签(Text Label)转化为数字(Integer) y = lbl_enc.fit_transform(data.一级标签.values) # 将DataFrame以9:1切分为训练集和验证集 data_train, data_valid, ytrain, yvalid = train_test_split(data, y, stratify=y, random_state=42, test_size=0.1, shuffle=True) xtrain = data_train.text_split.values # 将训练集的text_split属性值保存在xtrain中 xvalid = data_valid.text_split.values # 将验证集的text_split属性值保存在ytrain中 # 读取停用词文件,并将停用词保存到列表中 stwlist = [ line.strip() for line in open(os.path.join(root_dir, r'stopword.txt'), 'r', encoding='gbk').readlines() ] # 使用TF-IDF算法将文本转化为词频矩阵 tfv = TfidfVectorizer(min_df=3, max_df=0.5, max_features=None, ngram_range=(1, 2), use_idf=True, smooth_idf=True, stop_words=stwlist) # 使用TF-IDF来fit训练集和测试集 tfv.fit(list(xtrain) + list(xvalid)) xtrain_tfv = tfv.transform(xtrain) xvalid_tfv = tfv.transform(xvalid) # 使用SVD进行降维,components设为150,对于SVM来说,SVD的components的合适调整区间一般为120~200 svd = decomposition.TruncatedSVD(n_components=150, random_state=42) svd.fit(xtrain_tfv) xtrain_svd = svd.transform(xtrain_tfv) xvalid_svd = svd.transform(xvalid_tfv) # 对从SVD获得的数据进行缩放 scl = preprocessing.StandardScaler() scl.fit(xtrain_svd) xtrain_svd_scl = scl.transform(xtrain_svd) xvalid_svd_scl = scl.transform(xvalid_svd) # #使用网格搜索法寻找支持向量机的最佳超参数 # from sklearn.model_selection import GridSearchCV # # 把要调整的参数以及其候选值 列出来; # param_grid = {"gamma": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], # "C": [0.001, 0.01, 0.1, 1, 10, 100]} # print("Parameters:{}".format(param_grid)) # # clf = SVC(random_state=11) # 实例化一个SVC类 # # grid_search = GridSearchCV(clf, param_grid, cv=10, scoring='f1_micro') # 实例化一个GridSearchCV类 # grid_search.fit(xtrain_svd_scl, ytrain) # 训练,找到最优的参数,同时使用最优的参数实例化一个新的SVC estimator。 # print("Test set score:{:.2f}".format(grid_search.score(xvalid_svd_scl, yvalid))) # print("Best parameters:{}".format(grid_search.best_params_)) # print("Best score on train set:{:.2f}".format(grid_search.best_score_)) # 调用SVM模型 clf = SVC(C=10, gamma=0.001, random_state=151) clf.fit(xtrain_svd_scl, ytrain) # 输入训练集数据 predictions = clf.predict(xvalid_svd_scl) # 输入需要被分类的验证集数据 print("模型的f1_score: %0.3f " % f1_score(yvalid, predictions, average='micro')) model_path = os.path.join(root_dir, "结果数据/留言分类模型.m") joblib.dump(clf, model_path) # # 读取模型 # clf = joblib.load("my_model.m") # 保存验证集的分类结果 result = data_valid.copy() result.drop(['留言', 'text_split_list', 'text_split'], axis=1, inplace=True) result['模型结果'] = lbl_enc.inverse_transform(predictions) result_path = os.path.join(root_dir, r'结果数据/分类结果.xlsx') result.to_excel(result_path, index=False) print(result.head(5)) print("训练结束,用时:%f" % (time.time() - t)) print("第一题分类模型构建完毕, 模型保存在%s, 分类结果保存在%s" % (model_path, result_path))
def factors_em(self, max_iter=50, tol=math.sqrt(0.000001)): """ Estimates factors with EM alogorithm to handle missings Inputs: max_iter: Maximum number of iterations tol: Tolerance for convergence between iterations of predicted series values Alogrithm: 1) initial_nas: Boolean mask of locations of NaNs 2) working_data: Create Standardized data matrix with nan's replaced with means 3) F: Preliminary factor estimates 4) data_hat_last: Predicted standardized values of last SVD model. data_hat and data_hat_last will not exactly be mean 0 variance 1 5) Iterate data_hat until convergence 6) Fill in nans from orginal data Saves 1) self.svdmodel: sklearn pipeline with standardization step and svd model 2) self.series_filled: self.series with any NaNs filled in with predicted values from self.svdmodel """ # Define our estimation pipelines pipe = skpipe.Pipeline([ ('Standardize', self.factor_standardizer_method(self.standard_method)), ('Factors', skd.TruncatedSVD(self.Nfactor, algorithm='arpack')) ]) inital_scalar = self.factor_standardizer_method(self.standard_method) # Make numpy arrays for calculations actual_data = self.series.to_numpy(copy=True) intial_nas = self.series.isna().to_numpy(copy=True) working_data = inital_scalar.fit_transform( self.series.fillna(value=self.series.mean(), axis='index').to_numpy(copy=True)) # Estimate initial model F = pipe.fit_transform(working_data) data_hat_last = pipe.inverse_transform(F) # Iterate until model convereges iter = 0 distance = tol + 1 while (iter < max_iter) and (distance > tol): F = pipe.fit_transform(working_data) data_hat = pipe.inverse_transform(F) distance = np.linalg.norm(data_hat - data_hat_last, 2) / np.linalg.norm(data_hat_last, 2) data_hat_last = data_hat.copy() working_data[intial_nas] = data_hat[intial_nas] iter += 1 # Print results if iter == max_iter: print( f"EM alogrithm failed to converge afet Maximum iterations of {max_iter}. Distance = {distance}, tolerance was {tol}" ) else: print(f"EM algorithm converged after {iter} iterations") # Save Results actual_data[intial_nas] = inital_scalar.inverse_transform( working_data)[intial_nas] self.svdmodel = pipe self.series_filled = pd.DataFrame(actual_data, index=self.series.index, columns=self.series.columns) self.factors = pd.DataFrame( F, index=self.series_filled.index, columns=[f"F{i}" for i in range(1, F.shape[1] + 1)])
def __init__(self, **kwargs): self.truncated_svd = decomp.TruncatedSVD(**kwargs)
wordBucket[key] = plc plc += 1 print('the place is: ' + str(plc)) outVec = scipy.sparse.lil_matrix((len(globalDat), len(wordBucket.keys()))) print(str(outVec.shape)) for descN in range(len(text)): for importance, word in text[descN]: if wordBucket.get(word) != None: outVec[descN, wordBucket[word]] = importance #wordVecer = txt.CountVectorizer() #outVec = wordVecer.fit_transform(text) #cut down on insane amount of text features #decomp = dec.PCA(n_components=40000) print('starting truncation') decomp = dec.TruncatedSVD(n_components=int(outVec.shape[1] / 3)) outVec = decomp.fit_transform(outVec) outVec = scipy.sparse.csr_matrix(outVec, dtype=float) #trunc = dec.TruncatedSVD(n_components=outVec.shape[1]/10) #trunc.fit(outVec) #outVec = trunc.transform(outVec) print('description vector shape: ' + str(outVec.shape) + " " + str(type(outVec))) #collect the target #put all information in usable format shp = (len(formatted), len(formatted[0]) + 1 + outVec.shape[1]) withTarget = scipy.sparse.lil_matrix(shp, dtype=float) print("WithTarget: " + str(withTarget.shape)) withTarget = withTarget.transpose()
def pca(): print("PCA projection is selected") embedder = decomposition.TruncatedSVD(n_components=n_components) return embedder
def run(fold): #load the full training data with folds df = pd.read_csv("train_folds.csv") #all columns are features except id, target and kfold columns features = [f for f in df.columns if f not in ("id", "target", "kfold")] #fill all NaN values with NONE #note that I am converting all columns to "strings" #it doesnt matter because all are categories for col in features: df.loc[:, col] = df[col].astype(str).fillna("NONE") #get training data using folds df_train = df[df.kfold != fold].reset_index(drop=True) #get validation data using folds df_valid = df[df.kfold == fold].reset_index(drop=True) #initialize OneHotEncoder from scikit-learn ohe = preprocessing.OneHotEncoder() #fit ohe on training + validation features full_data = pd.concat([df_train[features], df_valid[features]], axis=0) ohe.fit(full_data[features]) #transform training data x_train = ohe.transform(df_train[features]) #transform validation data x_valid = ohe.transform(df_valid[features]) #initialize Truncated SVD #we are reducing the data to 120 components svd = decomposition.TruncatedSVD(n_components=120) #fit svd on full sparse training data full_sparse = sparse.vstack((x_train, x_valid)) svd.fit(full_sparse) #transform sparse training data x_train = svd.transform(x_train) x_valid = svd.transform(x_valid) model = ensemble.RandomForestClassifier(n_jobs=-1) #transform sparse validation data #fit model on training data (ohe) model.fit(x_train, df_train.target.values) #predict on validation data #we need the probability values as we are calculating AUC #we will use the probability of 1s valid_preds = model.predict_proba(x_valid)[:, 1] #get roc auc score auc = metrics.roc_auc_score(df_valid.target.values, valid_preds) #print auc print(f"Fold = {fold}, AUC = {auc}")
from sklearn.neighbors import LocalOutlierFactor import networkx as nx algorithms = [ decomposition.TruncatedSVD, manifold.MDS, manifold.Isomap, manifold.LocallyLinearEmbedding, manifold.TSNE ] fname = sys.argv[1] algorithm = int(sys.argv[2]) n_comps = int(sys.argv[3]) x = np.loadtxt(fname) if algorithm == 0: model = decomposition.TruncatedSVD(n_components=n_comps) X = model.fit_transform(x) elif algorithm == 1: model = manifold.MDS(n_components=n_comps) X = model.fit_transform(x) elif algorithm == 2: model = manifold.Isomap(n_components=n_comps) X = model.fit_transform(x) elif algorithm == 3: model = manifold.TSNE(n_components=n_comps) X = model.fit_transform(x) elif algorithm == 4: n_points, input_size = x.shape som_size = int(np.sqrt(n_points) / 2) model = MiniSom(som_size, som_size,
if title is not None: plt.title(title) # ======================================================================================== # 随机映射 print("Computing random projection") rp = random_projection.SparseRandomProjection(n_components=2, random_state=42) X_projected = rp.fit_transform(X) plot_embedding_2d(X_projected, "Random Projection") # ======================================================================================== # PCA print("Computing PCA projection") t0 = time.time() X_pca = decomposition.TruncatedSVD(n_components=3).fit_transform(X) plot_embedding_2d(X_pca[:, 0:2], "PCA 2D") plot_embedding_3d(X_pca, "PCA 3D (time %.2fs)" % (time.time() - t0)) # ======================================================================================== # LDA print("Computing LDA projection") X2 = X.copy() X2.flat[::X.shape[1] + 1] += 0.01 # Make X invertible t0 = time.time() lda = LinearDiscriminantAnalysis(n_components=3) X_lda = lda.fit_transform(X2, y) plot_embedding_2d(X_lda[:, 0:2], "LDA 2D") plot_embedding_3d(X_lda, "LDA 3D (time %.2fs)" % (time.time() - t0)) # ========================================================================================
def plot_2d_per_time(): fig_style = dict(cmap="viridis", marker='o') df, _ = models.load_unigrams() df_ratios = models.load_ratios() ug_normalized = (np.log1p(df).T / np.log1p(df).sum(axis=1)).T ug_smoothed = ug_normalized.rolling( data_config.unigram_rm_smoothing).mean() d_svd_unsmooth = pre.scale( decomp.TruncatedSVD(n_components=2).fit_transform( pre.StandardScaler().fit_transform( ug_normalized.ix[data_config.date_begin:].as_matrix()))) d_svd_smooth = pre.scale( decomp.TruncatedSVD(n_components=2).fit_transform( pre.StandardScaler().fit_transform( ug_smoothed.ix[data_config.date_begin:].as_matrix()))) ratios_smoothed = df_ratios.ewm(com=21).mean() d_ratios_ica = pre.scale( decomp.FastICA(n_components=2).fit_transform( pre.scale( ratios_smoothed.ix[data_config.date_begin:].as_matrix()))) d_ratios_ica_restricted = pre.scale( decomp.FastICA(n_components=2).fit_transform(pre.StandardScaler( ).fit_transform( ratios_smoothed.ix[data_config.date_turning_point:].as_matrix()))) d_ratios_ica_rotated = d_ratios_ica.dot( scipy.linalg.orthogonal_procrustes(d_ratios_ica, d_svd_smooth)[0]) d_ratios_ica_restricted_rotated = d_ratios_ica_restricted.dot( scipy.linalg.orthogonal_procrustes( d_ratios_ica_restricted, d_svd_smooth[-d_ratios_ica_restricted.shape[0]:])[0]) fig, ax = plt.subplots(ncols=3) idx = [d.toordinal() for d in df.ix[data_config.date_begin:].index.date] scatter_svd_unsmooth = ax[0].scatter(*d_svd_unsmooth.T, c=idx, s=15, **fig_style) ax[1].scatter(*d_svd_smooth.T, c=idx, s=15, **fig_style) scatter_ica_restricted = ax[2].scatter( *d_ratios_ica_restricted_rotated.T, c=idx[-d_ratios_ica_restricted.shape[0]:], s=15, vmin=min(idx), vmax=max(idx), **fig_style) for a in ax: a.set_frame_on(False) a.get_xaxis().set_visible(False) a.get_yaxis().set_visible(False) cb = fig.colorbar(scatter_svd_unsmooth, orientation='vertical', ticks=dates.YearLocator(), format=dates.DateFormatter('%Y')) cb.outline.set_visible(False) fig.savefig("thesis/plots/unigram_decomp.png")