def __init__( self, n_neighbors=15, min_dist=0.1, metric="euclidean", n_components=2, spread=1.0, random_state=None ): self._inst = UMAP( n_neighbors = n_neighbors, min_dist = min_dist, metric = metric, n_components=n_components, spread=spread, )
def test_umap_negative_n_neighbours(nn_data): u = UMAP(n_neighbors=-1) assert_raises(ValueError, u.fit, nn_data)
def test_umap_non_integer_n_components(nn_data): u = UMAP(n_components=1.5) assert_raises(ValueError, u.fit, nn_data)
def test_umap_negative_min_dist(nn_data): u = UMAP(min_dist=-1) assert_raises(ValueError, u.fit, nn_data)
def test_umap_bad_hellinger_data(nn_data): u = UMAP(metric="hellinger") assert_raises(ValueError, u.fit, -nn_data)
def test_umap_negative_op(nn_data): u = UMAP(set_op_mix_ratio=-1.0) assert_raises(ValueError, u.fit, nn_data)
def test_umap_unique_and_precomputed(nn_data): u = UMAP(metric="precomputed", unique=True) assert_raises(ValueError, u.fit, nn_data)
def test_densmap_var_shift(nn_data): u = UMAP(densmap=True, dens_var_shift=-1.0) assert_raises(ValueError, u.fit, nn_data)
def test_bad_transform_data(nn_data): u = UMAP().fit([[1, 1, 1, 1]]) with pytest.raises(ValueError): u.transform([[0, 0, 0, 0]])
'small', 'by', 'this', 'have', 'in', 'obviously', 'ten', 'those', 'vessel', 'good', 'up', 'will', 'combination', 'rather', 'should', 'if', 'so', 'plan', 'interesting', 'chat', 'let', 'now', 'imply', 'the', 'image', 'information', 'get', 'particular', 'test', 'show', 'about', 'strong', 'seventy', 'would', 'two', 'eighty', 'grey', 'at', 'last', 'always', 'blood', 'on', 'first', 'light', 'can', 'point', 'family', 'take', 'between', 'must', 'than', 'dr', 'honest', 'which', 'do', 'seem', 'an', 'all', 'black', '10', ' ', ' ', ' ', 'johnson', 'gosh', 'when', 'far', 'mean', 'with', 'absolutely', 'for', 'make', 'as', 'somewhere', 'screen', 'true', '20', 'correct', 'into', 'specifically', '90', 'dark', 'start', 'bottom', 'then', 'd', '100', 'out', 'line', 'where', 'pass', 'ct', 'i', 'round', 'open', 'mrs', 'clog' ] vectors = [] for word in words: vectors.append(nlp(word).vector) vectors = array(vectors) st.sidebar.subheader('Embedding Visualization') n_words = st.sidebar.slider("Number of words", 1, len(words), 30) words = array(words[:n_words]) vectors = vectors[:n_words, :] reducer = UMAP(n_components=3) scaled_data = StandardScaler().fit_transform(vectors) embedding = reducer.fit_transform(scaled_data) fig = scatter_3d(x=embedding[:, 0], y=embedding[:, 1], z=embedding[:, 2], text=words, hover_name=words) st.plotly_chart(fig)
distmat = nan_to_num(distmat) distmat[distmat < 0] = 0.0 distmat = distmat / max(distmat) print(distmat.shape) from umap import UMAP from py.utils.safe_pickle import pickle_dump import os dirname = "../../../exact_embeddings/" + distname + "_" + dataset if not os.path.exists(dirname): os.mkdir(dirname) n_neighbors = 40 for n_components in [50, 100, 300, 1000]: for min_dist in [1.0, 1.5, 2.0]: for spread in [1.0, 2.5]: if min_dist > spread: continue print(n_components, n_neighbors, min_dist, spread) t = UMAP(n_components=n_components, n_neighbors = n_neighbors,\ min_dist=min_dist, metric = "precomputed", random_state=42,\ n_epochs = 1000, spread = spread) embeddings = t.fit_transform(distmat) print(embeddings.shape) pickle_dump( embeddings, dirname + "/" + str(n_components) + "-" + str(min_dist) + "-" + str(spread) + ".p")
def latent_scatter(var_unk_pred, y_unk_pred, acquisition, **kwargs): chems = kwargs['chems'] chem2feature = kwargs['chem2feature'] idx_obs = kwargs['idx_obs'] idx_unk = kwargs['idx_unk'] regress_type = kwargs['regress_type'] prot_target = kwargs['prot_target'] chem_idx_obs = sorted(set([i for i, _ in idx_obs])) chem_idx_unk = sorted(set([i for i, _ in idx_unk])) feature_obs = np.array([chem2feature[chems[i]] for i in chem_idx_obs]) feature_unk = np.array([chem2feature[chems[i]] for i in chem_idx_unk]) from sklearn.neighbors import NearestNeighbors nbrs = NearestNeighbors(n_neighbors=1).fit(feature_obs) dist = np.ravel(nbrs.kneighbors(feature_unk)[0]) print('Distance Spearman r = {}, P = {}'.format( *ss.spearmanr(dist, var_unk_pred))) print('Distance Pearson rho = {}, P = {}'.format( *ss.pearsonr(dist, var_unk_pred))) X = np.vstack([feature_obs, feature_unk]) labels = np.concatenate( [np.zeros(len(chem_idx_obs)), np.ones(len(chem_idx_unk))]) sidx = np.argsort(-var_unk_pred) from fbpca import pca U, s, Vt = pca( X, k=3, ) X_pca = U * s from umap import UMAP um = UMAP( n_neighbors=15, min_dist=0.5, n_components=2, metric='euclidean', ) X_umap = um.fit_transform(X) from MulticoreTSNE import MulticoreTSNE as TSNE tsne = TSNE( n_components=2, n_jobs=20, ) X_tsne = tsne.fit_transform(X) if prot_target is None: suffix = '' else: suffix = '_' + prot_target for name, coords in zip( ['pca', 'umap', 'tsne'], [X_pca, X_umap, X_tsne], ): plt.figure() sns.scatterplot( x=coords[labels == 1, 0], y=coords[labels == 1, 1], color='blue', alpha=0.1, ) plt.scatter( x=coords[labels == 0, 0], y=coords[labels == 0, 1], color='orange', alpha=1.0, marker='x', linewidths=10, ) plt.savefig('figures/latent_scatter_{}_ypred_{}{}.png'.format( name, regress_type, suffix), dpi=300) plt.close() plt.figure() plt.scatter(x=coords[labels == 1, 0], y=coords[labels == 1, 1], c=ss.rankdata(var_unk_pred), alpha=0.1, cmap='coolwarm') plt.savefig('figures/latent_scatter_{}_var_{}{}.png'.format( name, regress_type, suffix), dpi=300) plt.close() plt.figure() plt.scatter(x=coords[labels == 1, 0], y=coords[labels == 1, 1], c=-acquisition, alpha=0.1, cmap='hot') plt.savefig('figures/latent_scatter_{}_acq_{}{}.png'.format( name, regress_type, suffix), dpi=300) plt.close()
def visualize_topics(topic_model, topics: List[int] = None, top_n_topics: int = None, width: int = 650, height: int = 650) -> go.Figure: """ Visualize topics, their sizes, and their corresponding words This visualization is highly inspired by LDAvis, a great visualization technique typically reserved for LDA. Arguments: topic_model: A fitted BERTopic instance. topics: A selection of topics to visualize top_n_topics: Only select the top n most frequent topics width: The width of the figure. height: The height of the figure. Usage: To visualize the topics simply run: ```python topic_model.visualize_topics() ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_topics() fig.write_html("path/to/file.html") ``` """ # Select topics based on top_n and topics args if topics is not None: topics = list(topics) elif top_n_topics is not None: topics = sorted( topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1]) else: topics = sorted(list(topic_model.get_topics().keys())) # Extract topic words and their frequencies topic_list = sorted(topics) frequencies = [topic_model.topic_sizes[topic] for topic in topic_list] words = [ " | ".join([word[0] for word in topic_model.get_topic(topic)[:5]]) for topic in topic_list ] # Embed c-TF-IDF into 2D all_topics = sorted(list(topic_model.get_topics().keys())) indices = np.array([all_topics.index(topic) for topic in topics]) embeddings = topic_model.c_tf_idf.toarray()[indices] embeddings = MinMaxScaler().fit_transform(embeddings) embeddings = UMAP(n_neighbors=2, n_components=2, metric='hellinger').fit_transform(embeddings) # Visualize with plotly df = pd.DataFrame({ "x": embeddings[1:, 0], "y": embeddings[1:, 1], "Topic": topic_list[1:], "Words": words[1:], "Size": frequencies[1:] }) return _plotly_topic_visualization(df, topic_list, width, height)
def plot_embedding(X, labels, classes=None, method='tSNE', cmap='tab20', figsize=(4, 4), markersize=4, marker=None, return_emb=False, save=False, save_emb=False, show_legend=True, show_axis_label=True, **legend_params): if marker is not None: X = np.concatenate([X, marker], axis=0) N = len(labels) if X.shape[1] != 2: if method == 'tSNE': #from sklearn.manifold import TSNE from MulticoreTSNE import MulticoreTSNE as TSNE X = TSNE(n_components=2, random_state=124, n_jobs=32).fit_transform(X) if method == 'UMAP': from umap import UMAP X = UMAP(n_neighbors=30, min_dist=0.3, metric='correlation').fit_transform(X) if method == 'PCA': from sklearn.decomposition import PCA X = PCA(n_components=2, random_state=124).fit_transform(X) plt.figure(figsize=figsize) if classes is None: classes = np.unique(labels) if cmap is not None: cmap = cmap elif len(classes) <= 10: cmap = 'tab10' elif len(classes) <= 20: cmap = 'tab20' else: cmap = 'husl' colors = sns.color_palette(cmap, n_colors=len(classes)) for i, c in enumerate(classes): plt.scatter(X[:N][labels == c, 0], X[:N][labels == c, 1], s=markersize, color=colors[i], label=c) if marker is not None: plt.scatter(X[N:, 0], X[N:, 1], s=10 * markersize, color='black', marker='*') # plt.axis("off") legend_params_ = { 'loc': 'center left', 'bbox_to_anchor': (1.0, 0.45), 'fontsize': 10, 'ncol': 1, 'frameon': False, 'markerscale': 1.5 } legend_params_.update(**legend_params) if show_legend: plt.legend(**legend_params_) sns.despine(offset=10, trim=True) if show_axis_label: plt.xlabel(method + ' dim 1', fontsize=12) plt.ylabel(method + ' dim 2', fontsize=12) if save: plt.savefig(save, format='pdf', bbox_inches='tight') else: plt.show() if save_emb: np.savetxt(save_emb, X) if return_emb: return X
class TUmap(Transform): """ n_neighbors: This determines the number of neighboring points used in local approximations of manifold structure. Larger values will result in more global structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50, with a choice of 10 to 15 being a sensible default. min_dist: This controls how tightly the embedding is allowed compress points together. Larger values ensure embedded points are more evenly distributed, while smaller values allow the algorithm to optimise more accurately with regard to local structure. Sensible values are in the range 0.001 to 0.5, with 0.1 being a reasonable default. metric: This determines the choice of metric used to measure distance in the input space. A wide variety of metrics are already coded, and a user defined function can be passed as long as it has been JITd by numba. """ def __init__( self, n_neighbors=15, min_dist=0.1, metric="euclidean", n_components=2, spread=1.0, random_state=None ): self._inst = UMAP( n_neighbors = n_neighbors, min_dist = min_dist, metric = metric, n_components=n_components, spread=spread, ) def transform(self, fp): x = FeaturePool(fp).array() logger.info("TUmap: starting UMAP transform ...") x_emb = self._inst.fit_transform(x) logger.info("TUamp: Done") for f_id in range(x_emb.shape[1]): yield Feature( "UMAP feature #{}".format(f_id), x_emb[:, f_id] ) @staticmethod def plot_embedding(efp: FeaturePool, split_by=None): x = efp.array() assert x.shape[1] == 2, "Embedding is expected to be with the size 2 to plot, got {}".format(x.shape[1]) fig = plt.figure(figsize=(7, 7)) ax = fig.add_subplot(111) if split_by is not None: d = split_by.data ax.scatter(x[:, 0], x[:, 1], c=d, alpha=0.5) else: ax.scatter(x[:, 0], x[:, 1], alpha=0.5) if split_by is not None: ax.set_title( "UMAP for a feature pool splitted by feature `{}`".format(split_by.name) ) else: ax.set_title( "UMAP for a feature pool" ) fig.show()
from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split from reval.best_nclust_cv import FindBestClustCV from sklearn.neighbors import KNeighborsClassifier from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import zero_one_loss, adjusted_mutual_info_score import matplotlib.pyplot as plt from umap import UMAP from reval.visualization import plot_metrics from reval.relative_validation import _kuhn_munkres_algorithm # MNIST dataset with 10 classes mnist, label = fetch_openml('mnist_784', version=1, return_X_y=True) transform = UMAP(n_neighbors=30, min_dist=0.0, n_components=10, random_state=42) # Stratified subsets of 7000 elements for both training and test set mnist_tr, mnist_ts, label_tr, label_ts = train_test_split(mnist, label, train_size=0.1, test_size=0.1, random_state=42, stratify=label) # Dimensionality reduction with UMAP as pre-processing step mnist_tr = transform.fit_transform(mnist_tr) mnist_ts = transform.transform(mnist_ts) plt.scatter(mnist_tr[:, 0],
def test_blobs_cluster(): data, labels = make_blobs(n_samples=500, n_features=10, centers=5) embedding = UMAP(n_epochs=100).fit_transform(data) assert adjusted_rand_score(labels, KMeans(5).fit_predict(embedding)) == 1.0
def test_densmap_frac(nn_data): u = UMAP(densmap=True, dens_frac=-1.0) assert_raises(ValueError, u.fit, nn_data) u = UMAP(densmap=True, dens_frac=2.0) assert_raises(ValueError, u.fit, nn_data)
for neighbors in n_neighbors: for min_d in min_dist: for k in ks: print(dataset + ', ' + metric + ', ' + scaler + ', n_components=' + str(components) + ', n_neighbors=' + str(neighbors) + ', min_dist=' + str(min_d) + ', k=' + str(k)) # Se estandariza usando el scaler correspondiente df = scalers[scaler].fit_transform( datasets[dataset]) # Se aplica UMAP um = UMAP(n_components=components, n_neighbors=neighbors, min_dist=min_d, metric=metric) embedding = um.fit_transform(df) # Se calculan las validaciones internas sil = get_silhouette_avg(embedding, k) sse = get_sse(embedding, k) # Se aplica KMeans km = KMeans(n_clusters=k, random_state=0).fit(embedding) # Se calcula la matriz de confusión tmp = pd.DataFrame({ 'Generos': metadata.genre, 'data': km.labels_
def test_densmap_bad_output_metric(nn_data): u = UMAP(densmap=True, output_metric="haversine") assert_raises(ValueError, u.fit, nn_data)
#all_docs_tagged = [TaggedDocument(doc, [i]) for i, doc in all_docs.items()] tagged_docs = [TaggedDocument(doc, [i]) for i, doc in X.items()] print("Train Doc2Vec model...") #model = Doc2Vec(vector_size=50, min_count=2, epochs=40) #model.build_vocab(all_docs_tagged) model = Doc2Vec.load(model_path) print("Infer doc vectors...") docvecs = X.progress_apply(lambda x: model.infer_vector(x)) docvecs = list(docvecs) #docvecs = docvecs.to_numpy() #print("dim reduction ...") dim_reduced_vecs = UMAP(metric="cosine", set_op_mix_ratio=0, n_components=n_comps, random_state=42).fit_transform(docvecs) print("dim reduction 2D ...") vecs_2d = UMAP(metric="cosine", set_op_mix_ratio=0, n_components=2, random_state=42).fit_transform(docvecs) #print("Local outlier factor ...") #df["predicted"] = LocalOutlierFactor( # novelty=False, metric="euclidean", contamination=d["contamination"]).fit_predict(dim_reduced_vecs) print("HDBSCAN ...") #dim_reduced_vecs = normalize(dim_reduced_vecs, norm="l2") clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
def test_umap_bad_n_jobs(nn_data): u = UMAP(n_jobs=-2) assert_raises(ValueError, u.fit, nn_data) u = UMAP(n_jobs=0) assert_raises(ValueError, u.fit, nn_data)
def example2(): mnist = fetch_openml('mnist_784', version=1) mnist.target = mnist.target.astype(int) X_tr, y_tr = mnist['data'][:60000], mnist.target[:60000] X_ts, y_ts = mnist['data'][60000::], mnist.target[60000::] transform = UMAP(n_components=2, random_state=42, n_neighbors=30, min_dist=0.0) X_tr = transform.fit_transform(X_tr) X_ts = transform.transform(X_ts) s = KNeighborsClassifier(n_neighbors=30) c = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=200) reval = FindBestClustCV(s=s, c=c, nfold=2, nrand=10, n_jobs=N_JOBS) metrics, nclustbest, tr_lab = reval.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr) plot_metrics(metrics) out = reval.evaluate(X_tr, X_ts, nclust=nclustbest, tr_lab=tr_lab) perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab) logging.info(f"Validation stability: {metrics['val'][nclustbest]}") logging.info(f"Best number of clusters during CV: {nclustbest}") logging.info(f"Best number of clusters on test set: " f"{len([lab for lab in np.unique(out.test_cllab) if lab >= 0])}") logging.info(f'AMI (true labels vs predicted labels) = ' f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}') logging.info('\n\n') logging.info("Metrics from true label comparisons on test set:") class_scores = compute_metrics(y_ts, perm_lab) for k, val in class_scores.items(): logging.info(f'{k}, {val}') logging.info('\n\n') # Visualization fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_tr[:, 0], X_tr[:, 1], c=y_tr, cmap='rainbow_r', s=0.1) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Train set true labels (digits dataset)") plt.show() fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_tr[:, 0], X_tr[:, 1], c=kuhn_munkres_algorithm(y_tr, tr_lab), cmap='tab20', s=0.1) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Train set predicted labels (digits dataset)") plt.show() fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_ts[:, 0], X_ts[:, 1], c=y_ts, cmap='tab20', s=0.1) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Test set true labels (digits dataset)") plt.show() fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_ts[:, 0], X_ts[:, 1], s=0.1, c=perm_lab, cmap='tab20') legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Test set clustering labels (digits dataset)") plt.show() # Internal measures # SILHOUETTE logging.info("Silhouette score based selection") sil_score_tr, sil_best_tr, sil_label_tr = select_best(X_tr, c, silhouette_score, select='max') sil_score_ts, sil_best_ts, sil_label_ts = select_best(X_ts, c, silhouette_score, select='max') logging.info( f"Best number of clusters (and scores) for tr/ts independent runs: " f"{sil_best_tr}({sil_score_tr})/{sil_best_ts}({sil_score_ts})") logging.info(f'AMI (true labels vs clustering labels) training = ' f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, sil_label_tr))}') logging.info(f'AMI (true labels vs clustering labels) test = ' f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, sil_label_ts))}') logging.info('\n\n') # DAVIES-BOULDIN logging.info("Davies-Bouldin score based selection") db_score_tr, db_best_tr, db_label_tr = select_best(X_tr, c, davies_bouldin_score, select='min') db_score_ts, db_best_ts, db_label_ts = select_best(X_ts, c, davies_bouldin_score, select='min') logging.info( f"Best number of clusters (and scores) for tr/ts independent runs: " f"{db_best_tr}({db_score_tr})/{db_best_ts}({db_score_ts})") logging.info(f'AMI (true labels vs clustering labels) training = ' f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, db_label_tr))}') logging.info(f'AMI (true labels vs clustering labels) test = ' f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, db_label_ts))}') logging.info('\n\n') # Visualization fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_tr[:, 0], X_tr[:, 1], c=sil_label_tr, cmap='tab20', s=0.1) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Train set silhouette labels (digits dataset)") plt.show() fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_ts[:, 0], X_ts[:, 1], c=sil_label_ts, cmap='tab20', s=0.1) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Test set silhouette labels (digits dataset)") plt.show() fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_tr[:, 0], X_tr[:, 1], c=db_label_tr, cmap='tab20', s=0.1) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Train set Davies-Bouldin labels (digits dataset)") plt.show() fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_ts[:, 0], X_ts[:, 1], s=0.1, c=db_label_ts, cmap='tab20') legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Test set Davies-Bouldin labels (digits dataset)") plt.show()
def test_umap_too_large_op(nn_data): u = UMAP(set_op_mix_ratio=1.5) assert_raises(ValueError, u.fit, nn_data)
def example3(n_jobs, preprocess=None): """ :param preprocess: it can be 'scaled', 'umap', 'scaled+umap', default None for raw processing. :type preprocess: str :return: """ # Example 4: best clussifier/clustering for UCI dataset # Classifiers s = [LogisticRegression(solver='liblinear', random_state=42), RandomForestClassifier(n_estimators=100, random_state=42), KNeighborsClassifier(n_neighbors=1, metric='euclidean'), SVC(C=1, random_state=42)] # Clustering c = [AgglomerativeClustering(), KMeans(random_state=42), hdbscan.HDBSCAN()] scparam = {'s': s, 'c': c} transform = UMAP(n_neighbors=30, min_dist=0.0, random_state=42) scale = StandardScaler() # Import benchmark datasets uci_data = build_ucidatasets() # Run ensemble learning algorithm best_results = {} for data, name in zip(uci_data, uci_data._fields): scparam['s'][-1].gamma = (1 / data['data'].shape[0]) nclass = len(np.unique(data['target'])) logging.info(f"Processing dataset {name}") logging.info(f"True number of classes: {nclass}\n") X_tr, X_ts, y_tr, y_ts = train_test_split(data['data'], data['target'], test_size=0.40, random_state=42, stratify=data['target']) if preprocess == 'umap+scaled': X_tr = transform.fit_transform(scale.fit_transform(X_tr)) elif preprocess == 'umap': X_tr = transform.fit_transform(X_tr) elif preprocess == 'scaled': X_tr = scale.fit_transform(X_tr) scparam_select = SCParamSelection(sc_params=scparam, cv=2, nrand=10, clust_range=list(range(2, nclass + 3, 1)), n_jobs=n_jobs, iter_cv=10, strat=y_tr) scparam_select.fit(X_tr, nclass=nclass) best_results[name] = scparam_select.best_param_ # Uncomment to save the results # pkl.dump(best_results, open('./best_resultUCI_scaledumap.pkl', 'wb')) logging.info('*' * 100) logging.info('\n\n')
def test_umap_negative_n_components(nn_data): u = UMAP(n_components=-1) assert_raises(ValueError, u.fit, nn_data)
def test_haversine_on_highd(nn_data): u = UMAP(metric="haversine") assert_raises(ValueError, u.fit, nn_data)
def test_umap_too_small_n_neighbours(nn_data): u = UMAP(n_neighbors=0.5) assert_raises(ValueError, u.fit, nn_data)
def test_umap_haversine_embed_to_highd(nn_data): u = UMAP(n_components=3, output_metric="haversine") assert_raises(ValueError, u.fit, nn_data)
from datetime import datetime from util import getKaggleMNIST from sklearn.linear_model import LogisticRegression from umap import UMAP # get the data Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST() print("Score without transformation:") model = LogisticRegression() model.fit(Xtrain, Ytrain) print(model.score(Xtrain, Ytrain)) print(model.score(Xtest, Ytest)) umapper = UMAP(n_neighbors=5, n_components=10) t0 = datetime.now() Ztrain = umapper.fit_transform(Xtrain) print("umap fit_transform took:", datetime.now() - t0) t0 = datetime.now() Ztest = umapper.transform(Xtest) print("umap transform took:", datetime.now() - t0) print("Score with transformation") model = LogisticRegression() t0 = datetime.now() model.fit(Ztrain, Ytrain) print("logistic regression fit took:", datetime.now() - t0) print(model.score(Ztrain, Ytrain)) print(model.score(Ztest, Ytest))
def test_umap_too_many_neighbors_warns(nn_data): u = UMAP(a=1.2, b=1.75, n_neighbors=2000, n_epochs=11, init="random") u.fit(nn_data[:100, ]) assert_equal(u._a, 1.2) assert_equal(u._b, 1.75)
def test_densmap_lambda(nn_data): u = UMAP(densmap=True, dens_lambda=-1.0) assert_raises(ValueError, u.fit, nn_data)
def get_factor_df(self, ids=None, embedding_dim=2, batch_size=128, valid_ages=None): if embedding_dim is not None: if embedding_dim == self.num_embeddings: embs_reduced = np.abs( self.embeddings.weight.data.cpu().numpy()) else: from umap import UMAP embs_reduced = UMAP(n_components=embedding_dim).fit_transform( np.abs(self.embeddings.weight.data.cpu().numpy())) with torch.no_grad(): idx = 0 dfs = [] all_temporal_idxs = torch.LongTensor(list(range(self.num_days))) while idx < self.num_entities: batch_idxs = torch.arange( idx, min((idx + batch_size, self.num_entities))) ( _, _, _, factors_by_emb, _, _, ) = self.forward(all_temporal_idxs, batch_idxs) idx += batch_size bee_ages_flat = self.ages[:, batch_idxs].numpy().flatten() factors_flat = factors_by_emb.data.cpu().numpy().reshape( -1, self.num_factors) day_flat = np.tile( np.arange(self.num_days)[:, None], (1, len(batch_idxs))).flatten() columns = ["age", "day" ] + [f"f_{f}" for f in range(self.num_factors)] df_data = np.concatenate( (bee_ages_flat[:, None], day_flat[:, None], factors_flat), axis=-1) if ids is not None: columns = ["bee_id"] + columns ids_flat = np.tile(ids[batch_idxs][None, :], (self.num_days, 1)).flatten() df_data = np.concatenate((ids_flat[:, None], df_data), axis=-1) if valid_ages is not None: columns = ["valid_age"] + columns valid_flat = valid_ages[:, batch_idxs].flatten() df_data = np.concatenate((valid_flat[:, None], df_data), axis=-1) if embedding_dim is not None: columns += [f"e_{f}" for f in range(embedding_dim)] embs_flat = np.tile(embs_reduced[batch_idxs][None, :], (self.num_days, 1)).reshape( -1, embedding_dim) df_data = np.concatenate((df_data, embs_flat), axis=-1) factor_df = pd.DataFrame(df_data, columns=columns) dfs.append(factor_df) factor_df = pd.concat(dfs) factor_df.reset_index(inplace=True, drop=True) factor_df = factor_df[factor_df.age >= 0] return factor_df