Beispiel #1
0
def update_graph(amt, hrs):
    t0 = time.time()
    # First, filter based on the slider values
    time_mask = (gdf.Time >= hrs[0]) & (gdf.Time <= hrs[1])
    amount_mask = (gdf.Amount >= amt[0]) & (gdf.Amount <= amt[1])
    filt_df = gdf.loc[time_mask & amount_mask]

    # Then, select the features and train a UMAP model with cuML
    features = filt_df.loc[:, "V1":"V28"].values
    reducer = cuml.UMAP()
    embedding = reducer.fit_transform(features)

    # Convert the embedding back to numpy
    embedding = cp.asnumpy(embedding)
    amount = cp.asnumpy(filt_df.Amount.values.round(2))

    # Create a plotly.express scatter plot
    fig = px.scatter(
        x=embedding[:, 0],
        y=embedding[:, 1],
        color=amount,
        labels={"color": "Amount ($)"},
        title="UMAP projection of credit card transactions",
    )

    t1 = time.time()
    out_msg = f"Projected {embedding.shape[0]} transactions in {t1-t0:.2f}s."
    alert = dbc.Alert(out_msg, color="success", dismissable=True)

    return fig, alert
Beispiel #2
0
def get_UMAP_prjs(input_data, cpu=True, **kwargs):
    "Compute the projections of `input_data` using UMAP, with a configuration contained in `**kwargs`."
    warnings.filterwarnings(
        "ignore",
        category=NumbaPerformanceWarning)  # silence NumbaPerformanceWarning
    reducer = umap.UMAP(**kwargs) if cpu else cuml.UMAP(**kwargs)
    projections = reducer.fit_transform(input_data)
    return projections
Beispiel #3
0
 def getMapper(self, X, y=None, **kwargs):
     if self._mapper is None:
         t0 = time.time()
         print(f"Computing embedding, input shape = {X.shape}")
         input_data: cudf.DataFrame = self.getDataFrame(X)
         self._mapper = cuml.UMAP(init=self.init,
                                  n_neighbors=self.n_neighbors,
                                  n_components=self.n_components,
                                  n_epochs=self.n_epochs,
                                  min_dist=self.min_dist,
                                  output_type="numpy")
         self._mapper.fit(input_data)
         print(
             f"Completed umap fit in time {time.time() - t0} sec, embedding shape = {self._embedding_.shape}"
         )
     return self._mapper
Beispiel #4
0
def cuml_umap(config, feature):
    # Import RAPIDS
    import cudf, cuml
    print("INSIDE CUML_UMAP")
    print(feature.shape)
    num_fr = feature.shape[0]
    embed = np.zeros((num_fr, config['n_components']))

    # TRY THIS LATER!!!!!!!!!!!!!!!!! IF YOU EVER RUN OUT OF SPACE; COMPARE EMBEDDINGS AND SEE IF SMALLER DATA MAKES A DIFFERENCE
    # df = cudf.DataFrame(feature, dtype='float32')
    df = cudf.DataFrame(feature)

    cu_embed = cuml.UMAP(n_components=config['n_components'], n_neighbors=config['n_neighbors'], n_epochs=config['n_epochs'], 
                    min_dist=config['min_dist'], spread=config['spread'], negative_sample_rate=config['negative_sample_rate'],
                    init=config['init'], repulsion_strength=config['repulsion_strength'], output_type='numpy').fit_transform(df)
    embed[:,0:config['n_components']] = cu_embed
    return embed
Beispiel #5
0
def prep_trumap_inputs(
        interpret_session: InterpretationSession, sid_idx: torch.Tensor,
        topk_sim_full_embeds: List, target_text: str,
        target_pred_ind: int) -> Tuple[List[Tuple], Tuple, Tuple]:
    full_embed_df = cudf.DataFrame([tuple(k) for k in topk_sim_full_embeds])
    umap_xformed_embeds = cuml.UMAP(
        n_neighbors=5, n_components=3, n_epochs=500,
        min_dist=0.1).fit_transform(full_embed_df).as_matrix()
    umap_normed_embeds = (umap_xformed_embeds /
                          np.linalg.norm(umap_xformed_embeds)).tolist()
    max_truth_umap, max_falsehood_umap, target_sent_umap = [
        umap_normed_embeds.pop() for _ in range(3)
    ]
    trumap_spectrum = []
    for idx, umapval in zip(sid_idx.tolist(), umap_normed_embeds):
        trumap_spectrum.append(
            (interpret_session.stmt_embed_dict['stext'][idx],
             interpret_session.stmt_embed_dict['labels'][idx], tuple(umapval)))
    target_token_tup = (target_text, target_pred_ind, tuple(target_sent_umap))
    umap_bounds_tup = (tuple(max_falsehood_umap), tuple(max_truth_umap))
    return trumap_spectrum, target_token_tup, umap_bounds_tup
Beispiel #6
0
def reduce_to_3D(data, labels, dimReductionMethod, trainedEmbeddingModel=None):

    startTime = time.time()

    preTrainedStr = ''
    '''
    if dimReductionMethod == 'TSNE':        
        embeddingModel = None
        embeddedData = cuml.TSNE( n_components = 2 ).fit_transform ( X = data )
        embeddedData.add_column('3', cudf.Series(np.zeros((data.shape[0]))) )
    else:
    '''
    if trainedEmbeddingModel is not None:
        preTrainedStr = 'pre-trained '
        embeddingModel = trainedEmbeddingModel
    else:
        if dimReductionMethod == 'PCA':
            embeddingModel = cuml.PCA(copy=True,
                                      n_components=3,
                                      random_state=0,
                                      svd_solver='full',
                                      verbose=True,
                                      whiten=False).fit(X=data)

        elif dimReductionMethod == 'UMAP':
            embeddingModel = cuml.UMAP(n_components=3).fit(X=data, y=labels)
        else:
            assert ('unable to find embedding model match to user query')

    embeddedData = embeddingModel.transform(X=data)

    elapsedTime = time.time() - startTime
    print(
        f'{embeddedData.shape} via {preTrainedStr}{dimReductionMethod} -- completed in: {elapsedTime:.3f} seconds'
    )

    return embeddedData, embeddingModel
Beispiel #7
0
cluster_models = dict(KMeans=cuml.KMeans())

decomposition_models = dict(
    PCA=cuml.PCA(),
    TruncatedSVD=cuml.TruncatedSVD(),
)

decomposition_models_xfail = dict(
    GaussianRandomProjection=cuml.GaussianRandomProjection(),
    SparseRandomProjection=cuml.SparseRandomProjection())

neighbor_models = dict(NearestNeighbors=cuml.NearestNeighbors())

dbscan_model = dict(DBSCAN=cuml.DBSCAN())

umap_model = dict(UMAP=cuml.UMAP())


def unit_param(*args, **kwargs):
    return pytest.param(*args, **kwargs, marks=pytest.mark.unit)


def quality_param(*args, **kwargs):
    return pytest.param(*args, **kwargs, marks=pytest.mark.quality)


def stress_param(*args, **kwargs):
    return pytest.param(*args, **kwargs, marks=pytest.mark.stress)


def pickle_save_load(tmpdir, model):
Beispiel #8
0
decomposition_models = {
    "PCA": lambda: cuml.PCA(),
    "TruncatedSVD": lambda: cuml.TruncatedSVD(),
}

decomposition_models_xfail = {
    "GaussianRandomProjection": lambda: cuml.GaussianRandomProjection(),
    "SparseRandomProjection": lambda: cuml.SparseRandomProjection()
}

neighbor_models = {"NearestNeighbors": lambda: cuml.NearestNeighbors()}

dbscan_model = {"DBSCAN": lambda: cuml.DBSCAN()}

umap_model = {"UMAP": lambda: cuml.UMAP()}

rf_models = {
    "rfc": lambda: cuml.RandomForestClassifier(),
    "rfr": lambda: cuml.RandomForestRegressor()
}

all_models = {
    **regression_models,
    **solver_models,
    **cluster_models,
    **decomposition_models,
    **decomposition_models_xfail,
    **neighbor_models,
    **dbscan_model,
    **umap_model,
def plot_comparison(checkpoint_id,
                    num_progressions=5,
                    num_classes=20,
                    num_samples=10000,
                    algorithm='umap'):

    checkpoints = os.listdir('pcl_cifar10_{}'.format(checkpoint_id[0]))
    checkpoints.sort()
    assert num_progressions <= len(
        checkpoints), 'Not enough checkpoints saved.'
    checkpoints = [
        checkpoints[i]
        for i in range(0, len(checkpoints),
                       len(checkpoints) // (num_progressions - 1))
    ][:num_progressions - 1] + [checkpoints[-1]]

    # fig, axes = plt.subplots(len(checkpoints), 2, figsize=(30,50))
    fig, axes = plt.subplots(2, len(checkpoints), figsize=(60, 30))

    print(checkpoints)

    for i, checkpoint_file in enumerate(checkpoints):
        # print(checkpoints)
        # print(checkpoint_file)
        epoch = checkpoint_file[11:15]  # just the 4-digit checkpoint epoch

        print('epoch: {}'.format(epoch))
        checkpoint = torch.load('pcl_cifar10_{}/{}'.format(
            checkpoint_id[0], checkpoint_file))
        model.load_state_dict(checkpoint['state_dict'])
        features, classes = compute_features(eval_loader,
                                             model,
                                             low_dim=low_dim,
                                             gpu=gpu)
        features[
            torch.norm(features, dim=1) >
            1.5] /= 2  #account for the few samples that are computed twice
        features = features.numpy()
        restricted_classes = np.array([i for i in classes if i < num_classes])
        features = features[np.array(classes) < num_classes]

        features = features[:num_samples]
        restricted_classes = restricted_classes[:num_samples]

        if algorithm == 'umap':
            # reducer = umap.UMAP(n_neighbors = 60, min_dist=0.1, n_components=2, metric='cosine')
            reducer = cuml.UMAP(n_neighbors=60,
                                min_dist=0.1,
                                n_components=2,
                                n_epochs=1000)
            y = reducer.fit_transform(features)
        elif algorithm == 'tsne':
            tsne = cudaTSNE(n_components=2,
                            perplexity=50,
                            learning_rate=600,
                            verbose=1,
                            n_iter=2500,
                            metric='euclidean')
            y = tsne.fit_transform(features)

        scatter = axes.flat[i].scatter(y[:, 0],
                                       y[:, 1],
                                       c=restricted_classes,
                                       cmap='Spectral',
                                       s=3)
        with torch.no_grad():
            results = run_dbscan(features,
                                 minPts=200,
                                 minSamples=0,
                                 temperature=0.2,
                                 eps=0.3)
            # results = run_kmeans(features, num_cluster=['250'])
            im2cluster = results['im2cluster'][0].tolist(
            )  # remember to turn this back to a list
        scatter = axes.flat[i + num_progressions].scatter(
            y[:, 0], y[:, 1], c=im2cluster, cmap='Spectral',
            s=3)  # restricting num_classes does not work here

        # legend = axes.flat[i].legend(*scatter.legend_elements(), loc='lower left', title="Classes")
        # axes.flat[i].add_artist(legend)

        axes.flat[i].set_title('Cifar Classes, epoch: {}'.format(epoch))
        axes.flat[i + num_progressions].set_title(
            'Clustering Classes, epoch: {}'.format(epoch))

    # axes.flat[-1].legend(*scatter.legend_elements(), loc='lower left', title="Classes", bbox_to_anchor=(1.00, 0), prop={'size': 25})
    fig.suptitle('{}_{} Comparison'.format(algorithm, checkpoint_id[0]),
                 fontsize=50)

    if not os.path.exists('imgs/{}_{}'.format(algorithm, checkpoint_id[0])):
        os.makedirs('imgs/{}_{}'.format(algorithm, checkpoint_id[0]))
    save_path = 'imgs/{}_{}/comparison'.format(algorithm, checkpoint_id[0])

    fig.savefig(save_path)
    print('Figure saved to : {}'.format(save_path))
Beispiel #10
0
        print('PCA has been skipped')

    task_start_time = datetime.now()
    n_clusters = 7
    if enable_gpu:
        kmeans_float = cuml.KMeans(n_clusters=n_clusters)
    else:
        kmeans_float = sklearn.cluster.KMeans(n_clusters=n_clusters)
    kmeans_float.fit(df_fingerprints)
    print('Runtime Kmeans time (hh:mm:ss.ms) {}'.format(datetime.now() -
                                                        task_start_time))

    # UMAP
    task_start_time = datetime.now()
    if enable_gpu:
        umap = cuml.UMAP(n_neighbors=100, a=1.0, b=1.0, learning_rate=1.0)
    else:
        umap = umap.UMAP()

    Xt = umap.fit_transform(df_fingerprints)
    print('Runtime UMAP time (hh:mm:ss.ms) {}'.format(datetime.now() -
                                                      task_start_time))

    if enable_gpu:
        df_fingerprints.add_column('x', Xt[0].to_array())
        df_fingerprints.add_column('y', Xt[1].to_array())
        df_fingerprints.add_column('cluster', kmeans_float.labels_)
    else:
        df_fingerprints['x'] = Xt[:, 0]
        df_fingerprints['y'] = Xt[:, 1]
        df_fingerprints['cluster'] = kmeans_float.labels_
Beispiel #11
0
distances = cupy.ravel(cupy.fromDlpack(dist_mlarr.to_dlpack()))
indices = cupy.ravel(cupy.fromDlpack(ind_mlarr.to_dlpack()))
print(
    f"Computed KNN graph, distances shape = {distances.shape}, indices shape = {indices.shape}, distances[0:5]= {distances[0:5]}, indices[0:5]= {indices[0:5]}"
)
n_samples = indices.shape[0]
n_nonzero = n_samples * n_neighbors
rowptr = cupy.arange(0, n_nonzero + 1, n_neighbors)
knn_graph = cupyx.scipy.sparse.csr_matrix((distances, indices, rowptr),
                                          shape=(n_samples, n_samples))

print(f"Completed KNN, graph shape = {knn_graph.shape}")

reducer = cuml.UMAP(n_neighbors=15,
                    n_components=3,
                    n_epochs=500,
                    min_dist=0.1,
                    output_type="numpy")
embedding = reducer.fit_transform(data, knn_graph=knn_graph)
print(f"Completed embedding, shape = {embedding.shape}")

# df = embedding.to_pandas()
# df.columns = ["x", "y"]
# df['class'] = pd.Series([str(x) for x in target.to_array()], dtype="category")
#
# cvs = ds.Canvas(plot_width=400, plot_height=400)
# agg = cvs.points(df, 'x', 'y', ds.count_cat('class'))
# img = tf.shade(agg, color_key=color_key, how='eq_hist')
#
# utils.export_image(img, filename='fashion-mnist', background='black')
#
Beispiel #12
0
decomposition_models_xfail = {
    "GaussianRandomProjection": lambda: cuml.GaussianRandomProjection(),
    "SparseRandomProjection": lambda: cuml.SparseRandomProjection()
}

neighbor_models = {
    "NearestNeighbors": lambda: cuml.NearestNeighbors()
}

dbscan_model = {
    "DBSCAN": lambda: cuml.DBSCAN()
}

umap_model = {
    "UMAP": lambda: cuml.UMAP()
}

rf_classification_model = {
    "rfc": lambda: cuml.RandomForestClassifier()
}

rf_regression_model = {
    "rfr": lambda: cuml.RandomForestRegressor()
}


def pickle_save_load(tmpdir, func_create_model, func_assert):

    model, X_test = func_create_model()
    pickle_file = tmpdir.join('cu_model.pickle')
Beispiel #13
0
    def __init__(self, df, n_clusters, chembl_ids, enable_gpu=True, pca_model=False):
        self.app = dash.Dash(
            __name__, external_stylesheets=external_stylesheets)
        self.df = df
        self.n_clusters = n_clusters
        self.chembl_ids = chembl_ids
        self.enable_gpu = enable_gpu
        self.pca = pca_model

        # Fetch relavant properties from database.
        self.prop_df = self.create_dataframe_molecule_properties(chembl_ids)

        self.df['chembl_id'] = chembl_ids
        self.df['id'] = self.df.index
        self.orig_df = df.copy()
        # initialize UMAP
        if enable_gpu:
            self.umap = cuml.UMAP(n_neighbors=100,
                        a=1.0,
                        b=1.0,
                        learning_rate=1.0)
        else:
            self.umap = umap.UMAP()

        # Construct the UI
        self.app.layout = self.constuct_layout()

        # Register callbacks for selection inside main figure
        self.app.callback(
            [Output('selected_clusters', 'value'),
             Output('selected_point_cnt', 'children')],
            [Input('main-figure', 'clickData'),
             Input('main-figure', 'selectedData'),
             Input('bt_recluster_clusters', 'n_clicks'),
             Input('bt_recluster_points', 'n_clicks'),
             Input('northstar_cluster', 'children')],
            [State("selected_clusters", "value")]) (self.handle_data_selection)

        # Register callbacks for buttons for reclustering selected data
        self.app.callback(
            [Output('main-figure', 'figure'),
             Output('northstar_cluster', 'children'),
             Output('north_star_clusterid_map', 'children')],
            [Input('bt_recluster_clusters', 'n_clicks'),
             Input('bt_recluster_points', 'n_clicks'),
             Input('bt_north_star', 'n_clicks'),
             Input('hidden_northstar', 'value'),
             Input('sl_prop_gradient', 'value'),
             Input('sl_nclusters', 'value'),],
            [State("selected_clusters", "value"),
             State("main-figure", "selectedData"),
             State('north_star', 'value'),]) (self.handle_re_cluster)

        # Register callbacks for selection inside main figure to update module details
        self.app.callback(
            [Output('tb_selected_molecules', 'children'),
             Output('sl_mol_props', 'options'),
             Output('current_page', 'children'),
             Output('total_page', 'children'),
             Output('section_molecule_details', 'style')],
            [Input('main-figure', 'selectedData'),
             Input('sl_mol_props', 'value'),
             Input('sl_prop_gradient', 'value'),
             Input('bt_page_prev', 'n_clicks'),
             Input('bt_page_next', 'n_clicks'),
             Input('north_star_clusterid_map', 'children')],
            State('current_page', 'children')) (self.handle_molecule_selection)

        self.app.callback(
            Output("hidden1", "children"),
            [Input("bt_reset", "n_clicks")]) (self.handle_reset)

        self.app.callback(
            [Output('north_star', 'value'),
             Output('hidden_northstar', 'value')],
            [Input({'role': 'bt_star_candidate', 'index': ALL}, 'n_clicks')],
            State('north_star', 'value')) \
                (self.handle_mark_north_star)
Beispiel #14
0
import pandas as pd
import cudf, cuml

df = pd.read_csv("data/data.csv")

columns = [
    'name', 'artists', 'acousticness', 'danceability', 'energy',
    'instrumentalness', 'key', 'liveness', 'loudness', 'speechiness', 'tempo',
    'valence'
]
df_mod = df[columns]

keys = df_mod.iloc[:, :2].values.tolist()
features = df_mod.iloc[:, 2:].to_numpy()
features = (features - features.min()) / (features.max() - features.min())

df = cudf.DataFrame(features)
embed = cuml.UMAP(n_neighbors=20, n_epochs=100, min_dist=0.1,
                  init='spectral').fit_transform(df)
np_embed = embed.to_pandas().to_numpy()

np.save("result/embeddings.npy", np_embed)