Ejemplo n.º 1
0
def find_matching_image_with_rapids():

    model = EfficientNetB0(weights='imagenet',
                           include_top=False,
                           pooling='avg',
                           input_shape=None)
    train_gen = DataGenerator(train, batch_size=128)
    image_embeddings = model.predict(train_gen, verbose=1)
    print('image embeddings shape is', image_embeddings.shape)

    # After fitting KNN, we will display some example rows of train and their 8 closest other images in train (based EffNetB0 image embeddings).

    KNN = 50
    model = NearestNeighbors(n_neighbors=KNN)
    model.fit(image_embeddings)
    distances, indices = model.kneighbors(image_embeddings)

    for k in range(180, 190):
        plt.figure(figsize=(20, 3))
        plt.plot(np.arange(50), cupy.asnumpy(distances[k, ]), 'o-')
        plt.title('Image Distance From Train Row %i to Other Train Rows' % k,
                  size=16)
        plt.ylabel('Distance to Train Row %i' % k, size=14)
        plt.xlabel('Index Sorted by Distance to Train Row %i' % k, size=14)
        plt.show()

        cluster = train.loc[cupy.asnumpy(indices[k, :8])]
        displayDF(cluster, random=False, ROWS=2, COLS=4)
def get_image_predictions(df, embeddings, threshold=0.0):
    if len(df) > 3:
        KNN = 50
    else:
        KNN = 3

    model = NearestNeighbors(n_neighbors=KNN, metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)

    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k, ] < threshold)[0]
        ids = indices[k, idx]
        posting_ids = df['posting_id'].iloc[ids].values
        if len(posting_ids) >= 2:
            idx_s = np.where(distances[k, ] < threshold - 0.08888)[0]
            ids_s = indices[k, idx_s]
            posting_ids_b = df['posting_id'].iloc[ids_s].values
            if len(posting_ids_b) >= 2:
                predictions.append(posting_ids_b)
            else:
                predictions.append(posting_ids)
        else:
            idx = np.where(distances[k, ] < 0.51313)[0]
            ids = indices[k, idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids[:2])

    del model, distances, indices
    gc.collect()
    return predictions
Ejemplo n.º 3
0
def find_similar_image():
    KNN = 50
    if len(test) == 3:
        KNN = 2

    model = NearestNeighbors(n_neighbors=KNN)
    model.fit(image_embeddings)
    preds = []
    CHUNK = 1024 * 4

    print('Finding similar images...')
    CTS = len(image_embeddings) // CHUNK
    if len(image_embeddings) % CHUNK != 0:
        CTS += 1
    for j in range(CTS):

        a = j * CHUNK
        b = (j + 1) * CHUNK
        b = min(b, len(image_embeddings))
        print('chunk', a, 'to', b)
        distances, indices = model.kneighbors(image_embeddings[a:b, ])

        for k in range(b - a):
            IDX = np.where(distances[k,] < 6.0)[0]
            IDS = indices[k, IDX]
            o = test.iloc[IDS].posting_id.values
            preds.append(o)

    del model, distances, indices, image_embeddings, embeds
    _ = gc.collect()

    test['preds2'] = preds
    test.head()
Ejemplo n.º 4
0
def compute_neighbors_rapids(X: np.ndarray,
                             n_neighbors: int,
                             metric: _Metric = 'euclidean'):
    """Compute nearest neighbors using RAPIDS cuml.

    Parameters
    ----------
    X: array of shape (n_samples, n_features)
        The data to compute nearest neighbors for.
    n_neighbors
        The number of neighbors to use.
    metric
        The metric to use to compute distances in high dimensional space.
        This string must match a valid predefined metric in RAPIDS cuml.

        Returns
    -------
    **knn_indices**, **knn_dists** : np.arrays of shape (n_observations, n_neighbors)
    """
    from cuml.neighbors import NearestNeighbors

    nn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric)
    X_contiguous = np.ascontiguousarray(X, dtype=np.float32)
    nn.fit(X_contiguous)
    knn_dist, knn_indices = nn.kneighbors(X_contiguous)
    return knn_indices, knn_dist
Ejemplo n.º 5
0
def get_image_neighbors(df, embeddings, threshold=args.threshold):
    n_neighbors = args.n_neighbors_max if len(df) > 3 else args.n_neighbors_min
    model_nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors)
    model_nearest_neighbors.fit(embeddings)
    distances, indices = model_nearest_neighbors.kneighbors(embeddings)
    predictions = []
    for k in range(embeddings.shape[0]):
        idx = np.where(distances[k] < threshold)[0]
        ids = indices[k, idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
    del model_nearest_neighbors, distances, indices
    gc.collect()
    return predictions
Ejemplo n.º 6
0
 def _find_distance_threshold(
     self,
     features,
     posting_ids: np.ndarray,
     thresholds: List[float],
 ) -> Tuple[float, float, List[List[str]]]:
     features = F.normalize(torch.from_numpy(features)).numpy()
     with TimeUtil.timer("nearest neighbor search"):
         model = NearestNeighbors(n_neighbors=len(self.valid_df), n_jobs=32)
         model.fit(features)
         distances, indices = model.kneighbors(features)
         FileUtil.save_npy(
             distances,
             self.config.dir_config.output_dir
             / f"distances_{self.fold}_{self.current_epoch:02d}.npy",
         )
         FileUtil.save_npy(
             indices,
             self.config.dir_config.output_dir
             / f"indices_{self.fold}_{self.current_epoch:02d}.npy",
         )
     best_score = 0
     best_threshold = -1
     best_y_pred: List[List[str]] = []
     for threshold in thresholds:
         y_pred = []
         for i in range(len(distances)):
             IDX = np.where(distances[i] < threshold)[0]
             if len(IDX) < self.config.inference_config.min_indices:
                 IDX = list(range(self.config.inference_config.min_indices))
             idxs = indices[i, IDX]
             y_pred.append(posting_ids[idxs])
         scores = MetricUtil.f1_scores(self.valid_df["target"].tolist(), y_pred)
         precisions, recalls = MetricUtil.precision_recall(
             self.valid_df["target"].tolist(), y_pred
         )
         self.valid_df["score"] = scores
         self.valid_df["precision"] = precisions
         self.valid_df["recall"] = recalls
         selected_score = self.valid_df["score"].mean()
         _p_mean = self.valid_df["precision"].mean()
         _r_mean = self.valid_df["recall"].mean()
         print(
             f"----------- valid f1: {selected_score} precision: {_p_mean} recall: {_r_mean} threshold: {threshold} ------------"
         )
         if selected_score > best_score:
             best_score = selected_score
             best_threshold = threshold
             best_y_pred = y_pred
     return best_score, best_threshold, best_y_pred
Ejemplo n.º 7
0
def test_fuzzy_simplicial_set(n_rows, n_features, n_neighbors,
                              precomputed_nearest_neighbors):
    n_clusters = 30
    random_state = 42
    metric = 'euclidean'

    X, _ = make_blobs(n_samples=n_rows,
                      centers=n_clusters,
                      n_features=n_features,
                      random_state=random_state)

    if precomputed_nearest_neighbors:
        nn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric)
        nn.fit(X)
        knn_dists, knn_indices = nn.kneighbors(X,
                                               n_neighbors,
                                               return_distance=True)
        cu_fss_graph = cu_fuzzy_simplicial_set(X,
                                               n_neighbors,
                                               random_state,
                                               metric,
                                               knn_indices=knn_indices,
                                               knn_dists=knn_dists)

        knn_indices = knn_indices.get()
        knn_dists = knn_dists.get()
        ref_fss_graph = ref_fuzzy_simplicial_set(
            X,
            n_neighbors,
            random_state,
            metric,
            knn_indices=knn_indices,
            knn_dists=knn_dists)[0].tocoo()
    else:
        cu_fss_graph = cu_fuzzy_simplicial_set(X, n_neighbors, random_state,
                                               metric)

        X = X.get()
        ref_fss_graph = ref_fuzzy_simplicial_set(X, n_neighbors, random_state,
                                                 metric)[0].tocoo()

    cu_fss_graph = cu_fss_graph.todense()
    ref_fss_graph = cp.sparse.coo_matrix(ref_fss_graph).todense()
    assert correctness_sparse(ref_fss_graph,
                              cu_fss_graph,
                              atol=0.1,
                              rtol=0.2,
                              threshold=0.95)
def get_image_neighbors(df, embeddings, KNN=50):
    model = NearestNeighbors(n_neighbors=KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)

    threshold = 4.5
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k, ] < threshold)[0]
        ids = indices[k, idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)

    del model, distances, indices
    gc.collect()
    return df, predictions
Ejemplo n.º 9
0
def compute_neighbors_rapids(X: np.ndarray, n_neighbors: int):
    """Compute nearest neighbors using RAPIDS cuml.
    Parameters
    ----------
    X: array of shape (n_samples, n_features)
        The data to compute nearest neighbors for.
    n_neighbors
        The number of neighbors to use.
        Returns
    -------
    **knn_indices**, **knn_dists** : np.arrays of shape (n_observations, n_neighbors)
    """
    from cuml.neighbors import NearestNeighbors
    nn = NearestNeighbors(n_neighbors=n_neighbors)
    X_contiguous = np.ascontiguousarray(X, dtype=np.float32)
    nn.fit(X_contiguous)
    knn_distsq, knn_indices = nn.kneighbors(X_contiguous)
    return knn_indices, np.sqrt(
        knn_distsq)  # cuml uses sqeuclidean metric so take sqrt
Ejemplo n.º 10
0
def get_image_neighbors(df, embeddings, KNN=50):
    model = NearestNeighbors(n_neighbors=KNN)  # 创建knn模型
    model.fit(embeddings)  # 训练features
    distances, indices = model.kneighbors(embeddings)  # 获得图片之间的距离(相似度)

    predictions = []
    for k in tqdm(range(embeddings.shape[0])):  # 每张图片都拿出来两两比对
        idx = np.where(
            distances[k, ] < CFG.img_thres)[0]  # 设置一个thres(阈值),来确定匹配的严格程度
        # 对于没有匹配到的其他图片的图片,我们放宽阈值再匹配一次
        if len(idx) == 1:
            idx = np.where(distances[k, ] < (CFG.img_thres + CFG.addition))[0]
        ids = indices[k, idx]
        posting_ids = df['posting_id'].iloc[ids].values  # 输出匹配的图片
        predictions.append(posting_ids)

    del model, distances, indices
    gc.collect()
    return predictions
def compute_neighbors_sklearn(X: np.ndarray, n_neighbors: int):
    """Compute nearest neighbors using sklearn

    Parameters
    ----------
    X: array of shape (n_samples, n_features)
        The data to compute nearest neighbors for.
    n_neighbors
        The number of neighbors to use.

        Returns
    -------
    **knn_indices**, **knn_dists** : np.arrays of shape (n_observations, n_neighbors)
    """
    from sklearn.neighbors import NearestNeighbors
    import time

    t0 = time.time()
    nn = NearestNeighbors(n_neighbors=n_neighbors)
    X_contiguous = np.ascontiguousarray(X, dtype=np.float32)
    nn.fit(X_contiguous)
    knn_dist, knn_indices = nn.kneighbors(X_contiguous)
    print("Here", time.time() - t0)
    return knn_indices, knn_dist
Ejemplo n.º 12
0
def spreadXY(X_2d, threshold, speed):
    'spreads items until distance is greater than threshold'

    import cudf
    from cuml.neighbors import NearestNeighbors

    def kernel(x, y, outx, outy, threshold2):
        for i, (x2, y2) in enumerate(zip(x, y)):
            d = math.sqrt(x2 * x2 + y2 * y2)
            if 0 < d <= threshold2:
                outx[i] = x2 / d
                outy[i] = y2 / d
            else:
                outx[i] = 0
                outy[i] = 0

    print('spreadXY')
    length = len(X_2d)
    X = cudf.DataFrame()
    X['x'] = X_2d[0:length, 0]
    X['y'] = X_2d[0:length, 1]
    k = 8
    scale = 10000
    threshold *= scale
    speed *= scale
    X = X.mul(scale)
    #X = np.copy(X_2d[:length])
    for i in range(20):
        nn = NearestNeighbors(n_neighbors=k)
        nn.fit(X)
        distances, indices = nn.kneighbors(X)
        #print(distances.shape)
        joins = []

        s = X.sum()
        print("iteration", i, "sum dist", s)

        newX = X
        for j in range(k):
            join = indices.drop([x for x in range(k) if x != j
                                 ])  #.rename(mapper={j: 'x'}, columns=[j])
            join = join.merge(X, how='left', left_on=[j], right_index=True)
            join = join.drop(j)
            v = join.sub(X)
            v = v.apply_rows(kernel,
                             incols=['x', 'y'],
                             outcols=dict(outx=np.float32, outy=np.float32),
                             kwargs=dict(threshold2=threshold))
            v = v.drop(['x', 'y'])
            v = v.rename(columns={'outx': 'x', 'outy': 'y'})
            newX = newX.sub(v.mul(speed))
            #newX = newX.add(1)
            #v = v.query('x * x + y * y <= ' + str(threshold * threshold))
        #print("newX")
        #print(newX)
        X = newX

        s = X.sum()
        print("iteration", i, "sum dist", s)
    X = X.truediv(scale)
    X = np.array(X.as_matrix())
    print(X.shape)
    return X
Ejemplo n.º 13
0
from cuml.neighbors import NearestNeighbors

# Using cudf Dataframe here is not likely to help with performance
# However, it's a good opportunity to get familiar with the API
source_df: cudf.DataFrame = cudf.read_csv(
    '/att/nobackup/tpmaxwel/data/fashion-mnist-csv/fashion_train.csv')
data = source_df.loc[:, source_df.columns[:-1]]
target = source_df[source_df.columns[-1]]
n_neighbors = 5

# fit model
model = NearestNeighbors(n_neighbors=5)
model.fit(data)

# get nearest neighbors
dist_mlarr, ind_mlarr = model.kneighbors(data, return_distance=True)

# create sparse matrix
distances = cupy.ravel(cupy.fromDlpack(dist_mlarr.to_dlpack()))
indices = cupy.ravel(cupy.fromDlpack(ind_mlarr.to_dlpack()))
print(
    f"Computed KNN graph, distances shape = {distances.shape}, indices shape = {indices.shape}, distances[0:5]= {distances[0:5]}, indices[0:5]= {indices[0:5]}"
)
n_samples = indices.shape[0]
n_nonzero = n_samples * n_neighbors
rowptr = cupy.arange(0, n_nonzero + 1, n_neighbors)
knn_graph = cupyx.scipy.sparse.csr_matrix((distances, indices, rowptr),
                                          shape=(n_samples, n_samples))

print(f"Completed KNN, graph shape = {knn_graph.shape}")
Ejemplo n.º 14
0
class gpActivationFlow(ActivationFlow):
    def __init__(self, nodes_data: xa.DataArray, n_neighbors: int, **kwargs):
        ActivationFlow.__init__(self, n_neighbors, **kwargs)
        self.I: cudf.DataFrame = None
        self.D: cudf.DataFrame = None
        self.P: cudf.DataFrame = None
        self.C: cudf.DataFrame = None
        self.nodes: cudf.DataFrame = None
        self.setNodeData(nodes_data, **kwargs)

    def setNodeData(self, nodes_data: xa.DataArray, **kwargs):
        print(
            f"{self.__class__.__name__}[{hex(id(self))}].setNodeData: input shape = {nodes_data.shape}"
        )
        if self.reset or (self.nodes is None):
            if (nodes_data.size > 0):
                t0 = time.time()
                self.nodes = cudf.DataFrame({
                    icol: nodes_data[:, icol]
                    for icol in range(nodes_data.shape[1])
                })
                self.nnd = NearestNeighbors(n_neighbors=self.nneighbors)
                self.nnd.fit(self.nodes)
                self.D, self.I = self.nnd.kneighbors(self.nodes,
                                                     return_distance=True)
                dt = (time.time() - t0)
                print(
                    f"Computed NN Graph with {self.nnd.n_neighbors} neighbors and {nodes_data.shape[0]} verts in {dt} sec ({dt/60} min)"
                )
                print(
                    f"  ---> Indices shape = {self.I.shape}, Distances shape = {self.D.shape} "
                )
            else:
                print("No data available for this block")

    def getGraph(self):
        return None

    def getConnectionMatrix(self) -> csr_matrix:
        distances = cupy.ravel(cupy.fromDlpack(self.D.to_dlpack()))
        indices = cupy.ravel(cupy.fromDlpack(self.I.to_dlpack()))
        n_samples = indices.shape[0]
        n_nonzero = n_samples * self.nneighbors
        rowptr = cupy.arange(0, n_nonzero + 1, self.nneighbors)
        knn_graph = cupyx.scipy.sparse.csr_matrix((distances, indices, rowptr),
                                                  shape=(n_samples, n_samples))
        print(f"Completed KNN, sparse graph shape = {knn_graph.shape}")
        return knn_graph

    def spread(self,
               sample_data: np.ndarray,
               nIter: int = 1,
               **kwargs) -> Optional[bool]:
        converged = True

        spdf = shortest_path(G, source_pid)
        spdf.sort_by("vertex")
        distances = spdf["distance"]

        self.reset = False
        return converged
Ejemplo n.º 15
0
def KNN_predict(df, embeddings, KNN=50, thresh=None, thresh_range=None):
    '''
    thresh_range: np.arrange for threshold selection
    thresh: distance threshold for result matching 

    image: 2.7, tfidf: 0.6
    image: list(np.arange(2,10,0.5))
    text : list(np.arange(0.1, 1, 0.1))   
    '''
    assert ((thresh is None) or (thresh_range is None)), "Must provide either `thresh` or `thresh_range`"
    if thresh_range is not None:
        assert 'matches' in df.columns, "Cannot perform threshold selection on testing data"

    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    thresholds_scores = None
    if thresh is None:
        thresholds = thresh_range

        scores = []
        recalls = []
        precisions = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            f1, precision, recall = f1_score(df['matches'], df['pred_matches'])
            print(f'Threshold {threshold:.2f}: F1 {f1.mean():.4f} Precision {precision.mean():.4f} Recall {recall.mean():.4f}')
            scores.append(f1.mean())
            recalls.append(recall.mean())
            precisions.append(precision.mean())
        thresholds_scores = pd.DataFrame({
            'thresholds': thresholds, 
            'scores': scores, 
            'recalls': recalls, 
            'precisions': precisions
            })
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
    
        thresh = best_threshold
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < thresh)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions, thresholds_scores