def _get_nns(self, x):
     hidden_reprs, _ = self._get_hidden_repr(x)
     knns = [
         nn.kneighbors(hidden_repr, return_distance=False)
         for hidden_repr, nn in zip(hidden_reprs, self._nns)
     ]
     knns = np.concatenate(knns, axis=1)
     return knns
def EpsDBSCAN(D, k):
    nn = NearestNeighbors(n_neighbors=k + 1)
    nn.fit(D)
    distances, indices = nn.kneighbors(D)
    distances = np.delete(distances, 0, 1)
    Dist = distances.max(axis=1)
    Array = sorted(Dist)
    AvgDist = distances.sum(axis=1) / k
    Avg_Array = sorted(AvgDist)
    plt.plot(Avg_Array, 'b')

    num = len(Avg_Array)
    n_Array = [0 for i in range(num)]
    minArray = min(Avg_Array)
    maxArray = max(Avg_Array)

    for i in range(num):
        n_Array[i] = (Avg_Array[i] - minArray) / (maxArray - minArray) * (1.0 -
                                                                          0.0)

    bins = np.linspace(0, 1, 10)
    bin_indice = np.digitize(n_Array, bins)
    Eps = []
    Avg_Array = np.array(Avg_Array)
    count_max = 0

    for i in range(10):
        count = len(np.where(bin_indice == i)[0])
        if count >= k:
            #print count
            e = np.sum(Avg_Array[bin_indice == i], axis=0) / count
            plt.hlines(e, xmin=0, xmax=len(Array), colors='r')
            Eps.append(e)

    N = len(Eps)
    Eps_index = []

    for i in range(N):
        for j in range(num):
            if Avg_Array[j] > Eps[i]:
                Eps_index.append(j)
                break

    ave_slope = (maxArray - minArray) / num

    #print 'ave slope'
    #print ave_slope
    #print ''
    for i in range(N - 1):
        slope = (Eps[i + 1] - Eps[i]) / (Eps_index[i + 1] - Eps_index[i])
        #print slope
        if slope > ave_slope * 2:
            out = Eps[i]
            break
        else:
            out = Eps[i + 1]

    return Eps
def EpsValue(D, k):
    nn = NearestNeighbors(n_neighbors=k + 1)
    nn.fit(D)
    distances, indices = nn.kneighbors(D)
    distances = np.delete(distances, 0, 1)
    Dist = distances.max(axis=1)
    AvgDist = distances.sum(axis=1) / k

    out = (max(Dist) - min(AvgDist)) / 100

    return min(AvgDist), out
Exemple #4
0
def uniformly_random_subsample(pairs_file, n_samples, out_file):
    
    pairs = pd.read_csv(pairs_file, sep='\t')
    samples = np.random.uniform(size=(n_samples,pairs.shape[1]-2))

    nn = NearestNeighbors(1, n_jobs=-1)
    nn.fit(pairs[['vec_sim', 'jac_sim', 'len_sim', 'top_sim']])

    index = pd.DataFrame(nn.kneighbors(samples, return_distance=False), columns=['index'])
    df = pairs.reset_index().merge(index).drop_duplicates()

    df.to_csv(out_file, sep='\t', index=None)
Exemple #5
0
def _eval(feats_labels_sk, feats_labels_im, n=200):
    """
    :param feats_labels_sk: a two-element tuple [features_of_sketches, labels_of_sketches]
        labels_of_sketches and labels_of_images are scalars(class id).
    :param feats_labels_im: a two-element tuple [features_of_images, labels_of_images]
            features_of_images and features_of_sketches are used for distance calculation.
    :param n: the top n elements used for evaluation
    :return: precision@n, mAP@all
    """
    nn = NN(n_neighbors=feats_labels_im[0].shape[0],
            metric='hamming',
            algorithm='brute').fit(feats_labels_im[0])
    _, indices = nn.kneighbors(feats_labels_sk[0])
    retrieved_classes = np.array(feats_labels_im[1])[indices]
    matches = np.vstack([(retrieved_classes[i] == feats_labels_sk[1][i])
                         for i in range(retrieved_classes.shape[0])
                         ]).astype(np.uint16)
    return _get_pre_from_matches(
        matches[:, :n]), _get_map_from_matches(matches)
Exemple #6
0
def EpsDBSCAN(D, k):
    nn = NearestNeighbors(n_neighbors=k+1)
    nn.fit(D)
    distances, indices = nn.kneighbors(D)
    distances = np.delete(distances, 0, 1)
    Dist = distances.max(axis=1)
    Array = sorted(Dist)
    AvgDist = distances.sum(axis=1)/k
    Avg_Array = sorted(AvgDist)
    ##plt.plot(Avg_Array, 'b')

    num = len(Avg_Array)
    n_Array = [0 for i in range(num)]
    minArray = min(Avg_Array)
    maxArray = max(Avg_Array)

    for i in range(num):
        n_Array[i] = (Avg_Array[i]-minArray)/(maxArray-minArray)*(1.0-0.0)

    bins = np.linspace(0, 1, 10)
    bin_indice = np.digitize(n_Array, bins)
    Eps = []
    Avg_Array = np.array(Avg_Array)
    count_max = 0

    for i in range(10):
        count = len(np.where(bin_indice == i)[0])
        if count >= k:
            e = np.sum(Avg_Array[bin_indice == i], axis=0)/count
            ##plt.hlines(e, xmin=0, xmax=len(Array), colors='r')
            Eps.append(e)

    N = len(Eps)
    Eps_index = []

    for i in range(N):
        for j in range(num):
            if Avg_Array[j] > Eps[i]:
                Eps_index.append(j)
                break

    ave_slope = (maxArray - minArray)/num
    Slopes = []
    
    old_slope = 0.0
    for i in range(N-1):
        slope = (Eps[i+1] - Eps[i]) / (Eps_index[i+1] - Eps_index[i])
        Slopes.append(slope)
        ##if slope > old_slope and slope < old_slope * 1.1:
        ##    out = Eps[i]
        ##    break
        #if i > 0 and slope > ave_slope:
        #    out = Eps[i]
        #    break
        #else:
        #    out = Eps[i+1]
        #    old_slope = slope

    ave_slope = sum(Slopes)/len(Slopes)

    for i in range(N-1):
        if i > 0 and Slopes[i] > ave_slope:
            out = Eps[i]
            break
        else:
            out = Eps[i+1]

    #if N % 2 == 0:
    #    median1 = N/2
    #    median2 = N/2 + 1
    #    median1 = int(median1) - 1
    #    median2 = int(median2) - 1
    #    median = (Eps[median1] + Eps[median2]) / 2
    #else:
    #    median = (N + 1) / 2
    #    median = int(median) - 1
    #    median = Eps[median]

    #out = median

    #out = Avg_Array[int(num*0.9)]
    #out = Array[int(num*0.8)]
    #out = float(sum(Eps)/len(Eps))
    out = Eps[1]
    ##plt.show()

    return out