def Cluster(data, algo, peso=None):

    #We will use 2 possibilities K-Means and DBSCAN
    #The first one needs the number of cluster while the second one needs
    #about 2 parameters eps and min Pts.

    if algo == "K-Means":

        #Using the Elbow Method for having the optimal number of clusters
        print()
        print("Evaluating number of Clusters")
        print()

        #For each possible number of clusters we are gonna
        #to compute the Sum of Squared Distance into each clusters
        #then we plot the result thorough Elbow Method

        centers = []
        inertia = []
        for i in range(1, 9):
            kk = KMeans(n_clusters=i, init='k-means++',
                        random_state=0).fit(data)
            inertia.append(kk.inertia_)
            centers.append(i)
        print()

        print("Make the plot of the ELBOW Method")
        #Making the Plot of the Elbow Method
        plt.plot(centers, inertia, 'bx-')
        plt.xlabel('k')
        plt.ylabel('Sum_of_squared_distances')
        plt.title('Elbow Method For Optimal k')
        plt.savefig("Elbow Method.png", dpi=150, figsize=(12, 6))
        #plt.show()
        plt.close()

        #K-Means Algorithm

        #Here we set 5 cause we know that the number of classes analyzed
        kmeans = KMeans(n_clusters=5, init='k-means++', random_state=0)
        y_result = kmeans.fit_predict(data)

        centroidi = kmeans.cluster_centers_

    elif algo == "DBSCAN":

        #DBSCAN Algorithm

        nbrs = NearestNeighbors(n_neighbors=5).fit(data)
        distances, indices = nbrs.kneighbors(data)
        print("The mean distance is about : " + str(np.mean(distances)))
        #np.median(distances)

        #dbscan = DBSCAN(eps= 0.0000000005, min_samples= 30700, metric="euclidean",
        #                n_jobs = 1)

        #dbscan = DBSCAN(eps= 0.000005, min_samples= 700, metric="euclidean", n_jobs = -1)
        dbscan = DBSCAN(eps=0.003,
                        min_samples=1000,
                        metric="euclidean",
                        n_jobs=-1)

        print(Counter(peso))
        print()

        y_result = dbscan.fit_predict(data, sample_weight=peso)
        centroidi = "In DBSCAN there aren not Centroids"

    return y_result, centroidi
Example #2
0
    def _upsampling(self, X, y, sigmafactor):
        """
        :param X:
        :param y:
        :param sigmafactor:
        :return:
        """
        k = 10
        negative = X[y == self.majority_target]
        positive = X[y == self.minority_target]
        size0 = len(negative)
        size1 = len(positive)
        newDataNum = round(abs(size0 - size1) * self.ratio)
        # print(newDataNum)
        nbrs0 = NearestNeighbors(n_neighbors=k).fit(negative)
        distances0, indices0 = nbrs0.kneighbors(negative)
        nbrs1 = NearestNeighbors(n_neighbors=k).fit(positive)
        distances1, indices1 = nbrs1.kneighbors(positive)
        gsigma0 = np.mean(distances0, 1) * sigmafactor
        gsigma1 = np.mean(distances1, 1) * sigmafactor
        # negative
        pdf0from0, density0from0 = self.get_pdf_of_points(
            gsigma0, negative, negative)
        pdf0from1, density0from1 = self.get_pdf_of_points(
            gsigma1, positive, negative)
        # positive
        pdf1from1, density1from1 = self.get_pdf_of_points(
            gsigma1, positive, positive)
        # print(pdf1from1)
        pdf1from0, density1from0 = self.get_pdf_of_points(
            gsigma0, negative, positive)
        # Calculate Posterior Probability
        confidence0 = self.get_confidence(pdf0from0, pdf0from1, size0, size1)
        confidence1 = self.get_confidence(pdf1from1, pdf1from0, size1, size0)

        # search for seed in negative data
        # Compute confidence ratio for negative data upon new data added to negative data
        pdf0from0_mat = np.tile(pdf0from0.reshape(-1, 1), (1, size0))
        pdf0from0_mat = (pdf0from0_mat * size0 + density0from0) / (size0 + 1)
        pdf0from0_mat = np.r_[pdf0from0_mat,
                              np.diag(pdf0from0_mat).reshape(1, -1)]

        pdf0from1_mat = np.tile(pdf0from1.reshape(-1, 1), (1, size0))
        pdf0from1_mat = np.r_[pdf0from1_mat,
                              np.diag(pdf0from1_mat).reshape(1, -1)]

        confidence0_new = self.get_confidence(pdf0from0_mat, pdf0from1_mat,
                                              size0 + 1, size1)
        # Compute confidence ratio for positive data upon new data added to negative data
        pdf1from0_mat = np.tile(pdf1from0.reshape(-1, 1), (1, size0))
        pdf1from0_mat = (pdf1from0_mat * size0 + density1from0) / (size0 + 1)

        pdf1from1_mat = np.tile(pdf1from1.reshape(-1, 1), (1, size0))
        confidence1_new = self.get_confidence(pdf1from1_mat, pdf1from0_mat,
                                              size1, size0 + 1)

        confidence_new_0 = np.r_[confidence0_new, confidence1_new]
        confidence_old_0 = np.concatenate([
            np.r_[np.tile(confidence0.reshape(-1, 1), (1, size0)),
                  confidence0.reshape(1, -1)],
            np.tile(confidence1.reshape(-1, 1), (1, size0))
        ],
                                          axis=0)
        # Relative Certainty Change
        confidence0_ratio = (confidence_new_0 -
                             confidence_old_0) / confidence_old_0
        confidence0_ratio = 0.5 * (
            np.mean(confidence0_ratio[0:size0 + 1, :], axis=0) +
            np.mean(confidence0_ratio[size0 + 1:, :], axis=0))
        # Search for seed in positive data
        # Compute confidence ratio for positive data upon new data added to positive data
        pdf1from1_mat = np.tile(pdf1from1.reshape(-1, 1), (1, size1))
        pdf1from1_mat = (pdf1from1_mat * size1 + density1from1) / (size1 + 1)
        pdf1from1_mat = np.r_[pdf1from1_mat,
                              np.diag(pdf1from1_mat).reshape(1, -1)]

        pdf1from0_mat = np.tile(pdf1from0.reshape(-1, 1), (1, size1))
        pdf1from0_mat = np.r_[pdf1from0_mat,
                              np.diag(pdf1from0_mat).reshape(1, -1)]

        confidence1_new = self.get_confidence(pdf1from1_mat, pdf1from0_mat,
                                              size1 + 1, size0)
        # Compute confidence ratio for negative data upon new data added to positive data
        pdf0from1_mat = np.tile(pdf0from1.reshape(-1, 1), (1, size1))
        pdf0from1_mat = (pdf0from1_mat * size1 + density0from1) / (size1 + 1)

        pdf0from0_mat = np.tile(pdf0from0.reshape(-1, 1), (1, size1))
        confidence0_new = self.get_confidence(pdf0from0_mat, pdf0from1_mat,
                                              size0, size1 + 1)

        confidence_new_1 = np.r_[confidence0_new, confidence1_new]
        confidence_old_1 = np.concatenate([
            np.tile(confidence0.reshape(-1, 1),
                    (1, size1)), np.r_[np.tile(confidence1.reshape(-1, 1),
                                               (1, size1)),
                                       confidence1.reshape(1, -1)]
        ],
                                          axis=0)
        # Relative Certainty Change
        confidence1_ratio = (confidence_new_1 -
                             confidence_old_1) / confidence_old_1
        confidence1_ratio = 0.5 * (
            np.mean(confidence1_ratio[0:size0, :], axis=0) +
            np.mean(confidence1_ratio[size0:, :], axis=0))

        confidence = np.append(confidence0_ratio, confidence1_ratio)

        X_resampled, y_resampled = self.getNewDataByInterpolationRandomSimplex3(
            X, y, gsigma0, gsigma1, confidence, newDataNum)

        return X_resampled, y_resampled
    format(i[:10], np.round(density[i[0:10]], 4)))

# Plot possible outliers
#figure(2)
#for k in range(1,21):
#    subplot(4,5,k)
#    imshow(np.reshape(X[i[k],:], (16,16)).T, cmap=cm.binary)
#    xticks([]); yticks([])
#    if k==3: title('Gaussian Kernel Density: Possible outliers')

### K-neighbors density estimator
# Neighbor to use:
K = 5

# Find the k nearest neighbors
knn = NearestNeighbors(n_neighbors=K).fit(X)
D, i = knn.kneighbors(X)

density = 1. / (D.sum(axis=1) / K)

# Sort the scores
i = density.argsort()
density = density[i]

# Plot k-neighbor estimate of outlier score (distances)
figure(2)
bar(range(20), density[:20])
title('KNN density: Outlier score')
show()
print(
    'KNN density. The index of the lowest density object:\n {},\n score:\n {} '
Example #4
0
    instance_img_path = Config.image_dir + "/fast_mask_roi_3.jpg"
    instance_output = get_single_patch_feature(instance_img_path, espcn_model)

    instance_img_path_2 = Config.image_dir + "/fast_mask_roi_6.jpg"
    instance_output_2 = get_single_patch_feature(instance_img_path_2,
                                                 espcn_model)

    feature_list = []
    for i, img in enumerate(img_list):
        img_torch = Variable(img).cuda()
        feature = extract_crow_feature(img_torch, cls_model)
        feature_list.append(feature)

    feature_list = np.array(feature_list)

    from sklearn.neighbors import NearestNeighbors
    model = NearestNeighbors(n_neighbors=5,
                             algorithm="ball_tree",
                             metric="minkowski",
                             n_jobs=4,
                             leaf_size=5,
                             p=2)

    model.fit(feature_list)

    d2, i2 = model.kneighbors(instance_output.reshape(1, -1))
    print d2
    d3, i3 = model.kneighbors(instance_output_2.reshape(1, -1))
    print d3
Example #5
0
def generate_fake_test_from_train_labels(train_seen_label, attribute, seenclasses, unseenclasses, num, per_seen=0.10, \
                                        per_unseen=0.40, per_seen_unseen= 0.50):
    """
    Input:
        train_seen_label-> images with labels containing objects less than opt.N
        attribute-> array containing word embeddings
        seenclasses-> array containing seen class indices
        unseenclasses-> array containing unseen class indices
        num-> number of generated synthetic labels
    Output:
        gzsl -> tensor containing synthetic labels of only unseen, seen and seen-unseen classes.  
    
    """
    if train_seen_label.min() == 0:
        print("Training data already trimmed and converted")
    else:
        print("original training data received (-1,1)'s ")
        train_seen_label = torch.clamp(train_seen_label, 0, 1)

    #remove all zero labeled images while training
    train_seen_label = train_seen_label[(train_seen_label.sum(1) !=
                                         0).nonzero().flatten()]
    seen_attributes = attribute[seenclasses]
    unseen_attributes = attribute[unseenclasses]
    seen_percent, unseen_percent, seen_unseen_percent = per_seen, per_unseen, per_seen_unseen

    print("seen={}, unseen={}, seen-unseen={}".format(seen_percent,
                                                      unseen_percent,
                                                      seen_unseen_percent))
    print("syn num={}".format(num))
    gzsl = []
    for i in range(0, num):
        new_gzsl_syn_list = []
        seen_unseen_label_pairs = {}
        nbrs = NearestNeighbors(n_neighbors=1,
                                algorithm='auto').fit(unseen_attributes)
        for seen_idx, seen_att in zip(seenclasses, seen_attributes):
            _, indices = nbrs.kneighbors(seen_att[None, :])
            seen_unseen_label_pairs[seen_idx.tolist()] = unseenclasses[
                indices[0][0]].tolist()

        #ADDING ONLY SEEN LABELS
        idx = torch.randperm(
            len(train_seen_label))[0:int(len(train_seen_label) * seen_percent)]
        seen_labels = train_seen_label[idx]
        _new_gzsl_syn_list = torch.zeros(seen_labels.shape[0],
                                         attribute.shape[0])
        _new_gzsl_syn_list[:, :len(seenclasses)] = seen_labels
        new_gzsl_syn_list.append(_new_gzsl_syn_list)

        #ADDING ONLY UNSEEN LABELS
        idx = torch.randperm(len(
            train_seen_label))[0:int(len(train_seen_label) * unseen_percent)]
        temp_label = train_seen_label[idx]
        _new_gzsl_syn_list = torch.zeros(temp_label.shape[0],
                                         attribute.shape[0])
        for m, lab in enumerate(temp_label):
            new_lab = torch.zeros(attribute.shape[0])
            unseen_lab = lab.nonzero().flatten()
            u = []
            for i in unseen_lab:
                u.append(seen_unseen_label_pairs[i.tolist()])
            new_lab[u] = 1
            _new_gzsl_syn_list[m, :] = new_lab
        unseen_labels = _new_gzsl_syn_list
        new_gzsl_syn_list.append(unseen_labels)

        #ADDING BOTH SEEN AND UNSEEN LABELS 50% OF THE SELECTED SEEN LABELS IS MAPPED TO UNSEEN LABELS
        idx = torch.randperm(
            len(train_seen_label
                ))[0:int(len(train_seen_label) * seen_unseen_percent)]
        temp_label = train_seen_label[idx]
        _new_gzsl_syn_list = torch.zeros(temp_label.shape[0],
                                         attribute.shape[0])
        for m, lab in enumerate(temp_label):
            u = []
            new_lab = torch.zeros(attribute.shape[0])
            seen_unseen_lab = lab.nonzero().flatten()
            temp_seen_label = np.random.choice(
                seen_unseen_lab, int(len(seen_unseen_lab) * 0.50))
            u.extend(temp_seen_label)
            rem_seen_label = np.setxor1d(temp_seen_label, seen_unseen_lab)
            for i in rem_seen_label:
                u.append(seen_unseen_label_pairs[i.tolist()])
            new_lab[u] = 1
            _new_gzsl_syn_list[m, :] = new_lab
        seen_unseen_labels = _new_gzsl_syn_list
        new_gzsl_syn_list.append(seen_unseen_labels)

        new_gzsl_syn_list = torch.cat(new_gzsl_syn_list)
        gzsl.append(new_gzsl_syn_list)

    gzsl = torch.cat(gzsl)
    tmp_list = gzsl.sum(0)
    ## To make sure every unseen label gets covered
    empty_lab = torch.arange(tmp_list.numel())[tmp_list == 0]
    min_uc = int(tmp_list[len(seenclasses):][
        tmp_list[len(seenclasses):] > 0].min().item())
    for el in empty_lab:
        idx = torch.randperm(gzsl.size(0))[:min_uc]
        gzsl[idx, el] = 1
    gzsl = gzsl.long()
    print("GZSL TEST LABELS:", gzsl.shape)
    return gzsl
Example #6
0
def pre_auto_cluster(PhoneValueVector_Chinese, ValueVectorArray_Chi,
                     n_neighbor, plot):

    value_size, feasure_size = ValueVectorArray_Chi.shape
    nbrs = NearestNeighbors(n_neighbors=n_neighbor + 1,
                            algorithm='brute',
                            metric='cosine').fit(ValueVectorArray_Chi)
    knn_matrix = nbrs.kneighbors(ValueVectorArray_Chi, return_distance=False)
    cosine_dist = (1 - cosine_similarity(ValueVectorArray_Chi))
    #local density
    k_dis_list = []
    for q in range(knn_matrix.shape[0]):
        k_dis = 0
        for p in range(1, knn_matrix.shape[1]):
            dis = cosine_dist[q][knn_matrix[q][p]]
            k_dis += dis
        k_dis_mean = (knn_matrix.shape[1] - 1) / (k_dis + 1
                                                  )  # sys.float_info.min
        k_dis_list.append(k_dis_mean)
    #density base distance
    min_dist_list = []
    dist_matrix = pairwise_distances(ValueVectorArray_Chi,
                                     Y=None,
                                     metric='cosine')
    k_dis_sort = sorted(enumerate(k_dis_list), key=lambda x: x[1])
    for n in range(len(k_dis_sort)):
        dist_higher_list = []
        for m in range(n + 1, len(k_dis_sort)):
            dist_higher = dist_matrix[k_dis_sort[n][0]][k_dis_sort[m][0]]
            dist_higher_list.append({
                'value index': k_dis_sort[n][0],
                'shortest index': k_dis_sort[m][0],
                'dist': dist_higher
            })
        if len(dist_higher_list) > 0:
            min_dist = min(dist_higher_list, key=lambda x: x['dist'])
            min_dist_list.append(min_dist)
        else:
            index = dist_matrix[k_dis_sort[n][0]].tolist().index(
                max(dist_matrix[k_dis_sort[n][0]]))

            max_dist = ({
                'value index': k_dis_sort[n][0],
                'shortest index': index,
                'dist': dist_matrix[k_dis_sort[n][0]][index]
            })
            min_dist_list.append(max_dist)
    ld_dbd = pd.DataFrame(min_dist_list)
    ld_dbd['local density'] = sorted(k_dis_list)
    ld_dbd = ld_dbd.sort_values(by='value index').reset_index(drop=True)

    value_name_list = []
    for j in ld_dbd['value index'].tolist():
        value_name_list.append(PhoneValueVector_Chinese.loc[j]['value'])
    ld_dbd['value name'] = value_name_list

    combine = np.array(k_dis_list) * np.array(ld_dbd['dist'].tolist())
    combine_list = ({
        'value index':
        list(range(combine.shape[0])),
        'combine':
        combine,
        'value name':
        PhoneValueVector_Chinese.loc[list(range(combine.shape[0]))]['value']
    })

    combine_df = pd.DataFrame(combine_list)
    combine_df.index = range(len(combine_df))
    combine_ = sorted(list(combine))
    combine_df = combine_df.sort_values(by='combine',
                                        ascending=False).reset_index(drop=True)
    if plot != 0:
        #plot to identify the cluster center and size
        plt.figure(figsize=(14, 10))
        plt.scatter(list(range(len(combine_))), combine_)
        plt.show()
        diff_list = []
        combine_sort = combine_df['combine'].tolist()
        for q in range(len(combine_df) - 1):
            diff = abs(combine_sort[q + 1] - combine_sort[q])
            diff_list.append(diff)
        plt.figure(figsize=(14, 8))
        plt.bar(list(range(plot)), diff_list[:plot])
        plt.show()
    return ld_dbd, combine_df
Example #7
0
    def __init__(self,
                 ratio='auto',
                 random_state=None,
                 verbose=True,
                 k=5,
                 m=10,
                 out_step=0.5,
                 kind='regular',
                 n_jobs=-1,
                 **kwargs):
        """Initialisation of SMOTE object.

        Parameters
        ----------
        ratio : str or float, optional (default='auto')
            If 'auto', the ratio will be defined automatically to balanced
            the dataset. Otherwise, the ratio will corresponds to the
            number of samples in the minority class over the the number of
            samples in the majority class.

        random_state : int or None, optional (default=None)
            Seed for random number generation.

        verbose : bool, optional (default=True)
            Boolean to either or not print information about the
            processing.

        k : int, optional (default=5)
            Number of nearest neighbours to used to construct synthetic
            samples.

        m : int, optional (default=10)
            Number of nearest neighbours to use to determine if a minority
            sample is in danger.

        out_step : float, optional (default=0.5)
            Step size when extrapolating.

        kind : str, optional (default='regular')
            The type of SMOTE algorithm to use one of the following
            options: 'regular', 'borderline1', 'borderline2', 'svm'

        n_jobs : int, optional (default=-1)
            Number of threads to run the algorithm when it is possible.

        """
        super(SMOTE, self).__init__(ratio=ratio,
                                    random_state=random_state,
                                    verbose=verbose)

        # Check the number of thread to use
        self.n_jobs = n_jobs

        # --- The type of smote
        # This object can perform regular smote over-sampling, borderline 1,
        # borderline 2 and svm smote. Since the algorithms are fairly simple
        # they share most methods.
        possible_kind = ('regular', 'borderline1', 'borderline2', 'svm')
        if kind in possible_kind:
            self.kind = kind
        else:
            raise ValueError('Unknown kind for SMOTE algorithm.')

        # --- Verbose
        # Control whether or not status and progress information should be
        self.verbose = verbose

        # --- Nearest Neighbours for synthetic samples
        # The smote algorithm uses the k-th nearest neighbours of a minority
        # sample to generate new synthetic samples.
        self.k = k

        # --- NN object
        # Import the NN object from scikit-learn library. Since in the smote
        # variations we must first find samples that are in danger, we
        # initialize the NN object differently depending on the method chosen
        if kind == 'regular':
            # Regular smote does not look for samples in danger, instead it
            # creates synthetic samples directly from the k-th nearest
            # neighbours with not filtering
            self.nearest_neighbour_ = NearestNeighbors(n_neighbors=k + 1,
                                                       n_jobs=self.n_jobs)
        else:
            # Borderline1, 2 and SVM variations of smote must first look for
            # samples that could be considered noise and samples that live
            # near the boundary between the classes. Therefore, before
            # creating synthetic samples from the k-th nns, it first look
            # for m nearest neighbors to decide whether or not a sample is
            # noise or near the boundary.
            self.nearest_neighbour_ = NearestNeighbors(n_neighbors=m + 1,
                                                       n_jobs=self.n_jobs)

            # --- Nearest Neighbours for noise and boundary (in danger)
            # Before creating synthetic samples we must first decide if
            # a given entry is noise or in danger. We use m nns in this step
            self.m = m

        # --- SVM smote
        # Unlike the borderline variations, the SVM variation uses the support
        # vectors to decide which samples are in danger (near the boundary).
        # Additionally it also introduces extrapolation for samples that are
        # considered safe (far from boundary) and interpolation for samples
        # in danger (near the boundary). The level of extrapolation is
        # controled by the out_step.
        if kind == 'svm':
            # Store extrapolation size
            self.out_step = out_step

            # Store SVM object with any parameters
            self.svm_ = SVC(random_state=self.rs_, **kwargs)
Example #8
0
    'totalRatingCount >= @popularity_threshold')
rating_popular_book.head()

moviemat = Movies_with_Rating.pivot_table(index='title',
                                          columns='user_id',
                                          values='rating').fillna(0)

moviemat.head()

Movie_ratingCount.sort_values('totalRatingCount', ascending=False).head(10)
#Filter
user_rating_matrix = csr_matrix(moviemat.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(user_rating_matrix)

query_index = np.random.choice(moviemat.shape[0])
us = np.array(moviemat.iloc[query_index, :]).reshape(1, -1)
distances, indices = model_knn.kneighbors(us, n_neighbors=6)
x_text = []
for i in range(0, len(distances.flatten())):

    if i == 0:

        print('Recommendations for {0}:\n'.format(moviemat.index[query_index]))
    else:
        x_text.append(moviemat.index[indices.flatten()[i]])
        print('{0}: {1}, with distance of {2}:'.format(
            i, moviemat.index[indices.flatten()[i]],
Example #9
0
def knn_predictor(x_train, y_train, x_test, y_test):
	clf = NearestNeighbors(n_neighbors = 5)
	clf.fit(x_train)
	accuracy = clf.score(x_test, y_test)
	f1 = precision_recall_fscore_support(y_test, clf.predict(x_test), average = 'weighted')[2]
	print(accuracy, f1)
Example #10
0
def knn_predictor(audio_feats, k=100):
    """
    differences_df = knn_predictor(audio_features)
    """

    # Scale the data with standard scaler
    scaler = StandardScaler()
    spotify_scaled = scaler.fit_transform(spotify)

    ################################################
    audio_feats_scaled = scaler.transform([audio_feats])

    ## Nearest Neighbors model
    knn = NearestNeighbors(n_neighbors=k, algorithm='kd_tree')
    knn.fit(spotify_scaled)

    # JOBLIB dump
    dump(knn, 'knn_final.joblib', compress=True)

    # make prediction
    prediction = knn.kneighbors(audio_feats_scaled)

    # create an index for similar songs
    similar_songs_index = prediction[1][0][:25].tolist()

    # Create an empty list to store simlar song names
    similar_song_ids = []
    similar_song_names = []

    # loop over the indexes and append song names to empty list above
    for i in similar_songs_index:
        song_id = identify['track_id'].iloc[i]
        similar_song_ids.append(song_id)
        song_name = identify['track_name'].iloc[i]
        similar_song_names.append(song_name)

    #################################################

    column_names = spotify.columns.tolist()

    # put scaled audio features into a dataframe
    audio_feats_scaled_df = pd.DataFrame(audio_feats_scaled,
                                         columns=column_names)

    # create empty list of similar songs' features
    similar_songs_features = []

    # loop through the indexes of similar songs to get audio features for each
    #. similar song
    for index in similar_songs_index:
        list_of_feats = spotify.iloc[index].tolist()
        similar_songs_features.append(list_of_feats)

    # scale the features and turn them into a dataframe
    similar_feats_scaled = scaler.transform(similar_songs_features)
    similar_feats_scaled_df = pd.DataFrame(similar_feats_scaled,
                                           columns=column_names)

    # get the % difference between the outputs and input songs
    col_names = similar_feats_scaled_df.columns.to_list()
    diff_df = pd.DataFrame(columns=col_names)
    for i in range(25):
        diff = abs(similar_feats_scaled_df.iloc[i] -
                   audio_feats_scaled_df.iloc[0])
        # print('type: ', type(similar_feats_scaled_df.iloc[i]))
        diff_df.loc[i] = diff

    # add sums of differences
    diff_df['sum'] = diff_df.sum(axis=1)
    diff_df = diff_df.sort_values(by=['sum'])
    diff_df = diff_df.reset_index(drop=True)

    # add track_id to DF
    diff_df['track_id'] = similar_song_ids

    # reorder cols to have track_id as first column
    cols = list(diff_df)
    cols.insert(0, cols.pop(cols.index('track_id')))
    diff_df = diff_df.loc[:, cols]

    # Remove the suggestion of the same song (all 0's)
    diff_df = diff_df[~(diff_df == 0).any(axis=1)]

    # Grab only the unique 10 songs
    diff_df = diff_df.drop_duplicates(subset=['sum'])[:10]

    diff_df = diff_df.reset_index(drop=True)

    return diff_df
Example #11
0
    #allsubspaces=range(max_count-1-7,0,-1)
    frmt = str(col) + 'b'
    factor = 1
    output = eig_vecs[:, :].dot(X)
    output = output.T
    count = 0
    for i in allsubspaces:
        bin_value = str(format(i, frmt))
        bin_value = bin_value[::-1]
        subspace_col = [
            col - 2 - index for index, value in enumerate(bin_value)
            if value == '1'
        ]
        print("%d : %s : '%s'" % (i, subspace_col, bin_value[::-1]))
        np_subspace = output[:, subspace_col]
        nbrs = NearestNeighbors(n_neighbors=k,
                                algorithm='ball_tree').fit(np_subspace)
        temp = nbrs.kneighbors_graph(np_subspace).toarray()
        temp = temp.astype(np.uint64)
        heidi_matrix = heidi_matrix * 2 + temp
        temp_array.append([heidi_matrix])
        factor = factor * 2
        count += 1

    max_count = max_count - 1
    r = int(max_count / 3)
    g = int(max_count / 3)
    b = max_count - r - g

    x = heidi_matrix >> (max_count - r)
    y = (heidi_matrix & ((pow(2, g) - 1) << b)) >> b
    z = (heidi_matrix & (pow(2, b) - 1))
def compute_velocity_on_grid(X_emb,
                             V_emb,
                             density=None,
                             smooth=None,
                             n_neighbors=None,
                             min_mass=None,
                             autoscale=True,
                             adjust_for_stream=False,
                             cutoff_perc=None):
    # remove invalid cells
    idx_valid = np.isfinite(X_emb.sum(1) + V_emb.sum(1))
    X_emb = X_emb[idx_valid]
    V_emb = V_emb[idx_valid]

    # prepare grid
    n_obs, n_dim = X_emb.shape
    density = 1 if density is None else density
    smooth = .5 if smooth is None else smooth

    grs = []
    for dim_i in range(n_dim):
        m, M = np.min(X_emb[:, dim_i]), np.max(X_emb[:, dim_i])
        m = m - .01 * np.abs(M - m)
        M = M + .01 * np.abs(M - m)
        gr = np.linspace(m, M, int(50 * density))
        grs.append(gr)

    meshes_tuple = np.meshgrid(*grs)
    X_grid = np.vstack([i.flat for i in meshes_tuple]).T

    # estimate grid velocities
    if n_neighbors is None: n_neighbors = int(n_obs / 50)
    nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=-1)
    nn.fit(X_emb)
    dists, neighs = nn.kneighbors(X_grid)

    scale = np.mean([(g[1] - g[0]) for g in grs]) * smooth
    weight = normal.pdf(x=dists, scale=scale)
    p_mass = weight.sum(1)

    V_grid = (V_emb[neighs] * weight[:, :, None]).sum(1) / np.maximum(
        1, p_mass)[:, None]
    if min_mass is None: min_mass = 1

    if adjust_for_stream:
        X_grid = np.stack([np.unique(X_grid[:, 0]), np.unique(X_grid[:, 1])])
        ns = int(np.sqrt(len(V_grid[:, 0])))
        V_grid = V_grid.T.reshape(2, ns, ns)

        mass = np.sqrt((V_grid**2).sum(0))
        min_mass = 10**(min_mass - 6)  # default min_mass = 1e-5
        min_mass = np.clip(min_mass, None, np.max(mass) * .9)
        cutoff = mass.reshape(V_grid[0].shape) < min_mass

        if cutoff_perc is None: cutoff_perc = 5
        length = np.sum(np.mean(np.abs(V_emb[neighs]), axis=1),
                        axis=1).T.reshape(ns, ns)
        cutoff |= length < np.percentile(length, cutoff_perc)

        V_grid[0][cutoff] = np.nan
    else:
        min_mass *= np.percentile(p_mass, 99) / 100
        X_grid, V_grid = X_grid[p_mass > min_mass], V_grid[p_mass > min_mass]

        if autoscale: V_grid /= 3 * quiver_autoscale(X_grid, V_grid)

    return X_grid, V_grid
Example #13
0
ur_test = pandas.read_csv("predictions.dat", '\t')
ur_train = ur

max_movie_id = len(np.unique(ur["movieID"]))
max_user_id = len(np.unique(ur["userID"]))

movieMap = np.unique(ur["movieID"])
userMap = np.unique(ur["userID"])

ratingsMatrix = np.ones((max_user_id, max_movie_id)) * 3.7123

for index, row in ur_train.iterrows():
    movieIdx = np.where(movieMap == row["movieID"])
    userIdx = np.where(userMap == row["userID"])
    ratingsMatrix[userIdx, movieIdx] = row["rating"]

predictedRatings = np.zeros((2, testInstances))

model = NearestNeighbors()

idx = 0

for index, row in ur_test.iterrows():
    getRating(row, ratingsMatrix, movieMap, userMap, idx, predictedRatings)
    idx += 1
    print(idx, idx / float(testInstances))

df = pandas.DataFrame(data=predictedRatings.T,
                      columns=["testID", "predicted rating"])
df[['testID']] = df[['testID']].astype(int)
df.to_csv("mypredictions.dat", index=False)
Example #14
0
filename = "Patients.csv"
rows = []

with open(filename, 'r') as csvfile:
    csvreader = csv.reader(csvfile)

    for row in csvreader:
        rows.append(list(map(int, row)))

data = np.array(rows)
data_nor = StandardScaler().fit_transform(data)
dimens = data.shape[1]

min_poin = 2 * dimens
print("Min_points = ", min_poin)
neigh = NearestNeighbors(n_neighbors=min_poin - 1)
nbrs = neigh.fit(data_nor)
distance, indices = nbrs.kneighbors(data_nor)

distance = np.sort(distance, axis=0)
distances = distance[:, 1]
plt.plot(distances)
plt.show()
y = distances
x = range(1, len(y) + 1)
kn = KneeLocator(x, y, curve='convex', direction='increasing')
elb = distances[kn.knee - 1]

plt.xlabel('number of clusters k')
plt.ylabel('Distances(eligible for eplison)')
plt.plot(x, y)
    features_array[:data.shape[0], 4] = np.array(features_dict['energy']).T
    features_array[:data.shape[0],
                   5] = np.array(features_dict['instrumentalness']).T
    features_array[:data.shape[0], 6] = np.array(features_dict['liveness']).T
    features_array[:data.shape[0], 7] = np.array(features_dict['loudness']).T
    features_array[:data.shape[0], 8] = np.array(features_dict['mode']).T
    features_array[:data.shape[0],
                   9] = np.array(features_dict['speechiness']).T
    features_array[:data.shape[0], 10] = np.array(features_dict['tempo']).T
    features_array[:data.shape[0], 11] = np.array(features_dict['year']).T

    features_array[data.shape[0], :] = input_

    features_array = features_array.astype('float64')

    model = NearestNeighbors(n_neighbors=5000, algorithm='ball_tree')

    scalar = StandardScaler()
    scalar.fit(features_array)
    features_array = scalar.transform(features_array)
    input_2 = features_array[data.shape[0], :]
    model.fit(features_array[:data.shape[0], :])
    distances, indices = model.kneighbors([input_2])
    recorded_indices = indices

    closest_1000_point = {
        'valence': [],
        'year': [],
        'popularity': [],
        'mode': [],
        'acousticness': [],
    def fit(self, X, y, training_indices=None):
        """Specify data to be plotted, and fit classifier only if required (the
        specified clasifier is only trained if it has not been trained yet).

        All the input data is provided in the matrix X, and corresponding
        binary labels (values taking 0 or 1) in the vector y

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            A {n_samples by n_samples} size matrix containing data

        y : array-like, shape = [n_samples]
            Labels

        training_indices : array-like or float, optional (default=None)
            Indices on which the classifier has been trained / should be trained.
            If float, it is converted to a random sample with the specified proportion
            of the full dataset.

        Returns
        -------
        self : returns an instance of self.
        """
        if set(np.array(y, dtype=int).tolist()) != set([0, 1]):
            raise Exception(
                "Currently only implemented for binary classification. Make sure you pass in two classes (0 and 1)"
            )

        if training_indices == None:
            train_idx = range(len(y))
        elif type(training_indices) == float:
            train_idx, test_idx = train_test_split(range(len(y)),
                                                   test_size=0.2)
        else:
            train_idx = training_indices

        self.X = X
        self.y = y
        self.train_idx = train_idx
        #self.test_idx = np.setdiff1d(np.arange(len(y)), self.train_idx, assume_unique=False)
        self.test_idx = list(
            set(range(len(y))).difference(set(self.train_idx)))

        # fit classifier if necessary
        try:
            self.classifier.predict([X[0]])
        except:
            self.classifier.fit(X[train_idx, :], y[train_idx])

        self.y_pred = self.classifier.predict(self.X)

        # fit DR method if necessary
        try:
            self.dimensionality_reduction.transform([X[0]])
        except:
            self.dimensionality_reduction.fit(X, y)

        try:
            self.dimensionality_reduction.transform([X[0]])
        except:
            raise Exception(
                "Please make sure your dimensionality reduction method has an exposed transform() method! If in doubt, use PCA or Isomap"
            )

        # transform data
        self.X2d = self.dimensionality_reduction.transform(self.X)
        self.mean_2d_dist = np.mean(pdist(self.X2d))
        self.X2d_xmin, self.X2d_xmax = np.min(self.X2d[:,
                                                       0]), np.max(self.X2d[:,
                                                                            0])
        self.X2d_ymin, self.X2d_ymax = np.min(self.X2d[:,
                                                       1]), np.max(self.X2d[:,
                                                                            1])

        self.majorityclass = 0 if list(y).count(0) > list(y).count(1) else 1
        self.minorityclass = 1 - self.majorityclass
        minority_idx, majority_idx = np.where(
            y == self.minorityclass)[0], np.where(y == self.majorityclass)[0]
        self.Xminor, self.Xmajor = X[minority_idx], X[majority_idx]
        self.Xminor2d, self.Xmajor2d = self.X2d[minority_idx], self.X2d[
            majority_idx]

        # set up efficient nearest neighbor models for later use
        self.nn_model_2d_majorityclass = NearestNeighbors(n_neighbors=2)
        self.nn_model_2d_majorityclass.fit(self.X2d[majority_idx, :])

        self.nn_model_2d_minorityclass = NearestNeighbors(n_neighbors=2)
        self.nn_model_2d_minorityclass.fit(self.X2d[minority_idx, :])

        # step 1. look for decision boundary points between corners of majority &
        # minority class distribution
        minority_corner_idx, majority_corner_idx = [], []
        for extremum1 in [np.min, np.max]:
            for extremum2 in [np.min, np.max]:
                _, idx = self.nn_model_2d_minorityclass.kneighbors([[
                    extremum1(self.Xminor2d[:, 0]),
                    extremum2(self.Xminor2d[:, 1])
                ]])
                minority_corner_idx.append(idx[0][0])
                _, idx = self.nn_model_2d_majorityclass.kneighbors([[
                    extremum1(self.Xmajor2d[:, 0]),
                    extremum2(self.Xmajor2d[:, 1])
                ]])
                majority_corner_idx.append(idx[0][0])

        # optimize to find new db keypoints between corners
        self._linear_decision_boundary_optimization(minority_corner_idx,
                                                    majority_corner_idx,
                                                    all_combinations=True,
                                                    step=1)

        # step 2. look for decision boundary points on lines connecting randomly
        # sampled points of majority & minority class
        n_samples = int(self.n_connecting_keypoints)
        from_idx = list(
            random.sample(list(np.arange(len(self.Xminor))), n_samples))
        to_idx = list(
            random.sample(list(np.arange(len(self.Xmajor))), n_samples))

        # optimize to find new db keypoints between minority and majority class
        self._linear_decision_boundary_optimization(from_idx,
                                                    to_idx,
                                                    all_combinations=False,
                                                    step=2)

        if len(self.decision_boundary_points_2d) < 2:
            print(
                "Failed to find initial decision boundary. Retrying... If this keeps happening, increasing the acceptance threshold might help. Also, make sure the classifier is able to find a point with 0.5 prediction probability (usually requires an even number of estimators/neighbors/etc)."
            )
            return self.fit(X, y, training_indices)

        # step 3. look for decision boundary points between already known db
        # points that are too distant (search on connecting line first, then on
        # surrounding hypersphere surfaces)
        edges, gap_distances, gap_probability_scores = self._get_sorted_db_keypoint_distances(
        )  # find gaps
        self.nn_model_decision_boundary_points = NearestNeighbors(
            n_neighbors=2)
        self.nn_model_decision_boundary_points.fit(
            self.decision_boundary_points)

        i = 0
        retries = 0
        while i < self.n_interpolated_keypoints:
            if self.verbose:
                print("Step 3/{}:{}/".format(self.steps, i,
                                             self.n_interpolated_keypoints))
            if self.random_gap_selection:
                # randomly sample from sorted DB keypoint gaps?
                gap_idx = np.random.choice(len(gap_probability_scores),
                                           1,
                                           p=gap_probability_scores)[0]
            else:
                # get largest gap
                gap_idx = 0
            from_point = self.decision_boundary_points[edges[gap_idx][0]]
            to_point = self.decision_boundary_points[edges[gap_idx][1]]

            # optimize to find new db keypoint along line connecting two db keypoints
            # with large gap
            db_point = self._find_decision_boundary_along_line(
                from_point,
                to_point,
                penalize_tangent_distance=self.penalties_enabled)

            if self.decision_boundary_distance(
                    db_point) > self.acceptance_threshold:
                if self.verbose:
                    print(
                        "No good solution along straight line - trying to find decision boundary on hypersphere surface around known decision boundary point"
                    )

                # hypersphere radius half the distance between from and to db keypoints
                R = euclidean(from_point, to_point) / 2.0
                # search around either source or target keypoint, with 0.5 probability,
                # hoping to find decision boundary in between
                if random.random() > 0.5:
                    from_point = to_point

                # optimize to find new db keypoint on hypersphere surphase around known keypoint
                db_point = self._find_decision_boundary_on_hypersphere(
                    from_point, R)
                if self.decision_boundary_distance(
                        db_point) <= self.acceptance_threshold:
                    db_point2d = self.dimensionality_reduction.transform(
                        [db_point])[0]
                    self.decision_boundary_points.append(db_point)
                    self.decision_boundary_points_2d.append(db_point2d)
                    i += 1
                    retries = 0
                else:
                    retries += 1
                    if retries > self.hypersphere_max_retry_budget:
                        i += 1
                        dist = self.decision_boundary_distance(db_point)
                        msg = "Found point is too distant from decision boundary ({}), but retry budget exceeded ({})"
                        print(
                            msg.format(dist,
                                       self.hypersphere_max_retry_budget))
                    elif self.verbose:
                        dist = self.decision_boundary_distance(db_point)
                        print(
                            "Found point is too distant from decision boundary ({}) retrying..."
                            .format(dist))

            else:
                db_point2d = self.dimensionality_reduction.transform(
                    [db_point])[0]
                self.decision_boundary_points.append(db_point)
                self.decision_boundary_points_2d.append(db_point2d)
                i += 1
                retries = 0

            edges, gap_distances, gap_probability_scores = self._get_sorted_db_keypoint_distances(
            )  # reload gaps

        self.decision_boundary_points = np.array(self.decision_boundary_points)
        self.decision_boundary_points_2d = np.array(
            self.decision_boundary_points_2d)

        if self.verbose:
            print("Done fitting! Found {} decision boundary keypoints.".format(
                len(self.decision_boundary_points)))

        return self
Example #17
0
    # Hyperparameters are described here.
    parser.add_argument("--n_neighbors", type=int, default=10)
    parser.add_argument("--metric", type=str, default="cosine")

    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument("--output-data-dir", type=str)
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])

    args = parser.parse_args()

    # Load the training data into a Pandas dataframe and make sure it is in the appropriate format
    embeddings = pd.read_csv(
        os.path.join(args.train, "embeddings.csv.tar.gz"),
        compression="gzip",
        index_col=False,
        header=None,
    )

    # Supply the hyperparameters of the nearest neighbors model
    n_neighbors = args.n_neighbors
    metric = args.metric

    # Now, fit the nearest neighbors model
    nn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric)
    model_nn = nn.fit(embeddings)
    print("model has been fitted")

    # Save the model to the output location in S3
    joblib.dump(model_nn, os.path.join(args.model_dir, "model.joblib"))
    def _generate_testpoints(self, tries=100):
        """Generate random demo points around decision boundary keypoints
        """
        nn_model = NearestNeighbors(n_neighbors=3)
        nn_model.fit(self.decision_boundary_points)

        nn_model_2d = NearestNeighbors(n_neighbors=2)
        nn_model_2d.fit(self.decision_boundary_points_2d)
        #max_radius = 2*np.max([nn_model_2d.kneighbors([self.decision_boundary_points_2d[i]])[0][0][1] for i in range(len(self.decision_boundary_points_2d))])

        self.X_testpoints = np.zeros((0, self.X.shape[1]))
        self.y_testpoints = []
        for i in range(len(self.decision_boundary_points)):
            if self.verbose:
                msg = "Generating testpoint for plotting {}/{}"
                print(msg.format(i, len(self.decision_boundary_points)))
            testpoints = np.zeros((0, self.X.shape[1]))
            # generate Np points in Gaussian around decision_boundary_points[i] with
            # radius depending on the distance to the next point
            d, idx = nn_model.kneighbors([self.decision_boundary_points[i]])
            radius = d[0][1] if d[0][1] != 0 else d[0][2]
            if radius == 0:
                radius = np.mean(pdist(self.decision_boundary_points_2d))
            max_radius = radius * 2
            radius /= 5.0

            # add demo points, keeping some balance
            max_imbalance = 5.0
            y_testpoints = []
            for j in range(self.n_generated_testpoints_per_keypoint - 2):
                c_radius = radius
                freq = np.array(np.unique(y_testpoints,
                                          return_counts=True)).T.astype(float)
                imbalanced = freq.shape[0] != 0
                if freq.shape[0] == 2 and (
                        freq[0, 1] / freq[1, 1] < 1.0 / max_imbalance
                        or freq[0, 1] / freq[1, 1] > max_imbalance):
                    imbalanced = True

                for try_i in range(tries):
                    testpoint = np.random.normal(
                        self.decision_boundary_points[i], radius,
                        (1, self.X.shape[1]))
                    try:
                        testpoint2d = self.dimensionality_reduction.transform(
                            testpoint)[0]
                    except:  # DR can fail e.g. if NMF gets negative values
                        testpoint = []
                        continue
                    # demo point needs to be close to current key point
                    if euclidean(
                            testpoint2d,
                            self.decision_boundary_points_2d[i]) <= max_radius:
                        if not imbalanced:  # needs to be not imbalanced
                            break
                        y_pred = self.classifier.predict(testpoint)[0]
                        # imbalanced but this would actually improve things
                        if freq.shape[0] == 2 and freq[y_pred,
                                                       1] < freq[1 - y_pred,
                                                                 1]:
                            break
                    c_radius /= 2.0
                if len(testpoint) != 0:
                    testpoints = np.vstack((testpoints, testpoint))
                    y_testpoints.append(self.classifier.predict(testpoint)[0])

            self.X_testpoints = np.vstack((self.X_testpoints, testpoints))
            self.y_testpoints = np.hstack((self.y_testpoints, y_testpoints))
            self.X_testpoints_2d = self.dimensionality_reduction.transform(
                self.X_testpoints)

        idx_within_bounds = np.where(
            (self.X_testpoints_2d[:, 0] >= self.X2d_xmin)
            & (self.X_testpoints_2d[:, 0] <= self.X2d_xmax)
            & (self.X_testpoints_2d[:, 1] >= self.X2d_ymin)
            & (self.X_testpoints_2d[:, 1] <= self.X2d_ymax))[0]
        self.X_testpoints = self.X_testpoints[idx_within_bounds]
        self.y_testpoints = self.y_testpoints[idx_within_bounds]
        self.X_testpoints_2d = self.X_testpoints_2d[idx_within_bounds]
Example #19
0
def auto_cluster(filepath, number_of_cluster, n_neighbor, top_num, plot):

    PhoneValueVector_Chinese, ValueVectorArray_Chi = gainvector(filepath)
    ld_dbd, combine_df = pre_auto_cluster(PhoneValueVector_Chinese,
                                          ValueVectorArray_Chi, n_neighbor,
                                          plot)

    #开始聚类
    value_size, feasure_size = ValueVectorArray_Chi.shape
    nbrs_ = NearestNeighbors(n_neighbors=20,
                             algorithm='brute',
                             metric='cosine').fit(ValueVectorArray_Chi)
    knn_matrix_ = nbrs_.kneighbors(ValueVectorArray_Chi, return_distance=False)
    dist_matrix = pairwise_distances(ValueVectorArray_Chi,
                                     Y=None,
                                     metric='cosine')
    cluster = []
    centers_ = combine_df['value index'].tolist()[0:number_of_cluster]

    cluster_len1 = 0
    cluster_len2 = 1
    value_list = []
    while len(cluster) != ValueVectorArray_Chi.shape[
            0] - number_of_cluster and cluster_len1 != cluster_len2:
        cluster_len1 = len(cluster)
        for j in range(len(PhoneValueVector_Chinese)):
            if PhoneValueVector_Chinese.loc[j]['value'] not in [
                    k['value'] for k in cluster
            ]:
                if j not in centers_:
                    near_index = ld_dbd['shortest index'][j]
                    near_value = PhoneValueVector_Chinese.loc[near_index][
                        'value']
                    if near_index in centers_:
                        cluster.append({
                            'label':
                            PhoneValueVector_Chinese.loc[near_index]['value'],
                            'value':
                            PhoneValueVector_Chinese.loc[j]['value'],
                            'dist':
                            dist_matrix[near_index][j]
                        })
                        value_list.append(
                            PhoneValueVector_Chinese.loc[near_index]['value'])
                        value_list.append(
                            PhoneValueVector_Chinese.loc[j]['value'])
                    elif len([k for k in cluster if k['value'] == near_value
                              ]) > 0:
                        cluster.append({
                            'label': [
                                k['label'] for k in cluster
                                if k['value'] == near_value
                            ][0],
                            'value':
                            PhoneValueVector_Chinese.loc[j]['value'],
                            'dist':
                            dist_matrix[near_index][j]
                        })
                        value_list.append([
                            k['label'] for k in cluster
                            if k['value'] == near_value
                        ][0])
                        value_list.append(
                            PhoneValueVector_Chinese.loc[j]['value'])
                    else:
                        continue
        cluster_len2 = len(cluster)
    extra = [
        a for a in PhoneValueVector_Chinese['value'].tolist()
        if a not in list(set(value_list))
    ]
    print('已有聚类结果', len(list(set(value_list))), '未有聚类结果', len(extra), 'total',
          len(PhoneValueVector_Chinese))
    cluster_df = pd.DataFrame(cluster)
    cluster_df = cluster_df.sort_values(by=['label', 'dist'])
    result = []
    center_top_df = cluster_df.loc[cluster_df['label'].isin(
        combine_df['value name'][:number_of_cluster])]
    for i in center_top_df['label'].drop_duplicates():
        center_top = center_top_df.loc[center_top_df['label'] == i]
        cluster_ = center_top['value'].tolist()[:top_num]
        result.append({
            'center':
            i,
            'cluster':
            cluster_,
            'combine':
            combine_df.loc[combine_df['value name'] == i]['combine'].tolist()
            [0]
        })
    result.append({'center': 0, 'cluster': extra, 'combine': 0})
    result_df = pd.DataFrame(result)
    result_df = result_df[['center', 'cluster', 'combine']]
    result_df = result_df.sort_values(by='combine',
                                      ascending=False).reset_index()
    #return result_df,cluster_df

    # top k of clusters
    cluster_size = [0]
    name_of_label = cluster_df['label'].drop_duplicates().tolist()
    cluster_df_top = pd.DataFrame()
    auto_cluster = []
    auto_remain = []
    for k in name_of_label:  #top_num
        auto_cluster.append(k)
        cluster_df_top = pd.concat([
            cluster_df_top, cluster_df.loc[cluster_df['label'] == k][:top_num]
        ])
        cluster_list = cluster_df.loc[cluster_df['label'] ==
                                      k]['value'][:top_num].tolist()
        cluster_remain_list = cluster_df.loc[cluster_df['label'] ==
                                             k]['value'][top_num:].tolist()
        for m in cluster_list:
            auto_cluster.append(m)
        for n in cluster_remain_list:
            auto_remain.append(n)
        cluster_size.append(len(auto_cluster))

    return result_df, cluster_df_top
Example #20
0
import numpy as np
from sklearn.neighbors import NearestNeighbors
import torch
from PIL import Image
import glob
import cv2
from torch.nn.functional import interpolate, softmax
from torchvision.utils import make_grid
from skimage.color import lab2rgb

ab_bins = np.load('pts_in_hull.npy')
nbrs = NearestNeighbors(n_neighbors=5, algorithm='kd_tree', p=2).fit(ab_bins)
ab_bins = torch.from_numpy(ab_bins).cuda().float()


def soft_encode_ab(raw_ab):

    raw_ab = raw_ab.numpy()

    # Flatten (C, A, H, W) array into (C*H*W, A) array

    nax = np.setdiff1d(np.arange(0, raw_ab.ndim), np.array((1)))
    axorder = np.concatenate((nax, np.array(1).flatten()), axis=0)

    flat_ab = raw_ab.transpose((axorder)).reshape((-1, 2))

    # Calculate encoidings for each element

    distances, indices = nbrs.kneighbors(flat_ab)

    dist_w = np.exp(-distances**2 / (2 * 5**2))
Example #21
0
def affinity(X,
             algo='lle',
             n_neighbors=5,
             epsilon='auto',
             kernel='rbf',
             gamma=1,
             theta=1,
             lle_diag_fill=False,
             row_norm=True,
             n_jobs=-1):
    """
    Compute the affinity matrix.

    :param X:
    :param algo:
    :param n_neighbors:
    :param epsilon:
    :param kernel:
    :param gamma:
    :param lle_diag_fill:
    :param row_norm:
    :param n_jobs:
    :return:
    """
    algo = algo.lower()
    assert algo in ['lle', 'epsilon', 'knn', 'kernel']

    if algo == 'lle':
        # locally linear embedding's reconstruction matrix
        n_neighbors = X.shape[0]
        knn = NearestNeighbors(n_neighbors + 1, n_jobs=n_jobs).fit(
            torch.cat([X, torch.zeros(1, X.shape[1])]))
        X = knn._fit_X
        n_samples = X.shape[0] - 1
        ind = knn.kneighbors(X, return_distance=False)[:-1, 1:]
        data = _barycenter_weights(X[:-1, :], X[ind])
        indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors)
        af_mat = torch.from_numpy(
            csr_matrix((data.ravel(), ind.ravel(), indptr),
                       shape=(n_samples, n_samples)).todense()).float()
        if lle_diag_fill:
            af_mat += torch.eye(n_samples)
        return af_mat

    elif algo == 'knn':
        # k-nearest neighbors
        if n_neighbors < 0:
            n_neighbors = X.shape[0]
        knn = NearestNeighbors(n_neighbors + 1, n_jobs=n_jobs).fit(
            torch.cat([X, torch.zeros(1, X.shape[1])]))
        X = knn._fit_X
        n_samples = X.shape[0] - 1
        ind = knn.kneighbors(X, return_distance=False)
        af_mat = torch.zeros(n_samples, n_samples)
        for i in range(n_samples):
            for j in range(min(n_samples + 1, ind.shape[1])):
                if ind[i, j] < n_samples:
                    if j < n_neighbors:
                        af_mat[i, ind[i, j]] = 1
                    else:
                        break
        # af_mat = torch.from_numpy(csr_matrix((data.ravel(), ind.ravel(), indptr),
        #                                      shape=(n_samples, n_samples)).todense()).float()
        return _row_norm(af_mat) if row_norm else af_mat

    elif algo == 'epsilon':
        # epsilon nearest neighbors
        n_samples = X.shape[0]
        knn = NearestNeighbors(n_samples + 1, n_jobs=n_jobs).fit(
            torch.cat([X, torch.zeros(1, X.shape[1])]))
        X = knn._fit_X
        dist, ind = knn.kneighbors(X, return_distance=True)
        if isinstance(epsilon, str):
            assert epsilon in ['auto']
            epsilon = np.mean(dist) / 2
        af_mat = torch.zeros(n_samples, n_samples)
        for i in range(n_samples):
            for j in range(n_samples + 1):
                if ind[i, j] < n_samples and dist[i, j] <= epsilon:
                    af_mat[i, ind[i, j]] = 1
        return _row_norm(af_mat) if row_norm else af_mat

    elif algo == 'kernel':
        # heat kernel (rbf or laplacian)
        if isinstance(kernel, str):
            assert kernel in ['rbf', 'laplacian']
            if kernel == 'rbf':
                kernel = rbf_kernel
            elif kernel == 'laplacian':
                kernel = laplacian_kernel
        # else predefined kernel func
        af_mat = torch.from_numpy(kernel(X, gamma=gamma)).float() / theta
        if n_neighbors > 0:
            mask = affinity(X,
                            algo='knn',
                            n_neighbors=n_neighbors,
                            row_norm=False,
                            n_jobs=n_jobs)
            mask -= torch.eye(mask.shape[0])
            print(af_mat[0])
            print(mask[0])
            print()
            af_mat *= mask
        return _row_norm(af_mat) if row_norm else af_mat
Example #22
0
    def perform_type_prediction(self, df):

        def create_binary_type_vector(t_types, a_types):
            vector = np.zeros(len(all_types))
            i = [a_types.index(_) for _ in t_types]
            vector[i] = 1
            return vector

        def create_binary_type_prediction_vector(t_types, a_types):
            vector = np.zeros(len(all_types))
            i = [a_types.index(_) for _ in itertools.chain.from_iterable(t_types)]
            vector[i] += 1
            return vector

        # get the types. Mapping from the index of subject to the index of object
        type_info = ut.deserializer(path=self.p_folder, serialized_name='type_info')

        # get the index of objects / get type information =>>> s #type o
        all_types = sorted(set.union(*list(type_info.values())))


        # Consider only points with type infos.
        e_w_types = df.loc[list(type_info.keys())]

        neigh = NearestNeighbors(n_neighbors=101, algorithm='kd_tree', metric='euclidean', n_jobs=-1).fit(
            e_w_types)

        # Get similarity results for selected entities
        df_most_similars = pd.DataFrame(neigh.kneighbors(e_w_types, return_distance=False))

        # Reindex the target
        df_most_similars.index = e_w_types.index.values

        # As sklearn implementation of kneighbors returns the point itself as most similar point
        df_most_similars.drop(columns=[0], inplace=True)


        # Map back to the original indexes. KNN does not consider the index of Dataframe.
        mapper = dict(zip(list(range(len(e_w_types))), e_w_types.index.values))
        # The values of most similars are mapped to original vocabulary positions
        df_most_similars = df_most_similars.applymap(lambda x: mapper[x])


        k_values = [1, 3, 5, 10, 15, 30, 50, 100]

        print('K values:',k_values)
        for k in k_values:
            print('#####', k, '####')
            similarities = list()
            for _, S in df_most_similars.iterrows():
                true_types = type_info[_]
                type_predictions = [type_info[_] for _ in S.values[:k]]

                vector_true = create_binary_type_vector(true_types, all_types)
                vector_prediction = create_binary_type_prediction_vector(type_predictions, all_types)

                sim = cosine(vector_true, vector_prediction)
                similarities.append(1 - sim)

            report = pd.DataFrame(similarities)
            print('Mean type prediction', report.mean().values)
Example #23
0
from fuzzywuzzy import fuzz
from flask import Flask, request, render_template
from scipy.spatial.distance import cosine
from surprise import SVD
import random

# load data
movie_user_mat_sparse = pickle.load(open('movie_user_mat_sparse', 'rb'))
movie_to_idx = pickle.load(open('movie_to_idx', 'rb'))
model = pickle.load(open('model_svd_100', 'rb'))
trainset = pickle.load(open('trainset', 'rb'))
mname = pickle.load(open('moviename', 'rb'))

# fit knn
model_knn = NearestNeighbors(metric='cosine',
                             algorithm='brute',
                             n_neighbors=50,
                             n_jobs=-1)
model_knn.fit(movie_user_mat_sparse)


def fuzzy_2(favs, m=mname):
    final = []
    for i in favs:
        lst = []
        for j in m:
            ratio = fuzz.ratio(i.lower(), j.lower())
            if ratio >= 70:
                lst.append([j, ratio])

        list.sort(lst, key=lambda x: x[1], reverse=True)
        final.append(lst[0][0])
Example #24
0
    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        idx_under : ndarray, shape (n_samples, )
            If `return_indices` is `True`, a boolean array will be returned
            containing the which samples have been selected.

        """

        # Assign the parameter of the element of this class
        # Check that the version asked is implemented
        if self.version not in [1, 2, 3]:
            raise ValueError('Parameter `version` must be 1, 2 or 3, got'
                             ' {}'.format(self.version))

        # Start with the minority class
        X_min = X[y == self.min_c_]
        y_min = y[y == self.min_c_]

        # All the minority class samples will be preserved
        X_resampled = X_min.copy()
        y_resampled = y_min.copy()

        # Compute the number of cluster needed
        if self.ratio == 'auto':
            num_samples = self.stats_c_[self.min_c_]
        else:
            num_samples = int(self.stats_c_[self.min_c_] / self.ratio)

        # If we need to offer support for the indices
        if self.return_indices:
            idx_under = np.flatnonzero(y == self.min_c_)

        # For each element of the current class, find the set of NN
        # of the minority class
        # Call the constructor of the NN
        nn_obj = NearestNeighbors(n_neighbors=self.size_ngh,
                                  n_jobs=self.n_jobs,
                                  **self.kwargs)

        # Fit the minority class since that we want to know the distance
        # to these point
        nn_obj.fit(X[y == self.min_c_])

        # Loop over the other classes under picking at random
        for key in self.stats_c_.keys():

            # If the minority class is up, skip it
            if key == self.min_c_:
                continue

            # Get the samples corresponding to the current class
            sub_samples_x = X[y == key]
            sub_samples_y = y[y == key]

            if self.version == 1:
                # Find the NN
                dist_vec, idx_vec = nn_obj.kneighbors(
                    sub_samples_x, n_neighbors=self.size_ngh)

                # Select the right samples
                sel_x, sel_y, idx_tmp = self._selection_dist_based(
                    X, y, dist_vec, num_samples, key, sel_strategy='nearest')

            elif self.version == 2:
                # Find the NN
                dist_vec, idx_vec = nn_obj.kneighbors(
                    sub_samples_x, n_neighbors=self.stats_c_[self.min_c_])

                # Select the right samples
                sel_x, sel_y, idx_tmp = self._selection_dist_based(
                    X, y, dist_vec, num_samples, key, sel_strategy='nearest')

            elif self.version == 3:
                # We need a new NN object to fit the current class
                nn_obj_cc = NearestNeighbors(n_neighbors=self.ver3_samp_ngh,
                                             n_jobs=self.n_jobs,
                                             **self.kwargs)
                nn_obj_cc.fit(sub_samples_x)

                # Find the set of NN to the minority class
                dist_vec, idx_vec = nn_obj_cc.kneighbors(X_min)

                # Create the subset containing the samples found during the NN
                # search. Linearize the indexes and remove the double values
                idx_vec = np.unique(idx_vec.reshape(-1))

                # Create the subset
                sub_samples_x = sub_samples_x[idx_vec, :]
                sub_samples_y = sub_samples_y[idx_vec]

                # Compute the NN considering the current class
                dist_vec, idx_vec = nn_obj.kneighbors(
                    sub_samples_x, n_neighbors=self.size_ngh)

                sel_x, sel_y, idx_tmp = self._selection_dist_based(
                    sub_samples_x,
                    sub_samples_y,
                    dist_vec,
                    num_samples,
                    key,
                    sel_strategy='farthest')
            else:
                raise NotImplementedError

            # If we need to offer support for the indices selected
            if self.return_indices:
                idx_under = np.concatenate((idx_under, idx_tmp), axis=0)

            X_resampled = np.concatenate((X_resampled, sel_x), axis=0)
            y_resampled = np.concatenate((y_resampled, sel_y), axis=0)

        self.logger.info('Under-sampling performed: %s', Counter(y_resampled))

        # Check if the indices of the samples selected should be returned too
        if self.return_indices:
            # Return the indices of interest
            return X_resampled, y_resampled, idx_under
        else:
            return X_resampled, y_resampled
Example #25
0
def nearest_neighbors(values, nbr_neighbors=15):
    nn = NearestNeighbors(nbr_neighbors, metric='euclidean',
                          algorithm='brute').fit(values)
    dists, idxs = nn.kneighbors(values)
    return (dists, idxs)
Example #26
0
def knn_prob(X, y, k):
    """Returns a part of y=1 among the k nearest neighbors."""
    knn = NearestNeighbors(n_neighbors=k, n_jobs=24).fit(X)
    nbh = knn.kneighbors(return_distance=False)
    return y.reshape(-1)[nbh].mean(1)
    def fill_missvalue(
            self,
            dataset,
            value='mean',
            consider_cat=True,
            label_based=False):  # methods of dealing with missing value
        if value == 'linear_knn':  # select k-nearest neigbors and using them to do linear_regression to fill the missing
            catagorical_set = [
                3, 22, 24, 30, 31, 47, 52, 56, 66, 71, 74, 75, 79, 91, 107,
                110, 112, 113, 125
            ]
            for i in range(len(catagorical_set)):
                catagorical_set[i] -= 1

            missing_stat = stat.Missing_stat().numofmissing_ofrow(dataset)
            nonMissingValSet = []
            missingIndex = missing_stat[0]
            dataset, target, id = Pandas_dataProcess().label_extract(dataset,
                                                                     id=True)
            dataset = dataset.values.tolist()
            for i in missingIndex:
                nonMissingValSet.append(dataset[i])
            print '1'

            ########################################## KNN ##########################
            K = 10
            filledSet = []
            for record in dataset:
                nonMissTmp = copy(nonMissingValSet)
                missedIndex = []
                fullSetDic = {}
                for index in range(len(record)):
                    if record[index] == '':
                        missedIndex.append(index)
                for index in range(len(nonMissTmp)):
                    distance = 0.0
                    line = nonMissTmp[index]
                    for i in range(len(line)):
                        if i in missedIndex or i in catagorical_set:
                            continue
                        else:
                            distance += pow(
                                (float(record[i]) - float(line[i])), 2)
                    distance = sqrt(distance)
                    fullSetDic[distance] = []
                    fullSetDic[distance].extend(nonMissTmp[index])
                od = collections.OrderedDict(sorted(fullSetDic.items()))
                topK = []
                for i in range(K):
                    key, value = fullSetDic.popitem()
                    topK.append(value)
                print '2'

                ########################################## Linear regression ########################
                fillIn = []
                train_x = []
                for row in range(len(topK)):
                    train_x.append([])
                    for col in range(len(topK[0])):
                        if col in missedIndex:
                            continue
                        else:
                            train_x[-1].append(topK[row][col])
                test_x = []
                for i in range(len(record)):
                    if record[i] == '':
                        continue
                    else:
                        test_x.append(record[i])
                for i in range(len(record)):
                    if record[i] == '':
                        train_y = []
                        for row in range(len(topK)):
                            train_y.append(topK[row][i])
                        regr = linear_model.LinearRegression()
                        regr.fit(train_x, train_y)
                        predict_label = regr.predict(test_x)
                        fillIn.append(predict_label)
                    else:
                        fillIn.append(record[i])
                filledSet.append(fillIn)
                print '3'
            return filledSet
        if value == 'nn':  # choose the nearest neighbor value to fill missing value
            copydataset = dataset
            copydataset = self.remove_categorical(copydataset)
            copydataset = self.fill_missvalue(copydataset, consider_cat=False)
            listset = array(copydataset.values.tolist())
            nbrs = NearestNeighbors(n_neighbors=2,
                                    algorithm='auto').fit(listset)
            distances, indices = nbrs.kneighbors(listset)
            print indices
            a = 0
            for (dataset_name, dataset_series) in dataset.iteritems():
                print a
                a += 1
                nan_num = len(dataset[dataset_series.isnull()])
                if nan_num > 0:
                    for index, values in dataset_series.iteritems():
                        if not dataset.ix[index, dataset_name] < inf:
                            dataset.ix[index, dataset_name] = dataset.ix[
                                indices[index][1], dataset_name]
            print dataset
            return dataset
        t = 1
        if label_based:
            dataset = dataset.sort(['target'], ascending=False)
        for (dataset_name, dataset_series) in dataset.iteritems():
            print t
            t += 1
            if value == 'mean':  # use mean value to fill the missing value
                nan_num = len(dataset[dataset_series.isnull()])
                if nan_num > 0:
                    if label_based:
                        if consider_cat:
                            if dataset_name in self.categorical_set:
                                for (values, times
                                     ) in dataset[dataset_name].value_counts(
                                     ).iteritems():
                                    mode = values
                                    break
                                dataset.loc[dataset_series.isnull(),
                                            dataset_name] = mode
                            else:
                                labelbased_mean = []
                                for (dataset_name2, dataset_series2
                                     ) in dataset.groupby('target'):
                                    labelbased_mean.append(
                                        dataset_series2[dataset_name].mean())
                                dataset[:87022].loc[
                                    dataset_series.isnull(),
                                    dataset_name] = labelbased_mean[1]
                                dataset[87022:].loc[
                                    dataset_series.isnull(),
                                    dataset_name] = labelbased_mean[0]
                        else:
                            dataset.loc[dataset_series.isnull(),
                                        dataset_name] = dataset_series.mean()
                    else:
                        if consider_cat:
                            if dataset_name in self.categorical_set:
                                for (values, times
                                     ) in dataset[dataset_name].value_counts(
                                     ).iteritems():
                                    mode = values
                                    break
                                dataset.loc[dataset_series.isnull(),
                                            dataset_name] = mode
                            else:
                                dataset.loc[
                                    dataset_series.isnull(),
                                    dataset_name] = dataset_series.mean()
                        else:
                            dataset.loc[dataset_series.isnull(),
                                        dataset_name] = dataset_series.mean()
            elif value == 'std':  # use standard deviation to fill the missing value
                nan_num = len(dataset[dataset_series.isnull()])
                if nan_num > 0:
                    if consider_cat:
                        if dataset_name in self.categorical_set:
                            for (values,
                                 times) in dataset[dataset_name].value_counts(
                                 ).iteritems():
                                mode = values
                                break
                            dataset.loc[dataset_series.isnull(),
                                        dataset_name] = mode
                        else:
                            dataset.loc[dataset_series.isnull(),
                                        dataset_name] = dataset_series.std()
                    else:
                        dataset.loc[dataset_series.isnull(),
                                    dataset_name] = dataset_series.std()
            elif value == 'mode':  # use the mode to fill the missing value
                nan_num = len(dataset[dataset_series.isnull()])
                if nan_num > 0:
                    for (values, times
                         ) in dataset[dataset_name].value_counts().iteritems():
                        mode = values
                        break
                    dataset.loc[dataset_series.isnull(), dataset_name] = mode
            else:
                nan_num = len(dataset[dataset_series.isnull()])
                if nan_num > 0:
                    dataset.loc[dataset_series.isnull(), dataset_name] = value
        return dataset
Example #28
0
x = x / 255.0
x = x.reshape(-1, 32, 32, 3)
x = x.reshape(-1, 3072)

pca = PCA(128)
pca.fit(x)
x_transformed = pca.transform(x)

n = randrange(60000)
print(n)
query = x_transformed[n]
label = y[n]
n_neigh = 6
x_transformed = x_transformed.reshape(-1, 128)
query = query.reshape(1, 128)
nbrs = NearestNeighbors(n_neighbors=n_neigh, n_jobs=-1).fit(x_transformed)
distances, indices = nbrs.kneighbors(np.array(query))
n_label_names = [label_names[y[i]] for i in indices]
closest_images = x[indices]
closest_images = closest_images.reshape(-1, 32, 32, 3)

plt.imshow(x[n].reshape(32, 32, 3))
plt.title(label_names[label])
plt.show()
plt.figure(figsize=(20, 6))
for i in range(1, n_neigh):
    # display original
    ax = plt.subplot(1, n_neigh, i + 1)
    ax.set_title(n_label_names[0][i])
    plt.imshow(closest_images[i].reshape(32, 32, 3))
    plt.gray()
 def create_detector(self):
     self.nearest_neighbor = NearestNeighbors(n_neighbors=1, metric='mahalanobis', metric_params={'V': self.cov, 'VI': np.linalg.inv(self.cov)},
                      algorithm='brute')
vec, vocab, dim = read_vector_file(vec_file)
vocab_index = dict()
for i in xrange(0, len(vocab)):
    vocab_index[vocab[i]] = i
num_users = len(vocab)
print "num users in train sequences", num_users
# print "users removed from vocab", len(set(users_train)-set(vocab))
# print "users in test sequences but not in vocab", len(users_test-set(vocab))

# building kd-tree
tic = time.clock()
# kd = KDTree(vec, leafsize=10)
neigh = NearestNeighbors(n_neighbors=5,
                         radius=1.0,
                         algorithm='ball_tree',
                         leaf_size=100,
                         metric='minkowski',
                         p=2)  #'ball_tree', 'kd_tree', 'auto'
neigh.fit(vec)
toc = time.clock()
print "ball tree built in", (toc - tic) * 1000


def get_candidate_set(query_set, next_adopters, N):
    try:
        query_set_ind = [vocab_index[query] for query in query_set]
    except KeyError:
        print "query word not present"
        return
    query_vec = [vec[i] for i in query_set_ind]
    # query using scipy kdtree