Example #1
0
File: nn.py Project: tianfudhe/ids
class NNScope:

    def get_minolab(self):
        tmp = pd.Series(self.y)
        tmp = tmp.value_counts()
        return min(tmp.keys(), key=lambda o: tmp[o])

    def normalization(self):
        self.X -= np.mean(self.X, axis=0)
        self.X /= np.sqrt(np.var(self.X, axis=0))

    def __init__(self, X, y, k):
        self.X = np.array(X, dtype='float64')
        self.normalization()
        self.y = y
        self.minolab = self.get_minolab()
        self.nn = NearestNeighbors(n_neighbors=k, n_jobs=-1)
        self.nn.fit(self.X)
        self.nn_maj = NearestNeighbors(n_neighbors=k, n_jobs=-1)
        self.nn_maj.fit(self.X[y != self.minolab])
        self.distr = None

    # how many minority samples with given number of minotiry neighbors
    def calc_ratio(self):
        dis_all, _ = self.nn.kneighbors()
        dis_all = dis_all[self.y == self.minolab]
        dis_maj, _ = self.nn_maj.kneighbors(self.X[self.y == self.minolab])
        self.WBNR = np.sqrt(np.mean(dis_all ** 2, axis=1) /
                            np.mean(dis_maj ** 2, axis=1))

    def show_ratio_distr(self):
        plt.hist(self.WBNR, bins=20)
Example #2
0
def k_nearest_neighbors_scores(k, eng_vec_dict, fr_vec_dict):
	eng_mat, fr_mat, index_map = build_parallel_mats_from_dicts(eng_vec_dict, fr_vec_dict, translation_dict)
	# k + 1 since we discard the top neighbor, which is itself
	neighbors_en = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(eng_mat)
	dist_en, indices_en = neighbors_en.kneighbors(eng_mat)
	neighbors_fr = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(fr_mat)
	dist_fr, indices_fr = neighbors_fr.kneighbors(fr_mat)
	# since we built the matrices in parallel, we know now that indices map to each other,
	# so we simply check the overlap of those to calculate precision and recall. 
	# calculate avg recall for k-recall
	avg_recall = 0.
	num_points = len(indices_en) + 0.
	knearest_map_en = dict()
	knearest_map_fr = dict()
	for i in range(0, int(num_points)):
		w_en = index_map[i][0]
		w_fr = index_map[i][1]
		index_set_en = set(indices_en[i][1:]) # should be size k
		index_set_fr = set(indices_fr[i][1:]) # should be size k
		if w_en not in knearest_map_en:
			knearest_map_en[w_en] = map(lambda z: index_map[z], index_set_en)
		if w_fr not in knearest_map_fr:
			knearest_map_fr[w_fr] = map(lambda z: index_map[z], index_set_fr)
		recall_count = sum(1 for i in index_set_fr if i in index_set_en)
		# precision = recall for this task
		recall = (recall_count + 0.)/len(index_set_en)
		avg_recall += recall
	return (avg_recall/num_points), knearest_map_en, knearest_map_fr
Example #3
0
def nearestN():
    X = [[125,1], [200,0], [70,0], [240,1], [114,0], [120,0], [264,1], [85,0], [150,0], [90,0]]
#    y = [ 0, 0, 0, 0, 1, 0, 0, 1, 0,1 ]
    model = NN(n_neighbors=1, radius=1)
    model.fit(X)
    y = [98.,0.]
    print model.kneighbors(y)
Example #4
0
def main():
    vectorizer = CountVectorizer(ngram_range=(1,2),max_df=1.0, min_df=0.0)

    nei = NearestNeighbors(algorithm='brute', metric='jaccard')
    matrix = vectorizer.fit_transform(training_set).todense()
    new_matrix = vectorizer.transform(new_comments).todense()
    nei.fit(matrix)
    path =  '{0}/'.format(pathsplit(abspath(__file__))[0])
    jsonfile = open(path + '{0}-nn.json'.format(n_neighbors), 'w')

    nodes = [{'name': (training_set+new_comments)[i],
              'group':(groups + new_groups)[i]}
             for i in range(len(training_set+new_comments))]
    links = []

    for i in range(len(matrix)):
        dist, idnei = nei.kneighbors(matrix[i], n_neighbors=n_neighbors + 1)
        dist, idnei = dist[0], idnei[0]

        for j in range(len(idnei[1:])):
            links.append({"source":i,"target":idnei[j+1],"value":10*(1 - dist[j+1])})

    for i in range(len(new_comments)):
        dist, idnei = nei.kneighbors(new_matrix[i], n_neighbors=n_neighbors + 1)
        dist, idnei = dist[0], idnei[0]
        for j in range(len(idnei[1:])):
            links.append({"source":len(matrix) + i,"target":idnei[j],"value":10*(1 - dist[j+1])})

    jsondumped = json.dumps({'nodes':nodes, 'links':links}, indent=2)

    jsonfile.write(jsondumped)
Example #5
0
def estimator_knn_cv(X, y, clf, n_neigh):
    neigh = NearestNeighbors(n_neigh, metric="euclidean", algorithm="brute")
    neigh_est = NearestNeighbors(n_neigh, metric="manhattan", algorithm="brute")
    acc = []
    for train, test in StratifiedKFold(y, 5):
        X_train = X[train]
        y_train = y[train]
        X_test = X[test]
        y_test = y[test]
        clf.fit(X_train, y_train)
        estimators = clf.estimators_
        preds_train = np.array(map(lambda e: e.predict(X_train), estimators)).T
        preds_test = np.array(map(lambda e: e.predict(X_test), estimators)).T
        preds_train_proba = np.array(map(lambda e: e.predict_proba(X_train), estimators))
        preds_test_proba = np.array(map(lambda e: e.predict_proba(X_test), estimators))
        p_train = preds_train_proba.swapaxes(0, 1)[:, :, 0]
        p_test = preds_test_proba.swapaxes(0, 1)[:, :, 0]
        neigh.fit(X_train)
        dist, knn = neigh.kneighbors(X_test)
        neigh_est.fit(preds_train)
        dist, knn_est = neigh_est.kneighbors(preds_test)
        # neigh_est.fit(p_train);dist, knn_est = neigh_est.kneighbors(p_test)
        knn_combined_uniq = np.array(map(np.unique, np.hstack((knn[:, :30], knn_est[:, :30]))))
        pp_uniq = np.array([stats.mode(y_train[nn])[0][0] for nn in knn_combined_uniq])
        # pp_uniq = np.array([stats.mode(y_train[nn])[0][0] for nn in knn[:,:30]])
        preds_test_est_knn = np.array(
            [[stats.mode(y_train[nn])[0][0] for nn in knn_est[:, :i]] for i in xrange(1, n_neigh, 2)]
        )
        acc.append(
            [accuracy_score(y_test, pred) for pred in np.vstack((preds_test_est_knn, clf.predict(X_test), pp_uniq))]
        )
    mean_acc = np.mean(acc, axis=0)
    print " ".join("{:.3f}".format(v) for v in mean_acc), " max:{:.3f}".format(mean_acc.max())
Example #6
0
    def resample(self):
        # Start with the minority class
        minx = self.x[self.y == self.minc]
        miny = self.y[self.y == self.minc]

        # Finding nns
        # Import the k-NN classifier
        from sklearn.neighbors import NearestNeighbors

        nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1)
        nearest_neighbour.fit(minx)
        nns = nearest_neighbour.kneighbors(minx, return_distance=False)[:, 1:]

        # Creating synthetic samples
        sx, sy = self.make_samples(
            minx, minx, self.minc, nns, int(self.ratio * len(miny)), random_state=self.rs, verbose=self.verbose
        )

        # Concatenate the newly generated samples to the original data set
        ret_x = concatenate((self.x, sx), axis=0)
        ret_y = concatenate((self.y, sy), axis=0)

        # Find the nearest neighbour of every point
        nn = NearestNeighbors(n_neighbors=2)
        nn.fit(ret_x)
        nns = nn.kneighbors(ret_x, return_distance=False)[:, 1]

        # Send the information to is_tomek function to get boolean vector back
        links = self.is_tomek(ret_y, nns, self.minc, self.verbose)

        if self.verbose:
            print("Over-sampling performed:" " " + str(Counter(ret_y[logical_not(links)])))

        # Return data set without majority Tomek links.
        return ret_x[logical_not(links)], ret_y[logical_not(links)]
def estimate_dimension(X, n_neighbors='auto', neighbors_estimator=None):
    """Estimate intrinsic dimensionality.

    Based on "Manifold-Adaptive Dimension Estimation"
    Farahmand, Szepavari, Audibert ICML 2007.

    Parameters
    ----------
    X : nd-array, shape (n_samples, n_features)
        Input data.

    n_neighbors : int or auto, default='auto'
        Number of neighbors used for estimate.
        'auto' means ``np.floor(2 * np.log(n_samples))``.

    neighbors_estimator : NearestNeighbors object or None, default=None
        A pre-fitted neighbors object to speed up calculations.
    """
    if n_neighbors == 'auto':
        n_neighbors = np.floor(2 * np.log(X.shape[0])).astype("int")

    if neighbors_estimator is None:
        neighbors_estimator = NearestNeighbors(n_neighbors=n_neighbors)
        neighbors_estimator.fit(X)
    full_dist = neighbors_estimator.kneighbors(X, n_neighbors=n_neighbors)[0][:, -1]
    half_dist = neighbors_estimator.kneighbors(X, n_neighbors=n_neighbors // 2)[0][:, -1]
    est = np.log(2) / np.log(full_dist / half_dist)
    est = np.minimum(est, X.shape[1])
    return np.round(np.mean(est))
Example #8
0
    def resample(self):
        # Start with the minority class
        minx = self.x[self.y == self.minc]
        miny = self.y[self.y == self.minc]

        # Finding nns
        # Import the k-NN classifier
        from sklearn.neighbors import NearestNeighbors

        nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1)
        nearest_neighbour.fit(minx)
        nns = nearest_neighbour.kneighbors(minx, return_distance=False)[:, 1:]

        # Creating synthetic samples
        sx, sy = self.make_samples(
            minx, minx, self.minc, nns, int(self.ratio * len(miny)), random_state=self.rs, verbose=self.verbose
        )

        # Concatenate the newly generated samples to the original data set
        ret_x = concatenate((self.x, sx), axis=0)
        ret_y = concatenate((self.y, sy), axis=0)

        # Create a k-NN to fit the whole data
        nn_obj = NearestNeighbors(n_neighbors=self.size_ngh)

        # Fit the whole dataset
        nn_obj.fit(ret_x)

        # Loop over the other classes under picking at random
        for key_idx, key in enumerate(self.ucd.keys()):

            # Get the sample of the current class
            sub_samples_x = ret_x[ret_y == key]
            sub_samples_y = ret_y[ret_y == key]

            # Find the NN for the current class
            nnhood_idx = nn_obj.kneighbors(sub_samples_x, return_distance=False)

            # Get the label of the corresponding to the index
            nnhood_label = ret_y[nnhood_idx] == key

            # Check which one are the same label than the current class
            # Make an AND operation through the k neighbours
            nnhood_bool = np.all(nnhood_label, axis=1)

            # Get the samples which agree all together
            sel_x = np.squeeze(sub_samples_x[np.nonzero(nnhood_bool), :])
            sel_y = sub_samples_y[np.nonzero(nnhood_bool)]

            if key_idx == 0:
                underx = sel_x[:, :]
                undery = sel_y[:]
            else:
                underx = concatenate((underx, sel_x), axis=0)
                undery = concatenate((undery, sel_y), axis=0)

        if self.verbose:
            print("Over-sampling performed: " + str(Counter(undery)))

        return underx, undery
def eucl_distance(a, b):
    nbrs_a = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(a) if a.size > 0 else None
    nbrs_b = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(b) if b.size > 0 else None
    distances_a, _ = nbrs_a.kneighbors(b) if nbrs_a and b.size > 0 else ([np.inf], None)
    distances_b, _ = nbrs_b.kneighbors(a) if nbrs_b and a.size > 0 else ([np.inf], None)

    return [distances_a, distances_b]
Example #10
0
 def on_pick(self, event):
     ind = event.ind[0]
     arty = event.artist
     for key in nld.layers.keys():
         layer = nld.layers[key]
         for plot in layer.plots:
             if plot is arty:
                 self.neighb_sec = key
                 break
     nbrs = NearestNeighbors(n_neighbors=50, n_jobs=1).fit(X)
     distances, indices = nbrs.kneighbors(X)
     #nbrs.fit(X)
     #W = barycenter_kneighbors_graph(
     #    nbrs, n_neighbors=50, reg=1e-3, n_jobs=1)
     #knn = kneighbors_graph(X, 10).to_array()
     try:
         self.scatters.remove()
         self.two_scatters.remove()
     except:
         pass
     self.points = indices[ind]
     neighb_layer = nld.get_layer(self.neighb_sec)
     self.points = X[self.points]
     # self.points = [neighb_layer.x_data[0][ind], neighb_layer.y_data[0][self.points], neighb_layer.z_data[0][self.points]]
     section_num = int(self.neighb_sec[-1])
     section_ax = self.fig.get_axes()[section_num + 1]
     section_layer = section_ax.get_layer(section_ax.title._text + ' proj')
     self.scatters = nld.scatter(self.points[:, 0], self.points[:, 1], self.points[:, 2], c='yellow', s=80)
     two_mat = np.column_stack((section_layer.x_data[0], section_layer.y_data[0]))
     two_nbrs = NearestNeighbors(n_neighbors=50, n_jobs=1).fit(two_mat)
     two_dists, two_inds = two_nbrs.kneighbors(two_mat)
     two_points = two_mat[two_inds[ind]] 
     self.two_scatters = section_ax.scatter(two_points[:, 0], two_points[:, 1], c='green', s=80)
Example #11
0
class KDTrees:

    def __init__(self, nb_neighbours, leaf_size):
        self.nbrs = NearestNeighbors(n_neighbors=nb_neighbours, algorithm='ball_tree', metric = 'haversine', leaf_size=leaf_size)
    # Compute distance in time between two points on the map
    def mapDistance(self, x, y):
        if (len(x) > 2):
            return np.sum((x - y) ** 2)
        else:
            if(x[0] < y[0]):
                tmp = y
                y = x
                x = tmp
            pos1 = str(x[0]) + ", " + str(x[1])
            pos2 = str(y[0]) + ", " + str(y[1])
            timestamp = datetime.now()
            sec_to_add = 32 * 3600 + (timestamp - datetime(1970, 1, 1)).total_seconds() - 2*3600 - timestamp.hour * 3600 - timestamp.minute * 60 - timestamp.second
            traject = gmaps.directions(pos1, pos2, mode="transit", departure_time=timestamp.fromtimestamp(sec_to_add))
            try:
                print 'ok'
                return (traject[0]["legs"][0]["arrival_time"]["value"] - traject[0]["legs"][0]["departure_time"]["value"])
            except:
                print 'bug'
                return 1000000000


    def addPoints(self, points):
        self.nbrs.fit(points)

    def getNeighbours(self, points):
        self.nbrs.kneighbors(points)
Example #12
0
def adasyn_sample(X,Y,minclass,K=5,n=200):
    indices = np.nonzero(Y==minclass)
    Ymin = Y[indices]
    Xmin = X[indices]
    Cmin = len(indices[0])
    Xs = []
    if n > Cmin:
        Xs.append(Xmin)
        n -= len(Ymin)
    else:
        # simple random without replacement undersampling
        return Xmin[random.sample(range(Cmin),n)]
    neigh = NearestNeighbors(n_neighbors=30)
    neigh.fit(X)
    nindices = neigh.kneighbors(Xmin,K,False)
    gamma = [float(sum(Y[i]==minclass))/K for i in nindices]
    gamma = gamma / np.linalg.norm(gamma,ord = 1)
    neigh = NearestNeighbors(n_neighbors=30)
    neigh.fit(Xmin)
    N = np.round(gamma*n).astype(int)
    assert len(N) == Cmin
    for (i,nn) in enumerate(N):
        nindices = neigh.kneighbors(Xmin[i],K,False)[0]
        for j in range(nn):
            alpha = random.random()
            Xnn = X[random.choice(nindices)]
            Xs.append((1.-alpha)*Xmin[i]+alpha*Xnn)
    Xadasyn = sparse.vstack(Xs)  
    return Xadasyn
Example #13
0
    def RunAllKnnScikit(q):
      totalTimer = Timer()

      # Load input dataset.
      # If the dataset contains two files then the second file is the query file 
      # In this case we add this to the command line.
      Log.Info("Loading dataset", self.verbose)
      if len(self.dataset) == 2:
        referenceData = np.genfromtxt(self.dataset[0], delimiter=',')
        queryData = np.genfromtxt(self.dataset[1], delimiter=',')
      else:
        referenceData = np.genfromtxt(self.dataset, delimiter=',')

      with totalTimer:
        # Get all the parameters.
        k = re.search("-k (\d+)", options)
        leafSize = re.search("-l (\d+)", options)

        if not k:
          Log.Fatal("Required option: Number of furthest neighbors to find.")
          q.put(-1)
          return -1
        else:
          k = int(k.group(1))
          if (k < 1 or k > referenceData.shape[0]):
            Log.Fatal("Invalid k: " + k.group(1) + "; must be greater than 0"
              + " and less or equal than " + str(referenceData.shape[0]))
            q.put(-1)
            return -1

        if not leafSize:
          l = 20
        elif int(leafSize.group(1)) < 0:
          Log.Fatal("Invalid leaf size: " + str(leafSize.group(1)) + ". Must" +
              " be greater than or equal to 0.")
          q.put(-1)
          return -1
        else:
          l = int(leafSize.group(1))
  
        try:
          # Perform All K-Nearest-Neighbors.
          model = NearestNeighbors(n_neighbors=k, algorithm='kd_tree', leaf_size=l)
          model.fit(referenceData)

          if len(self.dataset) == 2:
            out = model.kneighbors(queryData, k, return_distance=True)
          else:
	    # We have to increment k by one because mlpack ignores the
	    # self-neighbor, whereas scikit-learn will happily return the
	    # nearest neighbor of point 0 as point 0.
            out = model.kneighbors(referenceData, k + 1, return_distance=True)
        except Exception as e:
          q.put(-1)
          return -1

      time = totalTimer.ElapsedTime()
      q.put(time)
      return time
Example #14
0
def findKNN(frequencyVector,newVector):
    samples = np.array(frequencyVector)
    neigh = NearestNeighbors(n_neighbors=5, metric="euclidean")
    neigh.fit(samples)
    indexList = neigh.kneighbors(newVector,return_distance=False).tolist()
    a=neigh.kneighbors(newVector)
    print a
    return indexList
Example #15
0
class KNearestNeighbours(MLClassifierBase):
    """k Nearest Neighbours multi-label classifier."""
    BRIEFNAME = "MLkNN"

    def __init__(self, k = 10, s = 1.0):
        super(KNearestNeighbours, self).__init__(None)
        self.k = k # Number of neighbours
        self.s = s # Smooth parameter

    def compute_prior(self, y):
        prior_prob_true = []
        prior_prob_false = []
        for label in xrange(self.num_labels):
            prior_prob_true.append(float(self.s + sum(instance[label] == 1 for instance in y)) / (self.s * 2 + self.num_instances))
            prior_prob_false.append(1 - prior_prob_true[-1])
        return prior_prob_true, prior_prob_false

    def compute_cond(self, X, y):
        self.knn = NearestNeighbors(self.k).fit(X)
        c = [[0] * (self.k + 1) for label in xrange(self.num_labels)]
        cn = [[0] * (self.k + 1) for label in xrange(self.num_labels)]
        for instance in xrange(self.num_instances):
            neighbors = self.knn.kneighbors(X[instance], self.k, return_distance=False)
            for label in xrange(self.num_labels):
                delta = sum(y[neighbor][label] for neighbor in neighbors[0])
                (c if y[instance][label] == 1 else cn)[label][delta] += 1

        cond_prob_true = [[0] * (self.k + 1) for label in xrange(self.num_labels)]
        cond_prob_false = [[0] * (self.k + 1) for label in xrange(self.num_labels)]
        for label in xrange(self.num_labels):
            for neighbor in xrange(self.k + 1):
                cond_prob_true[label][neighbor] = (self.s + c[label][neighbor]) / (self.s * (self.k + 1) + sum(c[label]))
                cond_prob_false[label][neighbor] = (self.s + cn[label][neighbor]) / (self.s * (self.k + 1) + sum(cn[label]))
        return cond_prob_true, cond_prob_false

    def fit(self, X, y):
        self.predictions = y;
        self.num_instances = len(y)
        self.num_labels = len(y[0])
        # Computing the prior probabilities
        self.prior_prob_true, self.prior_prob_false = self.compute_prior(y)
        # Computing the posterior probabilities
        self.cond_prob_true, self.cond_prob_false = self.compute_cond(X, y)
        return self

    def predict(self, X):
        result = np.zeros((len(X), self.num_labels), dtype='i8')
        for instance in xrange(len(X)):
            neighbors = self.knn.kneighbors(X[instance], self.k, return_distance=False)
            for label in xrange(self.num_labels):
                delta = sum(self.predictions[neighbor][label] for neighbor in neighbors[0])
                p_true = self.prior_prob_true[label] * self.cond_prob_true[label][delta]
                p_false = self.prior_prob_false[label] * self.cond_prob_false[label][delta]
                prediction = (p_true >= p_false)
                result[instance][label] = int(prediction)
        return result
Example #16
0
    def sample(s):
        if s.data is None:
            raise ValueError('data not loaded.')
        mdl = NearestNeighbors(n_neighbors=s.k1, n_jobs=-1)
        mdl.fit(s.X)
        _, nei_table = mdl.kneighbors()
        # the index of those minority points with minority neighbors
        noise_mino_idx = filter(lambda o: sum(s.y[nei_table[o]] == s.minolab) != 0 and s.y[o] == s.minolab,
                                range(s.X.shape[0]))
        minoX = s.X[s.y == s.minolab]
        majX = s.X[s.y == s.majlab]

        mdl_maj = NearestNeighbors(n_neighbors=s.k2, n_jobs=-1)
        mdl_maj.fit(majX)
        # all majority examples on the bound
        _, tmp = mdl_maj.kneighbors(s.X[noise_mino_idx])
        # remove dumplicate examples
        bound_maj_idx = np.unique(np.reshape(tmp, (1, -1))[0])

        mdl_mino = NearestNeighbors(n_neighbors=s.k3, n_jobs=-1)
        mdl_mino.fit(minoX)
        # find minority examples on the bound backward
        _, tmp = mdl_mino.kneighbors(majX[bound_maj_idx])
        bound_mino_idx = np.unique(np.reshape(tmp, (1, -1))[0])

        bound_maj = majX[bound_maj_idx]
        bound_mino = minoX[bound_mino_idx]

        # difference matrix, shape = (majN, minoN).
        # Due to broadcast(strech), diff[i][j][k] would be maj[i][k]-mino[j][k],
        # thus vector diff[i][j]=maj[i]-mino[j] representing the outer vector diff.
        diff = bound_maj[:, None, :] - bound_mino
        Cf = lambda o: min(s.X.shape[1] / np.linalg.norm(o, 2), s.Cfth) * 1.0 / s.Cfth
        CM = np.apply_along_axis(Cf, 2, diff)

        W = np.mean(((CM * CM).T / np.sum(CM, axis=1)).T, axis=0)

        # P is the normalized Weight Vector, standing for the probability chosen to synthese
        P = W / np.sum(W)

        # np.save(open('W-{0}.ndarray'.format(s.mdl_args["gamma"]), 'w'), CM)

        # choose N bound minority examples to synthese, selection probability accroding to their weight
        chosen = np.random.choice(range(len(P)), size=s.N, p=P)
        chosenp = bound_mino[chosen]

        # would not implement CLUSTERING in MWMOTE, I could see no effort of that but time-consumption.
        _, nei = mdl_mino.kneighbors(chosenp, s.k1)
        dualp = minoX[[i[int(np.random.rand() * s.k1)] for i in nei]]

        generated = chosenp + np.random.rand(s.N, 1) * (dualp - chosenp)
        ret = np.hstack((np.vstack((minoX, generated, majX)),
                         np.array([s.minolab] * (minoX.shape[0] + s.N) + [s.majlab] * majX.shape[0])[:, None]))
        np.random.shuffle(ret)
        
        return ret
Example #17
0
  def get_station_nearest_neighbors_list(self, station, nps, n):
    """
    Returns the n nearest neighbors stations to given station among the stations
    in passed data frame "df".

    Args:
      station <string>: The station code for which nearest neighbors are needed.
      nps <int>: Number of previous stations to choose stations having nps model.
      n <int>: Number of nearest neighbors needed.
    """
    # Choose the stations who have the respective nps models.
    # If the unknown station occurs as 3rd station in the complete journey, then
    # the nearest known station should have a 3 previous station model and so on.
    stns_hvng_nps_mdls = self._pdr.get_stations_having_nps_model_list(nps)

    # Get the station features data frame for known stations having nps models
    df = self._pdr.get_known_596_stations_features_df()
    df = df[df.Station.isin(stns_hvng_nps_mdls)]

    query_stn_feature = [[self._stn_geo_crdnates[station][0],
                         self._stn_geo_crdnates[station][1],
                         self._stn_deg_strength[station],
                         self._stn_tfc_strength[station]]]
    # First choose neighbors which are geographically closer
    lat_lon_df = df[["Latitude", "Longitude"]]

    lat_lon_query_stn_ftr = [[self._stn_geo_crdnates[station][0],
                              self._stn_geo_crdnates[station][1]]]
    ll_nbrs = NN(n_neighbors=n, algorithm="auto").fit(lat_lon_df)
    # ll_indices are directly indexed corresponding to stns_hvng_nps_mdls
    ll_distances, ll_indices = ll_nbrs.kneighbors(lat_lon_query_stn_ftr)

    # Subselect the chosen stations features from the complete station
    # features df.
    selected_station_fts_df = self._get_selected_stations_df(ll_indices[0], df)

    # Then choose neighbors based on degree and traffic strength among the
    # above chosen geographically closer stations.
    deg_tfc_df = selected_station_fts_df[["Degree_Strength", "Traffic_Strength"]]
    deg_tfc_query_stn_ftr = [[self._stn_deg_strength[station],
                              self._stn_tfc_strength[station]]]
    dt_nbrs = NN(n_neighbors=n, algorithm="auto").fit(deg_tfc_df)
    # dt_indices are indexed with 0, so not directly related to
    # stns_hvng_nps_mdls
    dt_distances, dt_indices = dt_nbrs.kneighbors(deg_tfc_query_stn_ftr)

    # Once the dt_indices are obtained where the stations are arranged as per
    # increasing distance of degree and traffic strength features, get the
    # station codes from the df at those indices (since the dt_indices are
    # indexed from 0 onwards with respect to the ll_indices, hence the following
    # code). Also the ll_indices are with respect to the df.
    final_nearest_neighbors_stns_list = [df.iloc[ll_indices[0][idx]].Station
        for idx in dt_indices[0]]
    return final_nearest_neighbors_stns_list
Example #18
0
def main():
    if len(sys.argv) != 3:
        sys.stderr.write('Error: wrong number of arguments.\n')
        sys.stderr.write(
            'Usage: %s <corpus path> <model path>\n' % (sys.argv[0],))
        return 1
    logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT)
    text = get_soap_data(sys.argv[1])
    embedding = _get_w2v_embedding(sys.argv[2])
    data = pandas.DataFrame()
    data["Transcript"] = text[0:200000]

    data["Transcript"] = data["Transcript"].str.lower()
    data["index_value"] = data.index
    vals = data["Transcript"].values

    logger.info('Averaging')
    at = Timer()
    vector_rep = [average_vector(v, embedding) for v in vals]
    logger.info('Averaging took %s s', at.elapsed())
    # logger.info('Reassembling')
    # vector_rep = reduce(lambda a, b: a + b, vector_reps)
    # vector_rep = [average_vector(s, embedding) for s in vals]
    # logger.info('Saving vector...')
    # quick_save("big_ver", vector_rep)

    logger.info('Nearest neighbors fit')
    nnt = Timer()
    neighbors = NearestNeighbors(
        n_neighbors=10, metric="euclidean", algorithm='ball_tree')
    neighbors.fit(vector_rep)
    logger.info('Fitting took %s s', nnt.elapsed())

    threshold = .6  # Of the top N, take the longest response

    for i in range(5):
        t = Timer()
        embedded = average_vector(
            'how many women have you slept with', embedding)
        distance, indices = neighbors.kneighbors([embedded])
        print 'Query time: %s s' % (t.elapsed(),)

    while True:
        sentence = raw_input("Enter some text:\n")
        sentence = sentence.lower()
        embedded = average_vector(sentence, embedding)
        distance, indices = neighbors.kneighbors([embedded])
        for best in indices[0][0:5]:
            # Get the correct location
            best_match_index = data.iloc[best].index_value
            print 'Best match: %s' % (
                data['Transcript'][best_match_index],)
            print 'Response1:   %s' % (
                data['Transcript'][best_match_index + 1],)
Example #19
0
def match(GA_orig, GB_orig, order=3, max_depth=10, complexity=4):
    if len(GA_orig) > len(GB_orig):
        GA, GB = GB_orig.copy(), GA_orig.copy()
        logging.warning('Warning: reference graph is B not A')
    else:
        GA, GB = GA_orig.copy(), GB_orig.copy()
    # logging.warning('Matching graph A (%d nodes) to graph B (%d nodes)' % (len(GA_orig), len(GB_orig)))

    GA, GB = make_same_size(GA, GB)

    M = vertex_vectorize([GA, GB], complexity=complexity, normalization=True, inner_normalization=True)
    MA, MB = M[0], M[1]

    nnA = NearestNeighbors(n_neighbors=len(GA)).fit(MA)
    d, BprefA = nnA.kneighbors(MB)

    nnB = NearestNeighbors(n_neighbors=len(GB)).fit(MB)
    d, AprefB = nnB.kneighbors(MA)

    # mark bfv in vec attribute
    GA, GB = init_vec(GA), init_vec(GB)
    for k in range(order):
        ds = d[:, 0]
        id_max_A = np.argsort(ds)[k]
        id_max_B = AprefB[id_max_A][0]

        GA = annotate_with_bfs(GA, id_max_A, max_depth=max_depth)
        GB = annotate_with_bfs(GB, id_max_B, max_depth=max_depth)
    # draw_graph_set([GA,GB],n_graphs_per_line=2, size=9, secondary_vertex_label='vec')

    # vectorize 2nd time with real values this time
    M = vertex_vectorize([GA, GB], complexity=complexity, discrete=False, normalization=False, inner_normalization=False)
    MA, MB = M[0], M[1]

    nnA = NearestNeighbors(n_neighbors=len(GA)).fit(MA)
    d, BprefA = nnA.kneighbors(MB)

    nnB = NearestNeighbors(n_neighbors=len(GB)).fit(MB)
    d, AprefB = nnB.kneighbors(MA)

    A = ['A%d' % (i + 1) for i in range(len(GA))]
    B = ['B%d' % (i + 1) for i in range(len(GB))]

    Arankings = dict(((A[i], j + 1), B[AprefB[i, j]]) for i, j in product(range(len(GA)), range(len(GA))))
    Brankings = dict(((B[i], j + 1), A[BprefA[i, j]]) for i, j in product(range(len(GB)), range(len(GB))))

    rankings = Arankings
    rankings.update(Brankings)
    pairings = stable(rankings, A, B)

    # remove dummy node pairings
    npairings = trim_pairings(pairings, GA_orig, GB_orig)
    orderA, orderB = list(zip(*sorted(npairings)))
    return orderB
Example #20
0
class KNearestNeighbours(ClassifierMixin):
    '''ML-KNN'''

    def __init__(self, k = 10, s = 1.0):
        super(KNearestNeighbours, self).__init__()
        self.k = k
        self.s = s

    def compute_prior(self, y):
        prior_prob_true = []
        prior_prob_false = []
        for label in xrange(self.num_labels):
            prior_prob_true.append(float(self.s + sum(instance[label] == 1 for instance in y)) / (self.s * 2 + self.num_instances))
            prior_prob_false.append(1 - prior_prob_true[-1])
        return prior_prob_true, prior_prob_false

    def compute_cond(self, X, y):
        self.knn = NearestNeighbors(self.k).fit(X)
        c = [[0] * (self.k + 1) for label in xrange(self.num_labels)]
        cn = [[0] * (self.k + 1) for label in xrange(self.num_labels)]
        for instance in xrange(self.num_instances):
            neighbors = self.knn.kneighbors(X[instance], self.k, return_distance=False)
            for label in xrange(self.num_labels):
                delta = sum(y[neighbor][label] for neighbor in neighbors[0])
                (c if y[instance][label] == 1 else cn)[label][delta] += 1

        cond_prob_true = [[0] * (self.k + 1) for label in xrange(self.num_labels)]
        cond_prob_false = [[0] * (self.k + 1) for label in xrange(self.num_labels)]
        for label in xrange(self.num_labels):
            for neighbor in xrange(self.k + 1):
                cond_prob_true[label][neighbor] = (self.s + c[label][neighbor]) / (self.s * (self.k + 1) + sum(c[label]))
                cond_prob_false[label][neighbor] = (self.s + cn[label][neighbor]) / (self.s * (self.k + 1) + sum(cn[label]))
        return cond_prob_true, cond_prob_false

    def fit(self, X, y):
        self.predictions = y;
        self.num_instances = len(y)
        self.num_labels = len(y[0])
        self.prior_prob_true, self.prior_prob_false = self.compute_prior(y)
        self.cond_prob_true, self.cond_prob_false = self.compute_cond(X, y)
        return self

    def predict(self, X):
        result = np.zeros((len(X), self.num_labels), dtype='i8')
        for instance in xrange(len(X)):
            neighbors = self.knn.kneighbors(X[instance], self.k, return_distance=False)
            for label in xrange(self.num_labels):
                delta = sum(self.predictions[neighbor][label] for neighbor in neighbors[0])
                p_true = self.prior_prob_true[label] * self.cond_prob_true[label][delta]
                p_false = self.prior_prob_false[label] * self.cond_prob_false[label][delta]
                prediction = (p_true >= p_false)
                result[instance][label] = int(prediction)
        return result
Example #21
0
def pointwise_test(data, significance=0.05, standardize=False, plot=False):
    if standardize:
        data = standardize_mvn(data)
    n, p = data.shape
    k1, k2 = get_nbh_sizes(n, p)

    ## Step I: finding candidate modes
    nn = NearestNeighbors(k1, metric='euclidean').fit(data)
    possible_candidates = np.ones(n, dtype=np.bool)
    candidates = []
    while np.sum(possible_candidates) > 0:
        distances, indices = nn.kneighbors(data[possible_candidates])
        ind_new = np.argmin(distances[:, -1])
        new_candidate = np.arange(n)[possible_candidates][ind_new]
        candidates.append(new_candidate)
        possible_candidates[indices[ind_new, :k2]] = False

    ## Step II: Thin out candidates
    non_modes = []
    for i in candidates:
        mu = data[i, :]
        _, ind = nn.kneighbors(mu)
        ind = ind.ravel()
        X = data[ind[:k2], :]
        if hotelling_pval(X, mu) < 0.01:
            non_modes.append(i)
    modes = [i for i in candidates if not i in non_modes]

    ## Step III: SB-plot
    K = len(modes)
    in_other_modal_region = []
    for i in range(K):
        if i in in_other_modal_region:
            continue
        for j in range(i+1, K):
            if j in in_other_modal_region:
                continue
            x = data[modes[i], :]
            y = data[modes[j], :]
            alpha = np.linspace(0, 1, 200).reshape(-1, 1)
            x_alpha = alpha*x + (1-alpha)*y
            dist_k1nn, _ = nn.kneighbors(x_alpha)
            d_k1nn = dist_k1nn[:, -1]
            SB_alpha = p*(np.log(d_k1nn) - np.log(max(d_k1nn[0], d_k1nn[-1])))
            if (SB_alpha >= np.sqrt(2./k1)*norm.ppf(1-significance)).any() and plot:
                plt.plot(alpha, SB_alpha*np.sqrt(k1*1./2))
            else:
                in_other_modal_region.append(j)
    modal_regions = [mode for j, mode in enumerate(modes) if not j in in_other_modal_region]
    if len(modal_regions) > 1:
        return True
    return False
Example #22
0
    def fit(self, XMeta, YMeta, YCaMeta, folder = "data/dataForMeta/"): #X ... features, y... trueValue, yC ... values predicted by classifier
        self.nrOfClassifiers = YCaMeta.shape[1]
        wholeTime, timeForRegion = 0,0
        start = time.time()
        if self.printing: print("Starting to fit MetaDes")
        metaFeatures = []
        metaResponse = []
        nearestNeigbourRegion = NearestNeighbors(n_neighbors=self.K, metric=self.metric)
        nearestNeigbourRegion.fit(XMeta)
        nearestNeigbourOutputRegion = NearestNeighbors(n_neighbors=self.Kp, metric=self.metric)
        nearestNeigbourOutputRegion.fit(np.round(YCaMeta))
        with open(folder+"MetaFeatures_K"+str(self.K)+"_Kp"+str(self.Kp)+".csv", "w") as fMetaFeatures:
            #we use this, because this can be very big folder, so we have to save incrementally in file
            for i, x in enumerate(XMeta):
            # for i in range(2000):
                x = XMeta[i]
                if(i%1000 == 0): print("Training examples covered: %d/%d" %(i, len(XMeta)))
                doc = DOC(np.round(YCaMeta[i]), mode=1)#degree of consensus, Morda premislit, kako to drugace dolocit
                if(doc <= self.hC): #we let in instances, where classifiers have smaller consensus than tresshold
                    reg, opReg = {},{}
                    start2 = time.time()
                    # idxsReg = findRegion(XMeta, x, self.K, method='normalRegion')
                    idxsReg = nearestNeigbourRegion.kneighbors(x, n_neighbors=self.K+1, return_distance=False)[0,1:]
                    timeForRegion+= time.time() - start2
                    reg["X"], reg["Y"] = XMeta[idxsReg], YMeta[idxsReg]

                    start2 = time.time()
                    #idxsOP = findRegion(np.round(YCaMeta), np.round(YCaMeta[i]), self.Kp, method='outputProfileRegion')
                    idxsOP = nearestNeigbourOutputRegion.kneighbors(np.round(YCaMeta[i]), n_neighbors=self.Kp + 1, return_distance=False)[0,1:]
                    timeForRegion += time.time() - start2

                    opReg["X"], opReg["Y"] = XMeta[idxsOP], YMeta[idxsOP]
                    for j, cls in enumerate(YCaMeta[i]):
                        reg["YC"] = YCaMeta[idxsReg][:,j] #vzamemo vse response j-tega classifierja v okolici x
                        opReg["YC"] = YCaMeta[idxsOP][:,j]
                        f = computeMetaFeatures(reg, opReg)
                        metaFeatures.append(list(f))
                        res = 1 if int(np.round(cls)) == int(np.round(YMeta[i])) else 0
                        metaResponse.append(res)
                        [(fMetaFeatures.write(str(feat)), fMetaFeatures.write(",") if i != len(f)-1 else None) for i, feat in enumerate(f)]
                        fMetaFeatures.write("\n")
        metaResponse = np.array(metaResponse)
        np.savetxt(folder+"MetaResponse_K"+str(self.K)+"_Kp"+str(self.Kp)+".csv", metaResponse, delimiter="\n")
        metaFeatures = np.array(metaFeatures)

        print("Fitting meta cls...")
        self.fitMetaCls(metaFeatures, metaResponse)
        print("Done!")

        wholeTime = time.time()-start
        print("For training metaDes we needed %d time for finding region out of %d \n "
              "so we spent %.3f for region seeking" %(timeForRegion, wholeTime, timeForRegion/wholeTime))
Example #23
0
def embedding_refinement(data_matrix_highdim,
                         data_matrix_lowdim,
                         n_neighbors=8,
                         emb_quality_th=1,
                         n_iter=20):
    # extract neighbors list for high dimensional case
    neigh_high = NearestNeighbors(n_neighbors=n_neighbors)
    neigh_high.fit(data_matrix_highdim)
    neighbors_list_highdim = neigh_high.kneighbors(data_matrix_highdim, return_distance=0)
    n_instances = data_matrix_lowdim.shape[0]
    logger.debug('refinements max num iters: %d  k in neqs: %d num insts: %d' %
                 (n_iter, n_neighbors, n_instances))
    for it in range(n_iter):
        average_embedding_quality_score, scores = knn_quality_score(data_matrix_lowdim,
                                                                    neighbors_list_highdim,
                                                                    n_neighbors)
        # select low quality embedded instances
        ids = [i for i, s in enumerate(scores)
               if relative_quality(i, scores, neighbors_list_highdim) <= emb_quality_th]
        # find average position of true knns and move point there
        new_data_matrix_lowdim = compute_average(ids, data_matrix_lowdim, neighbors_list_highdim)
        new_average_embedding_quality_score, new_scores = knn_quality_score(new_data_matrix_lowdim,
                                                                            neighbors_list_highdim,
                                                                            n_neighbors)
        if new_average_embedding_quality_score > average_embedding_quality_score:
            data_matrix_lowdim = new_data_matrix_lowdim
            n_refinements = len(ids)
            frac_refinements = float(n_refinements) / n_instances
            logger.debug('r %.2d neqs: %.3f \t %.2f (%d insts)' %
                         (it + 1, new_average_embedding_quality_score,
                          frac_refinements, n_refinements))
        else:
            break
    return data_matrix_lowdim
    def resample(self):
        # Start with the minority class
        minx = self.x[self.y == self.minc]
        miny = self.y[self.y == self.minc]

        # Finding nns
        from sklearn.neighbors import NearestNeighbors

        print("Finding the %i nearest neighbours..." % self.k, end = "")

        NN = NearestNeighbors(n_neighbors = self.k + 1)
        NN.fit(minx)
        nns = NN.kneighbors(minx, return_distance=False)[:, 1:]

        print("done!")

        # Creating synthetic samples
        print("Creating synthetic samples...", end="")
        sx, sy = make_samples(minx, minx, self.minc, nns, int(self.ratio * len(miny)), random_state=self.rs)
        print("done!")

        # Concatenate the newly generated samples to the original data set
        ret_x = concatenate((self.x, sx), axis = 0)
        ret_y = concatenate((self.y, sy), axis = 0)

        return ret_x, ret_y
Example #25
0
def find_k_neighbors(points, neighbor_number=5):
    from sklearn.neighbors import NearestNeighbors
    import numpy as np
    X = np.array(points)
    neighbors = NearestNeighbors(n_neighbors=neighbor_number + 1, algorithm='ball_tree').fit(X)
    distances, indices = neighbors.kneighbors(X)
    return [[str(point), list([str(x) for x in indices[point][1:]])] for point in xrange(len(points))]
    def resample(self):
        from sklearn.neighbors import NearestNeighbors

        # Start with the minority class
        minx = self.x[self.y == self.minc]
        miny = self.y[self.y == self.minc]

        # Find the NNs for all samples in the data set.
        print("Finding the %i nearest neighbours..." % self.m, end = "")
        NN = NearestNeighbors(n_neighbors = self.m + 1)
        NN.fit(self.x)

        print("done!")

        # Boolean array with True for minority samples in danger
        index = asarray([in_danger(x, self.y, self.m, miny[0], NN) for x in minx])

        # If all minority samples are safe, return the original data set.
        if not any(index):
            print('There are no samples in danger. No borderline synthetic samples created.')
            return self.x, self.y

        # Find the NNs among the minority class
        NN.set_params(**{'n_neighbors' : self.k + 1})
        NN.fit(minx)
        nns = NN.kneighbors(minx[index], return_distance=False)[:, 1:]

        # Create synthetic samples for borderline points.
        sx, sy = make_samples(minx[index], minx, miny[0], nns, int(self.ratio * len(miny)), random_state=self.rs)

        # Concatenate the newly generated samples to the original data set
        ret_x = concatenate((self.x, sx), axis = 0)
        ret_y = concatenate((self.y, sy), axis = 0)

        return ret_x, ret_y
class KNNmodel():
    def __init__(self):
        self.knnModel = None

    def train(self, userFeatureTable, ratingsMat):
        userFeatureTable.loc[:, "age"] = userFeatureTable.loc[:, "age"] / 10.
        # ad hoc fix, make sure feature's range is similar
        self.knnModel = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(userFeatureTable)

        # ratingMat is the rating matrix
        self.ratingsMat = ratingsMat
        self.userFeatureTable = userFeatureTable
        self.userIds = self.userFeatureTable.index  # the actual order seen by the knnmodel

    def predict(self, userFeature):
        distances, indices = self.knnModel.kneighbors(userFeature)

        # indices are the nearest neighbors' index in the matrix, which is different from userId.
        return self.userIds[indices[0]]

    def provideRec(self, userId):
        # data is a tuple of (user feature, item feature)
        userIds = self.predict(self.userFeatureTable.loc[userId].as_matrix().reshape(1, -1))
        # remove himself as a nearest neighbor
        userIds = np.array(list(set(userIds) - set([userId])))

        # for all nearest neighbors, compute the the average score, sorted from large to small
        # then report the item ids
        return self.ratingsMat[userIds - 1].mean(axis=0).argsort()[::-1] + 1
def random_forest_single_predict(test_filename, name, feature_file, train_file, k):
    name_list, data = readfile_real_name(test_filename)
    print 'reading file...'
    test_data = data[name_list.index(name)]
    with open(train_file, 'rb') as f:
        clf = cPickle.load(f)
    print 'done'
    result_rate = (clf.predict_proba(test_data))[0]
    class_name = clf.classes_
    print name
    num = map(get_num, result_rate)
    name_list, feature_list = readfile_real_name_group(feature_file, class_name, num)
    neigh = NearestNeighbors()
    neigh.fit(feature_list)
    kneighbors_result_list = neigh.kneighbors(test_data, k, False)[0]
    print kneighbors_result_list
    for x in kneighbors_result_list:
        print name_list[x]
    classification_result = []
    average_list = []
    real_name = (name.split('_'))[0]
    counter = Counter(kneighbors_result_list)
    if real_name == name_list[counter.most_common(1)[0][0]].split('_')[0]:
        classification_result.append(1)
    else:
        classification_result.append(0)
    num = 0
    for i in kneighbors_result_list:
        if (name_list[i].split('_'))[0] == real_name:
            num += 1
    average_list.append((float)(num) / (float)(k))
    print classification_result, average_list
    return classification_result, average_list
Example #29
0
    def sample(s):
        if s.data is None:
            raise ValueError('data not loaded.')
        mdl = NearestNeighbors(n_neighbors=s.k, n_jobs=-1)
        minoX = s.X[s.y == s.minolab]
        majX = s.X[s.y == s.majlab]
        mdl.fit(minoX)
        _, nei_table = mdl.kneighbors()

        generated = None
        for cnt, nei_idx in enumerate(nei_table):
            x = minoX[cnt]
            if s.rate >= 0.5 * s.k:
                nei = minoX[np.random.choice(nei_idx, int(s.rate))]
                new = x + np.random.rand(int(s.rate), 1) * (nei - x)

            else:
                nei = minoX[nei_idx]
                new = x + np.random.rand(s.k, 1) * (nei - x)
                # each of the synthesed k points has N/k * 100 % probability to be chosen
                new = new[np.random.rand(s.k) > s.rate * 1.0 / s.k]
            if generated is None:
                generated = new
            else:
                generated = np.vstack((generated, new))
        # number of generated instances
        N = len(generated)
        ret = np.hstack((np.vstack((minoX, generated, majX)),
                         np.array([s.minolab] * (minoX.shape[0] + N) + [s.majlab] * majX.shape[0])[:, None]))
        np.random.shuffle(ret)
        return ret
Example #30
0
def test_kernel_density_sampling(n_samples=100, n_features=3):
    rng = np.random.RandomState(0)
    X = rng.randn(n_samples, n_features)

    bandwidth = 0.2

    for kernel in ['gaussian', 'tophat']:
        # draw a tophat sample
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        samp = kde.sample(100)
        assert_equal(X.shape, samp.shape)

        # check that samples are in the right range
        nbrs = NearestNeighbors(n_neighbors=1).fit(X)
        dist, ind = nbrs.kneighbors(X, return_distance=True)

        if kernel == 'tophat':
            assert np.all(dist < bandwidth)
        elif kernel == 'gaussian':
            # 5 standard deviations is safe for 100 samples, but there's a
            # very small chance this test could fail.
            assert np.all(dist < 5 * bandwidth)

    # check unsupported kernels
    for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        assert_raises(NotImplementedError, kde.sample, 100)

    # non-regression test: used to return a scalar
    X = rng.randn(4, 1)
    kde = KernelDensity(kernel="gaussian").fit(X)
    assert_equal(kde.sample().shape, (1, 1))
Example #31
0
class KNearestDatasets(object):
    def __init__(self, metric='l1', random_state=None, metric_params=None):
        self.logger = get_logger(__name__)

        self.metric = metric
        self.model = None
        self.metric_params = metric_params
        self.metafeatures = None
        self.runs = None
        self.best_configuration_per_dataset = None
        self.random_state = sklearn.utils.check_random_state(random_state)
        self.scaler = MinMaxScaler()

        if self.metric_params is None:
            self.metric_params = {}

    def fit(self, metafeatures, runs):
        """Fit the Nearest Neighbor model.

        Parameters
        ----------
        metafeatures : pandas.DataFrame
            A pandas dataframe. Each row represents a dataset, each column a
            metafeature.
        runs : dict
            Dictionary containing a list of runs for each dataset.
        """
        assert isinstance(metafeatures, pd.DataFrame)
        assert metafeatures.values.dtype in (np.float32, np.float64)
        assert np.isfinite(metafeatures.values).all()
        assert isinstance(runs, pd.DataFrame)
        assert runs.shape[1] == metafeatures.shape[0], \
            (runs.shape[1], metafeatures.shape[0])

        self.metafeatures = metafeatures
        self.runs = runs
        self.num_datasets = runs.shape[1]

        # Fit the metafeatures for scaler
        self.scaler.fit(self.metafeatures)

        # for each dataset, sort the runs according to their result
        best_configuration_per_dataset = {}
        for dataset_name in runs:
            if not np.isfinite(runs[dataset_name]).any():
                best_configuration_per_dataset[dataset_name] = None
            else:
                configuration_idx = runs[dataset_name].index[np.nanargmin(
                    runs[dataset_name].values)]
                best_configuration_per_dataset[
                    dataset_name] = configuration_idx

        self.best_configuration_per_dataset = best_configuration_per_dataset

        if callable(self.metric):
            self._metric = self.metric
            self._p = 0
        elif self.metric.lower() == "l1":
            self._metric = "minkowski"
            self._p = 1
        elif self.metric.lower() == "l2":
            self._metric = "minkowski"
            self._p = 2
        else:
            raise ValueError(self.metric)

        self._nearest_neighbors = NearestNeighbors(
            n_neighbors=self.num_datasets,
            radius=None,
            algorithm="brute",
            leaf_size=30,
            metric=self._metric,
            p=self._p,
            metric_params=self.metric_params)

    def kNearestDatasets(self, x, k=1, return_distance=False):
        """Return the k most similar datasets with respect to self.metric

        Parameters
        ----------
        x : pandas.Series
            A pandas Series object with the metafeatures for one dataset

        k : int
            Number of k nearest datasets which are returned. If k == -1,
            return all dataset sorted by similarity.

        return_distance : bool, optional. Defaults to False
            If true, distances to the new dataset will be returned.

        Returns
        -------
        list
            Names of the most similar datasets, sorted by similarity

        list
            Sorted distances. Only returned if return_distances is set to True.
        """
        assert type(x) == pd.Series
        if k < -1 or k == 0:
            raise ValueError(
                'Number of neighbors k cannot be zero or negative.')
        elif k == -1:
            k = self.num_datasets

        X_train = self.scaler.transform(self.metafeatures)
        x = x.values.reshape((1, -1))
        x = self.scaler.transform(x)
        self._nearest_neighbors.fit(X_train)
        distances, neighbor_indices = self._nearest_neighbors.kneighbors(
            x, n_neighbors=k, return_distance=True)

        assert k == neighbor_indices.shape[1]

        rval = [
            self.metafeatures.index[i]
            # Neighbor indices is 2d, each row is the indices for one
            # dataset in x.
            for i in neighbor_indices[0]
        ]

        if return_distance is False:
            return rval
        else:
            return rval, distances[0]

    def kBestSuggestions(self, x, k=1, exclude_double_configurations=True):
        assert type(x) == pd.Series
        if k < -1 or k == 0:
            raise ValueError(
                'Number of neighbors k cannot be zero or negative.')
        nearest_datasets, distances = self.kNearestDatasets(
            x, -1, return_distance=True)

        kbest = []

        added_configurations = set()
        for dataset_name, distance in zip(nearest_datasets, distances):
            best_configuration = self.best_configuration_per_dataset[
                dataset_name]

            if best_configuration is None:
                self.logger.warning("Found no best configuration for instance "
                                    "%s" % dataset_name)
                continue

            if exclude_double_configurations:
                if best_configuration not in added_configurations:
                    added_configurations.add(best_configuration)
                    kbest.append((dataset_name, distance, best_configuration))
            else:
                kbest.append((dataset_name, distance, best_configuration))

            if k != -1 and len(kbest) >= k:
                break

        if k == -1:
            k = len(kbest)
        return kbest[:k]
Example #32
0
import numpy as np
from sklearn.neighbors import NearestNeighbors
from glob import glob
import os
from skimage import io
import matplotlib.pyplot as plt

""" Use kNN as baseline algorithm for finidng nearest neighbors.  Validate results by observing classification error.  
The concept being that a model with less classification error will find nearest neighbors better as well."""

imgs = np.load(r'D:\pycharm_projects\AWSgeo\data.npy')
# imgs_labels = np.load(r'D:\pycharm_projects\AWSgeo\labels.npy')

logits = np.load(r'D:\pycharm_projects\AWSgeo\Tensorboard\model_2019-12-18-08-45-59\data_logits.npy')

############## sklearn ##############
rand_arrange = np.random.permutation(len(logits))
ind = -1
neigh = NearestNeighbors(5)
neigh.fit(logits[rand_arrange[:-1000]])
knns = neigh.kneighbors(logits[rand_arrange[ind]].reshape(1,-1), 6, return_distance=False)

plt.figure();plt.imshow(imgs[rand_arrange[ind]])
io.imshow_collection(imgs[rand_arrange[knns[0]]])




########## openCV ##################

Example #33
0
# Wczytanie bibliotek.
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Wczytanie danych.
iris = datasets.load_iris()
features = iris.data

# Utworzenie egzemplarza StandardScaler.
standardizer = StandardScaler()

# Standaryzacja cech.
features_standardized = standardizer.fit_transform(features)

# Dwóch najbliższych sąsiadów.
nearest_neighbors = NearestNeighbors(n_neighbors=2).fit(features_standardized)

# Utworzenie obserwacji.
new_observation = [ 1,  1,  1,  1]

# Obliczenie odległości i odszukanie indeksów najbliższych sąsiadów obserwacji.
distances, indices = nearest_neighbors.kneighbors([new_observation])

# Wyświetlenie najbliższych sąsiadów.
features_standardized[indices]
Example #34
0
    TruncatedSVD_X = svd.transform(X)

    max_indices = np.argmax(TruncatedSVD_X, axis=1)
    id2cluster = {}
    for i in range(len(sethastypeSig)):
        no = sethastypeSig[i]
        id2cluster[no] = max_indices[i]
        #print no
    print 'start to get nearest neighbors'
    X_latent_neigh = np.loadtxt(latentfileName)
    nbrs = NearestNeighbors(n_neighbors=10,
                            algorithm='kd_tree').fit(X_latent_neigh)
    print 'end to get nearest neighbors'
    for key in set_miss_rel:
        # print 'set_miss_rel',key
        distances, indices = nbrs.kneighbors(X_latent_neigh[key:key + 1])
        neighbors = indices
        indice_num = np.shape(neighbors)[1]
        neighbours = {}
        neighbours_no = 0
        for i in range(indice_num):
            ner = neighbors[0, i]
            temp = id2cluster.get(ner)
            if temp != None:
                id2cluster[key] = temp
                break
#                if neighbours.get(temp)!=None:
#                    neighbours[temp] = neighbours[temp] + 1
#                else:
#                    neighbours[temp] = 1
#                neighbours_no = neighbours_no + 1
Example #35
0
def getReviewSentencesNNs(dictRestIDToFoodItemFoodItemVecSentSentVec,
                          posNegSeeds,
                          withinDis=.25):
    restIDfoodItemFoodItemVecSentSentVecs = makeDictAFiveLists(
        dictRestIDToFoodItemFoodItemVecSentSentVec)
    print "  length all reviews", dictCount(
        dictRestIDToFoodItemFoodItemVecSentSentVec), len(
            restIDfoodItemFoodItemVecSentSentVecs)
    X = map(itemgetter(4), restIDfoodItemFoodItemVecSentSentVecs)
    Y = map(itemgetter(4), posNegSeeds)  #posNegSeed[4]
    NNs = []
    #X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    #Y = np.array([[0,0]])
    nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(X)
    try:
        allDistances, indices = nbrs.kneighbors(Y)
    except:
        print "allDistances", allDistances
    for i, distancesFromASeed in enumerate(allDistances):
        for j, dist in enumerate(
                distancesFromASeed
        ):  #0 since Y is just one seed we are looking at at a time
            if dist <= withinDis:
                try:
                    if (dist == 0.0) and (posNegSeeds[i][1] in map(
                            itemgetter(0),
                            dictRestIDToFoodItemFoodItemVecSentSentVec.get(
                                restIDfoodItemFoodItemVecSentSentVecs[indices[
                                    i, j]][0])
                    )) and (posNegSeeds[i][3] in map(
                            itemgetter(2),
                            dictRestIDToFoodItemFoodItemVecSentSentVec.get(
                                restIDfoodItemFoodItemVecSentSentVecs[indices[
                                    i, j]][0]))):
                        #and (restIDfoodItemFoodItemVecSentSentVecs[indices[i,j]][1] == posNegSeeds[i][1]) and (restIDfoodItemFoodItemVecSentSentVecs[indices[i,j]][3] == posNegSeeds[i][3]):
                        beforeCount = dictCount(
                            dictRestIDToFoodItemFoodItemVecSentSentVec)
                        removeSeedsFromDict(
                            dictRestIDToFoodItemFoodItemVecSentSentVec,
                            [posNegSeeds[i]]
                        )  #perhaps this happens where it didn't find it earlier, i'm not sure still figuring it out
                        if dictCount(dictRestIDToFoodItemFoodItemVecSentSentVec
                                     ) == beforeCount:
                            #print("try to remove a seed during getReviewSentencesNNs: no sucess")
                            removeSeedsFromDict(
                                dictRestIDToFoodItemFoodItemVecSentSentVec,
                                [posNegSeeds[i]])
                        else:
                            dictRestIDToFoodItemFoodItemVecSentSentVec.get(
                                restIDfoodItemFoodItemVecSentSentVecs[indices[
                                    i, j]][0])
                            #print("try to remove a seed during getReviewSentencesNNs: sucess")
                    else:
                        NNs.append(
                            restIDfoodItemFoodItemVecSentSentVecs[indices[i,
                                                                          j]])
                except TypeError as e:
                    pass
                    #print restIDfoodItemFoodItemVecSentSentVecs[indices[i,j]][0]
                    #dictRestIDToFoodItemFoodItemVecSentSentVec.get(restIDfoodItemFoodItemVecSentSentVecs[indices[i,j]][0])
            else:
                break  #the rest of the distances will only get further away
    return NNs
    np.save(GLOVE_VEC_PATH, vecs.astype(np.float32))


# Compute the real nearest neighbors for a set of test words.
if not (os.path.exists(GLOVE_KNN_PATH)):

    with open(GLOVE_VOC_PATH) as fp:
        words = list(map(str.strip, fp))
        word2idx = {w: i for i, w in enumerate(words)}

    vecs = np.load(GLOVE_VEC_PATH)
    knn = NearestNeighbors(n_neighbors=5, algorithm='brute', metric='euclidean')
    knn.fit(vecs)

    test_ii = list(map(word2idx.get, GLOVE_TEST_WORDS))
    nbrs = knn.kneighbors(vecs[test_ii], return_distance=False)

    with open(GLOVE_KNN_PATH, "w") as fp:
        for word, nbrs_ in zip(GLOVE_TEST_WORDS, nbrs):
            fp.write("%s %s\n" % (word, " ".join([words[i] for i in nbrs_])))

# Fit LSH models and compute the hash from each model on each word vector.
if not os.path.exists(LSH_HASHES_PATH):

    with open(GLOVE_VOC_PATH) as fp:
        words = list(map(str.strip, fp))
        word2idx = {w: i for i, w in enumerate(words)}

    vecs = np.load(GLOVE_VEC_PATH)

    lsh_models = [LSHModel(seed=i, H=H).fit(vecs) for i in range(L)]
Example #37
0
            genre_of_movie = cur.fetchall()
            genre_of_movie = [x[0] for x in genre_of_movie]
            movies_list.append((
                i[0],
                i[1],
                len(set(genre_of_movie).intersection(genre_list)),
            ))

movies_list = sorted(movies_list, key=cmp2, reverse=True)
movies_list = movies_list[:50]

# create a training dataset and find the k nearest neighbors.
training_data = [[item[1]] for item in movies_list]
train = np.array(training_data)
nbrs = NearestNeighbors()
nbrs.fit(train)
indices = nbrs.kneighbors([[item[1]] for item in mov_id],
                          n_neighbors=7,
                          return_distance=False)

# print indices

# print the results
for value in indices[0]:
    temp = cur.execute('''
        SELECT movie FROM Movies WHERE id = (?)''',
                       (movies_list[value][0], )).fetchone()[0]
    print(temp)

conn.close()

# http://scikit-learn.org/stable/modules/neighbors.html

from sklearn.neighbors import NearestNeighbors
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X)
distances, indices = nbrs.kneighbors(X)
indices                                           
distances
nbrs.kneighbors_graph(X).toarray()

-------------------------------------------------------------------------------------------


 from sklearn.neighbors import KDTree
>>> import numpy as np
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
>>> kdt = KDTree(X, leaf_size=30, metric='euclidean')
>>> kdt.query(X, k=2, return_distance=False)  
Example #39
0
    def n_closest(
        self,
        word,
        num_closest=5,
        metric='cosine',
        return_similarity=True,
        mode_bidirectional='mean',
        mode_sequence='mean',
    ):
        """
        find nearest words based on a word.

        Parameters
        ----------
        word: str
            Eg, 'najib'
        num_closest: int, (default=5)
            number of words closest to the result.
        metric: str, (default='cosine')
            vector distance algorithm.
        return_similarity: bool, (default=True)
            if True, will return between 0-1 represents the distance.

        Returns
        -------
        word_list: list of nearest words
        """
        if not isinstance(word, str):
            raise ValueError('input must be a string')
        if not isinstance(num_closest, int):
            raise ValueError('num_closest must be an integer')
        if not isinstance(metric, str):
            raise ValueError('metric must be a string')
        if not isinstance(return_similarity, bool):
            raise ValueError('num_closest must be a boolean')
        if return_similarity:
            nn = NearestNeighbors(num_closest + 1,
                                  metric=metric).fit(self._embed_matrix)
            distances, idx = nn.kneighbors(
                self.get_vector_by_name(
                    word,
                    mode_bidirectional=mode_bidirectional,
                    mode_sequence=mode_sequence,
                ).reshape((1, -1)))
            word_list = []
            for i in range(1, idx.shape[1]):
                word_list.append(
                    [self._reverse_dictionary[idx[0, i]], 1 - distances[0, i]])
            return word_list
        else:
            wv = self.get_vector_by_name(
                word,
                mode_bidirectional=mode_bidirectional,
                mode_sequence=mode_sequence,
            )
            closest_indices = self.closest_row_indices(wv, num_closest + 1,
                                                       metric)
            word_list = []
            for i in closest_indices:
                word_list.append(self._reverse_dictionary[i])
            if word in word_list:
                word_list.remove(word)
            return word_list
Example #40
0
class MLkNN(MLClassifierBase):
    """kNN classification method adapted for multi-label classification

    Parameters
    ----------

    k : integer
        number of neighbours of each input instance to take into account

    s: boolean
            the smoothing parameter

    ignore_first_neighbours : integer
            ability to ignore first N neighbours, useful for comparing with other classification software, if you
            don't know what it does, the default is safe, see https://github.com/scikit-multilearn/scikit-multilearn/issues/22

    """
    BRIEFNAME = "MLkNN"

    def __init__(self, k=10, s=1.0, ignore_first_neighbours=0):
        super(MLkNN, self).__init__()
        self.k = k  # Number of neighbours
        self.s = s  # Smooth parameter
        self.ignore_first_neighbours = ignore_first_neighbours
        self.copyable_attrs = ['k', 's', 'ignore_first_neighbours']

    def compute_prior(self, y):
        prior_prob_true = np.array(
            (self.s + y.sum(axis=0)) / (self.s * 2 + self.num_instances))[0]
        prior_prob_false = 1 - prior_prob_true

        return prior_prob_true, prior_prob_false

    def compute_cond(self, X, y):
        self.knn = NearestNeighbors(self.k).fit(X)
        c = sparse.lil_matrix((self.num_labels, self.k + 1), dtype='i8')
        cn = sparse.lil_matrix((self.num_labels, self.k + 1), dtype='i8')

        label_info = get_matrix_in_format(y, 'dok')

        neighbors = [
            a[self.ignore_first_neighbours:]
            for a in self.knn.kneighbors(X,
                                         self.k + self.ignore_first_neighbours,
                                         return_distance=False)
        ]

        for instance in range(self.num_instances):
            deltas = label_info[neighbors[instance], :].sum(axis=0)
            for label in range(self.num_labels):
                if label_info[instance, label] == 1:
                    c[label, deltas[0, label]] += 1
                else:
                    cn[label, deltas[0, label]] += 1

        c_sum = c.sum(axis=1)
        cn_sum = cn.sum(axis=1)

        cond_prob_true = sparse.lil_matrix((self.num_labels, self.k + 1),
                                           dtype='float')
        cond_prob_false = sparse.lil_matrix((self.num_labels, self.k + 1),
                                            dtype='float')
        for label in range(self.num_labels):
            for neighbor in range(self.k + 1):
                cond_prob_true[label,
                               neighbor] = (self.s + c[label, neighbor]) / (
                                   self.s * (self.k + 1) + c_sum[label, 0])
                cond_prob_false[label,
                                neighbor] = (self.s + cn[label, neighbor]) / (
                                    self.s * (self.k + 1) + cn_sum[label, 0])
        return cond_prob_true, cond_prob_false

    def fit(self, X, y):
        """Fit classifier with training data

        :param X: input features
        :type X: dense or sparse matrix (n_samples, n_features)
        :param y: binary indicator matrix with label assignments
        :type y: dense or sparse matrix of {0, 1} (n_samples, n_labels)
        :returns: Fitted instance of self

        """

        self.train_labels = get_matrix_in_format(y, 'lil')
        self.num_instances = self.train_labels.shape[0]
        self.num_labels = self.train_labels.shape[1]
        # Computing the prior probabilities
        self.prior_prob_true, self.prior_prob_false = self.compute_prior(
            self.train_labels)
        # Computing the posterior probabilities
        self.cond_prob_true, self.cond_prob_false = self.compute_cond(
            X, self.train_labels)
        return self

    def predict(self, X):
        """Predict labels for X

        :param X: input features
        :type X: dense or sparse matrix (n_samples, n_features)
        :returns: binary indicator matrix with label assignments
        :rtype: sparse matrix of int (n_samples, n_labels)

        """

        result = np.zeros((X.shape[0], self.num_labels))
        neighbors = [
            a[self.ignore_first_neighbours:]
            for a in self.knn.kneighbors(X,
                                         self.k + self.ignore_first_neighbours,
                                         return_distance=False)
        ]
        for instance in range(X.shape[0]):
            deltas = self.train_labels[neighbors[instance], ].sum(axis=0)

            for label in range(self.num_labels):
                p_true = self.prior_prob_true[label] * self.cond_prob_true[
                    label, deltas[0, label]]
                p_false = self.prior_prob_false[label] * self.cond_prob_false[
                    label, deltas[0, label]]

                result[instance, label] = p_true / (p_false + p_true)
                # 本来的返回值是int( p_true > p_false),为符合论文要求改源代码为返回
        return result

    def predict_proba(self, X):
        """Predict probabilities of label assignments for X

        :param X: input features
        :type X: dense or sparse matrix (n_samples, n_labels)
        :returns: matrix with label assignment probabilities
        :rtype: sparse matrix of float (n_samples, n_labels)
        
        """
        result = sparse.lil_matrix((X.shape[0], self.num_labels),
                                   dtype='float')
        neighbors = [
            a[self.ignore_first_neighbours:]
            for a in self.knn.kneighbors(X,
                                         self.k + self.ignore_first_neighbours,
                                         return_distance=False)
        ]
        for instance in range(X.shape[0]):
            deltas = self.train_labels[neighbors[instance], ].sum(axis=0)

            for label in range(self.num_labels):
                p_true = self.prior_prob_true[label] * self.cond_prob_true[
                    label, deltas[0, label]]
                p_false = self.prior_prob_false[label] * self.cond_prob_false[
                    label, deltas[0, label]]
                result[instance, label] = p_true
        return result
        test_y_all = []
        nr_events_all = []
        offline_time_fit = 0
        current_online_event_times = []

        for _, dt_test_bucket in dt_test_prefixes.groupby(
                dataset_manager.case_id_col):

            # select current test case
            test_y_all.extend(
                dataset_manager.get_label_numeric(dt_test_bucket))
            nr_events_all.append(len(dt_test_bucket))

            start = time.time()
            encoded_case = bucket_encoder.fit_transform(dt_test_bucket)
            _, knn_idxs = bucketer.kneighbors(encoded_case)
            knn_idxs = knn_idxs[0]

            relevant_cases_bucket = encoded_train.iloc[knn_idxs].index
            dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(
                dt_train_prefixes, relevant_cases_bucket)  # one row per event
            train_y = dataset_manager.get_label_numeric(dt_train_bucket)

            if len(set(train_y)) < 2:
                preds_all.append(train_y[0])
            else:
                feature_combiner = FeatureUnion([
                    (method,
                     EncoderFactory.get_encoder(method, **cls_encoder_args))
                    for method in methods
                ])
Example #42
0
class KNearestNeighborsAssignement(object):
    """   """
    def __init__(self,
                 feature_name,
                 max_distance,
                 data_directory="",
                 n_neighbors=1,
                 algorithm="ball_tree",
                 weights="distance"):
        """
        """
        self.feature_dim = None
        self.feature_name = feature_name
        self.trained = False
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.nb_samples = 0
        self.algorithm = algorithm
        self.max_distance = max_distance
        if data_directory == "":
            self.data_directory = "/tmp"
        else:
            self.data_directory = data_directory
        self.model = NearestNeighbors(n_neighbors=self.n_neighbors,
                                      algorithm=self.algorithm)
        try:
            data = np.load(self.data_directory + "/" + self.feature_name +
                           "_knn_classif.npz")
            self.X = list(data["x"])
            self.Y = list(data["y"])
            self.nb_samples = len(self.X)
            self.feature_dim = len(self.X[0])
            if self.n_neighbors is None:
                self.n_neighbors = math.sqrt(len(self.X[0]))
            self.train()
        except Exception:
            self.X = []
            self.Y = []

    def train(self):
        """
        """
        self.model.fit(np.array(self.X))

    def update(self, feature, label):
        """
        """
        if self.feature_dim is None:
            self.feature_dim = len(feature)
        self.X.append(feature)
        self.Y.append(label)
        self.nb_samples += 1

    def predict(self, feature):
        """
        """
        distances, matchs = self.model.kneighbors([feature])
        distance = distances[0]
        if distance > self.max_distance:
            return False, "unknown", 0.0
        indice = matchs[0][0]
        label = self.Y[indice]
        return True, label, distance

    def __del__(self):
        """
        """
        pass
        # TODO: save data
    def save(self, file):
        file = open(file, 'w')
        pickle.dump(self.knn, file)
        file.close()

    def load(self, file):
        file = open(file, 'r')
        self.knn = pickle.load(file)
        file.close()
    ax2.legend(targets)
    ax2.grid()
    plt.show()

    from sklearn.neighbors import NearestNeighbors
    # test_data_size = tst.shape[0]

    test_data = portfolio_points
    # test_data = tst

    X = cropped_components
    # X = data[:-test_data_size]
    # test_data = data[-test_data_size:]
    nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(X)
    # distances, indices = nbrs.kneighbors(X)
    test_distances, test_indices = nbrs.kneighbors(test_data)
    dic = {name: 0 for name in names}

    print ("Calculating success rate for test using Nearest Neighboirs")
    for name in finalDfPerIm.loc[test_indices.flatten()]['target']:
        dic[name] += 1
    m = test_indices.size
    for k, v in sorted(dic.items(), key=operator.itemgetter(0), reverse=True):
        print (k , "\t : \t", (int)((v / m) * 100) , "%")
    pass


    from sklearn.neighbors.nearest_centroid import NearestCentroid
    y = np.array(filter(lambda name: "Portfolio" in name, targs))
    clf = NearestCentroid()
    clf.fit(X, y)
Example #44
0

kmeans = KMeans(n_clusters=800)
preprocessed_image = []
files = [x for x in os.listdir() if "jpg" in x]
print(files)
images = [cv2.imread(img) for img in files]
descriptor_list = np.array([])
for image in images:
    image = gray(image)
    keypoint, descriptor = features(image, extractor)
    if len(descriptor_list) == 0:
        descriptor_list = np.array(descriptor)
    else:
        descriptor_list = np.vstack((descriptor_list, descriptor))
kmeans.fit(descriptor_list)
for image in images:
    image = gray(image)
    keypoint, descriptor = features(image, extractor)
    if (descriptor is not None):
        histogram = build_histogram(descriptor, kmeans)
        preprocessed_image.append(histogram)

data = cv2.imread("book1.jpg")
data = gray(data)
keypoint, descriptor = features(data, extractor)
histogram = build_histogram(descriptor, kmeans)
neighbor = NearestNeighbors(n_neighbors=5)
neighbor.fit(preprocessed_image)
dist, result = neighbor.kneighbors([histogram])
print([files[i] for i in result[0]])
  instances_per_class[i,0] = np.size(elements)


for ind, cnn_layer in enumerate(layer_names): #iterate through layers and count precision and recall for test data
  print(cnn_layer)
  features = Model(input = new_model.input, output=new_model.get_layer(cnn_layer).output)
  train_pred = features.predict(train_data)
  test_pred = features.predict(test_data)
  
  #find the k nearest neighbors of an image
  nn_model = NearestNeighbors(n_neighbors=n_nearest_imgs, metric='cosine')
  nn_model.fit(train_pred)
  
  for j in range(0,test_data.shape[0]):  #iterate throught test images
    an_img = test_pred[j,:].reshape(1, -1)
    distances, indices = nn_model.kneighbors(an_img)
    
    s=0
    for w in np.nditer(indices): #iterate through the most similar images
      if train_labels[w,0] == test_labels[j,0]:
        score[ind,0] += 1
        s += 1
        
      if train_labels[w,1] == test_labels[j,1]:
        score[ind,1] += 1
    
    score[ind,2] += s/instances_per_class[int(test_labels[j,0]),0] #recall per image
    
    
  score[ind,2] /= test_data.shape[0] #recall of breed
  score[ind,0] /= max_images #precision of breed
Example #46
0
    imgs_train_reconstruct = model.decoder.predict(E_train)
    if modelName == "simpleAE":
        imgs_train_reconstruct = imgs_train_reconstruct.reshape(
            (-1, ) + shape_img_resize)
    plot_reconstructions(imgs_train,
                         imgs_train_reconstruct,
                         os.path.join(outDir,
                                      "{}_reconstruct.png".format(modelName)),
                         range_imgs=[0, 255],
                         range_imgs_reconstruct=[0, 1])

# Fit kNN model on training images
print("Fitting k-nearest-neighbour model on training images...")
knn = NearestNeighbors(n_neighbors=5, metric="cosine")
knn.fit(E_train_flatten)

# Perform image retrieval on test images
print("Performing image retrieval on test images...")
for i, emb_flatten in enumerate(E_test_flatten):
    _, indices = knn.kneighbors([emb_flatten
                                 ])  # find k nearest train neighbours
    img_query = imgs_test[i]  # query image
    imgs_retrieval = [imgs_train[idx]
                      for idx in indices.flatten()]  # retrieval images
    outFile = os.path.join(outDir, "{}_retrieval_{}.png".format(modelName, i))
    plot_query_retrieval(img_query, imgs_retrieval, outFile)

# Plot t-SNE visualization
print("Visualizing t-SNE on training images...")
outFile = os.path.join(outDir, "{}_tsne.png".format(modelName))
plot_tsne(E_train_flatten, imgs_train, outFile)
Example #47
0
    def over_sampling(self):
        if self.k + 1 > self.n_train_less:
            print(
                'Expected n_neighbors <= n_samples,  but n_samples = {}, n_neighbors = {}, '
                'has changed the n_neighbors to {}'.format(
                    self.n_train_less, self.k + 1, self.n_train_less))
            self.k = self.n_train_less - 1
        data_less_filter = []
        num_maj_filter = []
        length_less = len(self.train_less)
        num_maj = number_maj(self.train[:, 1:], self.train_less[:, 1:],
                             self.tp_less, self.train[:, 0])
        for m in range(len(num_maj)):
            if num_maj[m] < self.k:
                data_less_filter.append(self.train_less[m])
                num_maj_filter.append(num_maj[m])
        self.train_less = np.array(data_less_filter)
        distance_more, nn_array_more = NearestNeighbors(
            n_neighbors=self.k + 1).fit(self.train_more[:, 1:]).kneighbors(
                self.train_less[:, 1:], return_distance=True)
        distance_less, nn_array = NearestNeighbors(n_neighbors=self.k + 1).fit(
            self.train_less[:, 1:]).kneighbors(self.train_less[:, 1:],
                                               return_distance=True)

        distance_less = distance_less.sum(axis=1)
        distance_more = distance_more.sum(axis=1)
        distance = distance_less / distance_more
        # print(distance)
        density = 1 / distance  # calculate density

        density = list(
            map(lambda x: min(100, x),
                density))  # Control the maximum density range at 100

        # The density is sorted below, and the minority samples are also sorted in order of density.
        density_sorted = sorted(range(len(density)),
                                key=lambda a: density[a],
                                reverse=True)  # sorted
        data_resorted = []
        density_sorted_data = []
        num_sorted = []
        for i in range(len(self.train_less)):
            data_resorted.append(self.train_less[density_sorted[i]])
            density_sorted_data.append(density[density_sorted[i]])
            num_sorted.append(num_maj_filter[density_sorted[i]])

        density = np.array(density_sorted_data)
        cluster_big_density = []
        cluster_small_density = []
        cluster_big_data = []
        cluster_small_data = []
        cluster_big_num = []
        cluster_small_num = []
        cluster = k_means(X=density.reshape((len(density), 1)), n_clusters=2)
        for i in range(cluster[1].shape[0]):
            if cluster[1][i] != cluster[1][i + 1]:  # Partition cluster
                cluster_big_density = density[:i + 1]
                cluster_big_data = np.array(data_resorted)[:i + 1, :]
                cluster_big_num = num_sorted[:i + 1]
                cluster_small_density = density[i + 1:]
                cluster_small_data = np.array(data_resorted)[i + 1:, :]
                cluster_small_num = num_sorted[i + 1:]
                break

        # If there is only one point in a cluster, do not divide the cluster
        if len(cluster_big_data) < 2 or len(cluster_small_data) < 2:
            cluster_big_data = np.array(data_resorted)
            cluster_big_density = density
            cluster_big_num = num_sorted
            flag = 1  # if flag==1 only run big cluster once
        else:
            flag = 2
        sum_0 = 0
        sum_1 = 0
        # Calculate weight
        for p in range(len(cluster_big_num)):
            sum_0 += (5 - cluster_big_num[p]) / self.k + 1
        for p in range(len(cluster_small_num)):
            sum_0 += (5 - cluster_small_num[p]) / self.k + 1

        ratio = []  # save the every cluster's totol weight
        ratio.append(sum_0)
        ratio.append(sum_1)
        wight = [5 / 6, 4 / 6, 3 / 6, 2 / 6, 1 / 6]
        kk = self.k
        diff = len(self.train_more
                   ) - length_less  # the number of samples need to synthesize
        totol_less = len(self.train_less)

        for i in range(flag):
            if i == 0:  # big cluster
                density = cluster_big_density
                self.n_train_less = len(cluster_big_data)
                self.train_less = cluster_big_data
                maj_num_ab = cluster_big_num
            else:  # small cluster
                density = cluster_small_density
                self.n_train_less = len(cluster_small_data)
                self.train_less = cluster_small_data
                maj_num_ab = cluster_small_num

            self.k = min(
                len(self.train_less) - 1,
                kk)  # if len(self.train_less)<k,set k =len(self.train_less)

            # The number of sample points that need to be inserted at each point
            if flag == 1:
                number_synthetic = int(
                    len(self.train_more) / self.IR - len(self.train_less))
            else:
                if i == 0:
                    number_synthetic = int(
                        (len(self.train_less) / totol_less) * diff)
                    len_big = number_synthetic
                else:
                    number_synthetic = diff - len_big

            # Calculate how many points should be inserted for each sample
            N = list(
                map(lambda x: int((x / ratio[i]) * number_synthetic), wight))
            self.reminder = number_synthetic - sum(N)
            self.num = 0

            neighbors = NearestNeighbors(n_neighbors=self.k + 1).fit(
                self.train_less[:, 1:])
            nn_array = neighbors.kneighbors(self.train_less[:, 1:],
                                            return_distance=False)

            self.synthetic = np.zeros((number_synthetic, self.n_attrs - 1))
            for p in range(self.train_less.shape[0]):
                self._populate(p, nn_array[p][1:], number_synthetic, N,
                               maj_num_ab)

            label_synthetic = np.array([self.tp_less] *
                                       number_synthetic).reshape(
                                           (number_synthetic, 1))
            np.random.seed(self.random_state)
            synthetic_dl = self.synthetic
            synthetic_dl = np.hstack(
                (label_synthetic, synthetic_dl))  # class column

            data_res = synthetic_dl
            if i == 0:
                return_data = np.vstack((copy.deepcopy(self.train), data_res))
                if flag == 1:
                    return return_data
                self.new_index = 0
            else:
                return_data = np.vstack((copy.deepcopy(return_data), data_res))

                return return_data
Example #48
0
def filterFunc(mrdModel, testData):
    qDim = mrdModel.X.mean.values.shape[1]

    scales1 = mrdModel.Y0.kern.input_sensitivity(summarize=False)
    scales2 = mrdModel.Y1.kern.input_sensitivity(summarize=False)

    scales1 = scales1 / scales1.max()
    scales2 = scales2 / scales2.max()

    # get the number of dimensions
    yThresh = 0.05
    indices = np.asarray(range(qDim))
    active1 = indices[scales1 >= yThresh]
    active2 = indices[scales2 >= yThresh]
    sharedDims = np.intersect1d(active2, active2)
    nShared = len(sharedDims)

    # get init latent state from optimization
    hybridFPS = 10000.0
    deltaT = 1.0 / hybridFPS

    # state transition matrix
    def f_cv(x, dt):
        nShared = len(x) / 2
        F = np.eye(2 * nShared)
        F[:nShared, nShared:] = dt * np.eye(nShared)
        return np.dot(F, x)

    def h_cv(x):
        nShared = len(x) / 2
        return x[:nShared]

    # create kalman filter
    sigmas = filterpy.kalman.MerweScaledSigmaPoints(n=2 * nShared,
                                                    alpha=0.1,
                                                    beta=2.0,
                                                    kappa=1.0)
    kf = UKF(dim_x=2 * nShared,
             dim_z=nShared,
             fx=f_cv,
             hx=h_cv,
             dt=deltaT,
             points=sigmas)

    # init state
    yIn = testData['Cloud'][0, :]
    [xPredict, infX] = mrdModel.Y0.infer_newX(yIn[None, :], optimize=True)
    xPredict = xPredict.mean

    kf.x = np.zeros((2 * nShared))
    kf.x[:nShared] = xPredict[0, sharedDims]

    # init covariance
    kf.P *= 1e-4

    # process and measurement noise
    kf.Q *= 1e-5
    kf.R *= 1e-3

    # model variables
    kKey = 'Cloud'
    mKey = 'TopCoord'

    qDim = mrdModel.X.mean.shape[1]
    nDimIn = testData[kKey].shape[1]
    nDimOut = testData[mKey].shape[1]
    nSamples = testData[mKey].shape[0]
    latentVals = np.zeros((nSamples, qDim))
    predictVals = np.zeros((nSamples, nDimOut))

    # obtain the training data latent positions
    latentPositions = mrdModel.X.mean
    nn = NearestNeighbors(n_neighbors=5,
                          algorithm='kd_tree').fit(mrdModel.Y0.Y)

    startTime = time.time()
    for n in range(nSamples):
        yIn = testData[kKey][n, :]
        yTrueOut = testData[mKey][n, :]

        kf.predict()
        if n % hybridFPS == 0:
            [xPredict, infX] = mrdModel.Y0.infer_newX(yIn[None, :],
                                                      optimize=True)
            xPredict = xPredict.mean
            kf.update(xPredict[0, sharedDims], R=1e-6 * np.eye(nShared))
        else:
            _, indices = nn.kneighbors(np.atleast_2d(yIn))
            xPredict = latentPositions[indices[0], :].mean(axis=0)
            kf.update(xPredict[sharedDims])

        # how to apply hybrid here??
        # kalman filter
        latentVal = np.atleast_2d(xPredict)
        latentVal[0, sharedDims] = kf.x[:nShared]

        yOut = mrdModel.predict(latentVal, Yindex=1)
        latentVals[n, :] = latentVal
        predictVals[n, :] = yOut[0]
        sys.stdout.write('.')
        sys.stdout.flush()
    stopTime = time.time()
    print '\nFinished Strategy Hybrid'

    nrmse = np.divide(
        np.sqrt(
            metrics.mean_squared_error(testData[mKey],
                                       predictVals,
                                       multioutput='raw_values')),
        testData[mKey].max(axis=0) - testData[mKey].min(axis=0))
    rmse = np.sqrt(
        metrics.mean_squared_error(testData[mKey],
                                   predictVals,
                                   multioutput='raw_values'))
    corr = np.zeros((1, nDimOut))
    for d in range(nDimOut):
        corr[0, d], _ = stats.pearsonr(testData[mKey][:, d], predictVals[:, d])

    results = {}
    results['corr'] = corr
    results['rmse'] = rmse
    results['nrmse'] = nrmse
    results['pred'] = predictVals
    results['latent'] = latentVals
    results['time'] = nSamples / (stopTime - startTime)
    return results
Example #49
0
import numpy as np
from sklearn.neighbors import NearestNeighbors

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
neighbors = NearestNeighbors(n_neighbors=2).fit(X)
distances, indices = neighbors.kneighbors(X, return_distance=True)
print(distances)
print(indices)
print(neighbors.kneighbors_graph(X).toarray())
Example #50
0
class Database(object):
    def __init__(self, database_vectors, targets, metric='cosine'):
        self.nn = NearestNeighbors(n_neighbors=database_vectors.shape[0],
                                   algorithm='brute',
                                   metric=metric)
        self.nn.fit(database_vectors)
        self.targets = np.cast[np.int](targets)
        bins = np.bincount(self.targets)
        idx = np.nonzero(bins)[0]
        self.instances_per_target = dict(zip(idx, bins[idx]))
        self.number_of_instances = float(len(targets))
        self.recall_levels = np.arange(0, 1.01, 0.1)
        self.fine_recall_levels = np.arange(0, 1.01, 0.05)

    def get_binary_relevances(self, queries, targets):
        """
        Executes the queries and returns the binary relevance vectors (one vector for each query)
        :param queries: the queries
        :param targets: the label of each query
        :return:
        """
        distances, indices = self.nn.kneighbors(queries)
        relevant_vectors = np.zeros_like(indices)
        for i in range(targets.shape[0]):
            relevant_vectors[i, :] = self.targets[indices[i, :]] == targets[i]
        return relevant_vectors

    def get_metrics(self, relevant_vectors, targets):
        """
        Evaluates the retrieval performance
        :param relevant_vectors: the relevant vectors for each query
        :param targets: labels of the queries
        :return:
        """
        # Calculate precisions per query
        precision = np.cumsum(relevant_vectors, axis=1) / np.arange(
            1, self.number_of_instances + 1)

        # Calculate interpolated precision
        for i in reversed(range(len(precision) - 1)):
            precision[:, i] = np.maximum(precision[:, i], precision[:, i + 1])

        # Calculate recall per query
        instances_per_query = np.zeros((targets.shape[0], 1))
        for i in range(targets.shape[0]):
            instances_per_query[i] = self.instances_per_target[targets[i]]
        recall = np.cumsum(relevant_vectors, axis=1) / instances_per_query

        # Calculate precision @ 11 recall point
        precision_at_recall_levels = np.zeros(
            (targets.shape[0], self.recall_levels.shape[0]))
        for i in range(len(self.recall_levels)):
            idx = np.argmin(np.abs(recall - self.recall_levels[i]), axis=1)
            precision_at_recall_levels[:, i] = precision[
                np.arange(targets.shape[0]), idx]

        # Calculate fine-grained precision
        precision_at_fine_recall_levels = np.zeros(
            (targets.shape[0], self.fine_recall_levels.shape[0]))
        for i in range(len(self.fine_recall_levels)):
            idx = np.argmin(np.abs(recall - self.fine_recall_levels[i]),
                            axis=1)
            precision_at_fine_recall_levels[:, i] = precision[
                np.arange(targets.shape[0]), idx]

        # Calculate the means values of the metrics
        ap = np.mean(precision_at_recall_levels, axis=1)
        m_ap = np.mean(ap)
        interpolated_precision = np.mean(precision, axis=0)
        interpolated_fine_precision = np.mean(precision_at_fine_recall_levels,
                                              axis=0)

        return m_ap, interpolated_precision, interpolated_fine_precision, self.fine_recall_levels,

    def evaluate(self, queries, targets, batch_size=128):
        """
        Evaluates the performance of the database using the following metrics: interpolated map, interpolated precision,
        and precision-recall curve
        :param queries: the queries
        :param targets: the labels
        :return: the evaluated metrics
        """
        n_batches = len(targets) // batch_size
        m_ap, fine_precision, raw_precision = None, None, None

        for i in tqdm(range(n_batches)):
            cur_queries = queries[i * batch_size:(i + 1) * batch_size]
            cur_targets = targets[i * batch_size:(i + 1) * batch_size]

            relevant_vectors = self.get_binary_relevances(
                cur_queries, cur_targets)
            (c_m_ap, c_raw_precision, c_fine_precision, self.fine_recall_levels,) = \
                self.get_metrics(relevant_vectors, cur_targets)

            if m_ap is None:
                m_ap = c_m_ap * batch_size
                fine_precision = c_fine_precision * batch_size
                raw_precision = c_raw_precision * batch_size
            else:
                m_ap += c_m_ap * batch_size
                fine_precision += c_fine_precision * batch_size
                raw_precision += c_raw_precision * batch_size

        if batch_size * n_batches < len(targets):
            cur_queries = queries[batch_size * n_batches:]
            cur_targets = targets[batch_size * n_batches:]

            relevant_vectors = self.get_binary_relevances(
                cur_queries, cur_targets)
            (c_m_ap, c_raw_precision, c_fine_precision, self.fine_recall_levels,) = \
                self.get_metrics(relevant_vectors, cur_targets)

            m_ap += c_m_ap * len(cur_targets)
            fine_precision += c_fine_precision * len(cur_targets)
            raw_precision += c_raw_precision * len(cur_targets)

        m_ap = m_ap / float(len(targets))
        fine_precision = fine_precision / float(len(targets))
        raw_precision = raw_precision / float(len(targets))

        results = {
            'map': m_ap,
            'precision': fine_precision,
            'recall_levels': self.fine_recall_levels,
            'raw_precision': raw_precision
        }

        return results
    def explain(self, out_num):
        x = self.x
        y = self.y
        inliners = x[y == 1]
        outliers = x[y == -1]

        # Resample outlier to form cluster.
        n_dimens = len(outliers[0])
        n_samples = 5
        resampled = np.random.normal(outliers[out_num].tolist(),
                                     [0.01 for _ in range(n_dimens)],
                                     (n_samples, n_dimens))
        np.append(resampled, outliers[out_num, :])

        # Find context of outlier.
        n_neigh = 30
        nbrs = NearestNeighbors(n_neighbors=n_neigh,
                                algorithm='kd_tree',
                                metric='euclidean').fit(inliners)
        distances, neighbors = nbrs.kneighbors(outliers)

        # Clustering outlier context.
        out_neigh = neighbors[out_num]
        context = inliners[out_neigh, :]
        db = DBSCAN(eps=0.1, min_samples=3).fit(context)
        labels = db.labels_

        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

        if n_clusters_ == 0:
            return "Could not find enough clusters in outlier context."

        # Mask for outlier neighbors' clusters.
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True

        n_features = len(x[0])
        s_il = np.empty([n_clusters_, n_features])
        cluster_sizes = np.empty([n_clusters_])
        unique_labels = set(labels)

        for l in unique_labels:
            if l != -1:
                class_member_mask = (labels == l)
                cluster = context[class_member_mask & core_samples_mask]
                cluster_sizes[l] = (len(cluster))

                # Compute coefficients.
                X = np.concatenate((cluster, resampled), axis=0)
                y = [1 if i < len(cluster) else 0 for i in range(len(X))]
                clf = LinearSVC(penalty="l1",
                                dual=False,
                                random_state=0,
                                tol=1e-5)
                clf.fit(X, y)

                # Find nearest neighbors in cluster.
                # should nearest neighbor be form the cluster?
                nbrs = NearestNeighbors(n_neighbors=2,
                                        algorithm='kd_tree',
                                        metric='euclidean').fit(cluster)
                cluster_nbrs = nbrs.kneighbors(
                    cluster, return_distance=False)[:,
                                                    1]  # [0] -- the same point

                for m in range(n_features):
                    dist = [
                        abs(context[n][m] - cluster[i][m])
                        for i, n in enumerate(cluster_nbrs)
                    ]
                    gamma = sum(dist) / len(cluster)
                    s_il[l][m] = abs(clf.coef_[0][m]) / gamma

            importance = np.empty([n_features])

            for m in range(n_features):
                s_sum = 0
                for l in unique_labels:
                    if l != -1:
                        s_sum += cluster_sizes[l] * s_il[l][m]
                importance[m] = s_sum / n_neigh
        return importance
Example #52
0
# In[19]:

ir.describe()  #showing the fitted data

# In[20]:

# creating a test data
import numpy as np
test = np.array([5.4, 2, 2, 2.3])
test1 = test.reshape(1, -1)
test1.shape

# In[21]:

nn.kneighbors(test1, 5)

# In[22]:

ir.iloc[[98, 93, 57, 60, 79]]  # displaying specific rows using iloc()

# ### KNeighborsClassifier Algorithm

# In[23]:

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets

n_neighbors = 15
Example #53
0
class ActiveLearningWithCostEmbedding(QueryStrategy):
    """Active Learning with Cost Embedding (ALCE)

    Cost sensitive multi-class algorithm.
    Assume each class has at least one sample in the labeled pool.

    Parameters
    ----------
    cost_matrix : array-like, shape=(n_classes, n_classes)
        The ith row, jth column represents the cost of the ground truth being
        ith class and prediction as jth class.

    mds_params : dict, optional
        http://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.html

    nn_params : dict, optional
        http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html

    embed_dim : int, optional (default: None)
        if is None, embed_dim = n_classes

    base_regressor : sklearn regressor

    random_state : {int, np.random.RandomState instance, None}, optional (default=None)
        If int or None, random_state is passed as parameter to generate
        np.random.RandomState instance. if np.random.RandomState instance,
        random_state is the random number generate.

    Attributes
    ----------
    nn_ : sklearn.neighbors.NearestNeighbors object instance

    Examples
    --------
    Here is an example of declaring a ActiveLearningWithCostEmbedding
    query_strategy object:

    .. code-block:: python

       import numpy as np
       from sklearn.svm import SVR

       from libact.query_strategies.multiclass import ActiveLearningWithCostEmbedding as ALCE

       cost_matrix = 2000. * np.random.rand(n_classes, n_classes)
       qs3 = ALCE(dataset, cost_matrix, SVR())

    References
    ----------
    .. [1] Kuan-Hao, and Hsuan-Tien Lin. "A Novel Uncertainty Sampling Algorithm
           for Cost-sensitive Multiclass Active Learning", In Proceedings of the
           IEEE International Conference on Data Mining (ICDM), 2016
    """
    def __init__(self,
                 dataset,
                 cost_matrix,
                 base_regressor,
                 embed_dim=None,
                 mds_params={},
                 nn_params={},
                 random_state=None):
        super(ActiveLearningWithCostEmbedding, self).__init__(dataset)

        self.cost_matrix = cost_matrix
        self.base_regressor = base_regressor

        self.n_classes = len(cost_matrix)
        if embed_dim is None:
            self.embed_dim = self.n_classes
        else:
            self.embed_dim = embed_dim
        self.regressors = [
            copy.deepcopy(self.base_regressor) for _ in range(self.embed_dim)
        ]

        self.random_state_ = seed_random_state(random_state)

        self.mds_params = {
            'metric': False,
            'n_components': self.embed_dim,
            'n_uq': self.n_classes,
            'max_iter': 300,
            'eps': 1e-6,
            'dissimilarity': "precomputed",
            'n_init': 8,
            'n_jobs': 1,
            'random_state': self.random_state_
        }
        self.mds_params.update(mds_params)

        self.nn_params = {}
        self.nn_params.update(nn_params)
        self.nn_ = NearestNeighbors(n_neighbors=1, **self.nn_params)

        dissimilarity = np.zeros((2 * self.n_classes, 2 * self.n_classes))
        dissimilarity[:self.n_classes, self.n_classes:] = self.cost_matrix
        dissimilarity[self.n_classes:, :self.n_classes] = self.cost_matrix.T
        mds_ = MDSP(**self.mds_params)
        embedding = mds_.fit(dissimilarity).embedding_

        self.class_embed = embedding[:self.n_classes, :]
        self.nn_.fit(embedding[self.n_classes:, :])

    @inherit_docstring_from(QueryStrategy)
    def make_query(self):
        dataset = self.dataset
        unlabeled_entry_ids, pool_X = dataset.get_unlabeled_entries()
        # The input class should be 0-n_classes
        X, y = dataset.get_labeled_entries()

        pred_embed = np.zeros((len(pool_X), self.embed_dim))
        for i in range(self.embed_dim):
            self.regressors[i].fit(X, self.class_embed[y, i])
            pred_embed[:, i] = self.regressors[i].predict(pool_X)

        dist, _ = self.nn_.kneighbors(pred_embed)
        dist = dist[:, 0]

        ask_idx = self.random_state_.choice(
            np.where(np.isclose(dist, np.max(dist)))[0])
        return unlabeled_entry_ids[ask_idx]
Example #54
0
def k_distance(dataset, k):
    nbrs = NearestNeighbors(n_neighbors=k).fit(dataset)
    distances, indices = nbrs.kneighbors(dataset)
    return distances[:, k - 1].mean()
Example #55
0
%matplotlib

cells2 = pd.read_csv('CRC_clusters_neighborhoods_markers.csv')

tissue_col = 'spots'
neigh_col = 'neighborhood10'
patient_col = 'patients'
group_col = 'groups'
X = 'X:X'
Y = 'Y:Y'

# calculate neighbors for each spot
for spot in cells2[tissue_col].unique():
    tissue = cells2[cells2[tissue_col] == spot]
    fit = NearestNeighbors(n_neighbors=1).fit(tissue[[X, Y]].values)
    m = fit.kneighbors()[1]

    cells2.loc[tissue.index, 'neigh_neigh'] = tissue.iloc[m[:, 0], :][neigh_col].values
cells2['neigh_neigh'] = cells2['neigh_neigh'].astype(int)


#compute for each patient, in each tissue and neighborhood, the number of cells in that neighborhoood
counts = cells2.groupby([group_col,patient_col,tissue_col,neigh_col]).apply(lambda x: len(x)).unstack()

#compute for each patient, in each tissue and neighborhood:  the count of how many of the cells in that neighborhood are next to a cell in the other neighborhood
neighs = cells2.groupby([group_col,patient_col,tissue_col,neigh_col]).apply(lambda x:x['neigh_neigh'].value_counts(sort = False)).unstack()

#specify which neighborhoods you want to calculate
neigh1,neigh2 = 0,4

# Comment out if you wish to average each spot for each patient
Example #56
0
def res(jobfile):
    Resume_Vector = []
    Ordered_list_Resume = []
    Ordered_list_Resume_Score = []
    LIST_OF_FILES = []
    LIST_OF_FILES_PDF = []
    LIST_OF_FILES_DOC = []
    LIST_OF_FILES_DOCX = []
    Resumes = []
    Temp_pdf = []
    os.chdir('./Original_Resumes')
    for file in glob.glob('**/*.pdf', recursive=True):
        LIST_OF_FILES_PDF.append(file)
    for file in glob.glob('**/*.doc', recursive=True):
        LIST_OF_FILES_DOC.append(file)
    for file in glob.glob('**/*.docx', recursive=True):
        LIST_OF_FILES_DOCX.append(file)

    LIST_OF_FILES = LIST_OF_FILES_DOC + LIST_OF_FILES_DOCX + LIST_OF_FILES_PDF
    # LIST_OF_FILES.remove("antiword.exe")
    print("This is LIST OF FILES")
    print(LIST_OF_FILES)

    # print("Total Files to Parse\t" , len(LIST_OF_PDF_FILES))
    print("####### PARSING ########")
    for nooo, i in enumerate(LIST_OF_FILES):
        Ordered_list_Resume.append(i)
        Temp = i.split(".")
        if Temp[1] == "pdf" or Temp[1] == "Pdf" or Temp[1] == "PDF":
            try:
                print("This is PDF", nooo)
                with open(i, 'rb') as pdf_file:
                    read_pdf = PyPDF2.PdfFileReader(pdf_file)
                    # page = read_pdf.getPage(0)
                    # page_content = page.extractText()
                    # Resumes.append(Temp_pdf)

                    number_of_pages = read_pdf.getNumPages()
                    for page_number in range(number_of_pages):

                        page = read_pdf.getPage(page_number)
                        page_content = page.extractText()
                        page_content = page_content.replace('\n', ' ')
                        # page_content.replace("\r", "")
                        Temp_pdf = str(Temp_pdf) + str(page_content)
                        # Temp_pdf.append(page_content)
                        # print(Temp_pdf)
                    Resumes.extend([Temp_pdf])
                    Temp_pdf = ''
                    # f = open(str(i)+str("+") , 'w')
                    # f.write(page_content)
                    # f.close()
            except Exception as e:
                print(e)
        if Temp[1] == "doc" or Temp[1] == "Doc" or Temp[1] == "DOC":
            print("This is DOC", i)

            try:
                a = textract.process(i)
                a = a.replace(b'\n', b' ')
                a = a.replace(b'\r', b' ')
                b = str(a)
                c = [b]
                Resumes.extend(c)
            except Exception as e:
                print(e)

        if Temp[1] == "docx" or Temp[1] == "Docx" or Temp[1] == "DOCX":
            print("This is DOCX", i)
            try:
                a = textract.process(i)
                a = a.replace(b'\n', b' ')
                a = a.replace(b'\r', b' ')
                b = str(a)
                c = [b]
                Resumes.extend(c)
            except Exception as e:
                print(e)

        if Temp[1] == "ex" or Temp[1] == "Exe" or Temp[1] == "EXE":
            print("This is EXE", i)
            pass

    print("Done Parsing.")

    Job_Desc = 0
    LIST_OF_TXT_FILES = []
    os.chdir('../Job_Description')
    f = open(jobfile, 'r')
    text = f.read()

    try:
        tttt = str(text)
        tttt = summarize(tttt, word_count=100)
        text = [tttt]
    except:
        text = 'None'

    f.close()

    vectorizer = TfidfVectorizer(stop_words='english')
    # print(text)
    vectorizer.fit(text)
    vector = vectorizer.transform(text)

    Job_Desc = vector.toarray()
    # print("\n\n")
    # print("This is job desc : " , Job_Desc)

    os.chdir('../')
    for i in Resumes:

        text = i
        tttt = str(text)
        try:
            tttt = summarize(tttt, word_count=100)
            text = [tttt]
            vector = vectorizer.transform(text)

            aaa = vector.toarray()
            Resume_Vector.append(vector.toarray())
        except:
            pass
    # print(Resume_Vector)

    for i in Resume_Vector:

        samples = i
        neigh = NearestNeighbors(n_neighbors=1)
        neigh.fit(samples)
        NearestNeighbors(algorithm='auto', leaf_size=30)

        Ordered_list_Resume_Score.extend(
            neigh.kneighbors(Job_Desc)[0][0].tolist())

    Z = [
        x
        for _, x in sorted(zip(Ordered_list_Resume_Score, Ordered_list_Resume))
    ]
    print(Ordered_list_Resume)
    print(Ordered_list_Resume_Score)
    flask_return = []
    # for n,i in enumerate(Z):
    #     print("Rankkkkk\t" , n+1, ":\t" , i)

    for n, i in enumerate(Z):
        # print("Rank\t" , n+1, ":\t" , i)
        # flask_return.append(str("Rank\t" , n+1, ":\t" , i))
        name = getfilepath(i)
        #name = name.split('.')[0]
        rank = n + 1
        res = ResultElement(rank, name)
        flask_return.append(res)
        # res.printresult()
        print(f"Rank{res.rank+1} :\t {res.filename}")
    return flask_return
Example #57
0
x = SimilarityVectors.cosine_vecs(utt)

top_n = 5

random_idc = set()
while (len(random_idc) != top_n):
    random_idc.add(random.randint(0, len(x) - 1))
print('------------------------------------')
print(random_idc)
print('------------------------------------\n')
test = [x[y] for y in random_idc]

for alg in ['ball_tree', 'kd_tree']:

    print('Used Search Algorithm :', alg)

    nneighbors = NearestNeighbors(n_neighbors=top_n, \
              algorithm=alg).fit(x)

    dist, idc = nneighbors.kneighbors([x[y] for y in random_idc])

    print()
    for i, idx in enumerate(random_idc):
        print('Looking for neighbors of : "', utt[idx], '"')
        for k, j in enumerate(idc[i]):
            print('Index :', j, '\nDistance :', dist[i][k], '\n' + utt[j])
        print()

    print('Indices :', idc, '\nDistances :', dist)
    print('------------------------------------\n')
v = np.zeros([const.num_particles, 3]).astype(np.float32)
plt.ion()
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for iteration in range(100):
    timer[iteration][0] = time.time()
    plt.xlim(-10, 10)
    plt.ylim(-10, 10)
    ax.set_zlim(-10, 10)
    ax.scatter3D(r1.T[0], r1.T[1], r1.T[2])
    plt.draw()
    plt.pause(0.000001)
    ax.cla()
    timer[iteration][1] = time.time()
    nn.fit(r1)
    neighbors = nn.kneighbors(r1,
                              return_distance=False).flatten().astype(np.int32)

    timer[iteration][2] = time.time()
    r_dash = r1.flatten().astype(np.float32)
    v_n = v.flatten().astype(np.float32)
    density = np.zeros([const.num_particles,
                        const.num_neighbhours]).flatten().astype(np.float32)
    force = np.zeros([const.num_particles, const.num_neighbhours,
                      3]).astype(np.float32)
    color_field_lap_val = np.zeros(
        [const.num_particles, const.num_neighbhours]).astype(np.float32)
    color_field_grad_val = np.zeros(
        [const.num_particles, const.num_neighbhours, 3]).astype(np.float32)

    timer[iteration][3] = time.time()
    calc_density(drv.Out(density),
Example #59
0
def clusterHead(left_eyes, right_eyes, fullHeads=False):
    #We use NN to cluster head objects: eyes and nose, assuming there is at least one pair of eyes
    if not left_eyes or not right_eyes:
        heads = {}
        if fullHeads:
            for headsita in list(range(len(left_eyes))):
                newHead = head(left_eye=headsita)
                heads[headsita] = newHead
            for headsita in list(range(len(right_eyes))):
                newHead = head(right_eye=headsita)
                heads[headsita] = newHead
    elif len(left_eyes) > 1:
        neigh = NearestNeighbors(n_neighbors=2)
        neigh.fit(left_eyes)
        distances, from_right_to_left = neigh.kneighbors(right_eyes)
        index_taken = {}  #[inr, distances[inr][0]]
        queue = list(range(len(right_eyes)))
        heads = {}
        j = -1
        # we examine the terms and correct previous choices
        while queue:
            index_right_eye = queue[0]
            queue = queue[1:]
            # we grab the closest left eye to the inr
            index_left_eye = from_right_to_left[index_right_eye][0]
            if (index_left_eye) == [] and fullHeads:
                # if the point is asolated
                newHead = head(right_eye=index_right_eye)
                heads[j] = newHead
                j = j - 1

            elif index_left_eye not in index_taken:
                #new index
                newHead = head(left_eye=index_left_eye,
                               right_eye=index_right_eye,
                               distance=distances[index_right_eye][0])
                heads[index_left_eye] = newHead
                index_taken[index_left_eye] = [
                    index_right_eye, distances[index_right_eye][0]
                ]
            else:
                # we need to compare distances
                newdist = distances[index_right_eye][0]
                olddist = index_taken[index_left_eye][1]
                if olddist < newdist:
                    # wrong left eye
                    index_left_eye = from_right_to_left[index_right_eye][1]
                    newdist = distances[index_right_eye][1]
                    olddist = index_taken.get(index_left_eye, [[], None])[1]
                    if index_left_eye not in index_taken:
                        newHead = head(left_eye=index_left_eye,
                                       right_eye=index_right_eye,
                                       distance=distances[index_right_eye][1])
                        heads[index_left_eye] = newHead
                        index_taken[index_left_eye] = [
                            index_right_eye, distances[index_right_eye][1]
                        ]
                    elif olddist < newdist and fullHeads:  # olddist<newdist
                        newHead = head(right_eye=index_right_eye)
                        heads[j] = newHead
                        j = j - 1
                    else:
                        queue = queue + [index_taken[index_left_eye][0]]
                        newHead = head(left_eye=index_left_eye,
                                       right_eye=index_right_eye,
                                       distance=newdist)
                        heads[index_left_eye] = newHead
                        index_taken[index_left_eye] = [
                            index_right_eye, distances[index_right_eye][1]
                        ]
                else:
                    # correct left eye already taken
                    queue = queue + [index_taken[index_left_eye][0]]
                    newHead = head(left_eye=index_left_eye,
                                   right_eye=index_right_eye,
                                   distance=newdist)
                    heads[index_left_eye] = newHead
                    index_taken[index_left_eye] = [index_right_eye, newdist]
        if fullHeads:
            missingheads = set(list(range(
                len(right_eyes)))).difference(index_taken)
        else:
            missingheads = []
        for headsita in missingheads:
            newHead = head(left_eye=headsita)
            heads[headsita] = newHead
    else:
        neigh = NearestNeighbors(n_neighbors=1)
        neigh.fit(right_eyes)
        distances, from_right_to_left = neigh.kneighbors(left_eyes)
        newHead = head(left_eye=0, right_eye=from_right_to_left[0][0])
        heads = {0: newHead}
    return heads
Example #60
0
def findNextAstar(image, giTarg, profile, thresh):
    open = []
    closed = []

    query = getFeatures(image)

    start = Node(0, 0, giTarg, query, -1, 0)
    open.append(start)

    while (open != []):
        q = findLowestF(open)
        open.remove(q)

        nbors = NearestNeighbors(n_neighbors=5)
        nbors.fit(knndata[:, 0:12])
        knn = nbors.kneighbors([q.index[0:12]])
        knn = knn[1][0]
        dirs = [[0, 0], [0, 0], [0, 0], [0, 0]]
        for n in knn:
            ug = findGIList(n)
            for d in range(0, 4):
                dirs[d][0] += 1
                dirs[d][1] += ug[d]

        successorGain = [d[1] / (d[0] - .00001) for d in dirs]

        print(successorGain)
        #exit()

        for i in range(4):
            index = random.randint(0, len(knn) - 1)

            sI = findFeatures(knn[random.randint(0, len(knn) - 1)])
            gi = float(q.index[10])
            sG = q.g + (gi / successorGain[i])

            sH = ((q.g - gi) / gi) * (sum(profile) / len(profile))
            giAcc = q.gi + gi
            sF = sG + sH
            sP = q.index

            print(sG)

            #print(giAcc)

            if (giAcc > giTarg):
                return [sG, giAcc]  #lowest E for direction, don't return

            else:
                skip = False
                for c in closed:
                    if ((c.index[0:12] == q.index[0:12]).all() and c.f <= q.f):
                        skip = True
                if not skip:
                    s = Node(sF, sG, sH, sI, sP, giAcc)
                    open.append(s)

        #print([len(open),len(closed)])
        closed.append(q)
    highest = 0
    ret = None
    for i in closed:
        if (i.gi > highest):
            highest = i.gi
            ret = [i.g, i.gi]
    return ret