Example #1
0
    def _compute_tolerance_distance(self, sample, symbol):
        """Compute the distance tolerance.

        Computes distance tolerance in the feature vectors space
        below which we find the symbol similar. Then saves it
        to proper file.

        Args:
            sample (list of lists of int): list of feature-vectors,
                                           on which we base on.
            symbol (String): name of symbol to compute tolerance
        """
        nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree')\
            .fit(sample)
        distances, _ = nbrs.kneighbors(sample)
        print(distances)
        means = []
        for distances_row in distances:
            row = np.delete(distances_row, [0])
            means.append(np.mean(row))
        means.sort()
        critical_index = math.ceil(0.8 * len(means)) - 1
        tolerance_distance = means[critical_index] * 1.3
        print("tolerance distance: %.16f" % tolerance_distance)

        tolerance_distance_path = \
            Classifier._get_file_path(
                self.files[DISTANCE_TOLERANCE_FILE], symbol)

        with open(tolerance_distance_path, 'w') as handle:
            handle.write("%.16f\n" % tolerance_distance)

        return tolerance_distance
Example #2
0
def embedding_refinement(data_matrix_highdim,
                         data_matrix_lowdim,
                         n_neighbors=8,
                         emb_quality_th=1,
                         n_iter=20):
    # extract neighbors list for high dimensional case
    neigh_high = NearestNeighbors(n_neighbors=n_neighbors)
    neigh_high.fit(data_matrix_highdim)
    neighbors_list_highdim = neigh_high.kneighbors(data_matrix_highdim, return_distance=0)
    n_instances = data_matrix_lowdim.shape[0]
    logger.debug('refinements max num iters: %d  k in neqs: %d num insts: %d' %
                 (n_iter, n_neighbors, n_instances))
    for it in range(n_iter):
        average_embedding_quality_score, scores = knn_quality_score(data_matrix_lowdim,
                                                                    neighbors_list_highdim,
                                                                    n_neighbors)
        # select low quality embedded instances
        ids = [i for i, s in enumerate(scores)
               if relative_quality(i, scores, neighbors_list_highdim) <= emb_quality_th]
        # find average position of true knns and move point there
        new_data_matrix_lowdim = compute_average(ids, data_matrix_lowdim, neighbors_list_highdim)
        new_average_embedding_quality_score, new_scores = knn_quality_score(new_data_matrix_lowdim,
                                                                            neighbors_list_highdim,
                                                                            n_neighbors)
        if new_average_embedding_quality_score > average_embedding_quality_score:
            data_matrix_lowdim = new_data_matrix_lowdim
            n_refinements = len(ids)
            frac_refinements = float(n_refinements) / n_instances
            logger.debug('r %.2d neqs: %.3f \t %.2f (%d insts)' %
                         (it + 1, new_average_embedding_quality_score,
                          frac_refinements, n_refinements))
        else:
            break
    return data_matrix_lowdim
    def resample(self):
        # Start with the minority class
        minx = self.x[self.y == self.minc]
        miny = self.y[self.y == self.minc]

        # Finding nns
        from sklearn.neighbors import NearestNeighbors

        print("Finding the %i nearest neighbours..." % self.k, end = "")

        NN = NearestNeighbors(n_neighbors = self.k + 1)
        NN.fit(minx)
        nns = NN.kneighbors(minx, return_distance=False)[:, 1:]

        print("done!")

        # Creating synthetic samples
        print("Creating synthetic samples...", end="")
        sx, sy = make_samples(minx, minx, self.minc, nns, int(self.ratio * len(miny)), random_state=self.rs)
        print("done!")

        # Concatenate the newly generated samples to the original data set
        ret_x = concatenate((self.x, sx), axis = 0)
        ret_y = concatenate((self.y, sy), axis = 0)

        return ret_x, ret_y
class ContentBased(object):
    """
    Modelo de recomendación de articulos basados en los tags con mas relevancia de cada uno de ellos.
    El modelo vectoriza cada articulo para poder calcular la similitud entre cada uno de ellos. 
    """
    def __init__(self, stop_words=None, token_pattern=None, metric='cosine', n_neighbors=5):
        if stop_words is None:
            stop_words =  stopwords.words("english")
            
        if token_pattern is None:
            token_pattern = '(?u)\\b[a-zA-Z]\\w\\w+\\b'
            
        self.tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, token_pattern=token_pattern)
        self.nearest_neigbors = NearestNeighbors(metric=metric, n_neighbors=n_neighbors, algorithm='brute')
        
    def fit(self, datos, columna_descripcion):
        """
        Entrenamos el modelo:
        1/ Vectorizacion de cada articulo (Extracción y ponderación de atributos)
        2/ Calculamos los articulos mas cercanos
        """
        self.datos = datos
        datos_por_tags = self.tfidf_vectorizer.fit_transform(datos[columna_descripcion])        
        self.nearest_neigbors.fit(datos_por_tags)
        
    def predict(self, descripcion):
        """
        Devuelve los articulos mas parecidos a la descripcion propuesta
        """
        descripcion_tags = self.tfidf_vectorizer.transform(descripcion)        
        if descripcion_tags.sum() == 0:
            return pd.DataFrame(columns=self.datos.columns)
        else:
            _, indices = self.nearest_neigbors.kneighbors(descripcion_tags)
            return self.datos.iloc[indices[0], :]
Example #5
0
def find_k_neighbors(points, neighbor_number=5):
    from sklearn.neighbors import NearestNeighbors
    import numpy as np
    X = np.array(points)
    neighbors = NearestNeighbors(n_neighbors=neighbor_number + 1, algorithm='ball_tree').fit(X)
    distances, indices = neighbors.kneighbors(X)
    return [[str(point), list([str(x) for x in indices[point][1:]])] for point in xrange(len(points))]
Example #6
0
    def sample(s):
        if s.data is None:
            raise ValueError('data not loaded.')
        mdl = NearestNeighbors(n_neighbors=s.k, n_jobs=-1)
        minoX = s.X[s.y == s.minolab]
        majX = s.X[s.y == s.majlab]
        mdl.fit(minoX)
        _, nei_table = mdl.kneighbors()

        generated = None
        for cnt, nei_idx in enumerate(nei_table):
            x = minoX[cnt]
            if s.rate >= 0.5 * s.k:
                nei = minoX[np.random.choice(nei_idx, int(s.rate))]
                new = x + np.random.rand(int(s.rate), 1) * (nei - x)

            else:
                nei = minoX[nei_idx]
                new = x + np.random.rand(s.k, 1) * (nei - x)
                # each of the synthesed k points has N/k * 100 % probability to be chosen
                new = new[np.random.rand(s.k) > s.rate * 1.0 / s.k]
            if generated is None:
                generated = new
            else:
                generated = np.vstack((generated, new))
        # number of generated instances
        N = len(generated)
        ret = np.hstack((np.vstack((minoX, generated, majX)),
                         np.array([s.minolab] * (minoX.shape[0] + N) + [s.majlab] * majX.shape[0])[:, None]))
        np.random.shuffle(ret)
        return ret
Example #7
0
def test_kernel_density_sampling(n_samples=100, n_features=3):
    rng = np.random.RandomState(0)
    X = rng.randn(n_samples, n_features)

    bandwidth = 0.2

    for kernel in ['gaussian', 'tophat']:
        # draw a tophat sample
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        samp = kde.sample(100)
        assert_equal(X.shape, samp.shape)

        # check that samples are in the right range
        nbrs = NearestNeighbors(n_neighbors=1).fit(X)
        dist, ind = nbrs.kneighbors(X, return_distance=True)

        if kernel == 'tophat':
            assert np.all(dist < bandwidth)
        elif kernel == 'gaussian':
            # 5 standard deviations is safe for 100 samples, but there's a
            # very small chance this test could fail.
            assert np.all(dist < 5 * bandwidth)

    # check unsupported kernels
    for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        assert_raises(NotImplementedError, kde.sample, 100)

    # non-regression test: used to return a scalar
    X = rng.randn(4, 1)
    kde = KernelDensity(kernel="gaussian").fit(X)
    assert_equal(kde.sample().shape, (1, 1))
Example #8
0
def findKNN(frequencyVector, newVector):
    samples = np.array(frequencyVector)
    neigh = NearestNeighbors(n_neighbors = 5, metric = "euclidean")
    neigh.fit(samples)
    indexList = neigh.kneighbors(newVector, return_distance = False).tolist()

    return indexList
def random_forest_single_predict(test_filename, name, feature_file, train_file, k):
    name_list, data = readfile_real_name(test_filename)
    print 'reading file...'
    test_data = data[name_list.index(name)]
    with open(train_file, 'rb') as f:
        clf = cPickle.load(f)
    print 'done'
    result_rate = (clf.predict_proba(test_data))[0]
    class_name = clf.classes_
    print name
    num = map(get_num, result_rate)
    name_list, feature_list = readfile_real_name_group(feature_file, class_name, num)
    neigh = NearestNeighbors()
    neigh.fit(feature_list)
    kneighbors_result_list = neigh.kneighbors(test_data, k, False)[0]
    print kneighbors_result_list
    for x in kneighbors_result_list:
        print name_list[x]
    classification_result = []
    average_list = []
    real_name = (name.split('_'))[0]
    counter = Counter(kneighbors_result_list)
    if real_name == name_list[counter.most_common(1)[0][0]].split('_')[0]:
        classification_result.append(1)
    else:
        classification_result.append(0)
    num = 0
    for i in kneighbors_result_list:
        if (name_list[i].split('_'))[0] == real_name:
            num += 1
    average_list.append((float)(num) / (float)(k))
    print classification_result, average_list
    return classification_result, average_list
    def resample(self):
        """
        :return:
            Return the data with majority samples that form a Tomek link
            removed.
        """

        from sklearn.neighbors import NearestNeighbors

        # Find the nearest neighbour of every point
        nn = NearestNeighbors(n_neighbors=2)
        nn.fit(self.x)
        nns = nn.kneighbors(self.x, return_distance=False)[:, 1]

        # Send the information to is_tomek function to get boolean vector back
        if self.verbose:
            print("Looking for majority Tomek links...")
        links = self.is_tomek(self.y, nns, self.minc, self.verbose)

        if self.verbose:
            print("Under-sampling "
                  "performed: " + str(Counter(self.y[logical_not(links)])))

        # Return data set without majority Tomek links.
        return self.x[logical_not(links)], self.y[logical_not(links)]
Example #11
0
def SMOTE(minority_samples, N, k):
    """
    The SMOTE algorithm, please refer to: [JAIR'02]SMOTE - Synthetic Minority Over-sampling Technique
    minority_samples The minority sample array
    N Amount of SMOTE N%
    k Number of nearest neighbors
    
    @return (N/100)*len(minority_samples) synthetic minority class samples
    """
    T = len(minority_samples) # number of minority samples
    if N < 100:
        T = N * 1.0 / 100 * T
        N = 100
    N = int(N * 1.0 / 100)
    
    neigh = NearestNeighbors(n_neighbors = k, radius=1.0, algorithm='auto', leaf_size=30, p=2)
    neigh = neigh.fit(minority_samples)
    
    synthetic_samples = []
    for i in range(T):
        target_sample = minority_samples[i]
        tmp = neigh.kneighbors(target_sample, k, return_distance=False)
        nnarray = tmp[0]
        populate(minority_samples, N, k, i, nnarray, synthetic_samples)
        
    return np.array(synthetic_samples, float)
Example #12
0
def _wpca_analysis(L, C, intensities):
    """
    Determine the eccentricity of each cluster using weighted PCA (See
    Jolliffe 2002, 14.2.1). The smallest normalized explained variance
    is small for flat of filiform objects.

    - L is a numpy matrix (one point on each row)
    - intensities are gray levels of each point

    No cluster assignment is used here: a ball of radius 10 around each
    center is used to find the cloud of points.
    """
    np.set_printoptions(threshold=50000)
    n_points, n_features = L.shape
    tee.log('WPCA - Fitting NearestNeighbors on', n_points, 'points')
    nbrs = NearestNeighbors(radius=10.0).fit(L)
    for i, c in enumerate(C):
        array_c = np.array([c.x, c.y, c.z])
        i_nbrs = nbrs.radius_neighbors([array_c], 10.0, return_distance=False)[0]
        points_within = L[i_nbrs]
        if len(points_within) < 64:  # too small set, there is no point in running PCA
            c.EVR = [0.499, 0.499, 0.002]
            c.last_variance = c.EVR[2]
        else:
            w = np.sqrt(intensities[i_nbrs]/255.0)
            wX = np.dot(np.diag(w), points_within)
            pca = sklearn.decomposition.PCA(n_components=3)
            X_r = pca.fit(wX).transform(wX)
            c.EVR = pca.explained_variance_ratio_
            c.last_variance = c.EVR[2]
        print('WPCA done on', i, '/', len(C), 'name=', c.name, 'EVR=', c.EVR)
Example #13
0
def k_nearest_neighbors_scores(k, eng_vec_dict, fr_vec_dict):
	eng_mat, fr_mat, index_map = build_parallel_mats_from_dicts(eng_vec_dict, fr_vec_dict, translation_dict)
	# k + 1 since we discard the top neighbor, which is itself
	neighbors_en = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(eng_mat)
	dist_en, indices_en = neighbors_en.kneighbors(eng_mat)
	neighbors_fr = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(fr_mat)
	dist_fr, indices_fr = neighbors_fr.kneighbors(fr_mat)
	# since we built the matrices in parallel, we know now that indices map to each other,
	# so we simply check the overlap of those to calculate precision and recall. 
	# calculate avg recall for k-recall
	avg_recall = 0.
	num_points = len(indices_en) + 0.
	knearest_map_en = dict()
	knearest_map_fr = dict()
	for i in range(0, int(num_points)):
		w_en = index_map[i][0]
		w_fr = index_map[i][1]
		index_set_en = set(indices_en[i][1:]) # should be size k
		index_set_fr = set(indices_fr[i][1:]) # should be size k
		if w_en not in knearest_map_en:
			knearest_map_en[w_en] = map(lambda z: index_map[z], index_set_en)
		if w_fr not in knearest_map_fr:
			knearest_map_fr[w_fr] = map(lambda z: index_map[z], index_set_fr)
		recall_count = sum(1 for i in index_set_fr if i in index_set_en)
		# precision = recall for this task
		recall = (recall_count + 0.)/len(index_set_en)
		avg_recall += recall
	return (avg_recall/num_points), knearest_map_en, knearest_map_fr
Example #14
0
def knn_find(train, test, k = 2):
    """find first K knn neighbors of test samples from train samples
    
    [Args]
    ----
    train: train data {array like, m x n, m samples, n features}
        list of sample, each sample are list of features.
        e.g. [[age = 18, weight = 120, height = 167],
              [age = 45, weight = 180, height = 173],
              ..., ]
        
    test: test data {array like, m x n, m samples, n features}
        data format is the same as train data
    
    k: number of neighbors
        how many neighbors you want to find
        
    [Returns]
    -------
    distances: list of distance of knn-neighbors from test data
        [[dist(test1, train_knn1), dist(test1, train_knn2), ...],
         [dist(test2, train_knn1), dist(test2, train_knn2), ...],
         ..., ]
    
    indices: list of indice of knn-neighbors from test data
        [[test1_train_knn1_index, test1_train_knn2_index, ...],
         [test2_train_knn1_index, test2_train_knn2_index, ...],
         ..., ]    
    """
    nbrs = NearestNeighbors(n_neighbors=k, algorithm="kd_tree").fit(train) # default = "kd_tree" algorithm
    return nbrs.kneighbors(test)
Example #15
0
def createSyntheticSamples(X,Y,nearestneigh,numNeighbors,majoritylabel,minoritylabel): 
    (Xminority,Xmajority) = partitionSamples(X,Y)
    numFeatures = Xminority.shape[1]
    Xreduced = pca(Xminority)
    numOrigMinority=len(Xminority)
    #reducedMinoritykmeans = KMeans(init='k-means++', max_iter=500,verbose=False,tol=1e-4,k=numCentroids, n_init=5, n_neighbors=3).fit(Xreduced)
    reducedNN = NearestNeighbors(nearestneigh, algorithm='auto')
    reducedNN.fit(Xreduced)
    #Xsyn=np.array([numOrigMinority,numNeighbors*numFeatures])
    trylist=[]
    #LOOPHERE  for EACH (minority) point...
    for i,row in enumerate(Xreduced):
        neighbor_index = reducedNN.kneighbors(row, return_distance=False) 
        closestPoints = Xminority[neighbor_index]
        #randomly choose one of the k nearest neighbors
        chosenNeighborsIndex = chooseNeighbor(neighbor_index,numNeighbors,i)
        chosenNeighbor = Xminority[chosenNeighborsIndex]
        #Calculate linear combination:        
        #Take te difference between the orig minority sample and its selected neighbor, where X[1,] is the orig point
        diff = Xminority[i,]-chosenNeighbor
        #Multiply this difference by a number between 0 and 1
        r = random.uniform(0,1)
        #Add it back to te orig minority vector and viola this is the synthetic sample
        syth_sample =Xminority[i,:]+r*diff
        syth_sample2 = syth_sample.tolist()
        trylist.append(syth_sample2)
    Xsyn=np.asarray(trylist).reshape(numNeighbors*numOrigMinority,numFeatures)
    maj_col=majoritylabel*np.ones([Xmajority.shape[0],1])
    min_col=minoritylabel*np.ones([Xsyn.shape[0],1])
    syth_Y = np.concatenate((maj_col,min_col),axis=0)
    syth_X = np.concatenate((Xmajority,Xsyn),axis=0)
    if(syth_X.shape[0]!=syth_Y.shape[0]):
        raise Exception("dim mismatch between features matrix and response matrix")
    return (syth_X, syth_Y)
Example #16
0
def construct_A(X, k, binary=False):

    nbrs = NearestNeighbors(n_neighbors=1 + k).fit(X)
    if binary:
        return nbrs.kneighbors_graph(X)
    else:
        return nbrs.kneighbors_graph(X, mode='distance')
Example #17
0
def shift(ep):
    ids,x1,y1,mk,mj,x2,y2 = np.genfromtxt(ep,unpack=True)

    local_mask = np.in1d(ids,bid)
    lid,lx1,ly1,lx2,ly2  = np.transpose([ids,x1,y1,x2,y2])[local_mask].T

    loc_xy = np.transpose([lx2,ly2])
    nbrs   = NN(n_neighbors=vecinos, algorithm='auto').fit(loc_xy)

    coo_xy = np.transpose([x2,y2])
    dist, idx = nbrs.kneighbors(coo_xy)
    idx  = idx[:,1:]
    dist = dist[:,1:]

    ctx = np.zeros(x1.size)
    cty = np.zeros(y1.size)

    for i in range(x1.size):
        star_locales = loc_xy[idx[i]]
        ep1_x = lx1[idx[i]]
        ep1_y = ly1[idx[i]]

        poptx, pcovx = curve_fit(linear,star_locales.T,ep1_x)
        popty, pcovy = curve_fit(linear,star_locales.T,ep1_y)

        ctx[i] += linear([x2[i],y2[i]],*poptx)
        cty[i] += linear([x2[i],y2[i]],*popty)

    shift_x = x1 - ctx
    shift_y = y1 - cty

    hdr  = 'ID X Y MAG_K MAG_J PMX PMY'
    fmt  = '%d %.3f %.3f %.3f %.3f %f %f'
    data = np.transpose([ids,x1,y1,mk,mj,shift_x,shift_y])
    np.savetxt('./%s/%s' % (pm_folder,ep.split('/')[-1].replace('.mfma','.pm')), data, header=hdr, fmt=fmt)
def test_connectivity_popagation():
    """
    Check that connectivity in the ward tree is propagated correctly during
    merging.
    """
    from sklearn.neighbors import NearestNeighbors

    X = np.array(
        [
            (0.014, 0.120),
            (0.014, 0.099),
            (0.014, 0.097),
            (0.017, 0.153),
            (0.017, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.152),
            (0.018, 0.149),
            (0.018, 0.144),
        ]
    )
    nn = NearestNeighbors(n_neighbors=10).fit(X)
    connectivity = nn.kneighbors_graph(X)
    ward = Ward(n_clusters=4, connectivity=connectivity)
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)
Example #19
0
class KDTrees:

    def __init__(self, nb_neighbours, leaf_size):
        self.nbrs = NearestNeighbors(n_neighbors=nb_neighbours, algorithm='ball_tree', metric = 'haversine', leaf_size=leaf_size)
    # Compute distance in time between two points on the map
    def mapDistance(self, x, y):
        if (len(x) > 2):
            return np.sum((x - y) ** 2)
        else:
            if(x[0] < y[0]):
                tmp = y
                y = x
                x = tmp
            pos1 = str(x[0]) + ", " + str(x[1])
            pos2 = str(y[0]) + ", " + str(y[1])
            timestamp = datetime.now()
            sec_to_add = 32 * 3600 + (timestamp - datetime(1970, 1, 1)).total_seconds() - 2*3600 - timestamp.hour * 3600 - timestamp.minute * 60 - timestamp.second
            traject = gmaps.directions(pos1, pos2, mode="transit", departure_time=timestamp.fromtimestamp(sec_to_add))
            try:
                print 'ok'
                return (traject[0]["legs"][0]["arrival_time"]["value"] - traject[0]["legs"][0]["departure_time"]["value"])
            except:
                print 'bug'
                return 1000000000


    def addPoints(self, points):
        self.nbrs.fit(points)

    def getNeighbours(self, points):
        self.nbrs.kneighbors(points)
Example #20
0
def nearestN():
    X = [[125,1], [200,0], [70,0], [240,1], [114,0], [120,0], [264,1], [85,0], [150,0], [90,0]]
#    y = [ 0, 0, 0, 0, 1, 0, 0, 1, 0,1 ]
    model = NN(n_neighbors=1, radius=1)
    model.fit(X)
    y = [98.,0.]
    print model.kneighbors(y)
Example #21
0
 def _set_widths_nearest_neighbor(self):
     # Nearest neighbors contain center itself, find one more.
     nbrs = NearestNeighbors(n_neighbors=self.n_neighbors+1, algorithm='ball_tree').fit(self.centers)
     for i in range(len(self.centers)):
         distances, indices = nbrs.kneighbors(self.centers[i]) 
         width = sum(distances[0])/(len(distances[0]-1))
         self.kernels[i].set_param(self.p/width) 
    def resample(self):
        """
        """

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Import the k-NN classifier
        from sklearn.neighbors import NearestNeighbors

        # Create a k-NN to fit the whole data
        nn_obj = NearestNeighbors(n_neighbors=self.size_ngh)

        # Fit the whole dataset
        nn_obj.fit(self.x)

        idx_to_exclude = []
        # Loop over the other classes under picking at random
        for key in self.ucd.keys():

            # Get the sample of the current class
            sub_samples_x = self.x[self.y == key]

            # Get the samples associated
            idx_sub_sample = np.nonzero(self.y == key)[0]

            # Find the NN for the current class
            nnhood_idx = nn_obj.kneighbors(sub_samples_x, return_distance=False)

            # Get the label of the corresponding to the index
            nnhood_label = (self.y[nnhood_idx] == key)

            # Check which one are the same label than the current class
            # Make an AND operation through the three neighbours
            nnhood_bool = np.logical_not(np.all(nnhood_label, axis=1))

            # If the minority class remove the majority samples (as in politic!!!! ;))
            if key == self.minc:
                # Get the index to exclude
                idx_to_exclude += nnhood_idx[np.nonzero(nnhood_label[np.nonzero(nnhood_bool)])].tolist()
            else:
                # Get the index to exclude
                idx_to_exclude += idx_sub_sample[np.nonzero(nnhood_bool)].tolist()

        # Create a vector with the sample to select
        sel_idx = np.ones(self.y.shape)
        sel_idx[idx_to_exclude] = 0

        # Get the samples from the majority classes
        sel_x = np.squeeze(self.x[np.nonzero(sel_idx), :])
        sel_y = self.y[np.nonzero(sel_idx)]

        underx = concatenate((underx, sel_x), axis=0)
        undery = concatenate((undery, sel_y), axis=0)

        if self.verbose:
            print("Under-sampling performed: " + str(Counter(undery)))

        return underx, undery
Example #23
0
def test_method(method, k = 10, tests=5):
    from sklearn.neighbors import NearestNeighbors
    t0 = time.time()
    nn = NearestNeighbors(leaf_size=data.shape[0]).fit(data)

    score = 0.0
    t_nn = 0.0
    t_meth = 0.0
    np.random.seed(0)

    for i in range(tests):
        d = data[np.random.randint(data.shape[0])]

        t0 = time.time()
        method_res = method(d, k)
        t_meth += time.time()-t0

        t0 = time.time()
        nn_res = nn.kneighbors(d, n_neighbors=k, return_distance=False)
        t_nn += time.time()-t0

        score += np.mean(np.in1d(nn_res, method_res))

    t_nn /= tests
    t_meth /= tests

    r1 = 'NN time: %1.10f method time: %1.10f speedup: %1.10f' % (t_nn, t_meth, t_nn/t_meth)

    r2 = '%1.2f%% overlap' % ((score/tests) * 100)
    return r1 + '\n' + r2
Example #24
0
def predict_appliance(home, appliance, feature):
    if home in all_homes[appliance]:
        home_to_pick=home
    else:
        home_to_pick=all_homes[appliance][0]
    print home_to_pick

    feature_dict = json.load(open("../data/output/sensitivity-numfeatures-allhomes/%s_%s_%d.json" %(appliance,feature, home_to_pick),"r"))
    f = feature_dict['f']
    k = feature_dict['k']
    clf = KNeighborsRegressor(n_neighbors=k)
    nn = NearestNeighbors(n_neighbors=k)
    df_new =df.copy()
    df_new = df_new.ix[all_homes[appliance]]
    df_new = df_new.ix[~df_new.index.isin([home])]
    #df_new = df_new.drop(home, axis=1)
    nn.fit(df_new[f].dropna())
    distances, indices = nn.kneighbors(df.ix[home][f])
    out = []
    nghbrs_list = df_new.index[indices].values[0]

    for month in range(1, 13):
        if len(nghbrs_list>1):
            out.append(df_new[["%s_%d" %(appliance, month) ]].ix[nghbrs_list].sum().values[0]/k)
        else:
            out.append(df_new[["%s_%d" %(appliance, month) ]].ix[nghbrs_list].values[0]/k)
    return out
Example #25
0
def knn_recommender():
    category_feature_matrix = pickle.load(open('bi_feature_matrix/category_feature_matrix', 'rb'))
    global_image_mapping = pickle.load(open('bi_feature_matrix/global_image_mapping', 'rb'))
    image_model = NearestNeighbors(n_neighbors=10, algorithm="auto").fit(category_feature_matrix)
    features_image, probs_image = generate_tags(
        "http://images.wisegeek.com/beach.jpg")

    # features_image = [u'bridge', u'water', u'river', u'reflection', u'no person', u'sunset', u'sky', u'travel',
    #                   u'evening', u'architecture', u'dawn', u'city', u'dusk', u'light', u'suspension', u'urban',
    #                   u'landscape', u'transportation system', u'suspension bridge', u'lake']
    #
    # probs_image = [0.9987363815307617, 0.9966945648193359, 0.9950028657913208, 0.9752582311630249, 0.9750866889953613,
    #                0.9703925848007202, 0.9699936509132385, 0.9686242341995239, 0.9574745893478394, 0.949645459651947,
    #                0.9459954500198364, 0.9424264430999756, 0.9072239398956299, 0.898352324962616, 0.8937841653823853,
    #                0.8838391304016113, 0.8808287382125854, 0.8789186477661133, 0.8749411106109619, 0.8702283501625061]

    feature_image_row_vector = np.zeros((1, len(features)))
    for j in range(len(features_image)):
        feature = features_image[j]
        if feature in features:
            feature_index = features.index(feature)
            feature_image_row_vector[0, feature_index] = probs_image[j]
    distance_near_image, images_near = image_model.kneighbors(feature_image_row_vector)
    for image_near in images_near[0]:
        id_image = global_image_mapping[image_near]
        cursor = train_collection.find({"1": id_image})
        for data in cursor:
            print data["field14"]
Example #26
0
def sampling_vectorized_file(file_number):
    j_train = j_train_prefix + format('%02d' % file_number)
    s_train = s_train_prefix + format('%02d' % file_number)
    if os.path.isfile(s_train):
        print s_train,'already exist...'
        # this file is already pending
        return
    touch(s_train)
    print 'creating'+s_train+'from',j_train,'...'
    # load pre-vectorized train text as multiple batch of nrows
    assert  os.path.isfile(j_train)
    (X_train,y_train) = joblib.load(j_train)
    n = X_test.shape[0] # 35065
    m = X_train.shape[0]/size_c*neighbor_c # 500000/10000
    dist=np.zeros(shape=(n,m),dtype=float)
    indx=np.zeros(shape=(n,m),dtype=int)
    for i in range(m/neighbor_c):
        print j_train,':',i,'/',m/neighbor_c
        off_c = i*size_c
        X_c = X_train[off_c:off_c+size_c]
        nbrs = NearestNeighbors(n_neighbors=5, algorithm='brute',metric='cosine').fit(X_c)
        t_dist,t_indx = nbrs.kneighbors(X_test)
        dist[:,neighbor_c*i:neighbor_c*(i+1)] = t_dist
        indx[:,neighbor_c*i:neighbor_c*(i+1)] = t_indx+off_c+file_number*500000
    joblib.dump((dist,indx),s_train)
    return dist,indx
Example #27
0
 def _compute_neighbors(self):
     V,dim = self.data_frame.shape
     neighbors = NearestNeighbors(n_neighbors=self.num_neighbors,algorithm='auto').fit(self.data_frame)
     _,indices = neighbors.kneighbors(self.data_frame)
     self._adjacency_graph = neighbors.kneighbors_graph(self.data_frame,mode='connectivity')
     self._knn_graph = neighbors.kneighbors_graph(self.data_frame,mode='distance')
     self._neighbors = indices
Example #28
0
	def removeRedundantFrames(self):
		h, w, d = self.keyframes[0].shape
		n = len(self.keyframes)
		frames = np.zeros((n, 256))
		self.frameHistFeats
		for i, kf in enumerate(self.keyframes):
			frames[i] = tools.getColorHist(kf).ravel()
		
		k = int(np.sqrt(n))
		kmeans = KMeans(k)
		print("Clustering frames into {0} code vectors.".format(k))
		kmeans.fit(self.frameHistFeats)

		bestFrameIndices = []
		bestFrames = []
		NN = NearestNeighbors(1)
		NN.fit(frames)
		centers = kmeans.cluster_centers_
		for center in centers:
			nearest = NN.kneighbors(center, return_distance=False)
			bestFrameIndices.append(nearest[0])
		bestFrameIndices.sort()
		for i in bestFrameIndices:
			bestFrames.append(self.keyframes[i])
		return bestFrames
Example #29
0
def main():
    vectorizer = CountVectorizer(ngram_range=(1,2),max_df=1.0, min_df=0.0)

    nei = NearestNeighbors(algorithm='brute', metric='jaccard')
    matrix = vectorizer.fit_transform(training_set).todense()
    new_matrix = vectorizer.transform(new_comments).todense()
    nei.fit(matrix)
    path =  '{0}/'.format(pathsplit(abspath(__file__))[0])
    jsonfile = open(path + '{0}-nn.json'.format(n_neighbors), 'w')

    nodes = [{'name': (training_set+new_comments)[i],
              'group':(groups + new_groups)[i]}
             for i in range(len(training_set+new_comments))]
    links = []

    for i in range(len(matrix)):
        dist, idnei = nei.kneighbors(matrix[i], n_neighbors=n_neighbors + 1)
        dist, idnei = dist[0], idnei[0]

        for j in range(len(idnei[1:])):
            links.append({"source":i,"target":idnei[j+1],"value":10*(1 - dist[j+1])})

    for i in range(len(new_comments)):
        dist, idnei = nei.kneighbors(new_matrix[i], n_neighbors=n_neighbors + 1)
        dist, idnei = dist[0], idnei[0]
        for j in range(len(idnei[1:])):
            links.append({"source":len(matrix) + i,"target":idnei[j],"value":10*(1 - dist[j+1])})

    jsondumped = json.dumps({'nodes':nodes, 'links':links}, indent=2)

    jsonfile.write(jsondumped)
Example #30
0
    def move(self, event):
        # add the knn scheme to decide selected region when moving mouse

        if SKLEARN_INSTALLED:
            if event.button == 1 and event.is_dragging:

                # TODO: support multiple datasets here
                data = get_map_data_scatter(self.active_layer_artist.layer,
                                            self.active_layer_artist.visual,
                                            self._vispy_widget)

                # calculate the threshold and call draw visual
                width = event.pos[0] - self.selection_origin[0]
                height = event.pos[1] - self.selection_origin[1]
                drag_distance = math.sqrt(width**2 + height**2)
                canvas_diag = math.sqrt(self._vispy_widget.canvas.size[0]**2 +
                                        self._vispy_widget.canvas.size[1]**2)

                mask = np.zeros(self.active_layer_artist.layer.shape)

                # neighbor num proportioned to mouse moving distance
                n_neighbors = drag_distance / canvas_diag * self.active_layer_artist.layer.shape[0]
                if n_neighbors >= 1:
                    neigh = NearestNeighbors(n_neighbors=n_neighbors)
                    neigh.fit(data)
                    select_index = neigh.kneighbors([self.selection_origin])[1]
                    mask[select_index] = 1
                self.mark_selected(mask, self.active_layer_artist.layer)
Example #31
0
def entropy(data=None,
            prob=None,
            method='nearest-neighbors',
            bins=None,
            errorVal=1e-5,
            units='bits'):
    '''
    given a probability distribution (prob) or an interable of symbols (data) compute and
    return its continuous entropy.

    inputs:
    ------
        data:       samples by dimensions ndarray

        prob:       iterable with probabilities

        method:     'nearest-neighbors', 'gaussian', or 'bin'

        bins:       either a list of num_bins, or a list of lists containing
                    the bin edges

        errorVal:   if prob is given, 'entropy' checks that the sum is about 1.
                    It raises an error if abs(sum(prob)-1) >= errorVal

        units:      either 'bits' or 'nats'

        Different Methods:

        'nearest-neighbors' computes the binless entropy (bits) of a random vector
        using average nearest neighbors distance (Kozachenko and Leonenko, 1987).
        For a review see Beirlant et al., 2001 or Chandler & Field, 2007.

        'gaussian' computes the binless entropy based on estimating the covariance
        matrix and assuming the data is normally distributed.

        'bin' discretizes the data and computes the discrete entropy.

    '''

    if prob is None and data is None:
        raise ValueError(
            "%s.entropy requires either 'prob' or 'data' to be defined" %
            __name__)

    if prob is not None and data is not None:
        raise ValueError(
            "%s.entropy requires only 'prob' or 'data to be given but not both"
            % __name__)

    if prob is not None and not isinstance(prob, np.ndarray):
        raise TypeError("'entropy' in '%s' needs 'prob' to be an ndarray" %
                        __name__)

    if prob is not None and abs(prob.sum() - 1) > errorVal:
        raise ValueError("parameter 'prob' in '%s.entropy' should sum to 1" %
                         __name__)

    if data.any():
        num_samples = data.shape[0]
        if len(data.shape) == 1:
            num_dimensions = 1
        else:
            num_dimensions = data.shape[1]

    if method == 'nearest-neighbors':
        from sklearn.neighbors import NearestNeighbors
        from scipy.special import gamma

        if data is None:
            raise ValueError(
                'Nearest neighbors entropy requires original data')

        if len(data.shape) > 1:
            k = num_dimensions
        else:
            k = 1

        nbrs = NearestNeighbors(n_neighbors=2, algorithm='auto').fit(data)
        #        pdb.set_trace()
        distances, indices = nbrs.kneighbors(data)
        rho = distances[:,
                        1]  # take nearest-neighbor distance (first column is always zero)
        Ak = (k *
              np.pi**(float(k) / float(2))) / gamma(float(k) / float(2) + 1)

        if units is 'bits':
            # 0.577215... is the Euler-Mascheroni constant (np.euler_gamma)
            return k * np.mean(np.log2(rho)) + np.log2(
                num_samples * Ak / k) + np.log2(np.exp(1)) * np.euler_gamma
        elif units is 'nats':
            # 0.577215... is the Euler-Mascheroni constant (np.euler_gamma)
            return k * np.mean(np.log(rho)) + np.log(
                num_samples * Ak / k) + np.log(np.exp(1)) * np.euler_gamma
        else:
            print('Units not recognized: {}'.format(units))

    elif method == 'gaussian':
        from numpy.linalg import det

        if data is None:
            raise ValueError(
                'Nearest neighbors entropy requires original data')
        detCov = det(np.dot(data.transpose(), data) / num_samples)
        normalization = (2 * np.pi * np.exp(1))**num_dimensions

        if detCov == 0:
            return -np.inf
        else:
            if units is 'bits':
                return 0.5 * np.log2(normalization * detCov)
            elif units is 'nats':
                return 0.5 * np.log(normalization * detCov)
            else:
                print('Units not recognized: {}'.format(units))

    elif method == 'bin':
        if prob is None and bins is None:
            raise ValueError('Either prob or bins must be specified.')

        if data is not None:
            prob = symbols_to_prob(data, bins=bins)

        if units is 'bits':
            # compute the log2 of the probability and change any -inf by 0s
            logProb = np.log2(prob)
            logProb[logProb == -np.inf] = 0
        elif units is 'nats':
            # compute the log2 of the probability and change any -inf by 0s
            logProb = np.log(prob)
            logProb[logProb == -np.inf] = 0
        else:
            print('Units not recognized: {}'.format(units))

        # return sum of product of logProb and prob
        # (not using np.dot here because prob, logprob are nd arrays)
        return -float(np.sum(prob * logProb))
# Calculating the minimums
MinX = min(TopVoxels['X'])
MinY = min(TopVoxels['Y'])
MinZ = min(TopVoxels['Z'])

# Subtracting the minimum from the column (to move it to 0,0,0)
TopVoxels['X'] = TopVoxels['X'].apply(lambda x: x - MinX)
TopVoxels['Y'] = TopVoxels['Y'].apply(lambda x: x - MinY)
TopVoxels['Z'] = TopVoxels['Z'].apply(lambda x: x - MinZ)

# Creating a 'deep' copy of the TopVoxels to use later because it has the same format and we want that
TopVoxelNormals = TopVoxels.copy(deep=True)

# Distance List
k_value = 10
nbrs = NearestNeighbors(n_neighbors=k_value,
                        algorithm='ball_tree').fit(TopVoxels)
distances, indices = nbrs.kneighbors(TopVoxels)

#Adding normal columns
for index, row in TopVoxels.iterrows():
    vectorMatrix = np.zeros(shape=[3, k_value])
    sum_centroids = np.zeros(shape=[1, 3])
    centroid = np.zeros(shape=[1, 3])
    covariance_matrix = np.zeros(shape=[3, 3])

    for j in range(k_value):
        vectorMatrix[0, j] = TopVoxels['X'][indices[index][j]]
        vectorMatrix[1, j] = TopVoxels['Y'][indices[index][j]]
        vectorMatrix[2, j] = TopVoxels['Z'][indices[index][j]]

        sum_centroids = np.add(sum_centroids, vectorMatrix[:, j])
def points_match_generator(ptrs, k_ptrs, affine_level, rand_move_level, batch_size):    
    nbors=NearestNeighbors(n_neighbors=k_ptrs+1).fit(ptrs)
    while 1:
        coordinates_ref_batch = np.zeros((batch_size,k_ptrs*3+1),dtype='float32')
        coordinates_tgt_batch = np.zeros((batch_size,k_ptrs*3+1),dtype='float32')
        matching = np.zeros((batch_size,1),dtype='int')
        for batch in range(batch_size):
            # affine transform the reference point set
            ptrs_affine=affine_transform(ptrs,affine_level,rand_move_level)
            
            ##########################################################################
            # add additional errors to target set to simulate segmentation mistakes
            ##########################################################################
            random_idx = np.arange(np.shape(ptrs)[0])
            np.random.shuffle(random_idx)
            idx_errors = random_idx[0:k_ptrs+1]
            errors = (np.random.rand(k_ptrs,3)-0.5)*10
            ptrs_target = np.copy(ptrs_affine)
            ptrs_target[idx_errors[0:k_ptrs],:] = ptrs_affine[idx_errors[0:k_ptrs],:]+errors
            
            ###############################################################
            # calculate (relative) coordinates of a single reference point
            ###############################################################
            distances_ref,nbors_idx=nbors.kneighbors(ptrs[idx_errors[k_ptrs:k_ptrs+1],:],
                                     return_distance=True)
            mean_distance = np.mean(distances_ref)                                  
            coordinates_ref_relative=(ptrs[nbors_idx[0,1:k_ptrs+1],:]-ptrs[nbors_idx[0,0],:])/mean_distance
            coordinates_ref = np.zeros(k_ptrs*3+1)
            coordinates_ref[0:k_ptrs*3] = coordinates_ref_relative.reshape(k_ptrs*3)
            coordinates_ref[k_ptrs*3] = mean_distance       
            
            nbors_target=NearestNeighbors(n_neighbors=k_ptrs+1).fit(ptrs_target)            
            matching_flag=np.random.rand()
            
            if matching_flag>0.5:                  
                ###################################################################
                # calculate coordinates of corresponding point in target point set
                ###################################################################
                distances_tgt_true,nbors_idx=nbors_target.kneighbors(ptrs_target[idx_errors[k_ptrs:k_ptrs+1],:],
                                         return_distance=True)
                mean_distance=np.mean(distances_tgt_true)                                  
                coordinates_tgt_relative=(ptrs_target[nbors_idx[0,1:k_ptrs+1],:]-ptrs_target[nbors_idx[0,0],:])/mean_distance
                coordinates_tgt=np.zeros(k_ptrs*3+1)
                coordinates_tgt[0:k_ptrs*3]=coordinates_tgt_relative.reshape(k_ptrs*3)
                coordinates_tgt[k_ptrs*3]=mean_distance   
                
            elif matching_flag<=0.5:
                #########################################################################
                # calculate coordinates of a non-corresponding point in target point set
                #########################################################################
                nbors_idx=nbors_target.kneighbors(ptrs_target[idx_errors[k_ptrs:k_ptrs+1],:],
                                         return_distance=False)
                random_nbors_idx=np.copy(nbors_idx[0,1:k_ptrs+1])
                np.random.shuffle(random_nbors_idx)
                distances_tgt_false,nbors_idx_false=nbors_target.kneighbors(ptrs_target[random_nbors_idx[0:1],:],
                                         return_distance=True)                                   
                mean_distance=np.mean(distances_tgt_false)                                  
                coordinates_tgt_relative=(ptrs_target[nbors_idx_false[0,1:k_ptrs+1],:]-ptrs_target[nbors_idx_false[0,0],:])/mean_distance
                coordinates_tgt=np.zeros(k_ptrs*3+1)
                coordinates_tgt[0:k_ptrs*3]=coordinates_tgt_relative.reshape(k_ptrs*3)
                coordinates_tgt[k_ptrs*3]=mean_distance
                
            else: raise NameError("matching_flag has an abnormal value")
            
            coordinates_ref_batch[batch,:]=coordinates_ref.reshape(1,k_ptrs*3+1)
            coordinates_tgt_batch[batch,:]=coordinates_tgt.reshape(1,k_ptrs*3+1)
            matching_flag=int(matching_flag>0.5)
            matching[batch,:]=np.array(matching_flag).reshape(1,1)
            
        yield ([coordinates_ref_batch, coordinates_tgt_batch], matching)
def get_sizes_and_z_from_cell_list(tracked_cells_df, cells, frame_num, scale_xy, scale_z, neighbors, density, scaled_vol=1):
   new_metric = []
   new_z = []
   total_metric = []
   total_z = []
   EXCLUDED_metric = []
   EXCLUDED_z = []
   
   ### get list of all cell locations on current frame
   all_x = tracked_cells_df.iloc[np.where(tracked_cells_df.FRAME == frame_num)].X
   all_y =  tracked_cells_df.iloc[np.where(tracked_cells_df.FRAME == frame_num)].Y
   all_z =  tracked_cells_df.iloc[np.where(tracked_cells_df.FRAME == frame_num)].Z
   
   all_series_num = tracked_cells_df.iloc[np.where(tracked_cells_df.FRAME == frame_num)].SERIES
   
   if not scaled_vol:
       all_coords = np.asarray(tracked_cells_df.iloc[np.where(tracked_cells_df.FRAME == frame_num)].coords)

   else:
       all_vols = np.asarray(tracked_cells_df.iloc[np.where(tracked_cells_df.FRAME == frame_num)].vol_rescaled)

   
   all_centroids = [np.asarray(all_x) * scale_xy, np.asarray(all_y) * scale_xy, np.asarray(all_z) * scale_z]
   all_centroids = np.transpose(all_centroids)
          
   
   if density:
       nbrs = NearestNeighbors(n_neighbors=neighbors, algorithm='ball_tree').fit(all_centroids)
       distances, indices = nbrs.kneighbors(all_centroids)        
              
       """ Get all distances to see distribution comparing DEPTH and density/distance """
       for dist_idx, obj in enumerate(distances):
           cur_dist = obj[1:-1]
           mean = np.mean(cur_dist)
           #total_dists.append(cur_dist)
           total_metric = np.concatenate((total_metric, [mean]))   
           
           """ ALSO GET LIST EXCLUDING CURRENT METRIC """
           series_num = np.asarray(all_series_num)[dist_idx]
           if series_num not in cells:     

               EXCLUDED_metric = np.concatenate((EXCLUDED_metric, [mean]))
               EXCLUDED_z = np.concatenate((EXCLUDED_z, [all_centroids[dist_idx, -1]]))               
               
       total_z = np.concatenate((total_z, all_centroids[:, -1]))
       
       """ Go cell by cell through NEW cells only """
       for cur_cell in cells:
           
           dist_idx = np.where(all_series_num == cur_cell)
           cur_dists = distances[dist_idx][0][1:-1]
           mean_dist = np.mean(cur_dists)
           new_metric = np.concatenate((new_metric, [mean_dist]))
           
           ### compare it with all the other cells at the same depth that are NOT new
           cur_z = np.asarray(tracked_cells_df.iloc[np.where(tracked_cells_df.SERIES == cur_cell)].Z)[0] * scale_z
           new_z = np.concatenate((new_z, [cur_z]))    


   elif not density:
       """ Get all distances to see distribution comparing DEPTH and density/distance """
       if not scaled_vol:
           for obj in all_coords:
               cur_vol = len(obj)
               total_metric = np.concatenate((total_metric, [cur_vol]))
       else:
           total_metric = np.concatenate((total_metric, all_vols))

            
       total_z = np.concatenate((total_z, np.asarray(all_z) * scale_z))            


       """ Get terminated cells """
       for cur_cell in cells:
            
            dist_idx = np.where(all_series_num == cur_cell)
            
            if not scaled_vol:
                cur_vol = len(all_coords[dist_idx][0])
                new_metric = np.concatenate((new_metric, [cur_vol]))
            else:
                cur_vol = all_vols[dist_idx]
                new_metric = np.concatenate((new_metric, cur_vol))
            
            ### compare it with all the other cells at the same depth that are NOT new
            cur_z = np.asarray(tracked_cells_df.iloc[np.where(tracked_cells_df.SERIES == cur_cell)].Z)[0] * scale_z
            new_z = np.concatenate((new_z, [cur_z]))

                
   return total_metric, total_z, new_metric, new_z, EXCLUDED_metric, EXCLUDED_z
Example #35
0
def perform_KNN(train_vecs, val_vecs, n_neighbors=1):
    print("-> Preparing KNN...")
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=-1).fit(train_vecs)
    print("-> Finding closest neighbors...")
    distances, indices = nbrs.kneighbors(val_vecs)
    return indices
Example #36
0
    def fit(self, X, y=None):
        """Fit the DPA clustering on the data.

        Parameters
        ----------
        X : array [n_samples, n_samples] if metric == “precomputed”, or, 
            [n_samples, n_features] otherwise
            The input samples. Similarities / affinities between
            instances if ``affinity='precomputed'``.

        y : Ignored
            Not used, present here for API consistency by convention.

        Returns
        -------
        self : object
            Returns self.
        """
        # Input validation
        X = check_array(X,
                        accept_sparse=['csr', 'csc', 'coo'],
                        dtype=np.float64,
                        ensure_min_samples=2)

        allow_squared = self.affinity in [
            "precomputed", "precomputed_nearest_neighbors"
        ]
        if X.shape[0] == X.shape[1] and not allow_squared:
            warnings.warn("The DPA clustering API has changed. ``fit``"
                          "now constructs an affinity matrix from data. To use"
                          " a custom affinity matrix, "
                          "set ``affinity=precomputed``.")

        self.k_max_ = self.k_max
        self.dim_ = self.dim
        if not self.dim:
            if self.dim_algo == "auto":
                self.dim_ = X.shape[1]
            elif self.dim_algo == "twoNN":
                if self.block_ratio >= X.shape[0]:
                    raise ValueError(
                        "block_ratio is larger than the sample size, the minimum size for \
                                      block analysis would be zero. Please set a value lower than "
                        + str(X.shape[0]))
                self.dim_ = twoNearestNeighbors(blockAn=self.blockAn,
                                                block_ratio=self.block_ratio,
                                                metric=self.metric,
                                                frac=self.frac,
                                                n_jobs=self.n_jobs).fit(X).dim_
            else:
                pass

        # If densities, uncertainties and k_hat are provided as input, compute only the
        # matrix of nearest neighbor:
        self.densities_ = self.densities
        self.err_densities_ = self.err_densities
        self.k_hat_ = self.k_hat
        if self.densities_ is not None and self.err_densities_ is not None and self.k_hat_ is not None:
            # If the nearest neighbors matrix is precomputed:
            if self.nn_distances is not None and self.nn_indices is not None:
                self.k_max_ = max(self.k_hat_)
                self.nn_distances_ = self.nn_distances
                self.nn_indices_ = self.nn_indices
            else:
                self.k_max_ = max(self.k_hat_)
                if self.metric == "precomputed":
                    nbrs = NearestNeighbors(
                        n_neighbors=self.k_max_ +
                        1,  # The point i is counted in its neighborhood 
                        algorithm="brute",
                        metric=self.metric,
                        n_jobs=self.n_jobs).fit(X)
                else:
                    nbrs = NearestNeighbors(
                        n_neighbors=self.k_max_ +
                        1,  # The point i is counted in its neighborhood 
                        algorithm="auto",
                        metric=self.metric,
                        n_jobs=self.n_jobs).fit(X)
                self.nn_distances_, nn_self.indices_ = nbrs.kneighbors(X)
        elif self.density_algo == "PAk":
            # If the nearest neighbors matrix is precomputed:
            if self.nn_distances is not None and self.nn_indices is not None:
                self.k_max_ = self.nn_distances.shape[1] - 1
                PAk = PointAdaptive_kNN(k_max=self.k_max_,
                                        D_thr=self.D_thr,
                                        metric=self.metric,
                                        nn_distances=self.nn_distances,
                                        nn_indices=self.nn_indices,
                                        dim_algo=self.dim_algo,
                                        blockAn=self.blockAn,
                                        block_ratio=self.block_ratio,
                                        frac=self.frac,
                                        dim=self.dim_,
                                        n_jobs=self.n_jobs).fit(X)
            else:
                PAk = PointAdaptive_kNN(k_max=self.k_max_,
                                        D_thr=self.D_thr,
                                        metric=self.metric,
                                        dim_algo=self.dim_algo,
                                        blockAn=self.blockAn,
                                        block_ratio=self.block_ratio,
                                        frac=self.frac,
                                        dim=self.dim_,
                                        n_jobs=self.n_jobs).fit(X)
            self.nn_distances_ = PAk.distances_
            self.nn_indices_ = PAk.indices_
            self.densities_ = PAk.densities_
            self.err_densities_ = PAk.err_densities_
            self.k_hat_ = PAk.k_hat_
            self.k_max_ = max(self.k_hat_)
        else:
            # TODO: implement option for kNN
            pass
        self.labels_, self.halos_, self.topography_, self.g_, self.centers_ = _DensityPeakAdvanced(
            self.densities_, self.err_densities_, self.k_hat_,
            self.nn_distances_, self.nn_indices_, self.Z)

        self.is_fitted_ = True

        return self
tf_value_of_input_data = input_tf(input_data)

def input_idf(input_data):
    idf_vec_input_data = []
    idf_each_doc_vec_input_data = []
    for each_word_input_data,val in word_dict.items():
        if each_word_input_data in input_data:
            word_value_in_each_doc_input_data = countIdfforwordvalue.get(each_word_input_data)
            idf_each_doc_vec_input_data.append(mth.log(length_of_docs / word_value_in_each_doc_input_data))
        else:
            idf_each_doc_vec_input_data.append(0)
    return idf_each_doc_vec_input_data


idf_value_of_input_data = input_idf(input_data)

def computeTfIdf_input(tf_value_of_input_data, idf_value_of_input_data):
    tfidf_input_vec = [a * b for a, b in zip(tf_value_of_input_data, idf_value_of_input_data)]
    return tfidf_input_vec

TfIdf_value_of_input_data = computeTfIdf_input(tf_value_of_input_data,idf_value_of_input_data)
value_for_predict = np.array(TfIdf_value_of_input_data).reshape(1,-1)

print("prediction")

from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=5)
neigh.fit(features) 
NearestNeighbors(algorithm='auto', leaf_size=30)
print(neigh.kneighbors(value_for_predict)) 
Example #38
0
# KNN brute Force Searching

import numpy as np
from sklearn.neighbors import NearestNeighbors
import timeit

#data creation
X = np.random.random((1000, 3))
print(X)

#training
nn = NearestNeighbors(5, algorithm='kd_tree')
nn.fit(X)

# Testing
test = np.array([0.3, 0.4, 0.4])
test1 = test.reshape(1, -1)
print(nn.kneighbors(test1, 5))
def addAvgDiff(train_df, test_df, nn=20):
    #adding features
    """
    ['diff_avg_price','diff_price_per_bed','diff_price_per_bath','diff_price_per_room']
    """
    #Note that removing outliers is moved into this function
    setOutlierNan(train_df)
    setOutlierNan(test_df)
    train_test = pd.concat([train_df.drop('interest_level', axis=1),
                            test_df]).dropna()

    nnfinder = NearestNeighbors(n_neighbors=nn)
    nnfinder.fit(train_test.ix[:, ['latitude', 'longitude']])

    train_df['diff_avg_price'] = np.nan
    train_df['diff_price_per_bed'] = np.nan
    train_df['diff_price_per_bath'] = np.nan
    train_df['diff_price_per_room'] = np.nan

    for i in train_df.index:
        if ~np.isnan(train_df.ix[i, 'latitude']) and ~np.isnan(
                train_df.ix[i, 'longitude']):
            _, idx = nnfinder.kneighbors(
                train_df.ix[i, ['latitude', 'longitude']].as_matrix().reshape(
                    1, -1))
            #price
            temp = train_test.iloc[
                idx.reshape(idx.size).tolist(), :].ix[:, 'price']
            temp = temp[temp != np.inf]
            train_df.ix[i,
                        'diff_avg_price'] = train_df.ix[i,
                                                        'price'] - temp.mean()

            temp = train_test.iloc[
                idx.reshape(idx.size).tolist(), :].ix[:, 'price_per_bed']
            temp = temp[temp != np.inf]
            train_df.ix[i, 'diff_price_per_bed'] = train_df.ix[
                i, 'price_per_bed'] - temp.mean()

            temp = train_test.iloc[
                idx.reshape(idx.size).tolist(), :].ix[:, 'price_per_bath']
            temp = temp[temp != np.inf]
            train_df.ix[i, 'diff_price_per_bath'] = train_df.ix[
                i, 'price_per_bath'] - temp.mean()

            temp = train_test.iloc[
                idx.reshape(idx.size).tolist(), :].ix[:, 'price_per_room']
            temp = temp[temp != np.inf]
            train_df.ix[i, 'diff_price_per_room'] = train_df.ix[
                i, 'price_per_room'] - temp.mean()

    test_df['diff_avg_price'] = np.nan
    test_df['diff_price_per_bed'] = np.nan
    test_df['diff_price_per_bath'] = np.nan
    test_df['diff_price_per_room'] = np.nan

    for i in test_df.index:
        if ~np.isnan(test_df.ix[i, 'latitude']) and ~np.isnan(
                test_df.ix[i, 'longitude']):
            _, idx = nnfinder.kneighbors(
                test_df.ix[i, ['latitude', 'longitude']].as_matrix().reshape(
                    1, -1))
            #price
            temp = train_test.iloc[
                idx.reshape(idx.size).tolist(), :].ix[:, 'price']
            temp = temp[temp != np.inf]
            test_df.ix[i,
                       'diff_avg_price'] = test_df.ix[i,
                                                      'price'] - temp.mean()

            temp = train_test.iloc[
                idx.reshape(idx.size).tolist(), :].ix[:, 'price_per_bed']
            temp = temp[temp != np.inf]
            test_df.ix[i, 'diff_price_per_bed'] = test_df.ix[
                i, 'price_per_bed'] - temp.mean()

            temp = train_test.iloc[
                idx.reshape(idx.size).tolist(), :].ix[:, 'price_per_bath']
            temp = temp[temp != np.inf]
            test_df.ix[i, 'diff_price_per_bath'] = test_df.ix[
                i, 'price_per_bath'] - temp.mean()

            temp = train_test.iloc[
                idx.reshape(idx.size).tolist(), :].ix[:, 'price_per_room']
            temp = temp[temp != np.inf]
            test_df.ix[i, 'diff_price_per_room'] = test_df.ix[
                i, 'price_per_room'] - temp.mean()

    train_df['longitude'] = train_df['longitude'].fillna(0)
    train_df['latitude'] = train_df['latitude'].fillna(0)
    test_df['longitude'] = test_df['longitude'].fillna(0)
    test_df['latitude'] = test_df['latitude'].fillna(0)
Example #40
0
    def perform_type_prediction(self, df, based_on_num_neigh=3):
        def create_binary_type_vector(t_types, a_types):
            vector = np.zeros(len(all_types))
            i = [a_types.index(_) for _ in t_types]
            vector[i] = 1
            return vector

        def create_binary_type_prediction_vector(t_types, a_types):
            vector = np.zeros(len(all_types))
            i = [
                a_types.index(_)
                for _ in itertools.chain.from_iterable(t_types)
            ]
            vector[i] += 1
            return vector

        # get the types. Mapping from the index of subject to the index of object
        type_info = deserializer(path=self.p_folder,
                                 serialized_name='type_info')

        # get the index of objects / get type information =>>> s #type o
        all_types = sorted(set.union(*list(type_info.values())))

        # Consider only points with type infos.
        e_w_types = df.loc[list(type_info.keys())]

        neigh = NearestNeighbors(n_neighbors=based_on_num_neigh,
                                 algorithm='kd_tree',
                                 metric='euclidean',
                                 n_jobs=-1).fit(e_w_types)

        # Get similarity results for selected entities
        df_most_similars = pd.DataFrame(
            neigh.kneighbors(e_w_types, return_distance=False))

        # Reindex the target
        df_most_similars.index = e_w_types.index.values

        # As sklearn implementation of kneighbors returns the point itself as most similar point
        df_most_similars.drop(columns=[0], inplace=True)

        # Map back to the original indexes. KNN does not consider the index of Dataframe.
        mapper = dict(zip(list(range(len(e_w_types))), e_w_types.index.values))
        # The values of most similars are mapped to original vocabulary positions
        df_most_similars = df_most_similars.applymap(lambda x: mapper[x])

        k_values = [1, 3, 5, 10, 15, 30, 50, 100]

        self.logger.info('K values: {0}'.format(k_values))
        for k in k_values:
            self.logger.info('##### {0} #####'.format(k))
            similarities = list()
            for _, S in df_most_similars.iterrows():
                true_types = type_info[_]
                type_predictions = [type_info[_] for _ in S.values[:k]]

                vector_true = create_binary_type_vector(true_types, all_types)
                vector_prediction = create_binary_type_prediction_vector(
                    type_predictions, all_types)

                sim = cosine(vector_true, vector_prediction)
                similarities.append(1 - sim)

            report = pd.DataFrame(similarities)
            self.logger.info('Mean type prediction: {0}'.format(
                report.mean().values))
Example #41
0
from sklearn.neighbors import NearestNeighbors
from carotid import carotid_data_util as cdu
from my_util import data_util
import matplotlib.pyplot as plt

id_all, x_data_all, y_data_all = cdu.get_ex_normal()
# id_all, x_data_all, y_data_all = cdu.get_ex_data('RCCA')
# x_data_all = data_util.scale(x_data_all)
# a minimum minPts can be derived from the number of dimensions D in the data set, as minPts ≥ D + 1
minPts = x_data_all.shape[1] + 1
# The value for ε can then be chosen by using a k-distance graph, plotting the distance to the k = minPts-1
k = minPts - 1
nbrs = NearestNeighbors(n_neighbors=k).fit(x_data_all)
distances, indices = nbrs.kneighbors(x_data_all)
distanceDec = sorted(distances[:, k - 1], reverse=True)
plt.plot(list(range(1, x_data_all.shape[0] + 1)), distanceDec)
plt.show()
Example #42
0
    def _label_clusters_first_pass(self, n_datasets, n_sym_ops):
        """First pass labelling of clusters.

        Labels points into clusters such that cluster contains exactly one copy
        of each dataset.

        Args:
          n_datasets (int): The number of datasets.
          n_sym_ops (int): The number of symmetry operations.

        Returns:
          cluster_labels (np.ndarray): A label for each coordinate, labelled from
          0 .. n_sym_ops.
        """
        # initialise cluster labels: -1 signifies doesn't belong to a cluster
        cluster_labels = np.full(self.coords.shape[0], -1, dtype=int)

        cluster_id = 0
        while (cluster_labels == -1).sum() > 0:
            coord_ids = np.arange(n_datasets * n_sym_ops)
            dataset_ids = coord_ids % n_datasets

            # select only those points that don't already belong to a cluster
            sel = np.where(cluster_labels == -1)
            X = self.coords[sel]
            dataset_ids = dataset_ids[sel]
            coord_ids = coord_ids[sel]

            # choose a high density point as seed for cluster
            nbrs = NearestNeighbors(n_neighbors=min(11, len(X)),
                                    algorithm="brute",
                                    metric="cosine").fit(X)
            distances, indices = nbrs.kneighbors(X)
            average_distance = np.array(
                [dist[1:].mean() for dist in distances])
            i = average_distance.argmin()

            d_id = dataset_ids[i]
            cluster = np.array([coord_ids[i]])
            cluster_dataset_ids = np.array([d_id])
            xis = np.array([X[i]])

            for j in range(n_datasets - 1):
                # select only those rows that don't correspond to a dataset already
                # present in current cluster
                sel = np.where(dataset_ids != d_id)
                X = X[sel]
                dataset_ids = dataset_ids[sel]
                coord_ids = coord_ids[sel]

                assert len(X) > 0

                # Find nearest neighbour in cosine-space to the current cluster centroid
                nbrs = NearestNeighbors(n_neighbors=min(1, len(X)),
                                        algorithm="brute",
                                        metric="cosine").fit(X)
                distances, indices = nbrs.kneighbors([xis.mean(axis=0)])
                k = indices[0][0]
                d_id = dataset_ids[k]
                cluster = np.append(cluster, coord_ids[k])
                cluster_dataset_ids = np.append(cluster_dataset_ids, d_id)
                xis = np.append(xis, [X[k]], axis=0)

            # label this cluster
            cluster_labels[cluster] = cluster_id
            cluster_id += 1
        return cluster_labels
Example #43
0
def run_xmap(dataset=None,
             n_neighbors=None,
             negative_sample_rate=None,
             seed=None,
             model_folder="models",
             return_step=STEP.DATA_CLEANED,
             learn_mode=LEARN.UNSUPERVISED,
             runall=False):
    global f, SEED, NNK, NS, cmap, NFEATURE
    # runall = True
    SEED = seed
    NNK = n_neighbors
    NS = negative_sample_rate
    # define the distance measure to be used in UMAP. Many options are available in UMAP
    distancem = "euclidean"
    np.random.seed(SEED)
    # define the path to store intermediate outputs
    pathname = model_folder + "/" + "xmap_k{}_ns{}_s{}_".format(NNK, NS,
                                                                SEED) + dataset
    current_step = STEP.INITIALIZING
    t0 = time.time()
    """
    STEP 1: Loading the dataset
    """
    if runall or not os.path.exists(pathname + ".cleandata"):
        f = open(
            "outputs/xmap_k{}_ns{}_s{}_".format(NNK, NS, SEED) + dataset +
            ".log", 'w')
        print_output("Loading Data ...")
        print_output("\tData set: {}".format(dataset))
        data = pd.read_csv("../data/{}.csv".format(dataset))
        feature_names = [
            c.replace("_", "ubar").replace(".", "dot") for c in data.columns
        ][1:]
        target_name = data.columns[0]
        nfeatures = len(feature_names)
        NFEATURE = nfeatures
        data = data.values
        Y = data[:, 0].reshape(-1, 1)
        X = data[:, 1:]
        # scale the dataset
        scaler = MinMaxScaler()
        scaler.fit(X)
        X_norm = scaler.transform(X)
        last_step = STEP.DATA_CLEANED
        pickle.dump((X_norm, Y, scaler, nfeatures, feature_names, target_name),
                    open(pathname + ".cleandata", "wb"))
    else:
        f = open(
            "outputs/xmap_k{}_ns{}_s{}_".format(NNK, NS, SEED) + dataset +
            ".log", 'w+')
        print_output("\tLoad cleaned data from " + pathname + ".cleandata")
        X_norm, Y, scaler, nfeatures, feature_names, target_name = pickle.load(
            open(pathname + ".cleandata", "rb"))
    current_step = STEP.DATA_CLEANED
    if current_step == return_step:
        return X_norm, Y, scaler, nfeatures, feature_names, target_name
    """
    STEP 2: Learn the latent representation of the dataset (unsupervised or supervised)
    """
    print_output("Learning UMAP ...")
    print(learn_mode)
    if learn_mode == LEARN.UNSUPERVISED:
        umapname = ".unsupervised_umap"
    else:
        umapname = ".supervised_umap"
    if runall or not os.path.exists(pathname + umapname):
        reducer = umap.UMAP(random_state=SEED,
                            n_neighbors=NNK,
                            negative_sample_rate=NS,
                            metric=distancem)
        # reducer = PCA(n_components=2)
        if learn_mode == LEARN.UNSUPERVISED:
            reducer.fit(X_norm)
        else:
            reducer.fit(X_norm, y=Y)
        last_step = STEP.UMAP_TRAINED
        pickle.dump(reducer, open(pathname + umapname, "wb"))
    else:
        print_output("\tLoad trained umap from " + pathname + umapname)
        reducer = pickle.load(open(pathname + umapname, "rb"))
    embedding = reducer.transform(X_norm)
    current_step = STEP.UMAP_TRAINED
    if current_step == return_step:
        return embedding
    lnames = ["Negative", "Positive"]
    Y = Y.reshape(-1)
    cmap = [
        "blue", "red", "purple", "hotpink", "black", "green", "orange", "teal",
        "brown", "lightsteelblue", "gray", "lime", "coral", "plum", "gold",
        "c", "tomato", "blueviolet", "darkseagreen"
    ]
    if return_step == None:
        plot_embedding_space(embedding,
                             labels=Y,
                             label_index=[0, 1],
                             lnames=lnames,
                             data_name="gt_" + dataset)
    """
    STEP 3: summarise latent data using ATL and cluster data using learned graph from ATL
    """
    # nepoch: number of times the data passed to ATL; age_max: maximum age of a connection; age increases if the
    # connection (different from the second best) links to the BMU. If max_age is too small the topological relationships will
    # be prematurely destroyed. Meanwhile if max_age is too large, some useless connections may survive because of randomness
    # or noise --> ATL needs to run longer to get the accurate results and more relationships will be preserved.
    # lamb: is the number of steps (or number of processed inputs) before ATL checks and cleans up the network. Lambda has
    # a similar effect as compared to max_age, i.e. small lamb leads to unstable network (unable to establish topological
    # relationhips) while large lamb may lead to redundant nodes and connections.
    print_output(
        "Learning topological relations and Determining contexts ....")
    if learn_mode == LEARN.UNSUPERVISED:
        soinnname = ".unsupervised_soinn"
    else:
        soinnname = ".supervised_soinn"
    if runall or not os.path.exists(pathname + soinnname):
        lamb = 200
        # if data.shape[0] < lamb:
        #     lamb = data.shape[0]
        nodes, connection, classes = atl.learning(input_data=embedding,
                                                  max_nepoch=5,
                                                  spread_factor=1.0,
                                                  lamb=lamb)

        classes = 0 * classes
        cmap = cmap * 10
        if return_step == None:
            plot_atl(nodes, connection)
        # create a network representation of the learned ATL graph
        G = nx.Graph()
        for i in range(0, nodes.shape[0]):
            for j in range(0, nodes.shape[0]):
                if connection[i, j] != 0:
                    G.add_edge(i, j, weight=1.0)

        # use community detection algorithms to discover the subgraph community
        network_cd_alg = "best"
        n_components = nx.number_connected_components(G)
        max_context = int(n_components + np.sqrt(n_components))
        threshold = 0.2
        if network_cd_alg == "gn":
            from networkx.algorithms import community
            communities_generator = community.girvan_newman(G)
            while True:
                level_communities = next(communities_generator)
                size_com = [len(c) for c in level_communities if len(c) > 1]
                if min(size_com) < threshold * sum(
                    [len(c) for c in level_communities]):
                    break
            cms = sorted(map(sorted, level_communities))
        elif network_cd_alg == "best" or network_cd_alg == "dendo":
            import community
            if network_cd_alg == "best":
                cms = community.best_partition(G, resolution=0.5)
            else:
                dendrogram = community.generate_dendrogram(G)
                sized = len(dendrogram)
                cms = community.partition_at_level(dendrogram, sized - 2)
            coms = set([cms[i] for i in cms])
            cdict = {}
            for k in coms:
                cdict[k] = []
            for i in cms:
                cdict[cms[i]].append(i)
            cms = []
            for k in coms:
                cms.append(list(cdict[k]))
        else:
            # cms = list(nx.connected_components(G))
            from networkx.algorithms import community
            communities_generator = community.girvan_newman(G)
            level_communities = next(communities_generator)
            cms = sorted(map(sorted, level_communities))

        components = [c for c in cms if len(c) > 1]
        # print(components)
        count = 1
        for comp in components:
            for n in comp:
                classes[n] = count
            count += 1

        # each components or subgraphs can be treated as clusters as the ATL only links nodes with similar patterns
        # togethers. The lack of connections between two nodes indicates that there two nodes or data matching these
        # two nodes should belong to the same cluster.
        nclusters = len(components)
        nbrs = NearestNeighbors(n_neighbors=1).fit(nodes)
        distances, indices = nbrs.kneighbors(embedding)
        node_indices = list(indices.reshape(-1))
        indices = np.array(
            [classes[node_indices[i]] for i in range(len(node_indices))])
        last_step = STEP.ATL_TRAINED
        pickle.dump(
            (nodes, connection, classes, nclusters, node_indices, indices),
            open(pathname + soinnname, "wb"))
    else:
        print_output("\tLoad trained umap from " + pathname + soinnname)
        nodes, connection, classes, nclusters, node_indices, indices = pickle.load(
            open(pathname + soinnname, "rb"))
    current_step = STEP.ATL_TRAINED
    if current_step == return_step:
        return embedding, nodes, connection, classes, nclusters, node_indices, indices
    """
    STEP 4: Try to explain context/cluster using Context Description Approximation (CDA). Can work with original 
    features or interactive terms (representing or/and relations).
    """
    cid = [c for c in range(nclusters + 1)]
    cid = cid + [100]
    if learn_mode == LEARN.UNSUPERVISED:
        explainnname = ".unsupervised_explain"
    else:
        explainnname = ".supervised_explain"
    if True or runall or not os.path.exists(pathname + explainnname):
        finteraction = False  # True if interactive terms are considered
        interactionAND = False  # True if AND relation is used
        n_identity_feature = 5  # determine the number of features/variables used to describe the cluster/context
        active_threshold = 0.01  # threshold to determine if a feature value can represent a given cluster/context
        cmap = 10 * [
            "red", "blue", "purple", "hotpink", "black", "green", "orange",
            "teal", "brown", "lightsteelblue", "gray", "lime", "coral", "plum",
            "gold", "c", "tomato", "blueviolet", "darkseagreen"
        ]
        if return_step == None:
            plot_embedding_space(embedding,
                                 labels=indices,
                                 label_index=cid,
                                 lnames=["context__# " + str(c) for c in cid],
                                 data_name="pointcontext_" + dataset)
            cluster_sizes = plot_embedding_space(
                embedding,
                labels=indices,
                label_index=cid,
                lnames=["context__# " + str(c) for c in cid],
                data_name="highlightcontext_" + dataset)
            cluster_id_ranked_by_size = (-np.array(cluster_sizes)).argsort()
        poly = PolynomialFeatures(interaction_only=True, include_bias=False)
        cluster_explainer_dict = {}
        if nclusters > 1:
            print_output("Explaining contexts ...")
            xcluster_id = np.zeros(embedding.shape[0])
            xcluster_id_details = np.zeros((embedding.shape[0], nclusters))
            outputs = np.zeros((nclusters, len(feature_names)))
            cluster_characteristic_dict = {}
            feature_names_I = None
            XX = X_norm
            feature_names = [
                ff.replace("ubar", "_").replace("dot", ".")
                for ff in feature_names
            ]
            if finteraction:
                if not interactionAND:
                    XX = poly.fit_transform(1 - X_norm)
                    XX = 1 - XX
                else:
                    XX = poly.fit_transform(X_norm)
                feature_names_I = str(poly.get_feature_names())
                for fi in range(nfeatures):
                    feature_names_I = feature_names_I.replace(
                        "'x" + str(fi) + "'", feature_names[fi])
                    feature_names_I = feature_names_I.replace(
                        "'x" + str(fi) + " ", feature_names[fi] + " ")
                    feature_names_I = feature_names_I.replace(
                        " x" + str(fi) + "'", " " + feature_names[fi])
                if not interactionAND:
                    feature_names_I = feature_names_I.replace("[", "").replace(
                        "]", "").replace("'",
                                         "").replace(", ",
                                                     ",").replace(" ", " or ")
                else:
                    feature_names_I = feature_names_I.replace("[", "").replace(
                        "]",
                        "").replace("'",
                                    "").replace(", ",
                                                ",").replace(" ", " and ")
                feature_names_I = feature_names_I.split(",")
                feature_names = feature_names_I
                outputs = np.zeros((nclusters, len(feature_names)))
            # for each cluster/context, repetitive patterns will be determined
            for i in range(nclusters):
                cluster_id = i  #cluster_id_ranked_by_size[i]
                print_output("Context #" + str(cluster_id + 1))
                Xc = XX[indices == cluster_id + 1]
                from scipy.stats import iqr
                for fi in range(len(feature_names)):
                    outputs[cluster_id][fi] = min(
                        np.sum(Xc[::, fi]) / len(Xc[::, fi]),
                        1 - np.sum(Xc[::, fi]) / len(Xc[::, fi]))
                true_features = []
                false_features = []
                numeric_features = []
                impure_features = []
                ranked_features = np.argsort(outputs[cluster_id])
                for fi in ranked_features:
                    if outputs[cluster_id][fi] <= active_threshold:
                        (values, counts) = np.unique(Xc[::, fi],
                                                     return_counts=True)
                        ind = np.argmax(counts)
                        val = values[ind]
                        if val == 1.0:
                            true_features.append(fi)
                        elif val == 0.0:
                            false_features.append(fi)
                        else:
                            numeric_features.append(fi)
                    else:
                        impure_features.append(
                            (fi, np.min(Xc[::, fi]), np.max(Xc[::, fi]),
                             np.average(Xc[::, fi])))
                nzeros = len(feature_names) - np.count_nonzero(
                    outputs[cluster_id])
                mask = np.ones((embedding.shape[0], ), dtype=bool)
                countf = 0
                print_output("\tTrue Features")
                count = 0
                filter_true = []
                for fi in true_features:
                    if countf >= n_identity_feature:
                        break
                    countf += 1
                    count += 1
                    fmask = XX[::, fi] == 1.0
                    mask = mask & fmask
                    filter_true.append(fi)
                if count > 0:
                    print_output("\t\t" + str(
                        sorted([
                            feature_names[ii] for ii in true_features[:count]
                        ])))
                print_output("\tFalse Features")
                count = 0
                filter_false = []
                for fi in false_features:
                    if countf >= n_identity_feature:
                        break
                    countf += 1
                    count += 1
                    fmask = XX[::, fi] == 0.0
                    mask = mask & fmask
                    filter_false.append(fi)
                true_features, false_features = filter_true, filter_false
                cluster_explainer_dict[cluster_id] = (finteraction,
                                                      true_features,
                                                      false_features,
                                                      numeric_features,
                                                      impure_features)
                if count > 0:
                    print_output("\t\t" + str(
                        sorted([
                            feature_names[ii] for ii in false_features[:count]
                        ])))
                print_output("\tNumeric Features")
                count = 0
                for fi in numeric_features:
                    if countf >= n_identity_feature:
                        break
                    countf += 1
                    count += 1
                if count > 0:
                    print_output("\t\t" +
                                 str([(feature_names[ii[0]], ii[1], ii[2])
                                      for ii in numeric_features[:count]]))
                xcluster_id_details[mask, cluster_id] = 1
                print_output("\t" + 20 * '-')
            print_output("\t" + 20 * '=')
            print_output("")

        last_step = STEP.CONTEXT_EXPLAINED
        pickle.dump(last_step, open(pathname + ".notes", "wb"))
        pickle.dump(
            (cluster_explainer_dict, xcluster_id_details, feature_names),
            open(pathname + explainnname, "wb"))
    else:
        print_output("\tLoad explainer from " + pathname + explainnname)
        cluster_explainer_dict, xcluster_id_details, feature_names = pickle.load(
            open(pathname + explainnname, "rb"))
    current_step = STEP.CONTEXT_EXPLAINED
    if current_step == return_step:
        return embedding, nodes, connection, classes, nclusters, node_indices, indices, cluster_explainer_dict, xcluster_id_details, feature_names

    run_time = time.time() - t0
    print_output('Run in %.3f s' % run_time)

    print_output("Complete!!!")
Example #44
0
def sort_min_diff(amat):
    mb = NearestNeighbors(len(amat), metric='manhattan').fit(amat)
    v = mb.kneighbors(amat)
    smallest = np.argmin(v[0].sum(axis=1))
    return amat[v[1][smallest]]