Ejemplo n.º 1
0
def gower_distance(X):
    """
    This function expects a pandas dataframe as input
    The data frame is to contain the features along the columns. Based on these features a
    distance matrix will be returned which will contain the pairwise gower distance between the rows
    All variables of object type will be treated as nominal variables and the others will be treated as 
    numeric variables.

    Distance metrics used for:

    Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
    Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry)
    """
    individual_variable_distances = []

    for i in range(X.shape[1]):
        feature = X.iloc[:,[i]]
        if feature.dtypes[0] == np.object:
            feature_dist = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(feature))
        else:
            feature_dist = DistanceMetric.get_metric('manhattan').pairwise(feature) / np.ptp(feature.values)
            
        individual_variable_distances.append(feature_dist)

    return np.array(individual_variable_distances).mean(0)
Ejemplo n.º 2
0
def calc_mahalanobis(x, y, n_neighbors):
    from sklearn.neighbors import DistanceMetric, NearestNeighbors
    DistanceMetric.get_metric('mahalanobis', V=np.cov(x))

    nn = NearestNeighbors(n_neighbors=n_neighbors,
                          algorithm='brute',
                          metric='mahalanobis',
                          metric_params={'V': np.cov(x)})
    return nn.fit(x).kneighbors(y)
Ejemplo n.º 3
0
Archivo: knn.py Proyecto: salceson/mro
def mahalonobis(X):
    cov = np.cov(X, rowvar=0)
    try:
        metric = DistanceMetric.get_metric('mahalanobis', V=cov) if X.shape[0] > 1 \
            else DistanceMetric.get_metric('euclidean')
    except LinAlgError:
        metric = DistanceMetric.get_metric('euclidean')

    def distance(x, y):
        return metric.pairwise([x], [y])[0][0]

    return distance
Ejemplo n.º 4
0
    def get_full_metric(self, train_pairs):
        train_pairs_flat = [item for subtuple in train_pairs for item in subtuple]
        
        pca = PCA(n_components = self.pca_components)
        pca.fit(train_pairs_flat)

        train_pairs_pca_flat = pca.transform(train_pairs_flat)

        train_pairs_pca = list()

        for i in xrange(0, len(train_pairs_pca_flat), 2):
            a = i 
            b = i + 1
            train_pairs_pca.append((train_pairs_pca_flat[a],
              train_pairs_pca_flat[b]))
        
        ys = ys_from_pairs(train_pairs_pca)

        file_id = str(random.random())[2:]

        save_cvx_params(ys, file_id)
        run_cvx(file_id)
        M = load_cvx_result(file_id)

        dist = DistanceMetric.get_metric('mahalanobis', VI = M)

        return dist, M, pca
Ejemplo n.º 5
0
def dist(X, Y, distance_function = "euclidean"):
    """calculate X, Y distance matrix
    [Args]
    ------
    X : m samples
    Y : n samples
    distance_function : user_defined distance
    
    [Returns]
    ---------
    distance_matrix: n * m distance matrix
    
    
    we have those built-in function. Default = euclidean
    
    "euclidean"    EuclideanDistance    sqrt(sum((x - y)^2))
    "manhattan"    ManhattanDistance    sum(|x - y|)
    "chebyshev"    ChebyshevDistance    sum(max(|x - y|))
    "minkowski"    MinkowskiDistance    sum(|x - y|^p)^(1/p)
    "wminkowski"    WMinkowskiDistance    sum(w * |x - y|^p)^(1/p)
    "seuclidean"    SEuclideanDistance    sqrt(sum((x - y)^2 / V))
    "mahalanobis"    MahalanobisDistance    sqrt((x - y)' V^-1 (x - y))
    """
    distance_calculator = DistanceMetric.get_metric(distance_function)
    return distance_calculator.pairwise(X, Y)
Ejemplo n.º 6
0
Archivo: knn.py Proyecto: salceson/mro
def euclid(_):
    metric = DistanceMetric.get_metric('euclidean')

    def distance(x, y):
        return metric.pairwise([x], [y])[0][0]

    return distance
Ejemplo n.º 7
0
    def run_single_trial(self, train_pairs, test_pairs, train_tune_data, test_tune_data):
        print "Running PCA..."
        train_pairs_pca, test_pairs_pca = self.fit_pca(train_pairs, test_pairs)
        ys = ys_from_pairs(train_pairs_pca)

        file_id = str(random.random())[2:]

        save_cvx_params(ys, file_id)
        run_cvx(file_id)
        M = load_cvx_result(file_id)
        dist = DistanceMetric.get_metric('mahalanobis', VI = M)
        train_a_sections = [x[0] for x in train_pairs_pca]
        train_b_sections = [x[1] for x in train_pairs_pca]
        test_a_sections = [x[0] for x in test_pairs_pca]
        test_b_sections = [x[1] for x in test_pairs_pca]

        train_given_sections = train_a_sections
        train_to_match_sections = train_b_sections
        test_given_sections = test_a_sections
        test_to_match_sections = test_b_sections
        if self.match_a_to_b:
            train_given_sections = train_b_sections
            train_to_match_sections = train_a_sections
            test_given_sections = test_b_sections
            test_to_match_sections = test_a_sections

        print "Constructing BallTrees..."
        train_bt = BallTree(train_to_match_sections, metric=dist)
        test_bt = BallTree(test_to_match_sections, metric=dist)

        train_top_fraction = int(len(train_given_sections) * self.correct_within_top_fraction)
        test_top_fraction = int(len(test_given_sections) * self.correct_within_top_fraction)

        print "Querying the BallTrees..."
        train_result = train_bt.query(train_given_sections, train_top_fraction)
        test_result = test_bt.query(test_given_sections, test_top_fraction)

        print "Looking at correctness of results..."
        train_correct = sum([int(i in train_result[1][i]) for i in xrange(len(train_given_sections))])
        test_correct = sum([int(i in test_result[1][i]) for i in xrange(len(test_given_sections))])

        print "Finding indices of correct matches..."
        test_result_full = test_bt.query(test_given_sections, len(test_given_sections))
        def default_index(lst, i):
          ind = -1
          try:
            ind = lst.index(i)
          except:
            pass
          return ind
        test_indices = [default_index(list(test_result_full[1][i]), i) for i in xrange(len(test_given_sections))]
        test_indices = [x for x in test_indices if x != -1]

        with open("successful_tunes_{}".format(file_id), 'w') as successful_tunes_f:
          for i, index in enumerate(test_indices):
            if index == 0:
              successful_tunes_f.write(str(test_tune_data[i]) + '\n\n')

        return [[train_correct, len(train_given_sections)],
            [test_correct, len(test_given_sections)]], test_indices
def standardizedEulideanDistance(wide, p):
    """ Calculate the standardized Euclidean distance and return an array of distances to the center and a matrix of pairwise distances.

    :Arguments:
        :type wide: pandas.DataFrame
        :param wide: A wide formatted data frame with samples as columns and compounds as rows.

    :Returns:
        :return: Return 4 pd.DataFrames with SED values and cutoffs.
        :rtype: pd.DataFrames
    """

    # Estimated Variance from the data
    varHat = wide.var(axis=1, ddof=1)
    varHat[varHat==0] = 1
    dist = DistanceMetric.get_metric('seuclidean', V=varHat)

    # Column means
    colMean = wide.mean(axis=1)

    # Calculate the standardized Euclidean Distance from all samples to the center

    SEDtoCenter = dist.pairwise(wide.values.T, pd.DataFrame(colMean).T)
    SEDtoCenter = pd.DataFrame(SEDtoCenter, columns = ['SED_to_Center'], index = wide.columns)
    
    # Calculate the pairwise standardized Euclidean Distance of all samples
    SEDpairwise = dist.pairwise(wide.values.T)
    SEDpairwise = pd.DataFrame(SEDpairwise, columns = wide.columns, index = wide.columns)
    for index, row in SEDpairwise.iterrows():
        SEDpairwise.loc[index, index] = np.nan
    
    # Calculate cutoffs
    # For SEDtoCenter: 
    #   Beta: sqrt((p-1)^2/p*(sum of n iid Beta(1/2, p/2)));        (It's the exact distribution.)
    #   Normal: sqrt(N((p-1)/p*n, 2*(p-2)*(p-1)^2/p^2/(p+1)*n));    (It's normal approximation. Works well when n is large.)
    #   Chisq: sqrt((p-1)/p*Chi-sq(n));                             (It's Chi-sq approximation. Works well when p is decent and p/n is not small.)
    # For SEDpairwise:
    #   Beta: sqrt(2*(p-1)*(sum of n iid Beta(1/2, p/2)));
    #   Normal: sqrt(N(2*n, 8*(p-2)/(p+1)*n));
    #   Chisq: sqrt(2*Chi-sq(n));
    # where n = # of compounds and p = # of samples
    pSamples  = float(wide.shape[1])
    nFeatures = float(wide.shape[0])
    nIterate  = 20000 #100000
    #p = 0.95
    betaP     = np.percentile(pd.DataFrame(stats.beta.rvs(0.5, 0.5*(pSamples-2), size=nIterate*nFeatures).reshape(nIterate, nFeatures)).sum(axis=1), p*100)
    betaCut1  = np.sqrt((pSamples-1)**2/pSamples*betaP)
    normCut1  = np.sqrt(stats.norm.ppf(p, (pSamples-1)/pSamples*nFeatures, np.sqrt(2*nFeatures*(pSamples-2)*(pSamples-1)**2/pSamples**2/(pSamples+1))))
    chisqCut1 = np.sqrt((pSamples-1)/pSamples*stats.chi2.ppf(p, nFeatures))
    betaCut2  = np.sqrt((pSamples-1)*2*betaP)
    normCut2  = np.sqrt(stats.norm.ppf(p, 2*nFeatures, np.sqrt(8*nFeatures*(pSamples-2)/(pSamples+1))))
    chisqCut2 = np.sqrt(2*stats.chi2.ppf(p, nFeatures))
    cutoff1   = pd.DataFrame([[betaCut1, normCut1, chisqCut1]], columns=['Beta(Exact)', 'Normal', 'Chi-sq'])
    cutoff2   = pd.DataFrame([[betaCut2, normCut2, chisqCut2]], columns=['Beta(Exact)', 'Normal', 'Chi-sq'])

    # TODO: Create a flag based on values greater than one of the cutoffs.
    return SEDtoCenter, cutoff1, SEDpairwise, cutoff2
Ejemplo n.º 9
0
def example2():
    """using customized distance
    """
    from HSH.Misc.shgeo import dist
    def earthdist(x, y): # latitude, longitude earth surface distance
        return dist((x[0], x[1]), (y[0], y[1]))
    
    dist_cal = DistanceMetric.get_metric(earthdist)
    train = np.array([[32.5, 101.0], [32.5, 102.0]])
    test = np.array([[31.5, 101.0], [39.5, 101.0]])
    print(dist_cal.pairwise(train, test))
Ejemplo n.º 10
0
def example1():
    dist = DistanceMetric.get_metric("euclidean")
    train = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    test = np.array([[0.5, 0.5], [-0.5, -0.5]])
    
    distance_matrix = dist.pairwise(train, test)
    print(distance_matrix) # distance_matrix
    
    reduced_distance_matrix = dist.dist_to_rdist(distance_matrix) # reduced_distance_matrix
    print(reduced_distance_matrix) # for euclidean, it's squared distance_matrix
    
    print(dist.rdist_to_dist(reduced_distance_matrix))
Ejemplo n.º 11
0
 def load_model(self):
     if self.file_cache and os.path.isfile(self.file_cache):
         self._log.debug("Loading mode: %s", self.file_cache)
         with numpy.load(self.file_cache) as cache:
             tail = tuple(cache['tail'])
             s = (cache['data_arr'], cache['idx_array_arr'],
                  cache['node_data_arr'], cache['node_bounds_arr']) +\
                 tail + (DistanceMetric.get_metric('hamming'),)
         #: :type: sklearn.neighbors.BallTree
         self.bt = BallTree.__new__(BallTree)
         self.bt.__setstate__(s)
         self._log.debug("Loading mode: Done")
Ejemplo n.º 12
0
def distance(X, distance_measure='euclidean'):

    X = np.array(X)

    if distance_measure in SKLEARN_METRICS:
        distance_ = DistanceMetric.get_metric(distance_measure).pairwise(X)
    elif distance_measure is 'pearson':
        distance_ = np.corrcoef(X)
    else:
        distance_ = None

    return distance_
Ejemplo n.º 13
0
 def entropy(x,k=3,base=np.exp(1),intens=1e-10):
     """ The classic K-L k-nearest neighbor continuous entropy estimator
         x should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]]
         if x is a one-dimensional scalar and we have four samples
     """
     assert k <= len(x)-1, "Set k smaller than num. samples - 1"
     d = len(x[0])
     N = len(x)
     x +=  intens*nr.rand(N,d)
     tree = KDTree(x, metric=DistanceMetric.get_metric("minkowski",p=np.float64('inf') ))
     nn = tree.query(x,k+1)[0][:,k]   # no need to reshape with new query_radius method
     const = digamma(N)-digamma(k) + d*log(2)
     return (const + d*np.mean(map(log,nn)))/log(base)
Ejemplo n.º 14
0
    def find_computed_cluster_metrics(self):
        """Initialises cluster metric computation over every cluster that is
        found by the given clustering algorithm.
        """
        for cluster in self.computed_clusters:
            cluster.compute_metrics(self.original_corpus,
                                    self.original_article_pos)

        centroid_locs = [x.centroid for x in self.computed_clusters]
        dist = DistanceMetric.get_metric('euclidean')
        dist_pair = dist.pairwise(centroid_locs)
        self.max_centroid_dist = max(list(itertools.chain.from_iterable(
            dist_pair)))
Ejemplo n.º 15
0
    def __init__(self, proc, Xss, yss, valid_set=0.1, validation_set=None):
        self.seen_states = set()
        self.state_set = []
        self.proc = proc
        self.valid_set = 0.1
        self.surr_loss = DistanceMetric.get_metric('hamming')

        if validation_set is None:
            self._split(Xss, yss)
        else:
            self.Xss = Xss
            self.yss = yss
            self.valid_Xss, self.valid_yss = validation_set
Ejemplo n.º 16
0
def __max_score_mapping(rr, predicted, test, max_angle=1.0 - 2):
    angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle)
    d = angle.pairwise(predicted, test)

    # Each true sample maps to closest
    test_mapping = np.zeros(shape=(test.shape[0],), dtype=float)

    for i in xrange(test.shape[0]):
        if np.any(d[:, i] < max_angle):
            close_predictions = d[:, i] < max_angle
            scores = [rr(p) for p in predicted[close_predictions, :]]
            test_mapping[i] = np.max(scores)

    return test_mapping
Ejemplo n.º 17
0
def update_prediction(prediction, real_pic, metric_name='euclidean'):
    """
    Update a prediction after receiving the actual picture from the webcam.

    Parameters
    ----------
    prediction : Prediction
        The model object of the prediction to update
    real_pic : Picture
        The model object of the actual picture received

    Return
    ------
    float : the prediction error
    """
    pred_pic = prediction.as_picture()
    cam_id = prediction.params.webcam.webcam_id
    if metric_name == 'wminkowski-pca':
        with webcam_fs.get_dataset(cam_id) as dataset:
            if 'pca' not in dataset.imgset.feature_sets:
                raise ValueError("""wminkowski-pca cannnot be used
                                    without a PCA feature set""")

            pca_extractor = dataset.imgset.feature_sets['pca'].extractor
            weights = pca_extractor.pca.explained_variance_ratio_
            pred_data = pca_extractor.extract(pred_pic.pixels)
            real_data = pca_extractor.extract(real_pic.pixels)
            metric = DistanceMetric.get_metric('wminkowski', p=2, w=weights)
    else:
        pred_data = pred_pic.pixels
        real_data = real_pic.pixels
        metric = DistanceMetric.get_metric(metric_name)

    error = metric.pairwise([pred_data], [real_data])[0]
    prediction.error = error
    prediction.save()
    return error
Ejemplo n.º 18
0
def entropy(data, ball='euclidean', k=1, units='nats'):
    """
    Estimates the entropy of the given data using the k-nearest neighbors method

    input
    -----
    data (nd-array):
        An (n by p) matrix containing n samples of p-dimensional data

    ball (string):
        Which ball (e.g. l1, euclidean, etc.) to use when computing the volume.
        Acceptable strings include:
            'l1'   : l1 or Manhattan distance
            'l2'   : l2 or Euclidean distance; default
            'linf' : l-infinity or Chebyshev distance

    k (integer):
        How many nearest-neighbors to use when computing radii. Must be at least 1.

    units (string):
        Which unit the entropy output has.
        Acceptable strings include:
            'nats' : base e
            'bits' : base 2

    """
    
    # Get number of samples and dimensionality
    (n,p)  = data.shape
    
    # Determine radii and volumes for a given metric space
    metric = getball(ball)
    if metric == 1:
        m = 'manhattan'
    elif metric == 2:
        m = 'euclidean'
    elif metric == inf:
        m = 'chebyshev'
        
    dist  = DistanceMetric.get_metric(m)
    D_mat = dist.pairwise(data)
    D_mat.sort(axis=1)
    radii = D_mat[:,k]
    Vs    = volume(radii, ball=str(metric), dimension=p)
    
    if units.lower() == 'nats':
        return sum([np.log(vol) for vol in Vs])/float(n) + np.log(n) - L(k - 1) + 0.577215665
    if units.lower() == 'bits':
        return sum([np.log2(vol) for vol in Vs])/float(n) + np.log2(n) - L(k - 1) + 0.577215665
Ejemplo n.º 19
0
def __mappings(predicted, test, max_angle=1.0 - 2):
    angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle)
    d = angle.pairwise(predicted, test)

    # Each true sample maps to closest
    test_mapping = np.zeros(shape=(test.shape[0],), dtype=int)
    predicted_mapping = np.zeros(shape=(predicted.shape[0],), dtype=int)

    for i in xrange(test.shape[0]):
        test_mapping[i] = 1 if np.any(d[:, i] < max_angle) else 0

    for i in xrange(predicted.shape[0]):
        predicted_mapping[i] = 1 if np.any(d[i, :] < max_angle) else 0

    return predicted_mapping, test_mapping
Ejemplo n.º 20
0
    def mi_LNC(x,y,k=5,base=np.exp(1),alpha=0.25,intens = 1e-10,metric='minkowski',p=np.float64('inf')):
        '''The mutual information estimator by PCA-based local non-uniform correction(LNC)
           ith row of X represents ith dimension of the data, e.g. X = [[1.0,3.0,3.0],[0.1,1.2,5.4]], if X has two dimensions and we have three samples
           alpha is a threshold parameter related to k and d(dimensionality), please refer to our paper for details about this parameter
        '''
        #N is the number of samples
        N = x.shape[0]

        #First Step: calculate the mutual information using the Kraskov mutual information estimator
        #adding small noise to X, e.g., x<-X+noise
        x += intens*nr.rand(x.shape[0],x.shape[1])
        y += intens*nr.rand(x.shape[0],x.shape[1])
        points = np.hstack((x,y))

        tree = KDTree(points, metric=DistanceMetric.get_metric(metric, p=p))
        try:
            dvec, knn_idx = tree.query(points, k+1)   # no need to reshape with new query_radius method
        except ValueError:
          return (float("NaN"))

        a = MI.avgdigamma(x,dvec[:,-1]*x.shape[1]/points.shape[1], metric=metric, p=p)
        b = MI.avgdigamma(y,dvec[:,-1]*y.shape[1]/points.shape[1], metric=metric, p=p)
        c = digamma(k)
        d = digamma(len(x))

        # a,b,c,d = MI.avgdigamma(x,dvec), MI.avgdigamma(y,dvec), digamma(k), digamma(len(x))
        # print("ee_acc: %s, %s, %s, %s" %( a,b,c,d))
        ret = (-a-b+c+d)/np.log(base)

        # LNC correction
        logV_knn = np.sum(np.log(np.abs(points - points[knn_idx[:,-1],:])), axis=1)
        logV_projected = np.zeros(logV_knn.shape)
        for i in range(points.shape[0]):
            knn_points = points[knn_idx[i,:],:]
            knn_centered = knn_points - points[i,:]
            u,s,v = la.svd(knn_centered)
            knn_proj = knn_centered.dot(v.T)
            max_dims = np.max(np.abs(knn_proj), axis=0)   # max-norm per dimension
            logV_projected[i] = np.sum(np.log(max_dims))

        diff = logV_projected - logV_knn
        if (alpha>1): alpha = 1
        diff[diff >= log(alpha)] = 0
        e = -np.sum(diff) / N

        return (ret + e)/log(base);
Ejemplo n.º 21
0
def distance_from_most_visited_place(place, user):
    q = select([func.count(),visits_10min.c.placeid]).where(visits_10min.c.userid == user).group_by(visits_10min.c.placeid).order_by(func.count().desc())
    most_visited_places = [r[1] for r in connection.execute(q).fetchall()]
    def get_lat_long(place_q):
        try:
            return connection.execute(select([places_location.c.longitude, places_location.c.latitude]).where(and_(places_location.c.placeid == place_q, places_location.c.userid == user))).fetchall()[0]
        except Exception as e:
            return None
            
    dist = DistanceMetric.get_metric('haversine')
    X = []
    X.append(get_lat_long(place))
    for p in most_visited_places:
        ret = get_lat_long(p)
        if ret is not None:
            X.append((ret[0], ret[1]))
            break
    return dist.pairwise(X)[0][1]
def calcMDS(pltnum, flag, dmetric):
    if flag == 1:
        clf = PCA(n_components=5)
        Y = clf.fit_transform(X)
        title  = 'PCA-MDS'
    elif flag == 2:
        clf = TruncatedSVD(n_components=5)
        Y = clf.fit_transform(X)
    else:
        Y = X
        title = 'MDS DistanceMetric: ' + str(dmetric)
    dist = DistanceMetric.get_metric(dmetric)
    Y    = dist.pairwise(Y)
    # Y = euclidean_distances(Y)
    mds = manifold.MDS(n_components=2, dissimilarity='precomputed')#, init='pca', random_state=0)
    Y = mds.fit_transform(Y)
    for i in range(1, 3):
        mdsPlot(int(str(pltnum) + str(i)), i, Y, title)
Ejemplo n.º 23
0
	def MDS(self,typeof='classic',dist=False,groups=None,dpi=300,textsize=10,interactive=False,
	        samemarker=False,markersize=8,numbered=False,legend=False,of='pdf',rotate=0,MD=False):
		'''
		Perform Multidimensional Scaling wither classic (PCoA) or non-metric.
		If you have the upper triangle of a distance matrix as a dictionary,
		pass the dictionary as dist.
		'''
		# Rotation instance
		self.clf = PCA(n_components=self.ncomp)		

		seed = np.random.RandomState(seed=3)

		if typeof == 'classic': 
			metric = True
			self.type = 'cMDS'
		else: 
			metric = False
			self.type = "nMDS"

		if dist:
			similarities=self.dict2array2matrix(dist)
		else:
			#similarities = euclidean_distances(self.data)
			dist = DistanceMetric.get_metric('euclidean')
			similarities = dist.pairwise(self.data)
		# Initiate multidimensional scaling
		mds = manifold.MDS(n_components=self.ncomp, metric = metric, max_iter=3000, eps=1e-9, 
		                   random_state=seed, dissimilarity="precomputed", n_jobs=-1)

		#fit the data the MDS
		pos = mds.fit(similarities).embedding_
		if typeof != 'classic': pos = mds.fit_transform(similarities, init=pos)

		# Rescale the data
		pos *= np.sqrt((np.array(self.data)** 2).sum()) / np.sqrt((pos ** 2).sum())

		# Rotate the data
		self.fit = self.clf.fit_transform(pos)
		self.Plot(dpi=dpi,textsize=textsize,interactive=interactive,samemarker=samemarker,
		          markersize=markersize,numbered=numbered,legend=legend,of=of,rotate=rotate,
		          groups=groups,MD=MD)
Ejemplo n.º 24
0
    def compute_metrics(self, corpus, article_pos):
        """Computes metrics for the given cluster. Metrics computed are:
        diameter, radius, centroid, closest article to centroid, the distance
        of the closest article to the centroid.

        Args:
            corpus: A corpus in LSI space
            article_pos (dict): Maps the article id to the actual
                                positions of the article in the corpus
        """
        dist_corpus = [corpus[article_pos[x]] for x in self.articles_id]

        # Centroid calculation
        self.centroid = np.average(dist_corpus, axis=0)

        # Diameter calculation
        dist = DistanceMetric.get_metric('euclidean')
        dist_pair = dist.pairwise(dist_corpus)
        self.diameter = max(list(itertools.chain.from_iterable(dist_pair)))

        # Radius calculation
        dist_corpus.append(self.centroid)
        dist_pair = dist.pairwise(dist_corpus)
        centroid_dist = [x for x in dist_pair[-1] if x > 0]
        if len(centroid_dist) > 0:
            self.radius = max(centroid_dist)

            # Closest article computation
            closest_article = self.articles_id[0]
            min_dist = self.radius
            tmp_content = []

            for k, id in enumerate(self.articles_id):
                if centroid_dist[k] < min_dist:
                    closest_article = id
                    min_dist = centroid_dist[k]
                    tmp_content = self.data[k]

            self.closest_article_id = closest_article
            self.closest_article_distance = min_dist
            self.closest_article_content = tmp_content
    def predict_proba(self, X):

        # Check is fit had been called by confirming that the distances_ dictionary has been set up
        check_is_fitted(self, ['distances_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)

        # Initialise an array to store the prediction scores generated
        predictions = np.zeros((len(X), len(self.classes_)))

        distance_metric_model = DistanceMetric.get_metric(self.dist_param)

        unique = []
        dist = []
        for i in self.distances_.keys():
            unique.append(i)
            dist.append(self.distances_.get(i))

        # Iterate through the query instances in the query dataset
        predictions_prob = []
        my_dict = dict(zip(unique, dist))

        for instance in X:
            prob_dist = []
            for item in my_dict:
                current_label = my_dict[item]
                array = np.vstack((current_label, instance))
                dist = distance_metric_model.pairwise(array)

                var = np.amin(np.array(dist)[dist != np.amin(dist)])
                prob_dist.append(1 / var)

            sum_value = sum(prob_dist)
            dict_model_prob = dict(zip(unique, (prob_dist / sum_value)))

            predictions_prob.append(dict_model_prob)

        return predictions_prob
Ejemplo n.º 26
0
    def _test(y_pred, y, batch_size):
        def update_fn(engine, batch):
            idx = (engine.state.iteration - 1) * batch_size
            y_true_batch = np_y[idx:idx + batch_size]
            y_pred_batch = np_y_pred[idx:idx + batch_size]
            return torch.from_numpy(y_pred_batch), torch.from_numpy(
                y_true_batch)

        engine = Engine(update_fn)

        m = CanberraMetric()
        m.attach(engine, "cm")

        np_y = y.numpy().ravel()
        np_y_pred = y_pred.numpy().ravel()

        canberra = DistanceMetric.get_metric("canberra")

        data = list(range(y_pred.shape[0] // batch_size))
        cm = engine.run(data, max_epochs=1).metrics["cm"]

        assert canberra.pairwise([np_y_pred, np_y])[0][1] == pytest.approx(cm)
Ejemplo n.º 27
0
    def fit_predict(self, X):
        # definition of  distance metric
        dist = DistanceMetric.get_metric(self.metric)
        # initialization of KDTree with corresponding metric
        tree = KDTree(X, metric=dist)
        cluster_counter = -1
        X = self.data2Point(X)
        for i, point in enumerate(X):
            if point.label == -2:
                neigh_ind, _ = tree.query_radius([point.coordinate],
                                                 r=self.eps,
                                                 return_distance=True,
                                                 sort_results=True)
                neigh_ind = neigh_ind[0]
                # we mark points with less neighbors min_samples as noise/outlier
                if neigh_ind.size < self.min_samples:
                    X[i].label = -1
                else:
                    cluster_counter += 1
                    X[i].label = cluster_counter
                    neigh_ind = neigh_ind[1:].tolist()
                    for j in neigh_ind:
                        # we mark neighbors that don't belong to any cluster as current cluster
                        if X[j].label < 0:
                            X[j].label = cluster_counter
                            q_neigh_ind, _ = tree.query_radius(
                                [X[j].coordinate],
                                r=self.eps,
                                return_distance=True,
                                sort_results=True)
                            q_neigh_ind = q_neigh_ind[0].tolist()
                            # we add current node as neighbor if density of neighbors is high enough
                            if len(q_neigh_ind) >= self.min_samples:
                                new_el = list(
                                    set(q_neigh_ind).difference(
                                        set(neigh_ind)))
                                neigh_ind.extend(new_el)

        return np.array([x.label for x in X])
Ejemplo n.º 28
0
    def closest_target(self, agent_states):
        """
        Compute the nearest neighbor based on the manhattan distance
        """
        agent_states = np.array(list(agent_states.values()))
        all_together = np.vstack((self.current_state, agent_states))
        dist = DistanceMetric.get_metric('chebyshev')
        distances = dist.pairwise(all_together)

        # Compute distances for two dirs (Periodic Boundary) - row 0 = pred
        id_dist_1 = np.argsort(distances[0, :])[1]
        id_dist_2 = np.flip(np.argsort(self.obs_space_size -
                                       distances[0, :]))[1]

        if id_dist_1 == id_dist_2:
            target_agent_id = id_dist_1
        else:
            d1 = distances[0, id_dist_1]
            d2 = self.obs_space_size - distances[0, id_dist_2]
            target_agent_id = id_dist_1 if d1 < d2 else id_dist_2
        # Subtract one (since we included pred in dist calc) to get agent_id
        return target_agent_id - 1
Ejemplo n.º 29
0
    def _test(y_pred, y, batch_size):
        def update_fn(engine, batch):
            idx = (engine.state.iteration - 1) * batch_size
            y_true_batch = np_y[idx:idx + batch_size]
            y_pred_batch = np_y_pred[idx:idx + batch_size]
            return idx, torch.from_numpy(y_pred_batch), torch.from_numpy(
                y_true_batch)

        engine = Engine(update_fn)

        m = ManhattanDistance(output_transform=lambda x: (x[1], x[2]))
        m.attach(engine, "md")

        np_y = y.numpy()
        np_y_pred = y_pred.numpy()

        manhattan = DistanceMetric.get_metric("manhattan")

        data = list(range(y_pred.shape[0] // batch_size))
        md = engine.run(data, max_epochs=1).metrics["md"]

        assert manhattan.pairwise([np_y_pred, np_y])[0][1] == pytest.approx(md)
Ejemplo n.º 30
0
 def predict(self, X_arr):
     """
     conventional kNN prediction that use weighted summation distance
     :param X_arr:
     :return:
     """
     from sklearn.neighbors import DistanceMetric as DM
     dis = DM.get_metric('euclidean')
     distances = []
     for i in range(X_arr.__len__()):
         X_arr[i], self.X_arr[i] = normalize(X_arr[i]), normalize(
             self.X_arr[i])  # force convert into range
         distances.append(dis.pairwise(X_arr[i], self.X_arr[i]))
     distances = np.array(distances)
     multi_dis = np.zeros((distances.shape[1], distances.shape[2]))
     for w, d in zip(self.weights, distances):
         multi_dis += w * d  # weighted distances
     sorted_ss_indicies = multi_dis.argsort()  # sort dis
     k_neighbors_lables = self.y[sorted_ss_indicies][:, :self.k]
     from scipy.stats import mode
     y_predicted, t = mode(k_neighbors_lables, axis=1)
     return y_predicted.reshape(-1)
Ejemplo n.º 31
0
    def euclidean_distance(dataset1: pd.DataFrame, dataset2: pd.DataFrame) -> float:
        """
        Pair up the datasets and compute the euclidean distances between the sequences of values. Both datasets must
        have the same columns.

        :param dataset1: First DataFrame.
        :param dataset2: Second DataFrame.
        :return: Euclidean distance between all rows in the datasets.
        """

        dist = DistanceMetric.get_metric('euclidean')
        if not len(dataset1.index) == len(dataset2.index):
            return -1
        distance = 0

        for i in range(0, len(dataset1.index)):
            data_row1 = dataset1.iloc[:, i:i + 1].transpose()
            data_row2 = dataset2.iloc[:, i:i + 1].transpose()
            ecl_dist = dist.pairwise(data_row1, data_row2)
            distance = distance + ecl_dist

        return distance
Ejemplo n.º 32
0
def compute_normals(_pcd,
                    height_encoded=True,
                    max_dist=0.1,
                    n_neighbors=9,
                    n_iter=20):
    _zero_array = np.array([0.0, 0.0, 0.0])

    n_neighbors += 1  # It will always find itself too
    if len(_pcd.shape) == 3:
        n_elem = _pcd.shape[0] * _pcd.shape[1]
    else:
        n_elem = _pcd.shape[0]
    pcd = _pcd.copy()
    pcd = pcd.reshape(n_elem, 3)

    tree = KDTree(pcd, leaf_size=10)
    metric = DistanceMetric.get_metric('euclidean')

    distances, indices = tree.query(pcd, k=n_neighbors)
    normals = numba_compute_normals(n_elem, indices, distances, max_dist, pcd)
    normals = normals.reshape(_pcd.shape)
    return normals
Ejemplo n.º 33
0
def calc_distance_matrix(X, method):
    if method in ['chebyshev', 'euclidean', 'l1', 'l2']:
        DM = DistanceMetric.get_metric(method).pairwise(X)
    elif method in ['cosine']:
        DM = pairwise.cosine_distances(X)
    elif method in ['correlation', 'cityblock', 'braycurtis', 'canberra', 'hamming', 'jaccard', 'kulsinski']:
        DM = squareform(pdist(X, method))
    elif method in ['minkowski3']:
        DM = squareform(pdist(X, 'minkowski', 3))
    elif method in ['dot']:
        DM = squareform(pdist(X, lambda u, v: np.dot(u, v)))
    elif method in ['emd']:
        from scipy.stats import wasserstein_distance
        l = len(X)
        DM = np.zeros((l, l))
        for x in range(l):
            for y in range(l):
                DM[x, y] = wasserstein_distance(X[x], X[y])
    else:
        return None

    return DM
Ejemplo n.º 34
0
    def __init__(self,
                 t=100,
                 n=16,
                 contamination=0.1,
                 metric='euclidean',
                 tol=1e-8,
                 verbose=False):
        super(iNNe, self).__init__()

        self.t = int(t)
        self.n = int(n)
        self.contamination = float(contamination)
        if metric == 'cosine':
            self.metric = cosine_similarity
        else:
            try:
                self.metric = DistanceMetric.get_metric(metric)
            except ValueError as e:
                raise BaseException(e)

        self.tol = float(tol)
        self.verbose = bool(verbose)
Ejemplo n.º 35
0
    def year_lat_lon(self, x, y):
        haversine = DistanceMetric.get_metric("haversine")
        #try:
        x_year = x['year']
        x_lat = radians(x['artist_latitude'])
        x_lon = radians(x['artist_longitude'])
        y_year = y['year']
        y_lat = radians(y['artist_latitude'])
        y_lon = radians(y['artist_longitude'])
        #except:
        #    raise IOError("Problem parsing features.")
        #    return None

        rad = 6367.44
        haversine = 2 * rad * asin(
            sqrt(
                sin((y_lat - x_lat) / 2)**2 +
                cos(x_lat) * cos(y_lat) * sin((y_lon - x_lon) / 2)**2))
        norm_year = (abs(x_year - y_year)) / ((2010 - 1926) * 2)

        dist = (1 / (20003 * 2)) * haversine + norm_year
        return dist
Ejemplo n.º 36
0
def train(num_result_images=25):
    # Convert 2D image matrix => 1D bottleneck vector
    print('\n** GENERATING BOTTLENECKS bottlenecks.csv **')

    # Setup model to convert 2D image matrix => 1D bottleneck vector
    base_model = Xception(include_top=False,
                          weights='imagenet',
                          input_shape=(img_w_size, img_h_size, 3),
                          pooling='avg')

    bottlenecks = base_model.predict(images)

    # TODO:  Change this to json
    np.savetxt("bottleneck.csv", bottlenecks, delimiter=",")
    print('\n** GENERATED BOTTLENECKS to bottleneck.csv **')

    bottlenecks = np.loadtxt("bottleneck.csv", delimiter=",")

    print('\n** GENERATING PAIRWISE pairwise_top_25.json **')
    dist = DistanceMetric.get_metric('euclidean')

    # Calculate pairwise distance -- O(n^2)
    bottleneck_pairwise_dist = dist.pairwise(bottlenecks)

    # Find the top 100 similar images per image
    retrieved_images = []
    for image_idx in range(0, len(bottleneck_pairwise_dist)):
        retrieved_indexes = pd.Series(
            bottleneck_pairwise_dist[image_idx]).sort_values().head(
                num_result_images).index.tolist()
        retrieved_indexes_int = list(
            map(lambda index: int(index), retrieved_indexes))

        pairwise_top_25[image_idx] = retrieved_indexes_int

    with open('pairwise_top_25.json', 'w') as fp:
        json.dump(pairwise_top_25, fp)

    print('\n** GENERATED PAIRWISE to pairwise_top_25.json **')
Ejemplo n.º 37
0
    def transform(self, X):
        """
        Compute the topological vector for each persistence diagram individually and concatenate the results.

        Parameters:
            X (list of n x 2 numpy arrays): input persistence diagrams.
    
        Returns:
            numpy array with shape (number of diagrams) x (**threshold**): output topological vectors.
        """
        if self.threshold == -1:
            thresh = np.array([X[i].shape[0] for i in range(len(X))]).max()
        else:
            thresh = self.threshold

        num_diag = len(X)
        Xfit = np.zeros([num_diag, thresh])

        for i in range(num_diag):

            diagram, num_pts_in_diag = X[i], X[i].shape[0]
            pers = 0.5 * (diagram[:, 1] - diagram[:, 0])
            min_pers = np.minimum(pers, np.transpose(pers))
            # Works fine with sklearn 1.0, but an ValueError exception is thrown on past versions
            try:
                distances = DistanceMetric.get_metric("chebyshev").pairwise(
                    diagram)
            except ValueError:
                # Empty persistence diagram case - https://github.com/GUDHI/gudhi-devel/issues/507
                assert len(diagram) == 0
                distances = np.empty(shape=[0, 0])
            vect = np.flip(
                np.sort(np.triu(np.minimum(distances, min_pers)), axis=None),
                0)
            dim = min(len(vect), thresh)
            Xfit[i, :dim] = vect[:dim]

        return Xfit
Ejemplo n.º 38
0
    def findClusters(self):
        reassignedClusterPoints = 0

        v = np.cov(self.dataFrame.iloc[:, :-1])
        #print(v)
        #print(self.__centroidsAsDataFrame__())
        #distance_metric = DistanceMetric.get_metric('euclidean')
        #dist = distance_metric.pairwise(self.dataFrame.loc[:, :'petal_width'], self.__centroidsAsDataFrame__())
        #print(dist)
        for index, row in self.dataFrame.iterrows():
            currentCluster = row['cluster']

            nearestCluster = {'name': "", 'distance': None}

            for cluster in self.clusterLabels:
                distance_metric = DistanceMetric.get_metric('chebyshev')
                dist = distance_metric.pairwise([row.iloc[:-1]],
                                                [self.centroids[cluster]])
                #print(dist)
                #dist = (row[0] - self.centroids[cluster][0])**2 +\
                #(row[1] - self.centroids[cluster][1])**2 +\
                #(row[2] - self.centroids[cluster][2])**2 +\
                #(row[3] - self.centroids[cluster][3])**2

                if not nearestCluster[
                        'distance'] or dist < nearestCluster['distance']:
                    nearestCluster['name'] = cluster
                    nearestCluster['distance'] = dist

            self.dataFrame.at[index, 'cluster'] = nearestCluster['name']

            if currentCluster != nearestCluster['name']:
                reassignedClusterPoints += 1
                #print("cluster assignment for row {} changed from {} to {}".format(index, currentCluster, nearestCluster['name']))

        self.__computeCentroids__(
        )  # UPDATE CENTROIDS AFTER REASSIGNMENT OF CLUSTER LABELS
        return reassignedClusterPoints
Ejemplo n.º 39
0
def ClusterCV(features, linkage='single', n_folds=10):
    """ Cluster-cross-validation.
    linkage - 'single', 'complete', or 'average'
    """
    # Get distance matrix
    dist = DistanceMetric.get_metric('jaccard')
    distmatrix = dist.pairwise(features)
    # Find best number of clusters
    num_samples = features.shape[0]
    scores = np.zeros(num_samples)
    for i in range(n_folds * 2, num_samples):
        clustering = AgglomerativeClustering(n_clusters=i,
                                             affinity='precomputed',
                                             linkage=linkage)
        clustering = clustering.fit(distmatrix)
        scores[i] = silhouette_score(distmatrix,
                                     labels=clustering.labels_,
                                     metric='precomputed')
    max_score = max(scores)
    n_clusters = np.where(scores == max(scores))[0][0]
    #print(scores)
    print("Number of clusters:", n_clusters)
    # Cluster
    clustering = AgglomerativeClustering(n_clusters=n_clusters,
                                         affinity='precomputed',
                                         linkage=linkage)
    clustering = clustering.fit(distmatrix)
    # Randomly assign each cluster to one fold
    cluster_nums = list(range(n_clusters))
    shuffle(cluster_nums)
    fold_assignments = {}
    for n in range(n_clusters):
        fold_assignments[cluster_nums[n]] = n % n_folds
    # Assign samples to folds
    folds = np.zeros(num_samples)
    for j in range(num_samples):
        folds[j] = fold_assignments[clustering.labels_[j]]
    return folds, max_score
Ejemplo n.º 40
0
    def __init__(
            self,
            t=100,  # number of ensemble members
            n=16,  # sample for each ensemble member
            contamination=0.1,  # expected proportion of anomalies in the data
            metric='euclidean',  # distance metric to use
            tol=1e-8,  # tolerance
            verbose=False):
        super().__init__()

        # instantiate the parameters
        self.t = int(t)
        self.n = int(n)
        self.contamination = float(contamination)
        if metric == 'cosine':
            self.metric = cosine_similarity
        else:
            try:
                self.metric = DistanceMetric.get_metric(metric)
            except ValueError as e:
                raise BaseException(e)
        self.tol = float(tol)
        self.verbose = bool(verbose)
Ejemplo n.º 41
0
 def load_hdf(self, fname, copydata=None, noisy=False):
     if noisy:
         print 'start load_hdf'
     f = h5py.File(fname, "r")
     state = []
     for i in range(4):       
         array_data = f["state_%s"%(i)][:]
         state.append(array_data)
         if noisy:
             print "load state_%s"%(i)
             print array_data
     if noisy:
         print 'done'
     int_array = f["int_values"][:]
     for val in int_array:
         state.append(int(val)) ## ensure type int not 'numpy.int64' 
     euc_dist = DistanceMetric.get_metric('euclidean')
     state.append(euc_dist)
     state = tuple(state)
     self.check_state_compatibility(state)
     self.__setstate__(state)
     if noisy:
         print 'end load_hdf'
Ejemplo n.º 42
0
    def __init__(self,
                 contamination=0.1,
                 metric='euclidean',
                 tol=1e-10,
                 verbose=False):
        super().__init__()

        # contamination
        if not(0.0 < contamination <= 1.0):
            raise ValueError(contamination, 'is not a float in (0.0, 1.0]')
        self.c = float(contamination)

        # distance metric
        try:
            self.metric = DistanceMetric.get_metric(metric)
        except:
            raise ValueError(metric, 'is not an accepted distance metric')

        self.tol = float(tol)
        self.verbose = bool(verbose)
        
        # internal
        self.derived_squashed_ = False
def knn_mahalanobis(x_train, x_test, y_train, y_test, k=50):
    dist = DistanceMetric.get_metric('manhattan')
    x_train_temp = x_train.reset_index(drop=True)
    x_dev_temp = x_test.reset_index(drop=True)
    y_train_temp = y_train.reset_index(drop=True)
    y_dev_temp = y_test.reset_index(drop=True)

    preds_train = np.zeros((y_train_temp.shape[0], 1))
    similarity_dist = dist.pairwise(x_train_temp, x_train_temp)
    for i in range(similarity_dist.shape[1]):
        min_ind_dev = similarity_dist[:, i].argsort()[:int(k)]
        preds_train[i] = np.mean(y_train_temp[y_train_temp.index.isin(min_ind_dev)])

    mse_train_reg = math.sqrt(mean_squared_error(y_train_temp, preds_train))

    preds_dev = np.zeros((y_dev_temp.shape[0],1))
    similarity_dist = dist.pairwise(x_train_temp, x_dev_temp)
    for i in range(similarity_dist.shape[1]):
        min_ind_dev = similarity_dist[:,i].argsort()[:int(k)]
        preds_dev[i] = np.mean(y_train_temp[y_train_temp.index.isin(min_ind_dev)])

    mse_dev_reg = math.sqrt(mean_squared_error(y_dev_temp, preds_dev))
    return mse_train_reg, mse_dev_reg
Ejemplo n.º 44
0
    def mi_Kraskov(x,y,k=5,base=np.exp(1),intens=1e-10,metric="minkowski",p=np.float64('inf')):
        '''The mutual information estimator by Kraskov et al.
           Inputs are 2D arrays, with each column being a dimension and each row being a data point
        '''
        assert len(x)==len(y), "Lists should have same length"
        assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
        x +=  intens*nr.rand(x.shape[0],x.shape[1])
        y +=  intens*nr.rand(x.shape[0],x.shape[1])
        points = np.hstack((x,y))

        #Find nearest neighbors in joint space, p=inf means max-norm
        tree = KDTree(points, metric=DistanceMetric.get_metric(metric,p=p))
        try:
          dvec = tree.query(points,k+1)[0][:,k]   # no need to reshape with new query_radius method
        except ValueError:
          return (float("NaN"))

        a = MI.avgdigamma(x,dvec*x.shape[1]/points.shape[1],metric=metric,p=p)
        b = MI.avgdigamma(y,dvec*y.shape[1]/points.shape[1],metric=metric,p=p)
        c = digamma(k)
        d = digamma(len(x))
        # print("ee_acc: %s, %s, %s, %s" %( a,b,c,d))
        return (-a-b+c+d)/np.log(base)
Ejemplo n.º 45
0
def mahalanobis(X,y,nr_bins,mode='continuous'):
    """ Estimate the mahalanobis distance between the average stimulus 
    pattern defined by classes in y across features in X. For every time point 
    """

    stim_mat,y = gen_RSA_matrix(y,nr_bins,mode=mode)

    evidence_RSA = np.zeros(X.shape[2])*np.nan
    matrix = np.zeros((nr_bins,nr_bins,X.shape[2]))*np.nan
    for tp in range(X.shape[2]):
        
        
        X_in = X[:,:,tp]
        
        #pca
        pca = PCA(n_components=.95)
        X_in = pca.fit(X_in).transform(X_in)
        
        #estimate covariance 
        emp_cov = EmpiricalCovariance().fit(X_in)
        maha = DistanceMetric.get_metric('mahalanobis',VI=emp_cov.covariance_)
    
    
        #scale data
        scaler = StandardScaler().fit(X_in)
        X_s = scaler.transform(X_in)
        
        
        X_stim = np.zeros((nr_bins,X_s.shape[1]))
        for i, stim in enumerate(np.unique(y)):
            X_stim[i,:] = X_s[(y==stim) ,:].mean(0).T
        
    
        matrix[:,:,tp] = maha.pairwise(X_stim)
        evidence_RSA[tp] = np.mean(np.mean(np.multiply(matrix[:,:,tp],stim_mat)))
        
    return evidence_RSA, matrix
Ejemplo n.º 46
0
def __main__():
    data = load_data('pd_speech_features.csv')

    # params = {'max_depth': 2, 'eta': 0.1, 'n_estimators': 50, 'booster': 'dart',
    #           'gamma': 0, 'reg_lambda': 0.05, 'objective': 'binary:logistic'}

    features = [
        "gender", "baselineFeats", "intensityFeats", "formantFeats",
        "bandwidthFeats", "vocalFeats", "mfccFeats", "waveletFeats",
        "tqwtFeats"
    ]

    X = convert_data(data, features)
    params = {
        "hid_layers": [(280, "relu"), (1, "sigmoid")],
        "compile": {
            "loss": 'binary_crossentropy',
            "optimizer": tf.keras.optimizers.Adam(lr=0.001),
            "metrics": ["accuracy"]
        },
        "epochs": 10,
        "batch_size": None
    }
    # scores = run_model("mlp", X.values, data['label'], **params)

    # params = {'hid_dim': 150, 'func': 'sigm', 'init': {'norm': 0.05},
    #           'train_args': ['OP', 'c'], 'train_kwargs': {'kmax': 100}}
    #
    # scores = run_model("elm", X, data['label'], **params)
    params = {"kernel": "poly", "degree": 20}
    # scores = run_model("svm", X, data['label'], **params)
    params = {
        "n_neighbors": 1,
        "metric": DistanceMetric.get_metric("manhattan")
    }
    scores = run_model("knn", X, data['label'], **params)
    print_results(scores, features, params)
Ejemplo n.º 47
0
def linearRect(loc, norm):
    indarr = np.arange(loc.shape[0])
    flag = np.zeros(loc.shape[0], dtype=int)
    dist = dm.get_metric("euclidean")
    dist_matrix = dist.pairwise(loc)
    nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(loc)
    nndistances, nnindices = nbrs.kneighbors(loc)
    # dist_sorted_ind = np.argsort(dist_matrix, axis=1)[:,1:]
    flag_closest_rected = np.ones_like(flag, dtype=int) * -1
    flag_closest_rected_dist = np.ones_like(flag, dtype=float) * 10
    ind = np.argmax(np.linalg.norm(loc, axis=1))
    right_drct = loc[ind] / np.linalg.norm(loc[ind])
    for i in range(loc.shape[0]):
        if i != 0:
            unmarkind = np.argmin(flag_closest_rected_dist[flag<1])
            ind = indarr[flag<1][unmarkind]
            right_drct = get_right_drct(norm, ind, nnindices[ind], nndistances[ind], flag)
            # print(ind, flag_closest_rected[ind], "flag_closest_rected_dist[ind]", flag_closest_rected_dist[ind])
        norm[ind] = rectify(norm[ind], right_drct)
        flag[ind] = 1
        flag_closest_rected_dist_new = np.minimum(dist_matrix[:, ind],flag_closest_rected_dist)
        flag_closest_rected = np.where(flag_closest_rected_dist_new<flag_closest_rected_dist, ind, flag_closest_rected)
        flag_closest_rected_dist = flag_closest_rected_dist_new
    return norm
Ejemplo n.º 48
0
def test_mahattan_distance():
    a = np.random.randn(4)
    b = np.random.randn(4)
    c = np.random.randn(4)
    d = np.random.randn(4)
    ground_truth = np.random.randn(4)

    m = ManhattanDistance()

    manhattan = DistanceMetric.get_metric("manhattan")

    m.update((torch.from_numpy(a), torch.from_numpy(ground_truth)))
    np_sum = np.abs(ground_truth - a).sum()
    assert m.compute() == pytest.approx(np_sum)
    assert manhattan.pairwise([a, ground_truth])[0][1] == pytest.approx(np_sum)

    m.update((torch.from_numpy(b), torch.from_numpy(ground_truth)))
    np_sum += np.abs(ground_truth - b).sum()
    assert m.compute() == pytest.approx(np_sum)
    v1 = np.hstack([a, b])
    v2 = np.hstack([ground_truth, ground_truth])
    assert manhattan.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)

    m.update((torch.from_numpy(c), torch.from_numpy(ground_truth)))
    np_sum += np.abs(ground_truth - c).sum()
    assert m.compute() == pytest.approx(np_sum)
    v1 = np.hstack([v1, c])
    v2 = np.hstack([v2, ground_truth])
    assert manhattan.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)

    m.update((torch.from_numpy(d), torch.from_numpy(ground_truth)))
    np_sum += np.abs(ground_truth - d).sum()
    assert m.compute() == pytest.approx(np_sum)
    v1 = np.hstack([v1, d])
    v2 = np.hstack([v2, ground_truth])
    assert manhattan.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)
Ejemplo n.º 49
0
def _plot_kmeans_distortions(df_ct_stats, path_to_figure=None):

    clusters = np.arange(1, 21)

    max_stat = df_ct_stats.loc['gl_max', :].dropna()
    max_stat = max_stat.values[:, np.newaxis]

    euc_dist = DistanceMetric.get_metric('euclidean')
    dist_mat = euc_dist.pairwise(max_stat)

    distortions = []
    for cluster in clusters:
        model = KMeans(n_clusters=cluster, random_state=0).fit(dist_mat)
        distortions.append(
            sum(
                np.min(cdist(dist_mat, model.cluster_centers_, 'euclidean'),
                       axis=1)) / dist_mat.shape[0])
    plt.figure()
    plt.plot(clusters, distortions, marker='o', color='yellow')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Distortion')

    #plt.xlim([0.01, 20.01])

    x_coords = np.linspace(1, np.size(clusters), 21, dtype=int)
    y_coords = np.linspace(min(distortions), max(distortions), 6)
    y_ticks = _ticks(y_coords)
    plt.xticks(x_coords, x_coords)
    plt.yticks(y_coords, y_ticks)

    if path_to_figure is not None:
        plt.savefig(
            path_to_figure,
            bbox_inches='tight',
            dpi=CONFIG.DPI,
        )
Ejemplo n.º 50
0
 def load_model(self):
     """
     Load a btree index from the configured cache element. This only occurs
     if there is a cache element configured and there are bytes there to
     read.
     """
     with self._model_lock:
         if self.cache_element and not self.cache_element.is_empty():
             self._log.debug("Loading model from cache: %s",
                             self.cache_element)
             buff = BytesIO(self.cache_element.get_bytes())
             # noinspection PyTypeChecker
             with np.load(buff, allow_pickle=True) as cache:
                 tail = tuple(cache['tail'])
                 s = [cache['data_arr'], cache['idx_array_arr'],
                      cache['node_data_arr'], cache['node_bounds_arr']]
                 s.extend(tail)
                 s[11] = DistanceMetric.get_metric('hamming')
                 s = tuple(s)
             # noinspection PyTypeChecker
             #: :type: sklearn.neighbors.BallTree
             self.bt = BallTree.__new__(BallTree)
             self.bt.__setstate__(s)
             self._log.debug("Loading mode: Done")
Ejemplo n.º 51
0
    def transform(self, X):

        if self.threshold_ == -1:
            thresh = np.array([X[i].shape[0] for i in range(len(X))]).max()
        else:
            thresh = self.threshold_

        num_diag = len(X)
        Xfit = np.zeros([num_diag, thresh])

        for i in range(num_diag):

            diagram, num_pts_in_diag = X[i], X[i].shape[0]
            pers = 0.5 * np.matmul(diagram, np.array([[-1.0], [1.0]]))
            min_pers = np.minimum(pers, np.transpose(pers))
            distances = DistanceMetric.get_metric("chebyshev").pairwise(
                diagram)
            vect = np.flip(
                np.sort(np.triu(np.minimum(distances, min_pers)), axis=None),
                0)
            dim = min(len(vect), thresh)
            Xfit[i, :dim] = vect[:dim]

        return Xfit
Ejemplo n.º 52
0
def test_compute():
    a = np.random.randn(4)
    b = np.random.randn(4)
    c = np.random.randn(4)
    d = np.random.randn(4)
    ground_truth = np.random.randn(4)

    m = CanberraMetric()

    canberra = DistanceMetric.get_metric("canberra")

    m.update((torch.from_numpy(a), torch.from_numpy(ground_truth)))
    np_sum = (np.abs(ground_truth - a) / (np.abs(a) + np.abs(ground_truth))).sum()
    assert m.compute() == pytest.approx(np_sum)
    assert canberra.pairwise([a, ground_truth])[0][1] == pytest.approx(np_sum)

    m.update((torch.from_numpy(b), torch.from_numpy(ground_truth)))
    np_sum += ((np.abs(ground_truth - b)) / (np.abs(b) + np.abs(ground_truth))).sum()
    assert m.compute() == pytest.approx(np_sum)
    v1 = np.hstack([a, b])
    v2 = np.hstack([ground_truth, ground_truth])
    assert canberra.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)

    m.update((torch.from_numpy(c), torch.from_numpy(ground_truth)))
    np_sum += ((np.abs(ground_truth - c)) / (np.abs(c) + np.abs(ground_truth))).sum()
    assert m.compute() == pytest.approx(np_sum)
    v1 = np.hstack([v1, c])
    v2 = np.hstack([v2, ground_truth])
    assert canberra.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)

    m.update((torch.from_numpy(d), torch.from_numpy(ground_truth)))
    np_sum += (np.abs(ground_truth - d) / (np.abs(d) + np.abs(ground_truth))).sum()
    assert m.compute() == pytest.approx(np_sum)
    v1 = np.hstack([v1, d])
    v2 = np.hstack([v2, ground_truth])
    assert canberra.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)
            time.sleep(10.0)

            if(init == 0):
                np.save('maternal_fetal_feature_vectors1k', maternal_fetal_feature_vectors, allow_pickle=False)
                np.save('maternal_feature_vectors1k', maternal_feature_vectors, allow_pickle=False)
                np.save('linear_regression_coefs1k', linear_regression_coefs, allow_pickle=False)
                np.save('linear_regression_intercepts1k', linear_regression_intercepts, allow_pickle=False)
            # figz.data = []

        if ((n_svrs % 25) == 0):
            print(['n_svrs:  ' + str(n_svrs)])


# Get histogram of token - token distances for clustering:
#
dist = DistanceMetric.get_metric('manhattan')
token_dists = dist.pairwise(maternal_fetal_feature_vectors[0:200,:])

# token_dists = distance_matrix(maternal_fetal_feature_vectors, maternal_fetal_feature_vectors, p=1, threshold=100000000)
# token_dists = distance.cdist(maternal_fetal_feature_vectors[0:5,:], maternal_fetal_feature_vectors[0:5,:], metric='cityblock')
token_dists = distance.pdist(maternal_fetal_feature_vectors, metric='cityblock')

# token_dist_hist = np.histogram(token_dists, bins=1000)
# token_dist_hist_idxs = np.arange(len(token_dist_hist))

token_dists_sorted = np.sort(token_dists)
token_dist_idxs = np.arange(len(token_dists_sorted))
fig = make_subplots(rows=1, cols=1)
fig.append_trace(go.Scatter(x=token_dist_idxs, y=token_dists_sorted), row=1, col=1)
fig.show()
  [0,0,1,1,0,1,0,1,1,0,1,1,0,1,1,0,0,1,1,1,1,1],
  [1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0],
  [1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0],
  [1,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0],
  [1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0],
  [0,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,0,1,1,1,1,0],
  [1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
  ]

AttributeClassifier = joblib.load('Dumps/AttributeClassifierKnowledgeTransfer.pkl')

features = scipy.io.loadmat("./UIUC1/UIUC1_win_feature.mat")
labels = scipy.io.loadmat("./UIUC1/UIUC1_labels.mat")
action_actor = open("./UIUC1/action_actor.txt")

dist = DistanceMetric.get_metric('euclidean')

mapping = [{}]
for line in action_actor:
    line = line.split()
    actionvector = numpy.zeros(14, dtype=numpy.int)
    actionvector[int(line[0])]=1
    mapping.append({'action':int(line[0]),'actionvector':actionvector, 'actor':int(line[1])})

total = len(labels['vlabels'][0])
ConfusionMatrix=numpy.array([[0,0],[0,0]])
NovelClassList=[[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]]
for NovelClass in NovelClassList:

    ConfusionMatrix2=numpy.array([[0,0],[0,0]])
Ejemplo n.º 55
0
def brute_force_neighbors(X, Y, k, metric, **kwargs):
    D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
    ind = np.argsort(D, axis=1)[:, :k]
    dist = D[np.arange(Y.shape[0])[:, None], ind]
    return dist, ind
Ejemplo n.º 56
0
def haversine_distance(p1, p2):
    d = DistanceMetric.get_metric('haversine')
    X = [p1, p2]
    return d.pairwise(X)[0][1]
from scipy.spatial import distance
import numpy as np

distance.euclidean([1, 0, 0], [0, 1, 0])
distance.euclidean([20, 25], [25, 22])  #closest : S1 with S2
np.sqrt(((20 - 25)**2 + (25 - 22)**2))  #sqrt(sum(x-y)^2)

distance.euclidean([20, 25], [35, 40])
distance.euclidean([20, 25], [40, 35])
distance.euclidean([35, 40], [40, 35])

#distance of all points in DF
from sklearn.neighbors import DistanceMetric

dist = DistanceMetric.get_metric('euclidean')
dist
df.to_numpy()
dist.pairwise(df.to_numpy())

#Kmeans clustering
df
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2).fit(df)
centroids = kmeans.cluster_centers_
print(centroids)
df
plt.scatter(df['math'],
            df['science'],
            c=kmeans.labels_.astype(float),
Ejemplo n.º 58
0
def dist(X_1, X_2, param='euclidean'):
    dist = DistanceMetric.get_metric(param)
    X = [X_1,X_2]
    return dist.pairwise(X)[0,1]
Ejemplo n.º 59
0
def mean_distance_to_closest(predicted, event):
    angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle)
    nn = BallTree(event.tracks, leaf_size=5, metric=angle)

    return np.sum([nn.query(predicted[i, :], k=1) for i in xrange(predicted.shape[0])]) / event.tracks.shape[0]
Ejemplo n.º 60
0
## KNN PREDICTOR ##

# do some lambda magic on text columns

traindata = list(train.apply(lambda x:'%s %s %s' % (x['query'],x['product_title'], x['product_description']),axis=1))
testdata = list(test.apply(lambda x:'%s %s %s' % (x['query'],x['product_title'], x['product_description']),axis=1))

# Fit TFIDF

tfv.fit(traindata)
X = tfv.transform(traindata)
X_test = tfv.transform(testdata)

clf = pipeline.Pipeline([('tSVD',tSVD),('scl',scl),('knn',knn)])
param_grid = {'knn__n_neighbors':[2],'knn__metric':[DistanceMetric.get_metric('manhattan')],'tSVD__n_components':[400]}

model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, scoring = kappa_scorer, refit = True, cv = 2, n_jobs = -1)

# Fit Model

model.fit(X, y)
model.best_estimator_.fit(X,y)
trainPred = model.best_estimator_.predict(X_test)

# Averaging predicted relevance values

finalPred = [int(floor((int(stemPred[i])+trainPred[i])*0.5)) for i in range(len(stemPred))]

#print "Kappa Score for Training Data\nStemming+KNN\nScore=%f" %(quadratic_weighted_kappa(y, finalPred))