Example #1
0
def gower_distance(X):
    """
    This function expects a pandas dataframe as input
    The data frame is to contain the features along the columns. Based on these features a
    distance matrix will be returned which will contain the pairwise gower distance between the rows
    All variables of object type will be treated as nominal variables and the others will be treated as 
    numeric variables.

    Distance metrics used for:

    Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
    Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry)
    """
    individual_variable_distances = []

    for i in range(X.shape[1]):
        feature = X.iloc[:,[i]]
        if feature.dtypes[0] == np.object:
            feature_dist = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(feature))
        else:
            feature_dist = DistanceMetric.get_metric('manhattan').pairwise(feature) / np.ptp(feature.values)
            
        individual_variable_distances.append(feature_dist)

    return np.array(individual_variable_distances).mean(0)
Example #2
0
def calc_mahalanobis(x, y, n_neighbors):
    from sklearn.neighbors import DistanceMetric, NearestNeighbors
    DistanceMetric.get_metric('mahalanobis', V=np.cov(x))

    nn = NearestNeighbors(n_neighbors=n_neighbors,
                          algorithm='brute',
                          metric='mahalanobis',
                          metric_params={'V': np.cov(x)})
    return nn.fit(x).kneighbors(y)
Example #3
0
File: knn.py Project: salceson/mro
def mahalonobis(X):
    cov = np.cov(X, rowvar=0)
    try:
        metric = DistanceMetric.get_metric('mahalanobis', V=cov) if X.shape[0] > 1 \
            else DistanceMetric.get_metric('euclidean')
    except LinAlgError:
        metric = DistanceMetric.get_metric('euclidean')

    def distance(x, y):
        return metric.pairwise([x], [y])[0][0]

    return distance
Example #4
0
def dist(X, Y, distance_function = "euclidean"):
    """calculate X, Y distance matrix
    [Args]
    ------
    X : m samples
    Y : n samples
    distance_function : user_defined distance
    
    [Returns]
    ---------
    distance_matrix: n * m distance matrix
    
    
    we have those built-in function. Default = euclidean
    
    "euclidean"    EuclideanDistance    sqrt(sum((x - y)^2))
    "manhattan"    ManhattanDistance    sum(|x - y|)
    "chebyshev"    ChebyshevDistance    sum(max(|x - y|))
    "minkowski"    MinkowskiDistance    sum(|x - y|^p)^(1/p)
    "wminkowski"    WMinkowskiDistance    sum(w * |x - y|^p)^(1/p)
    "seuclidean"    SEuclideanDistance    sqrt(sum((x - y)^2 / V))
    "mahalanobis"    MahalanobisDistance    sqrt((x - y)' V^-1 (x - y))
    """
    distance_calculator = DistanceMetric.get_metric(distance_function)
    return distance_calculator.pairwise(X, Y)
Example #5
0
    def get_full_metric(self, train_pairs):
        train_pairs_flat = [item for subtuple in train_pairs for item in subtuple]
        
        pca = PCA(n_components = self.pca_components)
        pca.fit(train_pairs_flat)

        train_pairs_pca_flat = pca.transform(train_pairs_flat)

        train_pairs_pca = list()

        for i in xrange(0, len(train_pairs_pca_flat), 2):
            a = i 
            b = i + 1
            train_pairs_pca.append((train_pairs_pca_flat[a],
              train_pairs_pca_flat[b]))
        
        ys = ys_from_pairs(train_pairs_pca)

        file_id = str(random.random())[2:]

        save_cvx_params(ys, file_id)
        run_cvx(file_id)
        M = load_cvx_result(file_id)

        dist = DistanceMetric.get_metric('mahalanobis', VI = M)

        return dist, M, pca
Example #6
0
    def run_single_trial(self, train_pairs, test_pairs, train_tune_data, test_tune_data):
        print "Running PCA..."
        train_pairs_pca, test_pairs_pca = self.fit_pca(train_pairs, test_pairs)
        ys = ys_from_pairs(train_pairs_pca)

        file_id = str(random.random())[2:]

        save_cvx_params(ys, file_id)
        run_cvx(file_id)
        M = load_cvx_result(file_id)
        dist = DistanceMetric.get_metric('mahalanobis', VI = M)
        train_a_sections = [x[0] for x in train_pairs_pca]
        train_b_sections = [x[1] for x in train_pairs_pca]
        test_a_sections = [x[0] for x in test_pairs_pca]
        test_b_sections = [x[1] for x in test_pairs_pca]

        train_given_sections = train_a_sections
        train_to_match_sections = train_b_sections
        test_given_sections = test_a_sections
        test_to_match_sections = test_b_sections
        if self.match_a_to_b:
            train_given_sections = train_b_sections
            train_to_match_sections = train_a_sections
            test_given_sections = test_b_sections
            test_to_match_sections = test_a_sections

        print "Constructing BallTrees..."
        train_bt = BallTree(train_to_match_sections, metric=dist)
        test_bt = BallTree(test_to_match_sections, metric=dist)

        train_top_fraction = int(len(train_given_sections) * self.correct_within_top_fraction)
        test_top_fraction = int(len(test_given_sections) * self.correct_within_top_fraction)

        print "Querying the BallTrees..."
        train_result = train_bt.query(train_given_sections, train_top_fraction)
        test_result = test_bt.query(test_given_sections, test_top_fraction)

        print "Looking at correctness of results..."
        train_correct = sum([int(i in train_result[1][i]) for i in xrange(len(train_given_sections))])
        test_correct = sum([int(i in test_result[1][i]) for i in xrange(len(test_given_sections))])

        print "Finding indices of correct matches..."
        test_result_full = test_bt.query(test_given_sections, len(test_given_sections))
        def default_index(lst, i):
          ind = -1
          try:
            ind = lst.index(i)
          except:
            pass
          return ind
        test_indices = [default_index(list(test_result_full[1][i]), i) for i in xrange(len(test_given_sections))]
        test_indices = [x for x in test_indices if x != -1]

        with open("successful_tunes_{}".format(file_id), 'w') as successful_tunes_f:
          for i, index in enumerate(test_indices):
            if index == 0:
              successful_tunes_f.write(str(test_tune_data[i]) + '\n\n')

        return [[train_correct, len(train_given_sections)],
            [test_correct, len(test_given_sections)]], test_indices
Example #7
0
File: knn.py Project: salceson/mro
def euclid(_):
    metric = DistanceMetric.get_metric('euclidean')

    def distance(x, y):
        return metric.pairwise([x], [y])[0][0]

    return distance
def standardizedEulideanDistance(wide, p):
    """ Calculate the standardized Euclidean distance and return an array of distances to the center and a matrix of pairwise distances.

    :Arguments:
        :type wide: pandas.DataFrame
        :param wide: A wide formatted data frame with samples as columns and compounds as rows.

    :Returns:
        :return: Return 4 pd.DataFrames with SED values and cutoffs.
        :rtype: pd.DataFrames
    """

    # Estimated Variance from the data
    varHat = wide.var(axis=1, ddof=1)
    varHat[varHat==0] = 1
    dist = DistanceMetric.get_metric('seuclidean', V=varHat)

    # Column means
    colMean = wide.mean(axis=1)

    # Calculate the standardized Euclidean Distance from all samples to the center

    SEDtoCenter = dist.pairwise(wide.values.T, pd.DataFrame(colMean).T)
    SEDtoCenter = pd.DataFrame(SEDtoCenter, columns = ['SED_to_Center'], index = wide.columns)
    
    # Calculate the pairwise standardized Euclidean Distance of all samples
    SEDpairwise = dist.pairwise(wide.values.T)
    SEDpairwise = pd.DataFrame(SEDpairwise, columns = wide.columns, index = wide.columns)
    for index, row in SEDpairwise.iterrows():
        SEDpairwise.loc[index, index] = np.nan
    
    # Calculate cutoffs
    # For SEDtoCenter: 
    #   Beta: sqrt((p-1)^2/p*(sum of n iid Beta(1/2, p/2)));        (It's the exact distribution.)
    #   Normal: sqrt(N((p-1)/p*n, 2*(p-2)*(p-1)^2/p^2/(p+1)*n));    (It's normal approximation. Works well when n is large.)
    #   Chisq: sqrt((p-1)/p*Chi-sq(n));                             (It's Chi-sq approximation. Works well when p is decent and p/n is not small.)
    # For SEDpairwise:
    #   Beta: sqrt(2*(p-1)*(sum of n iid Beta(1/2, p/2)));
    #   Normal: sqrt(N(2*n, 8*(p-2)/(p+1)*n));
    #   Chisq: sqrt(2*Chi-sq(n));
    # where n = # of compounds and p = # of samples
    pSamples  = float(wide.shape[1])
    nFeatures = float(wide.shape[0])
    nIterate  = 20000 #100000
    #p = 0.95
    betaP     = np.percentile(pd.DataFrame(stats.beta.rvs(0.5, 0.5*(pSamples-2), size=nIterate*nFeatures).reshape(nIterate, nFeatures)).sum(axis=1), p*100)
    betaCut1  = np.sqrt((pSamples-1)**2/pSamples*betaP)
    normCut1  = np.sqrt(stats.norm.ppf(p, (pSamples-1)/pSamples*nFeatures, np.sqrt(2*nFeatures*(pSamples-2)*(pSamples-1)**2/pSamples**2/(pSamples+1))))
    chisqCut1 = np.sqrt((pSamples-1)/pSamples*stats.chi2.ppf(p, nFeatures))
    betaCut2  = np.sqrt((pSamples-1)*2*betaP)
    normCut2  = np.sqrt(stats.norm.ppf(p, 2*nFeatures, np.sqrt(8*nFeatures*(pSamples-2)/(pSamples+1))))
    chisqCut2 = np.sqrt(2*stats.chi2.ppf(p, nFeatures))
    cutoff1   = pd.DataFrame([[betaCut1, normCut1, chisqCut1]], columns=['Beta(Exact)', 'Normal', 'Chi-sq'])
    cutoff2   = pd.DataFrame([[betaCut2, normCut2, chisqCut2]], columns=['Beta(Exact)', 'Normal', 'Chi-sq'])

    # TODO: Create a flag based on values greater than one of the cutoffs.
    return SEDtoCenter, cutoff1, SEDpairwise, cutoff2
def example2():
    """using customized distance
    """
    from HSH.Misc.shgeo import dist
    def earthdist(x, y): # latitude, longitude earth surface distance
        return dist((x[0], x[1]), (y[0], y[1]))
    
    dist_cal = DistanceMetric.get_metric(earthdist)
    train = np.array([[32.5, 101.0], [32.5, 102.0]])
    test = np.array([[31.5, 101.0], [39.5, 101.0]])
    print(dist_cal.pairwise(train, test))
Example #10
0
def example1():
    dist = DistanceMetric.get_metric("euclidean")
    train = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    test = np.array([[0.5, 0.5], [-0.5, -0.5]])
    
    distance_matrix = dist.pairwise(train, test)
    print(distance_matrix) # distance_matrix
    
    reduced_distance_matrix = dist.dist_to_rdist(distance_matrix) # reduced_distance_matrix
    print(reduced_distance_matrix) # for euclidean, it's squared distance_matrix
    
    print(dist.rdist_to_dist(reduced_distance_matrix))
Example #11
0
 def load_model(self):
     if self.file_cache and os.path.isfile(self.file_cache):
         self._log.debug("Loading mode: %s", self.file_cache)
         with numpy.load(self.file_cache) as cache:
             tail = tuple(cache['tail'])
             s = (cache['data_arr'], cache['idx_array_arr'],
                  cache['node_data_arr'], cache['node_bounds_arr']) +\
                 tail + (DistanceMetric.get_metric('hamming'),)
         #: :type: sklearn.neighbors.BallTree
         self.bt = BallTree.__new__(BallTree)
         self.bt.__setstate__(s)
         self._log.debug("Loading mode: Done")
Example #12
0
def distance(X, distance_measure='euclidean'):

    X = np.array(X)

    if distance_measure in SKLEARN_METRICS:
        distance_ = DistanceMetric.get_metric(distance_measure).pairwise(X)
    elif distance_measure is 'pearson':
        distance_ = np.corrcoef(X)
    else:
        distance_ = None

    return distance_
Example #13
0
    def __init__(self, proc, Xss, yss, valid_set=0.1, validation_set=None):
        self.seen_states = set()
        self.state_set = []
        self.proc = proc
        self.valid_set = 0.1
        self.surr_loss = DistanceMetric.get_metric('hamming')

        if validation_set is None:
            self._split(Xss, yss)
        else:
            self.Xss = Xss
            self.yss = yss
            self.valid_Xss, self.valid_yss = validation_set
Example #14
0
    def find_computed_cluster_metrics(self):
        """Initialises cluster metric computation over every cluster that is
        found by the given clustering algorithm.
        """
        for cluster in self.computed_clusters:
            cluster.compute_metrics(self.original_corpus,
                                    self.original_article_pos)

        centroid_locs = [x.centroid for x in self.computed_clusters]
        dist = DistanceMetric.get_metric('euclidean')
        dist_pair = dist.pairwise(centroid_locs)
        self.max_centroid_dist = max(list(itertools.chain.from_iterable(
            dist_pair)))
Example #15
0
 def entropy(x,k=3,base=np.exp(1),intens=1e-10):
     """ The classic K-L k-nearest neighbor continuous entropy estimator
         x should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]]
         if x is a one-dimensional scalar and we have four samples
     """
     assert k <= len(x)-1, "Set k smaller than num. samples - 1"
     d = len(x[0])
     N = len(x)
     x +=  intens*nr.rand(N,d)
     tree = KDTree(x, metric=DistanceMetric.get_metric("minkowski",p=np.float64('inf') ))
     nn = tree.query(x,k+1)[0][:,k]   # no need to reshape with new query_radius method
     const = digamma(N)-digamma(k) + d*log(2)
     return (const + d*np.mean(map(log,nn)))/log(base)
Example #16
0
def __max_score_mapping(rr, predicted, test, max_angle=1.0 - 2):
    angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle)
    d = angle.pairwise(predicted, test)

    # Each true sample maps to closest
    test_mapping = np.zeros(shape=(test.shape[0],), dtype=float)

    for i in xrange(test.shape[0]):
        if np.any(d[:, i] < max_angle):
            close_predictions = d[:, i] < max_angle
            scores = [rr(p) for p in predicted[close_predictions, :]]
            test_mapping[i] = np.max(scores)

    return test_mapping
Example #17
0
def update_prediction(prediction, real_pic, metric_name='euclidean'):
    """
    Update a prediction after receiving the actual picture from the webcam.

    Parameters
    ----------
    prediction : Prediction
        The model object of the prediction to update
    real_pic : Picture
        The model object of the actual picture received

    Return
    ------
    float : the prediction error
    """
    pred_pic = prediction.as_picture()
    cam_id = prediction.params.webcam.webcam_id
    if metric_name == 'wminkowski-pca':
        with webcam_fs.get_dataset(cam_id) as dataset:
            if 'pca' not in dataset.imgset.feature_sets:
                raise ValueError("""wminkowski-pca cannnot be used
                                    without a PCA feature set""")

            pca_extractor = dataset.imgset.feature_sets['pca'].extractor
            weights = pca_extractor.pca.explained_variance_ratio_
            pred_data = pca_extractor.extract(pred_pic.pixels)
            real_data = pca_extractor.extract(real_pic.pixels)
            metric = DistanceMetric.get_metric('wminkowski', p=2, w=weights)
    else:
        pred_data = pred_pic.pixels
        real_data = real_pic.pixels
        metric = DistanceMetric.get_metric(metric_name)

    error = metric.pairwise([pred_data], [real_data])[0]
    prediction.error = error
    prediction.save()
    return error
Example #18
0
def entropy(data, ball='euclidean', k=1, units='nats'):
    """
    Estimates the entropy of the given data using the k-nearest neighbors method

    input
    -----
    data (nd-array):
        An (n by p) matrix containing n samples of p-dimensional data

    ball (string):
        Which ball (e.g. l1, euclidean, etc.) to use when computing the volume.
        Acceptable strings include:
            'l1'   : l1 or Manhattan distance
            'l2'   : l2 or Euclidean distance; default
            'linf' : l-infinity or Chebyshev distance

    k (integer):
        How many nearest-neighbors to use when computing radii. Must be at least 1.

    units (string):
        Which unit the entropy output has.
        Acceptable strings include:
            'nats' : base e
            'bits' : base 2

    """
    
    # Get number of samples and dimensionality
    (n,p)  = data.shape
    
    # Determine radii and volumes for a given metric space
    metric = getball(ball)
    if metric == 1:
        m = 'manhattan'
    elif metric == 2:
        m = 'euclidean'
    elif metric == inf:
        m = 'chebyshev'
        
    dist  = DistanceMetric.get_metric(m)
    D_mat = dist.pairwise(data)
    D_mat.sort(axis=1)
    radii = D_mat[:,k]
    Vs    = volume(radii, ball=str(metric), dimension=p)
    
    if units.lower() == 'nats':
        return sum([np.log(vol) for vol in Vs])/float(n) + np.log(n) - L(k - 1) + 0.577215665
    if units.lower() == 'bits':
        return sum([np.log2(vol) for vol in Vs])/float(n) + np.log2(n) - L(k - 1) + 0.577215665
Example #19
0
def __mappings(predicted, test, max_angle=1.0 - 2):
    angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle)
    d = angle.pairwise(predicted, test)

    # Each true sample maps to closest
    test_mapping = np.zeros(shape=(test.shape[0],), dtype=int)
    predicted_mapping = np.zeros(shape=(predicted.shape[0],), dtype=int)

    for i in xrange(test.shape[0]):
        test_mapping[i] = 1 if np.any(d[:, i] < max_angle) else 0

    for i in xrange(predicted.shape[0]):
        predicted_mapping[i] = 1 if np.any(d[i, :] < max_angle) else 0

    return predicted_mapping, test_mapping
Example #20
0
    def mi_LNC(x,y,k=5,base=np.exp(1),alpha=0.25,intens = 1e-10,metric='minkowski',p=np.float64('inf')):
        '''The mutual information estimator by PCA-based local non-uniform correction(LNC)
           ith row of X represents ith dimension of the data, e.g. X = [[1.0,3.0,3.0],[0.1,1.2,5.4]], if X has two dimensions and we have three samples
           alpha is a threshold parameter related to k and d(dimensionality), please refer to our paper for details about this parameter
        '''
        #N is the number of samples
        N = x.shape[0]

        #First Step: calculate the mutual information using the Kraskov mutual information estimator
        #adding small noise to X, e.g., x<-X+noise
        x += intens*nr.rand(x.shape[0],x.shape[1])
        y += intens*nr.rand(x.shape[0],x.shape[1])
        points = np.hstack((x,y))

        tree = KDTree(points, metric=DistanceMetric.get_metric(metric, p=p))
        try:
            dvec, knn_idx = tree.query(points, k+1)   # no need to reshape with new query_radius method
        except ValueError:
          return (float("NaN"))

        a = MI.avgdigamma(x,dvec[:,-1]*x.shape[1]/points.shape[1], metric=metric, p=p)
        b = MI.avgdigamma(y,dvec[:,-1]*y.shape[1]/points.shape[1], metric=metric, p=p)
        c = digamma(k)
        d = digamma(len(x))

        # a,b,c,d = MI.avgdigamma(x,dvec), MI.avgdigamma(y,dvec), digamma(k), digamma(len(x))
        # print("ee_acc: %s, %s, %s, %s" %( a,b,c,d))
        ret = (-a-b+c+d)/np.log(base)

        # LNC correction
        logV_knn = np.sum(np.log(np.abs(points - points[knn_idx[:,-1],:])), axis=1)
        logV_projected = np.zeros(logV_knn.shape)
        for i in range(points.shape[0]):
            knn_points = points[knn_idx[i,:],:]
            knn_centered = knn_points - points[i,:]
            u,s,v = la.svd(knn_centered)
            knn_proj = knn_centered.dot(v.T)
            max_dims = np.max(np.abs(knn_proj), axis=0)   # max-norm per dimension
            logV_projected[i] = np.sum(np.log(max_dims))

        diff = logV_projected - logV_knn
        if (alpha>1): alpha = 1
        diff[diff >= log(alpha)] = 0
        e = -np.sum(diff) / N

        return (ret + e)/log(base);
def calcMDS(pltnum, flag, dmetric):
    if flag == 1:
        clf = PCA(n_components=5)
        Y = clf.fit_transform(X)
        title  = 'PCA-MDS'
    elif flag == 2:
        clf = TruncatedSVD(n_components=5)
        Y = clf.fit_transform(X)
    else:
        Y = X
        title = 'MDS DistanceMetric: ' + str(dmetric)
    dist = DistanceMetric.get_metric(dmetric)
    Y    = dist.pairwise(Y)
    # Y = euclidean_distances(Y)
    mds = manifold.MDS(n_components=2, dissimilarity='precomputed')#, init='pca', random_state=0)
    Y = mds.fit_transform(Y)
    for i in range(1, 3):
        mdsPlot(int(str(pltnum) + str(i)), i, Y, title)
Example #22
0
def distance_from_most_visited_place(place, user):
    q = select([func.count(),visits_10min.c.placeid]).where(visits_10min.c.userid == user).group_by(visits_10min.c.placeid).order_by(func.count().desc())
    most_visited_places = [r[1] for r in connection.execute(q).fetchall()]
    def get_lat_long(place_q):
        try:
            return connection.execute(select([places_location.c.longitude, places_location.c.latitude]).where(and_(places_location.c.placeid == place_q, places_location.c.userid == user))).fetchall()[0]
        except Exception as e:
            return None
            
    dist = DistanceMetric.get_metric('haversine')
    X = []
    X.append(get_lat_long(place))
    for p in most_visited_places:
        ret = get_lat_long(p)
        if ret is not None:
            X.append((ret[0], ret[1]))
            break
    return dist.pairwise(X)[0][1]
Example #23
0
    def compute_metrics(self, corpus, article_pos):
        """Computes metrics for the given cluster. Metrics computed are:
        diameter, radius, centroid, closest article to centroid, the distance
        of the closest article to the centroid.

        Args:
            corpus: A corpus in LSI space
            article_pos (dict): Maps the article id to the actual
                                positions of the article in the corpus
        """
        dist_corpus = [corpus[article_pos[x]] for x in self.articles_id]

        # Centroid calculation
        self.centroid = np.average(dist_corpus, axis=0)

        # Diameter calculation
        dist = DistanceMetric.get_metric('euclidean')
        dist_pair = dist.pairwise(dist_corpus)
        self.diameter = max(list(itertools.chain.from_iterable(dist_pair)))

        # Radius calculation
        dist_corpus.append(self.centroid)
        dist_pair = dist.pairwise(dist_corpus)
        centroid_dist = [x for x in dist_pair[-1] if x > 0]
        if len(centroid_dist) > 0:
            self.radius = max(centroid_dist)

            # Closest article computation
            closest_article = self.articles_id[0]
            min_dist = self.radius
            tmp_content = []

            for k, id in enumerate(self.articles_id):
                if centroid_dist[k] < min_dist:
                    closest_article = id
                    min_dist = centroid_dist[k]
                    tmp_content = self.data[k]

            self.closest_article_id = closest_article
            self.closest_article_distance = min_dist
            self.closest_article_content = tmp_content
Example #24
0
	def MDS(self,typeof='classic',dist=False,groups=None,dpi=300,textsize=10,interactive=False,
	        samemarker=False,markersize=8,numbered=False,legend=False,of='pdf',rotate=0,MD=False):
		'''
		Perform Multidimensional Scaling wither classic (PCoA) or non-metric.
		If you have the upper triangle of a distance matrix as a dictionary,
		pass the dictionary as dist.
		'''
		# Rotation instance
		self.clf = PCA(n_components=self.ncomp)		

		seed = np.random.RandomState(seed=3)

		if typeof == 'classic': 
			metric = True
			self.type = 'cMDS'
		else: 
			metric = False
			self.type = "nMDS"

		if dist:
			similarities=self.dict2array2matrix(dist)
		else:
			#similarities = euclidean_distances(self.data)
			dist = DistanceMetric.get_metric('euclidean')
			similarities = dist.pairwise(self.data)
		# Initiate multidimensional scaling
		mds = manifold.MDS(n_components=self.ncomp, metric = metric, max_iter=3000, eps=1e-9, 
		                   random_state=seed, dissimilarity="precomputed", n_jobs=-1)

		#fit the data the MDS
		pos = mds.fit(similarities).embedding_
		if typeof != 'classic': pos = mds.fit_transform(similarities, init=pos)

		# Rescale the data
		pos *= np.sqrt((np.array(self.data)** 2).sum()) / np.sqrt((pos ** 2).sum())

		# Rotate the data
		self.fit = self.clf.fit_transform(pos)
		self.Plot(dpi=dpi,textsize=textsize,interactive=interactive,samemarker=samemarker,
		          markersize=markersize,numbered=numbered,legend=legend,of=of,rotate=rotate,
		          groups=groups,MD=MD)
Example #25
0
    def mi_Kraskov(x,y,k=5,base=np.exp(1),intens=1e-10,metric="minkowski",p=np.float64('inf')):
        '''The mutual information estimator by Kraskov et al.
           Inputs are 2D arrays, with each column being a dimension and each row being a data point
        '''
        assert len(x)==len(y), "Lists should have same length"
        assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
        x +=  intens*nr.rand(x.shape[0],x.shape[1])
        y +=  intens*nr.rand(x.shape[0],x.shape[1])
        points = np.hstack((x,y))

        #Find nearest neighbors in joint space, p=inf means max-norm
        tree = KDTree(points, metric=DistanceMetric.get_metric(metric,p=p))
        try:
          dvec = tree.query(points,k+1)[0][:,k]   # no need to reshape with new query_radius method
        except ValueError:
          return (float("NaN"))

        a = MI.avgdigamma(x,dvec*x.shape[1]/points.shape[1],metric=metric,p=p)
        b = MI.avgdigamma(y,dvec*y.shape[1]/points.shape[1],metric=metric,p=p)
        c = digamma(k)
        d = digamma(len(x))
        # print("ee_acc: %s, %s, %s, %s" %( a,b,c,d))
        return (-a-b+c+d)/np.log(base)
Example #26
0
def mean_distance_to_closest(predicted, event):
    angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle)
    nn = BallTree(event.tracks, leaf_size=5, metric=angle)

    return np.sum([nn.query(predicted[i, :], k=1) for i in xrange(predicted.shape[0])]) / event.tracks.shape[0]
Example #27
0
def haversine_distance(p1, p2):
    d = DistanceMetric.get_metric('haversine')
    X = [p1, p2]
    return d.pairwise(X)[0][1]
Example #28
0
## KNN PREDICTOR ##

# do some lambda magic on text columns

traindata = list(train.apply(lambda x:'%s %s %s' % (x['query'],x['product_title'], x['product_description']),axis=1))
testdata = list(test.apply(lambda x:'%s %s %s' % (x['query'],x['product_title'], x['product_description']),axis=1))

# Fit TFIDF

tfv.fit(traindata)
X = tfv.transform(traindata)
X_test = tfv.transform(testdata)

clf = pipeline.Pipeline([('tSVD',tSVD),('scl',scl),('knn',knn)])
param_grid = {'knn__n_neighbors':[2],'knn__metric':[DistanceMetric.get_metric('manhattan')],'tSVD__n_components':[400]}

model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, scoring = kappa_scorer, refit = True, cv = 2, n_jobs = -1)

# Fit Model

model.fit(X, y)
model.best_estimator_.fit(X,y)
trainPred = model.best_estimator_.predict(X_test)

# Averaging predicted relevance values

finalPred = [int(floor((int(stemPred[i])+trainPred[i])*0.5)) for i in range(len(stemPred))]

#print "Kappa Score for Training Data\nStemming+KNN\nScore=%f" %(quadratic_weighted_kappa(y, finalPred))
  [0,0,1,1,0,1,0,1,1,0,1,1,0,1,1,0,0,1,1,1,1,1],
  [1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0],
  [1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0],
  [1,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0],
  [1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0],
  [0,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,0,1,1,1,1,0],
  [1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
  ]

AttributeClassifier = joblib.load('Dumps/AttributeClassifierKnowledgeTransfer.pkl')

features = scipy.io.loadmat("./UIUC1/UIUC1_win_feature.mat")
labels = scipy.io.loadmat("./UIUC1/UIUC1_labels.mat")
action_actor = open("./UIUC1/action_actor.txt")

dist = DistanceMetric.get_metric('euclidean')

mapping = [{}]
for line in action_actor:
    line = line.split()
    actionvector = numpy.zeros(14, dtype=numpy.int)
    actionvector[int(line[0])]=1
    mapping.append({'action':int(line[0]),'actionvector':actionvector, 'actor':int(line[1])})

total = len(labels['vlabels'][0])
ConfusionMatrix=numpy.array([[0,0],[0,0]])
NovelClassList=[[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]]
for NovelClass in NovelClassList:

    ConfusionMatrix2=numpy.array([[0,0],[0,0]])
Example #30
0
def dist(X_1, X_2, param='euclidean'):
    dist = DistanceMetric.get_metric(param)
    X = [X_1,X_2]
    return dist.pairwise(X)[0,1]