Example #1
0
def gower_distance(X):
    """
    This function expects a pandas dataframe as input
    The data frame is to contain the features along the columns. Based on these features a
    distance matrix will be returned which will contain the pairwise gower distance between the rows
    All variables of object type will be treated as nominal variables and the others will be treated as 
    numeric variables.

    Distance metrics used for:

    Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
    Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry)
    """
    individual_variable_distances = []

    for i in range(X.shape[1]):
        feature = X.iloc[:,[i]]
        if feature.dtypes[0] == np.object:
            feature_dist = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(feature))
        else:
            feature_dist = DistanceMetric.get_metric('manhattan').pairwise(feature) / np.ptp(feature.values)
            
        individual_variable_distances.append(feature_dist)

    return np.array(individual_variable_distances).mean(0)
Example #2
0
def calc_mahalanobis(x, y, n_neighbors):
    from sklearn.neighbors import DistanceMetric, NearestNeighbors
    DistanceMetric.get_metric('mahalanobis', V=np.cov(x))

    nn = NearestNeighbors(n_neighbors=n_neighbors,
                          algorithm='brute',
                          metric='mahalanobis',
                          metric_params={'V': np.cov(x)})
    return nn.fit(x).kneighbors(y)
Example #3
0
File: knn.py Project: salceson/mro
def mahalonobis(X):
    cov = np.cov(X, rowvar=0)
    try:
        metric = DistanceMetric.get_metric('mahalanobis', V=cov) if X.shape[0] > 1 \
            else DistanceMetric.get_metric('euclidean')
    except LinAlgError:
        metric = DistanceMetric.get_metric('euclidean')

    def distance(x, y):
        return metric.pairwise([x], [y])[0][0]

    return distance
Example #4
0
File: knn.py Project: salceson/mro
def euclid(_):
    metric = DistanceMetric.get_metric('euclidean')

    def distance(x, y):
        return metric.pairwise([x], [y])[0][0]

    return distance
Example #5
0
def dist(X, Y, distance_function = "euclidean"):
    """calculate X, Y distance matrix
    [Args]
    ------
    X : m samples
    Y : n samples
    distance_function : user_defined distance
    
    [Returns]
    ---------
    distance_matrix: n * m distance matrix
    
    
    we have those built-in function. Default = euclidean
    
    "euclidean"    EuclideanDistance    sqrt(sum((x - y)^2))
    "manhattan"    ManhattanDistance    sum(|x - y|)
    "chebyshev"    ChebyshevDistance    sum(max(|x - y|))
    "minkowski"    MinkowskiDistance    sum(|x - y|^p)^(1/p)
    "wminkowski"    WMinkowskiDistance    sum(w * |x - y|^p)^(1/p)
    "seuclidean"    SEuclideanDistance    sqrt(sum((x - y)^2 / V))
    "mahalanobis"    MahalanobisDistance    sqrt((x - y)' V^-1 (x - y))
    """
    distance_calculator = DistanceMetric.get_metric(distance_function)
    return distance_calculator.pairwise(X, Y)
Example #6
0
    def get_full_metric(self, train_pairs):
        train_pairs_flat = [item for subtuple in train_pairs for item in subtuple]
        
        pca = PCA(n_components = self.pca_components)
        pca.fit(train_pairs_flat)

        train_pairs_pca_flat = pca.transform(train_pairs_flat)

        train_pairs_pca = list()

        for i in xrange(0, len(train_pairs_pca_flat), 2):
            a = i 
            b = i + 1
            train_pairs_pca.append((train_pairs_pca_flat[a],
              train_pairs_pca_flat[b]))
        
        ys = ys_from_pairs(train_pairs_pca)

        file_id = str(random.random())[2:]

        save_cvx_params(ys, file_id)
        run_cvx(file_id)
        M = load_cvx_result(file_id)

        dist = DistanceMetric.get_metric('mahalanobis', VI = M)

        return dist, M, pca
Example #7
0
    def run_single_trial(self, train_pairs, test_pairs, train_tune_data, test_tune_data):
        print "Running PCA..."
        train_pairs_pca, test_pairs_pca = self.fit_pca(train_pairs, test_pairs)
        ys = ys_from_pairs(train_pairs_pca)

        file_id = str(random.random())[2:]

        save_cvx_params(ys, file_id)
        run_cvx(file_id)
        M = load_cvx_result(file_id)
        dist = DistanceMetric.get_metric('mahalanobis', VI = M)
        train_a_sections = [x[0] for x in train_pairs_pca]
        train_b_sections = [x[1] for x in train_pairs_pca]
        test_a_sections = [x[0] for x in test_pairs_pca]
        test_b_sections = [x[1] for x in test_pairs_pca]

        train_given_sections = train_a_sections
        train_to_match_sections = train_b_sections
        test_given_sections = test_a_sections
        test_to_match_sections = test_b_sections
        if self.match_a_to_b:
            train_given_sections = train_b_sections
            train_to_match_sections = train_a_sections
            test_given_sections = test_b_sections
            test_to_match_sections = test_a_sections

        print "Constructing BallTrees..."
        train_bt = BallTree(train_to_match_sections, metric=dist)
        test_bt = BallTree(test_to_match_sections, metric=dist)

        train_top_fraction = int(len(train_given_sections) * self.correct_within_top_fraction)
        test_top_fraction = int(len(test_given_sections) * self.correct_within_top_fraction)

        print "Querying the BallTrees..."
        train_result = train_bt.query(train_given_sections, train_top_fraction)
        test_result = test_bt.query(test_given_sections, test_top_fraction)

        print "Looking at correctness of results..."
        train_correct = sum([int(i in train_result[1][i]) for i in xrange(len(train_given_sections))])
        test_correct = sum([int(i in test_result[1][i]) for i in xrange(len(test_given_sections))])

        print "Finding indices of correct matches..."
        test_result_full = test_bt.query(test_given_sections, len(test_given_sections))
        def default_index(lst, i):
          ind = -1
          try:
            ind = lst.index(i)
          except:
            pass
          return ind
        test_indices = [default_index(list(test_result_full[1][i]), i) for i in xrange(len(test_given_sections))]
        test_indices = [x for x in test_indices if x != -1]

        with open("successful_tunes_{}".format(file_id), 'w') as successful_tunes_f:
          for i, index in enumerate(test_indices):
            if index == 0:
              successful_tunes_f.write(str(test_tune_data[i]) + '\n\n')

        return [[train_correct, len(train_given_sections)],
            [test_correct, len(test_given_sections)]], test_indices
def standardizedEulideanDistance(wide, p):
    """ Calculate the standardized Euclidean distance and return an array of distances to the center and a matrix of pairwise distances.

    :Arguments:
        :type wide: pandas.DataFrame
        :param wide: A wide formatted data frame with samples as columns and compounds as rows.

    :Returns:
        :return: Return 4 pd.DataFrames with SED values and cutoffs.
        :rtype: pd.DataFrames
    """

    # Estimated Variance from the data
    varHat = wide.var(axis=1, ddof=1)
    varHat[varHat==0] = 1
    dist = DistanceMetric.get_metric('seuclidean', V=varHat)

    # Column means
    colMean = wide.mean(axis=1)

    # Calculate the standardized Euclidean Distance from all samples to the center

    SEDtoCenter = dist.pairwise(wide.values.T, pd.DataFrame(colMean).T)
    SEDtoCenter = pd.DataFrame(SEDtoCenter, columns = ['SED_to_Center'], index = wide.columns)
    
    # Calculate the pairwise standardized Euclidean Distance of all samples
    SEDpairwise = dist.pairwise(wide.values.T)
    SEDpairwise = pd.DataFrame(SEDpairwise, columns = wide.columns, index = wide.columns)
    for index, row in SEDpairwise.iterrows():
        SEDpairwise.loc[index, index] = np.nan
    
    # Calculate cutoffs
    # For SEDtoCenter: 
    #   Beta: sqrt((p-1)^2/p*(sum of n iid Beta(1/2, p/2)));        (It's the exact distribution.)
    #   Normal: sqrt(N((p-1)/p*n, 2*(p-2)*(p-1)^2/p^2/(p+1)*n));    (It's normal approximation. Works well when n is large.)
    #   Chisq: sqrt((p-1)/p*Chi-sq(n));                             (It's Chi-sq approximation. Works well when p is decent and p/n is not small.)
    # For SEDpairwise:
    #   Beta: sqrt(2*(p-1)*(sum of n iid Beta(1/2, p/2)));
    #   Normal: sqrt(N(2*n, 8*(p-2)/(p+1)*n));
    #   Chisq: sqrt(2*Chi-sq(n));
    # where n = # of compounds and p = # of samples
    pSamples  = float(wide.shape[1])
    nFeatures = float(wide.shape[0])
    nIterate  = 20000 #100000
    #p = 0.95
    betaP     = np.percentile(pd.DataFrame(stats.beta.rvs(0.5, 0.5*(pSamples-2), size=nIterate*nFeatures).reshape(nIterate, nFeatures)).sum(axis=1), p*100)
    betaCut1  = np.sqrt((pSamples-1)**2/pSamples*betaP)
    normCut1  = np.sqrt(stats.norm.ppf(p, (pSamples-1)/pSamples*nFeatures, np.sqrt(2*nFeatures*(pSamples-2)*(pSamples-1)**2/pSamples**2/(pSamples+1))))
    chisqCut1 = np.sqrt((pSamples-1)/pSamples*stats.chi2.ppf(p, nFeatures))
    betaCut2  = np.sqrt((pSamples-1)*2*betaP)
    normCut2  = np.sqrt(stats.norm.ppf(p, 2*nFeatures, np.sqrt(8*nFeatures*(pSamples-2)/(pSamples+1))))
    chisqCut2 = np.sqrt(2*stats.chi2.ppf(p, nFeatures))
    cutoff1   = pd.DataFrame([[betaCut1, normCut1, chisqCut1]], columns=['Beta(Exact)', 'Normal', 'Chi-sq'])
    cutoff2   = pd.DataFrame([[betaCut2, normCut2, chisqCut2]], columns=['Beta(Exact)', 'Normal', 'Chi-sq'])

    # TODO: Create a flag based on values greater than one of the cutoffs.
    return SEDtoCenter, cutoff1, SEDpairwise, cutoff2
def example2():
    """using customized distance
    """
    from HSH.Misc.shgeo import dist
    def earthdist(x, y): # latitude, longitude earth surface distance
        return dist((x[0], x[1]), (y[0], y[1]))
    
    dist_cal = DistanceMetric.get_metric(earthdist)
    train = np.array([[32.5, 101.0], [32.5, 102.0]])
    test = np.array([[31.5, 101.0], [39.5, 101.0]])
    print(dist_cal.pairwise(train, test))
Example #10
0
def distance(X, distance_measure='euclidean'):

    X = np.array(X)

    if distance_measure in SKLEARN_METRICS:
        distance_ = DistanceMetric.get_metric(distance_measure).pairwise(X)
    elif distance_measure is 'pearson':
        distance_ = np.corrcoef(X)
    else:
        distance_ = None

    return distance_
Example #11
0
 def load_model(self):
     if self.file_cache and os.path.isfile(self.file_cache):
         self._log.debug("Loading mode: %s", self.file_cache)
         with numpy.load(self.file_cache) as cache:
             tail = tuple(cache['tail'])
             s = (cache['data_arr'], cache['idx_array_arr'],
                  cache['node_data_arr'], cache['node_bounds_arr']) +\
                 tail + (DistanceMetric.get_metric('hamming'),)
         #: :type: sklearn.neighbors.BallTree
         self.bt = BallTree.__new__(BallTree)
         self.bt.__setstate__(s)
         self._log.debug("Loading mode: Done")
Example #12
0
def example1():
    dist = DistanceMetric.get_metric("euclidean")
    train = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    test = np.array([[0.5, 0.5], [-0.5, -0.5]])
    
    distance_matrix = dist.pairwise(train, test)
    print(distance_matrix) # distance_matrix
    
    reduced_distance_matrix = dist.dist_to_rdist(distance_matrix) # reduced_distance_matrix
    print(reduced_distance_matrix) # for euclidean, it's squared distance_matrix
    
    print(dist.rdist_to_dist(reduced_distance_matrix))
Example #13
0
 def entropy(x,k=3,base=np.exp(1),intens=1e-10):
     """ The classic K-L k-nearest neighbor continuous entropy estimator
         x should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]]
         if x is a one-dimensional scalar and we have four samples
     """
     assert k <= len(x)-1, "Set k smaller than num. samples - 1"
     d = len(x[0])
     N = len(x)
     x +=  intens*nr.rand(N,d)
     tree = KDTree(x, metric=DistanceMetric.get_metric("minkowski",p=np.float64('inf') ))
     nn = tree.query(x,k+1)[0][:,k]   # no need to reshape with new query_radius method
     const = digamma(N)-digamma(k) + d*log(2)
     return (const + d*np.mean(map(log,nn)))/log(base)
Example #14
0
    def find_computed_cluster_metrics(self):
        """Initialises cluster metric computation over every cluster that is
        found by the given clustering algorithm.
        """
        for cluster in self.computed_clusters:
            cluster.compute_metrics(self.original_corpus,
                                    self.original_article_pos)

        centroid_locs = [x.centroid for x in self.computed_clusters]
        dist = DistanceMetric.get_metric('euclidean')
        dist_pair = dist.pairwise(centroid_locs)
        self.max_centroid_dist = max(list(itertools.chain.from_iterable(
            dist_pair)))
Example #15
0
    def __init__(self, proc, Xss, yss, valid_set=0.1, validation_set=None):
        self.seen_states = set()
        self.state_set = []
        self.proc = proc
        self.valid_set = 0.1
        self.surr_loss = DistanceMetric.get_metric('hamming')

        if validation_set is None:
            self._split(Xss, yss)
        else:
            self.Xss = Xss
            self.yss = yss
            self.valid_Xss, self.valid_yss = validation_set
Example #16
0
def __max_score_mapping(rr, predicted, test, max_angle=1.0 - 2):
    angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle)
    d = angle.pairwise(predicted, test)

    # Each true sample maps to closest
    test_mapping = np.zeros(shape=(test.shape[0],), dtype=float)

    for i in xrange(test.shape[0]):
        if np.any(d[:, i] < max_angle):
            close_predictions = d[:, i] < max_angle
            scores = [rr(p) for p in predicted[close_predictions, :]]
            test_mapping[i] = np.max(scores)

    return test_mapping
Example #17
0
def update_prediction(prediction, real_pic, metric_name='euclidean'):
    """
    Update a prediction after receiving the actual picture from the webcam.

    Parameters
    ----------
    prediction : Prediction
        The model object of the prediction to update
    real_pic : Picture
        The model object of the actual picture received

    Return
    ------
    float : the prediction error
    """
    pred_pic = prediction.as_picture()
    cam_id = prediction.params.webcam.webcam_id
    if metric_name == 'wminkowski-pca':
        with webcam_fs.get_dataset(cam_id) as dataset:
            if 'pca' not in dataset.imgset.feature_sets:
                raise ValueError("""wminkowski-pca cannnot be used
                                    without a PCA feature set""")

            pca_extractor = dataset.imgset.feature_sets['pca'].extractor
            weights = pca_extractor.pca.explained_variance_ratio_
            pred_data = pca_extractor.extract(pred_pic.pixels)
            real_data = pca_extractor.extract(real_pic.pixels)
            metric = DistanceMetric.get_metric('wminkowski', p=2, w=weights)
    else:
        pred_data = pred_pic.pixels
        real_data = real_pic.pixels
        metric = DistanceMetric.get_metric(metric_name)

    error = metric.pairwise([pred_data], [real_data])[0]
    prediction.error = error
    prediction.save()
    return error
Example #18
0
def entropy(data, ball='euclidean', k=1, units='nats'):
    """
    Estimates the entropy of the given data using the k-nearest neighbors method

    input
    -----
    data (nd-array):
        An (n by p) matrix containing n samples of p-dimensional data

    ball (string):
        Which ball (e.g. l1, euclidean, etc.) to use when computing the volume.
        Acceptable strings include:
            'l1'   : l1 or Manhattan distance
            'l2'   : l2 or Euclidean distance; default
            'linf' : l-infinity or Chebyshev distance

    k (integer):
        How many nearest-neighbors to use when computing radii. Must be at least 1.

    units (string):
        Which unit the entropy output has.
        Acceptable strings include:
            'nats' : base e
            'bits' : base 2

    """
    
    # Get number of samples and dimensionality
    (n,p)  = data.shape
    
    # Determine radii and volumes for a given metric space
    metric = getball(ball)
    if metric == 1:
        m = 'manhattan'
    elif metric == 2:
        m = 'euclidean'
    elif metric == inf:
        m = 'chebyshev'
        
    dist  = DistanceMetric.get_metric(m)
    D_mat = dist.pairwise(data)
    D_mat.sort(axis=1)
    radii = D_mat[:,k]
    Vs    = volume(radii, ball=str(metric), dimension=p)
    
    if units.lower() == 'nats':
        return sum([np.log(vol) for vol in Vs])/float(n) + np.log(n) - L(k - 1) + 0.577215665
    if units.lower() == 'bits':
        return sum([np.log2(vol) for vol in Vs])/float(n) + np.log2(n) - L(k - 1) + 0.577215665
Example #19
0
def __mappings(predicted, test, max_angle=1.0 - 2):
    angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle)
    d = angle.pairwise(predicted, test)

    # Each true sample maps to closest
    test_mapping = np.zeros(shape=(test.shape[0],), dtype=int)
    predicted_mapping = np.zeros(shape=(predicted.shape[0],), dtype=int)

    for i in xrange(test.shape[0]):
        test_mapping[i] = 1 if np.any(d[:, i] < max_angle) else 0

    for i in xrange(predicted.shape[0]):
        predicted_mapping[i] = 1 if np.any(d[i, :] < max_angle) else 0

    return predicted_mapping, test_mapping
Example #20
0
    def mi_LNC(x,y,k=5,base=np.exp(1),alpha=0.25,intens = 1e-10,metric='minkowski',p=np.float64('inf')):
        '''The mutual information estimator by PCA-based local non-uniform correction(LNC)
           ith row of X represents ith dimension of the data, e.g. X = [[1.0,3.0,3.0],[0.1,1.2,5.4]], if X has two dimensions and we have three samples
           alpha is a threshold parameter related to k and d(dimensionality), please refer to our paper for details about this parameter
        '''
        #N is the number of samples
        N = x.shape[0]

        #First Step: calculate the mutual information using the Kraskov mutual information estimator
        #adding small noise to X, e.g., x<-X+noise
        x += intens*nr.rand(x.shape[0],x.shape[1])
        y += intens*nr.rand(x.shape[0],x.shape[1])
        points = np.hstack((x,y))

        tree = KDTree(points, metric=DistanceMetric.get_metric(metric, p=p))
        try:
            dvec, knn_idx = tree.query(points, k+1)   # no need to reshape with new query_radius method
        except ValueError:
          return (float("NaN"))

        a = MI.avgdigamma(x,dvec[:,-1]*x.shape[1]/points.shape[1], metric=metric, p=p)
        b = MI.avgdigamma(y,dvec[:,-1]*y.shape[1]/points.shape[1], metric=metric, p=p)
        c = digamma(k)
        d = digamma(len(x))

        # a,b,c,d = MI.avgdigamma(x,dvec), MI.avgdigamma(y,dvec), digamma(k), digamma(len(x))
        # print("ee_acc: %s, %s, %s, %s" %( a,b,c,d))
        ret = (-a-b+c+d)/np.log(base)

        # LNC correction
        logV_knn = np.sum(np.log(np.abs(points - points[knn_idx[:,-1],:])), axis=1)
        logV_projected = np.zeros(logV_knn.shape)
        for i in range(points.shape[0]):
            knn_points = points[knn_idx[i,:],:]
            knn_centered = knn_points - points[i,:]
            u,s,v = la.svd(knn_centered)
            knn_proj = knn_centered.dot(v.T)
            max_dims = np.max(np.abs(knn_proj), axis=0)   # max-norm per dimension
            logV_projected[i] = np.sum(np.log(max_dims))

        diff = logV_projected - logV_knn
        if (alpha>1): alpha = 1
        diff[diff >= log(alpha)] = 0
        e = -np.sum(diff) / N

        return (ret + e)/log(base);
def calcMDS(pltnum, flag, dmetric):
    if flag == 1:
        clf = PCA(n_components=5)
        Y = clf.fit_transform(X)
        title  = 'PCA-MDS'
    elif flag == 2:
        clf = TruncatedSVD(n_components=5)
        Y = clf.fit_transform(X)
    else:
        Y = X
        title = 'MDS DistanceMetric: ' + str(dmetric)
    dist = DistanceMetric.get_metric(dmetric)
    Y    = dist.pairwise(Y)
    # Y = euclidean_distances(Y)
    mds = manifold.MDS(n_components=2, dissimilarity='precomputed')#, init='pca', random_state=0)
    Y = mds.fit_transform(Y)
    for i in range(1, 3):
        mdsPlot(int(str(pltnum) + str(i)), i, Y, title)
Example #22
0
def distance_from_most_visited_place(place, user):
    q = select([func.count(),visits_10min.c.placeid]).where(visits_10min.c.userid == user).group_by(visits_10min.c.placeid).order_by(func.count().desc())
    most_visited_places = [r[1] for r in connection.execute(q).fetchall()]
    def get_lat_long(place_q):
        try:
            return connection.execute(select([places_location.c.longitude, places_location.c.latitude]).where(and_(places_location.c.placeid == place_q, places_location.c.userid == user))).fetchall()[0]
        except Exception as e:
            return None
            
    dist = DistanceMetric.get_metric('haversine')
    X = []
    X.append(get_lat_long(place))
    for p in most_visited_places:
        ret = get_lat_long(p)
        if ret is not None:
            X.append((ret[0], ret[1]))
            break
    return dist.pairwise(X)[0][1]
Example #23
0
	def MDS(self,typeof='classic',dist=False,groups=None,dpi=300,textsize=10,interactive=False,
	        samemarker=False,markersize=8,numbered=False,legend=False,of='pdf',rotate=0,MD=False):
		'''
		Perform Multidimensional Scaling wither classic (PCoA) or non-metric.
		If you have the upper triangle of a distance matrix as a dictionary,
		pass the dictionary as dist.
		'''
		# Rotation instance
		self.clf = PCA(n_components=self.ncomp)		

		seed = np.random.RandomState(seed=3)

		if typeof == 'classic': 
			metric = True
			self.type = 'cMDS'
		else: 
			metric = False
			self.type = "nMDS"

		if dist:
			similarities=self.dict2array2matrix(dist)
		else:
			#similarities = euclidean_distances(self.data)
			dist = DistanceMetric.get_metric('euclidean')
			similarities = dist.pairwise(self.data)
		# Initiate multidimensional scaling
		mds = manifold.MDS(n_components=self.ncomp, metric = metric, max_iter=3000, eps=1e-9, 
		                   random_state=seed, dissimilarity="precomputed", n_jobs=-1)

		#fit the data the MDS
		pos = mds.fit(similarities).embedding_
		if typeof != 'classic': pos = mds.fit_transform(similarities, init=pos)

		# Rescale the data
		pos *= np.sqrt((np.array(self.data)** 2).sum()) / np.sqrt((pos ** 2).sum())

		# Rotate the data
		self.fit = self.clf.fit_transform(pos)
		self.Plot(dpi=dpi,textsize=textsize,interactive=interactive,samemarker=samemarker,
		          markersize=markersize,numbered=numbered,legend=legend,of=of,rotate=rotate,
		          groups=groups,MD=MD)
Example #24
0
    def compute_metrics(self, corpus, article_pos):
        """Computes metrics for the given cluster. Metrics computed are:
        diameter, radius, centroid, closest article to centroid, the distance
        of the closest article to the centroid.

        Args:
            corpus: A corpus in LSI space
            article_pos (dict): Maps the article id to the actual
                                positions of the article in the corpus
        """
        dist_corpus = [corpus[article_pos[x]] for x in self.articles_id]

        # Centroid calculation
        self.centroid = np.average(dist_corpus, axis=0)

        # Diameter calculation
        dist = DistanceMetric.get_metric('euclidean')
        dist_pair = dist.pairwise(dist_corpus)
        self.diameter = max(list(itertools.chain.from_iterable(dist_pair)))

        # Radius calculation
        dist_corpus.append(self.centroid)
        dist_pair = dist.pairwise(dist_corpus)
        centroid_dist = [x for x in dist_pair[-1] if x > 0]
        if len(centroid_dist) > 0:
            self.radius = max(centroid_dist)

            # Closest article computation
            closest_article = self.articles_id[0]
            min_dist = self.radius
            tmp_content = []

            for k, id in enumerate(self.articles_id):
                if centroid_dist[k] < min_dist:
                    closest_article = id
                    min_dist = centroid_dist[k]
                    tmp_content = self.data[k]

            self.closest_article_id = closest_article
            self.closest_article_distance = min_dist
            self.closest_article_content = tmp_content
Example #25
0
    def mi_Kraskov(x,y,k=5,base=np.exp(1),intens=1e-10,metric="minkowski",p=np.float64('inf')):
        '''The mutual information estimator by Kraskov et al.
           Inputs are 2D arrays, with each column being a dimension and each row being a data point
        '''
        assert len(x)==len(y), "Lists should have same length"
        assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
        x +=  intens*nr.rand(x.shape[0],x.shape[1])
        y +=  intens*nr.rand(x.shape[0],x.shape[1])
        points = np.hstack((x,y))

        #Find nearest neighbors in joint space, p=inf means max-norm
        tree = KDTree(points, metric=DistanceMetric.get_metric(metric,p=p))
        try:
          dvec = tree.query(points,k+1)[0][:,k]   # no need to reshape with new query_radius method
        except ValueError:
          return (float("NaN"))

        a = MI.avgdigamma(x,dvec*x.shape[1]/points.shape[1],metric=metric,p=p)
        b = MI.avgdigamma(y,dvec*y.shape[1]/points.shape[1],metric=metric,p=p)
        c = digamma(k)
        d = digamma(len(x))
        # print("ee_acc: %s, %s, %s, %s" %( a,b,c,d))
        return (-a-b+c+d)/np.log(base)
Example #26
0
def compute_medoid(raw_points):
    points = numpy.radians([[p[0], p[1]] for p in raw_points])
    d = DistanceMetric.get_metric('haversine')
    dists = d.pairwise(points)
    index = numpy.argmin(dists.sum(axis=0))
    return index
Example #27
0
def mst(data_path, k):

    df_pd = pd.read_csv(data_path)  # import the data
    N = len(df_pd)  # length of the data
    df_pd.columns = [i for i in range(0, len(df_pd.columns))
                     ]  # name the columns with numbers

    # Compute distances
    dist = DistanceMetric.get_metric('euclidean')
    df = df_pd.to_numpy()

    distance = dist.pairwise(
        df
    )  # matrix with the pair-wise euclidean distance between all data points

    # Sort the edges in ascending order (according to the distance in between vertices)
    sorted_edges = np.transpose(
        np.unravel_index(np.argsort(distance, axis=None),
                         distance.shape)).tolist()

    # Remove the first N zeros (distance between the same data points) and as they are duplicated, choose one for each 2
    sorted_edges = sorted_edges[N::2]

    # Add a cluster column to keep track of the point's cluster, if it does not belong to any, it will be 'Non visited'
    df_pd['cluster'] = ['Non visited' for i in range(0, N)]

    # Start algorithm

    # Initiate a dictionary with cluster and the data points it has
    cluster_dic = {-1: []}

    # Initiate a counter for the number of edges we add
    counter = 0
    t0 = time.time()
    for edge in sorted_edges:  # iterate over all the edges (sorted)

        parent_0 = df_pd.loc[
            edge[0],
            'cluster']  # set the cluster to which the first data point of the edge belongs to
        parent_1 = df_pd.loc[
            edge[1],
            'cluster']  # set the cluster to which the second data point of the edge belongs to

        # If both data points belong to the same cluster, then do nothing because it would create a cycle
        if (parent_0 == parent_1) and (parent_0 != 'Non visited'):
            pass

        # If both data points have no cluster assigned, create a new cluster with both data points
        elif (parent_0 == 'Non visited') and (parent_1 == 'Non visited'):
            counter += 1
            max_cluster = max(cluster_dic)
            cluster_dic[max_cluster + 1] = [edge[0]]
            cluster_dic[max_cluster + 1] += [edge[1]]

            # Keep track that these two data points have now a cluster by including the cluster number to df_pd
            df_pd.loc[edge[0], 'cluster'] = max_cluster + 1
            df_pd.loc[edge[1], 'cluster'] = max_cluster + 1

        # If some of the data points does not belong to any cluster, add the other data point of the edge to the cluster
        # of the first one and keep track that this data point has now a cluster by including the cluster to df_pd
        elif (parent_0 == 'Non visited') or (parent_1 == 'Non visited'):
            counter += 1
            if parent_0 == 'Non visited':
                cluster_dic[parent_1] += [edge[0]]
                df_pd.loc[edge[0], 'cluster'] = parent_1
            else:
                cluster_dic[parent_0] += [edge[1]]
                df_pd.loc[edge[1], 'cluster'] = parent_0

        # If the two data points belong to different cluster, add the vertices of the second cluster to the first,
        # delete the second cluster and change the cluster in df_pd for points in second cluster
        else:
            counter += 1
            cluster_dic[parent_0] += cluster_dic[parent_1]
            del cluster_dic[parent_1]

            df_pd.loc[df_pd['cluster'] == parent_1, 'cluster'] = parent_0

        # Stop iterating once we have N clusters, that is, we have added K == N - edges we have added and add these
        # points as 'alone' clusters (clusters with only one data point)
        if N - counter == k:
            for i in range(0, N):
                if df_pd.loc[i, 'cluster'] == 'Non visited':
                    df_pd.loc[i, 'cluster'] = max(cluster_dic) + 1
                    cluster_dic[max(cluster_dic)] = i
            t1 = time.time()
            print(t1 - t0)
            return df_pd
Example #28
0
def dist(X_1, X_2, param='euclidean'):
    dist = DistanceMetric.get_metric(param)
    X = [X_1,X_2]
    return dist.pairwise(X)[0,1]
Example #29
0
 
test_error_cheb = 1- accuracy_score(y_test, y_pred_test)

#mahalanobis

from sklearn.datasets import make_classification
from sklearn.neighbors import DistanceMetric
x = x_train
y = y_train
xt = x_train
yt = y_train
test_error = []
for K in list(range(1, 99,5)):
        x, y = make_classification()
        xt, yt = make_classification()
        DistanceMetric.get_metric('mahalanobis', V=np.cov(x))
        K_value = K
        neigh = KNeighborsClassifier(n_neighbors =K,algorithm='brute',  metric='mahalanobis', metric_params={'V': np.cov(x)})
        neigh.fit(x,y)
        y_pred_test = neigh.predict(xt)
        last = []
        for i in range(len(y)):
            if y[i] != y_pred_test[i]:
                last.append(i)
        accuracy = len(last)/len(y_pred_test)
        bwb = ( 1- accuracy)
        test_error.append(bwb)
        minpos = test_error.index(min(test_error))
        k_use = (minpos*5)+1
  
#optimal k = 4      
Example #30
0
def euclidean(x, y):
    result = DistanceMetric.get_metric('euclidean')
    return result.pairwise(x, y)
Example #31
0
def aglo(processed_files, n_clusters=80):
    '''
    This function will agglomerate the group of closed coordinates into single coordinate, plot into KML
    and mark it as Left or Right or Stop
    :param processed_files: Containing full path of the three generated text files
    :param n_clusters: By default set to 80
    :return: None
    '''

    # Calling the readFile function to parse the text files into the dataframes and stored into the dictionary
    # key as full path name of the text file
    dataframes = {}
    for file in processed_files:
        dataframes[file] = readFile(file)

    kml1 = simplekml.Kml(open=1)
    for df in dataframes:
        for row in dataframes[df]:
            kmlPoint(kml1, df, row)
    kml1.save('Output_before_aglo.kml')

    # Performing agglomeration for the three dataframes and store the cluster labels for each of the three
    # Agglomerative process into the dictionary
    model_labels = {}
    for df in dataframes:
        dist = DistanceMetric.get_metric('haversine')
        dist_matrix = dist.pairwise(dataframes[df])
        hc = AgglomerativeClustering(n_clusters=n_clusters,
                                     affinity='precomputed',
                                     linkage='single')
        hc.fit(dist_matrix)
        model_labels[df] = hc.labels_

    # Finding the centroid values for each clusters of the three agglomerative clusters label objects and
    # store that into the dictionary
    centroids = {}

    # For each agglomerative cluster label objects
    for model in model_labels:
        centroid = []
        # Run it through each clusters
        for cluster in range(n_clusters):
            coordinates = []
            # Take all the coordinates for each clusters and find the mean centroid coordinate and store it
            latitude = dataframes[model][model_labels[model] == cluster, 0]
            longitude = dataframes[model][model_labels[model] == cluster, 1]
            coordinates.append(np.sum(latitude) / len(latitude))
            coordinates.append(np.sum(longitude) / len(longitude))
            centroid.append(coordinates)

        centroids[model] = centroid

    # Getting centroid values list for each of the tracks and store into the separate list
    stops = []
    left = []
    right = []
    for model in centroids:
        if 'stops' in model:
            stops = centroids[model]
        if 'left' in model:
            left = centroids[model]
        if 'right' in model:
            right = centroids[model]

    # If the stops and left are within the distance of 100 meters, then remove the particular stop coordinates
    # from the list. Used Haversine distance to find the distance between two coordinates
    for idx, stop_coordinate in enumerate(stops):
        for left_coordinate in left:
            distance = haversine(stop_coordinate, left_coordinate)
            if distance < 100:
                del stops[idx]
                break

    # If the stops and right are within the distance of 100 meters, then remove the particular stop coordinates
    # from the list. Used Haversine distance to find the distance between two coordinates
    for idx, stop_coordinate in enumerate(stops):
        for right_coordinate in right:
            distance = haversine(stop_coordinate, right_coordinate)
            if distance < 100:
                del stops[idx]
                break

    # Updating the new stop list into the centroids[stops] dictionary value
    for model in centroids:
        if 'stops' in model:
            centroids[model] = stops

    # Creating the object for KML
    kml = simplekml.Kml(open=1)

    # For each GPS tracks, plot the coordinates into the KML with the designated labels and save the
    # KML as Output.kml
    for centroid in centroids:
        for coordinates in centroids[centroid]:
            kmlPoint(kml, centroid, coordinates)
    kml.save('Output.kml')
    def create_data_model(self):
        """Stores the data for the problem."""
        data = {}
        # Locations in block units
        data['locations'] = [
            (288, 149), (288, 129), (270, 133), (256, 141), (256, 157), (246, 157),
            (236, 169), (228, 169), (228, 161), (220, 169), (212, 169), (204, 169),
            (196, 169), (188, 169), (196, 161), (188, 145), (172, 145), (164, 145),
            (156, 145), (148, 145), (140, 145), (148, 169), (164, 169), (172, 169),
            (156, 169), (140, 169), (132, 169), (124, 169), (116, 161), (104, 153),
            (104, 161), (104, 169), (90, 165), (80, 157), (64, 157), (64, 165),
            (56, 169), (56, 161), (56, 153), (56, 145), (56, 137), (56, 129),
            (56, 121), (40, 121), (40, 129), (40, 137), (40, 145), (40, 153),
            (40, 161), (40, 169), (32, 169), (32, 161), (32, 153), (32, 145),
            (32, 137), (32, 129), (32, 121), (32, 113), (40, 113), (56, 113),
            (56, 105), (48, 99), (40, 99), (32, 97), (32, 89), (24, 89),
            (16, 97), (16, 109), (8, 109), (8, 97), (8, 89), (8, 81),
            (8, 73), (8, 65), (8, 57), (16, 57), (8, 49), (8, 41),
            (24, 45), (32, 41), (32, 49), (32, 57), (32, 65), (32, 73),
            (32, 81), (40, 83), (40, 73), (40, 63), (40, 51), (44, 43),
            (44, 35), (44, 27), (32, 25), (24, 25), (16, 25), (16, 17),
            (24, 17), (32, 17), (44, 11), (56, 9), (56, 17), (56, 25),
            (56, 33), (56, 41), (64, 41), (72, 41), (72, 49), (56, 49),
            (48, 51), (56, 57), (56, 65), (48, 63), (48, 73), (56, 73),
            (56, 81), (48, 83), (56, 89), (56, 97), (104, 97), (104, 105),
            (104, 113), (104, 121), (104, 129), (104, 137), (104, 145), (116, 145),
            (124, 145), (132, 145), (132, 137), (140, 137), (148, 137), (156, 137),
            (164, 137), (172, 125), (172, 117), (172, 109), (172, 101), (172, 93),
            (172, 85), (180, 85), (180, 77), (180, 69), (180, 61), (180, 53),
            (172, 53), (172, 61), (172, 69), (172, 77), (164, 81), (148, 85),
            (124, 85), (124, 93), (124, 109), (124, 125), (124, 117), (124, 101),
            (104, 89), (104, 81), (104, 73), (104, 65), (104, 49), (104, 41),
            (104, 33), (104, 25), (104, 17), (92, 9), (80, 9), (72, 9),
            (64, 21), (72, 25), (80, 25), (80, 25), (80, 41), (88, 49),
            (104, 57), (124, 69), (124, 77), (132, 81), (140, 65), (132, 61),
            (124, 61), (124, 53), (124, 45), (124, 37), (124, 29), (132, 21),
            (124, 21), (120, 9), (128, 9), (136, 9), (148, 9), (162, 9),
            (156, 25), (172, 21), (180, 21), (180, 29), (172, 29), (172, 37),
            (172, 45), (180, 45), (180, 37), (188, 41), (196, 49), (204, 57),
            (212, 65), (220, 73), (228, 69), (228, 77), (236, 77), (236, 69),
            (236, 61), (228, 61), (228, 53), (236, 53), (236, 45), (228, 45),
            (228, 37), (236, 37), (236, 29), (228, 29), (228, 21), (236, 21),
            (252, 21), (260, 29), (260, 37), (260, 45), (260, 53), (260, 61),
            (260, 69), (260, 77), (276, 77), (276, 69), (276, 61), (276, 53),
            (284, 53), (284, 61), (284, 69), (284, 77), (284, 85), (284, 93),
            (284, 101), (288, 109), (280, 109), (276, 101), (276, 93), (276, 85),
            (268, 97), (260, 109), (252, 101), (260, 93), (260, 85), (236, 85),
            (228, 85), (228, 93), (236, 93), (236, 101), (228, 101), (228, 109),
            (228, 117), (228, 125), (220, 125), (212, 117), (204, 109), (196, 101),
            (188, 93), (180, 93), (180, 101), (180, 109), (180, 117), (180, 125),
            (196, 145), (204, 145), (212, 145), (220, 145), (228, 145), (236, 145),
            (246, 141), (252, 125), (260, 129), (280, 133)
        ]  # yapf: disable

        data['data'] = list(map(lambda x: list(x), data['locations']))
        data['num_vehicles'] = 1
        data['depot'] = 0

        euclidean = DistanceMetric.get_metric('euclidean')
        data['line'] = euclidean.pairwise(data['locations'])
        self.data = {f"circuit_board{len(data['line'])}": data}
Example #33
0
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.neighbors import DistanceMetric
from sklearn.metrics import accuracy_score

dist = DistanceMetric.get_metric('manhattan')

X = np.asarray([[1, 4, 2], [5, 4, 8], [2, 6, 5], [1, 1, 1], [2, 9, 6]])
y = np.asarray([2, 3, 3, 1, 2])

neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X, y)

X_test = np.asarray([[5, 3, 8]])
print(neigh.predict(X_test))

print('-----------------')

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=1, metric='l2')
X = [[1, 4, 1], [5, 4, 8], [2, 6, 5], [1, 1, 1], [2, 9, 6]]
Y = [1, 1, 1, 0, 0]
to_pred = [[1, 22, 1]]
model.fit(X, Y)
pred = model.predict(to_pred)
print(pred)

print(' ---------------- ')
print('date de train si date de test')
# de modificat sus!!
prop = np.mean(y_test == 1.0)

# Compute the appropriate threshold
threshold = np.quantile(scores, prop)

# Print the confusion matrix for the thresholded scores
print(confusion_matrix(y_test, scores > threshold))

## Find the neighbor
# It is clear that the local outlier factor algorithm depends a lot on the idea of a nearest neighbor, which in turn depends on the choice of distance metric. So you decide to experiment some more with the hepatitis dataset introduced in the previous lesson. You are given three examples stored in features, whose classes are stored in labels. You will identify the nearest neighbor to the first example (row with index 0) using three different distance metrics, Euclidean, Hamming and Chebyshev, and on the basis of that choose which distance metric to use. You will import the necessary module as part of the exercise, but pandas and numpy already available, as are features and their labels labels.

# Import DistanceMetric as dm
from sklearn.neighbors import DistanceMetric as dm

# Find the Euclidean distance between all pairs
dist_eucl = dm.get_metric('euclidean').pairwise(features)

# Find the Hamming distance between all pairs
dist_hamm = dm.get_metric('hamming').pairwise(features)

# Find the Chebyshev distance between all pairs
dist_cheb = dm.get_metric('chebyshev').pairwise(features)

## Not all metrics agree
# In the previous exercise you saw that not all metrics agree when it comes to identifying nearest neighbors. But does this mean they might disagree on outliers, too? You decide to put this to the test. You use the same data as before, but this time feed it into a local outlier factor outlier detector. The module LocalOutlierFactor has been made available to you as lof, and the data is available as features.

# Instructions
# 100 XP
# Detect outliers in features using the euclidean metric.
# Detect outliers in features using the hamming metric.
# Detect outliers in features using the jaccard metric.
def check_pickle(metric, kwargs):
    dm = DistanceMetric.get_metric(metric, **kwargs)
    D1 = dm.pairwise(X1)
    dm2 = pickle.loads(pickle.dumps(dm))
    D2 = dm2.pairwise(X1)
    assert_array_almost_equal(D1, D2)
Example #36
0
 def _calculate_batch_avg_distance_school(self, points):
     earth_radius = 6371.0088
     dist = DistanceMetric.get_metric('haversine')
     distances = dist.pairwise(np.radians(points))
     indexes = np.tril_indices(n=distances.shape[0], k=-1, m=distances.shape[1])
     return earth_radius * np.mean(distances[indexes])
Example #37
0
def seuclidean(x, y, V):
    result = DistanceMetric.get_metric('seuclidean', V)
    return result.pairwise(x, y)
Example #38
0
def brute_force_neighbors(X, Y, k, metric, **kwargs):
    X, Y = check_array(X), check_array(Y)
    D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
    ind = np.argsort(D, axis=1)[:, :k]
    dist = D[np.arange(Y.shape[0])[:, None], ind]
    return dist, ind
 def _distancemetric_factory(X):
     return DistanceMetric.get_metric(name)
Example #40
0
def chebyshev(x, y):
    result = DistanceMetric.get_metric('chebyshev')
    return result.pairwise(x, y)
def sklearn_haversine(y, x):
    haversine = DistanceMetric.get_metric('haversine')
    latlon = np.hstack((y[:, np.newaxis], x[:, np.newaxis]))
    dists = haversine.pairwise(latlon)
    return 6371*dists #6371 is for distances in kms
Example #42
0
def compute_diameter(raw_points):
    points = numpy.radians([[p[0], p[1]] for p in raw_points])
    d = DistanceMetric.get_metric('haversine')
    dists = d.pairwise(points).flatten()
    return dists[numpy.argmax(dists)] * 6372795
Example #43
0
 def __init__(self):
     self.batch_size = 10
     self.model = ResNet50(weights='imagenet',
                           include_top=False, pooling='avg')
     self.dist = DistanceMetric.get_metric('euclidean')
     self.min_max_scaler = MinMaxScaler()
Example #44
0
def dist(X_1, X_2, param='euclidean'):
    dist = DistanceMetric.get_metric(param)
    X = [X_1, X_2]
    return dist.pairwise(X)[0, 1]
Example #45
0
def manhattan(x, y):
    result = DistanceMetric.get_metric('manhattan')
    return result.pairwise(x, y)
Example #46
0
    def fit(self, X, y):
        # czy X i y maja wlasciwy ksztalt
        X, y = check_X_y(X, y)
        # przechowanie unikalnych klas problemu
        self.classes_ = np.unique(y)
        # zapamietujemy X i y
        self.X_, self.y_ = X, y
        # przygotowujemy narzedzie do liczenia dystansow
        self.dm_ = DistanceMetric.get_metric(self.metric)

        # kontener na centroidy klas
        self.centroids_ = []
        # plt.scatter(self.X_[:, 0], self.X_[:, 1], c=y, cmap='bwr')
        # plt.tight_layout()
        # plt.savefig("trzy")
        # dla kazdej klasy
        for cl in self.classes_:
            # wybieramy tylko instancje nalezace do danej klasy
            X_class = self.X_[self.y_ == cl]

            # petla
            while True:
                # wyliczamy centroid klasy
                class_centroid = np.mean(X_class, axis=0)
                # jeżeli nie optymalizujemy to kończymy
                if self.optimize == False:
                    break
                # liczymy odchylenie standardowe instancji klasy
                std = np.std(X_class, axis=0)

                # możliwie najdalej znajdująca się instancje
                self.borderline_ = class_centroid + (self.sigma * std)

                # maksymalny dopuszczalny dystans
                accepted_distances = np.squeeze(
                    self.dm_.pairwise(
                        class_centroid.reshape(1, X_class.shape[1]),
                        self.borderline_.reshape(1, X_class.shape[1])))

                # liczymy dystanse wszystkich obiektow klasy od centroidu
                distances = np.squeeze(
                    self.dm_.pairwise(
                        class_centroid.reshape(1, X_class.shape[1]), X_class))

                # plt.scatter(class_centroid[0], class_centroid[1], c='black', s=260)
                # plt.savefig("trzy")

                # uznajemy za outliery te instancje, ktore znajduja sie od
                # centroidu dalej niz 3 * std
                self.outliers_mask_ = np.array(distances > accepted_distances)
                # konczymy optymalizacje, jezeli nie mamy outlierow
                if np.sum(self.outliers_mask_) == 0:
                    break
                # w inym przypadku pozbywamy sie outlierow
                else:
                    # plt.scatter(X_class[self.outliers_mask_, 0], X_class[self.outliers_mask_, 1], c='gray', s=100)
                    # plt.savefig("trzy")
                    X_class = X_class[self.outliers_mask_ == False]

            # dodajemy wyliczony centroid do listy
            self.centroids_.append(class_centroid)
        # zwracamy klasyfikator
        return self
Example #47
0
def haversine_distance(p1, p2):
    d = DistanceMetric.get_metric('haversine')
    X = [p1, p2]
    return d.pairwise(X)[0][1]
Example #48
0
def wminkowski(x, y, p, w):
    result = DistanceMetric.get_metric('wminkowski', p, w)
    return result.pairwise(x, y)
Example #49
0
    'Is this the first document?',
]

X = vectorizer.fit_transform(corpus)

print vectorizer.get_feature_names()
print X.toarray()
print "first index:", vectorizer.vocabulary_.get("first")

transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X.toarray())

print tfidf.toarray()
print "idf vector:", transformer.idf_

transformer2 = TfidfVectorizer(smooth_idf=False)
tfidf2 = transformer2.fit_transform(corpus)
print "\n", tfidf2.toarray()

mink_metric = DistanceMetric.get_metric("minkowski")
eucl_metric = DistanceMetric.get_metric("euclidean")

X = [[0, 1, 2], [3, 4, 5]]
eucl_pairs = eucl_metric.pairwise(X)

filename = "eucl_pairs"
print "before\n" + str(eucl_pairs)
save(filename, eucl_pairs)
print "after"
loaded_eucl_pairs = load(filename + ".npy")
print loaded_eucl_pairs
Example #50
0
import collections
import graphlab as gl
import pandas as pd
import json
from pymongo import MongoClient

from sklearn.neighbors import DistanceMetric
dist = DistanceMetric.get_metric('haversine')


class recommender(object):
    def __init__(self, df_tip, df_biz):
        self.tip = df_tip
        self.biz = df_biz
        self.loc = None

    def build(self, community, city):
        ### Get information of community
        tip_community = self.tip.loc[self.tip['user_id'].isin(community)]

        if city is None:
            # Get a list of business.
            businesses = list(set(tip_community['business_id']))

            # Get information of businesses for biz.
            biz_community = self.biz.loc[df['business_id'].isin(businesses)]

            # Calculate distances from loc to business_id.
            biz_community.loc[:,'dist'] = biz_community[['longitude','latitude']].apply(lambda x: dist.pairwise(self.loc,x[np.newaxis,:])[0][0], axis = 1)

            # Get a list of rellevant businesses in diameter of 10 unit of distance.
Example #51
0
## KNN PREDICTOR ##

# do some lambda magic on text columns

traindata = list(train.apply(lambda x:'%s %s %s' % (x['query'],x['product_title'], x['product_description']),axis=1))
testdata = list(test.apply(lambda x:'%s %s %s' % (x['query'],x['product_title'], x['product_description']),axis=1))

# Fit TFIDF

tfv.fit(traindata)
X = tfv.transform(traindata)
X_test = tfv.transform(testdata)

clf = pipeline.Pipeline([('tSVD',tSVD),('scl',scl),('knn',knn)])
param_grid = {'knn__n_neighbors':[2],'knn__metric':[DistanceMetric.get_metric('manhattan')],'tSVD__n_components':[400]}

model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, scoring = kappa_scorer, refit = True, cv = 2, n_jobs = -1)

# Fit Model

model.fit(X, y)
model.best_estimator_.fit(X,y)
trainPred = model.best_estimator_.predict(X_test)

# Averaging predicted relevance values

finalPred = [int(floor((int(stemPred[i])+trainPred[i])*0.5)) for i in range(len(stemPred))]

#print "Kappa Score for Training Data\nStemming+KNN\nScore=%f" %(quadratic_weighted_kappa(y, finalPred))
Example #52
0
def minkowski(x, y, p):
    result = DistanceMetric.get_metric('minkowski', p)
    return result.pairwise(x, y)
def check_cdist_bool(metric, D_true):
    dm = DistanceMetric.get_metric(metric)
    D12 = dm.pairwise(X1_bool, X2_bool)
    assert_array_almost_equal(D12, D_true)
Example #54
0
def main(args):
    if args.predict is None:
        # We are training a model.
        np.random.seed(args.seed)
        train = Dataset()

        train_data, train_target = makeVectors(train.data, train.target)

        model = Pipeline(steps=[
            #('trans', OneHotEncoder()),
            ('mlp', KNeighborsClassifier(n_neighbors=1, p=2, n_jobs=-1))
        ])
        model.fit(train_data, train_target)
        '''

        print(neigh.predict(np.array([vectorize('reci', 20)])))
        print(neigh.predict_proba(np.array([vectorize('reci', 20)])))


        scaler = MinMaxScaler()
        scaledTrain = scaler.fit_transform(trainVectors[1])
        scaledTest = scaler.transform(testVector)
        mlp = MLPClassifier(activation='relu', hidden_layer_sizes=(200,), max_iter=300)
        mlp.fit(scaledTrain, targets[1])
        print(mlp.classes_)
        probs = mlp.predict_proba(scaledTest)
        weights = np.array([1.469, 3.131])
        for prob in probs:
            print(prob)
            print(np.argmax(prob * weights))


        

        testVector = np.array([vectorize(5, 'naramek', 40), vectorize(1, 'sedmnact', 40), vectorize(6, 'vypravel', 40)]);
        print(testVector)
        
        scaler = MinMaxScaler()
        scaledTrain = scaler.fit_transform(trainVectors[3])
        scaledTest = scaler.transform(testVector)
        
        mlp = MLPClassifier(activation='tanh', hidden_layer_sizes=(150), max_iter=300)
        mlp.fit(scaledTrain, targets[3])
        
        print(mlp.predict_proba(scaledTest))
        print(mlp.predict(scaledTest))
        '''

        # TODO: Train a model on the given dataset and store it in `model`.
        model = model
        # Serialize the model.
        with lzma.open(args.model_path, "wb") as model_file:
            pickle.dump(model, model_file)

    else:
        # Use the model and return test set predictions, as either a Python list or a NumPy array.
        test = Dataset(args.predict)

        with lzma.open(args.model_path, "rb") as model_file:
            model = pickle.load(model_file)

        #ignored = [' ', '\n', '-', ':', ',', '.', '?', '!', '"']
        ignored = [' ', '\n']
        i = 0
        predictions = ''

        defaultWords = []
        words = []
        wordsLong = []
        wasCaps = []
        wasUpper = []
        testLower = test.data.lower()
        prevWord = ''
        while i < len(testLower):
            if testLower[i] in ignored:
                predictions += testLower[i]
                i += 1
            else:
                startIndex = i
                endIndex = i + 1
                while testLower[endIndex] not in ignored:
                    endIndex += 1

                if endIndex == startIndex + 1:
                    i = endIndex
                    predictions += test.data[startIndex:endIndex]
                    continue

                word = testLower[startIndex:endIndex]
                defaultWords.append(word)
                words.append(vectorize(word, 41))
                if (prevWord != ''):
                    wordsLong.append(vectorize(prevWord + ' ' + word, 41))
                else:
                    wordsLong.append(vectorize(word, 41))

                capsC = test.data[startIndex]

                if capsC == word[0]:
                    wasCaps.append(0)
                    wasUpper.append(0)
                else:
                    wasCaps.append(1)
                    if len(word) > 1 and startIndex + 1 < len(
                            test.data) and test.data[startIndex +
                                                     1] != word[1]:
                        wasUpper.append(1)
                    else:
                        wasUpper.append(0)
                i = endIndex

                prevWord = word

        predictions = ''
        dist1, newWords = model['mlp'].kneighbors(np.array(words), 1)
        dist2, newWordsLong = model['mlp'].kneighbors(np.array(wordsLong), 1)

        wordIndex = 0
        distt = DistanceMetric.get_metric('hamming')
        i = 0
        while i < len(testLower):
            if testLower[i] in ignored:
                predictions += testLower[i]
                i += 1
            else:
                startIndex = i
                endIndex = i + 1
                while testLower[endIndex] not in ignored:
                    endIndex += 1

                if endIndex == startIndex + 1:
                    i = endIndex
                    predictions += test.data[startIndex:endIndex]
                    continue

                newWord = model['mlp'].classes_[model['mlp']._y[
                    newWords[wordIndex]]][0]
                newWordLong = model['mlp'].classes_[model['mlp']._y[
                    newWordsLong[wordIndex]]][0]

                dist = dist1[wordIndex][0]
                distd = dist2[wordIndex][0]

                if (newWord != newWordLong and distd == 0):
                    #diff = mymetric(vectorize(newWord, 20), vectorize(newWordLong, 20))
                    #if diff < 2:
                    #print(newWord, 'za', newWordLong)
                    newWord = newWordLong
                    dist = dist2[wordIndex][0]
                '''
                else: 
                    if dist > dist2[wordIndex][0]/1000:

                        dist = dist2[wordIndex][0]
                        newWord = newWordLong

                if defaultWords[wordIndex] == 'odpovedel':
                    print(defaultWords[wordIndex], newWord, newWordLong)
                    print(dist, distd)
                '''

                if dist > 0.1:
                    newWord = defaultWords[wordIndex]

                if wasCaps[wordIndex]:
                    newWord = newWord.capitalize()

                if wasUpper[wordIndex]:
                    newWord = newWord.upper()
                predictions += newWord
                wordIndex += 1
                i = endIndex

        i = 0
        count = 0
        prevWord = ''
        while i < len(predictions):
            if predictions[i] in ignored:
                i += 1
            else:
                startIndex = i
                endIndex = i + 1
                while predictions[endIndex] not in ignored:
                    endIndex += 1

                word = predictions[startIndex:endIndex]

                if word == 'že' and prevWord != 'ale' and prevWord != 'Ale' and predictions[
                        startIndex - 2] != ',' and predictions[
                            startIndex - 2] != 'a' and predictions[startIndex -
                                                                   3] != ' ':
                    predictions = predictions[:startIndex] + 'ze' + predictions[
                        endIndex:]

                if word == 'ze' and prevWord == 'ale':
                    predictions = predictions[:startIndex] + 'že' + predictions[
                        endIndex:]

                if word == 'Že':
                    predictions = predictions[:startIndex] + 'Ze' + predictions[
                        endIndex:]

                if word == 'ně' and predictions[startIndex - 2] == ',':
                    predictions = predictions[:startIndex] + 'ne' + predictions[
                        endIndex:]

                if word == 'mne' and (prevWord == 'ke' or prevWord == 'o'):
                    predictions = predictions[:
                                              startIndex] + 'mně' + predictions[
                                                  endIndex:]
                if word == 'té' and (prevWord.lower() == 'prosím'
                                     or prevWord.lower() == 'jsme'
                                     or prevWord.lower() == 'aby'
                                     or prevWord.lower() == 'abych'
                                     or prevWord.lower() == 'co'
                                     or prevWord.lower() == 'kdo'
                                     or prevWord.lower() == 'který'):
                    predictions = predictions[:startIndex] + 'tě' + predictions[
                        endIndex:]

                if word == 'Té' and (prevWord.lower() == 'prosím'):
                    predictions = predictions[:startIndex] + 'Tě' + predictions[
                        endIndex:]
                i = endIndex

                prevWord = word

        f = open("pred.txt", "w", encoding='utf8')
        f.write(predictions)
        f.close()

        return predictions
Example #55
0
def mean_distance_to_closest(predicted, event):
    angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle)
    nn = BallTree(event.tracks, leaf_size=5, metric=angle)

    return np.sum([nn.query(predicted[i, :], k=1) for i in xrange(predicted.shape[0])]) / event.tracks.shape[0]
def check_pdist(metric, kwargs, D_true):
    dm = DistanceMetric.get_metric(metric, **kwargs)
    D12 = dm.pairwise(X1)
    assert_array_almost_equal(D12, D_true)
Example #57
0
plt.ylabel('Euclidean distance')
plt.show()
df

from scipy.spatial import distance
import numpy as np
distance.euclidean([1, 0, 0], [0, 1, 0])
distance.euclidean([20, 25], [25, 22])  #closest : S1 with S2
np.sqrt(((20 - 25)**2 + (25 - 22)**2))  #sqrt(sum(x-y)^2)

distance.euclidean([20, 25], [35, 40])
distance.euclidean([20, 25], [40, 35])
distance.euclidean([35, 40], [40, 35])

from sklearn.neighbors import DistanceMetric
dist = DistanceMetric.get_metric('euclidean')
dist
df.to_numpy()
dist.pairwise(df.to_numpy())

#iris dataset
from sklearn.datasets import load_iris
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

#Getting the data ready
from pydataset import data
iris = data('iris')
df2 = iris.copy()
Example #58
0
def manhalanobis(x, y, V):
    result = DistanceMetric.get_metric('manhalanobis', V)
    return result.pairwise(x, y)
  [0,0,1,1,0,1,0,1,1,0,1,1,0,1,1,0,0,1,1,1,1,1],
  [1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0],
  [1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0],
  [1,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0],
  [1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0],
  [0,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,0,1,1,1,1,0],
  [1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
  ]

AttributeClassifier = joblib.load('Dumps/AttributeClassifierKnowledgeTransfer.pkl')

features = scipy.io.loadmat("./UIUC1/UIUC1_win_feature.mat")
labels = scipy.io.loadmat("./UIUC1/UIUC1_labels.mat")
action_actor = open("./UIUC1/action_actor.txt")

dist = DistanceMetric.get_metric('euclidean')

mapping = [{}]
for line in action_actor:
    line = line.split()
    actionvector = numpy.zeros(14, dtype=numpy.int)
    actionvector[int(line[0])]=1
    mapping.append({'action':int(line[0]),'actionvector':actionvector, 'actor':int(line[1])})

total = len(labels['vlabels'][0])
ConfusionMatrix=numpy.array([[0,0],[0,0]])
NovelClassList=[[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]]
for NovelClass in NovelClassList:

    ConfusionMatrix2=numpy.array([[0,0],[0,0]])
def test_pickle_bool_metrics(metric):
    dm = DistanceMetric.get_metric(metric)
    D1 = dm.pairwise(X1_bool)
    dm2 = pickle.loads(pickle.dumps(dm))
    D2 = dm2.pairwise(X1_bool)
    assert_array_almost_equal(D1, D2)