def __calc_distances__(self, v1s, v2s, is_sparse=True):
        if is_sparse:
            dcosine     = np.array([cosine(x.toarray(), y.toarray())       for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dcityblock  = np.array([cityblock(x.toarray(), y.toarray())    for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dcanberra  = np.array([canberra(x.toarray(), y.toarray())     for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            deuclidean = np.array([euclidean(x.toarray(), y.toarray())    for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dminkowski  = np.array([minkowski(x.toarray(), y.toarray(), 3) for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dbraycurtis = np.array([braycurtis(x.toarray(), y.toarray())   for (x, y) in zip(v1s, v2s)]).reshape((-1,1))

            dskew_q1 = [skew(x.toarray().ravel()) for x in v1s]
            dskew_q2 = [skew(x.toarray().ravel()) for x in v2s]
            dkur_q1  = [kurtosis(x.toarray().ravel()) for x in v1s]
            dkur_q2  = [kurtosis(x.toarray().ravel()) for x in v2s]

            dskew_diff = np.abs(np.array(dskew_q1) - np.array(dskew_q2)).reshape((-1,1))
            dkur_diff  = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape((-1,1))
        else:
            dcosine     = np.array([cosine(x, y)       for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dcityblock  = np.array([cityblock(x, y)    for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dcanberra  = np.array([canberra(x, y)     for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            deuclidean = np.array([euclidean(x, y)    for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dminkowski  = np.array([minkowski(x, y, 3) for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dbraycurtis = np.array([braycurtis(x, y)   for (x, y) in zip(v1s, v2s)]).reshape((-1,1))

            dskew_q1 = [skew(x) for x in v1s]
            dskew_q2 = [skew(x) for x in v2s]
            dkur_q1  = [kurtosis(x) for x in v1s]
            dkur_q2  = [kurtosis(x) for x in v2s]

            dskew_diff = np.abs(np.array(dskew_q1) - np.array(dskew_q2)).reshape((-1,1))
            dkur_diff  = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape((-1,1))
        return np.hstack((dcosine,dcityblock,dcanberra,deuclidean,dminkowski,dbraycurtis,dskew_diff,dkur_diff))
Example #2
0
def subsample_points_low(pointcloud1, pointcloud2, num_subsampled_points,
                         rotation_ab, translation_ab):
    # (num_points, 3)
    pointcloud1 = pointcloud1.T
    pointcloud2 = pointcloud2.T
    num_points = pointcloud1.shape[0]
    nbrs1 = NearestNeighbors(
        n_neighbors=num_subsampled_points,
        algorithm='auto',
        metric=lambda x, y: minkowski(x, y)).fit(pointcloud1)
    nbrs2 = NearestNeighbors(
        n_neighbors=num_subsampled_points,
        algorithm='auto',
        metric=lambda x, y: minkowski(x, y)).fit(pointcloud2)
    random_idx1 = np.random.choice(num_points)
    random_p1 = pointcloud1[random_idx1, :]
    distance = np.sum((pointcloud2 - random_p1)**2, axis=-1)
    random_idx2 = np.argmax(distance)
    random_p2 = pointcloud1[random_idx2, :]
    idx1 = nbrs1.kneighbors(random_p1.reshape(1, -1),
                            return_distance=False).reshape(
                                (num_subsampled_points, ))
    idx2 = nbrs2.kneighbors(random_p2.reshape(1, -1),
                            return_distance=False).reshape(
                                (num_subsampled_points, ))
    pointcloud1 = pointcloud1[idx1, :]
    pointcloud2 = pointcloud2[idx2, :]
    pointcloud2 = rotation_ab.apply(pointcloud2).T + np.expand_dims(
        translation_ab, axis=1)
    return pointcloud1.T, pointcloud2
Example #3
0
def farthest_subsample_points(pointcloud1,
                              pointcloud2,
                              num_subsampled_points=768):
    pointcloud1 = pointcloud1.T
    pointcloud2 = pointcloud2.T
    num_points = pointcloud1.shape[0]
    nbrs1 = NearestNeighbors(
        n_neighbors=num_subsampled_points,
        algorithm='auto',
        metric=lambda x, y: minkowski(x, y)).fit(pointcloud1)
    random_p1 = np.random.random(size=(
        1, 3)) + np.array([[500, 500, 500]]) * np.random.choice([1, -1, 1, -1])
    idx1 = nbrs1.kneighbors(random_p1, return_distance=False).reshape(
        (num_subsampled_points, ))
    nbrs2 = NearestNeighbors(
        n_neighbors=num_subsampled_points,
        algorithm='auto',
        metric=lambda x, y: minkowski(x, y)).fit(pointcloud2)
    random_p2 = random_p1  #np.random.random(size=(1, 3)) + np.array([[500, 500, 500]]) * np.random.choice([1, -1, 2, -2])
    # 这种去不完全部分的方式,可能并不能达到很好的效果,还是会造成重合部分很多
    idx2 = nbrs2.kneighbors(random_p2, return_distance=False).reshape(
        (num_subsampled_points, ))
    # print('random_p1',random_p1)
    # print('pointcloud1',np.max(pointcloud1))
    # print('pointcloud2',np.max(pointcloud2))
    return pointcloud1[idx1, :].T, pointcloud2[idx2, :].T
Example #4
0
def get_nearest_neighbour(x, vectors, p = 2):
    """
        Finds the vector in <vectors> with the minimum distance from the vector parameter x.

        Args:
            x: 1xn vector
            vectors: qxn matrix with vectors that are nearest neighbour candidates of vector x.
            p: distance metric used. Default is 2 (Euclidean distance).

        Returns:
            Tuple with the index of the nearest neighbour of x in vectors and the distance between
            the nearest neighbour and x.

        Raises:
            InvalidParameterException: Raised if x and each vector in vectors have different dimensions.
    """
    
    if(x.shape[0] != vectors.shape[1]):
        raise exc.InvalidParameterException("[Exception]: vector parameter x must have the same 'n'"\
            "dimensions as in the qxn <vectors> matrix.")
    
    minimum_distance = minkowski(x, vectors[0], p)
    minimum_index = 0
    for index, v in enumerate(vectors):
        distance = minkowski(x, v, p)
        if(minimum_distance > distance):
            minimum_distance = distance
            minimum_index = index
    #print(minimum_index)
    return (minimum_index, minimum_distance)
Example #5
0
def covariance_and_correlation(arr):
    arr = np.reshape(arr, (-1, 7))
    #arr = [a[1:] for a in arr]
    return_val = []
    for val1 in arr:
        vals = []
        #min_1 = Parallel(n_jobs=-1) (delayed (distance.minkowski) (val1, val2, 1) for val2 in arr)
        #min_2 = Parallel(n_jobs=-1) (delayed (distance.minkowski) (val1, val2, 2) for val2 in arr)
        for val2 in arr:
            #if val1 == val2:
            #	continue
            #to_append = np.ndarray.flatten(np.cov(val1, val2) ).tolist()
            #coors = np.correlate(val1, val2).tolist()
            to_append = []
            #minkowski distance 1 and 2
            min_1 = distance.minkowski(val1, val2, 1)
            min_2 = distance.minkowski(val1, val2, 2)
            #kstat = [kstatp
            #cor = correlation(val1, val2)
            #to_append += coors
            #to_append.append(min_1)
            #to_append.append(min_2)
            min_d = [float(min_1), float(min_2)]
            to_append += min_d
            vals.append(to_append)
        return_val.append(np.mean(vals))
        #return_val.append(np.mean(vals,))# axis=1 ) )
    return np.mean(return_val, axis=0)
Example #6
0
def kmeans(k, max_itr, x, init_method, distance_measure, power=3):
    cluster = []
    wcss = []
    centroid = []
    old_centroid = []
    n = X.shape[0]
    index = 0

    #choose initi method
    if (init_method == 'random'):
        random.seed(3)
        for i in range(k):
            centroid.append(X[random.randint(1, n)])
        centroid = np.array(centroid)
    elif (init_method == 'K++'):
        centroid = kpp(k, x)
    else:
        centroid = X[list(range(k)), :]
    print("\nInitialization method :" + init_method)

    #choose distance measure
    if (distance_measure == 'Manhattan'):
        h = 1
    elif (distance_measure == 'Euclidean'):
        h = 2
    elif (distance_measure == 'Minkowski'):
        h = power
    print("Distance measure :" + distance_measure)

    for i in range(n):
        cluster.append(-1)
    cluster = np.array(cluster).reshape(n, 1)

    print("K=", k)
    #kmeans logic
    for i in range(max_itr):
        old_centroid = np.copy(centroid)
        for j in range(n):
            mini = 99999
            for z in range(k):
                tmp = distance.minkowski(x[j], centroid[z], h)
                if tmp < mini:
                    mini = tmp
                    index = z
            cluster[j] = index

        centroid = reinit(k, np.array(cluster), x, centroid)

        #if centroid donot change convergence point!
        if (old_centroid == centroid).all():
            print("\nConverged at iteartion:", i, "\n")
            break

    #calculate wcss
    total = 0
    for r in range(n):
        total = total + distance.minkowski(x[r], centroid[cluster[r]], h)
    wcss.append([total, k])
    return cluster, centroid, wcss
Example #7
0
    def _process_input(self,input_np_array):
        bmu = self._get_BMU(input_np_array)
        for neu in self.map_neurons.values():
            nhash = str(neu.x_c)+""+str(neu.y_c)

            '''weight adjustment if the neuron is in the neighbourhood of the BMU'''
            if minkowski(bmu.coords().astype(float), neu.coords().astype(float), 2) < self.nr:
                neu.weight_vs = neu.weight_vs + self.lr * (input_np_array-neu.weight_vs)
                neu.res_err += minkowski(neu.weight_vs, bmu.weight_vs, 2)
                self.map_neurons[nhash]=neu

        '''growth'''

        if bmu.res_err > self.thresh:
            neu = bmu
            down=str(neu.x_c)+str(int(neu.y_c)-1)
            up=str(neu.x_c)+str(int(neu.y_c)+1)
            left=str(int(neu.x_c)-1)+str(neu.y_c)
            right=str(int(neu.x_c)+1)+str(neu.y_c)
            nei_coords = np.array([down, up , left , right ] )
            nei_coordi = np.array([[(neu.x_c),(int(neu.y_c)-1)], [(neu.x_c),(int(neu.y_c)+1)], [(int(neu.x_c)-1),(neu.y_c)], [(int(neu.x_c)+1),int(neu.y_c)]] )
            p =0
            for coord in nei_coords:
                n=None
                try:
                    n= self.map_neurons[coord]
                    n.res_err+=self.fd*n.res_err

                except KeyError:
                    nwron=neuron(nei_coordi[p][0], nei_coordi[p][1], self.dim)
                    new_weight = 0
                #case a) new node has two consecutive nodes on one of its sides
                #tiroshan and lakmal please implement the code here
                #case b) between two old nodes
                    new_weight_b = self._type_b_weight_init(p,neu)
                    new_weight_a = self._type_a_weight_init(p,neu)
                    new_weight_c = self._type_c_weight_init(p,neu)

                    if new_weight_b.all() ==0:
                        if new_weight_a.all() == 0:
                            if new_weight_c.all() == 0:
                            #print "c==0"
                                new_weight = np.ndarray(shape=(self.dim))
                                new_weight.fill(0.5)
                            else:
                                new_weight = new_weight_c
                        else:
                            new_weight = new_weight_a
                    else:
                        new_weight = new_weight_b

                    nwron.weight_vs = new_weight
                    n=nwron
                self.map_neurons[coord]=n
                p+=1
            bmu.res_err=self.thresh/2
            self.map_neurons[str(bmu.x_c)+""+str(bmu.y_c)]=bmu
        return bmu.coords()
Example #8
0
def run(proc_id, return_dict, counter, dataset, test_index, indices_train_examples, algorithm, relevant_only):
    try:

        results = np.zeros(len(indices_train_examples))

        for array_index, example_index in enumerate(indices_train_examples):

            ###
            # Prepare examples
            ###

            if algorithm == 'feature_based':
                # feature based data is 2d-structured (examples,features)
                test_example = dataset.x_test_TSFresh_features[test_index, :]
                train_example = dataset.x_train_TSFresh_features[example_index, :]
            elif relevant_only:
                test_example = dataset.x_test[test_index]
                test_example, train_example = dataset.reduce_to_relevant(test_example, example_index)
            else:
                test_example = dataset.x_test[test_index]
                train_example = dataset.x_train[example_index]

            ##
            # Execute algorithm
            ##
            if algorithm == 'dtw':
                distance, _ = fastdtw(test_example, train_example, dist=euclidean)

            elif algorithm == 'dtw_weighting_nbr_features':
                distance, _ = fastdtw(test_example, train_example, dist=euclidean)
                distance = distance / test_example.shape[1]

            elif algorithm == 'feature_based':
                if relevant_only:
                    masking = dataset.get_ts_fresh_masking(example_index)
                    weights = masking / (np.sum(masking))
                    distance = minkowski(test_example, train_example, 2, weights)
                    # Adjustment based on feature amount (improved performance)
                    small_num_of_attributes_penalty = (1 / (np.sum(masking)))
                    # if small_num_of_attributes_penalty > 1:
                    #    small_num_of_attributes_penalty = 1
                    distance = distance * small_num_of_attributes_penalty
                else:
                    distance = minkowski(test_example, train_example, 2)

            else:
                raise ValueError('Unkown algorithm:', algorithm)

            results[array_index] = distance
            counter.increment()
        return_dict[proc_id] = results

    except KeyboardInterrupt:
        pass
Example #9
0
def farthest_subsample_points(pointcloud1, pointcloud2, num_subsampled_points=768):
    pointcloud1 = pointcloud1.T
    pointcloud2 = pointcloud2.T
    num_points = pointcloud1.shape[0]
    nbrs1 = NearestNeighbors(n_neighbors=num_subsampled_points, algorithm='auto',
                             metric=lambda x, y: minkowski(x, y)).fit(pointcloud1)
    random_p1 = np.random.random(size=(1, 3)) + np.array([[500, 500, 500]]) * np.random.choice([1, -1, 1, -1])
    idx1 = nbrs1.kneighbors(random_p1, return_distance=False).reshape((num_subsampled_points,))
    nbrs2 = NearestNeighbors(n_neighbors=num_subsampled_points, algorithm='auto',
                             metric=lambda x, y: minkowski(x, y)).fit(pointcloud2)
    random_p2 = random_p1
    idx2 = nbrs2.kneighbors(random_p2, return_distance=False).reshape((num_subsampled_points,))
    return pointcloud1[idx1, :].T, pointcloud2[idx2, :].T
def similarity_metric(spectrum_A,spectrum_B,metric,power_value,p=3):
    """
    ARGUMENTS: spectrum_A,spectrum_B: signals/spectra to be compared
               metric: similarity metric to be used (possible choices: 'cosine_sim','euclidean','cross_correlation' (TODO last one incomplete))
               power_value: power to which result of similarity metric is raised
               
    RETURNS:   value of similarity metric
    
    
    """
    

    spectrum_A=np.array(spectrum_A).flatten()
    spectrum_B=np.array(spectrum_B).flatten()

    if metric=='cosine_sim':
        
        return cosine_sim_given_spectra(spectrum_A,spectrum_B,power_value)
    
    elif metric=='euclidean':
        
        return euclidean_distance(spectrum_A,spectrum_B,power_value)
        
    elif metric=='minkowski':
        zero_vector=np.zeros(spectrum_A.size)
        
        
        norm_A=minkowski(u=spectrum_A,v=zero_vector,p=p,w=None)
        norm_B=minkowski(u=spectrum_B,v=zero_vector,p=p,w=None)
        
        spectrum_A_normed=spectrum_A/norm_A
        spectrum_B_normed=spectrum_B/norm_B
        
        minkowski_AB=minkowski(u=spectrum_A_normed,v=spectrum_B_normed,p=p,w=None)
        return 1.-minkowski_AB
        
    elif metric=='cross_correlation':
        #Not sure if this doesnt give the same as skp since automatic mode is 'valid' .... https://docs.scipy.org/doc/numpy/reference/generated/numpy.correlate.html
        corr_A_B=np.correlate(spectrum_A,spectrum_B)
        corr_A_A=np.correlate(spectrum_A,spectrum_A)
        corr_B_B=np.correlate(spectrum_B,spectrum_B)
    
        normalized_corr=corr_A_B/(corr_A_A*corr_B_B)
        
        return np.power(normalized_corr,power_value)
        
    else:
        
        raise NotImplementedError("The chosen metric is not implemented.")
Example #11
0
def distances(a, b, method='euclidean'):
	if (method == 'manhattan'):
		return distance.minkowski(a, b, 1)
	elif (method == 'euclidean'):
		return distance.minkowski(a, b, 2)
	elif (method == 'l3'):
		return distance.minkowski(a, b, 3)
	elif (method == 'bhat'):
		return -math.log(sum(np.sqrt(a * b)))
	elif (method == 'intersection'):
		return len(a) / (sum(np.minimum(a, b)))
	elif (method == 'corr'):
		return 1.0 - np.correlate(a, b)
	else:
		return 0
Example #12
0
def feature3(data):
    question1_vectors = np.zeros((data.shape[0], 300))
    error_count = 0
    for i, q in tqdm(enumerate(data.question1.values)):
        question1_vectors[i, :] = sent2vec(q)

    question2_vectors  = np.zeros((data.shape[0], 300))
    for i, q in tqdm(enumerate(data.question2.values)):
        question2_vectors[i, :] = sent2vec(q)

    data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]
    data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]  
    data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

    data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]
    data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

    data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
    data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
    data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
    data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]
    return data
def nearest_centroid_clustering(X_train, X_test, y_train, y_test, parameters,
                                evaluation_metrics):
    # modify parameters to call the clustering algorithm with modified ones, this mainly purposes the distance parameter
    modified_parameters = prepare_parameters(parameters)

    if modified_parameters["distance"] == "minkowski" and modified_parameters[
            "minkowski_p"] is not None:
        initial_classifier = NearestCentroid(
            metric=lambda x, y: distance.minkowski(
                x, y, modified_parameters["minkowski_p"]))
    else:
        if modified_parameters["distance"] == "mahalanobis":
            initial_classifier = NearestCentroid(metric="mahalanobis",
                                                 metric_params={
                                                     "V": np.cov(X_train)
                                                 })  # TODO: fix
        else:
            initial_classifier = NearestCentroid(
                metric=modified_parameters["distance"])

    classifier = initial_classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)

    evaluation_metrics["accuracy"] = classifier.score(X_test, y_test)

    return evaluation_metrics
Example #14
0
def calculate_distance(X, Y, metric='euclidean'):
    if metric == METRIC_EUCLIDEAN:
        return distance.euclidean(X, Y)
    elif metric == METRIC_JACCARD:
        return distance.jaccard(X, Y)
    elif metric == METRIC_CANBERRA:
        return distance.canberra(X, Y)
    elif metric == METRIC_CHEBYSHEV:
        return distance.chebyshev(X, Y)
    elif metric == METRIC_MINKOWSKI:
        return distance.minkowski(X, Y)
    elif metric == METRIC_WMINKOWSKI:
        return distance.wminkowski(X, Y)
    elif metric == METRIC_BRAYCURTIS:
        return distance.braycurtis(X, Y)
    elif metric == METRIC_HAMMING:
        return distance.hamming(X, Y)
    elif metric == METRIC_MAHALANOBIS:
        return distance.mahalanobis(X, Y)
    elif metric == METRIC_MANHATTAN:
        return sum(abs(a - b) for a, b in zip(X, Y))

    elif metric == METRIC_COSINE:
        dot_product = np.dot(X, Y)
        norm_a = np.linalg.norm(X)
        norm_b = np.linalg.norm(Y)
        return dot_product / (norm_a * norm_b)
def find_similar_to_team(team_stats, filtered_list, similarity='pearson'):
    avg_stats = team_stats[0]
    sd_stats = team_stats[1]
    avg_stats['Age'] = 21

    sd_stats = 1 / sd_stats
    sd_stats = sd_stats / sd_stats.sum()

    avg_stats = avg_stats.values.reshape(-1, len(avg_stats))
    fil_list = filtered_list[TEAM_ATTRIBUTES]
    fil_list = fil_list.mul(sd_stats, axis=1)

    avg_series = pd.Series(avg_stats.flatten(), index=fil_list.columns)

    if similarity == 'pearson':
        pearson_sim = fil_list.corrwith(avg_series, axis=1)
        filtered_list['pearson'] = pearson_sim
        return filtered_list.sort_values(['pearson'], ascending=False)

    if similarity == 'cosine':
        cos_sim = cosine_similarity(fil_list, avg_stats)
        filtered_list['cosine'] = pd.Series(
            [x for row in cos_sim for x in row], index=filtered_list.index)
        return filtered_list.sort_values(['cosine'], ascending=False)

    if similarity == 'minkowski':
        minkowski_sim = []
        for index, row in fil_list.iterrows():
            minkowski_sim.append(
                minkowski(row.values, avg_stats.flatten(), p=2))

        filtered_list['minkowski'] = pd.Series(minkowski_sim,
                                               index=filtered_list.index)
        return filtered_list.sort_values(['minkowski'], ascending=False)
Example #16
0
def get_w2v_simi(query, title):
    q_vec = np.nan_to_num(sent2vec(query))
    t_vec = np.nan_to_num(sent2vec(title))

    w2v_consine = cosine(q_vec, t_vec)
    w2v_cityblock = cityblock(q_vec, t_vec)
    w2v_jaccard = jaccard(q_vec, t_vec)
    w2v_canberra = canberra(q_vec, t_vec)
    w2v_euclidean = euclidean(q_vec, t_vec)
    w2v_minkowski = minkowski(q_vec, t_vec)
    w2v_braycurtis = braycurtis(q_vec, t_vec)

    w2v_skew_qvec = skew(q_vec)
    w2v_skew_tvec = skew(t_vec)
    w2v_kur_qvec = kurtosis(q_vec)
    w2v_kur_tvec = kurtosis(t_vec)

    outlist = [w2v_consine,
               w2v_cityblock,
               w2v_jaccard,
               w2v_canberra,
               w2v_euclidean,
               w2v_minkowski,
               w2v_braycurtis,
               w2v_skew_qvec,
               w2v_skew_tvec,
               w2v_kur_qvec,
               w2v_kur_tvec
               ]
    outformat = ':'.join(['{}']*len(outlist))

    return outformat.format(*outlist)
def Dist(array1, array2, dist):
    if dist == 'braycurtis':
        return distance.braycurtis(array1, array2)
    elif dist == 'correlation':
        return distance.correlation(array1, array2)
    elif dist == 'mahalanobis':
        return distance.mahalanobis(array1, array2)
    elif dist == 'minkowski':
        return distance.minkowski(array1, array2)
    elif dist == 'seuclidean':
        return distance.seuclidean(array1, array2)
    elif dist == 'sqeuclidean':
        return distance.sqeuclidean(array1, array2)
    elif dist == 'pearsonp':
        r, p = pearsonr(array1, array2)
        return p
    elif dist == 'pearsonr':
        r, p = pearsonr(array1, array2)
        return r
    elif dist == 'spearmanp':
        r, p = spearmanr(array1, array2)
        return p
    elif dist == 'spearmanr':
        r, p = spearmanr(array1, array2)
        return r
Example #18
0
def rmsd(cluster_list, vectors, clu_vectors):
    '''
	not a proper rmsd - it is a 
	mean squared deviation of the CG modes wrt to AT modes,
	averaged along clusters
	'''

    N_clu = len(cluster_list)
    RMSD = 0
    N = len(vectors)
    for j in range(N_clu):
        clu_vector = clu_vectors[j]
        clu_true_vectors = vectors[cluster_list[j]]

        # normalize and multiply by 100
        if np.linalg.norm(clu_vector) < 1e-7: clu_vector += np.ones(3) * 1e-6
        clu_vector = (clu_vector / np.linalg.norm(clu_vector)) * 100

        D = 0

        for v in clu_true_vectors:
            # normalize vectors to unit norm
            if np.linalg.norm(v) < 1e-7: v += np.ones(3) * 1e-6

            v = (v / np.linalg.norm(v)) * 100
            d = minkowski(v, clu_vector, p=2)
            D += d**2

        RMSD += float(D)

    return (RMSD / float(N_clu * N)) / 100.0
Example #19
0
def extend_with_features(data):
    stop_words = stopwords.words('english')
    data['fuzz_qratio'] = data.apply(
        lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])),
        axis=1)
    data['fuzz_WRatio'] = data.apply(
        lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])),
        axis=1)

    model = gensim.models.KeyedVectors.load_word2vec_format(
        google_news_model_path, binary=True)
    data['wmd'] = data.apply(
        lambda x: wmd(model, x['question1'], x['question2']), axis=1)

    norm_model = gensim.models.KeyedVectors.load_word2vec_format(
        google_news_model_path, binary=True)
    norm_model.init_sims(replace=True)
    data['norm_wmd'] = data.apply(
        lambda x: norm_wmd(norm_model, x['question1'], x['question2']), axis=1)

    question1_vectors = np.zeros((data.shape[0], 300))
    for i, q in enumerate(data.question1.values):
        question1_vectors[i, :] = sent2vec(model, q)

    question2_vectors = np.zeros((data.shape[0], 300))
    for i, q in enumerate(data.question2.values):
        question2_vectors[i, :] = sent2vec(model, q)

    question1_vectors = np.nan_to_num(question1_vectors)
    question2_vectors = np.nan_to_num(question2_vectors)

    data['cosine_distance'] = [
        cosine(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['cityblock_distance'] = [
        cityblock(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['jaccard_distance'] = [
        jaccard(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['canberra_distance'] = [
        canberra(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['euclidean_distance'] = [
        euclidean(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['minkowski_distance'] = [
        minkowski(x, y, 3)
        for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['braycurtis_distance'] = [
        braycurtis(x, y)
        for (x, y) in zip(question1_vectors, question2_vectors)
    ]

    data['skew_q1vec'] = [skew(x) for x in question1_vectors]
    data['skew_q2vec'] = [skew(x) for x in question2_vectors]
    data['kur_q1vec'] = [kurtosis(x) for x in question1_vectors]
    data['kur_q2vec'] = [kurtosis(x) for x in question2_vectors]
    return data
Example #20
0
    def compute_neighbors(self, X_test):
        ## Figure out the power p for minkowski distance
        if (self.metric == 'manhattan'):
            self.minkowski_p = 1
        elif (self.metric == 'euclidean'):
            self.minkowski_p = 2
        elif (self.metric == 'minkowski'):
            pass
        else:
            self.minkowski_p = 2
            print(
                "WARNING: Unknown distance metric: %s specified! Reverting to euclidean"
                % (self.metric))

        ## Compute distance of a test point to all train points
        distance_matrix = np.zeros([self.numObs])
        neighbor_id = np.zeros([self.numObs])
        for test_idx in range(self.XTestNumObs):
            for n_idx in range(self.numObs):
                neighbor_id[n_idx] = n_idx
                distance_matrix[n_idx] = minkowski(X_test[test_idx],
                                                   self.X[n_idx],
                                                   self.minkowski_p)

            ## Sort and record distances and neighbors
            sorted_distances = sorted(zip(distance_matrix, neighbor_id,
                                          self.y))
            for k_idx in range(self.k):
                self.XTestNeighborsDist[
                    test_idx, k_idx], self.XTestNeighborsId[
                        test_idx, k_idx], self.XTestNeighborsY[
                            test_idx, k_idx] = sorted_distances[k_idx]
def compute_pl_distance(
    pls, median_pl, p, dest_dir, file_names, patient_category
):
    diffs = []
    for pl in range(pls.shape[0]):
        # Loop through each patient
        patient_dist_from_avg = []
        for h_dim in range(pls.shape[1]):
            # Loop through each dimension
            patient_dist_from_avg.append(
                distance.minkowski(
                    pls[pl, h_dim, :].flatten(),
                    median_pl[h_dim, :].flatten(),
                    p,
                )
            )
        diffs.append(patient_dist_from_avg)
    diffs = np.array(diffs)
    # with open(dest_dir + ".npy", "wb") as f:
    #     np.save(f, diffs)
    diffs = pd.DataFrame(diffs, columns=["H_0", "H_1", "H_2"])
    file_names = np.array(file_names)
    outliers = pd.DataFrame()
    # Select patients who are outliers
    for col in diffs.columns:
        outliers[col] = list(
            file_names[np.array(diffs.nlargest(140, columns=col).index)]
        )
    outliers.to_csv(dest_dir + f"outliers_{patient_category}.csv", index=False)
    diffs.index = file_names
    diffs.to_csv(
        dest_dir + f"distance_from_median_pl_{patient_category}.csv",
        index=True,
    )
Example #22
0
    def process_batch(self,batch_np_array, k=10):
        start_time= time.time()
        for j in range(k):
            self.map_sizes.append(len(self.map_neurons.keys()))
            for i in range(batch_np_array.shape[0]):
                sys.stdout.write("iteration %d :"%(j+1))
                sys.stdout.write(" : NR = %d: "%(self.nr))
                sys.stdout.write(" input %d "%(i))
                sys.stdout.write(" map size %d "%(len(self.map_neurons.keys())))
                sys.stdout.write(" time %d \r"%(time.time()-start_time))
                sys.stdout.flush()
                tinp = batch_np_array[i]
                bcoords=self.process_input(tinp)
                bhash=str(bcoords[0])+""+str(bcoords[1])
                winner = self.map_neurons[bhash]

                #here's the tricky part
                score= minkowski(winner.weight_vs,tinp,2)#/self.dim
                winner.coassoc_vs[i]= score
                winner.binarycoassoc_vs[i]=1
                #print winner.coassoc_vs
                self.map_neurons[bhash]=winner

            self.nr=self.nr*(1-self.lr)
            self.lr = self.lr*self.lr_red_coef*(1-3.85/len(self.map_neurons.values()))
            if self.nr <=1 :
                print self.nr
                return

        return
Example #23
0
def findClosest(textVector, genre):
    '''
    Args:
        vector: vector of data analyzed from a text

        genre: genre of text ("fiction" or "nonfiction")

    Returns:
        a string explaining the three most stylistically similar authors and their
        minkowski distances from the author's style, ordered from least distance
        to greatest
    '''
    distanceTuples = []
    THIS_FOLDER = os.path.dirname(os.path.abspath(__file__))
    if genre == "fiction":
        standardizedCompSet = readVectors(
            os.path.join(THIS_FOLDER, "data/exemplaryFictionDataSTDD.csv"))
    if genre == "nonfiction":
        standardizedCompSet = readVectors(
            os.path.join(THIS_FOLDER, "data/exemplaryNonfictionDataSTDD.csv"))
    stdTextVector = standardizeVector(textVector, genre)
    # generate minkowski distance for each comparison text
    for vector in standardizedCompSet:
        distance = minkowski(stdTextVector[1:], vector[1:], 2)
        distanceTuples.append((round(distance, 2), vector[0]))
    # return three closest (distance, author) tuples
    distanceTuples.sort()
    similarityReport = "The authors in our database whose styles are most like yours \
    are {}, with a difference quotient of {}, followed by {} ({}) and {} ({}).\
    ".format(distanceTuples[0][1],
             str(distanceTuples[0][0]), distanceTuples[1][1],
             str(distanceTuples[1][0]), distanceTuples[2][1],
             str(distanceTuples[2][0]))

    return similarityReport
    def dataframe_with_bmu_for_each_input_vector_and_distance(
            self, passed_dataframe):
        """creating a dataframe with bmu indicator and the distance with each indicator"""

        temp_dataframe = copy.deepcopy(passed_dataframe)
        print temp_dataframe
        new_ind = pd.DataFrame(columns=[[
            'bmu_indicator1', 'indicator_1', 'bmu_indicator2', 'indicator_2',
            'distance'
        ]])
        print "dataframe size=", temp_dataframe.shape[0]
        for i in range(temp_dataframe.shape[0]):
            print "done for %d indicator" % (i)
            for j in range(temp_dataframe.shape[0]):
                #print "bmu1=",temp_dataframe.ix[i,0],"bmu2",temp_dataframe.ix[j,0]
                a = literal_eval(temp_dataframe.ix[i, 'node'])
                b = literal_eval(temp_dataframe.ix[j, 'node'])
                dist = minkowski(np.array(a), np.array(b), 1)
                #print 'dist=',dist
                X = pd.DataFrame(np.array([[
                    a, temp_dataframe.ix[i, 'column'], b,
                    temp_dataframe.ix[j, 'column'], dist
                ]]),
                                 columns=[
                                     'bmu_indicator1', 'indicator_1',
                                     'bmu_indicator2', 'indicator_2',
                                     'distance'
                                 ])
                new_ind = new_ind.append(X, ignore_index=True)
        return new_ind
Example #25
0
def distance_features(data,genismModel):
    w2v_q1 = np.array([sent2vec(q, genismModel) for q in data.question1])
    w2v_q2 = np.array([sent2vec(q, genismModel) for q in data.question2])
    a=np.zeros(300)
    for i in range(len(w2v_q1)):
        if w2v_q1[i].size==1:
            w2v_q1[i]=a
    for i in range(len(w2v_q2)):
        if w2v_q2[i].size==1:
            w2v_q2[i]=a
    
    data['cosine_distance'] = [cosine(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['cityblock_distance'] = [cityblock(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['jaccard_distance'] = [jaccard(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['canberra_distance'] = [canberra(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['euclidean_distance'] = [euclidean(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['minkowski_distance'] = [minkowski(x,y,3) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['braycurtis_distance'] = [braycurtis(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['skew_q1vec'] = [skew(x) for x in w2v_q1]
    data['skew_q2vec'] = [skew(x) for x in w2v_q2]
    data['kur_q1vec'] = [kurtosis(x) for x in w2v_q1]
    data['kur_q2vec'] = [kurtosis(x) for x in w2v_q2]
    fs_4 = ['cosine_distance', 'cityblock_distance', 'jaccard_distance', 'canberra_distance', 
         'euclidean_distance', 'minkowski_distance','braycurtis_distance','skew_q1vec',
         'skew_q2vec','kur_q1vec','kur_q2vec']
    return data,fs_4
def find_distance_score(curr_img, data_matrix, function_val):

    ##################### Minkowski 3 #######################

    if function_val == "minkowski3":
        dis_score = []
        for img in data_matrix:
            dis_score.append(distance.minkowski(img, curr_img, 3))

        return dis_score

    ################ Cosine #######################
    elif function_val == "cosine":
        sim_score = []
        for img in data_matrix:
            dot = np.dot(curr_img, img)
            norm1 = np.linalg.norm(curr_img)
            norm2 = np.linalg.norm(img)
            sim_score.append(dot / (norm1 * norm2))
        return sim_score

    ############## Euclidean Distance ####################
    elif function_val == "euclidean":
        return np.sqrt(np.sum(np.square(data_matrix - curr_img), axis=1))

    ################# Manhattan Distance ###################
    else:
        return np.sum(np.absolute(data_matrix - curr_img), axis=1)
Example #27
0
 def sortneighbors(self, x, y, X_train, x_test):
     x = np.array(x).astype(np.float)
     x_test = np.array(x_test).astype(np.float)
     dist = np.empty(len(x))
     for i in range(len(x)):
         #            st=globals()["distance."+self.metric]
         #            dist=st(x_train,x_test)
         if self.metric == 'cosine':
             dist[i] = distance.cosine(x[i], x_test)
         elif self.metric == 'chebyshev':
             dist[i] = distance.chebyshev(x[i], x_test)
         elif self.metric == 'cityblock':
             dist[i] = distance.cityblock(x[i], x_test)
         elif self.metric == 'euclidean':
             dist[i] = distance.euclidean(x[i], x_test)
         elif self.metric == 'minkowski':
             dist[i] = distance.minkowski(x[i], x_test)
         else:
             print(
                 'Error!!! Enter a correct distance function and try again \n'
             )
     dist = np.argsort(
         dist
     )  # Returning the indices of the similarity values (distance values are sorted in ascending order)
     x_sorted = np.empty(shape=(len(x), len(X_train[1])))
     y_sorted = []
     k = 0
     for i in dist:
         x_sorted[k] = x[i]
         y_sorted.append(y[i])
         k = k + 1
     return x_sorted, y_sorted
Example #28
0
def minkowski(x, y, p=3):
    try:
        return distance.minkowski(x, y, p)
    except ValueError:
        return np.NaN
    except:
        return np.NaN
Example #29
0
    def get_labels(self, scores, u, yu):  #########
        labels = self.labels
        labels[u] = yu
        dist = []
        neigh = []
        for ele in self.X:
            #dist.append(euclidean(self.X[u],ele))
            #dist.append(canberra(self.X[u],ele))
            #dist.append(chebyshev(self.X[u],ele))
            dist.append(minkowski(self.X[u], ele))

        temp_score = np.argsort(dist)
        #n = self.X.shape[0]
        #k = int(math.sqrt(n))
        k = self.k
        n_id = np.setdiff1d(temp_score[:k],
                            self.queried_ids)  # ids for unqueried neighbours
        for i in n_id:
            if (yu == 1 and scores[i] >= 0.5) or (
                    yu == -1
                    and scores[i] < 0.5):  # or (yu == -1 and scores[i] < 0.5)
                neigh.append(i)

        labels[list(neigh)] = yu  # k neighbours
        return labels
Example #30
0
def distance(x, y, weights = [], p = 3, method = "euclidean"):
    '''

    :param weights:
    :param p:
    :param x: X vector
    :param y: Y vector
    :param method: Method to Find Distance
    :return: The Distance Value
    '''

    value = 0.00
    if method == "euclidean":
        value = distance.euclidean(x, y)
    elif method == "minkowski":
        value = distance.minkowski(x, y, p)
    elif method == "cosine":
        value  = distance.cosine(x, y)
    elif method == "manhattan":
        value = distance.cityblock(x, y)
    elif method == "dice":
        value = distance.dice(x, y)
    elif method == "jaccard":
        value = distance.jaccard(x, y)
    elif method == "hamming":
        value == distance.hamming(x, y)
    elif method == "canbera":
        value == distance.chebyshev(x, y)
    else:
        print(method, " Not Found! unsing Eclidean Distance!")
        value = distance.euclidean(x, y)
    return value
Example #31
0
 def process_ppr(self, n, m, k):
     self.output_dir = self.output_dir + "/{}_{}_{}".format(
         k, m, ",".join([str(x) for x in n]))
     Path(self.output_dir).mkdir(parents=True, exist_ok=True)
     sim_matrix = self.get_sim_matrix()
     adj_matrix = self.get_knn_nodes(sim_matrix, k)
     adj_matrix_norm = self.normalize(adj_matrix)
     size = adj_matrix_norm.shape[0]
     u_old = np.zeros(size, dtype=float).reshape((-1, 1))
     v = np.zeros(size, dtype=float).reshape((-1, 1))
     for value in n:
         u_old[value - 1] = 1 / len(n)
         v[value - 1] = 1 / len(n)
     A = adj_matrix_norm
     diff = 1
     c = 0.65
     while diff > 1e-20:
         u_new = ((1 - c) * np.matmul(A, u_old)) + (c * v)
         diff = distance.minkowski(u_new, u_old, 1)
         u_old = u_new
     res = [self.idx_file_map[x] for x in u_new.ravel().argsort()[::-1][:m]]
     self.plot_dominant_gestures(res, k)
     c = {}
     c['user_files'] = [self.idx_file_map[x] for x in n]
     c['dominant_gestures'] = res
     json.dump(c,
               open(self.output_dir + "/{}_{}_dominant.txt".format(k, m),
                    "w"),
               indent='\t')
Example #32
0
def KNN(c, dataSet, k=3, dist_eq="Euclidean", p=3, r_dist=False):
    neighbors = []
    dist = []
    for data in dataSet:
        distance = 0
        if dist_eq == "Euclidean":
            distance = spd.euclidean(c, data)
        elif dist_eq == "Manhattan":
            distance = spd.cityblock(c, data)
        elif dist_eq == "Minkowski":
            distance = spd.minkowski(c, data, p)
        elif dist_eq == "Hamming":
            distance = spd.hamming(c, data)
        else:
            raise ValueError(
                "Invalid setting for dist_eq=Euclidean, Manhattan, Minkowski, Hamming"
            )
        if len(neighbors) < k:
            neighbors.append(data)
            dist.append(distance)
        else:
            max_d = max(dist)
            if distance < max_d:
                indx = dist.index(max_d)
                neighbors[indx] = data
                dist[indx] = distance
    if r_dist == False:
        return neighbors
    else:
        return neighbors, dist
def vectors_features(in_data: pd.DataFrame,
                     sent2vec: Callable[[str], np.array]) -> pd.DataFrame:
    assert "question1" in in_data.columns
    assert "question2" in in_data.columns
    vectors1 = np.array([sent2vec(x) for x in in_data['question1']])
    vectors2 = np.array([sent2vec(x) for x in in_data['question2']])
    in_data['cos'] = np.array(
        [cosine(x, y) for x, y in zip(vectors1, vectors2)])
    in_data['jaccard'] = np.array(
        [jaccard(x, y) for x, y in zip(vectors1, vectors2)])
    in_data['euclidean'] = np.array(
        [euclidean(x, y) for x, y in zip(vectors1, vectors2)])
    in_data['minkowski'] = np.array(
        [minkowski(x, y) for x, y in zip(vectors1, vectors2)])
    in_data['cityblock'] = np.array(
        [cityblock(x, y) for (x, y) in zip(vectors1, vectors2)])
    in_data['canberra'] = np.array(
        [canberra(x, y) for (x, y) in zip(vectors1, vectors2)])
    in_data['braycurtis'] = np.array(
        [braycurtis(x, y) for (x, y) in zip(vectors1, vectors2)])
    in_data['skew_q1'] = np.array([skew(x) for x in vectors1])
    in_data['skew_q2'] = np.array([skew(x) for x in vectors2])
    in_data['kur_q1'] = np.array([kurtosis(x) for x in vectors1])
    in_data['kur_q2'] = np.array([kurtosis(x) for x in vectors2])
    in_data['skew_diff'] = np.abs(in_data['skew_q1'] - in_data['skew_q2'])
    in_data['kur_diff'] = np.abs(in_data['kur_q1'] - in_data['kur_q2'])
    return in_data
Example #34
0
    def _build_distance_matrix(self, detected_squares, tracked_squares):
        # Calculate distances between tracked squares and new squares.
        distance_matrix = {}
        for ds in detected_squares:
            distance_matrix[ds] = {}
            for ts in tracked_squares: 
                distance_matrix[ds][ts] = distance.minkowski(ds.center, ts.center, 128)

        return distance_matrix
Example #35
0
 def _grow_map(self,input,k):
     bcoords = self._process_input(input)
     bhash=str(bcoords[0])+""+str(bcoords[1])
     winner = self.map_neurons[bhash]
     score= minkowski(winner.weight_vs,input,2)
     winner.k_coassoc_vs[k] = score
     winner.binarycoassoc_vs[k] = 1
     self.map_neurons[bhash] = winner
     return
Example #36
0
    def find_bmu(self, x):
        nodes = np.asarray(self.neurons.values())
        mink = np.argmin(np.linalg.norm(x - nodes, axis=1))
        # mink = pairwise_distances_argmin(nodes, np.array([x]))
        try:
            dist =minkowski(self.neurons.values()[mink], x, p = 2)
        except ValueError:
            print 'nan'

        return self.neurons.keys()[mink], dist   #dist_sqr[mink]
Example #37
0
    def _compute_measure(vals_1, vals_2, method='bhat'):
        """--------------------------------------------------------------------
         Computes the distance or dissimilairty between two 1-D lists of
         values. This function is called with pitch distribution values,
         while generating matrices. The function is symmetric, the two input
         lists are interchangable.
         ----------------------------------------------------------------------
         vals_1, vals_2 : The input value lists.
         method         : The choice of distance method
         ----------------------------------------------------------------------
         manhattan    : Minkowski distance of 1st degree
         euclidean    : Minkowski distance of 2nd degree
         l3           : Minkowski distance of 3rd degree
         bhat         : Bhattacharyya distance
         intersection : Intersection
         corr         : Correlation
         -------------------------------------------------------------------"""
        if method in ['manhattan', 'l1']:
            dist = spdistance.minkowski(vals_1, vals_2, 1)
        elif method in ['euclidean', 'l2']:
            dist = spdistance.euclidean(vals_1, vals_2)
        elif method == 'l3':
            dist = spdistance.minkowski(vals_1, vals_2, 3)
        elif method == 'bhat':  # bhattacharrya distance
            dist = -np.log(np.sum(np.sqrt(vals_1 * vals_2)))
        elif method == 'jeffrey':  # Jeffrey's divergence
            dist = (np.sum(vals_1 * np.log(vals_1 / vals_2)) +
                    np.sum(vals_2 * np.log(vals_2 / vals_1)))
        elif method == 'js':  # Jensen–Shannon distance
            dist = np.sqrt(
                np.sum(vals_1 * np.log(2 * vals_1 / (vals_1 + vals_2))) * 0.5 +
                np.sum(vals_2 * np.log(2 * vals_2 / (vals_1 + vals_2))) * 0.5)
        # Since correlation and intersection are actually similarity measures,
        # we convert them to dissimilarities, by taking 1 - similarity
        elif method == 'dis_intersect':
            dist = 1.0 - np.sum(np.minimum(vals_1, vals_2)) / np.size(vals_1)
        elif method == 'dis_corr':
            dist = 1.0 - np.correlate(vals_1, vals_2)
        else:
            raise ValueError("Unknown method")

        return dist
Example #38
0
 def find_bmu(self, x):
     nodes = np.asarray(self.neurons.values())
     deltas = nodes - x
     dist_sqr = np.sum(deltas**2, axis =1 )
     mink = np.argmin(dist_sqr)
     # mink = pairwise_distances_argmin(nodes, np.array([x]))
     try:
         dist =minkowski(self.neurons.values()[mink], x, p = 2)
     except ValueError:
         print 'nan'
     return self.neurons.keys()[mink], dist  #dist_sqr[mink]
Example #39
0
def distance(vals_1, vals_2, method='euclidean'):
	"""-------------------------------------------------------------------------
	Calculates the distance between two 1-D lists of values. This function is
	called with pitch distribution values, while generating distance matrices.
	The function is symmetric, the two inpıt lists are interchangable.
	----------------------------------------------------------------------------
	vals_1, vals_2 : The input value lists.
	method         : The choice of distance method
	----------------------------------------------------------------------------
	manhattan    : Minkowski distance of 1st degree
	euclidean    : Minkowski distance of 2nd degree
	l3           : Minkowski distance of 3rd degree
	bhat         : Bhattacharyya distance
	intersection : Intersection
	corr         : Correlation
	-------------------------------------------------------------------------"""
	if (method == 'euclidean'):
		return distance.euclidean(vals_1, vals_2)

	elif (method == 'manhattan'):
		return distance.minkowski(vals_1, vals_2, 1)

	elif (method == 'l3'):
		return distance.minkowski(vals_1, vals_2, 3)

	elif (method == 'bhat'):
		return -math.log(sum(np.sqrt(vals_1 * vals_2)))

	# Since correlation and intersection are actually similarity measures,
	# we take their inverse to be able to use them as distances. In other
	# words, max. similarity would give the min. inverse and we are always
	# looking for minimum distances.
	elif (method == 'intersection'):
		return len(vals_1) / (sum(np.minimum(vals_1, vals_2)))

	elif (method == 'corr'):
		return 1.0 - np.correlate(vals_1, vals_2)

	else:
		return 0
Example #40
0
def _find_in_map(gmap,ix_rng_s,ix_rng_e, inp_vec):

    keys= gmap.keys()[ix_rng_s:ix_rng_e]

    minDist=9223372036854775807
    candidate= None
    for neu_key in keys:
        neu = gmap[neu_key]
        cand=minkowski(inp_vec, neu.weight_vs, 2)
        if minDist> cand:
            minDist = cand
            candidate= neu

    return  candidate
Example #41
0
    def _get_BMU(self,input_nparray):

        minDist=9223372036854775807
        candidate= None
        for neu in self.map_neurons.itervalues():

            if self.boolean:
                cand = jaccard(input_nparray, neu.weight_vs)
                if minDist> cand:
                    minDist = cand
                    candidate= neu
            else:
                cand=minkowski(input_nparray, neu.weight_vs, 2)
                if minDist> cand:
                    minDist = cand
                    candidate= neu

        return  candidate
Example #42
0
    def getBMU(self,input_nparray):
        minDist=9223372036854775807
        candidate= None
        for neu in self.map_neurons.itervalues():
            #print "input: "+str(input_nparray)
            #print "neuron: "+str (neu.weight_vs)
            if self.boolean:
                cand = jaccard(input_nparray, neu.weight_vs)
                if minDist> cand:
                    minDist = cand
                    candidate= neu
            else:
                cand=minkowski(input_nparray, neu.weight_vs, 2)
                if minDist> cand:
                #print "mindist:",minDist
                #print "cand:",cand
                    minDist = cand
                    candidate= neu

                #print "candidate'scoords",candidate.coords()
        return  candidate
Example #43
0
def score(data, labels=None, centroids=None, norm=2):
    """Given data and labels or centroids, calculate objective
    the data, and either labels or centroids has to be given, the other is None
    If both labels and centroids are None, an error will be thrown
    If neither labels and centroids are None, they will both be used (although this will typically result
       in a worse score than if leaving one of them as None)
    If one is None, the other isn't, the one that is None will be filled in using the other
    norm is the minkowski norm used for computing distances
    """
    assert (labels is not None) or (centroids is not None), "At least one of labels and centroids must be not None"
    if centroids is None:
        centroids = getcentroids(data, labels)
    if labels is None:
        labels = getlabels(data, centroids, norm)
    distances = 0
    labelnames = unique(labels)
    for i in range(len(labelnames)):
        datac = data[labels == labelnames[i]]
        distances = distances + sum(array([minkowski(datac[j, :], centroids[i, :], norm)
                                           for j in range(shape(datac)[0])]))
    return distances / float(len(labels))
Example #44
0
    def parallel_search_bmu(self, input_vector):

        mapsize=len(self.map_neurons.keys())
        indices=[]
        for i in range (self.n_jobs):
            indices.append([i*mapsize/self.n_jobs , (i+1)*mapsize/self.n_jobs-1])

        #res = Parallel(n_jobs=2) (delayed(check_paths) (Path(points), a) for points in b)

        res=Parallel(n_jobs=self.n_jobs)(delayed(_find_in_map)(self.map_neurons,ix_range[0],ix_range[1],input_vector)for ix_range in indices)
        for r in res:
            if r is None:
                print r

        minDist=9223372036854775807
        candidate= None
        for neu in res:
            cand=minkowski(input_vector, neu.weight_vs, 2)
            if minDist> cand:
                minDist = cand
                candidate= neu

        return candidate
Example #45
0
	def compute_neighbors (self, X_test) :
		## Figure out the power p for minkowski distance
		if (self.metric == 'manhattan') :
			self.minkowski_p = 1
		elif (self.metric == 'euclidean') :
			self.minkowski_p = 2
		elif (self.metric == 'minkowski') :
			pass
		else :
			self.minkowski_p = 2
			print ("WARNING: Unknown distance metric: %s specified! Reverting to euclidean" %(self.metric))

		## Compute distance of a test point to all train points
		distance_matrix = np.zeros([self.numObs])
		neighbor_id = np.zeros([self.numObs])
		for test_idx in range(self.XTestNumObs) :
			for n_idx in range(self.numObs) :
				neighbor_id[n_idx] = n_idx
				distance_matrix[n_idx] = minkowski(X_test[test_idx], self.X[n_idx], self.minkowski_p)

			## Sort and record distances and neighbors
			sorted_distances = sorted(zip(distance_matrix, neighbor_id, self.y))
			for k_idx in range(self.k) :
				self.XTestNeighborsDist[test_idx, k_idx], self.XTestNeighborsId[test_idx, k_idx], self.XTestNeighborsY[test_idx, k_idx] = sorted_distances[k_idx]
data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

cPickle.dump(question1_vectors, open('data/q1_w2v.pkl', 'wb'), -1)
cPickle.dump(question2_vectors, open('data/q2_w2v.pkl', 'wb'), -1)

data.to_csv('data/quora_features.csv', index=False)
Example #47
0
def minkowski_distance(a,b,p):
    print distance.minkowski(a,b,p)
# In[4]:

import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from scipy.spatial import distance


# In[9]:

df=pd.read_table('/Users/Li/Google Drive/UIUC_Study/2015Fall/CS412/HW/HW1/data/vectors.txt')


# In[15]:

x = np.array(df.ix[0,1:])
y = np.array(df.ix[1,1:])


# In[19]:

distance.minkowski(x,y,2)


# In[21]:

distance.minkowski(x,y,3)

    ax1.axes.get_yaxis().set_visible(False)
    pl.savefig(pdb1+'vs'+pdb2+'Isomap.png',dpi=250)
    pl.show()
    pl.clf()
    
    print TM_align['seqA']
    print TM_align['matchs']
    print TM_align['alignedList1']
    print 'Aligned positions1:\n', ','.join(i[3:] for i in TM_align['alignedList1'])
    print TM_align['alignedList2']
    print 'Aligned positions2:\n', ','.join(i[3:] for i in TM_align['alignedList2'])

    
    permu1,stdpermu1 = get_permutation(neojamming1, TM_align['alignedList1'])
    permu2,stdpermu2 = get_permutation(neojamming2, TM_align['alignedList2'])
    pl.title('Conformer ensemble distance: %.2f'%minkowski(stdpermu1,stdpermu2,pnorm))
    pl.scatter(stdpermu1,stdpermu2,marker='o',s=55,facecolor='0.6',edgecolor='b')
    pl.xlim(0,len(stdpermu1))
    pl.ylim(0,len(stdpermu1))
    pl.savefig(pdb1+'vs'+pdb2+'.png', bbox_inches='tight',dpi=250)
    pl.show()
    
    
    print 'permu:\n',','.join([str(i) for i in permu1])
    print 'std permu:\n',','.join([str(i) for i in stdpermu1])
    print 'permu:\n',','.join([str(i) for i in permu2])
    print 'std permu:\n',','.join([str(i) for i in stdpermu2])
    
    print 'Conformation ensambles at distance:\t', minkowski(stdpermu1,stdpermu2,pnorm)
    fpermus = open(pdb1+'vs'+pdb2+'.txt','w')
    fpermus.write('%s\n%s\n' % (pdb1,','.join(map(str,stdpermu1))) )
Example #50
0
def wvMkow(a):
	return [distance.minkowski(x[0], x[1], 2) for x in a]
Example #51
0
    def process_input(self,input_np_array):
        bmu = self.getBMU(input_np_array)
        bmu.hits += 1
        bmu.time = self.count
        for neu in self.map_neurons.values():
            nhash = str(neu.x_c)+""+str(neu.y_c)
           # print "bmu: "+str(bmu.coords())
            #print "neu: "+str(neu.coords())nhash
            dist =  minkowski(bmu.coords().astype(float), neu.coords().astype(float), 2)
            if dist< self.nr:
                '''weight adjustment *np.exp(-1*dist**2/2*self.nr**2)'''
                #neu.weight_vs = neu.weight_vs + self.lr * (input_np_array-neu.weight_vs)
                neu.weight_vs = neu.weight_vs + self.lr *np.exp(((dist/self.nr)**2)/(-2))* self.adjustment_gaus(input_np_array,neu.weight_vs)
                err =self.gaussian_error(input_np_array,neu.weight_vs)
                neu.res_err += err#minkowski(neu.weight_vs, bmu.weight_vs, 2)
                self.map_neurons[nhash]=neu




        if bmu.res_err > self.thresh:
            #print bmu.res_err
            neu = bmu
            down=str(neu.x_c)+str(int(neu.y_c)-1)
            up=str(neu.x_c)+str(int(neu.y_c)+1)
            left=str(int(neu.x_c)-1)+str(neu.y_c)
            right=str(int(neu.x_c)+1)+str(neu.y_c)
            nei_coords = np.array([down, up , left , right ] )
            nei_coordi = np.array([[(neu.x_c),(int(neu.y_c)-1)], [(neu.x_c),(int(neu.y_c)+1)], [(int(neu.x_c)-1),(neu.y_c)], [(int(neu.x_c)+1),int(neu.y_c)]] )
            p =0
            for coord in nei_coords:
                n=None
                try:
                    n= self.map_neurons[coord]
                    n.res_err+=self.fd*n.res_err

                except KeyError:
                    nwron=neuron(nei_coordi[p][0], nei_coordi[p][1], self.dim)
                    nwron.time=self.t_time
                    new_weight = 0
                    #case a) new node has two consecutive nodes on one of its sides
                    #tiroshan and lakmal please implement the code here
                    #case b) between two old nodes
                    new_weight_b = self.type_b_weight_init(p,neu)
                    new_weight_a = self.type_a_weight_init(p,neu)
                    new_weight_c = self.type_c_weight_init(p,neu)

                    if new_weight_b.all() ==0:
                        if new_weight_a.all() == 0:
                            if new_weight_c.all() == 0:
                                #print "c==0"
                                new_weight = np.ndarray(shape=(self.dim))
                                new_weight.fill(0.5)
                            else:
                                new_weight = new_weight_c
                        else:
                            new_weight = new_weight_a
                    else:
                        new_weight = new_weight_b

                    # nwron.weight_vs=np.ndarray(shape=(self.dim))
                    # nwron.weight_vs.fill(0.5)
                    nwron.weight_vs = new_weight
                    n=nwron
                self.map_neurons[coord]=n
                p+=1
            bmu.res_err=self.thresh/2
            self.map_neurons[str(bmu.x_c)+""+str(bmu.y_c)]=bmu
        return bmu.coords()
Example #52
0
def minkowski((x, y)):
    ret = []
    for p in range(2, 6):
        ret += [distance.minkowski(x, y, p)]
    return ret
Example #53
0
        hists = []
    for img in imgs:
        hist = [0 for i in range(64)]            
        for i in range(len(img)):
            for j in range(len(img[0])):
                if random.random() > 0.0625: # sample 1/16 pixels for efficiency
                    continue
                b1, b2, b3 = [c/64 for c in img[i][j]]
                idx = b1 * 16 + b2 * 4 + b3
                hist[idx] += 1
        hists.append(np.array(hist))
    selected.append(img_names[0])
    for i in range(1, len(hists)):
        hist1 = hists[i - 1]
        hist2 = hists[i]
        diff= minkowski(hist1, hist2, 1)
        print diff
        if diff > 128000.0/(640 * 480) * (160 * 120): # threshold
            selected.append(img_names[i])
        


# In[63]:

if not os.path.exists(args.keyframes_dir):
    os.mkdir(args.keyframes_dir)
for img in selected:
    os.system("cp {1}/{0} {2}/{0}".format(img, iframe_dir, keyframes_dir))


# In[64]:
Example #54
0
def getlabels(data, centroids, norm=2):
    """Returns an array of labels, for the centroid that is closest to each datapoint"""
    return array([argmin([minkowski(data[i, :], centroids[c, :], norm)
                          for c in range(shape(centroids)[0])]) for i in range(shape(data)[0])])
    #HERE structures must have only atoms of selected chain
    TM_align = rcu.TM_aligned_residues(pdb1,pdb2,offset2, offset1)
    
    neojamming1 = neoj.neoJAMMING(pdb1,chain1)
    neojamming2 = neoj.neoJAMMING(pdb2,chain2)
    
    print TM_align['seqA']
    print TM_align['matchs']
    print TM_align['alignedList1']
    print 'Aligned positions1:\n', ','.join(i[3:] for i in TM_align['alignedList1'])
    print TM_align['alignedList2']
    print 'Aligned positions2:\n', ','.join(i[3:] for i in TM_align['alignedList2'])

    permu1,stdpermu1 = get_permutation(neojamming1, TM_align['alignedList1'])
    permu2,stdpermu2 = get_permutation(neojamming2, TM_align['alignedList2'])
    pl.title('Conformer ensemble distance: %.2f'%minkowski(stdpermu1,stdpermu2,PNORM))
    pl.scatter(stdpermu1,stdpermu2,marker='o',s=55,facecolor='0.6',edgecolor='b')
    pl.xlim(0,len(stdpermu1))
    pl.ylim(0,len(stdpermu1))
    pl.savefig(pdb1+'vs'+pdb2+'.png', bbox_inches='tight',dpi=250)

    
    
    print 'permu:\n',','.join([str(i) for i in permu1])
    print 'std permu:\n',','.join([str(i) for i in stdpermu1])
    print 'permu:\n',','.join([str(i) for i in permu2])
    print 'std permu:\n',','.join([str(i) for i in stdpermu2])
    
    print 'Conformation ensambles at distance:\t', minkowski(stdpermu1,stdpermu2,PNORM)
    print 'Spearman rank-order correlation coefficient and p-value', spearmanr(permu1,permu2)
    
Example #56
0
 def centroidscore(datapoint, centroids, pheromone, beta):
     distances = array([minkowski(datapoint, centroids[i, :], beta) for i in range(shape(centroids)[0])])
     return pheromone * (1.0 / distances)
Example #57
0
def minkowski_distance(a,b,p):
    return distance.minkowski(a,b,p)
Example #58
0
import numpy as np
import pylab as pl
import scipy.spatial.distance as dist
def plotSamples(x, y, z=None):
    stars = np.matrix([[3., -2., 0.], [3., 2., 0.]])
    if z is not None:
        x, y = z * np.matrix([x, y])
        stars = z * stars
    pl.scatter(x, y, s=10) # 画 gaussian 随机点
    pl.scatter(np.array(stars[0]), np.array(stars[1]), s=200, marker='*', color='r') # 画三个指定点
    pl.axhline(linewidth=2, color='g') # 画 x 轴
    pl.axvline(linewidth=2, color='g') # 画 y 轴
    pl.axis('equal')
    pl.axis([-5, 5, -5, 5])
    pl.show()
# 产生高斯分布的随机点
mean = [0, 0] # 平均值
cov = [[2, 1], [1, 2]] # 协方差
x, y = np.random.multivariate_normal(mean, cov, 1000).T
plotSamples(x, y)
covMat = np.matrix(np.cov(x, y)) # 求 x 与 y 的协方差矩阵
Z = np.linalg.cholesky(covMat).I # 仿射矩阵
plotSamples(x, y, Z)
# 求马氏距离
print '\n到原点的马氏距离分别是'
print dist.mahalanobis([0,0], [3,3], covMat.I), dist.mahalanobis([0,0], [-2,2], covMat.I)
# 求变换后的欧几里得距离
dots = (Z * np.matrix([[3, -2, 0], [3, 2, 0]])).T
print '\n变换后到原点的欧几里得距离分别是:'
print dist.minkowski([0, 0], np.array(dots[0]), 2), dist.minkowski([0, 0], np.array(dots[1]), 2)
Example #59
0
def mixed_dist(x, y): 
    if x[-1] != y[-1]: 
        x[-1], y[-1] = 10000, 0  # arbitrary, about same scale as other
    else: 
        x[-1], y[-1] = 0, 0
    return minkowski(x, y, 2)