def __calc_distances__(self, v1s, v2s, is_sparse=True):
        if is_sparse:
            dcosine     = np.array([cosine(x.toarray(), y.toarray())       for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dcityblock  = np.array([cityblock(x.toarray(), y.toarray())    for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dcanberra  = np.array([canberra(x.toarray(), y.toarray())     for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            deuclidean = np.array([euclidean(x.toarray(), y.toarray())    for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dminkowski  = np.array([minkowski(x.toarray(), y.toarray(), 3) for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dbraycurtis = np.array([braycurtis(x.toarray(), y.toarray())   for (x, y) in zip(v1s, v2s)]).reshape((-1,1))

            dskew_q1 = [skew(x.toarray().ravel()) for x in v1s]
            dskew_q2 = [skew(x.toarray().ravel()) for x in v2s]
            dkur_q1  = [kurtosis(x.toarray().ravel()) for x in v1s]
            dkur_q2  = [kurtosis(x.toarray().ravel()) for x in v2s]

            dskew_diff = np.abs(np.array(dskew_q1) - np.array(dskew_q2)).reshape((-1,1))
            dkur_diff  = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape((-1,1))
        else:
            dcosine     = np.array([cosine(x, y)       for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dcityblock  = np.array([cityblock(x, y)    for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dcanberra  = np.array([canberra(x, y)     for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            deuclidean = np.array([euclidean(x, y)    for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dminkowski  = np.array([minkowski(x, y, 3) for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dbraycurtis = np.array([braycurtis(x, y)   for (x, y) in zip(v1s, v2s)]).reshape((-1,1))

            dskew_q1 = [skew(x) for x in v1s]
            dskew_q2 = [skew(x) for x in v2s]
            dkur_q1  = [kurtosis(x) for x in v1s]
            dkur_q2  = [kurtosis(x) for x in v2s]

            dskew_diff = np.abs(np.array(dskew_q1) - np.array(dskew_q2)).reshape((-1,1))
            dkur_diff  = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape((-1,1))
        return np.hstack((dcosine,dcityblock,dcanberra,deuclidean,dminkowski,dbraycurtis,dskew_diff,dkur_diff))
def compute_Correlation_matrix(patients_tri, triclusters):
    final_matrix = list()
    print(len(patients_tri))
    for bic_p in patients_tri:
        line = list()
        for tric in triclusters:
            corr_list = list()
            for tric_p in tric.getPatients():
                tric_tps = tric.getTimes()
                tric_fs = tric.getSamples()
                for fo in tric_fs:
                    p_slice = bic_p.getSlice(c=fo)[:len(tric_tps)]
                    #print("fop", p_slice)
                    t_slice = tric.getSlice(g=tric_p, c=fo)
                    #print("fot", t_slice)

                    corr_list.append(1 - distance.canberra(p_slice, t_slice))

                    #print(corr_list)
                for to in tric_tps:
                    p_slice = bic_p.getSlice(t=to)[:len(tric_fs)]
                    #print("top", p_slice)
                    t_slice = tric.getSlice(g=tric_p, t=to)
                    #print("tot", t_slice)
                    #
                    corr_list.append(1 - distance.canberra(p_slice, t_slice))
                    #print(corr_list)
            #rint(corr_list)
            line.append(stat.mean(corr_list))
        #print(len(line),line)
        final_matrix.append(line)

    return final_matrix
 def calculateL2(self, feat1, feat2, c_type='euclidean'):
     assert np.shape(feat1) == np.shape(feat2)
     if config.insight:
         [
             len_,
         ] = np.shape(feat1)
         #print(np.shape(feat1))
     else:
         _, len_ = np.shape(feat1)
     #print("len ",len_)
     if c_type == "cosine":
         s_d = distance.cosine(feat1, feat2)
     elif c_type == "euclidean":
         #s_d = np.sqrt(np.sum(np.square(feat1-feat2)))
         #s_d = distance.euclidean(feat1,feat2,w=1./len_)
         s_d = distance.euclidean(feat1, feat2, w=1)
     elif c_type == "correlation":
         s_d = distance.correlation(feat1, feat2)
     elif c_type == "braycurtis":
         s_d = distance.braycurtis(feat1, feat2)
     elif c_type == 'canberra':
         s_d = distance.canberra(feat1, feat2)
     elif c_type == "chebyshev":
         s_d = distance.chebyshev(feat1, feat2)
     return s_d
Exemple #4
0
def distance_features(data,genismModel):
    w2v_q1 = np.array([sent2vec(q, genismModel) for q in data.question1])
    w2v_q2 = np.array([sent2vec(q, genismModel) for q in data.question2])
    a=np.zeros(300)
    for i in range(len(w2v_q1)):
        if w2v_q1[i].size==1:
            w2v_q1[i]=a
    for i in range(len(w2v_q2)):
        if w2v_q2[i].size==1:
            w2v_q2[i]=a
    
    data['cosine_distance'] = [cosine(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['cityblock_distance'] = [cityblock(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['jaccard_distance'] = [jaccard(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['canberra_distance'] = [canberra(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['euclidean_distance'] = [euclidean(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['minkowski_distance'] = [minkowski(x,y,3) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['braycurtis_distance'] = [braycurtis(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['skew_q1vec'] = [skew(x) for x in w2v_q1]
    data['skew_q2vec'] = [skew(x) for x in w2v_q2]
    data['kur_q1vec'] = [kurtosis(x) for x in w2v_q1]
    data['kur_q2vec'] = [kurtosis(x) for x in w2v_q2]
    fs_4 = ['cosine_distance', 'cityblock_distance', 'jaccard_distance', 'canberra_distance', 
         'euclidean_distance', 'minkowski_distance','braycurtis_distance','skew_q1vec',
         'skew_q2vec','kur_q1vec','kur_q2vec']
    return data,fs_4
Exemple #5
0
def feature3(data):
    question1_vectors = np.zeros((data.shape[0], 300))
    error_count = 0
    for i, q in tqdm(enumerate(data.question1.values)):
        question1_vectors[i, :] = sent2vec(q)

    question2_vectors  = np.zeros((data.shape[0], 300))
    for i, q in tqdm(enumerate(data.question2.values)):
        question2_vectors[i, :] = sent2vec(q)

    data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]
    data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]  
    data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

    data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]
    data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

    data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
    data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
    data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
    data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]
    return data
def computeDistance(X, Y, method):
    if 'cosine' in method:
        dist = spdistance.cosine(X, Y)
    elif 'dot' in method:
        dist = 1.0 - X.dot(Y)
    elif 'chi2' in method:
        dist = chiSquare2(X, Y)
    elif 'chi3' in method:
        dist = chiSquare3(X, Y)
    elif 'chi' in method:
        dist = chiSquare(X, Y)
    elif 'euclidean' in method:
        dist = cv2.norm(X, Y)
    elif 'canberra' in method:
        dist = spdistance.canberra(X, Y)
    elif 'correl' in method:
        dist = spdistance.correlation(X, Y)
    else:
        # does that work?
        dist = cv2.compareHist(X, Y, method)

    if hasattr(cv2, 'cv') and 'cv2.cv.CV_COMP_CORREL' in method:
        dist = 1 - dist
    elif hasattr(cv2, 'HISTCMP_CORREL') and 'cv2.HISTCMP_CORREL' in method:
        dist = 1 - dist
    elif hasattr(cv2, 'cv') and 'cv2.cv.CV_COMP_INTERSECT' in method:
        dist = 1 - dist
    elif hasattr(cv2,
                 'HISTCMP_INTERSECT') and 'cv2.HISTCMP_INTERSECT' in method:
        dist = 1 - dist

    return dist
Exemple #7
0
def extend_with_features(data):
    stop_words = stopwords.words('english')
    data['fuzz_qratio'] = data.apply(
        lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])),
        axis=1)
    data['fuzz_WRatio'] = data.apply(
        lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])),
        axis=1)

    model = gensim.models.KeyedVectors.load_word2vec_format(
        google_news_model_path, binary=True)
    data['wmd'] = data.apply(
        lambda x: wmd(model, x['question1'], x['question2']), axis=1)

    norm_model = gensim.models.KeyedVectors.load_word2vec_format(
        google_news_model_path, binary=True)
    norm_model.init_sims(replace=True)
    data['norm_wmd'] = data.apply(
        lambda x: norm_wmd(norm_model, x['question1'], x['question2']), axis=1)

    question1_vectors = np.zeros((data.shape[0], 300))
    for i, q in enumerate(data.question1.values):
        question1_vectors[i, :] = sent2vec(model, q)

    question2_vectors = np.zeros((data.shape[0], 300))
    for i, q in enumerate(data.question2.values):
        question2_vectors[i, :] = sent2vec(model, q)

    question1_vectors = np.nan_to_num(question1_vectors)
    question2_vectors = np.nan_to_num(question2_vectors)

    data['cosine_distance'] = [
        cosine(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['cityblock_distance'] = [
        cityblock(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['jaccard_distance'] = [
        jaccard(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['canberra_distance'] = [
        canberra(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['euclidean_distance'] = [
        euclidean(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['minkowski_distance'] = [
        minkowski(x, y, 3)
        for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['braycurtis_distance'] = [
        braycurtis(x, y)
        for (x, y) in zip(question1_vectors, question2_vectors)
    ]

    data['skew_q1vec'] = [skew(x) for x in question1_vectors]
    data['skew_q2vec'] = [skew(x) for x in question2_vectors]
    data['kur_q1vec'] = [kurtosis(x) for x in question1_vectors]
    data['kur_q2vec'] = [kurtosis(x) for x in question2_vectors]
    return data
def vectors_features(in_data: pd.DataFrame,
                     sent2vec: Callable[[str], np.array]) -> pd.DataFrame:
    assert "question1" in in_data.columns
    assert "question2" in in_data.columns
    vectors1 = np.array([sent2vec(x) for x in in_data['question1']])
    vectors2 = np.array([sent2vec(x) for x in in_data['question2']])
    in_data['cos'] = np.array(
        [cosine(x, y) for x, y in zip(vectors1, vectors2)])
    in_data['jaccard'] = np.array(
        [jaccard(x, y) for x, y in zip(vectors1, vectors2)])
    in_data['euclidean'] = np.array(
        [euclidean(x, y) for x, y in zip(vectors1, vectors2)])
    in_data['minkowski'] = np.array(
        [minkowski(x, y) for x, y in zip(vectors1, vectors2)])
    in_data['cityblock'] = np.array(
        [cityblock(x, y) for (x, y) in zip(vectors1, vectors2)])
    in_data['canberra'] = np.array(
        [canberra(x, y) for (x, y) in zip(vectors1, vectors2)])
    in_data['braycurtis'] = np.array(
        [braycurtis(x, y) for (x, y) in zip(vectors1, vectors2)])
    in_data['skew_q1'] = np.array([skew(x) for x in vectors1])
    in_data['skew_q2'] = np.array([skew(x) for x in vectors2])
    in_data['kur_q1'] = np.array([kurtosis(x) for x in vectors1])
    in_data['kur_q2'] = np.array([kurtosis(x) for x in vectors2])
    in_data['skew_diff'] = np.abs(in_data['skew_q1'] - in_data['skew_q2'])
    in_data['kur_diff'] = np.abs(in_data['kur_q1'] - in_data['kur_q2'])
    return in_data
Exemple #9
0
def similarity_function(x, y):
    """ Similarity function for comparing user features.

    This actually really should be implemented in taar.similarity_recommender
    and then imported here for consistency.
    """
    def safe_get(field, row, default_value):
        # Safely get a value from the Row. If the value is None, get the
        # default value.
        return row[field] if row[field] is not None else default_value

    # Extract the values for the categorical and continuous features for both
    # the x and y samples. Use an empty string as the default value for missing
    # categorical fields and 0 for the continuous ones.
    x_categorical_features = [safe_get(k, x, "") for k in CATEGORICAL_FEATURES]
    y_categorical_features = [safe_get(k, y, "") for k in CATEGORICAL_FEATURES]
    x_continuous_features = [
        float(safe_get(k, x, 0)) for k in CONTINUOUS_FEATURES
    ]
    y_continuous_features = [
        float(safe_get(k, y, 0)) for k in CONTINUOUS_FEATURES
    ]

    # Here a larger distance indicates a poorer match between categorical variables.
    j_d = distance.hamming(x_categorical_features, y_categorical_features)
    j_c = distance.canberra(x_continuous_features, y_continuous_features)

    # Take the product of similarities to attain a univariate similarity score.
    # Add a minimal constant to prevent zero values from categorical features.
    # Note: since both the distance function return a Numpy type, we need to
    # call the |item| function to get the underlying Python type. If we don't
    # do that this job will fail when performing KDE due to SPARK-20803 on
    # Spark 2.2.0.
    return abs((j_c + 0.001) * j_d).item()
Exemple #10
0
def get_w2v_simi(query, title):
    q_vec = np.nan_to_num(sent2vec(query))
    t_vec = np.nan_to_num(sent2vec(title))

    w2v_consine = cosine(q_vec, t_vec)
    w2v_cityblock = cityblock(q_vec, t_vec)
    w2v_jaccard = jaccard(q_vec, t_vec)
    w2v_canberra = canberra(q_vec, t_vec)
    w2v_euclidean = euclidean(q_vec, t_vec)
    w2v_minkowski = minkowski(q_vec, t_vec)
    w2v_braycurtis = braycurtis(q_vec, t_vec)

    w2v_skew_qvec = skew(q_vec)
    w2v_skew_tvec = skew(t_vec)
    w2v_kur_qvec = kurtosis(q_vec)
    w2v_kur_tvec = kurtosis(t_vec)

    outlist = [w2v_consine,
               w2v_cityblock,
               w2v_jaccard,
               w2v_canberra,
               w2v_euclidean,
               w2v_minkowski,
               w2v_braycurtis,
               w2v_skew_qvec,
               w2v_skew_tvec,
               w2v_kur_qvec,
               w2v_kur_tvec
               ]
    outformat = ':'.join(['{}']*len(outlist))

    return outformat.format(*outlist)
Exemple #11
0
def kmeansClassify(A, means, distType = "euclidean"):

	codesErrors = []
	for i in range(A.shape[0]):
		d = [0, sys.maxint] #check it against all means, and store the row index of mean with distance with mean
		for j in range(means.shape[0]): #calculate distance metrics other than euclidean

			if distType == "euclidean":
				newd = dist.euclidean(A[i,:], means[j,:])
			elif  distType == "cosine":
				newd = dist.cosine(A[i,:], means[j,:])
			elif distType == "canberra":
				newd = dist.canberra(A[i,:], means[j,:])
			elif distType == "manhattan":
				newd = dist.cityblock(A[i,:], means[j,:])
			elif distType == "correlation":
				newd = dist.correlation(A[i,:], means[j,:])
			elif distType == "hamming":
				newd = dist.hamming(A[i,:], means[j,:])


			if newd < d[1]:
				d = [j, newd]

		codesErrors.append(d)

	return (np.matrix(codesErrors)[:,0], np.matrix(codesErrors)[:,1])  #returns the codes and errors
Exemple #12
0
def calculate_distance(X, Y, metric='euclidean'):
    if metric == METRIC_EUCLIDEAN:
        return distance.euclidean(X, Y)
    elif metric == METRIC_JACCARD:
        return distance.jaccard(X, Y)
    elif metric == METRIC_CANBERRA:
        return distance.canberra(X, Y)
    elif metric == METRIC_CHEBYSHEV:
        return distance.chebyshev(X, Y)
    elif metric == METRIC_MINKOWSKI:
        return distance.minkowski(X, Y)
    elif metric == METRIC_WMINKOWSKI:
        return distance.wminkowski(X, Y)
    elif metric == METRIC_BRAYCURTIS:
        return distance.braycurtis(X, Y)
    elif metric == METRIC_HAMMING:
        return distance.hamming(X, Y)
    elif metric == METRIC_MAHALANOBIS:
        return distance.mahalanobis(X, Y)
    elif metric == METRIC_MANHATTAN:
        return sum(abs(a - b) for a, b in zip(X, Y))

    elif metric == METRIC_COSINE:
        dot_product = np.dot(X, Y)
        norm_a = np.linalg.norm(X)
        norm_b = np.linalg.norm(Y)
        return dot_product / (norm_a * norm_b)
def calculate_featureset4(dataframe, q1_vectors, q2_vectors):
    dataframe['cosine_dist'] = [
        cosine(x, y)
        for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors))
    ]
    dataframe['cityblock_dist'] = [
        cityblock(x, y)
        for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors))
    ]
    dataframe['jaccard_dist'] = [
        jaccard(x, y)
        for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors))
    ]
    dataframe['canberra_dist'] = [
        canberra(x, y)
        for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors))
    ]
    dataframe['euclidean_dist'] = [
        euclidean(x, y)
        for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors))
    ]
    dataframe['minkowski_dist'] = [
        minkowski(x, y, 3)
        for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors))
    ]
    dataframe['braycurtis_dist'] = [
        braycurtis(x, y)
        for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors))
    ]
    dataframe['skew_q1'] = [skew(x) for x in np.nan_to_num(q1_vectors)]
    dataframe['skew_q2'] = [skew(x) for x in np.nan_to_num(q2_vectors)]
    dataframe['kurtosis_q1'] = [kurtosis(x) for x in np.nan_to_num(q1_vectors)]
    dataframe['kurtosis_q2'] = [kurtosis(x) for x in np.nan_to_num(q2_vectors)]
    return dataframe
    def features_similarity(cls, df):
        cls.load_model(normed=True)
        question1_vectors, question2_vectors = cls.get_questions_vector(df)
        cls.resetmodel()

        cls.dict_features['cosine_distance'] = [
            cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                           np.nan_to_num(question2_vectors))
        ]
        print("1/11 Cosine Distance finished.")
        cls.dict_features['cityblock_distance'] = [
            cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                              np.nan_to_num(question2_vectors))
        ]
        print("2/11 Cityblock Distance finished.")
        cls.dict_features['jaccard_distance'] = [
            jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                            np.nan_to_num(question2_vectors))
        ]
        print("3/11 Jaccard Distance finished.")
        cls.dict_features['canberra_distance'] = [
            canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                             np.nan_to_num(question2_vectors))
        ]
        print("4/11 Canberra Distance finished.")
        cls.dict_features['euclidean_distance'] = [
            euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                              np.nan_to_num(question2_vectors))
        ]
        print("5/11 Euclidean Distance finished.")
        cls.dict_features['minkowski_distance'] = [
            minkowski(x, y, 3)
            for (x, y) in zip(np.nan_to_num(question1_vectors),
                              np.nan_to_num(question2_vectors))
        ]
        print("6/11 Minkowski Distance finished.")
        cls.dict_features['braycurtis_distance'] = [
            braycurtis(x, y)
            for (x, y) in zip(np.nan_to_num(question1_vectors),
                              np.nan_to_num(question2_vectors))
        ]
        print("7/11 Braycurtis Distance finished.")
        cls.dict_features['skew_q1vec'] = [
            skew(x) for x in np.nan_to_num(question1_vectors)
        ]
        print("8/11 Skew Q1 Vec finished.")
        cls.dict_features['skew_q2vec'] = [
            skew(x) for x in np.nan_to_num(question2_vectors)
        ]
        print("9/11 Skew Q2 Vec finished.")
        cls.dict_features['kur_q1vec'] = [
            kurtosis(x) for x in np.nan_to_num(question1_vectors)
        ]
        print("10/11 Kurtosis Q1 Vec finished.")
        cls.dict_features['kur_q2vec'] = [
            kurtosis(x) for x in np.nan_to_num(question2_vectors)
        ]
        print("11/11 Kurtosis Q2 Vec finished.")
        return question1_vectors, question2_vectors
Exemple #15
0
 def canberra(self):
     '''
     Euclidean distance between
     two conn matrices. The matrices are vectorized
     '''
     vec1 = self._vectorize(self.conn1)
     vec2 = self._vectorize(self.conn2)
     return distance.canberra(vec1, vec2)
Exemple #16
0
def main():
    a = [1, 2, 3, 4, 5]
    b = [2, 3, 4, 6, 8]
    result = distance.canberra(a, b)

    # list_elem = ['canberra','chebyshev','cityblock','euclidean','jaccard','hamming']
    # print(get_all_combinations(list_elem))
    print(result)
def canb(m, n):
    k = []
    for i in range(len(m)):
        k.append(distance.canberra(m[i].toarray(), n[i].toarray()))
    for i in range(len(k)):
        k[i] = round(k[i], 2)
    print('canb')
    return k
Exemple #18
0
def point_distance(point_a, point_b, type="graph", map="de_dust2"):
    """ Returns the distance between two points using a given method on a given map (if needed)

    Args:
        point_a: A list of floats or ints containing the position of point A
        point_b: A list of floats or ints containing the position of point B
        type: A string that is one of 'euclidean', 'manhattan', 'canberra', 'cosine' or 'graph'. Using 'graph' will use A* to find the shortest path and counts the discrete areas it travels.
        map: A string indicating the map
    """
    if map not in [
            "de_dust2",
            "de_cbble",
            "de_inferno",
            "de_mirage",
            "de_nuke",
            "de_overpass",
            "de_train",
            "de_vertigo",
    ]:
        raise ValueError(
            f'Invalid map name: got {map}, expected one of: "de_dust2", "de_cbble", "de_inferno", "de_mirage", "de_nuke", "de_overpass", "de_train", "de_vertigo"'
        )

    if type == "graph":
        path = os.path.join(os.path.dirname(__file__), "")
        proc = subprocess.Popen(
            [
                "go",
                "run",
                "path_distance.go",
                "-map",
                map,
                "-start_x",
                str(point_a[0]),
                "-start_y",
                str(point_a[1]),
                "-start_z",
                str(point_a[2]),
                "-end_x",
                str(point_b[0]),
                "-end_y",
                str(point_b[1]),
                "-end_z",
                str(point_b[2]),
            ],
            stdout=subprocess.PIPE,
            cwd=path,
        )
        return int(proc.stdout.read())
    elif type == "euclidean":
        return distance.euclidean(point_a, point_b)
    elif type == "manhattan":
        return distance.cityblock(point_a, point_b)
    elif type == "canberra":
        return distance.canberra(point_a, point_b)
    elif type == "cosine":
        return distance.cosine(point_a, point_b)
Exemple #19
0
def feats_tfidf(row):
    out_list = []
    que1 = str(row['question1'])
    que2 = str(row['question2'])

    #Calculate que1 lsa vector
    que1_vec = []
    que1_bow = dictionary.doc2bow(que1.lower().split())
    que1_lsi = lsi[que1_bow]
    for (index, value) in que1_lsi:
        que1_vec.append(value)

    #Calculate que2 lsa vector
    que2_vec = []
    que2_bow = dictionary.doc2bow(que2.lower().split())
    que2_lsi = lsi[que2_bow]
    for (index, value) in que2_lsi:
        que2_vec.append(value)

    #drop some dimensions if they don't match
    if len(que1_vec) != len(que2_vec):
        if len(que1_vec) > len(que2_vec):
            que1_vec = que1_vec[:len(que2_vec)]
            que2_vec = que2_vec
        else:
            que1_vec = que1_vec
            que2_vec = que2_vec[:len(que1_vec)]

    #Calculate distances between lsa vectors
    try:
        lsa_cosine = cosine(que1_vec, que2_vec)
    except:
        lsa_cosine = 1

    lsa_cityblock = cityblock(que1_vec, que2_vec)
    lsa_jaccard = jaccard(que1_vec, que2_vec)
    lsa_canberra = canberra(que1_vec, que2_vec)

    try:
        lsa_euclidean = euclidean(que1_vec, que2_vec)
    except:
        lsa_euclidean = np.nan

    lsa_minkowski = minkowski(que1_vec, que2_vec, 3)
    lsa_braycurtis = braycurtis(que1_vec, que2_vec)

    lsa_q1_skew = skew(que1_vec)
    lsa_q1_kurtosis = kurtosis(que1_vec)

    lsa_q2_skew = skew(que2_vec)
    lsa_q2_kurtosis = kurtosis(que2_vec)


    out_list.extend([lsa_cosine,lsa_cityblock,lsa_jaccard,lsa_canberra,lsa_euclidean, \
                     lsa_minkowski,lsa_braycurtis,lsa_q1_skew,lsa_q1_kurtosis,lsa_q2_skew, lsa_q2_kurtosis])

    return out_list
Exemple #20
0
def canberraDist(h, b, bigram_vectorizer):

    corpus = []
    corpus.append(h)
    corpus.append(b)
    vecs = bigram_vectorizer.fit_transform(corpus).toarray()
    vec1, vec2 = vecs[0, :], vecs[1, :]
    #print (canberra(vec1,vec2))
    return canberra(vec1, vec2) / 1000
Exemple #21
0
def canberraDist(h,b, bigram_vectorizer):
   
    corpus = []
    corpus.append(h)
    corpus.append(b)
    vecs = bigram_vectorizer.fit_transform(corpus).toarray()
    vec1, vec2 = vecs[0,:],vecs[1,:]
    #print (canberra(vec1,vec2))
    return canberra(vec1,vec2)/1000
Exemple #22
0
def calc_nearest_to(nearest):
    pics = read_json()
    selected = pics.pop(str(nearest))
    nearest_pics = {}
    for k, v in pics.items():
        nearest_pics[k] = canberra(selected, v)
        nearest_pics = {a: b for a, b in sorted(nearest_pics.items(), key=lambda item: item[1])}
        nearest_pics = dict(itertools.islice(nearest_pics.items(), 20))

    return nearest_pics
Exemple #23
0
 def pair_coherence(self, word_i, word_j, metric=None):
     if(metric=="correlation"):
         return 1 - distance.correlation(self.model[word_i], self.model[word_j])
     if(metric=="chebyshev"):
         return 1 - distance.chebyshev(self.model[word_i], self.model[word_j])
     if(metric=="euclidean"):
         return 1 - distance.euclidean(self.model[word_i], self.model[word_j])
     if(metric=="canberra"):
         return 1 - distance.canberra(self.model[word_i], self.model[word_j])
     return self.model.similarity(word_i,word_j)
Exemple #24
0
def similar(q1, q2):
    v1, v2 = sent2vec(q1, q2)
    cos = cosine(v1, v2)
    if (cos < -1):
        deg = math.degrees(math.acos(-1))
    elif (cos > 1):
        deg = math.degrees(math.acos(1))
    else:
        deg = math.degrees(math.acos(cos))
    return euclidean(v1, v2), cos, deg, canberra(v1, v2), correlation(v1, v2)
def get_distance_features(data, emb):
    data['cosine_distance'] = pd.Series([cosine(x, y) for x, y in emb])
    data['cityblock_distance'] = pd.Series([cityblock(x, y) for x, y in emb])
    data['jaccard_distance'] = pd.Series([jaccard(x, y) for x, y in emb])
    data['canberra_distance'] = pd.Series([canberra(x, y) for x, y in emb])
    data['euclidean_distance'] = pd.Series([euclidean(x, y) for x, y in emb])
    data['minkowski_distance'] = pd.Series(
        [minkowski(x, y, 3) for x, y in emb])
    data['braycurtis_distance'] = pd.Series([braycurtis(x, y) for x, y in emb])
    return data
Exemple #26
0
def canberra_dist(user_predict, adoptable_dogs, images):
    '''
    Calculating Canberra distance between two 1D arrays and return similiarty score
    '''
    sim_score = []
    for idx in range(0, len(adoptable_dogs)):
        sim_score.append(
            distance.canberra(user_predict.flatten(),
                              adoptable_dogs[idx].flatten()))
    print('Maximum SimScore: ' + str(max(sim_score)))
    return pd.DataFrame({'imgFile': images, 'SimScore': sim_score})
Exemple #27
0
def feature_construct(city,
                      model_name,
                      friends,
                      walk_len=100,
                      walk_times=20,
                      num_features=128):
    '''construct the feature matrixu2_checkin
    Args:
        city: city
        model_name: 20_locid
        friends: friends list (asymetric) [u1, u2]
        walk_len: walk length
        walk_times: walk times
        num_features: dimension for vector        
    Returns:
    '''

    if os.path.exists('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\
                      str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature'):
        os.remove('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\
                  str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature')

    emb = pd.read_csv('dataset/'+city+'/emb/'+city+'_'+model_name+'_'+\
                      str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.emb',\
                      header=None, skiprows=1, sep=' ')

    emb = emb.rename(columns={0: 'uid'})  # last column is user id
    emb = emb.loc[emb.uid > 0]  # only take users, no loc_type, not necessary

    pair = pair_construct(emb.uid.unique(), friends)

    for i in range(len(pair)):
        u1 = pair.loc[i, 'u1']
        u2 = pair.loc[i, 'u2']
        label = pair.loc[i, 'label']

        u1_vector = emb.loc[emb.uid == u1, range(1, emb.shape[1])]
        u2_vector = emb.loc[emb.uid == u2, range(1, emb.shape[1])]

        i_feature = pd.DataFrame([[
            u1, u2, label,
            cosine(u1_vector, u2_vector),
            euclidean(u1_vector, u2_vector),
            correlation(u1_vector, u2_vector),
            chebyshev(u1_vector, u2_vector),
            braycurtis(u1_vector, u2_vector),
            canberra(u1_vector, u2_vector),
            cityblock(u1_vector, u2_vector),
            sqeuclidean(u1_vector, u2_vector)
        ]])

        i_feature.to_csv('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\
                         str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature',\
                         index = False, header = None, mode = 'a')
Exemple #28
0
def plt_compare(one, two):
    hists = read_json()
    hist1 = hists[str(one)]
    hist2 = hists[str(two)]
    plt.plot(hist1, color='red')
    similarity = canberra(hist1, hist2)
    print(f'length : {len(hist2)}')
    print(f'similarity : {similarity}')
    # plt.figure()
    plt.plot(hist2)
    plt.show()
Exemple #29
0
def calc_ROSA(fea0, fea1):
    '''
    Calculating ROSA values of fea0 and fea1,
    where fea0 is the features of observation,
    and fea1 is the features of forecast.
    '''
    value = {}
    #R
    value['r_rate'] = dr(fea0['average_rainfall'], fea1['average_rainfall'])

    #O
    value['x_offset'] = fea1['x_c'] - fea0['x_c']
    value['y_offset'] = fea1['y_c'] - fea0['y_c']

    #S
    value['sigma_x_rate'] = dr(fea0['sigma_x'], fea1['sigma_x'])
    value['sigma_y_rate'] = dr(fea0['sigma_y'], fea1['sigma_y'])
    value['ecc'] = fea1['eccentricity'] - fea0['eccentricity']
    value['Hu_dist'] = distance.canberra(fea1['Hu'], fea0['Hu'])
    # Canberra distance
    # Other distances can be tried and tested.

    #A
    if (fea0['eccentricity'] < MIN_E) or (fea1['eccentricity'] < MIN_E):
        value['angle'] = 0.
    else:
        v0 = fea0['major_axis']
        v1 = fea1['major_axis']
        theta = numpy.dot(v0, v1)/(numpy.linalg.norm(v0)*numpy.linalg.norm(v1))
        theta = numpy.arccos(theta)/numpy.pi*180
        sgn = numpy.sign(numpy.cross(v0, v1))
        value['angle'] = sgn*theta

        # The angle between two straight lines should be
        # bounded between -90 and +90.
        if value['angle'] > 90:
            value['angle'] = value['angle'] - 180
        elif value['angle'] < -90:
            value['angle'] = value['angle'] + 180

    return value
all_questions=[userinput]+all_questions
all_answers=[d['answer'] for d in data]
all_answers=[userinput]+all_answers

QuestionTVectorArray=QTvectorizer.fit_transform(all_questions)
AnswerTVectorArray=ATvectorizer.fit_transform(all_answers)

#print "question cosine similairity-->",cosine_similarity(QuestionTVectorArray[0:1],QuestionTVectorArray)
#print "answer cosine similarity-->",cosine_similarity(AnswerTVectorArray[0:1],AnswerTVectorArray)
Qcosines=cosine_similarity(QuestionTVectorArray[0:1],QuestionTVectorArray)
Acosines=cosine_similarity(AnswerTVectorArray[0:1],AnswerTVectorArray)

Qbray=[dist.braycurtis(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
Abray=[dist.braycurtis(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray]

Qcanberra=[dist.canberra(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
Acanberra=[dist.canberra(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray]

Qhamming=[dist.hamming(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
Ahamming=[dist.hamming(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray]

Qcorrelation=[dist.correlation(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
Acorrelation=[dist.correlation(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray]

Qcityblock=[dist.cityblock(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
Acityblock=[dist.cityblock(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray]

Qdice=[dist.dice(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
Adice=[dist.dice(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray]

Qyule=[dist.yule(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
def pairwise_compare(signature_vectors):
    print "Pairwise Comparison of graphs ... "
    for i in range(0, len(signature_vectors)):
        for j in range(i+1, len(signature_vectors)):
            print "\t Distance between graph", i, "and graph ", j, dis.canberra(signature_vectors[i],signature_vectors[j])
    question1_vectors[i, :] = sent2vec(q)

question2_vectors  = np.zeros((data.shape[0], 300))
for i, q in tqdm(enumerate(data.question2.values)):
    question2_vectors[i, :] = sent2vec(q)

data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]
Exemple #33
0
def wvCanb(a):
	return [distance.canberra(x[0], x[1]) for x in a]
def canberra((x, y)):
    return distance.canberra(x, y)