Exemple #1
0
def jaccardSimilarity(term1,term2):
	term1Set = set(term1)
	term2Set = set(term2)
	unionSet = list(term1Set.union(term2Set));
	a = map(lambda x: 1 if x in term1Set else 0, unionSet)
	b = map(lambda x: 1 if x in term1Set else 0, unionSet)
	return jaccard(a,b)
Exemple #2
0
    def take_hits(self, cat_data, mid_dict, N, randomized=False, movies=None):

        N = N + 1 # The most similar element is itself.
        
        #total_hits = []

        if movies is None:
            movies = mid_dict.keys()
        
        m = len(movies)
        total = 0
        jac_list = []
        for movid in movies:
            
            jac = 0 # jaccard score of movies
            
            mov_genre = self.cat_mat[movid]
            if randomized: # Random test, verify the significance of results.
                neighbors = random.sample(xrange(1, 3417), N)
            else: 
                neighbors = set(self.sim_dict[movid][1:N])
            for neighbor in neighbors:
                n_genre = self.cat_mat[neighbor]
                jac = jac + (1 - jaccard(mov_genre, n_genre)) # similarity

            jac = jac / N # Avg Jaccard of the movie.
            jac_list.append(jac) # for histogram
            
            total = total + jac
            
            
            #total_hits.append(n)

        total = total / m # Avg jaccard of movies.
        return (total, jac_list)
Exemple #3
0
def compute_jaccard(x, y, x_min=0.0, x_max=1.0, y_min=0.0, y_max=1.0,
                    warn_uneven=True, limit_tolerance=4, disable_checks=False):
    """Calculate the Jaccard index (Jaccard similarity coefficient).

    The Jaccard coefficient measures similarity between sample sets, and is
    defined as the size of the intersection divided by the size of the union of
    the sample sets. The Jaccard coefficient can be calculated for a subset of
    rasters provided by using the threshold argument.

    Min and max values must be provided for both RasterLayer objects x
    and y. Method can be used with RasterLayers of any value range, but
    the defaults [0.0, 1.0] are geared towards comparing Zonation rank priority
    rasters. Limits provided are inclusive.

    :param x ndarray object.
    :param y ndarray object.
    :param x_min Numeric minimum threshold value for x to be used
                 (default 0.0).
    :param x_max Numeric maximum threshold value for x to be used
                 (default 1.0).
    :param y_min Numeric minimum threshold value for y to be used
                 (default 0.0).
    :param y_max Numeric maximum threshold value for y to be used
                 (default 1.0).
    :param warn_uneven Boolean indicating whether a warning is raised if the
                       compared raster coverages are very (>20x) uneven.
    :param limit_tolerance integer values that defines to which precision x and
                           y limits are rounded to. This helps e.g. with values
                           that close to 0 but not quite 0 (default: 4, i.e.
                           round(x, 4)).
    :param disable_checks boolean indicating if the input limit values are
                          checked against the actual raster values in x and y.

    :return numeric value in [0, 1].
    """
    if not disable_checks:
        assert x_min >= np.round(np.min(x), limit_tolerance), "Min threshold smaller than computed min of x"
        assert x_max <= np.round(np.max(x), limit_tolerance), "Max threshold greater than computed max of x"
        assert x_min < x_max, "Min threshold for x larger to max threshold"
        assert y_min >= np.round(np.min(y), limit_tolerance), "Min threshold smaller than computed min of y"
        assert y_max <= np.round(np.max(y), limit_tolerance), "Max threshold greater than computed max of y"
        assert y_min < y_max, "Min threshold for y larger to max threshold"

    # Get the values according to the limits provided
    x_bin = (x >= x_min) & (x <= x_max)
    y_bin = (y >= y_min) & (y <= y_max)

    if warn_uneven:
        x_size = np.sum(x_bin)
        y_size = np.sum(y_bin)
        # Sort from smaller to larger
        sizes = np.sort([x_size, y_size])
        if sizes[1] / sizes[0] > 20:
            print("WARNING: The extents of raster values above the "
                  "threshhold differ more than 20-fold: Jaccard coefficient " +
                  "may not be informative.")

    # Compute the Jaccard-Needham dissimilarity between two boolean 1-D arrays
    # and subtract from 1 to get the Jaccard index
    return 1 - jaccard(x_bin.flatten(), y_bin.flatten())
Exemple #4
0
 def weight_cost_func_hashtag_bow_topics(self, s, t):
     a1 = numpy.array(s['hashtag_bow'].todense()).ravel()
     a2 = numpy.array(t['hashtag_bow'].todense()).ravel()
     if not a1.any() or not a2.any():
         d = 1
     else:
         d = jaccard(a1, a2)
     return (0.2 * cosine(s['topics'], t['topics']) +
             0.1 * d)
Exemple #5
0
    def assign_edge_weights(cls, g,
                            dist_func,
                            fields_with_weights={'topics': 1}):
        """
        TODO: can be parallelized
        """
        N = g.number_of_edges()
        dists_mat = np.zeros((N, len(fields_with_weights)))

        fields, fields_weight = fields_with_weights.keys(), \
                                fields_with_weights.values()
        for i, (s, t) in enumerate(g.edges_iter()):
            if i % 10000 == 0:
                logger.debug('adding edge cost: {}/{}'.format(i, N))

            for j, f in enumerate(fields):
                if issparse(g.node[s][f]):
                    array1 = np.array(g.node[s][f].todense()).ravel()
                else:
                    array1 = np.array(g.node[s][f])

                if issparse(g.node[t][f]):
                    array2 = np.array(g.node[t][f].todense()).ravel()
                else:
                    array2 = np.array(g.node[t][f])

                # at least one is all-zero
                if not array1.any() or not array2.any():
                    dists_mat[i, j] = 1
                else:
                    if f == 'hashtag_bow':
                        # special treatment to `hashtag_bow`
                        dists_mat[i, j] = jaccard(
                            array1,
                            array2
                        )
                    else:
                        dists_mat[i, j] = dist_func(
                            array1,
                            array2
                        )

                    assert not np.isinf(dists_mat[i, j])

        weight_mat = np.matrix([fields_weight]).T

        dist_mat = np.abs(np.matrix(dists_mat) * weight_mat)

        for i, (s, t) in enumerate(g.edges_iter()):
            g[s][t][cls.EDGE_COST_KEY] = dist_mat[i, 0]
            assert not np.isinf(g[s][t][cls.EDGE_COST_KEY]), \
                (g.node[s]['bow'].nonzero(),
                 g.node[t]['bow'].nonzero())
        
        return g
Exemple #6
0
def recommend_movie(movieID):
	similar_movie = {}

	for i in df.columns:
	     similar_movie[i] = 0.2*(1.0 - jaccard(df[movieID],df[i])) + 1.0*(1.0 - jaccard(df1[movieID],df1[i]))
	
	sorted_similar_movie = dict(sorted(similar_movie.iteritems(), key=operator.itemgetter(1), reverse=True)[:500])
	
	#movie sorted with mean ratings and size
	mean_size = requests.get(url+'movies/mean/all').json()
	ms = pd.DataFrame(mean_size)
	ms_t = ms.T
	ms_t.drop(['movie_id'], axis = 0, inplace=True)
	ms_t.columns = ms['movie_id']

	for i in sorted_similar_movie:
		try:
			sorted_similar_movie[i] += 0.5*(ms_t[i][0] + ms_t[i][1])
		except KeyError, e:
			continue
Exemple #7
0
 def compute_similarity(self, arr1, arr2):
     if self.simfcn == "cosine":
         return self.d_to_sim(cosine(arr1, arr2))
     elif self.simfcn == "pearson":
         return self.d_to_sim(correlation(arr1, arr2))
     elif self.simfcn == "hamming":
         return 1 - hamming(arr1, arr2)
     elif self.simfcn == "jaccard":
         return 1 - jaccard(arr1, arr2)
     else:
         print "Similiarity Function Not Yet Supported"
         exit()
Exemple #8
0
def reccomend(movieID):

	movieID = int(movieID)
	#requests genre
	genreV = requests.get(url+'movies/genreV/all').json()

	#create dataframe
	vec = pd.DataFrame(genreV)

	#create genre vector
	vec['genresV'] = vec['genresV'].str.split('|')

	list_movieid = list(vec['movieId'])

	v = vec.T
	v.columns = list_movieid

	v.drop(['movieId'], inplace=True)

	v_json = v.to_dict(orient='records')
	#v_json = v.to_dict(orient='series')
	#d = requests.post('http://10.10.76.125:1000/api/movies', v_json)

	#v_res = d.json()

	df = pd.DataFrame.from_dict(v_json[0], orient='index')
	
	df.drop([20], axis = 1, inplace=True)
	
	df = df.T
	
	df = df.astype(int)
	
	similar_movie = {}

	for i in df.columns:
	     similar_movie[i] = 1.0 - jaccard(df[movieID],df[i])
	
	sorted_similar_movie = dict(sorted(similar_movie.iteritems(), key=operator.itemgetter(1), reverse=True)[:500])
	
	#movie sorted with mean ratings and size
	mean_size = requests.get(url+'movies/mean/all').json()
	ms = pd.DataFrame(mean_size)
	ms_t = ms.T
	ms_t.drop(['movie_id'], axis = 0, inplace=True)
	ms_t.columns = ms['movie_id']

	for i in sorted_similar_movie:
		try:
			sorted_similar_movie[i] += ms_t[i][0] + ms_t[i][1]
		except KeyError, e:
			continue
Exemple #9
0
def main():
    vectorizer = CountVectorizer(ngram_range=(1,2),max_df=1.0, min_df=0.0)

    matrix = vectorizer.fit_transform(training_set)
    for new_comment in new_comments:
        print '\n####\ncommentaire:{0}\n'.format(new_comment)
        vector = vectorizer.transform([new_comment])

        i = 0
        for vect in matrix:
            score = jaccard(vect.todense(), vector.todense())
            print 'sentence: {0}"\tscore:{1}'.format(training_set[i], score)
            i += 1
Exemple #10
0
def raster_differences(raster_dataset_1, raster_dataset_2, tolerance=1e-08):
    ''' Compares the values of two rasters given a certain treshold.

    The default tolerance value is the same as the one used by numpy.allclose.

    @param raster_dataset_1 GDAL dataset
    @param raster_dataset_2 GDAL dataset
    @param tolerance double defining the raster similarity tolerance (see http://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html)
    @return differences dict holding information on the potential differences
    '''

    differences = {}

    print("INFO: Extracting bands from the first dataset...")
    band_1 = raster_dataset_1.GetRasterBand(1)
    print("INFO: Extracting bands from the second dataset...")
    band_2 = raster_dataset_2.GetRasterBand(1)

    print("INFO: Extracting band 1 from the first dataset...")
    data_1 = band_1.ReadAsArray(0, 0, raster_dataset_1.RasterXSize,
                                raster_dataset_1.RasterYSize).astype(numpy.float)
    print("INFO: Extracting band 1 from the second dataset...")
    data_2 = band_2.ReadAsArray(0, 0, raster_dataset_2.RasterXSize,
                                raster_dataset_2.RasterYSize).astype(numpy.float)

    print("INFO: Comparing values...")
    equal = numpy.allclose(data_1, data_2, atol=tolerance)

    if not equal:
        print("WARNING: Raster dataset values not equal at {0} tolerance".format(tolerance))
        diff = data_1 - data_2

        differences['max'] = float(numpy.max(diff))
        differences['min'] = float(numpy.min(diff))
        differences['mean'] = float(numpy.mean(diff))
        differences['std'] = float(numpy.std(diff))
        #differences['quantiles'] = [float(item) for item in mquantiles(diff)]
        print("INFO: Calculating Kendall's tau statistics, this may take a while...")
        tau = kendalltau(data_1, data_2)
        differences['kendall_tau'] = (float(tau[0]), float(tau[1]))

        treshold = 0.99
        print("INFO: Calculating jaccard distance for treshold {0}".format(treshold))
        frac_data_1 = data_1 >= treshold
        frac_data_1 = numpy.reshape(frac_data_1, frac_data_1.size)
        frac_data_2 = data_2 >= treshold
        frac_data_2 = numpy.reshape(frac_data_2, frac_data_2.size)
        # Calculate jaccard index instead of distance
        differences['jaccard'] = (treshold, float(1-jaccard(frac_data_1, frac_data_2)))

    return differences
	def sentenceSimilarity(self, sent1, sent2, typeSim, isIDF):
		additionalStop = ['et', 'al', 'e.g.', 'http', 'etc']
		stemmer = PorterStemmer()
		term1 = [stemmer.stem(t) for t in nltk.word_tokenize(sent1.lower()) \
			if t not in stopwords.words('english') and len(t)>1 and t not in additionalStop]
		term2 = [stemmer.stem(t) for t in nltk.word_tokenize(sent2.lower()) \
			if t not in stopwords.words('english') and len(t)>1 and t not in additionalStop]
		allterm = list(set(term1 + term2))
		nl1 = len(term1)
		nl2 = len(term2)
		v1 = []
		v2 = []
		for t in allterm:
			if isIDF:
				idf = pickle.load( open( "../data/idf/globalidf.dat", "rb" ) )
				theIdf = idf.get(t)
				if theIdf!= None:
					v1.append(term1.count(t)*idf.get(t))
					v2.append(term2.count(t)*idf.get(t))
				else:
					v1.append(0)
					v2.append(0)
			else:
				nt1 = term1.count(t)
				if nt1!=0:
					v1.append(nt1/float(nl1))
				else:
					v1.append(0)
				nt2 = term2.count(t)
				if nt2!=0:
					v2.append(nt2/float(nl2))
				else:
					v2.append(0)
		#jaccard transform
		jv1 = []
		jv2 = []
		for i in range(0, len(v1)):
			if v1[i] > 0:
				jv1.append(1)
			else:
				jv1.append(0)
			if v2[i] > 0:
				jv2.append(1)
			else:
				jv2.append(0)
		if typeSim == 'cosine':
			return distance.cosine(v1, v2)
		else:
			return distance.jaccard(jv1, jv2)
def test_jaccard_similarity():
    true = np.random.binomial(n=1, p=.5, size=10).astype('float32')
    predicted = np.round(np.random.random(10))
    refscore = jaccard(true, predicted)
    yt = T.fvector('yt')
    yp = T.fvector('yp')
    f = theano.function([yt, yp], tmetrics.classification.jaccard_similarity(yt, yp), allow_input_downcast=True)
    score = f(true, predicted)
    print 'true'
    print true
    print 'predicted'
    print predicted
    print 'refscore {}'.format(refscore)
    print 'score {}'.format(score)
    assert np.allclose(refscore, score)
def compare_arand(f):
    from sklearn import metrics
    from scipy.spatial import distance
    rois = mh.imread(f.replace('rois2', 'rois'))
    rois2 = mh.imread(f)
    rois = (rois.ravel() != 0)
    rois2 = (rois2.ravel() != 0)
    arand = metrics.adjusted_rand_score(rois, rois2)
    # Note that scipy returns the Jaccard Distance, which is 1 - Jaccard Index
    # sklearn does not really implement jaccard, but an interpretation where
    # jaccard is just a synonym for accuracy.

    jaccard = 1. - distance.jaccard(rois, rois2)
    mcc = metrics.matthews_corrcoef(rois, rois2)
    return arand, jaccard, mcc
def assess_corr(df, pb, pairs):
    condensed_corr = []
    condensed_pval = []
    
    for pair in pairs:

        if jaccard(pb[pair[0]], pb[pair[1]]) > 0.2:
            (rho, p) = (np.nan, np.nan)
        else:
            tmp_df = df.loc[list(pair)].copy().dropna(axis=1, how='any').T
            (rho, p) = spearmanr(tmp_df)

        condensed_corr.append(rho)
        condensed_pval.append(p  )

    return (condensed_corr, condensed_pval)
def test_jaccard_similarity_2D():
    true = np.random.binomial(n=1, p=.5, size=10).astype('float32')
    predicted = np.round(np.random.random(10))
    refscore = np.asarray([jaccard(true, predicted)])
    double = lambda x: np.concatenate([x.reshape((1, len(x))), x.reshape((1, len(x)))])
    true, predicted, refscore = tuple(double(x) for x in [true, predicted, refscore])
    yt = T.fmatrix('yt')
    yp = T.fmatrix('yp')
    f = theano.function([yt, yp], tmetrics.classification.jaccard_similarity(yt, yp), allow_input_downcast=True)
    score = f(true, predicted)
    print 'true'
    print true
    print 'predicted'
    print predicted
    print 'refscore {}'.format(refscore)
    print 'score {}'.format(score)
    assert np.allclose(refscore, score)
Exemple #16
0
    def _get_BMU(self,input_nparray):

        minDist=9223372036854775807
        candidate= None
        for neu in self.map_neurons.itervalues():

            if self.boolean:
                cand = jaccard(input_nparray, neu.weight_vs)
                if minDist> cand:
                    minDist = cand
                    candidate= neu
            else:
                cand=minkowski(input_nparray, neu.weight_vs, 2)
                if minDist> cand:
                    minDist = cand
                    candidate= neu

        return  candidate
Exemple #17
0
    def similarity(self, article):
        """
        Calculate the similarity between this article
        and another article.
        """
        # Compare the text vectors,
        # and the entity vectors.
        v = self.vectorize()
        v_ = article.vectorize()

        # Linearly combine the similarity values,
        # weighing them according to these coefficients.
        # [text vector, entity vector, publication date]
        coefs = [2, 1, 2]
        sim = 0
        for i, vec in enumerate(v):
            dist = jaccard(v_[i], v[i])

            # Two empty vectors returns a jaccard distance of NaN.
            # Set it to be 1, i.e. consider them completely different
            # (or, put more clearly, they have nothing in common)
            # FYI if jaccard runs on empty vectors, it will throw a warning.
            if isnan(dist):
                dist = 1
            s = 1 - dist
            sim += (coefs[i] * s)

        # Also take publication dates into account.
        ideal_time = 259200 # 3 days, in seconds
        t, t_ = self.created_at, article.created_at

        # Subtract the more recent time from the earlier time.
        time_diff = t - t_ if t > t_ else t_ - t
        time_diff = time_diff.total_seconds()

        # Score is normalized [0, 1], where 1 is within the ideal time,
        # and approaches 0 the longer the difference is from the ideal time.
        time_score = 1 if time_diff < ideal_time else ideal_time/time_diff
        sim += (coefs[2] * time_score)

        # Normalize back to [0, 1].
        return sim/sum(coefs)
Exemple #18
0
    def getBMU(self,input_nparray):
        minDist=9223372036854775807
        candidate= None
        for neu in self.map_neurons.itervalues():
            #print "input: "+str(input_nparray)
            #print "neuron: "+str (neu.weight_vs)
            if self.boolean:
                cand = jaccard(input_nparray, neu.weight_vs)
                if minDist> cand:
                    minDist = cand
                    candidate= neu
            else:
                cand=minkowski(input_nparray, neu.weight_vs, 2)
                if minDist> cand:
                #print "mindist:",minDist
                #print "cand:",cand
                    minDist = cand
                    candidate= neu

                #print "candidate'scoords",candidate.coords()
        return  candidate
    def region_query(self, dataset, datapoint):
        """
        Returns a list of new datapoints in datapoint's epsilon-neigbourhood
        :param datapoint:
        :return:
        """
        datapoints = [datapoint]

        for new_datapoint in dataset:

            if new_datapoint != datapoint:

                distance = jaccard(
                    datapoint.data_vector,
                    new_datapoint.data_vector
                )

                if distance <= self.epsilon:
                    datapoints.append(new_datapoint)

        return datapoints
def simil_fun_jaccard_4(a, b):
    a_bin = a >= 10**(-4)
    b_bin = b >= 10**(-4)
    return jaccard(a, b)
Exemple #21
0
# print(prefix_vec[:5])

tfidf_feat = pd.DataFrame()

tfidf_feat['title_prefix_cosine_distance_tfidf'] = [
    cosine(x, y)
    for (x, y) in zip(np.nan_to_num(prefix_vec), np.nan_to_num(title_vec))
]
print('title_prefix_cosine_distance done')
tfidf_feat['title_prefix_cityblock_distance_tfidf'] = [
    cityblock(x, y)
    for (x, y) in zip(np.nan_to_num(prefix_vec), np.nan_to_num(title_vec))
]
print('title_prefix_cityblock_distance done')
tfidf_feat['title_prefix_jaccard_distance_tfidf'] = [
    jaccard(x, y)
    for (x, y) in zip(np.nan_to_num(prefix_vec), np.nan_to_num(title_vec))
]
print('title_prefix_jaccard_distance done')
tfidf_feat['title_prefix_canberra_distance_tfidf'] = [
    canberra(x, y)
    for (x, y) in zip(np.nan_to_num(prefix_vec), np.nan_to_num(title_vec))
]
print('title_prefix_canberra_distance done')
tfidf_feat['title_prefix_euclidean_distance_tfidf'] = [
    euclidean(x, y)
    for (x, y) in zip(np.nan_to_num(prefix_vec), np.nan_to_num(title_vec))
]
print('title_prefix_euclidean_distance done')
tfidf_feat['title_prefix_minkowski_distance_tfidf'] = [
    minkowski(x, y, 3)
#!/usr/bin/python
# -*- coding: utf-8 -*-
#[email protected]
"""
举个例子来说电影基数非常庞大
用户看过的电影只占其中非常小的一部分
如果两个用户都没有看过某一部电影(两个都是 0)
并不能说明两者相似
反而言之,如果两个用户都看过某一部电影(序列中都是 1)
则说明用户有很大的相似度。
在这个例子中,序列中等于 1 所占的权重应该远远大于 0 的权重
这就引出下面要说的杰卡德相似系数(Jaccard similarity)
"""
print(__doc__)

import scipy.spatial.distance as dst
#添加减少0不改变相似度
s1 = [1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
s2 = [0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0]
l = len(s1)
print(dst.jaccard(s1, s2))
def Jaccard(a, b):  #distance
    return distance.jaccard(a, b)
Exemple #24
0
def build_features(data):

    char_model = gensim.models.KeyedVectors.load_word2vec_format(
        '../data/char_w2v.txt')
    word_model = gensim.models.KeyedVectors.load_word2vec_format(
        '../data/word_w2v.txt')
    X = pd.DataFrame()
    X['word_wmd'] = data.apply(
        lambda x: wmd(x['word1'], x['word2'], word_model), axis=1)
    X['char_wmd'] = data.apply(
        lambda x: wmd(x['char1'], x['char2'], char_model), axis=1)
    question1_vectors = np.zeros((data.shape[0], 300))
    for i, q in tqdm(enumerate(data.word1.values)):
        question1_vectors[i, :] = sent2vec(q, word_model)

    question2_vectors = np.zeros((data.shape[0], 300))
    for i, q in tqdm(enumerate(data.word2.values)):
        question2_vectors[i, :] = sent2vec(q, word_model)

    char_question1_vectors = np.zeros((data.shape[0], 300))
    for i, q in tqdm(enumerate(data.char1.values)):
        char_question1_vectors[i, :] = sent2vec(q, char_model)

    char_question2_vectors = np.zeros((data.shape[0], 300))
    for i, q in tqdm(enumerate(data.char2.values)):
        char_question2_vectors[i, :] = sent2vec(q, char_model)
    #
    X['cosine_distance'] = [
        cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                       np.nan_to_num(question2_vectors))
    ]

    X['cityblock_distance'] = [
        cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                          np.nan_to_num(question2_vectors))
    ]

    X['jaccard_distance'] = [
        jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                        np.nan_to_num(question2_vectors))
    ]

    X['canberra_distance'] = [
        canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                         np.nan_to_num(question2_vectors))
    ]

    X['euclidean_distance'] = [
        euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                          np.nan_to_num(question2_vectors))
    ]

    X['minkowski_distance'] = [
        minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                             np.nan_to_num(question2_vectors))
    ]

    X['braycurtis_distance'] = [
        braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                           np.nan_to_num(question2_vectors))
    ]

    X['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
    X['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
    X['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
    X['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

    X['char_skew_q1vec'] = [
        skew(x) for x in np.nan_to_num(char_question1_vectors)
    ]
    X['char_skew_q2vec'] = [
        skew(x) for x in np.nan_to_num(char_question2_vectors)
    ]
    X['char_kur_q1vec'] = [
        kurtosis(x) for x in np.nan_to_num(char_question1_vectors)
    ]
    X['char_kur_q2vec'] = [
        kurtosis(x) for x in np.nan_to_num(char_question2_vectors)
    ]

    return X
Exemple #25
0
 def jaccard(self):
     a = self.target
     b = self.library
     return distance.jaccard(a, b)
Exemple #26
0
def scipyJaccard(u, v):
    return 1 - distance.jaccard(u, v)
Exemple #27
0
question2_vectors = np.zeros((train_df.shape[0], 300))
for i, q in tqdm(enumerate(train_df.question2.values)):
    question2_vectors[i, :] = sent2vec(q)

train_df['cosine_distance'] = [
    cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                   np.nan_to_num(question2_vectors))
]

train_df['cityblock_distance'] = [
    cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                      np.nan_to_num(question2_vectors))
]

train_df['jaccard_distance'] = [
    jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                    np.nan_to_num(question2_vectors))
]

train_df['canberra_distance'] = [
    canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                     np.nan_to_num(question2_vectors))
]

train_df['euclidean_distance'] = [
    euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                      np.nan_to_num(question2_vectors))
]

train_df['minkowski_distance'] = [
    minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
Exemple #28
0
lineNumber = 0
with open(filename) as f:
    fileData = f.read().splitlines()
    for line in fileData[1:]:  #Note, skip first line to remove header.
        ubScores = line.split(",")
        for cellLine in range(95):
            totalUbiquityScore[cellLine] += float(ubScores[cellLine])
            ubMat[cellLine, lineNumber] = float(ubScores[cellLine])
        lineNumber += 1

from scipy.spatial import distance
matJacc = np.zeros((95, 95))

for row in range(95):
    for col in range(95):
        matJacc[row, col] = distance.jaccard(mat[row, :], mat[col, :])

linked = linkage(mat, metric='jaccard')
linked2 = linkage(rnaMat, metric='euclidean')
linked3 = linkage(rnaMat2, metric='euclidean')
linkedEuclideanOfJaccard = linkage(matJacc, metric='euclidean')
#linkedReactionByCellLine = linkage(mat.transpose(), metric='jaccard');

plt.subplot(2, 2, 1)
dendrogram(linked,
           orientation='top',
           distance_sort='descending',
           show_leaf_counts=True)
plt.title(KEPT_OR_CORE)
plt.subplot(2, 2, 2)
dendrogram(linked2,
savefig(plot=(MD_distance_young_G.sort_values(
    by='Jaccard dist from the median').plot.bar(
        figsize=(10, 10),
        fontsize=7.5,
        title='class: young, Distance by Genes')),
        outfolder=outfolder,
        title='dist_young_Genes.png')

#rxn
# SD dist from median
o_sd_R = MD_distance_old_R.std()
y_sd_R = MD_distance_young_R.std()
print(o_sd_R)
print(y_sd_R)
# dist between class medians
R_class_dist = jaccard(median_old_R, median_young_R)
print(R_class_dist)

# mets
# SD dist from median
o_sd_M = MD_distance_old_M.std()
y_sd_M = MD_distance_young_M.std()
print(o_sd_M)
print(y_sd_M)
# dist between class medians
M_class_dist = jaccard(median_old_M, median_young_M)
print(M_class_dist)

#genes
# SD dist from median
o_sd_G = MD_distance_old_G.std()
Exemple #30
0
		desired_organism = sys.argv[5]
	except IndexError:
		desired_organism = None	
	if desired_organism is not None:
		models = [mod for mod in models if model_info[mod.split(sep)[-1].split('.')[0]][4] == desired_organism]
		print ' Predicting for organism : ' + desired_organism
		output_name = input_name + '_' + input_name2 + '_out_binary_sim_' + str(threshold) + '_' + desired_organism[:3] + '.txt'
	else: 	output_name = input_name + '_' + input_name2 + '_out_binary_sim_' + str(threshold) + '.txt'
	print ' Total Number of Classes : ' + str(len(models))
	print ' Using TPR threshold of : ' + str(threshold)
	output_name = input_name + '_' + input_name2 + '_out_binary_sim_' + str(threshold) + '.txt'
	out_file = open(output_name, 'w')
	querymatrix,smiles = importQuery(input_name)
	prediction_results = performTargetPrediction(models)
	print ' Total Number of Query Molecules file 1 : ' + str(len(querymatrix))
	querymatrix,smiles2 = importQuery(input_name2)
	prediction_results2 = performTargetPrediction(models)
	print ' Total Number of Query Molecules file 2 : ' + str(len(querymatrix))
	sim_output = []
	sim_output2 = []
	for idx in range(prediction_results.shape[1]):
		sim_output.append(rogerstanimoto(prediction_results[:,idx],prediction_results2[:,idx]))
		sim_output2.append(jaccard(prediction_results[:,idx],prediction_results2[:,idx]))
	out_file.write('Compound Pair No.\tSmiles 1\tSmiles 2\tRogers Tanimoto\tJaccard Sim\n')
	for idx, comp1 in enumerate(smiles):
		comp2 = smiles2[idx]
		s = sim_output[idx]
		s2 = sim_output2[idx]
		out_file.write('\t'.join(map(str,[idx,comp1,comp2,1.0-s,1.0-s2])) + '\n')
	print '\n Wrote Results to: ' + output_name
	out_file.close()
Exemple #31
0
########################

nlp_feat['title_prefix_cosine_distance_ybb'] = [
    cosine(x, y)
    for (x,
         y) in zip(np.nan_to_num(prefix_vectors), np.nan_to_num(title_vectors))
]
print('title_prefix_cosine_distance done')
nlp_feat['title_prefix_cityblock_distance_ybb'] = [
    cityblock(x, y)
    for (x,
         y) in zip(np.nan_to_num(prefix_vectors), np.nan_to_num(title_vectors))
]
print('title_prefix_cityblock_distance done')
nlp_feat['title_prefix_jaccard_distance_ybb'] = [
    jaccard(x, y)
    for (x,
         y) in zip(np.nan_to_num(prefix_vectors), np.nan_to_num(title_vectors))
]
print('title_prefix_jaccard_distance done')
nlp_feat['title_prefix_canberra_distance_ybb'] = [
    canberra(x, y)
    for (x,
         y) in zip(np.nan_to_num(prefix_vectors), np.nan_to_num(title_vectors))
]
print('title_prefix_canberra_distance done')
nlp_feat['title_prefix_euclidean_distance_ybb'] = [
    euclidean(x, y)
    for (x,
         y) in zip(np.nan_to_num(prefix_vectors), np.nan_to_num(title_vectors))
]
Exemple #32
0
def jaccard_distance_index(subtree_a, subtree_b):
    start = min(min(subtree_a), min(subtree_b))
    ind_rage = range(start, max(max(subtree_a), max(subtree_b)) + 1)
    A = [1 if i in subtree_a else 0 for i in ind_rage]
    B = [1 if i in subtree_b else 0 for i in ind_rage]
    return 1 - jaccard(A, B)
def jaccard_pixelwise(mask_a, mask_b, threshold=0.5):
    mask_a = (mask_a > threshold).astype(np.uint8)
    mask_b = (mask_b > threshold).astype(np.uint8)
    jac_dist = jaccard(mask_a.flatten(), mask_b.flatten())

    return (1 - jac_dist)
e_ab = euclidean(a, b)
e_ac = euclidean(a, c)
e_bc = euclidean(b, c)

# Cosine Distance
c_ab = cosine(a, b)
c_ac = cosine(a, c)
c_bc = cosine(b, c)

# converting to boolean
a = a > 0
b = b > 0
c = c > 0
# Jaccard Distance

j_ab = jaccard(a, b)
j_ac = jaccard(a, c)
j_bc = jaccard(b, c)

# Enter distance comparison below for each pair of vectors:

print('\n\nEuclidean Distance\n ab:', e_ab, 'ac:', e_ac, 'bc:', e_bc)
print('Cosine Distance\n ab:', c_ab, 'ac:', c_ac, 'bc:', c_bc)
print('Jaccard Dissimilarity (vectors should be boolean values)\n ab:', j_ab,
      'ac:', j_ac, 'bc:', j_bc)

print('\n\nThe most appropriate distance is...Cosine Distance')
print(
    '\nThe Cosine distance is best in this scenario because \nif the angle between the two vectors is small, then they \nare closer together and therefore more similar.'
)
Exemple #35
0
    def get_dist_preds(self, predictions, metric):
        new_preds = []
        classes = predictions[0].classes
        for j, pred in enumerate(predictions):
            distances = []
            remaining_preds = predictions[:j] + predictions[j + 1:]
            for pred_ in remaining_preds:
                dist_by_class = list([0] * len(classes))
                for k, class_ in enumerate(classes):
                    class_pred_ = pred_.probabilities[k]
                    class_pred = pred.probabilities[k]
                    if metric == 'euclid':
                        dist_by_class[k] = euclidean(class_pred_, class_pred)
                    elif metric == 'cosine':
                        dist_by_class[k] = cosine(class_pred_, class_pred)
                    elif metric == 'jaccard':  # i think this is only for boolean
                        dist_by_class[k] = jaccard(class_pred_, class_pred)
                    elif metric == 'chebyshev':
                        dist_by_class[k] = chebyshev(class_pred_, class_pred)
                    elif metric == 'correlation':
                        dist_by_class[k] = correlation(class_pred_, class_pred)
                    elif metric == 'cityblock':
                        dist_by_class[k] = cityblock(class_pred_, class_pred)
                    elif metric == 'canberra':
                        dist_by_class[k] = canberra(class_pred_, class_pred)
                    elif metric == 'braycurtis':
                        dist_by_class[k] = braycurtis(class_pred_, class_pred)
                    elif metric == 'hamming':  # i think this is only for boolean
                        dist_by_class[k] = hamming(class_pred_, class_pred)
                    elif metric == 'battacharyya':
                        dist_by_class[k] = DistanceMetrics.battacharyya(
                            class_pred_, class_pred, method='continuous')
                distances += [dist_by_class]
                # distances = [[c11,c21,c31], [c12,c22,c32], ..., [c1m,c2m,c3m]] for m models
            # new_preds = [(pred, [c11+...+c1m, ..., c31+...+c3m])]
            new_preds += [(pred, [sum(i) for i in zip(*distances)])
                          ]  # (precdictions, [w1, w2, ..., wc]) for c classes

        weights = [tup[1] for tup in new_preds]
        W = [
            sum(i) for i in zip(*weights)
        ]  # total weight for each class: [sum(w1i), sum(w2i), ..., sum(wci)], sum of sums for each model i
        class_weighted_preds = []
        for i, class_ in enumerate(classes):
            class_weights = [w[i] for w in weights]
            class_pred_dist = [(np.array([l[i] for l in tup[0].probabilities]),
                                tup[1][i]) for tup in new_preds]
            if self.sdhw:
                # those with lower distances have higher weight sort in ascending order of aggregated distances
                preds_ascending_dist = sorted(class_pred_dist,
                                              key=lambda x: x[1])
                # weights is a list of lists containing the weights for the classes of each model
                weights_descending = sorted(class_weights, reverse=True)
                weighted_pred = sum([
                    pred_tup[0] * (weights_descending[k] / W[i])
                    for k, pred_tup in enumerate(preds_ascending_dist)
                ])
            else:
                # those with lower distances have lower weight
                weighted_pred = sum([
                    pred_tup[0] * (pred_tup[1] / W[i])
                    for pred_tup in class_pred_dist
                ])
            class_weighted_preds += [weighted_pred]
        class_weighted_preds_trunc = np.array(
            [[class_weighted_preds[0][i], class_weighted_preds[1][i]]
             for i in range(len(class_weighted_preds[0]))])
        indices_max_proba = class_weighted_preds_trunc.argmax(axis=1)
        classifications = np.array([classes[i] for i in indices_max_proba])
        return classifications
Exemple #36
0
    print ' Total Number of Classes : ' + str(len(models))
    print ' Using TPR threshold of : ' + str(threshold)
    output_name = input_name + '_' + input_name2 + '_out_binary_sim_' + str(
        threshold) + '.txt'
    out_file = open(output_name, 'w')
    querymatrix, smiles, ids = importQuery(input_name)
    prediction_results = performTargetPrediction(models)
    print ' Total Number of Query Molecules file 1 : ' + str(len(querymatrix))
    querymatrix, smiles2, ids2 = importQuery(input_name2)
    prediction_results2 = performTargetPrediction(models)
    print ' Total Number of Query Molecules file 2 : ' + str(len(querymatrix))
    sim_output = []
    sim_output2 = []
    for idx in range(prediction_results.shape[1]):
        sim_output.append(
            rogerstanimoto(prediction_results[:, idx],
                           prediction_results2[:, idx]))
        sim_output2.append(
            jaccard(prediction_results[:, idx], prediction_results2[:, idx]))
    out_file.write(
        'Compound Pair No.\tSmiles 1\tSmiles 2\tRogers Tanimoto\tJaccard Sim\n'
    )
    for idx, comp1 in enumerate(ids):
        comp2 = ids2[idx]
        s = sim_output[idx]
        s2 = sim_output2[idx]
        out_file.write(
            '\t'.join(map(str, [idx, comp1, comp2, 1.0 - s, 1.0 - s2])) + '\n')
    print '\n Wrote Results to: ' + output_name
    out_file.close()
Exemple #37
0
 def disc_error(self, x):
     ph = self.sigmoid(x.dot(self.w)+self.b)
     hstates = np.random.binomial(1, ph)
     act_hstates = np.random.binomial(1, self.ph)
     return  dist.jaccard(hstates, act_hstates)
def cal_distance_bool(fps):
    return distance.jaccard(*fps)
Exemple #39
0
def jaccard_distance(a,b):
    return distance.jaccard(a,b)
Exemple #40
0
 def add_word2vec_features(self,
                           model_path,
                           model_name='w2v',
                           vector_size=300):
     """ word2vec features require a lot of RAM to be computed
     """
     # Load model and compute Word Mover's Distance
     self.w2c_model = gensim.models.KeyedVectors.load_word2vec_format(
         model_path, binary=True)
     self.w2c_model.init_sims(replace=True)
     self.df['{}_norm_wmd'.format(model_name)] = self.df.apply(
         lambda x: self.word_mover_distance(x['question1'], x['question2']),
         axis=1)
     self.w2c_model = gensim.models.KeyedVectors.load_word2vec_format(
         model_path, binary=True)
     self.df['{}_wmd'.format(model_name)] = self.df.apply(
         lambda x: self.word_mover_distance(x['question1'], x['question2']),
         axis=1)
     # Generate vectors from questions
     question1_vectors = np.zeros((self.df.shape[0], vector_size))
     question2_vectors = np.zeros((self.df.shape[0], vector_size))
     j = 0
     for i, row in self.df.iterrows():
         question1_vectors[j, :] = self.text2vec(row[self.q1_column])
         question2_vectors[j, :] = self.text2vec(row[self.q2_column])
         j += 1
     self.w2c_model = None  # Save up some RAM
     # Compute several features using vectors
     self.df['{}_cosine_distance'.format(model_name)] = [
         cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                        np.nan_to_num(question2_vectors))
     ]
     self.df['{}_cityblock_distance'.format(model_name)] = [
         cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                           np.nan_to_num(question2_vectors))
     ]
     self.df['{}_jaccard_distance'.format(model_name)] = [
         jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                         np.nan_to_num(question2_vectors))
     ]
     self.df['{}_canberra_distance'.format(model_name)] = [
         canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                          np.nan_to_num(question2_vectors))
     ]
     self.df['{}_euclidean_distance'.format(model_name)] = [
         euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                           np.nan_to_num(question2_vectors))
     ]
     self.df['{}_minkowski_distance'.format(model_name)] = [
         minkowski(x, y, 3)
         for (x, y) in zip(np.nan_to_num(question1_vectors),
                           np.nan_to_num(question2_vectors))
     ]
     self.df['{}_braycurtis_distance'.format(model_name)] = [
         braycurtis(x, y)
         for (x, y) in zip(np.nan_to_num(question1_vectors),
                           np.nan_to_num(question2_vectors))
     ]
     self.df['{}_skew_q1vec'.format(model_name)] = [
         skew(x) for x in np.nan_to_num(question1_vectors)
     ]
     self.df['{}_skew_q2vec'.format(model_name)] = [
         skew(x) for x in np.nan_to_num(question2_vectors)
     ]
     self.df['{}_kur_q1vec'.format(model_name)] = [
         kurtosis(x) for x in np.nan_to_num(question1_vectors)
     ]
     self.df['{}_kur_q2vec'.format(model_name)] = [
         kurtosis(x) for x in np.nan_to_num(question2_vectors)
     ]
     # Add word vectors as features
     for i in xrange(vector_size):
         self.df['{}_q1vec_{}'.format(model_name, i)] = question1_vectors[:,
                                                                          i]
         self.df['{}_q2vec_{}'.format(model_name, i)] = question2_vectors[:,
                                                                          i]
def sim_bin(s_vector,t_vector): return 1 - distance.jaccard(s_vector, t_vector)

# cosine similarity between two wieghted vectors
def sim_cosine(s_vector,t_vector): return 1 - distance.cosine(s_vector, t_vector)
        return v / np.sqrt((v**2).sum())
    else:
        return np.zeros(300)  #1*300


w2v_q1 = np.array([sent2vec(q, model) for q in data.question1])
w2v_q2 = np.array([sent2vec(q, model) for q in data.question2])

from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

data['cosine_distance'] = [cosine(x, y)
                           for (x, y) in zip(w2v_q1, w2v_q2)]  #x y 1-D array
data['cityblock_distance'] = [
    cityblock(x, y) for (x, y) in zip(w2v_q1, w2v_q2)
]
data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(w2v_q1, w2v_q2)]
data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(w2v_q1, w2v_q2)]
data['euclidean_distance'] = [
    euclidean(x, y) for (x, y) in zip(w2v_q1, w2v_q2)
]
data['minkowski_distance'] = [
    minkowski(x, y, 3) for (x, y) in zip(w2v_q1, w2v_q2)
]
data['braycurtis_distance'] = [
    braycurtis(x, y) for (x, y) in zip(w2v_q1, w2v_q2)
]

fs4_1 = [
    'cosine_distance', 'cityblock_distance', 'jaccard_distance',
    'canberra_distance', 'euclidean_distance', 'minkowski_distance',
    'braycurtis_distance'
Exemple #43
0
 def jaccard(u: np.ndarray, v: np.ndarray) -> float:
     return distance.jaccard(u, v)
Exemple #44
0
def jaccard_distance_index(subtree_a, subtree_b):
    start = min(min(subtree_a), min(subtree_b))
    ind_rage = range(start, max(max(subtree_a), max(subtree_b))+1)
    A = [1 if i in subtree_a else 0 for i in ind_rage]
    B = [1 if i in subtree_b else 0 for i in ind_rage]
    return 1-jaccard(A, B)
def jaccard_distance(a,b):
    print distance.jaccard(a,b)
def generate_feature(data):
    """
        basic feature
    """
    #length of sentence
    data['len_word_s1'] = data.apply(
        lambda x: len(x['sentences'].split("\001")[0].split(" ")), axis=1)
    data['len_word_s2'] = data.apply(
        lambda x: len(x['sentences'].split("\001")[1].split(" ")), axis=1)
    data['len_ratio'] = data.apply(lambda x: len_ratio(x['sentences']), axis=1)
    data['len_char_s1'] = data.apply(
        lambda x: len(''.join(x['sentences'].split("\001")[0].split(" "))),
        axis=1)
    data['len_char_s2'] = data.apply(
        lambda x: len(''.join(x['sentences'].split("\001")[1].split(" "))),
        axis=1)
    """
        fuzzywuzzy feature
    """
    data['fuzz_QRatio'] = data.apply(lambda x: fuzz_QRatio(x['sentences']),
                                     axis=1)
    data['fuzz_WRatio'] = data.apply(lambda x: fuzz_WRatio(x['sentences']),
                                     axis=1)
    data['fuzz_partial_ratio'] = data.apply(
        lambda x: fuzz_partial_ratio(x['sentences']), axis=1)
    data['fuzz_partial_token_set_ratio'] = data.apply(
        lambda x: fuzz_partial_token_set_ratio(x['sentences']), axis=1)
    data['fuzz_partial_token_sort_ratio'] = data.apply(
        lambda x: fuzz_partial_token_sort_ratio(x['sentences']), axis=1)
    data['fuzz_token_set_ratio'] = data.apply(
        lambda x: fuzz_token_set_ratio(x['sentences']), axis=1)
    data['fuzz_token_sort_ratio'] = data.apply(
        lambda x: fuzz_token_sort_ratio(x['sentences']), axis=1)
    """
        word2vec feature
    """
    sent1_vectors = np.zeros((data.shape[0], 300))
    for i, sents in tqdm(enumerate(data.sentences.values)):
        sent = sents.split("\001")[0]
        sent1_vectors[i, :] = sent2vec(sent)

    sent2_vectors = np.zeros((data.shape[0], 300))
    for i, sents in tqdm(enumerate(data.sentences.values)):
        sent = sents.split("\001")[1]
        sent2_vectors[i, :] = sent2vec(sent)
    data['cityblock_distance'] = [
        cityblock(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                          np.nan_to_num(sent2_vectors))
    ]
    data['jaccard_distance'] = [
        jaccard(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                        np.nan_to_num(sent2_vectors))
    ]
    data['cosine_distance'] = [
        cosine(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                       np.nan_to_num(sent2_vectors))
    ]
    data['canberra_distance'] = [
        canberra(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                         np.nan_to_num(sent2_vectors))
    ]
    data['euclidean_distance'] = [
        euclidean(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                          np.nan_to_num(sent2_vectors))
    ]
    data['braycurtis_distance'] = [
        braycurtis(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                           np.nan_to_num(sent2_vectors))
    ]
    data['minkowski_distance'] = [
        minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                             np.nan_to_num(sent2_vectors))
    ]
    data['pearson_coff'] = [
        scipy.stats.pearsonr(x, y)[0] for (x, y) in zip(
            np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors))
    ]
    data['spearman_coff'] = [
        scipy.stats.spearmanr(x, y)[0] for (x, y) in zip(
            np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors))
    ]
    data['kendalltau_coff'] = [
        scipy.stats.kendalltau(x, y)[0] for (x, y) in zip(
            np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors))
    ]

    data['polynomial_kernel'] = [
        polynomial_kernel(x.reshape(1, -1), y.reshape(1, -1))[0][0]
        for (x, y) in zip(np.nan_to_num(sent1_vectors),
                          np.nan_to_num(sent2_vectors))
    ]
    data['sigmoid_kernel'] = [
        sigmoid_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0]
        for (x, y) in zip(np.nan_to_num(sent1_vectors),
                          np.nan_to_num(sent2_vectors))
    ]
    data['rbf_kernel'] = [
        rbf_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0] for (x, y) in zip(
            np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors))
    ]
    data['laplacian_kernel'] = [
        laplacian_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0]
        for (x, y) in zip(np.nan_to_num(sent1_vectors),
                          np.nan_to_num(sent2_vectors))
    ]
    data['skew_s1vec'] = [skew(x) for x in np.nan_to_num(sent1_vectors)]
    data['skew_s2vec'] = [skew(x) for x in np.nan_to_num(sent2_vectors)]
    data['kur_s1vec'] = [kurtosis(x) for x in np.nan_to_num(sent1_vectors)]
    data['kur_s2vec'] = [kurtosis(x) for x in np.nan_to_num(sent2_vectors)]
    """
        word2vec feature average and weighted idf
    """
    sent1_vectors = np.zeros((data.shape[0], 300))
    for i, sents in tqdm(enumerate(data.sentences.values)):
        sent = sents.split("\001")[0]
        sent1_vectors[i, :] = sent2vec_ave(sent)

    sent2_vectors = np.zeros((data.shape[0], 300))
    for i, sents in tqdm(enumerate(data.sentences.values)):
        sent = sents.split("\001")[1]
        sent2_vectors[i, :] = sent2vec_ave(sent)

    data['cityblock_distance_ave'] = [
        cityblock(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                          np.nan_to_num(sent2_vectors))
    ]
    data['jaccard_distance_ave'] = [
        jaccard(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                        np.nan_to_num(sent2_vectors))
    ]
    data['cosine_distance_ave'] = [
        cosine(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                       np.nan_to_num(sent2_vectors))
    ]
    data['canberra_distance_ave'] = [
        canberra(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                         np.nan_to_num(sent2_vectors))
    ]
    data['euclidean_distance_ave'] = [
        euclidean(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                          np.nan_to_num(sent2_vectors))
    ]
    data['braycurtis_distance_ave'] = [
        braycurtis(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                           np.nan_to_num(sent2_vectors))
    ]
    data['minkowski_distance_ave'] = [
        minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                             np.nan_to_num(sent2_vectors))
    ]
    data['pearson_coff_ave'] = [
        scipy.stats.pearsonr(x, y)[0] for (x, y) in zip(
            np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors))
    ]
    data['spearman_coff_ave'] = [
        scipy.stats.spearmanr(x, y)[0] for (x, y) in zip(
            np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors))
    ]
    data['kendalltau_coff_ave'] = [
        scipy.stats.kendalltau(x, y)[0] for (x, y) in zip(
            np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors))
    ]

    data['polynomial_kernel_ave'] = [
        polynomial_kernel(x.reshape(1, -1), y.reshape(1, -1))[0][0]
        for (x, y) in zip(np.nan_to_num(sent1_vectors),
                          np.nan_to_num(sent2_vectors))
    ]
    data['sigmoid_kernel_ave'] = [
        sigmoid_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0]
        for (x, y) in zip(np.nan_to_num(sent1_vectors),
                          np.nan_to_num(sent2_vectors))
    ]
    data['rbf_kernel_ave'] = [
        rbf_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0] for (x, y) in zip(
            np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors))
    ]
    data['laplacian_kernel_ave'] = [
        laplacian_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0]
        for (x, y) in zip(np.nan_to_num(sent1_vectors),
                          np.nan_to_num(sent2_vectors))
    ]
    data['skew_s1vec_ave'] = [skew(x) for x in np.nan_to_num(sent1_vectors)]
    data['skew_s2vec_ave'] = [skew(x) for x in np.nan_to_num(sent2_vectors)]
    data['kur_s1vec_ave'] = [kurtosis(x) for x in np.nan_to_num(sent1_vectors)]
    data['kur_s2vec_ave'] = [kurtosis(x) for x in np.nan_to_num(sent2_vectors)]
    """
        word2vec feature average and weighted idf
    """
    sent1_vectors = np.zeros((data.shape[0], 300))
    for i, sents in tqdm(enumerate(data.sentences.values)):
        sent = sents.split("\001")[0]
        sent1_vectors[i, :] = sent2vec_ave_idf(sent)

    sent2_vectors = np.zeros((data.shape[0], 300))
    for i, sents in tqdm(enumerate(data.sentences.values)):
        sent = sents.split("\001")[1]
        sent2_vectors[i, :] = sent2vec_ave_idf(sent)

    data['cityblock_distance_ave_idf'] = [
        cityblock(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                          np.nan_to_num(sent2_vectors))
    ]
    data['jaccard_distance_ave_idf'] = [
        jaccard(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                        np.nan_to_num(sent2_vectors))
    ]
    data['cosine_distance_ave_idf'] = [
        cosine(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                       np.nan_to_num(sent2_vectors))
    ]
    data['canberra_distance_ave_idf'] = [
        canberra(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                         np.nan_to_num(sent2_vectors))
    ]
    data['euclidean_distance_ave_idf'] = [
        euclidean(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                          np.nan_to_num(sent2_vectors))
    ]
    data['braycurtis_distance_ave_idf'] = [
        braycurtis(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                           np.nan_to_num(sent2_vectors))
    ]
    data['minkowski_distance_ave_idf'] = [
        minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(sent1_vectors),
                                             np.nan_to_num(sent2_vectors))
    ]
    data['pearson_coff_ave_idf'] = [
        scipy.stats.pearsonr(x, y)[0] for (x, y) in zip(
            np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors))
    ]
    data['spearman_coff_ave_idf'] = [
        scipy.stats.spearmanr(x, y)[0] for (x, y) in zip(
            np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors))
    ]
    data['kendalltau_coff_ave_idf'] = [
        scipy.stats.kendalltau(x, y)[0] for (x, y) in zip(
            np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors))
    ]

    data['polynomial_kernel_ave_idf'] = [
        polynomial_kernel(x.reshape(1, -1), y.reshape(1, -1))[0][0]
        for (x, y) in zip(np.nan_to_num(sent1_vectors),
                          np.nan_to_num(sent2_vectors))
    ]
    data['sigmoid_kernel_ave_idf'] = [
        sigmoid_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0]
        for (x, y) in zip(np.nan_to_num(sent1_vectors),
                          np.nan_to_num(sent2_vectors))
    ]
    data['rbf_kernel_ave_idf'] = [
        rbf_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0] for (x, y) in zip(
            np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors))
    ]
    data['laplacian_kernel_ave'] = [
        laplacian_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0]
        for (x, y) in zip(np.nan_to_num(sent1_vectors),
                          np.nan_to_num(sent2_vectors))
    ]
    data['skew_s1vec_ave_idf'] = [
        skew(x) for x in np.nan_to_num(sent1_vectors)
    ]
    data['skew_s2vec_ave_idf'] = [
        skew(x) for x in np.nan_to_num(sent2_vectors)
    ]
    data['kur_s1vec_ave_idf'] = [
        kurtosis(x) for x in np.nan_to_num(sent1_vectors)
    ]
    data['kur_s2vec_ave_idf'] = [
        kurtosis(x) for x in np.nan_to_num(sent2_vectors)
    ]
    """
        Sequence Features
    """
    data['long_common_sequence'] = data.apply(
        lambda x: long_common_sequence(x['sentences']), axis=1)
    data['long_common_prefix'] = data.apply(
        lambda x: long_common_prefix(x['sentences']), axis=1)
    data['long_common_suffix'] = data.apply(
        lambda x: long_common_suffix(x['sentences']), axis=1)
    data['long_common_substring'] = data.apply(
        lambda x: long_common_substring(x['sentences']), axis=1)
    data['levenshtein_distance'] = data.apply(
        lambda x: levenshtein_distance(x['sentences']), axis=1)

    #other featre
    data['has_no_word'] = data.apply(lambda x: has_no_word(x['sentences']),
                                     axis=1)
    data['bow'] = data.apply(lambda x: bag_of_words(x['sentences']), axis=1)
    data['bow_tfidf'] = data.apply(
        lambda x: bag_of_words_tfidf(x['sentences']), axis=1)
    data['lcs_diff'] = data.apply(lambda x: lcs_diff(x['sentences']), axis=1)
    #N-gramOverlap
    data['1-gramoverlap'] = data.apply(
        lambda x: n_gram_over_lap(x['sentences'], 1), axis=1)
    data['2-gramoverlap'] = data.apply(
        lambda x: n_gram_over_lap(x['sentences'], 2), axis=1)
    data['3-gramoverlap'] = data.apply(
        lambda x: n_gram_over_lap(x['sentences'], 3), axis=1)
    data['2-gramoverlap_char'] = data.apply(
        lambda x: n_gram_over_lap_char(x['sentences'], 2), axis=1)
    data['3-gramoverlap_char'] = data.apply(
        lambda x: n_gram_over_lap_char(x['sentences'], 3), axis=1)
    data['4-gramoverlap_char'] = data.apply(
        lambda x: n_gram_over_lap_char(x['sentences'], 4), axis=1)
    data['5-gramoverlap_char'] = data.apply(
        lambda x: n_gram_over_lap_char(x['sentences'], 5), axis=1)
    return data
Exemple #47
0
def valueCal(q, index_list, data, modelvec, modelvec_all, modelvec_stop,
             length, shape):
    for i in index_list:
        list_con = []
        q1 = str(data.get_value(i, 'question1')).split()
        q2 = str(data.get_value(i, 'question2')).split()
        f1 = getF1_union(q1, q2)
        f2 = getF2_inter(q1, q2)
        f3 = getF3_sum(q1, q2)
        f4_q1 = len(q1)
        f4_q2 = len(q2)
        f4_rate = f4_q1 / f4_q2
        q1 = getVect(q1, modelvec, length, shape)
        q2 = getVect(q2, modelvec, length, shape)
        cos_dis_add = getCosDis_add(q1, q2)
        cos_dis_map = getCosDis_map(q1, q2)
        euc_dis = getEucDis(q1, q2)

        # all and stop

        q1_all = str(data.get_value(i, 'question1_all')).split()
        q2_all = str(data.get_value(i, 'question2_all')).split()
        q1_stop = str(data.get_value(i, 'question1_stop')).split()
        q2_stop = str(data.get_value(i, 'question2_stop')).split()

        f1_all = getF1_union(q1_all, q2_all)
        f2_all = getF2_inter(q1_all, q2_all)
        f3_all = getF3_sum(q1_all, q2_all)
        f4_q1_all = len(q1_all)
        f4_q2_all = len(q2_all)
        f4_rate_all = f4_q1_all / f4_q2_all
        f1_stop = getF1_union(q1_stop, q2_stop)
        f2_stop = getF2_inter(q1_stop, q2_stop)
        f3_stop = getF3_sum(q1_stop, q2_stop)
        f4_q1_stop = len(q1_stop)
        f4_q2_stop = len(q2_stop)
        f4_rate_stop = f4_q1_stop / f4_q2_stop

        q1_all = getVect(q1_all, modelvec_all, length, shape)
        q2_all = getVect(q2_all, modelvec_all, length, shape)
        q1_stop = getVect(q1_stop, modelvec_stop, length, shape)
        q2_stop = getVect(q2_stop, modelvec_stop, length, shape)

        cos_dis_all_add = getCosDis_add(q1_all, q2_all)
        cos_dis_all_map = getCosDis_map(q1_all, q2_all)
        cos_dis_stop_add = getCosDis_add(q1_stop, q2_stop)
        cos_dis_stop_map = getCosDis_map(q1_stop, q2_stop)

        euc_dis_all = getEucDis(q1_all, q2_all)
        euc_dis_stop = getEucDis(q1_stop, q2_stop)

        q1_map = vecmap(q1)
        q2_map = vecmap(q2)
        q1_add = vecadd(q1)
        q2_add = vecadd(q2)
        q_jac_map_dis = distance.jaccard(q1_map, q2_map)
        q_jac_add_dis = distance.jaccard(q1_add, q2_add)
        q_mhd_map_dis = distance.mahalanobis(
            q1_map, q2_map,
            np.linalg.pinv(np.cov(np.vstack((q1_map, q2_map)).T)))
        q_pearson_map = cal_pearson(q1_map, q2_map)
        q_pearson_add = cal_pearson(q1_add, q2_add)
        q_spearmanr_map_t, q_spearmanr_map_p = spearmanr(q1_map, q2_map)
        q_spearmanr_add_t, q_spearmanr_add_p = spearmanr(q1_add, q2_add)
        q_kendalltau_map_t, q_kendalltau_map_p = kendalltau(q1_map, q2_map)
        q_kendalltau_add_t, q_kendalltau_add_p = kendalltau(q1_add, q2_add)

        # all and stop

        q1_all_map = vecmap(q1_all)
        q2_all_map = vecmap(q2_all)
        q1_all_add = vecadd(q1_all)
        q2_all_add = vecadd(q2_all)
        q1_stop_map = vecmap(q1_stop)
        q2_stop_map = vecmap(q2_stop)
        q1_stop_add = vecadd(q1_stop)
        q2_stop_add = vecadd(q2_stop)

        q_all_jac_map_dis = distance.jaccard(q1_all_map, q2_all_map)
        q_all_jac_add_dis = distance.jaccard(q1_all_add, q2_all_add)
        q_stop_jac_map_dis = distance.jaccard(q1_stop_map, q2_stop_map)
        q_stop_jac_add_dis = distance.jaccard(q1_stop_add, q2_stop_add)

        q_all_mhd_map_dis = distance.mahalanobis(
            q1_all_map, q2_all_map,
            np.linalg.pinv(np.cov(np.vstack((q1_all_map, q2_all_map)).T)))
        q_stop_mhd_map_dis = distance.mahalanobis(
            q1_stop_map, q2_stop_map,
            np.linalg.pinv(np.cov(np.vstack((q1_stop_map, q2_stop_map)).T)))

        q_all_pearson_map = cal_pearson(q1_all_map, q2_all_map)
        q_all_pearson_add = cal_pearson(q1_all_add, q2_all_add)
        q_stop_pearson_map = cal_pearson(q1_stop_map, q2_stop_map)
        q_stop_pearson_add = cal_pearson(q1_stop_add, q2_stop_add)

        q_all_spearmanr_map_t, q_all_spearmanr_map_p = spearmanr(
            q1_all_map, q2_all_map)
        q_all_spearmanr_add_t, q_all_spearmanr_add_p = spearmanr(
            q1_all_add, q2_all_add)
        q_stop_spearmanr_map_t, q_stop_spearmanr_map_p = spearmanr(
            q1_stop_map, q2_stop_map)
        q_stop_spearmanr_add_t, q_stop_spearmanr_add_p = spearmanr(
            q1_stop_add, q2_stop_add)

        q_all_kendalltau_map_t, q_all_kendalltau_map_p = kendalltau(
            q1_all_map, q2_all_map)
        q_all_kendalltau_add_t, q_all_kendalltau_add_p = kendalltau(
            q1_all_add, q2_all_add)
        q_stop_kendalltau_map_t, q_stop_kendalltau_map_p = kendalltau(
            q1_stop_map, q2_stop_map)
        q_stop_kendalltau_add_t, q_stop_kendalltau_add_p = kendalltau(
            q1_stop_add, q2_stop_add)

        # list_con.append(q1)
        # list_con.append(q2)

        # all and stop

        # list_con.append(q1_all)
        # list_con.append(q2_all)
        # list_con.append(q1_stop)
        # list_con.append(q2_stop)

        list_con.append(data.get_value(i, 'id'))
        list_con.append(data.get_value(i, 'qid1'))
        list_con.append(data.get_value(i, 'qid2'))
        list_con.append(data.get_value(i, 'question1_all'))
        list_con.append(data.get_value(i, 'question2_all'))
        list_con.append(data.get_value(i, 'question1_stop'))
        list_con.append(data.get_value(i, 'question2_stop'))
        list_con.append(data.get_value(i, 'question1'))
        list_con.append(data.get_value(i, 'question2'))
        list_con.append(data.get_value(i, 'is_duplicate'))

        list_con.append(f1)
        list_con.append(f2)
        list_con.append(f3)
        list_con.append(f4_q1)
        list_con.append(f4_q2)
        list_con.append(f4_rate)
        list_con.append(cos_dis_add)
        list_con.append(cos_dis_map)
        list_con.append(euc_dis)
        list_con.append(q_jac_map_dis)
        list_con.append(q_jac_add_dis)
        list_con.append(q_mhd_map_dis)
        list_con.append(q_pearson_map)
        list_con.append(q_pearson_add)
        list_con.append(q_spearmanr_map_t)
        list_con.append(q_spearmanr_add_t)
        list_con.append(q_spearmanr_map_p)
        list_con.append(q_spearmanr_add_p)
        list_con.append(q_kendalltau_map_p)
        list_con.append(q_kendalltau_add_p)
        list_con.append(q_kendalltau_map_t)
        list_con.append(q_kendalltau_add_t)

        # all and stop

        list_con.append(f1_all)
        list_con.append(f2_all)
        list_con.append(f3_all)
        list_con.append(f4_q1_all)
        list_con.append(f4_q2_all)
        list_con.append(f4_rate_all)
        list_con.append(f1_stop)
        list_con.append(f2_stop)
        list_con.append(f3_stop)
        list_con.append(f4_q1_stop)
        list_con.append(f4_q2_stop)
        list_con.append(f4_rate_stop)

        list_con.append(cos_dis_all_add)
        list_con.append(cos_dis_all_map)
        list_con.append(cos_dis_stop_add)
        list_con.append(cos_dis_stop_map)

        list_con.append(euc_dis_all)
        list_con.append(euc_dis_stop)

        list_con.append(q_all_jac_map_dis)
        list_con.append(q_all_jac_add_dis)
        list_con.append(q_stop_jac_map_dis)
        list_con.append(q_stop_jac_add_dis)

        list_con.append(q_all_mhd_map_dis)
        list_con.append(q_stop_mhd_map_dis)

        list_con.append(q_all_pearson_map)
        list_con.append(q_all_pearson_add)
        list_con.append(q_stop_pearson_map)
        list_con.append(q_stop_pearson_add)

        list_con.append(q_all_spearmanr_map_t)
        list_con.append(q_all_spearmanr_add_t)
        list_con.append(q_stop_spearmanr_map_t)
        list_con.append(q_stop_spearmanr_add_t)

        list_con.append(q_all_spearmanr_map_p)
        list_con.append(q_all_spearmanr_add_p)
        list_con.append(q_stop_spearmanr_map_p)
        list_con.append(q_stop_spearmanr_add_p)

        list_con.append(q_all_kendalltau_map_p)
        list_con.append(q_all_kendalltau_add_p)
        list_con.append(q_stop_kendalltau_map_p)
        list_con.append(q_stop_kendalltau_add_p)

        list_con.append(q_all_kendalltau_map_t)
        list_con.append(q_all_kendalltau_add_t)
        list_con.append(q_stop_kendalltau_map_t)
        list_con.append(q_stop_kendalltau_add_t)

        while 1:
            try:
                q.put(list_con)
                break
            except:
                continue
Exemple #48
0
def main():
    print "# KNN Classifier"
    parser = ld.parse_arguments()

    # priting args
    print '\t-k = ' + str(parser.k)
    print '\t-d = ' + parser.distance

    stopwords = None
    if parser.stopwords_path:
        stopwords = ld.load_stopwords(parser.stopwords_path)

    voc = load_vocabulary(parser.train_path, stopwords)
    answers = load_answers(parser.train_path)

    train = transform(voc, parser.train_path)
    test = transform(voc, parser.test_path)

    # output file
    out_path = '../results/' + parser.distance + '_' + str(parser.k)
    out_path += '.txt'
    out_file = open(out_path, 'w')

    for point in test:
        neighbors = []
        for i in xrange(len(train)):
            neigh = train[i]
            distance = 0.0

            if parser.distance == 'cosine':
                distance = spd.cosine(neigh, point)
            elif parser.distance == 'jaccard':
                distance = spd.jaccard(neigh, point)
            elif parser.distance == 'euclidean':
                distance = spd.euclidean(neigh, point)
            elif parser.distance == 'dice':
                distance = spd.dice(neigh, point)
            elif parser.distance == 'correlation':
                distance = spd.correlation(neigh, point)
            elif parser.distance == 'manhattan':
                distance = spd.cityblock(neigh, point)
            else:
                print >> stderr, "ERRO! -  Distância informada inválida."
                exit()

            tup = (distance, i)
            heapq.heappush(neighbors, tup)

        # return the highest k similar points
        top_k = heapq.nsmallest(parser.k, neighbors)

        # classifing
        classification = np.zeros(2)
        for (_, idi) in top_k:
            classe = answers[idi]
            classification[int(classe)] += 1

        # outputing classification
        if(classification[0] >= classification[1]):
            print >> out_file, '0'
            print '0'
        else:
            print >> out_file, '1'
            print '1'

    # outputing the results'
    print
    print "# Resultados salvos no arquivo: " + out_path
    out_file.close()
    result.result("../data/imdb_test", out_path)
def simil_fun_jaccard_5(a, b):
    a_bin = a >= 10**(-5)
    b_bin = b >= 10**(-5)
    return jaccard(a, b)
Exemple #50
0
error_count = 0

for i, q in tqdm(enumerate(train_df.q1_clean.values)):
    question1_vectors[i, :] = sent2vec(q)

question2_vectors  = np.zeros((train_df.shape[0], 300))
for i, q in tqdm(enumerate(train_df.q2_clean.values)):
    question2_vectors[i, :] = sent2vec(q)
     
train_df['cosine_distance2'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

train_df['cityblock_distance2'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

train_df['jaccard_distance2'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

train_df['canberra_distance2'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

train_df['euclidean_distance2'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

train_df['minkowski_distance2'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

train_df['braycurtis_distance2'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

train_df['skew_q1vec2'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
error_count = 0

for i, q in tqdm(enumerate(data.question1.values)):
    question1_vectors[i, :] = sent2vec(q)

question2_vectors  = np.zeros((data.shape[0], 300))
for i, q in tqdm(enumerate(data.question2.values)):
    question2_vectors[i, :] = sent2vec(q)

data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
     
     reorder = np.array(bound_clusters['leaves'])
     
     plotfun.plotAggregateSignal(footprintVecs_norm_bound[reorder], labels=labels[reorder])
     plt.title('threshold %3.2f; %d bound by all'%(threshold, np.sum(np.all(boundMatrix, 0))))
     plt.savefig('%s.threshold_%3.2f.normalizedbyinsertions.bound.pdf'%(outfile, threshold))
     
     # plot unbound profiles
     footprintVecs_norm_unbound = np.array([np.mean(footprintMats[i, unboundMatrix[i]], 0)*
                                            np.mean(footprintMats[unboundMatrix])/np.mean(footprintMats[i, unboundMatrix[i]]) for i in range(numSamples)])
     plotfun.plotAggregateSignal(footprintVecs_norm_unbound, labels=labels)
     plt.title('threshold %3.2f; %d unbound'%(threshold, np.sum(np.all(np.logical_not(boundMatrix), 0))))
     plt.savefig('%s.threshold_%3.2f.normalizedbyinsertions.unbound.pdf'%(outfile, threshold))
     
     #  do Jaccard distance heat map
     distance = np.array([[1-jaccard(boundMatrix[i], boundMatrix[j]) for i in range(numSamples)] for j in range(numSamples)])
     plotHeatMap(distance, rowlabels=labels, columnlabels=labels, fontSize=11, cmap='RdGy_r', vmin=0, vmax=1)
     plt.savefig('%s.jaccard.heatmap.threshold_%3.2f.pdf'%(outfile, threshold))
         
 # plot heat maps
 for i in range(numSamples):
     fig = plt.figure(figsize=(4, 10))
     sortIndx = np.argsort(strengthMatrix[i])[::-1]
     plot_heatmap_bar(footprintMats[i, sortIndx], strengthMatrix[i,sortIndx], threshold=[np.sum(correlationToIdeal[i]['r']-correlationToBackground[i]['r'] > j) for j in [0, 0.05, 0.1, 0.15, 0.2]], label=labels[i])
     plt.savefig('%s.footprint.%s.heatmap.pdf'%(outfile, labels[i]))
 
 # plot side by side bound and unbound    
 threshold = 0.05
 fig = plt.figure(figsize=(4, 10))
 plot_heatmap_bar(footprintMats[i, np.all(boundMatrix,0)], strengthMatrix[i,np.all(boundMatrix,0)])
 
def jaccardratio(x,y):
    from scipy.spatial.distance import jaccard
    import numpy
    vx=numpy.array(bytearray(x))
    vy=numpy.array(bytearray(y))
    return jaccard(vx,vy)
Exemple #54
0
def jaccard_similarity(row_video, col_video):
    """
	jaccard_similarity : J(X,Y) = |X∩Y| / |X∪Y|
	input result into dataframe and return that.

	"""

    similarity_result = np.zeros((len(row_video), len(col_video)))

    similarity_result = np.column_stack((row_video, similarity_result))

    temp_col = [np.nan] + col_video
    similarity_result = np.row_stack((temp_col, similarity_result))

    # criteria video's data to dataframe
    MyDB.dic_execute('select * from ROUTINE where video_num in ' +
                     str(tuple(row_video)))
    row_video_tuple = list(MyDB.dic_fetchall())

    MyDB.dic_execute('select * from EXERCISE where video_num in ' +
                     str(tuple(row_video)))
    row_video_tuple += list(MyDB.dic_fetchall())

    row_video_df = pd.DataFrame(row_video_tuple)

    #compared video's data to dataframe

    MyDB.dic_execute('select * from ROUTINE where video_num in ' +
                     str(tuple(col_video)))
    col_video_tuple = list(MyDB.dic_fetchall())

    MyDB.dic_execute('select * from EXERCISE where video_num in ' +
                     str(tuple(col_video)))
    col_video_tuple += list(MyDB.dic_fetchall())

    col_video_df = pd.DataFrame(col_video_tuple)

    # EXERCISE and ROUTINE table have different size of columns.
    # So, make same size of matrix to calculate jaccard distance

    if len(row_video_df.columns) == 7:
        row_video_df.insert(loc=1,
                            column='equipment',
                            value=np.zeros((len(row_video_df), 1)))
        row_video_df.insert(loc=2,
                            column='excer_type',
                            value=np.zeros((len(row_video_df), 1)))
        row_video_df.insert(loc=7,
                            column='trainer',
                            value=np.zeros((len(row_video_df), 1)))

    if len(col_video_df.columns) == 7:
        col_video_df.insert(loc=1,
                            column='equipment',
                            value=np.zeros((len(col_video_df), 1)))
        col_video_df.insert(loc=2,
                            column='excer_type',
                            value=np.zeros((len(col_video_df), 1)))
        col_video_df.insert(loc=7,
                            column='trainer',
                            value=np.zeros((len(col_video_df), 1)))

    # all row_video data convert to number
    for i in row_video_df.columns.values:
        if i == 'length' or i == 'url' or i == 'video_num':
            continue

        # sex = 'f' or 'm', but 'm' is overlapped dataType of level column.
        elif i == 'sex':
            for j, v in row_video_df[i].items():
                if v == 'f':
                    row_video_df.loc[j, i] = 51
                elif v == 'm':
                    row_video_df.loc[j, i] = 52
                else:
                    s = convert_cate_num(v)
                    row_video_df.loc[j, i] = s
        else:
            for j, v in row_video_df[i].items():
                s = convert_cate_num(v)
                row_video_df.loc[j, i] = s

    # all column_video data convert to number
    for i in col_video_df.columns.values:
        if i == 'length' or i == 'url' or i == 'video_num':
            continue

        elif i == 'sex':
            for j, v in col_video_df[i].items():
                if v == 'f':
                    col_video_df.loc[j, i] = 51
                elif v == 'm':
                    col_video_df.loc[j, i] = 52
                else:
                    s = convert_cate_num(v)
                    col_video_df.loc[j, i] = s

        else:
            for j, v in col_video_df[i].items():
                s = convert_cate_num(v)
                col_video_df.loc[j, i] = s

    row_video_df = row_video_df.sort_values(by=['video_num'])
    col_video_df = col_video_df.sort_values(by=['video_num'])

    for i in range(len(row_video) - 1):

        temp_row_value = row_video_df.iloc[i].values

        for j in range(len(col_video)):
            temp_col_value = col_video_df.iloc[j].values
            similarity_result[i + 1, j + 1] = 1 - distance.jaccard(
                temp_row_value, temp_col_value)

    similarity_result = pd.DataFrame(similarity_result[1:, 1:],
                                     index=similarity_result[1:, 0],
                                     columns=similarity_result[0, 1:])

    return similarity_result
def rank_jaccard(intent: np.ndarray, selection: np.ndarray) -> float:
    return float(1 - jaccard(intent, selection))
 def _sim(self, v1, v2):
     if v1.norm() == 0 or v2.norm() == 0:
         return 0.0
     return jaccard(v1, v2)
df.head(2)

nltk.download('punkt')

question1_vectors = np.zeros((df.shape[0], 300))

for i, q in enumerate(tqdm_notebook(df.question1.values)):
    question1_vectors[i, :] = sent2vec(q)
    
question2_vectors  = np.zeros((df.shape[0], 300))
for i, q in enumerate(tqdm_notebook(df.question2.values)):
    question2_vectors[i, :] = sent2vec(q)

df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
df['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
df['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
df['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

df['is_duplicate'].value_counts()

df.isnull().sum()

df.drop(['question1', 'question2'], axis=1, inplace=True)
df = df[pd.notnull(df['cosine_distance'])]
def get_jaccard_distance(row):
    q1_q2_vectors = get_q1_q2_vectors(row)
    return jaccard(q1_q2_vectors[0], q1_q2_vectors[1])
Exemple #59
0
def jaccard_similarity_binconv(v0, v1):

    v0 = np.where((v0 > 0), 1, 0)
    v1 = np.where((v0 > 0), 1, 0)

    return 1.0 - spd.jaccard(v0, v1)
Exemple #60
0
 def jaccard(x):
     result = np.empty(df_array.shape[0])
     for i, row in enumerate(df_array[:, 0, :]):
         result[i] = distance.jaccard(row, x, **kwargs)
     return result