def jaccardSimilarity(term1,term2): term1Set = set(term1) term2Set = set(term2) unionSet = list(term1Set.union(term2Set)); a = map(lambda x: 1 if x in term1Set else 0, unionSet) b = map(lambda x: 1 if x in term1Set else 0, unionSet) return jaccard(a,b)
def take_hits(self, cat_data, mid_dict, N, randomized=False, movies=None): N = N + 1 # The most similar element is itself. #total_hits = [] if movies is None: movies = mid_dict.keys() m = len(movies) total = 0 jac_list = [] for movid in movies: jac = 0 # jaccard score of movies mov_genre = self.cat_mat[movid] if randomized: # Random test, verify the significance of results. neighbors = random.sample(xrange(1, 3417), N) else: neighbors = set(self.sim_dict[movid][1:N]) for neighbor in neighbors: n_genre = self.cat_mat[neighbor] jac = jac + (1 - jaccard(mov_genre, n_genre)) # similarity jac = jac / N # Avg Jaccard of the movie. jac_list.append(jac) # for histogram total = total + jac #total_hits.append(n) total = total / m # Avg jaccard of movies. return (total, jac_list)
def compute_jaccard(x, y, x_min=0.0, x_max=1.0, y_min=0.0, y_max=1.0, warn_uneven=True, limit_tolerance=4, disable_checks=False): """Calculate the Jaccard index (Jaccard similarity coefficient). The Jaccard coefficient measures similarity between sample sets, and is defined as the size of the intersection divided by the size of the union of the sample sets. The Jaccard coefficient can be calculated for a subset of rasters provided by using the threshold argument. Min and max values must be provided for both RasterLayer objects x and y. Method can be used with RasterLayers of any value range, but the defaults [0.0, 1.0] are geared towards comparing Zonation rank priority rasters. Limits provided are inclusive. :param x ndarray object. :param y ndarray object. :param x_min Numeric minimum threshold value for x to be used (default 0.0). :param x_max Numeric maximum threshold value for x to be used (default 1.0). :param y_min Numeric minimum threshold value for y to be used (default 0.0). :param y_max Numeric maximum threshold value for y to be used (default 1.0). :param warn_uneven Boolean indicating whether a warning is raised if the compared raster coverages are very (>20x) uneven. :param limit_tolerance integer values that defines to which precision x and y limits are rounded to. This helps e.g. with values that close to 0 but not quite 0 (default: 4, i.e. round(x, 4)). :param disable_checks boolean indicating if the input limit values are checked against the actual raster values in x and y. :return numeric value in [0, 1]. """ if not disable_checks: assert x_min >= np.round(np.min(x), limit_tolerance), "Min threshold smaller than computed min of x" assert x_max <= np.round(np.max(x), limit_tolerance), "Max threshold greater than computed max of x" assert x_min < x_max, "Min threshold for x larger to max threshold" assert y_min >= np.round(np.min(y), limit_tolerance), "Min threshold smaller than computed min of y" assert y_max <= np.round(np.max(y), limit_tolerance), "Max threshold greater than computed max of y" assert y_min < y_max, "Min threshold for y larger to max threshold" # Get the values according to the limits provided x_bin = (x >= x_min) & (x <= x_max) y_bin = (y >= y_min) & (y <= y_max) if warn_uneven: x_size = np.sum(x_bin) y_size = np.sum(y_bin) # Sort from smaller to larger sizes = np.sort([x_size, y_size]) if sizes[1] / sizes[0] > 20: print("WARNING: The extents of raster values above the " "threshhold differ more than 20-fold: Jaccard coefficient " + "may not be informative.") # Compute the Jaccard-Needham dissimilarity between two boolean 1-D arrays # and subtract from 1 to get the Jaccard index return 1 - jaccard(x_bin.flatten(), y_bin.flatten())
def weight_cost_func_hashtag_bow_topics(self, s, t): a1 = numpy.array(s['hashtag_bow'].todense()).ravel() a2 = numpy.array(t['hashtag_bow'].todense()).ravel() if not a1.any() or not a2.any(): d = 1 else: d = jaccard(a1, a2) return (0.2 * cosine(s['topics'], t['topics']) + 0.1 * d)
def assign_edge_weights(cls, g, dist_func, fields_with_weights={'topics': 1}): """ TODO: can be parallelized """ N = g.number_of_edges() dists_mat = np.zeros((N, len(fields_with_weights))) fields, fields_weight = fields_with_weights.keys(), \ fields_with_weights.values() for i, (s, t) in enumerate(g.edges_iter()): if i % 10000 == 0: logger.debug('adding edge cost: {}/{}'.format(i, N)) for j, f in enumerate(fields): if issparse(g.node[s][f]): array1 = np.array(g.node[s][f].todense()).ravel() else: array1 = np.array(g.node[s][f]) if issparse(g.node[t][f]): array2 = np.array(g.node[t][f].todense()).ravel() else: array2 = np.array(g.node[t][f]) # at least one is all-zero if not array1.any() or not array2.any(): dists_mat[i, j] = 1 else: if f == 'hashtag_bow': # special treatment to `hashtag_bow` dists_mat[i, j] = jaccard( array1, array2 ) else: dists_mat[i, j] = dist_func( array1, array2 ) assert not np.isinf(dists_mat[i, j]) weight_mat = np.matrix([fields_weight]).T dist_mat = np.abs(np.matrix(dists_mat) * weight_mat) for i, (s, t) in enumerate(g.edges_iter()): g[s][t][cls.EDGE_COST_KEY] = dist_mat[i, 0] assert not np.isinf(g[s][t][cls.EDGE_COST_KEY]), \ (g.node[s]['bow'].nonzero(), g.node[t]['bow'].nonzero()) return g
def recommend_movie(movieID): similar_movie = {} for i in df.columns: similar_movie[i] = 0.2*(1.0 - jaccard(df[movieID],df[i])) + 1.0*(1.0 - jaccard(df1[movieID],df1[i])) sorted_similar_movie = dict(sorted(similar_movie.iteritems(), key=operator.itemgetter(1), reverse=True)[:500]) #movie sorted with mean ratings and size mean_size = requests.get(url+'movies/mean/all').json() ms = pd.DataFrame(mean_size) ms_t = ms.T ms_t.drop(['movie_id'], axis = 0, inplace=True) ms_t.columns = ms['movie_id'] for i in sorted_similar_movie: try: sorted_similar_movie[i] += 0.5*(ms_t[i][0] + ms_t[i][1]) except KeyError, e: continue
def compute_similarity(self, arr1, arr2): if self.simfcn == "cosine": return self.d_to_sim(cosine(arr1, arr2)) elif self.simfcn == "pearson": return self.d_to_sim(correlation(arr1, arr2)) elif self.simfcn == "hamming": return 1 - hamming(arr1, arr2) elif self.simfcn == "jaccard": return 1 - jaccard(arr1, arr2) else: print "Similiarity Function Not Yet Supported" exit()
def reccomend(movieID): movieID = int(movieID) #requests genre genreV = requests.get(url+'movies/genreV/all').json() #create dataframe vec = pd.DataFrame(genreV) #create genre vector vec['genresV'] = vec['genresV'].str.split('|') list_movieid = list(vec['movieId']) v = vec.T v.columns = list_movieid v.drop(['movieId'], inplace=True) v_json = v.to_dict(orient='records') #v_json = v.to_dict(orient='series') #d = requests.post('http://10.10.76.125:1000/api/movies', v_json) #v_res = d.json() df = pd.DataFrame.from_dict(v_json[0], orient='index') df.drop([20], axis = 1, inplace=True) df = df.T df = df.astype(int) similar_movie = {} for i in df.columns: similar_movie[i] = 1.0 - jaccard(df[movieID],df[i]) sorted_similar_movie = dict(sorted(similar_movie.iteritems(), key=operator.itemgetter(1), reverse=True)[:500]) #movie sorted with mean ratings and size mean_size = requests.get(url+'movies/mean/all').json() ms = pd.DataFrame(mean_size) ms_t = ms.T ms_t.drop(['movie_id'], axis = 0, inplace=True) ms_t.columns = ms['movie_id'] for i in sorted_similar_movie: try: sorted_similar_movie[i] += ms_t[i][0] + ms_t[i][1] except KeyError, e: continue
def main(): vectorizer = CountVectorizer(ngram_range=(1,2),max_df=1.0, min_df=0.0) matrix = vectorizer.fit_transform(training_set) for new_comment in new_comments: print '\n####\ncommentaire:{0}\n'.format(new_comment) vector = vectorizer.transform([new_comment]) i = 0 for vect in matrix: score = jaccard(vect.todense(), vector.todense()) print 'sentence: {0}"\tscore:{1}'.format(training_set[i], score) i += 1
def raster_differences(raster_dataset_1, raster_dataset_2, tolerance=1e-08): ''' Compares the values of two rasters given a certain treshold. The default tolerance value is the same as the one used by numpy.allclose. @param raster_dataset_1 GDAL dataset @param raster_dataset_2 GDAL dataset @param tolerance double defining the raster similarity tolerance (see http://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html) @return differences dict holding information on the potential differences ''' differences = {} print("INFO: Extracting bands from the first dataset...") band_1 = raster_dataset_1.GetRasterBand(1) print("INFO: Extracting bands from the second dataset...") band_2 = raster_dataset_2.GetRasterBand(1) print("INFO: Extracting band 1 from the first dataset...") data_1 = band_1.ReadAsArray(0, 0, raster_dataset_1.RasterXSize, raster_dataset_1.RasterYSize).astype(numpy.float) print("INFO: Extracting band 1 from the second dataset...") data_2 = band_2.ReadAsArray(0, 0, raster_dataset_2.RasterXSize, raster_dataset_2.RasterYSize).astype(numpy.float) print("INFO: Comparing values...") equal = numpy.allclose(data_1, data_2, atol=tolerance) if not equal: print("WARNING: Raster dataset values not equal at {0} tolerance".format(tolerance)) diff = data_1 - data_2 differences['max'] = float(numpy.max(diff)) differences['min'] = float(numpy.min(diff)) differences['mean'] = float(numpy.mean(diff)) differences['std'] = float(numpy.std(diff)) #differences['quantiles'] = [float(item) for item in mquantiles(diff)] print("INFO: Calculating Kendall's tau statistics, this may take a while...") tau = kendalltau(data_1, data_2) differences['kendall_tau'] = (float(tau[0]), float(tau[1])) treshold = 0.99 print("INFO: Calculating jaccard distance for treshold {0}".format(treshold)) frac_data_1 = data_1 >= treshold frac_data_1 = numpy.reshape(frac_data_1, frac_data_1.size) frac_data_2 = data_2 >= treshold frac_data_2 = numpy.reshape(frac_data_2, frac_data_2.size) # Calculate jaccard index instead of distance differences['jaccard'] = (treshold, float(1-jaccard(frac_data_1, frac_data_2))) return differences
def sentenceSimilarity(self, sent1, sent2, typeSim, isIDF): additionalStop = ['et', 'al', 'e.g.', 'http', 'etc'] stemmer = PorterStemmer() term1 = [stemmer.stem(t) for t in nltk.word_tokenize(sent1.lower()) \ if t not in stopwords.words('english') and len(t)>1 and t not in additionalStop] term2 = [stemmer.stem(t) for t in nltk.word_tokenize(sent2.lower()) \ if t not in stopwords.words('english') and len(t)>1 and t not in additionalStop] allterm = list(set(term1 + term2)) nl1 = len(term1) nl2 = len(term2) v1 = [] v2 = [] for t in allterm: if isIDF: idf = pickle.load( open( "../data/idf/globalidf.dat", "rb" ) ) theIdf = idf.get(t) if theIdf!= None: v1.append(term1.count(t)*idf.get(t)) v2.append(term2.count(t)*idf.get(t)) else: v1.append(0) v2.append(0) else: nt1 = term1.count(t) if nt1!=0: v1.append(nt1/float(nl1)) else: v1.append(0) nt2 = term2.count(t) if nt2!=0: v2.append(nt2/float(nl2)) else: v2.append(0) #jaccard transform jv1 = [] jv2 = [] for i in range(0, len(v1)): if v1[i] > 0: jv1.append(1) else: jv1.append(0) if v2[i] > 0: jv2.append(1) else: jv2.append(0) if typeSim == 'cosine': return distance.cosine(v1, v2) else: return distance.jaccard(jv1, jv2)
def test_jaccard_similarity(): true = np.random.binomial(n=1, p=.5, size=10).astype('float32') predicted = np.round(np.random.random(10)) refscore = jaccard(true, predicted) yt = T.fvector('yt') yp = T.fvector('yp') f = theano.function([yt, yp], tmetrics.classification.jaccard_similarity(yt, yp), allow_input_downcast=True) score = f(true, predicted) print 'true' print true print 'predicted' print predicted print 'refscore {}'.format(refscore) print 'score {}'.format(score) assert np.allclose(refscore, score)
def compare_arand(f): from sklearn import metrics from scipy.spatial import distance rois = mh.imread(f.replace('rois2', 'rois')) rois2 = mh.imread(f) rois = (rois.ravel() != 0) rois2 = (rois2.ravel() != 0) arand = metrics.adjusted_rand_score(rois, rois2) # Note that scipy returns the Jaccard Distance, which is 1 - Jaccard Index # sklearn does not really implement jaccard, but an interpretation where # jaccard is just a synonym for accuracy. jaccard = 1. - distance.jaccard(rois, rois2) mcc = metrics.matthews_corrcoef(rois, rois2) return arand, jaccard, mcc
def assess_corr(df, pb, pairs): condensed_corr = [] condensed_pval = [] for pair in pairs: if jaccard(pb[pair[0]], pb[pair[1]]) > 0.2: (rho, p) = (np.nan, np.nan) else: tmp_df = df.loc[list(pair)].copy().dropna(axis=1, how='any').T (rho, p) = spearmanr(tmp_df) condensed_corr.append(rho) condensed_pval.append(p ) return (condensed_corr, condensed_pval)
def test_jaccard_similarity_2D(): true = np.random.binomial(n=1, p=.5, size=10).astype('float32') predicted = np.round(np.random.random(10)) refscore = np.asarray([jaccard(true, predicted)]) double = lambda x: np.concatenate([x.reshape((1, len(x))), x.reshape((1, len(x)))]) true, predicted, refscore = tuple(double(x) for x in [true, predicted, refscore]) yt = T.fmatrix('yt') yp = T.fmatrix('yp') f = theano.function([yt, yp], tmetrics.classification.jaccard_similarity(yt, yp), allow_input_downcast=True) score = f(true, predicted) print 'true' print true print 'predicted' print predicted print 'refscore {}'.format(refscore) print 'score {}'.format(score) assert np.allclose(refscore, score)
def _get_BMU(self,input_nparray): minDist=9223372036854775807 candidate= None for neu in self.map_neurons.itervalues(): if self.boolean: cand = jaccard(input_nparray, neu.weight_vs) if minDist> cand: minDist = cand candidate= neu else: cand=minkowski(input_nparray, neu.weight_vs, 2) if minDist> cand: minDist = cand candidate= neu return candidate
def similarity(self, article): """ Calculate the similarity between this article and another article. """ # Compare the text vectors, # and the entity vectors. v = self.vectorize() v_ = article.vectorize() # Linearly combine the similarity values, # weighing them according to these coefficients. # [text vector, entity vector, publication date] coefs = [2, 1, 2] sim = 0 for i, vec in enumerate(v): dist = jaccard(v_[i], v[i]) # Two empty vectors returns a jaccard distance of NaN. # Set it to be 1, i.e. consider them completely different # (or, put more clearly, they have nothing in common) # FYI if jaccard runs on empty vectors, it will throw a warning. if isnan(dist): dist = 1 s = 1 - dist sim += (coefs[i] * s) # Also take publication dates into account. ideal_time = 259200 # 3 days, in seconds t, t_ = self.created_at, article.created_at # Subtract the more recent time from the earlier time. time_diff = t - t_ if t > t_ else t_ - t time_diff = time_diff.total_seconds() # Score is normalized [0, 1], where 1 is within the ideal time, # and approaches 0 the longer the difference is from the ideal time. time_score = 1 if time_diff < ideal_time else ideal_time/time_diff sim += (coefs[2] * time_score) # Normalize back to [0, 1]. return sim/sum(coefs)
def getBMU(self,input_nparray): minDist=9223372036854775807 candidate= None for neu in self.map_neurons.itervalues(): #print "input: "+str(input_nparray) #print "neuron: "+str (neu.weight_vs) if self.boolean: cand = jaccard(input_nparray, neu.weight_vs) if minDist> cand: minDist = cand candidate= neu else: cand=minkowski(input_nparray, neu.weight_vs, 2) if minDist> cand: #print "mindist:",minDist #print "cand:",cand minDist = cand candidate= neu #print "candidate'scoords",candidate.coords() return candidate
def region_query(self, dataset, datapoint): """ Returns a list of new datapoints in datapoint's epsilon-neigbourhood :param datapoint: :return: """ datapoints = [datapoint] for new_datapoint in dataset: if new_datapoint != datapoint: distance = jaccard( datapoint.data_vector, new_datapoint.data_vector ) if distance <= self.epsilon: datapoints.append(new_datapoint) return datapoints
def simil_fun_jaccard_4(a, b): a_bin = a >= 10**(-4) b_bin = b >= 10**(-4) return jaccard(a, b)
# print(prefix_vec[:5]) tfidf_feat = pd.DataFrame() tfidf_feat['title_prefix_cosine_distance_tfidf'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(prefix_vec), np.nan_to_num(title_vec)) ] print('title_prefix_cosine_distance done') tfidf_feat['title_prefix_cityblock_distance_tfidf'] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(prefix_vec), np.nan_to_num(title_vec)) ] print('title_prefix_cityblock_distance done') tfidf_feat['title_prefix_jaccard_distance_tfidf'] = [ jaccard(x, y) for (x, y) in zip(np.nan_to_num(prefix_vec), np.nan_to_num(title_vec)) ] print('title_prefix_jaccard_distance done') tfidf_feat['title_prefix_canberra_distance_tfidf'] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(prefix_vec), np.nan_to_num(title_vec)) ] print('title_prefix_canberra_distance done') tfidf_feat['title_prefix_euclidean_distance_tfidf'] = [ euclidean(x, y) for (x, y) in zip(np.nan_to_num(prefix_vec), np.nan_to_num(title_vec)) ] print('title_prefix_euclidean_distance done') tfidf_feat['title_prefix_minkowski_distance_tfidf'] = [ minkowski(x, y, 3)
#!/usr/bin/python # -*- coding: utf-8 -*- #[email protected] """ 举个例子来说电影基数非常庞大 用户看过的电影只占其中非常小的一部分 如果两个用户都没有看过某一部电影(两个都是 0) 并不能说明两者相似 反而言之,如果两个用户都看过某一部电影(序列中都是 1) 则说明用户有很大的相似度。 在这个例子中,序列中等于 1 所占的权重应该远远大于 0 的权重 这就引出下面要说的杰卡德相似系数(Jaccard similarity) """ print(__doc__) import scipy.spatial.distance as dst #添加减少0不改变相似度 s1 = [1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0] s2 = [0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0] l = len(s1) print(dst.jaccard(s1, s2))
def Jaccard(a, b): #distance return distance.jaccard(a, b)
def build_features(data): char_model = gensim.models.KeyedVectors.load_word2vec_format( '../data/char_w2v.txt') word_model = gensim.models.KeyedVectors.load_word2vec_format( '../data/word_w2v.txt') X = pd.DataFrame() X['word_wmd'] = data.apply( lambda x: wmd(x['word1'], x['word2'], word_model), axis=1) X['char_wmd'] = data.apply( lambda x: wmd(x['char1'], x['char2'], char_model), axis=1) question1_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.word1.values)): question1_vectors[i, :] = sent2vec(q, word_model) question2_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.word2.values)): question2_vectors[i, :] = sent2vec(q, word_model) char_question1_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.char1.values)): char_question1_vectors[i, :] = sent2vec(q, char_model) char_question2_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.char2.values)): char_question2_vectors[i, :] = sent2vec(q, char_model) # X['cosine_distance'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['cityblock_distance'] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['jaccard_distance'] = [ jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['canberra_distance'] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['euclidean_distance'] = [ euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['minkowski_distance'] = [ minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['braycurtis_distance'] = [ braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] X['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] X['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] X['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)] X['char_skew_q1vec'] = [ skew(x) for x in np.nan_to_num(char_question1_vectors) ] X['char_skew_q2vec'] = [ skew(x) for x in np.nan_to_num(char_question2_vectors) ] X['char_kur_q1vec'] = [ kurtosis(x) for x in np.nan_to_num(char_question1_vectors) ] X['char_kur_q2vec'] = [ kurtosis(x) for x in np.nan_to_num(char_question2_vectors) ] return X
def jaccard(self): a = self.target b = self.library return distance.jaccard(a, b)
def scipyJaccard(u, v): return 1 - distance.jaccard(u, v)
question2_vectors = np.zeros((train_df.shape[0], 300)) for i, q in tqdm(enumerate(train_df.question2.values)): question2_vectors[i, :] = sent2vec(q) train_df['cosine_distance'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] train_df['cityblock_distance'] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] train_df['jaccard_distance'] = [ jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] train_df['canberra_distance'] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] train_df['euclidean_distance'] = [ euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] train_df['minkowski_distance'] = [ minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
lineNumber = 0 with open(filename) as f: fileData = f.read().splitlines() for line in fileData[1:]: #Note, skip first line to remove header. ubScores = line.split(",") for cellLine in range(95): totalUbiquityScore[cellLine] += float(ubScores[cellLine]) ubMat[cellLine, lineNumber] = float(ubScores[cellLine]) lineNumber += 1 from scipy.spatial import distance matJacc = np.zeros((95, 95)) for row in range(95): for col in range(95): matJacc[row, col] = distance.jaccard(mat[row, :], mat[col, :]) linked = linkage(mat, metric='jaccard') linked2 = linkage(rnaMat, metric='euclidean') linked3 = linkage(rnaMat2, metric='euclidean') linkedEuclideanOfJaccard = linkage(matJacc, metric='euclidean') #linkedReactionByCellLine = linkage(mat.transpose(), metric='jaccard'); plt.subplot(2, 2, 1) dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True) plt.title(KEPT_OR_CORE) plt.subplot(2, 2, 2) dendrogram(linked2,
savefig(plot=(MD_distance_young_G.sort_values( by='Jaccard dist from the median').plot.bar( figsize=(10, 10), fontsize=7.5, title='class: young, Distance by Genes')), outfolder=outfolder, title='dist_young_Genes.png') #rxn # SD dist from median o_sd_R = MD_distance_old_R.std() y_sd_R = MD_distance_young_R.std() print(o_sd_R) print(y_sd_R) # dist between class medians R_class_dist = jaccard(median_old_R, median_young_R) print(R_class_dist) # mets # SD dist from median o_sd_M = MD_distance_old_M.std() y_sd_M = MD_distance_young_M.std() print(o_sd_M) print(y_sd_M) # dist between class medians M_class_dist = jaccard(median_old_M, median_young_M) print(M_class_dist) #genes # SD dist from median o_sd_G = MD_distance_old_G.std()
desired_organism = sys.argv[5] except IndexError: desired_organism = None if desired_organism is not None: models = [mod for mod in models if model_info[mod.split(sep)[-1].split('.')[0]][4] == desired_organism] print ' Predicting for organism : ' + desired_organism output_name = input_name + '_' + input_name2 + '_out_binary_sim_' + str(threshold) + '_' + desired_organism[:3] + '.txt' else: output_name = input_name + '_' + input_name2 + '_out_binary_sim_' + str(threshold) + '.txt' print ' Total Number of Classes : ' + str(len(models)) print ' Using TPR threshold of : ' + str(threshold) output_name = input_name + '_' + input_name2 + '_out_binary_sim_' + str(threshold) + '.txt' out_file = open(output_name, 'w') querymatrix,smiles = importQuery(input_name) prediction_results = performTargetPrediction(models) print ' Total Number of Query Molecules file 1 : ' + str(len(querymatrix)) querymatrix,smiles2 = importQuery(input_name2) prediction_results2 = performTargetPrediction(models) print ' Total Number of Query Molecules file 2 : ' + str(len(querymatrix)) sim_output = [] sim_output2 = [] for idx in range(prediction_results.shape[1]): sim_output.append(rogerstanimoto(prediction_results[:,idx],prediction_results2[:,idx])) sim_output2.append(jaccard(prediction_results[:,idx],prediction_results2[:,idx])) out_file.write('Compound Pair No.\tSmiles 1\tSmiles 2\tRogers Tanimoto\tJaccard Sim\n') for idx, comp1 in enumerate(smiles): comp2 = smiles2[idx] s = sim_output[idx] s2 = sim_output2[idx] out_file.write('\t'.join(map(str,[idx,comp1,comp2,1.0-s,1.0-s2])) + '\n') print '\n Wrote Results to: ' + output_name out_file.close()
######################## nlp_feat['title_prefix_cosine_distance_ybb'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(prefix_vectors), np.nan_to_num(title_vectors)) ] print('title_prefix_cosine_distance done') nlp_feat['title_prefix_cityblock_distance_ybb'] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(prefix_vectors), np.nan_to_num(title_vectors)) ] print('title_prefix_cityblock_distance done') nlp_feat['title_prefix_jaccard_distance_ybb'] = [ jaccard(x, y) for (x, y) in zip(np.nan_to_num(prefix_vectors), np.nan_to_num(title_vectors)) ] print('title_prefix_jaccard_distance done') nlp_feat['title_prefix_canberra_distance_ybb'] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(prefix_vectors), np.nan_to_num(title_vectors)) ] print('title_prefix_canberra_distance done') nlp_feat['title_prefix_euclidean_distance_ybb'] = [ euclidean(x, y) for (x, y) in zip(np.nan_to_num(prefix_vectors), np.nan_to_num(title_vectors)) ]
def jaccard_distance_index(subtree_a, subtree_b): start = min(min(subtree_a), min(subtree_b)) ind_rage = range(start, max(max(subtree_a), max(subtree_b)) + 1) A = [1 if i in subtree_a else 0 for i in ind_rage] B = [1 if i in subtree_b else 0 for i in ind_rage] return 1 - jaccard(A, B)
def jaccard_pixelwise(mask_a, mask_b, threshold=0.5): mask_a = (mask_a > threshold).astype(np.uint8) mask_b = (mask_b > threshold).astype(np.uint8) jac_dist = jaccard(mask_a.flatten(), mask_b.flatten()) return (1 - jac_dist)
e_ab = euclidean(a, b) e_ac = euclidean(a, c) e_bc = euclidean(b, c) # Cosine Distance c_ab = cosine(a, b) c_ac = cosine(a, c) c_bc = cosine(b, c) # converting to boolean a = a > 0 b = b > 0 c = c > 0 # Jaccard Distance j_ab = jaccard(a, b) j_ac = jaccard(a, c) j_bc = jaccard(b, c) # Enter distance comparison below for each pair of vectors: print('\n\nEuclidean Distance\n ab:', e_ab, 'ac:', e_ac, 'bc:', e_bc) print('Cosine Distance\n ab:', c_ab, 'ac:', c_ac, 'bc:', c_bc) print('Jaccard Dissimilarity (vectors should be boolean values)\n ab:', j_ab, 'ac:', j_ac, 'bc:', j_bc) print('\n\nThe most appropriate distance is...Cosine Distance') print( '\nThe Cosine distance is best in this scenario because \nif the angle between the two vectors is small, then they \nare closer together and therefore more similar.' )
def get_dist_preds(self, predictions, metric): new_preds = [] classes = predictions[0].classes for j, pred in enumerate(predictions): distances = [] remaining_preds = predictions[:j] + predictions[j + 1:] for pred_ in remaining_preds: dist_by_class = list([0] * len(classes)) for k, class_ in enumerate(classes): class_pred_ = pred_.probabilities[k] class_pred = pred.probabilities[k] if metric == 'euclid': dist_by_class[k] = euclidean(class_pred_, class_pred) elif metric == 'cosine': dist_by_class[k] = cosine(class_pred_, class_pred) elif metric == 'jaccard': # i think this is only for boolean dist_by_class[k] = jaccard(class_pred_, class_pred) elif metric == 'chebyshev': dist_by_class[k] = chebyshev(class_pred_, class_pred) elif metric == 'correlation': dist_by_class[k] = correlation(class_pred_, class_pred) elif metric == 'cityblock': dist_by_class[k] = cityblock(class_pred_, class_pred) elif metric == 'canberra': dist_by_class[k] = canberra(class_pred_, class_pred) elif metric == 'braycurtis': dist_by_class[k] = braycurtis(class_pred_, class_pred) elif metric == 'hamming': # i think this is only for boolean dist_by_class[k] = hamming(class_pred_, class_pred) elif metric == 'battacharyya': dist_by_class[k] = DistanceMetrics.battacharyya( class_pred_, class_pred, method='continuous') distances += [dist_by_class] # distances = [[c11,c21,c31], [c12,c22,c32], ..., [c1m,c2m,c3m]] for m models # new_preds = [(pred, [c11+...+c1m, ..., c31+...+c3m])] new_preds += [(pred, [sum(i) for i in zip(*distances)]) ] # (precdictions, [w1, w2, ..., wc]) for c classes weights = [tup[1] for tup in new_preds] W = [ sum(i) for i in zip(*weights) ] # total weight for each class: [sum(w1i), sum(w2i), ..., sum(wci)], sum of sums for each model i class_weighted_preds = [] for i, class_ in enumerate(classes): class_weights = [w[i] for w in weights] class_pred_dist = [(np.array([l[i] for l in tup[0].probabilities]), tup[1][i]) for tup in new_preds] if self.sdhw: # those with lower distances have higher weight sort in ascending order of aggregated distances preds_ascending_dist = sorted(class_pred_dist, key=lambda x: x[1]) # weights is a list of lists containing the weights for the classes of each model weights_descending = sorted(class_weights, reverse=True) weighted_pred = sum([ pred_tup[0] * (weights_descending[k] / W[i]) for k, pred_tup in enumerate(preds_ascending_dist) ]) else: # those with lower distances have lower weight weighted_pred = sum([ pred_tup[0] * (pred_tup[1] / W[i]) for pred_tup in class_pred_dist ]) class_weighted_preds += [weighted_pred] class_weighted_preds_trunc = np.array( [[class_weighted_preds[0][i], class_weighted_preds[1][i]] for i in range(len(class_weighted_preds[0]))]) indices_max_proba = class_weighted_preds_trunc.argmax(axis=1) classifications = np.array([classes[i] for i in indices_max_proba]) return classifications
print ' Total Number of Classes : ' + str(len(models)) print ' Using TPR threshold of : ' + str(threshold) output_name = input_name + '_' + input_name2 + '_out_binary_sim_' + str( threshold) + '.txt' out_file = open(output_name, 'w') querymatrix, smiles, ids = importQuery(input_name) prediction_results = performTargetPrediction(models) print ' Total Number of Query Molecules file 1 : ' + str(len(querymatrix)) querymatrix, smiles2, ids2 = importQuery(input_name2) prediction_results2 = performTargetPrediction(models) print ' Total Number of Query Molecules file 2 : ' + str(len(querymatrix)) sim_output = [] sim_output2 = [] for idx in range(prediction_results.shape[1]): sim_output.append( rogerstanimoto(prediction_results[:, idx], prediction_results2[:, idx])) sim_output2.append( jaccard(prediction_results[:, idx], prediction_results2[:, idx])) out_file.write( 'Compound Pair No.\tSmiles 1\tSmiles 2\tRogers Tanimoto\tJaccard Sim\n' ) for idx, comp1 in enumerate(ids): comp2 = ids2[idx] s = sim_output[idx] s2 = sim_output2[idx] out_file.write( '\t'.join(map(str, [idx, comp1, comp2, 1.0 - s, 1.0 - s2])) + '\n') print '\n Wrote Results to: ' + output_name out_file.close()
def disc_error(self, x): ph = self.sigmoid(x.dot(self.w)+self.b) hstates = np.random.binomial(1, ph) act_hstates = np.random.binomial(1, self.ph) return dist.jaccard(hstates, act_hstates)
def cal_distance_bool(fps): return distance.jaccard(*fps)
def jaccard_distance(a,b): return distance.jaccard(a,b)
def add_word2vec_features(self, model_path, model_name='w2v', vector_size=300): """ word2vec features require a lot of RAM to be computed """ # Load model and compute Word Mover's Distance self.w2c_model = gensim.models.KeyedVectors.load_word2vec_format( model_path, binary=True) self.w2c_model.init_sims(replace=True) self.df['{}_norm_wmd'.format(model_name)] = self.df.apply( lambda x: self.word_mover_distance(x['question1'], x['question2']), axis=1) self.w2c_model = gensim.models.KeyedVectors.load_word2vec_format( model_path, binary=True) self.df['{}_wmd'.format(model_name)] = self.df.apply( lambda x: self.word_mover_distance(x['question1'], x['question2']), axis=1) # Generate vectors from questions question1_vectors = np.zeros((self.df.shape[0], vector_size)) question2_vectors = np.zeros((self.df.shape[0], vector_size)) j = 0 for i, row in self.df.iterrows(): question1_vectors[j, :] = self.text2vec(row[self.q1_column]) question2_vectors[j, :] = self.text2vec(row[self.q2_column]) j += 1 self.w2c_model = None # Save up some RAM # Compute several features using vectors self.df['{}_cosine_distance'.format(model_name)] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] self.df['{}_cityblock_distance'.format(model_name)] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] self.df['{}_jaccard_distance'.format(model_name)] = [ jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] self.df['{}_canberra_distance'.format(model_name)] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] self.df['{}_euclidean_distance'.format(model_name)] = [ euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] self.df['{}_minkowski_distance'.format(model_name)] = [ minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] self.df['{}_braycurtis_distance'.format(model_name)] = [ braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] self.df['{}_skew_q1vec'.format(model_name)] = [ skew(x) for x in np.nan_to_num(question1_vectors) ] self.df['{}_skew_q2vec'.format(model_name)] = [ skew(x) for x in np.nan_to_num(question2_vectors) ] self.df['{}_kur_q1vec'.format(model_name)] = [ kurtosis(x) for x in np.nan_to_num(question1_vectors) ] self.df['{}_kur_q2vec'.format(model_name)] = [ kurtosis(x) for x in np.nan_to_num(question2_vectors) ] # Add word vectors as features for i in xrange(vector_size): self.df['{}_q1vec_{}'.format(model_name, i)] = question1_vectors[:, i] self.df['{}_q2vec_{}'.format(model_name, i)] = question2_vectors[:, i]
def sim_bin(s_vector,t_vector): return 1 - distance.jaccard(s_vector, t_vector) # cosine similarity between two wieghted vectors def sim_cosine(s_vector,t_vector): return 1 - distance.cosine(s_vector, t_vector)
return v / np.sqrt((v**2).sum()) else: return np.zeros(300) #1*300 w2v_q1 = np.array([sent2vec(q, model) for q in data.question1]) w2v_q2 = np.array([sent2vec(q, model) for q in data.question2]) from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(w2v_q1, w2v_q2)] #x y 1-D array data['cityblock_distance'] = [ cityblock(x, y) for (x, y) in zip(w2v_q1, w2v_q2) ] data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(w2v_q1, w2v_q2)] data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(w2v_q1, w2v_q2)] data['euclidean_distance'] = [ euclidean(x, y) for (x, y) in zip(w2v_q1, w2v_q2) ] data['minkowski_distance'] = [ minkowski(x, y, 3) for (x, y) in zip(w2v_q1, w2v_q2) ] data['braycurtis_distance'] = [ braycurtis(x, y) for (x, y) in zip(w2v_q1, w2v_q2) ] fs4_1 = [ 'cosine_distance', 'cityblock_distance', 'jaccard_distance', 'canberra_distance', 'euclidean_distance', 'minkowski_distance', 'braycurtis_distance'
def jaccard(u: np.ndarray, v: np.ndarray) -> float: return distance.jaccard(u, v)
def jaccard_distance_index(subtree_a, subtree_b): start = min(min(subtree_a), min(subtree_b)) ind_rage = range(start, max(max(subtree_a), max(subtree_b))+1) A = [1 if i in subtree_a else 0 for i in ind_rage] B = [1 if i in subtree_b else 0 for i in ind_rage] return 1-jaccard(A, B)
def jaccard_distance(a,b): print distance.jaccard(a,b)
def generate_feature(data): """ basic feature """ #length of sentence data['len_word_s1'] = data.apply( lambda x: len(x['sentences'].split("\001")[0].split(" ")), axis=1) data['len_word_s2'] = data.apply( lambda x: len(x['sentences'].split("\001")[1].split(" ")), axis=1) data['len_ratio'] = data.apply(lambda x: len_ratio(x['sentences']), axis=1) data['len_char_s1'] = data.apply( lambda x: len(''.join(x['sentences'].split("\001")[0].split(" "))), axis=1) data['len_char_s2'] = data.apply( lambda x: len(''.join(x['sentences'].split("\001")[1].split(" "))), axis=1) """ fuzzywuzzy feature """ data['fuzz_QRatio'] = data.apply(lambda x: fuzz_QRatio(x['sentences']), axis=1) data['fuzz_WRatio'] = data.apply(lambda x: fuzz_WRatio(x['sentences']), axis=1) data['fuzz_partial_ratio'] = data.apply( lambda x: fuzz_partial_ratio(x['sentences']), axis=1) data['fuzz_partial_token_set_ratio'] = data.apply( lambda x: fuzz_partial_token_set_ratio(x['sentences']), axis=1) data['fuzz_partial_token_sort_ratio'] = data.apply( lambda x: fuzz_partial_token_sort_ratio(x['sentences']), axis=1) data['fuzz_token_set_ratio'] = data.apply( lambda x: fuzz_token_set_ratio(x['sentences']), axis=1) data['fuzz_token_sort_ratio'] = data.apply( lambda x: fuzz_token_sort_ratio(x['sentences']), axis=1) """ word2vec feature """ sent1_vectors = np.zeros((data.shape[0], 300)) for i, sents in tqdm(enumerate(data.sentences.values)): sent = sents.split("\001")[0] sent1_vectors[i, :] = sent2vec(sent) sent2_vectors = np.zeros((data.shape[0], 300)) for i, sents in tqdm(enumerate(data.sentences.values)): sent = sents.split("\001")[1] sent2_vectors[i, :] = sent2vec(sent) data['cityblock_distance'] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['jaccard_distance'] = [ jaccard(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['cosine_distance'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['canberra_distance'] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['euclidean_distance'] = [ euclidean(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['braycurtis_distance'] = [ braycurtis(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['minkowski_distance'] = [ minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['pearson_coff'] = [ scipy.stats.pearsonr(x, y)[0] for (x, y) in zip( np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['spearman_coff'] = [ scipy.stats.spearmanr(x, y)[0] for (x, y) in zip( np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['kendalltau_coff'] = [ scipy.stats.kendalltau(x, y)[0] for (x, y) in zip( np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['polynomial_kernel'] = [ polynomial_kernel(x.reshape(1, -1), y.reshape(1, -1))[0][0] for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['sigmoid_kernel'] = [ sigmoid_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0] for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['rbf_kernel'] = [ rbf_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0] for (x, y) in zip( np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['laplacian_kernel'] = [ laplacian_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0] for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['skew_s1vec'] = [skew(x) for x in np.nan_to_num(sent1_vectors)] data['skew_s2vec'] = [skew(x) for x in np.nan_to_num(sent2_vectors)] data['kur_s1vec'] = [kurtosis(x) for x in np.nan_to_num(sent1_vectors)] data['kur_s2vec'] = [kurtosis(x) for x in np.nan_to_num(sent2_vectors)] """ word2vec feature average and weighted idf """ sent1_vectors = np.zeros((data.shape[0], 300)) for i, sents in tqdm(enumerate(data.sentences.values)): sent = sents.split("\001")[0] sent1_vectors[i, :] = sent2vec_ave(sent) sent2_vectors = np.zeros((data.shape[0], 300)) for i, sents in tqdm(enumerate(data.sentences.values)): sent = sents.split("\001")[1] sent2_vectors[i, :] = sent2vec_ave(sent) data['cityblock_distance_ave'] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['jaccard_distance_ave'] = [ jaccard(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['cosine_distance_ave'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['canberra_distance_ave'] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['euclidean_distance_ave'] = [ euclidean(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['braycurtis_distance_ave'] = [ braycurtis(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['minkowski_distance_ave'] = [ minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['pearson_coff_ave'] = [ scipy.stats.pearsonr(x, y)[0] for (x, y) in zip( np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['spearman_coff_ave'] = [ scipy.stats.spearmanr(x, y)[0] for (x, y) in zip( np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['kendalltau_coff_ave'] = [ scipy.stats.kendalltau(x, y)[0] for (x, y) in zip( np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['polynomial_kernel_ave'] = [ polynomial_kernel(x.reshape(1, -1), y.reshape(1, -1))[0][0] for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['sigmoid_kernel_ave'] = [ sigmoid_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0] for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['rbf_kernel_ave'] = [ rbf_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0] for (x, y) in zip( np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['laplacian_kernel_ave'] = [ laplacian_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0] for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['skew_s1vec_ave'] = [skew(x) for x in np.nan_to_num(sent1_vectors)] data['skew_s2vec_ave'] = [skew(x) for x in np.nan_to_num(sent2_vectors)] data['kur_s1vec_ave'] = [kurtosis(x) for x in np.nan_to_num(sent1_vectors)] data['kur_s2vec_ave'] = [kurtosis(x) for x in np.nan_to_num(sent2_vectors)] """ word2vec feature average and weighted idf """ sent1_vectors = np.zeros((data.shape[0], 300)) for i, sents in tqdm(enumerate(data.sentences.values)): sent = sents.split("\001")[0] sent1_vectors[i, :] = sent2vec_ave_idf(sent) sent2_vectors = np.zeros((data.shape[0], 300)) for i, sents in tqdm(enumerate(data.sentences.values)): sent = sents.split("\001")[1] sent2_vectors[i, :] = sent2vec_ave_idf(sent) data['cityblock_distance_ave_idf'] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['jaccard_distance_ave_idf'] = [ jaccard(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['cosine_distance_ave_idf'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['canberra_distance_ave_idf'] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['euclidean_distance_ave_idf'] = [ euclidean(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['braycurtis_distance_ave_idf'] = [ braycurtis(x, y) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['minkowski_distance_ave_idf'] = [ minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['pearson_coff_ave_idf'] = [ scipy.stats.pearsonr(x, y)[0] for (x, y) in zip( np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['spearman_coff_ave_idf'] = [ scipy.stats.spearmanr(x, y)[0] for (x, y) in zip( np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['kendalltau_coff_ave_idf'] = [ scipy.stats.kendalltau(x, y)[0] for (x, y) in zip( np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['polynomial_kernel_ave_idf'] = [ polynomial_kernel(x.reshape(1, -1), y.reshape(1, -1))[0][0] for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['sigmoid_kernel_ave_idf'] = [ sigmoid_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0] for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['rbf_kernel_ave_idf'] = [ rbf_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0] for (x, y) in zip( np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['laplacian_kernel_ave'] = [ laplacian_kernel(x.reshape(-1, 1), y.reshape(-1, 1))[0][0] for (x, y) in zip(np.nan_to_num(sent1_vectors), np.nan_to_num(sent2_vectors)) ] data['skew_s1vec_ave_idf'] = [ skew(x) for x in np.nan_to_num(sent1_vectors) ] data['skew_s2vec_ave_idf'] = [ skew(x) for x in np.nan_to_num(sent2_vectors) ] data['kur_s1vec_ave_idf'] = [ kurtosis(x) for x in np.nan_to_num(sent1_vectors) ] data['kur_s2vec_ave_idf'] = [ kurtosis(x) for x in np.nan_to_num(sent2_vectors) ] """ Sequence Features """ data['long_common_sequence'] = data.apply( lambda x: long_common_sequence(x['sentences']), axis=1) data['long_common_prefix'] = data.apply( lambda x: long_common_prefix(x['sentences']), axis=1) data['long_common_suffix'] = data.apply( lambda x: long_common_suffix(x['sentences']), axis=1) data['long_common_substring'] = data.apply( lambda x: long_common_substring(x['sentences']), axis=1) data['levenshtein_distance'] = data.apply( lambda x: levenshtein_distance(x['sentences']), axis=1) #other featre data['has_no_word'] = data.apply(lambda x: has_no_word(x['sentences']), axis=1) data['bow'] = data.apply(lambda x: bag_of_words(x['sentences']), axis=1) data['bow_tfidf'] = data.apply( lambda x: bag_of_words_tfidf(x['sentences']), axis=1) data['lcs_diff'] = data.apply(lambda x: lcs_diff(x['sentences']), axis=1) #N-gramOverlap data['1-gramoverlap'] = data.apply( lambda x: n_gram_over_lap(x['sentences'], 1), axis=1) data['2-gramoverlap'] = data.apply( lambda x: n_gram_over_lap(x['sentences'], 2), axis=1) data['3-gramoverlap'] = data.apply( lambda x: n_gram_over_lap(x['sentences'], 3), axis=1) data['2-gramoverlap_char'] = data.apply( lambda x: n_gram_over_lap_char(x['sentences'], 2), axis=1) data['3-gramoverlap_char'] = data.apply( lambda x: n_gram_over_lap_char(x['sentences'], 3), axis=1) data['4-gramoverlap_char'] = data.apply( lambda x: n_gram_over_lap_char(x['sentences'], 4), axis=1) data['5-gramoverlap_char'] = data.apply( lambda x: n_gram_over_lap_char(x['sentences'], 5), axis=1) return data
def valueCal(q, index_list, data, modelvec, modelvec_all, modelvec_stop, length, shape): for i in index_list: list_con = [] q1 = str(data.get_value(i, 'question1')).split() q2 = str(data.get_value(i, 'question2')).split() f1 = getF1_union(q1, q2) f2 = getF2_inter(q1, q2) f3 = getF3_sum(q1, q2) f4_q1 = len(q1) f4_q2 = len(q2) f4_rate = f4_q1 / f4_q2 q1 = getVect(q1, modelvec, length, shape) q2 = getVect(q2, modelvec, length, shape) cos_dis_add = getCosDis_add(q1, q2) cos_dis_map = getCosDis_map(q1, q2) euc_dis = getEucDis(q1, q2) # all and stop q1_all = str(data.get_value(i, 'question1_all')).split() q2_all = str(data.get_value(i, 'question2_all')).split() q1_stop = str(data.get_value(i, 'question1_stop')).split() q2_stop = str(data.get_value(i, 'question2_stop')).split() f1_all = getF1_union(q1_all, q2_all) f2_all = getF2_inter(q1_all, q2_all) f3_all = getF3_sum(q1_all, q2_all) f4_q1_all = len(q1_all) f4_q2_all = len(q2_all) f4_rate_all = f4_q1_all / f4_q2_all f1_stop = getF1_union(q1_stop, q2_stop) f2_stop = getF2_inter(q1_stop, q2_stop) f3_stop = getF3_sum(q1_stop, q2_stop) f4_q1_stop = len(q1_stop) f4_q2_stop = len(q2_stop) f4_rate_stop = f4_q1_stop / f4_q2_stop q1_all = getVect(q1_all, modelvec_all, length, shape) q2_all = getVect(q2_all, modelvec_all, length, shape) q1_stop = getVect(q1_stop, modelvec_stop, length, shape) q2_stop = getVect(q2_stop, modelvec_stop, length, shape) cos_dis_all_add = getCosDis_add(q1_all, q2_all) cos_dis_all_map = getCosDis_map(q1_all, q2_all) cos_dis_stop_add = getCosDis_add(q1_stop, q2_stop) cos_dis_stop_map = getCosDis_map(q1_stop, q2_stop) euc_dis_all = getEucDis(q1_all, q2_all) euc_dis_stop = getEucDis(q1_stop, q2_stop) q1_map = vecmap(q1) q2_map = vecmap(q2) q1_add = vecadd(q1) q2_add = vecadd(q2) q_jac_map_dis = distance.jaccard(q1_map, q2_map) q_jac_add_dis = distance.jaccard(q1_add, q2_add) q_mhd_map_dis = distance.mahalanobis( q1_map, q2_map, np.linalg.pinv(np.cov(np.vstack((q1_map, q2_map)).T))) q_pearson_map = cal_pearson(q1_map, q2_map) q_pearson_add = cal_pearson(q1_add, q2_add) q_spearmanr_map_t, q_spearmanr_map_p = spearmanr(q1_map, q2_map) q_spearmanr_add_t, q_spearmanr_add_p = spearmanr(q1_add, q2_add) q_kendalltau_map_t, q_kendalltau_map_p = kendalltau(q1_map, q2_map) q_kendalltau_add_t, q_kendalltau_add_p = kendalltau(q1_add, q2_add) # all and stop q1_all_map = vecmap(q1_all) q2_all_map = vecmap(q2_all) q1_all_add = vecadd(q1_all) q2_all_add = vecadd(q2_all) q1_stop_map = vecmap(q1_stop) q2_stop_map = vecmap(q2_stop) q1_stop_add = vecadd(q1_stop) q2_stop_add = vecadd(q2_stop) q_all_jac_map_dis = distance.jaccard(q1_all_map, q2_all_map) q_all_jac_add_dis = distance.jaccard(q1_all_add, q2_all_add) q_stop_jac_map_dis = distance.jaccard(q1_stop_map, q2_stop_map) q_stop_jac_add_dis = distance.jaccard(q1_stop_add, q2_stop_add) q_all_mhd_map_dis = distance.mahalanobis( q1_all_map, q2_all_map, np.linalg.pinv(np.cov(np.vstack((q1_all_map, q2_all_map)).T))) q_stop_mhd_map_dis = distance.mahalanobis( q1_stop_map, q2_stop_map, np.linalg.pinv(np.cov(np.vstack((q1_stop_map, q2_stop_map)).T))) q_all_pearson_map = cal_pearson(q1_all_map, q2_all_map) q_all_pearson_add = cal_pearson(q1_all_add, q2_all_add) q_stop_pearson_map = cal_pearson(q1_stop_map, q2_stop_map) q_stop_pearson_add = cal_pearson(q1_stop_add, q2_stop_add) q_all_spearmanr_map_t, q_all_spearmanr_map_p = spearmanr( q1_all_map, q2_all_map) q_all_spearmanr_add_t, q_all_spearmanr_add_p = spearmanr( q1_all_add, q2_all_add) q_stop_spearmanr_map_t, q_stop_spearmanr_map_p = spearmanr( q1_stop_map, q2_stop_map) q_stop_spearmanr_add_t, q_stop_spearmanr_add_p = spearmanr( q1_stop_add, q2_stop_add) q_all_kendalltau_map_t, q_all_kendalltau_map_p = kendalltau( q1_all_map, q2_all_map) q_all_kendalltau_add_t, q_all_kendalltau_add_p = kendalltau( q1_all_add, q2_all_add) q_stop_kendalltau_map_t, q_stop_kendalltau_map_p = kendalltau( q1_stop_map, q2_stop_map) q_stop_kendalltau_add_t, q_stop_kendalltau_add_p = kendalltau( q1_stop_add, q2_stop_add) # list_con.append(q1) # list_con.append(q2) # all and stop # list_con.append(q1_all) # list_con.append(q2_all) # list_con.append(q1_stop) # list_con.append(q2_stop) list_con.append(data.get_value(i, 'id')) list_con.append(data.get_value(i, 'qid1')) list_con.append(data.get_value(i, 'qid2')) list_con.append(data.get_value(i, 'question1_all')) list_con.append(data.get_value(i, 'question2_all')) list_con.append(data.get_value(i, 'question1_stop')) list_con.append(data.get_value(i, 'question2_stop')) list_con.append(data.get_value(i, 'question1')) list_con.append(data.get_value(i, 'question2')) list_con.append(data.get_value(i, 'is_duplicate')) list_con.append(f1) list_con.append(f2) list_con.append(f3) list_con.append(f4_q1) list_con.append(f4_q2) list_con.append(f4_rate) list_con.append(cos_dis_add) list_con.append(cos_dis_map) list_con.append(euc_dis) list_con.append(q_jac_map_dis) list_con.append(q_jac_add_dis) list_con.append(q_mhd_map_dis) list_con.append(q_pearson_map) list_con.append(q_pearson_add) list_con.append(q_spearmanr_map_t) list_con.append(q_spearmanr_add_t) list_con.append(q_spearmanr_map_p) list_con.append(q_spearmanr_add_p) list_con.append(q_kendalltau_map_p) list_con.append(q_kendalltau_add_p) list_con.append(q_kendalltau_map_t) list_con.append(q_kendalltau_add_t) # all and stop list_con.append(f1_all) list_con.append(f2_all) list_con.append(f3_all) list_con.append(f4_q1_all) list_con.append(f4_q2_all) list_con.append(f4_rate_all) list_con.append(f1_stop) list_con.append(f2_stop) list_con.append(f3_stop) list_con.append(f4_q1_stop) list_con.append(f4_q2_stop) list_con.append(f4_rate_stop) list_con.append(cos_dis_all_add) list_con.append(cos_dis_all_map) list_con.append(cos_dis_stop_add) list_con.append(cos_dis_stop_map) list_con.append(euc_dis_all) list_con.append(euc_dis_stop) list_con.append(q_all_jac_map_dis) list_con.append(q_all_jac_add_dis) list_con.append(q_stop_jac_map_dis) list_con.append(q_stop_jac_add_dis) list_con.append(q_all_mhd_map_dis) list_con.append(q_stop_mhd_map_dis) list_con.append(q_all_pearson_map) list_con.append(q_all_pearson_add) list_con.append(q_stop_pearson_map) list_con.append(q_stop_pearson_add) list_con.append(q_all_spearmanr_map_t) list_con.append(q_all_spearmanr_add_t) list_con.append(q_stop_spearmanr_map_t) list_con.append(q_stop_spearmanr_add_t) list_con.append(q_all_spearmanr_map_p) list_con.append(q_all_spearmanr_add_p) list_con.append(q_stop_spearmanr_map_p) list_con.append(q_stop_spearmanr_add_p) list_con.append(q_all_kendalltau_map_p) list_con.append(q_all_kendalltau_add_p) list_con.append(q_stop_kendalltau_map_p) list_con.append(q_stop_kendalltau_add_p) list_con.append(q_all_kendalltau_map_t) list_con.append(q_all_kendalltau_add_t) list_con.append(q_stop_kendalltau_map_t) list_con.append(q_stop_kendalltau_add_t) while 1: try: q.put(list_con) break except: continue
def main(): print "# KNN Classifier" parser = ld.parse_arguments() # priting args print '\t-k = ' + str(parser.k) print '\t-d = ' + parser.distance stopwords = None if parser.stopwords_path: stopwords = ld.load_stopwords(parser.stopwords_path) voc = load_vocabulary(parser.train_path, stopwords) answers = load_answers(parser.train_path) train = transform(voc, parser.train_path) test = transform(voc, parser.test_path) # output file out_path = '../results/' + parser.distance + '_' + str(parser.k) out_path += '.txt' out_file = open(out_path, 'w') for point in test: neighbors = [] for i in xrange(len(train)): neigh = train[i] distance = 0.0 if parser.distance == 'cosine': distance = spd.cosine(neigh, point) elif parser.distance == 'jaccard': distance = spd.jaccard(neigh, point) elif parser.distance == 'euclidean': distance = spd.euclidean(neigh, point) elif parser.distance == 'dice': distance = spd.dice(neigh, point) elif parser.distance == 'correlation': distance = spd.correlation(neigh, point) elif parser.distance == 'manhattan': distance = spd.cityblock(neigh, point) else: print >> stderr, "ERRO! - Distância informada inválida." exit() tup = (distance, i) heapq.heappush(neighbors, tup) # return the highest k similar points top_k = heapq.nsmallest(parser.k, neighbors) # classifing classification = np.zeros(2) for (_, idi) in top_k: classe = answers[idi] classification[int(classe)] += 1 # outputing classification if(classification[0] >= classification[1]): print >> out_file, '0' print '0' else: print >> out_file, '1' print '1' # outputing the results' print print "# Resultados salvos no arquivo: " + out_path out_file.close() result.result("../data/imdb_test", out_path)
def simil_fun_jaccard_5(a, b): a_bin = a >= 10**(-5) b_bin = b >= 10**(-5) return jaccard(a, b)
error_count = 0 for i, q in tqdm(enumerate(train_df.q1_clean.values)): question1_vectors[i, :] = sent2vec(q) question2_vectors = np.zeros((train_df.shape[0], 300)) for i, q in tqdm(enumerate(train_df.q2_clean.values)): question2_vectors[i, :] = sent2vec(q) train_df['cosine_distance2'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] train_df['cityblock_distance2'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] train_df['jaccard_distance2'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] train_df['canberra_distance2'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] train_df['euclidean_distance2'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] train_df['minkowski_distance2'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] train_df['braycurtis_distance2'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] train_df['skew_q1vec2'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
error_count = 0 for i, q in tqdm(enumerate(data.question1.values)): question1_vectors[i, :] = sent2vec(q) question2_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.question2.values)): question2_vectors[i, :] = sent2vec(q) data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
reorder = np.array(bound_clusters['leaves']) plotfun.plotAggregateSignal(footprintVecs_norm_bound[reorder], labels=labels[reorder]) plt.title('threshold %3.2f; %d bound by all'%(threshold, np.sum(np.all(boundMatrix, 0)))) plt.savefig('%s.threshold_%3.2f.normalizedbyinsertions.bound.pdf'%(outfile, threshold)) # plot unbound profiles footprintVecs_norm_unbound = np.array([np.mean(footprintMats[i, unboundMatrix[i]], 0)* np.mean(footprintMats[unboundMatrix])/np.mean(footprintMats[i, unboundMatrix[i]]) for i in range(numSamples)]) plotfun.plotAggregateSignal(footprintVecs_norm_unbound, labels=labels) plt.title('threshold %3.2f; %d unbound'%(threshold, np.sum(np.all(np.logical_not(boundMatrix), 0)))) plt.savefig('%s.threshold_%3.2f.normalizedbyinsertions.unbound.pdf'%(outfile, threshold)) # do Jaccard distance heat map distance = np.array([[1-jaccard(boundMatrix[i], boundMatrix[j]) for i in range(numSamples)] for j in range(numSamples)]) plotHeatMap(distance, rowlabels=labels, columnlabels=labels, fontSize=11, cmap='RdGy_r', vmin=0, vmax=1) plt.savefig('%s.jaccard.heatmap.threshold_%3.2f.pdf'%(outfile, threshold)) # plot heat maps for i in range(numSamples): fig = plt.figure(figsize=(4, 10)) sortIndx = np.argsort(strengthMatrix[i])[::-1] plot_heatmap_bar(footprintMats[i, sortIndx], strengthMatrix[i,sortIndx], threshold=[np.sum(correlationToIdeal[i]['r']-correlationToBackground[i]['r'] > j) for j in [0, 0.05, 0.1, 0.15, 0.2]], label=labels[i]) plt.savefig('%s.footprint.%s.heatmap.pdf'%(outfile, labels[i])) # plot side by side bound and unbound threshold = 0.05 fig = plt.figure(figsize=(4, 10)) plot_heatmap_bar(footprintMats[i, np.all(boundMatrix,0)], strengthMatrix[i,np.all(boundMatrix,0)])
def jaccardratio(x,y): from scipy.spatial.distance import jaccard import numpy vx=numpy.array(bytearray(x)) vy=numpy.array(bytearray(y)) return jaccard(vx,vy)
def jaccard_similarity(row_video, col_video): """ jaccard_similarity : J(X,Y) = |X∩Y| / |X∪Y| input result into dataframe and return that. """ similarity_result = np.zeros((len(row_video), len(col_video))) similarity_result = np.column_stack((row_video, similarity_result)) temp_col = [np.nan] + col_video similarity_result = np.row_stack((temp_col, similarity_result)) # criteria video's data to dataframe MyDB.dic_execute('select * from ROUTINE where video_num in ' + str(tuple(row_video))) row_video_tuple = list(MyDB.dic_fetchall()) MyDB.dic_execute('select * from EXERCISE where video_num in ' + str(tuple(row_video))) row_video_tuple += list(MyDB.dic_fetchall()) row_video_df = pd.DataFrame(row_video_tuple) #compared video's data to dataframe MyDB.dic_execute('select * from ROUTINE where video_num in ' + str(tuple(col_video))) col_video_tuple = list(MyDB.dic_fetchall()) MyDB.dic_execute('select * from EXERCISE where video_num in ' + str(tuple(col_video))) col_video_tuple += list(MyDB.dic_fetchall()) col_video_df = pd.DataFrame(col_video_tuple) # EXERCISE and ROUTINE table have different size of columns. # So, make same size of matrix to calculate jaccard distance if len(row_video_df.columns) == 7: row_video_df.insert(loc=1, column='equipment', value=np.zeros((len(row_video_df), 1))) row_video_df.insert(loc=2, column='excer_type', value=np.zeros((len(row_video_df), 1))) row_video_df.insert(loc=7, column='trainer', value=np.zeros((len(row_video_df), 1))) if len(col_video_df.columns) == 7: col_video_df.insert(loc=1, column='equipment', value=np.zeros((len(col_video_df), 1))) col_video_df.insert(loc=2, column='excer_type', value=np.zeros((len(col_video_df), 1))) col_video_df.insert(loc=7, column='trainer', value=np.zeros((len(col_video_df), 1))) # all row_video data convert to number for i in row_video_df.columns.values: if i == 'length' or i == 'url' or i == 'video_num': continue # sex = 'f' or 'm', but 'm' is overlapped dataType of level column. elif i == 'sex': for j, v in row_video_df[i].items(): if v == 'f': row_video_df.loc[j, i] = 51 elif v == 'm': row_video_df.loc[j, i] = 52 else: s = convert_cate_num(v) row_video_df.loc[j, i] = s else: for j, v in row_video_df[i].items(): s = convert_cate_num(v) row_video_df.loc[j, i] = s # all column_video data convert to number for i in col_video_df.columns.values: if i == 'length' or i == 'url' or i == 'video_num': continue elif i == 'sex': for j, v in col_video_df[i].items(): if v == 'f': col_video_df.loc[j, i] = 51 elif v == 'm': col_video_df.loc[j, i] = 52 else: s = convert_cate_num(v) col_video_df.loc[j, i] = s else: for j, v in col_video_df[i].items(): s = convert_cate_num(v) col_video_df.loc[j, i] = s row_video_df = row_video_df.sort_values(by=['video_num']) col_video_df = col_video_df.sort_values(by=['video_num']) for i in range(len(row_video) - 1): temp_row_value = row_video_df.iloc[i].values for j in range(len(col_video)): temp_col_value = col_video_df.iloc[j].values similarity_result[i + 1, j + 1] = 1 - distance.jaccard( temp_row_value, temp_col_value) similarity_result = pd.DataFrame(similarity_result[1:, 1:], index=similarity_result[1:, 0], columns=similarity_result[0, 1:]) return similarity_result
def rank_jaccard(intent: np.ndarray, selection: np.ndarray) -> float: return float(1 - jaccard(intent, selection))
def _sim(self, v1, v2): if v1.norm() == 0 or v2.norm() == 0: return 0.0 return jaccard(v1, v2)
df.head(2) nltk.download('punkt') question1_vectors = np.zeros((df.shape[0], 300)) for i, q in enumerate(tqdm_notebook(df.question1.values)): question1_vectors[i, :] = sent2vec(q) question2_vectors = np.zeros((df.shape[0], 300)) for i, q in enumerate(tqdm_notebook(df.question2.values)): question2_vectors[i, :] = sent2vec(q) df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] df['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] df['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] df['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)] df['is_duplicate'].value_counts() df.isnull().sum() df.drop(['question1', 'question2'], axis=1, inplace=True) df = df[pd.notnull(df['cosine_distance'])]
def get_jaccard_distance(row): q1_q2_vectors = get_q1_q2_vectors(row) return jaccard(q1_q2_vectors[0], q1_q2_vectors[1])
def jaccard_similarity_binconv(v0, v1): v0 = np.where((v0 > 0), 1, 0) v1 = np.where((v0 > 0), 1, 0) return 1.0 - spd.jaccard(v0, v1)
def jaccard(x): result = np.empty(df_array.shape[0]) for i, row in enumerate(df_array[:, 0, :]): result[i] = distance.jaccard(row, x, **kwargs) return result