def point_dist(X, metric='euclidean', **kwargs): """ Wrapper for scipy.spatial.distance.pdist function. Returns a distance matrix in squareform for a 1D array of x, y coordinates :param X: 1D array of x, y coordinates :param metric: will be passed to scipy_pdist As for now, only euclidean distances are implemented. Others will follow. :param kwargs: will be passes to scipy_pdist :return: distance matrix of the given coordinates """ # check X _X = list(X) # switch metric if metric.lower() == 'euclidean': # check that all elements in the index have exactly a x and y coordinate if any([not len(e) == 2 for e in _X]): raise ValueError( 'The passed point data does not have a x and y coordinate for each point.' ) # data seems to be ok, return the distance matrix in squareform return np.matrix(squareform(scipy_pdist(_X, metric=metric, **kwargs))) elif metric.lower() == 'rank': return np.matrix(rankdata(point_dist(X, metric='euclidean'))) # this metric is not known else: raise ValueError( "The metric '%s' is not known. Use one of: ['euclidean', 'rank']" % str(metric))
def diss(self, options): """Calculate dissimilarity between usage profiles.""" check_file_exists(options.profile_file) genome_ids = [] profiles = [] with open(options.profile_file) as f: f.readline() # burn header for line in f: line_split = line.rstrip().split('\t') genome_id = line_split[0] profile = [float(v) for v in line_split[1:]] genome_ids.append(genome_id) profiles.append(profile) # calculate dissimilarity between genomes d = scipy_pdist(profiles, metric=options.metric) fout = open(options.output_file, 'w') if not options.full_matrix: # write out lower triangle from condense dissimilarity matrix, # in pairwise fashion fout.write('Genome A\tGenome B\tDissimilarity\n') condensed_idx = lambda i, j, n: n * j - j * (j + 1 ) // 2 + i - 1 - j for i in range(1, len(genome_ids)): for j in range(i): fout.write('%s\t%s\t%f\n' % (genome_ids[i], genome_ids[j], d[condensed_idx( i, j, len(genome_ids))])) else: # write out full dissimilarity matrix ds = scipy_squareform(d) for genome_id in genome_ids: fout.write('\t' + genome_id) fout.write('\n') for i, genome_id in enumerate(genome_ids): fout.write(genome_id) for j in range(len(genome_ids)): fout.write('\t%f' % ds[i, j]) fout.write('\n') fout.close() self.logger.info('Dissimilarity values written to: %s' % options.output_file)
def diss(self, options): """Calculate dissimilarity between usage profiles.""" check_file_exists(options.profile_file) genome_ids = [] profiles = [] with open(options.profile_file) as f: f.readline() # burn header for line in f: line_split = line.rstrip().split('\t') genome_id = line_split[0] profile = [float(v) for v in line_split[1:]] genome_ids.append(genome_id) profiles.append(profile) # calculate dissimilarity between genomes d = scipy_pdist(profiles, metric=options.metric) fout = open(options.output_file, 'w') if not options.full_matrix: # write out lower triangle from condense dissimilarity matrix, # in pairwise fashion fout.write('Genome A\tGenome B\tDissimilarity\n') condensed_idx = lambda i,j,n: n*j - j*(j+1)/2 + i - 1 - j for i in xrange(1, len(genome_ids)): for j in xrange(i): fout.write('%s\t%s\t%f\n' % (genome_ids[i], genome_ids[j], d[condensed_idx(i, j, len(genome_ids))])) else: # write out full dissimilarity matrix ds = scipy_squareform(d) for genome_id in genome_ids: fout.write('\t' + genome_id) fout.write('\n') for i, genome_id in enumerate(genome_ids): fout.write(genome_id) for j in xrange(len(genome_ids)): fout.write('\t%f' % ds[i,j]) fout.write('\n') fout.close() self.logger.info('Dissimilarity values written to: %s' % options.output_file)
def test_pdist(): torch.manual_seed(0) np.random.seed(1) for n_1 in range(10, 100, 10): for n_2 in range(10, 100, 10): dim = np.random.randint(1, 10) sample_1 = torch.randn(n_1, dim) sample_2 = torch.randn(n_2, dim) p = 1 + 2 * np.random.rand() # Use this l_p norm. distances = pdist(sample_1, sample_2, norm=p, eps=1e-9).numpy() sample_12 = np.vstack((sample_1.numpy(), sample_2.numpy())) distances_scipy = squareform( scipy_pdist(sample_12, metric='minkowski', p=p)) print(distances - distances_scipy[:n_1, n_1:]) assert np.allclose(distances, distances_scipy[:n_1, n_1:], rtol=1e-4, atol=1e-4)
def test_pdist(): torch.manual_seed(0) np.random.seed(1) for n_1 in range(10, 100, 10): for n_2 in range(10, 100, 10): dim = np.random.randint(1, 10) sample_1 = torch.randn(n_1, dim) sample_2 = torch.randn(n_2, dim) p = 1 + 2 * np.random.rand() # Use this l_p norm. distances = pdist(sample_1, sample_2, norm=p, eps=1e-9).numpy() sample_12 = np.vstack((sample_1.numpy(), sample_2.numpy())) distances_scipy = squareform( scipy_pdist(sample_12, metric='minkowski', p=p)) all_close = np.allclose(distances, distances_scipy[:n_1, n_1:], rtol=1e-3, atol=1e-3) if not all_close: diff = distances - distances_scipy[:n_1, n_1:] print(diff) bad_idxs = (np.where(np.abs(diff) > 0.0001)) print(distances[bad_idxs]) assert all_close
def pdist(a, b, metric): return scipy_pdist([a, b], metric=metric)[0]
def gower_distance(X: pd.DataFrame, agg_func=None, correlation_dist=None, multiprocessing=True, n_jobs=-2, verbose=False): """ This function expects a pandas dataframe as input The data frame is to contain the features along the columns. Based on these features a distance matrix will be returned which will contain the pairwise gower distance between the rows All variables of object type will be treated as nominal variables and the others will be treated as numeric variables. Distance metrics used for: Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry) """ individual_variable_dists = [] if multiprocessing: pdist = lambda X, metric: squareform_(pairwise_distances( X=X, metric=metric, n_jobs=n_jobs, force_all_finite='allow-nan'), checks=False) else: pdist = scipy_pdist # returns condensed dist matrix for column in X.columns: feature = X.loc[:, column] print("Gower's dissimilarity: Computing", column, ", dtype:", feature.dtypes, ", shape:", feature.shape) if verbose else None if column in [ "gene_family_id", "gene_family", "locus_type", "Transcript type", "tag" ]: print("Dice distance") if verbose else None feature_dist = pdist(feature.str.get_dummies("|"), 'dice') elif column == "miR family" or column == "Family": print("Dice distance") if verbose else None feature_dist = pdist(feature.str.get_dummies("/"), 'dice') elif column == "GO terms" or column == "Rfams": print("Dice distance") if verbose else None feature_dist = pdist(feature.str.get_dummies("|"), 'dice') elif column == "Disease association": print("Dice distance") if verbose else None feature_dist = pdist(feature.str.get_dummies("|"), 'dice') elif "sequence" in column: print(f"Global alignment seq score (maxlen={100})" ) if verbose else None # Note: If doesn't work, modify _pairwise_callable Line 1083 # X, Y = check_pairwise_arrays(X, Y) feature_dist = pdist(feature.values.reshape((X.shape[0], -1)), seq_global_alignment_pairwise_score) feature_dist = 1 - feature_dist # Convert from similarity to dissimilarity elif column == "Location": # LNC Locations print("Location split to Chromosome, start, end" ) if verbose else None location_features = feature.str.split( "[:-]", expand=True).filter(items=[0, 1]) hierarchical_columns = ["Chromosome", "start"] location_features.columns = hierarchical_columns location_features["start"] = location_features["start"].astype( np.float64) location_features["end"] = location_features["end"].astype( np.float64) # TODO Add bp region length feature_dist = gower_distance( location_features, agg_func=hierarchical_distance_aggregate_score, multiprocessing=True) elif column == "location": # GE Locations print("Location split to Chromosome, arm, region" ) if verbose else None location_features = feature.str.split( "[pq.]", expand=True).filter(items=[0, 1]) location_features.columns = ["Chromosome", "region"] location_features["arm"] = feature.str.extract(r'(?P<arm>[pq])', expand=True) location_features["band"] = feature.str.split("[pq.-]", expand=True)[2] location_features = location_features[[ "Chromosome", "arm", "region", "band" ]] # TODO Add band # # print(location_features) feature_dist = gower_distance( location_features, agg_func=hierarchical_distance_aggregate_score, multiprocessing=True) elif feature.dtypes == np.object: # TODO Use Categorical dtypes later print("Dice distance") if verbose else None feature_dist = pdist(pd.get_dummies(feature), 'dice') elif feature.dtypes == int: print("Manhattan distance (normalized ptp)") if verbose else None feature_dist = scipy_pdist(feature.values.reshape((X.shape[0],-1)), "manhattan") / \ (np.nanmax(feature.values) - np.nanmin(feature.values)) elif feature.dtypes == float: print("Euclidean distance (normalized ptp)") if verbose else None feature_dist = scipy_pdist(feature.values.reshape((X.shape[0],-1)), "euclidean") / \ (np.nanmax(feature.values) - np.nanmin(feature.values)) else: raise Exception("Invalid column dtype") individual_variable_dists.append(feature_dist) if correlation_dist is not None: print("Correlation distance", correlation_dist.shape) if verbose else None individual_variable_dists.append(correlation_dist) if agg_func is None: agg_func = lambda x: np.nanmean(x, axis=0) pdists_mean_reduced = agg_func(np.array(individual_variable_dists)) return pdists_mean_reduced
def fix(index): ep = self.index[index] ev = v return (index, scipy_pdist([ep, ev], metric="euclidean")[0])