Ejemplo n.º 1
0
def point_dist(X, metric='euclidean', **kwargs):
    """
    Wrapper for scipy.spatial.distance.pdist function.
    Returns a distance matrix in squareform for a 1D array of x, y coordinates

    :param X: 1D array of x, y coordinates
    :param metric: will be passed to scipy_pdist
            As for now, only euclidean distances are implemented. Others will follow.
    :param kwargs: will be passes to scipy_pdist
    :return: distance matrix of the given coordinates
    """
    # check X
    _X = list(X)

    # switch metric
    if metric.lower() == 'euclidean':
        # check that all elements in the index have exactly a x and y coordinate
        if any([not len(e) == 2 for e in _X]):
            raise ValueError(
                'The passed point data does not have a x and y coordinate for each point.'
            )

        # data seems to be ok, return the distance matrix in squareform
        return np.matrix(squareform(scipy_pdist(_X, metric=metric, **kwargs)))

    elif metric.lower() == 'rank':
        return np.matrix(rankdata(point_dist(X, metric='euclidean')))

    # this metric is not known
    else:
        raise ValueError(
            "The metric '%s' is not known. Use one of: ['euclidean', 'rank']" %
            str(metric))
Ejemplo n.º 2
0
    def diss(self, options):
        """Calculate dissimilarity between usage profiles."""

        check_file_exists(options.profile_file)

        genome_ids = []
        profiles = []
        with open(options.profile_file) as f:
            f.readline()  # burn header

            for line in f:
                line_split = line.rstrip().split('\t')
                genome_id = line_split[0]
                profile = [float(v) for v in line_split[1:]]

                genome_ids.append(genome_id)
                profiles.append(profile)

        # calculate dissimilarity between genomes
        d = scipy_pdist(profiles, metric=options.metric)

        fout = open(options.output_file, 'w')
        if not options.full_matrix:
            # write out lower triangle from condense dissimilarity matrix,
            # in pairwise fashion
            fout.write('Genome A\tGenome B\tDissimilarity\n')
            condensed_idx = lambda i, j, n: n * j - j * (j + 1
                                                         ) // 2 + i - 1 - j
            for i in range(1, len(genome_ids)):
                for j in range(i):
                    fout.write('%s\t%s\t%f\n' %
                               (genome_ids[i], genome_ids[j], d[condensed_idx(
                                   i, j, len(genome_ids))]))
        else:
            # write out full dissimilarity matrix
            ds = scipy_squareform(d)
            for genome_id in genome_ids:
                fout.write('\t' + genome_id)
            fout.write('\n')

            for i, genome_id in enumerate(genome_ids):
                fout.write(genome_id)
                for j in range(len(genome_ids)):
                    fout.write('\t%f' % ds[i, j])
                fout.write('\n')

        fout.close()

        self.logger.info('Dissimilarity values written to: %s' %
                         options.output_file)
Ejemplo n.º 3
0
    def diss(self, options):
        """Calculate dissimilarity between usage profiles."""
        
        check_file_exists(options.profile_file)
        
        genome_ids = []
        profiles = []
        with open(options.profile_file) as f:
            f.readline() # burn header
            
            for line in f:
                line_split = line.rstrip().split('\t')
                genome_id = line_split[0]
                profile = [float(v) for v in line_split[1:]]
                
                genome_ids.append(genome_id)
                profiles.append(profile)
                
        # calculate dissimilarity between genomes
        d = scipy_pdist(profiles, metric=options.metric)

        fout = open(options.output_file, 'w')
        if not options.full_matrix:
            # write out lower triangle from condense dissimilarity matrix,
            # in pairwise fashion
            fout.write('Genome A\tGenome B\tDissimilarity\n')
            condensed_idx = lambda i,j,n: n*j - j*(j+1)/2 + i - 1 - j
            for i in xrange(1, len(genome_ids)):
                for j in xrange(i):
                    fout.write('%s\t%s\t%f\n' % (genome_ids[i], genome_ids[j], d[condensed_idx(i, j, len(genome_ids))]))
        else:
            # write out full dissimilarity matrix
            ds = scipy_squareform(d)
            for genome_id in genome_ids:
                fout.write('\t' + genome_id)
            fout.write('\n')
            
            for i, genome_id in enumerate(genome_ids):
                fout.write(genome_id)
                for j in xrange(len(genome_ids)):
                    fout.write('\t%f' % ds[i,j])
                fout.write('\n')
        
        fout.close()
        
        self.logger.info('Dissimilarity values written to: %s' % options.output_file)
Ejemplo n.º 4
0
def test_pdist():
    torch.manual_seed(0)
    np.random.seed(1)
    for n_1 in range(10, 100, 10):
        for n_2 in range(10, 100, 10):
            dim = np.random.randint(1, 10)
            sample_1 = torch.randn(n_1, dim)
            sample_2 = torch.randn(n_2, dim)
            p = 1 + 2 * np.random.rand()  # Use this l_p norm.
            distances = pdist(sample_1, sample_2, norm=p, eps=1e-9).numpy()
            sample_12 = np.vstack((sample_1.numpy(), sample_2.numpy()))
            distances_scipy = squareform(
                scipy_pdist(sample_12, metric='minkowski', p=p))
            print(distances - distances_scipy[:n_1, n_1:])
            assert np.allclose(distances,
                               distances_scipy[:n_1, n_1:],
                               rtol=1e-4,
                               atol=1e-4)
Ejemplo n.º 5
0
def test_pdist():
    torch.manual_seed(0)
    np.random.seed(1)
    for n_1 in range(10, 100, 10):
        for n_2 in range(10, 100, 10):
            dim = np.random.randint(1, 10)
            sample_1 = torch.randn(n_1, dim)
            sample_2 = torch.randn(n_2, dim)
            p = 1 + 2 * np.random.rand()  # Use this l_p norm.
            distances = pdist(sample_1, sample_2, norm=p, eps=1e-9).numpy()
            sample_12 = np.vstack((sample_1.numpy(), sample_2.numpy()))
            distances_scipy = squareform(
                scipy_pdist(sample_12, metric='minkowski', p=p))
            all_close = np.allclose(distances,
                                    distances_scipy[:n_1, n_1:],
                                    rtol=1e-3,
                                    atol=1e-3)
            if not all_close:
                diff = distances - distances_scipy[:n_1, n_1:]
                print(diff)
                bad_idxs = (np.where(np.abs(diff) > 0.0001))
                print(distances[bad_idxs])
            assert all_close
Ejemplo n.º 6
0
def pdist(a, b, metric):
    return scipy_pdist([a, b], metric=metric)[0]
Ejemplo n.º 7
0
def pdist(a, b, metric):
    return scipy_pdist([a, b], metric=metric)[0]
def gower_distance(X: pd.DataFrame,
                   agg_func=None,
                   correlation_dist=None,
                   multiprocessing=True,
                   n_jobs=-2,
                   verbose=False):
    """
    This function expects a pandas dataframe as input
    The data frame is to contain the features along the columns. Based on these features a
    distance matrix will be returned which will contain the pairwise gower distance between the rows
    All variables of object type will be treated as nominal variables and the others will be treated as
    numeric variables.
    Distance metrics used for:
    Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
    Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry)
    """
    individual_variable_dists = []
    if multiprocessing:
        pdist = lambda X, metric: squareform_(pairwise_distances(
            X=X, metric=metric, n_jobs=n_jobs, force_all_finite='allow-nan'),
                                              checks=False)
    else:
        pdist = scipy_pdist  # returns condensed dist matrix

    for column in X.columns:
        feature = X.loc[:, column]
        print("Gower's dissimilarity: Computing", column, ", dtype:",
              feature.dtypes, ", shape:", feature.shape) if verbose else None

        if column in [
                "gene_family_id", "gene_family", "locus_type",
                "Transcript type", "tag"
        ]:
            print("Dice distance") if verbose else None
            feature_dist = pdist(feature.str.get_dummies("|"), 'dice')

        elif column == "miR family" or column == "Family":
            print("Dice distance") if verbose else None
            feature_dist = pdist(feature.str.get_dummies("/"), 'dice')

        elif column == "GO terms" or column == "Rfams":
            print("Dice distance") if verbose else None
            feature_dist = pdist(feature.str.get_dummies("|"), 'dice')

        elif column == "Disease association":
            print("Dice distance") if verbose else None
            feature_dist = pdist(feature.str.get_dummies("|"), 'dice')

        elif "sequence" in column:
            print(f"Global alignment seq score (maxlen={100})"
                  ) if verbose else None
            # Note: If doesn't work, modify _pairwise_callable Line 1083  # X, Y = check_pairwise_arrays(X, Y)
            feature_dist = pdist(feature.values.reshape((X.shape[0], -1)),
                                 seq_global_alignment_pairwise_score)
            feature_dist = 1 - feature_dist  # Convert from similarity to dissimilarity

        elif column == "Location":  # LNC Locations
            print("Location split to Chromosome, start, end"
                  ) if verbose else None
            location_features = feature.str.split(
                "[:-]", expand=True).filter(items=[0, 1])
            hierarchical_columns = ["Chromosome", "start"]
            location_features.columns = hierarchical_columns
            location_features["start"] = location_features["start"].astype(
                np.float64)
            location_features["end"] = location_features["end"].astype(
                np.float64)  # TODO Add bp region length

            feature_dist = gower_distance(
                location_features,
                agg_func=hierarchical_distance_aggregate_score,
                multiprocessing=True)

        elif column == "location":  # GE Locations
            print("Location split to Chromosome, arm, region"
                  ) if verbose else None
            location_features = feature.str.split(
                "[pq.]", expand=True).filter(items=[0, 1])
            location_features.columns = ["Chromosome", "region"]
            location_features["arm"] = feature.str.extract(r'(?P<arm>[pq])',
                                                           expand=True)
            location_features["band"] = feature.str.split("[pq.-]",
                                                          expand=True)[2]
            location_features = location_features[[
                "Chromosome", "arm", "region", "band"
            ]]  # TODO Add band #
            # print(location_features)
            feature_dist = gower_distance(
                location_features,
                agg_func=hierarchical_distance_aggregate_score,
                multiprocessing=True)

        elif feature.dtypes == np.object:  # TODO Use Categorical dtypes later
            print("Dice distance") if verbose else None
            feature_dist = pdist(pd.get_dummies(feature), 'dice')

        elif feature.dtypes == int:
            print("Manhattan distance (normalized ptp)") if verbose else None
            feature_dist = scipy_pdist(feature.values.reshape((X.shape[0],-1)), "manhattan") / \
                           (np.nanmax(feature.values) - np.nanmin(feature.values))
        elif feature.dtypes == float:
            print("Euclidean distance (normalized ptp)") if verbose else None
            feature_dist = scipy_pdist(feature.values.reshape((X.shape[0],-1)), "euclidean") / \
                           (np.nanmax(feature.values) - np.nanmin(feature.values))
        else:
            raise Exception("Invalid column dtype")

        individual_variable_dists.append(feature_dist)

    if correlation_dist is not None:
        print("Correlation distance",
              correlation_dist.shape) if verbose else None
        individual_variable_dists.append(correlation_dist)

    if agg_func is None:
        agg_func = lambda x: np.nanmean(x, axis=0)

    pdists_mean_reduced = agg_func(np.array(individual_variable_dists))

    return pdists_mean_reduced
Ejemplo n.º 9
0
 def fix(index):
     ep = self.index[index]
     ev = v
     return (index, scipy_pdist([ep, ev], metric="euclidean")[0])