Example #1
0
 def normalize_in_logspace(dist, in_log_space=True):
     if not in_log_space:
         log_dist = np.log(dist, dtype=np.float64)
         return np.exp(log_dist - logsumexp_scipy(log_dist))
     else:
         logging.debug('Likelihood before normalization\n{}'.format(dist))
         log_dist = np.array(dist, dtype=np.float64)
         return np.exp(log_dist - logsumexp_scipy(log_dist))
    def one_iteration_fix_k(self):

        random.shuffle(self.mutations)
        for mut in self.mutations:

            loglik = np.ones((self.n_clusters, self.n_samples), dtype=np.float64) * -np.inf
            const_array = np.zeros((self.n_clusters,), dtype=np.float64)

            if len(mut.assigned_to) == 1:
                continue  # don't reassign last mutation

            for cluster_idx, cluster in enumerate(self.clusterlist):
                # if the current point is the only thing in the cluster...
                # This seems to work empirically (as well as theoretically)
                if len(cluster) == 1 and mut.assigned_to == cluster:
                    continue
                stay_in_clust_const = len(cluster) / float(self.n_muts - 1 + self.alpha)

                const_array[cluster_idx] = np.log(stay_in_clust_const)

                if mut not in cluster:  # TODO: redefine in
                    loglik[cluster_idx] = self.logsum_of_marginals_per_sample(cluster.normed_hist + mut.loghist)
                else:
                    loglik[cluster_idx] = self.logsum_of_marginals_per_sample(
                        self.normalize_loghist_with_prior(cluster - mut) + mut.loghist)

            loglik = np.sum(loglik, axis=1)  # + const_array

            loglik = loglik - logsumexp_scipy(loglik)
            loglik = loglik + const_array

            c_lik = np.exp(loglik - logsumexp_scipy(loglik))

            # if np.random.random() < 0.1: print sum(c_lik[:-1])
            new_cluster_idx = np.nonzero(np.random.multinomial(1, c_lik) == 1)[0][0]
            if new_cluster_idx == self.n_clusters:  # new cluster
                mut.assigned_to -= mut
                DP_cluster(self, mut)  # create new cluster, lists updated automatically
            else:
                new_cluster = self.clusterlist[new_cluster_idx]
                mut.assigned_to -= mut
                new_cluster += mut

        cluster_counter = itertools.count()
        next(cluster_counter)
        real_index = dict([[x.id, next(cluster_counter)] for x in self.clusterlist])
        self.results.assign.append([real_index[x.assigned_to.id] for x in self.mutations])
        self.results.alpha.append(self.alpha)
        self.results.eta.append(self.eta)
        self.results.cluster_loghistograms.append([cluster.normed_hist for cluster in self.clusterlist])
        self.results.cluster_positions.append(
            [[np.argmax(x) for x in cluster.normed_hist] for cluster in self.clusterlist])
        self.results.clust_prop.append([len(cluster) / float(self.n_muts) for cluster in self.clusterlist])
        self.results.clust_size.append([len(cluster) for cluster in self.clusterlist])
        self.results.K.append(self.n_clusters)

        return [real_index[x.assigned_to.id] for x in self.mutations], [cluster.normed_hist for cluster in
                                                                        self.clusterlist]
Example #3
0
def DP_prob_k_cond_alpha_N(N, alpha, log_stirling_coef):
    loglik = [np.nan] * N
    for k in range(1, N + 1):
        loglik[k - 1] = log_stirling_coef[k - 1] + lgamma(N - 1) + k * np.log(alpha) + lgamma(alpha) - lgamma(alpha + N)

    Pr = np.exp(loglik - logsumexp_scipy(loglik))

    return (Pr)
Example #4
0
 def make_nd_histogram(hist_array):
     conv = 1e-40
     hist = np.asarray(hist_array, dtype=np.float32) + conv
     n_samples = np.shape(hist)[1]
     for i in range(n_samples):
         hist[:, :, 0] = conv
     return np.apply_over_axes(
         lambda x, y: np.apply_along_axis(lambda z: z - logsumexp_scipy(z),
                                          y, x), np.log(hist), 2)
Example #5
0
    def log_conv(x, y):
        ## y is len 2
        try:
            x = [-np.inf] + list(x) + [-np.inf]

        except:

            x = [-np.inf, x, -np.inf]
        x.insert(0, -np.inf)

        res = [np.nan] * (len(x) - 1)
        for k in range(len(x) - 1):
            res[k] = logsumexp_scipy([x[k] + y[0], x[k + 1] + y[1]])  #

        return (res)
Example #6
0
 def _load_clusters(self, cluster_info_file):
     logging.debug(
         'Loading clusters from {} file'.format(cluster_info_file))
     cluster_ccf = {}
     means = {}
     ccf_headers = [
         'postDP_ccf_' + str(i / 100.0) for i in xrange(0, 101, 1)
     ]
     with open(cluster_info_file, 'r') as reader:
         for line in reader:
             values = line.strip().split('\t')
             if line.startswith('Patient_ID'):
                 header = dict(
                     (item, idx) for idx, item in enumerate(values))
             else:
                 sample_id = values[header['Sample_ID']]
                 cluster_id = int(values[header['Cluster_ID']])
                 cluster_mean = float(values[header['postDP_ccf_mean']])
                 ccf = np.array(
                     [float(values[header[i]]) for i in ccf_headers],
                     dtype=np.float64)
                 ccf = np.clip(ccf, a_min=1e-20, a_max=None)
                 ccf = np.log(ccf, dtype=np.float64)
                 ccf = np.exp(ccf - logsumexp_scipy(ccf))
                 if cluster_id not in cluster_ccf:
                     cluster_ccf[cluster_id] = {}
                     means[cluster_id] = []
                 means[cluster_id].append(cluster_mean)
                 cluster_ccf[cluster_id][sample_id] = ccf
     for cluster_id in cluster_ccf:
         # decide whether cluster should be removed
         # if density < 0.1 across all samples add it to remove clusters, to be removed from BuildTree algorithm
         if self.low_ccf_check(means[cluster_id]):
             self._removed_clusters.append(cluster_id)
             logging.debug('Removed cluster {} '.format(cluster_id))
     return cluster_ccf
Example #7
0
    def one_iteration(self, resample=True):
        for mut in self.mutations:
            skip_count = 1
            loglik = np.ones((self.n_clusters + 1, self.n_samples), dtype=np.float64) * -np.inf
            const_array = np.zeros((self.n_clusters + 1,), dtype=np.float64)

            for cluster_idx, cluster in enumerate(self.clusterlist):
                ## if the current point is the only thing in the cluster...
                # This seems to work empirically (as well as theoretically)
                if len(cluster) == 1 and mut.assigned_to == cluster:
                    # skip_count+=1 #at most 2
                    continue
                stay_in_clust_const = len(cluster) / float(self.n_muts - 1 + self.alpha)

                const_array[cluster_idx] = np.log(stay_in_clust_const)

                if mut not in cluster:  # TODO: redefine in
                    loglik[cluster_idx] = self.logsum_of_marginals_per_sample(cluster.normed_hist + mut.loghist)
                else:
                    loglik[cluster_idx] = self.logsum_of_marginals_per_sample(
                        self.normalize_loghist_with_prior(cluster - mut) + mut.loghist)

            open_new_clust_const = self.alpha / float(self.n_muts - 1 + self.alpha)

            prior = np.clip(np.exp(self.logprior) - np.exp(
                functools.reduce(lambda x, y: np.maximum(x, y), [z.normed_hist for z in self.clusterlist])), a_min=1e-40,
                            a_max=1.)

            loglik[-1] = self.logsum_of_marginals_per_sample(
                mut.loghist + self.normalize_loghist_with_prior(np.log(prior)))
            const_array[-1] = np.log(open_new_clust_const)

            # c_loglik = np.sum(c_loglik, axis = 1)
            loglik = np.sum(loglik, axis=1)  # + const_array

            loglik = loglik - logsumexp_scipy(loglik)
            loglik = loglik + const_array

            c_lik = np.exp(loglik - logsumexp_scipy(loglik))

            new_cluster_idx = np.nonzero(np.random.multinomial(1, c_lik) == 1)[0][0]
            if new_cluster_idx == self.n_clusters:  # new cluster
                mut.assigned_to -= mut
                DP_cluster(self, mut)  # create new cluster, lists updated automatically
            else:
                new_cluster = self.clusterlist[new_cluster_idx]
                mut.assigned_to -= mut
                new_cluster += mut

        cluster_counter = itertools.count()
        next(cluster_counter)
        real_index = dict([[x.id, next(cluster_counter)] for x in self.clusterlist])
        self.results.assign.append([real_index[x.assigned_to.id] for x in self.mutations])
        self.results.alpha.append(self.alpha)
        self.results.eta.append(self.eta)
        self.results.cluster_loghistograms.append([cluster.normed_hist for cluster in self.clusterlist])
        self.results.cluster_positions.append(
            [[np.argmax(x) for x in cluster.normed_hist] for cluster in self.clusterlist])
        self.results.clust_prop.append([len(cluster) / float(self.n_muts) for cluster in self.clusterlist])
        self.results.clust_size.append([len(cluster) for cluster in self.clusterlist])
        self.results.K.append(self.n_clusters)

        print("{}({});".format(self.n_clusters, round(self.alpha, 1)),)
        sys.stdout.flush()

        if resample:
            ##resample alpha
            self.eta = stats.beta.rvs(self.alpha + 1, self.n_muts)
            self.alpha = sample_gamma_cond_N_k(self.n_muts, self.n_clusters, self.eta,
                                               self.gamma_prior)  ## Escobar and West 1995
Example #8
0
    def _load_mutations(self, mut_info_file):
        logging.debug('Loading mutations from {} file'.format(mut_info_file))
        ccf_headers = [
            'preDP_ccf_' + str(i / 100.0) for i in xrange(0, 101, 1)
        ]
        with open(mut_info_file, 'r') as reader:
            for line in reader:
                values = line.strip().split('\t')
                if line.startswith('Patient_ID'):
                    header = dict(
                        (item, idx) for idx, item in enumerate(values))
                else:
                    cluster_id = int(values[header['Cluster_Assignment']])
                    if cluster_id not in self._removed_clusters:
                        chromosome = values[header['Chromosome']]
                        position = values[header['Start_position']]
                        ref = values[header['Reference_Allele']]
                        alt = values[header['Tumor_Seq_Allele']]
                        sample_id = values[header['Sample_ID']]
                        ccf_1d = [
                            float(values[header[i]]) for i in ccf_headers
                        ]
                        ccf_1d = np.clip(np.array(ccf_1d, dtype=np.float64),
                                         a_min=1e-20,
                                         a_max=None)
                        ccf_1d = np.log(ccf_1d, dtype=np.float64)
                        ccf_1d = np.exp(ccf_1d - logsumexp_scipy(ccf_1d))
                        var_type = values[header['Variant_Type']]
                        mutation_str = ':'.join(
                            [chromosome, position, ref, alt])
                        if cluster_id not in self._cluster_mutations:
                            self._cluster_mutations[cluster_id] = {}
                        if mutation_str not in self._cluster_mutations[
                                cluster_id]:
                            self._cluster_mutations[cluster_id][
                                mutation_str] = {}

                        if sample_id not in self._samples_mutations:
                            self._samples_mutations[sample_id] = []

                        mutation = SomaticEvents.SomMutation(
                            chromosome,
                            position,
                            ref,
                            alt,
                            ccf_1d,
                            ref_cnt=values[header['t_ref_count']],
                            alt_cnt=values[header['t_alt_count']],
                            gene=values[header['Hugo_Symbol']],
                            prot_change=values[header['Protein_change']],
                            mut_category=values[
                                header['Variant_Classification']],
                            from_sample=sample_id,
                            type_=var_type)

                        self._cluster_mutations[cluster_id][mutation_str][
                            sample_id] = mutation
                        self._samples_mutations[sample_id].append(mutation_str)
                        self._clusters[cluster_id].add_mutation(mutation)
                        logging.info(
                            'Mutation {} loaded from sample {}'.format(
                                mutation_str, sample_id))
Example #9
0
def logsum_of_marginals_per_sample(loghist):
    return np.apply_along_axis(lambda x: logsumexp_scipy(x), 1,
                               np.array(loghist, dtype=np.float32))