Esempio n. 1
0
 def normalize_in_logspace(dist, in_log_space=True):
     if not in_log_space:
         log_dist = np.log(dist, dtype=np.float64)
         return np.exp(log_dist - logsumexp_scipy(log_dist))
     else:
         logging.debug('Likelihood before normalization\n{}'.format(dist))
         log_dist = np.array(dist, dtype=np.float64)
         return np.exp(log_dist - logsumexp_scipy(log_dist))
Esempio n. 2
0
 def add_sample_density(self, sample_id, density, conv=1e-40):
     sample_idx = self._time_points.index(sample_id)
     density = np.asarray(density, dtype=np.float32) + conv
     log_density = np.log(density, dtype=np.float32)
     self._hist[sample_idx] = density
     self._logprior[sample_idx] = log_density - logsumexp_scipy(log_density)
     self._loghist[sample_idx] = log_density - logsumexp_scipy(log_density)
     logging.debug('Added density for cluster {} for sample {}'.format(
         self._identifier, sample_id))
 def _load_clusters(self, cluster_info_file):
     logging.debug('Loading clusters from {} file'.format(cluster_info_file))
     cluster_ccf = {}
     means = {}
     ccf_headers = ['postDP_ccf_' + str(i / 100.0) for i in range(0, 101, 1)]
     with open(cluster_info_file, 'r') as reader:
         for line in reader:
             values = line.strip().split('\t')
             if line.startswith('Patient_ID'):
                 header = dict((item, idx) for idx, item in enumerate(values))
             else:
                 sample_id = values[header['Sample_ID']]
                 cluster_id = int(values[header['Cluster_ID']])
                 cluster_mean = float(values[header['postDP_ccf_mean']])
                 ccf = np.array([float(values[header[i]]) for i in ccf_headers], dtype=np.float64)
                 ccf = np.clip(ccf, a_min=1e-20, a_max=None)
                 ccf = np.log(ccf, dtype=np.float64)
                 ccf = np.exp(ccf - logsumexp_scipy(ccf))
                 if cluster_id not in cluster_ccf:
                     cluster_ccf[cluster_id] = {}
                     means[cluster_id] = []
                 means[cluster_id].append(cluster_mean)
                 cluster_ccf[cluster_id][sample_id] = ccf
     for cluster_id in cluster_ccf:
         # decide whether cluster should be removed
         # if density < 0.1 across all samples add it to remove clusters, to be removed from BuildTree algorithm
         if self.low_ccf_check(means[cluster_id]):
             self._removed_clusters.append(cluster_id)
             logging.debug('Removed cluster {} '.format(cluster_id))
     return cluster_ccf
Esempio n. 4
0
 def _normalize_in_logspace(dist, in_log_space=True):
     logging.debug('Distribution before normalization\n{}'.format(dist))
     if in_log_space:
         log_dist = np.array(dist, dtype=np.float64)
     else:
         logging.debug(
             'Converting distribution to log space before normalization')
         log_dist = np.log(dist, dtype=np.float64)
     return np.exp(log_dist - logsumexp_scipy(log_dist))
    def _load_mutations(self, mut_info_file):
        logging.debug('Loading mutations from {} file'.format(mut_info_file))
        ccf_headers = ['preDP_ccf_' + str(i / 100.0) for i in range(0, 101, 1)]
        with open(mut_info_file, 'r') as reader:
            for line in reader:
                values = line.strip().split('\t')
                if line.startswith('Patient_ID'):
                    header = dict((item, idx) for idx, item in enumerate(values))
                elif values[header['Variant_Type']] != 'CNV':
                    # TODO: for reshuffling need to keep all mutations and clusters
                    cluster_id = int(values[header['Cluster_Assignment']])
                    if cluster_id not in self._removed_clusters:
                        chromosome = values[header['Chromosome']]
                        position = values[header['Start_position']]
                        ref = values[header['Reference_Allele']]
                        alt = values[header['Tumor_Seq_Allele']]
                        sample_id = values[header['Sample_ID']]
                        ccf_1d = [float(values[header[i]]) for i in ccf_headers]
                        ccf_1d = np.clip(np.array(ccf_1d, dtype=np.float64), a_min=1e-20, a_max=None)
                        ccf_1d = np.log(ccf_1d, dtype=np.float64)
                        ccf_1d = np.exp(ccf_1d - logsumexp_scipy(ccf_1d))
                        var_type = values[header['Variant_Type']]
                        mutation_str = ':'.join([chromosome, position, ref, alt])
                        if cluster_id not in self._cluster_mutations:
                            self._cluster_mutations[cluster_id] = {}
                        if mutation_str not in self._cluster_mutations[cluster_id]:
                            self._cluster_mutations[cluster_id][mutation_str] = {}

                        if sample_id not in self._samples_mutations:
                            self._samples_mutations[sample_id] = []
                        t_ref_count = self._get_count(values[header['t_ref_count']])
                        t_alt_count = self._get_count(values[header['t_alt_count']])

                        mutation = SomaticEvents.SomMutation(chromosome, position, ref, alt, ccf_1d,
                                                             ref_cnt=t_ref_count,
                                                             alt_cnt=t_alt_count,
                                                             gene=values[header['Hugo_Symbol']],
                                                             prot_change=values[header['Protein_change']],
                                                             mut_category=values[header['Variant_Classification']],
                                                             from_sample=sample_id,
                                                             type_=var_type)

                        self._cluster_mutations[cluster_id][mutation_str][sample_id] = mutation
                        self._samples_mutations[sample_id].append(mutation_str)
                        self._clusters[cluster_id].add_mutation(mutation)
                        logging.info('Mutation {} loaded from sample {}'.format(mutation_str, sample_id))
Esempio n. 6
0
 def _make_nd_histogram(hist_array, conv=1e-40):
     hist = np.asarray(hist_array, dtype=np.float32) + conv
     return np.apply_along_axis(lambda z: z - logsumexp_scipy(z), 1,
                                np.log(hist))