Esempio n. 1
0
    def estimate_mutation_frequencies(self, region="global"):
        '''
        calculate the frequencies of mutation in a particular region
        currently the global frequencies should be estimated first
        because this defines the set of positions at which frequencies in
        other regions are estimated.
        '''
        def filter_alignment(aln, region=None, lower_tp=None, upper_tp=None):
            from Bio.Align import MultipleSeqAlignment
            tmp = aln
            if region is not None:
                tmp = [s for s in aln if s.attributes['region']==region]
            if lower_tp is not None:
                tmp = [s for s in aln if s.attributes['num_date']>=lower_tp]
            if upper_tp is not None:
                tmp = [s for s in aln if s.attributes['num_date']<upper_tp]
            return MultipleSeqAlignment(aln)


        if not hasattr(self.seqs, 'aln'):
            print("Align sequences first")
            return

        # loop over nucleotide sequences and translations and calcuate
        # region specific frequencies of mutations above a certain threshold
        for prot, aln in [('nuc',self.seqs.aln)]+ self.seqs.translations.items():
            if region=="global":
                tmp_aln = filter_alignment(aln, lower_tp=self.pivots[0], upper_tp=self.pivots[-1])
                include_set=[]
            else:
                tmp_aln = filter_alignment(aln, region=region, lower_tp=self.pivots[0], upper_tp=self.pivots[-1])
                include_set = set([pos for (pos, mut) in self.frequencies[(prot, 'global')]])
            time_points = [x.attributes['num_date'] for x in tmp_aln]
            if len(time_points)==0:
                print('no samples in region', region, prot)
                continue

            aln_frequencies = alignment_frequencies(tmp_aln, time_points,
                                            self.pivots, ws=max(2,len(time_points)//10),
                                            **self.kwargs)
            aln_frequencies.mutation_frequencies(min_freq=0.01)
            self.frequencies[(region,prot)] = aln_frequencies.frequencies
            self.frequency_confidence[(region,prot)] = aln_frequencies.calc_confidence()
        self.tip_count[region]=aln_frequencies.counts
Esempio n. 2
0
    def estimate_mutation_frequencies(self, region="global", pivots=24):
        '''
        calculate the frequencies of mutation in a particular region
        currently the global frequencies should be estimated first
        because this defines the set of positions at which frequencies in
        other regions are estimated.
        '''
        if not hasattr(self.seqs, 'aln'):
            print("Align sequences first")
            return

        def filter_alignment(aln, region=None, lower_tp=None, upper_tp=None):
            from Bio.Align import MultipleSeqAlignment
            tmp = aln
            if region is not None:
                if type(region) == str:
                    tmp = [s for s in tmp if s.attributes['region'] == region]
                elif type(region) == list:
                    tmp = [s for s in tmp if s.attributes['region'] in region]
                else:
                    print("region must be string or list")
                    return
            if lower_tp is not None:
                tmp = [
                    s for s in tmp
                    if np.mean(s.attributes['num_date']) >= lower_tp
                ]
            if upper_tp is not None:
                tmp = [
                    s for s in tmp
                    if np.mean(s.attributes['num_date']) < upper_tp
                ]
            return MultipleSeqAlignment(tmp)

        if not hasattr(self, 'pivots'):
            tps = np.array([
                np.mean(x.attributes['num_date'])
                for x in self.seqs.seqs.values()
            ])
            self.pivots = make_pivots(pivots, tps)
        else:
            print('estimate_mutation_frequencies: using self.pivots')

        if not hasattr(self, 'mutation_frequencies'):
            self.mutation_frequencies = {}
            self.mutation_frequency_confidence = {}
            self.mutation_frequency_counts = {}

        # loop over nucleotide sequences and translations and calcuate
        # region specific frequencies of mutations above a certain threshold
        if type(region) == str:
            region_name = region
            region_match = region
        elif type(region) == tuple:
            region_name = region[0]
            region_match = region[1]
        else:
            print("region must be string or tuple")
            return
        for prot, aln in [('nuc', self.seqs.aln)
                          ] + self.seqs.translations.items():
            if region_match == "global":
                tmp_aln = filter_alignment(aln,
                                           lower_tp=self.pivots[0],
                                           upper_tp=self.pivots[-1])
                include_set = []
            else:
                tmp_aln = filter_alignment(aln,
                                           region=region_match,
                                           lower_tp=self.pivots[0],
                                           upper_tp=self.pivots[-1])
                include_set = set([
                    pos for (pos, mut) in self.mutation_frequencies[('global',
                                                                     prot)]
                ])
            time_points = [np.mean(x.attributes['num_date']) for x in tmp_aln]
            if len(time_points) == 0:
                print('no samples in region', region_name, prot)
                self.mutation_frequency_counts[region_name] = np.zeros_like(
                    self.pivots)
                continue

            aln_frequencies = alignment_frequencies(
                tmp_aln,
                time_points,
                self.pivots,
                ws=max(2,
                       len(time_points) // 10),
                **self.kwargs)
            aln_frequencies.mutation_frequencies(min_freq=0.01)
            self.mutation_frequencies[(region_name,
                                       prot)] = aln_frequencies.frequencies
            self.mutation_frequency_confidence[(
                region_name, prot)] = aln_frequencies.calc_confidence()
            self.mutation_frequency_counts[
                region_name] = aln_frequencies.counts
Esempio n. 3
0
    def estimate_mutation_frequencies(self,
                                      inertia=0.0,
                                      min_freq=0.01,
                                      stiffness=20.0,
                                      pivots=24,
                                      region="global",
                                      include_set={}):
        '''
        calculate the frequencies of mutation in a particular region
        currently the global frequencies should be estimated first
        because this defines the set of positions at which frequencies in
        other regions are estimated.
        '''
        if not hasattr(self.seqs, 'aln'):
            self.log.warn("Align sequences first")
            return

        def filter_alignment(aln, region=None, lower_tp=None, upper_tp=None):
            from Bio.Align import MultipleSeqAlignment
            tmp = aln
            if region is not None:
                if type(region) == str:
                    tmp = [s for s in tmp if s.attributes['region'] == region]
                elif type(region) == list:
                    tmp = [s for s in tmp if s.attributes['region'] in region]
                else:
                    self.log.warn("region must be string or list")
                    return
            if lower_tp is not None:
                tmp = [
                    s for s in tmp
                    if np.mean(s.attributes['num_date']) >= lower_tp
                ]
            if upper_tp is not None:
                tmp = [
                    s for s in tmp
                    if np.mean(s.attributes['num_date']) < upper_tp
                ]
            return MultipleSeqAlignment(tmp)

        if not hasattr(self, 'pivots'):
            tps = np.array([
                np.mean(x.attributes['num_date'])
                for x in self.seqs.seqs.values()
            ])
            self.pivots = make_pivots(pivots, tps)
        # else:
        #     self.log.notify('estimate_mutation_frequencies: using self.pivots')

        if not hasattr(self, 'mutation_frequencies'):
            self.restore_mutation_frequencies()

        # loop over nucleotide sequences and translations and calcuate
        # region specific frequencies of mutations above a certain threshold
        if type(region) == str:
            region_name = region
            region_match = region
        elif type(region) == tuple:
            region_name = region[0]
            region_match = region[1]
        else:
            self.log.warn("region must be string or tuple")
            return

        # loop over different alignment types
        for prot, aln in [('nuc', self.seqs.aln)
                          ] + self.seqs.translations.items():
            if (region_name, prot) in self.mutation_frequencies:
                self.log.notify(
                    "Skipping Frequency Estimation for region \"{}\", protein \"{}\""
                    .format(region_name, prot))
                continue
            self.log.notify(
                "Starting Frequency Estimation for region \"{}\", protein \"{}\""
                .format(region_name, prot))

            # determine set of positions that have to have a frequency calculated
            if prot in include_set:
                tmp_include_set = [x for x in include_set[prot]]
            else:
                tmp_include_set = []

            tmp_aln = filter_alignment(
                aln,
                region=None if region == 'global' else region_match,
                lower_tp=self.pivots[0],
                upper_tp=self.pivots[-1])

            if ('global', prot) in self.mutation_frequencies:
                tmp_include_set += set([
                    pos for (pos, mut) in self.mutation_frequencies[('global',
                                                                     prot)]
                ])

            time_points = [np.mean(x.attributes['num_date']) for x in tmp_aln]
            if len(time_points) == 0:
                self.log.notify('no samples in region {} (protein: {})'.format(
                    region_name, prot))
                self.mutation_frequency_counts[region_name] = np.zeros_like(
                    self.pivots)
                continue

            # instantiate alignment frequency
            aln_frequencies = alignment_frequencies(
                tmp_aln,
                time_points,
                self.pivots,
                ws=max(2,
                       len(time_points) // 10),
                inertia=inertia,
                stiffness=stiffness,
                method='SLSQP')
            if prot == 'nuc':  # if this is a nucleotide alignment, set all non-canonical states to N
                A = aln_frequencies.aln
                A[~((A == 'A') | (A == 'C') | (A == 'G') | (A == 'T') |
                    ('A' == '-'))] = 'N'

            aln_frequencies.mutation_frequencies(
                min_freq=min_freq,
                include_set=tmp_include_set,
                ignore_char='N' if prot == 'nuc' else 'X')
            self.mutation_frequencies[(region_name,
                                       prot)] = aln_frequencies.frequencies
            self.mutation_frequency_confidence[(
                region_name, prot)] = aln_frequencies.calc_confidence()
            self.mutation_frequency_counts[
                region_name] = aln_frequencies.counts

        self.log.notify("Saving mutation frequencies (pickle)")
        with open(self.output_path + "_mut_freqs.pickle", 'wb') as fh:
            pickle.dump(set(self.seqs.seqs.keys()),
                        fh,
                        protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(
                (self.mutation_frequencies, self.mutation_frequency_confidence,
                 self.mutation_frequency_counts),
                fh,
                protocol=pickle.HIGHEST_PROTOCOL)