def collect_data(patient_codes, regions, reference, synnonsyn=True): ''' loop over regions and produce a dictionary that contains the frequencies, syn/nonsyn designations and mutation rates ''' cov_min=500 combined_af_by_pat={} syn_nonsyn_by_pat={} syn_nonsyn_by_pat_unconstrained={} consensus_mutation_rate={} mutation_rates = load_mutation_rates()['mu'] total_muts = {nuc: sum([x for mut, x in mutation_rates.iteritems() if mut[0]==nuc]) for nuc in 'ACGT'} patients = [] for pcode in patient_codes: print(pcode) p = Patient.load(pcode) patients.append(p) for region in regions: if region=="genomewide": region_seq = "".join(reference.consensus) else: region_seq = reference.annotation[region].extract("".join(reference.consensus)) combined_af_by_pat[region], syn_nonsyn_by_pat[region], syn_nonsyn_by_pat_unconstrained[region] \ = collect_weighted_afs(region, patients, reference, synnonsyn=synnonsyn) consensus_mutation_rate[region] = np.array([total_muts[nuc] if nuc not in ['-', 'N'] else np.nan for nuc in region_seq]) return {'af_by_pat': combined_af_by_pat, 'mut_rate': consensus_mutation_rate, 'syn_by_pat': syn_nonsyn_by_pat, 'syn_by_pat_uc': syn_nonsyn_by_pat_unconstrained}
ax.grid(True) plt.tight_layout() plt.ion() plt.show() for ext in ['svg', 'png', 'pdf']: fig.savefig('../figures/figure_S1.' + ext) return ax # Script if __name__ == '__main__': parser = argparse.ArgumentParser(description='Figure S1') parser.add_argument('--threshold', default=0.1, help='diversity threshold') parser.add_argument('--gp120', action='store_true', default=False, help='exclude gp120') args = parser.parse_args() mu = load_mutation_rates(args.threshold, args.gp120) plot_comparison(mu['mu'], mu['muA'], dmulog10=mu['dmulog10'], dmuAlog10=mu['dmuAlog10'])
transform=ax.transAxes, fontsize=fs) ax.grid(True) plt.tight_layout() plt.ion() plt.show() for ext in ['svg', 'png', 'pdf']: fig.savefig('../figures/figure_S1.'+ext) return ax # Script if __name__ == '__main__': parser = argparse.ArgumentParser(description='Figure S1') parser.add_argument('--threshold', default=0.1, help='diversity threshold') parser.add_argument('--gp120', action='store_true', default=False, help='exclude gp120') args = parser.parse_args() mu = load_mutation_rates(args.threshold, args.gp120) plot_comparison(mu['mu'], mu['muA'], dmulog10=mu['dmulog10'], dmuAlog10=mu['dmuAlog10'])
def collect_data(patients, cov_min=100, no_sweeps=False, refname='HXB2'): '''Collect data for the fitness cost estimate''' print('Collect data from patients') ref = HIVreference(refname=refname, subtype='any', load_alignment=True) mus = load_mutation_rates() mu = mus.mu muA = mus.muA data = [] for pi, pcode in enumerate(patients): print(pcode) p = Patient.load(pcode) comap = (pd.DataFrame(p.map_to_external_reference('genomewide', refname=refname)[:, :2], columns=[refname, 'patient']) .set_index('patient', drop=True) .loc[:, refname]) aft = p.get_allele_frequency_trajectories('genomewide', cov_min=cov_min) for pos, aft_pos in enumerate(aft.swapaxes(0, 2)): fead = p.pos_to_feature[pos] # Keep only sites within ONE protein # Note: we could drop this, but then we cannot quite classify syn/nonsyn if len(fead['protein_codon']) != 1: continue # Exclude codons with gaps pc = fead['protein_codon'][0][-1] cod_anc = ''.join(p.initial_sequence[pos - pc: pos - pc + 3]) if '-' in cod_anc: continue # Keep only nonmasked times if aft_pos[:4].mask.any(axis=0).all(): continue else: ind = ~aft_pos[:4].mask.any(axis=0) times = p.dsi[ind] aft_pos = aft_pos[:, ind] n_templates = p.n_templates_dilutions[ind] # Get site entropy if pos not in comap.index: continue pos_ref = comap.loc[pos] S_pos = ref.entropy[pos_ref] # Keep only sites where the ancestral allele and group M agree if ref.consensus_indices[pos_ref] != aft_pos[:, 0].argmax(): continue # Filter out sweeps if so specified, only for nonsyn if no_sweeps: found = False nuc_anc = p.initial_sequence[pos] for ia, aft_nuc in enumerate(aft_pos[:4]): if (alpha[ia] != nuc_anc) and (aft_nuc > 0.5).any(): cod_new = cod_anc[:pc] + alpha[ia] + cod_anc[pc+1:] if translate(cod_anc) != translate(cod_new): found = True if found: continue # Keep only 1 - ancestral allele ia = p.initial_indices[pos] aft_nuc = 1 - aft_pos[ia] for it, (t, af_nuc, n_temp) in enumerate(izip(times, aft_nuc, n_templates)): datum = {'time': t, 'af': af_nuc, 'pos': pos, 'pos_ref': pos_ref, 'protein': fead['protein_codon'][0][0], 'pcode': pcode, 'ancestral': alpha[ia], 'S': S_pos, 'n_templates': n_temp, } data.append(datum) data = pd.DataFrame(data) return data
def collect_data(patients, cov_min=100, no_sweeps=False, refname='HXB2'): '''Collect data for the fitness cost estimate''' print('Collect data from patients') ref = HIVreference(refname=refname, subtype='any', load_alignment=True) mus = load_mutation_rates() mu = mus.mu muA = mus.muA data = [] for pi, pcode in enumerate(patients): print(pcode) p = Patient.load(pcode) comap = (pd.DataFrame( p.map_to_external_reference('genomewide', refname=refname)[:, :2], columns=[refname, 'patient']).set_index('patient', drop=True).loc[:, refname]) aft = p.get_allele_frequency_trajectories('genomewide', cov_min=cov_min) for pos, aft_pos in enumerate(aft.swapaxes(0, 2)): fead = p.pos_to_feature[pos] # Keep only sites within ONE protein # Note: we could drop this, but then we cannot quite classify syn/nonsyn if len(fead['protein_codon']) != 1: continue # Exclude codons with gaps pc = fead['protein_codon'][0][-1] cod_anc = ''.join(p.initial_sequence[pos - pc:pos - pc + 3]) if '-' in cod_anc: continue # Keep only nonmasked times if aft_pos[:4].mask.any(axis=0).all(): continue else: ind = ~aft_pos[:4].mask.any(axis=0) times = p.dsi[ind] aft_pos = aft_pos[:, ind] n_templates = p.n_templates_dilutions[ind] # Get site entropy if pos not in comap.index: continue pos_ref = comap.loc[pos] S_pos = ref.entropy[pos_ref] # Keep only sites where the ancestral allele and group M agree if ref.consensus_indices[pos_ref] != aft_pos[:, 0].argmax(): continue # Filter out sweeps if so specified, only for nonsyn if no_sweeps: found = False nuc_anc = p.initial_sequence[pos] for ia, aft_nuc in enumerate(aft_pos[:4]): if (alpha[ia] != nuc_anc) and (aft_nuc > 0.5).any(): cod_new = cod_anc[:pc] + alpha[ia] + cod_anc[pc + 1:] if translate(cod_anc) != translate(cod_new): found = True if found: continue # Keep only 1 - ancestral allele ia = p.initial_indices[pos] aft_nuc = 1 - aft_pos[ia] for it, (t, af_nuc, n_temp) in enumerate(izip(times, aft_nuc, n_templates)): datum = { 'time': t, 'af': af_nuc, 'pos': pos, 'pos_ref': pos_ref, 'protein': fead['protein_codon'][0][0], 'pcode': pcode, 'ancestral': alpha[ia], 'S': S_pos, 'n_templates': n_temp, } data.append(datum) data = pd.DataFrame(data) return data