def fraction_per_site(): """ Same as fraction per region but only for pol and with discrimination between 1st 2nd and 3rd position. """ region = "pol" patient_names = ["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"] for ii, site in enumerate(["first", "second", "third"]): consensus = [] non_consensus = [] fraction_non_consensus = [] for patient_name in patient_names: patient = Patient.load(patient_name) aft = patient.get_allele_frequency_trajectories(region) site_mask = get_site_mask(aft, ii+1) consensus_mask = get_consensus_mask(patient, region, aft)[site_mask] non_consensus_mask = get_non_consensus_mask(patient, region, aft)[site_mask] consensus += [np.sum(consensus_mask, dtype=int)] non_consensus += [np.sum(non_consensus_mask, dtype=int)] fraction_non_consensus += [non_consensus[-1] / (consensus[-1] + non_consensus[-1])] mean_consensus = np.mean(consensus) / (aft.shape[-1]/3) std_consensus = np.std(consensus) / (aft.shape[-1]/3) mean_non_consensus = np.mean(non_consensus) / (aft.shape[-1]/3) std_non_consensus = np.std(non_consensus) / (aft.shape[-1]/3) mean_fraction_non_consensus = np.mean(fraction_non_consensus) std_fraction_non_consensus = np.std(fraction_non_consensus) print(f"Site {site}:") print(f""" Consensus {round(mean_consensus, 2)} += {round(std_consensus, 3)} Non-consensus {round(mean_non_consensus, 2)} += {round(std_non_consensus, 3)} Fraction non_consensus {round(mean_fraction_non_consensus,3)} += {round(std_fraction_non_consensus,3)}""")
def make_patient_RNA_DNA_tree(pcode, min_DNA_frac = 0.001): ''' make a tree for all RNA/DNA sample of a given patient ''' for seq_type in ['clustered_good', 'good', 'hyper', 'suspicious']: seqs=[] for outprefix in patient_to_prefix_p17[pcode]: with myopen('data/'+outprefix+'_DNA_'+seq_type+save_as) as ifile: seqs.extend([x for x in SeqIO.parse(ifile, 'fasta')]) p = Patient.load(pcode) seqs.extend(p.get_haplotype_alignment(region)) seqs_pruned = prune_rare_DNA(seqs, min_DNA_frac) for hi, hap in enumerate(seqs_pruned): hap.id+='_'+str(hi) hap.name=hap.id outfname = 'data/'+pcode+'_RNA_and_DNA_'+seq_type+'.fasta' align(ungap(seqs_pruned), outfname) tree = infer_tree(outfname, min_DNA_frac=0.0) leaves = sorted(filter(lambda x:x.name[:4]=='days', tree.get_terminals()), key = lambda x:(int(x.name.split('_')[1]), -int(x.name.split('_')[3][:-1]))) tree.root_with_outgroup(leaves[0]) tree.root.branch_length=0.01 for branch in tree.get_nonterminals(order='postorder'): if branch.branch_length<0.001: tree.collapse(branch) tree.ladderize() Phylo.write(tree, 'data/'+pcode+ '_RNA_and_DNA_'+seq_type+'.nwk', 'newick')
def collect_data(patient_codes, regions, reference, synnonsyn=True): ''' loop over regions and produce a dictionary that contains the frequencies, syn/nonsyn designations and mutation rates ''' cov_min=500 combined_af_by_pat={} syn_nonsyn_by_pat={} syn_nonsyn_by_pat_unconstrained={} consensus_mutation_rate={} mutation_rates = load_mutation_rates()['mu'] total_muts = {nuc: sum([x for mut, x in mutation_rates.iteritems() if mut[0]==nuc]) for nuc in 'ACGT'} patients = [] for pcode in patient_codes: print(pcode) p = Patient.load(pcode) patients.append(p) for region in regions: if region=="genomewide": region_seq = "".join(reference.consensus) else: region_seq = reference.annotation[region].extract("".join(reference.consensus)) combined_af_by_pat[region], syn_nonsyn_by_pat[region], syn_nonsyn_by_pat_unconstrained[region] \ = collect_weighted_afs(region, patients, reference, synnonsyn=synnonsyn) consensus_mutation_rate[region] = np.array([total_muts[nuc] if nuc not in ['-', 'N'] else np.nan for nuc in region_seq]) return {'af_by_pat': combined_af_by_pat, 'mut_rate': consensus_mutation_rate, 'syn_by_pat': syn_nonsyn_by_pat, 'syn_by_pat_uc': syn_nonsyn_by_pat_unconstrained}
def fraction_per_region(): """ Fraction of consensus and non_consensus site computation. This is for initial sequence for each patient. Fraction consensus + non_consensus does not equal one because some regions are excluded due to gaps. """ regions = ["env", "pol", "gag"] patient_names = ["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"] for region in regions: consensus = [] non_consensus = [] fraction_non_consensus = [] for patient_name in patient_names: patient = Patient.load(patient_name) aft = patient.get_allele_frequency_trajectories(region) consensus_mask = get_consensus_mask(patient, region, aft) non_consensus_mask = get_non_consensus_mask(patient, region, aft) consensus += [np.sum(consensus_mask, dtype=int)] non_consensus += [np.sum(non_consensus_mask, dtype=int)] fraction_non_consensus += [non_consensus[-1] / (consensus[-1] + non_consensus[-1])] mean_consensus = np.mean(consensus) / aft.shape[-1] std_consensus = np.std(consensus) / aft.shape[-1] mean_non_consensus = np.mean(non_consensus) / aft.shape[-1] std_non_consensus = np.std(non_consensus) / aft.shape[-1] mean_fraction_non_consensus = np.mean(fraction_non_consensus) std_fraction_non_consensus = np.std(fraction_non_consensus) print(f"Region {region}:") print(f""" Consensus {round(mean_consensus, 2)} += {round(std_consensus, 2)} Non-consensus {round(mean_non_consensus, 2)} += {round(std_non_consensus, 2)} Fraction non_consensus {round(mean_fraction_non_consensus,3)} += {round(std_fraction_non_consensus,3)}""")
def get_divergence_cumulative_sum(sample_slice=-3, patient_names=["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"]): """ Returns the divergence values for the last 3 datapoints of each patient. Returns both the raw values and the cumulative sum (normalized to 1). Taking only one every sampling time points as there is a lot of data. """ region = "pol" nb_last = 3 sampling = 20 all_values = [] for name in patient_names: patient = Patient.load(name) div2D = get_divergence_in_time(region, patient) if sample_slice < 0: values = div2D[sample_slice:].flatten() else: values = div2D[:sample_slice].flatten() all_values += list(values[~values.mask]) all_values = np.sort(all_values) cum_sum = np.cumsum(all_values) cum_sum /= cum_sum[-1] values = np.concatenate((all_values[::sampling], np.array([all_values[-1]]))) cumulative = np.concatenate((cum_sum[::sampling], np.array([cum_sum[-1]]))) return values, cumulative
def patient_preprocessing(pat_name,Squant, div = False, outliers = True, xcut = 0.0, xcut_up = 0., cov_min=100): '''Load patient data, remove outliers and return average frequencies by timepoint and covariances Input arguments: pat_name - patient name q - number of entropy categories Returns: xka_q - nucleotide frequencies by time-point, site, quantile tt - corresponding time values Smedians, Squant ''' q=len(Squant) print pat_name PAT = Patient.load(pat_name) tt = PAT.times() map_to_ref = PAT.map_to_external_reference(gen_region) # load mutation frequencies, retain only positions that map_to_ref freqs = PAT.get_allele_frequency_trajectories(gen_region, cov_min=cov_min)[:,:,map_to_ref[:,2]] gp120 = np.zeros(len(ref.seq), dtype=bool) gp120[[x for x in ref.annotation['gp120']]]=True # determine ancestral indices -> argmax in first time point anc_index = np.argmax(freqs[0,:4,:],axis=0) anc_freq = freqs[:,anc_index,range(anc_index.shape[0])] # calculate the mutation frequency change dt_k = np.array([tt[0]] + list(np.diff(tt))) xave = np.sum(dt_k * anc_freq.T, axis=1) /np.sum(dt_k * (~anc_freq.mask).T, axis=1) # determine positions where the initial state agrees with the consensus states good_positions = (anc_index==ref.get_consensus_indices_in_patient_region(map_to_ref)) good_positions = good_positions&(ref.get_ungapped()[map_to_ref[:,0]]) good_positions = good_positions&(~gp120[map_to_ref[:,0]]) #good_positions = (xave <= 1.-xcut)*(xave > xcut_up)*(anc_index==ref.get_consensus_indices_in_patient_region(map_to_ref)) xka_q = [] for jq in xrange(q): idx_ref = Squant[jq]['ind'] # reference positions are in map_to_ref[:,0] idx_PAT = np.in1d(map_to_ref[:,0], idx_ref)&good_positions if div: x_ka = (1.- anc_freq[:,idx_PAT])*anc_freq[:,idx_PAT] else: x_ka = (1.- anc_freq[:,idx_PAT]) xka_q.append(x_ka) # Remove outliers from the data if outliers: xka_q_new = [] for jq, x_ka0 in enumerate(xka_q): nonout = np.all(x_ka0<=0.5, axis=0) xka_q_new.append(x_ka0[:,nonout]) xka_q = list(xka_q_new) return xka_q, tt
def load_patient_data(patient_names = 'all', q = 4, timescale = 'years', filepath = None, fromHIVEVO = False): if patient_names == 'all': patient_names = ['p{}'.format(j+1) for j in xrange(11)] if fromHIVEVO: #sys.path.append('/ebio/ag-neher/share/users/vpuller/HIVEVO/HIVEVO_access') sys.path.append('/home/vadim/ebio/users/vpuller/HIVEVO/HIVEVO_access') from hivevo.patients import Patient from hivevo.HIVreference import HIVreference ref = HIVreference(load_alignment=False) Lref = len(ref.seq) data_all = {} for pat_name in patient_names: PAT = Patient.load(pat_name) tt = PAT.times(unit = timescale) vload = PAT.n_templates_viral_load dilutions = PAT.n_templates_dilutions freqs_raw = PAT.get_allele_frequency_trajectories('genomewide', error_rate=err)[:,:q,:] map_to_ref = PAT.map_to_external_reference('genomewide') freqs = np.ma.zeros((tt.shape[0], q, Lref)); freqs.mask = True freqs[:,:,map_to_ref[:,0]] = freqs_raw[:,:,map_to_ref[:,1]] data_all[pat_name] = (tt, freqs, vload, dilutions) if filepath is not None: np.save(filepath + '{}_data.npy'.format(pat_name), freqs.data) np.save(filepath + '{}_mask.npy'.format(pat_name), freqs.mask) np.save(filepath + '{}_tt.npy'.format(pat_name), tt) np.save(filepath + '{}_viral_load.npy'.format(pat_name), vload) np.save(filepath + '{}_dilutions.npy'.format(pat_name), dilutions.data) np.save(filepath + '{}_dilutions_mask.npy'.format(pat_name), dilutions.mask) data_all['Lref'] = freqs.shape[2] data_all['pat_names'] = patient_names elif filepath is not None: data_all = {} for pat_name in patient_names: tt = np.load(filepath + '{}_tt.npy'.format(pat_name)) data = np.load(filepath + '{}_data.npy'.format(pat_name)) mask = np.load(filepath + '{}_mask.npy'.format(pat_name)) freqs = np.ma.masked_array(data, mask = mask) vload = np.load(filepath + '{}_viral_load.npy'.format(pat_name)) dilutions = np.load(filepath + '{}_dilutions.npy'.format(pat_name)) dilutions_mask = np.load(filepath + '{}_dilutions_mask.npy'.format(pat_name)) dilutions = np.ma.masked_array(dilutions, mask = dilutions_mask) data_all[pat_name] = (tt, freqs, vload, dilutions) data_all['Lref'] = freqs.shape[2] data_all['pat_names'] = patient_names else: print 'Path to data is not specified' return data_all
def create_all_patient_trajectories(region, patient_names=[]): if patient_names == []: patient_names = ["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"] trajectories = [] ref = HIVreference(subtype="any") for patient_name in patient_names: patient = Patient.load(patient_name) aft = patient.get_allele_frequency_trajectories(region) trajectories = trajectories + create_trajectory_list( patient, region, aft, ref) return trajectories
def make_divergence_dict(time, consensus=False): """ Creates a dictionary with the divergence in time averaged over patients. Format of the dictionary : dict[region][consensus/non_consensus/all][high/low/all/first/second/third] """ regions = ["env", "pol", "gag"] patient_names = ["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"] fitness_keys = ["low", "high", "all", "first", "second", "third"] divergence_dict = {} for region in regions: divergence_dict[region] = {"consensus": {}, "non_consensus": {}, "all": {}} nb_traj = np.zeros_like(time) for key in fitness_keys: divergence_dict[region]["consensus"][key] = np.zeros_like(time, dtype=float) divergence_dict[region]["non_consensus"][key] = np.zeros_like(time, dtype=float) if key in ["all", "first", "second", "third"]: divergence_dict[region]["all"][key] = np.zeros_like(time, dtype=float) for patient_name in patient_names: patient = Patient.load(patient_name) patient_div_dict = get_mean_divergence_patient(patient, region, consensus) tmp_time = time[time < patient.dsi[-1]] nb_traj[:len(tmp_time)] += 1 for key in fitness_keys: patient_div_dict["consensus"][key] = np.interp( tmp_time, patient.dsi, patient_div_dict["consensus"][key]) patient_div_dict["non_consensus"][key] = np.interp( tmp_time, patient.dsi, patient_div_dict["non_consensus"][key]) divergence_dict[region]["consensus"][key][:len( tmp_time)] += patient_div_dict["consensus"][key] divergence_dict[region]["non_consensus"][key][:len( tmp_time)] += patient_div_dict["non_consensus"][key] for key in ["all", "first", "second", "third"]: patient_div_dict["all"][key] = np.interp( tmp_time, patient.dsi, patient_div_dict["all"][key]) divergence_dict[region]["all"][key][:len( tmp_time)] += patient_div_dict["all"][key] for key1 in ["consensus", "non_consensus", "all"]: for key2 in divergence_dict[region][key1].keys(): divergence_dict[region][key1][key2] = divergence_dict[region][key1][key2] / nb_traj return divergence_dict
def get_mean_sweep_per_year(region, patient_names=["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"], threshold=0.05): nb = 0 for patient_name in patient_names: patient = Patient.load(patient_name) aft = patient.get_allele_frequency_trajectories(region) initial_idx = patient.get_initial_indices(region) aft_initial = aft[np.arange(aft.shape[0])[:, np.newaxis, np.newaxis], initial_idx, np.arange(aft.shape[-1])] aft_initial = aft_initial[:, 0, :] mask = aft_initial <= threshold tmp = np.where(np.sum(mask, axis=0))[0] nb += tmp.shape[0] / patient.ysi[-1] return nb / len(patient_names)
def make_bootstrap_divergence_dict(nb_bootstrap=10, consensus=False): """ Creates a dictionary with the divergence in time for each patient. Format of the dictionary : dict[region][patient][consensus/non_consensus/all][high/low/all/first/second/third] Turn consensus to True to compute the divergence to consensus sequence instead of founder sequence. """ regions = ["env", "pol", "gag"] patient_names = ["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"] fitness_keys = ["low", "high", "all", "first", "second", "third"] time = np.arange(0, 2001, 40) # Generating a dictionnary with the divergence for each patient (interpolated to the time vector) divergence_dict = {} for region in regions: divergence_dict[region] = {} for patient_name in patient_names: patient = Patient.load(patient_name) patient_div_dict = get_mean_divergence_patient(patient, region, consensus) divergence_dict[region][patient_name] = patient_div_dict for key in divergence_dict[region][patient_name].keys(): for key2 in divergence_dict[region][patient_name][key].keys(): divergence_dict[region][patient_name][key][key2] = np.interp( time, patient.dsi, divergence_dict[region][patient_name][key][key2]) # Bootstrapping the divergence values over patients. Tips of the dict are list with 1 div vector for each of the bootstrapping bootstrap_dict = create_div_bootstrap_dict() for ii in range(nb_bootstrap): bootstrap_names = bootstrap_patient_names() for region in regions: dict_list = [] for patient_name in bootstrap_names: dict_list += [divergence_dict[region][patient_name]] for key in divergence_dict[region][patient_names[0]].keys(): for key2 in divergence_dict[region][patient_names[0]][key].keys(): tmp = np.array([dict[key][key2] for dict in dict_list]) bootstrap_dict[region][key][key2] += [np.mean(tmp, axis=0)] # Averaging the bootstrapping for region in bootstrap_dict.keys(): for key in bootstrap_dict[region].keys(): for key2 in bootstrap_dict[region][key].keys(): tmp = np.array(bootstrap_dict[region][key][key2]).copy() bootstrap_dict[region][key][key2] = {"mean": np.mean(tmp, axis=0), "std": np.std(tmp, axis=0)} return time, bootstrap_dict
def get_sweep_sites_sum( region, patient_names=["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"]): "Returns a 1D vector with the sum of sweep sites over all patients" sites = [] for patient_name in patient_names: patient = Patient.load(patient_name) aft = patient.get_allele_frequency_trajectories(region) sweep_mask = get_sweep_mask(patient, aft, region, threshold_low=0.5) ref = HIVreference(subtype="any") reference_mask = trajectory.get_reference_filter( patient, region, aft, ref) sweep_mask = sweep_mask[reference_mask] sites = sites + [list(sweep_mask[:2964])] sites = np.array(sites) sites = np.sum(sites, axis=0, dtype=int) return sites
def divergence_contribution(region, patient_name): patient = Patient.load(patient_name) aft = patient.get_allele_frequency_trajectories(region) initial_idx = patient.get_initial_indices(region) divergence_matrix = divergence.divergence_matrix(patient, region, aft) div = divergence_matrix[np.arange(aft.shape[0])[:, np.newaxis, np.newaxis], initial_idx, np.arange(aft.shape[-1])] div = div[:, 0, :] hist, bins = np.histogram(div[-3:, :], bins=1000) bins = bins[:-1] hist_sum = np.cumsum(hist) hist_sum = hist_sum / np.max(hist_sum) plt.figure() plt.plot(bins, hist_sum, label="hist") plt.legend() plt.grid() plt.show()
def add_RNA_haplotypes(fname, pat, region): ''' loads patient RNA haplotypes and DNA haplotypes and return both in one list ''' from hivevo.patients import Patient try: if pat in patient_translation: pcode = patient_translation[pat] else: pcode = pat p = Patient.load(pcode) rna_haps = p.get_haplotype_alignment(region) with myopen(fname) as ifile: dna_haps = [seq for seq in SeqIO.parse(ifile, 'fasta')] dna_haps.extend(filter(lambda x:get_RNA_read_count(x.description)>2, rna_haps)) return dna_haps except: print(fname,pat, region,"failed")
def initial_traj_under_threshold(region, patient_name): patient = Patient.load(patient_name) aft = patient.get_allele_frequency_trajectories(region) # Masking low depth depth = trajectory.get_depth(patient, region) depth = np.tile(depth, (6, 1, 1)) depth = np.swapaxes(depth, 0, 1) aft.mask = np.logical_or(aft.mask, ~depth) initial_idx = patient.get_initial_indices(region) aft_initial = aft[np.arange(aft.shape[0])[:, np.newaxis, np.newaxis], initial_idx, np.arange(aft.shape[-1])] aft_initial = aft_initial[:, 0, :] threshold_low = 0.05 threshold_high = 0.95 mask = aft_initial <= threshold_low data = aft_initial[:, np.where(np.sum(mask, axis=0))] data = data[:, 0, :] return data
def collect_data(patient_codes, regions, subtype): cov_min=500 combined_af_by_pat={} initial_codons_by_pat={} combined_phenos={} aa_ref = 'NL4-3' patients = [] for pcode in patient_codes: try: p = Patient.load(pcode) patients.append(p) except: print("Can't load patient", pcode) for region in regions: reference = HIVreferenceAminoacid(region, refname=aa_ref, subtype = subtype) combined_af_by_pat[region], initial_codons_by_pat[region], combined_phenos[region] =\ collect_weighted_aa_afs(region, patients, reference, cov_min=cov_min) return {'af_by_pat':combined_af_by_pat, 'init_codon': initial_codons_by_pat, 'pheno':combined_phenos}
def plot_drug_resistance_mutation_trajectories(pcode): ''' auxillary function to check for potential drug resistance evolution in RNA sequences only p10 has drug resistance mutations in the last two samples ''' plt.figure() p = Patient.load(pcode) RT = p.get_allele_frequency_trajectories('RT', type='aa') for mt in ['NNRTI', 'NRTI']: for aa1, pos, aa2 in drug_muts[mt]['mutations']: traj = 1 - RT[:, alphaal.index(aa1), pos - 1] if max(traj) > 0.1: plt.plot(p.dsi, traj, '-o', label=mt + ' ' + str(pos)) PR = p.get_allele_frequency_trajectories('PR', type='aa') for mt in ['PI']: for aa1, pos, aa2 in drug_muts[mt]['mutations']: traj = 1 - PR[:, alphaal.index(aa1), pos - 1] if max(traj) > 0.1: plt.plot(p.dsi, traj, '-o', label=mt + ' ' + str(pos)) plt.legend(loc=2, ncol=2)
def plot_drug_resistance_mutation_trajectories(pcode): """ auxillary function to check for potential drug resistance evolution in RNA sequences only p10 has drug resistance mutations in the last two samples """ plt.figure() p = Patient.load(pcode) RT = p.get_allele_frequency_trajectories("RT", type="aa") for mt in ["NNRTI", "NRTI"]: for aa1, pos, aa2 in drug_muts[mt]["mutations"]: traj = 1 - RT[:, alphaal.index(aa1), pos - 1] if max(traj) > 0.1: plt.plot(p.dsi, traj, "-o", label=mt + " " + str(pos)) PR = p.get_allele_frequency_trajectories("PR", type="aa") for mt in ["PI"]: for aa1, pos, aa2 in drug_muts[mt]["mutations"]: traj = 1 - PR[:, alphaal.index(aa1), pos - 1] if max(traj) > 0.1: plt.plot(p.dsi, traj, "-o", label=mt + " " + str(pos)) plt.legend(loc=2, ncol=2)
def collect_data(patients, cov_min=100, refname='HXB2'): '''Collect data for the fitness cost estimate''' ref = HIVreference(refname=refname, subtype='any', load_alignment=True) data = [] for pi, pcode in enumerate(patients): print pcode p = Patient.load(pcode) comap = (pd.DataFrame(p.map_to_external_reference('genomewide', refname=refname)[:, :2], columns=[refname, 'patient']) .set_index('patient', drop=True) .loc[:, refname]) aft = p.get_allele_frequency_trajectories('genomewide', cov_min=cov_min) for pos, aft_pos in enumerate(aft.swapaxes(0, 2)): fead = p.pos_to_feature[pos] # Keep only sites within ONE protein # Note: we could drop this, but then we cannot quite classify syn/nonsyn if len(fead['protein_codon']) != 1: continue # Exclude codons with gaps pc = fead['protein_codon'][0][-1] cod_anc = ''.join(p.initial_sequence[pos - pc: pos - pc + 3]) if '-' in cod_anc: continue # Keep only nonmasked times if aft_pos[:4].mask.any(axis=0).all(): continue else: ind = ~aft_pos[:4].mask.any(axis=0) times = p.dsi[ind] aft_pos = aft_pos[:, ind] # Get site entropy if pos not in comap.index: continue pos_ref = comap.loc[pos] S_pos = ref.entropy[pos_ref] # Keep only sites where the ancestral allele and group M agree if ref.consensus_indices[pos_ref] != aft_pos[:, 0].argmax(): anc_cross = False else: anc_cross = True for ia, aft_nuc in enumerate(aft_pos[:4]): # Keep only derived alleles if alpha[ia] == p.initial_sequence[pos]: continue # Keep only sweeps if not (aft_nuc > 0.9).any(): continue # Annotate with syn/nonsyn alleles cod_new = cod_anc[:pc] + alpha[ia] + cod_anc[pc+1:] if translate(cod_anc) != translate(cod_new): syn = False else: syn = True mut = p.initial_sequence[pos]+'->'+alpha[ia] for it, (t, af_nuc) in enumerate(izip(times, aft_nuc)): datum = {'time': t, 'af': af_nuc, 'pos': pos, 'pos_ref': pos_ref, 'protein': fead['protein_codon'][0][0], 'pcode': pcode, 'mut': mut, 'S': S_pos, 'syn': syn, 'anc_cross': anc_cross, } data.append(datum) data = pd.DataFrame(data) return data
def collect_data(patients, cov_min=100, refname='HXB2', subtype='any', entropy_threshold=0.1, excluded_proteins=[]): '''Collect data for the mutation rate estimate''' print('Collect data from patients') ref = HIVreference(refname=refname, load_alignment=True, subtype=subtype) data = [] for pi, pcode in enumerate(patients): print(pcode) p = Patient.load(pcode) comap = (pd.DataFrame(p.map_to_external_reference('genomewide')[:, :2], columns=[refname, 'patient' ]).set_index('patient', drop=True).loc[:, refname]) aft = p.get_allele_frequency_trajectories('genomewide', cov_min=cov_min) times = p.dsi for pos, aft_pos in enumerate(aft.swapaxes(0, 2)): fead = p.pos_to_feature[pos] # Keep only sites within ONE protein if len(fead['protein_codon']) != 1: continue # skip if protein is to be excluded if fead['protein_codon'][0][0] in excluded_proteins: continue # Exclude codons with gaps pc = fead['protein_codon'][0][-1] cod_anc = ''.join(p.initial_sequence[pos - pc:pos - pc + 3]) if '-' in cod_anc: continue for ia, aft_nuc in enumerate(aft_pos[:4]): # Keep only derived alleles if alpha[ia] == p.initial_sequence[pos]: continue # Keep only no RNA structures if fead['RNA']: continue # Keep only sites which are also in the reference if pos not in comap.index: continue # Keep only high-entropy sites S_pos = ref.entropy[comap.loc[pos]] if S_pos < entropy_threshold: continue # Keep only synonymous alleles cod_new = cod_anc[:pc] + alpha[ia] + cod_anc[pc + 1:] if translate(cod_anc) != translate(cod_new): continue mut = p.initial_sequence[pos] + '->' + alpha[ia] for it, (t, af_nuc) in enumerate(izip(times, aft_nuc)): # Keep only nonmasked times if aft_nuc.mask[it]: continue datum = { 'time': t, 'af': af_nuc, 'pos': pos, 'refpos': comap.loc[pos], 'protein': fead['protein_codon'][0][0], 'pcode': pcode, 'mut': mut, 'subtype': subtype, 'refname': refname, } data.append(datum) data = pd.DataFrame(data) return data
import numpy as np import matplotlib.pyplot as plt import filenames from hivevo.patients import Patient from trajectory import create_trajectory_list, create_all_patient_trajectories patient_name = "p1" region = "env" fontsize = 16 patient = Patient.load(patient_name) aft = patient.get_allele_frequency_trajectories(region) trajectories = create_all_patient_trajectories(region)
def collect_data(patients, cov_min=100, refname='HXB2'): '''Collect data for the fitness cost estimate''' ref = HIVreference(refname=refname, subtype='any', load_alignment=True) mus = load_mutation_rates() mu = mus.mu muA = mus.muA data = [] for pi, pcode in enumerate(patients): print pcode p = Patient.load(pcode) comap = (pd.DataFrame( p.map_to_external_reference('genomewide', refname=refname)[:, :2], columns=[refname, 'patient']).set_index('patient', drop=True).loc[:, refname]) aft = p.get_allele_frequency_trajectories('genomewide', cov_min=cov_min) for pos, aft_pos in enumerate(aft.swapaxes(0, 2)): fead = p.pos_to_feature[pos] # Keep only sites within ONE protein # Note: we could drop this, but then we cannot quite classify syn/nonsyn if len(fead['protein_codon']) != 1: continue # Exclude codons with gaps pc = fead['protein_codon'][0][-1] cod_anc = ''.join(p.initial_sequence[pos - pc:pos - pc + 3]) if '-' in cod_anc: continue # Keep only nonmasked times if aft_pos[:4].mask.any(axis=0).all(): continue else: ind = ~aft_pos[:4].mask.any(axis=0) times = p.dsi[ind] aft_pos = aft_pos[:, ind] # Get site entropy if pos not in comap.index: continue pos_ref = comap.loc[pos] S_pos = ref.entropy[pos_ref] # Keep only sites where the ancestral allele and group M agree if ref.consensus_indices[pos_ref] != aft_pos[:, 0].argmax(): continue for ia, aft_nuc in enumerate(aft_pos[:4]): # Keep only derived alleles if alpha[ia] == p.initial_sequence[pos]: continue # Keep only sweeps if not (aft_nuc > 0.5).any(): continue # Annotate with syn/nonsyn alleles cod_new = cod_anc[:pc] + alpha[ia] + cod_anc[pc + 1:] if translate(cod_anc) != translate(cod_new): syn = False else: syn = True mut = p.initial_sequence[pos] + '->' + alpha[ia] mu_pos = mu[mut] muA_pos = muA[mut] for it, (t, af_nuc) in enumerate(izip(times, aft_nuc)): datum = { 'time': t, 'af': af_nuc, 'pos': pos, 'pos_ref': pos_ref, 'protein': fead['protein_codon'][0][0], 'pcode': pcode, 'mut': mut, 'mu': mu_pos, 'muAbram': muA_pos, 'S': S_pos, 'syn': syn, } data.append(datum) data = pd.DataFrame(data) return data
rna_haps_by_tp = defaultdict(list) for rhap in rna_haps: # sort RNA reads by time points rna_haps_by_tp[int(rhap.name.split('_')[1])].append(rhap) print(sorted(rna_haps_by_tp.keys()), [len(rna_haps_by_tp[k]) for k in sorted(rna_haps_by_tp.keys())], pat.dsi) reads = [] for tp, p in zip(pat.dsi, proportions): reads.extend(sample_from_RNA_tp(rna_haps_by_tp[int(tp)], int(p * n))) return reads if __name__ == "__main__": patients = {pcode: Patient.load(pcode) for pcode in pcodes} import pandas as pd #treatment_dates = pd.read_excel('data/2016-01-08_treatment_start_dates.xlsx') sns.set_style('darkgrid') # lifetimes of cells -- 95% short lived cells, 5% long lived cells cell_pop = [(0.9, 30.), (0.1, 500.)] ### # make read samples from pre treatment RNA reads ### reads = {} sampling_times = [0, 90, 180] # at ART start, 3 month, 6 month for pcode, p in patients.iteritems(): reads[pcode] = [] for t_ART in sampling_times: reads[pcode].append(sample_from_RNA(p, cell_pop, t_ART, n=1000))
dmin = 20 dmin_pad = 100 var_min = 0.1 cov_min = 200 corr_vs_distance = {} bins = np.arange(0,401,40) binc = (bins[:-1]+bins[1:])*0.5 all_dists = [] all_weights = [] for frag in all_fragments: if frag not in ['F'+str(i) for i in xrange(1,7)]: continue dists = [] weights = [] for pcode in patients: p = Patient.load(pcode) aft = p.get_allele_frequency_trajectories(frag) depth = p.get_fragment_depth(pad=False, limit_to_dilution=False) depth_pad = p.get_fragment_depth(pad=True, limit_to_dilution=False) for si, sample in enumerate(p.samples[:-1]): if depth[si][all_fragments.index(frag)]>dmin \ or depth_pad[si][all_fragments.index(frag)]>dmin_pad: try: positions, af2p, cov, af1p = sample.get_pair_frequencies(frag, var_min=var_min) majority_nuc = af1p.argmax(axis=0) if positions is None: continue LD, Dp, p12 = LDfunc(af2p, af1p, cov, cov_min=100) daf = aft[si+1][majority_nuc,positions] - aft[si][majority_nuc,positions] X,Y = np.meshgrid(positions, positions) dp1,dp2 = np.meshgrid(daf, daf)
return evo_rate_dict if __name__ == "__main__": colors = ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9"] regions = ["env", "pol", "gag"] # time = np.arange(0, 3100, 100) # divergence_dict = make_divergence_dict(time) # save_divergence_dict(divergence_dict) patient_names = ["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"] region = "pol" nb_consensus = np.array([0, 0, 0]) for patient_name in patient_names: patient = Patient.load("p1") aft = patient.get_allele_frequency_trajectories(region) mask = get_consensus_mask(patient, region, aft) mask1 = mask[::3] mask2 = mask[1::3] mask3 = mask[2::3] nb_consensus += [np.sum(mask1), np.sum(mask2), np.sum(mask3)] nb_consensus = nb_consensus / len(patient_names) / (aft.shape[-1] / 3) print(nb_consensus) # plt.figure() # for key1 in divergence_dict[region].keys(): # for key2 in divergence_dict[region][key1].keys(): # plt.plot(time, divergence_dict[region][key1][key2], label=f"{key1} {key2}")
def collect_data(patients, cov_min=100, no_sweeps=False, refname='HXB2'): '''Collect data for the fitness cost estimate''' print('Collect data from patients') ref = HIVreference(refname=refname, subtype='any', load_alignment=True) mus = load_mutation_rates() mu = mus.mu muA = mus.muA data = [] for pi, pcode in enumerate(patients): print(pcode) p = Patient.load(pcode) comap = (pd.DataFrame(p.map_to_external_reference('genomewide', refname=refname)[:, :2], columns=[refname, 'patient']) .set_index('patient', drop=True) .loc[:, refname]) aft = p.get_allele_frequency_trajectories('genomewide', cov_min=cov_min) for pos, aft_pos in enumerate(aft.swapaxes(0, 2)): fead = p.pos_to_feature[pos] # Keep only sites within ONE protein # Note: we could drop this, but then we cannot quite classify syn/nonsyn if len(fead['protein_codon']) != 1: continue # Exclude codons with gaps pc = fead['protein_codon'][0][-1] cod_anc = ''.join(p.initial_sequence[pos - pc: pos - pc + 3]) if '-' in cod_anc: continue # Keep only nonmasked times if aft_pos[:4].mask.any(axis=0).all(): continue else: ind = ~aft_pos[:4].mask.any(axis=0) times = p.dsi[ind] aft_pos = aft_pos[:, ind] n_templates = p.n_templates_dilutions[ind] # Get site entropy if pos not in comap.index: continue pos_ref = comap.loc[pos] S_pos = ref.entropy[pos_ref] # Keep only sites where the ancestral allele and group M agree if ref.consensus_indices[pos_ref] != aft_pos[:, 0].argmax(): continue # Filter out sweeps if so specified, only for nonsyn if no_sweeps: found = False nuc_anc = p.initial_sequence[pos] for ia, aft_nuc in enumerate(aft_pos[:4]): if (alpha[ia] != nuc_anc) and (aft_nuc > 0.5).any(): cod_new = cod_anc[:pc] + alpha[ia] + cod_anc[pc+1:] if translate(cod_anc) != translate(cod_new): found = True if found: continue # Keep only 1 - ancestral allele ia = p.initial_indices[pos] aft_nuc = 1 - aft_pos[ia] for it, (t, af_nuc, n_temp) in enumerate(izip(times, aft_nuc, n_templates)): datum = {'time': t, 'af': af_nuc, 'pos': pos, 'pos_ref': pos_ref, 'protein': fead['protein_codon'][0][0], 'pcode': pcode, 'ancestral': alpha[ia], 'S': S_pos, 'n_templates': n_temp, } data.append(datum) data = pd.DataFrame(data) return data
from itertools import izip from hivevo.patients import Patient from hivevo.samples import all_fragments from hivevo.af_tools import LD as LDfunc from util import store_data, load_data, fig_width, fig_fontsize, HIVEVO_colormap import os from filenames import get_figure_folder import matplotlib.pyplot as plt import seaborn as sns plt.ion() sns.set_style('darkgrid') cols = HIVEVO_colormap() if __name__ == "__main__": p = Patient.load('p10') aft = p.get_allele_frequency_trajectories('genomewide') af = aft[0] consensus_indices = p.get_initial_indices('genomewide') minor_af = 1.0 - af.max(axis=0) # make a histogram of the minor allele frequencies plt.figure() plt.hist(minor_af, bins=np.linspace(0, 1, 51), bottom=0.5) plt.yscale('log') plt.xlabel('frequency') plt.ylabel('number of minor variants') # --> there are two clear peaks one around 0.35-0.5 , the other around 0.1-0.15 variable_pos = minor_af > 0.05
dmin = 20 dmin_pad = 100 var_min = 0.1 cov_min = 200 corr_vs_distance = {} bins = np.arange(0, 401, 40) binc = (bins[:-1] + bins[1:]) * 0.5 all_dists = [] all_weights = [] for frag in all_fragments: if frag not in ['F' + str(i) for i in xrange(1, 7)]: continue dists = [] weights = [] for pcode in patients: p = Patient.load(pcode) aft = p.get_allele_frequency_trajectories(frag) depth = p.get_fragment_depth(pad=False, limit_to_dilution=False) depth_pad = p.get_fragment_depth(pad=True, limit_to_dilution=False) for si, sample in enumerate(p.samples[:-1]): if depth[si][all_fragments.index(frag)]>dmin \ or depth_pad[si][all_fragments.index(frag)]>dmin_pad: try: positions, af2p, cov, af1p = sample.get_pair_frequencies( frag, var_min=var_min) majority_nuc = af1p.argmax(axis=0) if positions is None: continue LD, Dp, p12 = LDfunc(af2p, af1p, cov, cov_min=100) daf = aft[si + 1][majority_nuc, positions] - aft[si][majority_nuc,
def collect_data(patients, cov_min=100, refname='HXB2', subtype='any', entropy_threshold=0.1, excluded_proteins=[]): '''Collect data for the mutation rate estimate''' print('Collect data from patients') ref = HIVreference(refname=refname, load_alignment=True, subtype=subtype) data = [] for pi, pcode in enumerate(patients): print(pcode) p = Patient.load(pcode) comap = (pd.DataFrame(p.map_to_external_reference('genomewide')[:, :2], columns=[refname, 'patient']) .set_index('patient', drop=True) .loc[:, refname]) aft = p.get_allele_frequency_trajectories('genomewide', cov_min=cov_min) times = p.dsi for pos, aft_pos in enumerate(aft.swapaxes(0, 2)): fead = p.pos_to_feature[pos] # Keep only sites within ONE protein if len(fead['protein_codon']) != 1: continue # skip if protein is to be excluded if fead['protein_codon'][0][0] in excluded_proteins: continue # Exclude codons with gaps pc = fead['protein_codon'][0][-1] cod_anc = ''.join(p.initial_sequence[pos - pc: pos - pc + 3]) if '-' in cod_anc: continue for ia, aft_nuc in enumerate(aft_pos[:4]): # Keep only derived alleles if alpha[ia] == p.initial_sequence[pos]: continue # Keep only no RNA structures if fead['RNA']: continue # Keep only sites which are also in the reference if pos not in comap.index: continue # Keep only high-entropy sites S_pos = ref.entropy[comap.loc[pos]] if S_pos < entropy_threshold: continue # Keep only synonymous alleles cod_new = cod_anc[:pc] + alpha[ia] + cod_anc[pc+1:] if translate(cod_anc) != translate(cod_new): continue mut = p.initial_sequence[pos]+'->'+alpha[ia] for it, (t, af_nuc) in enumerate(izip(times, aft_nuc)): # Keep only nonmasked times if aft_nuc.mask[it]: continue datum = {'time': t, 'af': af_nuc, 'pos': pos, 'refpos': comap.loc[pos], 'protein': fead['protein_codon'][0][0], 'pcode': pcode, 'mut': mut, 'subtype': subtype, 'refname': refname, } data.append(datum) data = pd.DataFrame(data) return data
from itertools import izip from hivevo.patients import Patient from hivevo.samples import all_fragments from hivevo.af_tools import LD as LDfunc from util import store_data, load_data, fig_width, fig_fontsize, HIVEVO_colormap import os from filenames import get_figure_folder import matplotlib.pyplot as plt import seaborn as sns plt.ion() sns.set_style('darkgrid') cols = HIVEVO_colormap() if __name__=="__main__": p = Patient.load('p10') aft = p.get_allele_frequency_trajectories('genomewide') af = aft[0] consensus_indices = p.get_initial_indices('genomewide') minor_af = 1.0 - af.max(axis=0) # make a histogram of the minor allele frequencies plt.figure() plt.hist(minor_af, bins = np.linspace(0,1,51), bottom=0.5) plt.yscale('log') plt.xlabel('frequency') plt.ylabel('number of minor variants') # --> there are two clear peaks one around 0.35-0.5 , the other around 0.1-0.15 variable_pos = minor_af>0.05
import matplotlib.pyplot as plt from matplotlib import cm import numpy as np import seaborn as sns from hivevo.patients import Patient from hivevo.samples import all_fragments from hivevo.sequence import alpha plt.ion() sns.set_style('darkgrid') p = Patient.load('p3') aft = p.get_allele_frequency_trajectories('RT1') div = (aft * (1.0 - aft)).sum(axis=1) var_pos = div.max(axis=0) > 0.1 plt.figure() for pos in np.where(var_pos)[0]: for ni in range(5): traj = aft[:, ni, pos] if traj.max() > 0.2 and traj[0] < 0.5: #and traj[-1]<0.2: plt.plot(p.ysi[~traj.mask], traj[~traj.mask], c=cm.jet(1.0 * pos / aft.shape[-1]), label=str(pos + 1) + alpha[ni], lw=2) print pos, alpha[ni], np.round(traj, 2) plt.ylabel('SNP frequency') plt.xlabel('ETI [years]') plt.legend(loc=2) plt.savefig('mutatons_p3_RT1.pdf')
if j == Nit -1: print 'WARNING from amoeba_vp:\n the maximum number of iterations has been reached, max(dx/x) = ',\ np.max(np.abs((xx-xcenter)/xcenter)) # np.max(np.array([LA.norm(x-xx[0,:]) for x in xx[1:,:]])) if return_f: return xx[0,:],ff[0], j else: return xx[0,:] if __name__=="__main__": '''Studying fluctuations of nucleotides with high fitness''' plt.ioff() patient_name = 'p1' gen_region = 'pol' #'gag' #'pol' #'gp41' #'gp120' #'vif' #'RRE' PAT = Patient.load(patient_name) outdir_name = '/ebio/ag-neher/share/users/vpuller/Fabio_data_work/Stratify/' if not os.path.exists(outdir_name): os.makedirs(outdir_name) tt = PAT.times() nt = tt.shape[0] counts = PAT.get_allele_count_trajectories(gen_region) freqs = PAT.get_allele_frequency_trajectories(gen_region).data L = freqs.shape[2] # genetic region length nuc1 = 'A' nuc2 = 'G' j1 = np.where(bases == nuc1)[0][0]
import matplotlib.pyplot as plt from matplotlib import cm import numpy as np import seaborn as sns from hivevo.patients import Patient from hivevo.samples import all_fragments from hivevo.sequence import alpha plt.ion() sns.set_style('darkgrid') p = Patient.load('p3') aft = p.get_allele_frequency_trajectories('RT1') div = (aft*(1.0-aft)).sum(axis=1) var_pos = div.max(axis=0)>0.1 plt.figure() for pos in np.where(var_pos)[0]: for ni in range(5): traj = aft[:,ni,pos] if traj.max()>0.2 and traj[0]<0.5 : #and traj[-1]<0.2: plt.plot(p.ysi[~traj.mask], traj[~traj.mask], c = cm.jet(1.0*pos/aft.shape[-1]), label = str(pos+1)+alpha[ni], lw=2) print pos, alpha[ni], np.round(traj,2) plt.ylabel('SNP frequency') plt.xlabel('ETI [years]') plt.legend(loc=2) plt.savefig('mutatons_p3_RT1.pdf')