def fraction_per_site():
    """
    Same as fraction per region but only for pol and with discrimination between 1st 2nd and 3rd position.
    """
    region = "pol"
    patient_names = ["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"]
    for ii, site in enumerate(["first", "second", "third"]):
        consensus = []
        non_consensus = []
        fraction_non_consensus = []
        for patient_name in patient_names:
            patient = Patient.load(patient_name)
            aft = patient.get_allele_frequency_trajectories(region)
            site_mask = get_site_mask(aft, ii+1)
            consensus_mask = get_consensus_mask(patient, region, aft)[site_mask]
            non_consensus_mask = get_non_consensus_mask(patient, region, aft)[site_mask]
            consensus += [np.sum(consensus_mask, dtype=int)]
            non_consensus += [np.sum(non_consensus_mask, dtype=int)]
            fraction_non_consensus += [non_consensus[-1] / (consensus[-1] + non_consensus[-1])]

        mean_consensus = np.mean(consensus) / (aft.shape[-1]/3)
        std_consensus = np.std(consensus) / (aft.shape[-1]/3)
        mean_non_consensus = np.mean(non_consensus) / (aft.shape[-1]/3)
        std_non_consensus = np.std(non_consensus) / (aft.shape[-1]/3)
        mean_fraction_non_consensus = np.mean(fraction_non_consensus)
        std_fraction_non_consensus = np.std(fraction_non_consensus)
        print(f"Site {site}:")
        print(f"""   Consensus {round(mean_consensus, 2)} += {round(std_consensus, 3)}   Non-consensus {round(mean_non_consensus, 2)} += {round(std_non_consensus, 3)}   Fraction non_consensus {round(mean_fraction_non_consensus,3)} += {round(std_fraction_non_consensus,3)}""")
Exemple #2
0
def make_patient_RNA_DNA_tree(pcode, min_DNA_frac = 0.001):
    ''' make a tree for all RNA/DNA sample of a given patient '''
    for seq_type in ['clustered_good', 'good', 'hyper', 'suspicious']:
        seqs=[]
        for outprefix in patient_to_prefix_p17[pcode]:
            with myopen('data/'+outprefix+'_DNA_'+seq_type+save_as) as ifile:
                seqs.extend([x for x in SeqIO.parse(ifile, 'fasta')])
        p = Patient.load(pcode)
        seqs.extend(p.get_haplotype_alignment(region))
        seqs_pruned = prune_rare_DNA(seqs, min_DNA_frac)
        for hi, hap in enumerate(seqs_pruned):
            hap.id+='_'+str(hi)
            hap.name=hap.id

        outfname = 'data/'+pcode+'_RNA_and_DNA_'+seq_type+'.fasta'
        align(ungap(seqs_pruned), outfname)
        tree = infer_tree(outfname, min_DNA_frac=0.0)
        leaves = sorted(filter(lambda x:x.name[:4]=='days', tree.get_terminals()),
                        key = lambda x:(int(x.name.split('_')[1]), -int(x.name.split('_')[3][:-1])))
        tree.root_with_outgroup(leaves[0])
        tree.root.branch_length=0.01
        for branch in tree.get_nonterminals(order='postorder'):
            if branch.branch_length<0.001:
                tree.collapse(branch)
        tree.ladderize()
        Phylo.write(tree, 'data/'+pcode+ '_RNA_and_DNA_'+seq_type+'.nwk', 'newick')
Exemple #3
0
def collect_data(patient_codes, regions, reference, synnonsyn=True):
    '''
    loop over regions and produce a dictionary that contains the frequencies,
    syn/nonsyn designations and mutation rates
    '''
    cov_min=500
    combined_af_by_pat={}
    syn_nonsyn_by_pat={}
    syn_nonsyn_by_pat_unconstrained={}
    consensus_mutation_rate={}
    mutation_rates = load_mutation_rates()['mu']
    total_muts = {nuc: sum([x for mut, x in mutation_rates.iteritems() if mut[0]==nuc]) for nuc in 'ACGT'}

    patients = []
    for pcode in patient_codes:
        print(pcode)
        p = Patient.load(pcode)
        patients.append(p)
    for region in regions:
        if region=="genomewide":
            region_seq = "".join(reference.consensus)
        else:
            region_seq = reference.annotation[region].extract("".join(reference.consensus))
        combined_af_by_pat[region], syn_nonsyn_by_pat[region], syn_nonsyn_by_pat_unconstrained[region] \
            = collect_weighted_afs(region, patients, reference, synnonsyn=synnonsyn)
        consensus_mutation_rate[region] = np.array([total_muts[nuc] if nuc not in ['-', 'N'] else np.nan
                                                    for nuc in region_seq])

    return {'af_by_pat': combined_af_by_pat,
            'mut_rate': consensus_mutation_rate,
            'syn_by_pat': syn_nonsyn_by_pat,
            'syn_by_pat_uc': syn_nonsyn_by_pat_unconstrained}
def fraction_per_region():
    """
    Fraction of consensus and non_consensus site computation. This is for initial sequence for each patient.
    Fraction consensus + non_consensus does not equal one because some regions are excluded due to gaps.
    """
    regions = ["env", "pol", "gag"]
    patient_names = ["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"]
    for region in regions:
        consensus = []
        non_consensus = []
        fraction_non_consensus = []
        for patient_name in patient_names:
            patient = Patient.load(patient_name)
            aft = patient.get_allele_frequency_trajectories(region)
            consensus_mask = get_consensus_mask(patient, region, aft)
            non_consensus_mask = get_non_consensus_mask(patient, region, aft)
            consensus += [np.sum(consensus_mask, dtype=int)]
            non_consensus += [np.sum(non_consensus_mask, dtype=int)]
            fraction_non_consensus += [non_consensus[-1] / (consensus[-1] + non_consensus[-1])]

        mean_consensus = np.mean(consensus) / aft.shape[-1]
        std_consensus = np.std(consensus) / aft.shape[-1]
        mean_non_consensus = np.mean(non_consensus) / aft.shape[-1]
        std_non_consensus = np.std(non_consensus) / aft.shape[-1]
        mean_fraction_non_consensus = np.mean(fraction_non_consensus)
        std_fraction_non_consensus = np.std(fraction_non_consensus)
        print(f"Region {region}:")
        print(f"""   Consensus {round(mean_consensus, 2)} += {round(std_consensus, 2)}   Non-consensus {round(mean_non_consensus, 2)} += {round(std_non_consensus, 2)}   Fraction non_consensus {round(mean_fraction_non_consensus,3)} += {round(std_fraction_non_consensus,3)}""")
Exemple #5
0
def get_divergence_cumulative_sum(sample_slice=-3, patient_names=["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"]):
    """
    Returns the divergence values for the last 3 datapoints of each patient. Returns both the raw values and
    the cumulative sum (normalized to 1).
    Taking only one every sampling time points as there is a lot of data.
    """
    region = "pol"
    nb_last = 3
    sampling = 20

    all_values = []
    for name in patient_names:
        patient = Patient.load(name)
        div2D = get_divergence_in_time(region, patient)
        if sample_slice < 0:
            values = div2D[sample_slice:].flatten()
        else:
            values = div2D[:sample_slice].flatten()

        all_values += list(values[~values.mask])
    all_values = np.sort(all_values)
    cum_sum = np.cumsum(all_values)
    cum_sum /= cum_sum[-1]
    values = np.concatenate((all_values[::sampling], np.array([all_values[-1]])))
    cumulative = np.concatenate((cum_sum[::sampling], np.array([cum_sum[-1]])))
    return values, cumulative
def patient_preprocessing(pat_name,Squant, div = False, outliers = True, xcut = 0.0,
                          xcut_up = 0., cov_min=100):
    '''Load patient data, remove outliers and return average frequencies by
    timepoint and covariances

    Input arguments:
    pat_name - patient name
    q - number of entropy categories

    Returns:
    xka_q - nucleotide frequencies by time-point, site, quantile
    tt - corresponding time values
    Smedians, Squant
    '''
    q=len(Squant)
    print pat_name
    PAT = Patient.load(pat_name)
    tt = PAT.times()
    map_to_ref = PAT.map_to_external_reference(gen_region)
    # load mutation frequencies, retain only positions that map_to_ref
    freqs = PAT.get_allele_frequency_trajectories(gen_region, cov_min=cov_min)[:,:,map_to_ref[:,2]]
    gp120 = np.zeros(len(ref.seq), dtype=bool)
    gp120[[x for x in ref.annotation['gp120']]]=True


    # determine ancestral indices -> argmax in first time point
    anc_index = np.argmax(freqs[0,:4,:],axis=0)
    anc_freq = freqs[:,anc_index,range(anc_index.shape[0])]

    # calculate the mutation frequency change
    dt_k = np.array([tt[0]] + list(np.diff(tt)))
    xave = np.sum(dt_k * anc_freq.T, axis=1) /np.sum(dt_k * (~anc_freq.mask).T, axis=1)
    # determine positions where the initial state agrees with the consensus states
    good_positions = (anc_index==ref.get_consensus_indices_in_patient_region(map_to_ref))
    good_positions = good_positions&(ref.get_ungapped()[map_to_ref[:,0]])
    good_positions = good_positions&(~gp120[map_to_ref[:,0]])
    #good_positions = (xave <= 1.-xcut)*(xave > xcut_up)*(anc_index==ref.get_consensus_indices_in_patient_region(map_to_ref))

    xka_q = []
    for jq in xrange(q):
        idx_ref = Squant[jq]['ind']
        # reference positions are in map_to_ref[:,0]
        idx_PAT = np.in1d(map_to_ref[:,0], idx_ref)&good_positions

        if div:
            x_ka = (1.- anc_freq[:,idx_PAT])*anc_freq[:,idx_PAT]
        else:
            x_ka = (1.- anc_freq[:,idx_PAT])
        xka_q.append(x_ka)

    # Remove outliers from the data
    if outliers:
        xka_q_new = []
        for jq, x_ka0 in enumerate(xka_q):
            nonout = np.all(x_ka0<=0.5, axis=0)
            xka_q_new.append(x_ka0[:,nonout])
        xka_q = list(xka_q_new)

    return xka_q, tt
def load_patient_data(patient_names = 'all', 
                      q = 4, 
                      timescale = 'years', 
                      filepath = None, 
                      fromHIVEVO = False):
    if patient_names == 'all':
        patient_names = ['p{}'.format(j+1) for j in xrange(11)]
    if fromHIVEVO:
        #sys.path.append('/ebio/ag-neher/share/users/vpuller/HIVEVO/HIVEVO_access') 
        sys.path.append('/home/vadim/ebio/users/vpuller/HIVEVO/HIVEVO_access') 
        from hivevo.patients import Patient
        from hivevo.HIVreference import HIVreference
        
        ref = HIVreference(load_alignment=False)
        Lref = len(ref.seq)
        data_all = {}
        for pat_name in patient_names:
            PAT = Patient.load(pat_name)
            tt = PAT.times(unit = timescale)
            vload = PAT.n_templates_viral_load
            dilutions = PAT.n_templates_dilutions
            freqs_raw = PAT.get_allele_frequency_trajectories('genomewide', error_rate=err)[:,:q,:]
            map_to_ref = PAT.map_to_external_reference('genomewide')
            freqs = np.ma.zeros((tt.shape[0], q, Lref)); freqs.mask = True
            freqs[:,:,map_to_ref[:,0]] = freqs_raw[:,:,map_to_ref[:,1]]
            data_all[pat_name] = (tt, freqs, vload, dilutions)
            if filepath is not None:
                np.save(filepath + '{}_data.npy'.format(pat_name), freqs.data)
                np.save(filepath + '{}_mask.npy'.format(pat_name), freqs.mask)
                np.save(filepath + '{}_tt.npy'.format(pat_name), tt) 
                np.save(filepath + '{}_viral_load.npy'.format(pat_name), vload)
                np.save(filepath + '{}_dilutions.npy'.format(pat_name), dilutions.data)
                np.save(filepath + '{}_dilutions_mask.npy'.format(pat_name), dilutions.mask)
        data_all['Lref'] = freqs.shape[2]
        data_all['pat_names'] = patient_names
        
    elif filepath is not None:
        data_all = {}
        for pat_name in patient_names:
            tt = np.load(filepath + '{}_tt.npy'.format(pat_name))
            data = np.load(filepath + '{}_data.npy'.format(pat_name))
            mask = np.load(filepath + '{}_mask.npy'.format(pat_name))
            freqs = np.ma.masked_array(data, mask = mask)
            vload = np.load(filepath + '{}_viral_load.npy'.format(pat_name))
            dilutions = np.load(filepath + '{}_dilutions.npy'.format(pat_name))
            dilutions_mask = np.load(filepath + '{}_dilutions_mask.npy'.format(pat_name))
            dilutions = np.ma.masked_array(dilutions, mask = dilutions_mask)
            data_all[pat_name] = (tt, freqs, vload, dilutions)
        data_all['Lref'] = freqs.shape[2]
        data_all['pat_names'] = patient_names
    else:
        print 'Path to data is not specified'
    return data_all
Exemple #8
0
def create_all_patient_trajectories(region, patient_names=[]):
    if patient_names == []:
        patient_names = ["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"]

    trajectories = []
    ref = HIVreference(subtype="any")
    for patient_name in patient_names:
        patient = Patient.load(patient_name)
        aft = patient.get_allele_frequency_trajectories(region)
        trajectories = trajectories + create_trajectory_list(
            patient, region, aft, ref)

    return trajectories
Exemple #9
0
def make_divergence_dict(time, consensus=False):
    """
    Creates a dictionary with the divergence in time averaged over patients.
    Format of the dictionary : dict[region][consensus/non_consensus/all][high/low/all/first/second/third]
    """

    regions = ["env", "pol", "gag"]
    patient_names = ["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"]
    fitness_keys = ["low", "high", "all", "first", "second", "third"]

    divergence_dict = {}
    for region in regions:
        divergence_dict[region] = {"consensus": {}, "non_consensus": {}, "all": {}}

        nb_traj = np.zeros_like(time)
        for key in fitness_keys:
            divergence_dict[region]["consensus"][key] = np.zeros_like(time, dtype=float)
            divergence_dict[region]["non_consensus"][key] = np.zeros_like(time, dtype=float)
            if key in ["all", "first", "second", "third"]:
                divergence_dict[region]["all"][key] = np.zeros_like(time, dtype=float)

        for patient_name in patient_names:
            patient = Patient.load(patient_name)
            patient_div_dict = get_mean_divergence_patient(patient, region, consensus)

            tmp_time = time[time < patient.dsi[-1]]
            nb_traj[:len(tmp_time)] += 1

            for key in fitness_keys:
                patient_div_dict["consensus"][key] = np.interp(
                    tmp_time, patient.dsi, patient_div_dict["consensus"][key])
                patient_div_dict["non_consensus"][key] = np.interp(
                    tmp_time, patient.dsi, patient_div_dict["non_consensus"][key])

                divergence_dict[region]["consensus"][key][:len(
                    tmp_time)] += patient_div_dict["consensus"][key]
                divergence_dict[region]["non_consensus"][key][:len(
                    tmp_time)] += patient_div_dict["non_consensus"][key]

            for key in ["all", "first", "second", "third"]:
                patient_div_dict["all"][key] = np.interp(
                    tmp_time, patient.dsi, patient_div_dict["all"][key])
                divergence_dict[region]["all"][key][:len(
                    tmp_time)] += patient_div_dict["all"][key]

        for key1 in ["consensus", "non_consensus", "all"]:
            for key2 in divergence_dict[region][key1].keys():
                divergence_dict[region][key1][key2] = divergence_dict[region][key1][key2] / nb_traj

    return divergence_dict
Exemple #10
0
def get_mean_sweep_per_year(region, patient_names=["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"], threshold=0.05):
    nb = 0
    for patient_name in patient_names:
        patient = Patient.load(patient_name)
        aft = patient.get_allele_frequency_trajectories(region)

        initial_idx = patient.get_initial_indices(region)
        aft_initial = aft[np.arange(aft.shape[0])[:, np.newaxis, np.newaxis],
                          initial_idx, np.arange(aft.shape[-1])]
        aft_initial = aft_initial[:, 0, :]

        mask = aft_initial <= threshold
        tmp = np.where(np.sum(mask, axis=0))[0]
        nb += tmp.shape[0] / patient.ysi[-1]
    return nb / len(patient_names)
Exemple #11
0
def make_bootstrap_divergence_dict(nb_bootstrap=10, consensus=False):
    """
    Creates a dictionary with the divergence in time for each patient.
    Format of the dictionary : dict[region][patient][consensus/non_consensus/all][high/low/all/first/second/third]
    Turn consensus to True to compute the divergence to consensus sequence instead of founder sequence.
    """

    regions = ["env", "pol", "gag"]
    patient_names = ["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"]
    fitness_keys = ["low", "high", "all", "first", "second", "third"]
    time = np.arange(0, 2001, 40)

    # Generating a dictionnary with the divergence for each patient (interpolated to the time vector)
    divergence_dict = {}
    for region in regions:
        divergence_dict[region] = {}
        for patient_name in patient_names:
            patient = Patient.load(patient_name)
            patient_div_dict = get_mean_divergence_patient(patient, region, consensus)
            divergence_dict[region][patient_name] = patient_div_dict

            for key in divergence_dict[region][patient_name].keys():
                for key2 in divergence_dict[region][patient_name][key].keys():
                    divergence_dict[region][patient_name][key][key2] = np.interp(
                        time, patient.dsi, divergence_dict[region][patient_name][key][key2])

    # Bootstrapping the divergence values over patients. Tips of the dict are list with 1 div vector for each of the bootstrapping
    bootstrap_dict = create_div_bootstrap_dict()
    for ii in range(nb_bootstrap):
        bootstrap_names = bootstrap_patient_names()
        for region in regions:
            dict_list = []
            for patient_name in bootstrap_names:
                dict_list += [divergence_dict[region][patient_name]]

            for key in divergence_dict[region][patient_names[0]].keys():
                for key2 in divergence_dict[region][patient_names[0]][key].keys():
                    tmp = np.array([dict[key][key2] for dict in dict_list])
                    bootstrap_dict[region][key][key2] += [np.mean(tmp, axis=0)]

    # Averaging the bootstrapping
    for region in bootstrap_dict.keys():
        for key in bootstrap_dict[region].keys():
            for key2 in bootstrap_dict[region][key].keys():
                tmp = np.array(bootstrap_dict[region][key][key2]).copy()
                bootstrap_dict[region][key][key2] = {"mean": np.mean(tmp, axis=0), "std": np.std(tmp, axis=0)}

    return time, bootstrap_dict
Exemple #12
0
def get_sweep_sites_sum(
        region,
        patient_names=["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"]):
    "Returns a 1D vector with the sum of sweep sites over all patients"
    sites = []
    for patient_name in patient_names:
        patient = Patient.load(patient_name)
        aft = patient.get_allele_frequency_trajectories(region)
        sweep_mask = get_sweep_mask(patient, aft, region, threshold_low=0.5)
        ref = HIVreference(subtype="any")
        reference_mask = trajectory.get_reference_filter(
            patient, region, aft, ref)
        sweep_mask = sweep_mask[reference_mask]
        sites = sites + [list(sweep_mask[:2964])]

    sites = np.array(sites)
    sites = np.sum(sites, axis=0, dtype=int)
    return sites
Exemple #13
0
def divergence_contribution(region, patient_name):
    patient = Patient.load(patient_name)
    aft = patient.get_allele_frequency_trajectories(region)
    initial_idx = patient.get_initial_indices(region)
    divergence_matrix = divergence.divergence_matrix(patient, region, aft)
    div = divergence_matrix[np.arange(aft.shape[0])[:, np.newaxis, np.newaxis],
                            initial_idx, np.arange(aft.shape[-1])]
    div = div[:, 0, :]
    hist, bins = np.histogram(div[-3:, :], bins=1000)
    bins = bins[:-1]
    hist_sum = np.cumsum(hist)
    hist_sum = hist_sum / np.max(hist_sum)

    plt.figure()
    plt.plot(bins, hist_sum, label="hist")
    plt.legend()
    plt.grid()
    plt.show()
def add_RNA_haplotypes(fname, pat, region):
    '''
    loads patient RNA haplotypes and DNA haplotypes and return
    both in one list
    '''
    from hivevo.patients import Patient
    try:
        if pat in patient_translation:
            pcode = patient_translation[pat]
        else:
            pcode = pat
        p = Patient.load(pcode)
        rna_haps = p.get_haplotype_alignment(region)
        with myopen(fname) as ifile:
            dna_haps = [seq for seq in SeqIO.parse(ifile, 'fasta')]
        dna_haps.extend(filter(lambda x:get_RNA_read_count(x.description)>2, rna_haps))
        return dna_haps
    except:
        print(fname,pat, region,"failed")
Exemple #15
0
def initial_traj_under_threshold(region, patient_name):
    patient = Patient.load(patient_name)
    aft = patient.get_allele_frequency_trajectories(region)
    # Masking low depth
    depth = trajectory.get_depth(patient, region)
    depth = np.tile(depth, (6, 1, 1))
    depth = np.swapaxes(depth, 0, 1)
    aft.mask = np.logical_or(aft.mask, ~depth)

    initial_idx = patient.get_initial_indices(region)
    aft_initial = aft[np.arange(aft.shape[0])[:, np.newaxis, np.newaxis],
                      initial_idx, np.arange(aft.shape[-1])]
    aft_initial = aft_initial[:, 0, :]

    threshold_low = 0.05
    threshold_high = 0.95

    mask = aft_initial <= threshold_low
    data = aft_initial[:, np.where(np.sum(mask, axis=0))]
    data = data[:, 0, :]
    return data
Exemple #16
0
def collect_data(patient_codes, regions, subtype):
    cov_min=500
    combined_af_by_pat={}
    initial_codons_by_pat={}
    combined_phenos={}
    aa_ref = 'NL4-3'
    patients = []

    for pcode in patient_codes:
        try:
            p = Patient.load(pcode)
            patients.append(p)
        except:
            print("Can't load patient", pcode)

    for region in regions:
        reference = HIVreferenceAminoacid(region, refname=aa_ref, subtype = subtype)
        combined_af_by_pat[region], initial_codons_by_pat[region], combined_phenos[region] =\
            collect_weighted_aa_afs(region, patients, reference, cov_min=cov_min)

    return {'af_by_pat':combined_af_by_pat, 'init_codon': initial_codons_by_pat, 'pheno':combined_phenos}
def plot_drug_resistance_mutation_trajectories(pcode):
    '''
    auxillary function to check for potential drug resistance evolution in RNA sequences
    only p10 has drug resistance mutations in the last two samples
    '''
    plt.figure()
    p = Patient.load(pcode)
    RT = p.get_allele_frequency_trajectories('RT', type='aa')
    for mt in ['NNRTI', 'NRTI']:
        for aa1, pos, aa2 in drug_muts[mt]['mutations']:
            traj = 1 - RT[:, alphaal.index(aa1), pos - 1]
            if max(traj) > 0.1:
                plt.plot(p.dsi, traj, '-o', label=mt + ' ' + str(pos))

    PR = p.get_allele_frequency_trajectories('PR', type='aa')
    for mt in ['PI']:
        for aa1, pos, aa2 in drug_muts[mt]['mutations']:
            traj = 1 - PR[:, alphaal.index(aa1), pos - 1]
            if max(traj) > 0.1:
                plt.plot(p.dsi, traj, '-o', label=mt + ' ' + str(pos))

    plt.legend(loc=2, ncol=2)
def plot_drug_resistance_mutation_trajectories(pcode):
    """
    auxillary function to check for potential drug resistance evolution in RNA sequences
    only p10 has drug resistance mutations in the last two samples
    """
    plt.figure()
    p = Patient.load(pcode)
    RT = p.get_allele_frequency_trajectories("RT", type="aa")
    for mt in ["NNRTI", "NRTI"]:
        for aa1, pos, aa2 in drug_muts[mt]["mutations"]:
            traj = 1 - RT[:, alphaal.index(aa1), pos - 1]
            if max(traj) > 0.1:
                plt.plot(p.dsi, traj, "-o", label=mt + " " + str(pos))

    PR = p.get_allele_frequency_trajectories("PR", type="aa")
    for mt in ["PI"]:
        for aa1, pos, aa2 in drug_muts[mt]["mutations"]:
            traj = 1 - PR[:, alphaal.index(aa1), pos - 1]
            if max(traj) > 0.1:
                plt.plot(p.dsi, traj, "-o", label=mt + " " + str(pos))

    plt.legend(loc=2, ncol=2)
def collect_data(patients, cov_min=100, refname='HXB2'):
    '''Collect data for the fitness cost estimate'''
    ref = HIVreference(refname=refname, subtype='any', load_alignment=True)

    data = []
    for pi, pcode in enumerate(patients):
        print pcode

        p = Patient.load(pcode)
        comap = (pd.DataFrame(p.map_to_external_reference('genomewide', refname=refname)[:, :2],
                              columns=[refname, 'patient'])
                   .set_index('patient', drop=True)
                   .loc[:, refname])

        aft = p.get_allele_frequency_trajectories('genomewide', cov_min=cov_min)
        for pos, aft_pos in enumerate(aft.swapaxes(0, 2)):
            fead = p.pos_to_feature[pos]

            # Keep only sites within ONE protein
            # Note: we could drop this, but then we cannot quite classify syn/nonsyn
            if len(fead['protein_codon']) != 1:
                continue

            # Exclude codons with gaps
            pc = fead['protein_codon'][0][-1]
            cod_anc = ''.join(p.initial_sequence[pos - pc: pos - pc + 3])
            if '-' in cod_anc:
                continue

            # Keep only nonmasked times
            if aft_pos[:4].mask.any(axis=0).all():
                continue
            else:
                ind = ~aft_pos[:4].mask.any(axis=0)
                times = p.dsi[ind]
                aft_pos = aft_pos[:, ind]

            # Get site entropy
            if pos not in comap.index:
                continue
            pos_ref = comap.loc[pos]
            S_pos = ref.entropy[pos_ref]

            # Keep only sites where the ancestral allele and group M agree
            if ref.consensus_indices[pos_ref] != aft_pos[:, 0].argmax():
                anc_cross = False
            else:
                anc_cross = True

            for ia, aft_nuc in enumerate(aft_pos[:4]):
                # Keep only derived alleles
                if alpha[ia] == p.initial_sequence[pos]:
                    continue

                # Keep only sweeps
                if not (aft_nuc > 0.9).any():
                    continue

                # Annotate with syn/nonsyn alleles
                cod_new = cod_anc[:pc] + alpha[ia] + cod_anc[pc+1:]
                if translate(cod_anc) != translate(cod_new):
                    syn = False
                else:
                    syn = True

                mut = p.initial_sequence[pos]+'->'+alpha[ia]
                for it, (t, af_nuc) in enumerate(izip(times, aft_nuc)):
                    datum = {'time': t,
                             'af': af_nuc,
                             'pos': pos,
                             'pos_ref': pos_ref,
                             'protein': fead['protein_codon'][0][0],
                             'pcode': pcode,
                             'mut': mut,
                             'S': S_pos,
                             'syn': syn,
                             'anc_cross': anc_cross,
                            }
                    data.append(datum)

    data = pd.DataFrame(data)

    return data
def collect_data(patients,
                 cov_min=100,
                 refname='HXB2',
                 subtype='any',
                 entropy_threshold=0.1,
                 excluded_proteins=[]):
    '''Collect data for the mutation rate estimate'''
    print('Collect data from patients')

    ref = HIVreference(refname=refname, load_alignment=True, subtype=subtype)

    data = []
    for pi, pcode in enumerate(patients):
        print(pcode)

        p = Patient.load(pcode)
        comap = (pd.DataFrame(p.map_to_external_reference('genomewide')[:, :2],
                              columns=[refname, 'patient'
                                       ]).set_index('patient',
                                                    drop=True).loc[:, refname])

        aft = p.get_allele_frequency_trajectories('genomewide',
                                                  cov_min=cov_min)
        times = p.dsi

        for pos, aft_pos in enumerate(aft.swapaxes(0, 2)):
            fead = p.pos_to_feature[pos]

            # Keep only sites within ONE protein
            if len(fead['protein_codon']) != 1:
                continue
            # skip if protein is to be excluded
            if fead['protein_codon'][0][0] in excluded_proteins:
                continue

            # Exclude codons with gaps
            pc = fead['protein_codon'][0][-1]
            cod_anc = ''.join(p.initial_sequence[pos - pc:pos - pc + 3])
            if '-' in cod_anc:
                continue

            for ia, aft_nuc in enumerate(aft_pos[:4]):
                # Keep only derived alleles
                if alpha[ia] == p.initial_sequence[pos]:
                    continue

                # Keep only no RNA structures
                if fead['RNA']:
                    continue

                # Keep only sites which are also in the reference
                if pos not in comap.index:
                    continue

                # Keep only high-entropy sites
                S_pos = ref.entropy[comap.loc[pos]]
                if S_pos < entropy_threshold:
                    continue

                # Keep only synonymous alleles
                cod_new = cod_anc[:pc] + alpha[ia] + cod_anc[pc + 1:]
                if translate(cod_anc) != translate(cod_new):
                    continue

                mut = p.initial_sequence[pos] + '->' + alpha[ia]

                for it, (t, af_nuc) in enumerate(izip(times, aft_nuc)):
                    # Keep only nonmasked times
                    if aft_nuc.mask[it]:
                        continue

                    datum = {
                        'time': t,
                        'af': af_nuc,
                        'pos': pos,
                        'refpos': comap.loc[pos],
                        'protein': fead['protein_codon'][0][0],
                        'pcode': pcode,
                        'mut': mut,
                        'subtype': subtype,
                        'refname': refname,
                    }
                    data.append(datum)

    data = pd.DataFrame(data)

    return data
Exemple #21
0
import numpy as np
import matplotlib.pyplot as plt
import filenames
from hivevo.patients import Patient
from trajectory import create_trajectory_list, create_all_patient_trajectories

patient_name = "p1"
region = "env"
fontsize = 16

patient = Patient.load(patient_name)
aft = patient.get_allele_frequency_trajectories(region)
trajectories = create_all_patient_trajectories(region)
def collect_data(patients, cov_min=100, refname='HXB2'):
    '''Collect data for the fitness cost estimate'''
    ref = HIVreference(refname=refname, subtype='any', load_alignment=True)
    mus = load_mutation_rates()
    mu = mus.mu
    muA = mus.muA

    data = []
    for pi, pcode in enumerate(patients):
        print pcode

        p = Patient.load(pcode)
        comap = (pd.DataFrame(
            p.map_to_external_reference('genomewide', refname=refname)[:, :2],
            columns=[refname, 'patient']).set_index('patient',
                                                    drop=True).loc[:, refname])

        aft = p.get_allele_frequency_trajectories('genomewide',
                                                  cov_min=cov_min)
        for pos, aft_pos in enumerate(aft.swapaxes(0, 2)):
            fead = p.pos_to_feature[pos]

            # Keep only sites within ONE protein
            # Note: we could drop this, but then we cannot quite classify syn/nonsyn
            if len(fead['protein_codon']) != 1:
                continue

            # Exclude codons with gaps
            pc = fead['protein_codon'][0][-1]
            cod_anc = ''.join(p.initial_sequence[pos - pc:pos - pc + 3])
            if '-' in cod_anc:
                continue

            # Keep only nonmasked times
            if aft_pos[:4].mask.any(axis=0).all():
                continue
            else:
                ind = ~aft_pos[:4].mask.any(axis=0)
                times = p.dsi[ind]
                aft_pos = aft_pos[:, ind]

            # Get site entropy
            if pos not in comap.index:
                continue
            pos_ref = comap.loc[pos]
            S_pos = ref.entropy[pos_ref]

            # Keep only sites where the ancestral allele and group M agree
            if ref.consensus_indices[pos_ref] != aft_pos[:, 0].argmax():
                continue

            for ia, aft_nuc in enumerate(aft_pos[:4]):
                # Keep only derived alleles
                if alpha[ia] == p.initial_sequence[pos]:
                    continue

                # Keep only sweeps
                if not (aft_nuc > 0.5).any():
                    continue

                # Annotate with syn/nonsyn alleles
                cod_new = cod_anc[:pc] + alpha[ia] + cod_anc[pc + 1:]
                if translate(cod_anc) != translate(cod_new):
                    syn = False
                else:
                    syn = True

                mut = p.initial_sequence[pos] + '->' + alpha[ia]
                mu_pos = mu[mut]
                muA_pos = muA[mut]

                for it, (t, af_nuc) in enumerate(izip(times, aft_nuc)):
                    datum = {
                        'time': t,
                        'af': af_nuc,
                        'pos': pos,
                        'pos_ref': pos_ref,
                        'protein': fead['protein_codon'][0][0],
                        'pcode': pcode,
                        'mut': mut,
                        'mu': mu_pos,
                        'muAbram': muA_pos,
                        'S': S_pos,
                        'syn': syn,
                    }
                    data.append(datum)

    data = pd.DataFrame(data)

    return data
Exemple #23
0
    rna_haps_by_tp = defaultdict(list)
    for rhap in rna_haps:  # sort RNA reads by time points
        rna_haps_by_tp[int(rhap.name.split('_')[1])].append(rhap)

    print(sorted(rna_haps_by_tp.keys()),
          [len(rna_haps_by_tp[k]) for k in sorted(rna_haps_by_tp.keys())],
          pat.dsi)
    reads = []
    for tp, p in zip(pat.dsi, proportions):
        reads.extend(sample_from_RNA_tp(rna_haps_by_tp[int(tp)], int(p * n)))

    return reads


if __name__ == "__main__":
    patients = {pcode: Patient.load(pcode) for pcode in pcodes}
    import pandas as pd
    #treatment_dates = pd.read_excel('data/2016-01-08_treatment_start_dates.xlsx')
    sns.set_style('darkgrid')
    # lifetimes of cells -- 95% short lived cells, 5% long lived cells
    cell_pop = [(0.9, 30.), (0.1, 500.)]

    ###
    # make read samples from pre treatment RNA reads
    ###
    reads = {}
    sampling_times = [0, 90, 180]  # at ART start, 3 month, 6 month
    for pcode, p in patients.iteritems():
        reads[pcode] = []
        for t_ART in sampling_times:
            reads[pcode].append(sample_from_RNA(p, cell_pop, t_ART, n=1000))
 dmin = 20
 dmin_pad = 100
 var_min = 0.1
 cov_min = 200
 corr_vs_distance = {}
 bins = np.arange(0,401,40)
 binc = (bins[:-1]+bins[1:])*0.5
 all_dists = []
 all_weights = []
 for frag in all_fragments:
     if frag not in ['F'+str(i) for i in xrange(1,7)]:
         continue
     dists = []
     weights = []
     for pcode in patients:
         p = Patient.load(pcode)
         aft = p.get_allele_frequency_trajectories(frag)
         depth = p.get_fragment_depth(pad=False, limit_to_dilution=False)
         depth_pad = p.get_fragment_depth(pad=True, limit_to_dilution=False)
         for si, sample in enumerate(p.samples[:-1]):
             if depth[si][all_fragments.index(frag)]>dmin \
                 or depth_pad[si][all_fragments.index(frag)]>dmin_pad:
                 try:
                     positions, af2p, cov, af1p = sample.get_pair_frequencies(frag, var_min=var_min)
                     majority_nuc = af1p.argmax(axis=0)
                     if positions is None:
                         continue
                     LD, Dp, p12 =  LDfunc(af2p, af1p, cov, cov_min=100)
                     daf = aft[si+1][majority_nuc,positions] - aft[si][majority_nuc,positions]
                     X,Y = np.meshgrid(positions, positions)
                     dp1,dp2 = np.meshgrid(daf, daf)
    return evo_rate_dict


if __name__ == "__main__":
    colors = ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9"]
    regions = ["env", "pol", "gag"]

    # time = np.arange(0, 3100, 100)
    # divergence_dict = make_divergence_dict(time)
    # save_divergence_dict(divergence_dict)

    patient_names = ["p1", "p2", "p3", "p4", "p5", "p6", "p8", "p9", "p11"]
    region = "pol"
    nb_consensus = np.array([0, 0, 0])
    for patient_name in patient_names:
        patient = Patient.load("p1")
        aft = patient.get_allele_frequency_trajectories(region)
        mask = get_consensus_mask(patient, region, aft)
        mask1 = mask[::3]
        mask2 = mask[1::3]
        mask3 = mask[2::3]
        nb_consensus += [np.sum(mask1), np.sum(mask2), np.sum(mask3)]

    nb_consensus = nb_consensus / len(patient_names) / (aft.shape[-1] / 3)

    print(nb_consensus)

    # plt.figure()
    # for key1 in divergence_dict[region].keys():
    #     for key2 in divergence_dict[region][key1].keys():
    #         plt.plot(time, divergence_dict[region][key1][key2], label=f"{key1} {key2}")
def collect_data(patients, cov_min=100, no_sweeps=False, refname='HXB2'):
    '''Collect data for the fitness cost estimate'''
    print('Collect data from patients')

    ref = HIVreference(refname=refname, subtype='any', load_alignment=True)
    mus = load_mutation_rates()
    mu = mus.mu
    muA = mus.muA

    data = []
    for pi, pcode in enumerate(patients):
        print(pcode)

        p = Patient.load(pcode)
        comap = (pd.DataFrame(p.map_to_external_reference('genomewide', refname=refname)[:, :2],
                              columns=[refname, 'patient'])
                   .set_index('patient', drop=True)
                   .loc[:, refname])

        aft = p.get_allele_frequency_trajectories('genomewide', cov_min=cov_min)
        for pos, aft_pos in enumerate(aft.swapaxes(0, 2)):
            fead = p.pos_to_feature[pos]

            # Keep only sites within ONE protein
            # Note: we could drop this, but then we cannot quite classify syn/nonsyn
            if len(fead['protein_codon']) != 1:
                continue

            # Exclude codons with gaps
            pc = fead['protein_codon'][0][-1]
            cod_anc = ''.join(p.initial_sequence[pos - pc: pos - pc + 3])
            if '-' in cod_anc:
                continue

            # Keep only nonmasked times
            if aft_pos[:4].mask.any(axis=0).all():
                continue
            else:
                ind = ~aft_pos[:4].mask.any(axis=0)
                times = p.dsi[ind]
                aft_pos = aft_pos[:, ind]
                n_templates = p.n_templates_dilutions[ind]

            # Get site entropy
            if pos not in comap.index:
                continue
            pos_ref = comap.loc[pos]
            S_pos = ref.entropy[pos_ref]

            # Keep only sites where the ancestral allele and group M agree
            if ref.consensus_indices[pos_ref] != aft_pos[:, 0].argmax():
                continue

            # Filter out sweeps if so specified, only for nonsyn
            if no_sweeps:
                found = False
                nuc_anc = p.initial_sequence[pos]
                for ia, aft_nuc in enumerate(aft_pos[:4]):
                    if (alpha[ia] != nuc_anc) and (aft_nuc > 0.5).any():
                        cod_new = cod_anc[:pc] + alpha[ia] + cod_anc[pc+1:]
                        if translate(cod_anc) != translate(cod_new):
                            found = True
                if found:
                    continue

            # Keep only 1 - ancestral allele
            ia = p.initial_indices[pos]
            aft_nuc = 1 - aft_pos[ia]
            for it, (t, af_nuc, n_temp) in enumerate(izip(times, aft_nuc, n_templates)):
                datum = {'time': t,
                         'af': af_nuc,
                         'pos': pos,
                         'pos_ref': pos_ref,
                         'protein': fead['protein_codon'][0][0],
                         'pcode': pcode,
                         'ancestral': alpha[ia],
                         'S': S_pos,
                         'n_templates': n_temp,
                        }
                data.append(datum)

    data = pd.DataFrame(data)

    return data
Exemple #27
0
from itertools import izip
from hivevo.patients import Patient
from hivevo.samples import all_fragments
from hivevo.af_tools import LD as LDfunc
from util import store_data, load_data, fig_width, fig_fontsize, HIVEVO_colormap
import os
from filenames import get_figure_folder
import matplotlib.pyplot as plt
import seaborn as sns
plt.ion()

sns.set_style('darkgrid')
cols = HIVEVO_colormap()

if __name__ == "__main__":
    p = Patient.load('p10')
    aft = p.get_allele_frequency_trajectories('genomewide')

    af = aft[0]
    consensus_indices = p.get_initial_indices('genomewide')
    minor_af = 1.0 - af.max(axis=0)

    # make a histogram of the minor allele frequencies
    plt.figure()
    plt.hist(minor_af, bins=np.linspace(0, 1, 51), bottom=0.5)
    plt.yscale('log')
    plt.xlabel('frequency')
    plt.ylabel('number of minor variants')

    # --> there are two clear peaks one around 0.35-0.5 , the other around 0.1-0.15
    variable_pos = minor_af > 0.05
Exemple #28
0
 dmin = 20
 dmin_pad = 100
 var_min = 0.1
 cov_min = 200
 corr_vs_distance = {}
 bins = np.arange(0, 401, 40)
 binc = (bins[:-1] + bins[1:]) * 0.5
 all_dists = []
 all_weights = []
 for frag in all_fragments:
     if frag not in ['F' + str(i) for i in xrange(1, 7)]:
         continue
     dists = []
     weights = []
     for pcode in patients:
         p = Patient.load(pcode)
         aft = p.get_allele_frequency_trajectories(frag)
         depth = p.get_fragment_depth(pad=False, limit_to_dilution=False)
         depth_pad = p.get_fragment_depth(pad=True, limit_to_dilution=False)
         for si, sample in enumerate(p.samples[:-1]):
             if depth[si][all_fragments.index(frag)]>dmin \
                 or depth_pad[si][all_fragments.index(frag)]>dmin_pad:
                 try:
                     positions, af2p, cov, af1p = sample.get_pair_frequencies(
                         frag, var_min=var_min)
                     majority_nuc = af1p.argmax(axis=0)
                     if positions is None:
                         continue
                     LD, Dp, p12 = LDfunc(af2p, af1p, cov, cov_min=100)
                     daf = aft[si + 1][majority_nuc,
                                       positions] - aft[si][majority_nuc,
def collect_data(patients, cov_min=100, refname='HXB2', subtype='any',
                 entropy_threshold=0.1, excluded_proteins=[]):
    '''Collect data for the mutation rate estimate'''
    print('Collect data from patients')

    ref = HIVreference(refname=refname, load_alignment=True, subtype=subtype)

    data = []
    for pi, pcode in enumerate(patients):
        print(pcode)

        p = Patient.load(pcode)
        comap = (pd.DataFrame(p.map_to_external_reference('genomewide')[:, :2],
                              columns=[refname, 'patient'])
                   .set_index('patient', drop=True)
                   .loc[:, refname])

        aft = p.get_allele_frequency_trajectories('genomewide', cov_min=cov_min)
        times = p.dsi

        for pos, aft_pos in enumerate(aft.swapaxes(0, 2)):
            fead = p.pos_to_feature[pos]

            # Keep only sites within ONE protein
            if len(fead['protein_codon']) != 1:
                continue
            # skip if protein is to be excluded
            if fead['protein_codon'][0][0] in excluded_proteins:
                continue

            # Exclude codons with gaps
            pc = fead['protein_codon'][0][-1]
            cod_anc = ''.join(p.initial_sequence[pos - pc: pos - pc + 3])
            if '-' in cod_anc:
                continue

            for ia, aft_nuc in enumerate(aft_pos[:4]):
                # Keep only derived alleles
                if alpha[ia] == p.initial_sequence[pos]:
                    continue

                # Keep only no RNA structures
                if fead['RNA']:
                    continue

                # Keep only sites which are also in the reference
                if pos not in comap.index:
                    continue

                # Keep only high-entropy sites
                S_pos = ref.entropy[comap.loc[pos]]
                if S_pos < entropy_threshold:
                    continue

                # Keep only synonymous alleles
                cod_new = cod_anc[:pc] + alpha[ia] + cod_anc[pc+1:]
                if translate(cod_anc) != translate(cod_new):
                    continue

                mut = p.initial_sequence[pos]+'->'+alpha[ia]

                for it, (t, af_nuc) in enumerate(izip(times, aft_nuc)):
                    # Keep only nonmasked times
                    if aft_nuc.mask[it]:
                        continue

                    datum = {'time': t,
                             'af': af_nuc,
                             'pos': pos,
                             'refpos': comap.loc[pos],
                             'protein': fead['protein_codon'][0][0],
                             'pcode': pcode,
                             'mut': mut,
                             'subtype': subtype,
                             'refname': refname,
                            }
                    data.append(datum)

    data = pd.DataFrame(data)

    return data
from itertools import izip
from hivevo.patients import Patient
from hivevo.samples import all_fragments
from hivevo.af_tools import LD as LDfunc
from util import store_data, load_data, fig_width, fig_fontsize, HIVEVO_colormap
import os
from filenames import get_figure_folder
import matplotlib.pyplot as plt
import seaborn as sns
plt.ion()

sns.set_style('darkgrid')
cols = HIVEVO_colormap()

if __name__=="__main__":
    p = Patient.load('p10')
    aft = p.get_allele_frequency_trajectories('genomewide')

    af = aft[0]
    consensus_indices = p.get_initial_indices('genomewide')
    minor_af = 1.0 - af.max(axis=0)

    # make a histogram of the minor allele frequencies
    plt.figure()
    plt.hist(minor_af, bins = np.linspace(0,1,51), bottom=0.5)
    plt.yscale('log')
    plt.xlabel('frequency')
    plt.ylabel('number of minor variants')

    # --> there are two clear peaks one around 0.35-0.5 , the other around 0.1-0.15
    variable_pos = minor_af>0.05
Exemple #31
0
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import seaborn as sns
from hivevo.patients import Patient
from hivevo.samples import all_fragments
from hivevo.sequence import alpha
plt.ion()
sns.set_style('darkgrid')

p = Patient.load('p3')
aft = p.get_allele_frequency_trajectories('RT1')
div = (aft * (1.0 - aft)).sum(axis=1)
var_pos = div.max(axis=0) > 0.1

plt.figure()
for pos in np.where(var_pos)[0]:
    for ni in range(5):
        traj = aft[:, ni, pos]
        if traj.max() > 0.2 and traj[0] < 0.5:  #and traj[-1]<0.2:
            plt.plot(p.ysi[~traj.mask],
                     traj[~traj.mask],
                     c=cm.jet(1.0 * pos / aft.shape[-1]),
                     label=str(pos + 1) + alpha[ni],
                     lw=2)
            print pos, alpha[ni], np.round(traj, 2)

plt.ylabel('SNP frequency')
plt.xlabel('ETI [years]')
plt.legend(loc=2)
plt.savefig('mutatons_p3_RT1.pdf')
    
    if j == Nit -1:
        print 'WARNING from amoeba_vp:\n    the maximum number of iterations has been reached, max(dx/x) = ',\
        np.max(np.abs((xx-xcenter)/xcenter))
#        np.max(np.array([LA.norm(x-xx[0,:]) for x in xx[1:,:]])) 
    if return_f:
        return xx[0,:],ff[0], j
    else:
        return xx[0,:]
        
if __name__=="__main__":
    '''Studying fluctuations of nucleotides with high fitness'''
    plt.ioff()
    patient_name = 'p1'
    gen_region = 'pol' #'gag' #'pol' #'gp41' #'gp120' #'vif' #'RRE'
    PAT = Patient.load(patient_name)
    
    outdir_name = '/ebio/ag-neher/share/users/vpuller/Fabio_data_work/Stratify/'
    if not os.path.exists(outdir_name):
        os.makedirs(outdir_name)
        
    tt = PAT.times()
    nt = tt.shape[0]
    counts = PAT.get_allele_count_trajectories(gen_region)
    freqs = PAT.get_allele_frequency_trajectories(gen_region).data
    
    L = freqs.shape[2] # genetic region length
      
    nuc1 = 'A'
    nuc2 = 'G'
    j1 = np.where(bases == nuc1)[0][0]
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import seaborn as sns
from hivevo.patients import Patient
from hivevo.samples import all_fragments
from hivevo.sequence import alpha
plt.ion()
sns.set_style('darkgrid')

p = Patient.load('p3')
aft = p.get_allele_frequency_trajectories('RT1')
div = (aft*(1.0-aft)).sum(axis=1)
var_pos = div.max(axis=0)>0.1

plt.figure()
for pos in np.where(var_pos)[0]:
    for ni in range(5):
        traj = aft[:,ni,pos]
        if traj.max()>0.2 and traj[0]<0.5 : #and traj[-1]<0.2:
            plt.plot(p.ysi[~traj.mask], traj[~traj.mask], c = cm.jet(1.0*pos/aft.shape[-1]), label = str(pos+1)+alpha[ni], lw=2)
            print pos, alpha[ni], np.round(traj,2)

plt.ylabel('SNP frequency')
plt.xlabel('ETI [years]')
plt.legend(loc=2)
plt.savefig('mutatons_p3_RT1.pdf')