Beispiel #1
0
def collect_data(patient_codes, regions, reference, synnonsyn=True):
    '''
    loop over regions and produce a dictionary that contains the frequencies,
    syn/nonsyn designations and mutation rates
    '''
    cov_min=500
    combined_af_by_pat={}
    syn_nonsyn_by_pat={}
    syn_nonsyn_by_pat_unconstrained={}
    consensus_mutation_rate={}
    mutation_rates = load_mutation_rates()['mu']
    total_muts = {nuc: sum([x for mut, x in mutation_rates.iteritems() if mut[0]==nuc]) for nuc in 'ACGT'}

    patients = []
    for pcode in patient_codes:
        print(pcode)
        p = Patient.load(pcode)
        patients.append(p)
    for region in regions:
        if region=="genomewide":
            region_seq = "".join(reference.consensus)
        else:
            region_seq = reference.annotation[region].extract("".join(reference.consensus))
        combined_af_by_pat[region], syn_nonsyn_by_pat[region], syn_nonsyn_by_pat_unconstrained[region] \
            = collect_weighted_afs(region, patients, reference, synnonsyn=synnonsyn)
        consensus_mutation_rate[region] = np.array([total_muts[nuc] if nuc not in ['-', 'N'] else np.nan
                                                    for nuc in region_seq])

    return {'af_by_pat': combined_af_by_pat,
            'mut_rate': consensus_mutation_rate,
            'syn_by_pat': syn_nonsyn_by_pat,
            'syn_by_pat_uc': syn_nonsyn_by_pat_unconstrained}
Beispiel #2
0
    ax.grid(True)

    plt.tight_layout()

    plt.ion()
    plt.show()

    for ext in ['svg', 'png', 'pdf']:
        fig.savefig('../figures/figure_S1.' + ext)

    return ax


# Script
if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Figure S1')
    parser.add_argument('--threshold', default=0.1, help='diversity threshold')
    parser.add_argument('--gp120',
                        action='store_true',
                        default=False,
                        help='exclude gp120')
    args = parser.parse_args()

    mu = load_mutation_rates(args.threshold, args.gp120)

    plot_comparison(mu['mu'],
                    mu['muA'],
                    dmulog10=mu['dmulog10'],
                    dmuAlog10=mu['dmuAlog10'])
            transform=ax.transAxes,
            fontsize=fs)
    ax.grid(True)

    plt.tight_layout()

    plt.ion()
    plt.show()

    for ext in ['svg', 'png', 'pdf']:
        fig.savefig('../figures/figure_S1.'+ext)

    return ax



# Script
if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Figure S1')
    parser.add_argument('--threshold', default=0.1, help='diversity threshold')
    parser.add_argument('--gp120', action='store_true', default=False, help='exclude gp120')
    args = parser.parse_args()

    mu = load_mutation_rates(args.threshold, args.gp120)

    plot_comparison(mu['mu'],
                    mu['muA'],
                    dmulog10=mu['dmulog10'],
                    dmuAlog10=mu['dmuAlog10'])
def collect_data(patients, cov_min=100, no_sweeps=False, refname='HXB2'):
    '''Collect data for the fitness cost estimate'''
    print('Collect data from patients')

    ref = HIVreference(refname=refname, subtype='any', load_alignment=True)
    mus = load_mutation_rates()
    mu = mus.mu
    muA = mus.muA

    data = []
    for pi, pcode in enumerate(patients):
        print(pcode)

        p = Patient.load(pcode)
        comap = (pd.DataFrame(p.map_to_external_reference('genomewide', refname=refname)[:, :2],
                              columns=[refname, 'patient'])
                   .set_index('patient', drop=True)
                   .loc[:, refname])

        aft = p.get_allele_frequency_trajectories('genomewide', cov_min=cov_min)
        for pos, aft_pos in enumerate(aft.swapaxes(0, 2)):
            fead = p.pos_to_feature[pos]

            # Keep only sites within ONE protein
            # Note: we could drop this, but then we cannot quite classify syn/nonsyn
            if len(fead['protein_codon']) != 1:
                continue

            # Exclude codons with gaps
            pc = fead['protein_codon'][0][-1]
            cod_anc = ''.join(p.initial_sequence[pos - pc: pos - pc + 3])
            if '-' in cod_anc:
                continue

            # Keep only nonmasked times
            if aft_pos[:4].mask.any(axis=0).all():
                continue
            else:
                ind = ~aft_pos[:4].mask.any(axis=0)
                times = p.dsi[ind]
                aft_pos = aft_pos[:, ind]
                n_templates = p.n_templates_dilutions[ind]

            # Get site entropy
            if pos not in comap.index:
                continue
            pos_ref = comap.loc[pos]
            S_pos = ref.entropy[pos_ref]

            # Keep only sites where the ancestral allele and group M agree
            if ref.consensus_indices[pos_ref] != aft_pos[:, 0].argmax():
                continue

            # Filter out sweeps if so specified, only for nonsyn
            if no_sweeps:
                found = False
                nuc_anc = p.initial_sequence[pos]
                for ia, aft_nuc in enumerate(aft_pos[:4]):
                    if (alpha[ia] != nuc_anc) and (aft_nuc > 0.5).any():
                        cod_new = cod_anc[:pc] + alpha[ia] + cod_anc[pc+1:]
                        if translate(cod_anc) != translate(cod_new):
                            found = True
                if found:
                    continue

            # Keep only 1 - ancestral allele
            ia = p.initial_indices[pos]
            aft_nuc = 1 - aft_pos[ia]
            for it, (t, af_nuc, n_temp) in enumerate(izip(times, aft_nuc, n_templates)):
                datum = {'time': t,
                         'af': af_nuc,
                         'pos': pos,
                         'pos_ref': pos_ref,
                         'protein': fead['protein_codon'][0][0],
                         'pcode': pcode,
                         'ancestral': alpha[ia],
                         'S': S_pos,
                         'n_templates': n_temp,
                        }
                data.append(datum)

    data = pd.DataFrame(data)

    return data
def collect_data(patients, cov_min=100, no_sweeps=False, refname='HXB2'):
    '''Collect data for the fitness cost estimate'''
    print('Collect data from patients')

    ref = HIVreference(refname=refname, subtype='any', load_alignment=True)
    mus = load_mutation_rates()
    mu = mus.mu
    muA = mus.muA

    data = []
    for pi, pcode in enumerate(patients):
        print(pcode)

        p = Patient.load(pcode)
        comap = (pd.DataFrame(
            p.map_to_external_reference('genomewide', refname=refname)[:, :2],
            columns=[refname, 'patient']).set_index('patient',
                                                    drop=True).loc[:, refname])

        aft = p.get_allele_frequency_trajectories('genomewide',
                                                  cov_min=cov_min)
        for pos, aft_pos in enumerate(aft.swapaxes(0, 2)):
            fead = p.pos_to_feature[pos]

            # Keep only sites within ONE protein
            # Note: we could drop this, but then we cannot quite classify syn/nonsyn
            if len(fead['protein_codon']) != 1:
                continue

            # Exclude codons with gaps
            pc = fead['protein_codon'][0][-1]
            cod_anc = ''.join(p.initial_sequence[pos - pc:pos - pc + 3])
            if '-' in cod_anc:
                continue

            # Keep only nonmasked times
            if aft_pos[:4].mask.any(axis=0).all():
                continue
            else:
                ind = ~aft_pos[:4].mask.any(axis=0)
                times = p.dsi[ind]
                aft_pos = aft_pos[:, ind]
                n_templates = p.n_templates_dilutions[ind]

            # Get site entropy
            if pos not in comap.index:
                continue
            pos_ref = comap.loc[pos]
            S_pos = ref.entropy[pos_ref]

            # Keep only sites where the ancestral allele and group M agree
            if ref.consensus_indices[pos_ref] != aft_pos[:, 0].argmax():
                continue

            # Filter out sweeps if so specified, only for nonsyn
            if no_sweeps:
                found = False
                nuc_anc = p.initial_sequence[pos]
                for ia, aft_nuc in enumerate(aft_pos[:4]):
                    if (alpha[ia] != nuc_anc) and (aft_nuc > 0.5).any():
                        cod_new = cod_anc[:pc] + alpha[ia] + cod_anc[pc + 1:]
                        if translate(cod_anc) != translate(cod_new):
                            found = True
                if found:
                    continue

            # Keep only 1 - ancestral allele
            ia = p.initial_indices[pos]
            aft_nuc = 1 - aft_pos[ia]
            for it, (t, af_nuc,
                     n_temp) in enumerate(izip(times, aft_nuc, n_templates)):
                datum = {
                    'time': t,
                    'af': af_nuc,
                    'pos': pos,
                    'pos_ref': pos_ref,
                    'protein': fead['protein_codon'][0][0],
                    'pcode': pcode,
                    'ancestral': alpha[ia],
                    'S': S_pos,
                    'n_templates': n_temp,
                }
                data.append(datum)

    data = pd.DataFrame(data)

    return data