Example #1
0
def collect_diverse_sites_aminoacids(patients,
                                     regions,
                                     cov_min=1000,
                                     af_threshold=0.01,
                                     subtype='patient',
                                     refname='HXB2'):
    '''Fraction of sites that are diverse for different quantiles of subtype entropy'''
    ps = {pcode: Patient.load(pcode) for pcode in patients}

    diverse_fraction = []
    for region in regions:
        print region

        if subtype == 'any':
            ref = HIVreferenceAminoacid(refname=refname, subtype='any')
            ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05)
        else:
            refs = {}
            for subtype in ['B', 'C', 'AE']:
                ref = HIVreferenceAminoacid(region,
                                            refname=refname,
                                            subtype=subtype)
                ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05)
                refs[subtype] = ref

        for pi, pcode in enumerate(patients):
            p = ps[pcode]

            if subtype == 'patient':
                ref = refs[p['Subtype']]

            aft = p.get_allele_frequency_trajectories(region,
                                                      cov_min=cov_min,
                                                      type='aa')
            if len(aft.mask.shape) < 2:
                aft.mask = np.zeros_like(aft, dtype=bool)

            # get patient to subtype map and subset entropy vectors
            patient_to_subtype = p.map_to_external_reference_aminoacids(
                region, refname=refname)
            subtype_entropy = ref.get_entropy_in_patient_region(
                patient_to_subtype)
            entropy_quantiles = get_quantiles(4, subtype_entropy)
            good_ref = ref.good_pos_in_reference[patient_to_subtype[:, 0]]

            # loop over times and calculate the correlation for each value
            for t, af in izip(p.dsi, aft):
                good_af = (~np.any(
                    af.mask, axis=0)[patient_to_subtype[:, 1]]) & good_ref
                tmp_af = af[:, patient_to_subtype[:, 1]]
                # tmp_af has only columns that are mappable to the reference
                # good_af is a mask for useful columns
                # Squant['ind'] below is a mask for positions corresponding to an entropy quantile (at mappable positions)
                tmp = {'S'+str(i+1):np.mean(tmp_af[:,Squant['ind']*good_af].max(axis=0)\
                                           <tmp_af[:,Squant['ind']*good_af].sum(axis=0)-af_threshold)
                                        for i, Squant in entropy_quantiles.iteritems()}
                tmp.update({'pcode': pcode, 'region': region, 'time': t})
                diverse_fraction.append(tmp)

    return pd.DataFrame(diverse_fraction)
Example #2
0
def make_tree(region,
              fn_ali,
              fn_tree,
              tmpfile='/tmp/seqs.fasta',
              fasttreebin='FastTree'):
    '''Make tree of minor haplotype variants from all patients + outgroup'''

    # Collect haplpotypes from patients
    seqs = []
    patients = ['p1', 'p2', 'p3', 'p5', 'p6', 'p8', 'p9', 'p11']
    for pcode in patients:
        p = Patient.load(pcode)

        for seq in p.get_haplotype_alignment(region):
            seq.id = 'patient_' + pcode + '_' + seq.id
            seq.name = 'patient_' + pcode + '_' + seq.name
            seq.description = 'patient ' + pcode + ', ' + seq.description

            seqs.append(seq)

    # Add reference as an outgroup
    ref = HIVreference(load_alignment=False)
    refseq = ref.annotation[region].extract(ref.seq)
    seqs.append(refseq)

    # Align (Muscle)
    if os.path.isfile(tmpfile):
        os.remove(tmpfile)
    SeqIO.write(seqs, tmpfile, 'fasta')

    try:
        sp.call([
            'muscle', '-maxiters', '1', '-diags', '-in', tmpfile, '-out',
            fn_ali
        ])
    finally:
        os.remove(tmpfile)

    # Annotate for FastTree (does not accept double labels)
    seqs = []
    for seq in SeqIO.parse(fn_ali, 'fasta'):
        seq.name = seq.name + '_#' + str(len(seqs))
        seq.id = seq.id + '_#' + str(len(seqs))
        seqs.append(seq)
    SeqIO.write(seqs, tmpfile, 'fasta')

    # FastTree
    try:
        sp.call([fasttreebin, '-nt', '-out', fn_tree, tmpfile])
    finally:
        os.remove(tmpfile)

    # reroot with outgroup
    tree = Phylo.read(fn_tree, 'newick')
    for leaf in tree.get_terminals():
        if refseq.id in leaf.name:
            break
    tree.root_with_outgroup(leaf)
    Phylo.write(tree, fn_tree, 'newick')
Example #3
0
def get_toaway_histograms(subtype, Sc=1):
    '''
    calculate allele frequency histograms for each patient and each time points
    separately for sites that agree or disagree with consensus.
    this can be done for a low and high entropy category with the threshold set by Sc
    '''
    away_histogram = {(pcode, Sbin): {}
                      for Sbin in ['low', 'high'] for pcode in patients}
    to_histogram = {(pcode, Sbin): {}
                    for Sbin in ['low', 'high'] for pcode in patients}
    # if subtypes == 'any' meaning comparison to groupM, we can load the reference here
    if subtype == 'any':
        hxb2 = HIVreference(refname='HXB2', subtype=subtype)
        good_pos_in_reference = hxb2.get_ungapped(threshold=0.05)

    # determine divergence and minor variation at sites that agree with consensus or not
    for pi, pcode in enumerate(patients):
        try:
            p = Patient.load(pcode)
        except:
            print "Can't load patient", pcode
        else:
            print('subtype:', subtype, "patient", pcode)
            if subtype == 'patient':  # if we take the subtype of the patient, load specific ref alignment here
                hxb2 = HIVreference(refname='HXB2', subtype=p['Subtype'])
                good_pos_in_reference = hxb2.get_ungapped(threshold=0.05)
            for region in regions:
                aft = p.get_allele_frequency_trajectories(region,
                                                          cov_min=cov_min)

                # get patient to subtype map and subset entropy vectors, convert to bits
                patient_to_subtype = p.map_to_external_reference(
                    region, refname='HXB2')
                subtype_entropy = hxb2.get_entropy_in_patient_region(
                    patient_to_subtype) / np.log(2.0)
                ancestral = p.get_initial_indices(region)[
                    patient_to_subtype[:, 2]]
                consensus = hxb2.get_consensus_indices_in_patient_region(
                    patient_to_subtype)
                good_ref = good_pos_in_reference[patient_to_subtype[:, 0]]
                away_sites = ancestral == consensus
                aft_HXB2 = aft[:, :, patient_to_subtype[:, 2]]

                for H, sites in [(away_histogram, away_sites),
                                 (to_histogram, ~away_sites)]:
                    for Sbin in ['low', 'high']:
                        if Sbin == 'low':
                            ind = (sites) & (subtype_entropy < Sc) & (good_ref)
                        else:
                            ind = (sites) & (subtype_entropy >=
                                             Sc) & (good_ref)
                        for ti, t in enumerate(p.dsi):
                            y, x = np.histogram(
                                aft_HXB2[ti, ancestral[ind],
                                         np.where(ind)[0]].compressed(),
                                bins=af_bins)
                            H[(pcode, Sbin)][t] = y

    return to_histogram, away_histogram
Example #4
0
def collect_data():
    data = {'CD4': get_CD4(), 'VL': get_VL(), 'deep sequencing': {}}

    for pn in pnumbers:
        pcode = 'p' + str(pn)
        p = Patient.load(pcode)
        data['deep sequencing'][pcode] = p.dsi

    return data
Example #5
0
def get_toaway_histograms_aminoacids(subtype, Sc=1, refname='HXB2'):
    '''Calculate SFS for towards/away from cross-sectional consensus for amino acids

    Calculate allele frequency histograms for each patient and each time points
    separately for sites that agree or disagree with consensus.
    this can be done for a low and high entropy category with the threshold set by Sc
    '''
    ps = {pcode: Patient.load(pcode) for pcode in patients}

    away_histogram = {(pcode, Sbin):{} for Sbin in ['low','high'] for pcode in patients}
    to_histogram = {(pcode, Sbin):{} for Sbin in ['low','high'] for pcode in patients}
    for region in regions:

        # if subtypes == 'any' meaning comparison to groupM, we can load the reference here
        if subtype=='any':
            ref = HIVreferenceAminoacid(region, refname=refname, subtype=subtype)
            ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05)
        else:
            refs = {}
            for subtype in ['B', 'C', 'AE']:
                ref = HIVreferenceAminoacid(region, refname=refname, subtype=subtype)
                ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05)
                refs[subtype] = ref

        # determine divergence and minor variation at sites that agree with consensus or not
        for pi, pcode in enumerate(patients):
            p = ps[pcode]
            print 'subtype:', subtype, "patient", pcode

            if subtype == 'patient': # if we take the subtype of the patient, load specific ref alignment here
                ref = refs[p['Subtype']]

            aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min,
                                                      type='aa')

            # get patient to subtype map and subset entropy vectors, convert to bits
            patient_to_subtype = p.map_to_external_reference_aminoacids(region, refname=refname)
            subtype_entropy = ref.get_entropy_in_patient_region(patient_to_subtype) / np.log(2.0)
            ancestral = p.get_initial_indices(region, type='aa')[patient_to_subtype[:, -1]]
            consensus = ref.get_consensus_indices_in_patient_region(patient_to_subtype)
            good_ref = ref.good_pos_in_reference[patient_to_subtype[:, 0]]
            away_sites = ancestral == consensus
            aft_ref = aft[:,:,patient_to_subtype[:, -1]]

            # H is the dict ot add this too, sites are the consensus/non consensus positions
            for H, sites in [(away_histogram, away_sites), (to_histogram, ~away_sites)]:
                for Sbin in ['low', 'high']:
                    if Sbin=='low': # make a boolean array with the relevant positions == True
                        ind = (sites)&(subtype_entropy<Sc)&(good_ref)
                    else:                    
                        ind = (sites)&(subtype_entropy>=Sc)&(good_ref)
                    for ti,t in enumerate(p.dsi): # for each time point, make and allele frequency histogram
                        y, x = np.histogram(aft_ref[ti,ancestral[ind], np.where(ind)[0]].compressed(),
                                            bins=af_bins)
                        H[(pcode, Sbin)][t] = y

    return to_histogram, away_histogram
Example #6
0
def collect_correlations(patients,
                         regions,
                         cov_min=1000,
                         subtype='patient',
                         refname='HXB2'):
    '''Correlation of subtype entropy and intra-patient diversity'''
    correlations = []
    if subtype == 'any':
        ref = HIVreference(refname=refname, subtype='any')
        ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05)
    else:
        refs = {}
        for subtype in ['B', 'C', 'AE']:
            ref = HIVreference(refname=refname, subtype=subtype)
            ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05)
            refs[subtype] = ref

    for pi, pcode in enumerate(patients):
        p = Patient.load(pcode)

        if subtype == 'patient':
            ref = refs[p['Subtype']]

        for region in regions:
            aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min)
            if len(aft.mask.shape) < 2:
                aft.mask = np.zeros_like(aft, dtype=bool)

            # get patient to subtype map and subset entropy vectors
            patient_to_subtype = p.map_to_external_reference(region,
                                                             refname=refname)
            subtype_entropy = ref.get_entropy_in_patient_region(
                patient_to_subtype)
            good_ref = ref.good_pos_in_reference[patient_to_subtype[:, 0]]

            # loop over times and calculate the correlation for each value
            for t, af in izip(p.dsi, aft):
                patient_entropy = np.maximum(
                    0, -np.sum(af[:-1] * np.log(1e-10 + af[:-1]),
                               axis=0))[patient_to_subtype[:, 2]]
                # good_af is a mask for useful columns
                good_af = (~np.any(
                    af.mask, axis=0)[patient_to_subtype[:, 2]]) & good_ref
                if good_af.sum() > 0.5 * good_af.shape[0]:
                    rho, pval = spearmanr(patient_entropy[good_af],
                                          subtype_entropy[good_af])
                    correlations.append({
                        'pcode': pcode,
                        'region': region,
                        'time': t,
                        'rho': rho,
                        'pval': pval
                    })

    return pd.DataFrame(correlations)
def collect_data():
    data = {'CD4': get_CD4(),
            'VL': get_VL(),
            'deep sequencing': {}}

    for pn in pnumbers:
        pcode = 'p'+str(pn)
        p = Patient.load(pcode)
        data['deep sequencing'][pcode] = p.dsi

    return data
Example #8
0
def collect_ctl_data(patients, regions, ctl_kind='mhci=80'):
    data_ctl = []

    for pi, pcode in enumerate(patients):
        p = Patient.load(pcode)

        # Add predicted epitopes
        ctl_table = p.get_ctl_epitopes(kind=ctl_kind, regions=regions)
        ctl_table['pcode'] = p.name
        data_ctl.append(ctl_table)
    data_ctl = pd.concat(data_ctl)
    return data_ctl
Example #9
0
def collect_correlations(patients,
                         regions,
                         cov_min=1000,
                         refname='HXB2',
                         min_dsi=1500):
    '''Correlation of entropy between patients'''
    ps = [Patient.load(pcode) for pcode in patients]

    correlations = []
    for region in regions:
        print region
        for pi, p1 in enumerate(ps):
            aft1 = p1.get_allele_frequency_trajectories(region,
                                                        cov_min=cov_min)
            af1 = aft1[p1.dsi >= min_dsi].mean(axis=0)
            en1 = np.maximum(
                0, -np.sum(af1[:-1] * np.log(1e-10 + af1[:-1]), axis=0))
            ptoref1 = p1.map_to_external_reference(region, refname=refname)
            ptorefd1 = dict(ptoref1[:, ::2])
            seq1 = p1.get_initial_sequence(region)

            for p2 in ps[:pi]:
                aft2 = p2.get_allele_frequency_trajectories(region,
                                                            cov_min=cov_min)
                af2 = aft2[p2.dsi >= min_dsi].mean(axis=0)
                en2 = np.maximum(
                    0, -np.sum(af2[:-1] * np.log(1e-10 + af2[:-1]), axis=0))
                ptoref2 = p2.map_to_external_reference(region, refname=refname)
                ptorefd2 = dict(ptoref2[:, ::2])
                seq2 = p2.get_initial_sequence(region)

                overlap = np.intersect1d(ptoref1[:, 0],
                                         ptoref2[:, 0],
                                         assume_unique=True)
                af_ov = np.array([(en1[ptorefd1[pos]], en2[ptorefd2[pos]])
                                  for pos in overlap])
                rho, pval = spearmanr(af_ov[:, 0], af_ov[:, 1])

                seq1_ov = np.array([seq1[ptorefd1[pos]] for pos in overlap])
                seq2_ov = np.array([seq2[ptorefd2[pos]] for pos in overlap])
                dist = (seq1_ov != seq2_ov).mean()

                correlations.append({
                    'pcode1': p1.name,
                    'pcode2': p2.name,
                    'pcode': p1.name + '-' + p2.name,
                    'region': region,
                    'rho': rho,
                    'distance': dist,
                    'pval': pval
                })

    return pd.DataFrame(correlations)
def make_tree(region, fn_ali, fn_tree, tmpfile='/tmp/seqs.fasta', fasttreebin='FastTree'):
    '''Make tree of minor haplotype variants from all patients + outgroup'''

    # Collect haplpotypes from patients
    seqs = []
    patients = ['p1', 'p2', 'p3','p5', 'p6', 'p8', 'p9', 'p11']
    for pcode in patients:
        p = Patient.load(pcode)

        for seq in p.get_haplotype_alignment(region):
            seq.id = 'patient_'+pcode+'_'+seq.id
            seq.name = 'patient_'+pcode+'_'+seq.name
            seq.description = 'patient '+pcode+', '+seq.description

            seqs.append(seq)

    # Add reference as an outgroup
    ref = HIVreference(load_alignment=False)
    refseq = ref.annotation[region].extract(ref.seq)
    seqs.append(refseq)

    # Align (Muscle)
    if os.path.isfile(tmpfile):
        os.remove(tmpfile)
    SeqIO.write(seqs, tmpfile, 'fasta')

    try:
        sp.call(['muscle', '-maxiters', '1', '-diags', '-in', tmpfile, '-out', fn_ali])
    finally:
        os.remove(tmpfile)

    # Annotate for FastTree (does not accept double labels)
    seqs = []
    for seq in SeqIO.parse(fn_ali, 'fasta'):
        seq.name = seq.name+'_#'+str(len(seqs))
        seq.id = seq.id+'_#'+str(len(seqs))
        seqs.append(seq)
    SeqIO.write(seqs, tmpfile, 'fasta')

    # FastTree
    try:
        sp.call([fasttreebin, '-nt', '-out', fn_tree, tmpfile])
    finally:
        os.remove(tmpfile)

    # reroot with outgroup
    tree = Phylo.read(fn_tree, 'newick')
    for leaf in tree.get_terminals():
        if refseq.id in leaf.name:
            break
    tree.root_with_outgroup(leaf)
    Phylo.write(tree, fn_tree, 'newick')
def collect_diverse_sites_aminoacids(patients, regions, cov_min=1000, af_threshold=0.01, subtype='patient', refname='HXB2'):
    '''Fraction of sites that are diverse for different quantiles of subtype entropy'''
    ps = {pcode: Patient.load(pcode) for pcode in patients}

    diverse_fraction = []
    for region in regions:
        print region

        if subtype=='any':
            ref = HIVreferenceAminoacid(refname=refname, subtype='any')
            ref.good_pos_in_reference = ref.get_ungapped(threshold = 0.05)
        else:
            refs = {}
            for subtype in ['B', 'C', 'AE']:
                ref = HIVreferenceAminoacid(region, refname=refname, subtype=subtype)
                ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05)
                refs[subtype] = ref

        for pi, pcode in enumerate(patients):
            p = ps[pcode]

            if subtype=='patient':
                ref = refs[p['Subtype']]

            aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min, type='aa')
            if len(aft.mask.shape)<2:
                aft.mask = np.zeros_like(aft, dtype=bool)

            # get patient to subtype map and subset entropy vectors
            patient_to_subtype = p.map_to_external_reference_aminoacids(region, refname=refname)
            subtype_entropy = ref.get_entropy_in_patient_region(patient_to_subtype)
            entropy_quantiles = get_quantiles(4, subtype_entropy)
            good_ref = ref.good_pos_in_reference[patient_to_subtype[:,0]]

            # loop over times and calculate the correlation for each value
            for t, af in izip(p.dsi,aft):
                good_af = (~np.any(af.mask, axis=0)[patient_to_subtype[:,1]]) & good_ref
                tmp_af = af[:,patient_to_subtype[:,1]]
                # tmp_af has only columns that are mappable to the reference
                # good_af is a mask for useful columns
                # Squant['ind'] below is a mask for positions corresponding to an entropy quantile (at mappable positions) 
                tmp = {'S'+str(i+1):np.mean(tmp_af[:,Squant['ind']*good_af].max(axis=0)\
                                           <tmp_af[:,Squant['ind']*good_af].sum(axis=0)-af_threshold)
                                        for i, Squant in entropy_quantiles.iteritems()}
                tmp.update({'pcode':pcode,'region':region,'time':t})
                diverse_fraction.append(tmp)

    return pd.DataFrame(diverse_fraction)
def collect_correlations_aminoacids(patients, regions, cov_min=1000, subtype='patient', refname='HXB2'):
    '''Correlation of subtype entropy and intra-patient diversity'''
    ps = {pcode: Patient.load(pcode) for pcode in patients}

    correlations = []
    for region in regions:
        print region

        if subtype == 'any':
            ref = HIVreferenceAminoacid(region, refname=refname, subtype='any')
            ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05)
        else:
            refs = {}
            for subtype in ['B', 'C', 'AE']:
                ref = HIVreferenceAminoacid(region, refname=refname, subtype=subtype)
                ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05)
                refs[subtype] = ref

        for pi, pcode in enumerate(patients):
            p = ps[pcode]
    
            if subtype == 'patient':
                ref = refs[p['Subtype']]

            aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min,
                                                      type='aa')
            if len(aft.mask.shape) < 2:
                aft.mask = np.zeros_like(aft, dtype=bool)

            # get patient to subtype map and subset entropy vectors
            patient_to_subtype = p.map_to_external_reference_aminoacids(region, refname=refname)
            subtype_entropy = ref.get_entropy_in_patient_region(patient_to_subtype)
            good_ref = ref.good_pos_in_reference[patient_to_subtype[:,0]]

            # loop over times and calculate the correlation for each value
            for t, af in izip(p.dsi, aft):
                patient_entropy = np.maximum(0,-np.sum(af[:-1]*np.log(1e-10+af[:-1]), axis=0))[patient_to_subtype[:,1]]
                # good_af is a mask for useful columns
                good_af = (~np.any(af.mask, axis=0)[patient_to_subtype[:,1]]) & good_ref
                if good_af.sum() > 0.5 * good_af.shape[0]:
                    rho,pval = spearmanr(patient_entropy[good_af], subtype_entropy[good_af])
                    correlations.append({'pcode':pcode,
                                 'region': region,
                                 'time': t,
                                 'rho': rho,
                                 'pval': pval})

    return pd.DataFrame(correlations)
def get_toaway_histograms(subtype, Sc=1):
    '''
    calculate allele frequency histograms for each patient and each time points
    separately for sites that agree or disagree with consensus.
    this can be done for a low and high entropy category with the threshold set by Sc
    '''
    away_histogram = {(pcode, Sbin):{} for Sbin in ['low','high'] for pcode in patients}
    to_histogram = {(pcode, Sbin):{} for Sbin in ['low','high'] for pcode in patients}
    # if subtypes == 'any' meaning comparison to groupM, we can load the reference here
    if subtype=='any':
        hxb2 = HIVreference(refname='HXB2', subtype = subtype)
        good_pos_in_reference = hxb2.get_ungapped(threshold = 0.05)

    # determine divergence and minor variation at sites that agree with consensus or not
    for pi, pcode in enumerate(patients):
        try:
            p = Patient.load(pcode)
        except:
            print "Can't load patient", pcode
        else:
            print('subtype:', subtype, "patient",pcode)
            if subtype == 'patient': # if we take the subtype of the patient, load specific ref alignment here
                hxb2 = HIVreference(refname='HXB2', subtype = p['Subtype'])
                good_pos_in_reference = hxb2.get_ungapped(threshold = 0.05)
            for region in regions:
                aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min)

                # get patient to subtype map and subset entropy vectors, convert to bits
                patient_to_subtype = p.map_to_external_reference(region, refname = 'HXB2')
                subtype_entropy = hxb2.get_entropy_in_patient_region(patient_to_subtype)/np.log(2.0)
                ancestral = p.get_initial_indices(region)[patient_to_subtype[:,2]]
                consensus = hxb2.get_consensus_indices_in_patient_region(patient_to_subtype)
                good_ref = good_pos_in_reference[patient_to_subtype[:,0]]
                away_sites = ancestral==consensus
                aft_HXB2 = aft[:,:,patient_to_subtype[:,2]]

                for H, sites in [(away_histogram, away_sites), (to_histogram, ~away_sites)]:
                    for Sbin in ['low', 'high']:
                        if Sbin=='low':
                            ind = (sites)&(subtype_entropy<Sc)&(good_ref)
                        else:                    
                            ind = (sites)&(subtype_entropy>=Sc)&(good_ref)
                        for ti,t in enumerate(p.dsi):
                            y,x = np.histogram(aft_HXB2[ti,ancestral[ind],np.where(ind)[0]].compressed(), bins=af_bins)
                            H[(pcode, Sbin)][t]=y

    return to_histogram, away_histogram
def collect_correlations(patients, regions, cov_min=1000, refname='HXB2', min_dsi=1500):
    '''Correlation of entropy between patients'''
    ps = [Patient.load(pcode) for pcode in patients]

    correlations = []
    for region in regions:
        print region
        for pi, p1 in enumerate(ps):
            aft1 = p1.get_allele_frequency_trajectories(region, cov_min=cov_min)
            af1 = aft1[p1.dsi >= min_dsi].mean(axis=0)
            en1 = np.maximum(0,-np.sum(af1[:-1]*np.log(1e-10+af1[:-1]), axis=0))
            ptoref1 = p1.map_to_external_reference(region, refname=refname)
            ptorefd1 = dict(ptoref1[:, ::2])
            seq1 = p1.get_initial_sequence(region)

            for p2 in ps[:pi]:
                aft2 = p2.get_allele_frequency_trajectories(region, cov_min=cov_min)
                af2 = aft2[p2.dsi >= min_dsi].mean(axis=0)
                en2 = np.maximum(0,-np.sum(af2[:-1]*np.log(1e-10+af2[:-1]), axis=0))
                ptoref2 = p2.map_to_external_reference(region, refname=refname)
                ptorefd2 = dict(ptoref2[:, ::2])
                seq2 = p2.get_initial_sequence(region)

                overlap = np.intersect1d(ptoref1[:, 0], ptoref2[:, 0], assume_unique=True)
                af_ov = np.array([(en1[ptorefd1[pos]], en2[ptorefd2[pos]]) for pos in overlap])
                rho, pval = spearmanr(af_ov[:, 0], af_ov[:, 1])

                seq1_ov = np.array([seq1[ptorefd1[pos]] for pos in overlap])
                seq2_ov = np.array([seq2[ptorefd2[pos]] for pos in overlap])
                dist = (seq1_ov != seq2_ov).mean()

                correlations.append({'pcode1': p1.name,
                                     'pcode2': p2.name,
                                     'pcode': p1.name+'-'+p2.name,
                                     'region': region,
                                     'rho': rho,
                                     'distance': dist,
                                     'pval': pval})

    return pd.DataFrame(correlations)
Example #15
0
def collect_data_LD(patients):
    '''Collect data for LD plot'''
    dmin = 40
    dmin_pad = 200
    var_min = 0.2
    cov_min = 200
    LD_vs_distance = {}
    Dp_vs_distance = {}
    bins = np.arange(0,401,40)
    binc = (bins[:-1]+bins[1:])*0.5
    for frag in all_fragments:
        if frag not in ['F'+str(i) for i in xrange(1,7)]:
            continue
        dists = []
        weights_LD = []
        weights_Dp = []
        for pcode in patients:
            p = Patient.load(pcode)
            depth = p.get_fragment_depth(pad=False, limit_to_dilution=False)
            depth_pad = p.get_fragment_depth(pad=True, limit_to_dilution=False)

            for si, sample in enumerate(p.samples):

                # check for sufficient depth
                if ((depth[si][all_fragments.index(frag)] > dmin) or 
                    (depth_pad[si][all_fragments.index(frag)] > dmin_pad)):

                    positions, af2p, cov, af1p = sample.get_pair_frequencies(frag, var_min=var_min)

                    if positions is None:
                        continue
                    LD, Dp, p12 =  LDfunc(af2p, af1p, cov, cov_min=100)

                    X,Y = np.meshgrid(positions, positions)
                    np.fill_diagonal(cov, 0)
                    dists.extend(np.abs(X-Y)[cov>=cov_min])
                    weights_LD.extend(LD[cov>=cov_min])
                    weights_Dp.extend(Dp[cov>=cov_min])
                    print (pcode, si, frag,
                           " # of positions:", len(positions),
                           'depth:', depth[si][all_fragments.index(frag)])
                else:
                    print (pcode, si, frag, "insufficient depth:",
                           depth[si][all_fragments.index(frag)],
                           depth_pad[si][all_fragments.index(frag)])

        yn,xn = np.histogram(dists, bins = bins)
        y,x = np.histogram(dists, weights = weights_LD, bins=bins)
        LD_vs_distance[frag] = y/(1e-10+yn)
        y,x = np.histogram(dists, weights = weights_Dp, bins=bins)
        Dp_vs_distance[frag]=y/(1e-10+yn)

    for pcr in ['PCR1', 'PCR2']:
        positions, af2p, cov, af1p = control_LD(pcr, var_min=var_min)
        LD, Dp, p12 =  LDfunc(af2p, af1p, cov, cov_min=100)

        X,Y = np.meshgrid(positions, positions)
        np.fill_diagonal(cov, 0)
        dists = np.abs(X-Y)[cov>=cov_min].flatten()
        weights_LD = LD[cov>=cov_min].flatten()
        weights_Dp = Dp[cov>=cov_min].flatten()

        yn,xn = np.histogram(dists, bins = bins)
        y,x = np.histogram(dists, weights = weights_LD, bins=bins)
        LD_vs_distance[pcr] = y/(1e-10+yn)
        y,x = np.histogram(dists, weights = weights_Dp, bins=bins)
        Dp_vs_distance[pcr]=y/(1e-10+yn)
    data = {'Dp': Dp_vs_distance,
            'LDrsq': LD_vs_distance,
            'bins': bins, 'binc': binc,
            'var_min': var_min, 'cov_min': cov_min,
            'dmin': dmin, 'dmin_pad': 200,
            'patients':patients}

    return data
Example #16
0
def collect_to_away_aminoacids(patients, regions, Sbins=[0, 0.1, 0.3, 3], cov_min=1000,
                               refname='HXB2',
                               subtype='patient'):
    '''Collect allele frequencies polarized from cross-sectional consensus for amino acids

    Collect minor variant frequencies, divergences, etc separately for sites that agree or disagree
    with consensus. consensus is either group M consensus (subtype='any') or the subtype of the 
    respective patient (subtype='patient'). In addition, these quantities are stratified by entropy
    '''
    ps = {pcode: Patient.load(pcode) for pcode in patients}

    minor_variants = []
    to_away_divergence = []
    to_away_minor = []
    consensus_distance = {}
    for region in regions:
        print region

        # if subtypes == 'any' meaning comparison to groupM, we can load the reference here
        if subtype == 'any':
            ref = HIVreferenceAminoacid(region, refname=refname, subtype=subtype)
            ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05)
        else:
            refs = {}
            for subtype in ['B', 'C', 'AE']:
                ref = HIVreferenceAminoacid(region, refname=refname, subtype=subtype)
                ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05)
                refs[subtype] = ref

        # determine divergence and minor variation at sites that agree with consensus or not
        for pi, pcode in enumerate(patients):
            p = ps[pcode]
            if subtype == 'patient': # if we take the subtype of the patient, load specific ref alignment here
                ref = refs[p['Subtype']]

            aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min,
                                                      type='aa')

            # get patient to subtype map and subset entropy vectors, convert to bits
            patient_to_subtype = p.map_to_external_reference_aminoacids(region, refname=refname)
            subtype_entropy = ref.get_entropy_in_patient_region(patient_to_subtype) / np.log(2.0)
            ancestral = p.get_initial_indices(region, type='aa')[patient_to_subtype[:, -1]]
            consensus = ref.get_consensus_indices_in_patient_region(patient_to_subtype)
            away_sites = ancestral == consensus
            good_ref = ref.good_pos_in_reference[patient_to_subtype[:, 0]]
            consensus_distance[(pcode, region)] = np.mean(~away_sites)
            print pcode, region, "dist:", 1-away_sites.mean(), "useful_ref:", good_ref.mean()

            # loop over times and calculate the af in entropy bins
            for t, af in izip(p.dsi, aft):
                good_af = (((~np.any(af.mask, axis=0))
                            #&(aft[0].max(axis=0)>0.9)
                            &(af.argmax(axis=0) < af.shape[0] - 2))[patient_to_subtype[:, -1]]) \
                            & good_ref
                # make version of all arrays that contain only unmasked sites and are also ungapped
                clean_af = af[:,patient_to_subtype[:, -1]][:-1, good_af]
                clean_away = away_sites[good_af]
                clean_consensus = consensus[good_af]
                clean_ancestral = ancestral[good_af]
                clean_entropy = subtype_entropy[good_af]
                clean_entropy_bins = [(clean_entropy >= t_lower) & (clean_entropy < t_upper)
                                    for t_lower, t_upper in zip(Sbins[:-1], Sbins[1:])]
                clean_minor = clean_af.sum(axis=0) - clean_af.max(axis=0)
                clean_derived = clean_af.sum(axis=0) - clean_af[clean_ancestral,np.arange(clean_ancestral.shape[0])]
                print pcode, region, t
                
                # for each entropy bin, calculate the average divergence and minor variation
                for sbin, sites in enumerate(clean_entropy_bins):
                    minor_variants.append({'pcode': pcode,
                                           'region': region,
                                           'time': t,
                                           'S_bin': sbin,
                                           'af_away_minor':  np.mean(clean_minor[sites&clean_away]), 
                                           'af_away_derived':np.mean(clean_derived[sites&clean_away]),
                                           'af_to_minor':    np.mean(clean_minor[sites&(~clean_away)]), 
                                           'af_to_derived':  np.mean(clean_derived[sites&(~clean_away)])
                                          })

                # calculate the minor variation at sites were the founder differs from consensus
                # in different allele frequency bins
                clean_reversion = clean_af[clean_consensus,np.arange(clean_consensus.shape[0])]*(~clean_away)
                clean_total_divergence = clean_af.sum(axis=0) - clean_af[clean_ancestral,np.arange(clean_ancestral.shape[0])]
                to_away_divergence.append({'pcode': pcode,
                                           'region': region,
                                           'time': t,
                                           'reversion': np.mean(clean_reversion), 
                                           'divergence': np.mean(clean_total_divergence)
                                          })

                af_thres = [0, 0.05, 0.1, 0.25, 0.5, 0.95, 1.0]
                rev_tmp = clean_af[clean_consensus,np.arange(clean_consensus.shape[0])][~clean_away]
                der_tmp = clean_derived[~clean_away] 
                for ai,(af_lower, af_upper) in enumerate(zip(af_thres[:-1], af_thres[1:])):
                    to_away_minor.append({'pcode': pcode,
                                          'region': region,
                                          'time': t,
                                          'af_bin': ai,
                      'reversion_spectrum': np.mean(rev_tmp*(rev_tmp>=af_lower)*(rev_tmp<af_upper)),
                      'minor_reversion_spectrum': np.mean(der_tmp*(der_tmp>=af_lower)*(der_tmp<af_upper))
                                         })

    return (pd.DataFrame(minor_variants),
            pd.DataFrame(to_away_divergence),
            pd.DataFrame(to_away_minor),
            consensus_distance)
    patients = ['p1', 'p2', 'p3','p5', 'p6', 'p8', 'p9', 'p11']

    rate_or_gof = 0
    window_size = 300
    cov_min = 200

    if not os.path.isfile(fn_data) or params.redo:
        print("Regenerating plot data")
        cats = [{'name': 'total', 'only_substitutions': False},
                {'name': 'substitutions', 'only_substitutions': True},
               ]
        ref = {key: -np.ones((len(patients), 10000), dtype=float) for key in ['total', 'substitutions']}
        evo_rates = {key: {} for key in ref}
        for pi, pcode in enumerate(patients):
            p = Patient.load(pcode)
            to_ref = p.map_to_external_reference('genomewide')

            for cat in cats:
                div_traj = get_divergence_trajectory(p, cov_min=cov_min,
                                                     sequence_type=params.type,
                                                     only_substitutions=cat['only_substitutions'])

                print (pcode, cat['name']+' divergence',
                       zip(np.round(p.ysi),
                           [[np.round(x[x<th].sum()) for th in [.1, .5, 0.95, 1.0]] for x in div_traj]))

                if params.type == 'nuc':
                    min_valid_fraction = 0.95
                else:
                    # Two out of three are masked by design
Example #18
0
def collect_data_richard(patients, regions, syn_degeneracy=2):
    '''Collect data for divergence and diversity'''

    syn_divergence = {reg: {p: [] for p in patients} for reg in regions}
    syn_diversity = {reg: {p: [] for p in patients} for reg in regions}
    nonsyn_divergence = {reg: {p: [] for p in patients} for reg in regions}
    nonsyn_diversity = {reg: {p: [] for p in patients} for reg in regions}
    time_bins = np.array([0, 200, 500, 1000, 1500, 2000, 3000, 5000])

    nbins = 10
    sfs_tmin = 1000
    sfs = {
        'syn': np.zeros(nbins, dtype=float),
        'nonsyn': np.zeros(nbins, dtype='float'),
        'bins': np.linspace(0.01, 0.99, nbins + 1)
    }
    time_binc = 0.5 * (time_bins[1:] + time_bins[:-1])
    cov_min = 100
    for pi, pcode in enumerate(patients):
        try:
            p = Patient.load(pcode)
        except:
            print "Can't load patient", pcode
        else:
            for region in regions:
                for prot in regions[region]:
                    initial_indices = p.get_initial_indices(prot)
                    aft = p.get_allele_frequency_trajectories(prot,
                                                              cov_min=cov_min)
                    gaps = p.get_gaps_by_codon(prot, pad=2, threshold=0.05)
                    syn_mask = p.get_syn_mutations(prot)
                    syn_pos = (syn_mask.sum(axis=0) > 1) * (gaps == False)
                    nonsyn_pos = (syn_mask.sum(axis=0) <= 1) * (
                        p.get_constrained(prot) == False) * (gaps == False)
                    print pcode, prot, syn_pos.sum(), nonsyn_pos.sum()
                    syn_divergence[region][pcode].extend([
                        (t, divergence(af[:, syn_pos],
                                       initial_indices[syn_pos]))
                        for t, af in zip(p.dsi, aft)
                    ])
                    syn_diversity[region][pcode].extend([
                        (t, diversity(af[:, syn_pos]))
                        for t, af in zip(p.dsi, aft)
                    ])
                    nonsyn_divergence[region][pcode].extend([
                        (t,
                         divergence(af[:, nonsyn_pos],
                                    initial_indices[nonsyn_pos]))
                        for t, af in zip(p.dsi, aft)
                    ])
                    nonsyn_diversity[region][pcode].extend([
                        (t, diversity(af[:, nonsyn_pos]))
                        for t, af in zip(p.dsi, aft)
                    ])

                    syn_derived = syn_mask.copy()
                    syn_derived[initial_indices,
                                np.arange(syn_derived.shape[1])] = False
                    for t, af in izip(p.dsi, aft):
                        if t > sfs_tmin:
                            y, x = np.histogram(af[syn_derived].flatten(),
                                                bins=sfs['bins'])
                            sfs['syn'] += y
                    nonsyn_derived = syn_mask == False
                    nonsyn_derived *= (p.get_constrained(prot)
                                       == False) * (gaps == False)
                    nonsyn_derived[initial_indices,
                                   np.arange(syn_derived.shape[1])] = False
                    for t, af in izip(p.dsi, aft):
                        if t > sfs_tmin:
                            y, x = np.histogram(af[nonsyn_derived],
                                                bins=sfs['bins'])
                            sfs['nonsyn'] += y

    for tmp_data in [
            syn_divergence, syn_diversity, nonsyn_diversity, nonsyn_divergence
    ]:
        for region in regions:
            tmp = np.vstack([np.array(tmp_data[region][p]) for p in patients])
            tmp_clean = tmp[-np.isnan(tmp[:, 1]), :]
            y, x = np.histogram(tmp_clean[:, 0],
                                bins=time_bins,
                                weights=tmp_clean[:, 1])
            yn, x = np.histogram(tmp_clean[:, 0], bins=time_bins)
            tmp_data[region] = {
                'avg': y / (1e-10 + yn),
                'bins': time_binc,
                'raw': tmp_data[region]
            }

    data = {
        'syn_diversity': syn_diversity,
        'syn_divergence': syn_divergence,
        'nonsyn_diversity': nonsyn_diversity,
        'nonsyn_divergence': nonsyn_divergence,
        'sfs': sfs
    }

    return data
Example #19
0
def collect_data_fabio(patients, regions, cov_min=100, syn_degeneracy=2):
    '''Collect data for divergence and diversity'''
    import pandas as pd
    from itertools import izip

    # Prepare SFS
    nbins = 10
    sfs_tmin = 1000
    sfs = {
        'syn': np.zeros(nbins, dtype=float),
        'nonsyn': np.zeros(nbins, dtype=float),
        'bins': np.linspace(0.01, 0.99, nbins + 1),
    }

    # Collect into DataFrame
    data = []
    for pi, pcode in enumerate(patients):
        p = Patient.load(pcode)
        for region, prots in regions.iteritems():
            for prot in prots:
                aft = p.get_allele_frequency_trajectories(prot,
                                                          cov_min=cov_min)
                initial_indices = p.get_initial_indices(prot)
                gaps = p.get_gaps_by_codon(prot, pad=2, threshold=0.05)

                # Classify syn/nonsyn POSITIONS
                # NOTE: this is not fully correct because some positions (2-fold
                # degenerate) are both syn and nonsyn, but it's close enough
                syn_mask = p.get_syn_mutations(prot)
                syn_sum = syn_mask.sum(axis=0)
                # NOTE: syn_mask == 0 are substitutions, they make up most
                # of the nonsynonymous signal
                pos = {
                    'syn': (syn_sum >= syn_degeneracy) & (~gaps),
                    'nonsyn':
                    (syn_sum <= 1) & (~p.get_constrained(prot)) & (~gaps),
                }

                print pcode, prot, pos['syn'].sum(), pos['nonsyn'].sum()

                # Divergence/diversity
                for t, af in izip(p.dsi, aft):
                    for mutclass, ind in pos.iteritems():
                        data.append({
                            'pcode':
                            pcode,
                            'time':
                            t,
                            'region':
                            region,
                            'protein':
                            prot,
                            'nsites':
                            ind.sum(),
                            'mutclass':
                            mutclass,
                            'divergence':
                            divergence(af[:, ind], initial_indices[ind]),
                            'diversity':
                            diversity(af[:, ind]),
                        })

                # Site frequency spectrum
                syn_derived = syn_mask.copy()
                syn_derived[initial_indices,
                            np.arange(syn_derived.shape[1])] = False
                nonsyn_derived = (-syn_mask) & (-p.get_constrained(prot)) & (
                    -gaps)
                nonsyn_derived[initial_indices,
                               np.arange(syn_derived.shape[1])] = False

                for t, af in izip(p.dsi, aft):
                    if t < sfs_tmin:
                        continue

                    sfs['syn'] += np.histogram(af[syn_derived],
                                               bins=sfs['bins'])[0]
                    sfs['nonsyn'] += np.histogram(af[nonsyn_derived],
                                                  bins=sfs['bins'])[0]

    data = pd.DataFrame(data)
    data['divergence'] = data['divergence'].astype(float)
    data['diversity'] = data['diversity'].astype(float)
    return {'divdiv': data, 'sfs': sfs}
Example #20
0
            {
                'name': 'total',
                'only_substitutions': False
            },
            {
                'name': 'substitutions',
                'only_substitutions': True
            },
        ]
        ref = {
            key: -np.ones((len(patients), 10000), dtype=float)
            for key in ['total', 'substitutions']
        }
        evo_rates = {key: {} for key in ref}
        for pi, pcode in enumerate(patients):
            p = Patient.load(pcode)
            to_ref = p.map_to_external_reference('genomewide')

            for cat in cats:
                div_traj = get_divergence_trajectory(
                    p,
                    cov_min=cov_min,
                    sequence_type=params.type,
                    only_substitutions=cat['only_substitutions'])

                print(
                    pcode, cat['name'] + ' divergence',
                    zip(np.round(p.ysi), [[
                        np.round(x[x < th].sum())
                        for th in [.1, .5, 0.95, 1.0]
                    ] for x in div_traj]))
Example #21
0
def collect_data_LD(patients):
    '''Collect data for LD plot'''
    dmin = 40
    dmin_pad = 200
    var_min = 0.2
    cov_min = 200
    LD_vs_distance = {}
    Dp_vs_distance = {}
    bins = np.arange(0, 401, 40)
    binc = (bins[:-1] + bins[1:]) * 0.5
    for frag in all_fragments:
        if frag not in ['F' + str(i) for i in xrange(1, 7)]:
            continue
        dists = []
        weights_LD = []
        weights_Dp = []
        for pcode in patients:
            p = Patient.load(pcode)
            depth = p.get_fragment_depth(pad=False, limit_to_dilution=False)
            depth_pad = p.get_fragment_depth(pad=True, limit_to_dilution=False)

            for si, sample in enumerate(p.samples):

                # check for sufficient depth
                if ((depth[si][all_fragments.index(frag)] > dmin) or
                    (depth_pad[si][all_fragments.index(frag)] > dmin_pad)):

                    positions, af2p, cov, af1p = sample.get_pair_frequencies(
                        frag, var_min=var_min)

                    if positions is None:
                        continue
                    LD, Dp, p12 = LDfunc(af2p, af1p, cov, cov_min=100)

                    X, Y = np.meshgrid(positions, positions)
                    np.fill_diagonal(cov, 0)
                    dists.extend(np.abs(X - Y)[cov >= cov_min])
                    weights_LD.extend(LD[cov >= cov_min])
                    weights_Dp.extend(Dp[cov >= cov_min])
                    print(pcode, si, frag, " # of positions:", len(positions),
                          'depth:', depth[si][all_fragments.index(frag)])
                else:
                    print(pcode, si, frag, "insufficient depth:",
                          depth[si][all_fragments.index(frag)],
                          depth_pad[si][all_fragments.index(frag)])

        yn, xn = np.histogram(dists, bins=bins)
        y, x = np.histogram(dists, weights=weights_LD, bins=bins)
        LD_vs_distance[frag] = y / (1e-10 + yn)
        y, x = np.histogram(dists, weights=weights_Dp, bins=bins)
        Dp_vs_distance[frag] = y / (1e-10 + yn)

    for pcr in ['PCR1', 'PCR2']:
        positions, af2p, cov, af1p = control_LD(pcr, var_min=var_min)
        LD, Dp, p12 = LDfunc(af2p, af1p, cov, cov_min=100)

        X, Y = np.meshgrid(positions, positions)
        np.fill_diagonal(cov, 0)
        dists = np.abs(X - Y)[cov >= cov_min].flatten()
        weights_LD = LD[cov >= cov_min].flatten()
        weights_Dp = Dp[cov >= cov_min].flatten()

        yn, xn = np.histogram(dists, bins=bins)
        y, x = np.histogram(dists, weights=weights_LD, bins=bins)
        LD_vs_distance[pcr] = y / (1e-10 + yn)
        y, x = np.histogram(dists, weights=weights_Dp, bins=bins)
        Dp_vs_distance[pcr] = y / (1e-10 + yn)
    data = {
        'Dp': Dp_vs_distance,
        'LDrsq': LD_vs_distance,
        'bins': bins,
        'binc': binc,
        'var_min': var_min,
        'cov_min': cov_min,
        'dmin': dmin,
        'dmin_pad': 200,
        'patients': patients
    }

    return data
Example #22
0
    parser = argparse.ArgumentParser(
        description="make figure for SNP correlations")
    parser.add_argument('--redo', action='store_true', help='recalculate data')
    params = parser.parse_args()

    VERBOSE = 2
    pname = 'p11'
    n_time = 4

    username = os.path.split(os.getenv('HOME'))[-1]
    foldername = get_figure_folder(username, 'controls')
    fn_data = foldername + 'data/'
    fn_data = fn_data + 'allele_frequency_overlap.pickle'

    if not os.path.isfile(fn_data) or params.redo:
        patient = Patient.load(pname)
        samples = patient.samples[n_time]
        data = get_allele_frequency_overlap(sample,
                                            overlaps,
                                            cov_min=cov_min,
                                            VERBOSE=VERBOSE,
                                            qual_min=qual_min)

        estimate_templates_overlaps(sample, data)

        store_data(data, fn_data)
    else:
        data = load_data(fn_data)

    filename = foldername + 'allele_frequency_overlap'
    plot_allele_frequency_overlap(
Example #23
0
def collect_substitution_data(patients, regions, cov_min=100):
    from Bio.Seq import translate
    data = []
    for pi, pcode in enumerate(patients):
        p = Patient.load(pcode)

        for region in regions:
            print p.name, region

            initial_indices = p.get_initial_indices(region)
            aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min)
            if np.isscalar(aft.mask):
                aft.mask = np.zeros_like(aft, bool)

            coomap = p.map_to_external_reference(region)[:, ::2]
            coomapd = {
                'pat_to_subtype': dict(coomap[:, ::-1]),
                'subtype_to_pat': dict(coomap)
            }

            for posdna in xrange(aft.shape[-1]):
                # Get the position in reference coordinates
                if posdna not in coomapd['pat_to_subtype']:
                    pos_sub = -1  #continue
                else:
                    pos_sub = coomapd['pat_to_subtype'][posdna]

                # Get allele frequency trajectory
                aftpos = aft[:, :, posdna]
                ind = -aftpos[:, 0].mask
                if ind.sum() == 0:
                    continue
                aftpos = aftpos[ind]
                timespos = p.dsi[ind]

                # Ancestral allele
                ianc = initial_indices[posdna]
                anc = alpha[ianc]

                # Ignore indels
                if ianc >= 4:
                    continue

                # Check for fixation
                if (aftpos[0, ianc] < 0.7) or np.min(aftpos[:, ianc]) > 0.2:
                    continue

                # Get codon
                ci = posdna // 3
                rf = posdna % 3
                cod_anc = ''.join(alpha[initial_indices[ci * 3:(ci + 1) * 3]])
                if '-' in cod_anc:
                    continue
                aa_anc = translate(cod_anc)

                # Check which allele (if any) is fixing
                for inuc, nuc in enumerate(alpha[:4]):
                    if nuc == anc:
                        continue

                    if aftpos[-1, inuc] < 0.95:
                        continue

                    # NOTE: OK, it's a substitution (max 1 per site)
                    break
                else:
                    continue

                # Assign a time to the substitution
                ist = (aftpos[:, inuc] > 0.5).nonzero()[0][0]
                tsubst = 0.5 * (timespos[ist - 1] + timespos[ist])

                nuc = alpha[inuc]
                mut = anc + '->' + nuc

                # Define transition/transversion
                if frozenset(nuc + anc) in (frozenset('CT'), frozenset('AG')):
                    trclass = 'ts'
                else:
                    trclass = 'tv'

                # Check syn/nonsyn
                cod_nuc = cod_anc[:rf] + nuc + cod_anc[rf + 1:]
                aa_nuc = translate(cod_nuc)
                is_syn = aa_nuc == aa_anc

                datum = {
                    'pcode': p.name,
                    'region': region,
                    'pos_patient': posdna,
                    'pos_ref': pos_sub,
                    'mut': mut,
                    'trclass': trclass,
                    'syn': is_syn,
                    'time': tsubst,
                }

                data.append(datum)

    data = pd.DataFrame(data)
    return data
def collect_data_richard(patients, regions, syn_degeneracy=2):
    '''Collect data for divergence and diversity'''

    syn_divergence = {reg:{p:[] for p in patients} for reg in regions}
    syn_diversity = {reg:{p:[] for p in patients} for reg in regions}
    nonsyn_divergence = {reg:{p:[] for p in patients} for reg in regions}
    nonsyn_diversity = {reg:{p:[] for p in patients} for reg in regions}
    time_bins = np.array([0, 200, 500, 1000, 1500, 2000, 3000, 5000])

    nbins=10
    sfs_tmin=1000
    sfs = {'syn':np.zeros(nbins, dtype=float),
           'nonsyn':np.zeros(nbins, dtype='float'),
           'bins':np.linspace(0.01,0.99,nbins+1)}
    time_binc = 0.5*(time_bins[1:]+time_bins[:-1])
    cov_min = 100
    for pi, pcode in enumerate(patients):
        try:
            p = Patient.load(pcode)
        except:
            print "Can't load patient", pcode
        else:
            for region in regions:
                for prot in regions[region]:
                    initial_indices = p.get_initial_indices(prot)
                    aft = p.get_allele_frequency_trajectories(prot, cov_min=cov_min)
                    gaps = p.get_gaps_by_codon(prot, pad=2, threshold=0.05)
                    syn_mask = p.get_syn_mutations(prot)
                    syn_pos = (syn_mask.sum(axis=0)>1)*(gaps==False)
                    nonsyn_pos = (syn_mask.sum(axis=0)<=1)*(p.get_constrained(prot)==False)*(gaps==False)
                    print pcode, prot, syn_pos.sum(), nonsyn_pos.sum()
                    syn_divergence[region][pcode].extend([(t, divergence(af[:,syn_pos],
                                             initial_indices[syn_pos])) for t,af in zip(p.dsi, aft)])
                    syn_diversity[region][pcode].extend([(t, diversity(af[:,syn_pos]))
                                                        for t,af in zip(p.dsi, aft)])
                    nonsyn_divergence[region][pcode].extend([(t, divergence(af[:,nonsyn_pos],
                                             initial_indices[nonsyn_pos])) for t,af in zip(p.dsi, aft)])
                    nonsyn_diversity[region][pcode].extend([(t, diversity(af[:,nonsyn_pos]))
                                                           for t,af in zip(p.dsi, aft)])

                    syn_derived = syn_mask.copy()
                    syn_derived[initial_indices, np.arange(syn_derived.shape[1])]=False
                    for t,af in izip(p.dsi,aft):
                        if t>sfs_tmin:
                            y,x = np.histogram(af[syn_derived].flatten(), bins=sfs['bins'])
                            sfs['syn']+=y
                    nonsyn_derived = syn_mask==False
                    nonsyn_derived*=(p.get_constrained(prot)==False)*(gaps==False)
                    nonsyn_derived[initial_indices, np.arange(syn_derived.shape[1])]=False
                    for t,af in izip(p.dsi,aft):
                        if t>sfs_tmin:
                            y,x = np.histogram(af[nonsyn_derived], bins=sfs['bins'])
                            sfs['nonsyn']+=y

    for tmp_data in [syn_divergence, syn_diversity, nonsyn_diversity, nonsyn_divergence]:
        for region in regions:
            tmp = np.vstack([np.array(tmp_data[region][p]) for p in patients])
            tmp_clean = tmp[-np.isnan(tmp[:,1]),:]
            y,  x = np.histogram(tmp_clean[:,0],bins = time_bins, weights = tmp_clean[:,1])
            yn, x = np.histogram(tmp_clean[:,0],bins = time_bins)
            tmp_data[region] = {'avg':y/(1e-10+yn), 'bins':time_binc, 'raw':tmp_data[region]}

    data = {'syn_diversity':syn_diversity, 'syn_divergence':syn_divergence,
            'nonsyn_diversity':nonsyn_diversity, 'nonsyn_divergence':nonsyn_divergence,
            'sfs':sfs}

    return data
    import argparse
    parser = argparse.ArgumentParser(description="make figure for SNP correlations")
    parser.add_argument('--redo', action='store_true', help='recalculate data')
    params = parser.parse_args()

    VERBOSE = 2
    pname = 'p11'
    n_time = 4

    username = os.path.split(os.getenv('HOME'))[-1]
    foldername = get_figure_folder(username, 'controls')
    fn_data = foldername+'data/'
    fn_data = fn_data + 'allele_frequency_overlap.pickle'

    if not os.path.isfile(fn_data) or params.redo:
        patient = Patient.load(pname)
        samples = patient.samples[n_time]
        data = get_allele_frequency_overlap(sample, overlaps, cov_min=cov_min,
                                            VERBOSE=VERBOSE, qual_min=qual_min)

        estimate_templates_overlaps(sample, data)


        store_data(data, fn_data)
    else:
        data = load_data(fn_data)
        
    filename = foldername+'allele_frequency_overlap'
    plot_allele_frequency_overlap(data, VERBOSE=VERBOSE,
                                  fig_filename=filename,
                                 )
Example #26
0
from hivevo.hivevo.patients import Patient
from hivevo.hivevo.samples import all_fragments
from hivevo.hivevo.sequence import alpha

from filenames import get_figure_folder
from util import store_data, load_data, fig_width, fig_fontsize, patients, patient_colors, HIVEVO_colormap
plt.ion()
sns.set_style('darkgrid')

username = os.path.split(os.getenv('HOME'))[-1]
foldername = get_figure_folder(username, 'first')


cmap = HIVEVO_colormap()
p = Patient.load('p1')
fig, axs = plt.subplots(2,3, sharey=True, sharex=True)
traj = []
ti = 3
tj = ti+1
for fi, frag in enumerate(all_fragments):
    ax = axs[fi//3,fi%3]
    aft = p.get_allele_frequency_trajectories(frag)
    pos = np.linspace(0,1,aft.shape[-1])
    for pi in xrange(aft.shape[-1]):
        for ni in xrange(5):
            if aft[0,ni,pi]<0.5 and (aft[ti,ni,pi]>0.2 and aft[tj,ni,pi]>0.2):
                traj.append([frag, pi, aft[:,ni,pi]])
    try:
        for ni in xrange(5):
            ind = (aft[ti,ni,:]*(1-aft[ti,ni,:])>0.01)|(aft[tj,ni,:]*(1-aft[tj,ni,:])>0.01)
Example #27
0
def collect_to_away(patients, regions, Sbins=[0,0.02, 0.08, 0.25, 2], cov_min=1000, subtype = 'patient'):
    minor_variants = []
    to_away_divergence = []
    to_away_minor = []
    consensus_distance = {}
    # if subtypes == 'any' meaning comparison to groupM, we can load the reference here
    if subtype=='any':
        hxb2 = HIVreference(refname='HXB2', subtype = subtype)
        good_pos_in_reference = hxb2.get_ungapped(threshold = 0.05)

    # determine divergence and minor variation at sites that agree with consensus or not
    for pi, pcode in enumerate(patients):
        try:
            p = Patient.load(pcode)
        except:
            print "Can't load patient", pcode
        else:
            if subtype == 'patient': # if we take the subtype of the patient, load specific ref alignment here
                hxb2 = HIVreference(refname='HXB2', subtype = p['Subtype'])
                good_pos_in_reference = hxb2.get_ungapped(threshold = 0.05)
            for region in regions:
                aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min, type='aa')

                # get patient to subtype map and subset entropy vectors, convert to bits
                patient_to_subtype = p.map_to_external_reference(region, refname = 'HXB2')
                subtype_entropy = hxb2.get_entropy_in_patient_region(patient_to_subtype)/np.log(2.0)
                ancestral = p.get_initial_indices(region)[patient_to_subtype[:,2]]
                consensus = hxb2.get_consensus_indices_in_patient_region(patient_to_subtype)
                away_sites = ancestral==consensus
                good_ref = good_pos_in_reference[patient_to_subtype[:,0]]
                consensus_distance[(pcode, region)] = np.mean(~away_sites)
                print pcode, region, "dist:",1-away_sites.mean(), "useful_ref:",good_ref.mean()

                # loop over times and calculate the af in entropy bins
                for t, af in izip(p.dsi,aft):
                    good_af = (((~np.any(af.mask, axis=0))
                                #&(aft[0].max(axis=0)>0.9)
                                &(af.argmax(axis=0)<4))[patient_to_subtype[:,2]]) \
                                & good_ref
                    # make version of all arrays that contain only unmasked sites and are also ungapped
                    clean_af = af[:,patient_to_subtype[:,2]][:5,good_af]
                    clean_away = away_sites[good_af]
                    clean_consensus = consensus[good_af]
                    clean_ancestral = ancestral[good_af]
                    clean_entropy = subtype_entropy[good_af]
                    clean_entropy_bins = [(clean_entropy>=t_lower)&(clean_entropy<t_upper)
                                        for t_lower, t_upper in zip(Sbins[:-1], Sbins[1:])]
                    clean_minor = clean_af.sum(axis=0) - clean_af.max(axis=0)
                    clean_derived = clean_af.sum(axis=0) - clean_af[clean_ancestral,np.arange(clean_ancestral.shape[0])]
                    print pcode, region, t
                    
                    # for each entropy bin, calculate the average divergence and minor variation
                    for sbin, sites in enumerate(clean_entropy_bins):
                        minor_variants.append({'pcode':pcode,'region':region,'time':t,'S_bin':sbin,
                                            'af_away_minor':  np.mean(clean_minor[sites&clean_away]), 
                                            'af_away_derived':np.mean(clean_derived[sites&clean_away]),
                                            'af_to_minor':    np.mean(clean_minor[sites&(~clean_away)]), 
                                            'af_to_derived':  np.mean(clean_derived[sites&(~clean_away)])})

                    # calculate the minor variation at sites were the founder differs from consensus
                    # in different allele frequency bins
                    clean_reversion = clean_af[clean_consensus,np.arange(clean_consensus.shape[0])]*(~clean_away)
                    clean_total_divergence = clean_af.sum(axis=0) - clean_af[clean_ancestral,np.arange(clean_ancestral.shape[0])]
                    to_away_divergence.append({'pcode':pcode,'region':region,'time':t,
                                        'reversion':np.mean(clean_reversion), 
                                        'divergence':np.mean(clean_total_divergence)})

                    af_thres = [0,0.05,0.1, 0.25, 0.5, 0.95, 1.0]
                    rev_tmp = clean_af[clean_consensus,np.arange(clean_consensus.shape[0])][~clean_away]
                    der_tmp = clean_derived[~clean_away] 
                    for ai,(af_lower, af_upper) in enumerate(zip(af_thres[:-1], af_thres[1:])):
                        to_away_minor.append({'pcode':pcode,'region':region,'time':t,'af_bin':ai,
                                        'reversion_spectrum':np.mean(rev_tmp*(rev_tmp>=af_lower)*(rev_tmp<af_upper)),
                                        'minor_reversion_spectrum':np.mean(der_tmp*(der_tmp>=af_lower)*(der_tmp<af_upper))})

    return pd.DataFrame(minor_variants), pd.DataFrame(to_away_divergence),pd.DataFrame(to_away_minor), consensus_distance
def collect_data_fabio(patients, regions, cov_min=100, syn_degeneracy=2):
    '''Collect data for divergence and diversity'''
    import pandas as pd
    from itertools import izip

    # Prepare SFS
    nbins=10
    sfs_tmin=1000
    sfs = {'syn': np.zeros(nbins, dtype=float),
           'nonsyn': np.zeros(nbins, dtype=float),
           'bins': np.linspace(0.01, 0.99, nbins+1),
          }

    # Collect into DataFrame
    data = []
    for pi, pcode in enumerate(patients):
        p = Patient.load(pcode)
        for region, prots in regions.iteritems():
            for prot in prots:
                aft = p.get_allele_frequency_trajectories(prot, cov_min=cov_min)
                initial_indices = p.get_initial_indices(prot)
                gaps = p.get_gaps_by_codon(prot, pad=2, threshold=0.05)

                # Classify syn/nonsyn POSITIONS
                # NOTE: this is not fully correct because some positions (2-fold
                # degenerate) are both syn and nonsyn, but it's close enough
                syn_mask = p.get_syn_mutations(prot)
                syn_sum = syn_mask.sum(axis=0)
                # NOTE: syn_mask == 0 are substitutions, they make up most
                # of the nonsynonymous signal
                pos = {'syn': (syn_sum >= syn_degeneracy) & (~gaps),
                       'nonsyn': (syn_sum <= 1) & (~p.get_constrained(prot)) & (~gaps),
                      }

                print pcode, prot, pos['syn'].sum(), pos['nonsyn'].sum()

                # Divergence/diversity
                for t, af in izip(p.dsi, aft):
                    for mutclass, ind in pos.iteritems():
                        data.append({'pcode': pcode,
                                     'time': t,
                                     'region': region,
                                     'protein': prot,
                                     'nsites': ind.sum(),
                                     'mutclass': mutclass,
                                     'divergence': divergence(af[:, ind], initial_indices[ind]),
                                     'diversity': diversity(af[:, ind]),
                                    })


                # Site frequency spectrum
                syn_derived = syn_mask.copy()
                syn_derived[initial_indices, np.arange(syn_derived.shape[1])] = False
                nonsyn_derived = (-syn_mask) & (-p.get_constrained(prot)) & (-gaps)
                nonsyn_derived[initial_indices, np.arange(syn_derived.shape[1])] = False

                for t,af in izip(p.dsi,aft):
                    if t < sfs_tmin:
                        continue

                    sfs['syn'] += np.histogram(af[syn_derived], bins=sfs['bins'])[0]
                    sfs['nonsyn'] += np.histogram(af[nonsyn_derived], bins=sfs['bins'])[0]


    data = pd.DataFrame(data)
    data['divergence'] = data['divergence'].astype(float)
    data['diversity'] = data['diversity'].astype(float)
    return {'divdiv':data, 'sfs':sfs}