def collect_diverse_sites_aminoacids(patients, regions, cov_min=1000, af_threshold=0.01, subtype='patient', refname='HXB2'): '''Fraction of sites that are diverse for different quantiles of subtype entropy''' ps = {pcode: Patient.load(pcode) for pcode in patients} diverse_fraction = [] for region in regions: print region if subtype == 'any': ref = HIVreferenceAminoacid(refname=refname, subtype='any') ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05) else: refs = {} for subtype in ['B', 'C', 'AE']: ref = HIVreferenceAminoacid(region, refname=refname, subtype=subtype) ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05) refs[subtype] = ref for pi, pcode in enumerate(patients): p = ps[pcode] if subtype == 'patient': ref = refs[p['Subtype']] aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min, type='aa') if len(aft.mask.shape) < 2: aft.mask = np.zeros_like(aft, dtype=bool) # get patient to subtype map and subset entropy vectors patient_to_subtype = p.map_to_external_reference_aminoacids( region, refname=refname) subtype_entropy = ref.get_entropy_in_patient_region( patient_to_subtype) entropy_quantiles = get_quantiles(4, subtype_entropy) good_ref = ref.good_pos_in_reference[patient_to_subtype[:, 0]] # loop over times and calculate the correlation for each value for t, af in izip(p.dsi, aft): good_af = (~np.any( af.mask, axis=0)[patient_to_subtype[:, 1]]) & good_ref tmp_af = af[:, patient_to_subtype[:, 1]] # tmp_af has only columns that are mappable to the reference # good_af is a mask for useful columns # Squant['ind'] below is a mask for positions corresponding to an entropy quantile (at mappable positions) tmp = {'S'+str(i+1):np.mean(tmp_af[:,Squant['ind']*good_af].max(axis=0)\ <tmp_af[:,Squant['ind']*good_af].sum(axis=0)-af_threshold) for i, Squant in entropy_quantiles.iteritems()} tmp.update({'pcode': pcode, 'region': region, 'time': t}) diverse_fraction.append(tmp) return pd.DataFrame(diverse_fraction)
def make_tree(region, fn_ali, fn_tree, tmpfile='/tmp/seqs.fasta', fasttreebin='FastTree'): '''Make tree of minor haplotype variants from all patients + outgroup''' # Collect haplpotypes from patients seqs = [] patients = ['p1', 'p2', 'p3', 'p5', 'p6', 'p8', 'p9', 'p11'] for pcode in patients: p = Patient.load(pcode) for seq in p.get_haplotype_alignment(region): seq.id = 'patient_' + pcode + '_' + seq.id seq.name = 'patient_' + pcode + '_' + seq.name seq.description = 'patient ' + pcode + ', ' + seq.description seqs.append(seq) # Add reference as an outgroup ref = HIVreference(load_alignment=False) refseq = ref.annotation[region].extract(ref.seq) seqs.append(refseq) # Align (Muscle) if os.path.isfile(tmpfile): os.remove(tmpfile) SeqIO.write(seqs, tmpfile, 'fasta') try: sp.call([ 'muscle', '-maxiters', '1', '-diags', '-in', tmpfile, '-out', fn_ali ]) finally: os.remove(tmpfile) # Annotate for FastTree (does not accept double labels) seqs = [] for seq in SeqIO.parse(fn_ali, 'fasta'): seq.name = seq.name + '_#' + str(len(seqs)) seq.id = seq.id + '_#' + str(len(seqs)) seqs.append(seq) SeqIO.write(seqs, tmpfile, 'fasta') # FastTree try: sp.call([fasttreebin, '-nt', '-out', fn_tree, tmpfile]) finally: os.remove(tmpfile) # reroot with outgroup tree = Phylo.read(fn_tree, 'newick') for leaf in tree.get_terminals(): if refseq.id in leaf.name: break tree.root_with_outgroup(leaf) Phylo.write(tree, fn_tree, 'newick')
def get_toaway_histograms(subtype, Sc=1): ''' calculate allele frequency histograms for each patient and each time points separately for sites that agree or disagree with consensus. this can be done for a low and high entropy category with the threshold set by Sc ''' away_histogram = {(pcode, Sbin): {} for Sbin in ['low', 'high'] for pcode in patients} to_histogram = {(pcode, Sbin): {} for Sbin in ['low', 'high'] for pcode in patients} # if subtypes == 'any' meaning comparison to groupM, we can load the reference here if subtype == 'any': hxb2 = HIVreference(refname='HXB2', subtype=subtype) good_pos_in_reference = hxb2.get_ungapped(threshold=0.05) # determine divergence and minor variation at sites that agree with consensus or not for pi, pcode in enumerate(patients): try: p = Patient.load(pcode) except: print "Can't load patient", pcode else: print('subtype:', subtype, "patient", pcode) if subtype == 'patient': # if we take the subtype of the patient, load specific ref alignment here hxb2 = HIVreference(refname='HXB2', subtype=p['Subtype']) good_pos_in_reference = hxb2.get_ungapped(threshold=0.05) for region in regions: aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min) # get patient to subtype map and subset entropy vectors, convert to bits patient_to_subtype = p.map_to_external_reference( region, refname='HXB2') subtype_entropy = hxb2.get_entropy_in_patient_region( patient_to_subtype) / np.log(2.0) ancestral = p.get_initial_indices(region)[ patient_to_subtype[:, 2]] consensus = hxb2.get_consensus_indices_in_patient_region( patient_to_subtype) good_ref = good_pos_in_reference[patient_to_subtype[:, 0]] away_sites = ancestral == consensus aft_HXB2 = aft[:, :, patient_to_subtype[:, 2]] for H, sites in [(away_histogram, away_sites), (to_histogram, ~away_sites)]: for Sbin in ['low', 'high']: if Sbin == 'low': ind = (sites) & (subtype_entropy < Sc) & (good_ref) else: ind = (sites) & (subtype_entropy >= Sc) & (good_ref) for ti, t in enumerate(p.dsi): y, x = np.histogram( aft_HXB2[ti, ancestral[ind], np.where(ind)[0]].compressed(), bins=af_bins) H[(pcode, Sbin)][t] = y return to_histogram, away_histogram
def collect_data(): data = {'CD4': get_CD4(), 'VL': get_VL(), 'deep sequencing': {}} for pn in pnumbers: pcode = 'p' + str(pn) p = Patient.load(pcode) data['deep sequencing'][pcode] = p.dsi return data
def get_toaway_histograms_aminoacids(subtype, Sc=1, refname='HXB2'): '''Calculate SFS for towards/away from cross-sectional consensus for amino acids Calculate allele frequency histograms for each patient and each time points separately for sites that agree or disagree with consensus. this can be done for a low and high entropy category with the threshold set by Sc ''' ps = {pcode: Patient.load(pcode) for pcode in patients} away_histogram = {(pcode, Sbin):{} for Sbin in ['low','high'] for pcode in patients} to_histogram = {(pcode, Sbin):{} for Sbin in ['low','high'] for pcode in patients} for region in regions: # if subtypes == 'any' meaning comparison to groupM, we can load the reference here if subtype=='any': ref = HIVreferenceAminoacid(region, refname=refname, subtype=subtype) ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05) else: refs = {} for subtype in ['B', 'C', 'AE']: ref = HIVreferenceAminoacid(region, refname=refname, subtype=subtype) ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05) refs[subtype] = ref # determine divergence and minor variation at sites that agree with consensus or not for pi, pcode in enumerate(patients): p = ps[pcode] print 'subtype:', subtype, "patient", pcode if subtype == 'patient': # if we take the subtype of the patient, load specific ref alignment here ref = refs[p['Subtype']] aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min, type='aa') # get patient to subtype map and subset entropy vectors, convert to bits patient_to_subtype = p.map_to_external_reference_aminoacids(region, refname=refname) subtype_entropy = ref.get_entropy_in_patient_region(patient_to_subtype) / np.log(2.0) ancestral = p.get_initial_indices(region, type='aa')[patient_to_subtype[:, -1]] consensus = ref.get_consensus_indices_in_patient_region(patient_to_subtype) good_ref = ref.good_pos_in_reference[patient_to_subtype[:, 0]] away_sites = ancestral == consensus aft_ref = aft[:,:,patient_to_subtype[:, -1]] # H is the dict ot add this too, sites are the consensus/non consensus positions for H, sites in [(away_histogram, away_sites), (to_histogram, ~away_sites)]: for Sbin in ['low', 'high']: if Sbin=='low': # make a boolean array with the relevant positions == True ind = (sites)&(subtype_entropy<Sc)&(good_ref) else: ind = (sites)&(subtype_entropy>=Sc)&(good_ref) for ti,t in enumerate(p.dsi): # for each time point, make and allele frequency histogram y, x = np.histogram(aft_ref[ti,ancestral[ind], np.where(ind)[0]].compressed(), bins=af_bins) H[(pcode, Sbin)][t] = y return to_histogram, away_histogram
def collect_correlations(patients, regions, cov_min=1000, subtype='patient', refname='HXB2'): '''Correlation of subtype entropy and intra-patient diversity''' correlations = [] if subtype == 'any': ref = HIVreference(refname=refname, subtype='any') ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05) else: refs = {} for subtype in ['B', 'C', 'AE']: ref = HIVreference(refname=refname, subtype=subtype) ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05) refs[subtype] = ref for pi, pcode in enumerate(patients): p = Patient.load(pcode) if subtype == 'patient': ref = refs[p['Subtype']] for region in regions: aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min) if len(aft.mask.shape) < 2: aft.mask = np.zeros_like(aft, dtype=bool) # get patient to subtype map and subset entropy vectors patient_to_subtype = p.map_to_external_reference(region, refname=refname) subtype_entropy = ref.get_entropy_in_patient_region( patient_to_subtype) good_ref = ref.good_pos_in_reference[patient_to_subtype[:, 0]] # loop over times and calculate the correlation for each value for t, af in izip(p.dsi, aft): patient_entropy = np.maximum( 0, -np.sum(af[:-1] * np.log(1e-10 + af[:-1]), axis=0))[patient_to_subtype[:, 2]] # good_af is a mask for useful columns good_af = (~np.any( af.mask, axis=0)[patient_to_subtype[:, 2]]) & good_ref if good_af.sum() > 0.5 * good_af.shape[0]: rho, pval = spearmanr(patient_entropy[good_af], subtype_entropy[good_af]) correlations.append({ 'pcode': pcode, 'region': region, 'time': t, 'rho': rho, 'pval': pval }) return pd.DataFrame(correlations)
def collect_data(): data = {'CD4': get_CD4(), 'VL': get_VL(), 'deep sequencing': {}} for pn in pnumbers: pcode = 'p'+str(pn) p = Patient.load(pcode) data['deep sequencing'][pcode] = p.dsi return data
def collect_ctl_data(patients, regions, ctl_kind='mhci=80'): data_ctl = [] for pi, pcode in enumerate(patients): p = Patient.load(pcode) # Add predicted epitopes ctl_table = p.get_ctl_epitopes(kind=ctl_kind, regions=regions) ctl_table['pcode'] = p.name data_ctl.append(ctl_table) data_ctl = pd.concat(data_ctl) return data_ctl
def collect_correlations(patients, regions, cov_min=1000, refname='HXB2', min_dsi=1500): '''Correlation of entropy between patients''' ps = [Patient.load(pcode) for pcode in patients] correlations = [] for region in regions: print region for pi, p1 in enumerate(ps): aft1 = p1.get_allele_frequency_trajectories(region, cov_min=cov_min) af1 = aft1[p1.dsi >= min_dsi].mean(axis=0) en1 = np.maximum( 0, -np.sum(af1[:-1] * np.log(1e-10 + af1[:-1]), axis=0)) ptoref1 = p1.map_to_external_reference(region, refname=refname) ptorefd1 = dict(ptoref1[:, ::2]) seq1 = p1.get_initial_sequence(region) for p2 in ps[:pi]: aft2 = p2.get_allele_frequency_trajectories(region, cov_min=cov_min) af2 = aft2[p2.dsi >= min_dsi].mean(axis=0) en2 = np.maximum( 0, -np.sum(af2[:-1] * np.log(1e-10 + af2[:-1]), axis=0)) ptoref2 = p2.map_to_external_reference(region, refname=refname) ptorefd2 = dict(ptoref2[:, ::2]) seq2 = p2.get_initial_sequence(region) overlap = np.intersect1d(ptoref1[:, 0], ptoref2[:, 0], assume_unique=True) af_ov = np.array([(en1[ptorefd1[pos]], en2[ptorefd2[pos]]) for pos in overlap]) rho, pval = spearmanr(af_ov[:, 0], af_ov[:, 1]) seq1_ov = np.array([seq1[ptorefd1[pos]] for pos in overlap]) seq2_ov = np.array([seq2[ptorefd2[pos]] for pos in overlap]) dist = (seq1_ov != seq2_ov).mean() correlations.append({ 'pcode1': p1.name, 'pcode2': p2.name, 'pcode': p1.name + '-' + p2.name, 'region': region, 'rho': rho, 'distance': dist, 'pval': pval }) return pd.DataFrame(correlations)
def make_tree(region, fn_ali, fn_tree, tmpfile='/tmp/seqs.fasta', fasttreebin='FastTree'): '''Make tree of minor haplotype variants from all patients + outgroup''' # Collect haplpotypes from patients seqs = [] patients = ['p1', 'p2', 'p3','p5', 'p6', 'p8', 'p9', 'p11'] for pcode in patients: p = Patient.load(pcode) for seq in p.get_haplotype_alignment(region): seq.id = 'patient_'+pcode+'_'+seq.id seq.name = 'patient_'+pcode+'_'+seq.name seq.description = 'patient '+pcode+', '+seq.description seqs.append(seq) # Add reference as an outgroup ref = HIVreference(load_alignment=False) refseq = ref.annotation[region].extract(ref.seq) seqs.append(refseq) # Align (Muscle) if os.path.isfile(tmpfile): os.remove(tmpfile) SeqIO.write(seqs, tmpfile, 'fasta') try: sp.call(['muscle', '-maxiters', '1', '-diags', '-in', tmpfile, '-out', fn_ali]) finally: os.remove(tmpfile) # Annotate for FastTree (does not accept double labels) seqs = [] for seq in SeqIO.parse(fn_ali, 'fasta'): seq.name = seq.name+'_#'+str(len(seqs)) seq.id = seq.id+'_#'+str(len(seqs)) seqs.append(seq) SeqIO.write(seqs, tmpfile, 'fasta') # FastTree try: sp.call([fasttreebin, '-nt', '-out', fn_tree, tmpfile]) finally: os.remove(tmpfile) # reroot with outgroup tree = Phylo.read(fn_tree, 'newick') for leaf in tree.get_terminals(): if refseq.id in leaf.name: break tree.root_with_outgroup(leaf) Phylo.write(tree, fn_tree, 'newick')
def collect_diverse_sites_aminoacids(patients, regions, cov_min=1000, af_threshold=0.01, subtype='patient', refname='HXB2'): '''Fraction of sites that are diverse for different quantiles of subtype entropy''' ps = {pcode: Patient.load(pcode) for pcode in patients} diverse_fraction = [] for region in regions: print region if subtype=='any': ref = HIVreferenceAminoacid(refname=refname, subtype='any') ref.good_pos_in_reference = ref.get_ungapped(threshold = 0.05) else: refs = {} for subtype in ['B', 'C', 'AE']: ref = HIVreferenceAminoacid(region, refname=refname, subtype=subtype) ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05) refs[subtype] = ref for pi, pcode in enumerate(patients): p = ps[pcode] if subtype=='patient': ref = refs[p['Subtype']] aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min, type='aa') if len(aft.mask.shape)<2: aft.mask = np.zeros_like(aft, dtype=bool) # get patient to subtype map and subset entropy vectors patient_to_subtype = p.map_to_external_reference_aminoacids(region, refname=refname) subtype_entropy = ref.get_entropy_in_patient_region(patient_to_subtype) entropy_quantiles = get_quantiles(4, subtype_entropy) good_ref = ref.good_pos_in_reference[patient_to_subtype[:,0]] # loop over times and calculate the correlation for each value for t, af in izip(p.dsi,aft): good_af = (~np.any(af.mask, axis=0)[patient_to_subtype[:,1]]) & good_ref tmp_af = af[:,patient_to_subtype[:,1]] # tmp_af has only columns that are mappable to the reference # good_af is a mask for useful columns # Squant['ind'] below is a mask for positions corresponding to an entropy quantile (at mappable positions) tmp = {'S'+str(i+1):np.mean(tmp_af[:,Squant['ind']*good_af].max(axis=0)\ <tmp_af[:,Squant['ind']*good_af].sum(axis=0)-af_threshold) for i, Squant in entropy_quantiles.iteritems()} tmp.update({'pcode':pcode,'region':region,'time':t}) diverse_fraction.append(tmp) return pd.DataFrame(diverse_fraction)
def collect_correlations_aminoacids(patients, regions, cov_min=1000, subtype='patient', refname='HXB2'): '''Correlation of subtype entropy and intra-patient diversity''' ps = {pcode: Patient.load(pcode) for pcode in patients} correlations = [] for region in regions: print region if subtype == 'any': ref = HIVreferenceAminoacid(region, refname=refname, subtype='any') ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05) else: refs = {} for subtype in ['B', 'C', 'AE']: ref = HIVreferenceAminoacid(region, refname=refname, subtype=subtype) ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05) refs[subtype] = ref for pi, pcode in enumerate(patients): p = ps[pcode] if subtype == 'patient': ref = refs[p['Subtype']] aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min, type='aa') if len(aft.mask.shape) < 2: aft.mask = np.zeros_like(aft, dtype=bool) # get patient to subtype map and subset entropy vectors patient_to_subtype = p.map_to_external_reference_aminoacids(region, refname=refname) subtype_entropy = ref.get_entropy_in_patient_region(patient_to_subtype) good_ref = ref.good_pos_in_reference[patient_to_subtype[:,0]] # loop over times and calculate the correlation for each value for t, af in izip(p.dsi, aft): patient_entropy = np.maximum(0,-np.sum(af[:-1]*np.log(1e-10+af[:-1]), axis=0))[patient_to_subtype[:,1]] # good_af is a mask for useful columns good_af = (~np.any(af.mask, axis=0)[patient_to_subtype[:,1]]) & good_ref if good_af.sum() > 0.5 * good_af.shape[0]: rho,pval = spearmanr(patient_entropy[good_af], subtype_entropy[good_af]) correlations.append({'pcode':pcode, 'region': region, 'time': t, 'rho': rho, 'pval': pval}) return pd.DataFrame(correlations)
def get_toaway_histograms(subtype, Sc=1): ''' calculate allele frequency histograms for each patient and each time points separately for sites that agree or disagree with consensus. this can be done for a low and high entropy category with the threshold set by Sc ''' away_histogram = {(pcode, Sbin):{} for Sbin in ['low','high'] for pcode in patients} to_histogram = {(pcode, Sbin):{} for Sbin in ['low','high'] for pcode in patients} # if subtypes == 'any' meaning comparison to groupM, we can load the reference here if subtype=='any': hxb2 = HIVreference(refname='HXB2', subtype = subtype) good_pos_in_reference = hxb2.get_ungapped(threshold = 0.05) # determine divergence and minor variation at sites that agree with consensus or not for pi, pcode in enumerate(patients): try: p = Patient.load(pcode) except: print "Can't load patient", pcode else: print('subtype:', subtype, "patient",pcode) if subtype == 'patient': # if we take the subtype of the patient, load specific ref alignment here hxb2 = HIVreference(refname='HXB2', subtype = p['Subtype']) good_pos_in_reference = hxb2.get_ungapped(threshold = 0.05) for region in regions: aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min) # get patient to subtype map and subset entropy vectors, convert to bits patient_to_subtype = p.map_to_external_reference(region, refname = 'HXB2') subtype_entropy = hxb2.get_entropy_in_patient_region(patient_to_subtype)/np.log(2.0) ancestral = p.get_initial_indices(region)[patient_to_subtype[:,2]] consensus = hxb2.get_consensus_indices_in_patient_region(patient_to_subtype) good_ref = good_pos_in_reference[patient_to_subtype[:,0]] away_sites = ancestral==consensus aft_HXB2 = aft[:,:,patient_to_subtype[:,2]] for H, sites in [(away_histogram, away_sites), (to_histogram, ~away_sites)]: for Sbin in ['low', 'high']: if Sbin=='low': ind = (sites)&(subtype_entropy<Sc)&(good_ref) else: ind = (sites)&(subtype_entropy>=Sc)&(good_ref) for ti,t in enumerate(p.dsi): y,x = np.histogram(aft_HXB2[ti,ancestral[ind],np.where(ind)[0]].compressed(), bins=af_bins) H[(pcode, Sbin)][t]=y return to_histogram, away_histogram
def collect_correlations(patients, regions, cov_min=1000, refname='HXB2', min_dsi=1500): '''Correlation of entropy between patients''' ps = [Patient.load(pcode) for pcode in patients] correlations = [] for region in regions: print region for pi, p1 in enumerate(ps): aft1 = p1.get_allele_frequency_trajectories(region, cov_min=cov_min) af1 = aft1[p1.dsi >= min_dsi].mean(axis=0) en1 = np.maximum(0,-np.sum(af1[:-1]*np.log(1e-10+af1[:-1]), axis=0)) ptoref1 = p1.map_to_external_reference(region, refname=refname) ptorefd1 = dict(ptoref1[:, ::2]) seq1 = p1.get_initial_sequence(region) for p2 in ps[:pi]: aft2 = p2.get_allele_frequency_trajectories(region, cov_min=cov_min) af2 = aft2[p2.dsi >= min_dsi].mean(axis=0) en2 = np.maximum(0,-np.sum(af2[:-1]*np.log(1e-10+af2[:-1]), axis=0)) ptoref2 = p2.map_to_external_reference(region, refname=refname) ptorefd2 = dict(ptoref2[:, ::2]) seq2 = p2.get_initial_sequence(region) overlap = np.intersect1d(ptoref1[:, 0], ptoref2[:, 0], assume_unique=True) af_ov = np.array([(en1[ptorefd1[pos]], en2[ptorefd2[pos]]) for pos in overlap]) rho, pval = spearmanr(af_ov[:, 0], af_ov[:, 1]) seq1_ov = np.array([seq1[ptorefd1[pos]] for pos in overlap]) seq2_ov = np.array([seq2[ptorefd2[pos]] for pos in overlap]) dist = (seq1_ov != seq2_ov).mean() correlations.append({'pcode1': p1.name, 'pcode2': p2.name, 'pcode': p1.name+'-'+p2.name, 'region': region, 'rho': rho, 'distance': dist, 'pval': pval}) return pd.DataFrame(correlations)
def collect_data_LD(patients): '''Collect data for LD plot''' dmin = 40 dmin_pad = 200 var_min = 0.2 cov_min = 200 LD_vs_distance = {} Dp_vs_distance = {} bins = np.arange(0,401,40) binc = (bins[:-1]+bins[1:])*0.5 for frag in all_fragments: if frag not in ['F'+str(i) for i in xrange(1,7)]: continue dists = [] weights_LD = [] weights_Dp = [] for pcode in patients: p = Patient.load(pcode) depth = p.get_fragment_depth(pad=False, limit_to_dilution=False) depth_pad = p.get_fragment_depth(pad=True, limit_to_dilution=False) for si, sample in enumerate(p.samples): # check for sufficient depth if ((depth[si][all_fragments.index(frag)] > dmin) or (depth_pad[si][all_fragments.index(frag)] > dmin_pad)): positions, af2p, cov, af1p = sample.get_pair_frequencies(frag, var_min=var_min) if positions is None: continue LD, Dp, p12 = LDfunc(af2p, af1p, cov, cov_min=100) X,Y = np.meshgrid(positions, positions) np.fill_diagonal(cov, 0) dists.extend(np.abs(X-Y)[cov>=cov_min]) weights_LD.extend(LD[cov>=cov_min]) weights_Dp.extend(Dp[cov>=cov_min]) print (pcode, si, frag, " # of positions:", len(positions), 'depth:', depth[si][all_fragments.index(frag)]) else: print (pcode, si, frag, "insufficient depth:", depth[si][all_fragments.index(frag)], depth_pad[si][all_fragments.index(frag)]) yn,xn = np.histogram(dists, bins = bins) y,x = np.histogram(dists, weights = weights_LD, bins=bins) LD_vs_distance[frag] = y/(1e-10+yn) y,x = np.histogram(dists, weights = weights_Dp, bins=bins) Dp_vs_distance[frag]=y/(1e-10+yn) for pcr in ['PCR1', 'PCR2']: positions, af2p, cov, af1p = control_LD(pcr, var_min=var_min) LD, Dp, p12 = LDfunc(af2p, af1p, cov, cov_min=100) X,Y = np.meshgrid(positions, positions) np.fill_diagonal(cov, 0) dists = np.abs(X-Y)[cov>=cov_min].flatten() weights_LD = LD[cov>=cov_min].flatten() weights_Dp = Dp[cov>=cov_min].flatten() yn,xn = np.histogram(dists, bins = bins) y,x = np.histogram(dists, weights = weights_LD, bins=bins) LD_vs_distance[pcr] = y/(1e-10+yn) y,x = np.histogram(dists, weights = weights_Dp, bins=bins) Dp_vs_distance[pcr]=y/(1e-10+yn) data = {'Dp': Dp_vs_distance, 'LDrsq': LD_vs_distance, 'bins': bins, 'binc': binc, 'var_min': var_min, 'cov_min': cov_min, 'dmin': dmin, 'dmin_pad': 200, 'patients':patients} return data
def collect_to_away_aminoacids(patients, regions, Sbins=[0, 0.1, 0.3, 3], cov_min=1000, refname='HXB2', subtype='patient'): '''Collect allele frequencies polarized from cross-sectional consensus for amino acids Collect minor variant frequencies, divergences, etc separately for sites that agree or disagree with consensus. consensus is either group M consensus (subtype='any') or the subtype of the respective patient (subtype='patient'). In addition, these quantities are stratified by entropy ''' ps = {pcode: Patient.load(pcode) for pcode in patients} minor_variants = [] to_away_divergence = [] to_away_minor = [] consensus_distance = {} for region in regions: print region # if subtypes == 'any' meaning comparison to groupM, we can load the reference here if subtype == 'any': ref = HIVreferenceAminoacid(region, refname=refname, subtype=subtype) ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05) else: refs = {} for subtype in ['B', 'C', 'AE']: ref = HIVreferenceAminoacid(region, refname=refname, subtype=subtype) ref.good_pos_in_reference = ref.get_ungapped(threshold=0.05) refs[subtype] = ref # determine divergence and minor variation at sites that agree with consensus or not for pi, pcode in enumerate(patients): p = ps[pcode] if subtype == 'patient': # if we take the subtype of the patient, load specific ref alignment here ref = refs[p['Subtype']] aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min, type='aa') # get patient to subtype map and subset entropy vectors, convert to bits patient_to_subtype = p.map_to_external_reference_aminoacids(region, refname=refname) subtype_entropy = ref.get_entropy_in_patient_region(patient_to_subtype) / np.log(2.0) ancestral = p.get_initial_indices(region, type='aa')[patient_to_subtype[:, -1]] consensus = ref.get_consensus_indices_in_patient_region(patient_to_subtype) away_sites = ancestral == consensus good_ref = ref.good_pos_in_reference[patient_to_subtype[:, 0]] consensus_distance[(pcode, region)] = np.mean(~away_sites) print pcode, region, "dist:", 1-away_sites.mean(), "useful_ref:", good_ref.mean() # loop over times and calculate the af in entropy bins for t, af in izip(p.dsi, aft): good_af = (((~np.any(af.mask, axis=0)) #&(aft[0].max(axis=0)>0.9) &(af.argmax(axis=0) < af.shape[0] - 2))[patient_to_subtype[:, -1]]) \ & good_ref # make version of all arrays that contain only unmasked sites and are also ungapped clean_af = af[:,patient_to_subtype[:, -1]][:-1, good_af] clean_away = away_sites[good_af] clean_consensus = consensus[good_af] clean_ancestral = ancestral[good_af] clean_entropy = subtype_entropy[good_af] clean_entropy_bins = [(clean_entropy >= t_lower) & (clean_entropy < t_upper) for t_lower, t_upper in zip(Sbins[:-1], Sbins[1:])] clean_minor = clean_af.sum(axis=0) - clean_af.max(axis=0) clean_derived = clean_af.sum(axis=0) - clean_af[clean_ancestral,np.arange(clean_ancestral.shape[0])] print pcode, region, t # for each entropy bin, calculate the average divergence and minor variation for sbin, sites in enumerate(clean_entropy_bins): minor_variants.append({'pcode': pcode, 'region': region, 'time': t, 'S_bin': sbin, 'af_away_minor': np.mean(clean_minor[sites&clean_away]), 'af_away_derived':np.mean(clean_derived[sites&clean_away]), 'af_to_minor': np.mean(clean_minor[sites&(~clean_away)]), 'af_to_derived': np.mean(clean_derived[sites&(~clean_away)]) }) # calculate the minor variation at sites were the founder differs from consensus # in different allele frequency bins clean_reversion = clean_af[clean_consensus,np.arange(clean_consensus.shape[0])]*(~clean_away) clean_total_divergence = clean_af.sum(axis=0) - clean_af[clean_ancestral,np.arange(clean_ancestral.shape[0])] to_away_divergence.append({'pcode': pcode, 'region': region, 'time': t, 'reversion': np.mean(clean_reversion), 'divergence': np.mean(clean_total_divergence) }) af_thres = [0, 0.05, 0.1, 0.25, 0.5, 0.95, 1.0] rev_tmp = clean_af[clean_consensus,np.arange(clean_consensus.shape[0])][~clean_away] der_tmp = clean_derived[~clean_away] for ai,(af_lower, af_upper) in enumerate(zip(af_thres[:-1], af_thres[1:])): to_away_minor.append({'pcode': pcode, 'region': region, 'time': t, 'af_bin': ai, 'reversion_spectrum': np.mean(rev_tmp*(rev_tmp>=af_lower)*(rev_tmp<af_upper)), 'minor_reversion_spectrum': np.mean(der_tmp*(der_tmp>=af_lower)*(der_tmp<af_upper)) }) return (pd.DataFrame(minor_variants), pd.DataFrame(to_away_divergence), pd.DataFrame(to_away_minor), consensus_distance)
patients = ['p1', 'p2', 'p3','p5', 'p6', 'p8', 'p9', 'p11'] rate_or_gof = 0 window_size = 300 cov_min = 200 if not os.path.isfile(fn_data) or params.redo: print("Regenerating plot data") cats = [{'name': 'total', 'only_substitutions': False}, {'name': 'substitutions', 'only_substitutions': True}, ] ref = {key: -np.ones((len(patients), 10000), dtype=float) for key in ['total', 'substitutions']} evo_rates = {key: {} for key in ref} for pi, pcode in enumerate(patients): p = Patient.load(pcode) to_ref = p.map_to_external_reference('genomewide') for cat in cats: div_traj = get_divergence_trajectory(p, cov_min=cov_min, sequence_type=params.type, only_substitutions=cat['only_substitutions']) print (pcode, cat['name']+' divergence', zip(np.round(p.ysi), [[np.round(x[x<th].sum()) for th in [.1, .5, 0.95, 1.0]] for x in div_traj])) if params.type == 'nuc': min_valid_fraction = 0.95 else: # Two out of three are masked by design
def collect_data_richard(patients, regions, syn_degeneracy=2): '''Collect data for divergence and diversity''' syn_divergence = {reg: {p: [] for p in patients} for reg in regions} syn_diversity = {reg: {p: [] for p in patients} for reg in regions} nonsyn_divergence = {reg: {p: [] for p in patients} for reg in regions} nonsyn_diversity = {reg: {p: [] for p in patients} for reg in regions} time_bins = np.array([0, 200, 500, 1000, 1500, 2000, 3000, 5000]) nbins = 10 sfs_tmin = 1000 sfs = { 'syn': np.zeros(nbins, dtype=float), 'nonsyn': np.zeros(nbins, dtype='float'), 'bins': np.linspace(0.01, 0.99, nbins + 1) } time_binc = 0.5 * (time_bins[1:] + time_bins[:-1]) cov_min = 100 for pi, pcode in enumerate(patients): try: p = Patient.load(pcode) except: print "Can't load patient", pcode else: for region in regions: for prot in regions[region]: initial_indices = p.get_initial_indices(prot) aft = p.get_allele_frequency_trajectories(prot, cov_min=cov_min) gaps = p.get_gaps_by_codon(prot, pad=2, threshold=0.05) syn_mask = p.get_syn_mutations(prot) syn_pos = (syn_mask.sum(axis=0) > 1) * (gaps == False) nonsyn_pos = (syn_mask.sum(axis=0) <= 1) * ( p.get_constrained(prot) == False) * (gaps == False) print pcode, prot, syn_pos.sum(), nonsyn_pos.sum() syn_divergence[region][pcode].extend([ (t, divergence(af[:, syn_pos], initial_indices[syn_pos])) for t, af in zip(p.dsi, aft) ]) syn_diversity[region][pcode].extend([ (t, diversity(af[:, syn_pos])) for t, af in zip(p.dsi, aft) ]) nonsyn_divergence[region][pcode].extend([ (t, divergence(af[:, nonsyn_pos], initial_indices[nonsyn_pos])) for t, af in zip(p.dsi, aft) ]) nonsyn_diversity[region][pcode].extend([ (t, diversity(af[:, nonsyn_pos])) for t, af in zip(p.dsi, aft) ]) syn_derived = syn_mask.copy() syn_derived[initial_indices, np.arange(syn_derived.shape[1])] = False for t, af in izip(p.dsi, aft): if t > sfs_tmin: y, x = np.histogram(af[syn_derived].flatten(), bins=sfs['bins']) sfs['syn'] += y nonsyn_derived = syn_mask == False nonsyn_derived *= (p.get_constrained(prot) == False) * (gaps == False) nonsyn_derived[initial_indices, np.arange(syn_derived.shape[1])] = False for t, af in izip(p.dsi, aft): if t > sfs_tmin: y, x = np.histogram(af[nonsyn_derived], bins=sfs['bins']) sfs['nonsyn'] += y for tmp_data in [ syn_divergence, syn_diversity, nonsyn_diversity, nonsyn_divergence ]: for region in regions: tmp = np.vstack([np.array(tmp_data[region][p]) for p in patients]) tmp_clean = tmp[-np.isnan(tmp[:, 1]), :] y, x = np.histogram(tmp_clean[:, 0], bins=time_bins, weights=tmp_clean[:, 1]) yn, x = np.histogram(tmp_clean[:, 0], bins=time_bins) tmp_data[region] = { 'avg': y / (1e-10 + yn), 'bins': time_binc, 'raw': tmp_data[region] } data = { 'syn_diversity': syn_diversity, 'syn_divergence': syn_divergence, 'nonsyn_diversity': nonsyn_diversity, 'nonsyn_divergence': nonsyn_divergence, 'sfs': sfs } return data
def collect_data_fabio(patients, regions, cov_min=100, syn_degeneracy=2): '''Collect data for divergence and diversity''' import pandas as pd from itertools import izip # Prepare SFS nbins = 10 sfs_tmin = 1000 sfs = { 'syn': np.zeros(nbins, dtype=float), 'nonsyn': np.zeros(nbins, dtype=float), 'bins': np.linspace(0.01, 0.99, nbins + 1), } # Collect into DataFrame data = [] for pi, pcode in enumerate(patients): p = Patient.load(pcode) for region, prots in regions.iteritems(): for prot in prots: aft = p.get_allele_frequency_trajectories(prot, cov_min=cov_min) initial_indices = p.get_initial_indices(prot) gaps = p.get_gaps_by_codon(prot, pad=2, threshold=0.05) # Classify syn/nonsyn POSITIONS # NOTE: this is not fully correct because some positions (2-fold # degenerate) are both syn and nonsyn, but it's close enough syn_mask = p.get_syn_mutations(prot) syn_sum = syn_mask.sum(axis=0) # NOTE: syn_mask == 0 are substitutions, they make up most # of the nonsynonymous signal pos = { 'syn': (syn_sum >= syn_degeneracy) & (~gaps), 'nonsyn': (syn_sum <= 1) & (~p.get_constrained(prot)) & (~gaps), } print pcode, prot, pos['syn'].sum(), pos['nonsyn'].sum() # Divergence/diversity for t, af in izip(p.dsi, aft): for mutclass, ind in pos.iteritems(): data.append({ 'pcode': pcode, 'time': t, 'region': region, 'protein': prot, 'nsites': ind.sum(), 'mutclass': mutclass, 'divergence': divergence(af[:, ind], initial_indices[ind]), 'diversity': diversity(af[:, ind]), }) # Site frequency spectrum syn_derived = syn_mask.copy() syn_derived[initial_indices, np.arange(syn_derived.shape[1])] = False nonsyn_derived = (-syn_mask) & (-p.get_constrained(prot)) & ( -gaps) nonsyn_derived[initial_indices, np.arange(syn_derived.shape[1])] = False for t, af in izip(p.dsi, aft): if t < sfs_tmin: continue sfs['syn'] += np.histogram(af[syn_derived], bins=sfs['bins'])[0] sfs['nonsyn'] += np.histogram(af[nonsyn_derived], bins=sfs['bins'])[0] data = pd.DataFrame(data) data['divergence'] = data['divergence'].astype(float) data['diversity'] = data['diversity'].astype(float) return {'divdiv': data, 'sfs': sfs}
{ 'name': 'total', 'only_substitutions': False }, { 'name': 'substitutions', 'only_substitutions': True }, ] ref = { key: -np.ones((len(patients), 10000), dtype=float) for key in ['total', 'substitutions'] } evo_rates = {key: {} for key in ref} for pi, pcode in enumerate(patients): p = Patient.load(pcode) to_ref = p.map_to_external_reference('genomewide') for cat in cats: div_traj = get_divergence_trajectory( p, cov_min=cov_min, sequence_type=params.type, only_substitutions=cat['only_substitutions']) print( pcode, cat['name'] + ' divergence', zip(np.round(p.ysi), [[ np.round(x[x < th].sum()) for th in [.1, .5, 0.95, 1.0] ] for x in div_traj]))
def collect_data_LD(patients): '''Collect data for LD plot''' dmin = 40 dmin_pad = 200 var_min = 0.2 cov_min = 200 LD_vs_distance = {} Dp_vs_distance = {} bins = np.arange(0, 401, 40) binc = (bins[:-1] + bins[1:]) * 0.5 for frag in all_fragments: if frag not in ['F' + str(i) for i in xrange(1, 7)]: continue dists = [] weights_LD = [] weights_Dp = [] for pcode in patients: p = Patient.load(pcode) depth = p.get_fragment_depth(pad=False, limit_to_dilution=False) depth_pad = p.get_fragment_depth(pad=True, limit_to_dilution=False) for si, sample in enumerate(p.samples): # check for sufficient depth if ((depth[si][all_fragments.index(frag)] > dmin) or (depth_pad[si][all_fragments.index(frag)] > dmin_pad)): positions, af2p, cov, af1p = sample.get_pair_frequencies( frag, var_min=var_min) if positions is None: continue LD, Dp, p12 = LDfunc(af2p, af1p, cov, cov_min=100) X, Y = np.meshgrid(positions, positions) np.fill_diagonal(cov, 0) dists.extend(np.abs(X - Y)[cov >= cov_min]) weights_LD.extend(LD[cov >= cov_min]) weights_Dp.extend(Dp[cov >= cov_min]) print(pcode, si, frag, " # of positions:", len(positions), 'depth:', depth[si][all_fragments.index(frag)]) else: print(pcode, si, frag, "insufficient depth:", depth[si][all_fragments.index(frag)], depth_pad[si][all_fragments.index(frag)]) yn, xn = np.histogram(dists, bins=bins) y, x = np.histogram(dists, weights=weights_LD, bins=bins) LD_vs_distance[frag] = y / (1e-10 + yn) y, x = np.histogram(dists, weights=weights_Dp, bins=bins) Dp_vs_distance[frag] = y / (1e-10 + yn) for pcr in ['PCR1', 'PCR2']: positions, af2p, cov, af1p = control_LD(pcr, var_min=var_min) LD, Dp, p12 = LDfunc(af2p, af1p, cov, cov_min=100) X, Y = np.meshgrid(positions, positions) np.fill_diagonal(cov, 0) dists = np.abs(X - Y)[cov >= cov_min].flatten() weights_LD = LD[cov >= cov_min].flatten() weights_Dp = Dp[cov >= cov_min].flatten() yn, xn = np.histogram(dists, bins=bins) y, x = np.histogram(dists, weights=weights_LD, bins=bins) LD_vs_distance[pcr] = y / (1e-10 + yn) y, x = np.histogram(dists, weights=weights_Dp, bins=bins) Dp_vs_distance[pcr] = y / (1e-10 + yn) data = { 'Dp': Dp_vs_distance, 'LDrsq': LD_vs_distance, 'bins': bins, 'binc': binc, 'var_min': var_min, 'cov_min': cov_min, 'dmin': dmin, 'dmin_pad': 200, 'patients': patients } return data
parser = argparse.ArgumentParser( description="make figure for SNP correlations") parser.add_argument('--redo', action='store_true', help='recalculate data') params = parser.parse_args() VERBOSE = 2 pname = 'p11' n_time = 4 username = os.path.split(os.getenv('HOME'))[-1] foldername = get_figure_folder(username, 'controls') fn_data = foldername + 'data/' fn_data = fn_data + 'allele_frequency_overlap.pickle' if not os.path.isfile(fn_data) or params.redo: patient = Patient.load(pname) samples = patient.samples[n_time] data = get_allele_frequency_overlap(sample, overlaps, cov_min=cov_min, VERBOSE=VERBOSE, qual_min=qual_min) estimate_templates_overlaps(sample, data) store_data(data, fn_data) else: data = load_data(fn_data) filename = foldername + 'allele_frequency_overlap' plot_allele_frequency_overlap(
def collect_substitution_data(patients, regions, cov_min=100): from Bio.Seq import translate data = [] for pi, pcode in enumerate(patients): p = Patient.load(pcode) for region in regions: print p.name, region initial_indices = p.get_initial_indices(region) aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min) if np.isscalar(aft.mask): aft.mask = np.zeros_like(aft, bool) coomap = p.map_to_external_reference(region)[:, ::2] coomapd = { 'pat_to_subtype': dict(coomap[:, ::-1]), 'subtype_to_pat': dict(coomap) } for posdna in xrange(aft.shape[-1]): # Get the position in reference coordinates if posdna not in coomapd['pat_to_subtype']: pos_sub = -1 #continue else: pos_sub = coomapd['pat_to_subtype'][posdna] # Get allele frequency trajectory aftpos = aft[:, :, posdna] ind = -aftpos[:, 0].mask if ind.sum() == 0: continue aftpos = aftpos[ind] timespos = p.dsi[ind] # Ancestral allele ianc = initial_indices[posdna] anc = alpha[ianc] # Ignore indels if ianc >= 4: continue # Check for fixation if (aftpos[0, ianc] < 0.7) or np.min(aftpos[:, ianc]) > 0.2: continue # Get codon ci = posdna // 3 rf = posdna % 3 cod_anc = ''.join(alpha[initial_indices[ci * 3:(ci + 1) * 3]]) if '-' in cod_anc: continue aa_anc = translate(cod_anc) # Check which allele (if any) is fixing for inuc, nuc in enumerate(alpha[:4]): if nuc == anc: continue if aftpos[-1, inuc] < 0.95: continue # NOTE: OK, it's a substitution (max 1 per site) break else: continue # Assign a time to the substitution ist = (aftpos[:, inuc] > 0.5).nonzero()[0][0] tsubst = 0.5 * (timespos[ist - 1] + timespos[ist]) nuc = alpha[inuc] mut = anc + '->' + nuc # Define transition/transversion if frozenset(nuc + anc) in (frozenset('CT'), frozenset('AG')): trclass = 'ts' else: trclass = 'tv' # Check syn/nonsyn cod_nuc = cod_anc[:rf] + nuc + cod_anc[rf + 1:] aa_nuc = translate(cod_nuc) is_syn = aa_nuc == aa_anc datum = { 'pcode': p.name, 'region': region, 'pos_patient': posdna, 'pos_ref': pos_sub, 'mut': mut, 'trclass': trclass, 'syn': is_syn, 'time': tsubst, } data.append(datum) data = pd.DataFrame(data) return data
def collect_data_richard(patients, regions, syn_degeneracy=2): '''Collect data for divergence and diversity''' syn_divergence = {reg:{p:[] for p in patients} for reg in regions} syn_diversity = {reg:{p:[] for p in patients} for reg in regions} nonsyn_divergence = {reg:{p:[] for p in patients} for reg in regions} nonsyn_diversity = {reg:{p:[] for p in patients} for reg in regions} time_bins = np.array([0, 200, 500, 1000, 1500, 2000, 3000, 5000]) nbins=10 sfs_tmin=1000 sfs = {'syn':np.zeros(nbins, dtype=float), 'nonsyn':np.zeros(nbins, dtype='float'), 'bins':np.linspace(0.01,0.99,nbins+1)} time_binc = 0.5*(time_bins[1:]+time_bins[:-1]) cov_min = 100 for pi, pcode in enumerate(patients): try: p = Patient.load(pcode) except: print "Can't load patient", pcode else: for region in regions: for prot in regions[region]: initial_indices = p.get_initial_indices(prot) aft = p.get_allele_frequency_trajectories(prot, cov_min=cov_min) gaps = p.get_gaps_by_codon(prot, pad=2, threshold=0.05) syn_mask = p.get_syn_mutations(prot) syn_pos = (syn_mask.sum(axis=0)>1)*(gaps==False) nonsyn_pos = (syn_mask.sum(axis=0)<=1)*(p.get_constrained(prot)==False)*(gaps==False) print pcode, prot, syn_pos.sum(), nonsyn_pos.sum() syn_divergence[region][pcode].extend([(t, divergence(af[:,syn_pos], initial_indices[syn_pos])) for t,af in zip(p.dsi, aft)]) syn_diversity[region][pcode].extend([(t, diversity(af[:,syn_pos])) for t,af in zip(p.dsi, aft)]) nonsyn_divergence[region][pcode].extend([(t, divergence(af[:,nonsyn_pos], initial_indices[nonsyn_pos])) for t,af in zip(p.dsi, aft)]) nonsyn_diversity[region][pcode].extend([(t, diversity(af[:,nonsyn_pos])) for t,af in zip(p.dsi, aft)]) syn_derived = syn_mask.copy() syn_derived[initial_indices, np.arange(syn_derived.shape[1])]=False for t,af in izip(p.dsi,aft): if t>sfs_tmin: y,x = np.histogram(af[syn_derived].flatten(), bins=sfs['bins']) sfs['syn']+=y nonsyn_derived = syn_mask==False nonsyn_derived*=(p.get_constrained(prot)==False)*(gaps==False) nonsyn_derived[initial_indices, np.arange(syn_derived.shape[1])]=False for t,af in izip(p.dsi,aft): if t>sfs_tmin: y,x = np.histogram(af[nonsyn_derived], bins=sfs['bins']) sfs['nonsyn']+=y for tmp_data in [syn_divergence, syn_diversity, nonsyn_diversity, nonsyn_divergence]: for region in regions: tmp = np.vstack([np.array(tmp_data[region][p]) for p in patients]) tmp_clean = tmp[-np.isnan(tmp[:,1]),:] y, x = np.histogram(tmp_clean[:,0],bins = time_bins, weights = tmp_clean[:,1]) yn, x = np.histogram(tmp_clean[:,0],bins = time_bins) tmp_data[region] = {'avg':y/(1e-10+yn), 'bins':time_binc, 'raw':tmp_data[region]} data = {'syn_diversity':syn_diversity, 'syn_divergence':syn_divergence, 'nonsyn_diversity':nonsyn_diversity, 'nonsyn_divergence':nonsyn_divergence, 'sfs':sfs} return data
import argparse parser = argparse.ArgumentParser(description="make figure for SNP correlations") parser.add_argument('--redo', action='store_true', help='recalculate data') params = parser.parse_args() VERBOSE = 2 pname = 'p11' n_time = 4 username = os.path.split(os.getenv('HOME'))[-1] foldername = get_figure_folder(username, 'controls') fn_data = foldername+'data/' fn_data = fn_data + 'allele_frequency_overlap.pickle' if not os.path.isfile(fn_data) or params.redo: patient = Patient.load(pname) samples = patient.samples[n_time] data = get_allele_frequency_overlap(sample, overlaps, cov_min=cov_min, VERBOSE=VERBOSE, qual_min=qual_min) estimate_templates_overlaps(sample, data) store_data(data, fn_data) else: data = load_data(fn_data) filename = foldername+'allele_frequency_overlap' plot_allele_frequency_overlap(data, VERBOSE=VERBOSE, fig_filename=filename, )
from hivevo.hivevo.patients import Patient from hivevo.hivevo.samples import all_fragments from hivevo.hivevo.sequence import alpha from filenames import get_figure_folder from util import store_data, load_data, fig_width, fig_fontsize, patients, patient_colors, HIVEVO_colormap plt.ion() sns.set_style('darkgrid') username = os.path.split(os.getenv('HOME'))[-1] foldername = get_figure_folder(username, 'first') cmap = HIVEVO_colormap() p = Patient.load('p1') fig, axs = plt.subplots(2,3, sharey=True, sharex=True) traj = [] ti = 3 tj = ti+1 for fi, frag in enumerate(all_fragments): ax = axs[fi//3,fi%3] aft = p.get_allele_frequency_trajectories(frag) pos = np.linspace(0,1,aft.shape[-1]) for pi in xrange(aft.shape[-1]): for ni in xrange(5): if aft[0,ni,pi]<0.5 and (aft[ti,ni,pi]>0.2 and aft[tj,ni,pi]>0.2): traj.append([frag, pi, aft[:,ni,pi]]) try: for ni in xrange(5): ind = (aft[ti,ni,:]*(1-aft[ti,ni,:])>0.01)|(aft[tj,ni,:]*(1-aft[tj,ni,:])>0.01)
def collect_to_away(patients, regions, Sbins=[0,0.02, 0.08, 0.25, 2], cov_min=1000, subtype = 'patient'): minor_variants = [] to_away_divergence = [] to_away_minor = [] consensus_distance = {} # if subtypes == 'any' meaning comparison to groupM, we can load the reference here if subtype=='any': hxb2 = HIVreference(refname='HXB2', subtype = subtype) good_pos_in_reference = hxb2.get_ungapped(threshold = 0.05) # determine divergence and minor variation at sites that agree with consensus or not for pi, pcode in enumerate(patients): try: p = Patient.load(pcode) except: print "Can't load patient", pcode else: if subtype == 'patient': # if we take the subtype of the patient, load specific ref alignment here hxb2 = HIVreference(refname='HXB2', subtype = p['Subtype']) good_pos_in_reference = hxb2.get_ungapped(threshold = 0.05) for region in regions: aft = p.get_allele_frequency_trajectories(region, cov_min=cov_min, type='aa') # get patient to subtype map and subset entropy vectors, convert to bits patient_to_subtype = p.map_to_external_reference(region, refname = 'HXB2') subtype_entropy = hxb2.get_entropy_in_patient_region(patient_to_subtype)/np.log(2.0) ancestral = p.get_initial_indices(region)[patient_to_subtype[:,2]] consensus = hxb2.get_consensus_indices_in_patient_region(patient_to_subtype) away_sites = ancestral==consensus good_ref = good_pos_in_reference[patient_to_subtype[:,0]] consensus_distance[(pcode, region)] = np.mean(~away_sites) print pcode, region, "dist:",1-away_sites.mean(), "useful_ref:",good_ref.mean() # loop over times and calculate the af in entropy bins for t, af in izip(p.dsi,aft): good_af = (((~np.any(af.mask, axis=0)) #&(aft[0].max(axis=0)>0.9) &(af.argmax(axis=0)<4))[patient_to_subtype[:,2]]) \ & good_ref # make version of all arrays that contain only unmasked sites and are also ungapped clean_af = af[:,patient_to_subtype[:,2]][:5,good_af] clean_away = away_sites[good_af] clean_consensus = consensus[good_af] clean_ancestral = ancestral[good_af] clean_entropy = subtype_entropy[good_af] clean_entropy_bins = [(clean_entropy>=t_lower)&(clean_entropy<t_upper) for t_lower, t_upper in zip(Sbins[:-1], Sbins[1:])] clean_minor = clean_af.sum(axis=0) - clean_af.max(axis=0) clean_derived = clean_af.sum(axis=0) - clean_af[clean_ancestral,np.arange(clean_ancestral.shape[0])] print pcode, region, t # for each entropy bin, calculate the average divergence and minor variation for sbin, sites in enumerate(clean_entropy_bins): minor_variants.append({'pcode':pcode,'region':region,'time':t,'S_bin':sbin, 'af_away_minor': np.mean(clean_minor[sites&clean_away]), 'af_away_derived':np.mean(clean_derived[sites&clean_away]), 'af_to_minor': np.mean(clean_minor[sites&(~clean_away)]), 'af_to_derived': np.mean(clean_derived[sites&(~clean_away)])}) # calculate the minor variation at sites were the founder differs from consensus # in different allele frequency bins clean_reversion = clean_af[clean_consensus,np.arange(clean_consensus.shape[0])]*(~clean_away) clean_total_divergence = clean_af.sum(axis=0) - clean_af[clean_ancestral,np.arange(clean_ancestral.shape[0])] to_away_divergence.append({'pcode':pcode,'region':region,'time':t, 'reversion':np.mean(clean_reversion), 'divergence':np.mean(clean_total_divergence)}) af_thres = [0,0.05,0.1, 0.25, 0.5, 0.95, 1.0] rev_tmp = clean_af[clean_consensus,np.arange(clean_consensus.shape[0])][~clean_away] der_tmp = clean_derived[~clean_away] for ai,(af_lower, af_upper) in enumerate(zip(af_thres[:-1], af_thres[1:])): to_away_minor.append({'pcode':pcode,'region':region,'time':t,'af_bin':ai, 'reversion_spectrum':np.mean(rev_tmp*(rev_tmp>=af_lower)*(rev_tmp<af_upper)), 'minor_reversion_spectrum':np.mean(der_tmp*(der_tmp>=af_lower)*(der_tmp<af_upper))}) return pd.DataFrame(minor_variants), pd.DataFrame(to_away_divergence),pd.DataFrame(to_away_minor), consensus_distance
def collect_data_fabio(patients, regions, cov_min=100, syn_degeneracy=2): '''Collect data for divergence and diversity''' import pandas as pd from itertools import izip # Prepare SFS nbins=10 sfs_tmin=1000 sfs = {'syn': np.zeros(nbins, dtype=float), 'nonsyn': np.zeros(nbins, dtype=float), 'bins': np.linspace(0.01, 0.99, nbins+1), } # Collect into DataFrame data = [] for pi, pcode in enumerate(patients): p = Patient.load(pcode) for region, prots in regions.iteritems(): for prot in prots: aft = p.get_allele_frequency_trajectories(prot, cov_min=cov_min) initial_indices = p.get_initial_indices(prot) gaps = p.get_gaps_by_codon(prot, pad=2, threshold=0.05) # Classify syn/nonsyn POSITIONS # NOTE: this is not fully correct because some positions (2-fold # degenerate) are both syn and nonsyn, but it's close enough syn_mask = p.get_syn_mutations(prot) syn_sum = syn_mask.sum(axis=0) # NOTE: syn_mask == 0 are substitutions, they make up most # of the nonsynonymous signal pos = {'syn': (syn_sum >= syn_degeneracy) & (~gaps), 'nonsyn': (syn_sum <= 1) & (~p.get_constrained(prot)) & (~gaps), } print pcode, prot, pos['syn'].sum(), pos['nonsyn'].sum() # Divergence/diversity for t, af in izip(p.dsi, aft): for mutclass, ind in pos.iteritems(): data.append({'pcode': pcode, 'time': t, 'region': region, 'protein': prot, 'nsites': ind.sum(), 'mutclass': mutclass, 'divergence': divergence(af[:, ind], initial_indices[ind]), 'diversity': diversity(af[:, ind]), }) # Site frequency spectrum syn_derived = syn_mask.copy() syn_derived[initial_indices, np.arange(syn_derived.shape[1])] = False nonsyn_derived = (-syn_mask) & (-p.get_constrained(prot)) & (-gaps) nonsyn_derived[initial_indices, np.arange(syn_derived.shape[1])] = False for t,af in izip(p.dsi,aft): if t < sfs_tmin: continue sfs['syn'] += np.histogram(af[syn_derived], bins=sfs['bins'])[0] sfs['nonsyn'] += np.histogram(af[nonsyn_derived], bins=sfs['bins'])[0] data = pd.DataFrame(data) data['divergence'] = data['divergence'].astype(float) data['diversity'] = data['diversity'].astype(float) return {'divdiv':data, 'sfs':sfs}