def loadK31(reg, filepath, fromHIV=False): ''' Loading data for 31 additional patients Input arguments: reg: name of genetic region (gag or pol) filepath: path to directory where the frequency data are to be stored/downloaded fromHIV: download raw data and store them, if True; use stored data, if False ''' data = {} if fromHIV: sys.path.append("/scicore/home/neher/neher/HIV/hivwholeseq") from hivwholeseq.patients.patients import load_patients, Patient pats = load_patients(csv=True) fmt = "%d/%m/%Y" fhandle = open(filepath + 'K31_info_{}.txt'.format(reg), 'w') for pcode, pat in pats.iterrows(): try: EDI = datetime.strptime(pat["infect date best"], fmt) P = Patient(pat) aft = P.get_allele_frequency_trajectories(reg, cov_min=500)[0] for si, (scode, sample) in enumerate(P.samples.iterrows()): try: date = datetime.strptime(sample["date"], fmt) af = aft[si] TI = date.toordinal() - EDI.toordinal() fhandle.write('{}\t{}\t{}\n'.format(pcode, scode, TI)) np.save( filepath + '{}_{}_{}_data.npy'.format(pcode, scode, reg), af.data) np.save( filepath + '{}_{}_{}_mask.npy'.format(pcode, scode, reg), af.mask) data['{}_{}'.format(pcode, scode)] = (date.toordinal() - EDI.toordinal(), af) print(pcode, scode, "WORKED!!!") except: print(scode, "didn't work") except: print("skipping patient ", pcode) fhandle.close() else: with open(filepath + 'K31_info_{}.txt'.format(reg), 'r') as fhandle: for line in fhandle: words = line.split() pat_name = '_'.join(words[:2]) af_data = np.load(filepath + '{}_{}_data.npy'.format(pat_name, reg)) af_mask = np.load(filepath + '{}_{}_mask.npy'.format(pat_name, reg)) af = np.ma.masked_array(af_data, mask=af_mask) data[pat_name] = (int(words[2]), af) return data
def ttest_sliding_window(func_name, cutoff, ws, lstep=10, Ncr=5): SW = EDI.sliding_window(data, func_name, ws, cutoff) idx = np.where((SW.ttk > Tmin) * (SW.ttk < Tmax))[0] ttk = SW.ttk[idx] jjk = SW.jjk[idx] Npat = np.max(SW.jjk) + 1 xxk = SW.xxk[idx, :][:, ::lstep] NNk = SW.NNk[idx, :][:, ::lstep] xxk = np.ma.masked_where(NNk / ws < fcr, xxk) ttk_est = np.ma.zeros(xxk.shape) for jpat in xrange(Npat): idx_pat = np.where(jjk == jpat)[0] idx_data = np.where(jjk != jpat)[0] ttk_data, dtdx_t0 = EDI.EDI_LAD_multisite(ttk[idx_data], xxk[idx_data, :]) ttk_est[idx_pat, :] = dtdx_t0[0, :] * xxk[idx_pat, :] + dtdx_t0[1, :] msk = np.zeros_like(xxk) msk[:, np.where(np.sum(1 - xxk.mask, axis=0) < Ncr)[0]] = 1 ttk_est = np.ma.masked_where(msk, ttk_est) return ttk_est, dtdx_t0, ttk, xxk, NNk, jjk
def TI_from_diversity(DD, j0jL, cutoff, nboot=None, rf=rframe): ''' Estimate the time of infection (TI) from the specified diversity values Input arguments: DD: list/array of diversity values j0jL: tuple specifying the genetic region to use cutoff: lower cutoff value, xc nboot: number of bootstraps over different patients (if None, then no bootstrapping) Output arguments: TTest: estimated times of infection (with rows corresponding to bootstrap relizations) dtdx_t0: slope and intercept values (with rows corresponding to bootstrap relizations) ''' CUT = EDI.window_cutoff(data, func_name, region(j0jL), cutoff, rf=rf) ttk, xxk, jjk = CUT.realdata(Tmin, Tmax, fcr=fcr, vload_min=vload_min, dilutions_min=dilutions_min) if nboot is None: ttk_data, dtdx_t0 = EDI.fitmeth_byname(ttk, xxk, method=method) TTest = dtdx_t0[0] * DD + dtdx_t0[1] return TTest, dtdx_t0 else: Npat = len(CUT.pat_names) jjboot = np.random.randint(0, high=Npat, size=(nboot, Npat)) TTest = np.zeros((nboot, len(DD))) dtdx_t0 = np.zeros((nboot, 2)) for jboot, idx_boot in enumerate(jjboot): tk = np.ma.concatenate([ttk[np.where(jjk == j)] for j in idx_boot]) xk = np.ma.concatenate([xxk[np.where(jjk == j)] for j in idx_boot]) ttk_est, dtdx_t0[jboot, :] = EDI.fitmeth_byname(tk, xk, method=method) TTest[jboot, :] = dtdx_t0[jboot, 0] * DD + dtdx_t0[jboot, 1] return TTest, dtdx_t0
def ax_traj_xt(ax, rf=None): CUT = EDI.window_cutoff(data, measure, region(j0jL), cutoff, rf=rf) ttk, xxk, jjk = CUT.realdata(Tmin, Tmax, fcr=fcr, vload_min=vload_min, dilutions_min=dilutions_min) for jpat in xrange(Npat): jj = np.where(jjk == jpat) ax.plot(ttk[jj], xxk[jj], '--' + marks1[jpat], c=cols[jpat], markersize=12) return ax
def ttest_region(func_name, j0jL, cutoff, method,\ return_slope = False, return_all = False, rf = rframe): CUT = EDI.window_cutoff(data, func_name, region(j0jL), cutoff, rf=rf) ttk, xxk, jjk = CUT.realdata(Tmin, Tmax, fcr=fcr, vload_min=vload_min, dilutions_min=dilutions_min) ttk_est = np.zeros(ttk.shape) dtdx_t0 = np.zeros((Npat, 2)) for jpat in xrange(Npat): idx_pat = np.where(jjk == jpat)[0] idx_data = np.where(jjk != jpat)[0] ttk_data, dtdx_t0[jpat, :] = EDI.fitmeth_byname(ttk[idx_data], xxk[idx_data], method=method) ttk_est[idx_pat] = dtdx_t0[jpat, 0] * xxk[idx_pat] + dtdx_t0[jpat, 1] if return_all: return ttk_est, ttk, xxk, jjk, dtdx_t0 elif return_slope: return ttk_est, ttk, dtdx_t0 else: return ttk_est, ttk
def ROC_curve(func_name, j0jL, cutoff, tcr, ax=None): def contable(ttk, xxk, tcr, xcr): TP = np.count_nonzero((ttk < tcr) * (xxk < xcr)) FP = np.count_nonzero((ttk >= tcr) * (xxk < xcr)) FN = np.count_nonzero((ttk < tcr) * (xxk >= xcr)) TN = np.count_nonzero((ttk >= tcr) * (xxk >= xcr)) return np.array([[TP, FN], [FP, TN]]) CUT = EDI.window_cutoff(data, func_name, region(j0jL), cutoff, rf=rframe) ttk, xxk, jjk = CUT.realdata(Tmin, Tmax, fcr=fcr, vload_min=vload_min, dilutions_min=dilutions_min) xxcr = np.sort(np.unique(xxk)) M = np.array([contable(ttk, xxk, tcr, xcr) for xcr in xxcr]) MM = M / np.sum(M, axis=2, keepdims=True) if ax is not None: ax.plot(MM[:, 1, 0], MM[:, 0, 0]) return MM, np.sum(np.diff(MM[:, 1, 0]) * MM[1:, 0, 0])
def plot_corrcoeff0(j0jL, measures, cutoffs, filename, rf=rframe): ''' Plot pearson correlation coefficients between times and corresponding diversity values Input arguments: j0jL: tuple of initial and final positions of the genome window measures: diversity measures cutoffs: low frequency cutoffs filename: path to the file for saving the figure ''' fig, ax = plt.subplots(1, len(measures), figsize = (H*len(measures), 2*H),\ sharey = True) titls = [leg_byname(name) for name in measures] for j, measure in enumerate(measures): rxt = np.zeros((cutoffs.shape[0], len(data['pat_names']))) for jcut, cut in enumerate(cutoffs): CUT = EDI.window_cutoff(data, measure, region(j0jL), cut, rf=rf) ttk_all, xxk_all, jjk = CUT.realdata(Tmin, Tmax, fcr = fcr,\ vload_min = vload_min, dilutions_min = dilutions_min) for jpat in xrange(Npat): idx = np.where(jjk == jpat) ttk = ttk_all[idx] xxk = xxk_all[idx] rxt[jcut, jpat] = np.corrcoef(ttk, xxk)[0, 1] for jr, r in enumerate(rxt.T): ax[j].plot(cutoffs, r**2, styles[jr]) ax[j].set_title(titls[j], fontsize=fs1) ax[j].tick_params(labelsize=.8 * fs1) ax[j].set_xlabel(r'$x_c$', fontsize=fs1) ax[j].set_xticks(np.arange(0., .5, .1)) ax[0].legend(data['pat_names'], fontsize=0.8 * fs1, loc=0) ax[0].set_ylabel(r'$r^2$', fontsize=fs1) fig.subplots_adjust(hspace=0.1) plt.savefig(filename) plt.close() return None
func_name = funcnames[names.index(measure)] fcr = 0.5 Tmin = 0 Tmax = 9 vload_min = None dilutions_min = None method = 'LAD' rframe = 2 #reference frame; set to None to use all sites fs = 28 H = 8 #loading frequency data datapath = './Frequency_Data/' data = EDI.load_patient_data(patient_names='all', filepath=datapath) Npat = len(data['pat_names']) def region(j0jL): if type(j0jL) is str: # The genome annotations head = ['name', 'x1', 'x2', 'width', 'ri'] annot = [] with open(datapath + 'annotations.txt', 'r') as fhandle: for line in fhandle: l = [ x if j == 0 else int(x) for j, x in enumerate(line.split()) ] annot.append({name: l[j] for j, name in enumerate(head)}) coords = {anno['name']: (anno['x1'], anno['x2']) for anno in annot}
def plot_slope_bootstrap(j0jL, func_name, cutoff, filename, nboot=10**3): ''' Bootstrap plot of slope and intercept values Input arguments: j0jL: tuple of initial and final positions of the genome window func_name: diversity measure cutoff: low frequency cutoff filename: path to the file for saving the figure nboot: number of bootstrap realizations ''' CUT = EDI.window_cutoff(data, func_name, region(j0jL), cutoff, rf=rframe) ttk, xxk, jjk = CUT.realdata(Tmin, Tmax, fcr=fcr, vload_min=vload_min, dilutions_min=dilutions_min) dtdx_t0 = np.zeros((nboot, 2)) jjboot = np.random.randint(0, high=Npat, size=(nboot, Npat)) for jboot, idx_boot in enumerate(jjboot): tk = np.ma.concatenate([ttk[np.where(jjk == j)] for j in idx_boot]) xk = np.ma.concatenate([xxk[np.where(jjk == j)] for j in idx_boot]) ttk_est, dtdx_t0[jboot, :] = EDI.fitmeth_byname(tk, xk, method=method) label_s = 's [years/diversity]' label_t0 = r'$t_0$' + '[years]' fig, ax = plt.subplots(1, 2, figsize=(2 * H, H), sharey=True) ax[0].hist(dtdx_t0[:, 0], alpha=0.5) ax[1].hist(dtdx_t0[:, 1], alpha=0.5) ax[0].set_xlabel(label_s, fontsize=fs) # ax[0].set_ylabel(method, fontsize = fs) ax[0].tick_params(labelsize=.8 * fs) ax[1].set_xlabel(label_t0, fontsize=fs) ax[1].tick_params(labelsize=.8 * fs) plt.savefig(filename) plt.close() fig, ax = plt.subplots(1, 1, figsize=(1.2 * H, H)) Hist, xedges, yedges, cax = ax.hist2d(dtdx_t0[:,0], dtdx_t0[:,1],\ cmap = plt.cm.Blues) ax.set_xlabel(label_s, fontsize=fs) ax.set_ylabel(label_t0, fontsize=fs) ax.tick_params(labelsize=.8 * fs) cbar = fig.colorbar(cax) cbar.ax.tick_params(labelsize=.8 * fs) fig.tight_layout() plt.savefig(filename[:-4] + '_2d.pdf') plt.close() # sns_plot = sns.jointplot(dtdx_t0[:,0], dtdx_t0[:,1], size = H, kind = 'hex', stat_func = None) # with sns.axes_style("white"): # sns.set_style(font = u'Verdana') sns_plot = sns.jointplot(dtdx_t0[:, 0], dtdx_t0[:, 1], stat_func=None, size=H, kind='kde', joint_kws={'shade_lowest': False}) sns_plot.set_axis_labels(xlabel=label_s, ylabel=label_t0, fontsize=fs) sns_plot.ax_joint.tick_params(labelsize=.8 * fs) sns_plot.savefig(filename[:-4] + '_joint.pdf') plt.close(sns_plot.fig) # sns_plot.close() # fig, ax1 = plt.subplots(1, 1, figsize = (1.2*H, H)) # sns_plot = sns.jointplot(dtdx_t0[:,0], dtdx_t0[:,1]) # fig.sca("axis") ## Hist, xedges, yedges, cax = ax.hist2d(dtdx_t0[:,0], dtdx_t0[:,1],\ ## cmap = plt.cm.Blues) # ax.set_xlabel('slope [years]', fontsize = fs) # ax.set_ylabel('intercept', fontsize = fs) # ax.tick_params(labelsize = .8*fs) # cbar = fig.colorbar(cax) # cbar.ax.tick_params(labelsize = .8*fs) # plt.savefig(filename[:-4] + '_joint.pdf') # plt.close() return None
head = ['name', 'x1', 'x2', 'width', 'ri'] annot = [] with open(datapath + 'annotations.txt', 'r') as fhandle: for line in fhandle: l = [x if j == 0 else int(x) for j, x in enumerate(line.split())] annot.append({name: l[j] for j, name in enumerate(head)}) coords = {anno['name']: (anno['x1'], anno['x2']) for anno in annot} feas = ['gag', 'pol', 'env'] #loading frequency data pnames = 'all' #pnames = ['p{}'.format(j+1) for j in xrange(11)] #pnames.remove('p6') #pnames.remove('p1') #pnames.remove('p3') data = EDI.load_patient_data(patient_names=pnames, filepath=datapath) Npat = len(data['pat_names']) def leg_byname(funcname): legs = ['polymorphic sites', 'diversity', 'site entropy'] heads = ['ambiguous_above', 'hamming_above', 'entropy_above'] return legs[heads.index(funcname)] def region(j0jL): if type(j0jL) is str: return coords[j0jL] else: return j0jL
outdir_name = './plotK31/' if not os.path.exists(outdir_name): os.makedirs(outdir_name) #Creating figures for the manuscript measure = 'diversity' meas = translate_measures(measure) cutoff1 = 0.002 for reg in ['gag', 'pol']: print reg j0jL = TI.region(reg) #Loading and processing the training set data (11 patients) CUT = EDI.window_cutoff(TI.data, meas, j0jL, cutoff1, rf=rframe) ttk, xxk, jjk = CUT.realdata(Tmin, Tmax, fcr=fcr, vload_min=vload_min, dilutions_min=dilutions_min) ttk_est, dtdx = TI.TI_from_diversity(xxk, reg, cutoff1, rf=rframe) #Loading and processing the validation dataset data (31 patients) K31data = loadK31(reg, './K31_data/{}/'.format(reg), fromHIV=False) TT, DD, pats, samples = K31_diversity(K31data, cutoff1, verbose=True) TTest, dtdx_t0 = TI.TI_from_diversity(DD, j0jL, cutoff1, rf=rframe) TTmax = np.max(TT) jj = np.where(ttk <= TTmax)