예제 #1
0
def loadK31(reg, filepath, fromHIV=False):
    '''
    Loading data for 31 additional patients
    
    Input arguments:
    reg: name of genetic region (gag or pol)
    filepath: path to directory where the frequency data are to be stored/downloaded
    fromHIV: download raw data and store them, if True; use stored data, if False 
    '''
    data = {}
    if fromHIV:
        sys.path.append("/scicore/home/neher/neher/HIV/hivwholeseq")
        from hivwholeseq.patients.patients import load_patients, Patient
        pats = load_patients(csv=True)
        fmt = "%d/%m/%Y"
        fhandle = open(filepath + 'K31_info_{}.txt'.format(reg), 'w')
        for pcode, pat in pats.iterrows():
            try:
                EDI = datetime.strptime(pat["infect date best"], fmt)
                P = Patient(pat)
                aft = P.get_allele_frequency_trajectories(reg, cov_min=500)[0]
                for si, (scode, sample) in enumerate(P.samples.iterrows()):
                    try:
                        date = datetime.strptime(sample["date"], fmt)
                        af = aft[si]
                        TI = date.toordinal() - EDI.toordinal()
                        fhandle.write('{}\t{}\t{}\n'.format(pcode, scode, TI))
                        np.save(
                            filepath +
                            '{}_{}_{}_data.npy'.format(pcode, scode, reg),
                            af.data)
                        np.save(
                            filepath +
                            '{}_{}_{}_mask.npy'.format(pcode, scode, reg),
                            af.mask)
                        data['{}_{}'.format(pcode,
                                            scode)] = (date.toordinal() -
                                                       EDI.toordinal(), af)
                        print(pcode, scode, "WORKED!!!")
                    except:
                        print(scode, "didn't work")

            except:
                print("skipping patient ", pcode)
        fhandle.close()
    else:
        with open(filepath + 'K31_info_{}.txt'.format(reg), 'r') as fhandle:
            for line in fhandle:
                words = line.split()
                pat_name = '_'.join(words[:2])
                af_data = np.load(filepath +
                                  '{}_{}_data.npy'.format(pat_name, reg))
                af_mask = np.load(filepath +
                                  '{}_{}_mask.npy'.format(pat_name, reg))
                af = np.ma.masked_array(af_data, mask=af_mask)
                data[pat_name] = (int(words[2]), af)
    return data
예제 #2
0
def ttest_sliding_window(func_name, cutoff, ws, lstep=10, Ncr=5):
    SW = EDI.sliding_window(data, func_name, ws, cutoff)
    idx = np.where((SW.ttk > Tmin) * (SW.ttk < Tmax))[0]
    ttk = SW.ttk[idx]
    jjk = SW.jjk[idx]
    Npat = np.max(SW.jjk) + 1
    xxk = SW.xxk[idx, :][:, ::lstep]
    NNk = SW.NNk[idx, :][:, ::lstep]
    xxk = np.ma.masked_where(NNk / ws < fcr, xxk)

    ttk_est = np.ma.zeros(xxk.shape)
    for jpat in xrange(Npat):
        idx_pat = np.where(jjk == jpat)[0]
        idx_data = np.where(jjk != jpat)[0]
        ttk_data, dtdx_t0 = EDI.EDI_LAD_multisite(ttk[idx_data],
                                                  xxk[idx_data, :])
        ttk_est[idx_pat, :] = dtdx_t0[0, :] * xxk[idx_pat, :] + dtdx_t0[1, :]
    msk = np.zeros_like(xxk)
    msk[:, np.where(np.sum(1 - xxk.mask, axis=0) < Ncr)[0]] = 1
    ttk_est = np.ma.masked_where(msk, ttk_est)
    return ttk_est, dtdx_t0, ttk, xxk, NNk, jjk
예제 #3
0
def TI_from_diversity(DD, j0jL, cutoff, nboot=None, rf=rframe):
    '''
    Estimate the time of infection (TI) from the specified diversity values

    Input arguments:
    DD: list/array of diversity values
    j0jL: tuple specifying the genetic region to use
    cutoff: lower cutoff value, xc
    nboot: number of bootstraps over different patients (if None, then no bootstrapping)

    Output arguments:
    TTest: estimated times of infection (with rows corresponding to bootstrap relizations)
    dtdx_t0: slope and intercept values (with rows corresponding to bootstrap relizations)
    '''

    CUT = EDI.window_cutoff(data, func_name, region(j0jL), cutoff, rf=rf)
    ttk, xxk, jjk = CUT.realdata(Tmin,
                                 Tmax,
                                 fcr=fcr,
                                 vload_min=vload_min,
                                 dilutions_min=dilutions_min)
    if nboot is None:
        ttk_data, dtdx_t0 = EDI.fitmeth_byname(ttk, xxk, method=method)
        TTest = dtdx_t0[0] * DD + dtdx_t0[1]
        return TTest, dtdx_t0
    else:
        Npat = len(CUT.pat_names)
        jjboot = np.random.randint(0, high=Npat, size=(nboot, Npat))
        TTest = np.zeros((nboot, len(DD)))
        dtdx_t0 = np.zeros((nboot, 2))
        for jboot, idx_boot in enumerate(jjboot):
            tk = np.ma.concatenate([ttk[np.where(jjk == j)] for j in idx_boot])
            xk = np.ma.concatenate([xxk[np.where(jjk == j)] for j in idx_boot])
            ttk_est, dtdx_t0[jboot, :] = EDI.fitmeth_byname(tk,
                                                            xk,
                                                            method=method)
            TTest[jboot, :] = dtdx_t0[jboot, 0] * DD + dtdx_t0[jboot, 1]
        return TTest, dtdx_t0
예제 #4
0
 def ax_traj_xt(ax, rf=None):
     CUT = EDI.window_cutoff(data, measure, region(j0jL), cutoff, rf=rf)
     ttk, xxk, jjk = CUT.realdata(Tmin,
                                  Tmax,
                                  fcr=fcr,
                                  vload_min=vload_min,
                                  dilutions_min=dilutions_min)
     for jpat in xrange(Npat):
         jj = np.where(jjk == jpat)
         ax.plot(ttk[jj],
                 xxk[jj],
                 '--' + marks1[jpat],
                 c=cols[jpat],
                 markersize=12)
     return ax
예제 #5
0
def ttest_region(func_name, j0jL, cutoff, method,\
                 return_slope = False, return_all = False, rf = rframe):
    CUT = EDI.window_cutoff(data, func_name, region(j0jL), cutoff, rf=rf)
    ttk, xxk, jjk = CUT.realdata(Tmin,
                                 Tmax,
                                 fcr=fcr,
                                 vload_min=vload_min,
                                 dilutions_min=dilutions_min)

    ttk_est = np.zeros(ttk.shape)
    dtdx_t0 = np.zeros((Npat, 2))
    for jpat in xrange(Npat):
        idx_pat = np.where(jjk == jpat)[0]
        idx_data = np.where(jjk != jpat)[0]
        ttk_data, dtdx_t0[jpat, :] = EDI.fitmeth_byname(ttk[idx_data],
                                                        xxk[idx_data],
                                                        method=method)
        ttk_est[idx_pat] = dtdx_t0[jpat, 0] * xxk[idx_pat] + dtdx_t0[jpat, 1]
    if return_all:
        return ttk_est, ttk, xxk, jjk, dtdx_t0
    elif return_slope:
        return ttk_est, ttk, dtdx_t0
    else:
        return ttk_est, ttk
예제 #6
0
def ROC_curve(func_name, j0jL, cutoff, tcr, ax=None):
    def contable(ttk, xxk, tcr, xcr):
        TP = np.count_nonzero((ttk < tcr) * (xxk < xcr))
        FP = np.count_nonzero((ttk >= tcr) * (xxk < xcr))
        FN = np.count_nonzero((ttk < tcr) * (xxk >= xcr))
        TN = np.count_nonzero((ttk >= tcr) * (xxk >= xcr))
        return np.array([[TP, FN], [FP, TN]])

    CUT = EDI.window_cutoff(data, func_name, region(j0jL), cutoff, rf=rframe)
    ttk, xxk, jjk = CUT.realdata(Tmin,
                                 Tmax,
                                 fcr=fcr,
                                 vload_min=vload_min,
                                 dilutions_min=dilutions_min)

    xxcr = np.sort(np.unique(xxk))
    M = np.array([contable(ttk, xxk, tcr, xcr) for xcr in xxcr])
    MM = M / np.sum(M, axis=2, keepdims=True)
    if ax is not None:
        ax.plot(MM[:, 1, 0], MM[:, 0, 0])
    return MM, np.sum(np.diff(MM[:, 1, 0]) * MM[1:, 0, 0])
예제 #7
0
def plot_corrcoeff0(j0jL, measures, cutoffs, filename, rf=rframe):
    '''
    Plot pearson correlation coefficients between times
    and corresponding diversity values

    Input arguments:
    j0jL: tuple of initial and final positions of the genome window
    measures: diversity measures
    cutoffs: low frequency cutoffs
    filename: path to the file for saving the figure
    '''
    fig, ax = plt.subplots(1, len(measures), figsize = (H*len(measures), 2*H),\
    sharey = True)
    titls = [leg_byname(name) for name in measures]

    for j, measure in enumerate(measures):
        rxt = np.zeros((cutoffs.shape[0], len(data['pat_names'])))
        for jcut, cut in enumerate(cutoffs):
            CUT = EDI.window_cutoff(data, measure, region(j0jL), cut, rf=rf)
            ttk_all, xxk_all, jjk = CUT.realdata(Tmin, Tmax,  fcr = fcr,\
            vload_min = vload_min, dilutions_min = dilutions_min)
            for jpat in xrange(Npat):
                idx = np.where(jjk == jpat)
                ttk = ttk_all[idx]
                xxk = xxk_all[idx]
                rxt[jcut, jpat] = np.corrcoef(ttk, xxk)[0, 1]

        for jr, r in enumerate(rxt.T):
            ax[j].plot(cutoffs, r**2, styles[jr])
        ax[j].set_title(titls[j], fontsize=fs1)
        ax[j].tick_params(labelsize=.8 * fs1)
        ax[j].set_xlabel(r'$x_c$', fontsize=fs1)
        ax[j].set_xticks(np.arange(0., .5, .1))
    ax[0].legend(data['pat_names'], fontsize=0.8 * fs1, loc=0)
    ax[0].set_ylabel(r'$r^2$', fontsize=fs1)
    fig.subplots_adjust(hspace=0.1)
    plt.savefig(filename)
    plt.close()
    return None
예제 #8
0
func_name = funcnames[names.index(measure)]

fcr = 0.5
Tmin = 0
Tmax = 9
vload_min = None
dilutions_min = None
method = 'LAD'
rframe = 2  #reference frame; set to None to use all sites

fs = 28
H = 8

#loading frequency data
datapath = './Frequency_Data/'
data = EDI.load_patient_data(patient_names='all', filepath=datapath)
Npat = len(data['pat_names'])


def region(j0jL):
    if type(j0jL) is str:
        # The genome annotations
        head = ['name', 'x1', 'x2', 'width', 'ri']
        annot = []
        with open(datapath + 'annotations.txt', 'r') as fhandle:
            for line in fhandle:
                l = [
                    x if j == 0 else int(x) for j, x in enumerate(line.split())
                ]
                annot.append({name: l[j] for j, name in enumerate(head)})
        coords = {anno['name']: (anno['x1'], anno['x2']) for anno in annot}
예제 #9
0
def plot_slope_bootstrap(j0jL, func_name, cutoff, filename, nboot=10**3):
    '''
    Bootstrap plot of slope and intercept values

    Input arguments:
    j0jL: tuple of initial and final positions of the genome window
    func_name: diversity measure
    cutoff: low frequency cutoff
    filename: path to the file for saving the figure
    nboot: number of bootstrap realizations
    '''
    CUT = EDI.window_cutoff(data, func_name, region(j0jL), cutoff, rf=rframe)
    ttk, xxk, jjk = CUT.realdata(Tmin,
                                 Tmax,
                                 fcr=fcr,
                                 vload_min=vload_min,
                                 dilutions_min=dilutions_min)
    dtdx_t0 = np.zeros((nboot, 2))

    jjboot = np.random.randint(0, high=Npat, size=(nboot, Npat))
    for jboot, idx_boot in enumerate(jjboot):
        tk = np.ma.concatenate([ttk[np.where(jjk == j)] for j in idx_boot])
        xk = np.ma.concatenate([xxk[np.where(jjk == j)] for j in idx_boot])
        ttk_est, dtdx_t0[jboot, :] = EDI.fitmeth_byname(tk, xk, method=method)

    label_s = 's [years/diversity]'
    label_t0 = r'$t_0$' + '[years]'
    fig, ax = plt.subplots(1, 2, figsize=(2 * H, H), sharey=True)
    ax[0].hist(dtdx_t0[:, 0], alpha=0.5)
    ax[1].hist(dtdx_t0[:, 1], alpha=0.5)

    ax[0].set_xlabel(label_s, fontsize=fs)
    #    ax[0].set_ylabel(method, fontsize = fs)
    ax[0].tick_params(labelsize=.8 * fs)

    ax[1].set_xlabel(label_t0, fontsize=fs)
    ax[1].tick_params(labelsize=.8 * fs)
    plt.savefig(filename)
    plt.close()

    fig, ax = plt.subplots(1, 1, figsize=(1.2 * H, H))
    Hist, xedges, yedges, cax = ax.hist2d(dtdx_t0[:,0], dtdx_t0[:,1],\
    cmap = plt.cm.Blues)
    ax.set_xlabel(label_s, fontsize=fs)
    ax.set_ylabel(label_t0, fontsize=fs)
    ax.tick_params(labelsize=.8 * fs)
    cbar = fig.colorbar(cax)
    cbar.ax.tick_params(labelsize=.8 * fs)
    fig.tight_layout()
    plt.savefig(filename[:-4] + '_2d.pdf')
    plt.close()

    #    sns_plot = sns.jointplot(dtdx_t0[:,0], dtdx_t0[:,1], size = H, kind = 'hex', stat_func = None)
    #    with sns.axes_style("white"):
    #        sns.set_style(font = u'Verdana')
    sns_plot = sns.jointplot(dtdx_t0[:, 0],
                             dtdx_t0[:, 1],
                             stat_func=None,
                             size=H,
                             kind='kde',
                             joint_kws={'shade_lowest': False})
    sns_plot.set_axis_labels(xlabel=label_s, ylabel=label_t0, fontsize=fs)
    sns_plot.ax_joint.tick_params(labelsize=.8 * fs)
    sns_plot.savefig(filename[:-4] + '_joint.pdf')
    plt.close(sns_plot.fig)

    #    sns_plot.close()
    #    fig, ax1 = plt.subplots(1, 1, figsize = (1.2*H, H))
    #    sns_plot = sns.jointplot(dtdx_t0[:,0], dtdx_t0[:,1])
    #    fig.sca("axis")
    ##    Hist, xedges, yedges, cax = ax.hist2d(dtdx_t0[:,0], dtdx_t0[:,1],\
    ##    cmap = plt.cm.Blues)
    #    ax.set_xlabel('slope [years]', fontsize = fs)
    #    ax.set_ylabel('intercept', fontsize = fs)
    #    ax.tick_params(labelsize = .8*fs)
    #    cbar = fig.colorbar(cax)
    #    cbar.ax.tick_params(labelsize = .8*fs)
    #    plt.savefig(filename[:-4] + '_joint.pdf')
    #    plt.close()
    return None
예제 #10
0
head = ['name', 'x1', 'x2', 'width', 'ri']
annot = []
with open(datapath + 'annotations.txt', 'r') as fhandle:
    for line in fhandle:
        l = [x if j == 0 else int(x) for j, x in enumerate(line.split())]
        annot.append({name: l[j] for j, name in enumerate(head)})
coords = {anno['name']: (anno['x1'], anno['x2']) for anno in annot}
feas = ['gag', 'pol', 'env']

#loading frequency data
pnames = 'all'
#pnames = ['p{}'.format(j+1) for j in xrange(11)]
#pnames.remove('p6')
#pnames.remove('p1')
#pnames.remove('p3')
data = EDI.load_patient_data(patient_names=pnames, filepath=datapath)
Npat = len(data['pat_names'])


def leg_byname(funcname):
    legs = ['polymorphic sites', 'diversity', 'site entropy']
    heads = ['ambiguous_above', 'hamming_above', 'entropy_above']
    return legs[heads.index(funcname)]


def region(j0jL):
    if type(j0jL) is str:
        return coords[j0jL]
    else:
        return j0jL
예제 #11
0
    outdir_name = './plotK31/'
    if not os.path.exists(outdir_name):
        os.makedirs(outdir_name)

    #Creating figures for the manuscript
    measure = 'diversity'
    meas = translate_measures(measure)
    cutoff1 = 0.002

    for reg in ['gag', 'pol']:
        print reg
        j0jL = TI.region(reg)

        #Loading and processing the training set data (11 patients)
        CUT = EDI.window_cutoff(TI.data, meas, j0jL, cutoff1, rf=rframe)
        ttk, xxk, jjk = CUT.realdata(Tmin,
                                     Tmax,
                                     fcr=fcr,
                                     vload_min=vload_min,
                                     dilutions_min=dilutions_min)
        ttk_est, dtdx = TI.TI_from_diversity(xxk, reg, cutoff1, rf=rframe)

        #Loading and processing the validation dataset data (31 patients)
        K31data = loadK31(reg, './K31_data/{}/'.format(reg), fromHIV=False)
        TT, DD, pats, samples = K31_diversity(K31data, cutoff1, verbose=True)
        TTest, dtdx_t0 = TI.TI_from_diversity(DD, j0jL, cutoff1, rf=rframe)

        TTmax = np.max(TT)
        jj = np.where(ttk <= TTmax)