Esempio n. 1
0
def negative_diff(b1, b2, thres=-0.3, verbose=True):

    select = (b1 - b2) < thres
    if verbose:
        wzcore.err_print('Selected %d probes' % select.sum())

    return b1.loc[select,], b2.loc[select,]
Esempio n. 2
0
def data_load_samples(samples, probes=None):

    if not isinstance(samples, list):
        samples = [samples]

    data_home = '/Users/wandingzhou/projects/hs-tcga/data/2015_03_05_TCGA_450/dat/'
    _betases = []
    cancer_types = pd.Series()

    wzcore.err_print_sig()
    for cancer_type in samples:
        wzcore.err_print_m(' '+cancer_type)
        dat_fn = cancer_type+'.pkl'
        _betas = pd.read_pickle(data_home+'/'+dat_fn)
        if probes is None:
            _betases.append(_betas) # = pd.concat([betas, _betas], axis=1)
        else:
            _betases.append(_betas.loc[probes,])  # _betas = pd.concat([betas, _betas.loc[probes,]], axis=1)
        cancer_types = cancer_types.append(pd.Series([cancer_type]*_betas.shape[1], index=_betas.columns))

    betas = pd.concat(_betases, axis=1)
    wzcore.err_print_m('\n')

    # some cell line sample belong to multiple cancer type, choose the last cancer
    cancer_types = cancer_types.groupby(level=0).last()
    betas = betas.groupby(level=0, axis=1).mean()
    wzcore.err_print('Loaded %d probes and %d samples' % betas.shape)

    return betas, cancer_types
Esempio n. 3
0
    def mutstat(self, samples, gene, verbose=True):

        t = []
        cnt = 0
        cnt1 = 0
        if gene not in self.genes:
            wzcore.err_print('Gene ID not identified %s' % gene)
            return None
        
        for s in samples:
            s = s[:12]
            if s in self.sample2muts:
                mut = False
                for g, mt, mt1 in self.sample2muts[s]:
                    if g == gene:
                        mut = True
                        cnt1 += 1
                        break
                t.append(mut)
                cnt += 1
            else:
                t.append(np.nan)

        if verbose:
            wzcore.err_print('Identified info from %d/%d samples (%d muts).' % (cnt, len(samples), cnt1))

        return pd.Series(t, index=samples)
Esempio n. 4
0
def data_load_tissue(source):

    if source == 'ESC':
        # this has H1, H9, H9ESC, HUES6 and ICM
        betas = pd.read_pickle('/Users/wandingzhou/projects/hs-tcga/2016_03_29_TGCT/data/ESC_and_ICM.pkl')

    if source == 'PGC':
        # this has PGC and AGC
        betas = pd.read_pickle('/Users/wandingzhou/projects/hs-tcga/2016_03_29_TGCT/data/PGC.pkl')

    if source == 'Laird':
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_06_03_AML_normal_sorted/GSE49618_betas.tsv')
        samples = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_06_03_AML_normal_sorted/samples', index_col='barcode')
        betas.columns = betas.columns.map(lambda x: 'blood_'+samples.loc[x.split('_')[0],'name'])

    if source == 'Encode':
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/GSE40699_ENCODE/GSE40699_betas.tsv')
        samples = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/GSE40699_ENCODE/samples', index_col='barcode')
        betas.columns = samples.loc[betas.columns.map(lambda x:x.split('_',2)[2]), 'short']+'_'+samples.loc[betas.columns.map(lambda x:x.split('_',2)[2]), 'cellline']

    if source == 'Bonder':      # muscle and fat

        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/Bonder2014_BMCGenomics/GSE61454_severely_obsese/betas.tsv')
        samples = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/Bonder2014_BMCGenomics/GSE61454_severely_obsese/samples',header=None,index_col=0,names=['barcode','sample'])
        betas.columns = samples.loc[betas.columns.map(lambda x:x.split('_',1)[0]),'sample']+"_"+betas.columns.map(lambda x:x.split('_',1)[0])
        betas = betas.loc[:,~betas.columns.map(lambda x:x.startswith('Liver'))] # exclude liver

    if source == 'Slieker':
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/GSE48472_Slieker_2013_EpigeneticsAndChromatin/betas.tsv')
        samples = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/GSE48472_Slieker_2013_EpigeneticsAndChromatin/samples',header=None,index_col=0,names=['barcode','sample'])
        betas.columns = samples.loc[betas.columns.map(lambda x:x.split('_',1)[0]),'sample']+"_"+betas.columns.map(lambda x:x.split('_',1)[0])

    if source == 'Wong':
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_06_03_AlexWong_skin/AlexWong_skin_betas.tsv')
        samples = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_06_03_AlexWong_skin/samples', index_col='barcode')
        betas.columns = 'skin_'+betas.columns.to_series()

    if source == 'Lawlor1133':
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_06_03_Lawlor_tumor_lungfibroblast/s1133_betas.tsv')
        names = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_06_03_Lawlor_tumor_lungfibroblast/1133samples.csv.unix', index_col='Complete_Barcode', sep='\t')
        betas.columns = names.loc[betas.columns,'GROUP_NAME']+'_'+betas.columns
        
    if source == 'Guintivano':
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/GSE41826_Brain/betas.tsv')
        mask_snp_probes(betas)
        names = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/GSE41826_Brain/samples.csv',index_col='barcode')
        betas.columns = 'brain_'+names.loc[betas.columns,'sample'].map(str)+'_'+betas.columns

    if source == 'Wagner':
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/GSE52025_Wagner_fibroblast/GSE52025_betas.tsv',sep='\t')
        betas.columns = 'fibroblast_'+betas.columns.map(str)

    if source == 'Reinus':      # blood
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_04_10_sorted_cell_population/blood_beta.tsv', sep='\t')
        names = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_04_10_sorted_cell_population/Sorted_Blood/sample_sheet_IDAT.csv.unix.tsv',index_col='barcode')
        betas.columns = names.loc[betas.columns,'Type']+'-'+betas.columns
        
    wzcore.err_print("Loaded %d samples" % betas.shape[1])
    
    return betas
Esempio n. 5
0
def data_load_rnaseq_samples(_cancer_types, genes=None):

    if not isinstance(_cancer_types, list):
        _cancer_types = [_cancer_types]

    data_home = '/Users/wandingzhou/projects/hs-tcga/data/2015_04_30_TCGA_rnaseq/dat/'
    _rsems = []
    cancer_types = pd.Series()
    
    wzcore.err_print_sig()
    for cancer_type in _cancer_types:
        wzcore.err_print_m(' '+cancer_type)
        dat_fn = cancer_type+'.pkl'
        _rsem = pd.read_pickle(data_home+'/'+dat_fn)
        if genes is None:
            _rsems.append(_rsem)
        else:
            _rsems.append(_rsem.loc[genes,])
        cancer_types = cancer_types.append(pd.Series([cancer_type]*_rsem.shape[1], index=_rsem.columns))

    rsems = pd.concat(_rsems, axis=1)
    wzcore.err_print_m('\n')
            
    cancer_types = cancer_types.groupby(level=0).last()
    wzcore.err_print('Loaded %d genes and %d samples' % rsems.shape)
    
    return rsems, cancer_types
Esempio n. 6
0
def split_tumor_normal(df):

    dft = df.loc[:,df.columns.map(lambda x:x[13]=='0')]
    dfn = df.loc[:,df.columns.map(lambda x:x[13]=='1')]
    dfc = df.loc[:,df.columns.map(lambda x:x[13]=='2')]

    wzcore.err_print('Found %d tumor, %d normal, %d cell line and %d others.' % (dft.shape[1], dfn.shape[1], dfc.shape[1], df.shape[1]-dft.shape[1]-dfn.shape[1]-dfc.shape[1]))
    return dft, dfn, dfc
Esempio n. 7
0
def nonuniform(df, maxbeta=0.7, minbeta=0.3, maxsupp=0.95):

    maxsuppn = float(df.shape[1]) * maxsupp
    def _is_nonuniform(row):
        return not (((row > maxbeta).sum() > maxsuppn) or ((row < minbeta).sum() > maxsuppn))

    dfv = df[df.apply(_is_nonuniform, axis=1)]
    wzcore.err_print('Selected %d nonuniform probes from %d' % (dfv.shape[0], df.shape[0]))

    return dfv
Esempio n. 8
0
def data_create_tissue_sample():
    samples = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_07_14_purity/2015_07_14_sample_select.txt', index_col="sample")
    betas = pd.DataFrame()
    for k, v in samples.sort('source').groupby('source'):
        _betas = data_load_tissue(v['source'][0])
        _betas2 = _betas[_betas.columns.intersection(v.index)]
        betas = pd.concat([betas, _betas2], axis=1)
        wzcore.err_print("From %s chose %d samples" % (v['source'][0], _betas2.shape[1]))

    wzcore.err_print("Loaded %d samples" % betas.shape[1])

    return betas, samples
Esempio n. 9
0
def take_segment_mean(df, probe2seg, min_support=10, quiet=False):

    _df_seg = df.copy()
    if isinstance(_df_seg, pd.Series):
        _df_seg = _df_seg.to_frame()
    _df_seg['seg'] = probe2seg.loc[df.index]
    _df_seg_gb = _df_seg.groupby('seg')
    df_seg_count = _df_seg_gb.count().iloc[:,0]
    df_seg_mean = _df_seg_gb.mean()[df_seg_count >= min_support]
    if not quiet:
        wzcore.err_print('There are %d segments well supported.' % df_seg_mean.shape[0])

    return df_seg_mean
Esempio n. 10
0
    def associate_continuous_outliers(self, outlier_fn, fw=20.0, printn=10):
        
        balanced_up = [((pval+foldc*fw)/2.0,g,pval,foldc)
                       for pval, foldc, g in zip(self.pvals, self.foldcs, self.genes_select)
                       if (not pd.isnull(pval)) and (not pd.isnull(foldc))]
        balanced_dw = [((pval-foldc*fw)/2.0,g,pval,foldc)
                       for pval, foldc, g in zip(self.pvals, self.foldcs, self.genes_select)
                       if (not pd.isnull(pval)) and (not pd.isnull(foldc))]

        import wzplotlib

        tss = self.ts.copy()
        tss.sort()
        cbs = [wzplotlib.WZCbar(tss, continuous=True)]
        balanced_up.sort(reverse=True)
        wzcore.err_print('upside:')
        for i in xrange(min(printn, len(self.genes_select))):
            b, g, pval, foldc = balanced_up[i]
            cbs.append(wzplotlib.WZCbar(self.mutstat(tss.index, g, verbose=False)[tss.index], title=g+' [up]'))
            wzcore.err_print('%s\tpval:%1.2f\tfoldc:%1.2f\tbalanced:%1.2f' % (g, pval, foldc, b))

        balanced_dw.sort(reverse=True)
        wzcore.err_print('\ndownside:')
        for i in xrange(min(printn, len(self.genes_select))):
            b, g, pval, foldc = balanced_dw[i]
            cbs.append(wzplotlib.WZCbar(self.mutstat(tss.index, g, verbose=False)[tss.index], title=g+' [down]'))
            wzcore.err_print('%s\tpval:%1.2f\tfoldc:%1.2f\tbalanced:%1.2f' % (g, pval, foldc, b))

        wzplotlib.row_stack_layout(cbs, figfile=outlier_fn)
Esempio n. 11
0
def filter_by_purity(betas, min_purity=0.8, keep_normal=True):

    Hui_annot = pd.read_csv('/Users/wandingzhou/projects/hs-tcga/data/2015_03_23_Hui_annotation/sampleAnnotSubWB20130619.tsv', index_col=1, error_bad_lines=False, sep='\t')
    # print Hui_annot.columns[Hui_annot.columns.map(lambda x: np.bool(re.search('purity', x)))]
    # there are two purities, ABSOLUTE.purity and abs.purity, ABSOLUTE.purity has more value (4776 vs 4468)
    # the two values agree on overlap
    # I hereby use ABSOLUTE.purity
    # 4239 samples with purity estimates
    purity_anno = Hui_annot['ABSOLUTE.purity']
    purity_anno = purity_anno[(purity_anno.notnull()) & (purity_anno != 1.0)]

    sample_is_pure = betas.columns.map(lambda x: (x in purity_anno and purity_anno[x] > min_purity) or x[13]=='1')
    betasv = betas.loc[:,sample_is_pure].copy()
    wzcore.err_print('Selected %d pure samples (>%1.2f) from %d samples.' % (betasv.shape[1], min_purity, betas.shape[1]))
    return betasv
Esempio n. 12
0
def load_te_and_seqs(
        rmskbed='/Users/wandingzhou/projects/pj-mm/2015-04-23-alu/rmsk.bed.gz',
        load_seq=False,
        tetype=None,
        tetype2=None,
        tetype3=None):

    refgenome = faidx.RefGenome('/Users/wandingzhou/references/hg19/hg19.fa')
    tes = {}
    wzcore.err_print_sig()
    for i, line in enumerate(wzcore.opengz(rmskbed)):
        if i % 100000 == 0:
            wzcore.err_print_m(' %d' % i)

        fields = line.strip().split('\t')
        te = TE()
        te.chrm = fields[0]
        if te.chrm.find('_') > 0:
            continue
        te.beg = int(fields[1])
        te.end = int(fields[2])
        te.rmskbed = rmskbed
        te.strand = fields[3]
        te.tetype = fields[4]
        te.tetype2 = fields[5]
        te.tetype3 = fields[6]

        if tetype is not None and te.tetype != tetype:
            continue

        if tetype2 is not None and te.tetype2 != tetype2:
            continue

        if tetype3 is not None and te.tetype3 != tetype3:
            continue

        if load_seq:
            try:
                _te_load_seqs(refgenome, te)
            except IndexError:  # TE at chromosome boundaries, ignore
                # te.seq == None
                pass

        tes[(te.chrm, te.beg, te.end)] = te

    wzcore.err_print_m('\n')
    wzcore.err_print('Loaded %d TEs' % len(tes))
    return tes
Esempio n. 13
0
    def associate_continuous_outliers(self, fn, fw=10, printn=10):

        import matplotlib.pyplot as plt
        plt.figure(figsize=(10,10))
        plt.subplots_adjust(hspace=1)
        
        toplist = sorted(zip(self.pvals, self.rhos, self.genes_select), reverse=True)
        n = min(printn, len(toplist))
        for i in xrange(n):
            pval, rho, gene = toplist[i]
            wzcore.err_print('%s\tpval:%1.2f\trho:%1.2f' % (gene, pval, rho))
            plt.subplot(5,2,i+1)
            _dgene, _ts = self.expstat(self.ts, gene)
            plt.scatter(np.log2(_dgene), _ts, edgecolor='none', alpha=0.5, s=4)
            plt.xlabel('log2('+gene+' %1.2f)' % rho)

        plt.savefig(fn, bbox_inches='tight')
Esempio n. 14
0
def probe_select_pairwise(betas, v1, v2, mindelta=0.5, upq=0.75, loq=0.25):

    inind1 = betas.columns.isin(v1.index)
    inind2 = betas.columns.isin(v2.index)

    select1 = []
    select2 = []
    for i, row in betas.iterrows():
        c1 = row[inind1]
        c2 = row[inind2]
        if c2.quantile(loq) - c1.quantile(upq) > mindelta: # hypo
            select1.append(i)
        if c1.quantile(loq) - c2.quantile(upq) > mindelta: # hyper
            select2.append(i)

    wzcore.err_print("selected %d hypo and %d hyper." % (len(select1), len(select2)))
    return select1, select2
Esempio n. 15
0
    def __init__(self):

        self.sample2muts = {}
        self.genes = set()
        with open('/Users/wandingzhou/projects/hs-tcga/data/2015_06_17_TCGA_mutations/merged_maf') as fh:
            for line in fh:
                fields = line.strip().split()
                if fields[2] in ['Silent', 'RNA']:
                    continue
                sample = fields[4][:12]
                if sample not in self.sample2muts:
                    self.sample2muts[sample] = []
                mut = fields[1]
                self.sample2muts[sample].append((mut, fields[2], fields[3]))
                self.genes.add(mut)

        wzcore.err_print('Loaded %d genes and %d samples' % (len(self.genes), len(self.sample2muts)))

        return
Esempio n. 16
0
def load_cgi_and_seqs():

    cgis = []
    refgenome = faidx.RefGenome('/Users/wandingzhou/references/hg19/hg19.fa')
    for line in wzcore.opengz(
            '/Users/wandingzhou/projects/hs-tcga/data/2015_03_24_cpg_island/TakaiJones/takai.jones.strict.bed.gz'
    ):
        fields = line.strip().split('\t')
        cgi = CGI()
        cgi.chrm = fields[0]
        cgi.beg = int(fields[1])
        cgi.end = int(fields[2])
        cgi.cgitype = fields[3]
        cgi.seq = refgenome.fetch_sequence(cgi.chrm, cgi.seq_beg(),
                                           cgi.seq_end()).upper()
        cgis.append(cgi)

    wzcore.err_print('Loaded %d CGIs' % len(cgis))
    return cgis
Esempio n. 17
0
def probe_select1(betas, v, upq=0.75, loq=0.25, mindelta=0.1):

    inind = betas.columns.isin(v.index)
    _select1 = []
    _select2 = []

    hyperrows = []
    hyporows = []
    for i, row in betas.iterrows():
        c = row[inind]
        cb = row[~inind]
        hyperrows.append((c.min()-cb.quantile(upq),i))
        hyporows.append((cb.quantile(loq)-c.max(),i))

    _select1 = [(i,j) for i,j in sorted(hyporows, reverse=True)[:100] if i>mindelta]
    _select2 = [(i,j) for i,j in sorted(hyperrows, reverse=True)[:100] if i>mindelta]

    wzcore.err_print("selected %d hypo and %d hyper probes." % (len(_select1), len(_select2)))

    return list(set([j for i,j in _select1]) | set([j for i,j in _select2]))
Esempio n. 18
0
def polarized(df, upthres=0.7,dwthres=0.3, kind='any', minsupp=1):
    """ select probes that are methylated in some samples and unmethylated in others
    "any" means more than minsupp probes have hypo and hyper meth
    "all" all probes have either hypo or hyper meth
    """

    def _isvar(row):
        hi = (row > upthres).sum()
        lo = (row < dwthres).sum()
        if hi >= minsupp and lo >= minsupp:
            if kind == 'any':
                return True
            elif kind == 'all' and hi + lo == row.shape[0]:
                return True
            else:
                return False
        else:
            return False

    dfv = df[df.apply(_isvar, axis=1)]
    wzcore.err_print('Selected %d variable probes from %d samples' % dfv.shape)
    return dfv
Esempio n. 19
0
def get_cph(df):

    df = df.loc[df.index.str(startswith('ch'))]

    wzcore.err_print('Get %d CpH probes.' % df.shape[0])
    return df
Esempio n. 20
0
def clean_450k(df, nahow='strong', probe_fn='/Users/wandingzhou/projects/hs-tcga/data/2015_03_05_TCGA_450/450k_probes', verbose=True):

    probe_loc = pd.read_table(probe_fn,index_col=3,header=None, names=['chrm','beg','end','gene'])

    if verbose:
        if len(df.shape) == 1:
            wzcore.err_print("Before: %d probes" % df.shape[0])
        else:
            wzcore.err_print("Before: %d probes and %d samples" % df.shape)
    
    # remove X,Y chromosome
    wzcore.err_print('Removing sex chromosomes')
    df = df[(~probe_loc.chrm.isin(['chrX','chrY']))[df.index]]
    wzcore.err_print('Removing cph and rs probes')
    df = df.loc[df.index.str.startswith('cg')]

    # remove NA
    wzcore.err_print('Removing NA probes')
    if nahow == 'strong':
        df = df.dropna(how='any')
    if nahow == 'weak':
        df = df.dropna(how='all')

    if verbose:
        if len(df.shape) == 1:
            wzcore.err_print("After: %d probes after removal" % df.shape[0])
        else:
            wzcore.err_print("After: %d probes and %d samples" % df.shape)
        
    return df