Example #1
0
def negative_diff(b1, b2, thres=-0.3, verbose=True):

    select = (b1 - b2) < thres
    if verbose:
        wzcore.err_print('Selected %d probes' % select.sum())

    return b1.loc[select,], b2.loc[select,]
Example #2
0
def data_load_samples(samples, probes=None):

    if not isinstance(samples, list):
        samples = [samples]

    data_home = '/Users/wandingzhou/projects/hs-tcga/data/2015_03_05_TCGA_450/dat/'
    _betases = []
    cancer_types = pd.Series()

    wzcore.err_print_sig()
    for cancer_type in samples:
        wzcore.err_print_m(' '+cancer_type)
        dat_fn = cancer_type+'.pkl'
        _betas = pd.read_pickle(data_home+'/'+dat_fn)
        if probes is None:
            _betases.append(_betas) # = pd.concat([betas, _betas], axis=1)
        else:
            _betases.append(_betas.loc[probes,])  # _betas = pd.concat([betas, _betas.loc[probes,]], axis=1)
        cancer_types = cancer_types.append(pd.Series([cancer_type]*_betas.shape[1], index=_betas.columns))

    betas = pd.concat(_betases, axis=1)
    wzcore.err_print_m('\n')

    # some cell line sample belong to multiple cancer type, choose the last cancer
    cancer_types = cancer_types.groupby(level=0).last()
    betas = betas.groupby(level=0, axis=1).mean()
    wzcore.err_print('Loaded %d probes and %d samples' % betas.shape)

    return betas, cancer_types
Example #3
0
    def mutstat(self, samples, gene, verbose=True):

        t = []
        cnt = 0
        cnt1 = 0
        if gene not in self.genes:
            wzcore.err_print('Gene ID not identified %s' % gene)
            return None
        
        for s in samples:
            s = s[:12]
            if s in self.sample2muts:
                mut = False
                for g, mt, mt1 in self.sample2muts[s]:
                    if g == gene:
                        mut = True
                        cnt1 += 1
                        break
                t.append(mut)
                cnt += 1
            else:
                t.append(np.nan)

        if verbose:
            wzcore.err_print('Identified info from %d/%d samples (%d muts).' % (cnt, len(samples), cnt1))

        return pd.Series(t, index=samples)
Example #4
0
def data_load_tissue(source):

    if source == 'ESC':
        # this has H1, H9, H9ESC, HUES6 and ICM
        betas = pd.read_pickle('/Users/wandingzhou/projects/hs-tcga/2016_03_29_TGCT/data/ESC_and_ICM.pkl')

    if source == 'PGC':
        # this has PGC and AGC
        betas = pd.read_pickle('/Users/wandingzhou/projects/hs-tcga/2016_03_29_TGCT/data/PGC.pkl')

    if source == 'Laird':
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_06_03_AML_normal_sorted/GSE49618_betas.tsv')
        samples = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_06_03_AML_normal_sorted/samples', index_col='barcode')
        betas.columns = betas.columns.map(lambda x: 'blood_'+samples.loc[x.split('_')[0],'name'])

    if source == 'Encode':
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/GSE40699_ENCODE/GSE40699_betas.tsv')
        samples = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/GSE40699_ENCODE/samples', index_col='barcode')
        betas.columns = samples.loc[betas.columns.map(lambda x:x.split('_',2)[2]), 'short']+'_'+samples.loc[betas.columns.map(lambda x:x.split('_',2)[2]), 'cellline']

    if source == 'Bonder':      # muscle and fat

        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/Bonder2014_BMCGenomics/GSE61454_severely_obsese/betas.tsv')
        samples = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/Bonder2014_BMCGenomics/GSE61454_severely_obsese/samples',header=None,index_col=0,names=['barcode','sample'])
        betas.columns = samples.loc[betas.columns.map(lambda x:x.split('_',1)[0]),'sample']+"_"+betas.columns.map(lambda x:x.split('_',1)[0])
        betas = betas.loc[:,~betas.columns.map(lambda x:x.startswith('Liver'))] # exclude liver

    if source == 'Slieker':
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/GSE48472_Slieker_2013_EpigeneticsAndChromatin/betas.tsv')
        samples = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/GSE48472_Slieker_2013_EpigeneticsAndChromatin/samples',header=None,index_col=0,names=['barcode','sample'])
        betas.columns = samples.loc[betas.columns.map(lambda x:x.split('_',1)[0]),'sample']+"_"+betas.columns.map(lambda x:x.split('_',1)[0])

    if source == 'Wong':
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_06_03_AlexWong_skin/AlexWong_skin_betas.tsv')
        samples = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_06_03_AlexWong_skin/samples', index_col='barcode')
        betas.columns = 'skin_'+betas.columns.to_series()

    if source == 'Lawlor1133':
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_06_03_Lawlor_tumor_lungfibroblast/s1133_betas.tsv')
        names = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_06_03_Lawlor_tumor_lungfibroblast/1133samples.csv.unix', index_col='Complete_Barcode', sep='\t')
        betas.columns = names.loc[betas.columns,'GROUP_NAME']+'_'+betas.columns
        
    if source == 'Guintivano':
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/GSE41826_Brain/betas.tsv')
        mask_snp_probes(betas)
        names = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/GSE41826_Brain/samples.csv',index_col='barcode')
        betas.columns = 'brain_'+names.loc[betas.columns,'sample'].map(str)+'_'+betas.columns

    if source == 'Wagner':
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_03_18_tumor_purity/GSE52025_Wagner_fibroblast/GSE52025_betas.tsv',sep='\t')
        betas.columns = 'fibroblast_'+betas.columns.map(str)

    if source == 'Reinus':      # blood
        betas = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_04_10_sorted_cell_population/blood_beta.tsv', sep='\t')
        names = pd.read_table('/Users/wandingzhou/projects/hs-tcga/data/2015_04_10_sorted_cell_population/Sorted_Blood/sample_sheet_IDAT.csv.unix.tsv',index_col='barcode')
        betas.columns = names.loc[betas.columns,'Type']+'-'+betas.columns
        
    wzcore.err_print("Loaded %d samples" % betas.shape[1])
    
    return betas
Example #5
0
def data_load_rnaseq_samples(_cancer_types, genes=None):

    if not isinstance(_cancer_types, list):
        _cancer_types = [_cancer_types]

    data_home = '/Users/wandingzhou/projects/hs-tcga/data/2015_04_30_TCGA_rnaseq/dat/'
    _rsems = []
    cancer_types = pd.Series()
    
    wzcore.err_print_sig()
    for cancer_type in _cancer_types:
        wzcore.err_print_m(' '+cancer_type)
        dat_fn = cancer_type+'.pkl'
        _rsem = pd.read_pickle(data_home+'/'+dat_fn)
        if genes is None:
            _rsems.append(_rsem)
        else:
            _rsems.append(_rsem.loc[genes,])
        cancer_types = cancer_types.append(pd.Series([cancer_type]*_rsem.shape[1], index=_rsem.columns))

    rsems = pd.concat(_rsems, axis=1)
    wzcore.err_print_m('\n')
            
    cancer_types = cancer_types.groupby(level=0).last()
    wzcore.err_print('Loaded %d genes and %d samples' % rsems.shape)
    
    return rsems, cancer_types
Example #6
0
def split_tumor_normal(df):

    dft = df.loc[:,df.columns.map(lambda x:x[13]=='0')]
    dfn = df.loc[:,df.columns.map(lambda x:x[13]=='1')]
    dfc = df.loc[:,df.columns.map(lambda x:x[13]=='2')]

    wzcore.err_print('Found %d tumor, %d normal, %d cell line and %d others.' % (dft.shape[1], dfn.shape[1], dfc.shape[1], df.shape[1]-dft.shape[1]-dfn.shape[1]-dfc.shape[1]))
    return dft, dfn, dfc
Example #7
0
def nonuniform(df, maxbeta=0.7, minbeta=0.3, maxsupp=0.95):

    maxsuppn = float(df.shape[1]) * maxsupp
    def _is_nonuniform(row):
        return not (((row > maxbeta).sum() > maxsuppn) or ((row < minbeta).sum() > maxsuppn))

    dfv = df[df.apply(_is_nonuniform, axis=1)]
    wzcore.err_print('Selected %d nonuniform probes from %d' % (dfv.shape[0], df.shape[0]))

    return dfv
Example #8
0
def data_create_tissue_sample():
    samples = pd.read_table('/Users/wandingzhou/projects/hs-tcga/2015_07_14_purity/2015_07_14_sample_select.txt', index_col="sample")
    betas = pd.DataFrame()
    for k, v in samples.sort('source').groupby('source'):
        _betas = data_load_tissue(v['source'][0])
        _betas2 = _betas[_betas.columns.intersection(v.index)]
        betas = pd.concat([betas, _betas2], axis=1)
        wzcore.err_print("From %s chose %d samples" % (v['source'][0], _betas2.shape[1]))

    wzcore.err_print("Loaded %d samples" % betas.shape[1])

    return betas, samples
Example #9
0
def take_segment_mean(df, probe2seg, min_support=10, quiet=False):

    _df_seg = df.copy()
    if isinstance(_df_seg, pd.Series):
        _df_seg = _df_seg.to_frame()
    _df_seg['seg'] = probe2seg.loc[df.index]
    _df_seg_gb = _df_seg.groupby('seg')
    df_seg_count = _df_seg_gb.count().iloc[:,0]
    df_seg_mean = _df_seg_gb.mean()[df_seg_count >= min_support]
    if not quiet:
        wzcore.err_print('There are %d segments well supported.' % df_seg_mean.shape[0])

    return df_seg_mean
Example #10
0
    def associate_continuous_outliers(self, outlier_fn, fw=20.0, printn=10):
        
        balanced_up = [((pval+foldc*fw)/2.0,g,pval,foldc)
                       for pval, foldc, g in zip(self.pvals, self.foldcs, self.genes_select)
                       if (not pd.isnull(pval)) and (not pd.isnull(foldc))]
        balanced_dw = [((pval-foldc*fw)/2.0,g,pval,foldc)
                       for pval, foldc, g in zip(self.pvals, self.foldcs, self.genes_select)
                       if (not pd.isnull(pval)) and (not pd.isnull(foldc))]

        import wzplotlib

        tss = self.ts.copy()
        tss.sort()
        cbs = [wzplotlib.WZCbar(tss, continuous=True)]
        balanced_up.sort(reverse=True)
        wzcore.err_print('upside:')
        for i in xrange(min(printn, len(self.genes_select))):
            b, g, pval, foldc = balanced_up[i]
            cbs.append(wzplotlib.WZCbar(self.mutstat(tss.index, g, verbose=False)[tss.index], title=g+' [up]'))
            wzcore.err_print('%s\tpval:%1.2f\tfoldc:%1.2f\tbalanced:%1.2f' % (g, pval, foldc, b))

        balanced_dw.sort(reverse=True)
        wzcore.err_print('\ndownside:')
        for i in xrange(min(printn, len(self.genes_select))):
            b, g, pval, foldc = balanced_dw[i]
            cbs.append(wzplotlib.WZCbar(self.mutstat(tss.index, g, verbose=False)[tss.index], title=g+' [down]'))
            wzcore.err_print('%s\tpval:%1.2f\tfoldc:%1.2f\tbalanced:%1.2f' % (g, pval, foldc, b))

        wzplotlib.row_stack_layout(cbs, figfile=outlier_fn)
Example #11
0
def filter_by_purity(betas, min_purity=0.8, keep_normal=True):

    Hui_annot = pd.read_csv('/Users/wandingzhou/projects/hs-tcga/data/2015_03_23_Hui_annotation/sampleAnnotSubWB20130619.tsv', index_col=1, error_bad_lines=False, sep='\t')
    # print Hui_annot.columns[Hui_annot.columns.map(lambda x: np.bool(re.search('purity', x)))]
    # there are two purities, ABSOLUTE.purity and abs.purity, ABSOLUTE.purity has more value (4776 vs 4468)
    # the two values agree on overlap
    # I hereby use ABSOLUTE.purity
    # 4239 samples with purity estimates
    purity_anno = Hui_annot['ABSOLUTE.purity']
    purity_anno = purity_anno[(purity_anno.notnull()) & (purity_anno != 1.0)]

    sample_is_pure = betas.columns.map(lambda x: (x in purity_anno and purity_anno[x] > min_purity) or x[13]=='1')
    betasv = betas.loc[:,sample_is_pure].copy()
    wzcore.err_print('Selected %d pure samples (>%1.2f) from %d samples.' % (betasv.shape[1], min_purity, betas.shape[1]))
    return betasv
Example #12
0
def load_te_and_seqs(
        rmskbed='/Users/wandingzhou/projects/pj-mm/2015-04-23-alu/rmsk.bed.gz',
        load_seq=False,
        tetype=None,
        tetype2=None,
        tetype3=None):

    refgenome = faidx.RefGenome('/Users/wandingzhou/references/hg19/hg19.fa')
    tes = {}
    wzcore.err_print_sig()
    for i, line in enumerate(wzcore.opengz(rmskbed)):
        if i % 100000 == 0:
            wzcore.err_print_m(' %d' % i)

        fields = line.strip().split('\t')
        te = TE()
        te.chrm = fields[0]
        if te.chrm.find('_') > 0:
            continue
        te.beg = int(fields[1])
        te.end = int(fields[2])
        te.rmskbed = rmskbed
        te.strand = fields[3]
        te.tetype = fields[4]
        te.tetype2 = fields[5]
        te.tetype3 = fields[6]

        if tetype is not None and te.tetype != tetype:
            continue

        if tetype2 is not None and te.tetype2 != tetype2:
            continue

        if tetype3 is not None and te.tetype3 != tetype3:
            continue

        if load_seq:
            try:
                _te_load_seqs(refgenome, te)
            except IndexError:  # TE at chromosome boundaries, ignore
                # te.seq == None
                pass

        tes[(te.chrm, te.beg, te.end)] = te

    wzcore.err_print_m('\n')
    wzcore.err_print('Loaded %d TEs' % len(tes))
    return tes
Example #13
0
    def associate_continuous_outliers(self, fn, fw=10, printn=10):

        import matplotlib.pyplot as plt
        plt.figure(figsize=(10,10))
        plt.subplots_adjust(hspace=1)
        
        toplist = sorted(zip(self.pvals, self.rhos, self.genes_select), reverse=True)
        n = min(printn, len(toplist))
        for i in xrange(n):
            pval, rho, gene = toplist[i]
            wzcore.err_print('%s\tpval:%1.2f\trho:%1.2f' % (gene, pval, rho))
            plt.subplot(5,2,i+1)
            _dgene, _ts = self.expstat(self.ts, gene)
            plt.scatter(np.log2(_dgene), _ts, edgecolor='none', alpha=0.5, s=4)
            plt.xlabel('log2('+gene+' %1.2f)' % rho)

        plt.savefig(fn, bbox_inches='tight')
Example #14
0
def probe_select_pairwise(betas, v1, v2, mindelta=0.5, upq=0.75, loq=0.25):

    inind1 = betas.columns.isin(v1.index)
    inind2 = betas.columns.isin(v2.index)

    select1 = []
    select2 = []
    for i, row in betas.iterrows():
        c1 = row[inind1]
        c2 = row[inind2]
        if c2.quantile(loq) - c1.quantile(upq) > mindelta: # hypo
            select1.append(i)
        if c1.quantile(loq) - c2.quantile(upq) > mindelta: # hyper
            select2.append(i)

    wzcore.err_print("selected %d hypo and %d hyper." % (len(select1), len(select2)))
    return select1, select2
Example #15
0
    def __init__(self):

        self.sample2muts = {}
        self.genes = set()
        with open('/Users/wandingzhou/projects/hs-tcga/data/2015_06_17_TCGA_mutations/merged_maf') as fh:
            for line in fh:
                fields = line.strip().split()
                if fields[2] in ['Silent', 'RNA']:
                    continue
                sample = fields[4][:12]
                if sample not in self.sample2muts:
                    self.sample2muts[sample] = []
                mut = fields[1]
                self.sample2muts[sample].append((mut, fields[2], fields[3]))
                self.genes.add(mut)

        wzcore.err_print('Loaded %d genes and %d samples' % (len(self.genes), len(self.sample2muts)))

        return
Example #16
0
def load_cgi_and_seqs():

    cgis = []
    refgenome = faidx.RefGenome('/Users/wandingzhou/references/hg19/hg19.fa')
    for line in wzcore.opengz(
            '/Users/wandingzhou/projects/hs-tcga/data/2015_03_24_cpg_island/TakaiJones/takai.jones.strict.bed.gz'
    ):
        fields = line.strip().split('\t')
        cgi = CGI()
        cgi.chrm = fields[0]
        cgi.beg = int(fields[1])
        cgi.end = int(fields[2])
        cgi.cgitype = fields[3]
        cgi.seq = refgenome.fetch_sequence(cgi.chrm, cgi.seq_beg(),
                                           cgi.seq_end()).upper()
        cgis.append(cgi)

    wzcore.err_print('Loaded %d CGIs' % len(cgis))
    return cgis
Example #17
0
def probe_select1(betas, v, upq=0.75, loq=0.25, mindelta=0.1):

    inind = betas.columns.isin(v.index)
    _select1 = []
    _select2 = []

    hyperrows = []
    hyporows = []
    for i, row in betas.iterrows():
        c = row[inind]
        cb = row[~inind]
        hyperrows.append((c.min()-cb.quantile(upq),i))
        hyporows.append((cb.quantile(loq)-c.max(),i))

    _select1 = [(i,j) for i,j in sorted(hyporows, reverse=True)[:100] if i>mindelta]
    _select2 = [(i,j) for i,j in sorted(hyperrows, reverse=True)[:100] if i>mindelta]

    wzcore.err_print("selected %d hypo and %d hyper probes." % (len(_select1), len(_select2)))

    return list(set([j for i,j in _select1]) | set([j for i,j in _select2]))
Example #18
0
def polarized(df, upthres=0.7,dwthres=0.3, kind='any', minsupp=1):
    """ select probes that are methylated in some samples and unmethylated in others
    "any" means more than minsupp probes have hypo and hyper meth
    "all" all probes have either hypo or hyper meth
    """

    def _isvar(row):
        hi = (row > upthres).sum()
        lo = (row < dwthres).sum()
        if hi >= minsupp and lo >= minsupp:
            if kind == 'any':
                return True
            elif kind == 'all' and hi + lo == row.shape[0]:
                return True
            else:
                return False
        else:
            return False

    dfv = df[df.apply(_isvar, axis=1)]
    wzcore.err_print('Selected %d variable probes from %d samples' % dfv.shape)
    return dfv
Example #19
0
def get_cph(df):

    df = df.loc[df.index.str(startswith('ch'))]

    wzcore.err_print('Get %d CpH probes.' % df.shape[0])
    return df
Example #20
0
def clean_450k(df, nahow='strong', probe_fn='/Users/wandingzhou/projects/hs-tcga/data/2015_03_05_TCGA_450/450k_probes', verbose=True):

    probe_loc = pd.read_table(probe_fn,index_col=3,header=None, names=['chrm','beg','end','gene'])

    if verbose:
        if len(df.shape) == 1:
            wzcore.err_print("Before: %d probes" % df.shape[0])
        else:
            wzcore.err_print("Before: %d probes and %d samples" % df.shape)
    
    # remove X,Y chromosome
    wzcore.err_print('Removing sex chromosomes')
    df = df[(~probe_loc.chrm.isin(['chrX','chrY']))[df.index]]
    wzcore.err_print('Removing cph and rs probes')
    df = df.loc[df.index.str.startswith('cg')]

    # remove NA
    wzcore.err_print('Removing NA probes')
    if nahow == 'strong':
        df = df.dropna(how='any')
    if nahow == 'weak':
        df = df.dropna(how='all')

    if verbose:
        if len(df.shape) == 1:
            wzcore.err_print("After: %d probes after removal" % df.shape[0])
        else:
            wzcore.err_print("After: %d probes and %d samples" % df.shape)
        
    return df