def prepare_gct_files_hgic(pids=consts.ALL_PIDS, outdir=None):
    """
    Prepare the GCT files required to perform classification of the hGIC samples:
    - hGIC FFPE
    - hGIC cell culture
    - Both combined
    In all cases, use FPKM units (cufflinks), TPM (salmon) and CPM (STAR).
    Use gene symbols as these are contained in the signatures.
    """
    if outdir is None:
        outdir = output.unique_output_dir()

    infiles = []

    loaded = {}
    for typ in ('cell_culture', 'ffpe'):
        for src in ('star', 'salmon', 'star/cufflinks'):
            this_obj = loader.load_by_patient(pids, type=typ, source=src, include_control=False)
            this_obj.filter_samples(this_obj.meta.type == 'GBM')
            if typ == 'ffpe':
                # restrict to the 'best' versions (there are some duplicates where we tried twice)
                this_obj.filter_by_sample_name(consts.FFPE_RNASEQ_SAMPLES_ALL)
            this_dat = reference_genomes.translate_quantification_resolving_duplicates(
                this_obj.data,
                'Ensembl Gene ID',
                'Approved Symbol'
            )
            loaded.setdefault(typ, {})[src] = this_dat
            fn = os.path.join(outdir, "%s_%s.gct" % (SRC_MAP[src], typ))
            gsea.data_to_gct(this_dat, fn)
            infiles.append(fn)

    return infiles
Example #2
0
def download_from_ftp(dl_paths,
                      outdir=None,
                      host='ftp-trace.ncbi.nlm.nih.gov',
                      user='',
                      passwd=''):
    """
    :param dl_paths: List of paths to download
    :param host:
    :param user:
    :param passwd:
    :return:
    """
    if outdir is None:
        outdir = __name__
    outdir = unique_output_dir(outdir)

    ftp = ftplib.FTP(host=host, user=user, passwd=passwd)
    ftp.login()

    for ff in dl_paths:

        # get download filename
        sp = [t for t in ff.split('/') if len(t)]
        sp = sp[-1]
        outfile = os.path.join(outdir, sp)
        logger.info("Attempting to download %s to %s", ff, outfile)
        with open(outfile, 'wb') as f:
            try:
                ftp.retrbinary('RETR %s' % ff, f.write)
            except Exception:
                logger.exception("Download failed")
Example #3
0
def download_from_manifest(path_to_manifest, outdir=None, legacy=False):
    """
    Download all files from the provided manifest
    :param path_to_manifest:
    :param outdir: If None, create a unique output folder
    :return:
    """
    if outdir is None:
        outdir = unique_output_dir("nih_gdc_legacy")
    mani = pd.read_csv(path_to_manifest, sep='\t', header=0, index_col=0)
    for fid, row in mani.iterrows():
        outfile = os.path.join(outdir, row.filename)
        download_data(fid, outfile, legacy=legacy)
def prepare_gct_files(outdir=None):
    """
    Prepare the GCT files required to perform classification:
    - Our GBM FFPE and cell culture samples
    - TCGA RNA-Seq cohort
    - Both combined
    In all cases, use FPKM units and gene symbols, as these are used by Wang
    """
    if outdir is None:
        outdir = unique_output_dir("gct_files_for_wang")

    infiles = []

    # 1) Our data
    obj_ffpe = rnaseq_data.load_by_patient('all', type='ffpe')
    dat_ffpe = obj_ffpe.get_fpkm()
    dat_ffpe.columns = ['%s_FFPE' % t for t in obj_ffpe.meta.reference_id]
    obj_cc = rnaseq_data.load_by_patient(patient_ids='all')
    dat_cc = obj_cc.get_fpkm()
    dat_cc = dat_cc.loc[:, obj_cc.meta.type == 'GBM']
    dat_all = pd.concat((dat_cc, dat_ffpe), axis=1)
    idx = reference_genomes.ensembl_to_gene_symbol(dat_all.index).dropna()
    dat_all = dat_all.loc[idx.index]
    dat_all.index = idx
    fn = os.path.join(outdir, "gbm_ffpe_cc_fpkm.gct")
    gsea.data_to_gct(dat_all, fn)
    infiles.append(fn)

    # 2) TCGA (IDH1 WT only)
    tcga_dat, tcga_meta = rnaseq_data.tcga_primary_gbm(units='fpkm')
    tcga_dat = tcga_dat.loc[:, tcga_meta.idh1_status == 'WT']
    idx = reference_genomes.ensembl_to_gene_symbol(tcga_dat.index).dropna()
    idx = idx.loc[~idx.index.duplicated()]
    tcga_dat = tcga_dat.loc[idx.index]
    tcga_dat.index = idx
    fn = os.path.join(outdir, "tcga_idh1_wt_fpkm.gct")
    gsea.data_to_gct(tcga_dat, fn)
    infiles.append(fn)

    # 3) Combined
    dat = gsea.combine_gct_files(*infiles)
    fn = os.path.join(outdir, "tcga_idh1_wt_and_gbm_ffpe_cc_fpkm.gct")
    gsea.data_to_gct(dat, fn)
import os

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from load_data import rnaseq_data
from rnaseq import differential_expression, general
from settings import LOCAL_DATA_DIR
from utils import output, setops, excel, ipa, reference_genomes

if __name__ == "__main__":

    outdir = output.unique_output_dir("cross_validate_de_multiple_refs", reuse_empty=True)
    # all n=2 samples and RTK II samples
    pids = ['017', '019', '030', '031', '050', '054']
    cmap = 'RdYlGn_r'

    de_params = {
        'lfc': 1,
        'fdr': 0.01,
        'method': 'GLM'
    }

    subgroups = {
        'RTK I': ['019', '030', '031'],
        'RTK II': ['017', '050', '054'],
    }

    intersecter = lambda x, y: set(x).intersection(y)
Example #6
0
            if not isinstance(the_names, str):
                the_names = ';'.join(the_names)
            c.writerow([
                row[0],
                row[1],
                row[2],
                the_names,
                '.',  # empty score field
                row[3]
            ])


if __name__ == "__main__":

    distance = 5000  # distance from TSS to include

    fn = os.path.join(LOCAL_DATA_DIR, 'reference_genomes', 'human', 'ensembl',
                      'GRCh38.release87', 'gtf',
                      'Homo_sapiens.GRCh38.87.gtf.gz')
    outdir = output.unique_output_dir("chipseq_analysis")

    reg, names = get_gene_tss_from_gtf(fn, distance=distance, sources=SOURCES)
    fn_out = os.path.join(outdir, 'gene_tss_pad_%d.bed' % distance)
    write_bed_file(reg, names, fn_out)

    reg, names = get_transcript_tss_from_gtf(fn,
                                             distance=distance,
                                             sources=SOURCES)
    fn_out = os.path.join(outdir, 'transcript_tss_pad_%d.bed' % distance)
    write_bed_file(reg, names, fn_out)
Example #7
0
    # }

    subgroup_set_colours = {
        'RTK I full': '#0d680f',
        'RTK II full': '#820505',
        'MES full': '#7900ad',
        'RTK I partial': '#6ecc70',
        'RTK II partial': '#d67373',
        'MES partial': '#cc88ea',
        'mixed': '#4C72B0',
        'specific': '#f4e842',
    }

    min_cpm = 1

    outdir = output.unique_output_dir("compare_de_gene_counts_s1",
                                      reuse_empty=True)
    obj = loader.load_by_patient(pids, include_control=False)

    # remove IPSC and rejected 061 samples for good
    idx = ((~obj.meta.index.str.contains('IPSC'))
           &
           (~obj.meta.index.isin(['DURA061_NSC_N1_P5', 'DURA061_NSC_N6_P4'])))
    obj.meta = obj.meta.loc[idx]
    obj.data = obj.data.loc[:, idx]
    obj.batch_id = obj.batch_id.loc[idx]

    # we'll run everything with two different edgeR tests

    methods = ('GLM', 'QLGLM')

    res_1 = {}
Example #8
0
    cg.ax_heatmap.yaxis.label.set_visible(False)
    cg.ax_heatmap.xaxis.label.set_visible(False)
    if show_gene_labels:
        plt.setp(cg.ax_heatmap.yaxis.get_ticklabels(), rotation=0, fontsize=14)
    else:
        cg.ax_heatmap.yaxis.set_ticklabels([])

    return cg


if __name__ == "__main__":

    N_PC = 3
    geneset = consts.NORTHCOTT_GENES
    outdir = unique_output_dir("pca_atcc_lines")

    # it's useful to maintain a list of known upregulated genes
    nano_genes = []
    for grp, arr in consts.NANOSTRING_GENES:
        if grp != 'WNT':
            nano_genes.extend(arr)
    nano_genes.remove('EGFL11')
    nano_genes.append('EYS')

    all_nstring = []
    [all_nstring.extend(t) for _, t in consts.NANOSTRING_GENES]
    all_ncott = []
    [all_ncott.extend(t) for _, t in consts.NORTHCOTT_GENES]

    # load Ncott data (285 non-WNT MB samples)
Example #9
0
import pandas as pd
from utils import output
from settings import OUTPUT_DIR, DATA_DIR
import os
from plotting import venn, clustering
from matplotlib import pyplot as plt

if __name__ == "__main__":
    outdir = output.unique_output_dir("tcga_gbm_analysis", reuse_empty=True)
    # load meta files
    meta_fn = {
        'rnaseq':
        os.path.join(DATA_DIR, 'rnaseq', 'tcga_gbm', 'primary_tumour',
                     'rnaseq.meta.csv'),
        'marr_u133':
        os.path.join(DATA_DIR, 'microarray', 'tcga_gbm', 'primary_tumour',
                     'microarray.meta.ht_hg_u133a.csv'),
        'marr_agilent1':
        os.path.join(DATA_DIR, 'microarray', 'tcga_gbm', 'primary_tumour',
                     'microarray.meta.agilentg4502a_07_1.csv'),
        'marr_agilent2':
        os.path.join(DATA_DIR, 'microarray', 'tcga_gbm', 'primary_tumour',
                     'microarray.meta.agilentg4502a_07_2.csv'),
        'meth_450k':
        os.path.join(DATA_DIR, 'methylation', 'tcga_gbm', 'primary_tumour',
                     'methylation.450k.meta.csv'),
        'meth_27k':
        os.path.join(DATA_DIR, 'methylation', 'tcga_gbm', 'primary_tumour',
                     'methylation.27k.meta.csv'),
    }
Example #10
0
from utils import genomics, output, log
from matplotlib import pyplot as plt
import seaborn as sns

logger = log.get_console_logger(__name__)


def plot_one_hist(dat, ax, *args, **kwargs):
    mval = dat[:, 0] / dat.sum(axis=1).astype(float) * 100.
    ax.hist(mval, *args, **kwargs)


if __name__ == "__main__":

    min_coverage = 10
    outdir = output.unique_output_dir("rrbs_methylation_cpg_islands",
                                      reuse_empty=True)

    cpg_island_tsv = os.path.join(GIT_LFS_DATA_DIR, 'mouse_cpg_island',
                                  'grcm38_cpgisland.tsv')
    cpg_regions = pd.read_csv(cpg_island_tsv, sep='\t', header=0)

    indir = os.path.join(DATA_DIR, 'rrbseq', 'GC-CV-7163', 'trim_galore',
                         'mouse', 'bismark')
    subdir = "GC-CV-7163-{i}_S{i}"
    flist = glob(os.path.join(indir, "*.bismark.cov.gz"))

    chroms = [str(t) for t in range(1, 20)]

    chrom_lengths = genomics.reference_genome_chrom_lengths(tax_id=10090)
    # discard unplaced scaffolds, MT, X, Y
    chrom_lengths = chrom_lengths.loc[chroms]
Example #11
0
from utils import output


def log_cpm(dat, base=2, offset=1.):
    dat = dat + offset
    if len(dat.shape) == 2:
        cpm = dat.divide(dat.sum(), axis=1) * 1e6
    else:
        cpm = dat.divide(dat.sum()) * 1e6
    return np.log(cpm) / np.log(base)


if __name__ == '__main__':
    min_cpm = 0.01

    outdir = output.unique_output_dir("biological_technical_ecdf")

    # all our patient data (cell culture)

    our_patient_obj = loader.load_by_patient('all', source='star')

    # all our patient data (FFPE culture)
    ffpe_samples = [
        'NH15_1661DEF2C',
        'NH15_1877_SP1C',
        'NH15_2101_DEF1A',
        'NH16_270_DEF1Ereplacement',
        'NH16_616DEF1B',
        'NH16_677_SP1A',
        'NH16_1574DEF1A',
        'NH16_1976_DEF1Areplacement',
import os
import pandas as pd

from utils import output, setops, log, genomics
from settings import INTERMEDIATE_DIR
from scripts.hgic_final import two_strategies_combine_de_dmr as tscdd
from scripts.hgic_final import consts
from methylation import dmr

logger = log.get_console_logger()

if __name__ == "__main__":
    outdir = output.unique_output_dir(reuse_empty=True)

    # Set this to True to include reference methylation data
    # This will limit the number of available probes (to 450K)
    include_external_dm_refs = False

    de_params = consts.DE_PARAMS
    dmr_params = consts.DMR_PARAMS

    norm_method_s1 = 'swan'

    pids = consts.PIDS

    if include_external_dm_refs:
        external_ref_names_dm = ['GSE38216']
        external_ref_samples_dm = ['H9 NPC 1', 'H9 NPC 2']
    else:
        external_ref_names_dm = None
        external_ref_samples_dm = None
                            [row.CHR, row.Strand, row.MAPINFO, t[0], t[1]])
    return pd.DataFrame(this_res,
                        columns=['probe_id'] + df.columns.tolist() + anno_cols)


if __name__ == '__main__':
    anno = loader.load_illumina_methylationepic_annotation(split_genes=False)

    # 1. Annotate DMPs and re-export to Excel

    # dmp_fns = glob(os.path.join(GIT_LFS_DATA_DIR, 'mb_dmp', '*.xlsx'))
    dmp_fns = glob(os.path.join(os.path.expanduser('~/temp'), '*.xlsx'))

    print "Found %d relevant input (DMP) files: %s" % (len(dmp_fns),
                                                       ', '.join(dmp_fns))
    outdir = output.unique_output_dir("mb_dmps")
    res = {}

    for fn in dmp_fns:
        base = os.path.splitext(os.path.basename(fn))[0]
        res[base] = {}
        dat = pd.read_excel(fn, sheet_name=None)
        for cmp, df in dat.items():
            res[base][cmp] = annot_one(df, anno)

        # save to Excel
        out_fn = os.path.join(outdir, os.path.basename(fn))
        excel.pandas_to_excel(res[base], out_fn, write_index=False)

    # 2.1 Look for common DMPs
from rnaseq import loader, differential_expression, general, filter
import os
import pandas as pd
from utils import output
from settings import RNASEQ_DIR
import numpy as np
"""
Aim:

Carry out DE analysis on the TCGA RNA-Seq data, all primary tumour samples vs all solid healthy tissue
"""

if __name__ == "__main__":
    de_params = {'lfc': 1, 'fdr': 0.01, 'method': 'QLGLM'}

    outdir = output.unique_output_dir("tcga_de")
    indir_pt = os.path.join(RNASEQ_DIR, 'tcga_gbm', 'primary_tumour')
    indir_hn = os.path.join(RNASEQ_DIR, 'tcga_gbm', 'solid_tissue_normal')

    dat_fn = os.path.join(indir_pt, 'rnaseq.htseq.csv.gz')
    dat_hn_fn = os.path.join(indir_hn, 'rnaseq_normal.htseq.csv.gz')
    meta_fn = os.path.join(indir_pt, 'brennan_s7.csv')

    meta = pd.read_csv(meta_fn, header=0, index_col=0)
    dat_pt = pd.read_csv(dat_fn, header=0, index_col=0)
    dat_hn = pd.read_csv(dat_hn_fn, header=0, index_col=0)

    dat_pt.columns = [t[:12] for t in dat_pt.columns]
    dat_pt = dat_pt.loc[:, ~dat_pt.columns.duplicated()]
    meta = meta.loc[dat_pt.columns]
    meta = meta.loc[~meta.index.duplicated()]
Example #15
0
def compute_median_betas_one_sample(the_dat, probes_by_gene):
    missing_probes = set()
    missing_genes = []
    res = {}
    for g, probes in probes_by_gene.items():
        try:
            res[g] = the_dat.loc[probes].median(axis=0)
        except KeyError:
            missing_probes.update(probes)
            missing_genes.append(g)
    return res, missing_genes, missing_probes


if __name__ == "__main__":
    outdir = output.unique_output_dir("report_beta_values")

    ######################
    # 1: PRIMARY TUMOUR  #
    ######################

    # in this case, we want the median beta value over all probes that are associated with a given gene
    # we'll exclude those associated with gene body only
    indir = os.path.join(DATA_DIR, 'methylation', 'tcga_gbm', 'primary_tumour')
    meta_fn = os.path.join(indir, 'methylation.450k.meta.csv')
    dat_fn = os.path.join(indir, 'methylation.450k.csv.gz')

    meta = pd.read_csv(meta_fn, header=0, index_col=0)
    dat = pd.read_csv(dat_fn, header=0, index_col=0, skiprows=[1])

    print "Primary tumour (%d samples)" % meta.shape[0]
import os
from utils.output import unique_output_dir
from load_data import methylation_array
from plotting import clustering, pca
from classification import lda
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

if __name__ == "__main__":
    outdir = unique_output_dir('meth_classification_lda')
    REF_META_SUBGRP_LABEL = 'dna methylation subgroup'

    data, meta = methylation_array.hgic_methylationepic(norm_method='swan')
    # Don't know why, but some probes (~2000) are only present in one OR the other sample
    # Therefore, remove those
    data = data.dropna()

    # add some extra meta information
    meta.loc[:, 'cell_type'] = 'NSC'
    meta.loc[meta.index.str.contains('GBM'), 'cell_type'] = 'GBM'
    meta.loc[:, 'subgroup'] = 'RTK I'
    meta.loc[meta.index.str.contains('024'), 'subgroup'] = 'Unknown'
    meta.loc[meta.index.str.contains('026'), 'subgroup'] = 'Unknown'
    meta.loc[meta.index.str.contains('044'), 'subgroup'] = 'Mesenchymal'
    meta.loc[meta.index.str.contains('GIBCO'), 'subgroup'] = 'NSC'
Example #17
0
    # resolve any duplicates arbitrarily (these should be rare)
    gs = gs.loc[~gs.index.duplicated()]
    df.insert(0, 'Gene Symbol', gs)


def add_fc_direction(df):
    direction = pd.Series(index=df.index, name='Direction')
    direction.loc[df.logFC < 0] = 'down'
    direction.loc[df.logFC > 0] = 'up'
    df.insert(df.shape[1], 'Direction', direction)


if __name__ == '__main__':
    lfc = 1
    fdr = 0.01
    outdir = output.unique_output_dir("paired_rnaseq")
    # RTK II samples
    # pids = ['017', '050', '054', '061']
    # all n=2 samples
    pids = ['018', '044', '049', '050', '052', '054', '061']
    # all samples
    # pids = [t for t in rnaseq_data.PATIENT_LOOKUP_STAR if t != 'GIBCO']
    obj = rnaseq_data.load_by_patient(pids, annotate_by='Ensembl Gene ID')
    # discard unmapped, etc
    obj.data = obj.data.loc[obj.data.index.str.contains('ENSG')]

    dat_filt = filter_by_cpm(obj.data, min_n_samples=2)

    de = {}
    de_up = {}
    de_down = {}
Example #18
0
import os
import collections
import gzip
import numpy as np
from scipy import stats
import re
from settings import DATA_DIR, LOCAL_DATA_DIR, GIT_LFS_DATA_DIR
import pysam
from matplotlib import pyplot as plt
import pandas as pd
import multiprocessing as mp
from utils import log, genomics, output
logger = log.get_console_logger(__name__)

if __name__ == "__main__":
    outdir = output.unique_output_dir("rrbs_enzyme_specificity",
                                      reuse_empty=True)

    basedir = os.path.join(DATA_DIR, 'rrbseq', 'GC-CV-7163')

    indir = os.path.join(basedir, 'trim_galore_mouse/bismark')
    bam_fn = os.path.join(indir, 'GC-CV-7163-6_S6_pe.sorted.bam')
    cov_fn = os.path.join(indir, 'GC-CV-7163-6_S6_bismark.cov.gz')
    s = pysam.AlignmentFile(bam_fn, 'rb')
    chroms = [str(t) for t in range(1, 20)]

    # theoretical (binomial) distribution of inferred methylation by coverage
    Ns = [10, 20, 50, 100]
    cs = ['k', 'r', 'b', 'g']
    ps = [0.1, 0.25, 0.5]
    for p in ps:
        fig = plt.figure()
def log_cpm(dat, base=2, offset=1.):
    dat = dat + offset
    if len(dat.shape) == 2:
        cpm = dat.divide(dat.sum(), axis=1) * 1e6
    else:
        cpm = dat.divide(dat.sum()) * 1e6
    return np.log(cpm) / np.log(base)


if __name__ == '__main__':
    pids = ['019', '031', '049', '052']
    min_cpm = 1
    min_cpm_individual = 0.1

    outdir = output.unique_output_dir("james_opc_smartseq2_vs_polya")

    ## 1) STAR CPM estimates

    ss2_obj = loader.load_references('wtchg_p180059', strandedness='u')
    assigned_sum = ss2_obj.data.sum()
    unassigned_sum = ss2_obj.data_unassigned.drop('N_unmapped').sum()

    ss2_pct_assigned = assigned_sum / (assigned_sum + unassigned_sum) * 100.

    print "SmartSeq2 samples % assigned"
    print ss2_pct_assigned

    polya_obj = loader.load_by_patient(pids)

    # restrict to relevant samples for first part of the analysis
Example #20
0
    eps = .1  # offset for log transform

    rna_ff_samples = [
        'NH15_1661DEF2C',
        'NH15_1877_SP1C',
        'NH15_2101_DEF1A',
        'NH16_270_DEF1Ereplacement',
        'NH16_616DEF1B',
        'NH16_677_SP1A',
        'NH16_2063_DEF1Areplacement',
        'NH16_2214DEF1A',
        'NH16_2255DEF1B2',
        'NH16_2806DEF3A1'
    ]

    outdir = output.unique_output_dir("cruk_ffpe_cc_correlation")

    if remove_mt:
        mt_ens = general.get_mitochondrial(9606)

    rna_cc_obj = rnaseq.loader.load_by_patient(pids, source=source, include_control=False)
    rna_ff_obj = rnaseq.loader.load_by_patient(pids, source=source, include_control=False, type='ffpe')

    # filter
    ix = rna_ff_obj.meta.index.isin(rna_ff_samples)
    rna_ff_obj.filter_samples(ix)

    ix = rna_cc_obj.meta.type == 'GBM'
    rna_cc_obj.filter_samples(ix)

    # add NH ID and patient ID to FFPE
Example #21
0
import os

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import rankdata

from load_data import rnaseq_data
from stats import transformations
from utils.output import unique_output_dir
from utils.reference_genomes import ensembl_to_gene_symbol, gene_symbol_to_ensembl

if __name__ == "__main__":
    outdir = unique_output_dir("tom_qpcr", reuse_empty=True)
    ref = 'GIBCO_NSC_P4'

    obj = rnaseq_data.all_hgic_loader(annotate_by="Ensembl Gene ID")
    dat = obj.data.loc[obj.data.index.str.contains('ENSG')]
    dat = dat.loc[:, ~obj.meta.index.str.contains('DURA')]
    # normalised version (by number of aligned reads)
    dat_n = dat.divide(dat.sum(axis=0), axis=1) * 1e6

    # remove any absent / mostly absent genes
    median_count = dat_n.median(axis=1).sort_values()
    keep_idx = median_count.loc[median_count != 0].index

    dat = dat.loc[keep_idx]
    dat_n = dat_n.loc[keep_idx]
    median_count = median_count.loc[keep_idx]

    # remove any genes that are (mostly) absent in NSC
Example #22
0
    # source = 'star'

    # units = 'estimated_counts'
    units = 'tpm'
    # units = 'cpm'
    # units = 'counts'

    # transform = 'vst'
    transform = 'log'

    # remove_mt = True
    remove_mt = False

    pca_add_sample_names = False

    outdir = unique_output_dir("mouse_nsc_pca_cluster", reuse_empty=True)
    n_gene_try = [1000, 2000, 3000,
                  5000][::-1]  # largest first, so we can reuse the MAD array

    if source == 'star':
        load_cls = loader.StarCountLoader
        load_kwargs = {}
    elif source == 'salmon':
        load_cls = loader.SalmonQuantLoader
        load_kwargs = {'units': units}
    else:
        raise ValueError("Unrecognised source %s" % source)

    if units == 'tpm':
        eps = .01
    elif units == 'estimated_counts':
Example #23
0
        if s.mate(rd) not in reads_seen:
            reads_seen.add(rd)
    return len(reads_seen)


if __name__ == "__main__":
    """
    Usage: rrbs_theor_fragment_analysis.py <BAM_FN>
    BAM_FN must be a sorted bam file
    """
    bam_fn = sys.argv[1]
    if not os.path.isfile(bam_fn):
        raise ValueError("Unable to find BAM file %s" % bam_fn)

    bam_dir = os.path.split(os.path.abspath(bam_fn))[0]
    outdir = output.unique_output_dir("rrbs_fragment_analysis",
                                      root_output_dir=bam_dir)

    # fixed output directory for BED regions
    bed_outdir = os.path.join(output.OUTPUT_DIR, "rrbs_theor_fragments")
    if not os.path.exists(bed_outdir):
        logger.info("Created output dir %s", bed_outdir)
        os.makedirs(bed_outdir)

    # same output directory for remaining results
    outfile = re.sub(r'(\.sorted)?\.bam', ".coverage.pkl",
                     os.path.split(bam_fn)[-1])
    outfn = os.path.join(outdir, outfile)
    fcounts_outfn = os.path.join(
        outdir,
        re.sub(r'(\.sorted)?\.bam', '.mspi_fragments.counts',
               os.path.split(bam_fn)[-1]))
Example #24
0
    def get_result(self,
                   sample_id,
                   outdir=None,
                   sample_name=None,
                   run_id=None):
        """
        Retrieve results relating to a sample and save to disk.
        :param sample_name: If supplied, this overrides the submitted sample name
        :param run_id: If supplied, this is used, otherwise the latest run is automatically determined
        """

        if self.outdir is None:
            if outdir is None:
                self.outdir = unique_output_dir('heidelberg_classifier',
                                                reuse_empty=True)
            else:
                self.outdir = outdir
            print "Data will be downloaded to %s" % self.outdir

        the_url = self.SAMPLE_URL.format(sid=sample_id)
        resp = self.session.get(the_url)
        soup = BeautifulSoup(resp.content, "html.parser")

        summary = self.get_summary_data(soup=soup)
        batch = summary['batch']
        if sample_name is None:
            sample_name = summary['sample_name'].strip()
        # ensure sample name is a valid identifier
        sample_name = re.sub(r' +', '_', sample_name)
        sample_name = sample_name.replace('/', '-')
        sample_name = sample_name.replace('\\', '-')

        if run_id is None:
            run_id = summary['run_id']
        created_at = summary['created_at']

        logger.info("Sample %s, run ID %d, batch %s", sample_name, run_id,
                    batch)

        # one of three situations:
        # 1) Classifier has not finished any modules. Probably needs restarting.
        # 2) Classification has completed but full report not available. Retrieve classification scores.
        # 3) Full report available. Download all data.

        t = soup.findAll(
            text=re.compile(r'.*Classifier script not finished.*'))
        if len(t) > 0:
            # situation (1)
            # get creation time
            # this is fragile, but easier than trawlind through tables!
            logger.info(
                "Sample ID %d (%s). Classification script is not finished. Nothing to do.",
                sample_id, sample_name)
            dt = (datetime.datetime.utcnow() - created_at).total_seconds()
            if dt > 18000:
                logger.warn(
                    "Submitted more than 5 hours ago. Consider restarting.")
            return

        # create the output subdir if necessary
        out_subdir = os.path.join(self.outdir, batch)
        if not os.path.isdir(out_subdir):
            try:
                os.makedirs(out_subdir)
            except OSError as exc:
                logger.error("Failed to create output directory %s",
                             out_subdir)

        # try getting the pdf report
        aa = soup.findAll('a')
        aa = [t for t in aa if re.search(r'Download *idat_', t.get_text())]
        if len(aa) == 0:
            logger.error("No download link found")
        else:
            the_url = '/'.join([self.ROOT_URL, aa[0]['href']])

            resp = self.session.get(the_url)
            if resp.status_code == 200:
                # if this works, we know we're in situation (3)
                outfile = os.path.join(self.outdir, batch,
                                       "%s.pdf" % sample_name)
                if os.path.isfile(outfile):
                    logger.error("File already exists: %s", outfile)
                logger.info("Saving PDF file to %s", outfile)
                with open(outfile, 'wb') as f:
                    f.write(resp.content)

            # download the full analysis results
            the_url = self.ANALYSIS_RESULTS_URL.format(sid=sample_id,
                                                       rid=run_id)
            logger.info("Downloading zipped results file for sample %s",
                        sample_id)
            resp = self.session.get(the_url)

            outfile = os.path.join(self.outdir, batch, "%s.zip" % sample_name)
            if os.path.isfile(outfile):
                logger.error("File already exists: %s", outfile)
            logger.info("Saving zip file to %s", outfile)
            with open(outfile, 'wb') as f:
                f.write(resp.content)

        # situation (2) OR (3)
        # Either way, get the classifier results
        # FIXME: (April 2019) page layout change has broken this part
        try:
            raw_scores = read_table(soup.find(attrs={'id': 'rawScores'}))
            raw_scores = self.save_scores(raw_scores, sample_name, batch,
                                          'raw_scores')
        except Exception:
            logger.exception("Failed to retrieve raw scores.")

        try:
            cal_scores = read_table(
                soup.find(attrs={'id': 'calibratedScores'}))
            cal_scores = self.save_scores(cal_scores, sample_name, batch,
                                          'calibrated_scores')
        except Exception:
            logger.exception("Failed to retrieve calibrated scores.")
Example #25
0
if __name__ == "__main__":
    dmr_params = {
        'd_max': 400,
        'n_min': 6,
        'delta_m_min': 0.4,
        'alpha': 0.01,
        'dmr_test_method': 'mwu_permute',  # 'mwu', 'mwu_permute'
        'test_kwargs': {},
        'n_jobs': mp.cpu_count(),
    }

    me_data_indir = os.path.join(OUTPUT_DIR, 'mb_methylation_data')
    de_results_indir = os.path.join(GIT_LFS_DATA_DIR, 'mb_de_bmi1_chd7')

    outdir = output.unique_output_dir("mb_de_dmr")
    norm_method = 'swan'

    obj = loader.IlluminaHumanMethylationLoader(
        base_dir=me_data_indir,
        meta_fn=os.path.join(me_data_indir, 'sources.csv'),
        norm_method=norm_method,
    )

    # obj = loader.load_by_patient(['3021', 'ICb1299'], norm_method=norm_method, include_control=False)

    # add condition and cell line column to meta
    meta = obj.meta
    # condition = pd.Series({
    #     '3021_1_Scr': 'scramble',
    #     '3021_1_shB': 'shBMI1',
Example #26
0
import os

import pandas as pd

from load_data import rnaseq_data
from scripts.rnaseq import gtf_reader
from utils import reference_genomes
from utils.output import unique_output_dir

if __name__ == '__main__':
    gene_lengths = {
        'PDGFRA': 6576,
        'SLC1A3': 4170,
    }

    OUTDIR = unique_output_dir("jb.marker_levels", reuse_empty=True)

    # GSE73721 (reference astrocytes, oligos, ...)
    obj73721 = rnaseq_data.gse73721(source='star',
                                    annotate_by='Ensembl Gene ID')

    # remove unneeded samples
    to_keep73721 = (obj73721.data.columns.str.contains('yo ctx astro')
                    | obj73721.data.columns.str.contains('Hippocampus astro')
                    | obj73721.data.columns.str.contains('oligo'))

    # GSE61794 (H9-derived NSC x 2)
    obj61794 = rnaseq_data.gse61794(source='star',
                                    annotate_by='Ensembl Gene ID')
    # combining replicates
    rc = obj61794.meta.read_count.sum()
    # apply_qn = False
    dist_metric = 'pearson'
    # dist_metric = 'spearman'
    remove_mt = True
    min_tpm = 1.
    eps = .1  # offset for log transform

    rna_ff_samples = [
        'NH15_1661DEF2C', 'NH15_1877_SP1C', 'NH15_2101_DEF1A',
        'NH16_270_DEF1Ereplacement', 'NH16_616DEF1B', 'NH16_677_SP1A',
        'NH16_2063_DEF1Areplacement', 'NH16_2214DEF1A', 'NH16_2255DEF1B2',
        'NH16_2806DEF3A1'
    ]

    script_name = os.path.splitext(os.path.basename(sys.argv[0]))[0]
    outdir = output.unique_output_dir(script_name)

    if remove_mt:
        mt_ens = general.get_mitochondrial(9606)

    rna_cc_obj = rnaseq.loader.load_by_patient(pids,
                                               source=source,
                                               include_control=False)
    rna_ff_obj = rnaseq.loader.load_by_patient(pids,
                                               source=source,
                                               include_control=False,
                                               type='ffpe')

    # filter
    ix = rna_ff_obj.meta.index.isin(rna_ff_samples)
    rna_ff_obj.filter_samples(ix)
Example #28
0
        return data.subtract(data.mean(axis=0), axis=1).divide(data.std(axis=0), axis=1)
    elif axis == 1:
        return data.subtract(data.mean(axis=1), axis=0).divide(data.std(axis=1), axis=0)
    else:
        raise AttributeError("Axis must be 0 (norm by col) or 1 (norm by row)")


def impute_missing(data, strategy='median'):
    X = data.copy()
    imp = Imputer(missing_values='NaN', strategy=strategy, axis=0)
    X = imp.fit_transform(X)
    return pd.DataFrame(X, index=data.index, columns=data.columns)


if __name__ == "__main__":
    outdir = unique_output_dir("hie_full_cohort_results", reuse_empty=True)

    dat = load_cleaned_data()
    dat.loc[:, 'batch'] = [t[:2] for t in dat.index]
    biomarkers = dat.loc[:, (
        BIOMARKER_PEAK_COLS
        + BIOMARKER_TROUGH_COLS
        + BIOMARKER_PEAK_AGE_COLS
        + BIOMARKER_TROUGH_AGE_COLS
    )]
    outcomes = dat.loc[:, OUTCOME_COL]
    peaks_dat = dat.loc[:, BIOMARKER_PEAK_COLS + BIOMARKER_TROUGH_COLS]
    nvar = peaks_dat.shape[1]
    X = impute_missing(peaks_dat, strategy='median')

    meconium_idx = dat.loc[:, 'Meconium Aspiration'] == 'Y'
                                               row_colours,
                                               fig_kws={'figsize': (5.5, 10)},
                                               vertical=False,
                                               metric=metric)
        fig_dict[ng] = d
    return fig_dict


if __name__ == "__main__":
    norm_method = 'bmiq'
    # norm_method = 'swan'
    n_hipsci = 12
    # qn_method = 'median'
    qn_method = None

    outdir = output.unique_output_dir()
    # load 12 patients iNSC, 4 iPSC
    pids = consts.PIDS

    # we'll list our samples explicitly to avoid results changing in future
    our_samples = [
        'DURA018_NSC_N4_P4',
        'DURA018_NSC_N2_P6',
        'DURA019_NSC_N8C_P2',
        'DURA019_NSC_N5C1_P2',
        'DURA019_FB_P7',
        'DURA019_IPSC_N8C_P13',
        'DURA030_NSC_N16B6_P1',
        'DURA030_NSC_N9_P2',
        'DURA030_FB_P8',
        'DURA030_IPSC_N16B6_P13',
import os
import pandas as pd
from matplotlib import pyplot as plt
from plotting import venn
from settings import GIT_LFS_DATA_DIR
from utils.output import unique_output_dir

if __name__ == '__main__':
    outdir = unique_output_dir("mg_bmdm_venn")
    indir = os.path.join(GIT_LFS_DATA_DIR, 'GSE86573_bowman_de')

    gl261_mg = pd.read_csv(os.path.join(indir, 'gl261_mg_vs_healthy_mg.csv'),
                           header=0,
                           index_col=0)
    gl261_bmdm = pd.read_csv(os.path.join(
        indir, 'gl261_bmdm_vs_healthy_monocyte.csv'),
                             header=0,
                             index_col=0)
    gemm_mg = pd.read_csv(os.path.join(indir, 'gemm_mg_vs_healthy_mg.csv'),
                          header=0,
                          index_col=0)
    gemm_bmdm = pd.read_csv(os.path.join(indir,
                                         'gemm_bmdm_vs_healthy_monocyte.csv'),
                            header=0,
                            index_col=0)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    v, sets, counts = venn.venn_diagram(gl261_mg.index,
                                        gemm_mg.index,
                                        set_labels=("GL261 MG", "GEMM MG"),