def prepare_gct_files_hgic(pids=consts.ALL_PIDS, outdir=None): """ Prepare the GCT files required to perform classification of the hGIC samples: - hGIC FFPE - hGIC cell culture - Both combined In all cases, use FPKM units (cufflinks), TPM (salmon) and CPM (STAR). Use gene symbols as these are contained in the signatures. """ if outdir is None: outdir = output.unique_output_dir() infiles = [] loaded = {} for typ in ('cell_culture', 'ffpe'): for src in ('star', 'salmon', 'star/cufflinks'): this_obj = loader.load_by_patient(pids, type=typ, source=src, include_control=False) this_obj.filter_samples(this_obj.meta.type == 'GBM') if typ == 'ffpe': # restrict to the 'best' versions (there are some duplicates where we tried twice) this_obj.filter_by_sample_name(consts.FFPE_RNASEQ_SAMPLES_ALL) this_dat = reference_genomes.translate_quantification_resolving_duplicates( this_obj.data, 'Ensembl Gene ID', 'Approved Symbol' ) loaded.setdefault(typ, {})[src] = this_dat fn = os.path.join(outdir, "%s_%s.gct" % (SRC_MAP[src], typ)) gsea.data_to_gct(this_dat, fn) infiles.append(fn) return infiles
def download_from_ftp(dl_paths, outdir=None, host='ftp-trace.ncbi.nlm.nih.gov', user='', passwd=''): """ :param dl_paths: List of paths to download :param host: :param user: :param passwd: :return: """ if outdir is None: outdir = __name__ outdir = unique_output_dir(outdir) ftp = ftplib.FTP(host=host, user=user, passwd=passwd) ftp.login() for ff in dl_paths: # get download filename sp = [t for t in ff.split('/') if len(t)] sp = sp[-1] outfile = os.path.join(outdir, sp) logger.info("Attempting to download %s to %s", ff, outfile) with open(outfile, 'wb') as f: try: ftp.retrbinary('RETR %s' % ff, f.write) except Exception: logger.exception("Download failed")
def download_from_manifest(path_to_manifest, outdir=None, legacy=False): """ Download all files from the provided manifest :param path_to_manifest: :param outdir: If None, create a unique output folder :return: """ if outdir is None: outdir = unique_output_dir("nih_gdc_legacy") mani = pd.read_csv(path_to_manifest, sep='\t', header=0, index_col=0) for fid, row in mani.iterrows(): outfile = os.path.join(outdir, row.filename) download_data(fid, outfile, legacy=legacy)
def prepare_gct_files(outdir=None): """ Prepare the GCT files required to perform classification: - Our GBM FFPE and cell culture samples - TCGA RNA-Seq cohort - Both combined In all cases, use FPKM units and gene symbols, as these are used by Wang """ if outdir is None: outdir = unique_output_dir("gct_files_for_wang") infiles = [] # 1) Our data obj_ffpe = rnaseq_data.load_by_patient('all', type='ffpe') dat_ffpe = obj_ffpe.get_fpkm() dat_ffpe.columns = ['%s_FFPE' % t for t in obj_ffpe.meta.reference_id] obj_cc = rnaseq_data.load_by_patient(patient_ids='all') dat_cc = obj_cc.get_fpkm() dat_cc = dat_cc.loc[:, obj_cc.meta.type == 'GBM'] dat_all = pd.concat((dat_cc, dat_ffpe), axis=1) idx = reference_genomes.ensembl_to_gene_symbol(dat_all.index).dropna() dat_all = dat_all.loc[idx.index] dat_all.index = idx fn = os.path.join(outdir, "gbm_ffpe_cc_fpkm.gct") gsea.data_to_gct(dat_all, fn) infiles.append(fn) # 2) TCGA (IDH1 WT only) tcga_dat, tcga_meta = rnaseq_data.tcga_primary_gbm(units='fpkm') tcga_dat = tcga_dat.loc[:, tcga_meta.idh1_status == 'WT'] idx = reference_genomes.ensembl_to_gene_symbol(tcga_dat.index).dropna() idx = idx.loc[~idx.index.duplicated()] tcga_dat = tcga_dat.loc[idx.index] tcga_dat.index = idx fn = os.path.join(outdir, "tcga_idh1_wt_fpkm.gct") gsea.data_to_gct(tcga_dat, fn) infiles.append(fn) # 3) Combined dat = gsea.combine_gct_files(*infiles) fn = os.path.join(outdir, "tcga_idh1_wt_and_gbm_ffpe_cc_fpkm.gct") gsea.data_to_gct(dat, fn)
import os import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from load_data import rnaseq_data from rnaseq import differential_expression, general from settings import LOCAL_DATA_DIR from utils import output, setops, excel, ipa, reference_genomes if __name__ == "__main__": outdir = output.unique_output_dir("cross_validate_de_multiple_refs", reuse_empty=True) # all n=2 samples and RTK II samples pids = ['017', '019', '030', '031', '050', '054'] cmap = 'RdYlGn_r' de_params = { 'lfc': 1, 'fdr': 0.01, 'method': 'GLM' } subgroups = { 'RTK I': ['019', '030', '031'], 'RTK II': ['017', '050', '054'], } intersecter = lambda x, y: set(x).intersection(y)
if not isinstance(the_names, str): the_names = ';'.join(the_names) c.writerow([ row[0], row[1], row[2], the_names, '.', # empty score field row[3] ]) if __name__ == "__main__": distance = 5000 # distance from TSS to include fn = os.path.join(LOCAL_DATA_DIR, 'reference_genomes', 'human', 'ensembl', 'GRCh38.release87', 'gtf', 'Homo_sapiens.GRCh38.87.gtf.gz') outdir = output.unique_output_dir("chipseq_analysis") reg, names = get_gene_tss_from_gtf(fn, distance=distance, sources=SOURCES) fn_out = os.path.join(outdir, 'gene_tss_pad_%d.bed' % distance) write_bed_file(reg, names, fn_out) reg, names = get_transcript_tss_from_gtf(fn, distance=distance, sources=SOURCES) fn_out = os.path.join(outdir, 'transcript_tss_pad_%d.bed' % distance) write_bed_file(reg, names, fn_out)
# } subgroup_set_colours = { 'RTK I full': '#0d680f', 'RTK II full': '#820505', 'MES full': '#7900ad', 'RTK I partial': '#6ecc70', 'RTK II partial': '#d67373', 'MES partial': '#cc88ea', 'mixed': '#4C72B0', 'specific': '#f4e842', } min_cpm = 1 outdir = output.unique_output_dir("compare_de_gene_counts_s1", reuse_empty=True) obj = loader.load_by_patient(pids, include_control=False) # remove IPSC and rejected 061 samples for good idx = ((~obj.meta.index.str.contains('IPSC')) & (~obj.meta.index.isin(['DURA061_NSC_N1_P5', 'DURA061_NSC_N6_P4']))) obj.meta = obj.meta.loc[idx] obj.data = obj.data.loc[:, idx] obj.batch_id = obj.batch_id.loc[idx] # we'll run everything with two different edgeR tests methods = ('GLM', 'QLGLM') res_1 = {}
cg.ax_heatmap.yaxis.label.set_visible(False) cg.ax_heatmap.xaxis.label.set_visible(False) if show_gene_labels: plt.setp(cg.ax_heatmap.yaxis.get_ticklabels(), rotation=0, fontsize=14) else: cg.ax_heatmap.yaxis.set_ticklabels([]) return cg if __name__ == "__main__": N_PC = 3 geneset = consts.NORTHCOTT_GENES outdir = unique_output_dir("pca_atcc_lines") # it's useful to maintain a list of known upregulated genes nano_genes = [] for grp, arr in consts.NANOSTRING_GENES: if grp != 'WNT': nano_genes.extend(arr) nano_genes.remove('EGFL11') nano_genes.append('EYS') all_nstring = [] [all_nstring.extend(t) for _, t in consts.NANOSTRING_GENES] all_ncott = [] [all_ncott.extend(t) for _, t in consts.NORTHCOTT_GENES] # load Ncott data (285 non-WNT MB samples)
import pandas as pd from utils import output from settings import OUTPUT_DIR, DATA_DIR import os from plotting import venn, clustering from matplotlib import pyplot as plt if __name__ == "__main__": outdir = output.unique_output_dir("tcga_gbm_analysis", reuse_empty=True) # load meta files meta_fn = { 'rnaseq': os.path.join(DATA_DIR, 'rnaseq', 'tcga_gbm', 'primary_tumour', 'rnaseq.meta.csv'), 'marr_u133': os.path.join(DATA_DIR, 'microarray', 'tcga_gbm', 'primary_tumour', 'microarray.meta.ht_hg_u133a.csv'), 'marr_agilent1': os.path.join(DATA_DIR, 'microarray', 'tcga_gbm', 'primary_tumour', 'microarray.meta.agilentg4502a_07_1.csv'), 'marr_agilent2': os.path.join(DATA_DIR, 'microarray', 'tcga_gbm', 'primary_tumour', 'microarray.meta.agilentg4502a_07_2.csv'), 'meth_450k': os.path.join(DATA_DIR, 'methylation', 'tcga_gbm', 'primary_tumour', 'methylation.450k.meta.csv'), 'meth_27k': os.path.join(DATA_DIR, 'methylation', 'tcga_gbm', 'primary_tumour', 'methylation.27k.meta.csv'), }
from utils import genomics, output, log from matplotlib import pyplot as plt import seaborn as sns logger = log.get_console_logger(__name__) def plot_one_hist(dat, ax, *args, **kwargs): mval = dat[:, 0] / dat.sum(axis=1).astype(float) * 100. ax.hist(mval, *args, **kwargs) if __name__ == "__main__": min_coverage = 10 outdir = output.unique_output_dir("rrbs_methylation_cpg_islands", reuse_empty=True) cpg_island_tsv = os.path.join(GIT_LFS_DATA_DIR, 'mouse_cpg_island', 'grcm38_cpgisland.tsv') cpg_regions = pd.read_csv(cpg_island_tsv, sep='\t', header=0) indir = os.path.join(DATA_DIR, 'rrbseq', 'GC-CV-7163', 'trim_galore', 'mouse', 'bismark') subdir = "GC-CV-7163-{i}_S{i}" flist = glob(os.path.join(indir, "*.bismark.cov.gz")) chroms = [str(t) for t in range(1, 20)] chrom_lengths = genomics.reference_genome_chrom_lengths(tax_id=10090) # discard unplaced scaffolds, MT, X, Y chrom_lengths = chrom_lengths.loc[chroms]
from utils import output def log_cpm(dat, base=2, offset=1.): dat = dat + offset if len(dat.shape) == 2: cpm = dat.divide(dat.sum(), axis=1) * 1e6 else: cpm = dat.divide(dat.sum()) * 1e6 return np.log(cpm) / np.log(base) if __name__ == '__main__': min_cpm = 0.01 outdir = output.unique_output_dir("biological_technical_ecdf") # all our patient data (cell culture) our_patient_obj = loader.load_by_patient('all', source='star') # all our patient data (FFPE culture) ffpe_samples = [ 'NH15_1661DEF2C', 'NH15_1877_SP1C', 'NH15_2101_DEF1A', 'NH16_270_DEF1Ereplacement', 'NH16_616DEF1B', 'NH16_677_SP1A', 'NH16_1574DEF1A', 'NH16_1976_DEF1Areplacement',
import os import pandas as pd from utils import output, setops, log, genomics from settings import INTERMEDIATE_DIR from scripts.hgic_final import two_strategies_combine_de_dmr as tscdd from scripts.hgic_final import consts from methylation import dmr logger = log.get_console_logger() if __name__ == "__main__": outdir = output.unique_output_dir(reuse_empty=True) # Set this to True to include reference methylation data # This will limit the number of available probes (to 450K) include_external_dm_refs = False de_params = consts.DE_PARAMS dmr_params = consts.DMR_PARAMS norm_method_s1 = 'swan' pids = consts.PIDS if include_external_dm_refs: external_ref_names_dm = ['GSE38216'] external_ref_samples_dm = ['H9 NPC 1', 'H9 NPC 2'] else: external_ref_names_dm = None external_ref_samples_dm = None
[row.CHR, row.Strand, row.MAPINFO, t[0], t[1]]) return pd.DataFrame(this_res, columns=['probe_id'] + df.columns.tolist() + anno_cols) if __name__ == '__main__': anno = loader.load_illumina_methylationepic_annotation(split_genes=False) # 1. Annotate DMPs and re-export to Excel # dmp_fns = glob(os.path.join(GIT_LFS_DATA_DIR, 'mb_dmp', '*.xlsx')) dmp_fns = glob(os.path.join(os.path.expanduser('~/temp'), '*.xlsx')) print "Found %d relevant input (DMP) files: %s" % (len(dmp_fns), ', '.join(dmp_fns)) outdir = output.unique_output_dir("mb_dmps") res = {} for fn in dmp_fns: base = os.path.splitext(os.path.basename(fn))[0] res[base] = {} dat = pd.read_excel(fn, sheet_name=None) for cmp, df in dat.items(): res[base][cmp] = annot_one(df, anno) # save to Excel out_fn = os.path.join(outdir, os.path.basename(fn)) excel.pandas_to_excel(res[base], out_fn, write_index=False) # 2.1 Look for common DMPs
from rnaseq import loader, differential_expression, general, filter import os import pandas as pd from utils import output from settings import RNASEQ_DIR import numpy as np """ Aim: Carry out DE analysis on the TCGA RNA-Seq data, all primary tumour samples vs all solid healthy tissue """ if __name__ == "__main__": de_params = {'lfc': 1, 'fdr': 0.01, 'method': 'QLGLM'} outdir = output.unique_output_dir("tcga_de") indir_pt = os.path.join(RNASEQ_DIR, 'tcga_gbm', 'primary_tumour') indir_hn = os.path.join(RNASEQ_DIR, 'tcga_gbm', 'solid_tissue_normal') dat_fn = os.path.join(indir_pt, 'rnaseq.htseq.csv.gz') dat_hn_fn = os.path.join(indir_hn, 'rnaseq_normal.htseq.csv.gz') meta_fn = os.path.join(indir_pt, 'brennan_s7.csv') meta = pd.read_csv(meta_fn, header=0, index_col=0) dat_pt = pd.read_csv(dat_fn, header=0, index_col=0) dat_hn = pd.read_csv(dat_hn_fn, header=0, index_col=0) dat_pt.columns = [t[:12] for t in dat_pt.columns] dat_pt = dat_pt.loc[:, ~dat_pt.columns.duplicated()] meta = meta.loc[dat_pt.columns] meta = meta.loc[~meta.index.duplicated()]
def compute_median_betas_one_sample(the_dat, probes_by_gene): missing_probes = set() missing_genes = [] res = {} for g, probes in probes_by_gene.items(): try: res[g] = the_dat.loc[probes].median(axis=0) except KeyError: missing_probes.update(probes) missing_genes.append(g) return res, missing_genes, missing_probes if __name__ == "__main__": outdir = output.unique_output_dir("report_beta_values") ###################### # 1: PRIMARY TUMOUR # ###################### # in this case, we want the median beta value over all probes that are associated with a given gene # we'll exclude those associated with gene body only indir = os.path.join(DATA_DIR, 'methylation', 'tcga_gbm', 'primary_tumour') meta_fn = os.path.join(indir, 'methylation.450k.meta.csv') dat_fn = os.path.join(indir, 'methylation.450k.csv.gz') meta = pd.read_csv(meta_fn, header=0, index_col=0) dat = pd.read_csv(dat_fn, header=0, index_col=0, skiprows=[1]) print "Primary tumour (%d samples)" % meta.shape[0]
import os from utils.output import unique_output_dir from load_data import methylation_array from plotting import clustering, pca from classification import lda from sklearn.decomposition import PCA import pandas as pd from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.ensemble import RandomForestClassifier from matplotlib import pyplot as plt import seaborn as sns import numpy as np if __name__ == "__main__": outdir = unique_output_dir('meth_classification_lda') REF_META_SUBGRP_LABEL = 'dna methylation subgroup' data, meta = methylation_array.hgic_methylationepic(norm_method='swan') # Don't know why, but some probes (~2000) are only present in one OR the other sample # Therefore, remove those data = data.dropna() # add some extra meta information meta.loc[:, 'cell_type'] = 'NSC' meta.loc[meta.index.str.contains('GBM'), 'cell_type'] = 'GBM' meta.loc[:, 'subgroup'] = 'RTK I' meta.loc[meta.index.str.contains('024'), 'subgroup'] = 'Unknown' meta.loc[meta.index.str.contains('026'), 'subgroup'] = 'Unknown' meta.loc[meta.index.str.contains('044'), 'subgroup'] = 'Mesenchymal' meta.loc[meta.index.str.contains('GIBCO'), 'subgroup'] = 'NSC'
# resolve any duplicates arbitrarily (these should be rare) gs = gs.loc[~gs.index.duplicated()] df.insert(0, 'Gene Symbol', gs) def add_fc_direction(df): direction = pd.Series(index=df.index, name='Direction') direction.loc[df.logFC < 0] = 'down' direction.loc[df.logFC > 0] = 'up' df.insert(df.shape[1], 'Direction', direction) if __name__ == '__main__': lfc = 1 fdr = 0.01 outdir = output.unique_output_dir("paired_rnaseq") # RTK II samples # pids = ['017', '050', '054', '061'] # all n=2 samples pids = ['018', '044', '049', '050', '052', '054', '061'] # all samples # pids = [t for t in rnaseq_data.PATIENT_LOOKUP_STAR if t != 'GIBCO'] obj = rnaseq_data.load_by_patient(pids, annotate_by='Ensembl Gene ID') # discard unmapped, etc obj.data = obj.data.loc[obj.data.index.str.contains('ENSG')] dat_filt = filter_by_cpm(obj.data, min_n_samples=2) de = {} de_up = {} de_down = {}
import os import collections import gzip import numpy as np from scipy import stats import re from settings import DATA_DIR, LOCAL_DATA_DIR, GIT_LFS_DATA_DIR import pysam from matplotlib import pyplot as plt import pandas as pd import multiprocessing as mp from utils import log, genomics, output logger = log.get_console_logger(__name__) if __name__ == "__main__": outdir = output.unique_output_dir("rrbs_enzyme_specificity", reuse_empty=True) basedir = os.path.join(DATA_DIR, 'rrbseq', 'GC-CV-7163') indir = os.path.join(basedir, 'trim_galore_mouse/bismark') bam_fn = os.path.join(indir, 'GC-CV-7163-6_S6_pe.sorted.bam') cov_fn = os.path.join(indir, 'GC-CV-7163-6_S6_bismark.cov.gz') s = pysam.AlignmentFile(bam_fn, 'rb') chroms = [str(t) for t in range(1, 20)] # theoretical (binomial) distribution of inferred methylation by coverage Ns = [10, 20, 50, 100] cs = ['k', 'r', 'b', 'g'] ps = [0.1, 0.25, 0.5] for p in ps: fig = plt.figure()
def log_cpm(dat, base=2, offset=1.): dat = dat + offset if len(dat.shape) == 2: cpm = dat.divide(dat.sum(), axis=1) * 1e6 else: cpm = dat.divide(dat.sum()) * 1e6 return np.log(cpm) / np.log(base) if __name__ == '__main__': pids = ['019', '031', '049', '052'] min_cpm = 1 min_cpm_individual = 0.1 outdir = output.unique_output_dir("james_opc_smartseq2_vs_polya") ## 1) STAR CPM estimates ss2_obj = loader.load_references('wtchg_p180059', strandedness='u') assigned_sum = ss2_obj.data.sum() unassigned_sum = ss2_obj.data_unassigned.drop('N_unmapped').sum() ss2_pct_assigned = assigned_sum / (assigned_sum + unassigned_sum) * 100. print "SmartSeq2 samples % assigned" print ss2_pct_assigned polya_obj = loader.load_by_patient(pids) # restrict to relevant samples for first part of the analysis
eps = .1 # offset for log transform rna_ff_samples = [ 'NH15_1661DEF2C', 'NH15_1877_SP1C', 'NH15_2101_DEF1A', 'NH16_270_DEF1Ereplacement', 'NH16_616DEF1B', 'NH16_677_SP1A', 'NH16_2063_DEF1Areplacement', 'NH16_2214DEF1A', 'NH16_2255DEF1B2', 'NH16_2806DEF3A1' ] outdir = output.unique_output_dir("cruk_ffpe_cc_correlation") if remove_mt: mt_ens = general.get_mitochondrial(9606) rna_cc_obj = rnaseq.loader.load_by_patient(pids, source=source, include_control=False) rna_ff_obj = rnaseq.loader.load_by_patient(pids, source=source, include_control=False, type='ffpe') # filter ix = rna_ff_obj.meta.index.isin(rna_ff_samples) rna_ff_obj.filter_samples(ix) ix = rna_cc_obj.meta.type == 'GBM' rna_cc_obj.filter_samples(ix) # add NH ID and patient ID to FFPE
import os import numpy as np import pandas as pd from matplotlib import pyplot as plt from scipy.stats import rankdata from load_data import rnaseq_data from stats import transformations from utils.output import unique_output_dir from utils.reference_genomes import ensembl_to_gene_symbol, gene_symbol_to_ensembl if __name__ == "__main__": outdir = unique_output_dir("tom_qpcr", reuse_empty=True) ref = 'GIBCO_NSC_P4' obj = rnaseq_data.all_hgic_loader(annotate_by="Ensembl Gene ID") dat = obj.data.loc[obj.data.index.str.contains('ENSG')] dat = dat.loc[:, ~obj.meta.index.str.contains('DURA')] # normalised version (by number of aligned reads) dat_n = dat.divide(dat.sum(axis=0), axis=1) * 1e6 # remove any absent / mostly absent genes median_count = dat_n.median(axis=1).sort_values() keep_idx = median_count.loc[median_count != 0].index dat = dat.loc[keep_idx] dat_n = dat_n.loc[keep_idx] median_count = median_count.loc[keep_idx] # remove any genes that are (mostly) absent in NSC
# source = 'star' # units = 'estimated_counts' units = 'tpm' # units = 'cpm' # units = 'counts' # transform = 'vst' transform = 'log' # remove_mt = True remove_mt = False pca_add_sample_names = False outdir = unique_output_dir("mouse_nsc_pca_cluster", reuse_empty=True) n_gene_try = [1000, 2000, 3000, 5000][::-1] # largest first, so we can reuse the MAD array if source == 'star': load_cls = loader.StarCountLoader load_kwargs = {} elif source == 'salmon': load_cls = loader.SalmonQuantLoader load_kwargs = {'units': units} else: raise ValueError("Unrecognised source %s" % source) if units == 'tpm': eps = .01 elif units == 'estimated_counts':
if s.mate(rd) not in reads_seen: reads_seen.add(rd) return len(reads_seen) if __name__ == "__main__": """ Usage: rrbs_theor_fragment_analysis.py <BAM_FN> BAM_FN must be a sorted bam file """ bam_fn = sys.argv[1] if not os.path.isfile(bam_fn): raise ValueError("Unable to find BAM file %s" % bam_fn) bam_dir = os.path.split(os.path.abspath(bam_fn))[0] outdir = output.unique_output_dir("rrbs_fragment_analysis", root_output_dir=bam_dir) # fixed output directory for BED regions bed_outdir = os.path.join(output.OUTPUT_DIR, "rrbs_theor_fragments") if not os.path.exists(bed_outdir): logger.info("Created output dir %s", bed_outdir) os.makedirs(bed_outdir) # same output directory for remaining results outfile = re.sub(r'(\.sorted)?\.bam', ".coverage.pkl", os.path.split(bam_fn)[-1]) outfn = os.path.join(outdir, outfile) fcounts_outfn = os.path.join( outdir, re.sub(r'(\.sorted)?\.bam', '.mspi_fragments.counts', os.path.split(bam_fn)[-1]))
def get_result(self, sample_id, outdir=None, sample_name=None, run_id=None): """ Retrieve results relating to a sample and save to disk. :param sample_name: If supplied, this overrides the submitted sample name :param run_id: If supplied, this is used, otherwise the latest run is automatically determined """ if self.outdir is None: if outdir is None: self.outdir = unique_output_dir('heidelberg_classifier', reuse_empty=True) else: self.outdir = outdir print "Data will be downloaded to %s" % self.outdir the_url = self.SAMPLE_URL.format(sid=sample_id) resp = self.session.get(the_url) soup = BeautifulSoup(resp.content, "html.parser") summary = self.get_summary_data(soup=soup) batch = summary['batch'] if sample_name is None: sample_name = summary['sample_name'].strip() # ensure sample name is a valid identifier sample_name = re.sub(r' +', '_', sample_name) sample_name = sample_name.replace('/', '-') sample_name = sample_name.replace('\\', '-') if run_id is None: run_id = summary['run_id'] created_at = summary['created_at'] logger.info("Sample %s, run ID %d, batch %s", sample_name, run_id, batch) # one of three situations: # 1) Classifier has not finished any modules. Probably needs restarting. # 2) Classification has completed but full report not available. Retrieve classification scores. # 3) Full report available. Download all data. t = soup.findAll( text=re.compile(r'.*Classifier script not finished.*')) if len(t) > 0: # situation (1) # get creation time # this is fragile, but easier than trawlind through tables! logger.info( "Sample ID %d (%s). Classification script is not finished. Nothing to do.", sample_id, sample_name) dt = (datetime.datetime.utcnow() - created_at).total_seconds() if dt > 18000: logger.warn( "Submitted more than 5 hours ago. Consider restarting.") return # create the output subdir if necessary out_subdir = os.path.join(self.outdir, batch) if not os.path.isdir(out_subdir): try: os.makedirs(out_subdir) except OSError as exc: logger.error("Failed to create output directory %s", out_subdir) # try getting the pdf report aa = soup.findAll('a') aa = [t for t in aa if re.search(r'Download *idat_', t.get_text())] if len(aa) == 0: logger.error("No download link found") else: the_url = '/'.join([self.ROOT_URL, aa[0]['href']]) resp = self.session.get(the_url) if resp.status_code == 200: # if this works, we know we're in situation (3) outfile = os.path.join(self.outdir, batch, "%s.pdf" % sample_name) if os.path.isfile(outfile): logger.error("File already exists: %s", outfile) logger.info("Saving PDF file to %s", outfile) with open(outfile, 'wb') as f: f.write(resp.content) # download the full analysis results the_url = self.ANALYSIS_RESULTS_URL.format(sid=sample_id, rid=run_id) logger.info("Downloading zipped results file for sample %s", sample_id) resp = self.session.get(the_url) outfile = os.path.join(self.outdir, batch, "%s.zip" % sample_name) if os.path.isfile(outfile): logger.error("File already exists: %s", outfile) logger.info("Saving zip file to %s", outfile) with open(outfile, 'wb') as f: f.write(resp.content) # situation (2) OR (3) # Either way, get the classifier results # FIXME: (April 2019) page layout change has broken this part try: raw_scores = read_table(soup.find(attrs={'id': 'rawScores'})) raw_scores = self.save_scores(raw_scores, sample_name, batch, 'raw_scores') except Exception: logger.exception("Failed to retrieve raw scores.") try: cal_scores = read_table( soup.find(attrs={'id': 'calibratedScores'})) cal_scores = self.save_scores(cal_scores, sample_name, batch, 'calibrated_scores') except Exception: logger.exception("Failed to retrieve calibrated scores.")
if __name__ == "__main__": dmr_params = { 'd_max': 400, 'n_min': 6, 'delta_m_min': 0.4, 'alpha': 0.01, 'dmr_test_method': 'mwu_permute', # 'mwu', 'mwu_permute' 'test_kwargs': {}, 'n_jobs': mp.cpu_count(), } me_data_indir = os.path.join(OUTPUT_DIR, 'mb_methylation_data') de_results_indir = os.path.join(GIT_LFS_DATA_DIR, 'mb_de_bmi1_chd7') outdir = output.unique_output_dir("mb_de_dmr") norm_method = 'swan' obj = loader.IlluminaHumanMethylationLoader( base_dir=me_data_indir, meta_fn=os.path.join(me_data_indir, 'sources.csv'), norm_method=norm_method, ) # obj = loader.load_by_patient(['3021', 'ICb1299'], norm_method=norm_method, include_control=False) # add condition and cell line column to meta meta = obj.meta # condition = pd.Series({ # '3021_1_Scr': 'scramble', # '3021_1_shB': 'shBMI1',
import os import pandas as pd from load_data import rnaseq_data from scripts.rnaseq import gtf_reader from utils import reference_genomes from utils.output import unique_output_dir if __name__ == '__main__': gene_lengths = { 'PDGFRA': 6576, 'SLC1A3': 4170, } OUTDIR = unique_output_dir("jb.marker_levels", reuse_empty=True) # GSE73721 (reference astrocytes, oligos, ...) obj73721 = rnaseq_data.gse73721(source='star', annotate_by='Ensembl Gene ID') # remove unneeded samples to_keep73721 = (obj73721.data.columns.str.contains('yo ctx astro') | obj73721.data.columns.str.contains('Hippocampus astro') | obj73721.data.columns.str.contains('oligo')) # GSE61794 (H9-derived NSC x 2) obj61794 = rnaseq_data.gse61794(source='star', annotate_by='Ensembl Gene ID') # combining replicates rc = obj61794.meta.read_count.sum()
# apply_qn = False dist_metric = 'pearson' # dist_metric = 'spearman' remove_mt = True min_tpm = 1. eps = .1 # offset for log transform rna_ff_samples = [ 'NH15_1661DEF2C', 'NH15_1877_SP1C', 'NH15_2101_DEF1A', 'NH16_270_DEF1Ereplacement', 'NH16_616DEF1B', 'NH16_677_SP1A', 'NH16_2063_DEF1Areplacement', 'NH16_2214DEF1A', 'NH16_2255DEF1B2', 'NH16_2806DEF3A1' ] script_name = os.path.splitext(os.path.basename(sys.argv[0]))[0] outdir = output.unique_output_dir(script_name) if remove_mt: mt_ens = general.get_mitochondrial(9606) rna_cc_obj = rnaseq.loader.load_by_patient(pids, source=source, include_control=False) rna_ff_obj = rnaseq.loader.load_by_patient(pids, source=source, include_control=False, type='ffpe') # filter ix = rna_ff_obj.meta.index.isin(rna_ff_samples) rna_ff_obj.filter_samples(ix)
return data.subtract(data.mean(axis=0), axis=1).divide(data.std(axis=0), axis=1) elif axis == 1: return data.subtract(data.mean(axis=1), axis=0).divide(data.std(axis=1), axis=0) else: raise AttributeError("Axis must be 0 (norm by col) or 1 (norm by row)") def impute_missing(data, strategy='median'): X = data.copy() imp = Imputer(missing_values='NaN', strategy=strategy, axis=0) X = imp.fit_transform(X) return pd.DataFrame(X, index=data.index, columns=data.columns) if __name__ == "__main__": outdir = unique_output_dir("hie_full_cohort_results", reuse_empty=True) dat = load_cleaned_data() dat.loc[:, 'batch'] = [t[:2] for t in dat.index] biomarkers = dat.loc[:, ( BIOMARKER_PEAK_COLS + BIOMARKER_TROUGH_COLS + BIOMARKER_PEAK_AGE_COLS + BIOMARKER_TROUGH_AGE_COLS )] outcomes = dat.loc[:, OUTCOME_COL] peaks_dat = dat.loc[:, BIOMARKER_PEAK_COLS + BIOMARKER_TROUGH_COLS] nvar = peaks_dat.shape[1] X = impute_missing(peaks_dat, strategy='median') meconium_idx = dat.loc[:, 'Meconium Aspiration'] == 'Y'
row_colours, fig_kws={'figsize': (5.5, 10)}, vertical=False, metric=metric) fig_dict[ng] = d return fig_dict if __name__ == "__main__": norm_method = 'bmiq' # norm_method = 'swan' n_hipsci = 12 # qn_method = 'median' qn_method = None outdir = output.unique_output_dir() # load 12 patients iNSC, 4 iPSC pids = consts.PIDS # we'll list our samples explicitly to avoid results changing in future our_samples = [ 'DURA018_NSC_N4_P4', 'DURA018_NSC_N2_P6', 'DURA019_NSC_N8C_P2', 'DURA019_NSC_N5C1_P2', 'DURA019_FB_P7', 'DURA019_IPSC_N8C_P13', 'DURA030_NSC_N16B6_P1', 'DURA030_NSC_N9_P2', 'DURA030_FB_P8', 'DURA030_IPSC_N16B6_P13',
import os import pandas as pd from matplotlib import pyplot as plt from plotting import venn from settings import GIT_LFS_DATA_DIR from utils.output import unique_output_dir if __name__ == '__main__': outdir = unique_output_dir("mg_bmdm_venn") indir = os.path.join(GIT_LFS_DATA_DIR, 'GSE86573_bowman_de') gl261_mg = pd.read_csv(os.path.join(indir, 'gl261_mg_vs_healthy_mg.csv'), header=0, index_col=0) gl261_bmdm = pd.read_csv(os.path.join( indir, 'gl261_bmdm_vs_healthy_monocyte.csv'), header=0, index_col=0) gemm_mg = pd.read_csv(os.path.join(indir, 'gemm_mg_vs_healthy_mg.csv'), header=0, index_col=0) gemm_bmdm = pd.read_csv(os.path.join(indir, 'gemm_bmdm_vs_healthy_monocyte.csv'), header=0, index_col=0) fig = plt.figure() ax = fig.add_subplot(111) v, sets, counts = venn.venn_diagram(gl261_mg.index, gemm_mg.index, set_labels=("GL261 MG", "GEMM MG"),