def __init__(self, tax_id=9606, genome_version='GRCh37', gtf_fn=None): self.tax_id = tax_id self.genome_version = genome_version if gtf_fn is None: gtf_fn = genomics.get_reference_genome_gtf(tax_id, version=genome_version) self.gtf_fn = gtf_fn self.db = genomics.GtfAnnotation(gtf_fn) self.mdat = None self.dmr_res = None self.anno = None self.de_res = None self.dmr_comparison_groups = None self.logger = log.get_console_logger(self.__class__.__name__) # default plotting parameters self.colours = None self.markers = None self.zorder = None self.alpha = None self.size = None self.fig_kws = {} self.m_plot_kws = {} self.de_direction_colour = None self.dm_direction_colour = None self.dm_vmin = self.dm_vmax = None self.de_vmin = self.de_vmax = None self.set_plot_parameters()
def __init__(self, tax_id=9606, logger=None, force_update=False, go_dir=DEFAULT_GO_DIR, bg_genes=None): # gene_converter can be used to enable automatic gene conversion self.gene_converter = None self.logger = logger or log.get_console_logger(self.__class__.__name__) self.tax_id = tax_id if not os.path.isdir(go_dir): self.logger.warn("Creating master GO directory at %s.", go_dir) os.makedirs(go_dir) else: self.logger.info("Using existing GO directory at %s.", go_dir) self.base_dir = go_dir # get filenames and parse both GAF and OBO self.obo_fn = self.check_and_get_obo(force_update=force_update) self.gaf_fn = self.check_and_get_gaf(force_update=force_update) self.obo = obo_parser.GODag(self.obo_fn) self.gaf = associations.read_ncbi_gene2go(self.gaf_fn, taxids=[self.tax_id]) self.logger.info("{N:,} annotated human genes".format(N=len(self.gaf))) self.bg_genes = bg_genes if self.bg_genes is not None: self.set_bg_genes(bg_genes)
def yugene_transform(marray_data, resolve_ties=True): """ Apply the YuGene transform to the supplied data. Le Cao, Kim-Anh, Florian Rohart, Leo McHugh, Othmar Korn, and Christine A. Wells. "YuGene: A Simple Approach to Scale Gene Expression Data Derived from Different Platforms for Integrated Analyses." Genomics 103, no. 4 (April 2014): 239-51. doi:10.1016/j.ygeno.2014.03.001. Assume the data are supplied with samples in columns and genes in rows :param resolve_ties: If True (default), replace all tied values with the mean. This is especially significant at low count values, which are often highly degenerate. """ logger = get_console_logger(__name__) res = marray_data.copy() # add columnwise offset to ensure all positive values colmin = res.min(axis=0) neg_warn = False for i in np.where(colmin < 0)[0]: res.iloc[:, i] -= colmin[i] neg_warn = True if neg_warn: logger.warning( "Data contained negative values. Columnwise shift applied to correct this." ) for t in marray_data.columns: col = res.loc[:, t].sort_values(ascending=False) cs = col.cumsum() s = col.sum() # numerical error: the final value in cumsum() may not equal the sum if cs[-1] != s: cs[cs == cs[-1]] = s a = 1 - cs / s if resolve_ties: # FIXME: this is tediously slow; can definitely improve it! # find tied values in the input data tied = np.unique(col.loc[col.duplicated()].values) if tied.size > 1: logger.info("Resolving %d ties in column %s.", tied.size - 1, t) for i in tied[tied > 0]: a[col == i] = a[col == i].mean() else: logger.info("No ties to resolve in column %s.", t) res.loc[a.index, t] = a # a numerical error in cumsum() may result in some small negative values. Zero these. res[res < 0] = 0. # colmin = res.min(axis=0) # colmin[colmin >= 0] = 0. # res = res.subtract(colmin, axis=1) return res
def __init__( self, base_dir=None, meta_fn=None, samples=None, tax_id=9606, batch_id=None, verbose=True, *args, **kwargs): """ Base class for loading a dataset. :param base_dir: Path to the root input directory. All data must be contained in this directory or below it. :param meta_fn: Path to the meta file. :param samples: If supplied, use this to filter the files loaded. :param tax_id: The taxonomy ID (default: 9606, human) :param batch_id: Optionally supply a name for this batch, useful when combining batches """ self.base_dir = base_dir if not os.path.isdir(self.base_dir): raise ValueError("Supplied base_dir %s does not exist or is not a directory." % self.base_dir) self.meta_fn = meta_fn if self.meta_fn is not None: if not os.path.isfile(self.meta_fn): raise ValueError("Meta file %s does not exist." % self.meta_fn) self.meta_is_linked = None self.sample_names = None self.samples_to_keep = samples self.tax_id = tax_id self.batch_id = batch_id self.verbose = verbose self.logger = log.get_console_logger(self.__class__.__name__) self.meta = None self.input_files = None self.data = None self.load_meta() self.get_inputs() self.load_data() self.post_process()
def __init__( self, data_fn=None, meta_fn=None, samples=None, tax_id=9606, batch_id=None, verbose=True, *args, **kwargs): """ Base class for loading a dataset based on a single data file. :param data_fn: Path to the file containing all the data. :param meta_fn: Path to the meta file. :param samples: If supplied, use this to filter the files loaded. :param tax_id: The taxonomy ID (default: 9606, human) :param batch_id: Optionally supply a name for this batch, useful when combining batches """ self.data_fn = data_fn if self.data_fn is None: raise ValueError("Must supply a valid path to the data file.") elif not os.path.isfile(self.data_fn): raise ValueError("Data file %s does not exist." % self.data_fn) self.meta_fn = meta_fn if self.meta_fn is not None: if not os.path.isfile(self.meta_fn): raise ValueError("Meta file %s does not exist." % self.meta_fn) self.meta_is_linked = None self.sample_names = None self.samples_to_keep = samples self.tax_id = tax_id self.batch_id = batch_id self.verbose = verbose self.logger = log.get_console_logger(self.__class__.__name__) self.meta = None self.input_files = data_fn self.data = None self.load_meta() self.load_data() self.post_process()
def __init__(self, clear_existing=True): self.logger = log.get_console_logger(self.__class__.__name__) # functional API - the python bindings are incomplete here? self.cy = CyRestClient() if clear_existing: # reset the session (in case something is already loaded) self.cy.session.delete() # command API - the python bindings are much better self.cy_cmd = cyrest.cyclient() # collections added to the session self.name_to_id = {} self.collections = {} self.auto_net_name = 1
def check_and_get_file(root_dir, ext, get_func, force_update=False, logger=None): """ Check for a file with a given extension in the supplied dir. :param root_dir: :param ext: :param get_fn: Function handle that, when called with no args, fetches and returns the data :param force_update: If True, don't use existing file but instead force a new get call. :return: String, giving the filename of the data """ if logger is None: logger = log.get_console_logger("check_and_get_file") if not os.path.isdir(root_dir): logger.warn("Creating %s directory at %s.", ext, root_dir) os.makedirs(root_dir) flist = glob.glob(os.path.join(root_dir, "*.%s" % ext)) files_seen = {} for full_fn in flist: fn = os.path.split(full_fn)[1] try: d = datetime.datetime.strptime(fn, "%s.%s" % (datestr_fmt, ext)) except ValueError: logger.warn("Failed to parse version of %s file %s. Skipping.", ext, full_fn) else: files_seen[d] = full_fn if force_update or len(files_seen) == 0: fn_out = os.path.join( root_dir, "%s.%s" % (datetime.date.today().strftime(datestr_fmt), ext)) dat = get_func() with open(fn_out, 'wb') as fout: fout.write(dat) logger.info("Downloaded new %s file and saved it at %s", ext, fn_out) else: latest_date = max(files_seen.keys()) logger.info("Using existing %s file %s.", ext, files_seen[latest_date]) fn_out = files_seen[latest_date] return fn_out
import glob import os import re import pandas as pd from load_data import loader from rnaseq.general import ensembl_transcript_quant_to_gene from settings import RNASEQ_DIR from utils.log import get_console_logger logger = get_console_logger(__name__) INDEX_FIELDS = ( 'Approved Symbol', 'Entrez Gene ID', 'RefSeq IDs', 'Ensembl Gene ID' ) class RnaSeqFileLocations(object): def __init__(self, root_dir, alignment_subdir=None, batch_id = None, strandedness='r', tax_id=9606): self.root_dir = root_dir self.strandedness = strandedness self.alignment_subdir = alignment_subdir if batch_id is None: self.batch_id = os.path.split(self.root_dir)[-1] else: self.batch_id = batch_id self.tax_id = tax_id
import multiprocessing as mp import os import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from scipy.cluster import hierarchy as hc from plotting import clustering from rnaseq import gsea from scripts.hgic_final import analyse_xcell_results from scripts.hgic_final import consts from settings import HGIC_LOCAL_DIR, GIT_LFS_DATA_DIR from utils import log, output, reference_genomes logger = log.get_console_logger() XCELL_SIGNATURE_FN = os.path.join(GIT_LFS_DATA_DIR, 'xcell', 'ESM3_signatures.xlsx') def load_ipa_signatures(fn): res = {} with open(fn, 'rb') as f: c = csv.reader(f) for row in c: res[row[0]] = row[1:] return res def simplify_tcga_names(data):
import gzip import numpy as np import pickle import sys import re import pysam import pybedtools import subprocess from matplotlib import pyplot as plt import pandas as pd sys.path.append(os.path.dirname(__file__) + '/../../') from settings import DATA_DIR, LOCAL_DATA_DIR, GIT_LFS_DATA_DIR from utils import log, genomics, output logger = log.get_console_logger(__name__) def get_motif_locations(fa_reader, motif, references): for c in references: this_ref = fa_reader[c] it = re.finditer(motif, this_ref) for t in it: yield (c, t.start()) def create_cpg_bed(fa_fn, outfn, references=None): fa_reader = pysam.FastaFile(fa_fn) if references is None: references = fa_reader.references # get location of every CpG
def __init__(self, loaders, intersection_only=True): """ Class to combine multiple loader objects. Each loader represents a separate batch. Inputs can include multiple lane loaders. :param loaders: Iterable of loader objects. :param intersection_only: If True (default), reduce counts to the indices (e.g. genes) that are present in all loaders. """ self.logger = log.get_console_logger(self.__class__.__name__) if len(loaders) < 2: raise ValueError("Must supply 2 or more loaders to use a MultipleBatchLoader.") # we can only claim the meta data is linked here if all loaders have this property self.meta_is_linked = True for l in loaders: if not l.meta_is_linked: self.meta_is_linked = False # set the batch column name avoiding clashes batch_col = 'batch' meta_cols = sorted(setops.reduce_union(*[t.meta.columns for t in loaders if t.meta is not None])) if batch_col in meta_cols: i = 1 while batch_col in meta_cols: batch_col = "batch_%d" % i i += 1 meta_cols += [batch_col] # check attributes that must match in all loaders if len(set([t.tax_id for t in loaders])) > 1: raise AttributeError( "The tax_id of the samples differ between loaders: %s" % ', '.join([str(t.tax_id) for t in loaders]) ) else: self.tax_id = loaders[0].tax_id if len(set([t.row_indexed for t in loaders])) > 1: raise AttributeError("row_indexed bool must be the same in all loaders") else: self.row_indexed = loaders[0].row_indexed extra_df_attributes = {} if self.row_indexed: row_indexed_dat_arr = {} else: dat = {} meta_values = [] meta_index = [] blank_meta_row = dict([(k, None) for k in meta_cols]) # we may need to append a number to sample names sample_appendix = 0 auto_batch = 1 meta_auto_idx = 0 samples_seen = set() for l in loaders: this_batch = l.batch_id if not hasattr(this_batch, '__iter__'): if l.batch_id is None: this_batch = auto_batch auto_batch += 1 this_batch = pd.Series(this_batch, index=l.meta.index) try: this_samples = l.input_files.index.tolist() except AttributeError: # occurs when we are loading a single file # FIXME: find a better catch - this is too general if hasattr(l, 'input_files'): # this occurs if l is a single file loader ## FIXME: single file loaders may contain multiple samples ## in that case, this doesn't spot name clashes!! # FIXME: here's a workaround for now: may not be bulletproof this_samples = [l.input_files] if len(this_samples) != len(l.meta.index): this_samples = l.meta.index.tolist() else: # this occurs if l is a batch loader # FIXME: may not give us valid sample names? this_samples = l.meta.index.tolist() # get a copy of the data if self.row_indexed: this_dat = l.data.copy() else: this_dat = copy.copy(l.data) # get a copy of meta if l.meta is not None: this_meta = l.meta.copy() # resolve any sample clashes in the data (NOT the meta data) clash_resolved = False new_names = [] while len(samples_seen.intersection(this_samples)) > 0: sample_appendix += 1 # find the clash clashes = samples_seen.intersection(this_samples) self.logger.warning( "Found sample name clash(es): %s. Modifying names to avoid errors.", ', '.join(clashes) ) for c in clashes: new_names.append([ this_samples[this_samples.index(c)], this_samples[this_samples.index(c)] + "_%d" % sample_appendix ]) this_samples[this_samples.index(c)] += "_%d" % sample_appendix clash_resolved = True samples_seen.update(this_samples) if clash_resolved: # relabel metadata if linked if l.meta_is_linked: # reorder first to be sure it's the same as data this_meta = this_meta.loc[this_dat.columns] this_meta.index = this_samples # relabel the data if self.row_indexed: this_dat.columns = this_samples else: for prev, new in new_names: this_dat[new] = this_dat.pop(prev) # relabel the batch IDs this_batch.index = this_samples # relabel any other DF data if present for fld in l.extra_df_attributes: x = getattr(l, fld) x.columns = this_samples # data if self.row_indexed: if isinstance(this_dat.columns, pd.MultiIndex): col_list = this_dat.columns.levels[0].tolist() else: col_list = this_dat.columns.tolist() for c in col_list: row_indexed_dat_arr[c] = this_dat[[c]] else: dat.update(this_dat) # other df attributes for fld in l.extra_df_attributes: if fld not in extra_df_attributes: extra_df_attributes[fld] = getattr(l, fld).copy() else: extra_df_attributes[fld] = pd.concat((extra_df_attributes[fld], getattr(l, fld)), axis=1) # rebuild meta if l.meta is not None: for i in this_meta.index: this_row = dict(blank_meta_row) this_row.update(this_meta.loc[i].to_dict()) this_row[batch_col] = this_batch[i] meta_values.append(this_row) if l.meta_is_linked: meta_index.append(i) else: meta_index.append(meta_auto_idx) meta_auto_idx += 1 else: for c in this_dat.columns: this_row = dict(blank_meta_row) this_row[batch_col] = this_batch[c] meta_values.append(this_row) meta_index.append(meta_auto_idx) meta_auto_idx += 1 self.meta = pd.DataFrame(meta_values, index=meta_index, columns=meta_cols) if intersection_only: join = 'inner' else: join = 'outer' if self.row_indexed: dat = pd.concat( [row_indexed_dat_arr[k] for k in self.meta.index], axis=1, sort=True, join=join ) self.data = dat self.batch_id = self.meta.loc[:, batch_col] self.extra_df_attributes = tuple() for fld in extra_df_attributes: setattr(self, fld, extra_df_attributes[fld]) self.extra_df_attributes += (fld,)
from settings import DAVID_WEB_SERVICES_CONFIG from suds.client import Client from utils import log logger = log.get_console_logger('DAVID_web_services') WSDL_URL = 'https://david-d.ncifcrf.gov/webservice/services/DAVIDWebService?wsdl' SOAP_ENDPOINT = 'https://david-d.ncifcrf.gov/webservice/services/DAVIDWebService.DAVIDWebServiceHttpSoap11Endpoint/' class WSDLApi(object): def __init__(self, url=WSDL_URL, user=DAVID_WEB_SERVICES_CONFIG['email']): self.user = user self.url = url self.client = None self.connect(endpoint=SOAP_ENDPOINT) def connect(self, endpoint): self.client = Client(self.url) self.client.wsdl.services[0].setlocation(endpoint) # authenticate user email self.client.service.authenticate(self.user) def introspection(self): """ Print the service (introspection) :return: """ print self.client
import json import os import gzip import multiprocessing as mp import pandas as pd import numpy as np from matplotlib import pyplot as plt import seaborn as sns from utils import log, output from settings import INTERMEDIATE_DIR from StringIO import StringIO clogger = log.get_console_logger("rrbs_coverage_analysis") def parse_one_result(cpg, perms): if len(cpg) == 0: return tab = pd.read_csv(StringIO(cpg), sep='\t', header=0, dtype=int) cpg_chr = tab.iloc[0, 0] cpg_start = tab.iloc[0, 1] cpg_arr = tab.iloc[:, -1].values # permutations n_bp = len(cpg_arr) n_perm = len(perms) perm_arr = np.zeros((n_bp, n_perm)) perm_loc = [] for j in range(n_perm): if len(perms[j]) == 0: # no coverage at all - skip
from rnaseq import gsea import pandas as pd from . import consts from utils.log import get_console_logger import logging logger = get_console_logger("signature_classifier") class ssGSEAClassifier(object): """ Basic classifier that uses pre-defined signatures to score samples and assess classification. """ def __init__(self, signature_dict, **ssgsea_kwds): """ :param signature_dict: Dictionary. Keys are the class name, values are iterables of genes / probes or any other row index :param ssgsea_kwds: Any additional kwargs are passed directly to the ssgsea algorithm. """ self.signatures = signature_dict self.ssgsea_kwds = ssgsea_kwds # it's useful to maintain a copy of all the signature IDs for validation purposes self.all_ids = reduce(lambda x, y: x.union(y), self.signatures.values(), set()) def score(self, sample_data, check_overlap=True): """ :param sample_data: Pandas Series (single sample) or DataFrame (multiple samples) to be classified. :param check_overlap: If True (default), check whether all the signature IDs are in the data. If
import requests import pandas as pd import os from utils import output, log from load_data import geo_repo, sra import ftplib from settings import RNASEQ_DIR """ AIM: retrieve metadata (combined SRA and GEO) for a number of GEO datasets in one go """ GEO_FTP = "ftp.ncbi.nlm.nih.gov" GEO_BASE = "/geo/series/{stripped_id}/{full_id}/matrix/{full_id}_series_matrix.txt.gz" logger = log.get_console_logger("sra_batch_get_metadata") inputs = [ ('GSE116124', 'SRP151040'), ('GSE97265', 'SRP102810'), ('GSE89056', 'SRP091957'), ('GSE107654', 'SRP126289'), ('GSE97904', 'SRP104149'), ('GSE97619', 'SRP103788'), ('GSE85839', 'SRP082406'), ('GSE53094', 'SRP033569'), ('GSE67915', 'SRP057205'), # ('GSE62772', 'SRP049340'), # need to dl two geo files ('GSE73211', 'SRP063867'), ]
import pandas as pd import os import re import collections from rnaseq import gsea from settings import OUTPUT_DIR from utils import log, setops, output, excel from plotting import venn from matplotlib import pyplot as plt logger = log.get_console_logger("process_GSEA_results") if __name__ == "__main__": """ Use the results generated by exporting data with prepare_data then running GSEA (see code comments in prepare_data). """ pids = [ '018', '019', '030', '031', '017', '050', '054', '061', '026', '052' ] refs = ['gibco_nsc', 'h9_nsc'] top_n_pathways = 20 units = 'tpm' fdr = 0.05 # set to None to skip filtering indir = os.path.join(OUTPUT_DIR, "gsea_data", units) outdir = output.unique_output_dir("gsea_data") subgroups = { 'RTK I': ['018', '019', '030', '031'], 'RTK II': ['017', '050', '054', '061'], 'MES': ['026', '052'] }