def _score_challengeA_bunch(self, filenames, subname): from easydev import Progress pb = Progress(5, 1) pb.animate(0) results = [] for i, filename in enumerate(filenames): res = self.score_challengeA(filename, subname+"_" + str(i+1)) pb.animate(i+1) results.append(res) aupr_score = -np.mean(np.log10([x['p_auroc'] for x in results])) auroc_score = -np.mean(np.log10([x['p_aupr'] for x in results])) score = (aupr_score + auroc_score)/2. df = pd.Series() df['Overall Score'] = score df['AUPR score (pval)'] = aupr_score df['AUROC score (pval)'] = aupr_score for i in range(1, 6): df['AUPR Net %s' % i] = results[i-1]['aupr'] for i in range(1, 6): df['AUROC Net %s' % i] = results[i-1]['auroc'] return df
def fit(self, amp=1, progress=False, n_jobs=-1): r"""Loop over distributions and find best parameter to fit the data for each When a distribution is fitted onto the data, we populate a set of dataframes: - :attr:`df_errors` :sum of the square errors between the data and the fitted distribution i.e., :math:`\sum_i \left( Y_i - pdf(X_i) \right)^2` - :attr:`fitted_param` : the parameters that best fit the data - :attr:`fitted_pdf` : the PDF generated with the parameters that best fit the data Indices of the dataframes contains the name of the distribution. """ import warnings warnings.filterwarnings("ignore", category=RuntimeWarning) if progress: self.pb = Progress(len(self.distributions)) jobs = ( delayed(self._fit_single_distribution)(dist, progress) for dist in self.distributions ) pool = Parallel(n_jobs=n_jobs, backend="threading") _ = pool(jobs) self.df_errors = pd.DataFrame( { "sumsquare_error": self._fitted_errors, "aic": self._aic, "bic": self._bic, "kl_div": self._kldiv, } )
def anova_one_drug(self, drug_id, animate=True, output='object'): """Computes ANOVA for a given drug across all features :param str drug_id: a valid drug identifier. :param animate: shows the progress bar :return: a dataframe Calls :meth:`anova_one_drug_one_feature` for each feature. """ # some features can be dropped ?? # drop first and second columns that are made of strings # works under python2 but not python 3. Assume that the 2 first #columns are the sample name and tissue feature # Then, we keep only cases with at least 3 features. # MSI could be used but is not like in original R code. features = self.features.df.copy() # need to skip the FACTOR to keep only features shift = self.features.shift features = features[features.columns[shift:]] # FIXME what about features with less than 3 zeros ? mask = features.sum(axis=0) >= 3 # TODO: MSI, tissues, name must always be kept # selected_features = features[features.columns[mask]] # scan all features for a given drug assert drug_id in self.ic50.df.columns N = len(selected_features.columns) pb = Progress(N, 10) res = {}
def search_from_smile_inchembl(self): N = len(self.drug_ids) pb = Progress(N) self.results_chembl = {} self.results_chemspider = {} for i in range(0, N): drug = self.drug_ids[i] self.results_chembl[drug] = [] if self.results[drug]: for chemspider_id in self.results[drug]: chemspider_entry = self._cs_get(chemspider_id) self.results_chemspider[drug] = chemspider_entry smile = chemspider_entry['smiles'] # now search in chembl res_chembl = self.chembl.get_compounds_by_SMILES(smile) try: res_chembl['compounds'] self.results_chembl[drug].extend( res_chembl['compounds']) except: pass pb.animate(i + 1)
def create_taxonomy_file(self, filename="taxonomy.dat"): logger.info("Please wait while creating the output file. " "This may take a few minutes") from easydev import Progress pb = Progress(len(self.df_nodes)) count = 0 df_names = self.df_names.query("key == 'scientific name'").copy() with open(filename, "w") as fout: for taxid in self.df_nodes.index: row = self.df_nodes.loc[taxid] fout.write("ID : {}\n".format(taxid)) fout.write("PARENT ID : {}\n".format( row.parent)) fout.write("RANK : {}\n".format( row['rank'])) #names = df_names.loc[taxid] #print( fout.write("{:26s}: {}\n".format("SCIENTIFIC NAME", df_names.loc[taxid, "name"])) """ len(names) for k,v in zip(names['key'], names['name']): if k.upper() in ['SCIENTIFIC NAME', 'SYNONYM']: fout.write("{:26s}: {}\n".format(k.upper(), v)) except: k, v = names['key'], names['name'] fout.write("{:26s}: {}\n".format(k.upper(), v)) """ fout.write("//\n") count += 1 pb.animate(count)
def create_html_drugs(self): """Create an HTML page for each drug""" # group by drugs all_drugs = list(self.df['DRUG_ID'].unique()) df = self.get_significant_set() groups = df.groupby('DRUG_ID') if self.verbose: print("Creating individual HTML pages for each drug") N = len(groups.indices.keys()) N = len(all_drugs) pb = Progress(N) for i, drug in enumerate(all_drugs): # enumerate(groups.indices.keys()): # get the indices and therefore subgroup if drug in groups.groups.keys(): subdf = groups.get_group(drug) else: subdf = {} html = HTMLOneDrug(self, self.df, subdf, drug) html.create_report(onweb=False) if self.settings.animate: pb.animate(i + 1) if self.settings.animate: print("\n")
def select_random_reads(self, N=None, output_filename="random.fasta"): """Select random reads and save in a file :param int N: number of random unique reads to select should provide a number but a list can be used as well. :param str output_filename: """ import numpy as np thisN = len(self) if isinstance(N, int): if N > thisN: N = thisN # create random set of reads to pick up cherries = list(range(thisN)) np.random.shuffle(cherries) # cast to set for efficient iteration cherries = set(cherries[0:N]) elif isinstance(N, set): cherries = N elif isinstance(N, list): cherries = set(N) fasta = FastxFile(self.filename) pb = Progress(thisN) # since we scan the entire file with open(output_filename, "w") as fh: for i, read in enumerate(fasta): if i in cherries: fh.write(read.__str__() + "\n") else: pass pb.animate(i + 1) return cherries
def download_accession_from_ncbi(self, accession): # a list of accessions in a file # can be a list, a unique string, a filename with 1-column wit accession # to retrieve if isinstance(accession, list): pass elif isinstance(accession, str): if os.path.exists(accession): with open(accession, "r") as fin: accessions = fin.read().split() else: accessions = [accession] from easydev import Progress N = len(accessions) pb = Progress(N) logger.info("Fetching {} accession fasta files from NCBI".format(N)) for i, accession in enumerate(accessions): data = self.eutils.EFetch("nucleotide", rettype="fasta", id=accession, retmode="text") if isinstance(data, int): logger.info( "Could not fetch this accession: {}. continue".format( accession)) print("Could not fetch this accession: {}. continue".format( accession)) else: outname = "{}/library/{}.fa".format(self.dbname, accession) with open(outname, "wb") as fout: fout.write(data) pb.animate(i + 1)
def _get_G(self, gold): from easydev import Progress import scipy.sparse regulators = list(set(gold[0])) targets = list(set(gold[[0,1]].stack())) N, M = gold[0].max(), gold[1].max() ## A will store indices goind from 0 (not 1) to N-1 # hence the -1 indices when handling A if i,j are the # values of the gene A = np.zeros((N, M)) for row in gold[[0,1]].values: i, j = row A[i-1, j-1] = 1 A_sparse = scipy.sparse.csr_matrix(A) #N, M = len(regulators), len(targets) G = np.zeros((N, M)) pb = Progress(len(regulators), 1) for i, x in enumerate(regulators): for j, y in enumerate(targets): if A[x-1, y-1] == 1: G[x-1, y-1] = 1 elif x != y: G[x-1, y-1] = -1 pb.animate(i+1) return G
def diagnostics(self): """Return dataframe with information about the analysis """ n_drugs = len(self.ic50.drugIds) n_features = len(self.features.features) - self.features.shift n_combos = n_drugs * n_features feasible = 0 pb = Progress(n_drugs, 1) counter = 0 for drug in self.ic50.drugIds: for feature in self.features.features[self.features.shift:]: dd = self._get_one_drug_one_feature_data(drug, feature, diagnostic_only=True) if dd.status is True: feasible += 1 counter += 1 pb.animate(counter) results = { 'n_drug': n_drugs, 'n_combos': n_combos, 'feasible_tests': feasible, 'percentage_feasible_tests': float(feasible)/n_combos*100} return results
def filling_chembl_pubchem_using_unichem(self): """ """ N = len(self.drug_ids) pb = Progress(N) for i, this in enumerate(self.drug_ids): entry = self.dd.df.loc[this] # if no information is provided, we will need to get it # from chemspider # From the database, when chembl is provided, it is unique # same for chemspider and pubchem and CAS select = entry[['CHEMSPIDER', 'CHEMBL', 'PUBCHEM']] if select.count() == 0: name = self.dd.df.loc[this].DRUG_NAME results = self._cs_find(name) if len(results) == 0: # nothing found pass elif len(results) == 1: self.dd_filled.df.loc[this].loc['CHEMSPIDER'] = results[0] else: # non unique #chemspider = ",".join([str(x) for x in results]) self.dd_filled.df.loc[this].loc['CHEMSPIDER'] = results pb.animate(i + 1) # Search in chemspider systematically for i, this in enumerate(self.drug_ids): entry = self.dd.df.loc[this] if select.count() == 1: res = self._cs_find(drug) pb.animate(i + 1)
def find_motif_fasta(self, filename, motif, window=200, local_threshold=None, global_threshold=None): from sequana import FastA data = FastA(filename) N = len(data) from easydev import Progress pb = Progress(N) df = { "query_name": [], "hit": [], "length": [], "start": [], "end": [] } for i, item in enumerate(data): X1, S = self.find_motif_from_sequence(item.sequence, motif, window=window, local_threshold=local_threshold ) if S >= self.global_threshold: df['query_name'].append(item.name) df['start'].append(0) df['end'].append(len(item.sequence)) df['length'].append(len(item.sequence)) df['hit'].append(S) pb.animate(i+1) df = pd.DataFrame(df) return df
def to_kmer_content(self, k=7): """Return a Series with kmer count across all reads :param int k: (default to 7-mers) :return: Pandas Series with index as kmer and values as count. Takes about 30 seconds on a million reads. """ # Counter is slow if we apply it on each read. # .count is slow as well import collections from sequana.kmer import get_kmer counter = collections.Counter() pb = Progress(len(self)) buffer_ = [] for i, this in enumerate(self): buffer_.extend(list(get_kmer(this['sequence'], k))) if len(buffer_) > 100000: counter += collections.Counter(buffer_) buffer_ = [] pb.animate(i) counter += collections.Counter(buffer_) ts = pd.Series(counter) ts.sort_values(inplace=True, ascending=False) return ts
def create_html_associations(self): """Create an HTML page for each significant association The name of the output HTML file is **<association id>.html** where association id is stored in :attr:`df`. """ print("\nCreating individual HTML pages for each association") df = self.get_significant_set() drugs = df['DRUG_ID'].values features = df['FEATURE'].values assocs = df['ASSOC_ID'].values fdrs = df['ANOVA_FEATURE_FDR'].values N = len(df) pb = Progress(N) html = Association(self, drug='dummy', feature='dummy', fdr='dummy') for i in range(N): html.drug = drugs[i] html.feature = features[i] html._filename = str(assocs[i]) + '.html' html.fdr = fdrs[i] html.assoc_id = assocs[i] html._init_report() # since we have one shared instance html.create_report(onweb=False) pb.animate(i + 1)
def get_supported_families(self, N=1000, progress=True): """Returns the list of supported PANTHER family IDs This services returns only 1000 items per request. This is defined by the index. For instance index set to 1 returns the first 1000 families. Index set to 2 returns families between index 1000 and 2000 and so on. As of 20 Feb 2020, there was about 15,000 families. This function simplifies your life by calling the service as many times as required. Therefore it returns all families in one go. """ from easydev import Progress params = {'startIndex': 1} res = self.http_get("supportedpantherfamilies", params=params) results = res['search']['panther_family_subfam_list']['family'] if len(results) != N: msg = "looks like the services changed. Call this function with N={}" msg = msg.format(len(results)) raise ValueError(msg) number_of_families = res['search']['number_of_families'] pb = Progress(int(number_of_families / N)) pb.animate(1) for i in range(1, int(number_of_families / N) + 1): params = {'startIndex': i * N + 1} res = self.http_get("supportedpantherfamilies", params=params) data = res['search']['panther_family_subfam_list']['family'] results.extend(data) if progress: pb.animate(i) return results
def get_gis(self, extensions=['fa']): self.filenames = [] root = self.dbname for extension in extensions: self.filenames.extend( list(glob.iglob("%s/library/**/*%s" % (root, extension)))) for extension in extensions: self.filenames.extend( list(glob.iglob("%s/library/**/**/*%s" % (root, extension)))) N = len(self.filenames) pb = Progress(N) gis = [] for i, filename in enumerate(self.filenames): data = open(filename, "r") line = data.readline() if line.startswith('>'): assert "gi" in line, "expected >gi to be found at the beginning" gi = line[1:].split("|")[1] else: raise ValueError( "This file %s does not seem to be a FASTA file" % filename) gis.append(gi) pb.animate(i + 1) print() gis = [int(x) for x in gis] self.gis = gis assert len(gis) == len(self.filenames) return gis
def filter(self, identifiers_list=[], min_bp=None, max_bp=None, progressbar=True, output_filename='filtered.fastq'): """Save reads in a new file if there are not in the identifier_list :param int min_bp: ignore reads with length shorter than min_bp :param int max_bp: ignore reads with length above max_bp """ # 7 seconds without identifiers to scan the file # on a 750000 reads if min_bp is None: min_bp = 0 if max_bp is None: max_bp = 1e9 # make sure we are at the beginning self.rewind() output_filename, tozip = self._istozip(output_filename) with open(output_filename, "w") as fout: pb = Progress(self.n_reads) buf = "" filtered = 0 saved = 0 for count, lines in enumerate(grouper(self._fileobj)): identifier = lines[0].split()[0] if lines[0].split()[0].decode() in identifiers_list: filtered += 1 else: N = len(lines[1]) if N <= max_bp and N >= min_bp: buf += "{}{}+\n{}".format( lines[0].decode("utf-8"), lines[1].decode("utf-8"), lines[3].decode("utf-8")) saved += 1 else: filtered += 1 if count % 100000 == 0: fout.write(buf) buf = "" if progressbar is True: pb.animate(count+1) fout.write(buf) if filtered < len(identifiers_list): print("\nWARNING: not all identifiers were found in the fastq file to " + "be filtered.") logger.info("\n{} reads were filtered out and {} saved in {}".format( filtered, saved, output_filename)) if tozip is True: logger.info("Compressing file") self._gzip(output_filename)
def _init(self): self.fitted_param = {} self.fitted_pdf = {} self._fitted_errors = {} self._aic = {} self._bic = {} self._kldiv = {} self._fit_i = 0 # fit progress self.pb = Progress(len(self.distributions))
def volcano_plot_all_drugs(self): """Create a volcano plot for each drug and save in PNG files Each filename is set to **volcano_<drug identifier>.png** """ drugs = list(self.df[self._colname_drugid].unique()) pb = Progress(len(drugs), 1) for i, drug in enumerate(drugs): self.volcano_plot_one_drug(drug) self.savefig("volcano_%s.png" % drug, size_inches=(10, 10)) pb.animate(i + 1)
def volcano_plot_all_features(self): """Create a volcano plot for each feature and save in PNG files Each filename is set to **volcano_<feature name>.png** """ features = list(self.df[self._colname_feature].unique()) print('Creating image for each feature (using all drugs)') pb = Progress(len(features), 1) for i, feature in enumerate(features): self.volcano_plot_one_feature(feature) self.savefig("volcano_%s.png" % feature, size_inches=(10, 10)) pb.animate(i + 1)
def create_summary_pages(self): """Create summary pages Once the main analyis is done (:meth:`analyse`), and the company packages have been created (:meth:`create_data_packages_for_companies`), you can run this method that will creade a summary HTML page (index.html) for the tissue, and a similar summary HTML page for the tissues of each company. Finally, an HTML summary page for the companies is also created. The final tree direcorty looks like:: |-- index.html |-- company_packages | |-- index.html | |-- Company1 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html | |-- Company2 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html |-- tissue_packages | |-- index.html | |-- Tissue1 | |-- Tissue2 """ # First for the main directory (tissue_packages): print(purple("Creating summary index.html for the tissues")) self._create_summary_pages(self.main_directory, verbose=False) # Then for each companies: print(purple("Creating summary index.html for each company")) pb = Progress(len(self.companies)) for i, company in enumerate(self.companies): try: self._create_summary_pages(self.company_directory + os.sep + company, verbose=False, company=company) except Exception as err: print( red("Issue with %s. Continue with other companies" % company)) print(err) pb.animate(i + 1) # Finally, an index towards each company self._create_main_index()
def test_progressbar(): N = 2 p = progressbar.progress_bar(N) for i in range(0,N): time.sleep(.1) p.animate(i+1, i) p = progressbar.TextProgressBar(N, progressbar.consoleprint) for i in range(0,N): time.sleep(.1) p.animate(i+1, i) p = Progress(100) p.animate(1) assert p.pb.interval == 1 p = Progress(200) assert p.pb.interval == 2 p.animate(1)
def check_ipython_notebook(): notebooks = glob.glob("*ipynb") N = len(notebooks) pb = Progress(N) for i, filename in enumerate(notebooks): print(purple(filename)) notebook = read(open(filename), 'json') r = NotebookRunner(notebook) r.run_notebook() pb.animate(i + 1)
def _load_complexes(self, show_progress=True): from easydev import Progress import time pb = Progress(len(self.df.complexAC)) complexes = {} self.logging.info("Loading all details from the IntactComplex database") for i, identifier in enumerate(self.df.complexAC): res = self.webserv.details(identifier) complexes[identifier] = res if show_progress: pb.animate(i+1) self._complexes = complexes
def create_html_features(self): """Create an HTML page for each significant feature""" df = self.get_significant_set() groups = df.groupby('FEATURE') print("\nCreating individual HTML pages for each feature") N = len(groups.indices.keys()) pb = Progress(N) for i, feature in enumerate(groups.indices.keys()): # get the indices and therefore subgroup subdf = groups.get_group(feature) html = HTMLOneFeature(self, self.df, subdf, feature) html.create_report(onweb=False) pb.animate(i + 1)
def plot_pca_vs_max_features(self, step=100, n_components=2, progress=True): """ .. plot:: :include-source: from sequana.viz.pca import PCA from sequana import sequana_data import pandas as pd data = sequana_data("test_pca.csv") df = pd.read_csv(data) df = df.set_index("Id") p = PCA(df) p.plot_pca_vs_max_features() """ assert n_components in [2,3,4] N = len(self.df) if step > N: step = N # We start with at least 5 features X = range(10, N, step) from easydev import Progress pb = Progress(len(X)) Y = [] for i, x in enumerate(X): res = self.plot(n_components=n_components, max_features=x, show_plot=False) Y.append(res) if progress: pb.animate(i+1) sub = n_components pylab.subplot(sub,1,1) pylab.plot(X, [y[0]*100 for y in Y]) pylab.ylabel("PC1 (%)") pylab.subplot(sub,1,2) pylab.plot(X, [y[1]*100 for y in Y]) pylab.ylabel("PC2 (%)") if sub >= 3: pylab.subplot(sub,1,3) pylab.plot(X, [y[2]*100 for y in Y]) pylab.ylabel("PC3 (%)") if sub >= 4: pylab.subplot(sub,1,4) pylab.plot(X, [y[3]*100 for y in Y]) pylab.ylabel("PC4 (%)")
def _load_pathways(self, progress=True): # This is just loading all pathways once for all logger.info( "loading all pathways from KEGG. may take time the first time") self.pathways = {} from easydev import Progress pb = Progress(len(self.kegg.pathwayIds)) for i, ID in enumerate(self.kegg.pathwayIds): self.pathways[ID.replace("path:", "")] = self.kegg.parse(self.kegg.get(ID)) if progress: pb.animate(i + 1) # Some cleanup for ID in self.pathways.keys(): name = self.pathways[ID]['NAME'][0] self.pathways[ID]['NAME'] = name.split(" - ", 1)[0] # save gene sets self.gene_sets = {} for ID in self.pathways.keys(): res = self.pathways[ID] if "GENE" in res.keys(): results = [] # some pathways reports genes as a dictionary id:'gene name; description' ('.eg. eco') # others reports genes as a dictionary id:'description' for geneID, description in res['GENE'].items(): if ";" in description: name = description.split(';')[0] else: name = geneID results.append(name) self.gene_sets[ID] = results else: print("SKIPPED (no genes) {}: {}".format(ID, res['NAME'])) # save all pathways info self.df_pathways = pd.DataFrame(self.pathways).T del self.df_pathways["ENTRY"] del self.df_pathways["REFERENCE"] go = [ x['GO'] if isinstance(x, dict) and 'GO' in x.keys() else None for x in self.df_pathways.DBLINKS ] self.df_pathways['GO'] = go del self.df_pathways["DBLINKS"]
def filter_names_dmp_file(self, filename="names.dmp", output="names_filtered.dmp", taxons=[]): all_taxons = set() pb = Progress(len(taxons)) for i, taxon in enumerate(taxons): parents = self.get_family(taxon) all_taxons.update(parents) pb.animate(i + 1) print("") with open(filename, "r") as fin: with open(output, "w") as fout: for line in fin.readlines(): if int(line.split("\t", 1)[0]) in all_taxons: fout.write(line)
def select_random_reads(self, N=None, output_filename="random.fastq"): """Select random reads and save in a file :param int N: number of random unique reads to select should provide a number but a list can be used as well. You can select random reads for R1, and re-use the returned list as input for the R2 (since pairs must be kept) :param str output_filename: If you have a pair of files, the same reads must be selected in R1 and R2.:: f1 = FastQ(file1) selection = f1.select_random_reads(N=1000) f2 = FastQ(file2) f2.select_random_reads(selection) """ thisN = len(self) if isinstance(N, int): if N > thisN: N = thisN # create random set of reads to pick up cherries = list(range(thisN)) np.random.shuffle(cherries) # cast to set for efficient iteration cherries = set(cherries[0:N]) elif isinstance(N, set): cherries = N elif isinstance(N, list): cherries = set(N) fastq = pysam.FastxFile(self.filename) pb = Progress(thisN) # since we scan the entire file with open(output_filename, "w") as fh: for i, read in enumerate(fastq): if i in cherries: fh.write(read.__str__() + "\n") else: pass pb.animate(i+1) return cherries
def optimise_elastic_net(self, drug_name, feature_name, N=20, Nalpha=20): lwts = pylab.linspace(0, 1, N) alphas = pylab.linspace(0, 5, Nalpha) mses = np.zeros((N, Nalpha)) pb = Progress(N) for i, lwt in enumerate(lwts): for j, alpha in enumerate(alphas): self.settings.regression_method = 'ElasticNet' self.settings.regression_alpha = alpha self.settings.regression_L1_wt = lwt odof = self.anova_one_drug_one_feature(drug_name, feature_name) anova = self._get_anova_summary(self.data_lm, output='dataframe') mses[i, j] = self.data_lm.bic pb.animate(i + 1) return mses