def _opt_ridge_lasso(self, drug_name, feature_name, method, alphas=None): if alphas is None: alphas = pylab.linspace(0,1, 20) mses = [] params = [] method_buf = self.settings.regression_method alpha_buf = self.settings.elastic_net.alpha pb = Progress(len(alphas)) for j, alpha in enumerate(alphas): self.settings.regression_method = method self.settings.elastic_net.alpha = alpha odof = self.anova_one_drug_one_feature(drug_name, feature_name) anova = self._get_anova_summary(self.data_lm, output='dataframe') #mses.append(anova.ix['Residuals']['Sum Sq']) mses.append(anova.ix['tissue']['F value']) #mses.append(anova['Sum Sq'].sum()) pb.animate(j+1) params.append(self.data_lm.params) self.settings.regression_method = method_buf self.settings.elastic_net.alpha = alpha_buf return alphas, mses, params
def _get_G(self, gold): from easydev import Progress import scipy.sparse regulators = list(set(gold[0])) targets = list(set(gold[[0,1]].stack())) N, M = gold[0].max(), gold[1].max() ## A will store indices goind from 0 (not 1) to N-1 # hence the -1 indices when handling A if i,j are the # values of the gene A = np.zeros((N, M)) for row in gold[[0,1]].values: i, j = row A[i-1, j-1] = 1 A_sparse = scipy.sparse.csr_matrix(A) #N, M = len(regulators), len(targets) G = np.zeros((N, M)) pb = Progress(len(regulators), 1) for i, x in enumerate(regulators): for j, y in enumerate(targets): if A[x-1, y-1] == 1: G[x-1, y-1] = 1 elif x != y: G[x-1, y-1] = -1 pb.animate(i+1) return G
def search_from_smile_inchembl(self): N = len(self.drug_ids) pb = Progress(N) self.results_chembl = {} self.results_chemspider = {} for i in range(0, N): drug = self.drug_ids[i] self.results_chembl[drug] = [] if self.results[drug]: for chemspider_id in self.results[drug]: chemspider_entry = self._cs_get(chemspider_id) self.results_chemspider[drug] = chemspider_entry smile = chemspider_entry['smiles'] # now search in chembl res_chembl = self.chembl.get_compounds_by_SMILES(smile) try: res_chembl['compounds'] self.results_chembl[drug].extend( res_chembl['compounds']) except: pass pb.animate(i + 1)
def filling_chembl_pubchem_using_unichem(self): """ """ N = len(self.drug_ids) pb = Progress(N) for i,this in enumerate(self.drug_ids): entry = self.dd.df.ix[this] # if no information is provided, we will need to get it # from chemspider # From the database, when chembl is provided, it is unique # same for chemspider and pubchem and CAS select = entry[['CHEMSPIDER', 'CHEMBL', 'PUBCHEM']] if select.count() == 0: name = self.dd.df.ix[this].DRUG_NAME results = self._cs_find(name) if len(results) == 0: # nothing found pass elif len(results) == 1: self.dd_filled.df.ix[this].loc['CHEMSPIDER'] = results[0] else: # non unique #chemspider = ",".join([str(x) for x in results]) self.dd_filled.df.ix[this].loc['CHEMSPIDER'] = results pb.animate(i+1) # Search in chemspider systematically for i, this in enumerate(self.drug_ids): entry = self.dd.df.ix[this] if select.count() == 1: res = self._cs_find(drug) pb.animate(i+1)
def create_html_associations(self): """Create an HTML page for each significant association The name of the output HTML file is **<association id>.html** where association id is stored in :attr:`df`. """ print("\nCreating individual HTML pages for each association") df = self.get_significant_set() drugs = df['DRUG_ID'].values features = df['FEATURE'].values assocs = df['ASSOC_ID'].values fdrs = df['ANOVA_FEATURE_FDR'].values N = len(df) pb = Progress(N) html = Association(self, drug='dummy', feature='dummy', fdr='dummy') for i in range(N): html.drug = drugs[i] html.feature = features[i] html._filename = str(assocs[i]) + '.html' html.fdr = fdrs[i] html.assoc_id = assocs[i] html._init_report() # since we have one shared instance html.create_report(onweb=False) pb.animate(i+1)
def find_motif_fasta(self, filename, motif, window=200, local_threshold=None, global_threshold=None): from sequana import FastA data = FastA(filename) N = len(data) from easydev import Progress pb = Progress(N) df = { "query_name": [], "hit": [], "length": [], "start": [], "end": [] } for i, item in enumerate(data): X1, S = self.find_motif_from_sequence(item.sequence, motif, window=window, local_threshold=local_threshold ) if S >= self.global_threshold: df['query_name'].append(item.name) df['start'].append(0) df['end'].append(len(item.sequence)) df['length'].append(len(item.sequence)) df['hit'].append(S) pb.animate(i+1) df = pd.DataFrame(df) return df
def to_kmer_content(self, k=7): """Return a Series with kmer count across all reads :param int k: (default to 7-mers) :return: Pandas Series with index as kmer and values as count. Takes about 30 seconds on a million reads. """ # Counter is slow if we apply it on each read. # .count is slow as well import collections from sequana.kmer import get_kmer counter = collections.Counter() pb = Progress(len(self)) buffer_ = [] for i, this in enumerate(self): buffer_.extend(list(get_kmer(this['sequence'], k))) if len(buffer_) > 100000: counter += collections.Counter(buffer_) buffer_ = [] pb.animate(i) counter += collections.Counter(buffer_) ts = pd.Series(counter) ts.sort_values(inplace=True, ascending=False) return ts
def _score_challengeA_bunch(self, filenames, subname): from easydev import Progress pb = Progress(5, 1) pb.animate(0) results = [] for i, filename in enumerate(filenames): res = self.score_challengeA(filename, subname + "_" + str(i + 1)) pb.animate(i + 1) results.append(res) aupr_score = -np.mean(np.log10([x["p_auroc"] for x in results])) auroc_score = -np.mean(np.log10([x["p_aupr"] for x in results])) score = (aupr_score + auroc_score) / 2.0 df = pd.TimeSeries() df["Overall Score"] = score df["AUPR score (pval)"] = aupr_score df["AUROC score (pval)"] = aupr_score for i in range(1, 6): df["AUPR Net %s" % i] = results[i - 1]["aupr"] for i in range(1, 6): df["AUROC Net %s" % i] = results[i - 1]["auroc"] return df
def diagnostics(self): """Return dataframe with information about the analysis """ n_drugs = len(self.ic50.drugIds) n_features = len(self.features.features) - self.features.shift n_combos = n_drugs * n_features feasible = 0 pb = Progress(n_drugs, 1) counter = 0 for drug in self.ic50.drugIds: for feature in self.features.features[self.features.shift:]: dd = self._get_one_drug_one_feature_data(drug, feature, diagnostic_only=True) if dd.status is True: feasible += 1 counter += 1 pb.animate(counter) results = { 'n_drug': n_drugs, 'n_combos': n_combos, 'feasible_tests': feasible, 'percentage_feasible_tests': float(feasible)/n_combos*100} return results
def compounds2accession(self, compounds): """For each compound, identifies the target and corresponding UniProt accession number This is not part of ChEMBL API :: # we recommend to use cache if you use this method regularly c = Chembl(cache=True) drugs = c.get_approved_drugs() # to speed up example drugs = drugs[0:20] IDs = [x['molecule_chembl_id] for x in drugs] c.compounds2accession(IDs) """ # we jump from compounds to targets through activities # Here this is a one to many mapping so we initialise a default # dictionary. from collections import defaultdict compound2target = defaultdict(set) filter = "molecule_chembl_id__in={}" from easydev import Progress pb = Progress(len(compounds)) for i in range(0, len(compounds)): # FIXME could get activities by bunch using # ",".join(compounds[i:i+10) for example activities = self.get_activity(filters=filter.format(compounds[i])) # get target ChEMBL IDs from activities for act in activities: compound2target[act['molecule_chembl_id']].add(act['target_chembl_id']) pb.animate(i+1) # What we need is to get targets for all targets found in the previous # step. For each compound/drug there are hundreds of targets though. And # we will call the get_target for each list of hundreds targets. This # will take forever. Instead, because there are *only* 12,000 targets, # let us download all of them ! This took about 4 minutes on this test but # if you use the cache, next time it will be much much quicker. This is # not down at the activities level because there are too many entries targets = self.get_target(limit=-1) # identifies all target chembl id to easily retrieve the entry later on target_names = [target['target_chembl_id'] for target in targets] # retrieve all uniprot accessions for all targets of each compound for compound, targs in compound2target.items(): accessions = set() for target in targs: index = target_names.index(target) accessions = accessions.union([comp['accession'] for comp in targets[index]['target_components']]) compound2target[compound] = accessions return compound2target
def select_random_reads(self, N=None, output_filename="random.fasta"): """Select random reads and save in a file :param int N: number of random unique reads to select should provide a number but a list can be used as well. :param str output_filename: """ import numpy as np thisN = len(self) if isinstance(N, int): if N > thisN: N = thisN # create random set of reads to pick up cherries = list(range(thisN)) np.random.shuffle(cherries) # cast to set for efficient iteration cherries = set(cherries[0:N]) elif isinstance(N, set): cherries = N elif isinstance(N, list): cherries = set(N) fasta = FastxFile(self.filename) pb = Progress(thisN) # since we scan the entire file with open(output_filename, "w") as fh: for i, read in enumerate(fasta): if i in cherries: fh.write(read.__str__() + "\n") else: pass pb.animate(i+1) return cherries
def create_html_drugs(self): """Create an HTML page for each drug""" # group by drugs all_drugs = list(self.df['DRUG_ID'].unique()) df = self.get_significant_set() groups = df.groupby('DRUG_ID') if self.verbose: print("Creating individual HTML pages for each drug") N = len(groups.indices.keys()) N = len(all_drugs) pb = Progress(N) for i, drug in enumerate(all_drugs): # enumerate(groups.indices.keys()): # get the indices and therefore subgroup if drug in groups.groups.keys(): subdf = groups.get_group(drug) else: subdf = {} html = HTMLOneDrug(self, self.df, subdf, drug) html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n")
def get_gis(self, extensions=['fa']): self.filenames = [] root = self.dbname for extension in extensions: self.filenames.extend( list(glob.iglob("%s/library/**/*%s" % (root, extension)))) for extension in extensions: self.filenames.extend( list(glob.iglob("%s/library/**/**/*%s" % (root, extension)))) N = len(self.filenames) pb = Progress(N) gis = [] for i, filename in enumerate(self.filenames): data = open(filename, "r") line = data.readline() if line.startswith('>'): assert "gi" in line, "expected >gi to be found at the beginning" gi = line[1:].split("|")[1] else: raise ValueError( "This file %s does not seem to be a FASTA file" % filename) gis.append(gi) pb.animate(i + 1) print() gis = [int(x) for x in gis] self.gis = gis assert len(gis) == len(self.filenames) return gis
def create_html_drugs(self): """Create an HTML page for each drug""" # group by drugs all_drugs = list(self.df['DRUG_ID'].unique()) df = self.get_significant_set() groups = df.groupby('DRUG_ID') if self.verbose: print("Creating individual HTML pages for each drug") N = len(groups.indices.keys()) N = len(all_drugs) pb = Progress(N) for i, drug in enumerate(all_drugs): # enumerate(groups.indices.keys()): # get the indices and therefore subgroup if drug in groups.groups.keys(): subdf = groups.get_group(drug) else: subdf = {} html = HTMLOneDrug(self, self.df, subdf, drug) html.create_report(onweb=False) if self.settings.animate: pb.animate(i + 1) if self.settings.animate: print("\n")
def download_accession_from_ncbi(self, accession): # a list of accessions in a file # can be a list, a unique string, a filename with 1-column wit accession # to retrieve if isinstance(accession, list): pass elif isinstance(accession, str): if os.path.exists(accession): with open(accession, "r") as fin: accessions = fin.read().split() else: accessions = [accession] from easydev import Progress N = len(accessions) pb = Progress(N) logger.info("Fetching {} accession fasta files from NCBI".format(N)) for i, accession in enumerate(accessions): data = self.eutils.EFetch("nucleotide", rettype="fasta", id=accession, retmode="text") if isinstance(data, int): logger.info( "Could not fetch this accession: {}. continue".format( accession)) print("Could not fetch this accession: {}. continue".format( accession)) else: outname = "{}/library/{}.fa".format(self.dbname, accession) with open(outname, "wb") as fout: fout.write(data) pb.animate(i + 1)
def create_html_associations(self): """Create an HTML page for each significant association The name of the output HTML file is **<association id>.html** where association id is stored in :attr:`df`. """ print("\nCreating individual HTML pages for each association") df = self.get_significant_set() drugs = df['DRUG_ID'].values features = df['FEATURE'].values assocs = df['ASSOC_ID'].values fdrs = df['ANOVA_FEATURE_FDR'].values N = len(df) pb = Progress(N) html = Association(self, drug='dummy', feature='dummy', fdr='dummy') for i in range(N): html.drug = drugs[i] html.feature = features[i] html._filename = str(assocs[i]) + '.html' html.fdr = fdrs[i] html.assoc_id = assocs[i] html._init_report() # since we have one shared instance html.create_report(onweb=False) pb.animate(i + 1)
def select_random_reads(self, N=None, output_filename="random.fasta"): """Select random reads and save in a file :param int N: number of random unique reads to select should provide a number but a list can be used as well. :param str output_filename: """ import numpy as np thisN = len(self) if isinstance(N, int): if N > thisN: N = thisN # create random set of reads to pick up cherries = list(range(thisN)) np.random.shuffle(cherries) # cast to set for efficient iteration cherries = set(cherries[0:N]) elif isinstance(N, set): cherries = N elif isinstance(N, list): cherries = set(N) fasta = FastxFile(self.filename) pb = Progress(thisN) # since we scan the entire file with open(output_filename, "w") as fh: for i, read in enumerate(fasta): if i in cherries: fh.write(read.__str__() + "\n") else: pass pb.animate(i + 1) return cherries
def process_single_reads(reader, modifiers, filters, n_progress=-1): """ Loop over reads, find adapters, trim reads, apply modifiers and output modified reads. Return a Statistics object. """ n = 0 # no. of processed reads total_bp = 0 if n_progress != -1: try: from easydev import Progress pb = Progress(n_progress) count = 0 except: n_progress = -1 for read in reader: n += 1 total_bp += len(read.sequence) for modifier in modifiers: read = modifier(read) for filter in filters: if filter(read): break if n_progress != -1: count += 1 pb.animate(count) return Statistics(n=n, total_bp1=total_bp, total_bp2=None)
def create_taxonomy_file(self, filename="taxonomy.dat"): logger.info("Please wait while creating the output file. " "This may take a few minutes") from easydev import Progress pb = Progress(len(self.df_nodes)) count = 0 df_names = self.df_names.query("key == 'scientific name'").copy() with open(filename, "w") as fout: for taxid in self.df_nodes.index: row = self.df_nodes.loc[taxid] fout.write("ID : {}\n".format(taxid)) fout.write("PARENT ID : {}\n".format( row.parent)) fout.write("RANK : {}\n".format( row['rank'])) #names = df_names.loc[taxid] #print( fout.write("{:26s}: {}\n".format("SCIENTIFIC NAME", df_names.loc[taxid, "name"])) """ len(names) for k,v in zip(names['key'], names['name']): if k.upper() in ['SCIENTIFIC NAME', 'SYNONYM']: fout.write("{:26s}: {}\n".format(k.upper(), v)) except: k, v = names['key'], names['name'] fout.write("{:26s}: {}\n".format(k.upper(), v)) """ fout.write("//\n") count += 1 pb.animate(count)
def search_from_smile_inchembl(self): N = len(self.drug_ids) pb = Progress(N) self.results_chembl = {} self.results_chemspider = {} for i in range(0, N): drug = self.drug_ids[i] self.results_chembl[drug] = [] if self.results[drug]: for chemspider_id in self.results[drug]: chemspider_entry = self._cs_get(chemspider_id) self.results_chemspider[drug] = chemspider_entry smile = chemspider_entry['smiles'] # now search in chembl res_chembl = self.chembl.get_compounds_by_SMILES(smile) try: res_chembl['compounds'] self.results_chembl[drug].extend(res_chembl['compounds']) except: pass pb.animate(i+1)
def dendogram_coefficients(self, stacked=False, show=True, cmap="terrain"): """ shows the coefficient of each optimised model for each drug """ drugids = self.drugIds from easydev import Progress pb = Progress(len(drugids)) d = {} for i, drug_name in enumerate(drugids): X, Y = self._get_one_drug_data(drug_name, randomize_Y=False) results = self.runCV(drug_name, verbose=False) df = pd.DataFrame({'name': X.columns, 'weight': results.coefficients}) df = df.set_index("name").sort_values("weight") d[drug_name] = df.copy() pb.animate(i+1) # use drugid to keep same order as in the data dfall = pd.concat([d[i] for i in drugids], axis=1) dfall.columns = drugids if show: from biokit import heatmap h = heatmap.Heatmap(dfall, cmap=cmap) h.plot(num=1,colorbar_position="top left") if stacked is True: dfall = dfall.stack().reset_index() dfall.columns = ["feature", "drug", "weight"] return dfall
def filter(self, identifiers_list=[], min_bp=None, max_bp=None, progressbar=True, output_filename='filtered.fastq'): """Save reads in a new file if there are not in the identifier_list :param int min_bp: ignore reads with length shorter than min_bp :param int max_bp: ignore reads with length above max_bp """ # 7 seconds without identifiers to scan the file # on a 750000 reads if min_bp is None: min_bp = 0 if max_bp is None: max_bp = 1e9 # make sure we are at the beginning self.rewind() output_filename, tozip = self._istozip(output_filename) with open(output_filename, "w") as fout: pb = Progress(self.n_reads) buf = "" filtered = 0 saved = 0 for count, lines in enumerate(grouper(self._fileobj)): identifier = lines[0].split()[0] if lines[0].split()[0].decode() in identifiers_list: filtered += 1 else: N = len(lines[1]) if N <= max_bp and N >= min_bp: buf += "{}{}+\n{}".format( lines[0].decode("utf-8"), lines[1].decode("utf-8"), lines[3].decode("utf-8")) saved += 1 else: filtered += 1 if count % 100000 == 0: fout.write(buf) buf = "" if progressbar is True: pb.animate(count+1) fout.write(buf) if filtered < len(identifiers_list): print("\nWARNING: not all identifiers were found in the fastq file to " + "be filtered.") logger.info("\n{} reads were filtered out and {} saved in {}".format( filtered, saved, output_filename)) if tozip is True: logger.info("Compressing file") self._gzip(output_filename)
def _init(self): self.fitted_param = {} self.fitted_pdf = {} self._fitted_errors = {} self._aic = {} self._bic = {} self._kldiv = {} self._fit_i = 0 # fit progress self.pb = Progress(len(self.distributions))
def volcano_plot_all_drugs(self): """Create a volcano plot for each drug and save in PNG files Each filename is set to **volcano_<drug identifier>.png** """ drugs = list(self.df[self._colname_drugid].unique()) pb = Progress(len(drugs), 1) for i, drug in enumerate(drugs): self.volcano_plot_one_drug(drug) self.savefig("volcano_%s.png" % drug, size_inches=(10, 10)) pb.animate(i + 1)
def volcano_plot_all_drugs(self): """Create a volcano plot for each drug and save in PNG files Each filename is set to **volcano_<drug identifier>.png** """ drugs = list(self.df[self._colname_drugid].unique()) pb = Progress(len(drugs), 1) for i, drug in enumerate(drugs): self.volcano_plot_one_drug(drug) self.savefig("volcano_%s.png" % drug, size_inches=(10, 10)) pb.animate(i+1)
def volcano_plot_all_features(self): """Create a volcano plot for each feature and save in PNG files Each filename is set to **volcano_<feature name>.png** """ features = list(self.df[self._colname_feature].unique()) print('Creating image for each feature (using all drugs)') pb = Progress(len(features), 1) for i, feature in enumerate(features): self.volcano_plot_one_feature(feature) self.savefig("volcano_%s.png" % feature, size_inches=(10, 10)) pb.animate(i + 1)
def _load_complexes(self, show_progress=True): from easydev import Progress import time pb = Progress(len(self.df.complexAC)) complexes = {} self.logging.info("Loading all details from the IntactComplex database") for i, identifier in enumerate(self.df.complexAC): res = self.webserv.details(identifier) complexes[identifier] = res if show_progress: pb.animate(i+1) self._complexes = complexes
def check_ipython_notebook(): notebooks = glob.glob("*ipynb") N = len(notebooks) pb = Progress(N) for i, filename in enumerate(notebooks): print(purple(filename)) notebook = read(open(filename), 'json') r = NotebookRunner(notebook) r.run_notebook() pb.animate(i + 1)
def create_summary_pages(self): """Create summary pages Once the main analyis is done (:meth:`analyse`), and the company packages have been created (:meth:`create_data_packages_for_companies`), you can run this method that will creade a summary HTML page (index.html) for the tissue, and a similar summary HTML page for the tissues of each company. Finally, an HTML summary page for the companies is also created. The final tree direcorty looks like:: |-- index.html |-- company_packages | |-- index.html | |-- Company1 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html | |-- Company2 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html |-- tissue_packages | |-- index.html | |-- Tissue1 | |-- Tissue2 """ # First for the main directory (tissue_packages): print(purple("Creating summary index.html for the tissues")) self._create_summary_pages(self.main_directory, verbose=False) # Then for each companies: print(purple("Creating summary index.html for each company")) pb = Progress(len(self.companies)) for i, company in enumerate(self.companies): try: self._create_summary_pages(self.company_directory + os.sep + company, verbose=False, company=company) except Exception as err: print( red("Issue with %s. Continue with other companies" % company)) print(err) pb.animate(i + 1) # Finally, an index towards each company self._create_main_index()
def volcano_plot_all_features(self): """Create a volcano plot for each feature and save in PNG files Each filename is set to **volcano_<feature name>.png** """ features = list(self.df[self._colname_feature].unique()) print('Creating image for each feature (using all drugs)') pb = Progress(len(features), 1) for i, feature in enumerate(features): self.volcano_plot_one_feature(feature) self.savefig("volcano_%s.png" % feature, size_inches=(10, 10)) pb.animate(i+1)
def check_ipython_notebook(): notebooks = glob.glob("*ipynb") N = len(notebooks) pb = Progress(N) for i,filename in enumerate(notebooks): print(purple(filename)) notebook = read(open(filename), 'json') r = NotebookRunner(notebook) r.run_notebook() pb.animate(i+1)
def create_html_features(self): """Create an HTML page for each significant feature""" df = self.get_significant_set() groups = df.groupby('FEATURE') print("\nCreating individual HTML pages for each feature") N = len(groups.indices.keys()) pb = Progress(N) for i, feature in enumerate(groups.indices.keys()): # get the indices and therefore subgroup subdf = groups.get_group(feature) html = HTMLOneFeature(self, self.df, subdf, feature) html.create_report(onweb=False) pb.animate(i + 1)
def create_html_features(self): """Create an HTML page for each significant feature""" df = self.get_significant_set() groups = df.groupby('FEATURE') print("\nCreating individual HTML pages for each feature") N = len(groups.indices.keys()) pb = Progress(N) for i, feature in enumerate(groups.indices.keys()): # get the indices and therefore subgroup subdf = groups.get_group(feature) html = HTMLOneFeature(self, self.df, subdf, feature) html.create_report(onweb=False) pb.animate(i+1)
def plot_cindex(self, drug_name, alphas, l1_ratio=0.5, n_folds=10, hold=False): # This is longish (300 seconds with 10 folds and 80 alphas # for GDSC v5 data sets. from dreamtools.core.cindex import cindex CI_train = {} CI_test = {} for c in range(n_folds): CI_train[c] = [] CI_test[c] = [] from easydev import Progress pb = Progress(len(alphas)) for i, alpha in enumerate(alphas): self.elastic_net(drug_name, alpha=alpha, l1_ratio=l1_ratio, n_folds=n_folds) # Look at the first fold only for kf in range(n_folds): x_train = self.kfold_data['x_train'][kf].values y_train = self.kfold_data['y_train'][kf].values x_test = self.kfold_data['x_test'][kf].values y_test = self.kfold_data['y_test'][kf].values x_train_pred = self.en.predict(x_train) x_test_pred = self.en.predict(x_test) CI_test[kf].append(1-cindex(x_test_pred, y_test, [True]*len(y_test))) CI_train[kf].append(1-cindex(x_train_pred, y_train, [True] * len(y_train))) pb.animate(i) mu_train = pd.DataFrame(CI_train).transpose().mean() sigma_train = pd.DataFrame(CI_train).transpose().std() mu_test = pd.DataFrame(CI_test).transpose().mean() sigma_test = pd.DataFrame(CI_test).transpose().std() best_alpha = alphas[pd.DataFrame(CI_test).mean(axis=1).argmax()] pylab.clf() pylab.errorbar(pylab.log(alphas), mu_train, yerr=sigma_train, label="train") pylab.errorbar(pylab.log(alphas)+.1, mu_test, yerr=sigma_test, label="test") pylab.plot(pylab.log(alphas), mu_train, 'ob') pylab.plot(pylab.log(alphas)+.1, mu_train, 'or') pylab.legend() pylab.axvline(pylab.log(best_alpha), lw=2, color="purple") return best_alpha
def create_summary_pages(self): """Create summary pages Once the main analyis is done (:meth:`analyse`), and the company packages have been created (:meth:`create_data_packages_for_companies`), you can run this method that will creade a summary HTML page (index.html) for the tissue, and a similar summary HTML page for the tissues of each company. Finally, an HTML summary page for the companies is also created. The final tree direcorty looks like:: |-- index.html |-- company_packages | |-- index.html | |-- Company1 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html | |-- Company2 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html |-- tissue_packages | |-- index.html | |-- Tissue1 | |-- Tissue2 """ # First for the main directory (tissue_packages): print(purple("Creating summary index.html for the tissues")) self._create_summary_pages(self.main_directory, verbose=False) # Then for each companies: print(purple("Creating summary index.html for each company")) pb = Progress(len(self.companies)) for i, company in enumerate(self.companies): try: self._create_summary_pages(self.company_directory + os.sep + company, verbose=False, company=company) except Exception as err: print(red("Issue with %s. Continue with other companies" % company)) print(err) pb.animate(i+1) # Finally, an index towards each company self._create_main_index()
def plot_pca_vs_max_features(self, step=100, n_components=2, progress=True): """ .. plot:: :include-source: from sequana.viz.pca import PCA from sequana import sequana_data import pandas as pd data = sequana_data("test_pca.csv") df = pd.read_csv(data) df = df.set_index("Id") p = PCA(df) p.plot_pca_vs_max_features() """ assert n_components in [2,3,4] N = len(self.df) if step > N: step = N # We start with at least 5 features X = range(10, N, step) from easydev import Progress pb = Progress(len(X)) Y = [] for i, x in enumerate(X): res = self.plot(n_components=n_components, max_features=x, show_plot=False) Y.append(res) if progress: pb.animate(i+1) sub = n_components pylab.subplot(sub,1,1) pylab.plot(X, [y[0]*100 for y in Y]) pylab.ylabel("PC1 (%)") pylab.subplot(sub,1,2) pylab.plot(X, [y[1]*100 for y in Y]) pylab.ylabel("PC2 (%)") if sub >= 3: pylab.subplot(sub,1,3) pylab.plot(X, [y[2]*100 for y in Y]) pylab.ylabel("PC3 (%)") if sub >= 4: pylab.subplot(sub,1,4) pylab.plot(X, [y[3]*100 for y in Y]) pylab.ylabel("PC4 (%)")
def filter(self, identifiers_list=[], min_bp=None, max_bp=None, progressbar=True, output_filename='filtered.fastq', remove=True): """Filter reads :param int min_bp: ignore reads with length shorter than min_bp :param int max_bp: ignore reads with length above max_bp """ # 7 seconds without identifiers to scan the file # on a 750000 reads if min_bp is None: min_bp = 0 if max_bp is None: max_bp = 1e9 # make sure we are at the beginning self.rewind() output_filename, tozip = self._istozip(output_filename) with open(output_filename, "w") as fout: pb = Progress(self.n_reads) buf = "" filtered = 0 for count, lines in enumerate(grouper(self._fileobj)): identifier = lines[0].split()[0] if lines[0].split()[0] in identifiers_list: filtered += 1 else: N = len(lines[1]) if N <= max_bp and N >= min_bp: buf += "{}{}+\n{}".format( lines[0].decode("utf-8"), lines[1].decode("utf-8"), lines[3].decode("utf-8")) if count % 100000 == 0: fout.write(buf) buf = "" if progressbar is True: pb.animate(count+1) fout.write(buf) if filtered < len(identifiers_list): print("\nWARNING: not all identifiers were found in the fastq file to " + "be filtered.") if tozip is True: self._gzip(output_filename)
def anova_one_drug(self, drug_id, animate=True, output='object'): """Computes ANOVA for a given drug across all features :param str drug_id: a valid drug identifier. :param animate: shows the progress bar :return: a dataframe Calls :meth:`anova_one_drug_one_feature` for each feature. """ # some features can be dropped ?? # drop first and second columns that are made of strings # works under python2 but not python 3. Assume that the 2 first #columns are the sample name and tissue feature # Then, we keep only cases with at least 3 features. # MSI could be used but is not like in original R code. features = self.features.df.copy() # need to skip the FACTOR to keep only features shift = self.features.shift features = features[features.columns[shift:]] # FIXME what about features with less than 3 zeros ? mask = features.sum(axis=0) >= 3 # TODO: MSI, tissues, name must always be kept # selected_features = features[features.columns[mask]] # scan all features for a given drug assert drug_id in self.ic50.df.columns N = len(selected_features.columns) pb = Progress(N, 10) res = {}
def fit(self, amp=1, progress=False, n_jobs=-1): r"""Loop over distributions and find best parameter to fit the data for each When a distribution is fitted onto the data, we populate a set of dataframes: - :attr:`df_errors` :sum of the square errors between the data and the fitted distribution i.e., :math:`\sum_i \left( Y_i - pdf(X_i) \right)^2` - :attr:`fitted_param` : the parameters that best fit the data - :attr:`fitted_pdf` : the PDF generated with the parameters that best fit the data Indices of the dataframes contains the name of the distribution. """ import warnings warnings.filterwarnings("ignore", category=RuntimeWarning) if progress: self.pb = Progress(len(self.distributions)) jobs = ( delayed(self._fit_single_distribution)(dist, progress) for dist in self.distributions ) pool = Parallel(n_jobs=n_jobs, backend="threading") _ = pool(jobs) self.df_errors = pd.DataFrame( { "sumsquare_error": self._fitted_errors, "aic": self._aic, "bic": self._bic, "kl_div": self._kldiv, } )
def _load_pathways(self, progress=True): # This is just loading all pathways once for all logger.info( "loading all pathways from KEGG. may take time the first time") self.pathways = {} from easydev import Progress pb = Progress(len(self.kegg.pathwayIds)) for i, ID in enumerate(self.kegg.pathwayIds): self.pathways[ID.replace("path:", "")] = self.kegg.parse(self.kegg.get(ID)) if progress: pb.animate(i + 1) # Some cleanup for ID in self.pathways.keys(): name = self.pathways[ID]['NAME'][0] self.pathways[ID]['NAME'] = name.split(" - ", 1)[0] # save gene sets self.gene_sets = {} for ID in self.pathways.keys(): res = self.pathways[ID] if "GENE" in res.keys(): results = [] # some pathways reports genes as a dictionary id:'gene name; description' ('.eg. eco') # others reports genes as a dictionary id:'description' for geneID, description in res['GENE'].items(): if ";" in description: name = description.split(';')[0] else: name = geneID results.append(name) self.gene_sets[ID] = results else: print("SKIPPED (no genes) {}: {}".format(ID, res['NAME'])) # save all pathways info self.df_pathways = pd.DataFrame(self.pathways).T del self.df_pathways["ENTRY"] del self.df_pathways["REFERENCE"] go = [ x['GO'] if isinstance(x, dict) and 'GO' in x.keys() else None for x in self.df_pathways.DBLINKS ] self.df_pathways['GO'] = go del self.df_pathways["DBLINKS"]
def filter_names_dmp_file(self, filename="names.dmp", output="names_filtered.dmp", taxons=[]): all_taxons = set() pb = Progress(len(taxons)) for i, taxon in enumerate(taxons): parents = self.get_family(taxon) all_taxons.update(parents) pb.animate(i + 1) print("") with open(filename, "r") as fin: with open(output, "w") as fout: for line in fin.readlines(): if int(line.split("\t", 1)[0]) in all_taxons: fout.write(line)
def select_random_reads(self, N=None, output_filename="random.fastq"): """Select random reads and save in a file :param int N: number of random unique reads to select should provide a number but a list can be used as well. You can select random reads for R1, and re-use the returned list as input for the R2 (since pairs must be kept) :param str output_filename: If you have a pair of files, the same reads must be selected in R1 and R2.:: f1 = FastQ(file1) selection = f1.select_random_reads(N=1000) f2 = FastQ(file2) f2.select_random_reads(selection) """ thisN = len(self) if isinstance(N, int): if N > thisN: N = thisN # create random set of reads to pick up cherries = list(range(thisN)) np.random.shuffle(cherries) # cast to set for efficient iteration cherries = set(cherries[0:N]) elif isinstance(N, set): cherries = N elif isinstance(N, list): cherries = set(N) fastq = pysam.FastxFile(self.filename) pb = Progress(thisN) # since we scan the entire file with open(output_filename, "w") as fh: for i, read in enumerate(fastq): if i in cherries: fh.write(read.__str__() + "\n") else: pass pb.animate(i+1) return cherries
def optimise_elastic_net(self, drug_name, feature_name, N=20, Nalpha=20): lwts = pylab.linspace(0, 1, N) alphas = pylab.linspace(0, 5, Nalpha) mses = np.zeros((N, Nalpha)) pb = Progress(N) for i, lwt in enumerate(lwts): for j, alpha in enumerate(alphas): self.settings.regression_method = 'ElasticNet' self.settings.regression_alpha = alpha self.settings.regression_L1_wt = lwt odof = self.anova_one_drug_one_feature(drug_name, feature_name) anova = self._get_anova_summary(self.data_lm, output='dataframe') mses[i, j] = self.data_lm.bic pb.animate(i + 1) return mses
def volcano_plot_all_drugs(self): """Create a volcano plot for each drug and save in PNG files Each filename is set to **volcano_<drug identifier>.png** """ drugs = list(self.df[self._colname_drugid].unique()) pb = Progress(len(drugs), 1) for i, drug in enumerate(drugs): self.volcano_plot_one_drug(drug) self.savefig_and_js("volcano_%s.png" % drug, size_inches=(10, 10)) pb.animate(i + 1) # This prevent memory leak. self.current_fig.canvas.mpl_disconnect(self.cid) try: import mpld3 mpld3.plugins.clear(self.current_fig) except: pass
def optimise_elastic_net(self, drug_name, feature_name, N=20, Nalpha=20): lwts = pylab.linspace(0, 1, N) alphas = pylab.linspace(0, 5, Nalpha) mses = np.zeros((N, Nalpha)) pb = Progress(N) for i, lwt in enumerate(lwts): for j, alpha in enumerate(alphas): self.settings.regression_method = 'ElasticNet' self.settings.regression_alpha = alpha self.settings.regression_L1_wt = lwt odof = self.anova_one_drug_one_feature(drug_name, feature_name) anova = self._get_anova_summary(self.data_lm, output='dataframe') mses[i,j] = self.data_lm.bic pb.animate(i+1) return mses
def volcano_plot_all_drugs(self): """Create a volcano plot for each drug and save in PNG files Each filename is set to **volcano_<drug identifier>.png** """ drugs = list(self.df[self._colname_drugid].unique()) pb = Progress(len(drugs), 1) for i, drug in enumerate(drugs): self.volcano_plot_one_drug(drug) self.savefig_and_js("volcano_%s.png" % drug, size_inches=(10, 10)) pb.animate(i+1) # This prevent memory leak. self.current_fig.canvas.mpl_disconnect(self.cid) try: import mpld3 mpld3.plugins.clear(self.current_fig) except: pass
def search_in_chemspider(self): #SB52334 --> SB-52334 N = len(self.dd) pb = Progress(N) self.results = {} results = [] for i, index in enumerate(self.dd.df.index): drug = self.dd.df.index[i] drug_name = self.dd.df.ix[drug].DRUG_NAME try: res = self.chemspider_find(drug_name) except: print(index, drug_name) res = [] self.results[drug] = res pb.animate(i + 1) results.append(res) self.dd_filled.df['CHEMSPIDER_SEARCHED'] = results
def load_records(self, overwrite=False): """Load a flat file and store records in :attr:`records` Since version 0.8.3 we use NCBI that is updated more often than the ebi ftp according to their README. ftp://ncbi.nlm.nih.gov/pub/taxonomy/ """ self.download_taxonomic_file(overwrite=overwrite) self.records = {} # TODO: check if it exists otherwise, load it ? if os.path.exists(self.database) is False: self.load() with open(self.database) as f: data = f.read().strip() # This is fast. tried parse package, much slower. cost of progress bar # is not important. data = data.split("//\n") # the sep is //\n self._child_match = re.compile(r'ID\s+\:\s*(\d+)\s*') self._parent_match = re.compile(r'PARENT ID\s+\:\s*(\d+)\s*') self._rank_match = re.compile(r'RANK\s+\:\s*([^\n]+)\s*') self._name_match = re.compile(r'SCIENTIFIC NAME\s+\:\s*([^\n]+)\s*') from easydev import Progress pb = Progress(len(data)) logger.info('Loading all taxon records.') for i, record in enumerate(data[0:]): dd = {'raw': record} dd['id'] = int(self._child_match.search(record).group(1)) dd['parent'] = int(self._parent_match.search(record).group(1)) dd['scientific_name'] = self._name_match.search(record).group(1) dd['rank'] = self._rank_match.search(record).group(1) self.records[dd["id"]] = dd if self.verbose: pb.animate(i + 1) if self.verbose: print()
def run_methods(self): results = defaultdict(list) # We only test the methods common to all converters # (The intended use is with a list of converters all # having the same methods, but different input files) methods = set(self.converters[0].available_methods[:]) # a copy ! for converter in self.converters[1:]: methods &= set(converter.available_methods[:]) methods = sorted(methods) if self.include_dummy: methods += ['dummy'] if self.to_include: methods = [x for x in methods if x in self.to_include] elif self.to_exclude: methods = [x for x in methods if x not in self.to_exclude] for method in methods: print("\nEvaluating method %s" % method) # key: converter.infile # value: list of times times = defaultdict(list) pb = Progress(self.N) for i in range(self.N): for converter in self.converters: with Timer(times[converter.infile]): converter(method=method) pb.animate(i + 1) # Normalize times so that each converter has comparable times mean_time = gmean(np.fromiter(chain(*times.values()), dtype=float)) # median of ratios to geometric mean (c.f. DESeq normalization) scales = { conv: np.median(np.asarray(conv_times) / mean_time) for conv, conv_times in times.items() } for (conv, conv_times) in times.items(): scale = scales[conv] results[method].extend( [conv_time / scale for conv_time in conv_times]) self.results = results
def volcano_plot_all_features(self): """Create a volcano plot for each feature and save in PNG files Each filename is set to **volcano_<feature name>.png** """ features = list(self.df[self._colname_feature].unique()) print('Creating image for each feature (using all drugs)') pb = Progress(len(features), 1) for i, feature in enumerate(features): self.volcano_plot_one_feature(feature) self.savefig_and_js("volcano_%s.png" % feature, size_inches=(10, 10)) pb.animate(i+1) # This prevent memory leak. self.current_fig.canvas.mpl_disconnect(self.cid) try: import mpld3 mpld3.plugins.clear(self.current_fig) except: pass
def load_records(self, overwrite=False): """Load a flat file and store records in :attr:`records` """ self._load_flat_file(overwrite=overwrite) self.records = {} # TODO: check if it exists otherwise, load it ? if os.path.exists(self.filename) is False: self.load() with open(self.filename) as f: data = f.read().strip() data = data.split("//\n") # the sep is //\n self._child_match = re.compile('ID\s+\:\s*(\d+)\s*') self._parent_match = re.compile('PARENT ID\s+\:\s*(\d+)\s*') self._rank_match = re.compile('RANK\s+\:\s*([^\n]+)\s*') self._name_match = re.compile('SCIENTIFIC NAME\s+\:\s*([^\n]+)\s*') from easydev import Progress pb = Progress(len(data)) if self.verbose: print('Loading all taxon records.') for i, record in enumerate(data[0:]): # try/except increase comput. time by 5% try: dico = self._interpret_record(record) identifier = int(dico['id']) self.records[identifier] = dico except Exception as err: print(err) print('Could not parse the following record ' + \ 'Please fill bug report on http://github.com/biokit') print(record) if self.verbose: pb.animate(i+1) if self.verbose: print()
def search_in_chemspider(self): # Fill results attribute as a dictionary. Keys being the drug id # and values are list of chemspider identifiers # # SB52334 --> SB-52334 N = len(self.dd) pb = Progress(N) self.results = {} results = [] for i, index in enumerate(self.dd.df.index): drug = self.dd.df.index[i] drug_name = self.dd.df.ix[drug].DRUG_NAME try: res = self._cs_find(drug_name) except: print("This drug index (%s) / drug name (%s) was not found" % (index, drug_name)) res = [] self.results[drug] = res pb.animate(i+1) results.append(res) self.dd_filled.df['CHEMSPIDER_SEARCHED'] = results
def process_paired_reads(paired_reader, modifiers1, modifiers2, filters, n_progress=-1): """ Loop over reads, find adapters, trim reads, apply modifiers and output modified reads. Return a Statistics object. """ n = 0 # no. of processed reads total1_bp = 0 total2_bp = 0 if n_progress != -1: try: from easydev import Progress pb = Progress(n_progress) count = 0 except: n_progress = -1 for read1, read2 in paired_reader: n += 1 total1_bp += len(read1.sequence) total2_bp += len(read2.sequence) for modifier in modifiers1: read1 = modifier(read1) for modifier in modifiers2: read2 = modifier(read2) for filter in filters: # Stop writing as soon as one of the filters was successful. if filter(read1, read2): break if n_progress != -1: count += 1 pb.animate(count) return Statistics(n=n, total_bp1=total1_bp, total_bp2=total2_bp)
def check_randomness(self, drug_name, n_folds=10, N=10, show=True, progress=False): scores = [] pb = Progress(N) for i in range(N): # Fit a model using CV inter_results = self.runCV(drug_name, n_folds=n_folds, verbose=False) scores.append(inter_results.Rp) if progress: pb.animate(i+1) random_scores = [] pb = Progress(N) for i in range(N): # Fit a model using CV inter_results = self.runCV(drug_name, n_folds=n_folds, randomize_Y=True, verbose=False) random_scores.append(inter_results.Rp) if progress: pb.animate(i+1) from scipy.stats import ttest_ind ttest_res = ttest_ind(scores, random_scores) results = { "scores": scores, "random_scores": random_scores, "ttest_pval": ttest_res.pvalue} # Compute the log of the Bayes factor to avoid underflow as communicated # by M.Menden. S = sum([s>r for s,r in zip(scores, random_scores)]) proba = S / len(scores) if proba == 1: # Set the maximum instead of infinite # bayes_factor = np.inf bayes_factor = 1. / (1./len(scores)) else: bayes_factor = 1. / (1-proba) results['bayes_factor'] = bayes_factor if show: M = max(max(scores), max(random_scores)) * 1.2 bins = pylab.linspace(0, M, 40) pylab.clf() pylab.hist(scores, bins=bins, color="b", alpha=0.5) pylab.hist(random_scores, color="r", alpha=0.5, bins=bins) pylab.title("ttest=%(ttest_pval).3e, bayes=%(bayes_factor)s" % results) pylab.grid(True) return results
def test_progressbar(): N = 2 p = progressbar.progress_bar(N) for i in range(0,N): time.sleep(.1) p.animate(i+1, i) p = progressbar.TextProgressBar(N, progressbar.consoleprint) for i in range(0,N): time.sleep(.1) p.animate(i+1, i) p = Progress(100) p.animate(1) assert p.pb.interval == 1 p = Progress(200) assert p.pb.interval == 2 p.animate(1)