def create_html_drugs(self): """Create an HTML page for each drug""" # group by drugs all_drugs = list(self.df['DRUG_ID'].unique()) df = self.get_significant_set() groups = df.groupby('DRUG_ID') if self.verbose: print("Creating individual HTML pages for each drug") N = len(groups.indices.keys()) N = len(all_drugs) pb = Progress(N) for i, drug in enumerate(all_drugs): # enumerate(groups.indices.keys()): # get the indices and therefore subgroup if drug in groups.groups.keys(): subdf = groups.get_group(drug) else: subdf = {} html = HTMLOneDrug(self, self.df, subdf, drug) html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n")
def _score_challengeA_bunch(self, filenames, subname): from easydev import Progress pb = Progress(5, 1) pb.animate(0) results = [] for i, filename in enumerate(filenames): res = self.score_challengeA(filename, subname + "_" + str(i + 1)) pb.animate(i + 1) results.append(res) aupr_score = -np.mean(np.log10([x["p_auroc"] for x in results])) auroc_score = -np.mean(np.log10([x["p_aupr"] for x in results])) score = (aupr_score + auroc_score) / 2.0 df = pd.TimeSeries() df["Overall Score"] = score df["AUPR score (pval)"] = aupr_score df["AUROC score (pval)"] = aupr_score for i in range(1, 6): df["AUPR Net %s" % i] = results[i - 1]["aupr"] for i in range(1, 6): df["AUROC Net %s" % i] = results[i - 1]["auroc"] return df
def process_single_reads(reader, modifiers, filters, n_progress=-1): """ Loop over reads, find adapters, trim reads, apply modifiers and output modified reads. Return a Statistics object. """ n = 0 # no. of processed reads total_bp = 0 if n_progress != -1: try: from easydev import Progress pb = Progress(n_progress) count = 0 except: n_progress = -1 for read in reader: n += 1 total_bp += len(read.sequence) for modifier in modifiers: read = modifier(read) for filter in filters: if filter(read): break if n_progress != -1: count += 1 pb.animate(count) return Statistics(n=n, total_bp1=total_bp, total_bp2=None)
def filling_chembl_pubchem_using_unichem(self): """ """ N = len(self.drug_ids) pb = Progress(N) for i,this in enumerate(self.drug_ids): entry = self.dd.df.ix[this] # if no information is provided, we will need to get it # from chemspider # From the database, when chembl is provided, it is unique # same for chemspider and pubchem and CAS select = entry[['CHEMSPIDER', 'CHEMBL', 'PUBCHEM']] if select.count() == 0: name = self.dd.df.ix[this].DRUG_NAME results = self._cs_find(name) if len(results) == 0: # nothing found pass elif len(results) == 1: self.dd_filled.df.ix[this].loc['CHEMSPIDER'] = results[0] else: # non unique #chemspider = ",".join([str(x) for x in results]) self.dd_filled.df.ix[this].loc['CHEMSPIDER'] = results pb.animate(i+1) # Search in chemspider systematically for i, this in enumerate(self.drug_ids): entry = self.dd.df.ix[this] if select.count() == 1: res = self._cs_find(drug) pb.animate(i+1)
def _get_G(self, gold): from easydev import Progress import scipy.sparse regulators = list(set(gold[0])) targets = list(set(gold[[0,1]].stack())) N, M = gold[0].max(), gold[1].max() ## A will store indices goind from 0 (not 1) to N-1 # hence the -1 indices when handling A if i,j are the # values of the gene A = np.zeros((N, M)) for row in gold[[0,1]].values: i, j = row A[i-1, j-1] = 1 A_sparse = scipy.sparse.csr_matrix(A) #N, M = len(regulators), len(targets) G = np.zeros((N, M)) pb = Progress(len(regulators), 1) for i, x in enumerate(regulators): for j, y in enumerate(targets): if A[x-1, y-1] == 1: G[x-1, y-1] = 1 elif x != y: G[x-1, y-1] = -1 pb.animate(i+1) return G
def _opt_ridge_lasso(self, drug_name, feature_name, method, alphas=None): if alphas is None: alphas = pylab.linspace(0,1, 20) mses = [] params = [] method_buf = self.settings.regression_method alpha_buf = self.settings.elastic_net.alpha pb = Progress(len(alphas)) for j, alpha in enumerate(alphas): self.settings.regression_method = method self.settings.elastic_net.alpha = alpha odof = self.anova_one_drug_one_feature(drug_name, feature_name) anova = self._get_anova_summary(self.data_lm, output='dataframe') #mses.append(anova.ix['Residuals']['Sum Sq']) mses.append(anova.ix['tissue']['F value']) #mses.append(anova['Sum Sq'].sum()) pb.animate(j+1) params.append(self.data_lm.params) self.settings.regression_method = method_buf self.settings.elastic_net.alpha = alpha_buf return alphas, mses, params
def diagnostics(self): """Return dataframe with information about the analysis """ n_drugs = len(self.ic50.drugIds) n_features = len(self.features.features) - self.features.shift n_combos = n_drugs * n_features feasible = 0 pb = Progress(n_drugs, 1) counter = 0 for drug in self.ic50.drugIds: for feature in self.features.features[self.features.shift:]: dd = self._get_one_drug_one_feature_data(drug, feature, diagnostic_only=True) if dd.status is True: feasible += 1 counter += 1 pb.animate(counter) results = { 'n_drug': n_drugs, 'n_combos': n_combos, 'feasible_tests': feasible, 'percentage_feasible_tests': float(feasible)/n_combos*100} return results
def compounds2accession(self, compounds): """For each compound, identifies the target and corresponding UniProt accession number This is not part of ChEMBL API :: # we recommend to use cache if you use this method regularly c = Chembl(cache=True) drugs = c.get_approved_drugs() # to speed up example drugs = drugs[0:20] IDs = [x['molecule_chembl_id] for x in drugs] c.compounds2accession(IDs) """ # we jump from compounds to targets through activities # Here this is a one to many mapping so we initialise a default # dictionary. from collections import defaultdict compound2target = defaultdict(set) filter = "molecule_chembl_id__in={}" from easydev import Progress pb = Progress(len(compounds)) for i in range(0, len(compounds)): # FIXME could get activities by bunch using # ",".join(compounds[i:i+10) for example activities = self.get_activity(filters=filter.format(compounds[i])) # get target ChEMBL IDs from activities for act in activities: compound2target[act['molecule_chembl_id']].add(act['target_chembl_id']) pb.animate(i+1) # What we need is to get targets for all targets found in the previous # step. For each compound/drug there are hundreds of targets though. And # we will call the get_target for each list of hundreds targets. This # will take forever. Instead, because there are *only* 12,000 targets, # let us download all of them ! This took about 4 minutes on this test but # if you use the cache, next time it will be much much quicker. This is # not down at the activities level because there are too many entries targets = self.get_target(limit=-1) # identifies all target chembl id to easily retrieve the entry later on target_names = [target['target_chembl_id'] for target in targets] # retrieve all uniprot accessions for all targets of each compound for compound, targs in compound2target.items(): accessions = set() for target in targs: index = target_names.index(target) accessions = accessions.union([comp['accession'] for comp in targets[index]['target_components']]) compound2target[compound] = accessions return compound2target
def dendogram_coefficients(self, stacked=False, show=True, cmap="terrain"): """ shows the coefficient of each optimised model for each drug """ drugids = self.drugIds from easydev import Progress pb = Progress(len(drugids)) d = {} for i, drug_name in enumerate(drugids): X, Y = self._get_one_drug_data(drug_name, randomize_Y=False) results = self.runCV(drug_name, verbose=False) df = pd.DataFrame({'name': X.columns, 'weight': results.coefficients}) df = df.set_index("name").sort_values("weight") d[drug_name] = df.copy() pb.animate(i+1) # use drugid to keep same order as in the data dfall = pd.concat([d[i] for i in drugids], axis=1) dfall.columns = drugids if show: from biokit import heatmap h = heatmap.Heatmap(dfall, cmap=cmap) h.plot(num=1,colorbar_position="top left") if stacked is True: dfall = dfall.stack().reset_index() dfall.columns = ["feature", "drug", "weight"] return dfall
def create_html_associations(self): """Create an HTML page for each significant association The name of the output HTML file is **<association id>.html** where association id is stored in :attr:`df`. """ print("\nCreating individual HTML pages for each association") df = self.get_significant_set() drugs = df['DRUG_ID'].values features = df['FEATURE'].values assocs = df['ASSOC_ID'].values fdrs = df['ANOVA_FEATURE_FDR'].values N = len(df) pb = Progress(N) html = Association(self, drug='dummy', feature='dummy', fdr='dummy') for i in range(N): html.drug = drugs[i] html.feature = features[i] html._filename = str(assocs[i]) + '.html' html.fdr = fdrs[i] html.assoc_id = assocs[i] html._init_report() # since we have one shared instance html.create_report(onweb=False) pb.animate(i+1)
def search_from_smile_inchembl(self): N = len(self.drug_ids) pb = Progress(N) self.results_chembl = {} self.results_chemspider = {} for i in range(0, N): drug = self.drug_ids[i] self.results_chembl[drug] = [] if self.results[drug]: for chemspider_id in self.results[drug]: chemspider_entry = self._cs_get(chemspider_id) self.results_chemspider[drug] = chemspider_entry smile = chemspider_entry['smiles'] # now search in chembl res_chembl = self.chembl.get_compounds_by_SMILES(smile) try: res_chembl['compounds'] self.results_chembl[drug].extend(res_chembl['compounds']) except: pass pb.animate(i+1)
def to_kmer_content(self, k=7): """Return a Series with kmer count across all reads :param int k: (default to 7-mers) :return: Pandas Series with index as kmer and values as count. Takes about 30 seconds on a million reads. """ # Counter is slow if we apply it on each read. # .count is slow as well import collections from sequana.kmer import get_kmer counter = collections.Counter() pb = Progress(len(self)) buffer_ = [] for i, this in enumerate(self): buffer_.extend(list(get_kmer(this['sequence'], k))) if len(buffer_) > 100000: counter += collections.Counter(buffer_) buffer_ = [] pb.animate(i) counter += collections.Counter(buffer_) ts = pd.Series(counter) ts.sort_values(inplace=True, ascending=False) return ts
def select_random_reads(self, N=None, output_filename="random.fasta"): """Select random reads and save in a file :param int N: number of random unique reads to select should provide a number but a list can be used as well. :param str output_filename: """ import numpy as np thisN = len(self) if isinstance(N, int): if N > thisN: N = thisN # create random set of reads to pick up cherries = list(range(thisN)) np.random.shuffle(cherries) # cast to set for efficient iteration cherries = set(cherries[0:N]) elif isinstance(N, set): cherries = N elif isinstance(N, list): cherries = set(N) fasta = FastxFile(self.filename) pb = Progress(thisN) # since we scan the entire file with open(output_filename, "w") as fh: for i, read in enumerate(fasta): if i in cherries: fh.write(read.__str__() + "\n") else: pass pb.animate(i+1) return cherries
def volcano_plot_all_drugs(self): """Create a volcano plot for each drug and save in PNG files Each filename is set to **volcano_<drug identifier>.png** """ drugs = list(self.df[self._colname_drugid].unique()) pb = Progress(len(drugs), 1) for i, drug in enumerate(drugs): self.volcano_plot_one_drug(drug) self.savefig("volcano_%s.png" % drug, size_inches=(10, 10)) pb.animate(i+1)
def _load_complexes(self, show_progress=True): from easydev import Progress import time pb = Progress(len(self.df.complexAC)) complexes = {} self.logging.info("Loading all details from the IntactComplex database") for i, identifier in enumerate(self.df.complexAC): res = self.webserv.details(identifier) complexes[identifier] = res if show_progress: pb.animate(i+1) self._complexes = complexes
def create_html_features(self): """Create an HTML page for each significant feature""" df = self.get_significant_set() groups = df.groupby('FEATURE') print("\nCreating individual HTML pages for each feature") N = len(groups.indices.keys()) pb = Progress(N) for i, feature in enumerate(groups.indices.keys()): # get the indices and therefore subgroup subdf = groups.get_group(feature) html = HTMLOneFeature(self, self.df, subdf, feature) html.create_report(onweb=False) pb.animate(i+1)
def volcano_plot_all_features(self): """Create a volcano plot for each feature and save in PNG files Each filename is set to **volcano_<feature name>.png** """ features = list(self.df[self._colname_feature].unique()) print('Creating image for each feature (using all drugs)') pb = Progress(len(features), 1) for i, feature in enumerate(features): self.volcano_plot_one_feature(feature) self.savefig("volcano_%s.png" % feature, size_inches=(10, 10)) pb.animate(i+1)
def check_ipython_notebook(): notebooks = glob.glob("*ipynb") N = len(notebooks) pb = Progress(N) for i,filename in enumerate(notebooks): print(purple(filename)) notebook = read(open(filename), 'json') r = NotebookRunner(notebook) r.run_notebook() pb.animate(i+1)
def check_randomness(self, drug_name, n_folds=10, N=10, show=True, progress=False): scores = [] pb = Progress(N) for i in range(N): # Fit a model using CV inter_results = self.runCV(drug_name, n_folds=n_folds, verbose=False) scores.append(inter_results.Rp) if progress: pb.animate(i+1) random_scores = [] pb = Progress(N) for i in range(N): # Fit a model using CV inter_results = self.runCV(drug_name, n_folds=n_folds, randomize_Y=True, verbose=False) random_scores.append(inter_results.Rp) if progress: pb.animate(i+1) from scipy.stats import ttest_ind ttest_res = ttest_ind(scores, random_scores) results = { "scores": scores, "random_scores": random_scores, "ttest_pval": ttest_res.pvalue} # Compute the log of the Bayes factor to avoid underflow as communicated # by M.Menden. S = sum([s>r for s,r in zip(scores, random_scores)]) proba = S / len(scores) if proba == 1: # Set the maximum instead of infinite # bayes_factor = np.inf bayes_factor = 1. / (1./len(scores)) else: bayes_factor = 1. / (1-proba) results['bayes_factor'] = bayes_factor if show: M = max(max(scores), max(random_scores)) * 1.2 bins = pylab.linspace(0, M, 40) pylab.clf() pylab.hist(scores, bins=bins, color="b", alpha=0.5) pylab.hist(random_scores, color="r", alpha=0.5, bins=bins) pylab.title("ttest=%(ttest_pval).3e, bayes=%(bayes_factor)s" % results) pylab.grid(True) return results
def plot_cindex(self, drug_name, alphas, l1_ratio=0.5, n_folds=10, hold=False): # This is longish (300 seconds with 10 folds and 80 alphas # for GDSC v5 data sets. from dreamtools.core.cindex import cindex CI_train = {} CI_test = {} for c in range(n_folds): CI_train[c] = [] CI_test[c] = [] from easydev import Progress pb = Progress(len(alphas)) for i, alpha in enumerate(alphas): self.elastic_net(drug_name, alpha=alpha, l1_ratio=l1_ratio, n_folds=n_folds) # Look at the first fold only for kf in range(n_folds): x_train = self.kfold_data['x_train'][kf].values y_train = self.kfold_data['y_train'][kf].values x_test = self.kfold_data['x_test'][kf].values y_test = self.kfold_data['y_test'][kf].values x_train_pred = self.en.predict(x_train) x_test_pred = self.en.predict(x_test) CI_test[kf].append(1-cindex(x_test_pred, y_test, [True]*len(y_test))) CI_train[kf].append(1-cindex(x_train_pred, y_train, [True] * len(y_train))) pb.animate(i) mu_train = pd.DataFrame(CI_train).transpose().mean() sigma_train = pd.DataFrame(CI_train).transpose().std() mu_test = pd.DataFrame(CI_test).transpose().mean() sigma_test = pd.DataFrame(CI_test).transpose().std() best_alpha = alphas[pd.DataFrame(CI_test).mean(axis=1).argmax()] pylab.clf() pylab.errorbar(pylab.log(alphas), mu_train, yerr=sigma_train, label="train") pylab.errorbar(pylab.log(alphas)+.1, mu_test, yerr=sigma_test, label="test") pylab.plot(pylab.log(alphas), mu_train, 'ob') pylab.plot(pylab.log(alphas)+.1, mu_train, 'or') pylab.legend() pylab.axvline(pylab.log(best_alpha), lw=2, color="purple") return best_alpha
def create_summary_pages(self): """Create summary pages Once the main analyis is done (:meth:`analyse`), and the company packages have been created (:meth:`create_data_packages_for_companies`), you can run this method that will creade a summary HTML page (index.html) for the tissue, and a similar summary HTML page for the tissues of each company. Finally, an HTML summary page for the companies is also created. The final tree direcorty looks like:: |-- index.html |-- company_packages | |-- index.html | |-- Company1 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html | |-- Company2 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html |-- tissue_packages | |-- index.html | |-- Tissue1 | |-- Tissue2 """ # First for the main directory (tissue_packages): print(purple("Creating summary index.html for the tissues")) self._create_summary_pages(self.main_directory, verbose=False) # Then for each companies: print(purple("Creating summary index.html for each company")) pb = Progress(len(self.companies)) for i, company in enumerate(self.companies): try: self._create_summary_pages(self.company_directory + os.sep + company, verbose=False, company=company) except Exception as err: print(red("Issue with %s. Continue with other companies" % company)) print(err) pb.animate(i+1) # Finally, an index towards each company self._create_main_index()
def filter(self, identifiers_list=[], min_bp=None, max_bp=None, progressbar=True, output_filename='filtered.fastq', remove=True): """Filter reads :param int min_bp: ignore reads with length shorter than min_bp :param int max_bp: ignore reads with length above max_bp """ # 7 seconds without identifiers to scan the file # on a 750000 reads if min_bp is None: min_bp = 0 if max_bp is None: max_bp = 1e9 # make sure we are at the beginning self.rewind() output_filename, tozip = self._istozip(output_filename) with open(output_filename, "w") as fout: pb = Progress(self.n_reads) buf = "" filtered = 0 for count, lines in enumerate(grouper(self._fileobj)): identifier = lines[0].split()[0] if lines[0].split()[0] in identifiers_list: filtered += 1 else: N = len(lines[1]) if N <= max_bp and N >= min_bp: buf += "{}{}+\n{}".format( lines[0].decode("utf-8"), lines[1].decode("utf-8"), lines[3].decode("utf-8")) if count % 100000 == 0: fout.write(buf) buf = "" if progressbar is True: pb.animate(count+1) fout.write(buf) if filtered < len(identifiers_list): print("\nWARNING: not all identifiers were found in the fastq file to " + "be filtered.") if tozip is True: self._gzip(output_filename)
def select_random_reads(self, N=None, output_filename="random.fastq"): """Select random reads and save in a file :param int N: number of random unique reads to select should provide a number but a list can be used as well. You can select random reads for R1, and re-use the returned list as input for the R2 (since pairs must be kept) :param str output_filename: If you have a pair of files, the same reads must be selected in R1 and R2.:: f1 = FastQ(file1) selection = f1.select_random_reads(N=1000) f2 = FastQ(file2) f2.select_random_reads(selection) """ thisN = len(self) if isinstance(N, int): if N > thisN: N = thisN # create random set of reads to pick up cherries = list(range(thisN)) np.random.shuffle(cherries) # cast to set for efficient iteration cherries = set(cherries[0:N]) elif isinstance(N, set): cherries = N elif isinstance(N, list): cherries = set(N) fastq = pysam.FastxFile(self.filename) pb = Progress(thisN) # since we scan the entire file with open(output_filename, "w") as fh: for i, read in enumerate(fastq): if i in cherries: fh.write(read.__str__() + "\n") else: pass pb.animate(i+1) return cherries
def volcano_plot_all_drugs(self): """Create a volcano plot for each drug and save in PNG files Each filename is set to **volcano_<drug identifier>.png** """ drugs = list(self.df[self._colname_drugid].unique()) pb = Progress(len(drugs), 1) for i, drug in enumerate(drugs): self.volcano_plot_one_drug(drug) self.savefig_and_js("volcano_%s.png" % drug, size_inches=(10, 10)) pb.animate(i+1) # This prevent memory leak. self.current_fig.canvas.mpl_disconnect(self.cid) try: import mpld3 mpld3.plugins.clear(self.current_fig) except: pass
def optimise_elastic_net(self, drug_name, feature_name, N=20, Nalpha=20): lwts = pylab.linspace(0, 1, N) alphas = pylab.linspace(0, 5, Nalpha) mses = np.zeros((N, Nalpha)) pb = Progress(N) for i, lwt in enumerate(lwts): for j, alpha in enumerate(alphas): self.settings.regression_method = 'ElasticNet' self.settings.regression_alpha = alpha self.settings.regression_L1_wt = lwt odof = self.anova_one_drug_one_feature(drug_name, feature_name) anova = self._get_anova_summary(self.data_lm, output='dataframe') mses[i,j] = self.data_lm.bic pb.animate(i+1) return mses
def volcano_plot_all_features(self): """Create a volcano plot for each feature and save in PNG files Each filename is set to **volcano_<feature name>.png** """ features = list(self.df[self._colname_feature].unique()) print('Creating image for each feature (using all drugs)') pb = Progress(len(features), 1) for i, feature in enumerate(features): self.volcano_plot_one_feature(feature) self.savefig_and_js("volcano_%s.png" % feature, size_inches=(10, 10)) pb.animate(i+1) # This prevent memory leak. self.current_fig.canvas.mpl_disconnect(self.cid) try: import mpld3 mpld3.plugins.clear(self.current_fig) except: pass
def test_progressbar(): N = 2 p = progressbar.progress_bar(N) for i in range(0,N): time.sleep(.1) p.animate(i+1, i) p = progressbar.TextProgressBar(N, progressbar.consoleprint) for i in range(0,N): time.sleep(.1) p.animate(i+1, i) p = Progress(100) p.animate(1) assert p.pb.interval == 1 p = Progress(200) assert p.pb.interval == 2 p.animate(1)
def load_records(self, overwrite=False): """Load a flat file and store records in :attr:`records` """ self._load_flat_file(overwrite=overwrite) self.records = {} # TODO: check if it exists otherwise, load it ? if os.path.exists(self.filename) is False: self.load() with open(self.filename) as f: data = f.read().strip() data = data.split("//\n") # the sep is //\n self._child_match = re.compile('ID\s+\:\s*(\d+)\s*') self._parent_match = re.compile('PARENT ID\s+\:\s*(\d+)\s*') self._rank_match = re.compile('RANK\s+\:\s*([^\n]+)\s*') self._name_match = re.compile('SCIENTIFIC NAME\s+\:\s*([^\n]+)\s*') from easydev import Progress pb = Progress(len(data)) if self.verbose: print('Loading all taxon records.') for i, record in enumerate(data[0:]): # try/except increase comput. time by 5% try: dico = self._interpret_record(record) identifier = int(dico['id']) self.records[identifier] = dico except Exception as err: print(err) print('Could not parse the following record ' + \ 'Please fill bug report on http://github.com/biokit') print(record) if self.verbose: pb.animate(i+1) if self.verbose: print()
def search_in_chemspider(self): # Fill results attribute as a dictionary. Keys being the drug id # and values are list of chemspider identifiers # # SB52334 --> SB-52334 N = len(self.dd) pb = Progress(N) self.results = {} results = [] for i, index in enumerate(self.dd.df.index): drug = self.dd.df.index[i] drug_name = self.dd.df.ix[drug].DRUG_NAME try: res = self._cs_find(drug_name) except: print("This drug index (%s) / drug name (%s) was not found" % (index, drug_name)) res = [] self.results[drug] = res pb.animate(i+1) results.append(res) self.dd_filled.df['CHEMSPIDER_SEARCHED'] = results
def process_paired_reads(paired_reader, modifiers1, modifiers2, filters, n_progress=-1): """ Loop over reads, find adapters, trim reads, apply modifiers and output modified reads. Return a Statistics object. """ n = 0 # no. of processed reads total1_bp = 0 total2_bp = 0 if n_progress != -1: try: from easydev import Progress pb = Progress(n_progress) count = 0 except: n_progress = -1 for read1, read2 in paired_reader: n += 1 total1_bp += len(read1.sequence) total2_bp += len(read2.sequence) for modifier in modifiers1: read1 = modifier(read1) for modifier in modifiers2: read2 = modifier(read2) for filter in filters: # Stop writing as soon as one of the filters was successful. if filter(read1, read2): break if n_progress != -1: count += 1 pb.animate(count) return Statistics(n=n, total_bp1=total1_bp, total_bp2=total2_bp)
def bam_to_mapped_unmapped_fastq(filename, output_directory=None, verbose=True): """Create mapped and unmapped fastq files from a BAM file :context: given a reference, one or two FastQ files are mapped onto the reference to generate a BAM file. This BAM file is a compressed version of a SAM file, which interpretation should be eased within this function. :param filename: input BAM file :param output_directory: where to save the mapped and unmapped files :return: dictionary with number of reads for each file (mapped/unmapped for R1/R2) as well as the mode (paired or not), the number of unpaired reads, and the number of duplicated reads. The unpaired reads should be zero (sanity check) Given a BAM file, create FASTQ with R1/R2 reads mapped and unmapped. In the paired-end case, 4 files are created. Note that this function is efficient in that it does not create intermediate files limiting IO in the process. As compared to standard tools such as bedtools bamtofastq, it is 1.5 to 2X slower but it does create the mapped AND unmapped reads. :Details: Secondary alignment (flag 256) are dropped so as to remove any ambiguous alignments. The output dictionary stores "secondary" key to keep track of the total number of secondary reads that are dropped. If the flag is 256 and the read is unpaired, the key *unpaired* is also incremented. If the flag is not equal to 256, we first reverse complement reads that are tagged as *reverse* in the BAM file. Then, reads that are not paired or not "proper pair" (neither flag 4 nor flag 8) are ignored. If R1 is mapped **or** R2 is mapped then the reads are considered mapped. If both R1 and R2 are unmapped, then reads are unmapped. .. note:: about chimeric alignment: one is the representative and the other is the supplementary. This flag is not used in this function. Note also that chimeric alignment have same QNAME and flag 4 and 8 .. note:: the contamination reported is basde on R1 only. .. todo:: comments are missing since there are not stored in the BAM file. .. note:: the mapped reads may not be synchronized because we include also the chimeric alignment (cf samtools documentation). However, total reads = unmappeds reads + R1 mapped + R2 mapped - supplementary reads (those with flag 2048). """ bam = BAM(filename) # figure out if this is paired or unpaired newname, ext = os.path.splitext(filename) import collections stats = collections.defaultdict(int) stats['R1_unmapped'] = 0 stats['R1_mapped'] = 0 # figure out where to save the file if output_directory is None: pass else: assert isinstance(filename, str) from sequana.snaketools import FileFactory ff = FileFactory(filename) newname = output_directory + os.sep + ff.filenames[0] rt1 = "_R1_" rt2 = "_R2_" R1_mapped = open(newname + "{}.mapped.fastq".format(rt1), "wb") R1_unmapped = open(newname + "{}.unmapped.fastq".format(rt1), "wb") stats['duplicated'] = 0 stats['unpaired'] = 0 unpaired = 0 # if paired, let open other files if bam.is_paired: stats['mode'] = "pe" stats['R2_unmapped'] = 0 stats['R2_mapped'] = 0 R2_mapped = open(newname + "{}.mapped.fastq".format(rt2), "wb") R2_unmapped = open(newname + "{}.unmapped.fastq".format(rt2), "wb") else: stats['mode'] = "se" # loop through the BAM (make sure it is rewinded) bam.reset() if verbose: from easydev import Progress pb = Progress(len(bam)) for i, this in enumerate(bam): if this.flag & 256: # Unmapped reads are in the BAM file but have no valid assigned # position (N.B., they may have an assigned position, but it should be ignored). # It's typically the case that a number of reads can't be aligned, due to things # like sequencing errors, imperfect matches between the DNA sequenced and the # reference, random e. coli or other contamination, etc.. # A secondary alignment occurs when a given read could align reasonably well to # more than one place. One of the possible reported alignments is termed "primary" # and the others will be marked as "secondary". stats['secondary'] += 1 if this.is_paired is False: stats['unpaired'] += 1 else: # quick hack if this.is_read1: suffix = b"/1" else: suffix = b"/2" # in pysam, seq is a string and qual a bytes.... if this.is_reverse is True: txt = b"@" + bytes(this.qname, "utf-8") + suffix + b"\n" revcomp = reverse_complement(this.seq) txt += bytes(revcomp, "utf-8") + b"\n" txt += b"+\n" txt += bytes(this.qual[::-1], 'utf-8') + b"\n" else: txt = b"@" + bytes(this.qname, "utf-8") + suffix + b"\n" txt += bytes(this.seq, "utf-8") + b"\n" txt += b"+\n" txt += bytes(this.qual, "utf-8") + b"\n" # Here, we must be careful as to keep the pairs. So if R1 is mapped # but R2 is unmapped (or the inverse), then the pair is mapped if this.is_read1: if this.is_unmapped and this.mate_is_unmapped: R1_unmapped.write(txt) stats['R1_unmapped'] += 1 else: R1_mapped.write(txt) stats['R1_mapped'] += 1 elif this.is_read2: if this.is_unmapped and this.mate_is_unmapped: R2_unmapped.write(txt) stats['R2_unmapped'] += 1 else: R2_mapped.write(txt) stats['R2_mapped'] += 1 else: # This should be a single read #assert self.is_paired is False stats['unpaired'] += 1 if this.is_unmapped: R1_unmapped.write(txt) stats['R1_unmapped'] += 1 else: R1_mapped.write(txt) stats['R1_mapped'] += 1 if this.is_duplicate: stats['duplicated'] += 1 if verbose: pb.animate(i + 1) if bam.is_paired: R2_mapped.close() R2_unmapped.close() logger.info("\nNumber of entries in the BAM: %s" % str(i + 1)) R1_mapped.close() R1_unmapped.close() _x = stats['R1_mapped'] _y = stats['R1_unmapped'] stats["contamination"] = _x / float(_x + _y) * 100 return stats
def create_data_packages_for_companies(self, companies=None): ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies Ncomp = len(companies) for ii, company in enumerate(companies): print("\n\n========= Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company)) self.mkdir(company) for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print("---------------- for TCGA %s" % tcga) # Read the results previously computed try: results_df = self.results[tcga].df.copy() except: results_path = "ALL/%s/OUTPUT/results.csv" % tcga print("Downloading results from %s" % results_path) results_df = ANOVAResults(results_path) results = ANOVAResults(results_df) # Get a DrugDecode for that company drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # filter results using the new drug decode drug_ids_in_results = get_drug_id(results.df.DRUG_ID) mask = [ True if x in drug_decode_company.df.index else False for x in drug_ids_in_results ] results.df = results.df.ix[mask] # Just to create an instance with the subset of drug_decode # and correct settings. This is also used to store # the entire input data set. So, we must remove all drugs # not relevant for the analysis of this company an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company) def drug_to_keep(drug): to_keep = get_drug_id(drug) in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = company + os.sep + tcga an.settings.analysis_type = tcga self.report = ANOVAReport(an, results) self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) if self.debug is False: self.report.create_html_features() self.report.create_html_associations() # For now, we just copy all DRUG images from # the analysis made in ALL from easydev import shellcmd, Progress print("\nCopying drug files") drug_ids = results.df.DRUG_ID.unique() pb = Progress(len(drug_ids)) for i, drug_id in enumerate(drug_ids): # copy the HTML filename = "%s.html" % drug_id source = "ALL%s%s%s" % (os.sep, tcga, os.sep) dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) #copy the images filename = "volcano_%s.*" % drug_id source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep, os.sep) dest = "%s%s%s%simages%s" % (company, os.sep, tcga, os.sep, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) pb.animate(i + 1)
class MultiProcessing(object): """Class to run jobs in an asynchronous manner. You would use this class to run several jobs on a local computer that has several cpus. :: t = MultiProcessing(maxcpu=2) t.add_job(func, func_args) t.run() t.results[0] # contain returned object from the function *func*. .. warning:: the function must be a function, not a method. This is inherent to multiprocess in the multiprocessing module. .. warning:: the order in the results list may not be the same as the list of jobs. see :meth:`run` for details """ def __init__(self, maxcpu=None, verbose=False, progress=True): """ :param maxcpu: default returned by multiprocessing.cpu_count() :param verbose: print the output of each job. Could be very verbose so we advice to keep it False. :param progress: shows the progress """ if maxcpu == None: maxcpu = cpu_count() self.maxcpu = maxcpu self.reset() self.verbose = verbose self.progress = progress def reset(self): """remove joves and results""" self.jobs = [] # a list of processes self.results = Queue() # the results to append def add_job(self, func, *args, **kargs): """add a job in the pool""" if self.verbose: print("Adding jobs in the queue..", ) t = Process(target=func, args=args, kwargs=kargs) self.jobs.append(t) def _cb(self, results): if self.verbose is True: print("callback", results) if self.progress is True: self.pb.animate(len(self.results) + 1) self.results.append(results) def run(self, delay=0.1, verbose=True): """Run all the jobs in the Pool until all have finished. Jobs that have been added to the job list in :meth:`add_job` are now processed in this method by using a Pool. Here, we add all jobs using the apply_async method from multiprocess module. In order to ensure that the jobs are run sequentially in the same order as in :attr:`jobs`, we introduce a delay between 2 calls to apply_async (see http://docs.python.org/2/library/multiprocessing.html) A better way may be t use a Manager but for now, this works. """ from easydev import Progress if self.progress is True: self.pb = Progress(len(self.jobs), 1) self.pb.animate(0) def init_worker(): import signal signal.signal(signal.SIGINT, signal.SIG_IGN) self.results = [] self.pool = Pool(self.maxcpu, init_worker) for process in self.jobs: self.pool.apply_async(process._target, process._args, process._kwargs, callback=self._cb) # ensure the results have same order as jobs # maybe important if you expect the order of the results to # be the same as inut; otherwise set delay to 0 time.sleep(delay) try: while True: time.sleep(1) # check if all processes are finished. # if so, finished. count = len(self.results) if count == len(self.jobs): break except KeyboardInterrupt: print( "\nCaught interruption. " + "Terminating the Pool of processes... ", ) self.pool.terminate() self.pool.join() print("... done") else: # Closing properly the pool self.pool.close() self.pool.join() # Pool cannot be pickled. So, if we want to pickel "MultiProcessing" # class itself, we must desctroy this instance del self.pool self.finished = True
def download_fasta(self, filelist, output_dir=None, from_ena=True): """Download a FASTA (or list of) :param filelist: a name to find on the ENA web server OR the name of an accession number. .. warning:: The filename is named after the accession without .X number If there are several variant .1, .2 the later will be used. This should not happen if the list is properly defined. """ from bioservices import ENA if filelist.endswith(".txt") and os.path.exists(filelist) is False: logger.info( "Downloading list from http://www.ebi.ac.uk/genomes/%s" % filelist) data = urlopen("http://www.ebi.ac.uk/genomes/%s" % filelist).readlines() identifiers = [x.strip().decode() for x in data] elif filelist == "mus_musculus": #19 +x+y chromosomes + 5 mitochondrion # could also add strain C57BL. identifiers = [ "AY172335", "CM000209", "CM000210", "CM000211" "CM000212", "CM000213", "CM000214", "CM000215", "CM000216" "CM000217", "CM000218", "CM000219", "CM000220", "CM000221" "CM000222", "CM000223", "CM000224", "CM000225", "CM000226" "CM000227", "CM000228", "CM000229", "CM000225", "CM000226" "EF108342", "AB042432", "AY675564", "DQ874614" ] elif filelist == "worms": # Caernorhabditis briggsae and elegans identifiers = [ "AC186293", "FR847112", "FR847113", "FR847114", "FR847118", "FR847121", "FR847123", "BX284601", "BX284602", "BX284603", "BX284604", "BX284605", "BX284606" ] elif isinstance(filelist, str) and filelist in self._metadata.keys(): name = self._metadata[filelist][0] logger.info( "Downloading list from http://www.ebi.ac.uk/genomes/%s" % name) data = urlopen("http://www.ebi.ac.uk/genomes/%s" % name).readlines() identifiers = [x.strip().decode() for x in data] elif isinstance(filelist, list): identifiers = filelist[:] elif isinstance(filelist, str): # could be a single identifier or a filename (assuming a single # column) if os.path.exists(filelist): identifiers = [x for x in open(filelist).read().split()] identifiers = [x.strip() for x in identifiers] else: identifiers = [filelist] self._identifiers = identifiers self.results = self.ena_id_to_gi_number(identifiers) # do not use caching things this could be huge data sets. ena = ENA() if output_dir is None: output_dir = "." else: try: os.mkdir(output_dir) except: pass N = len(identifiers) pb = Progress(N) logger.info("Fetching all fasta from ENA") for i, identifier in enumerate(identifiers): filenames = glob.glob(output_dir + os.sep + "ENA_%s*" % identifier) if len(filenames) >= 1: pb.animate(i + 1) # no need to fetch and save the data it looks like... continue # download data from ENA data = ena.get_data(identifier, "fasta") # Split header and Fasta header, others = data.decode().split("\n", 1) # Source of failure: # - list and DB are not synchrone: e.g. some entries may be deleted if "suppressed" in header: continue if ">" not in header: continue # Do not use try/except since when it fails, this is a real issue name = header.strip(">").split(" ")[0] db, id_, acc = name.split("|") try: header = self.switch_header_to_gi(acc) except: logger.error("Failed for this entry:") logger.error(identifier) logger.error(header) logger.error(name) continue # Save to local file # WARNINGS: extension is .fa because kraken-build expects .fa files filename = "%s_%s.fa" % (db, acc.split(".")[0]) if output_dir: filename = output_dir + os.sep + filename with open(filename, "w") as fout: fout.write(header + "\n" + others) pb.animate(i + 1)
def filter(self, identifiers_list=[], min_bp=None, max_bp=None, progressbar=True, output_filename='filtered.fastq'): """Save reads in a new file if there are not in the identifier_list :param int min_bp: ignore reads with length shorter than min_bp :param int max_bp: ignore reads with length above max_bp """ # 7 seconds without identifiers to scan the file # on a 750000 reads if min_bp is None: min_bp = 0 if max_bp is None: max_bp = 1e9 # make sure we are at the beginning self.rewind() output_filename, tozip = self._istozip(output_filename) with open(output_filename, "w") as fout: pb = Progress(self.n_reads) buf = "" filtered = 0 saved = 0 for count, lines in enumerate(grouper(self._fileobj)): identifier = lines[0].split()[0] if lines[0].split()[0].decode() in identifiers_list: filtered += 1 else: #pragma: no cover N = len(lines[1]) if N <= max_bp and N >= min_bp: buf += "{}{}+\n{}".format(lines[0].decode("utf-8"), lines[1].decode("utf-8"), lines[3].decode("utf-8")) saved += 1 else: filtered += 1 if count % 100000 == 0: fout.write(buf) buf = "" if progressbar is True: pb.animate(count + 1) fout.write(buf) if filtered < len(identifiers_list): #pragma: no cover print( "\nWARNING: not all identifiers were found in the fastq file to " + "be filtered.") logger.info("\n{} reads were filtered out and {} saved in {}".format( filtered, saved, output_filename)) if tozip is True: #pragma: no cover logger.info("Compressing file") self._gzip(output_filename)
def _get_data(self, name, params): # keep the number of events we want and original offset max_data = params['limit'] offset = params['offset'] # I noticed that # if offset + limit > total_count, then limit is set to 1000 - offset # Not sure whether it is a bug or intended behaviour but this caused # some issues during the debugging. # So http_get("mechanism?format=json&limit=10000&offset=10") # returns 990 entries and not 1000 as expected. # if a resources is small (e.g. tissue has 655 < 1000 entries) there is # no such issues. # So, the best is to constraint limit to 1000 params['limit'] = 1000 # for the first call # The limit used in all other calls limit = 1000 res = self.http_get("{}".format(name), params=params) self._check_request(res) # get rid of page_meta key/value self.page_meta = res['page_meta'] keys = list(res.keys()) keys.remove('page_meta') names = keys[0] # the parameter name in plural form # keep first chunk of data data = res[names] if max_data == -1: max_data = res['page_meta']['total_count'] elif max_data > res['page_meta']['total_count']: max_data = res['page_meta']['total_count'] N = max_data from easydev import Progress pb = Progress(N) count = 1 while res["page_meta"]['next'] and len(data) < max_data: params['limit'] = limit params['offset'] = limit * count + offset res = self.http_get("{}".format(name), params=params) data += res[names] count += 1 pb.animate(len(data)) self.page_meta = res['page_meta'] if self.page_meta['next']: offset = self.page_meta['offset'] total = self.page_meta['total_count'] - len(data) - int(offset) self.logging.warning( 'More data available ({}). rerun with higher' 'limit and/or offset {}. Check content of page_meta' ' attribute'.format(total, offset)) if len(data) > max_data: return data[0:max_data] else: return data
def get_feature_counts_eukaryotes(self, feature=None, attribute=None): if feature is None: feature = "gene" if attribute is None: attribute = "ID" # just to not loose the original df = self.df.copy() # Name contains the salmon entries read from gffread that uses # transcript_id. From this transcript id, we get the gene (parent) df['Gene'] = [self.trs2genes[x] for x in self.df.Name] #groups = df.groupby('Gene').groups counts_on_genes = df.groupby('Gene').NumReads.sum() ff = self.filename.split("/")[-1] results = f"\nGeneid\tChr\tStart\tEnd\tStrand\tLength\t{ff}" # mouse 25814 gene (feature) # 53715 gene_id (attribute) # 135181 transcript_id (attribute) # 133618 transcript_id from salmon # 135181 entries in transcript fasta (gffread) # gffread extact transcript_id from the gff if present # otherwise, extract geneID or gene_id logger.info("Recreating the feature counts") genes = {} dd = self.gff.df.query("ID in @counts_on_genes.index") dd = dd.set_index("ID") dd = dd.loc[counts_on_genes.index] self.dd = dd types = dd['type'].values starts = dd['start'].values stops = dd['stop'].values strands = dd['strand'].values seqids = dd['seqid'].values from easydev import Progress pb = Progress(len(counts_on_genes)) S = 0 logger.info("Grouping") TPMgroup = df.groupby('Gene').apply(lambda group: group['TPM'].sum()) efflength_null = df.groupby('Gene').apply( lambda group: group['EffectiveLength'].mean()) groups = df.groupby('Gene') for i, name in enumerate(counts_on_genes.index): # Since we use ID, there should be only one hit. we select the first # one to convert to a Series tpm_sum = TPMgroup.loc[name] if tpm_sum == 0: length = efflength_null.loc[name] else: abundances = groups.get_group(name).TPM efflength = groups.get_group(name).EffectiveLength length = sum([x * y for x, y in zip(abundances, efflength) ]) / abundances.sum() S += abundances.sum() # FIXME we keep only types 'gene' to agree with output of # start/bowtie when working on the gene feature. What would happen # to compare salmon wit other type of features ? if types[i] == "gene": start = starts[i] stop = stops[i] seqid = seqids[i] strand = strands[i] NumReads = counts_on_genes.loc[name] length = length name = name.replace("gene:", "") results += f"\n{name}\t{seqid}\t{start}\t{stop}\t{strand}\t{length}\t{NumReads}" else: pass pb.animate(i) return results """
def check_randomness(self, drug_name, kfolds=10, N=10, progress=False, nbins=40, show=True, **kargs): """Compute Bayes factor between NULL model and best model fitted N times :param drug_name: :param kfolds: :param int N: optimise NULL models and real model N times :param progress: :param nbins: :param show: Bayes factor:: S = sum([s>r for s,r in zip(scores, random_scores)]) proba = S / len(scores) bayes_factor = 1. / (1-proba) Interpretation for values of the Bayes factor according to Kass and Raftery (1995). ============================ ================== Interpretation B(1,2) ============================ ================== Very strong support for 1 < 0.0067 Strong support 1 0.0067 to 0.05 Positive support for 1 0.05 to .33 Weak support for 1 0.33 to 1 No support for either model 1 Weak support for 2 1 to 3 Positive support for 2 3 to 20 Strong support for 2 20 to 150 Very strong support for 2 > 150 ============================ ================== references: http://www.socsci.uci.edu/~mdlee/LodewyckxEtAl2009.pdf http://www.aarondefazio.com/adefazio-bayesfactor-guide.pdf """ scores = [] pb = Progress(N) for i in range(N): # Fit a model using CV inter_results = self.runCV(drug_name, kfolds=kfolds, verbose=False, **kargs) scores.append(inter_results.Rp) if progress: pb.animate(i + 1) random_scores = [] pb = Progress(N) for i in range(N): # Fit a model using CV inter_results = self.runCV(drug_name, kfolds=kfolds, randomize_Y=True, verbose=False, **kargs) random_scores.append(inter_results.Rp) if progress: pb.animate(i + 1) from scipy.stats import ttest_ind ttest_res = ttest_ind(scores, random_scores) results = { "scores": scores, "random_scores": random_scores, "ttest_pval": ttest_res.pvalue } # Compute the log of the Bayes factor to avoid underflow as communicated # by M.Menden. S = sum([s > r for s, r in zip(scores, random_scores)]) proba = S / len(scores) if proba == 1: # Set the maximum instead of infinite # bayes_factor = np.inf bayes_factor = 1. / (1. / len(scores)) else: bayes_factor = 1. / (1 - proba) results['bayes_factor'] = bayes_factor M = max(max(scores), max(random_scores)) * 1.2 m = min(min(scores), min(random_scores)) * 1.2 if show: bins = pylab.linspace(m, M, nbins) pylab.clf() pylab.hist(scores, bins=bins, color="b", alpha=0.5) pylab.hist(random_scores, color="r", alpha=0.5, bins=bins) pylab.title("Bayes factor=%(bayes_factor).2f" % results) pylab.grid(True) pylab.xlabel("Coefficient of correlation Rp") pylab.xlabel("#") return results
def anova_all(self, animate=True, drugs=None, multicore=None): """Run all ANOVA tests for all drugs and all features. :param drugs: you may select a subset of drugs :param animate: shows the progress bar :return: an :class:`~gdsctools.anova_results.ANOVAResults` instance with the dataframe stored in an attribute called **df** Calls :meth:`anova_one_drug` for each drug and concatenate all results together. Note that once all data are gathered, :meth:`add_pvalues_correction` is called to fill a new column with FDR corrections. An extra column named "ASSOC_ID" is also added with a unique identifer sorted by ascending FDR. .. note:: A thorough comparison with version v17 gives the same FDR results (difference ~1e-6); Note however that the qvalue results differ by about 0.3% due to different smoothing in R and Python. """ if self.verbose and len(self.individual_anova): print("Reusing some results from the buffer. " "To reset the buffer, call reset_buffer() method") # drop DRUG where number of IC50 (non-null) is below 5 # axis=0 is default but we emphasize that sum is over # column (i.e. drug vv = (self.ic50.df.isnull() == False).sum(axis=0) # FIXME: should be in one_drug_one_feature ?? drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50] # if user provided a list of drugs, use them: if drugs is not None: # todo: check valifity of the drug names drug_names = drugs[:] pb = Progress(len(drug_names), 1) drug_names = list(drug_names) #pylab.shuffle(drug_names) # ? why if animate is True: pb.animate(0) if multicore: # Note that here, we do not use the buffer multicore_analysis(self, drug_names, multicore) else: for i, drug_name in enumerate(drug_names): if drug_name in self.individual_anova.keys(): pass else: res = self.anova_one_drug(drug_name, animate=False, output='dataframe') self.individual_anova[drug_name] = res if animate is True: pb.animate(i + 1) print("\n") if len(self.individual_anova) == 0: return ANOVAResults() df = pd.concat(self.individual_anova, ignore_index=True) if len(df) == 0: return df # sort all data by ANOVA p-values try: df.sort_values('ANOVA_FEATURE_pval', inplace=True) except: df.sort('ANOVA_FEATURE_pval', inplace=True) # all ANOVA have been computed individually for each drug and each # feature. Now, we need to compute the multiple testing corrections if self.settings.pvalue_correction_level is True: df = self.add_pvalues_correction(df) else: pass # insert a unique identifier as first column df.insert(0, 'ASSOC_ID', range(1, len(df) + 1)) self.df = df # order the column names as defined in the __init__ method df = df[self.column_names] df.reset_index(inplace=True, drop=True) return ANOVAResults(df, self.settings)
def anova_one_drug(self, drug_id, animate=True, output='object'): """Computes ANOVA for a given drug across all features :param str drug_id: a valid drug identifier. :param animate: shows the progress bar :return: a dataframe Calls :meth:`anova_one_drug_one_feature` for each feature. """ # drop first and second columns that are made of strings # works under python2 but not python 3. Assume that the 2 first #columns are the sample name and tissue feature # Then, we keep only cases with at least 3 features. # MSI could be used but is not like in original R code. features = self.features.df.copy() # need to skip the FACTOR to keep only features shift = self.features.shift features = features[features.columns[shift:]] # FIXME what about features with less than 3 zeros ? mask = features.sum(axis=0) >= 3 # TODO: MSI, tissues, name must always be kept # selected_features = features[features.columns[mask]] # scan all features for a given drug assert drug_id in self.ic50.df.columns N = len(selected_features.columns) pb = Progress(N, 10) res = {} # for i, feature in enumerate(selected_features.columns): # production True, means we do not want to create a DataFrame # for each call to the anova_one_drug_one_feature function # Instead, we require dictionaries this = self.anova_one_drug_one_feature(drug_id, feature, production=True) if this['ANOVA_FEATURE_pval'] is not None: res[feature] = this if animate is True: pb.animate(i + 1) # if production is False: # df = pid.concat(res, ignore_index=True) df = pd.DataFrame.from_records(res) df = df.T df = ANOVAResults().astype(df) if len(df) == 0: return df # append DRUG_NAME/DRUG_TARGET columns df = self.drug_decode.drug_annotations(df) # TODO: drop rows where ANOVA_FEATURE_PVAL is None if output != 'object': df = self.add_pvalues_correction(df) return df else: df = self.add_pvalues_correction(df) res = ANOVAResults(df, self.settings) res.settings = ANOVASettings(**self.settings) return res
def plot_cindex(self, drug_name, alphas, l1_ratio=0.5, kfolds=10, hold=False): """Tune alpha parameter using concordance index This is longish and performs the following task. For a set of alpha (list), run the elastic net analysis for a given **l1_ratio** with **kfolds**. For each alpha, get the CIndex and find the CINdex for which the errors are minimum. .. warning:: this is a bit longish (300 seconds for 10 folds and 80 alphas) on GDSCv5 data set. """ from dreamtools.core.cindex import cindex CI_train = {} CI_test = {} for c in range(kfolds): CI_train[c] = [] CI_test[c] = [] pb = Progress(len(alphas)) for i, alpha in enumerate(alphas): self.fit(drug_name, alpha=alpha, l1_ratio=l1_ratio, kfolds=kfolds) # Look at the results and store cindex for kf in range(kfolds): x_train = self.kfold_data['x_train'][kf].values y_train = self.kfold_data['y_train'][kf].values x_test = self.kfold_data['x_test'][kf].values y_test = self.kfold_data['y_test'][kf].values x_train_pred = self.en.predict(x_train) x_test_pred = self.en.predict(x_test) CI_test[kf].append(1 - cindex(x_test_pred, y_test, [True] * len(y_test))) CI_train[kf].append(1 - cindex(x_train_pred, y_train, [True] * len(y_train))) pb.animate(i) mu_train = pd.DataFrame(CI_train).transpose().mean() sigma_train = pd.DataFrame(CI_train).transpose().std() mu_test = pd.DataFrame(CI_test).transpose().mean() sigma_test = pd.DataFrame(CI_test).transpose().std() best_alpha = alphas[pd.DataFrame(CI_test).mean(axis=1).argmax()] pylab.clf() pylab.errorbar(pylab.log(alphas), mu_train, yerr=sigma_train, label="train") pylab.errorbar(pylab.log(alphas) + .1, mu_test, yerr=sigma_test, label="test") pylab.plot(pylab.log(alphas), mu_train, 'ob') pylab.plot(pylab.log(alphas) + .1, mu_train, 'or') pylab.legend() pylab.axvline(pylab.log(best_alpha), lw=2, color="purple") return best_alpha
def score_challengeB(self, filenames): # Ideally provide 3 filenames but if only 1 is given, try # to infer the names of the 2 others cor_pheno1 = [] cor_pheno2 = [] pval_pheno1 = [] pval_pheno2 = [] scores = [] from dreamtools.core.rtools import RTools rtool = RTools(verboseR=False) assert len(filenames) == 3, "Must provide 3 files" self.golds = [] self.preds = [] gold_filenames = self.download_goldstandard('B') print("Warning: your 3 submissions should be ordered as B1, B2, B3 files") for tag in [1, 2, 3]: #assumeing data and gs are sorted in the same way !! gold = pd.read_csv(gold_filenames[tag-1], sep='[ \t]', engine='python') self.golds.append(gold) #filename = 'DREAM5_SysGenB%s_your_Predictions.txt' % tag #filename = self._pj([self.classpath, 'data', filename]) filename = filenames[tag-1] pred1 = pd.read_csv(filename, sep='[ \t]', engine='python') self.preds.append(pred1) # correlation gs versus predicted rtool.session.t = pred1.ix[0].values rtool.session.g = gold.ix[0].values rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')") T1 = rtool.session.results.copy() rtool.session.t = pred1.ix[1].values rtool.session.g = gold.ix[1].values rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')") T2 = rtool.session.results.copy() cor_pheno1.append(T1['estimate']) cor_pheno2.append(T2['estimate']) pval_pheno1.append(T1['p.value']) pval_pheno2.append(T2['p.value']) score = -(np.log(T1['p.value']) + np.log(T2['p.value'])) scores.append(score) self.corp1 = cor_pheno1 self.corp2 = cor_pheno2 self.pval1 = pval_pheno1 self.pval2 = pval_pheno2 self.scores = scores # This part now compute the pvalues using random prediction random_scores = {0:[],1:[],2:[]} from easydev import Progress pb = Progress(self.N_pvalues, interval=1) for ii in range(1, self.N_pvalues): for tag in [0,1,2]: #generate random coordinates coord = random.sample(['RIL%s' % i for i in range(1,31)], 30) coord2 = random.sample(['RIL%s' % i for i in range(1,31)], 30) # Obtaining random scores rtool.session.t = self.preds[tag].ix[0].ix[coord].values rtool.session.g = self.golds[tag].ix[0].values rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')") T1 = rtool.session.results.copy() rtool.session.t = self.preds[tag].ix[1].ix[coord2].values rtool.session.g = self.golds[tag].ix[1].values rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')") T2 = rtool.session.results.copy() random_scores[tag].append(-(np.log(T1['p.value']) + np.log(T2['p.value']))) pb.animate(ii+1) self.random_scores = random_scores #Obtaining p-values pvals = [sum(self.random_scores[k]>= self.scores[k])/float(self.N_pvalues) for k in [0,1,2]] self.pvals = pvals df = pd.DataFrame({'scores':self.scores, 'correlation_phenotype1':cor_pheno1, 'correlation_phenotype2':cor_pheno2, 'pvalues_phenotype1':pval_pheno1, 'pvalues_phenotype2':pval_pheno2, 'pvalues':self.pvals}) df= df.T df.columns = ['SysGenB1', 'SysGenB2', 'SysGenB3'] return df
def compounds2accession(self, compounds): """For each compound, identifies the target and corresponding UniProt accession number This is not part of ChEMBL API :: # we recommend to use cache if you use this method regularly c = Chembl(cache=True) drugs = c.get_approved_drugs() # to speed up example drugs = drugs[0:20] IDs = [x['molecule_chembl_id] for x in drugs] c.compounds2accession(IDs) """ # we jump from compounds to targets through activities # Here this is a one to many mapping so we initialise a default # dictionary. from collections import defaultdict compound2target = defaultdict(set) filter = "molecule_chembl_id__in={}" from easydev import Progress if isinstance(compounds, list): pass else: compounds = list(compounds) pb = Progress(len(compounds)) for i in range(0, len(compounds)): # FIXME could get activities by bunch using # ",".join(compounds[i:i+10) for example activities = self.get_activity(filters=filter.format(compounds[i])) # get target ChEMBL IDs from activities for act in activities: compound2target[act['molecule_chembl_id']].add( act['target_chembl_id']) pb.animate(i + 1) # What we need is to get targets for all targets found in the previous # step. For each compound/drug there are hundreds of targets though. And # we will call the get_target for each list of hundreds targets. This # will take forever. Instead, because there are *only* 12,000 targets, # let us download all of them ! This took about 4 minutes on this test but # if you use the cache, next time it will be much much quicker. This is # not down at the activities level because there are too many entries targets = self.get_target(limit=-1) # identifies all target chembl id to easily retrieve the entry later on target_names = [target['target_chembl_id'] for target in targets] # retrieve all uniprot accessions for all targets of each compound for compound, targs in compound2target.items(): accessions = set() for target in targs: index = target_names.index(target) accessions = accessions.union([ comp['accession'] for comp in targets[index]['target_components'] ]) compound2target[compound] = accessions return compound2target
def get_graph(self, go_ids, ontologies=None, progress=True): # Here we filter the data to keep only the relevant go terms as shown in # panther pie chart import networkx as nx gg = nx.DiGraph() #assert ontology in ['MF', 'BP', 'CC'] if ontologies is None: ontologies = ['MF', 'BP', 'CC'] elif isinstance(ontologies, str): ontologies = [ontologies] ancestors = [self.ancestors[x] for x in ontologies] levels = [] real_ids = [] obsolets = [] from easydev import Progress pb = Progress(len(go_ids)) print('Retrieving info for each significant go terms') annotations = {} for i, go_id in enumerate(go_ids): # Some go terms maybe obsolet or renamed. Looking at other functions # may not work simply because the ID has changed. info = self.quickgo.get_go_terms(go_id) annotations[go_id] = info if info[0]['id'] != go_id: _id = info[0]['id'] print('changed {} to {}'.format(go_id, _id)) annotations[_id] = info else: _id = go_id aspect = info[0]['aspect'] if info[0]['isObsolete'] is True: print("Skipping obsole go terms: {}".format(go_id)) obsolets.append(go_id) continue real_ids.append(_id) # now figure out the distance to main ancestor # we can try several times #if _id != self.ancestors[ontology]: for ancestor in ancestors: edges = self.quickgo.get_go_paths(_id, ancestor) if edges == 400: print("Could not retrieve {} to {}".format(_id, ancestor)) continue if edges["numberOfHits"] == 0: continue if len(edges["results"]) >= 1: for path in edges["results"]: for edge in path: gg.add_edge(edge['child'], edge["parent"]) else: print(_id, edges["results"]) if progress is True: pb.animate(i + 1) self.obsolets = obsolets self.annotations = annotations self.graph = gg all_paths = {} for ancestor in ancestors: if ancestor not in gg: continue paths = nx.shortest_path_length(gg, target=ancestor) for obsolet in obsolets: paths[obsolet] = 100 all_paths[ancestor] = paths return all_paths
def _get_info(self): """Populates the data structures for plotting. Will be called on request""" stats = {"A":0, "C":0, "G":0, "T":0, "N":0} stats["qualities"] = [] stats["mean_qualities"] = [] stats["mean_length"] = 0 stats["sequences"] = [] minimum = 1e6 maximum = 0 # FIXME this self.N takes time in the cosntructor # do we need it ? self.lengths = np.empty(self.N) self.gc_list = [] total_length = 0 C = defaultdict(int) if self.verbose: pb = Progress(self.N) sequences = [] mean_qualities = [] qualities = [] # could use multiprocessing # FastxFile has shown some errors while handling gzip files # created with zlib (e.g. from atropos). This is now replaced # by the Atropos FastqReader for now. #fastq = pysam.FastxFile(self.filename) with FastqReader(self.filename) as f: for i, record in enumerate(f): N = len(record.sequence) if N == 0: raise ValueError("Read {} has a length equal to zero. Clean your FastQ files".format(i)) self.lengths[i] = N # we can store all qualities and sequences reads, so # just max_sample are stored: if i < self.max_sample: quality = [ord(x) -33 for x in record.qualities] mean_qualities.append(sum(quality) / N) qualities.append(quality) sequences.append(record.sequence) # store count of all qualities for k in record.qualities: C[k] += 1 GG = record.sequence.count('G') CC = record.sequence.count('C') self.gc_list.append((GG+CC)/float(N)*100) # not using a counter, or loop speed up the code stats["A"] += record.sequence.count("A") stats["C"] += CC stats["G"] += GG stats["T"] += record.sequence.count("T") stats["N"] += record.sequence.count("N") total_length += len(record.sequence) if self.verbose: pb.animate(i+1) # other data self.qualities = qualities self.mean_qualities = mean_qualities self.minimum = int(self.lengths.min()) self.maximum = int(self.lengths.max()) self.sequences = sequences self.gc_content = np.mean(self.gc_list) stats['mean_length'] = total_length / float(self.N) stats['total_bp'] = stats['A'] + stats['C'] + stats['G'] + stats["T"] + stats['N'] stats['mean_quality'] = sum([(ord(k) -33)*v for k,v in C.items()]) / stats['total_bp'] self.stats = stats
def fit(self, amp=1, progress=False): r"""Loop over distributions and find best parameter to fit the data for each When a distribution is fitted onto the data, we populate a set of dataframes: - :attr:`df_errors` :sum of the square errors between the data and the fitted distribution i.e., :math:`\sum_i \left( Y_i - pdf(X_i) \right)^2` - :attr:`fitted_param` : the parameters that best fit the data - :attr:`fitted_pdf` : the PDF generated with the parameters that best fit the data Indices of the dataframes contains the name of the distribution. """ import warnings warnings.filterwarnings("ignore", category=RuntimeWarning) from easydev import Progress N = len(self.distributions) pb = Progress(N) for i, distribution in enumerate(self.distributions): try: # need a subprocess to check time it takes. If too long, skip it dist = eval("scipy.stats." + distribution) # TODO here, dist.fit may take a while or just hang forever # with some distributions. So, I thought to use signal module # to catch the error when signal takes too long. It did not work # presumably because another try/exception is inside the # fit function, so I used threading with a recipe from stackoverflow # See timed_run function above param = self._timed_run(dist.fit, distribution, args=self._data) # with signal, does not work. maybe because another expection is caught # hoping the order returned by fit is the same as in pdf pdf_fitted = dist.pdf(self.x, *param) self.fitted_param[distribution] = param[:] self.fitted_pdf[distribution] = pdf_fitted # calculate error sq_error = pylab.sum( (self.fitted_pdf[distribution] - self.y)**2) # calcualte information criteria logLik = np.sum(dist.logpdf(self.x, *param)) k = len(param[:]) n = len(self._data) aic = 2 * k - 2 * logLik bic = n * np.log(sq_error / n) + k * np.log(n) # calcualte kullback leibler divergence kullback_leibler = kl_div(self.fitted_pdf[distribution], self.y) logging.info("Fitted {} distribution with error={})".format( distribution, sq_error)) # compute some errors now self._fitted_errors[distribution] = sq_error self._aic[distribution] = aic self._bic[distribution] = bic self._kldiv[distribution] = kullback_leibler except Exception as err: #pragma: no cover logging.warning( "SKIPPED {} distribution (taking more than {} seconds)". format(distribution, self.timeout)) # if we cannot compute the error, set it to large values self._fitted_errors[distribution] = np.inf self._aic[distribution] = np.inf self._bic[distribution] = np.inf self._kldiv[distribution] = np.inf if progress: pb.animate(i + 1) self.df_errors = pd.DataFrame({ 'sumsquare_error': self._fitted_errors, 'aic': self._aic, 'bic': self._bic, 'kl_div': self._kldiv })
class Fitter(object): """Fit a data sample to known distributions A naive approach often performed to figure out the undelying distribution that could have generated a data set, is to compare the histogram of the data with a PDF (probability distribution function) of a known distribution (e.g., normal). Yet, the parameters of the distribution are not known and there are lots of distributions. Therefore, an automatic way to fit many distributions to the data would be useful, which is what is implemented here. Given a data sample, we use the `fit` method of SciPy to extract the parameters of that distribution that best fit the data. We repeat this for all available distributions. Finally, we provide a summary so that one can see the quality of the fit for those distributions Here is an example where we generate a sample from a gamma distribution. :: >>> # First, we create a data sample following a Gamma distribution >>> from scipy import stats >>> data = stats.gamma.rvs(2, loc=1.5, scale=2, size=20000) >>> # We then create the Fitter object >>> import fitter >>> f = fitter.Fitter(data) >>> # just a trick to use only 10 distributions instead of 80 to speed up the fitting >>> f.distributions = f.distributions[0:10] + ['gamma'] >>> # fit and plot >>> f.fit() >>> f.summary() sumsquare_error gamma 0.000095 beta 0.000179 chi 0.012247 cauchy 0.044443 anglit 0.051672 [5 rows x 1 columns] Once the data has been fitted, the :meth:`summary` metod returns a sorted dataframe where the Looping over the 80 distributions in SciPy could takes some times so you can overwrite the :attr:`distributions` with a subset if you want. In order to reload all distributions, call :meth:`load_all_distributions`. Some distributions do not converge when fitting. There is a timeout of 10 seconds after which the fitting procedure is cancelled. You can change this :attr:`timeout` attribute if needed. If the histogram of the data has outlier of very long tails, you may want to increase the :attr:`bins` binning or to ignore data below or above a certain range. This can be achieved by setting the :attr:`xmin` and :attr:`xmax` attributes. If you set xmin, you can come back to the original data by setting xmin to None (same for xmax) or just recreate an instance. """ def __init__(self, data, xmin=None, xmax=None, bins=100, distributions=None, timeout=30, density=True): """.. rubric:: Constructor :param list data: a numpy array or a list :param float xmin: if None, use the data minimum value, otherwise histogram and fits will be cut :param float xmax: if None, use the data maximum value, otherwise histogram and fits will be cut :param int bins: numbers of bins to be used for the cumulative histogram. This has an impact on the quality of the fit. :param list distributions: give a list of distributions to look at. If none, use all scipy distributions that have a fit method. If you want to use only one distribution and know its name, you may provide a string (e.g. 'gamma'). Finally, you may set to 'common' to include only common distributions, which are: cauchy, chi2, expon, exponpow, gamma, lognorm, norm, powerlaw, irayleigh, uniform. :param timeout: max time for a given distribution. If timeout is reached, the distribution is skipped. .. versionchanged:: 1.2.1 remove verbose argument, replacedb by logging module. .. versionchanged:: 1.0.8 increase timeout from 10 to 30 seconds. """ self.timeout = timeout # USER input self._data = None # Issue https://github.com/cokelaer/fitter/issues/22 asked for setting # the density to False in the fitting and plotting. I first tought it # would be possible, but the fitting is performed using the PDF of scipy # so one would still need to normalise the data so that it is # comparable. Therefore I do not see anyway to do it without using # density set to True for now. self._density = True #: list of distributions to test self.distributions = distributions if self.distributions == None: self._load_all_distributions() elif self.distributions == "common": self.distributions = get_common_distributions() elif isinstance(distributions, str): self.distributions = [distributions] self.bins = bins self._alldata = np.array(data) if xmin == None: self._xmin = self._alldata.min() else: self._xmin = xmin if xmax == None: self._xmax = self._alldata.max() else: self._xmax = xmax self._trim_data() self._update_data_pdf() # Other attributes self._init() def _init(self): self.fitted_param = {} self.fitted_pdf = {} self._fitted_errors = {} self._aic = {} self._bic = {} self._kldiv = {} self._fit_i = 0 # fit progress self.pb = Progress(len(self.distributions)) def _update_data_pdf(self): # histogram retuns X with N+1 values. So, we rearrange the X output into only N self.y, self.x = np.histogram( self._data, bins=self.bins, density=self._density) self.x = [(this + self.x[i + 1]) / 2. for i, this in enumerate(self.x[0:-1])] def _trim_data(self): self._data = self._alldata[np.logical_and( self._alldata >= self._xmin, self._alldata <= self._xmax)] def _get_xmin(self): return self._xmin def _set_xmin(self, value): if value == None: value = self._alldata.min() elif value < self._alldata.min(): value = self._alldata.min() self._xmin = value self._trim_data() self._update_data_pdf() xmin = property(_get_xmin, _set_xmin, doc="consider only data above xmin. reset if None") def _get_xmax(self): return self._xmax def _set_xmax(self, value): if value == None: value = self._alldata.max() elif value > self._alldata.max(): value = self._alldata.max() self._xmax = value self._trim_data() self._update_data_pdf() xmax = property(_get_xmax, _set_xmax, doc="consider only data below xmax. reset if None ") def _load_all_distributions(self): """Replace the :attr:`distributions` attribute with all scipy distributions""" self.distributions = get_distributions() def hist(self): """Draw normed histogram of the data using :attr:`bins` .. plot:: >>> from scipy import stats >>> data = stats.gamma.rvs(2, loc=1.5, scale=2, size=20000) >>> # We then create the Fitter object >>> import fitter >>> fitter.Fitter(data).hist() """ _ = pylab.hist(self._data, bins=self.bins, density=self._density) pylab.grid(True) def _fit_single_distribution(self, distribution, progress: bool): try: # need a subprocess to check time it takes. If too long, skip it dist = eval("scipy.stats." + distribution) # TODO here, dist.fit may take a while or just hang forever # with some distributions. So, I thought to use signal module # to catch the error when signal takes too long. It did not work # presumably because another try/exception is inside the # fit function, so I used threading with a recipe from stackoverflow # See timed_run function above param = self._timed_run(dist.fit, distribution, args=self._data) # with signal, does not work. maybe because another expection is caught # hoping the order returned by fit is the same as in pdf pdf_fitted = dist.pdf(self.x, *param) self.fitted_param[distribution] = param[:] self.fitted_pdf[distribution] = pdf_fitted # calculate error sq_error = pylab.sum( (self.fitted_pdf[distribution] - self.y) ** 2) # calcualte information criteria logLik = np.sum(dist.logpdf(self.x, *param)) k = len(param[:]) n = len(self._data) aic = 2 * k - 2 * logLik bic = n * np.log(sq_error / n) + k * np.log(n) # calcualte kullback leibler divergence kullback_leibler = kl_div( self.fitted_pdf[distribution], self.y) logging.info("Fitted {} distribution with error={})".format( distribution, sq_error)) # compute some errors now self._fitted_errors[distribution] = sq_error self._aic[distribution] = aic self._bic[distribution] = bic self._kldiv[distribution] = kullback_leibler except Exception: # pragma: no cover logging.warning("SKIPPED {} distribution (taking more than {} seconds)".format(distribution, self.timeout)) # if we cannot compute the error, set it to large values self._fitted_errors[distribution] = np.inf self._aic[distribution] = np.inf self._bic[distribution] = np.inf self._kldiv[distribution] = np.inf if progress: self._fit_i += 1 self.pb.animate(self._fit_i) def fit(self, amp=1, progress=False, n_jobs=-1): r"""Loop over distributions and find best parameter to fit the data for each When a distribution is fitted onto the data, we populate a set of dataframes: - :attr:`df_errors` :sum of the square errors between the data and the fitted distribution i.e., :math:`\sum_i \left( Y_i - pdf(X_i) \right)^2` - :attr:`fitted_param` : the parameters that best fit the data - :attr:`fitted_pdf` : the PDF generated with the parameters that best fit the data Indices of the dataframes contains the name of the distribution. """ import warnings warnings.filterwarnings("ignore", category=RuntimeWarning) jobs = (delayed(self._fit_single_distribution)(dist, progress) for dist in self.distributions) pool = Parallel(n_jobs=n_jobs, backend='threading') _ = pool(jobs) self.df_errors = pd.DataFrame({'sumsquare_error': self._fitted_errors, 'aic': self._aic, 'bic': self._bic, 'kl_div': self._kldiv}) def plot_pdf(self, names=None, Nbest=5, lw=2, method="sumsquare_error"): """Plots Probability density functions of the distributions :param str,list names: names can be a single distribution name, or a list of distribution names, or kept as None, in which case, the first Nbest distribution will be taken (default to best 5) """ assert Nbest > 0 if Nbest > len(self.distributions): Nbest = len(self.distributions) if isinstance(names, list): for name in names: pylab.plot(self.x, self.fitted_pdf[name], lw=lw, label=name) elif names: pylab.plot(self.x, self.fitted_pdf[names], lw=lw, label=names) else: try: names = self.df_errors.sort_values( by=method).index[0:Nbest] except Exception: names = self.df_errors.sort(method).index[0:Nbest] for name in names: if name in self.fitted_pdf.keys(): pylab.plot( self.x, self.fitted_pdf[name], lw=lw, label=name) else: # pragma: no cover logger.warning("%s was not fitted. no parameters available" % name) pylab.grid(True) pylab.legend() def get_best(self, method='sumsquare_error'): """Return best fitted distribution and its parameters a dictionary with one key (the distribution name) and its parameters """ # self.df should be sorted, so then us take the first one as the best name = self.df_errors.sort_values(method).iloc[0].name params = self.fitted_param[name] return {name: params} def summary(self, Nbest=5, lw=2, plot=True, method="sumsquare_error"): """Plots the distribution of the data and Nbest distribution """ if plot: pylab.clf() self.hist() self.plot_pdf(Nbest=Nbest, lw=lw, method=method) pylab.grid(True) Nbest = min(Nbest, len(self.distributions)) try: names = self.df_errors.sort_values( by=method).index[0:Nbest] except: # pragma: no cover names = self.df_errors.sort(method).index[0:Nbest] return self.df_errors.loc[names] def _timed_run(self, func, distribution, args=(), kwargs={}, default=None): """This function will spawn a thread and run the given function using the args, kwargs and return the given default value if the timeout is exceeded. http://stackoverflow.com/questions/492519/timeout-on-a-python-function-call """ class InterruptableThread(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.result = default self.exc_info = (None, None, None) def run(self): try: self.result = func(args, **kwargs) except Exception as err: # pragma: no cover self.exc_info = sys.exc_info() def suicide(self): # pragma: no cover raise RuntimeError('Stop has been called') it = InterruptableThread() it.start() started_at = datetime.now() it.join(self.timeout) ended_at = datetime.now() diff = ended_at - started_at if it.exc_info[0] is not None: # pragma: no cover ; if there were any exceptions a, b, c = it.exc_info raise Exception(a, b, c) # communicate that to caller if it.isAlive(): # pragma: no cover it.suicide() raise RuntimeError else: return it.result