Ejemplo n.º 1
0
    def create_html_drugs(self):
        """Create an HTML page for each drug"""
        # group by drugs
        all_drugs = list(self.df['DRUG_ID'].unique())

        df = self.get_significant_set()
        groups = df.groupby('DRUG_ID')
        if self.verbose:
            print("Creating individual HTML pages for each drug")
        N = len(groups.indices.keys())
        N = len(all_drugs)
        pb = Progress(N)
        for i, drug in enumerate(all_drugs):
            # enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            if drug in groups.groups.keys():
                subdf = groups.get_group(drug)
            else:
                subdf = {}

            html = HTMLOneDrug(self, self.df, subdf, drug)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i+1)
        if self.settings.animate: print("\n")
Ejemplo n.º 2
0
    def _score_challengeA_bunch(self, filenames, subname):

        from easydev import Progress

        pb = Progress(5, 1)
        pb.animate(0)
        results = []
        for i, filename in enumerate(filenames):
            res = self.score_challengeA(filename, subname + "_" + str(i + 1))
            pb.animate(i + 1)
            results.append(res)

        aupr_score = -np.mean(np.log10([x["p_auroc"] for x in results]))
        auroc_score = -np.mean(np.log10([x["p_aupr"] for x in results]))
        score = (aupr_score + auroc_score) / 2.0

        df = pd.TimeSeries()
        df["Overall Score"] = score
        df["AUPR score (pval)"] = aupr_score
        df["AUROC score (pval)"] = aupr_score
        for i in range(1, 6):
            df["AUPR Net %s" % i] = results[i - 1]["aupr"]
        for i in range(1, 6):
            df["AUROC Net %s" % i] = results[i - 1]["auroc"]

        return df
Ejemplo n.º 3
0
def process_single_reads(reader, modifiers, filters, n_progress=-1):
	"""
	Loop over reads, find adapters, trim reads, apply modifiers and
	output modified reads.

	Return a Statistics object.
	"""
	n = 0  # no. of processed reads
	total_bp = 0
	if n_progress != -1:
		try:
			from easydev import Progress
			pb = Progress(n_progress)
			count = 0
		except:
			n_progress = -1

	for read in reader:
		n += 1
		total_bp += len(read.sequence)
		for modifier in modifiers:
			read = modifier(read)
		for filter in filters:
			if filter(read):
				break
		if n_progress != -1:
			count += 1
			pb.animate(count)

	return Statistics(n=n, total_bp1=total_bp, total_bp2=None)
Ejemplo n.º 4
0
    def filling_chembl_pubchem_using_unichem(self):
        """

        """
        N = len(self.drug_ids)
        pb = Progress(N)
        for i,this in enumerate(self.drug_ids):
            entry = self.dd.df.ix[this]
            # if no information is provided, we will need to get it 
            # from chemspider

            # From the database, when chembl is provided, it is unique
            # same for chemspider and pubchem and CAS
            select = entry[['CHEMSPIDER', 'CHEMBL', 'PUBCHEM']]
            if select.count() == 0:
                name = self.dd.df.ix[this].DRUG_NAME
                results = self._cs_find(name)
                if len(results) == 0:
                    # nothing found
                    pass
                elif len(results) == 1:
                    self.dd_filled.df.ix[this].loc['CHEMSPIDER'] = results[0]
                else:
                    # non unique
                    #chemspider = ",".join([str(x) for x in results])
                    self.dd_filled.df.ix[this].loc['CHEMSPIDER'] = results
            pb.animate(i+1)

        # Search in chemspider systematically
        for i, this in enumerate(self.drug_ids):
            entry = self.dd.df.ix[this]
            if select.count() == 1:
                res = self._cs_find(drug)

            pb.animate(i+1)
Ejemplo n.º 5
0
    def _get_G(self, gold):
        from easydev import Progress
        import scipy.sparse
        regulators = list(set(gold[0]))
        targets = list(set(gold[[0,1]].stack()))

        N, M = gold[0].max(), gold[1].max()

        ## A will store indices goind from 0 (not 1) to N-1
        # hence the -1 indices when handling A if i,j are the
        # values of the gene
        A = np.zeros((N, M))
        for row in gold[[0,1]].values:
            i, j = row
            A[i-1, j-1] = 1
        A_sparse = scipy.sparse.csr_matrix(A)

        #N, M = len(regulators), len(targets)
        G = np.zeros((N, M))

        pb = Progress(len(regulators), 1)
        for i, x in enumerate(regulators):
            for j, y in enumerate(targets):
                if A[x-1, y-1] == 1:
                    G[x-1, y-1] = 1
                elif x != y:
                    G[x-1, y-1] = -1
            pb.animate(i+1)
        return G
Ejemplo n.º 6
0
    def _opt_ridge_lasso(self, drug_name, feature_name, method, alphas=None):

        if alphas is None:
            alphas = pylab.linspace(0,1, 20)

        mses = []
        params = []
        method_buf = self.settings.regression_method
        alpha_buf = self.settings.elastic_net.alpha

        pb = Progress(len(alphas))
        for j, alpha in enumerate(alphas):
            self.settings.regression_method = method
            self.settings.elastic_net.alpha = alpha
            odof = self.anova_one_drug_one_feature(drug_name,
                    feature_name)
            anova = self._get_anova_summary(self.data_lm,
                    output='dataframe')
            #mses.append(anova.ix['Residuals']['Sum Sq'])
            mses.append(anova.ix['tissue']['F value'])
            #mses.append(anova['Sum Sq'].sum())
            pb.animate(j+1)
            params.append(self.data_lm.params)
        self.settings.regression_method = method_buf
        self.settings.elastic_net.alpha = alpha_buf
        return alphas, mses, params
Ejemplo n.º 7
0
    def diagnostics(self):
        """Return dataframe with information about the analysis

        """
        n_drugs = len(self.ic50.drugIds)
        n_features = len(self.features.features) - self.features.shift
        n_combos = n_drugs * n_features
        feasible = 0
        pb = Progress(n_drugs, 1)
        counter = 0
        for drug in self.ic50.drugIds:
            for feature in self.features.features[self.features.shift:]:
                dd = self._get_one_drug_one_feature_data(drug, feature,
                        diagnostic_only=True)
                if dd.status is True:
                    feasible += 1
            counter += 1
            pb.animate(counter)

        results = {
                'n_drug': n_drugs,
                'n_combos': n_combos,
                'feasible_tests': feasible,
                'percentage_feasible_tests': float(feasible)/n_combos*100}
        return results
Ejemplo n.º 8
0
    def compounds2accession(self, compounds):
        """For each compound, identifies the target and corresponding UniProt
        accession number

        This is not part of ChEMBL API

        ::

            # we recommend to use cache if you use this method regularly
            c = Chembl(cache=True)
            drugs = c.get_approved_drugs()

            # to speed up example
            drugs = drugs[0:20]
            IDs = [x['molecule_chembl_id] for x in drugs]

            c.compounds2accession(IDs)

        """
        # we jump from compounds to targets through activities
        # Here this is a one to many mapping so we initialise a default
        # dictionary.
        from collections import defaultdict
        compound2target = defaultdict(set)

        filter = "molecule_chembl_id__in={}"
        from easydev import Progress
        pb = Progress(len(compounds))
        for i in range(0, len(compounds)):
            # FIXME could get activities by bunch using 
            # ",".join(compounds[i:i+10) for example
            activities = self.get_activity(filters=filter.format(compounds[i]))
            # get target ChEMBL IDs from activities
            for act in activities:
                compound2target[act['molecule_chembl_id']].add(act['target_chembl_id'])
            pb.animate(i+1)

        # What we need is to get targets for all targets found in the previous
        # step. For each compound/drug there are hundreds of targets though. And
        # we will call the get_target for each list of hundreds targets. This
        # will take forever. Instead, because there are *only* 12,000 targets,
        # let us download all of them ! This took about 4 minutes on this test but
        # if you use the cache, next time it will be much much quicker. This is
        # not down at the activities level because there are too many entries

        targets = self.get_target(limit=-1)

        # identifies all target chembl id to easily retrieve the entry later on
        target_names = [target['target_chembl_id'] for target in targets]

        # retrieve all uniprot accessions for all targets of each compound
        for compound, targs in compound2target.items():
            accessions = set()
            for target in targs:
                index = target_names.index(target)
                accessions = accessions.union([comp['accession'] 
                    for comp in targets[index]['target_components']])
            compound2target[compound] = accessions
 
        return compound2target
Ejemplo n.º 9
0
    def dendogram_coefficients(self, stacked=False, show=True, cmap="terrain"):
        """

        shows the coefficient of each optimised model for each drug
        """
        drugids = self.drugIds
        from easydev import Progress
        pb = Progress(len(drugids))
        d = {}

        for i, drug_name in enumerate(drugids):
            X, Y = self._get_one_drug_data(drug_name, randomize_Y=False)
            results = self.runCV(drug_name, verbose=False)
            df = pd.DataFrame({'name': X.columns, 'weight': results.coefficients})
            df = df.set_index("name").sort_values("weight")
            d[drug_name] = df.copy()
            pb.animate(i+1)

        # use drugid to keep same order as in the data
        dfall = pd.concat([d[i] for i in drugids], axis=1)
        dfall.columns = drugids

        if show:
            from biokit import heatmap
            h = heatmap.Heatmap(dfall, cmap=cmap)
            h.plot(num=1,colorbar_position="top left")

        if stacked is True:
            dfall = dfall.stack().reset_index()
            dfall.columns = ["feature", "drug", "weight"]
        return dfall
Ejemplo n.º 10
0
    def create_html_associations(self):
        """Create an HTML page for each significant association

        The name of the output HTML file is **<association id>.html**
        where association id is stored in :attr:`df`.

        """
        print("\nCreating individual HTML pages for each association")
        df = self.get_significant_set()

        drugs = df['DRUG_ID'].values
        features = df['FEATURE'].values
        assocs = df['ASSOC_ID'].values
        fdrs = df['ANOVA_FEATURE_FDR'].values

        N = len(df)
        pb = Progress(N)

        html = Association(self, drug='dummy', feature='dummy',  fdr='dummy')

        for i in range(N):
            html.drug = drugs[i]
            html.feature = features[i]
            html._filename = str(assocs[i]) + '.html'
            html.fdr = fdrs[i]
            html.assoc_id = assocs[i]
            html._init_report() # since we have one shared instance
            html.create_report(onweb=False)
            pb.animate(i+1)
Ejemplo n.º 11
0
    def search_from_smile_inchembl(self):

        N = len(self.drug_ids)

        pb = Progress(N)
        self.results_chembl = {}
        self.results_chemspider = {}

        for i in range(0, N):
            drug = self.drug_ids[i]
            self.results_chembl[drug] = []

            if self.results[drug]:
                for chemspider_id in self.results[drug]:
                    chemspider_entry = self._cs_get(chemspider_id)
                    self.results_chemspider[drug] = chemspider_entry
                    smile = chemspider_entry['smiles']
                    # now search in chembl
                    res_chembl = self.chembl.get_compounds_by_SMILES(smile)
                    try:
                        res_chembl['compounds']
                        self.results_chembl[drug].extend(res_chembl['compounds'])
                    except:
                        pass

            pb.animate(i+1)
Ejemplo n.º 12
0
    def to_kmer_content(self, k=7):
        """Return a Series with kmer count across all reads

        :param int k: (default to 7-mers)
        :return: Pandas Series with index as kmer and values as count.

        Takes about 30 seconds on a million reads.
        """
        # Counter is slow if we apply it on each read.
        # .count is slow as well
        import collections
        from sequana.kmer import get_kmer
        counter = collections.Counter()
        pb = Progress(len(self))
        buffer_ = []
        for i, this in enumerate(self):
            buffer_.extend(list(get_kmer(this['sequence'], k)))
            if len(buffer_) > 100000:
                counter += collections.Counter(buffer_)
                buffer_ = []
            pb.animate(i)
        counter += collections.Counter(buffer_)

        ts = pd.Series(counter)
        ts.sort_values(inplace=True, ascending=False)

        return ts
Ejemplo n.º 13
0
    def select_random_reads(self, N=None, output_filename="random.fasta"):
        """Select random reads and save in a file

        :param int N: number of random unique reads to select
            should provide a number but a list can be used as well.
        :param str output_filename:
        """
        import numpy as np
        thisN = len(self)
        if isinstance(N, int):
            if N > thisN:
                N = thisN
            # create random set of reads to pick up
            cherries = list(range(thisN))
            np.random.shuffle(cherries)
            # cast to set for efficient iteration
            cherries = set(cherries[0:N])
        elif isinstance(N, set):
            cherries = N
        elif isinstance(N, list):
            cherries = set(N)
        fasta = FastxFile(self.filename)
        pb = Progress(thisN) # since we scan the entire file
        with open(output_filename, "w") as fh:
            for i, read in enumerate(fasta):
                if i in cherries:
                    fh.write(read.__str__() + "\n")
                else:
                    pass
                pb.animate(i+1)
        return cherries
Ejemplo n.º 14
0
    def volcano_plot_all_drugs(self):
        """Create a volcano plot for each drug and save in PNG files

        Each filename is set to **volcano_<drug identifier>.png**
        """
        drugs = list(self.df[self._colname_drugid].unique())
        pb = Progress(len(drugs), 1)
        for i, drug in enumerate(drugs):
            self.volcano_plot_one_drug(drug)
            self.savefig("volcano_%s.png" % drug, size_inches=(10, 10))
            pb.animate(i+1)
Ejemplo n.º 15
0
 def _load_complexes(self, show_progress=True):
     from easydev import Progress
     import time
     pb = Progress(len(self.df.complexAC))
     complexes = {}
     self.logging.info("Loading all details from the IntactComplex database")
     for i, identifier in enumerate(self.df.complexAC):
         res = self.webserv.details(identifier)
         complexes[identifier] = res
         if show_progress:
             pb.animate(i+1)
     self._complexes = complexes
Ejemplo n.º 16
0
 def create_html_features(self):
     """Create an HTML page for each significant feature"""
     df = self.get_significant_set()
     groups = df.groupby('FEATURE')
     print("\nCreating individual HTML pages for each feature")
     N = len(groups.indices.keys())
     pb = Progress(N)
     for i, feature in enumerate(groups.indices.keys()):
         # get the indices and therefore subgroup
         subdf = groups.get_group(feature)
         html = HTMLOneFeature(self, self.df, subdf, feature)
         html.create_report(onweb=False)
         pb.animate(i+1)
Ejemplo n.º 17
0
    def volcano_plot_all_features(self):
        """Create a volcano plot for each feature and save in PNG files

        Each filename is set to **volcano_<feature name>.png**
        """
        features = list(self.df[self._colname_feature].unique())
        print('Creating image for each feature (using all drugs)')
        pb = Progress(len(features), 1)
        for i, feature in enumerate(features):
            self.volcano_plot_one_feature(feature)
            self.savefig("volcano_%s.png" % feature,
                    size_inches=(10, 10))
            pb.animate(i+1)
def check_ipython_notebook():


    notebooks = glob.glob("*ipynb")
    N = len(notebooks)

    pb = Progress(N)
    for i,filename in enumerate(notebooks):
        print(purple(filename))
        notebook = read(open(filename), 'json')
        r = NotebookRunner(notebook)
        r.run_notebook()
        pb.animate(i+1)
Ejemplo n.º 19
0
    def check_randomness(self, drug_name, n_folds=10, N=10, show=True,
            progress=False):

        scores = []
        pb = Progress(N)
        for i in range(N):
            # Fit a model using CV
            inter_results = self.runCV(drug_name, n_folds=n_folds, verbose=False)
            scores.append(inter_results.Rp)
            if progress: 
                pb.animate(i+1)

        random_scores = []
        pb = Progress(N)
        for i in range(N):
            # Fit a model using CV
            inter_results = self.runCV(drug_name, n_folds=n_folds,
                                randomize_Y=True, verbose=False)
            random_scores.append(inter_results.Rp)
            if progress:
                pb.animate(i+1)

        from scipy.stats import ttest_ind
        ttest_res = ttest_ind(scores, random_scores)
        results = { "scores": scores,
                    "random_scores": random_scores,
                    "ttest_pval": ttest_res.pvalue}

        # Compute the log of the Bayes factor to avoid underflow as communicated
        # by M.Menden.
        S = sum([s>r for s,r in zip(scores, random_scores)])
        proba = S / len(scores)
        if proba == 1:
            # Set the maximum instead of infinite
            # bayes_factor = np.inf
            bayes_factor = 1. / (1./len(scores))
            
        else:
            bayes_factor = 1. / (1-proba)
        results['bayes_factor'] = bayes_factor

        if show:
            M = max(max(scores), max(random_scores)) * 1.2
            bins = pylab.linspace(0, M, 40)
            pylab.clf()
            pylab.hist(scores, bins=bins, color="b", alpha=0.5)
            pylab.hist(random_scores, color="r", alpha=0.5, bins=bins)
            pylab.title("ttest=%(ttest_pval).3e, bayes=%(bayes_factor)s" % results)
            pylab.grid(True)

        return results
Ejemplo n.º 20
0
    def plot_cindex(self, drug_name, alphas, l1_ratio=0.5, n_folds=10, hold=False):
        # This is longish (300 seconds with 10 folds and 80 alphas
        # for GDSC v5 data sets.
        from dreamtools.core.cindex import cindex

        CI_train = {}
        CI_test = {}
        for c in range(n_folds):
            CI_train[c] = []
            CI_test[c] = []

        from easydev import Progress
        pb = Progress(len(alphas))

        for i, alpha in enumerate(alphas):
            self.elastic_net(drug_name, alpha=alpha, l1_ratio=l1_ratio,
                             n_folds=n_folds)

            # Look at the first fold only
            for kf in range(n_folds):
                x_train = self.kfold_data['x_train'][kf].values
                y_train = self.kfold_data['y_train'][kf].values

                x_test = self.kfold_data['x_test'][kf].values
                y_test = self.kfold_data['y_test'][kf].values

                x_train_pred = self.en.predict(x_train)
                x_test_pred = self.en.predict(x_test)

                CI_test[kf].append(1-cindex(x_test_pred, y_test, [True]*len(y_test)))
                CI_train[kf].append(1-cindex(x_train_pred, y_train, [True] * len(y_train)))
            pb.animate(i)

        mu_train = pd.DataFrame(CI_train).transpose().mean()
        sigma_train = pd.DataFrame(CI_train).transpose().std()

        mu_test = pd.DataFrame(CI_test).transpose().mean()
        sigma_test = pd.DataFrame(CI_test).transpose().std()

        best_alpha = alphas[pd.DataFrame(CI_test).mean(axis=1).argmax()]

        pylab.clf()
        pylab.errorbar(pylab.log(alphas), mu_train, yerr=sigma_train, label="train")
        pylab.errorbar(pylab.log(alphas)+.1, mu_test, yerr=sigma_test, label="test")
        pylab.plot(pylab.log(alphas), mu_train, 'ob')
        pylab.plot(pylab.log(alphas)+.1, mu_train, 'or')
        pylab.legend()
        pylab.axvline(pylab.log(best_alpha), lw=2, color="purple")

        return best_alpha
Ejemplo n.º 21
0
    def create_summary_pages(self):
        """Create summary pages

        Once the main analyis is done (:meth:`analyse`), and the company
        packages have been created (:meth:`create_data_packages_for_companies`),
        you can run this method that will creade a summary HTML page
        (index.html) for the tissue, and a similar summary HTML page for the
        tissues of each company. Finally, an HTML summary page for the 
        companies is also created.

        The final tree direcorty looks like::


            |-- index.html
            |-- company_packages
            |   |-- index.html
            |   |-- Company1
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |   |-- Company2
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |-- tissue_packages
            |   |-- index.html
            |   |-- Tissue1
            |   |-- Tissue2


        """
        # First for the main directory (tissue_packages):
        print(purple("Creating summary index.html for the tissues"))
        self._create_summary_pages(self.main_directory, verbose=False)

        # Then for each companies:
        print(purple("Creating summary index.html for each company"))
        pb = Progress(len(self.companies))
        for i, company in enumerate(self.companies):
            try:
                self._create_summary_pages(self.company_directory + os.sep +
                    company, verbose=False, company=company)
            except Exception as err:
                print(red("Issue with %s. Continue with other companies" % company))
                print(err)
            pb.animate(i+1)

        # Finally, an index towards each company
        self._create_main_index()
Ejemplo n.º 22
0
    def filter(self, identifiers_list=[], min_bp=None, max_bp=None,
        progressbar=True, output_filename='filtered.fastq', remove=True):
        """Filter reads

        :param int min_bp: ignore reads with length shorter than min_bp
        :param int max_bp: ignore reads with length above max_bp

        """
        # 7 seconds without identifiers to scan the file
        # on a 750000 reads

        if min_bp is None:
            min_bp = 0

        if max_bp is None:
            max_bp = 1e9

        # make sure we are at the beginning
        self.rewind()

        output_filename, tozip = self._istozip(output_filename)

        with open(output_filename, "w") as fout:
            pb = Progress(self.n_reads)
            buf = ""
            filtered = 0

            for count, lines in enumerate(grouper(self._fileobj)):
                identifier = lines[0].split()[0]
                if lines[0].split()[0] in identifiers_list:
                    filtered += 1
                else:
                    N = len(lines[1])
                    if N <= max_bp and N >= min_bp:
                        buf += "{}{}+\n{}".format(
                            lines[0].decode("utf-8"),
                            lines[1].decode("utf-8"),
                            lines[3].decode("utf-8"))
                    if count % 100000 == 0:
                        fout.write(buf)
                        buf = ""
                if progressbar is True:
                    pb.animate(count+1)
            fout.write(buf)
            if filtered < len(identifiers_list):
                print("\nWARNING: not all identifiers were found in the fastq file to " +
                      "be filtered.")
        if tozip is True: self._gzip(output_filename)
Ejemplo n.º 23
0
    def select_random_reads(self, N=None, output_filename="random.fastq"):
        """Select random reads and save in a file

        :param int N: number of random unique reads to select
            should provide a number but a list can be used as well. 
            You can select random reads for R1, and re-use the returned list as
            input for the R2 (since pairs must be kept)
        :param str output_filename:

        If you have a pair of files, the same reads must be selected in R1 and
        R2.::

            f1 = FastQ(file1)
            selection = f1.select_random_reads(N=1000)
            f2 = FastQ(file2)
            f2.select_random_reads(selection)


        """
        thisN = len(self)
        if isinstance(N, int):
            if N > thisN:
                N = thisN
            # create random set of reads to pick up
            cherries = list(range(thisN))
            np.random.shuffle(cherries)
            # cast to set for efficient iteration
            cherries = set(cherries[0:N])
        elif isinstance(N, set):
            cherries = N
        elif isinstance(N, list):
            cherries = set(N)

        fastq = pysam.FastxFile(self.filename)


        pb = Progress(thisN) # since we scan the entire file
        with open(output_filename, "w") as fh:
            for i, read in enumerate(fastq):
                if i in cherries:
                    fh.write(read.__str__() + "\n")
                else:
                    pass
                pb.animate(i+1)
        return cherries
Ejemplo n.º 24
0
    def volcano_plot_all_drugs(self):
        """Create a volcano plot for each drug and save in PNG files

        Each filename is set to **volcano_<drug identifier>.png**
        """
        drugs = list(self.df[self._colname_drugid].unique())
        pb = Progress(len(drugs), 1)
        for i, drug in enumerate(drugs):
            self.volcano_plot_one_drug(drug)
            self.savefig_and_js("volcano_%s.png" % drug, size_inches=(10, 10))
            pb.animate(i+1)

            # This prevent memory leak.
            self.current_fig.canvas.mpl_disconnect(self.cid)
            try:
                import mpld3
                mpld3.plugins.clear(self.current_fig)
            except:
                pass
Ejemplo n.º 25
0
    def optimise_elastic_net(self, drug_name, feature_name, N=20, Nalpha=20):
        lwts = pylab.linspace(0, 1, N)
        alphas = pylab.linspace(0, 5, Nalpha)

        mses = np.zeros((N, Nalpha))

        pb = Progress(N)
        for i, lwt in enumerate(lwts):
            for j, alpha in enumerate(alphas):
                self.settings.regression_method = 'ElasticNet'
                self.settings.regression_alpha = alpha
                self.settings.regression_L1_wt = lwt
                odof = self.anova_one_drug_one_feature(drug_name,
                        feature_name)
                anova = self._get_anova_summary(self.data_lm,
                        output='dataframe')
                mses[i,j] = self.data_lm.bic
            pb.animate(i+1)
        return mses
Ejemplo n.º 26
0
    def volcano_plot_all_features(self):
        """Create a volcano plot for each feature and save in PNG files

        Each filename is set to **volcano_<feature name>.png**
        """
        features = list(self.df[self._colname_feature].unique())
        print('Creating image for each feature (using all drugs)')
        pb = Progress(len(features), 1)
        for i, feature in enumerate(features):
            self.volcano_plot_one_feature(feature)
            self.savefig_and_js("volcano_%s.png" % feature, 
                    size_inches=(10, 10))
            pb.animate(i+1)

            # This prevent memory leak.
            self.current_fig.canvas.mpl_disconnect(self.cid)
            try:
                import mpld3
                mpld3.plugins.clear(self.current_fig)
            except:
                pass
Ejemplo n.º 27
0
def test_progressbar():
    N = 2
    p = progressbar.progress_bar(N)

    for i in range(0,N):
        time.sleep(.1)
        p.animate(i+1, i)


    p = progressbar.TextProgressBar(N, progressbar.consoleprint)
    for i in range(0,N):
        time.sleep(.1)
        p.animate(i+1, i)

    p = Progress(100)
    p.animate(1)
    assert p.pb.interval == 1

    p = Progress(200)
    assert p.pb.interval == 2
    p.animate(1)
Ejemplo n.º 28
0
    def load_records(self, overwrite=False):
        """Load a flat file and store records in :attr:`records`

        """
        self._load_flat_file(overwrite=overwrite)
        self.records = {}

        # TODO: check if it exists otherwise, load it ?
        if os.path.exists(self.filename) is False:
            self.load()
        with open(self.filename) as f:
            data = f.read().strip()

        data = data.split("//\n") # the sep is //\n
        self._child_match = re.compile('ID\s+\:\s*(\d+)\s*')
        self._parent_match = re.compile('PARENT ID\s+\:\s*(\d+)\s*')
        self._rank_match = re.compile('RANK\s+\:\s*([^\n]+)\s*')
        self._name_match = re.compile('SCIENTIFIC NAME\s+\:\s*([^\n]+)\s*')

        from easydev import Progress
        pb = Progress(len(data))

        if self.verbose:
            print('Loading all taxon records.')
        for i, record in enumerate(data[0:]):
            # try/except increase comput. time by 5%
            try:
                dico = self._interpret_record(record)
                identifier = int(dico['id'])
                self.records[identifier] = dico
            except Exception as err:
                print(err)
                print('Could not parse the following record '  + \
                      'Please fill bug report on http://github.com/biokit')
                print(record)
            if self.verbose:
                pb.animate(i+1)
        if self.verbose:
            print()
Ejemplo n.º 29
0
    def search_in_chemspider(self):
        # Fill results attribute as a dictionary. Keys being the drug id
        # and values are list of chemspider identifiers
        #
        # SB52334 --> SB-52334
        N = len(self.dd)

        pb = Progress(N)
        self.results = {}
        results = []
        for i, index in enumerate(self.dd.df.index):
            drug = self.dd.df.index[i]
            drug_name = self.dd.df.ix[drug].DRUG_NAME
            try:
                res = self._cs_find(drug_name)
            except:
                print("This drug index (%s) / drug name (%s) was not found" %
                        (index, drug_name))
                res = []
            self.results[drug] = res
            pb.animate(i+1)
            results.append(res)
        self.dd_filled.df['CHEMSPIDER_SEARCHED'] = results
Ejemplo n.º 30
0
def process_paired_reads(paired_reader, modifiers1, modifiers2, filters,
                         n_progress=-1):
	"""
	Loop over reads, find adapters, trim reads, apply modifiers and
	output modified reads.

	Return a Statistics object.
	"""
	n = 0  # no. of processed reads
	total1_bp = 0
	total2_bp = 0

	if n_progress != -1:
		try:
			from easydev import Progress
			pb = Progress(n_progress)
			count = 0
		except:
			n_progress = -1

	for read1, read2 in paired_reader:
		n += 1
		total1_bp += len(read1.sequence)
		total2_bp += len(read2.sequence)
		for modifier in modifiers1:
			read1 = modifier(read1)
		for modifier in modifiers2:
			read2 = modifier(read2)
		for filter in filters:
			# Stop writing as soon as one of the filters was successful.
			if filter(read1, read2):
				break
		if n_progress != -1:
			count += 1
			pb.animate(count)

	return Statistics(n=n, total_bp1=total1_bp, total_bp2=total2_bp)
Ejemplo n.º 31
0
def bam_to_mapped_unmapped_fastq(filename,
                                 output_directory=None,
                                 verbose=True):
    """Create mapped and unmapped fastq files from a BAM file

    :context: given a reference, one or two FastQ files are mapped onto the
        reference to generate a BAM file. This BAM file is a compressed version
        of a SAM file, which interpretation should be eased within this
        function.

    :param filename: input BAM file
    :param output_directory: where to save the mapped and unmapped files
    :return: dictionary with number of reads for each file (mapped/unmapped for
        R1/R2) as well as the mode (paired or not), the number of unpaired
        reads, and the number of duplicated reads. The unpaired reads should
        be zero (sanity check)

    Given a BAM file, create FASTQ with R1/R2 reads mapped and unmapped.
    In the paired-end case, 4 files are created.

    Note that this function is efficient in that it does not create intermediate
    files limiting IO in the process. As compared to standard tools such as 
    bedtools bamtofastq, it is 1.5 to 2X slower but it does create the mapped
    AND unmapped reads.

    :Details: Secondary alignment (flag 256) are dropped so as to remove any
        ambiguous alignments. The output dictionary stores "secondary" key to
        keep track of the total number of secondary reads that are dropped. If
        the flag is 256 and the read is unpaired, the key *unpaired* is also
        incremented.

        If the flag is not equal to 256, we first reverse complement reads that
        are tagged as *reverse* in the BAM file. Then, reads that are not paired or
        not "proper pair" (neither flag 4 nor flag 8) are ignored.

        If R1 is mapped **or** R2 is mapped then the reads are considered mapped. If
        both R1 and R2 are unmapped, then reads are unmapped.

    .. note:: about chimeric alignment: one is the representative and the other is
        the supplementary. This flag is not used in this function. Note also that
        chimeric alignment have same QNAME and flag 4 and 8

    .. note:: the contamination reported is basde on R1 only.

    .. todo:: comments are missing since there are not stored in the BAM file.


    .. note:: the mapped reads may not be synchronized because we include also
        the chimeric alignment (cf samtools documentation). However, 
        total reads = unmappeds reads + R1 mapped + R2 mapped - supplementary
        reads (those with flag 2048).
    """
    bam = BAM(filename)
    # figure out if this is paired or unpaired

    newname, ext = os.path.splitext(filename)

    import collections
    stats = collections.defaultdict(int)
    stats['R1_unmapped'] = 0
    stats['R1_mapped'] = 0

    # figure out where to save the file
    if output_directory is None:
        pass
    else:
        assert isinstance(filename, str)
        from sequana.snaketools import FileFactory
        ff = FileFactory(filename)
        newname = output_directory + os.sep + ff.filenames[0]

    rt1 = "_R1_"
    rt2 = "_R2_"

    R1_mapped = open(newname + "{}.mapped.fastq".format(rt1), "wb")
    R1_unmapped = open(newname + "{}.unmapped.fastq".format(rt1), "wb")
    stats['duplicated'] = 0
    stats['unpaired'] = 0

    unpaired = 0

    # if paired, let open other files
    if bam.is_paired:
        stats['mode'] = "pe"
        stats['R2_unmapped'] = 0
        stats['R2_mapped'] = 0
        R2_mapped = open(newname + "{}.mapped.fastq".format(rt2), "wb")
        R2_unmapped = open(newname + "{}.unmapped.fastq".format(rt2), "wb")
    else:
        stats['mode'] = "se"

    # loop through the BAM (make sure it is rewinded)
    bam.reset()

    if verbose:
        from easydev import Progress
        pb = Progress(len(bam))

    for i, this in enumerate(bam):
        if this.flag & 256:
            # Unmapped reads are in the BAM file but have no valid assigned
            # position (N.B., they may have an assigned position, but it should be ignored).
            # It's typically the case that a number of reads can't be aligned, due to things
            # like sequencing errors, imperfect matches between the DNA sequenced and the
            # reference, random e. coli or other contamination, etc..
            # A secondary alignment occurs when a given read could align reasonably well to
            # more than one place. One of the possible reported alignments is termed "primary"
            # and the others will be marked as "secondary".
            stats['secondary'] += 1
            if this.is_paired is False:
                stats['unpaired'] += 1
        else:

            # quick hack
            if this.is_read1:
                suffix = b"/1"
            else:
                suffix = b"/2"

            # in pysam, seq is a string and qual a bytes....
            if this.is_reverse is True:
                txt = b"@" + bytes(this.qname, "utf-8") + suffix + b"\n"
                revcomp = reverse_complement(this.seq)
                txt += bytes(revcomp, "utf-8") + b"\n"
                txt += b"+\n"
                txt += bytes(this.qual[::-1], 'utf-8') + b"\n"
            else:
                txt = b"@" + bytes(this.qname, "utf-8") + suffix + b"\n"
                txt += bytes(this.seq, "utf-8") + b"\n"
                txt += b"+\n"
                txt += bytes(this.qual, "utf-8") + b"\n"

            # Here, we must be careful as to keep the pairs. So if R1 is mapped
            # but R2 is unmapped (or the inverse), then the pair is mapped
            if this.is_read1:
                if this.is_unmapped and this.mate_is_unmapped:
                    R1_unmapped.write(txt)
                    stats['R1_unmapped'] += 1
                else:
                    R1_mapped.write(txt)
                    stats['R1_mapped'] += 1
            elif this.is_read2:
                if this.is_unmapped and this.mate_is_unmapped:
                    R2_unmapped.write(txt)
                    stats['R2_unmapped'] += 1
                else:
                    R2_mapped.write(txt)
                    stats['R2_mapped'] += 1
            else:
                # This should be a single read
                #assert self.is_paired is False
                stats['unpaired'] += 1
                if this.is_unmapped:
                    R1_unmapped.write(txt)
                    stats['R1_unmapped'] += 1
                else:
                    R1_mapped.write(txt)
                    stats['R1_mapped'] += 1

            if this.is_duplicate:
                stats['duplicated'] += 1

        if verbose:
            pb.animate(i + 1)

    if bam.is_paired:
        R2_mapped.close()
        R2_unmapped.close()

    logger.info("\nNumber of entries in the BAM: %s" % str(i + 1))

    R1_mapped.close()
    R1_unmapped.close()

    _x = stats['R1_mapped']
    _y = stats['R1_unmapped']
    stats["contamination"] = _x / float(_x + _y) * 100

    return stats
Ejemplo n.º 32
0
    def create_data_packages_for_companies(self, companies=None):
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print("\n\n========= Analysing company %s out of %s (%s)" %
                  (ii + 1, Ncomp, company))
            self.mkdir(company)
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print("---------------- for TCGA %s" % tcga)

                # Read the results previously computed
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "ALL/%s/OUTPUT/results.csv" % tcga
                    print("Downloading results from %s" % results_path)
                    results_df = ANOVAResults(results_path)

                results = ANOVAResults(results_df)

                # Get a DrugDecode for that company
                drug_decode_company = self.drug_decode.df.query(
                    "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # filter results using the new drug decode
                drug_ids_in_results = get_drug_id(results.df.DRUG_ID)

                mask = [
                    True if x in drug_decode_company.df.index else False
                    for x in drug_ids_in_results
                ]

                results.df = results.df.ix[mask]

                # Just to create an instance with the subset of drug_decode
                # and correct settings. This is also used to store
                # the entire input data set. So, we must remove all drugs
                # not relevant for the analysis of this company
                an = ANOVA(self.ic50_filename, gf_filename,
                           drug_decode_company)

                def drug_to_keep(drug):
                    to_keep = get_drug_id(drug) in drug_decode_company.df.index
                    return to_keep

                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = company + os.sep + tcga
                an.settings.analysis_type = tcga
                self.report = ANOVAReport(an, results)
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)

                if self.debug is False:
                    self.report.create_html_features()
                    self.report.create_html_associations()

                    # For now, we just copy all DRUG images from
                    # the analysis made in ALL
                    from easydev import shellcmd, Progress
                    print("\nCopying drug files")
                    drug_ids = results.df.DRUG_ID.unique()
                    pb = Progress(len(drug_ids))
                    for i, drug_id in enumerate(drug_ids):
                        # copy the HTML
                        filename = "%s.html" % drug_id
                        source = "ALL%s%s%s" % (os.sep, tcga, os.sep)
                        dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        #copy the images
                        filename = "volcano_%s.*" % drug_id
                        source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep,
                                                        os.sep)
                        dest = "%s%s%s%simages%s" % (company, os.sep, tcga,
                                                     os.sep, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        pb.animate(i + 1)
Ejemplo n.º 33
0
class MultiProcessing(object):
    """Class to run jobs in an asynchronous manner.

    You would use this class to run several jobs on a local computer that has
    several cpus.


    ::

        t = MultiProcessing(maxcpu=2)
        t.add_job(func, func_args)
        t.run()
        t.results[0] # contain returned object from the function *func*.


    .. warning:: the function must be a function, not a method. This is inherent
        to multiprocess in the multiprocessing module.

    .. warning:: the order in the results list may not be the same as the
        list of jobs. see :meth:`run` for details


    """
    def __init__(self, maxcpu=None, verbose=False, progress=True):
        """

        :param maxcpu: default returned by multiprocessing.cpu_count()
        :param verbose: print the output of each job. Could be very verbose
            so we advice to keep it False.
        :param progress: shows the progress


        """
        if maxcpu == None:
            maxcpu = cpu_count()

        self.maxcpu = maxcpu
        self.reset()
        self.verbose = verbose
        self.progress = progress

    def reset(self):
        """remove joves and results"""
        self.jobs = []  # a list of processes
        self.results = Queue()  # the results to append

    def add_job(self, func, *args, **kargs):
        """add a job in the pool"""
        if self.verbose:
            print("Adding jobs in the queue..", )
        t = Process(target=func, args=args, kwargs=kargs)
        self.jobs.append(t)

    def _cb(self, results):
        if self.verbose is True:
            print("callback", results)
        if self.progress is True:
            self.pb.animate(len(self.results) + 1)
        self.results.append(results)

    def run(self, delay=0.1, verbose=True):
        """Run all the jobs in the Pool until all have finished.

        Jobs that have been added to the job list in :meth:`add_job`
        are now processed in this method by using a Pool. Here, we add
        all jobs using the apply_async method from multiprocess module.

        In order to ensure that the jobs are run sequentially in the same
        order as in :attr:`jobs`, we introduce a delay between 2 calls
        to apply_async (see http://docs.python.org/2/library/multiprocessing.html)

        A better way may be t use a Manager but for now, this works.

        """
        from easydev import Progress
        if self.progress is True:
            self.pb = Progress(len(self.jobs), 1)
            self.pb.animate(0)

        def init_worker():
            import signal
            signal.signal(signal.SIGINT, signal.SIG_IGN)

        self.results = []
        self.pool = Pool(self.maxcpu, init_worker)

        for process in self.jobs:
            self.pool.apply_async(process._target,
                                  process._args,
                                  process._kwargs,
                                  callback=self._cb)

            # ensure the results have same order as jobs
            # maybe important if you expect the order of the results to
            # be the same as inut; otherwise set delay to 0
            time.sleep(delay)

        try:
            while True:
                time.sleep(1)
                # check if all processes are finished.
                # if so, finished.
                count = len(self.results)
                if count == len(self.jobs):
                    break

        except KeyboardInterrupt:
            print(
                "\nCaught interruption. " +
                "Terminating the Pool of processes... ", )
            self.pool.terminate()
            self.pool.join()
            print("... done")
        else:
            # Closing properly the pool
            self.pool.close()
            self.pool.join()

        # Pool cannot be pickled. So, if we want to pickel "MultiProcessing"
        # class itself, we must desctroy this instance
        del self.pool

        self.finished = True
Ejemplo n.º 34
0
    def download_fasta(self, filelist, output_dir=None, from_ena=True):
        """Download a FASTA (or list of)

        :param filelist: a name to find on the ENA web server OR the
            name of an accession number.

        .. warning:: The filename is named after the accession without .X number
            If there are several variant .1, .2 the later will be used. This
            should not happen if the list is properly defined. 
        """
        from bioservices import ENA
        if filelist.endswith(".txt") and os.path.exists(filelist) is False:
            logger.info(
                "Downloading list from http://www.ebi.ac.uk/genomes/%s" %
                filelist)
            data = urlopen("http://www.ebi.ac.uk/genomes/%s" %
                           filelist).readlines()
            identifiers = [x.strip().decode() for x in data]
        elif filelist == "mus_musculus":  #19 +x+y chromosomes + 5 mitochondrion
            # could also add strain C57BL.
            identifiers = [
                "AY172335", "CM000209", "CM000210", "CM000211"
                "CM000212", "CM000213", "CM000214", "CM000215", "CM000216"
                "CM000217", "CM000218", "CM000219", "CM000220", "CM000221"
                "CM000222", "CM000223", "CM000224", "CM000225", "CM000226"
                "CM000227", "CM000228", "CM000229", "CM000225", "CM000226"
                "EF108342", "AB042432", "AY675564", "DQ874614"
            ]
        elif filelist == "worms":  # Caernorhabditis briggsae and elegans
            identifiers = [
                "AC186293", "FR847112", "FR847113", "FR847114", "FR847118",
                "FR847121", "FR847123", "BX284601", "BX284602", "BX284603",
                "BX284604", "BX284605", "BX284606"
            ]
        elif isinstance(filelist, str) and filelist in self._metadata.keys():
            name = self._metadata[filelist][0]
            logger.info(
                "Downloading list from http://www.ebi.ac.uk/genomes/%s" % name)
            data = urlopen("http://www.ebi.ac.uk/genomes/%s" %
                           name).readlines()
            identifiers = [x.strip().decode() for x in data]
        elif isinstance(filelist, list):
            identifiers = filelist[:]
        elif isinstance(filelist, str):
            # could be a single identifier or a filename (assuming a single
            # column)
            if os.path.exists(filelist):
                identifiers = [x for x in open(filelist).read().split()]
                identifiers = [x.strip() for x in identifiers]
            else:
                identifiers = [filelist]
        self._identifiers = identifiers

        self.results = self.ena_id_to_gi_number(identifiers)

        # do not use caching things this could be huge data sets.
        ena = ENA()

        if output_dir is None:
            output_dir = "."
        else:
            try:
                os.mkdir(output_dir)
            except:
                pass

        N = len(identifiers)
        pb = Progress(N)
        logger.info("Fetching all fasta from ENA")
        for i, identifier in enumerate(identifiers):
            filenames = glob.glob(output_dir + os.sep + "ENA_%s*" % identifier)

            if len(filenames) >= 1:
                pb.animate(i + 1)
                # no need to fetch and save the data it looks like...
                continue

            # download data from ENA
            data = ena.get_data(identifier, "fasta")

            # Split header and Fasta
            header, others = data.decode().split("\n", 1)

            # Source of failure:
            # - list and DB are not synchrone: e.g. some entries may be deleted
            if "suppressed" in header:
                continue
            if ">" not in header:
                continue

            # Do not use try/except since when it fails, this is a real issue
            name = header.strip(">").split(" ")[0]
            db, id_, acc = name.split("|")

            try:
                header = self.switch_header_to_gi(acc)
            except:
                logger.error("Failed for this entry:")
                logger.error(identifier)
                logger.error(header)
                logger.error(name)
                continue

            # Save to local file
            # WARNINGS: extension is .fa because kraken-build expects .fa files
            filename = "%s_%s.fa" % (db, acc.split(".")[0])
            if output_dir:
                filename = output_dir + os.sep + filename

            with open(filename, "w") as fout:
                fout.write(header + "\n" + others)
            pb.animate(i + 1)
Ejemplo n.º 35
0
    def filter(self,
               identifiers_list=[],
               min_bp=None,
               max_bp=None,
               progressbar=True,
               output_filename='filtered.fastq'):
        """Save reads in a new file if there are not in the identifier_list

        :param int min_bp: ignore reads with length shorter than min_bp
        :param int max_bp: ignore reads with length above max_bp

        """
        # 7 seconds without identifiers to scan the file
        # on a 750000 reads

        if min_bp is None:
            min_bp = 0

        if max_bp is None:
            max_bp = 1e9

        # make sure we are at the beginning
        self.rewind()

        output_filename, tozip = self._istozip(output_filename)

        with open(output_filename, "w") as fout:
            pb = Progress(self.n_reads)
            buf = ""
            filtered = 0
            saved = 0

            for count, lines in enumerate(grouper(self._fileobj)):
                identifier = lines[0].split()[0]
                if lines[0].split()[0].decode() in identifiers_list:
                    filtered += 1
                else:  #pragma: no cover
                    N = len(lines[1])
                    if N <= max_bp and N >= min_bp:
                        buf += "{}{}+\n{}".format(lines[0].decode("utf-8"),
                                                  lines[1].decode("utf-8"),
                                                  lines[3].decode("utf-8"))
                        saved += 1
                    else:
                        filtered += 1
                    if count % 100000 == 0:
                        fout.write(buf)
                        buf = ""
                if progressbar is True:
                    pb.animate(count + 1)
            fout.write(buf)
            if filtered < len(identifiers_list):  #pragma: no cover
                print(
                    "\nWARNING: not all identifiers were found in the fastq file to "
                    + "be filtered.")
        logger.info("\n{} reads were filtered out and {} saved in {}".format(
            filtered, saved, output_filename))

        if tozip is True:  #pragma: no cover
            logger.info("Compressing file")
            self._gzip(output_filename)
Ejemplo n.º 36
0
    def _get_data(self, name, params):

        # keep the number of events we want and original offset
        max_data = params['limit']
        offset = params['offset']

        # I noticed that
        # if offset + limit > total_count, then limit is set to 1000 - offset
        # Not sure whether it is a bug or intended behaviour but this caused
        # some issues during the debugging.

        # So http_get("mechanism?format=json&limit=10000&offset=10")
        # returns 990 entries and not 1000 as expected.

        # if a resources is small (e.g. tissue has 655 < 1000 entries) there is
        # no such issues.

        # So, the best is to constraint limit to 1000
        params['limit'] = 1000  # for the first call

        # The limit used in all other calls
        limit = 1000

        res = self.http_get("{}".format(name), params=params)
        self._check_request(res)

        # get rid of page_meta key/value
        self.page_meta = res['page_meta']
        keys = list(res.keys())
        keys.remove('page_meta')
        names = keys[0]  # the parameter name in plural form

        # keep first chunk of data
        data = res[names]

        if max_data == -1:
            max_data = res['page_meta']['total_count']
        elif max_data > res['page_meta']['total_count']:
            max_data = res['page_meta']['total_count']

        N = max_data
        from easydev import Progress
        pb = Progress(N)
        count = 1

        while res["page_meta"]['next'] and len(data) < max_data:
            params['limit'] = limit
            params['offset'] = limit * count + offset
            res = self.http_get("{}".format(name), params=params)
            data += res[names]
            count += 1
            pb.animate(len(data))
            self.page_meta = res['page_meta']

        if self.page_meta['next']:
            offset = self.page_meta['offset']
            total = self.page_meta['total_count'] - len(data) - int(offset)
            self.logging.warning(
                'More data available ({}). rerun with higher'
                'limit and/or offset {}. Check content of page_meta'
                ' attribute'.format(total, offset))

        if len(data) > max_data:
            return data[0:max_data]
        else:
            return data
Ejemplo n.º 37
0
    def get_feature_counts_eukaryotes(self, feature=None, attribute=None):

        if feature is None:
            feature = "gene"

        if attribute is None:
            attribute = "ID"

        # just to not loose the original
        df = self.df.copy()

        # Name contains the salmon entries read from gffread that uses
        # transcript_id. From this transcript id, we get the gene (parent)
        df['Gene'] = [self.trs2genes[x] for x in self.df.Name]

        #groups = df.groupby('Gene').groups
        counts_on_genes = df.groupby('Gene').NumReads.sum()

        ff = self.filename.split("/")[-1]
        results = f"\nGeneid\tChr\tStart\tEnd\tStrand\tLength\t{ff}"

        # mouse 25814 gene (feature)
        #       53715 gene_id (attribute)
        #      135181 transcript_id (attribute)
        #      133618 transcript_id from salmon
        #      135181 entries in transcript fasta (gffread)

        # gffread extact transcript_id from the gff if present
        # otherwise, extract geneID or gene_id
        logger.info("Recreating the feature counts")

        genes = {}

        dd = self.gff.df.query("ID in @counts_on_genes.index")
        dd = dd.set_index("ID")
        dd = dd.loc[counts_on_genes.index]
        self.dd = dd

        types = dd['type'].values
        starts = dd['start'].values
        stops = dd['stop'].values
        strands = dd['strand'].values
        seqids = dd['seqid'].values

        from easydev import Progress
        pb = Progress(len(counts_on_genes))

        S = 0

        logger.info("Grouping")
        TPMgroup = df.groupby('Gene').apply(lambda group: group['TPM'].sum())
        efflength_null = df.groupby('Gene').apply(
            lambda group: group['EffectiveLength'].mean())

        groups = df.groupby('Gene')
        for i, name in enumerate(counts_on_genes.index):
            # Since we use ID, there should be only one hit. we select the first
            # one to convert to a Series

            tpm_sum = TPMgroup.loc[name]
            if tpm_sum == 0:
                length = efflength_null.loc[name]
            else:
                abundances = groups.get_group(name).TPM
                efflength = groups.get_group(name).EffectiveLength
                length = sum([x * y for x, y in zip(abundances, efflength)
                              ]) / abundances.sum()
                S += abundances.sum()

            # FIXME we keep only types 'gene' to agree with output of
            # start/bowtie when working on the gene feature. What would happen
            # to compare salmon wit other type of features ?
            if types[i] == "gene":
                start = starts[i]
                stop = stops[i]
                seqid = seqids[i]
                strand = strands[i]
                NumReads = counts_on_genes.loc[name]
                length = length
                name = name.replace("gene:", "")
                results += f"\n{name}\t{seqid}\t{start}\t{stop}\t{strand}\t{length}\t{NumReads}"
            else:
                pass
            pb.animate(i)
        return results
        """
Ejemplo n.º 38
0
    def check_randomness(self,
                         drug_name,
                         kfolds=10,
                         N=10,
                         progress=False,
                         nbins=40,
                         show=True,
                         **kargs):
        """Compute Bayes factor between NULL model and best model fitted N times


        :param drug_name:
        :param kfolds:
        :param int N: optimise NULL models and real model N times
        :param progress:
        :param nbins:
        :param show:

        Bayes factor::

            S = sum([s>r for s,r in zip(scores, random_scores)])
            proba = S / len(scores)
            bayes_factor = 1. / (1-proba)

        Interpretation for values of the Bayes factor according to Kass 
        and Raftery (1995).

        ============================  ==================
            Interpretation                  B(1,2)
        ============================  ==================
           Very strong support for 1   < 0.0067
                    Strong support 1   0.0067 to 0.05
              Positive support for 1   0.05 to .33
                  Weak support for 1   0.33 to 1
         No support for either model   1
                  Weak support for 2   1 to 3
              Positive support for 2   3 to 20
                Strong support for 2   20 to 150
           Very strong support for 2   > 150
        ============================  ==================

        references: http://www.socsci.uci.edu/~mdlee/LodewyckxEtAl2009.pdf
            http://www.aarondefazio.com/adefazio-bayesfactor-guide.pdf
        """
        scores = []
        pb = Progress(N)
        for i in range(N):
            # Fit a model using CV
            inter_results = self.runCV(drug_name,
                                       kfolds=kfolds,
                                       verbose=False,
                                       **kargs)
            scores.append(inter_results.Rp)
            if progress:
                pb.animate(i + 1)

        random_scores = []
        pb = Progress(N)
        for i in range(N):
            # Fit a model using CV
            inter_results = self.runCV(drug_name,
                                       kfolds=kfolds,
                                       randomize_Y=True,
                                       verbose=False,
                                       **kargs)
            random_scores.append(inter_results.Rp)
            if progress:
                pb.animate(i + 1)

        from scipy.stats import ttest_ind
        ttest_res = ttest_ind(scores, random_scores)
        results = {
            "scores": scores,
            "random_scores": random_scores,
            "ttest_pval": ttest_res.pvalue
        }

        # Compute the log of the Bayes factor to avoid underflow as communicated
        # by M.Menden.
        S = sum([s > r for s, r in zip(scores, random_scores)])
        proba = S / len(scores)
        if proba == 1:
            # Set the maximum instead of infinite
            # bayes_factor = np.inf
            bayes_factor = 1. / (1. / len(scores))
        else:
            bayes_factor = 1. / (1 - proba)
        results['bayes_factor'] = bayes_factor

        M = max(max(scores), max(random_scores)) * 1.2
        m = min(min(scores), min(random_scores)) * 1.2
        if show:
            bins = pylab.linspace(m, M, nbins)
            pylab.clf()
            pylab.hist(scores, bins=bins, color="b", alpha=0.5)
            pylab.hist(random_scores, color="r", alpha=0.5, bins=bins)
            pylab.title("Bayes factor=%(bayes_factor).2f" % results)
            pylab.grid(True)
            pylab.xlabel("Coefficient of correlation Rp")
            pylab.xlabel("#")

        return results
Ejemplo n.º 39
0
    def anova_all(self, animate=True, drugs=None, multicore=None):
        """Run all ANOVA tests for all drugs and all features.

        :param drugs: you may select a subset of drugs
        :param animate: shows the progress bar
        :return: an :class:`~gdsctools.anova_results.ANOVAResults`
            instance with the dataframe
            stored in an attribute called **df**

        Calls :meth:`anova_one_drug` for each drug and concatenate all
        results together. Note that once all data are gathered,
        :meth:`add_pvalues_correction` is called to fill a new column
        with FDR corrections.

        An extra column  named "ASSOC_ID" is also added with
        a unique identifer sorted by ascending FDR.

        .. note:: A thorough comparison with version v17 gives the same FDR
            results (difference ~1e-6); Note however that the qvalue results
            differ by about 0.3% due to different smoothing in R and Python.
        """
        if self.verbose and len(self.individual_anova):
            print("Reusing some results from the buffer. "
                  "To reset the buffer, call reset_buffer() method")
        # drop DRUG where number of IC50 (non-null) is below 5
        # axis=0 is default but we emphasize that sum is over
        # column (i.e. drug
        vv = (self.ic50.df.isnull() == False).sum(axis=0)
        # FIXME: should be in one_drug_one_feature ??
        drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50]

        # if user provided a list of drugs, use them:
        if drugs is not None:
            # todo: check valifity of the drug names
            drug_names = drugs[:]

        pb = Progress(len(drug_names), 1)
        drug_names = list(drug_names)
        #pylab.shuffle(drug_names) # ? why

        if animate is True:
            pb.animate(0)

        if multicore:
            # Note that here, we do not use the buffer
            multicore_analysis(self, drug_names, multicore)
        else:

            for i, drug_name in enumerate(drug_names):
                if drug_name in self.individual_anova.keys():
                    pass
                else:
                    res = self.anova_one_drug(drug_name,
                                              animate=False,
                                              output='dataframe')
                    self.individual_anova[drug_name] = res
                if animate is True:
                    pb.animate(i + 1)
        print("\n")
        if len(self.individual_anova) == 0:
            return ANOVAResults()

        df = pd.concat(self.individual_anova, ignore_index=True)

        if len(df) == 0:
            return df
        # sort all data by ANOVA p-values
        try:
            df.sort_values('ANOVA_FEATURE_pval', inplace=True)
        except:
            df.sort('ANOVA_FEATURE_pval', inplace=True)

        # all ANOVA have been computed individually for each drug and each
        # feature. Now, we need to compute the multiple testing corrections
        if self.settings.pvalue_correction_level is True:
            df = self.add_pvalues_correction(df)
        else:
            pass

        # insert a unique identifier as first column
        df.insert(0, 'ASSOC_ID', range(1, len(df) + 1))

        self.df = df
        # order the column names as defined in the __init__ method
        df = df[self.column_names]
        df.reset_index(inplace=True, drop=True)

        return ANOVAResults(df, self.settings)
Ejemplo n.º 40
0
    def anova_one_drug(self, drug_id, animate=True, output='object'):
        """Computes ANOVA for a given drug across all features

        :param str drug_id: a valid drug identifier.
        :param animate: shows the progress bar
        :return: a dataframe

        Calls :meth:`anova_one_drug_one_feature` for each feature.
        """
        # drop first and second columns that are made of strings
        # works under python2 but not python 3. Assume that the 2 first
        #columns are the sample name and tissue feature
        # Then, we keep only cases with at least 3 features.
        # MSI could be used but is not like in original R code.
        features = self.features.df.copy()
        # need to skip the FACTOR to keep only features
        shift = self.features.shift

        features = features[features.columns[shift:]]
        # FIXME what about features with less than 3 zeros ?
        mask = features.sum(axis=0) >= 3

        # TODO: MSI, tissues, name must always be kept
        #
        selected_features = features[features.columns[mask]]

        # scan all features for a given drug
        assert drug_id in self.ic50.df.columns
        N = len(selected_features.columns)
        pb = Progress(N, 10)
        res = {}
        #
        for i, feature in enumerate(selected_features.columns):
            # production True, means we do not want to create a DataFrame
            # for each call to the anova_one_drug_one_feature function
            # Instead, we require dictionaries
            this = self.anova_one_drug_one_feature(drug_id,
                                                   feature,
                                                   production=True)
            if this['ANOVA_FEATURE_pval'] is not None:
                res[feature] = this
            if animate is True:
                pb.animate(i + 1)

        # if production is False:
        # df = pid.concat(res, ignore_index=True)
        df = pd.DataFrame.from_records(res)
        df = df.T

        df = ANOVAResults().astype(df)
        if len(df) == 0:
            return df

        # append DRUG_NAME/DRUG_TARGET columns
        df = self.drug_decode.drug_annotations(df)

        # TODO: drop rows where ANOVA_FEATURE_PVAL is None
        if output != 'object':
            df = self.add_pvalues_correction(df)
            return df
        else:
            df = self.add_pvalues_correction(df)
            res = ANOVAResults(df, self.settings)
            res.settings = ANOVASettings(**self.settings)
            return res
Ejemplo n.º 41
0
    def plot_cindex(self,
                    drug_name,
                    alphas,
                    l1_ratio=0.5,
                    kfolds=10,
                    hold=False):
        """Tune alpha parameter using concordance index


        This is longish and performs the following task. For a set of alpha
        (list), run the elastic net analysis for a given **l1_ratio** with
        **kfolds**. For each alpha, get the CIndex and find the CINdex for
        which the errors are minimum.

        .. warning:: this is a bit longish (300 seconds for 10 folds
            and 80 alphas) on GDSCv5 data set.
        """
        from dreamtools.core.cindex import cindex

        CI_train = {}
        CI_test = {}
        for c in range(kfolds):
            CI_train[c] = []
            CI_test[c] = []

        pb = Progress(len(alphas))

        for i, alpha in enumerate(alphas):
            self.fit(drug_name, alpha=alpha, l1_ratio=l1_ratio, kfolds=kfolds)

            # Look at the results and store cindex
            for kf in range(kfolds):
                x_train = self.kfold_data['x_train'][kf].values
                y_train = self.kfold_data['y_train'][kf].values

                x_test = self.kfold_data['x_test'][kf].values
                y_test = self.kfold_data['y_test'][kf].values

                x_train_pred = self.en.predict(x_train)
                x_test_pred = self.en.predict(x_test)

                CI_test[kf].append(1 - cindex(x_test_pred, y_test, [True] *
                                              len(y_test)))
                CI_train[kf].append(1 - cindex(x_train_pred, y_train, [True] *
                                               len(y_train)))
            pb.animate(i)

        mu_train = pd.DataFrame(CI_train).transpose().mean()
        sigma_train = pd.DataFrame(CI_train).transpose().std()

        mu_test = pd.DataFrame(CI_test).transpose().mean()
        sigma_test = pd.DataFrame(CI_test).transpose().std()

        best_alpha = alphas[pd.DataFrame(CI_test).mean(axis=1).argmax()]

        pylab.clf()
        pylab.errorbar(pylab.log(alphas),
                       mu_train,
                       yerr=sigma_train,
                       label="train")
        pylab.errorbar(pylab.log(alphas) + .1,
                       mu_test,
                       yerr=sigma_test,
                       label="test")
        pylab.plot(pylab.log(alphas), mu_train, 'ob')
        pylab.plot(pylab.log(alphas) + .1, mu_train, 'or')
        pylab.legend()
        pylab.axvline(pylab.log(best_alpha), lw=2, color="purple")

        return best_alpha
Ejemplo n.º 42
0
    def score_challengeB(self, filenames):
        # Ideally provide 3 filenames but if only 1 is given, try
        # to infer the names of the 2 others
        cor_pheno1 = []
        cor_pheno2 = []
        pval_pheno1 = []
        pval_pheno2 = []
        scores = []
        from dreamtools.core.rtools import RTools
        rtool = RTools(verboseR=False)

        assert len(filenames) == 3, "Must provide 3 files"

        self.golds = []
        self.preds = []
        gold_filenames = self.download_goldstandard('B')
        print("Warning: your 3 submissions should be ordered as B1, B2, B3 files")

        for tag in [1, 2, 3]:
            #assumeing data and gs are sorted in the same way !!
            gold = pd.read_csv(gold_filenames[tag-1], sep='[ \t]', 
                    engine='python')
            self.golds.append(gold)

            #filename = 'DREAM5_SysGenB%s_your_Predictions.txt' % tag
            #filename = self._pj([self.classpath, 'data', filename])
            filename = filenames[tag-1]
            pred1 = pd.read_csv(filename, sep='[ \t]', engine='python')
            self.preds.append(pred1)

            # correlation gs versus predicted
            rtool.session.t = pred1.ix[0].values
            rtool.session.g = gold.ix[0].values
            rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')")
            T1 = rtool.session.results.copy()

            rtool.session.t = pred1.ix[1].values
            rtool.session.g = gold.ix[1].values
            rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')")
            T2 = rtool.session.results.copy()
            cor_pheno1.append(T1['estimate'])
            cor_pheno2.append(T2['estimate'])
            pval_pheno1.append(T1['p.value'])
            pval_pheno2.append(T2['p.value'])

            score = -(np.log(T1['p.value']) + np.log(T2['p.value']))
            scores.append(score)

        self.corp1 = cor_pheno1
        self.corp2 = cor_pheno2
        self.pval1 = pval_pheno1
        self.pval2 = pval_pheno2
        self.scores = scores

        # This part now compute the pvalues using random prediction
        random_scores = {0:[],1:[],2:[]}

        from easydev import Progress
        pb = Progress(self.N_pvalues, interval=1)

        for ii in range(1, self.N_pvalues):
            for tag in [0,1,2]:
                #generate random coordinates
                coord = random.sample(['RIL%s' % i for i in range(1,31)], 30)
                coord2 = random.sample(['RIL%s' % i for i in range(1,31)], 30)

                # Obtaining random scores
                rtool.session.t = self.preds[tag].ix[0].ix[coord].values
                rtool.session.g = self.golds[tag].ix[0].values
                rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')")
                T1 = rtool.session.results.copy()
                rtool.session.t = self.preds[tag].ix[1].ix[coord2].values
                rtool.session.g = self.golds[tag].ix[1].values
                rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')")
                T2 = rtool.session.results.copy()

                random_scores[tag].append(-(np.log(T1['p.value']) + np.log(T2['p.value'])))
            pb.animate(ii+1)
        self.random_scores = random_scores
        #Obtaining p-values
        pvals = [sum(self.random_scores[k]>= self.scores[k])/float(self.N_pvalues)
                for k in [0,1,2]]
        self.pvals = pvals

        df = pd.DataFrame({'scores':self.scores,
            'correlation_phenotype1':cor_pheno1,
            'correlation_phenotype2':cor_pheno2,
            'pvalues_phenotype1':pval_pheno1,
            'pvalues_phenotype2':pval_pheno2,
            'pvalues':self.pvals})
        df= df.T
        df.columns = ['SysGenB1', 'SysGenB2', 'SysGenB3']
        return df
Ejemplo n.º 43
0
    def compounds2accession(self, compounds):
        """For each compound, identifies the target and corresponding UniProt
        accession number

        This is not part of ChEMBL API

        ::

            # we recommend to use cache if you use this method regularly
            c = Chembl(cache=True)
            drugs = c.get_approved_drugs()

            # to speed up example
            drugs = drugs[0:20]
            IDs = [x['molecule_chembl_id] for x in drugs]

            c.compounds2accession(IDs)

        """
        # we jump from compounds to targets through activities
        # Here this is a one to many mapping so we initialise a default
        # dictionary.
        from collections import defaultdict
        compound2target = defaultdict(set)

        filter = "molecule_chembl_id__in={}"
        from easydev import Progress

        if isinstance(compounds, list):
            pass
        else:
            compounds = list(compounds)

        pb = Progress(len(compounds))
        for i in range(0, len(compounds)):
            # FIXME could get activities by bunch using
            # ",".join(compounds[i:i+10) for example
            activities = self.get_activity(filters=filter.format(compounds[i]))
            # get target ChEMBL IDs from activities
            for act in activities:
                compound2target[act['molecule_chembl_id']].add(
                    act['target_chembl_id'])
            pb.animate(i + 1)

        # What we need is to get targets for all targets found in the previous
        # step. For each compound/drug there are hundreds of targets though. And
        # we will call the get_target for each list of hundreds targets. This
        # will take forever. Instead, because there are *only* 12,000 targets,
        # let us download all of them ! This took about 4 minutes on this test but
        # if you use the cache, next time it will be much much quicker. This is
        # not down at the activities level because there are too many entries
        targets = self.get_target(limit=-1)

        # identifies all target chembl id to easily retrieve the entry later on
        target_names = [target['target_chembl_id'] for target in targets]

        # retrieve all uniprot accessions for all targets of each compound
        for compound, targs in compound2target.items():
            accessions = set()
            for target in targs:
                index = target_names.index(target)
                accessions = accessions.union([
                    comp['accession']
                    for comp in targets[index]['target_components']
                ])
            compound2target[compound] = accessions

        return compound2target
Ejemplo n.º 44
0
    def get_graph(self, go_ids, ontologies=None, progress=True):
        # Here we filter the data to keep only the relevant go terms as shown in
        # panther pie chart
        import networkx as nx
        gg = nx.DiGraph()

        #assert ontology in ['MF', 'BP', 'CC']
        if ontologies is None:
            ontologies = ['MF', 'BP', 'CC']
        elif isinstance(ontologies, str):
            ontologies = [ontologies]
        ancestors = [self.ancestors[x] for x in ontologies]

        levels = []
        real_ids = []
        obsolets = []
        from easydev import Progress
        pb = Progress(len(go_ids))
        print('Retrieving info for each significant go terms')
        annotations = {}

        for i, go_id in enumerate(go_ids):

            # Some go terms maybe obsolet or renamed. Looking at other functions
            # may not work simply because the ID has changed.
            info = self.quickgo.get_go_terms(go_id)
            annotations[go_id] = info

            if info[0]['id'] != go_id:
                _id = info[0]['id']
                print('changed {} to {}'.format(go_id, _id))
                annotations[_id] = info
            else:
                _id = go_id
            aspect = info[0]['aspect']
            if info[0]['isObsolete'] is True:
                print("Skipping obsole go terms: {}".format(go_id))
                obsolets.append(go_id)
                continue
            real_ids.append(_id)

            # now figure out the distance to main ancestor
            # we can try several times
            #if _id != self.ancestors[ontology]:
            for ancestor in ancestors:

                edges = self.quickgo.get_go_paths(_id, ancestor)
                if edges == 400:
                    print("Could not retrieve {} to {}".format(_id, ancestor))
                    continue
                if edges["numberOfHits"] == 0:
                    continue
                if len(edges["results"]) >= 1:
                    for path in edges["results"]:
                        for edge in path:
                            gg.add_edge(edge['child'], edge["parent"])
                else:
                    print(_id, edges["results"])
            if progress is True:
                pb.animate(i + 1)

        self.obsolets = obsolets
        self.annotations = annotations
        self.graph = gg
        all_paths = {}
        for ancestor in ancestors:
            if ancestor not in gg:
                continue
            paths = nx.shortest_path_length(gg, target=ancestor)
            for obsolet in obsolets:
                paths[obsolet] = 100
            all_paths[ancestor] = paths

        return all_paths
Ejemplo n.º 45
0
    def _get_info(self):
        """Populates the data structures for plotting.

        Will be called on request"""

        stats = {"A":0, "C":0, "G":0, "T":0, "N":0}
        stats["qualities"] = []
        stats["mean_qualities"] = []
        stats["mean_length"] = 0
        stats["sequences"] = []

        minimum = 1e6
        maximum = 0
        # FIXME this self.N takes time in the cosntructor
        # do we need it ?
        self.lengths = np.empty(self.N)
        self.gc_list = []
        total_length = 0
        C = defaultdict(int)
        if self.verbose:
            pb = Progress(self.N)

        sequences = []
        mean_qualities = []
        qualities = []
        # could use multiprocessing
        # FastxFile has shown some errors while handling gzip files
        # created with zlib (e.g. from atropos). This is now replaced
        # by the Atropos FastqReader for now.
        #fastq = pysam.FastxFile(self.filename)

        with FastqReader(self.filename) as f:
            for i, record in enumerate(f):
                N = len(record.sequence)
                if N == 0:
                    raise ValueError("Read {} has a length equal to zero. Clean your FastQ files".format(i))
                self.lengths[i] = N

                # we can store all qualities and sequences reads, so
                # just max_sample are stored:
                if i < self.max_sample:
                    quality = [ord(x) -33 for x in record.qualities]
                    mean_qualities.append(sum(quality) / N)
                    qualities.append(quality)
                    sequences.append(record.sequence)

                # store count of all qualities
                for k in record.qualities:
                    C[k] += 1

                GG = record.sequence.count('G') 
                CC = record.sequence.count('C')
                self.gc_list.append((GG+CC)/float(N)*100)

                # not using a counter, or loop speed up the code
                stats["A"] += record.sequence.count("A")
                stats["C"] += CC
                stats["G"] += GG
                stats["T"] += record.sequence.count("T")
                stats["N"] += record.sequence.count("N")

                total_length += len(record.sequence)

                if self.verbose:
                    pb.animate(i+1)

        # other data
        self.qualities = qualities
        self.mean_qualities = mean_qualities
        self.minimum = int(self.lengths.min())
        self.maximum = int(self.lengths.max())
        self.sequences = sequences
        self.gc_content = np.mean(self.gc_list)
        stats['mean_length'] = total_length / float(self.N)
        stats['total_bp'] = stats['A'] + stats['C'] + stats['G'] + stats["T"] + stats['N']
        stats['mean_quality'] = sum([(ord(k) -33)*v for k,v in C.items()]) / stats['total_bp']

        self.stats = stats
Ejemplo n.º 46
0
    def fit(self, amp=1, progress=False):
        r"""Loop over distributions and find best parameter to fit the data for each

        When a distribution is fitted onto the data, we populate a set of
        dataframes:

            - :attr:`df_errors`  :sum of the square errors between the data and the fitted
              distribution i.e., :math:`\sum_i \left( Y_i - pdf(X_i) \right)^2`
            - :attr:`fitted_param` : the parameters that best fit the data
            - :attr:`fitted_pdf` : the PDF generated with the parameters that best fit the data

        Indices of the dataframes contains the name of the distribution.

        """
        import warnings
        warnings.filterwarnings("ignore", category=RuntimeWarning)

        from easydev import Progress
        N = len(self.distributions)
        pb = Progress(N)
        for i, distribution in enumerate(self.distributions):
            try:
                # need a subprocess to check time it takes. If too long, skip it
                dist = eval("scipy.stats." + distribution)

                # TODO here, dist.fit may take a while or just hang forever
                # with some distributions. So, I thought to use signal module
                # to catch the error when signal takes too long. It did not work
                # presumably because another try/exception is inside the
                # fit function, so I used threading with a recipe from stackoverflow
                # See timed_run function above
                param = self._timed_run(dist.fit,
                                        distribution,
                                        args=self._data)

                # with signal, does not work. maybe because another expection is caught
                # hoping the order returned by fit is the same as in pdf
                pdf_fitted = dist.pdf(self.x, *param)

                self.fitted_param[distribution] = param[:]
                self.fitted_pdf[distribution] = pdf_fitted

                # calculate error
                sq_error = pylab.sum(
                    (self.fitted_pdf[distribution] - self.y)**2)

                # calcualte information criteria
                logLik = np.sum(dist.logpdf(self.x, *param))
                k = len(param[:])
                n = len(self._data)
                aic = 2 * k - 2 * logLik
                bic = n * np.log(sq_error / n) + k * np.log(n)

                # calcualte kullback leibler divergence
                kullback_leibler = kl_div(self.fitted_pdf[distribution],
                                          self.y)

                logging.info("Fitted {} distribution with error={})".format(
                    distribution, sq_error))

                # compute some errors now
                self._fitted_errors[distribution] = sq_error
                self._aic[distribution] = aic
                self._bic[distribution] = bic
                self._kldiv[distribution] = kullback_leibler
            except Exception as err:  #pragma: no cover
                logging.warning(
                    "SKIPPED {} distribution (taking more than {} seconds)".
                    format(distribution, self.timeout))
                # if we cannot compute the error, set it to large values
                self._fitted_errors[distribution] = np.inf
                self._aic[distribution] = np.inf
                self._bic[distribution] = np.inf
                self._kldiv[distribution] = np.inf
            if progress:
                pb.animate(i + 1)

        self.df_errors = pd.DataFrame({
            'sumsquare_error': self._fitted_errors,
            'aic': self._aic,
            'bic': self._bic,
            'kl_div': self._kldiv
        })
Ejemplo n.º 47
0
class Fitter(object):
    """Fit a data sample to known distributions

    A naive approach often performed to figure out the undelying distribution that
    could have generated a data set, is to compare the histogram of the data with
    a PDF (probability distribution function) of a known distribution (e.g., normal).

    Yet, the parameters of the distribution are not known and there are lots of
    distributions. Therefore, an automatic way to fit many distributions to the data
    would be useful, which is what is implemented here.

    Given a data sample, we use the `fit` method of SciPy to extract the parameters
    of that distribution that best fit the data. We repeat this for all available distributions.
    Finally, we provide a summary so that one can see the quality of the fit for those distributions

    Here is an example where we generate a sample from a gamma distribution.

    ::

        >>> # First, we create a data sample following a Gamma distribution
        >>> from scipy import stats
        >>> data = stats.gamma.rvs(2, loc=1.5, scale=2, size=20000)

        >>> # We then create the Fitter object
        >>> import fitter
        >>> f = fitter.Fitter(data)

        >>> # just a trick to use only 10 distributions instead of 80 to speed up the fitting
        >>> f.distributions = f.distributions[0:10] + ['gamma']

        >>> # fit and plot
        >>> f.fit()
        >>> f.summary()
                sumsquare_error
        gamma          0.000095
        beta           0.000179
        chi            0.012247
        cauchy         0.044443
        anglit         0.051672
        [5 rows x 1 columns]

    Once the data has been fitted, the :meth:`summary` metod returns a sorted dataframe where the

    Looping over the 80 distributions in SciPy could takes some times so you can overwrite the
    :attr:`distributions` with a subset if you want. In order to reload all distributions,
    call :meth:`load_all_distributions`.

    Some distributions do not converge when fitting. There is a timeout of 10 seconds after which
    the fitting procedure is cancelled. You can change this :attr:`timeout` attribute if needed.

    If the histogram of the data has outlier of very long tails, you may want to increase the
    :attr:`bins` binning or to ignore data below or above a certain range. This can be achieved
    by setting the :attr:`xmin` and :attr:`xmax` attributes. If you set xmin, you can come back to
    the original data by setting xmin to None (same for xmax) or just recreate an instance.
    """

    def __init__(self, data, xmin=None, xmax=None, bins=100,
                 distributions=None, timeout=30,
                 density=True):
        """.. rubric:: Constructor

        :param list data: a numpy array or a list
        :param float xmin: if None, use the data minimum value, otherwise histogram and
            fits will be cut
        :param float xmax: if None, use the data maximum value, otherwise histogram and
            fits will be cut
        :param int bins: numbers of bins to be used for the cumulative histogram. This has
            an impact on the quality of the fit.
        :param list distributions: give a list of distributions to look at. If none, use
            all scipy distributions that have a fit method. If you want to use
            only one distribution and know its name, you may provide a string (e.g.
            'gamma'). Finally, you may set to 'common' to  include only common
            distributions, which are: cauchy, chi2, expon, exponpow, gamma,
                 lognorm, norm, powerlaw, irayleigh, uniform.
        :param timeout: max time for a given distribution. If timeout is
            reached, the distribution is skipped.

        .. versionchanged:: 1.2.1 remove verbose argument, replacedb by logging module.
        .. versionchanged:: 1.0.8 increase timeout from 10 to 30 seconds.
        """
        self.timeout = timeout
        # USER input
        self._data = None

        # Issue https://github.com/cokelaer/fitter/issues/22 asked for setting
        # the density to False in the fitting and plotting. I first tought it
        # would be possible, but the fitting is performed using the PDF of scipy
        # so one would still need to normalise the data so that it is
        # comparable. Therefore I do not see anyway to do it without using
        # density set to True for now.
        self._density = True

        #: list of distributions to test
        self.distributions = distributions
        if self.distributions == None:
            self._load_all_distributions()
        elif self.distributions == "common":
            self.distributions = get_common_distributions()
        elif isinstance(distributions, str):
            self.distributions = [distributions]

        self.bins = bins

        self._alldata = np.array(data)
        if xmin == None:
            self._xmin = self._alldata.min()
        else:
            self._xmin = xmin
        if xmax == None:
            self._xmax = self._alldata.max()
        else:
            self._xmax = xmax

        self._trim_data()
        self._update_data_pdf()

        # Other attributes
        self._init()

    def _init(self):
        self.fitted_param = {}
        self.fitted_pdf = {}
        self._fitted_errors = {}
        self._aic = {}
        self._bic = {}
        self._kldiv = {}
        self._fit_i = 0  # fit progress
        self.pb = Progress(len(self.distributions))

    def _update_data_pdf(self):
        # histogram retuns X with N+1 values. So, we rearrange the X output into only N
        self.y, self.x = np.histogram(
            self._data, bins=self.bins, density=self._density)
        self.x = [(this + self.x[i + 1]) / 2. for i,
                                                  this in enumerate(self.x[0:-1])]

    def _trim_data(self):
        self._data = self._alldata[np.logical_and(
            self._alldata >= self._xmin, self._alldata <= self._xmax)]

    def _get_xmin(self):
        return self._xmin

    def _set_xmin(self, value):
        if value == None:
            value = self._alldata.min()
        elif value < self._alldata.min():
            value = self._alldata.min()
        self._xmin = value
        self._trim_data()
        self._update_data_pdf()

    xmin = property(_get_xmin, _set_xmin,
                    doc="consider only data above xmin. reset if None")

    def _get_xmax(self):
        return self._xmax

    def _set_xmax(self, value):
        if value == None:
            value = self._alldata.max()
        elif value > self._alldata.max():
            value = self._alldata.max()
        self._xmax = value
        self._trim_data()
        self._update_data_pdf()

    xmax = property(_get_xmax, _set_xmax,
                    doc="consider only data below xmax. reset if None ")

    def _load_all_distributions(self):
        """Replace the :attr:`distributions` attribute with all scipy distributions"""
        self.distributions = get_distributions()

    def hist(self):
        """Draw normed histogram of the data using :attr:`bins`


        .. plot::

            >>> from scipy import stats
            >>> data = stats.gamma.rvs(2, loc=1.5, scale=2, size=20000)
            >>> # We then create the Fitter object
            >>> import fitter
            >>> fitter.Fitter(data).hist()

        """
        _ = pylab.hist(self._data, bins=self.bins, density=self._density)
        pylab.grid(True)

    def _fit_single_distribution(self, distribution, progress: bool):
        try:
            # need a subprocess to check time it takes. If too long, skip it
            dist = eval("scipy.stats." + distribution)

            # TODO here, dist.fit may take a while or just hang forever
            # with some distributions. So, I thought to use signal module
            # to catch the error when signal takes too long. It did not work
            # presumably because another try/exception is inside the
            # fit function, so I used threading with a recipe from stackoverflow
            # See timed_run function above
            param = self._timed_run(dist.fit, distribution, args=self._data)

            # with signal, does not work. maybe because another expection is caught
            # hoping the order returned by fit is the same as in pdf
            pdf_fitted = dist.pdf(self.x, *param)

            self.fitted_param[distribution] = param[:]
            self.fitted_pdf[distribution] = pdf_fitted

            # calculate error
            sq_error = pylab.sum(
                (self.fitted_pdf[distribution] - self.y) ** 2)

            # calcualte information criteria
            logLik = np.sum(dist.logpdf(self.x, *param))
            k = len(param[:])
            n = len(self._data)
            aic = 2 * k - 2 * logLik
            bic = n * np.log(sq_error / n) + k * np.log(n)

            # calcualte kullback leibler divergence
            kullback_leibler = kl_div(
                self.fitted_pdf[distribution], self.y)

            logging.info("Fitted {} distribution with error={})".format(
                distribution, sq_error))

            # compute some errors now
            self._fitted_errors[distribution] = sq_error
            self._aic[distribution] = aic
            self._bic[distribution] = bic
            self._kldiv[distribution] = kullback_leibler
        except Exception:  # pragma: no cover
            logging.warning("SKIPPED {} distribution (taking more than {} seconds)".format(distribution,
                                                                                           self.timeout))
            # if we cannot compute the error, set it to large values
            self._fitted_errors[distribution] = np.inf
            self._aic[distribution] = np.inf
            self._bic[distribution] = np.inf
            self._kldiv[distribution] = np.inf
        if progress:
            self._fit_i += 1
            self.pb.animate(self._fit_i)

    def fit(self, amp=1, progress=False, n_jobs=-1):
        r"""Loop over distributions and find best parameter to fit the data for each

        When a distribution is fitted onto the data, we populate a set of
        dataframes:

            - :attr:`df_errors`  :sum of the square errors between the data and the fitted
              distribution i.e., :math:`\sum_i \left( Y_i - pdf(X_i) \right)^2`
            - :attr:`fitted_param` : the parameters that best fit the data
            - :attr:`fitted_pdf` : the PDF generated with the parameters that best fit the data

        Indices of the dataframes contains the name of the distribution.

        """
        import warnings
        warnings.filterwarnings("ignore", category=RuntimeWarning)

        jobs = (delayed(self._fit_single_distribution)(dist, progress) for dist in self.distributions)
        pool = Parallel(n_jobs=n_jobs, backend='threading')
        _ = pool(jobs)
        self.df_errors = pd.DataFrame({'sumsquare_error': self._fitted_errors,
                                       'aic': self._aic,
                                       'bic': self._bic,
                                       'kl_div': self._kldiv})

    def plot_pdf(self, names=None, Nbest=5, lw=2, method="sumsquare_error"):
        """Plots Probability density functions of the distributions

        :param str,list names: names can be a single distribution name, or a list
            of distribution names, or kept as None, in which case, the first Nbest
            distribution will be taken (default to best 5)


        """
        assert Nbest > 0
        if Nbest > len(self.distributions):
            Nbest = len(self.distributions)

        if isinstance(names, list):
            for name in names:
                pylab.plot(self.x, self.fitted_pdf[name], lw=lw, label=name)
        elif names:
            pylab.plot(self.x, self.fitted_pdf[names], lw=lw, label=names)
        else:
            try:
                names = self.df_errors.sort_values(
                    by=method).index[0:Nbest]
            except Exception:
                names = self.df_errors.sort(method).index[0:Nbest]

            for name in names:
                if name in self.fitted_pdf.keys():
                    pylab.plot(
                        self.x, self.fitted_pdf[name], lw=lw, label=name)
                else:  # pragma: no cover
                    logger.warning("%s was not fitted. no parameters available" % name)
        pylab.grid(True)
        pylab.legend()

    def get_best(self, method='sumsquare_error'):
        """Return best fitted distribution and its parameters

        a dictionary with one key (the distribution name) and its parameters

        """
        # self.df should be sorted, so then us take the first one as the best
        name = self.df_errors.sort_values(method).iloc[0].name
        params = self.fitted_param[name]
        return {name: params}

    def summary(self, Nbest=5, lw=2, plot=True, method="sumsquare_error"):
        """Plots the distribution of the data and Nbest distribution

        """
        if plot:
            pylab.clf()
            self.hist()
            self.plot_pdf(Nbest=Nbest, lw=lw, method=method)
            pylab.grid(True)

        Nbest = min(Nbest, len(self.distributions))
        try:
            names = self.df_errors.sort_values(
                by=method).index[0:Nbest]
        except:  # pragma: no cover
            names = self.df_errors.sort(method).index[0:Nbest]
        return self.df_errors.loc[names]

    def _timed_run(self, func, distribution, args=(), kwargs={}, default=None):
        """This function will spawn a thread and run the given function
        using the args, kwargs and return the given default value if the
        timeout is exceeded.

        http://stackoverflow.com/questions/492519/timeout-on-a-python-function-call
        """

        class InterruptableThread(threading.Thread):
            def __init__(self):
                threading.Thread.__init__(self)
                self.result = default
                self.exc_info = (None, None, None)

            def run(self):
                try:
                    self.result = func(args, **kwargs)
                except Exception as err:  # pragma: no cover
                    self.exc_info = sys.exc_info()

            def suicide(self):  # pragma: no cover
                raise RuntimeError('Stop has been called')

        it = InterruptableThread()
        it.start()
        started_at = datetime.now()
        it.join(self.timeout)
        ended_at = datetime.now()
        diff = ended_at - started_at

        if it.exc_info[0] is not None:  # pragma: no cover ;  if there were any exceptions
            a, b, c = it.exc_info
            raise Exception(a, b, c)  # communicate that to caller

        if it.isAlive():  # pragma: no cover
            it.suicide()
            raise RuntimeError
        else:
            return it.result