Esempio n. 1
0
    def _opt_ridge_lasso(self, drug_name, feature_name, method, alphas=None):

        if alphas is None:
            alphas = pylab.linspace(0,1, 20)

        mses = []
        params = []
        method_buf = self.settings.regression_method
        alpha_buf = self.settings.elastic_net.alpha

        pb = Progress(len(alphas))
        for j, alpha in enumerate(alphas):
            self.settings.regression_method = method
            self.settings.elastic_net.alpha = alpha
            odof = self.anova_one_drug_one_feature(drug_name,
                    feature_name)
            anova = self._get_anova_summary(self.data_lm,
                    output='dataframe')
            #mses.append(anova.ix['Residuals']['Sum Sq'])
            mses.append(anova.ix['tissue']['F value'])
            #mses.append(anova['Sum Sq'].sum())
            pb.animate(j+1)
            params.append(self.data_lm.params)
        self.settings.regression_method = method_buf
        self.settings.elastic_net.alpha = alpha_buf
        return alphas, mses, params
Esempio n. 2
0
    def _get_G(self, gold):
        from easydev import Progress
        import scipy.sparse
        regulators = list(set(gold[0]))
        targets = list(set(gold[[0,1]].stack()))

        N, M = gold[0].max(), gold[1].max()

        ## A will store indices goind from 0 (not 1) to N-1
        # hence the -1 indices when handling A if i,j are the
        # values of the gene
        A = np.zeros((N, M))
        for row in gold[[0,1]].values:
            i, j = row
            A[i-1, j-1] = 1
        A_sparse = scipy.sparse.csr_matrix(A)

        #N, M = len(regulators), len(targets)
        G = np.zeros((N, M))

        pb = Progress(len(regulators), 1)
        for i, x in enumerate(regulators):
            for j, y in enumerate(targets):
                if A[x-1, y-1] == 1:
                    G[x-1, y-1] = 1
                elif x != y:
                    G[x-1, y-1] = -1
            pb.animate(i+1)
        return G
Esempio n. 3
0
    def search_from_smile_inchembl(self):

        N = len(self.drug_ids)

        pb = Progress(N)
        self.results_chembl = {}
        self.results_chemspider = {}

        for i in range(0, N):
            drug = self.drug_ids[i]
            self.results_chembl[drug] = []

            if self.results[drug]:
                for chemspider_id in self.results[drug]:
                    chemspider_entry = self._cs_get(chemspider_id)
                    self.results_chemspider[drug] = chemspider_entry
                    smile = chemspider_entry['smiles']
                    # now search in chembl
                    res_chembl = self.chembl.get_compounds_by_SMILES(smile)
                    try:
                        res_chembl['compounds']
                        self.results_chembl[drug].extend(
                            res_chembl['compounds'])
                    except:
                        pass

            pb.animate(i + 1)
Esempio n. 4
0
    def filling_chembl_pubchem_using_unichem(self):
        """

        """
        N = len(self.drug_ids)
        pb = Progress(N)
        for i,this in enumerate(self.drug_ids):
            entry = self.dd.df.ix[this]
            # if no information is provided, we will need to get it 
            # from chemspider

            # From the database, when chembl is provided, it is unique
            # same for chemspider and pubchem and CAS
            select = entry[['CHEMSPIDER', 'CHEMBL', 'PUBCHEM']]
            if select.count() == 0:
                name = self.dd.df.ix[this].DRUG_NAME
                results = self._cs_find(name)
                if len(results) == 0:
                    # nothing found
                    pass
                elif len(results) == 1:
                    self.dd_filled.df.ix[this].loc['CHEMSPIDER'] = results[0]
                else:
                    # non unique
                    #chemspider = ",".join([str(x) for x in results])
                    self.dd_filled.df.ix[this].loc['CHEMSPIDER'] = results
            pb.animate(i+1)

        # Search in chemspider systematically
        for i, this in enumerate(self.drug_ids):
            entry = self.dd.df.ix[this]
            if select.count() == 1:
                res = self._cs_find(drug)

            pb.animate(i+1)
Esempio n. 5
0
    def create_html_associations(self):
        """Create an HTML page for each significant association

        The name of the output HTML file is **<association id>.html**
        where association id is stored in :attr:`df`.

        """
        print("\nCreating individual HTML pages for each association")
        df = self.get_significant_set()

        drugs = df['DRUG_ID'].values
        features = df['FEATURE'].values
        assocs = df['ASSOC_ID'].values
        fdrs = df['ANOVA_FEATURE_FDR'].values

        N = len(df)
        pb = Progress(N)

        html = Association(self, drug='dummy', feature='dummy',  fdr='dummy')

        for i in range(N):
            html.drug = drugs[i]
            html.feature = features[i]
            html._filename = str(assocs[i]) + '.html'
            html.fdr = fdrs[i]
            html.assoc_id = assocs[i]
            html._init_report() # since we have one shared instance
            html.create_report(onweb=False)
            pb.animate(i+1)
Esempio n. 6
0
 def find_motif_fasta(self, filename, motif, window=200,
         local_threshold=None, global_threshold=None):
     from sequana import FastA
     data = FastA(filename)
     N = len(data)
     from easydev import Progress
     pb = Progress(N)
     df = {
         "query_name": [],
         "hit": [],
         "length": [],
         "start": [],
         "end": []
     }
     for i, item in enumerate(data):
         X1, S = self.find_motif_from_sequence(item.sequence, motif,
                     window=window, local_threshold=local_threshold
                     )
         if S >= self.global_threshold:
             df['query_name'].append(item.name)
             df['start'].append(0)
             df['end'].append(len(item.sequence))
             df['length'].append(len(item.sequence))
             df['hit'].append(S)
         pb.animate(i+1)
     df = pd.DataFrame(df)
     return df
Esempio n. 7
0
    def to_kmer_content(self, k=7):
        """Return a Series with kmer count across all reads

        :param int k: (default to 7-mers)
        :return: Pandas Series with index as kmer and values as count.

        Takes about 30 seconds on a million reads.
        """
        # Counter is slow if we apply it on each read.
        # .count is slow as well
        import collections
        from sequana.kmer import get_kmer
        counter = collections.Counter()
        pb = Progress(len(self))
        buffer_ = []
        for i, this in enumerate(self):
            buffer_.extend(list(get_kmer(this['sequence'], k)))
            if len(buffer_) > 100000:
                counter += collections.Counter(buffer_)
                buffer_ = []
            pb.animate(i)
        counter += collections.Counter(buffer_)

        ts = pd.Series(counter)
        ts.sort_values(inplace=True, ascending=False)

        return ts
Esempio n. 8
0
    def _score_challengeA_bunch(self, filenames, subname):

        from easydev import Progress

        pb = Progress(5, 1)
        pb.animate(0)
        results = []
        for i, filename in enumerate(filenames):
            res = self.score_challengeA(filename, subname + "_" + str(i + 1))
            pb.animate(i + 1)
            results.append(res)

        aupr_score = -np.mean(np.log10([x["p_auroc"] for x in results]))
        auroc_score = -np.mean(np.log10([x["p_aupr"] for x in results]))
        score = (aupr_score + auroc_score) / 2.0

        df = pd.TimeSeries()
        df["Overall Score"] = score
        df["AUPR score (pval)"] = aupr_score
        df["AUROC score (pval)"] = aupr_score
        for i in range(1, 6):
            df["AUPR Net %s" % i] = results[i - 1]["aupr"]
        for i in range(1, 6):
            df["AUROC Net %s" % i] = results[i - 1]["auroc"]

        return df
Esempio n. 9
0
    def diagnostics(self):
        """Return dataframe with information about the analysis

        """
        n_drugs = len(self.ic50.drugIds)
        n_features = len(self.features.features) - self.features.shift
        n_combos = n_drugs * n_features
        feasible = 0
        pb = Progress(n_drugs, 1)
        counter = 0
        for drug in self.ic50.drugIds:
            for feature in self.features.features[self.features.shift:]:
                dd = self._get_one_drug_one_feature_data(drug, feature,
                        diagnostic_only=True)
                if dd.status is True:
                    feasible += 1
            counter += 1
            pb.animate(counter)

        results = {
                'n_drug': n_drugs,
                'n_combos': n_combos,
                'feasible_tests': feasible,
                'percentage_feasible_tests': float(feasible)/n_combos*100}
        return results
Esempio n. 10
0
    def compounds2accession(self, compounds):
        """For each compound, identifies the target and corresponding UniProt
        accession number

        This is not part of ChEMBL API

        ::

            # we recommend to use cache if you use this method regularly
            c = Chembl(cache=True)
            drugs = c.get_approved_drugs()

            # to speed up example
            drugs = drugs[0:20]
            IDs = [x['molecule_chembl_id] for x in drugs]

            c.compounds2accession(IDs)

        """
        # we jump from compounds to targets through activities
        # Here this is a one to many mapping so we initialise a default
        # dictionary.
        from collections import defaultdict
        compound2target = defaultdict(set)

        filter = "molecule_chembl_id__in={}"
        from easydev import Progress
        pb = Progress(len(compounds))
        for i in range(0, len(compounds)):
            # FIXME could get activities by bunch using 
            # ",".join(compounds[i:i+10) for example
            activities = self.get_activity(filters=filter.format(compounds[i]))
            # get target ChEMBL IDs from activities
            for act in activities:
                compound2target[act['molecule_chembl_id']].add(act['target_chembl_id'])
            pb.animate(i+1)

        # What we need is to get targets for all targets found in the previous
        # step. For each compound/drug there are hundreds of targets though. And
        # we will call the get_target for each list of hundreds targets. This
        # will take forever. Instead, because there are *only* 12,000 targets,
        # let us download all of them ! This took about 4 minutes on this test but
        # if you use the cache, next time it will be much much quicker. This is
        # not down at the activities level because there are too many entries

        targets = self.get_target(limit=-1)

        # identifies all target chembl id to easily retrieve the entry later on
        target_names = [target['target_chembl_id'] for target in targets]

        # retrieve all uniprot accessions for all targets of each compound
        for compound, targs in compound2target.items():
            accessions = set()
            for target in targs:
                index = target_names.index(target)
                accessions = accessions.union([comp['accession'] 
                    for comp in targets[index]['target_components']])
            compound2target[compound] = accessions
 
        return compound2target
Esempio n. 11
0
    def select_random_reads(self, N=None, output_filename="random.fasta"):
        """Select random reads and save in a file

        :param int N: number of random unique reads to select
            should provide a number but a list can be used as well.
        :param str output_filename:
        """
        import numpy as np
        thisN = len(self)
        if isinstance(N, int):
            if N > thisN:
                N = thisN
            # create random set of reads to pick up
            cherries = list(range(thisN))
            np.random.shuffle(cherries)
            # cast to set for efficient iteration
            cherries = set(cherries[0:N])
        elif isinstance(N, set):
            cherries = N
        elif isinstance(N, list):
            cherries = set(N)
        fasta = FastxFile(self.filename)
        pb = Progress(thisN) # since we scan the entire file
        with open(output_filename, "w") as fh:
            for i, read in enumerate(fasta):
                if i in cherries:
                    fh.write(read.__str__() + "\n")
                else:
                    pass
                pb.animate(i+1)
        return cherries
Esempio n. 12
0
    def create_html_drugs(self):
        """Create an HTML page for each drug"""
        # group by drugs
        all_drugs = list(self.df['DRUG_ID'].unique())

        df = self.get_significant_set()
        groups = df.groupby('DRUG_ID')
        if self.verbose:
            print("Creating individual HTML pages for each drug")
        N = len(groups.indices.keys())
        N = len(all_drugs)
        pb = Progress(N)
        for i, drug in enumerate(all_drugs):
            # enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            if drug in groups.groups.keys():
                subdf = groups.get_group(drug)
            else:
                subdf = {}

            html = HTMLOneDrug(self, self.df, subdf, drug)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i+1)
        if self.settings.animate: print("\n")
Esempio n. 13
0
    def _get_G(self, gold):
        from easydev import Progress
        import scipy.sparse
        regulators = list(set(gold[0]))
        targets = list(set(gold[[0,1]].stack()))

        N, M = gold[0].max(), gold[1].max()

        ## A will store indices goind from 0 (not 1) to N-1
        # hence the -1 indices when handling A if i,j are the
        # values of the gene
        A = np.zeros((N, M))
        for row in gold[[0,1]].values:
            i, j = row
            A[i-1, j-1] = 1
        A_sparse = scipy.sparse.csr_matrix(A)

        #N, M = len(regulators), len(targets)
        G = np.zeros((N, M))

        pb = Progress(len(regulators), 1)
        for i, x in enumerate(regulators):
            for j, y in enumerate(targets):
                if A[x-1, y-1] == 1:
                    G[x-1, y-1] = 1
                elif x != y:
                    G[x-1, y-1] = -1
            pb.animate(i+1)
        return G
Esempio n. 14
0
    def get_gis(self, extensions=['fa']):
        self.filenames = []
        root = self.dbname
        for extension in extensions:
            self.filenames.extend(
                list(glob.iglob("%s/library/**/*%s" % (root, extension))))
        for extension in extensions:
            self.filenames.extend(
                list(glob.iglob("%s/library/**/**/*%s" % (root, extension))))

        N = len(self.filenames)
        pb = Progress(N)
        gis = []
        for i, filename in enumerate(self.filenames):
            data = open(filename, "r")
            line = data.readline()
            if line.startswith('>'):
                assert "gi" in line, "expected >gi to be found at the beginning"
                gi = line[1:].split("|")[1]
            else:
                raise ValueError(
                    "This file %s does not seem to be a FASTA file" % filename)
            gis.append(gi)
            pb.animate(i + 1)
        print()
        gis = [int(x) for x in gis]
        self.gis = gis

        assert len(gis) == len(self.filenames)
        return gis
Esempio n. 15
0
    def create_html_drugs(self):
        """Create an HTML page for each drug"""
        # group by drugs
        all_drugs = list(self.df['DRUG_ID'].unique())

        df = self.get_significant_set()
        groups = df.groupby('DRUG_ID')
        if self.verbose:
            print("Creating individual HTML pages for each drug")
        N = len(groups.indices.keys())
        N = len(all_drugs)
        pb = Progress(N)
        for i, drug in enumerate(all_drugs):
            # enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            if drug in groups.groups.keys():
                subdf = groups.get_group(drug)
            else:
                subdf = {}

            html = HTMLOneDrug(self, self.df, subdf, drug)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i + 1)
        if self.settings.animate: print("\n")
Esempio n. 16
0
    def download_accession_from_ncbi(self, accession):
        # a list of accessions in a file
        # can be a list, a unique string, a filename with 1-column wit accession
        # to retrieve
        if isinstance(accession, list):
            pass
        elif isinstance(accession, str):
            if os.path.exists(accession):
                with open(accession, "r") as fin:
                    accessions = fin.read().split()
            else:
                accessions = [accession]

        from easydev import Progress
        N = len(accessions)
        pb = Progress(N)
        logger.info("Fetching {} accession fasta files from NCBI".format(N))
        for i, accession in enumerate(accessions):
            data = self.eutils.EFetch("nucleotide",
                                      rettype="fasta",
                                      id=accession,
                                      retmode="text")
            if isinstance(data, int):
                logger.info(
                    "Could not fetch this accession: {}. continue".format(
                        accession))
                print("Could not fetch this accession: {}. continue".format(
                    accession))
            else:
                outname = "{}/library/{}.fa".format(self.dbname, accession)
                with open(outname, "wb") as fout:
                    fout.write(data)
            pb.animate(i + 1)
Esempio n. 17
0
    def create_html_associations(self):
        """Create an HTML page for each significant association

        The name of the output HTML file is **<association id>.html**
        where association id is stored in :attr:`df`.

        """
        print("\nCreating individual HTML pages for each association")
        df = self.get_significant_set()

        drugs = df['DRUG_ID'].values
        features = df['FEATURE'].values
        assocs = df['ASSOC_ID'].values
        fdrs = df['ANOVA_FEATURE_FDR'].values

        N = len(df)
        pb = Progress(N)

        html = Association(self, drug='dummy', feature='dummy', fdr='dummy')

        for i in range(N):
            html.drug = drugs[i]
            html.feature = features[i]
            html._filename = str(assocs[i]) + '.html'
            html.fdr = fdrs[i]
            html.assoc_id = assocs[i]
            html._init_report()  # since we have one shared instance
            html.create_report(onweb=False)
            pb.animate(i + 1)
Esempio n. 18
0
    def select_random_reads(self, N=None, output_filename="random.fasta"):
        """Select random reads and save in a file

        :param int N: number of random unique reads to select
            should provide a number but a list can be used as well.
        :param str output_filename:
        """
        import numpy as np
        thisN = len(self)
        if isinstance(N, int):
            if N > thisN:
                N = thisN
            # create random set of reads to pick up
            cherries = list(range(thisN))
            np.random.shuffle(cherries)
            # cast to set for efficient iteration
            cherries = set(cherries[0:N])
        elif isinstance(N, set):
            cherries = N
        elif isinstance(N, list):
            cherries = set(N)
        fasta = FastxFile(self.filename)
        pb = Progress(thisN)  # since we scan the entire file
        with open(output_filename, "w") as fh:
            for i, read in enumerate(fasta):
                if i in cherries:
                    fh.write(read.__str__() + "\n")
                else:
                    pass
                pb.animate(i + 1)
        return cherries
Esempio n. 19
0
def process_single_reads(reader, modifiers, filters, n_progress=-1):
	"""
	Loop over reads, find adapters, trim reads, apply modifiers and
	output modified reads.

	Return a Statistics object.
	"""
	n = 0  # no. of processed reads
	total_bp = 0
	if n_progress != -1:
		try:
			from easydev import Progress
			pb = Progress(n_progress)
			count = 0
		except:
			n_progress = -1

	for read in reader:
		n += 1
		total_bp += len(read.sequence)
		for modifier in modifiers:
			read = modifier(read)
		for filter in filters:
			if filter(read):
				break
		if n_progress != -1:
			count += 1
			pb.animate(count)

	return Statistics(n=n, total_bp1=total_bp, total_bp2=None)
Esempio n. 20
0
    def create_taxonomy_file(self, filename="taxonomy.dat"):
        logger.info("Please wait while creating the output file. "
                    "This may take a few minutes")
        from easydev import Progress
        pb = Progress(len(self.df_nodes))
        count = 0
        df_names = self.df_names.query("key == 'scientific name'").copy()
        with open(filename, "w") as fout:

            for taxid in self.df_nodes.index:
                row = self.df_nodes.loc[taxid]
                fout.write("ID                        : {}\n".format(taxid))
                fout.write("PARENT ID                 : {}\n".format(
                    row.parent))
                fout.write("RANK                      : {}\n".format(
                    row['rank']))

                #names = df_names.loc[taxid]
                #print(
                fout.write("{:26s}: {}\n".format("SCIENTIFIC NAME",
                                                 df_names.loc[taxid, "name"]))
                """    len(names)
                    for k,v in zip(names['key'], names['name']):
                        if k.upper() in ['SCIENTIFIC NAME', 'SYNONYM']:
                            fout.write("{:26s}: {}\n".format(k.upper(), v))
                except:
                    k, v = names['key'], names['name']
                    fout.write("{:26s}: {}\n".format(k.upper(), v))
                """
                fout.write("//\n")
                count += 1
                pb.animate(count)
Esempio n. 21
0
    def search_from_smile_inchembl(self):

        N = len(self.drug_ids)

        pb = Progress(N)
        self.results_chembl = {}
        self.results_chemspider = {}

        for i in range(0, N):
            drug = self.drug_ids[i]
            self.results_chembl[drug] = []

            if self.results[drug]:
                for chemspider_id in self.results[drug]:
                    chemspider_entry = self._cs_get(chemspider_id)
                    self.results_chemspider[drug] = chemspider_entry
                    smile = chemspider_entry['smiles']
                    # now search in chembl
                    res_chembl = self.chembl.get_compounds_by_SMILES(smile)
                    try:
                        res_chembl['compounds']
                        self.results_chembl[drug].extend(res_chembl['compounds'])
                    except:
                        pass

            pb.animate(i+1)
Esempio n. 22
0
    def dendogram_coefficients(self, stacked=False, show=True, cmap="terrain"):
        """

        shows the coefficient of each optimised model for each drug
        """
        drugids = self.drugIds
        from easydev import Progress
        pb = Progress(len(drugids))
        d = {}

        for i, drug_name in enumerate(drugids):
            X, Y = self._get_one_drug_data(drug_name, randomize_Y=False)
            results = self.runCV(drug_name, verbose=False)
            df = pd.DataFrame({'name': X.columns, 'weight': results.coefficients})
            df = df.set_index("name").sort_values("weight")
            d[drug_name] = df.copy()
            pb.animate(i+1)

        # use drugid to keep same order as in the data
        dfall = pd.concat([d[i] for i in drugids], axis=1)
        dfall.columns = drugids

        if show:
            from biokit import heatmap
            h = heatmap.Heatmap(dfall, cmap=cmap)
            h.plot(num=1,colorbar_position="top left")

        if stacked is True:
            dfall = dfall.stack().reset_index()
            dfall.columns = ["feature", "drug", "weight"]
        return dfall
Esempio n. 23
0
    def filter(self, identifiers_list=[], min_bp=None, max_bp=None,
        progressbar=True, output_filename='filtered.fastq'):
        """Save reads in a new file if there are not in the identifier_list

        :param int min_bp: ignore reads with length shorter than min_bp
        :param int max_bp: ignore reads with length above max_bp

        """
        # 7 seconds without identifiers to scan the file
        # on a 750000 reads

        if min_bp is None:
            min_bp = 0

        if max_bp is None:
            max_bp = 1e9

        # make sure we are at the beginning
        self.rewind()

        output_filename, tozip = self._istozip(output_filename)

        with open(output_filename, "w") as fout:
            pb = Progress(self.n_reads)
            buf = ""
            filtered = 0
            saved = 0 

            for count, lines in enumerate(grouper(self._fileobj)):
                identifier = lines[0].split()[0]
                if lines[0].split()[0].decode() in identifiers_list:
                    filtered += 1
                else:
                    N = len(lines[1])
                    if N <= max_bp and N >= min_bp:
                        buf += "{}{}+\n{}".format(
                            lines[0].decode("utf-8"),
                            lines[1].decode("utf-8"),
                            lines[3].decode("utf-8"))
                        saved += 1
                    else:
                        filtered += 1
                    if count % 100000 == 0:
                        fout.write(buf)
                        buf = ""
                if progressbar is True:
                    pb.animate(count+1)
            fout.write(buf)
            if filtered < len(identifiers_list):
                print("\nWARNING: not all identifiers were found in the fastq file to " +
                      "be filtered.")
        logger.info("\n{} reads were filtered out and {} saved in {}".format(
            filtered, saved, output_filename))

        if tozip is True: 
            logger.info("Compressing file")
            self._gzip(output_filename)
Esempio n. 24
0
 def _init(self):
     self.fitted_param = {}
     self.fitted_pdf = {}
     self._fitted_errors = {}
     self._aic = {}
     self._bic = {}
     self._kldiv = {}
     self._fit_i = 0  # fit progress
     self.pb = Progress(len(self.distributions))
Esempio n. 25
0
    def volcano_plot_all_drugs(self):
        """Create a volcano plot for each drug and save in PNG files

        Each filename is set to **volcano_<drug identifier>.png**
        """
        drugs = list(self.df[self._colname_drugid].unique())
        pb = Progress(len(drugs), 1)
        for i, drug in enumerate(drugs):
            self.volcano_plot_one_drug(drug)
            self.savefig("volcano_%s.png" % drug, size_inches=(10, 10))
            pb.animate(i + 1)
Esempio n. 26
0
    def volcano_plot_all_drugs(self):
        """Create a volcano plot for each drug and save in PNG files

        Each filename is set to **volcano_<drug identifier>.png**
        """
        drugs = list(self.df[self._colname_drugid].unique())
        pb = Progress(len(drugs), 1)
        for i, drug in enumerate(drugs):
            self.volcano_plot_one_drug(drug)
            self.savefig("volcano_%s.png" % drug, size_inches=(10, 10))
            pb.animate(i+1)
Esempio n. 27
0
    def volcano_plot_all_features(self):
        """Create a volcano plot for each feature and save in PNG files

        Each filename is set to **volcano_<feature name>.png**
        """
        features = list(self.df[self._colname_feature].unique())
        print('Creating image for each feature (using all drugs)')
        pb = Progress(len(features), 1)
        for i, feature in enumerate(features):
            self.volcano_plot_one_feature(feature)
            self.savefig("volcano_%s.png" % feature, size_inches=(10, 10))
            pb.animate(i + 1)
Esempio n. 28
0
 def _load_complexes(self, show_progress=True):
     from easydev import Progress
     import time
     pb = Progress(len(self.df.complexAC))
     complexes = {}
     self.logging.info("Loading all details from the IntactComplex database")
     for i, identifier in enumerate(self.df.complexAC):
         res = self.webserv.details(identifier)
         complexes[identifier] = res
         if show_progress:
             pb.animate(i+1)
     self._complexes = complexes
Esempio n. 29
0
def check_ipython_notebook():

    notebooks = glob.glob("*ipynb")
    N = len(notebooks)

    pb = Progress(N)
    for i, filename in enumerate(notebooks):
        print(purple(filename))
        notebook = read(open(filename), 'json')
        r = NotebookRunner(notebook)
        r.run_notebook()
        pb.animate(i + 1)
Esempio n. 30
0
    def create_summary_pages(self):
        """Create summary pages

        Once the main analyis is done (:meth:`analyse`), and the company
        packages have been created (:meth:`create_data_packages_for_companies`),
        you can run this method that will creade a summary HTML page
        (index.html) for the tissue, and a similar summary HTML page for the
        tissues of each company. Finally, an HTML summary page for the 
        companies is also created.

        The final tree direcorty looks like::


            |-- index.html
            |-- company_packages
            |   |-- index.html
            |   |-- Company1
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |   |-- Company2
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |-- tissue_packages
            |   |-- index.html
            |   |-- Tissue1
            |   |-- Tissue2


        """
        # First for the main directory (tissue_packages):
        print(purple("Creating summary index.html for the tissues"))
        self._create_summary_pages(self.main_directory, verbose=False)

        # Then for each companies:
        print(purple("Creating summary index.html for each company"))
        pb = Progress(len(self.companies))
        for i, company in enumerate(self.companies):
            try:
                self._create_summary_pages(self.company_directory + os.sep +
                                           company,
                                           verbose=False,
                                           company=company)
            except Exception as err:
                print(
                    red("Issue with %s. Continue with other companies" %
                        company))
                print(err)
            pb.animate(i + 1)

        # Finally, an index towards each company
        self._create_main_index()
Esempio n. 31
0
    def volcano_plot_all_features(self):
        """Create a volcano plot for each feature and save in PNG files

        Each filename is set to **volcano_<feature name>.png**
        """
        features = list(self.df[self._colname_feature].unique())
        print('Creating image for each feature (using all drugs)')
        pb = Progress(len(features), 1)
        for i, feature in enumerate(features):
            self.volcano_plot_one_feature(feature)
            self.savefig("volcano_%s.png" % feature,
                    size_inches=(10, 10))
            pb.animate(i+1)
def check_ipython_notebook():


    notebooks = glob.glob("*ipynb")
    N = len(notebooks)

    pb = Progress(N)
    for i,filename in enumerate(notebooks):
        print(purple(filename))
        notebook = read(open(filename), 'json')
        r = NotebookRunner(notebook)
        r.run_notebook()
        pb.animate(i+1)
Esempio n. 33
0
 def create_html_features(self):
     """Create an HTML page for each significant feature"""
     df = self.get_significant_set()
     groups = df.groupby('FEATURE')
     print("\nCreating individual HTML pages for each feature")
     N = len(groups.indices.keys())
     pb = Progress(N)
     for i, feature in enumerate(groups.indices.keys()):
         # get the indices and therefore subgroup
         subdf = groups.get_group(feature)
         html = HTMLOneFeature(self, self.df, subdf, feature)
         html.create_report(onweb=False)
         pb.animate(i + 1)
Esempio n. 34
0
 def create_html_features(self):
     """Create an HTML page for each significant feature"""
     df = self.get_significant_set()
     groups = df.groupby('FEATURE')
     print("\nCreating individual HTML pages for each feature")
     N = len(groups.indices.keys())
     pb = Progress(N)
     for i, feature in enumerate(groups.indices.keys()):
         # get the indices and therefore subgroup
         subdf = groups.get_group(feature)
         html = HTMLOneFeature(self, self.df, subdf, feature)
         html.create_report(onweb=False)
         pb.animate(i+1)
Esempio n. 35
0
    def plot_cindex(self, drug_name, alphas, l1_ratio=0.5, n_folds=10, hold=False):
        # This is longish (300 seconds with 10 folds and 80 alphas
        # for GDSC v5 data sets.
        from dreamtools.core.cindex import cindex

        CI_train = {}
        CI_test = {}
        for c in range(n_folds):
            CI_train[c] = []
            CI_test[c] = []

        from easydev import Progress
        pb = Progress(len(alphas))

        for i, alpha in enumerate(alphas):
            self.elastic_net(drug_name, alpha=alpha, l1_ratio=l1_ratio,
                             n_folds=n_folds)

            # Look at the first fold only
            for kf in range(n_folds):
                x_train = self.kfold_data['x_train'][kf].values
                y_train = self.kfold_data['y_train'][kf].values

                x_test = self.kfold_data['x_test'][kf].values
                y_test = self.kfold_data['y_test'][kf].values

                x_train_pred = self.en.predict(x_train)
                x_test_pred = self.en.predict(x_test)

                CI_test[kf].append(1-cindex(x_test_pred, y_test, [True]*len(y_test)))
                CI_train[kf].append(1-cindex(x_train_pred, y_train, [True] * len(y_train)))
            pb.animate(i)

        mu_train = pd.DataFrame(CI_train).transpose().mean()
        sigma_train = pd.DataFrame(CI_train).transpose().std()

        mu_test = pd.DataFrame(CI_test).transpose().mean()
        sigma_test = pd.DataFrame(CI_test).transpose().std()

        best_alpha = alphas[pd.DataFrame(CI_test).mean(axis=1).argmax()]

        pylab.clf()
        pylab.errorbar(pylab.log(alphas), mu_train, yerr=sigma_train, label="train")
        pylab.errorbar(pylab.log(alphas)+.1, mu_test, yerr=sigma_test, label="test")
        pylab.plot(pylab.log(alphas), mu_train, 'ob')
        pylab.plot(pylab.log(alphas)+.1, mu_train, 'or')
        pylab.legend()
        pylab.axvline(pylab.log(best_alpha), lw=2, color="purple")

        return best_alpha
Esempio n. 36
0
    def create_summary_pages(self):
        """Create summary pages

        Once the main analyis is done (:meth:`analyse`), and the company
        packages have been created (:meth:`create_data_packages_for_companies`),
        you can run this method that will creade a summary HTML page
        (index.html) for the tissue, and a similar summary HTML page for the
        tissues of each company. Finally, an HTML summary page for the 
        companies is also created.

        The final tree direcorty looks like::


            |-- index.html
            |-- company_packages
            |   |-- index.html
            |   |-- Company1
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |   |-- Company2
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |-- tissue_packages
            |   |-- index.html
            |   |-- Tissue1
            |   |-- Tissue2


        """
        # First for the main directory (tissue_packages):
        print(purple("Creating summary index.html for the tissues"))
        self._create_summary_pages(self.main_directory, verbose=False)

        # Then for each companies:
        print(purple("Creating summary index.html for each company"))
        pb = Progress(len(self.companies))
        for i, company in enumerate(self.companies):
            try:
                self._create_summary_pages(self.company_directory + os.sep +
                    company, verbose=False, company=company)
            except Exception as err:
                print(red("Issue with %s. Continue with other companies" % company))
                print(err)
            pb.animate(i+1)

        # Finally, an index towards each company
        self._create_main_index()
Esempio n. 37
0
    def plot_pca_vs_max_features(self, step=100, n_components=2,
            progress=True):
        """

        .. plot::
            :include-source:

            from sequana.viz.pca import PCA
            from sequana import sequana_data
            import pandas as pd

            data = sequana_data("test_pca.csv")
            df = pd.read_csv(data)
            df = df.set_index("Id")

            p = PCA(df)
            p.plot_pca_vs_max_features()

        """
        assert n_components in [2,3,4]
        N = len(self.df)
        if step > N:
            step = N

        # We start with at least 5 features
        X = range(10, N, step)
        from easydev import Progress
        pb = Progress(len(X))
        Y = []
        for i, x in enumerate(X):
            res = self.plot(n_components=n_components, max_features=x, show_plot=False)
            Y.append(res)
            if progress: pb.animate(i+1)

        sub = n_components
        pylab.subplot(sub,1,1)
        pylab.plot(X, [y[0]*100 for y in Y])
        pylab.ylabel("PC1 (%)")
        pylab.subplot(sub,1,2)
        pylab.plot(X, [y[1]*100 for y in Y])
        pylab.ylabel("PC2 (%)")
        if sub >= 3:
            pylab.subplot(sub,1,3)
            pylab.plot(X, [y[2]*100 for y in Y])
            pylab.ylabel("PC3 (%)")
        if sub >= 4:
            pylab.subplot(sub,1,4)
            pylab.plot(X, [y[3]*100 for y in Y])
            pylab.ylabel("PC4 (%)")
Esempio n. 38
0
    def filter(self, identifiers_list=[], min_bp=None, max_bp=None,
        progressbar=True, output_filename='filtered.fastq', remove=True):
        """Filter reads

        :param int min_bp: ignore reads with length shorter than min_bp
        :param int max_bp: ignore reads with length above max_bp

        """
        # 7 seconds without identifiers to scan the file
        # on a 750000 reads

        if min_bp is None:
            min_bp = 0

        if max_bp is None:
            max_bp = 1e9

        # make sure we are at the beginning
        self.rewind()

        output_filename, tozip = self._istozip(output_filename)

        with open(output_filename, "w") as fout:
            pb = Progress(self.n_reads)
            buf = ""
            filtered = 0

            for count, lines in enumerate(grouper(self._fileobj)):
                identifier = lines[0].split()[0]
                if lines[0].split()[0] in identifiers_list:
                    filtered += 1
                else:
                    N = len(lines[1])
                    if N <= max_bp and N >= min_bp:
                        buf += "{}{}+\n{}".format(
                            lines[0].decode("utf-8"),
                            lines[1].decode("utf-8"),
                            lines[3].decode("utf-8"))
                    if count % 100000 == 0:
                        fout.write(buf)
                        buf = ""
                if progressbar is True:
                    pb.animate(count+1)
            fout.write(buf)
            if filtered < len(identifiers_list):
                print("\nWARNING: not all identifiers were found in the fastq file to " +
                      "be filtered.")
        if tozip is True: self._gzip(output_filename)
Esempio n. 39
0
    def anova_one_drug(self, drug_id, animate=True, output='object'):
        """Computes ANOVA for a given drug across all features

        :param str drug_id: a valid drug identifier.
        :param animate: shows the progress bar
        :return: a dataframe

        Calls :meth:`anova_one_drug_one_feature` for each feature.
        """
        # some features can be dropped ??

        # drop first and second columns that are made of strings
        # works under python2 but not python 3. Assume that the 2 first
        #columns are the sample name and tissue feature
        # Then, we keep only cases with at least 3 features.
        # MSI could be used but is not like in original R code.
        features = self.features.df.copy()
        # need to skip the FACTOR to keep only features
        shift = self.features.shift

        features = features[features.columns[shift:]]
        # FIXME what about features with less than 3 zeros ?
        mask = features.sum(axis=0) >= 3

        # TODO: MSI, tissues, name must always be kept
        #
        selected_features = features[features.columns[mask]]

        # scan all features for a given drug
        assert drug_id in self.ic50.df.columns
        N = len(selected_features.columns)
        pb = Progress(N, 10)
        res = {}
Esempio n. 40
0
    def fit(self, amp=1, progress=False, n_jobs=-1):
        r"""Loop over distributions and find best parameter to fit the data for each

        When a distribution is fitted onto the data, we populate a set of
        dataframes:

            - :attr:`df_errors`  :sum of the square errors between the data and the fitted
              distribution i.e., :math:`\sum_i \left( Y_i - pdf(X_i) \right)^2`
            - :attr:`fitted_param` : the parameters that best fit the data
            - :attr:`fitted_pdf` : the PDF generated with the parameters that best fit the data

        Indices of the dataframes contains the name of the distribution.

        """
        import warnings

        warnings.filterwarnings("ignore", category=RuntimeWarning)

        if progress:
            self.pb = Progress(len(self.distributions))

        jobs = (
            delayed(self._fit_single_distribution)(dist, progress)
            for dist in self.distributions
        )
        pool = Parallel(n_jobs=n_jobs, backend="threading")
        _ = pool(jobs)
        self.df_errors = pd.DataFrame(
            {
                "sumsquare_error": self._fitted_errors,
                "aic": self._aic,
                "bic": self._bic,
                "kl_div": self._kldiv,
            }
        )
Esempio n. 41
0
    def _load_pathways(self, progress=True):
        # This is just loading all pathways once for all
        logger.info(
            "loading all pathways from KEGG. may take time the first time")
        self.pathways = {}
        from easydev import Progress
        pb = Progress(len(self.kegg.pathwayIds))
        for i, ID in enumerate(self.kegg.pathwayIds):
            self.pathways[ID.replace("path:",
                                     "")] = self.kegg.parse(self.kegg.get(ID))
            if progress:
                pb.animate(i + 1)

        # Some cleanup
        for ID in self.pathways.keys():
            name = self.pathways[ID]['NAME'][0]
            self.pathways[ID]['NAME'] = name.split(" - ", 1)[0]

        # save gene sets
        self.gene_sets = {}
        for ID in self.pathways.keys():
            res = self.pathways[ID]
            if "GENE" in res.keys():
                results = []
                # some pathways reports genes as a dictionary id:'gene name; description' ('.eg. eco')
                # others reports genes as a dictionary id:'description'
                for geneID, description in res['GENE'].items():
                    if ";" in description:
                        name = description.split(';')[0]
                    else:
                        name = geneID
                    results.append(name)

                self.gene_sets[ID] = results
            else:
                print("SKIPPED (no genes) {}: {}".format(ID, res['NAME']))

        # save all pathways info
        self.df_pathways = pd.DataFrame(self.pathways).T
        del self.df_pathways["ENTRY"]
        del self.df_pathways["REFERENCE"]
        go = [
            x['GO'] if isinstance(x, dict) and 'GO' in x.keys() else None
            for x in self.df_pathways.DBLINKS
        ]
        self.df_pathways['GO'] = go
        del self.df_pathways["DBLINKS"]
Esempio n. 42
0
    def filter_names_dmp_file(self,
                              filename="names.dmp",
                              output="names_filtered.dmp",
                              taxons=[]):

        all_taxons = set()
        pb = Progress(len(taxons))
        for i, taxon in enumerate(taxons):
            parents = self.get_family(taxon)
            all_taxons.update(parents)
            pb.animate(i + 1)
        print("")
        with open(filename, "r") as fin:
            with open(output, "w") as fout:
                for line in fin.readlines():
                    if int(line.split("\t", 1)[0]) in all_taxons:
                        fout.write(line)
Esempio n. 43
0
    def select_random_reads(self, N=None, output_filename="random.fastq"):
        """Select random reads and save in a file

        :param int N: number of random unique reads to select
            should provide a number but a list can be used as well. 
            You can select random reads for R1, and re-use the returned list as
            input for the R2 (since pairs must be kept)
        :param str output_filename:

        If you have a pair of files, the same reads must be selected in R1 and
        R2.::

            f1 = FastQ(file1)
            selection = f1.select_random_reads(N=1000)
            f2 = FastQ(file2)
            f2.select_random_reads(selection)


        """
        thisN = len(self)
        if isinstance(N, int):
            if N > thisN:
                N = thisN
            # create random set of reads to pick up
            cherries = list(range(thisN))
            np.random.shuffle(cherries)
            # cast to set for efficient iteration
            cherries = set(cherries[0:N])
        elif isinstance(N, set):
            cherries = N
        elif isinstance(N, list):
            cherries = set(N)

        fastq = pysam.FastxFile(self.filename)


        pb = Progress(thisN) # since we scan the entire file
        with open(output_filename, "w") as fh:
            for i, read in enumerate(fastq):
                if i in cherries:
                    fh.write(read.__str__() + "\n")
                else:
                    pass
                pb.animate(i+1)
        return cherries
Esempio n. 44
0
    def optimise_elastic_net(self, drug_name, feature_name, N=20, Nalpha=20):
        lwts = pylab.linspace(0, 1, N)
        alphas = pylab.linspace(0, 5, Nalpha)

        mses = np.zeros((N, Nalpha))

        pb = Progress(N)
        for i, lwt in enumerate(lwts):
            for j, alpha in enumerate(alphas):
                self.settings.regression_method = 'ElasticNet'
                self.settings.regression_alpha = alpha
                self.settings.regression_L1_wt = lwt
                odof = self.anova_one_drug_one_feature(drug_name, feature_name)
                anova = self._get_anova_summary(self.data_lm,
                                                output='dataframe')
                mses[i, j] = self.data_lm.bic
            pb.animate(i + 1)
        return mses
Esempio n. 45
0
    def volcano_plot_all_drugs(self):
        """Create a volcano plot for each drug and save in PNG files

        Each filename is set to **volcano_<drug identifier>.png**
        """
        drugs = list(self.df[self._colname_drugid].unique())
        pb = Progress(len(drugs), 1)
        for i, drug in enumerate(drugs):
            self.volcano_plot_one_drug(drug)
            self.savefig_and_js("volcano_%s.png" % drug, size_inches=(10, 10))
            pb.animate(i + 1)

            # This prevent memory leak.
            self.current_fig.canvas.mpl_disconnect(self.cid)
            try:
                import mpld3
                mpld3.plugins.clear(self.current_fig)
            except:
                pass
Esempio n. 46
0
    def optimise_elastic_net(self, drug_name, feature_name, N=20, Nalpha=20):
        lwts = pylab.linspace(0, 1, N)
        alphas = pylab.linspace(0, 5, Nalpha)

        mses = np.zeros((N, Nalpha))

        pb = Progress(N)
        for i, lwt in enumerate(lwts):
            for j, alpha in enumerate(alphas):
                self.settings.regression_method = 'ElasticNet'
                self.settings.regression_alpha = alpha
                self.settings.regression_L1_wt = lwt
                odof = self.anova_one_drug_one_feature(drug_name,
                        feature_name)
                anova = self._get_anova_summary(self.data_lm,
                        output='dataframe')
                mses[i,j] = self.data_lm.bic
            pb.animate(i+1)
        return mses
Esempio n. 47
0
    def volcano_plot_all_drugs(self):
        """Create a volcano plot for each drug and save in PNG files

        Each filename is set to **volcano_<drug identifier>.png**
        """
        drugs = list(self.df[self._colname_drugid].unique())
        pb = Progress(len(drugs), 1)
        for i, drug in enumerate(drugs):
            self.volcano_plot_one_drug(drug)
            self.savefig_and_js("volcano_%s.png" % drug, size_inches=(10, 10))
            pb.animate(i+1)

            # This prevent memory leak.
            self.current_fig.canvas.mpl_disconnect(self.cid)
            try:
                import mpld3
                mpld3.plugins.clear(self.current_fig)
            except:
                pass
Esempio n. 48
0
    def search_in_chemspider(self):
        #SB52334 --> SB-52334
        N = len(self.dd)

        pb = Progress(N)
        self.results = {}
        results = []
        for i, index in enumerate(self.dd.df.index):
            drug = self.dd.df.index[i]
            drug_name = self.dd.df.ix[drug].DRUG_NAME
            try:
                res = self.chemspider_find(drug_name)
            except:
                print(index, drug_name)
                res = []
            self.results[drug] = res
            pb.animate(i + 1)
            results.append(res)
        self.dd_filled.df['CHEMSPIDER_SEARCHED'] = results
Esempio n. 49
0
    def load_records(self, overwrite=False):
        """Load a flat file and store records in :attr:`records`

        Since version 0.8.3 we use NCBI that is updated more often than the ebi
        ftp according to their README.

        ftp://ncbi.nlm.nih.gov/pub/taxonomy/

        """
        self.download_taxonomic_file(overwrite=overwrite)
        self.records = {}

        # TODO: check if it exists otherwise, load it ?
        if os.path.exists(self.database) is False:
            self.load()

        with open(self.database) as f:
            data = f.read().strip()

        # This is fast. tried parse package, much slower. cost of progress bar
        # is not important.
        data = data.split("//\n")  # the sep is //\n
        self._child_match = re.compile(r'ID\s+\:\s*(\d+)\s*')
        self._parent_match = re.compile(r'PARENT ID\s+\:\s*(\d+)\s*')
        self._rank_match = re.compile(r'RANK\s+\:\s*([^\n]+)\s*')
        self._name_match = re.compile(r'SCIENTIFIC NAME\s+\:\s*([^\n]+)\s*')

        from easydev import Progress
        pb = Progress(len(data))

        logger.info('Loading all taxon records.')
        for i, record in enumerate(data[0:]):
            dd = {'raw': record}
            dd['id'] = int(self._child_match.search(record).group(1))
            dd['parent'] = int(self._parent_match.search(record).group(1))
            dd['scientific_name'] = self._name_match.search(record).group(1)
            dd['rank'] = self._rank_match.search(record).group(1)
            self.records[dd["id"]] = dd
            if self.verbose:
                pb.animate(i + 1)
        if self.verbose:
            print()
Esempio n. 50
0
    def run_methods(self):
        results = defaultdict(list)
        # We only test the methods common to all converters
        # (The intended use is with a list of converters all
        # having the same methods, but different input files)
        methods = set(self.converters[0].available_methods[:])  # a copy !
        for converter in self.converters[1:]:
            methods &= set(converter.available_methods[:])
        methods = sorted(methods)

        if self.include_dummy:
            methods += ['dummy']

        if self.to_include:
            methods = [x for x in methods if x in self.to_include]
        elif self.to_exclude:
            methods = [x for x in methods if x not in self.to_exclude]

        for method in methods:
            print("\nEvaluating method %s" % method)
            # key: converter.infile
            # value: list of times
            times = defaultdict(list)
            pb = Progress(self.N)
            for i in range(self.N):
                for converter in self.converters:
                    with Timer(times[converter.infile]):
                        converter(method=method)
                pb.animate(i + 1)
            # Normalize times so that each converter has comparable times
            mean_time = gmean(np.fromiter(chain(*times.values()), dtype=float))
            # median of ratios to geometric mean (c.f. DESeq normalization)
            scales = {
                conv: np.median(np.asarray(conv_times) / mean_time)
                for conv, conv_times in times.items()
            }
            for (conv, conv_times) in times.items():
                scale = scales[conv]
                results[method].extend(
                    [conv_time / scale for conv_time in conv_times])
        self.results = results
Esempio n. 51
0
    def volcano_plot_all_features(self):
        """Create a volcano plot for each feature and save in PNG files

        Each filename is set to **volcano_<feature name>.png**
        """
        features = list(self.df[self._colname_feature].unique())
        print('Creating image for each feature (using all drugs)')
        pb = Progress(len(features), 1)
        for i, feature in enumerate(features):
            self.volcano_plot_one_feature(feature)
            self.savefig_and_js("volcano_%s.png" % feature, 
                    size_inches=(10, 10))
            pb.animate(i+1)

            # This prevent memory leak.
            self.current_fig.canvas.mpl_disconnect(self.cid)
            try:
                import mpld3
                mpld3.plugins.clear(self.current_fig)
            except:
                pass
Esempio n. 52
0
    def load_records(self, overwrite=False):
        """Load a flat file and store records in :attr:`records`

        """
        self._load_flat_file(overwrite=overwrite)
        self.records = {}

        # TODO: check if it exists otherwise, load it ?
        if os.path.exists(self.filename) is False:
            self.load()
        with open(self.filename) as f:
            data = f.read().strip()

        data = data.split("//\n") # the sep is //\n
        self._child_match = re.compile('ID\s+\:\s*(\d+)\s*')
        self._parent_match = re.compile('PARENT ID\s+\:\s*(\d+)\s*')
        self._rank_match = re.compile('RANK\s+\:\s*([^\n]+)\s*')
        self._name_match = re.compile('SCIENTIFIC NAME\s+\:\s*([^\n]+)\s*')

        from easydev import Progress
        pb = Progress(len(data))

        if self.verbose:
            print('Loading all taxon records.')
        for i, record in enumerate(data[0:]):
            # try/except increase comput. time by 5%
            try:
                dico = self._interpret_record(record)
                identifier = int(dico['id'])
                self.records[identifier] = dico
            except Exception as err:
                print(err)
                print('Could not parse the following record '  + \
                      'Please fill bug report on http://github.com/biokit')
                print(record)
            if self.verbose:
                pb.animate(i+1)
        if self.verbose:
            print()
Esempio n. 53
0
    def search_in_chemspider(self):
        # Fill results attribute as a dictionary. Keys being the drug id
        # and values are list of chemspider identifiers
        #
        # SB52334 --> SB-52334
        N = len(self.dd)

        pb = Progress(N)
        self.results = {}
        results = []
        for i, index in enumerate(self.dd.df.index):
            drug = self.dd.df.index[i]
            drug_name = self.dd.df.ix[drug].DRUG_NAME
            try:
                res = self._cs_find(drug_name)
            except:
                print("This drug index (%s) / drug name (%s) was not found" %
                        (index, drug_name))
                res = []
            self.results[drug] = res
            pb.animate(i+1)
            results.append(res)
        self.dd_filled.df['CHEMSPIDER_SEARCHED'] = results
Esempio n. 54
0
def process_paired_reads(paired_reader, modifiers1, modifiers2, filters,
                         n_progress=-1):
	"""
	Loop over reads, find adapters, trim reads, apply modifiers and
	output modified reads.

	Return a Statistics object.
	"""
	n = 0  # no. of processed reads
	total1_bp = 0
	total2_bp = 0

	if n_progress != -1:
		try:
			from easydev import Progress
			pb = Progress(n_progress)
			count = 0
		except:
			n_progress = -1

	for read1, read2 in paired_reader:
		n += 1
		total1_bp += len(read1.sequence)
		total2_bp += len(read2.sequence)
		for modifier in modifiers1:
			read1 = modifier(read1)
		for modifier in modifiers2:
			read2 = modifier(read2)
		for filter in filters:
			# Stop writing as soon as one of the filters was successful.
			if filter(read1, read2):
				break
		if n_progress != -1:
			count += 1
			pb.animate(count)

	return Statistics(n=n, total_bp1=total1_bp, total_bp2=total2_bp)
Esempio n. 55
0
    def check_randomness(self, drug_name, n_folds=10, N=10, show=True,
            progress=False):

        scores = []
        pb = Progress(N)
        for i in range(N):
            # Fit a model using CV
            inter_results = self.runCV(drug_name, n_folds=n_folds, verbose=False)
            scores.append(inter_results.Rp)
            if progress: 
                pb.animate(i+1)

        random_scores = []
        pb = Progress(N)
        for i in range(N):
            # Fit a model using CV
            inter_results = self.runCV(drug_name, n_folds=n_folds,
                                randomize_Y=True, verbose=False)
            random_scores.append(inter_results.Rp)
            if progress:
                pb.animate(i+1)

        from scipy.stats import ttest_ind
        ttest_res = ttest_ind(scores, random_scores)
        results = { "scores": scores,
                    "random_scores": random_scores,
                    "ttest_pval": ttest_res.pvalue}

        # Compute the log of the Bayes factor to avoid underflow as communicated
        # by M.Menden.
        S = sum([s>r for s,r in zip(scores, random_scores)])
        proba = S / len(scores)
        if proba == 1:
            # Set the maximum instead of infinite
            # bayes_factor = np.inf
            bayes_factor = 1. / (1./len(scores))
            
        else:
            bayes_factor = 1. / (1-proba)
        results['bayes_factor'] = bayes_factor

        if show:
            M = max(max(scores), max(random_scores)) * 1.2
            bins = pylab.linspace(0, M, 40)
            pylab.clf()
            pylab.hist(scores, bins=bins, color="b", alpha=0.5)
            pylab.hist(random_scores, color="r", alpha=0.5, bins=bins)
            pylab.title("ttest=%(ttest_pval).3e, bayes=%(bayes_factor)s" % results)
            pylab.grid(True)

        return results
Esempio n. 56
0
def test_progressbar():
    N = 2
    p = progressbar.progress_bar(N)

    for i in range(0,N):
        time.sleep(.1)
        p.animate(i+1, i)


    p = progressbar.TextProgressBar(N, progressbar.consoleprint)
    for i in range(0,N):
        time.sleep(.1)
        p.animate(i+1, i)

    p = Progress(100)
    p.animate(1)
    assert p.pb.interval == 1

    p = Progress(200)
    assert p.pb.interval == 2
    p.animate(1)