Ejemplo n.º 1
0
    def _score_challengeA_bunch(self, filenames, subname):

        from easydev import Progress
        pb = Progress(5, 1)
        pb.animate(0)
        results = []
        for i, filename in enumerate(filenames):
            res  = self.score_challengeA(filename, subname+"_" + str(i+1))
            pb.animate(i+1)
            results.append(res)

        aupr_score = -np.mean(np.log10([x['p_auroc'] for x in results]))
        auroc_score = -np.mean(np.log10([x['p_aupr'] for x in results]))
        score = (aupr_score + auroc_score)/2.

        df = pd.Series()
        df['Overall Score'] = score
        df['AUPR score (pval)'] = aupr_score
        df['AUROC score (pval)'] = aupr_score

        for i in range(1, 6):
            df['AUPR Net %s' % i] = results[i-1]['aupr']

        for i in range(1, 6):
            df['AUROC Net %s' % i] = results[i-1]['auroc']

        return df
Ejemplo n.º 2
0
    def fit(self, amp=1, progress=False, n_jobs=-1):
        r"""Loop over distributions and find best parameter to fit the data for each

        When a distribution is fitted onto the data, we populate a set of
        dataframes:

            - :attr:`df_errors`  :sum of the square errors between the data and the fitted
              distribution i.e., :math:`\sum_i \left( Y_i - pdf(X_i) \right)^2`
            - :attr:`fitted_param` : the parameters that best fit the data
            - :attr:`fitted_pdf` : the PDF generated with the parameters that best fit the data

        Indices of the dataframes contains the name of the distribution.

        """
        import warnings

        warnings.filterwarnings("ignore", category=RuntimeWarning)

        if progress:
            self.pb = Progress(len(self.distributions))

        jobs = (
            delayed(self._fit_single_distribution)(dist, progress)
            for dist in self.distributions
        )
        pool = Parallel(n_jobs=n_jobs, backend="threading")
        _ = pool(jobs)
        self.df_errors = pd.DataFrame(
            {
                "sumsquare_error": self._fitted_errors,
                "aic": self._aic,
                "bic": self._bic,
                "kl_div": self._kldiv,
            }
        )
Ejemplo n.º 3
0
    def anova_one_drug(self, drug_id, animate=True, output='object'):
        """Computes ANOVA for a given drug across all features

        :param str drug_id: a valid drug identifier.
        :param animate: shows the progress bar
        :return: a dataframe

        Calls :meth:`anova_one_drug_one_feature` for each feature.
        """
        # some features can be dropped ??

        # drop first and second columns that are made of strings
        # works under python2 but not python 3. Assume that the 2 first
        #columns are the sample name and tissue feature
        # Then, we keep only cases with at least 3 features.
        # MSI could be used but is not like in original R code.
        features = self.features.df.copy()
        # need to skip the FACTOR to keep only features
        shift = self.features.shift

        features = features[features.columns[shift:]]
        # FIXME what about features with less than 3 zeros ?
        mask = features.sum(axis=0) >= 3

        # TODO: MSI, tissues, name must always be kept
        #
        selected_features = features[features.columns[mask]]

        # scan all features for a given drug
        assert drug_id in self.ic50.df.columns
        N = len(selected_features.columns)
        pb = Progress(N, 10)
        res = {}
Ejemplo n.º 4
0
    def search_from_smile_inchembl(self):

        N = len(self.drug_ids)

        pb = Progress(N)
        self.results_chembl = {}
        self.results_chemspider = {}

        for i in range(0, N):
            drug = self.drug_ids[i]
            self.results_chembl[drug] = []

            if self.results[drug]:
                for chemspider_id in self.results[drug]:
                    chemspider_entry = self._cs_get(chemspider_id)
                    self.results_chemspider[drug] = chemspider_entry
                    smile = chemspider_entry['smiles']
                    # now search in chembl
                    res_chembl = self.chembl.get_compounds_by_SMILES(smile)
                    try:
                        res_chembl['compounds']
                        self.results_chembl[drug].extend(
                            res_chembl['compounds'])
                    except:
                        pass

            pb.animate(i + 1)
Ejemplo n.º 5
0
    def create_taxonomy_file(self, filename="taxonomy.dat"):
        logger.info("Please wait while creating the output file. "
                    "This may take a few minutes")
        from easydev import Progress
        pb = Progress(len(self.df_nodes))
        count = 0
        df_names = self.df_names.query("key == 'scientific name'").copy()
        with open(filename, "w") as fout:

            for taxid in self.df_nodes.index:
                row = self.df_nodes.loc[taxid]
                fout.write("ID                        : {}\n".format(taxid))
                fout.write("PARENT ID                 : {}\n".format(
                    row.parent))
                fout.write("RANK                      : {}\n".format(
                    row['rank']))

                #names = df_names.loc[taxid]
                #print(
                fout.write("{:26s}: {}\n".format("SCIENTIFIC NAME",
                                                 df_names.loc[taxid, "name"]))
                """    len(names)
                    for k,v in zip(names['key'], names['name']):
                        if k.upper() in ['SCIENTIFIC NAME', 'SYNONYM']:
                            fout.write("{:26s}: {}\n".format(k.upper(), v))
                except:
                    k, v = names['key'], names['name']
                    fout.write("{:26s}: {}\n".format(k.upper(), v))
                """
                fout.write("//\n")
                count += 1
                pb.animate(count)
Ejemplo n.º 6
0
    def create_html_drugs(self):
        """Create an HTML page for each drug"""
        # group by drugs
        all_drugs = list(self.df['DRUG_ID'].unique())

        df = self.get_significant_set()
        groups = df.groupby('DRUG_ID')
        if self.verbose:
            print("Creating individual HTML pages for each drug")
        N = len(groups.indices.keys())
        N = len(all_drugs)
        pb = Progress(N)
        for i, drug in enumerate(all_drugs):
            # enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            if drug in groups.groups.keys():
                subdf = groups.get_group(drug)
            else:
                subdf = {}

            html = HTMLOneDrug(self, self.df, subdf, drug)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i + 1)
        if self.settings.animate: print("\n")
Ejemplo n.º 7
0
    def select_random_reads(self, N=None, output_filename="random.fasta"):
        """Select random reads and save in a file

        :param int N: number of random unique reads to select
            should provide a number but a list can be used as well.
        :param str output_filename:
        """
        import numpy as np
        thisN = len(self)
        if isinstance(N, int):
            if N > thisN:
                N = thisN
            # create random set of reads to pick up
            cherries = list(range(thisN))
            np.random.shuffle(cherries)
            # cast to set for efficient iteration
            cherries = set(cherries[0:N])
        elif isinstance(N, set):
            cherries = N
        elif isinstance(N, list):
            cherries = set(N)
        fasta = FastxFile(self.filename)
        pb = Progress(thisN)  # since we scan the entire file
        with open(output_filename, "w") as fh:
            for i, read in enumerate(fasta):
                if i in cherries:
                    fh.write(read.__str__() + "\n")
                else:
                    pass
                pb.animate(i + 1)
        return cherries
Ejemplo n.º 8
0
    def download_accession_from_ncbi(self, accession):
        # a list of accessions in a file
        # can be a list, a unique string, a filename with 1-column wit accession
        # to retrieve
        if isinstance(accession, list):
            pass
        elif isinstance(accession, str):
            if os.path.exists(accession):
                with open(accession, "r") as fin:
                    accessions = fin.read().split()
            else:
                accessions = [accession]

        from easydev import Progress
        N = len(accessions)
        pb = Progress(N)
        logger.info("Fetching {} accession fasta files from NCBI".format(N))
        for i, accession in enumerate(accessions):
            data = self.eutils.EFetch("nucleotide",
                                      rettype="fasta",
                                      id=accession,
                                      retmode="text")
            if isinstance(data, int):
                logger.info(
                    "Could not fetch this accession: {}. continue".format(
                        accession))
                print("Could not fetch this accession: {}. continue".format(
                    accession))
            else:
                outname = "{}/library/{}.fa".format(self.dbname, accession)
                with open(outname, "wb") as fout:
                    fout.write(data)
            pb.animate(i + 1)
Ejemplo n.º 9
0
    def _get_G(self, gold):
        from easydev import Progress
        import scipy.sparse
        regulators = list(set(gold[0]))
        targets = list(set(gold[[0,1]].stack()))

        N, M = gold[0].max(), gold[1].max()

        ## A will store indices goind from 0 (not 1) to N-1
        # hence the -1 indices when handling A if i,j are the
        # values of the gene
        A = np.zeros((N, M))
        for row in gold[[0,1]].values:
            i, j = row
            A[i-1, j-1] = 1
        A_sparse = scipy.sparse.csr_matrix(A)

        #N, M = len(regulators), len(targets)
        G = np.zeros((N, M))

        pb = Progress(len(regulators), 1)
        for i, x in enumerate(regulators):
            for j, y in enumerate(targets):
                if A[x-1, y-1] == 1:
                    G[x-1, y-1] = 1
                elif x != y:
                    G[x-1, y-1] = -1
            pb.animate(i+1)
        return G
Ejemplo n.º 10
0
    def diagnostics(self):
        """Return dataframe with information about the analysis

        """
        n_drugs = len(self.ic50.drugIds)
        n_features = len(self.features.features) - self.features.shift
        n_combos = n_drugs * n_features
        feasible = 0
        pb = Progress(n_drugs, 1)
        counter = 0
        for drug in self.ic50.drugIds:
            for feature in self.features.features[self.features.shift:]:
                dd = self._get_one_drug_one_feature_data(drug, feature,
                        diagnostic_only=True)
                if dd.status is True:
                    feasible += 1
            counter += 1
            pb.animate(counter)

        results = {
                'n_drug': n_drugs,
                'n_combos': n_combos,
                'feasible_tests': feasible,
                'percentage_feasible_tests': float(feasible)/n_combos*100}
        return results
Ejemplo n.º 11
0
    def filling_chembl_pubchem_using_unichem(self):
        """

        """
        N = len(self.drug_ids)
        pb = Progress(N)
        for i, this in enumerate(self.drug_ids):
            entry = self.dd.df.loc[this]
            # if no information is provided, we will need to get it
            # from chemspider

            # From the database, when chembl is provided, it is unique
            # same for chemspider and pubchem and CAS
            select = entry[['CHEMSPIDER', 'CHEMBL', 'PUBCHEM']]
            if select.count() == 0:
                name = self.dd.df.loc[this].DRUG_NAME
                results = self._cs_find(name)
                if len(results) == 0:
                    # nothing found
                    pass
                elif len(results) == 1:
                    self.dd_filled.df.loc[this].loc['CHEMSPIDER'] = results[0]
                else:
                    # non unique
                    #chemspider = ",".join([str(x) for x in results])
                    self.dd_filled.df.loc[this].loc['CHEMSPIDER'] = results
            pb.animate(i + 1)

        # Search in chemspider systematically
        for i, this in enumerate(self.drug_ids):
            entry = self.dd.df.loc[this]
            if select.count() == 1:
                res = self._cs_find(drug)

            pb.animate(i + 1)
Ejemplo n.º 12
0
 def find_motif_fasta(self, filename, motif, window=200,
         local_threshold=None, global_threshold=None):
     from sequana import FastA
     data = FastA(filename)
     N = len(data)
     from easydev import Progress
     pb = Progress(N)
     df = {
         "query_name": [],
         "hit": [],
         "length": [],
         "start": [],
         "end": []
     }
     for i, item in enumerate(data):
         X1, S = self.find_motif_from_sequence(item.sequence, motif,
                     window=window, local_threshold=local_threshold
                     )
         if S >= self.global_threshold:
             df['query_name'].append(item.name)
             df['start'].append(0)
             df['end'].append(len(item.sequence))
             df['length'].append(len(item.sequence))
             df['hit'].append(S)
         pb.animate(i+1)
     df = pd.DataFrame(df)
     return df
Ejemplo n.º 13
0
    def to_kmer_content(self, k=7):
        """Return a Series with kmer count across all reads

        :param int k: (default to 7-mers)
        :return: Pandas Series with index as kmer and values as count.

        Takes about 30 seconds on a million reads.
        """
        # Counter is slow if we apply it on each read.
        # .count is slow as well
        import collections
        from sequana.kmer import get_kmer
        counter = collections.Counter()
        pb = Progress(len(self))
        buffer_ = []
        for i, this in enumerate(self):
            buffer_.extend(list(get_kmer(this['sequence'], k)))
            if len(buffer_) > 100000:
                counter += collections.Counter(buffer_)
                buffer_ = []
            pb.animate(i)
        counter += collections.Counter(buffer_)

        ts = pd.Series(counter)
        ts.sort_values(inplace=True, ascending=False)

        return ts
Ejemplo n.º 14
0
    def create_html_associations(self):
        """Create an HTML page for each significant association

        The name of the output HTML file is **<association id>.html**
        where association id is stored in :attr:`df`.

        """
        print("\nCreating individual HTML pages for each association")
        df = self.get_significant_set()

        drugs = df['DRUG_ID'].values
        features = df['FEATURE'].values
        assocs = df['ASSOC_ID'].values
        fdrs = df['ANOVA_FEATURE_FDR'].values

        N = len(df)
        pb = Progress(N)

        html = Association(self, drug='dummy', feature='dummy', fdr='dummy')

        for i in range(N):
            html.drug = drugs[i]
            html.feature = features[i]
            html._filename = str(assocs[i]) + '.html'
            html.fdr = fdrs[i]
            html.assoc_id = assocs[i]
            html._init_report()  # since we have one shared instance
            html.create_report(onweb=False)
            pb.animate(i + 1)
Ejemplo n.º 15
0
    def get_supported_families(self, N=1000, progress=True):
        """Returns the list of supported PANTHER family IDs

        This services returns only 1000 items per request. This is defined by
        the index. For instance index set to 1 returns the first 1000 families.
        Index set to 2 returns families between index 1000 and 2000 and so on.
        As of 20 Feb 2020, there was about 15,000 families.

        This function simplifies your life by calling the service as many times
        as required. Therefore it returns all families in one go.

        """
        from easydev import Progress
        params = {'startIndex': 1}
        res = self.http_get("supportedpantherfamilies", params=params)
        results = res['search']['panther_family_subfam_list']['family']
        if len(results) != N:
            msg = "looks like the services changed. Call this function with N={}"
            msg = msg.format(len(results))
            raise ValueError(msg)

        number_of_families = res['search']['number_of_families']
        pb = Progress(int(number_of_families / N))
        pb.animate(1)
        for i in range(1, int(number_of_families / N) + 1):
            params = {'startIndex': i * N + 1}
            res = self.http_get("supportedpantherfamilies", params=params)
            data = res['search']['panther_family_subfam_list']['family']
            results.extend(data)
            if progress:
                pb.animate(i)
        return results
Ejemplo n.º 16
0
    def get_gis(self, extensions=['fa']):
        self.filenames = []
        root = self.dbname
        for extension in extensions:
            self.filenames.extend(
                list(glob.iglob("%s/library/**/*%s" % (root, extension))))
        for extension in extensions:
            self.filenames.extend(
                list(glob.iglob("%s/library/**/**/*%s" % (root, extension))))

        N = len(self.filenames)
        pb = Progress(N)
        gis = []
        for i, filename in enumerate(self.filenames):
            data = open(filename, "r")
            line = data.readline()
            if line.startswith('>'):
                assert "gi" in line, "expected >gi to be found at the beginning"
                gi = line[1:].split("|")[1]
            else:
                raise ValueError(
                    "This file %s does not seem to be a FASTA file" % filename)
            gis.append(gi)
            pb.animate(i + 1)
        print()
        gis = [int(x) for x in gis]
        self.gis = gis

        assert len(gis) == len(self.filenames)
        return gis
Ejemplo n.º 17
0
    def filter(self, identifiers_list=[], min_bp=None, max_bp=None,
        progressbar=True, output_filename='filtered.fastq'):
        """Save reads in a new file if there are not in the identifier_list

        :param int min_bp: ignore reads with length shorter than min_bp
        :param int max_bp: ignore reads with length above max_bp

        """
        # 7 seconds without identifiers to scan the file
        # on a 750000 reads

        if min_bp is None:
            min_bp = 0

        if max_bp is None:
            max_bp = 1e9

        # make sure we are at the beginning
        self.rewind()

        output_filename, tozip = self._istozip(output_filename)

        with open(output_filename, "w") as fout:
            pb = Progress(self.n_reads)
            buf = ""
            filtered = 0
            saved = 0 

            for count, lines in enumerate(grouper(self._fileobj)):
                identifier = lines[0].split()[0]
                if lines[0].split()[0].decode() in identifiers_list:
                    filtered += 1
                else:
                    N = len(lines[1])
                    if N <= max_bp and N >= min_bp:
                        buf += "{}{}+\n{}".format(
                            lines[0].decode("utf-8"),
                            lines[1].decode("utf-8"),
                            lines[3].decode("utf-8"))
                        saved += 1
                    else:
                        filtered += 1
                    if count % 100000 == 0:
                        fout.write(buf)
                        buf = ""
                if progressbar is True:
                    pb.animate(count+1)
            fout.write(buf)
            if filtered < len(identifiers_list):
                print("\nWARNING: not all identifiers were found in the fastq file to " +
                      "be filtered.")
        logger.info("\n{} reads were filtered out and {} saved in {}".format(
            filtered, saved, output_filename))

        if tozip is True: 
            logger.info("Compressing file")
            self._gzip(output_filename)
Ejemplo n.º 18
0
 def _init(self):
     self.fitted_param = {}
     self.fitted_pdf = {}
     self._fitted_errors = {}
     self._aic = {}
     self._bic = {}
     self._kldiv = {}
     self._fit_i = 0  # fit progress
     self.pb = Progress(len(self.distributions))
Ejemplo n.º 19
0
    def volcano_plot_all_drugs(self):
        """Create a volcano plot for each drug and save in PNG files

        Each filename is set to **volcano_<drug identifier>.png**
        """
        drugs = list(self.df[self._colname_drugid].unique())
        pb = Progress(len(drugs), 1)
        for i, drug in enumerate(drugs):
            self.volcano_plot_one_drug(drug)
            self.savefig("volcano_%s.png" % drug, size_inches=(10, 10))
            pb.animate(i + 1)
Ejemplo n.º 20
0
    def volcano_plot_all_features(self):
        """Create a volcano plot for each feature and save in PNG files

        Each filename is set to **volcano_<feature name>.png**
        """
        features = list(self.df[self._colname_feature].unique())
        print('Creating image for each feature (using all drugs)')
        pb = Progress(len(features), 1)
        for i, feature in enumerate(features):
            self.volcano_plot_one_feature(feature)
            self.savefig("volcano_%s.png" % feature, size_inches=(10, 10))
            pb.animate(i + 1)
Ejemplo n.º 21
0
    def create_summary_pages(self):
        """Create summary pages

        Once the main analyis is done (:meth:`analyse`), and the company
        packages have been created (:meth:`create_data_packages_for_companies`),
        you can run this method that will creade a summary HTML page
        (index.html) for the tissue, and a similar summary HTML page for the
        tissues of each company. Finally, an HTML summary page for the 
        companies is also created.

        The final tree direcorty looks like::


            |-- index.html
            |-- company_packages
            |   |-- index.html
            |   |-- Company1
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |   |-- Company2
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |-- tissue_packages
            |   |-- index.html
            |   |-- Tissue1
            |   |-- Tissue2


        """
        # First for the main directory (tissue_packages):
        print(purple("Creating summary index.html for the tissues"))
        self._create_summary_pages(self.main_directory, verbose=False)

        # Then for each companies:
        print(purple("Creating summary index.html for each company"))
        pb = Progress(len(self.companies))
        for i, company in enumerate(self.companies):
            try:
                self._create_summary_pages(self.company_directory + os.sep +
                                           company,
                                           verbose=False,
                                           company=company)
            except Exception as err:
                print(
                    red("Issue with %s. Continue with other companies" %
                        company))
                print(err)
            pb.animate(i + 1)

        # Finally, an index towards each company
        self._create_main_index()
Ejemplo n.º 22
0
def test_progressbar():
    N = 2
    p = progressbar.progress_bar(N)

    for i in range(0,N):
        time.sleep(.1)
        p.animate(i+1, i)


    p = progressbar.TextProgressBar(N, progressbar.consoleprint)
    for i in range(0,N):
        time.sleep(.1)
        p.animate(i+1, i)

    p = Progress(100)
    p.animate(1)
    assert p.pb.interval == 1

    p = Progress(200)
    assert p.pb.interval == 2
    p.animate(1)
Ejemplo n.º 23
0
def check_ipython_notebook():

    notebooks = glob.glob("*ipynb")
    N = len(notebooks)

    pb = Progress(N)
    for i, filename in enumerate(notebooks):
        print(purple(filename))
        notebook = read(open(filename), 'json')
        r = NotebookRunner(notebook)
        r.run_notebook()
        pb.animate(i + 1)
Ejemplo n.º 24
0
 def _load_complexes(self, show_progress=True):
     from easydev import Progress
     import time
     pb = Progress(len(self.df.complexAC))
     complexes = {}
     self.logging.info("Loading all details from the IntactComplex database")
     for i, identifier in enumerate(self.df.complexAC):
         res = self.webserv.details(identifier)
         complexes[identifier] = res
         if show_progress:
             pb.animate(i+1)
     self._complexes = complexes
Ejemplo n.º 25
0
 def create_html_features(self):
     """Create an HTML page for each significant feature"""
     df = self.get_significant_set()
     groups = df.groupby('FEATURE')
     print("\nCreating individual HTML pages for each feature")
     N = len(groups.indices.keys())
     pb = Progress(N)
     for i, feature in enumerate(groups.indices.keys()):
         # get the indices and therefore subgroup
         subdf = groups.get_group(feature)
         html = HTMLOneFeature(self, self.df, subdf, feature)
         html.create_report(onweb=False)
         pb.animate(i + 1)
Ejemplo n.º 26
0
    def plot_pca_vs_max_features(self, step=100, n_components=2,
            progress=True):
        """

        .. plot::
            :include-source:

            from sequana.viz.pca import PCA
            from sequana import sequana_data
            import pandas as pd

            data = sequana_data("test_pca.csv")
            df = pd.read_csv(data)
            df = df.set_index("Id")

            p = PCA(df)
            p.plot_pca_vs_max_features()

        """
        assert n_components in [2,3,4]
        N = len(self.df)
        if step > N:
            step = N

        # We start with at least 5 features
        X = range(10, N, step)
        from easydev import Progress
        pb = Progress(len(X))
        Y = []
        for i, x in enumerate(X):
            res = self.plot(n_components=n_components, max_features=x, show_plot=False)
            Y.append(res)
            if progress: pb.animate(i+1)

        sub = n_components
        pylab.subplot(sub,1,1)
        pylab.plot(X, [y[0]*100 for y in Y])
        pylab.ylabel("PC1 (%)")
        pylab.subplot(sub,1,2)
        pylab.plot(X, [y[1]*100 for y in Y])
        pylab.ylabel("PC2 (%)")
        if sub >= 3:
            pylab.subplot(sub,1,3)
            pylab.plot(X, [y[2]*100 for y in Y])
            pylab.ylabel("PC3 (%)")
        if sub >= 4:
            pylab.subplot(sub,1,4)
            pylab.plot(X, [y[3]*100 for y in Y])
            pylab.ylabel("PC4 (%)")
Ejemplo n.º 27
0
    def _load_pathways(self, progress=True):
        # This is just loading all pathways once for all
        logger.info(
            "loading all pathways from KEGG. may take time the first time")
        self.pathways = {}
        from easydev import Progress
        pb = Progress(len(self.kegg.pathwayIds))
        for i, ID in enumerate(self.kegg.pathwayIds):
            self.pathways[ID.replace("path:",
                                     "")] = self.kegg.parse(self.kegg.get(ID))
            if progress:
                pb.animate(i + 1)

        # Some cleanup
        for ID in self.pathways.keys():
            name = self.pathways[ID]['NAME'][0]
            self.pathways[ID]['NAME'] = name.split(" - ", 1)[0]

        # save gene sets
        self.gene_sets = {}
        for ID in self.pathways.keys():
            res = self.pathways[ID]
            if "GENE" in res.keys():
                results = []
                # some pathways reports genes as a dictionary id:'gene name; description' ('.eg. eco')
                # others reports genes as a dictionary id:'description'
                for geneID, description in res['GENE'].items():
                    if ";" in description:
                        name = description.split(';')[0]
                    else:
                        name = geneID
                    results.append(name)

                self.gene_sets[ID] = results
            else:
                print("SKIPPED (no genes) {}: {}".format(ID, res['NAME']))

        # save all pathways info
        self.df_pathways = pd.DataFrame(self.pathways).T
        del self.df_pathways["ENTRY"]
        del self.df_pathways["REFERENCE"]
        go = [
            x['GO'] if isinstance(x, dict) and 'GO' in x.keys() else None
            for x in self.df_pathways.DBLINKS
        ]
        self.df_pathways['GO'] = go
        del self.df_pathways["DBLINKS"]
Ejemplo n.º 28
0
    def filter_names_dmp_file(self,
                              filename="names.dmp",
                              output="names_filtered.dmp",
                              taxons=[]):

        all_taxons = set()
        pb = Progress(len(taxons))
        for i, taxon in enumerate(taxons):
            parents = self.get_family(taxon)
            all_taxons.update(parents)
            pb.animate(i + 1)
        print("")
        with open(filename, "r") as fin:
            with open(output, "w") as fout:
                for line in fin.readlines():
                    if int(line.split("\t", 1)[0]) in all_taxons:
                        fout.write(line)
Ejemplo n.º 29
0
    def select_random_reads(self, N=None, output_filename="random.fastq"):
        """Select random reads and save in a file

        :param int N: number of random unique reads to select
            should provide a number but a list can be used as well. 
            You can select random reads for R1, and re-use the returned list as
            input for the R2 (since pairs must be kept)
        :param str output_filename:

        If you have a pair of files, the same reads must be selected in R1 and
        R2.::

            f1 = FastQ(file1)
            selection = f1.select_random_reads(N=1000)
            f2 = FastQ(file2)
            f2.select_random_reads(selection)


        """
        thisN = len(self)
        if isinstance(N, int):
            if N > thisN:
                N = thisN
            # create random set of reads to pick up
            cherries = list(range(thisN))
            np.random.shuffle(cherries)
            # cast to set for efficient iteration
            cherries = set(cherries[0:N])
        elif isinstance(N, set):
            cherries = N
        elif isinstance(N, list):
            cherries = set(N)

        fastq = pysam.FastxFile(self.filename)


        pb = Progress(thisN) # since we scan the entire file
        with open(output_filename, "w") as fh:
            for i, read in enumerate(fastq):
                if i in cherries:
                    fh.write(read.__str__() + "\n")
                else:
                    pass
                pb.animate(i+1)
        return cherries
Ejemplo n.º 30
0
    def optimise_elastic_net(self, drug_name, feature_name, N=20, Nalpha=20):
        lwts = pylab.linspace(0, 1, N)
        alphas = pylab.linspace(0, 5, Nalpha)

        mses = np.zeros((N, Nalpha))

        pb = Progress(N)
        for i, lwt in enumerate(lwts):
            for j, alpha in enumerate(alphas):
                self.settings.regression_method = 'ElasticNet'
                self.settings.regression_alpha = alpha
                self.settings.regression_L1_wt = lwt
                odof = self.anova_one_drug_one_feature(drug_name, feature_name)
                anova = self._get_anova_summary(self.data_lm,
                                                output='dataframe')
                mses[i, j] = self.data_lm.bic
            pb.animate(i + 1)
        return mses