Esempi in Python per AttrDict, esempi in Python per easydev.AttrDict

Esempio n. 1

0

Mostra file

File: volcano.py Progetto: howard-lightfoot/gdsctools

    def __init__(self, data, sep="\t", settings=None):
        """.. rubric:: Constructor

        :param data: an :class:`~gdsctools.anova.ANOVAResults` instance
            or a dataframe with the proper columns names (see below)
        :param settings: an instance of
            :class:`~gdsctools.settings.ANOVASettings`

        Expected column names to be found if a filename is provided::

            ANOVA_FEATURE_pval
            ANOVA_FEATURE_FDR
            FEATURE_delta_MEAN_IC50
            FEATURE_IC50_effect_size
            N_FEATURE_pos
            N_FEATURE_pos
            FEATURE
            DRUG_ID

        If the plotting is too slow, you can use the :meth:`selector` to prune
        the results (most of the data are noise and overlap on the middle
        bottom  area of the plot with little information.

        """
        # a copy since we do may change the data
        try:
            # an ANOVAResults contains a df attribute
            self.df = data.df.copy()
        except:
            # probably a dataframe
            self.df = data.copy()

        # this is redundant could reuse the input ??
        if settings is None:
            from gdsctools.settings import ANOVASettings
            self.settings = ANOVASettings()
        else:
            self.settings = AttrDict(**settings)

        self.figtools = Savefig()
        self.figtools.directory = self.settings.directory

        self.drugs = set(self.df[self._colname_drugid])
        self.features = set(self.df[self._colname_feature])

        # intensive calls made once for all
        self.groups_by_drugs = self.df.groupby(self._colname_drugid).groups
        self.groups_by_features = self.df.groupby(self._colname_feature).groups

Esempio n. 2

0

Mostra file

File: rnadiff.py Progetto: sequana/sequana

    def import_tables(self):
        from easydev import AttrDict

        data = {
            compa.stem.replace("_degs_DESeq2", "").replace("-", "_"):
            RNADiffTable(
                compa,
                alpha=self._alpha,
                log2_fc=self._log2_fc,
                condition=self.condition,
                # gff=self.annotation.annotation,
            )
            for compa in self.files
        }

        return AttrDict(**data)

Esempio n. 3

0

Mostra file

File: fastq.py Progetto: yuelianghaoyuana/biokit

    def create_random_data(self, N, min_length=10, max_length=40):
        import numpy as np
        letters = 'ACGT'
        self.clear()
        for i in range(0, N):
            d = {}
            d['identifier'] = str(i)
            mid = (max_length + min_length) / 2.
            nseq = int(np.random.normal(mid, (max_length - min_length) / 10.))

            d['sequence'] = ''.join(
                [letters[np.random.randint(4)] for x in range(0, nseq)])
            d['quality'] = ''.join([
                self._quality_character[np.random.randint(90)]
                for x in range(0, nseq)
            ])
            self.entries.append(AttrDict(**d))

Esempio n. 4

0

Mostra file

File: databases.py Progetto: naveen584/sequana

    def accession_to_info(self, ids):
        """An accession or list of them returns list of dictionaries"""
        res = self.eutils.EFetch(db="nuccore",
                                 id=ids,
                                 rettype="docsum",
                                 retmode="dict")

        res = res['eSummaryResult']['DocSum']

        # if one id provided, it will be a dict, otherwise a list of dicts
        try:
            res[0]
        except:
            res = [res]

        # now we can loop over all identifiers
        records = {}
        accessions = [x.strip() for x in ids.split(',')]

        for i, entry in enumerate(res):
            # first, save the acc number
            accession = entry['Id']
            # then various info
            items = entry['Item']
            identifier = [x for x in items
                          if x['@Name'] == "Extra"][0]['#text']
            if "||" in identifier:
                # strip content after ||
                identifier = identifier.split("||")[0]

            title = [x for x in items if x['@Name'] == "Title"][0]['#text']
            taxid = [x for x in items if x['@Name'] == "TaxId"][0]['#text']
            gi = [x for x in items if x['@Name'] == "Gi"][0]['#text']
            record = {
                "taxid": taxid,
                'accession': accession,
                "identifier": identifier,
                'gi': gi,
                'comment': title
            }

            records[accessions[i]] = AttrDict(**record)
        return records

Esempio n. 5

0

Mostra file

File: fastq.py Progetto: yuelianghaoyuana/biokit

    def _parse_data(self, data):
        self._init()
        for i, line in enumerate(data.split("\n")):
            # skip blankline
            if len(line) == 0:
                continue
            # have we finished to parse the sequence ?
            # if + is found in the new line, then the answer is yes
            if line.startswith("+") and self._parsing_mode == 'sequence':
                self._parsing_mode = 'quality'
                continue

            # assume identifier is on 1 line only
            if self._parsing_mode == 'identifier':
                if line.startswith("@"):
                    self.identifier = FASTQIdentifier(line)  # check
                    self.identifier = self.identifier.identifier
                    self._parsing_mode = 'sequence'
                else:
                    raise ValueError(
                        "Expected @ at the beginning of the line %s" % i)
            elif self._parsing_mode == 'sequence':
                self.sequence += line
                self.nolines += 1
            elif self._parsing_mode == 'quality':
                self.quality += line
                # here the line may start with @, which is confusing with the identifier
                # however, from the sequence we know how many lines are expected
                self.nolines -= 1
                if self.nolines == 0:
                    # clean sequence and quality strings
                    self.sequence = self.sequence.replace("\n", "")
                    self.quality = self.quality.replace("\n", "")
                    self.identifier = self.identifier
                    entry = self.to_dict()
                    entry = AttrDict(**entry)
                    try:
                        self.entries.append(entry)
                    except:
                        # not very good design but if it fails, we assume that
                        # this is the SingleFASTQ class otherwise, the FASTQ class
                        break
                    self._init()

Esempio n. 6

0

Mostra file

File: adapters.py Progetto: pythseq/sequana

    def _to_read(self, this):
        from easydev import AttrDict
        d = AttrDict()
        d.sequence = this.sequence
        if this.comment is None:
            this.comment  = ""

        d.comment = this.comment
        try:
            #pysam format
            d.identifier = this.name
        except:
            # this class convention
            d.identifier = this.identifier
        return d

Esempio n. 7

0

Mostra file

    def plot(self, model_parameters=None, **kwargs):
        """ Take a list of dictionnaries with models parameters to plot
        predicted models. If user doesn't provide parameters, the standard
        plot function from fitting is used.

        Example:
            model_parameters=[{"mu": 5, "sigma": 0.5, "pi": 1}]
        """
        if not model_parameters:
            return super(EM, self).plot(**kwargs)
        # Set parameters with the dictionnary
        self.k = len(model_parameters)
        self.results = AttrDict()
        self.results.mus = [model["mu"] for model in model_parameters]
        self.results.sigmas = [model["sigma"] for model in model_parameters]
        self.results.pis = [model["pi"] for model in model_parameters]
        parms_keys = ("mu", "sigma", "pi")
        self.results.x = [
            model[key] for model in model_parameters for key in parms_keys
        ]
        return super(EM, self).plot(**kwargs)

Esempio n. 8

0

Mostra file

    def estimate(self, guess=None, k=2):
        """

        :param list guess: a list to provide the initial guess. Order is mu1, sigma1,
            pi1, mu2, ...
        :param int k: number of models to be used.
        """
        #print("EM estimation")
        self.k = k
        # Initial guess of parameters and initializations
        if guess is None:
            # estimate the mu/sigma/pis from the data
            guess = self.get_guess()

        mu = np.array(guess[0::3])
        sig = np.array(guess[1::3])
        pi_ = np.array(guess[2::3])
        N_ = len(pi_)

        gamma = np.zeros((N_, int(self.size)))
        N_ = np.zeros(N_)
        p_new = guess

        # EM loop
        counter = 0
        converged = False

        self.mus = []

        while not converged:
            # Compute the responsibility func. and new parameters
            for k in range(0, self.k):
                # unstable if eslf.model.pdf is made of zeros

                #self.model.pdf(self.data, p_new,normalise=False).sum()!=0:
                gamma[k, :] = pi_[k] * ss.norm.pdf(self.data, mu[k], sig[k])
                gamma[k, :] /= (self.model.pdf(self.data,
                                               p_new,
                                               normalise=False))
                """else:
                    gamma[k, :] = pi_[k]*pylab.normpdf(self.data, mu[k],
                        sig[k])/(self.model.pdf(self.data, p_new,
                            normalise=False)+1e-6)
                """
                N_[k] = gamma[k].sum()
                mu[k] = np.sum(gamma[k] * self.data) / N_[k]
                sig[k] = pylab.sqrt(
                    np.sum(gamma[k] * (self.data - mu[k])**2) / N_[k])
                pi_[k] = N_[k] / self.size

            self.results = {'x': p_new, 'nfev': counter, 'success': converged}

            p_new = []
            for this in range(self.k):
                p_new.extend([mu[this], sig[this], pi_[this]])

            #p_new = [(mu[x], sig[x], pi_[x]) for x in range(0, self.k)]
            #p_new = list(pylab.flatten(p_new))

            self.status = True
            try:
                assert abs(N_.sum() - self.size) / self.size < 1e-6
                assert abs(pi_.sum() - 1) < 1e-6
            except:
                print("issue arised at iteration %s" % counter)
                self.debug = {'N': N_, 'pis': pi_}
                self.status = False
                break

            self.mus.append(mu)

            # Convergence check
            counter += 1
            converged = counter >= self.max_iter

        self.gamma = gamma

        if self.status is True:
            self.results = {'x': p_new, 'nfev': counter, 'success': converged}

        self.results = AttrDict(**self.results)
        self.results.mus = self.results.x[0::3]
        self.results.sigmas = self.results.x[1::3]
        self.results.pis = self.results.x[2::3]

        log_likelihood = self.model.log_likelihood(self.results.x, self.data)
        self.results.AIC = criteria.AIC(log_likelihood, k, logL=True)

        self.results.log_likelihood = log_likelihood
        self.results.AIC = criteria.AIC(log_likelihood, self.k, logL=True)
        self.results.AICc = criteria.AICc(log_likelihood,
                                          self.k,
                                          self.data.size,
                                          logL=True)
        self.results.BIC = criteria.BIC(log_likelihood,
                                        self.k,
                                        self.data.size,
                                        logL=True)

Esempio n. 9

0

Mostra file

    def __init__(self,
                 RCMD='R',
                 max_len=10000,
                 use_dict=None,
                 host='localhost',
                 user=None,
                 ssh='ssh',
                 return_err=True,
                 dump_stdout=False,
                 verbose=False,
                 options=('--quiet', '--no-save', '--no-restore')):
        '''
        RCMD: The name of a R interpreter, path information should be included
            if it is not in the system search path.
        use_dict: A R named list will be returned as a Python dictionary if
            "use_dict" is True, or a list of tuples (name, value) if "use_dict"
            is False. If "use_dict" is None, the return value will be a
            dictionary if there is no replicated names, or a list if replicated
            names found.
        host: The computer name (or IP) on which the R interpreter is
            installed. The value "localhost" means that R locates on the the
            localhost computer. On POSIX systems (including Cygwin environment
            on Windows), it is possible to use R on a remote computer if the
            command "ssh" works. To do that, the user needs to set this value,
            and perhaps the parameter "user".
        user: The user name on the remote computer. This value needs to be set
            only if the user name on the remote computer is different from the
            local user. In interactive environment, the password can be input
            by the user if prompted. If running in a program, the user needs to
            be able to login without typing password!
        ssh: The program to login to remote computer.
        return_err: redirect stderr to stdout
        dump_stdout:
            prints output from R directly to sys.stdout, useful for long running
            routines which print progress during execution.
        '''
        # use self.__dict__.update to register variables since __setattr__ is
        # used to set variables for R.  tried to define __setattr in the class,
        # and change it to __setattr__ for instances at the end of __init__,
        # but it seems failed.
        # -- maybe this only failed in Python2.5? as warned at
        # http://wiki.python.org/moin/NewClassVsClassicClass:
        # "Warning: In 2.5, magic names (typically those with a double
        # underscore (DunderAlias) at both ends of the name) may look at the
        # class rather than the instance even for old-style classes."
        self.__dict__.update({
            'prog': None,
            'Rfun': self.__class__.__Rfun,
            'Rexecutable': RCMD,
            'max_len': max_len,
            'use_dict': use_dict,
            'verbose': verbose,
            'dump_stdout': dump_stdout,
            'localhost': host == 'localhost',
            'newline': sys.platform == 'win32' and '\r\n' or '\n',
            'sendAll':
            sendAll  # keep a reference to the global function "sendAll" which will be used by __del__
        })

        RCMD = [RCMD]
        if not self.localhost:
            RCMD.insert(0, host)
            if user:
                RCMD.insert(0, '-l%s' % user)
            RCMD.insert(0, ssh)

        for arg in options:
            if arg not in RCMD:
                RCMD.append(arg)

        if _has_subp and hasattr(subprocess, 'STARTUPINFO'):
            info = subprocess.STARTUPINFO()
            try:
                if hasattr(subprocess, '_subprocess'):
                    info.dwFlags |= subprocess._subprocess.STARTF_USESHOWWINDOW
                    info.wShowWindow = subprocess._subprocess.SW_HIDE
                else:
                    info.dwFlags |= subprocess.STARTF_USESHOWWINDOW
                    info.wShowWindow = subprocess.SW_HIDE
            except:
                info = None
        else:
            info = None

        # create stderr to replace None for py2exe:
        # http://www.py2exe.org/index.cgi/Py2ExeSubprocessInteractions
        if sys.platform != 'win32':
            childstderr = None
        else:
            if hasattr(sys.stderr, 'fileno'):
                childstderr = sys.stderr
            elif hasattr(sys.stderr, '_file') and hasattr(
                    sys.stderr._file, 'fileno'):
                childstderr = sys.stderr._file
            else:  # Give up and point child stderr at nul
                childstderr = file('nul', 'a')

        from easydev import AttrDict
        self.__dict__['subprocess_args'] = AttrDict(
            **{
                'RCMD': RCMD,
                'PIPE': PIPE,
                'stderr': return_err and _STDOUT or childstderr,
                'info': info
            })

        self.reconnect()

Esempio n. 10

0

Mostra file

File: __init__.py Progetto: sinamomken/bioconvert

    'newick': ["newick", "nw", "nhx", "nwk"],  # phylo
    'nexus': ["nexus", "nx", "nex", "nxs"],  # phylo
    'paf': ['paf'],  # assembly
    'phylip': ['phy', 'ph', 'phylip'],  # phylo
    'phyloxml': ['phyloxml', 'xml'],  # phylo
    'sam': ["sam"],  # alignement
    'sra': ["sra"],  # sra format
    'stockholm': ['sto', 'sth', 'stockholm'],  # alignment
    'vcf': ['vcf'],  # variant
    'twobit': ['2bit'],  # sequence
    'tsv': ["tsv"],
    'yaml': ['yaml', 'YAML'],  # misc
    'maf': ["maf"]  # !! this is MIRA format, not mutation alignment format
}

extensions = AttrDict(**extensions)

# nexml   *.xml
# phyloxml    *.xml
"""
ace     *.ace   1.47    No  1.52    Reads the contig sequences from an ACE assembly file. Uses Bio.Sequencing.Ace internally   
clustal     *.aln   1.43    1.43    No  The alignment format of Clustal X and
Clustal W.  CLUSTAL format is recognised by the word CLUSTAL at the beginning of
the file.

fastq-solexa    *.fq, *.fastq   1.50    1.50    1.52    FASTQ files are a bit
like FASTA files but also include sequencing qualities. In
Biopython,"fastq-solexa" refers to the original Solexa/Illumina style FASTQ
files which encode Solexa qualities using an ASCII offset of 64. See also what
we call the "fastq-illumina" format.    There is no standard file extension for
a FASTQ file, but .fq and .fastq, are commonly used. There are different FASTQ

Esempio n. 11

0

Mostra file

File: anova.py Progetto: Donnyvdm/gdsctools

    def _get_one_drug_one_feature_data(self,
                                       drug_id,
                                       feature_name,
                                       diagnostic_only=False):
        """
        return: a dictionary with relevant information. There is also
            a test to see if the data can be analysis or not. This is
            stored ad a boolean value with key called *status*.
        """
        # dictionary  struture to hold results (can set values as attributes)
        dd = AttrDict()

        # select IC50 of a given drug
        # a fast way to select non-NA values from 1 column:
        # dropna is actually faster than a method using a mask.
        #dd.Y = self.ic50.df[drug_id].dropna()
        #indices = dd.Y.index
        #dd.masked_features = self.features.df[feature_name][indices]
        #dd.masked_tissue = self.tissue_factor[indices]
        #dd.masked_msi = self.msi_factor[indices]
        #dd.positive_feature = dd.masked_features.values.sum()
        #dd.negative_feature = len(dd.masked_features) - dd.positive_feature
        #dd.positive_msi = dd.masked_msi.values.sum()
        #dd.negative_msi = len(dd.masked_msi) - dd.positive_msi
        # using a mask instead of indices is 30% slower
        #mask = self.ic50.df[drug_id].isnull()==False
        #dd.masked_features = self.features.df[feature_name][mask]
        #dd.masked_tissue = self.tissue_factor[mask]
        #dd.masked_msi = self.msi_factor[mask]

        # Amother version using a dictionary instead of dataframer is actually
        # 2-3 times faster. It requires to transform the dataframe into a
        # dictionary once for all and dropping the NA as well.
        # Now, the next line takes no time
        dd.Y = self.ic50_dict[drug_id]['Y']

        # an alias to the indices
        indices = self.ic50_dict[drug_id]['indices']
        dd.indices = indices
        # select only relevant tissues/msi/features

        # This a is 5-6 times slower to use loc than the 2 lines of
        # code that follows, the creation of this masked_features was
        # taking 99% of the time in this function and now takes about 50%
        #dd.masked_features = self.features.df.loc[indices, feature_name].values
        real_indices = self.ic50_dict[drug_id]['real_indices']
        dd.masked_features = np.nan_to_num(
            self.features.df[feature_name].values[real_indices])

        dd.masked_tissue = self.tissue_dict[drug_id]
        if self.features.found_msi:
            dd.masked_msi = self.msi_dict[drug_id]
            dd.positive_msi = dd.masked_msi.values.sum()
            dd.negative_msi = len(dd.masked_msi) - dd.positive_msi

        if self.settings.include_media_factor:
            dd.masked_media = self.media_dict[drug_id]

        # compute length of pos/neg features and MSI
        dd.positive_feature = dd.masked_features.sum()
        dd.negative_feature = len(dd.masked_features) - dd.positive_feature

        # Some validity tests to run the analysis or not
        feature_threshold = self.settings.feature_factor_threshold
        msi_threshold = self.settings.MSI_factor_threshold

        A = self.settings.include_MSI_factor and\
            dd.positive_feature >= feature_threshold and\
            dd.negative_feature >= feature_threshold and\
            dd.negative_msi >= msi_threshold and \
            dd.positive_msi >= msi_threshold

        B = (not self.settings.include_MSI_factor) and\
            dd.positive_feature >= feature_threshold and\
            dd.negative_feature >= feature_threshold
        # We could of course use the mean() and std() functions from pandas or
        # numpy. We could also use the glass and cohens functions from the
        # stats module but the following code is much faster because it
        # factorises the computations of mean and variance
        dd.positives = dd.Y[dd.masked_features == 1]
        dd.negatives = dd.Y[dd.masked_features == 0]
        dd.Npos = len(dd.positives)
        dd.Nneg = len(dd.negatives)

        # additional information
        dd.feature_name = feature_name
        dd.drug_id = drug_id
        dd.drug_target = self.drug_decode.get_target(drug_id)
        dd.drug_name = self.drug_decode.get_name(drug_id)

        # FIXME is False does not give the same results as == False
        # in the test test_anova.py !!
        if (A == False) and (B == False):
            dd.status = False
            return dd
        else:
            dd.status = True

        if diagnostic_only is True:
            return dd

        # compute mean and std of pos and neg sets;using mean() takes 15us and
        # using the already computed sum and N takes 5us
        pos_sum = dd.positives.sum()
        neg_sum = dd.negatives.sum()
        dd.pos_IC50_mean = pos_sum / dd.Npos
        dd.neg_IC50_mean = neg_sum / dd.Nneg
        dd.delta_mean_IC50 = dd.pos_IC50_mean - dd.neg_IC50_mean

        # note the ddof to agree with R convention.
        dd.pos_IC50_std = dd.positives.std(ddof=1)
        dd.neg_IC50_std = dd.negatives.std(ddof=1)

        # Nov 2016. Den may be close to zero but slightly negative
        den = (dd.positives**2).sum() - pos_sum**2 / dd.Npos
        dd.pos_IC50_std = np.sqrt(max(0, den) / (dd.Npos - 1.))

        den = (dd.negatives**2).sum() - neg_sum**2 / dd.Nneg
        dd.neg_IC50_std = np.sqrt(max(0, den) / (dd.Nneg - 1.))

        # Compute Cohens and Glass effect size. Since underlying code
        # has lots in common, we do not use the modules but add
        # the code here below
        md = np.abs(dd.pos_IC50_mean - dd.neg_IC50_mean)

        dd.pos_glass = md / dd.pos_IC50_std
        dd.neg_glass = md / dd.neg_IC50_std


        csd = (dd.Npos - 1.) * dd.pos_IC50_std**2 + \
                (dd.Nneg - 1.) * dd.neg_IC50_std**2
        csd /= dd.Npos + dd.Nneg - 2.  # make sure this is float
        if (csd > 0):
            dd.effectsize_ic50 = md / np.sqrt(csd)
        else:
            print("Unexpected negative effect size for %s %s. Set to zero. " %
                  (drug_id, feature_name))
            dd.effectsize_ic50 = 0

        # Note that equal_var is a user parameter and affects
        # results. The ANOVA_results.txt obtained from SFTP
        # have different values meaning that the equal.var param
        # was set to False. Note that pvalue is stored at index 1
        dd.ttest = self._get_ttest(dd.negatives, dd.positives)
        return dd

Esempio n. 12

0

Mostra file

def test_cutadapt_options():

    p = argparse.ArgumentParser()
    so = CutadaptOptions()
    so.add_options(p)

    # test the adapter choice
    for this in ["universal", "PCRFree", "none"]:
        options = {
            "cutadapt_adapter_choice": this,
            "cutadapt_design_file": None,
            "cutadapt_fwd": None,
            "cutadapt_rev": None,
            "skip_cutadapt": False,
        }
        #p.parse_args([])
        options = AttrDict(**options)
        so.check_options(options)

    # test for a valid design and adapter choice
    options = {
        "cutadapt_adapter_choice": "TruSeq",
        "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"),
        "cutadapt_fwd": None,
        "cutadapt_rev": None,
        "skip_cutadapt": False,
    }
    options = AttrDict(**options)
    so.check_options(options)

    # test for a valid design but wrong adapter choice
    options = {
        "cutadapt_adapter_choice": "Nextera",
        "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"),
        "cutadapt_fwd": None,
        "cutadapt_rev": None,
        "skip_cutadapt": False,
    }
    options = AttrDict(**options)
    try:
        so.check_options(options)
        assert False
    except:
        assert True

    # wrong combo (missing adapter choice)
    options = {
        "cutadapt_adapter_choice": None,
        "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"),
        "cutadapt_fwd": None,
        "cutadapt_rev": None,
        "skip_cutadapt": False,
    }
    options = AttrDict(**options)
    try:
        so.check_options(options)
        assert False
    except:
        assert True

    # wrong quality (missing adapter choice)
    try:
        p.parse_args(["--cutadapt-quality", "-1"])
        assert False
    except:
        assert True
    p.parse_args(["--cutadapt-quality", "10"])

    # test for a valid design and adapter choice but also fwd/rev provided
    # whereas, we cannot do anything with this combo
    options = {
        "cutadapt_adapter_choice": "TruSeq",
        "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"),
        "cutadapt_fwd": "ACGT",  # dummy values
        "cutadapt_rev": "CGTA",  # dummy values
        "skip_cutadapt": False,
    }
    options = AttrDict(**options)
    try:
        so.check_options(options)
        assert False
    except:
        assert True

    options = {
        "cutadapt_adapter_choice": None,
        "cutadapt_design_file": None,
        "cutadapt_fwd": sequana_data("TruSeqCD_DNA_fwd.fa"),
        "cutadapt_rev": sequana_data("TruSeqCD_DNA_rev.fa"),
        "skip_cutadapt": False,
    }
    options = AttrDict(**options)
    so.check_options(options)

Esempio n. 13

0

Mostra file

File: anova.py Progetto: CancerRxGene/gdsctools

    def _get_one_drug_one_feature_data(self, drug_id, feature_name, diagnostic_only=False):
        """
        return: a dictionary with relevant information. There is also
            a test to see if the data can be analysis or not. This is
            stored ad a boolean value with key called *status*.
        """
        # dictionary  struture to hold results (can set values as attributes)
        dd = AttrDict()

        # select IC50 of a given drug
        # a fast way to select non-NA values from 1 column:
        # dropna is actually faster than a method using a mask.
        # dd.Y = self.ic50.df[drug_id].dropna()
        # indices = dd.Y.index
        # dd.masked_features = self.features.df[feature_name][indices]
        # dd.masked_tissue = self.tissue_factor[indices]
        # dd.masked_msi = self.msi_factor[indices]
        # dd.positive_feature = dd.masked_features.values.sum()
        # dd.negative_feature = len(dd.masked_features) - dd.positive_feature
        # dd.positive_msi = dd.masked_msi.values.sum()
        # dd.negative_msi = len(dd.masked_msi) - dd.positive_msi
        # using a mask instead of indices is 30% slower
        # mask = self.ic50.df[drug_id].isnull()==False
        # dd.masked_features = self.features.df[feature_name][mask]
        # dd.masked_tissue = self.tissue_factor[mask]
        # dd.masked_msi = self.msi_factor[mask]

        # Amother version using a dictionary instead of dataframer is actually
        # 2-3 times faster. It requires to transform the dataframe into a
        # dictionary once for all and dropping the NA as well.
        # Now, the next line takes no time
        dd.Y = self.ic50_dict[drug_id]["Y"]

        # an alias to the indices
        indices = self.ic50_dict[drug_id]["indices"]
        dd.indices = indices
        # select only relevant tissues/msi/features

        # This a is 5-6 times slower to use loc than the 2 lines of
        # code that follows, the creation of this masked_features was
        # taking 99% of the time in this function and now takes about 50%
        # dd.masked_features = self.features.df.loc[indices, feature_name].values
        real_indices = self.ic50_dict[drug_id]["real_indices"]
        dd.masked_features = self.features.df[feature_name].values[real_indices]

        dd.masked_tissue = self.tissue_dict[drug_id]
        if self.features.found_msi:
            dd.masked_msi = self.msi_dict[drug_id]
            dd.positive_msi = dd.masked_msi.values.sum()
            dd.negative_msi = len(dd.masked_msi) - dd.positive_msi

        if self.settings.include_media_factor:
            dd.masked_media = self.media_dict[drug_id]

        # compute length of pos/neg features and MSI
        dd.positive_feature = dd.masked_features.sum()
        dd.negative_feature = len(dd.masked_features) - dd.positive_feature

        # Some validity tests to run the analysis or not
        feature_threshold = self.settings.feature_factor_threshold
        msi_threshold = self.settings.MSI_factor_threshold

        A = (
            self.settings.include_MSI_factor
            and dd.positive_feature >= feature_threshold
            and dd.negative_feature >= feature_threshold
            and dd.negative_msi >= msi_threshold
            and dd.positive_msi >= msi_threshold
        )

        B = (
            (not self.settings.include_MSI_factor)
            and dd.positive_feature >= feature_threshold
            and dd.negative_feature >= feature_threshold
        )
        # We could of course use the mean() and std() functions from pandas or
        # numpy. We could also use the glass and cohens functions from the
        # stats module but the following code is much faster because it
        # factorises the computations of mean and variance
        dd.positives = dd.Y[dd.masked_features == 1]
        dd.negatives = dd.Y[dd.masked_features == 0]
        dd.Npos = len(dd.positives)
        dd.Nneg = len(dd.negatives)

        # additional information
        dd.feature_name = feature_name
        dd.drug_id = drug_id
        dd.drug_target = self.drug_decode.get_target(drug_id)
        dd.drug_name = self.drug_decode.get_name(drug_id)

        # FIXME is False does not give the same results as == False
        # in the test test_anova.py !!
        if (A == False) and (B == False):
            dd.status = False
            return dd
        else:
            dd.status = True

        if diagnostic_only is True:
            return dd

        # compute mean and std of pos and neg sets;using mean() takes 15us and
        # using the already computed sum and N takes 5us
        pos_sum = dd.positives.sum()
        neg_sum = dd.negatives.sum()
        dd.pos_IC50_mean = pos_sum / dd.Npos
        dd.neg_IC50_mean = neg_sum / dd.Nneg
        dd.delta_mean_IC50 = dd.pos_IC50_mean - dd.neg_IC50_mean

        # note the ddof to agree with R convention.
        dd.pos_IC50_std = dd.positives.std(ddof=1)
        dd.neg_IC50_std = dd.negatives.std(ddof=1)

        # Nov 2016. Den may be close to zero but slightly negative
        den = (dd.positives ** 2).sum() - pos_sum ** 2 / dd.Npos
        dd.pos_IC50_std = np.sqrt(max(0, den) / (dd.Npos - 1.0))

        den = (dd.negatives ** 2).sum() - neg_sum ** 2 / dd.Nneg
        dd.neg_IC50_std = np.sqrt(max(0, den) / (dd.Nneg - 1.0))

        # Compute Cohens and Glass effect size. Since underlying code
        # has lots in common, we do not use the modules but add
        # the code here below
        md = np.abs(dd.pos_IC50_mean - dd.neg_IC50_mean)

        dd.pos_glass = md / dd.pos_IC50_std
        dd.neg_glass = md / dd.neg_IC50_std

        csd = (dd.Npos - 1.0) * dd.pos_IC50_std ** 2 + (dd.Nneg - 1.0) * dd.neg_IC50_std ** 2
        csd /= dd.Npos + dd.Nneg - 2.0  # make sure this is float
        if csd > 0:
            dd.effectsize_ic50 = md / np.sqrt(csd)
        else:
            print("Unexpected negative effect size for %s %s. Set to zero. " % (drug_id, feature_name))
            dd.effectsize_ic50 = 0

        # Note that equal_var is a user parameter and affects
        # results. The ANOVA_results.txt obtained from SFTP
        # have different values meaning that the equal.var param
        # was set to False. Note that pvalue is stored at index 1
        dd.ttest = self._get_ttest(dd.negatives, dd.positives)
        return dd