def __init__(self, data, sep="\t", settings=None): """.. rubric:: Constructor :param data: an :class:`~gdsctools.anova.ANOVAResults` instance or a dataframe with the proper columns names (see below) :param settings: an instance of :class:`~gdsctools.settings.ANOVASettings` Expected column names to be found if a filename is provided:: ANOVA_FEATURE_pval ANOVA_FEATURE_FDR FEATURE_delta_MEAN_IC50 FEATURE_IC50_effect_size N_FEATURE_pos N_FEATURE_pos FEATURE DRUG_ID If the plotting is too slow, you can use the :meth:`selector` to prune the results (most of the data are noise and overlap on the middle bottom area of the plot with little information. """ # a copy since we do may change the data try: # an ANOVAResults contains a df attribute self.df = data.df.copy() except: # probably a dataframe self.df = data.copy() # this is redundant could reuse the input ?? if settings is None: from gdsctools.settings import ANOVASettings self.settings = ANOVASettings() else: self.settings = AttrDict(**settings) self.figtools = Savefig() self.figtools.directory = self.settings.directory self.drugs = set(self.df[self._colname_drugid]) self.features = set(self.df[self._colname_feature]) # intensive calls made once for all self.groups_by_drugs = self.df.groupby(self._colname_drugid).groups self.groups_by_features = self.df.groupby(self._colname_feature).groups
def import_tables(self): from easydev import AttrDict data = { compa.stem.replace("_degs_DESeq2", "").replace("-", "_"): RNADiffTable( compa, alpha=self._alpha, log2_fc=self._log2_fc, condition=self.condition, # gff=self.annotation.annotation, ) for compa in self.files } return AttrDict(**data)
def create_random_data(self, N, min_length=10, max_length=40): import numpy as np letters = 'ACGT' self.clear() for i in range(0, N): d = {} d['identifier'] = str(i) mid = (max_length + min_length) / 2. nseq = int(np.random.normal(mid, (max_length - min_length) / 10.)) d['sequence'] = ''.join( [letters[np.random.randint(4)] for x in range(0, nseq)]) d['quality'] = ''.join([ self._quality_character[np.random.randint(90)] for x in range(0, nseq) ]) self.entries.append(AttrDict(**d))
def accession_to_info(self, ids): """An accession or list of them returns list of dictionaries""" res = self.eutils.EFetch(db="nuccore", id=ids, rettype="docsum", retmode="dict") res = res['eSummaryResult']['DocSum'] # if one id provided, it will be a dict, otherwise a list of dicts try: res[0] except: res = [res] # now we can loop over all identifiers records = {} accessions = [x.strip() for x in ids.split(',')] for i, entry in enumerate(res): # first, save the acc number accession = entry['Id'] # then various info items = entry['Item'] identifier = [x for x in items if x['@Name'] == "Extra"][0]['#text'] if "||" in identifier: # strip content after || identifier = identifier.split("||")[0] title = [x for x in items if x['@Name'] == "Title"][0]['#text'] taxid = [x for x in items if x['@Name'] == "TaxId"][0]['#text'] gi = [x for x in items if x['@Name'] == "Gi"][0]['#text'] record = { "taxid": taxid, 'accession': accession, "identifier": identifier, 'gi': gi, 'comment': title } records[accessions[i]] = AttrDict(**record) return records
def _parse_data(self, data): self._init() for i, line in enumerate(data.split("\n")): # skip blankline if len(line) == 0: continue # have we finished to parse the sequence ? # if + is found in the new line, then the answer is yes if line.startswith("+") and self._parsing_mode == 'sequence': self._parsing_mode = 'quality' continue # assume identifier is on 1 line only if self._parsing_mode == 'identifier': if line.startswith("@"): self.identifier = FASTQIdentifier(line) # check self.identifier = self.identifier.identifier self._parsing_mode = 'sequence' else: raise ValueError( "Expected @ at the beginning of the line %s" % i) elif self._parsing_mode == 'sequence': self.sequence += line self.nolines += 1 elif self._parsing_mode == 'quality': self.quality += line # here the line may start with @, which is confusing with the identifier # however, from the sequence we know how many lines are expected self.nolines -= 1 if self.nolines == 0: # clean sequence and quality strings self.sequence = self.sequence.replace("\n", "") self.quality = self.quality.replace("\n", "") self.identifier = self.identifier entry = self.to_dict() entry = AttrDict(**entry) try: self.entries.append(entry) except: # not very good design but if it fails, we assume that # this is the SingleFASTQ class otherwise, the FASTQ class break self._init()
def _to_read(self, this): from easydev import AttrDict d = AttrDict() d.sequence = this.sequence if this.comment is None: this.comment = "" d.comment = this.comment try: #pysam format d.identifier = this.name except: # this class convention d.identifier = this.identifier return d
def plot(self, model_parameters=None, **kwargs): """ Take a list of dictionnaries with models parameters to plot predicted models. If user doesn't provide parameters, the standard plot function from fitting is used. Example: model_parameters=[{"mu": 5, "sigma": 0.5, "pi": 1}] """ if not model_parameters: return super(EM, self).plot(**kwargs) # Set parameters with the dictionnary self.k = len(model_parameters) self.results = AttrDict() self.results.mus = [model["mu"] for model in model_parameters] self.results.sigmas = [model["sigma"] for model in model_parameters] self.results.pis = [model["pi"] for model in model_parameters] parms_keys = ("mu", "sigma", "pi") self.results.x = [ model[key] for model in model_parameters for key in parms_keys ] return super(EM, self).plot(**kwargs)
def estimate(self, guess=None, k=2): """ :param list guess: a list to provide the initial guess. Order is mu1, sigma1, pi1, mu2, ... :param int k: number of models to be used. """ #print("EM estimation") self.k = k # Initial guess of parameters and initializations if guess is None: # estimate the mu/sigma/pis from the data guess = self.get_guess() mu = np.array(guess[0::3]) sig = np.array(guess[1::3]) pi_ = np.array(guess[2::3]) N_ = len(pi_) gamma = np.zeros((N_, int(self.size))) N_ = np.zeros(N_) p_new = guess # EM loop counter = 0 converged = False self.mus = [] while not converged: # Compute the responsibility func. and new parameters for k in range(0, self.k): # unstable if eslf.model.pdf is made of zeros #self.model.pdf(self.data, p_new,normalise=False).sum()!=0: gamma[k, :] = pi_[k] * ss.norm.pdf(self.data, mu[k], sig[k]) gamma[k, :] /= (self.model.pdf(self.data, p_new, normalise=False)) """else: gamma[k, :] = pi_[k]*pylab.normpdf(self.data, mu[k], sig[k])/(self.model.pdf(self.data, p_new, normalise=False)+1e-6) """ N_[k] = gamma[k].sum() mu[k] = np.sum(gamma[k] * self.data) / N_[k] sig[k] = pylab.sqrt( np.sum(gamma[k] * (self.data - mu[k])**2) / N_[k]) pi_[k] = N_[k] / self.size self.results = {'x': p_new, 'nfev': counter, 'success': converged} p_new = [] for this in range(self.k): p_new.extend([mu[this], sig[this], pi_[this]]) #p_new = [(mu[x], sig[x], pi_[x]) for x in range(0, self.k)] #p_new = list(pylab.flatten(p_new)) self.status = True try: assert abs(N_.sum() - self.size) / self.size < 1e-6 assert abs(pi_.sum() - 1) < 1e-6 except: print("issue arised at iteration %s" % counter) self.debug = {'N': N_, 'pis': pi_} self.status = False break self.mus.append(mu) # Convergence check counter += 1 converged = counter >= self.max_iter self.gamma = gamma if self.status is True: self.results = {'x': p_new, 'nfev': counter, 'success': converged} self.results = AttrDict(**self.results) self.results.mus = self.results.x[0::3] self.results.sigmas = self.results.x[1::3] self.results.pis = self.results.x[2::3] log_likelihood = self.model.log_likelihood(self.results.x, self.data) self.results.AIC = criteria.AIC(log_likelihood, k, logL=True) self.results.log_likelihood = log_likelihood self.results.AIC = criteria.AIC(log_likelihood, self.k, logL=True) self.results.AICc = criteria.AICc(log_likelihood, self.k, self.data.size, logL=True) self.results.BIC = criteria.BIC(log_likelihood, self.k, self.data.size, logL=True)
def __init__(self, RCMD='R', max_len=10000, use_dict=None, host='localhost', user=None, ssh='ssh', return_err=True, dump_stdout=False, verbose=False, options=('--quiet', '--no-save', '--no-restore')): ''' RCMD: The name of a R interpreter, path information should be included if it is not in the system search path. use_dict: A R named list will be returned as a Python dictionary if "use_dict" is True, or a list of tuples (name, value) if "use_dict" is False. If "use_dict" is None, the return value will be a dictionary if there is no replicated names, or a list if replicated names found. host: The computer name (or IP) on which the R interpreter is installed. The value "localhost" means that R locates on the the localhost computer. On POSIX systems (including Cygwin environment on Windows), it is possible to use R on a remote computer if the command "ssh" works. To do that, the user needs to set this value, and perhaps the parameter "user". user: The user name on the remote computer. This value needs to be set only if the user name on the remote computer is different from the local user. In interactive environment, the password can be input by the user if prompted. If running in a program, the user needs to be able to login without typing password! ssh: The program to login to remote computer. return_err: redirect stderr to stdout dump_stdout: prints output from R directly to sys.stdout, useful for long running routines which print progress during execution. ''' # use self.__dict__.update to register variables since __setattr__ is # used to set variables for R. tried to define __setattr in the class, # and change it to __setattr__ for instances at the end of __init__, # but it seems failed. # -- maybe this only failed in Python2.5? as warned at # http://wiki.python.org/moin/NewClassVsClassicClass: # "Warning: In 2.5, magic names (typically those with a double # underscore (DunderAlias) at both ends of the name) may look at the # class rather than the instance even for old-style classes." self.__dict__.update({ 'prog': None, 'Rfun': self.__class__.__Rfun, 'Rexecutable': RCMD, 'max_len': max_len, 'use_dict': use_dict, 'verbose': verbose, 'dump_stdout': dump_stdout, 'localhost': host == 'localhost', 'newline': sys.platform == 'win32' and '\r\n' or '\n', 'sendAll': sendAll # keep a reference to the global function "sendAll" which will be used by __del__ }) RCMD = [RCMD] if not self.localhost: RCMD.insert(0, host) if user: RCMD.insert(0, '-l%s' % user) RCMD.insert(0, ssh) for arg in options: if arg not in RCMD: RCMD.append(arg) if _has_subp and hasattr(subprocess, 'STARTUPINFO'): info = subprocess.STARTUPINFO() try: if hasattr(subprocess, '_subprocess'): info.dwFlags |= subprocess._subprocess.STARTF_USESHOWWINDOW info.wShowWindow = subprocess._subprocess.SW_HIDE else: info.dwFlags |= subprocess.STARTF_USESHOWWINDOW info.wShowWindow = subprocess.SW_HIDE except: info = None else: info = None # create stderr to replace None for py2exe: # http://www.py2exe.org/index.cgi/Py2ExeSubprocessInteractions if sys.platform != 'win32': childstderr = None else: if hasattr(sys.stderr, 'fileno'): childstderr = sys.stderr elif hasattr(sys.stderr, '_file') and hasattr( sys.stderr._file, 'fileno'): childstderr = sys.stderr._file else: # Give up and point child stderr at nul childstderr = file('nul', 'a') from easydev import AttrDict self.__dict__['subprocess_args'] = AttrDict( **{ 'RCMD': RCMD, 'PIPE': PIPE, 'stderr': return_err and _STDOUT or childstderr, 'info': info }) self.reconnect()
'newick': ["newick", "nw", "nhx", "nwk"], # phylo 'nexus': ["nexus", "nx", "nex", "nxs"], # phylo 'paf': ['paf'], # assembly 'phylip': ['phy', 'ph', 'phylip'], # phylo 'phyloxml': ['phyloxml', 'xml'], # phylo 'sam': ["sam"], # alignement 'sra': ["sra"], # sra format 'stockholm': ['sto', 'sth', 'stockholm'], # alignment 'vcf': ['vcf'], # variant 'twobit': ['2bit'], # sequence 'tsv': ["tsv"], 'yaml': ['yaml', 'YAML'], # misc 'maf': ["maf"] # !! this is MIRA format, not mutation alignment format } extensions = AttrDict(**extensions) # nexml *.xml # phyloxml *.xml """ ace *.ace 1.47 No 1.52 Reads the contig sequences from an ACE assembly file. Uses Bio.Sequencing.Ace internally clustal *.aln 1.43 1.43 No The alignment format of Clustal X and Clustal W. CLUSTAL format is recognised by the word CLUSTAL at the beginning of the file. fastq-solexa *.fq, *.fastq 1.50 1.50 1.52 FASTQ files are a bit like FASTA files but also include sequencing qualities. In Biopython,"fastq-solexa" refers to the original Solexa/Illumina style FASTQ files which encode Solexa qualities using an ASCII offset of 64. See also what we call the "fastq-illumina" format. There is no standard file extension for a FASTQ file, but .fq and .fastq, are commonly used. There are different FASTQ
def _get_one_drug_one_feature_data(self, drug_id, feature_name, diagnostic_only=False): """ return: a dictionary with relevant information. There is also a test to see if the data can be analysis or not. This is stored ad a boolean value with key called *status*. """ # dictionary struture to hold results (can set values as attributes) dd = AttrDict() # select IC50 of a given drug # a fast way to select non-NA values from 1 column: # dropna is actually faster than a method using a mask. #dd.Y = self.ic50.df[drug_id].dropna() #indices = dd.Y.index #dd.masked_features = self.features.df[feature_name][indices] #dd.masked_tissue = self.tissue_factor[indices] #dd.masked_msi = self.msi_factor[indices] #dd.positive_feature = dd.masked_features.values.sum() #dd.negative_feature = len(dd.masked_features) - dd.positive_feature #dd.positive_msi = dd.masked_msi.values.sum() #dd.negative_msi = len(dd.masked_msi) - dd.positive_msi # using a mask instead of indices is 30% slower #mask = self.ic50.df[drug_id].isnull()==False #dd.masked_features = self.features.df[feature_name][mask] #dd.masked_tissue = self.tissue_factor[mask] #dd.masked_msi = self.msi_factor[mask] # Amother version using a dictionary instead of dataframer is actually # 2-3 times faster. It requires to transform the dataframe into a # dictionary once for all and dropping the NA as well. # Now, the next line takes no time dd.Y = self.ic50_dict[drug_id]['Y'] # an alias to the indices indices = self.ic50_dict[drug_id]['indices'] dd.indices = indices # select only relevant tissues/msi/features # This a is 5-6 times slower to use loc than the 2 lines of # code that follows, the creation of this masked_features was # taking 99% of the time in this function and now takes about 50% #dd.masked_features = self.features.df.loc[indices, feature_name].values real_indices = self.ic50_dict[drug_id]['real_indices'] dd.masked_features = np.nan_to_num( self.features.df[feature_name].values[real_indices]) dd.masked_tissue = self.tissue_dict[drug_id] if self.features.found_msi: dd.masked_msi = self.msi_dict[drug_id] dd.positive_msi = dd.masked_msi.values.sum() dd.negative_msi = len(dd.masked_msi) - dd.positive_msi if self.settings.include_media_factor: dd.masked_media = self.media_dict[drug_id] # compute length of pos/neg features and MSI dd.positive_feature = dd.masked_features.sum() dd.negative_feature = len(dd.masked_features) - dd.positive_feature # Some validity tests to run the analysis or not feature_threshold = self.settings.feature_factor_threshold msi_threshold = self.settings.MSI_factor_threshold A = self.settings.include_MSI_factor and\ dd.positive_feature >= feature_threshold and\ dd.negative_feature >= feature_threshold and\ dd.negative_msi >= msi_threshold and \ dd.positive_msi >= msi_threshold B = (not self.settings.include_MSI_factor) and\ dd.positive_feature >= feature_threshold and\ dd.negative_feature >= feature_threshold # We could of course use the mean() and std() functions from pandas or # numpy. We could also use the glass and cohens functions from the # stats module but the following code is much faster because it # factorises the computations of mean and variance dd.positives = dd.Y[dd.masked_features == 1] dd.negatives = dd.Y[dd.masked_features == 0] dd.Npos = len(dd.positives) dd.Nneg = len(dd.negatives) # additional information dd.feature_name = feature_name dd.drug_id = drug_id dd.drug_target = self.drug_decode.get_target(drug_id) dd.drug_name = self.drug_decode.get_name(drug_id) # FIXME is False does not give the same results as == False # in the test test_anova.py !! if (A == False) and (B == False): dd.status = False return dd else: dd.status = True if diagnostic_only is True: return dd # compute mean and std of pos and neg sets;using mean() takes 15us and # using the already computed sum and N takes 5us pos_sum = dd.positives.sum() neg_sum = dd.negatives.sum() dd.pos_IC50_mean = pos_sum / dd.Npos dd.neg_IC50_mean = neg_sum / dd.Nneg dd.delta_mean_IC50 = dd.pos_IC50_mean - dd.neg_IC50_mean # note the ddof to agree with R convention. dd.pos_IC50_std = dd.positives.std(ddof=1) dd.neg_IC50_std = dd.negatives.std(ddof=1) # Nov 2016. Den may be close to zero but slightly negative den = (dd.positives**2).sum() - pos_sum**2 / dd.Npos dd.pos_IC50_std = np.sqrt(max(0, den) / (dd.Npos - 1.)) den = (dd.negatives**2).sum() - neg_sum**2 / dd.Nneg dd.neg_IC50_std = np.sqrt(max(0, den) / (dd.Nneg - 1.)) # Compute Cohens and Glass effect size. Since underlying code # has lots in common, we do not use the modules but add # the code here below md = np.abs(dd.pos_IC50_mean - dd.neg_IC50_mean) dd.pos_glass = md / dd.pos_IC50_std dd.neg_glass = md / dd.neg_IC50_std csd = (dd.Npos - 1.) * dd.pos_IC50_std**2 + \ (dd.Nneg - 1.) * dd.neg_IC50_std**2 csd /= dd.Npos + dd.Nneg - 2. # make sure this is float if (csd > 0): dd.effectsize_ic50 = md / np.sqrt(csd) else: print("Unexpected negative effect size for %s %s. Set to zero. " % (drug_id, feature_name)) dd.effectsize_ic50 = 0 # Note that equal_var is a user parameter and affects # results. The ANOVA_results.txt obtained from SFTP # have different values meaning that the equal.var param # was set to False. Note that pvalue is stored at index 1 dd.ttest = self._get_ttest(dd.negatives, dd.positives) return dd
def test_cutadapt_options(): p = argparse.ArgumentParser() so = CutadaptOptions() so.add_options(p) # test the adapter choice for this in ["universal", "PCRFree", "none"]: options = { "cutadapt_adapter_choice": this, "cutadapt_design_file": None, "cutadapt_fwd": None, "cutadapt_rev": None, "skip_cutadapt": False, } #p.parse_args([]) options = AttrDict(**options) so.check_options(options) # test for a valid design and adapter choice options = { "cutadapt_adapter_choice": "TruSeq", "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"), "cutadapt_fwd": None, "cutadapt_rev": None, "skip_cutadapt": False, } options = AttrDict(**options) so.check_options(options) # test for a valid design but wrong adapter choice options = { "cutadapt_adapter_choice": "Nextera", "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"), "cutadapt_fwd": None, "cutadapt_rev": None, "skip_cutadapt": False, } options = AttrDict(**options) try: so.check_options(options) assert False except: assert True # wrong combo (missing adapter choice) options = { "cutadapt_adapter_choice": None, "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"), "cutadapt_fwd": None, "cutadapt_rev": None, "skip_cutadapt": False, } options = AttrDict(**options) try: so.check_options(options) assert False except: assert True # wrong quality (missing adapter choice) try: p.parse_args(["--cutadapt-quality", "-1"]) assert False except: assert True p.parse_args(["--cutadapt-quality", "10"]) # test for a valid design and adapter choice but also fwd/rev provided # whereas, we cannot do anything with this combo options = { "cutadapt_adapter_choice": "TruSeq", "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"), "cutadapt_fwd": "ACGT", # dummy values "cutadapt_rev": "CGTA", # dummy values "skip_cutadapt": False, } options = AttrDict(**options) try: so.check_options(options) assert False except: assert True options = { "cutadapt_adapter_choice": None, "cutadapt_design_file": None, "cutadapt_fwd": sequana_data("TruSeqCD_DNA_fwd.fa"), "cutadapt_rev": sequana_data("TruSeqCD_DNA_rev.fa"), "skip_cutadapt": False, } options = AttrDict(**options) so.check_options(options)
def _get_one_drug_one_feature_data(self, drug_id, feature_name, diagnostic_only=False): """ return: a dictionary with relevant information. There is also a test to see if the data can be analysis or not. This is stored ad a boolean value with key called *status*. """ # dictionary struture to hold results (can set values as attributes) dd = AttrDict() # select IC50 of a given drug # a fast way to select non-NA values from 1 column: # dropna is actually faster than a method using a mask. # dd.Y = self.ic50.df[drug_id].dropna() # indices = dd.Y.index # dd.masked_features = self.features.df[feature_name][indices] # dd.masked_tissue = self.tissue_factor[indices] # dd.masked_msi = self.msi_factor[indices] # dd.positive_feature = dd.masked_features.values.sum() # dd.negative_feature = len(dd.masked_features) - dd.positive_feature # dd.positive_msi = dd.masked_msi.values.sum() # dd.negative_msi = len(dd.masked_msi) - dd.positive_msi # using a mask instead of indices is 30% slower # mask = self.ic50.df[drug_id].isnull()==False # dd.masked_features = self.features.df[feature_name][mask] # dd.masked_tissue = self.tissue_factor[mask] # dd.masked_msi = self.msi_factor[mask] # Amother version using a dictionary instead of dataframer is actually # 2-3 times faster. It requires to transform the dataframe into a # dictionary once for all and dropping the NA as well. # Now, the next line takes no time dd.Y = self.ic50_dict[drug_id]["Y"] # an alias to the indices indices = self.ic50_dict[drug_id]["indices"] dd.indices = indices # select only relevant tissues/msi/features # This a is 5-6 times slower to use loc than the 2 lines of # code that follows, the creation of this masked_features was # taking 99% of the time in this function and now takes about 50% # dd.masked_features = self.features.df.loc[indices, feature_name].values real_indices = self.ic50_dict[drug_id]["real_indices"] dd.masked_features = self.features.df[feature_name].values[real_indices] dd.masked_tissue = self.tissue_dict[drug_id] if self.features.found_msi: dd.masked_msi = self.msi_dict[drug_id] dd.positive_msi = dd.masked_msi.values.sum() dd.negative_msi = len(dd.masked_msi) - dd.positive_msi if self.settings.include_media_factor: dd.masked_media = self.media_dict[drug_id] # compute length of pos/neg features and MSI dd.positive_feature = dd.masked_features.sum() dd.negative_feature = len(dd.masked_features) - dd.positive_feature # Some validity tests to run the analysis or not feature_threshold = self.settings.feature_factor_threshold msi_threshold = self.settings.MSI_factor_threshold A = ( self.settings.include_MSI_factor and dd.positive_feature >= feature_threshold and dd.negative_feature >= feature_threshold and dd.negative_msi >= msi_threshold and dd.positive_msi >= msi_threshold ) B = ( (not self.settings.include_MSI_factor) and dd.positive_feature >= feature_threshold and dd.negative_feature >= feature_threshold ) # We could of course use the mean() and std() functions from pandas or # numpy. We could also use the glass and cohens functions from the # stats module but the following code is much faster because it # factorises the computations of mean and variance dd.positives = dd.Y[dd.masked_features == 1] dd.negatives = dd.Y[dd.masked_features == 0] dd.Npos = len(dd.positives) dd.Nneg = len(dd.negatives) # additional information dd.feature_name = feature_name dd.drug_id = drug_id dd.drug_target = self.drug_decode.get_target(drug_id) dd.drug_name = self.drug_decode.get_name(drug_id) # FIXME is False does not give the same results as == False # in the test test_anova.py !! if (A == False) and (B == False): dd.status = False return dd else: dd.status = True if diagnostic_only is True: return dd # compute mean and std of pos and neg sets;using mean() takes 15us and # using the already computed sum and N takes 5us pos_sum = dd.positives.sum() neg_sum = dd.negatives.sum() dd.pos_IC50_mean = pos_sum / dd.Npos dd.neg_IC50_mean = neg_sum / dd.Nneg dd.delta_mean_IC50 = dd.pos_IC50_mean - dd.neg_IC50_mean # note the ddof to agree with R convention. dd.pos_IC50_std = dd.positives.std(ddof=1) dd.neg_IC50_std = dd.negatives.std(ddof=1) # Nov 2016. Den may be close to zero but slightly negative den = (dd.positives ** 2).sum() - pos_sum ** 2 / dd.Npos dd.pos_IC50_std = np.sqrt(max(0, den) / (dd.Npos - 1.0)) den = (dd.negatives ** 2).sum() - neg_sum ** 2 / dd.Nneg dd.neg_IC50_std = np.sqrt(max(0, den) / (dd.Nneg - 1.0)) # Compute Cohens and Glass effect size. Since underlying code # has lots in common, we do not use the modules but add # the code here below md = np.abs(dd.pos_IC50_mean - dd.neg_IC50_mean) dd.pos_glass = md / dd.pos_IC50_std dd.neg_glass = md / dd.neg_IC50_std csd = (dd.Npos - 1.0) * dd.pos_IC50_std ** 2 + (dd.Nneg - 1.0) * dd.neg_IC50_std ** 2 csd /= dd.Npos + dd.Nneg - 2.0 # make sure this is float if csd > 0: dd.effectsize_ic50 = md / np.sqrt(csd) else: print("Unexpected negative effect size for %s %s. Set to zero. " % (drug_id, feature_name)) dd.effectsize_ic50 = 0 # Note that equal_var is a user parameter and affects # results. The ANOVA_results.txt obtained from SFTP # have different values meaning that the equal.var param # was set to False. Note that pvalue is stored at index 1 dd.ttest = self._get_ttest(dd.negatives, dd.positives) return dd