def opt_federov(self, design_formula, trials, data, max_iterations = 1000000, nullify = 0): info("Starting \"optFederov\" run") info("Using Search Space:") info(str(self.utils.str(data))) formulas = {} for parameter in self.parameter_ranges.keys(): formulas["{0}e".format(parameter)] = Formula("{0}e ~ ({0} - {1}) / {1}".format(parameter, (self.parameter_ranges[parameter][1] - 1.0) / 2.0)) info("Encoding formulas: " + str(self.utils.str(ListVector(formulas)))) info("Data Dimensions: " + str(self.base.dim(data))) coded_data = self.rsm.coded_data(data, formulas = ListVector(formulas)) info("Coded data: " + str(self.utils.str(coded_data))) output = self.algdesign.optFederov(frml = Formula(design_formula), data = coded_data, nTrials = trials, nullify = nullify, nRepeats = 10, maxIteration = max_iterations) return output
def from_r(cls, lv: ro.ListVector): cls._check_expected_model(lv, "FIXED") return cls( name=cls._get_name(lv), p=get_int(lv.rx2("p")), b=get_float_array(lv.rx2("b")), varB=get_float(lv.rx2("varB")), SD_b=get_float_array(lv.rx2("SD.b")), )
def export_smpl_split_to_r(smpls): n_smpls = len(smpls) all_train = ListVector.from_length(n_smpls) all_test = ListVector.from_length(n_smpls) for idx, (train, test) in enumerate(smpls): all_train[idx] = IntVector(train + 1) all_test[idx] = IntVector(test + 1) return all_train, all_test
def decode_data(self, data): formulas = {} for parameter in self.parameter_ranges.keys(): formulas["{0}".format(parameter)] = Formula( "{0} ~ round(({0}e * {1}) + {1})".format( parameter, (self.parameter_ranges[parameter][1] - 1.0) / 2.0)) info("Encoding formulas: " + str(self.base.summary_default(ListVector(formulas)))) info("Data Dimensions: " + str(self.base.dim(data))) return self.rsm.coded_data(data, formulas=ListVector(formulas))
def create_initial_sample(n_obs, dim, type='lhs', lower_bound=None, upper_bound=None): """ Convenient helper function, which creates an initial sample - either based on random (uniform) sampling or using latin hypercube sampling. Args: n_obs: number of observations dim: number of dimensions type: type of sampling strategy (Default value = 'lhs') lower_bound: The lower bounds of the initial sample as a list of size dim (Default value = 0) upper_bound: The upper bounds of the initial sample as a list of size dim (Default value = 1) Returns: numpy array of shape (n_obs x dim) """ if lower_bound is None: lower_bound = [0] * dim if upper_bound is None: upper_bound = [1] * dim pcontrol = { 'init_sample.type': type, 'init_sample.lower': IntVector(lower_bound), 'init_sample.upper': IntVector(upper_bound) } return np.array( flacco.createInitialSample(n_obs, dim, ListVector(pcontrol)))
def _translate_control(control): """ Transforms a python dict to a valid R object Args: control: python dict Returns: R object of type ListVector """ ctrl = {} for key, lst in control.items(): if isinstance(lst, list): if all(isinstance(n, int) for n in lst): entry = IntVector(control[key]) elif all(isinstance(n, bool) for n in lst): entry = BoolVector(control[key]) elif all(isinstance(n, float) for n in lst): entry = FloatVector(control[key]) elif all(isinstance(n, str) for n in lst): entry = StrVector(control[key]) else: entry = None if entry is not None: ctrl[key] = entry else: ctrl[key] = lst return ListVector(ctrl)
def from_r(cls, lv: ro.ListVector): cls._check_expected_model(lv, "BL") return cls( name=cls._get_name(lv), minAbsBeta=get_float(lv.rx2("minAbsBeta")), p=get_int(lv.rx2("p")), MSx=get_float(lv.rx2("MSx")), R2=get_float(lv.rx2("R2")), lambda_=get_float(lv.rx2("lambda")), type=get_str(lv.rx2("type")), shape=get_float(lv.rx2("shape")), rate=get_float(lv.rx2("rate")), b=get_float_array(lv.rx2("b")), tau2=get_float_array(lv.rx2("tau2")), SD_b=get_float_array(lv.rx2("SD.b")), )
def create_roast_scorer(gene_sets='c2.cp.kegg', id_type='entrez', grouping='by_substance', q_value_cutoff=0.1, na_action='fill_0', cache=True, cache_signatures=False): """Only cache signatures when doing permutations, otherwise it will only slow it down""" importr('limma') importr('Biobase') gene_sets_r = ListVector({ gene_set.name: StrVector(list(gene_set.genes)) for gene_set in db.load(gene_sets=gene_sets, id_type=id_type).gene_sets }) def set_gene_set_collection(): globalenv[gene_sets] = gene_sets_r def roast_score(disease: ExpressionWithControls, compound: ExpressionWithControls): if len(compound.cases.columns) < 2 or len( compound.controls.columns) < 2: print( f'Skipping {compound} not enough degrees of freedom (no way to compute in-group variance)' ) return None if cache: multiprocess_cache_manager.respawn_cache_if_needed() try: disease_gene_sets = roast(disease, gene_sets=gene_sets, use_cache=cache) disease_gene_sets.drop(disease_gene_sets[ disease_gene_sets['fdr_q-val'] > q_value_cutoff].index, inplace=True) signature_gene_sets = roast(compound, gene_sets=gene_sets, use_cache=cache and cache_signatures) joined = combine_gsea_results(disease_gene_sets, signature_gene_sets, na_action) if randint(0, 100) == 1: r('gc()') return joined.score.mean() except RRuntimeError as e: print(e) return None return scoring_function(roast_score, input=ExpressionWithControls, grouping=grouping, before_batch=set_gene_set_collection)
def StrListVector(strList): """Convert input to a StrVector, or a ListVector recursively""" try: assert (len(strList) > 0) # NULL, None, '', non-str scalar etc except: return NULL if isinstance(strList, ListVector): # already a ListVector return ListVector(strList) elif isinstance(strList, StrVector): # already a StrVector return StrVector(strList) elif isinstance(strList, str): # str scalar, so apply StrVector return StrVector([strList]) elif any([types.is_list_like(s) for s in strList]): # not the deepest list return ListVector([(None, StrListVector(s)) for s in strList]) else: return StrVector(list(strList)) # is deepest list(-like) of str types
def from_r(cls, lv: ro.ListVector): cls._check_expected_model(lv, "BRR") return cls( name=cls._get_name(lv), p=get_int(lv.rx2("p")), df0=get_float(lv.rx2("df0")), R2=get_float(lv.rx2("R2")), MSx=get_float(lv.rx2("MSx")), S0=get_float(lv.rx2("S0")), b=get_float_array(lv.rx2("b")), varB=get_float(lv.rx2("varB")), SD_b=get_float_array(lv.rx2("SD.b")), SD_varB=get_float(lv.rx2("SD.varB")), )
def test_import_intercell_network(self): from rpy2.robjects import ListVector interactions_params = {"resources": "CellPhoneDB"} transmitter_params = {"categories": "ligand"} receiver_params = {"categories": "receptor"} expected = self.omnipathr.import_intercell_network( interactions_param=ListVector(list(interactions_params.items())), transmitter_param=ListVector(list(transmitter_params.items())), receiver_param=ListVector(list(receiver_params.items())), ) actual = op.interactions.import_intercell_network( interactions_params=interactions_params, transmitter_params=transmitter_params, receiver_params=receiver_params, ) _assert_dataframes_equal(expected, actual)
def dict_to_named_list(dct): if (isinstance(dct, dict) or isinstance(dct, Parameter) or isinstance(dct, pd.core.series.Series)): dct = {key: val for key, val in dct.items()} # convert numbers to builtin types before conversion (see rpy2 #548) for key, val in dct.items(): if isinstance(val, numbers.Integral): dct[key] = int(val) elif isinstance(val, numbers.Number): dct[key] = float(val) r_list = ListVector(dct) return r_list return dct
def get_R_theta(pi, c, Gamma, A, b, Sigma): """Return a R compatible list from numpy arrays""" numpy2ri.activate() in_theta = ListVector(dict( pi=pi, c=c.T, Gamma=Gamma.transpose((1,2,0)), A = A.transpose((1,2,0)), b=b.T, Sigma=Sigma.transpose((1,2,0)) )) numpy2ri.deactivate() return in_theta
def _convert_python_to_R(data: typing.Union[dict, pd.DataFrame]): """ Converts a python object to an R object brms can handle: * python dict -> R list * python dataframe -> R dataframe """ with localconverter(default_converter + pandas2ri.converter + numpy2ri.converter) as cv: if isinstance(data, pd.DataFrame): return DataFrame(data) elif isinstance(data, dict): return ListVector(data) else: raise ValueError("Data should be either a pandas dataframe or a dictionary")
def dict_to_named_list(dct): if (isinstance(dct, dict) or isinstance(dct, Parameter) or isinstance(dct, pd.core.series.Series)): dct = {key: val for key, val in dct.items()} # convert numbers, numpy arrays and pandas dataframes to builtin # types before conversion (see rpy2 #548) with conversion.localconverter(default_converter + pandas2ri.converter + numpy2ri.converter): for key, val in dct.items(): dct[key] = conversion.py2rpy(val) r_list = ListVector(dct) return r_list return dct
def train(self, omics_dataset: Dict, save: bool = False) -> None: """Trains the model on multi-omics data. Parameters ---------- omics_dataset: dict Multi-omics dataset, keyed by datatype save: bool, default False Whether to save the results """ iClusterPlus = importr("iClusterPlus") base = importr("base") if type(omics_dataset) != dict: raise ValueError("omics_data is supposed to be of type dict") X = [omics_dataset[i] for i in omics_dataset] self._datatypes = list(omics_dataset.keys()) n_datasets = len(X) n_samples = X[0].shape[0] feature_counts = [X[i].shape[1] for i in range(n_datasets)] if n_datasets > 4: raise ValueError( "iClusterPlus allows a maximum of only 4 mulit-omics datasets") self._initialize_run(n_samples, n_datasets, feature_counts) data = ListVector.from_length(6) for idx in range(6): data[idx] = NULL for idx, omics_data in enumerate(X): data[idx] = omics_data fitted_model = iClusterPlus.iClusterPlus(data[0], data[1], data[2], data[3], self._types, self._K, self._alpha, self._lambda_reg, maxiter=self._max_iter, eps=self._epsilon) if save: self.save(results=fitted_model) return fitted_model
def run_mimp(mutation_source: str, site_type_name: str, model: str = None, enzyme_type='kinase') -> DataFrame: """Run MIMP for given source of mutations and given site type. Args: mutation_source: name of mutation source site_type_name: name of site type model: name of the model or path to custom .mimp file, if not specified, an automatically generated, custom, site-based model will be used. enzyme_type: is the enzyme that modifies the site a kinase? if not use "catch-all" strategy: train MIMP as if there was just one site-specific enzyme - just because we do not have information about enzyme-site specificity for enzymes other than kinases (yet!) """ site_type = SiteType.query.filter_by(name=site_type_name).one() if not model: model = get_or_create_model_path(site_type, enzyme_type) mimp = load_mimp() sequences, disorder, mutations, sites = prepare_active_driver_data( mutation_source, site_type_name) mutations = mutations.assign(mutation=Series( m.wt_residue + str(m.position) + m.mut_residue for m in mutations.itertuples(index=False)).values) sites.position = to_numeric(sites.position) sequences = ListVector(sequences) modified_residues = site_type.find_modified_residues() mimp_result = mimp.site_mimp(mutations[['gene', 'mutation']], sequences, site_type=site_type_name, sites=sites[['gene', 'position']], residues_groups=residues_groups( site_type, modified_residues), **{'model.data': model}) if mimp_result is NULL: return DataFrame() return pandas2ri.ri2py(mimp_result)
def ro(self): """Expose a view as RObject, to be manipulated in R environment""" # Convert to R vector of correct data type if isinstance(self.iloc, dict): out = ListVector([(None, PyR(v).ro) for v in self.iloc]) if types.is_float_dtype(self.iloc): out = FloatVector(self.iloc.reshape(-1, order='F')) elif types.is_integer_dtype(self.iloc): out = IntVector(self.iloc.reshape(-1, order='F')) else: out = StrVector(self.iloc.reshape(-1, order='F')) if len(self.dim) > 1: # reshape to R Array if has non-trivial dim out = ro.r.array(out, dim=IntVector(self.dim)) # Collect R object name attributes if hasattr(self, 'rownames'): out.rownames = StrVector(self.rownames) if hasattr(self, 'colnames'): out.colnames = StrVector(self.colnames) if hasattr(self, 'names'): out.names = ListVector(self.names) if isinstance( self.names, ListVector) else StrVector(self.names) return out
def from_r(self, lv: ro.ListVector) -> "BGLRResult": switch: "Dict[str, Type[BGLRResult]]" = { "FIXED": FixedResult, "BRR": BRRResult, "BL": BLResult, "BayesA": BayesAResult, "BayesB": BayesBResult, "BayesC": BayesCResult, "RKHS": RKHSResult, } model = get_str(lv.rx2("model")) if model not in switch: raise ValueError( f"Model {model} does not correspond to one of the " "known BGLR models.") return switch[model].from_r(lv)
def collection_to_R(collection, trim_to, min=5, max=500, name=None): if not name: name = collection.name gene_ids = trim_to filtered = { gene_set.name: StrVector(list(gene_set.genes)) for gene_set in ( collection # limma::cameraPR goes crazy without this # limma::mroast seems to work fine (and be more aware of the limitted statistical support) .subset(gene_ids) .gene_sets if gene_ids else collection.gene_sets ) if max > len(gene_set.genes) > min } gene_sets_r = ListVector(filtered) globalenv[name] = gene_sets_r return filtered
def __getitem__(self, args): """Returns copy of subset of data object from slice or index args""" try: if isinstance(self.iloc, dict): # return item of dict if isinstance(args, int): try: args = list(self.names).index(args) except: args = list(self.iloc.keys()).index(args) return self.iloc[args] # replace any str labels in args with its index in self.names if isinstance(args, tuple) and self.names is not None: args = tuple(self.index(a, i) for i, a in enumerate(args)) # extract corresponding subset of names if self.names: names_ = deepcopy(self.names) names = ListVector(names_) for i in range(len(self.names)): if isinstance(names_[i], StrVector): s = np.array(names_[i])[args[i]] names[i] = StrVector([s] if isinstance(s, str) else s) else: names = NULL # finally extract by looping over each dim; enables R-like indexing out = deepcopy(self.iloc) for i, arg in enumerate(args): a = [slice(None)] * len(args) a[i] = arg dims = len(out.shape) out = out[tuple(a)] if self.verbose: print(i, out.shape, dims, tuple(a)) if len(out.shape) < dims: # if this dimension is flattened out names = names[:i] + names[(i + 1):] return PyR(out, names=names) except: raise Exception(f"getitem: {args}")
def train(self, omics_dataset: Dict, save: bool = False, **kwargs): """Trains the model on multi-omics data. Parameters ---------- omics_dataset: dict Multi-omics dataset, keyed by datatype save: bool, default False Whether to save the results """ iClusterPlus = importr("iClusterPlus") base = importr("base") if type(omics_dataset) != dict: raise ValueError("omics_data is supposed to be of type dict") X = [omics_dataset[i] for i in omics_dataset] self._datatypes = list(omics_dataset.keys()) n_datasets = len(X) n_samples = X[0].shape[0] feature_counts = [X[i].shape[1] for i in range(n_datasets)] self._initialize_run(n_samples, n_datasets, feature_counts) data = ListVector.from_length(n_datasets) for idx, omics_data in enumerate(X): data[idx] = omics_data fitted_model = iClusterPlus.iCluster(data, self._num_subtypes, self._lambda_reg, scalar=False, max_iter=self.max_iter, epsilon=self.epsilon) if save: self.save(results=fitted_model) return fitted_model
def gllim(self,sigma_type,gamma_type,T=None,Y=None, Lw=0,in_theta=r('NULL')): constraints = {"full":"","iso":"i"} c_S = constraints[sigma_type] c_G = constraints[gamma_type] dic_cst = {"Sigma":c_S} if c_G: dic_cst["Gammat"] = c_G if in_theta: in_r = r('NULL') print(np.array(in_theta.rx('c')[0]).shape) else: in_r = self.r_init if T is None: T = self.responses else: T = numpy2ri.numpy2ri(T.T) if Y is None: Y = self.covariates else: Y = numpy2ri.numpy2ri(Y.T) mod = self.xLLiM.gllim(T,Y,self.K, in_r=in_r,maxiter=self.maxiter,Lw=Lw, cstr=ListVector(dic_cst), in_theta=in_theta, verb=1) self.model = mod return np.array(mod.rx('pi')[0]),\ np.array(mod.rx('c')[0]).T,\ np.array(mod.rx('Gamma')[0]).transpose((2,0,1)),\ np.array(mod.rx('A')[0]).transpose((2,0,1)),\ np.array(mod.rx("b")[0]).T,\ np.array(mod.rx('Sigma')[0]).transpose((2,0,1))
def dictToList(obj): from rpy2.robjects import ListVector return ListVector(obj)
def call_gsoa(request): # data from task tiger print("request: {}".format(request)) local_buffer = [] try: gsoa = importr('GSOA') #flex_dashboard = importr('') args = request.copy() for field in NECESSARY_FIELDS: args.pop(field) if len(str(request.get('dataFilePath'))) < 2: return "no data" outFilePath = "/data/{}_{}.txt".format( request.get('email', 'results_txt').replace('.com', '').strip(), request.get('dataFilePath').split(".")[0]) print("email: {}".format(request.get('email', 'results_txt'))) #redirect everything from R into the python console (local buffer) rinterface.set_writeconsole_warnerror( lambda line: local_buffer.append(line)) rinterface.set_writeconsole_regular( lambda line: local_buffer.append(line)) result = gsoa.GSOA_ProcessFiles( dataFilePath=request.get('dataFilePath', ''), classFilePath=request.get('classFilePath', ''), gmtFilePath=request.get('gmtFilePath', ''), outFilePath=outFilePath, numCores=multiprocessing.cpu_count(), numRandomIterations=request.get('numRandomIterations', ''), classificationAlgorithm=request.get('classificationAlgorithm', ''), numCrossValidationFolds=request.get('numCrossValidationFolds', ''), removePercentLowestExpr=request.get('removePercentLowestExpr', ''), removePercentLowestVar=request.get('removePercentLowestVar', '')) print("Writing RMarkdown") outFilePath_html = outFilePath.replace('txt', 'html') rmarkdown.render( '/app/GSOA_Report.Rmd', output_file=outFilePath.replace('txt', 'html'), params=ListVector({ 'data1': outFilePath, 'alg': request.get('classificationAlgorithm', 'svm'), 'class': request.get('classFilePath', ''), 'crossval': request.get('numCrossValidationFolds', ''), 'data_files': request.get('dataFilePath', ''), 'genesets': request.get('gmtFilePath', ''), #'hallmarks': 'iterations': request.get('numRandomIterations', ''), 'lowexpress': request.get('removePercentLowestExpr', ''), #'results_hallmark' : 'var': request.get('removePercentLowestVar', '') })) email_report(request.get('email'), outFilePath) except Exception as e: email_error(request.get('email'), e, local_buffer) finally: rinterface.set_writeconsole_warnerror(rinterface.consolePrint) rinterface.set_writeconsole_regular(rinterface.consolePrint)
def from_r(cls, lv: ro.ListVector): cls._check_expected_model(lv, "RKHS") K = get_float_array(lv.rx2("K")) return cls( name=cls._get_name(lv), K=K, K_inv=np.linalg.pinv(K), V=get_float_array(lv.rx2("V")), d=get_float_array(lv.rx2("d")), tolD=get_float(lv.rx2("tolD")), levelsU=get_int(lv.rx2("levelsU")), df0=get_float(lv.rx2("df0")), R2=get_float(lv.rx2("R2")), S0=get_float(lv.rx2("S0")), u=get_float_array(lv.rx2("u")), varU=get_float(lv.rx2("varU")), uStar=get_float_array(lv.rx2("uStar")), SD_u=get_float_array(lv.rx2("SD.u")), SD_varU=get_float(lv.rx2("SD.varU")), )
def from_r(cls, lv: ro.ListVector): cls._check_expected_model(lv, "BayesC") return cls( name=cls._get_name(lv), p=get_int(lv.rx2("p")), MSx=get_float(lv.rx2("MSx")), R2=get_float(lv.rx2("R2")), df0=get_float(lv.rx2("df0")), probIn=get_float(lv.rx2("probIn")), counts=get_float(lv.rx2("counts")), countsIn=get_float(lv.rx2("countsIn")), countsOut=get_float(lv.rx2("countsOut")), S0=get_float(lv.rx2("S0")), b=get_float_array(lv.rx2("b")), d=get_float_array(lv.rx2("d")), varB=get_float_array(lv.rx2("varB")), SD_b=get_float_array(lv.rx2("SD.b")), SD_varB=get_float_array(lv.rx2("SD.varB")), SD_probIn=get_float(lv.rx2("SD.probIn")), )
def __init__(self, params): self.base = importr("base") self.utils = importr("utils") self.stats = importr("stats") self.algdesign = importr("AlgDesign") self.car = importr("car") self.rsm = importr("rsm") self.dplyr = importr("dplyr") self.quantreg = importr("quantreg") self.dicekrig = importr("DiceKriging") self.diced = importr("DiceDesign") #numpy.random.seed(11221) #self.base.set_seed(11221) self.complete_design_data = None self.complete_search_space = None self.total_runs = 20 orio.main.tuner.search.search.Search.__init__(self, params) self.name = "GPR" self.parameter_ranges = {} for i in range(len(self.params["axis_val_ranges"])): self.parameter_ranges[self.params["axis_names"][i]] = [ 0, len(self.params["axis_val_ranges"][i]) ] info("Parameters: " + str(self.parameter_ranges)) self.parameter_values = {} for i in range(len(self.params["axis_val_ranges"])): self.parameter_values[self.params["axis_names"] [i]] = self.params["axis_val_ranges"][i] info("Parameter Real Ranges: " + str(self.axis_val_ranges)) info("Parameter Range Values: " + str(self.parameter_values)) self.range_matrix = {} for i in range(len(self.axis_names)): self.range_matrix[self.axis_names[i]] = IntVector( self.axis_val_ranges[i]) self.range_matrix = ListVector(self.range_matrix) info("DataFrame Ranges: " + str(self.base.summary_default(self.range_matrix))) self.starting_sample = int(round(len(self.params["axis_names"]) + 2)) self.steps = 22 self.extra_experiments = int(round(len(self.params["axis_names"]) * 1)) self.testing_set_size = 300000 self.failure_multiplier = 100 self.__readAlgoArgs() self.experiment_data = None self.best_points_complete = None if self.time_limit <= 0 and self.total_runs <= 0: err(('%s search requires search time limit or ' + 'total number of search runs to be defined') % self.__class__.__name__) self.run_summary_database = dataset.connect("sqlite:///" + 'run_summary.db') self.summary = self.run_summary_database["dlmt_run_summary"] info("Starting sample: " + str(self.starting_sample)) info("GPR steps: " + str(self.steps)) info("Experiments added per step: " + str(self.extra_experiments)) info("Initial Testing Set Size: " + str(self.testing_set_size)) info("Constraints: " + str(self.constraint))
def _get_name(lv: ro.ListVector) -> str: name = get_str(lv.rx2("Name")) return name[len("ETA_"):]
def _check_expected_model(lv: ro.ListVector, model: str): this_model = get_str(lv.rx2("model")) if this_model != model: raise ValueError( f"Expected to get {model}, but got results for {this_model}.")