def predict_ratios_with_SD(predictors, measures, nruns, outDir): ''' Parameters predictors: df, index are ratios, columns are ['model', 'errormodel', 'features'] measures: df, measured MDVs, rows are MDVs, columns are ['mean', 'sd'] nruns: int, # of runs for Monte Carlo simulation outDir: str, output directory ''' predRatiosSDs = pd.DataFrame(columns = ['predicted', 'sd']) for ratio, [baseModel, errorModel, features] in predictors.iterrows(): subMeasures = measures.loc[features, :] if subMeasures['sd'].sum() == 0.0: predRatio, predSD = predict_ratio_with_SD(baseModel, subMeasures['mean'], errorModel = errorModel) else: predRatio, predSD = predict_ratio_with_SD(baseModel, subMeasures['mean'], SDs = subMeasures['sd'], nruns = nruns) predRatiosSDs.loc[ratio, :] = [np.asscalar(predRatio), np.asscalar(predSD)] display_estimated_ratios_or_fluxes('ratio', predRatiosSDs) save_data(predRatiosSDs, 'predicted_ratios', outDir, True, True)
def generate_random_fluxes_in_parallel(S, revs, fluxCons, ratioCons, bndCons, nsims, njobs, outDir): ''' Parameters S: df, stoichiometric matrix, balanced metabolites in rows, net reactions fluxes in columns revs: ser, reaction reversibility fluxCons: ser, flux value constraints ratioCons: df, ratio range constraints, columns are ['greater_than_zero', 'smaller_than_zero'], e.g. a < v1/v2 < b will be transfromed into v1 - a*v2 > 0 and v1 - b*v2 < 0 bndCons: df, flux range constraints, columns are ['lb', 'ub'] nsims: int, # of flux distributions to simulate njobs: int, # of jobs run in parallel outDir: str, output directory Returns totalFluxDistribs: df, total fluxes distributions, columns are fluxes, rows are runs ''' neqns = null_space(S).shape[1] ncons = 0 if fluxCons is None else fluxCons.shape[0] display_DOF(neqns, ncons) netFluxDistribs, netFluxBnds = flux_sampler(S, revs, fluxCons, ratioCons, bndCons, nsims, njobs) plot_random_flux_distributions(netFluxDistribs, netFluxBnds, outDir) save_data(netFluxDistribs, 'random_fluxes', outDir, False, True) scaler = 1 if fluxCons is None else fluxCons[fluxCons != 0].mean() totalFluxDistribs = generate_total_flux_from_net_flux(netFluxDistribs, revs, scaler) return totalFluxDistribs
def simulate_ratios_MDVs_in_parallel(simEMUs, symRatios, symAs, symBs, subMDVsAll, fluxDistribs, quantile, njobs, outDir): ''' Parameters simEMUs: lst, of which the MDVs will be simulated symRatios: df, index is ratio, columns are ['args', 'symbol'] symAs: dict, key is size, value is like [[symbol variables of A], symbol matrix A, [column EMUs of A]] symBs: dict, key is size, value is like [[symbol variables of B], symbol matrix B, [column EMUs of B]] subMDVsAll: dict of dict, like {tracer: {substrate EMU: MDV}} fluxDistribs: df, fluxes distributions, columns are fluxes, rows are runs quantile: float, simulated values in the quantile interval (i.e. [0.5 - quantile/2, 0.5 + quantile/2]) are retained njobs: # of jobs run in parallel outDir: str, output directory Returns ratiosMDVsAll: df, combined ratiosMDVs, of which the index is flux distribution NO, columns are flux ratios and MDVs ''' length = int(np.ceil(fluxDistribs.shape[0] / njobs)) fluxDistribChunks = [ fluxDistribs[i * length:(i + 1) * length] for i in range(njobs) ] pool = Pool(processes=njobs) ratiosMDVs = [] for i in range(njobs): if i >= len(fluxDistribChunks): continue res = pool.apply_async(func=simulator, args=(simEMUs, symRatios, symAs, symBs, subMDVsAll, fluxDistribChunks[i])) ratiosMDVs.append(res) pool.close() pool.join() ratiosMDVs = [res.get() for res in ratiosMDVs] ratiosMDVsAll = pd.concat(ratiosMDVs, ignore_index=True) ratiosMDVsAll = filter_ratios(symRatios.index, ratiosMDVsAll, quantile) save_data(ratiosMDVsAll, 'ratios_MDVs', outDir, False, True)
def estimate_fluxes_with_SD(S, AeqConsAll, beqConsAll, bndCons, nruns, outDir): ''' Parameters S: df, stoichiometric matrix, balanced metabolites in rows, total reactions in columns AeqConsAll: lst of df, A of equality constraints beqConsAll: lst of ser, b of equality constraints bndCons: df, boundary constraints of flux nruns: int, # of runs for Monte Carlo simulation outDir: str, output directory Returns estFluxesSDs: df, estimated net fluxes, rows are rxns, columns are ['estimated', 'sd'] ''' neqns = null_space(S).shape[1] ncons = AeqConsAll[0].shape[0] display_DOF(neqns, ncons) estTotalFluxes = pd.DataFrame(index=np.arange(nruns), columns=S.columns) for i, (AeqCons, beqCons) in enumerate(zip(AeqConsAll, beqConsAll)): estFluxes = estimate_fluxes(S, AeqCons, beqCons, bndCons) estTotalFluxes.loc[i, :] = estFluxes estNetFluxes = calculate_net_flux_from_total_flux(estTotalFluxes) estNetFluxesSDs = pd.DataFrame({ 'estimated': estNetFluxes.mean(axis=0), 'sd': estNetFluxes.std(axis=0) }) display_estimated_ratios_or_fluxes('flux', estNetFluxesSDs) save_data(estNetFluxesSDs, 'estimated_fluxes', outDir, True, True)
def select_ratios(EMUs, EAMs, symAs, symBs, subMDVsAll, fluxDistrib, outDir, exNodes = [], thold1 = 1e12, thold2 = 1e-3): ''' Parameters EMUs: lst, of which the MDVs will be simulated EAMs: dict, EMU adjacency matrix (EAM) of different size, like {size: EAM}. NOTE: the cells of EAM are symbols symAs: dict, key is size, value is like [[symbol variables of A], symbol matrix A, [column EMUs of A]] symBs: dict, key is size, value is like [[symbol variables of B], symbol matrix B, [column EMUs of B]] subMDVsAll: dict of dict, like {tracer: {substrate EMU: MDV}} fluxDistrib: ser, flux distribution exNodes: lst, node metabolites excluded for ratio selection outDir: str, output directory thold1: float, threshold to calculate the null space, the greater threshold, the easier to get non-empty null space (higher DOF) thold2: float, distance threshold, under which column MDVs will be considered equal Returns selRatiosAll: df, selected ratios, index is ratio, columns are ['args', 'symbol'] ''' def find_independent_columns(data, thold): ''' Parameters data: df, independent columns of which will be found thold: float, distance threshold, under which columns will be considered equal Returns indCols: lst, independent column names ''' labels = AgglomerativeClustering(n_clusters = None, distance_threshold = thold).fit_predict(data.values.T) labelMapping = {} for label, col in zip(labels, data.columns): labelMapping.setdefault(label, []).append(col) indCols = [cols[0] for cols in labelMapping.values() if len(cols) == 1] return indCols lamAs = lambdify_matrix(symAs) lamBs = lambdify_matrix(symBs) selRatiosAll = pd.DataFrame() for _, subMDVs in subMDVsAll.items(): simMDVsAll = simulate_MDVs(EMUs, lamAs, lamBs, subMDVs, fluxDistrib, 2) for EMU in EMUs: metab, atomNOs = re.match(r'^(\w+?)(\d+)$', EMU).groups() inputInfo = EAMs[len(atomNOs)][EMU][EAMs[len(atomNOs)][EMU] != 0] if metab not in exNodes and inputInfo.shape[0] > 1: inputMat = np.array([reduce(conv, [ChainMap(simMDVsAll, subMDVs)[preEMU] for preEMU in preEMUs.split(',')]) for preEMUs in inputInfo.index]).T inputMat = pd.DataFrame(inputMat, columns = inputInfo.index) DOF = null_space(inputMat.values, rcond = np.finfo(np.float64).eps * max(inputMat.shape) * thold1).shape[1] if DOF == 0: selPreEMUs = inputInfo.index.tolist() else: #selPreEMUs = find_independent_columns(inputMat, thold = thold2) selPreEMUs = [] if selPreEMUs: selRatios = pd.DataFrame() selRatios['symbol'] = inputInfo[selPreEMUs] / inputInfo.sum() selRatios['args'] = selRatios['symbol'].apply(lambda r: list(map(str, r.free_symbols))) selRatios['formula'] = inputInfo[selPreEMUs].index.str.replace(r',', '+') + '_' + inputInfo.name selRatiosAll = pd.concat((selRatiosAll, selRatios)) selRatiosAll.drop_duplicates(subset = ['symbol'], inplace = True) selRatiosAll.index = ['r' + str(i) for i in range(1, selRatiosAll.shape[0]+1)] if selRatiosAll.empty: raise ValueError('no ratio selected, simulation terminated.') save_data(selRatiosAll[['formula', 'symbol']], 'selected_ratios', outDir, True, True) return selRatiosAll[['args', 'symbol']]
def model_selector(ratio, Xtrain, Xtest, Ytrain, Ytest, methods, outDir, error=False, nfolds=5): ''' Parameters ratio: str: ratio ID Xtrain: df, feature matrix for training Xtest: df, feature matrix for testing Ytrain: ser, target for training Ytest: ser, target for testing methods: lst, ML methods to test outDir: str, output directory error: bool, whether to train a error model nfolds: int, cross validation folds ''' subOutDir = r'%s/%s' % (outDir, ratio) os.makedirs(subOutDir, exist_ok=True) print('\nratio ' + ratio) predRess = pd.DataFrame( columns=pd.MultiIndex.from_product([methods, ['predicted', 'true']])) MAEs = pd.Series(index=methods) R2s = MAEs.copy() for method in methods: print('\ntuning %s ...' % method) bestModel, bestParams, predRes, MAE, R2 = evaluate_model(Xtrain, Xtest, Ytrain, Ytest, method, nfolds=nfolds) predRess.loc[:, idx[method, :]] = predRes.values MAEs[method] = MAE R2s[method] = R2 display_best_params(bestParams) save_model(method, bestModel, subOutDir) if error: Ypredict = bestModel.predict(Xtest.values) YtestError = (Ypredict - Ytest)**2 bestErrorModel = tune_model(Xtest, YtestError, method, nfolds=nfolds)[0] save_model(method + '_error', bestErrorModel, subOutDir) plot_MAE(ratio, MAEs, subOutDir) save_data(MAEs, 'MAE', subOutDir, True, False) plot_predicted_vs_true(ratio, predRess, R2s, subOutDir) save_data(predRess, 'predicted_vs_true', subOutDir, False, True) save_data(R2s, 'R2', subOutDir, True, False)