def predict_LME(model_path, outdir, X, y, dv, partition_name, model_name=''): stderr('Retrieving saved model %s...\n' % m) with open(model_path, 'rb') as m_file: lme = pickle.load(m_file) summary = '=' * 50 + '\n' summary += 'LME regression\n\n' summary += 'Model name: %s\n\n' % m summary += 'Formula:\n' summary += ' ' + formula + '\n' summary += str(lme.summary()) + '\n' if args.mode in [None, 'response']: lme_preds = lme.predict(X) with open(outdir + '/%spreds_%s.txt' % ('' if model_name=='' else model_name + '_', partition_name), 'w') as p_file: for i in range(len(lme_preds)): p_file.write(str(lme_preds[i]) + '\n') losses = np.array(y[dv] - lme_preds) ** 2 with open(outdir + '/%slosses_mse_%s.txt' % ('' if model_name=='' else model_name + '_', partition_name), 'w') as p_file: for i in range(len(losses)): p_file.write(str(losses[i]) + '\n') lme_mse = mse(y[dv], lme_preds) lme_mae = mae(y[dv], lme_preds) summary += 'Loss (%s set):\n' % partition_name summary += ' MSE: %.4f\n' % lme_mse summary += ' MAE: %.4f\n' % lme_mae summary += '=' * 50 + '\n' with open(outdir + '/%seval_%s.txt' % ('' if model_name=='' else model_name + '_', partition_name), 'w') as f_out: f_out.write(summary) stderr(summary)
else: basenames_to_pool = sorted( list( set(basenames_to_pool).intersection( set(basenames_to_pool_cur)))) exps_outdirs.append(p.outdir) else: comparison_sets = {None: cdr_models} if not args.pool: for s in comparison_sets: model_set = comparison_sets[s] if len(model_set) > 1: if s is not None: stderr( 'Comparing models within ablation set "%s"...\n' % s) for i in range(len(model_set)): m1 = model_set[i] p.set_model(m1) for j in range(i + 1, len(model_set)): m2 = model_set[j] is_nested = nested(m1, m2) if is_nested or not args.ablation: if is_nested: if m1.count('!') > m2.count('!'): a_model = m1 b_model = m2 else: a_model = m2
models = [ x for x in models if (x.startswith('CDR') or x.startswith('DTSR')) ] partitions = get_partition_list(args.partition) partition_str = '-'.join(partitions) for m in models: dir_path = p.outdir + '/' + m if args.ablated_models: data_path = dir_path + '/X_conv_' + partition_str + '.csv' else: data_path = p.outdir + '/' + m.split( '!')[0] + '/X_conv_' + partition_str + '.csv' stderr('Two-step analysis using data file %s\n' % data_path) if os.path.exists(data_path): p.set_model(m) f = Formula(p['formula']) model_form = f.to_lmer_formula_string( z=args.zscore, correlated=not args.uncorrelated) model_form = model_form.replace('-', '_') is_lme = '|' in model_form df = pd.read_csv(data_path, sep=' ', skipinitialspace=True) for c in df.columns: if df[c].dtype.name == 'object': df[c] = df[c].astype(str)
plot_y_inches=p['plot_y_inches'] if y_inches is None else y_inches, ylim=args.ylim, cmap=p['cmap'] if cmap is None else cmap, dpi=args.dpi, legend=legend, xlab=args.xlab, ylab=args.ylab, use_line_markers=args.markers, transparent_background=args.transparent_background, dump_source=args.dump_source) for m in models: p.set_model(m) stderr('Retrieving saved model %s...\n' % m) cdr_model = load_cdr(p.outdir + '/' + m) kwargs = { 'plot_n_time_units': p['plot_n_time_units'] if n_time_units is None else n_time_units, 'plot_n_time_points': p['plot_n_time_points'] if resolution is None else resolution, 'plot_x_inches': p['plot_x_inches'] if x_inches is None else x_inches, 'plot_y_inches': p['plot_y_inches'] if y_inches is None else y_inches, 'cmap': p['cmap'] if cmap is None else cmap, 'dpi':
evaluation_set_paths.append((X_paths, y_paths)) for d in range(len(evaluation_sets)): X, y, select, X_response_aligned_predictor_names, X_response_aligned_predictors, X_2d_predictor_names, X_2d_predictors = evaluation_sets[d] partition_str = evaluation_set_names[d] for m in cdr_models: formula = p.models[m]['formula'] dv = formula.strip().split('~')[0].strip() y_valid, select_y_valid = filter_invalid_responses(y, dv) X_response_aligned_predictors_valid = X_response_aligned_predictors if X_response_aligned_predictors_valid is not None: X_response_aligned_predictors_valid = X_response_aligned_predictors_valid[select_y_valid] stderr('Retrieving saved model %s...\n' % m) cdr_model = load_cdr(p.outdir + '/' + m) X_conv, X_conv_summary = cdr_model.convolve_inputs( X, y_valid, X_response_aligned_predictor_names=X_response_aligned_predictor_names, X_response_aligned_predictors=X_response_aligned_predictors_valid, X_2d_predictor_names=X_2d_predictor_names, X_2d_predictors=X_2d_predictors, scaled=not args.unscaled, n_samples=args.nsamples, algorithm=args.algorithm, standardize_response=args.standardize_response )
if not p.use_gpu_if_available: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' models = filter_models(p.model_list, args.models) run_baseline = False run_cdr = False for m in models: if not run_baseline and m.startswith('LM') or m.startswith('GAM'): run_baseline = True elif not run_cdr and (m.startswith('CDR') or m.startswith('DTSR')): run_cdr = True if not (run_baseline or run_cdr): stderr('No models to run. Exiting...\n') exit() cdr_formula_list = [ Formula(p.models[m]['formula']) for m in models if (m.startswith('CDR') or m.startswith('DTSR')) ] cdr_formula_name_list = [ m for m in p.model_list if (m.startswith('CDR') or m.startswith('DTSR')) ] all_rangf = [v for x in cdr_formula_list for v in x.rangf] partitions = get_partition_list(args.partition) X_paths, y_paths = paths_from_partition_cliarg(partitions, p) X, y = read_data(X_paths, y_paths,
dv = formula.strip().split('~')[0].strip() ## For some reason, GAM can't predict using custom functions, so we have to translate them z_term = re.compile('z.\((.*)\)') c_term = re.compile('c.\((.*)\)') formula = [ t.strip() for t in formula.strip().split() if t.strip() != '' ] for i in range(len(formula)): formula[i] = z_term.sub(r'scale(\1)', formula[i]) formula[i] = c_term.sub(r'scale(\1, scale=FALSE)', formula[i]) formula = ' '.join(formula) stderr('Retrieving saved model %s...\n' % m) with open(p.outdir + '/' + m + '/m.obj', 'rb') as m_file: gam = pickle.load(m_file) gam_preds = gam.predict(X_baseline) with open(p.outdir + '/' + m + '/preds_%s.txt' % partition_str, 'w') as p_file: for i in range(len(gam_preds)): p_file.write(str(gam_preds[i]) + '\n') losses = np.array(y[dv] - gam_preds)**2 with open( p.outdir + '/' + m + '/losses_mse_%s.txt' % partition_str, 'w') as p_file: for i in range(len(losses)): p_file.write(str(losses[i]) + '\n') gam_mse = mse(y[dv], gam_preds) gam_mae = mae(y[dv], gam_preds)
models = filter_models(p.model_list, args.models, cdr_only=True) synth_path = os.path.dirname(os.path.dirname(p.X_train)) + '/d.obj' if not os.path.exists(synth_path): raise ValueError('Path to synth data %s does not exist. Check to make sure that model is fitted to synthetic data and that paths are correct in the config file.') with open(synth_path, 'rb') as f: d = pickle.load(f) def gold_irf_lambda(x): return d.irf(x, coefs=True) for m in models: p.set_model(m) formula = p.models[m]['formula'] stderr('Retrieving saved model %s...\n' % m) cdr_model = load_cdr(p.outdir + '/' + m) stderr('Computing RMSD...\n') rmsd = cdr_model.irf_rmsd( gold_irf_lambda, summed=args.summed, n_time_units=args.ntimeunits, n_time_points=args.resolution, algorithm=args.algorithm ) summary = '=' * 50 + '\n' summary += 'CDR regression\n\n' summary += 'Model name: %s\n\n' % m
type=str, default=None, help='String to prepend to output file.') args, unknown = argparser.parse_known_args() for path in args.paths: p = Config(path) if not p.use_gpu_if_available: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' models = filter_models(p.model_list, args.models, cdr_only=True) for m in models: stderr('Retrieving saved model %s...\n' % m) cdr_model = load_cdr(p.outdir + '/' + m) summary = cdr_model.summary(random=args.random, level=args.level, n_samples=args.nsamples, integral_n_time_units=args.timeunits) if args.prefix: outname = p.outdir + '/' + m + '/' + args.prefix + '_summary.txt' else: outname = p.outdir + '/' + m + '/summary.txt' stderr('Saving summary to %s' % outname) with open(outname, 'w') as f: f.write(summary)
model_cur = np.concatenate(model_cur, axis=0) baseline_cur = np.concatenate(baseline_cur, axis=0) else: model_cur = np.array(model_errors[0]) baseline_cur = np.array(baseline_errors[0]) select = np.logical_and(np.isfinite(np.array(model_cur)), np.isfinite(np.array(baseline_cur))) diff = float(len(model_cur) - select.sum()) p_value, base_diff, diffs = permutation_test(baseline_cur[select], model_cur[select], n_iter=10000, n_tails=args.tails, mode=args.metric, nested=True) stderr('\n') out_path = args.outdir + '/%s_PT.txt' % args.name with open(out_path, 'w') as f: stderr('Saving output to %s...\n' % out_path) summary = '=' * 50 + '\n' summary += 'Model comparison:\n' summary += ' %s\n' % ';'.join(args.baseline_error_paths) summary += ' vs\n' summary += ' %s\n' % ';'.join(args.model_error_paths) if diff > 0: summary += '%d NaN rows filtered out (out of %d)\n' % ( diff, len(model_cur)) summary += 'Metric: %s\n' % args.metric summary += 'Difference: %.4f\n' % base_diff summary += 'p: %.4e%s\n' % (