def cost_function(params, *args): """ Cost function for pfactor fitting. :param params: list of estimated pfactors :param args: arguments required for calculating the cost :return: cost score (float) """ dexp, tk, assignments, k, kint, weights = args if weights is not None: dpred = calculate_dpred(np.array(params), tk, kint, assignments) score = calculate_rms(dpred, dexp, len(assignments), weights) else: dpred = calculate_dpred(np.array(params), tk, kint, assignments) score = calculate_rms(dpred, dexp, len(assignments)) score += harmonic_score(params, k) return float(score)
def predict_dexp(pfact, time_points, assignments): """ Calculates predicted dexp from pfactors, time_points, assignments and kint values. :param pfact: array of pfactors. :param time_points: array of time points. :param kint: array of kint values :param assignments: array of assignment arrays. :return: numpy array of dexp values. """ dexp = calculate_dpred(pfact, time_points, assignments) return dexp
def loo_crossval(dexp, time_points, ass, lam, pH, temp, seq, res1, resn): """ This function performs leave-one-out cross-validation at a fixed value of lambda. All protecion factors are initialized to ln(P)=1 (except prolines, for which ln(P)=-1). A minimization is applied using a specific penalization term lambda. The cross validation error is evaluated on every train and test datasets generated by leaving out one time point at a time. """ cv_train = 0 cv_test = 0 for k in range(len(time_points)): out_file = "CVout.rm%s" % str(k) dexp_train, times_train, dexp_test, times_test = L1OUT_dataset( dexp, time_points, k) run(base_dir=os.getcwd(), dexp=dexp_train, assignments=ass, pfact=None, random_steps=None, time_points=times_train, harmonic_term=lam, output_file=out_file, tolerance=1e-10, weights=None, pH=pH, temperature=temp, seq=seq, res1=res1, resn=resn) pfact = read_pfact(out_file + '.pfact') dpred_test = calculate_dpred(pfact, times_test, kint, ass) cost_test = [ 1 / len(pred) * np.sum((pred - exp)**2) for pred, exp in zip(dpred_test, dexp_test) ] cv_train += sum(np.loadtxt(out_file + '.diff'))[1] # summed over peptides cv_test += sum(cost_test) # sum over peptides os.remove(out_file + ".Dpred") os.remove(out_file + ".diff") os.remove(out_file + ".pfact") return cv_train / len(time_points), cv_test / len(time_points)
def run(base_dir, dexp, assignments, pfact, random_steps, time_points, harmonic_term, output_file, tolerance, weights, pH, temperature, seq, res1, resn): """ :param base_dir: base directory for all input files. :param dexp: file containing dexp values. :param assignments: file containing assignments of kints to dexp values. :param pfact: file containing pfactor values. :param random_steps: number of steps for random search. :param time_points: a list of experiment time points. :param harmonic_term: term to be used for harmonic cost scoring. :param output_file: stub for all output files. :param tolerance: tolerance value for minimisation convergence. :return: """ assignment_set = set() for ass in assignments: for x in range(int(ass[1]), int(ass[2]) + 1): assignment_set.add(x) pfactor_filter = set() for ass in assignments: for x in range(int(ass[1] + 1), int(ass[2]) + 1): pfactor_filter.add(x) if ass[1] < min(pfactor_filter): pfactor_filter.add(ass[1]) kint, prolines = calculate_kint_for_sequence(res1, resn, seq, temperature, pH) if not pfact: if random_steps: rand_output = do_random_search(kint, random_steps, pfactor_filter, dexp, time_points, assignments, harmonic_term, prolines, weights, seed=None) min_score = min(rand_output.keys()) init_array = rand_output[min_score] else: init_array = [ 1 if ii not in prolines or ii == 0 or ii + 1 in pfactor_filter else -1 for ii in range(max(pfactor_filter)) ] else: init_array = read_pfact(pfact) bounds = [(0.00001, 20) if x >= 0 else (-1, -1) if x == -1 else (0, 0) for x in init_array] pfit = fit_pfact(init_array, dexp, time_points, assignments, harmonic_term, kint, bounds, tolerance, weights) write_pfact(pfit.x, output_file) dpred = calculate_dpred(pfit.x, time_points, kint, assignments) write_dpred(output_file, dpred, time_points) write_diff(output_file, dpred, dexp) final_score = cost_function(pfit.x, dexp, time_points, assignments, harmonic_term, kint, weights) print('Final value of cost function w harm term: {}'.format(final_score)) final_score = cost_function(pfit.x, dexp, time_points, assignments, 0.0, kint, weights) print('Final value of cost function w/o harm term: {}'.format(final_score))
config['pfact'] = read_pfact(opts.pfact) if opts.times: config['times'] = read_time_points(opts.times) if opts.seq: config['sequence'] = read_seq(opts.seq) config['res1'] = 1 config['resn'] = len(read_seq(opts.seq)) # Optional arguments if opts.out: config['output'] = opts.out else: config['output'] = None pfact = config['pfact'] assignments = read_assignments(config['assignments']) assignment_set = set() for ass in assignments: for x in range(int(ass[1]), int(ass[2]) + 1): assignment_set.add(x) kint, prolines = calculate_kint_for_sequence(config['res1'], config['resn'], config['sequence'], config['temperature'], config['pH']) dpred = calculate_dpred(pfact, config['times'], kint, assignments) write_dpred(config['output'], dpred, config['times'])