def select_top_solutions(out_file, n): files = os.listdir() diff_files = [] diff_value = [] for file in files: if '.diff' in file and out_file in file: f = open(file, 'r') diff = pd.read_csv(file, header=None, delim_whitespace=True) diff_files.append(file) diff_value.append(np.average(diff[1])) sorted_files = pd.DataFrame(zip( diff_files, diff_value)).sort_values(1).reset_index(drop=True) np.savetxt('diff.list', sorted_files.values, fmt='%s %5.10f') log.info("Cost function of all solutions stored in file diff.list") p = [] for i in range(int(sorted_files.shape[0] * n / 100)): file = sorted_files[0][i] p.append(list(read_pfact(file.replace('.diff', '.pfact')))) with open("all.sp", "w+") as f: for i in range(len(p)): for j in range(len(p[i])): if j == len(p[i]) - 1: f.write("{: <12}\n".format(round(p[i][j], 5))) else: f.write("{: <12} ".format(round(p[i][j], 5))) log.info("Top %s solutions stored in all.sp" % str(sorted_files.shape[0] * n / 100))
def loo_crossval(dexp, time_points, ass, lam, pH, temp, seq, res1, resn): """ This function performs leave-one-out cross-validation at a fixed value of lambda. All protecion factors are initialized to ln(P)=1 (except prolines, for which ln(P)=-1). A minimization is applied using a specific penalization term lambda. The cross validation error is evaluated on every train and test datasets generated by leaving out one time point at a time. """ cv_train = 0 cv_test = 0 for k in range(len(time_points)): out_file = "CVout.rm%s" % str(k) dexp_train, times_train, dexp_test, times_test = L1OUT_dataset( dexp, time_points, k) run(base_dir=os.getcwd(), dexp=dexp_train, assignments=ass, pfact=None, random_steps=None, time_points=times_train, harmonic_term=lam, output_file=out_file, tolerance=1e-10, weights=None, pH=pH, temperature=temp, seq=seq, res1=res1, resn=resn) pfact = read_pfact(out_file + '.pfact') dpred_test = calculate_dpred(pfact, times_test, kint, ass) cost_test = [ 1 / len(pred) * np.sum((pred - exp)**2) for pred, exp in zip(dpred_test, dexp_test) ] cv_train += sum(np.loadtxt(out_file + '.diff'))[1] # summed over peptides cv_test += sum(cost_test) # sum over peptides os.remove(out_file + ".Dpred") os.remove(out_file + ".diff") os.remove(out_file + ".pfact") return cv_train / len(time_points), cv_test / len(time_points)
def get_dcalc(assignments_file, kint_file, pfact_file, time_points_file, fragment_number): assignments = read_assignments(assignments_file) kint = read_kint(kint_file, -1) pfact = read_pfact(pfact_file) time_points = read_time_points(time_points_file) residue_range = np.array([assignments[int(fragment_number) - 1]])[0] dcalc_all = np.zeros( (len(time_points), residue_range[2] - residue_range[1] + 1)) for ii, time in enumerate(time_points): k = kint[residue_range[1]:residue_range[2]] p = pfact[residue_range[1]:residue_range[2]] dcalc = np.insert(1.0 - np.exp(-k * 60 * time / p), 0, time) dcalc_all[ii] = dcalc return dcalc_all
def predict_isotopic_envelope(ass_file, seq_file, temperature, pH, lnp_file, times_file, pep, charge_state, exchange, out_file, pi0_file=''): seq = read_seq(seq_file) times = read_time_points(times_file) # Select residues involving the selected peptide ass = read_assignments(ass_file) start_res = ass[int(pep) - 1][1] end_res = ass[int(pep) - 1][2] # Upload kint and lnP values if exchange == 'f': kint, _ = calculate_kint_for_sequence(1, len(seq), seq, float(temperature), float(pH)) kint = kint[start_res:end_res] elif exchange == 'b': kint, _ = calculate_kback_for_sequence(1, len(seq), seq, float(temperature), float(pH)) kint = kint[start_res:end_res] lnP = read_pfact(lnp_file)[start_res:end_res] # Calculate fully protonated isotopic envelope if exchange == 'f': pi0 = fully_protonated_envelope(seq[start_res:end_res + 1], z=charge_state) mass = list(pi0.keys()) fr0 = list(pi0.values()) while len(mass) <= 2 * len(kint[start_res:end_res + 1]): mass.append( (mass[-1] + 1.00627 * int(charge_state)) / charge_state) fr0.append(0) print(mass, fr0) elif exchange == 'b': pi0 = pd.read_csv(pi0_file, skiprows=1, header=None, delim_whitespace=True) mass = list(pi0[1]) u_fr0 = list(pi0[2]) fr0 = centered_isotopic_envelope(0, kint, lnP, u_fr0) # Calculate isotopic envelopes at different times for i in range(len(times)): if exchange == 'f': f1 = centered_isotopic_envelope(times[i], kint, lnP, fr0) elif exchange == 'b': f1 = back_centered_isotopic_envelope(times[i], kint, lnP, fr0) f1 = [f1[j] / sum(f1) * 100 for j in range(len(f1))] with open("%s.%s.isot" % (out_file, str(i)), 'w+') as f: f.write('# ' + seq[start_res:end_res] + '\n') for j in range(len(f1)): f.write('%d\t' % j) f.write('%5.5f\t' % mass[j]) f.write('%5.2f\t' % f1[j]) last_col = f1[j] / max(f1) * 100 if j == len(f1) - 1: f.write('%5.2f' % last_col) else: f.write('%5.2f\n' % last_col)
def run(base_dir, dexp, assignments, pfact, random_steps, time_points, harmonic_term, output_file, tolerance, weights, pH, temperature, seq, res1, resn): """ :param base_dir: base directory for all input files. :param dexp: file containing dexp values. :param assignments: file containing assignments of kints to dexp values. :param pfact: file containing pfactor values. :param random_steps: number of steps for random search. :param time_points: a list of experiment time points. :param harmonic_term: term to be used for harmonic cost scoring. :param output_file: stub for all output files. :param tolerance: tolerance value for minimisation convergence. :return: """ assignment_set = set() for ass in assignments: for x in range(int(ass[1]), int(ass[2]) + 1): assignment_set.add(x) pfactor_filter = set() for ass in assignments: for x in range(int(ass[1] + 1), int(ass[2]) + 1): pfactor_filter.add(x) if ass[1] < min(pfactor_filter): pfactor_filter.add(ass[1]) kint, prolines = calculate_kint_for_sequence(res1, resn, seq, temperature, pH) if not pfact: if random_steps: rand_output = do_random_search(kint, random_steps, pfactor_filter, dexp, time_points, assignments, harmonic_term, prolines, weights, seed=None) min_score = min(rand_output.keys()) init_array = rand_output[min_score] else: init_array = [ 1 if ii not in prolines or ii == 0 or ii + 1 in pfactor_filter else -1 for ii in range(max(pfactor_filter)) ] else: init_array = read_pfact(pfact) bounds = [(0.00001, 20) if x >= 0 else (-1, -1) if x == -1 else (0, 0) for x in init_array] pfit = fit_pfact(init_array, dexp, time_points, assignments, harmonic_term, kint, bounds, tolerance, weights) write_pfact(pfit.x, output_file) dpred = calculate_dpred(pfit.x, time_points, kint, assignments) write_dpred(output_file, dpred, time_points) write_diff(output_file, dpred, dexp) final_score = cost_function(pfit.x, dexp, time_points, assignments, harmonic_term, kint, weights) print('Final value of cost function w harm term: {}'.format(final_score)) final_score = cost_function(pfit.x, dexp, time_points, assignments, 0.0, kint, weights) print('Final value of cost function w/o harm term: {}'.format(final_score))
# Compulsory arguments if opts.base: config['base'] = opts.base else: config['base'] = os.getcwd() if opts.ass: config['assignments'] = opts.ass if opts.temp: config['temperature'] = float(opts.temp) if opts.pH: config['pH'] = float(opts.pH) if opts.pfact: config['pfact'] = read_pfact(opts.pfact) if opts.times: config['times'] = read_time_points(opts.times) if opts.seq: config['sequence'] = read_seq(opts.seq) config['res1'] = 1 config['resn'] = len(read_seq(opts.seq)) # Optional arguments if opts.out: config['output'] = opts.out else: config['output'] = None pfact = config['pfact'] assignments = read_assignments(config['assignments'])