def clean_up_data(data, f, AA_DICT, settings, filters): filter_lt_target = 0 filter_weight_agreement = 0 filter_empty_data = 0 for r in reversed(range(len(data))): data[r][4] = analyze_mods( data[r][7], data[r] [4]) #must do first or the weight agreement will kill the peptide #this checks that there are enough peaks and that the thoeretical and measured m/z match. if not something is wrong and the row is deleted try: # peppy = IC.peptide(data[r][4], settings["Heavy Element"], AA_DICT) # print("settings[\"Heavy Element\"]", settings["Heavy Element"]) # print(peppy.Composition(), ":", peppy.MW()) theory_wt = IC.peptide(data[r][4], settings["Heavy Element"], AA_DICT).MW() if theory_wt > filters["mass cutoff"]: target = filters["Peaks included if over mass cutoff"] else: target = filters["Peaks included if under mass cutoff"] x = [float(y) for y in data[r][-2].split()] x = [a for a in x if a != 0 ] # returns values as zero if not present. remove issues. #debugging if len(x) < target: filter_lt_target += 1 if abs(theory_wt - float(data[r][6])) > filters["Weight Agreement"]: theory_comp = IC.peptide(data[r][4], settings["Heavy Element"], AA_DICT).Composition() print(data[r][1], data[r][4], "formula:", theory_comp, "theory_wt:", theory_wt, "observed:", data[r][6]) filter_weight_agreement += 1 if data[r][4] == '': filter_empty_data += 1 if len(x) < target or abs( theory_wt - float(data[r][6]) ) > filters["Weight Agreement"] or data[r][ 4] == '': #deals with insufficient peaks, bad weight agreement, and bad mods del data[r] else: #Protein ID, file, charge, RT, Peptide, protein name, mass, mod, isotopes, abundances before this point data[r] = [ f, data[r][0], data[r][5], data[r][4], data[r][6], data[r][2], data[r][3], data[r][-3], data[r][-2], data[r][-1] ] # if all is good set the data as good and proceed #bad sequences are removed except KeyError: del data[r] print("clean_up_data() filters:", "filter_lt_target:", filter_lt_target, "filter_weight_agreement:", filter_weight_agreement, "filter_empty_data:", filter_empty_data)
def emass(inqueue, outlist, error_list, AA_DICT, settings, filters): while not inqueue.empty(): sequence = inqueue.get() label = settings["Heavy Element"] pep = IC.peptide(sequence, settings["Heavy Element"], AA_DICT) if pep.MW() > filters["mass cutoff"]: size = int(filters["Peaks included if over mass cutoff"]) else: size = int(filters["Peaks included if under mass cutoff"]) fm = parser( pep.Composition(True) ) #this turns the potential labeling sites into Xs so the program knows what they are tmp = [] seq_data = [] mz_data = [] working = True x_range = np.arange( 0.0, settings["Maximum Theoretical Percent"], settings["Step Size for Labeling"] ) #this is used here and in the fitter. easier to give it its own variable for x_pct in x_range: #increase x here #need to reassign since it is a tuple. consider changing to dictionary or class in the future. master_isotope['X'] = [ isotope(master_isotope[label][0].mass, master_isotope[label][0].abundance - x_pct), isotope(master_isotope[label][1].mass, master_isotope[label][1].abundance + x_pct) ] #does the calculations and saves the data result = [isotope(0, 1)] result = calculate(tmp, result, fm, limit, charge) mz, pre_norm = print_pattern(result, digits) if x_pct == 0: true_m0 = mz[0] elif round(mz[0], 2) != round(true_m0, 2): working = False break norm = normalize( pre_norm[:size] ) #need to normalize the abundance data (it comes out normalized with the highest peak as 1. this makes the sumof the relative abundances in the number of relevant peaks 1) seq_data.append(norm) mz_data.append(mz[:size]) if working: graph = [sequence, pep.Composition(False), pep.MW(), pep.get_n()] # save relevent data #run through fitter. Do not use elif here. it needs to check both so it will do both if both are selected if settings["Use Abundance"]: graph = fitter(graph, seq_data, size, x_range, settings, filters, True) if settings["Use neutromer spacing"]: graph = fitter(graph, mz_data, size, x_range, settings, filters, False) outlist.append(graph) #add the data to the multiprocessing list else: error_list.append([sequence, x_pct]) inqueue.task_done()
def emass(inqueue, seq_dict, outlist, error_list, AA_DICT, settings, filters): while not inqueue.empty(): sequence = inqueue.get() big_size = int(filters["Peaks included if over mass cutoff"]) small_size = int(filters["Peaks included if under mass cutoff"]) label = settings["Heavy Element"] pep = IC.peptide(sequence,settings["Heavy Element"], AA_DICT) if pep.MW() > filters["mass cutoff"]: size = big_size else: size = small_size fm = parser(pep.Composition(True)) #this turns the potential labeling sites into Xs so the program knows what they are master_isotope['X'] = [isotope(master_isotope[label][0].mass, master_isotope[label][0].abundance), isotope(master_isotope[label][1].mass, master_isotope[label][1].abundance)] tmp = [] result = [isotope(0, 1)] result = calculate(tmp, result, fm, limit, charge) mz, pre_norm = print_pattern(result, digits) true_m0 = mz[0] if settings["Use Abundance"]: base_abund = normalize(pre_norm[:size]) base_abund = sizing(base_abund, size, small_size, big_size) if settings["Use neutromer spacing"]: base_mz = [] for i in range(1, size): base_mz.append(mz[i]-mz[0]) base_mz = sizing(base_mz, size, small_size, big_size) working = True for x_pct in seq_dict[sequence]: final = [sequence, pep.Composition(False), pep.MW(), pep.get_n(), x_pct] if x_pct == 0: x_pct = filters["Zero Labeling Check"] #increase x here #need to reassign since it is a tuple. consider changing to dictionary or class in the future. master_isotope['X'] = [isotope(master_isotope[label][0].mass, master_isotope[label][0].abundance-x_pct), isotope(master_isotope[label][1].mass, master_isotope[label][1].abundance+x_pct)] #does the calculations and saves the data result = [isotope(0, 1)] result = calculate(tmp, result, fm, limit, charge) mz, pre_norm = print_pattern(result, digits) if round(mz[0], 2) != round(true_m0,2): working = False if working: if settings["Use Abundance"]: final.extend(base_abund) abund = normalize(pre_norm[:size]) abund = sizing(abund, size, small_size, big_size) final.extend(abund) #need to normalize the abundance data (it comes out normalized with the highest peak as 1. this makes the sumof the relative abundances in the number of relevant peaks 1) if settings["Use neutromer spacing"]: final.extend(base_mz) spacing = [] for i in range(1, size): spacing.append(mz[i]-mz[0]) spacing = sizing(spacing, size, small_size, big_size) final.extend(spacing) outlist.append(final) #add the data to the multiprocessing list else: error_list.append([sequence, x_pct]) inqueue.task_done()