def __init__(self, enrichment_path, out_path, settings_path): settings.load(settings_path) self.settings_path = settings_path self.enrichment_path = Path(enrichment_path) aa_label_df = pd.read_csv(settings.aa_label_path, sep='\t') aa_label_df.set_index('study_type', inplace=True) self.aa_labeling_dict = aa_label_df.loc[ settings.study_type, ].to_dict() if self.enrichment_path.suffix == '.tsv': self._enrichment_df = pd.read_csv(filepath_or_buffer=str( self.enrichment_path), sep='\t') elif self.enrichment_path.suffix == '.csv': self._enrichment_df = pd.read_csv(filepath_or_buffer=str( self.enrichment_path), sep=',') if settings.recognize_available_cores is True: self._n_processors = mp.cpu_count() else: self._n_processors = settings.n_processors #$breaks windows/python interactions if too many cores are used. very niche application but still relevant if self._n_processors > 60: self.n_processors = 60 self._mp_pool = mp.Pool(self._n_processors) self.out_path = out_path self.model = None
def __init__(self, model_path, out_path, settings_path): settings.load(settings_path) self.settings_path = settings_path itertuple_renamer = copy(protein_itertuple_renamer) self.error = "" #$get the number of cores we're using for multiprocessing if settings.recognize_available_cores is True: self._n_processors = mp.cpu_count() else: self._n_processors = settings.n_processors #$breaks windows/python interactions if too many cores are used. very niche application but still relevant if self._n_processors > 60: self.n_processors = 60 self._mp_pool = mp.Pool(self._n_processors) if model_path[-4:] == ".tsv": self.model = pd.read_csv(model_path, sep='\t') elif model_path[-4:] == ".csv": self.model = pd.read_csv(model_path) else: # $should never trigger unless we are fiddling with the gui raise ValueError("invalid file extension") self.model.rename(columns=itertuple_renamer, inplace=True) self.out_path = out_path
def _mp_prepare(settings_path, args, aa_labeling_dict=None): settings.load(settings_path) #file_path, time, enrichment = args file_path, time, enrichment, sample_group, biological_replicate = args df = pd.read_csv(filepath_or_buffer=file_path, sep='\t') if "mzs_list" in df.columns: df.drop(inplace=True, columns=[ "mzs_list", "intensities_list", "rt_list", "baseline_list" ]) df = TheoryPreparer._apply_filters(df) if aa_labeling_dict: #$don't include an else for either if statement. no need to calculate if column exists #$ and we don't want to add the column if we can't calculate it since checking for it is an error check for later steps if literature_n_name not in df.columns: if aa_labeling_dict != "": df = df.apply(TheoryPreparer._calculate_literature_n, axis=1, args=(aa_labeling_dict, )) df['time'] = time df['enrichment'] = enrichment df["sample_group"] = sample_group df["bio_rep"] = biological_replicate return df
def __init__(self, settings_path, id_path, mzml_path, out_path): ''' Parameters ---------- ids : str The name of the file containing the identifications. This data will likely have been taken from an unlabeled run. mzml : str The name of the file containing mass spectrometry data in the mzml format, labeled or not. settings : str The name of the file containing the settings for this instance of the exctractor, which may contain the settings for the rest of Deuterater as well. This file *should* be in ``.yaml`` format. For addition information, see the settings file's documentation ''' self.settings_path = settings_path settings.load(self.settings_path) self.id_path = id_path self.mzml_path = mzml_path self.out_path = out_path self.ids = {} self._id_chunks = () self._mzml_native_id_bounds = [] self.model = pd.DataFrame() try: if settings.recognize_available_cores is True: self._n_processors = mp.cpu_count() else: self._n_processors = settings.n_processors #$breaks windows/python interactions if too many cores are used. very niche application but still relevant if self._n_processors > 60: self.n_processors = 60 self._chunk_size = settings.chunk_size self._chunking_threshold = mul(settings.chunking_method_threshold, settings.chunk_size) self._id_rt_unit = settings.id_file_rt_unit self._trim_ids = settings.trim_ids_to_mzml_bounds self._rt_window = settings.time_window self._mp_pool = mp.Pool(self._n_processors) if not os.path.exists(self.out_path): open(self.out_path, 'w').close() # TODO: this needs fixed, it doesn't catch open files if not os.access(self.out_path, os.W_OK): raise PermissionError('Output path not writeable.') except Exception as e: print(e) traceback.print_tb(e.__traceback__) raise
def __init__(self, model_path, out_path, graph_folder, settings_path): settings.load(settings_path) if model_path[-4:] == ".tsv": self.model = pd.read_csv(model_path, sep='\t') elif model_path[-4:] == ".csv": self.model = pd.read_csv(model_path) else: #$should never trigger unless we are fiddling with the gui raise ValueError("invalid file extension") self.out_path = out_path self.rate_model = None self.datapoint_model = None # CQ self.graph_folder = graph_folder self.settings_path = settings_path #$get the number of cores we're using for multiprocessing if settings.recognize_available_cores is True: self._n_processors = mp.cpu_count() else: self._n_processors = settings.n_processors #$breaks windows/python interactions if too many cores are used. very niche application but still relevant if self._n_processors > 60: self.n_processors = 60 self._mp_pool = mp.Pool(self._n_processors)
def extract(settings_path, mzml_path, index_to_ID, chunk): '''Extract data from the mzml according to the identification information ''' # A rough outline of how the logic flows. # EXTRACT: # for each scan in the reader # in ms level 1 # for each id in the window # isotope extraction specific logic # Turn the warnings off so that it doesn't mess up the tqdm progress bar warnings.filterwarnings("ignore") # Load the settings into the settings module. This needs to be done in each # process due to how python handles multiprocessing settings.load(settings_path) # Open a file pointer to the mzml file mzml_fp = pymzml.run.Reader(path_or_file=mzml_path, build_index_from_scratch=True) mzml_bounds = dml.get_bounds(mzml_fp, index_to_ID) # Check for an empty id file chunk if len(chunk) <= 0: warnings.warn( EmptyIdChunkWarning('There are no identifications in this chunk')) # Set the high and low retention time bounds, based on the chunk of the # identification file lo_rt_bound = chunk.at[0, 'rt'] - settings.time_window if lo_rt_bound < 0: lo_rt_bound = 0 hi_rt_bound = chunk.at[len(chunk) - 1, 'rt'] + settings.time_window # Search for the scans at the high and low retention time bounds lo_spec_idx = dml.retention_time_search(mzml_fp, index_to_ID, lo_rt_bound) hi_spec_idx = dml.retention_time_search(mzml_fp, index_to_ID, hi_rt_bound) ## if mzml_fp[index_to_ID[hi_spec_idx]] > hi_rt_bound: ## hi_spec_idx = hi_spec_idx - 1 # Logic block for handling out-of-bounds indices if lo_spec_idx != -1 and hi_spec_idx != -1: # Do nothing if both indices are in bounds pass elif lo_spec_idx == -1 and hi_spec_idx != -1: # If the just the higher index is found, assign the lowest index in # the mzml to 'lo_spec_idx' lo_spec_idx = mzml_bounds['idx_min'] elif lo_spec_idx != -1 and hi_spec_idx == -1: # If the just the lower index is found, assign the highest index in # the mzml to 'hi_spec_idx' hi_spec_idx = mzml_bounds['idx_max'] elif lo_rt_bound < mzml_bounds['rt_min'] < \ mzml_bounds['rt_max'] < hi_rt_bound: # If neither index is found but the time span covered by the chunk of # the ID file encompasses that of the mzml, assign 'lo_spec_idx' and # 'hi_spec_idx' the minimum and maximum index values given by the # mzml file lo_spec_idx = mzml_bounds['idx_min'] hi_spec_idx = mzml_bounds['idx_max'] else: # Otherwise, there is no intersection between the ID file and the mzml # in terms of retention time and no analysis can be made return -1 ids = [] # initialize the list of identifications # TODO: redefine this column as ionmass? chunk['mass'] = chunk['mz'] * chunk['z'] # Instantiate all of the identifications in the chunk for row in chunk.itertuples(index=True): ids.append( ID( rt=row.rt, mz=row.mz, mass=row.mass, z=row.z, n_isos=row.n_isos, #cf=row.cf )) # Iterate through all of the relevent spectrums in the mzml for spectrum_index in dmt.inclusive_range(lo_spec_idx, hi_spec_idx): # apply the index_to_ID map in order to access the correct spectrum native_id = index_to_ID[spectrum_index] try: # try to access this spectrum spectrum = mzml_fp[native_id] spec_rt = spectrum.scan_time_in_minutes() spec_mzs = spectrum.mz spec_abs = spectrum.i except Exception: # TODO: use a more specific Exception # catch the exception and move on if the spectrum is not found continue # only deal with desired ms_level if spectrum.ms_level != settings.ms_level: continue # determine id indices of peaks searches # adding and subtracting the floating point error tolerance allows us # to include the extremes of the range local_window_min = \ spec_rt - (settings.time_window) # + settings.fpe_tolerance) local_window_max = \ spec_rt + (settings.time_window) # + settings.fpe_tolerance) try: lo_slice_index = \ min(chunk[chunk['rt'] > local_window_min].axes[0].tolist()) hi_slice_index = \ max(chunk[chunk['rt'] < local_window_max].axes[0].tolist()) except: continue # iterate through relevant ids for id in ids[dmt.inclusive_slice(lo_slice_index, hi_slice_index)]: charge = id.z # instatntiate an envelope envelope = Envelope(peaks=[], rt=spec_rt, n_lookback=settings.peak_lookback, n_lookahead=settings.peak_lookahead) lo_baseline_lookback = None hi_baseline_lookback = None lo_baseline_lookahead = None hi_baseline_lookahead = None peak_range_start = 0 - settings.peak_lookback peak_range_end = id.n_isos + settings.peak_lookahead # Iterate through all of the peaks we want to look for for peak_num in range(peak_range_start, peak_range_end): # define the mz to search for in the spectrum search_mz = id.mz + (peak_num * NEUTRON / charge) # define the ppm error tolerance reach = settings.ppm_window / 1_000_000.0 * search_mz # find the index of the nearest data point in that spectrum's # mz array index = dmt.find_nearest_index(spec_mzs, search_mz) if peak_num == 0: # set the bounds for defining the baseline lo_baseline_lookback = dmt.find_nearest_index( spec_mzs, id.mz - settings.baseline_lookback) hi_baseline_lookback = index lo_baseline_lookahead = index hi_baseline_lookahead = dmt.find_nearest_index( spec_mzs, id.mz + settings.baseline_lookback) # TODO: Do I need to speed this up by removing typecheck? # TODO: Expand this to only one paren/bracket per line? if abs(spec_mzs[index] - search_mz) < reach: # If the value at that index is within the reach envelope.append_peak( Peak(mz=spec_mzs[index], abundance=spec_abs[index], i=peak_num)) else: if 0 <= peak_num < id.n_isos: # set the envelopes validity flag to false if no peak # is found, then move on to the next identification envelope.is_valid = False envelope.append_peak( Peak( mz=search_mz, # TODO: it might be better to set this to NA abundance=0, i=peak_num)) # TODO Do i need to speed this up by removing typecheck? # If all of the peaks have been found, add it to the # identification (after determining the baseline) # NOTE: baseline is defined as the median abundance of the 100 # mz units preceding the m0 peak # CQ: Changing baseline to be the MAD of 100 m/z datapoints ahead and behind m0 peak. # Adapted from Marginean, I; Tang, K; Smith, RD.; Kelly, R; Picoelectrospray Ionization Mass Spectrometry # Using Narrow-Bore Chemically Etched Emitters, ASMS, 2013 def mad(values): m = median(values) return median([abs(a - m) for a in values]) lookback_baseline = [ l for l in spec_abs[dmt.inclusive_slice( lo_baseline_lookback, hi_baseline_lookback)] if l != 0 ][-100:] lookahead_baseline = [ l for l in spec_abs[dmt.inclusive_slice( lo_baseline_lookahead, hi_baseline_lookahead)] if l != 0 ][1:101] normal_distribution_scale_factor = 1.4826 envelope.baseline = normal_distribution_scale_factor * mad( lookback_baseline + lookahead_baseline) id.append_envelope(envelope) mzml_fp.close() for id in ids: id.aggregate_envelopes() # TODO: Better variable naming here. obs? I can do better # TODO: is there better way to initialize this? # TODO: add lookback columns? # Initialize the dataframe to send back to the main process peak_out = pd.DataFrame(index=chunk.index.values, columns=[ 'mzs', 'abundances', 'lookback_mzs', 'lookback_abundances', 'lookahead_mzs', 'lookahead_abundances', 'rt_min', 'rt_max', 'baseline_signal', "mads", 'mzs_list', 'intensities_list', "rt_list", "baseline_list", 'num_scans_combined', 'mzml_path' ]) # Populate valid rows. for row in peak_out.itertuples(): i = row.Index id = ids[i] if id.condensed_envelope: mzs, abundances = id.condensed_envelope.to_obs() lb_mzs, lb_abundances = id.condensed_envelope.lb_obs() la_mzs, la_abundances = id.condensed_envelope.la_obs() peak_out.at[i, 'mzs'] = mzs peak_out.at[i, 'abundances'] = abundances peak_out.at[i, 'rt_min'] = id.rt_min peak_out.at[i, 'rt_max'] = id.rt_max peak_out.at[i, 'baseline_signal'] = id.condensed_envelope.baseline peak_out.at[i, 'lookback_mzs'] = lb_mzs peak_out.at[i, 'lookback_abundances'] = lb_abundances peak_out.at[i, 'lookahead_mzs'] = la_mzs peak_out.at[i, 'lookahead_abundances'] = la_abundances peak_out.at[i, 'mads'] = str(id.mads) peak_out.at[i, 'num_scans_combined'] = len(id._envelopes) peak_out.at[i, 'mzml_path'] = mzml_path results = chunk.join(peak_out) return results
def run_rate_workflow(self): # $will need some settings settings.load(rate_settings_file) # $first we need to check which steps are checked worklist = self.check_table_checklist() # $ only proceed if we have a if worklist == []: QtWidgets.QMessageBox.information( self, "Error", ("No options were " "checked. Please check steps to performed and try again")) return elif type(worklist) == str: QtWidgets.QMessageBox.information(self, "Error", worklist) return # $second we need to an output folder and check it for output folder QtWidgets.QMessageBox.information(self, "Info", ("Please select folder " "for output")) output_folder = QtWidgets.QFileDialog.getExistingDirectory( self, "Select an Output Folder", self.file_loc, QtWidgets.QFileDialog.ShowDirsOnly) if output_folder == "": return # $change location we start asking for things at # $don't change since all output is going in here self.file_loc = output_folder # MainGuiObject._make_folder(output_folder) #don't care if overwrite rate_settings.yaml but should check if want to use the settings already in the folder if os.path.exists(os.path.join(output_folder, "rate_settings.yaml")): comp_result = settings.compare( rate_settings_file, os.path.join(output_folder, "rate_settings.yaml")) if comp_result != "MATCH": if comp_result == "Mismatched Keys": qBox = QtWidgets.QMessageBox(self) qBox.setWindowTitle("Question") question = "A settings file already exists in this output folder. Would you like to use those settings,or overwrite them?" qBox.setText(question) qBox.setIcon(QtWidgets.QMessageBox.Question) qBox.setStandardButtons(QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No | QtWidgets.QMessageBox.Cancel) yButton = qBox.button(QtWidgets.QMessageBox.Yes) yButton.setText("Use Settings") nButton = qBox.button(QtWidgets.QMessageBox.No) nButton.setText("Overwrite") response = qBox.exec_() if response == QtWidgets.QMessageBox.Yes: settings.load( os.path.join(output_folder, "rate_settings.yaml")) settings.freeze(rate_settings_file) elif response == QtWidgets.QMessageBox.No: if self.check_file_removal([ os.path.join(output_folder, "rate_settings.yaml") ], ask_permission=False): settings.freeze( os.path.join(output_folder, "rate_settings.yaml")) else: return else: return else: #$no point asking if we can delete the file if it is the same anyway. still want to overwrite as part of checking permissions. #$may not overwrite later if self.check_file_removal( [os.path.join(output_folder, "rate_settings.yaml")], ask_permission=False): settings.freeze( os.path.join(output_folder, "rate_settings.yaml")) else: return else: settings.freeze(os.path.join(output_folder, "rate_settings.yaml")) # $then need to check if the files exist. if so warn the user. function no_extract_list = [w for w in worklist if w != "Extract"] outputs_to_check = [] #$deal with the extra detail file added for rate calculation if "Rate Calculation" in no_extract_list: outputs_to_check.append( os.path.join(output_folder, extra_rate_file)) for worklist_step in no_extract_list: step_object_dict[worklist_step].complete_filename(self.file_loc) outputs_to_check.append( step_object_dict[worklist_step].full_filename) # $if should only fail if an extract only, but that may occur if outputs_to_check != []: proceed = self.check_file_removal(outputs_to_check) if not proceed: return # $now we need to get input and do the work. each step can only occur # $once and they occur in order. so we will write them in order # todo$ see if we can compress the code and make sure it is readable previous_output_file = "" extracted_files = [] make_table_in_order = True for analysis_step in worklist: if analysis_step == "Extract": # $no if for this one, if extract is here it is the start id_file = self.collect_single_file("ID", "Extract", "CSV (*.csv)") if id_file == "": return # $always check if is good since it is first infile_is_good = self.check_input( step_object_dict[analysis_step], id_file) if not infile_is_good: return #$infile_is_good is just a check for blanks in the input file data_is_good = self.check_extractor_input( id_file, required_data_extractor_data, autofill_columns) if not data_is_good: return mzml_files = self.collect_multiple_files( "Centroided Data", analysis_step, "mzML (*.mzML)") if mzml_files == []: return mzml_filenames = [ os.path.basename(filename) for filename in mzml_files ] extracted_files = [ filename.replace(".mzML", ".tsv") for filename in mzml_filenames ] extracted_files = [ os.path.join(output_folder, filename) for filename in extracted_files ] extracted_intermediate_files = extracted_files needed_files = list( set(extracted_files + extracted_intermediate_files)) proceed = self.check_file_removal(needed_files) if not proceed: return # $need to run the table if necessary. taken from the # $"Provide Time and Enrichment" elif if "Provide Time and Enrichment" in worklist: previous_output_file = step_object_dict[ "Provide Time and Enrichment"].full_filename self.get_data_table = TimeEnrichmentWindow( self, extracted_files, previous_output_file) self.get_data_table.exec_() # $don't make the table twice make_table_in_order = False # $now that the table is done we need to confirm the user # $hit the proceed button on the table (same check as in # $elif analysis_step == "Theory Generation" ) if not os.path.exists(previous_output_file): return # $ modified from the extract-dir argument from the command line for m in tqdm(range(len(mzml_files)), total=len(mzml_files), desc="Extracting mzml files: "): extractor = Extractor( id_path=os.path.join(self.file_loc, id_file), mzml_path=mzml_files[m], out_path=extracted_intermediate_files[m], settings_path=rate_settings_file, ) extractor.load() extractor.run() extractor.write() #$need to delete classes when they're done or they may linger in RAM del extractor elif analysis_step == "Provide Time and Enrichment" and make_table_in_order: # $if coming right after a list if extracted_files == []: extracted_files = self.collect_multiple_files( "Extracted Data", "Provide Time and Enrichment", "TSV (*.tsv)") if extracted_files == []: return # $ensure the input files are good. only need to deal with # $if the user just selected for e_file in extracted_files: infile_is_good = self.check_input( step_object_dict[analysis_step], e_file) if not infile_is_good: return # $ now that we have the extracted files we can make a table # $the talbe will handle the output previous_output_file = step_object_dict[ analysis_step].full_filename self.get_data_table = TimeEnrichmentWindow( self, extracted_files, previous_output_file) self.get_data_table.exec_() elif analysis_step == "Theory Generation": # $since the files are in the table can just read that in if previous_output_file == "": previous_output_file = self.collect_single_file( "time and enrichment", analysis_step, "spreadsheet (*.csv *.tsv)") if previous_output_file == "": return infile_is_good = self.check_input( step_object_dict[analysis_step], previous_output_file) if not infile_is_good: return # $else is to deal with a failed write from the previous table # $ don't need an error message just return elif not os.path.exists(previous_output_file): return # $final check to see if all of the files in the input table # $still exist. don't want to error out in the middle of # $multiprocessing final_proceed = self.check_files_from_files( previous_output_file, 0) if not final_proceed: return theorist = TheoryPreparer( enrichment_path=previous_output_file, out_path=step_object_dict[analysis_step].full_filename, settings_path=rate_settings_file) theorist.prepare() theorist.write() del theorist previous_output_file = step_object_dict[ analysis_step].full_filename elif analysis_step == "Fraction New Calculation": if previous_output_file == "": previous_output_file = self.collect_single_file( "theoretical output", analysis_step, "spreadsheet (*.csv *.tsv)") if previous_output_file == "": return infile_is_good = self.check_input( step_object_dict[analysis_step], previous_output_file) if not infile_is_good: return # $not sure why this would happen but we'll put it here # $to avoid future error elif not os.path.exists(previous_output_file): return fnewcalc = FractionNewCalculator( model_path=previous_output_file, out_path=step_object_dict[analysis_step].full_filename, settings_path=rate_settings_file) fnewcalc.generate() if fnewcalc.error != "": QtWidgets.QMessageBox.information(self, "Error", fnewcalc.error) return fnewcalc.write() del fnewcalc previous_output_file = step_object_dict[ analysis_step].full_filename elif analysis_step == "Rate Calculation": if previous_output_file == "": previous_output_file = self.collect_single_file( "fraction new", analysis_step, "spreadsheet (*.csv *.tsv)") if previous_output_file == "": return # $need to ensure that we have proper colmns which varies # $by setting needed_columns = [ settings.peptide_analyte_id_column, settings.peptide_analyte_name_column, "sample_group" ] if settings.use_abundance != "No": needed_columns.extend( ["abund_fn", "frac_new_abunds_std_dev"]) if settings.use_neutromer_spacing: needed_columns.extend(["nsfn", "frac_new_mzs_std_dev"]) if settings.use_abundance != "No" and settings.use_neutromer_spacing: needed_columns.extend( ["cfn", "frac_new_combined_std_dev"]) step_object_dict[ analysis_step].required_columns = needed_columns infile_is_good = self.check_input( step_object_dict[analysis_step], previous_output_file) if not infile_is_good: return # $need to get a graph folder and ensure it exists # $don't worry about overwriting files GraphFolder = os.path.join(self.file_loc, "Graph_Folder") MainGuiObject._make_folder(GraphFolder) ratecalc = RateCalculator( model_path=previous_output_file, out_path=step_object_dict[analysis_step].full_filename, graph_folder=GraphFolder, settings_path=rate_settings_file) ratecalc.calculate() ratecalc.write() del ratecalc QtWidgets.QMessageBox.information(self, "Success", "Analysis completed successfully")
def _mp_prepare(df, settings_path): settings.load(settings_path) #$can start with itertuples. if need be can swap to apply for row in df.itertuples(index=True): #$Do any initial filtering to save time calculating if row.n_value < settings.min_allowed_n_values: df = FractionNewCalculator._error_method( df, row, "N value is less than {}".format( settings.min_allowed_n_values)) continue if len(row.Sequence) < settings.min_aa_sequence_length: df = FractionNewCalculator._error_method( df, row, "Fewer than {} amino acids".format( settings.min_aa_sequence_length)) continue #$if the user chooses 0 as enrichment it will cause divide by zero #$error later, so we will use the settings to force it #$result should be close to 0 change anyway if row.enrichment != 0.0: use_enrich = row.enrichment else: use_enrich = settings.enrichement_of_zero #$not currently considering adducts other than H+ in peptide data #try: # num_h, parsed_cf = parse_cf(row.cf_w_adduct) #except: num_h, parsed_cf = parse_cf(row.cf) _, enriched_results = emass(parsed_cf=parsed_cf, n_list=[0, row.n_value], n_H=num_h, low_pct=0, high_pct=use_enrich, num_peaks=int(row.num_peaks), testing=False) e_mzs, e_abunds = enriched_results if settings.use_abundance != "No": FractionNewCalculator._prepare_row(df, row, "abunds", e_abunds) normalized_empirical_abunds = \ FractionNewCalculator._normalize_abundances( row.abundances[1:-1].split(", ")) df.at[row.Index, 'normalized_empirical_abundances']= \ normalized_empirical_abunds theory_abund_deltas = FractionNewCalculator._calculate_deltas( e_abunds.loc[1][1:], e_abunds.loc[0][1:]) empirical_abund_deltas = FractionNewCalculator._calculate_deltas( [ float(x) for x in normalized_empirical_abunds.split(", ") ], e_abunds.loc[0][1:]) theory_abund_deltas, empirical_abund_deltas, removed_peaks = \ FractionNewCalculator._trim_abunds(theory_abund_deltas, empirical_abund_deltas) df.at[row.Index, "low_labeling_peaks"] = removed_peaks # TODO: Should this be included? # df.at[row.Index, 'delta_I'] = np.std(empirical_abund_deltas) #$don't need to break if we only have one or zero. combined and #$spacing are still fine, we just shouldn't do the other calculations #$here if len(theory_abund_deltas) < 2: df.at[row.Index, "abund_fn"] = \ f"Insufficient peaks with theory above {settings.minimum_abund_change}" all_frac_new_abunds = [] else: all_frac_new_abunds = FractionNewCalculator._calculate_fractions( empirical_abund_deltas, theory_abund_deltas) df = FractionNewCalculator.final_calculations( df, row, "abunds", all_frac_new_abunds, "abund_fn", theory_abund_deltas[0]) if settings.use_neutromer_spacing: FractionNewCalculator._prepare_row(df, row, "mzs", e_mzs) theory_mz_deltas = FractionNewCalculator._calculate_deltas( FractionNewCalculator._mz_deltas(e_mzs.loc[1][1:]), FractionNewCalculator._mz_deltas(e_mzs.loc[0][1:])) #need to trim the parenthesis from the row.mzs string #also turn it into floats to do math on it observed_mzs = [float(x) for x in row.mzs[1:-1].split(", ")] observed_neutral_mass = [y * int(row.z) for y in observed_mzs] df.at[row.Index, "observed_neutral_masses"] = \ ", ".join([str(x) for x in observed_neutral_mass]) empirical_mz_deltas = FractionNewCalculator._calculate_deltas( FractionNewCalculator._mz_deltas(observed_neutral_mass), FractionNewCalculator._mz_deltas(e_mzs.loc[0][1:])) all_frac_new_mzs = FractionNewCalculator._calculate_fractions( empirical_mz_deltas, theory_mz_deltas) df = FractionNewCalculator.final_calculations( df, row, "mzs", all_frac_new_mzs, "nsfn") if settings.use_neutromer_spacing and settings.use_abundance != "No": all_frac_new_combined = all_frac_new_abunds + all_frac_new_mzs df = FractionNewCalculator.final_calculations( df, row, "combined", all_frac_new_combined, "cfn") return df
def __init__(self, parent = None, current_setting_file = None): super(Rate_Setting_Menu, self).__init__(parent) settings.load(current_setting_file) self.current_setting_file =current_setting_file #$this is needed to slim things down a bit self.setWindowTitle("Rate Settings Menu") self.setupUi(self) self.fill_study_type_combobox() self.all_settings=[ setting_string_info(self.recognize_available_cores, "recognize_available_cores", settings.recognize_available_cores, True), setting_numerical_info(self.default_cores, "n_processors", settings.n_processors, True), setting_string_info(self.study_type_combobox, "study_type", settings.study_type, False), setting_string_info(self.rt_unit, "id_file_rt_unit", settings.id_file_rt_unit, False), setting_numerical_info(self.time_window, "time_window", settings.time_window, False), setting_numerical_info(self.ppm_error, "ppm_window", settings.ppm_window, True), setting_string_info(self.use_abundance, "use_abundance", settings.use_abundance, False), setting_string_info(self.use_neutromer_spacing, "use_neutromer_spacing", settings.use_neutromer_spacing, True), setting_numerical_info(self.minimum_nonzero_points, "minimum_nonzero_points", settings.minimum_nonzero_points, True), setting_string_info(self.roll_up_option,"roll_up_rate_calc", settings.roll_up_rate_calc, True), setting_string_info(self.asymptope_type, "asymptote", settings.asymptote, False), setting_numerical_info(self.fixed_asymptote_value, "fixed_asymptote_value", settings.fixed_asymptote_value, False), setting_numerical_info(self.proliferation_adjustment, "proliferation_adjustment", settings.proliferation_adjustment, False), setting_string_info(self.bias_selection_option, "bias_calculation", settings.bias_calculation, False), setting_numerical_info(self.abund_manual_bias, "abundance_manual_bias", settings.abundance_manual_bias, False), setting_numerical_info(self.spacing_manual_bias, "spacing_manual_bias", settings.spacing_manual_bias, False), setting_numerical_info(self.combined_manual_bias, "combined_manual_bias", settings.combined_manual_bias, False), setting_numerical_info(self.min_allowed_m0_change, "min_allowed_abund_max_delta", settings.min_allowed_abund_max_delta, False), setting_numerical_info(self.min_sequence_length, "min_aa_sequence_length", settings.min_aa_sequence_length, True), setting_numerical_info(self.min_n_value, "min_allowed_n_values", settings.min_allowed_n_values, True), setting_numerical_info(self.ms_level, "ms_level", settings.ms_level, True), setting_string_info(self.verbose_rate, "verbose_rate", settings.verbose_rate, True), setting_string_info(self.graph_save_file_type, "rate_output_format", settings.rate_output_format, False) ] for setting_object in self.all_settings: setting_object.set_object_value() self.SaveButton.clicked.connect(self.save_settings) self.ExitButton.clicked.connect(self.close)
def _mp_function(data_tuple, settings_path, fn_col, fn_std_dev, calc_type, manual_bias, std_dev_filter, graph_folder, rate_eq, max_time, p0): w.filterwarnings("error") settings.load(settings_path) pd.options.mode.chained_assignment = None id_values, group = data_tuple[0], data_tuple[1] id_name = id_values[0] sample_group_name =id_values[1] common_name = group[settings.peptide_analyte_name_column].iloc[0] #$drop error string could do earlier for more speed, but this is #$clearer and allows errors that affect only one calculation type group = RateCalculator._error_trimmer( group, [fn_col, fn_std_dev]) #$the copy is just to avoid a SettingWithCopy warning in a few #$operations. if it causes problems remove and suppress warning group = group[group[fn_std_dev] < std_dev_filter].copy() if len(group) == 0: result = RateCalculator._make_error_message( "No Isotope Envelopes Agree","", id_name, common_name, sample_group_name, calc_type, 0, 0, 0, 0) return result, group #offset all values by a certain amount (instrument bias) if settings.bias_calculation == "calculated": bias = RateCalculator._calc_bias(group, fn_col) group[fn_col] = group[fn_col] - bias elif settings.bias_calculation == "manual": #$ user designated bias group[fn_col] = group[fn_col] - manual_bias xs = np.concatenate(([0], group['time'].to_numpy())) ys = np.concatenate(([settings.y_intercept_of_fit], group[fn_col].to_numpy())) if settings.roll_up_rate_calc: xs, ys, devs = RateCalculator._roll(xs, ys) else: devs = np.concatenate(( [settings.error_of_zero], group[fn_std_dev].to_numpy())) # Get the number of unique time points, and continue if not enough num_unique_times = len(set(group['time'])) unique_length = len(set(group[settings.unique_sequence_column])) num_measurements = len(group.index) #TODO$ this is not ideal but this is a good first attempt num_files = len(set(group["mzml_path"])) #TODO: Handle Num Bio Reps in Stuff # I think this fixes the issues with technical replicates. #$need to use astype(str) on all or if someone uses numbers for group names or replicate names issues result num_bio_reps = len(set(group["time"].astype(str) + group["sample_group"].astype(str) + group["bio_rep"].astype(str))) if num_unique_times < settings.minimum_nonzero_points: result = RateCalculator._make_error_message( "Insufficient Timepoints","", id_name, common_name, sample_group_name, calc_type, num_measurements, num_unique_times, unique_length, num_files) return result, group # perform fit try: #$DO NOT use std dev as the Sigma because it creates influential outliers #$don't use sigma unless we have a different popt, pcov = curve_fit( f=rate_eq, xdata=xs, ydata=ys, p0=p0) # pull results of fit into variables rate = popt[0] asymptote = \ popt[1] if len(popt) > 1 else settings.fixed_asymptote_value #TODO$ ci uses degrees of freedom = n-k where n is the number of points and k is the number of parameters estimated #$including intercept in linear regression. if asymptote is fixed k =1 otherwise k =2 (intercept is fit by equation, not data) #$not counting charge states and different peptides as unique measurements. #$despite the claim in the documentation, acorrding to statistics consultation and every site I checked, the np.sqrt(np.diag(pcov))[0] #$is standard error, not std dev, so don't divide by sqrt of n confint = \ t.ppf(.975, num_files - len(popt)) * \ np.sqrt(np.diag(pcov))[0] y_predicted = dur.simple(xs, rate, asymptote, settings.proliferation_adjustment) r_2 = dur.calculate_r2(ys, y_predicted) result = { 'analyte_id': id_name, 'analyte_name': common_name, 'group_name': sample_group_name, '{} rate'.format(calc_type) : rate, '{} asymptote'.format(calc_type) : asymptote, '{} std_error'.format(calc_type): np.sqrt(np.diag(pcov))[0], '{} 95pct_confidence'.format(calc_type): confint, '{} half life'.format(calc_type): RateCalculator._halflife(rate), '{} R2'.format(calc_type): r_2, "{} files observed in".format(calc_type): num_files, '{} num_measurements'.format(calc_type): num_measurements, '{} num_time_points'.format(calc_type): num_unique_times, '{} uniques'.format(calc_type): unique_length, '{} exceptions'.format(calc_type): "", #$'calculation_type': calc_type } #$ if there is an asymptote need to provide it graph_name = "{}_{}_{}".format(id_name, sample_group_name, fn_col) graph_title = "{}_{}_{}\nk={}, a={}".format(common_name, sample_group_name, fn_col, result[f'{calc_type} rate'], 1.0) if settings.roll_up_rate_calc: graph_rate(graph_name, xs, ys, rate, asymptote, confint, rate_eq, graph_folder, max_time, settings.asymptote, devs, title=graph_title) else: graph_rate(graph_name, xs, ys, rate, asymptote, confint, rate_eq, graph_folder, max_time, settings.asymptote, title=graph_title) except Exception as c: #$"we have a guess but are unsure" warning if type(c).__name__ == "OptimizeWarning": current_exception = \ 'OptimizeWarning: optimal fit could not be found' #$couldn't find the minimum elif type(c).__name__ == "RuntimeError": current_exception = \ 'fit could not be found' else: raise c #$will stop here so don't need to consider further result = RateCalculator._make_error_message( "value could not be determined", current_exception, id_name, common_name, sample_group_name, calc_type,num_measurements, num_unique_times, unique_length, num_files) return result, group