Exemple #1
0
    def __init__(self, enrichment_path, out_path, settings_path):
        settings.load(settings_path)
        self.settings_path = settings_path
        self.enrichment_path = Path(enrichment_path)
        aa_label_df = pd.read_csv(settings.aa_label_path, sep='\t')
        aa_label_df.set_index('study_type', inplace=True)
        self.aa_labeling_dict = aa_label_df.loc[
            settings.study_type, ].to_dict()

        if self.enrichment_path.suffix == '.tsv':
            self._enrichment_df = pd.read_csv(filepath_or_buffer=str(
                self.enrichment_path),
                                              sep='\t')
        elif self.enrichment_path.suffix == '.csv':
            self._enrichment_df = pd.read_csv(filepath_or_buffer=str(
                self.enrichment_path),
                                              sep=',')
        if settings.recognize_available_cores is True:
            self._n_processors = mp.cpu_count()
        else:
            self._n_processors = settings.n_processors
        #$breaks windows/python interactions if too many cores are used.  very niche application but still relevant
        if self._n_processors > 60:
            self.n_processors = 60
        self._mp_pool = mp.Pool(self._n_processors)
        self.out_path = out_path
        self.model = None
    def __init__(self, model_path, out_path, settings_path):
        settings.load(settings_path)
        self.settings_path = settings_path
        itertuple_renamer = copy(protein_itertuple_renamer)

        self.error = ""

        #$get the number of cores we're using for multiprocessing
        if settings.recognize_available_cores is True:
            self._n_processors = mp.cpu_count()
        else:
            self._n_processors = settings.n_processors
        #$breaks windows/python interactions if too many cores are used.  very niche application but still relevant
        if self._n_processors > 60:
            self.n_processors = 60
        self._mp_pool = mp.Pool(self._n_processors)

        if model_path[-4:] == ".tsv":
            self.model = pd.read_csv(model_path, sep='\t')
        elif model_path[-4:] == ".csv":
            self.model = pd.read_csv(model_path)
        else:  # $should never trigger unless we are fiddling with the gui
            raise ValueError("invalid file extension")

        self.model.rename(columns=itertuple_renamer, inplace=True)
        self.out_path = out_path
Exemple #3
0
 def _mp_prepare(settings_path, args, aa_labeling_dict=None):
     settings.load(settings_path)
     #file_path, time, enrichment = args
     file_path, time, enrichment, sample_group, biological_replicate = args
     df = pd.read_csv(filepath_or_buffer=file_path, sep='\t')
     if "mzs_list" in df.columns:
         df.drop(inplace=True,
                 columns=[
                     "mzs_list", "intensities_list", "rt_list",
                     "baseline_list"
                 ])
     df = TheoryPreparer._apply_filters(df)
     if aa_labeling_dict:
         #$don't include an else for either if statement.  no need to calculate if column exists
         #$ and we don't want to add the column if we can't calculate it since checking for it is an error check for later steps
         if literature_n_name not in df.columns:
             if aa_labeling_dict != "":
                 df = df.apply(TheoryPreparer._calculate_literature_n,
                               axis=1,
                               args=(aa_labeling_dict, ))
     df['time'] = time
     df['enrichment'] = enrichment
     df["sample_group"] = sample_group
     df["bio_rep"] = biological_replicate
     return df
Exemple #4
0
    def __init__(self, settings_path, id_path, mzml_path, out_path):
        '''
        Parameters
        ----------
        ids : str
            The name of the file containing the identifications. This data
            will likely have been taken from an unlabeled run.
        mzml : str
            The name of the file containing mass spectrometry data in the
            mzml format, labeled or not.
        settings : str
            The name of the file containing the settings for this instance
            of the exctractor, which may contain the settings for the rest
            of Deuterater as well. This file *should* be in ``.yaml`` format.

            For addition information, see the settings file's documentation

        '''
        self.settings_path = settings_path
        settings.load(self.settings_path)

        self.id_path = id_path
        self.mzml_path = mzml_path
        self.out_path = out_path

        self.ids = {}
        self._id_chunks = ()
        self._mzml_native_id_bounds = []
        self.model = pd.DataFrame()

        try:
            if settings.recognize_available_cores is True:
                self._n_processors = mp.cpu_count()
            else:
                self._n_processors = settings.n_processors
            #$breaks windows/python interactions if too many cores are used.  very niche application but still relevant
            if self._n_processors > 60:
                self.n_processors = 60
            self._chunk_size = settings.chunk_size
            self._chunking_threshold = mul(settings.chunking_method_threshold,
                                           settings.chunk_size)
            self._id_rt_unit = settings.id_file_rt_unit
            self._trim_ids = settings.trim_ids_to_mzml_bounds
            self._rt_window = settings.time_window
            self._mp_pool = mp.Pool(self._n_processors)

            if not os.path.exists(self.out_path):
                open(self.out_path, 'w').close()
            # TODO: this needs fixed, it doesn't catch open files
            if not os.access(self.out_path, os.W_OK):
                raise PermissionError('Output path not writeable.')

        except Exception as e:
            print(e)
            traceback.print_tb(e.__traceback__)
            raise
Exemple #5
0
 def __init__(self, model_path, out_path, graph_folder, settings_path):
     settings.load(settings_path)
     if model_path[-4:] == ".tsv":
         self.model = pd.read_csv(model_path, sep='\t')
     elif model_path[-4:] == ".csv":
         self.model = pd.read_csv(model_path)
     else: #$should never trigger unless we are fiddling with the gui
         raise ValueError("invalid file extension")
     self.out_path = out_path
     self.rate_model = None
     self.datapoint_model = None  # CQ
     self.graph_folder = graph_folder
     self.settings_path = settings_path
     #$get the number of cores we're using for multiprocessing
     if settings.recognize_available_cores is True:
         self._n_processors = mp.cpu_count()
     else:
         self._n_processors = settings.n_processors
     
     
     #$breaks windows/python interactions if too many cores are used.  very niche application but still relevant
     if self._n_processors > 60:
         self.n_processors = 60
     self._mp_pool = mp.Pool(self._n_processors)
Exemple #6
0
def extract(settings_path, mzml_path, index_to_ID, chunk):
    '''Extract data from the mzml according to the identification information
    '''
    # A rough outline of how the logic flows.
    # EXTRACT:
    # for each scan in the reader
    #   in ms level 1
    #   for each id in the window
    #       isotope extraction specific logic

    # Turn the warnings off so that it doesn't mess up the tqdm progress bar
    warnings.filterwarnings("ignore")

    # Load the settings into the settings module. This needs to be done in each
    #   process due to how python handles multiprocessing
    settings.load(settings_path)

    # Open a file pointer to the mzml file
    mzml_fp = pymzml.run.Reader(path_or_file=mzml_path,
                                build_index_from_scratch=True)
    mzml_bounds = dml.get_bounds(mzml_fp, index_to_ID)

    # Check for an empty id file chunk
    if len(chunk) <= 0:
        warnings.warn(
            EmptyIdChunkWarning('There are no identifications in this chunk'))

    # Set the high and low retention time bounds, based on the chunk of the
    #   identification file
    lo_rt_bound = chunk.at[0, 'rt'] - settings.time_window
    if lo_rt_bound < 0:
        lo_rt_bound = 0
    hi_rt_bound = chunk.at[len(chunk) - 1, 'rt'] + settings.time_window

    # Search for the scans at the high and low retention time bounds
    lo_spec_idx = dml.retention_time_search(mzml_fp, index_to_ID, lo_rt_bound)
    hi_spec_idx = dml.retention_time_search(mzml_fp, index_to_ID, hi_rt_bound)
    ##    if mzml_fp[index_to_ID[hi_spec_idx]] > hi_rt_bound:
    ##        hi_spec_idx = hi_spec_idx - 1

    # Logic block for handling out-of-bounds indices
    if lo_spec_idx != -1 and hi_spec_idx != -1:
        # Do nothing if both indices are in bounds
        pass
    elif lo_spec_idx == -1 and hi_spec_idx != -1:
        # If the just the higher index is found, assign the lowest index in
        #   the mzml to 'lo_spec_idx'
        lo_spec_idx = mzml_bounds['idx_min']
    elif lo_spec_idx != -1 and hi_spec_idx == -1:
        # If the just the lower index is found, assign the highest index in
        #   the mzml to 'hi_spec_idx'
        hi_spec_idx = mzml_bounds['idx_max']
    elif lo_rt_bound < mzml_bounds['rt_min'] < \
            mzml_bounds['rt_max'] < hi_rt_bound:
        # If neither index is found but the time span covered by the chunk of
        #   the ID file encompasses that of the mzml, assign 'lo_spec_idx' and
        #   'hi_spec_idx' the minimum and maximum index values given by the
        #   mzml file
        lo_spec_idx = mzml_bounds['idx_min']
        hi_spec_idx = mzml_bounds['idx_max']
    else:
        # Otherwise, there is no intersection between the ID file and the mzml
        #   in terms of retention time and no analysis can be made
        return -1

    ids = []  # initialize the list of identifications

    # TODO: redefine this column as ionmass?
    chunk['mass'] = chunk['mz'] * chunk['z']

    # Instantiate all of the identifications in the chunk
    for row in chunk.itertuples(index=True):
        ids.append(
            ID(
                rt=row.rt,
                mz=row.mz,
                mass=row.mass,
                z=row.z,
                n_isos=row.n_isos,
                #cf=row.cf
            ))

    # Iterate through all of the relevent spectrums in the mzml
    for spectrum_index in dmt.inclusive_range(lo_spec_idx, hi_spec_idx):
        # apply the index_to_ID map in order to access the correct spectrum
        native_id = index_to_ID[spectrum_index]
        try:
            # try to access this spectrum
            spectrum = mzml_fp[native_id]
            spec_rt = spectrum.scan_time_in_minutes()
            spec_mzs = spectrum.mz
            spec_abs = spectrum.i
        except Exception:
            # TODO: use a more specific Exception
            # catch the exception and move on if the spectrum is not found
            continue

        # only deal with desired ms_level
        if spectrum.ms_level != settings.ms_level:
            continue

        # determine id indices of peaks searches
        # adding and subtracting the floating point error tolerance allows us
        # to include the extremes of the range
        local_window_min = \
            spec_rt - (settings.time_window)  # + settings.fpe_tolerance)
        local_window_max = \
            spec_rt + (settings.time_window)  # + settings.fpe_tolerance)
        try:
            lo_slice_index = \
                min(chunk[chunk['rt'] > local_window_min].axes[0].tolist())
            hi_slice_index = \
                max(chunk[chunk['rt'] < local_window_max].axes[0].tolist())
        except:
            continue

        # iterate through relevant ids
        for id in ids[dmt.inclusive_slice(lo_slice_index, hi_slice_index)]:
            charge = id.z
            # instatntiate an envelope
            envelope = Envelope(peaks=[],
                                rt=spec_rt,
                                n_lookback=settings.peak_lookback,
                                n_lookahead=settings.peak_lookahead)

            lo_baseline_lookback = None
            hi_baseline_lookback = None
            lo_baseline_lookahead = None
            hi_baseline_lookahead = None

            peak_range_start = 0 - settings.peak_lookback
            peak_range_end = id.n_isos + settings.peak_lookahead

            # Iterate through all of the peaks we want to look for
            for peak_num in range(peak_range_start, peak_range_end):
                # define the mz to search for in the spectrum
                search_mz = id.mz + (peak_num * NEUTRON / charge)
                # define the ppm error tolerance
                reach = settings.ppm_window / 1_000_000.0 * search_mz
                # find the index of the nearest data point in that spectrum's
                #   mz array
                index = dmt.find_nearest_index(spec_mzs, search_mz)

                if peak_num == 0:
                    # set the bounds for defining the baseline
                    lo_baseline_lookback = dmt.find_nearest_index(
                        spec_mzs, id.mz - settings.baseline_lookback)
                    hi_baseline_lookback = index
                    lo_baseline_lookahead = index
                    hi_baseline_lookahead = dmt.find_nearest_index(
                        spec_mzs, id.mz + settings.baseline_lookback)

                # TODO: Do I need to speed this up by removing typecheck?
                # TODO: Expand this to only one paren/bracket per line?
                if abs(spec_mzs[index] - search_mz) < reach:
                    # If the value at that index is within the reach
                    envelope.append_peak(
                        Peak(mz=spec_mzs[index],
                             abundance=spec_abs[index],
                             i=peak_num))
                else:
                    if 0 <= peak_num < id.n_isos:
                        # set the envelopes validity flag to false if no peak
                        #   is found, then move on to the next identification
                        envelope.is_valid = False
                    envelope.append_peak(
                        Peak(
                            mz=search_mz,
                            # TODO: it might be better to set this to NA
                            abundance=0,
                            i=peak_num))

            # TODO Do i need to speed this up by removing typecheck?
            # If all of the peaks have been found, add it to the
            #   identification (after determining the baseline)
            # NOTE: baseline is defined as the median abundance of the 100
            #   mz units preceding the m0 peak

            # CQ: Changing baseline to be the MAD of 100 m/z datapoints ahead and behind m0 peak.
            # Adapted from Marginean, I; Tang, K; Smith, RD.; Kelly, R; Picoelectrospray Ionization Mass Spectrometry
            #   Using Narrow-Bore Chemically Etched Emitters, ASMS, 2013

            def mad(values):
                m = median(values)
                return median([abs(a - m) for a in values])

            lookback_baseline = [
                l for l in spec_abs[dmt.inclusive_slice(
                    lo_baseline_lookback, hi_baseline_lookback)] if l != 0
            ][-100:]
            lookahead_baseline = [
                l for l in spec_abs[dmt.inclusive_slice(
                    lo_baseline_lookahead, hi_baseline_lookahead)] if l != 0
            ][1:101]

            normal_distribution_scale_factor = 1.4826
            envelope.baseline = normal_distribution_scale_factor * mad(
                lookback_baseline + lookahead_baseline)

            id.append_envelope(envelope)
    mzml_fp.close()

    for id in ids:
        id.aggregate_envelopes()

    # TODO: Better variable naming here. obs? I can do better
    # TODO: is there better way to initialize this?
    # TODO: add lookback columns?

    # Initialize the dataframe to send back to the main process
    peak_out = pd.DataFrame(index=chunk.index.values,
                            columns=[
                                'mzs', 'abundances', 'lookback_mzs',
                                'lookback_abundances', 'lookahead_mzs',
                                'lookahead_abundances', 'rt_min', 'rt_max',
                                'baseline_signal', "mads", 'mzs_list',
                                'intensities_list', "rt_list", "baseline_list",
                                'num_scans_combined', 'mzml_path'
                            ])

    # Populate valid rows.
    for row in peak_out.itertuples():
        i = row.Index
        id = ids[i]

        if id.condensed_envelope:
            mzs, abundances = id.condensed_envelope.to_obs()
            lb_mzs, lb_abundances = id.condensed_envelope.lb_obs()
            la_mzs, la_abundances = id.condensed_envelope.la_obs()
            peak_out.at[i, 'mzs'] = mzs
            peak_out.at[i, 'abundances'] = abundances
            peak_out.at[i, 'rt_min'] = id.rt_min
            peak_out.at[i, 'rt_max'] = id.rt_max
            peak_out.at[i, 'baseline_signal'] = id.condensed_envelope.baseline
            peak_out.at[i, 'lookback_mzs'] = lb_mzs
            peak_out.at[i, 'lookback_abundances'] = lb_abundances
            peak_out.at[i, 'lookahead_mzs'] = la_mzs
            peak_out.at[i, 'lookahead_abundances'] = la_abundances
            peak_out.at[i, 'mads'] = str(id.mads)
            peak_out.at[i, 'num_scans_combined'] = len(id._envelopes)
        peak_out.at[i, 'mzml_path'] = mzml_path

    results = chunk.join(peak_out)

    return results
Exemple #7
0
    def run_rate_workflow(self):
        # $will need some settings
        settings.load(rate_settings_file)

        # $first we need to check which steps are checked
        worklist = self.check_table_checklist()
        # $ only proceed if we have a
        if worklist == []:
            QtWidgets.QMessageBox.information(
                self, "Error",
                ("No options were "
                 "checked. Please check steps to performed and try again"))
            return
        elif type(worklist) == str:
            QtWidgets.QMessageBox.information(self, "Error", worklist)
            return
        # $second we need to an output folder and check it for output folder
        QtWidgets.QMessageBox.information(self, "Info",
                                          ("Please select folder "
                                           "for output"))
        output_folder = QtWidgets.QFileDialog.getExistingDirectory(
            self, "Select an Output Folder", self.file_loc,
            QtWidgets.QFileDialog.ShowDirsOnly)
        if output_folder == "": return
        # $change location we start asking for things at
        # $don't change since all output is going in here
        self.file_loc = output_folder
        # MainGuiObject._make_folder(output_folder)

        #don't care if overwrite rate_settings.yaml but should check if want to use the settings already in the folder
        if os.path.exists(os.path.join(output_folder, "rate_settings.yaml")):
            comp_result = settings.compare(
                rate_settings_file,
                os.path.join(output_folder, "rate_settings.yaml"))
            if comp_result != "MATCH":
                if comp_result == "Mismatched Keys":
                    qBox = QtWidgets.QMessageBox(self)
                    qBox.setWindowTitle("Question")
                    question = "A settings file already exists in this output folder. Would you like to use those settings,or overwrite them?"
                    qBox.setText(question)
                    qBox.setIcon(QtWidgets.QMessageBox.Question)
                    qBox.setStandardButtons(QtWidgets.QMessageBox.Yes
                                            | QtWidgets.QMessageBox.No
                                            | QtWidgets.QMessageBox.Cancel)

                    yButton = qBox.button(QtWidgets.QMessageBox.Yes)
                    yButton.setText("Use Settings")
                    nButton = qBox.button(QtWidgets.QMessageBox.No)
                    nButton.setText("Overwrite")
                    response = qBox.exec_()
                    if response == QtWidgets.QMessageBox.Yes:
                        settings.load(
                            os.path.join(output_folder, "rate_settings.yaml"))
                        settings.freeze(rate_settings_file)
                    elif response == QtWidgets.QMessageBox.No:
                        if self.check_file_removal([
                                os.path.join(output_folder,
                                             "rate_settings.yaml")
                        ],
                                                   ask_permission=False):
                            settings.freeze(
                                os.path.join(output_folder,
                                             "rate_settings.yaml"))
                        else:
                            return
                    else:
                        return
                else:
                    #$no point asking if we can delete the file if it is the same anyway.  still want to overwrite as part of checking permissions.
                    #$may not overwrite later
                    if self.check_file_removal(
                        [os.path.join(output_folder, "rate_settings.yaml")],
                            ask_permission=False):
                        settings.freeze(
                            os.path.join(output_folder, "rate_settings.yaml"))
                    else:
                        return
        else:
            settings.freeze(os.path.join(output_folder, "rate_settings.yaml"))

        # $then need to check if the files exist. if so warn the user. function
        no_extract_list = [w for w in worklist if w != "Extract"]
        outputs_to_check = []
        #$deal with the extra detail file added for rate calculation
        if "Rate Calculation" in no_extract_list:
            outputs_to_check.append(
                os.path.join(output_folder, extra_rate_file))

        for worklist_step in no_extract_list:
            step_object_dict[worklist_step].complete_filename(self.file_loc)
            outputs_to_check.append(
                step_object_dict[worklist_step].full_filename)
        # $if should only fail if an extract only, but that may occur
        if outputs_to_check != []:
            proceed = self.check_file_removal(outputs_to_check)
            if not proceed:
                return

        # $now we need to get input and do the work. each step can only occur
        # $once and they occur in order. so we will write them in order
        # todo$ see if we can compress the code and make sure it is readable
        previous_output_file = ""
        extracted_files = []
        make_table_in_order = True
        for analysis_step in worklist:
            if analysis_step == "Extract":
                # $no if for this one, if extract is here it is the start
                id_file = self.collect_single_file("ID", "Extract",
                                                   "CSV (*.csv)")
                if id_file == "": return
                # $always check if is good since it is first
                infile_is_good = self.check_input(
                    step_object_dict[analysis_step], id_file)
                if not infile_is_good: return
                #$infile_is_good is just a check for blanks in the input file
                data_is_good = self.check_extractor_input(
                    id_file, required_data_extractor_data, autofill_columns)
                if not data_is_good: return
                mzml_files = self.collect_multiple_files(
                    "Centroided Data", analysis_step, "mzML (*.mzML)")
                if mzml_files == []: return

                mzml_filenames = [
                    os.path.basename(filename) for filename in mzml_files
                ]
                extracted_files = [
                    filename.replace(".mzML", ".tsv")
                    for filename in mzml_filenames
                ]
                extracted_files = [
                    os.path.join(output_folder, filename)
                    for filename in extracted_files
                ]
                extracted_intermediate_files = extracted_files

                needed_files = list(
                    set(extracted_files + extracted_intermediate_files))
                proceed = self.check_file_removal(needed_files)
                if not proceed: return
                # $need to run the table if necessary. taken from the
                # $"Provide Time and Enrichment" elif
                if "Provide Time and Enrichment" in worklist:
                    previous_output_file = step_object_dict[
                        "Provide Time and Enrichment"].full_filename
                    self.get_data_table = TimeEnrichmentWindow(
                        self, extracted_files, previous_output_file)
                    self.get_data_table.exec_()
                    # $don't make the table twice
                    make_table_in_order = False
                    # $now that the table is done we need to confirm the user
                    # $hit the proceed button on the table (same check as in
                    # $elif analysis_step == "Theory Generation" )
                    if not os.path.exists(previous_output_file): return
                # $ modified from the extract-dir argument from the command line
                for m in tqdm(range(len(mzml_files)),
                              total=len(mzml_files),
                              desc="Extracting mzml files: "):
                    extractor = Extractor(
                        id_path=os.path.join(self.file_loc, id_file),
                        mzml_path=mzml_files[m],
                        out_path=extracted_intermediate_files[m],
                        settings_path=rate_settings_file,
                    )
                    extractor.load()
                    extractor.run()
                    extractor.write()
                    #$need to delete classes when they're done or they may linger in RAM
                    del extractor
            elif analysis_step == "Provide Time and Enrichment" and make_table_in_order:
                # $if coming right after a list
                if extracted_files == []:
                    extracted_files = self.collect_multiple_files(
                        "Extracted Data", "Provide Time and Enrichment",
                        "TSV (*.tsv)")
                    if extracted_files == []: return
                    # $ensure the input files are good. only need to deal with
                    # $if the user just selected
                    for e_file in extracted_files:
                        infile_is_good = self.check_input(
                            step_object_dict[analysis_step], e_file)
                        if not infile_is_good: return

                # $ now that we have the extracted files we can make a table
                # $the talbe will handle the output
                previous_output_file = step_object_dict[
                    analysis_step].full_filename
                self.get_data_table = TimeEnrichmentWindow(
                    self, extracted_files, previous_output_file)
                self.get_data_table.exec_()
            elif analysis_step == "Theory Generation":
                # $since the files are in the table can just read that in
                if previous_output_file == "":
                    previous_output_file = self.collect_single_file(
                        "time and enrichment", analysis_step,
                        "spreadsheet (*.csv *.tsv)")
                    if previous_output_file == "": return
                    infile_is_good = self.check_input(
                        step_object_dict[analysis_step], previous_output_file)
                    if not infile_is_good: return
                # $else is to deal with a failed write from the previous table
                # $ don't need an error message just return
                elif not os.path.exists(previous_output_file):
                    return

                # $final check to see if all of the files in the input table
                # $still exist.  don't want to error out in the middle of
                # $multiprocessing
                final_proceed = self.check_files_from_files(
                    previous_output_file, 0)
                if not final_proceed: return

                theorist = TheoryPreparer(
                    enrichment_path=previous_output_file,
                    out_path=step_object_dict[analysis_step].full_filename,
                    settings_path=rate_settings_file)
                theorist.prepare()
                theorist.write()
                del theorist
                previous_output_file = step_object_dict[
                    analysis_step].full_filename
            elif analysis_step == "Fraction New Calculation":
                if previous_output_file == "":
                    previous_output_file = self.collect_single_file(
                        "theoretical output", analysis_step,
                        "spreadsheet (*.csv *.tsv)")
                    if previous_output_file == "": return

                    infile_is_good = self.check_input(
                        step_object_dict[analysis_step], previous_output_file)
                    if not infile_is_good: return
                # $not sure why this would happen but we'll put it here
                # $to avoid future error
                elif not os.path.exists(previous_output_file):
                    return
                fnewcalc = FractionNewCalculator(
                    model_path=previous_output_file,
                    out_path=step_object_dict[analysis_step].full_filename,
                    settings_path=rate_settings_file)
                fnewcalc.generate()
                if fnewcalc.error != "":
                    QtWidgets.QMessageBox.information(self, "Error",
                                                      fnewcalc.error)
                    return
                fnewcalc.write()
                del fnewcalc
                previous_output_file = step_object_dict[
                    analysis_step].full_filename
            elif analysis_step == "Rate Calculation":
                if previous_output_file == "":
                    previous_output_file = self.collect_single_file(
                        "fraction new", analysis_step,
                        "spreadsheet (*.csv *.tsv)")
                    if previous_output_file == "": return
                    # $need to ensure that we have proper colmns which varies
                    # $by setting
                    needed_columns = [
                        settings.peptide_analyte_id_column,
                        settings.peptide_analyte_name_column, "sample_group"
                    ]

                    if settings.use_abundance != "No":
                        needed_columns.extend(
                            ["abund_fn", "frac_new_abunds_std_dev"])
                    if settings.use_neutromer_spacing:
                        needed_columns.extend(["nsfn", "frac_new_mzs_std_dev"])
                    if settings.use_abundance != "No" and settings.use_neutromer_spacing:
                        needed_columns.extend(
                            ["cfn", "frac_new_combined_std_dev"])
                    step_object_dict[
                        analysis_step].required_columns = needed_columns
                    infile_is_good = self.check_input(
                        step_object_dict[analysis_step], previous_output_file)
                    if not infile_is_good: return

                # $need to get a graph folder and ensure it exists
                # $don't worry about overwriting files
                GraphFolder = os.path.join(self.file_loc, "Graph_Folder")
                MainGuiObject._make_folder(GraphFolder)
                ratecalc = RateCalculator(
                    model_path=previous_output_file,
                    out_path=step_object_dict[analysis_step].full_filename,
                    graph_folder=GraphFolder,
                    settings_path=rate_settings_file)
                ratecalc.calculate()
                ratecalc.write()
                del ratecalc
        QtWidgets.QMessageBox.information(self, "Success",
                                          "Analysis completed successfully")
    def _mp_prepare(df, settings_path):
        settings.load(settings_path)
        #$can start with itertuples.  if need be can swap to apply
        for row in df.itertuples(index=True):
            #$Do any initial filtering to save time calculating
            if row.n_value < settings.min_allowed_n_values:
                df = FractionNewCalculator._error_method(
                    df, row, "N value is less than {}".format(
                        settings.min_allowed_n_values))
                continue
            if len(row.Sequence) < settings.min_aa_sequence_length:
                df = FractionNewCalculator._error_method(
                    df, row, "Fewer than {} amino acids".format(
                        settings.min_aa_sequence_length))
                continue

            #$if the user chooses 0 as enrichment it will cause divide by zero
            #$error later, so we will use the settings to force it
            #$result should be close to 0 change anyway
            if row.enrichment != 0.0:
                use_enrich = row.enrichment
            else:
                use_enrich = settings.enrichement_of_zero

            #$not currently considering adducts other than H+ in peptide data
            #try:
            #    num_h, parsed_cf = parse_cf(row.cf_w_adduct)
            #except:
            num_h, parsed_cf = parse_cf(row.cf)

            _, enriched_results = emass(parsed_cf=parsed_cf,
                                        n_list=[0, row.n_value],
                                        n_H=num_h,
                                        low_pct=0,
                                        high_pct=use_enrich,
                                        num_peaks=int(row.num_peaks),
                                        testing=False)
            e_mzs, e_abunds = enriched_results

            if settings.use_abundance != "No":
                FractionNewCalculator._prepare_row(df, row, "abunds", e_abunds)
                normalized_empirical_abunds = \
                    FractionNewCalculator._normalize_abundances(
                        row.abundances[1:-1].split(", "))
                df.at[row.Index, 'normalized_empirical_abundances']= \
                    normalized_empirical_abunds

                theory_abund_deltas = FractionNewCalculator._calculate_deltas(
                    e_abunds.loc[1][1:], e_abunds.loc[0][1:])
                empirical_abund_deltas = FractionNewCalculator._calculate_deltas(
                    [
                        float(x)
                        for x in normalized_empirical_abunds.split(", ")
                    ], e_abunds.loc[0][1:])
                theory_abund_deltas, empirical_abund_deltas, removed_peaks = \
                    FractionNewCalculator._trim_abunds(theory_abund_deltas, empirical_abund_deltas)

                df.at[row.Index, "low_labeling_peaks"] = removed_peaks

                # TODO: Should this be included?
                # df.at[row.Index, 'delta_I'] = np.std(empirical_abund_deltas)

                #$don't need to break if we only have one or zero. combined and
                #$spacing are still fine, we just shouldn't do the other calculations
                #$here
                if len(theory_abund_deltas) < 2:
                    df.at[row.Index, "abund_fn"] = \
                        f"Insufficient peaks with theory above {settings.minimum_abund_change}"
                    all_frac_new_abunds = []
                else:
                    all_frac_new_abunds = FractionNewCalculator._calculate_fractions(
                        empirical_abund_deltas, theory_abund_deltas)
                    df = FractionNewCalculator.final_calculations(
                        df, row, "abunds", all_frac_new_abunds, "abund_fn",
                        theory_abund_deltas[0])

            if settings.use_neutromer_spacing:
                FractionNewCalculator._prepare_row(df, row, "mzs", e_mzs)
                theory_mz_deltas = FractionNewCalculator._calculate_deltas(
                    FractionNewCalculator._mz_deltas(e_mzs.loc[1][1:]),
                    FractionNewCalculator._mz_deltas(e_mzs.loc[0][1:]))
                #need to trim the parenthesis from the row.mzs string
                #also turn it into floats to do math on it
                observed_mzs = [float(x) for x in row.mzs[1:-1].split(", ")]
                observed_neutral_mass = [y * int(row.z) for y in observed_mzs]
                df.at[row.Index, "observed_neutral_masses"] = \
                    ", ".join([str(x) for x in observed_neutral_mass])
                empirical_mz_deltas = FractionNewCalculator._calculate_deltas(
                    FractionNewCalculator._mz_deltas(observed_neutral_mass),
                    FractionNewCalculator._mz_deltas(e_mzs.loc[0][1:]))
                all_frac_new_mzs = FractionNewCalculator._calculate_fractions(
                    empirical_mz_deltas, theory_mz_deltas)
                df = FractionNewCalculator.final_calculations(
                    df, row, "mzs", all_frac_new_mzs, "nsfn")

            if settings.use_neutromer_spacing and settings.use_abundance != "No":
                all_frac_new_combined = all_frac_new_abunds + all_frac_new_mzs
                df = FractionNewCalculator.final_calculations(
                    df, row, "combined", all_frac_new_combined, "cfn")
        return df
Exemple #9
0
 def __init__(self, parent = None, current_setting_file = None):
     super(Rate_Setting_Menu, self).__init__(parent)
     settings.load(current_setting_file)
     self.current_setting_file =current_setting_file
     #$this is needed to slim things down a bit
     self.setWindowTitle("Rate Settings Menu")
     self.setupUi(self)
     
     self.fill_study_type_combobox()
     self.all_settings=[
         setting_string_info(self.recognize_available_cores, "recognize_available_cores",
                              settings.recognize_available_cores, True),
         setting_numerical_info(self.default_cores, "n_processors",
                                settings.n_processors, True),
         setting_string_info(self.study_type_combobox, "study_type",
                             settings.study_type, False),
         setting_string_info(self.rt_unit, "id_file_rt_unit",
                             settings.id_file_rt_unit, False),
         setting_numerical_info(self.time_window, "time_window",
                                 settings.time_window, False),
         setting_numerical_info(self.ppm_error, "ppm_window",
                                 settings.ppm_window, True),
         setting_string_info(self.use_abundance, "use_abundance",
                             settings.use_abundance, False),
         setting_string_info(self.use_neutromer_spacing, "use_neutromer_spacing",
                             settings.use_neutromer_spacing, True),
         setting_numerical_info(self.minimum_nonzero_points, "minimum_nonzero_points",
                                settings.minimum_nonzero_points, True),
         setting_string_info(self.roll_up_option,"roll_up_rate_calc",
                              settings.roll_up_rate_calc, True),
         setting_string_info(self.asymptope_type, "asymptote", 
                              settings.asymptote, False),
         setting_numerical_info(self.fixed_asymptote_value, "fixed_asymptote_value",
                                settings.fixed_asymptote_value, False),
         setting_numerical_info(self.proliferation_adjustment, "proliferation_adjustment",
                                settings.proliferation_adjustment, False),
         setting_string_info(self.bias_selection_option, "bias_calculation",
                              settings.bias_calculation, False),
         setting_numerical_info(self.abund_manual_bias, "abundance_manual_bias",
                                settings.abundance_manual_bias, False),
         setting_numerical_info(self.spacing_manual_bias, "spacing_manual_bias",
                                settings.spacing_manual_bias, False),
         setting_numerical_info(self.combined_manual_bias, "combined_manual_bias",
                                settings.combined_manual_bias, False),
         setting_numerical_info(self.min_allowed_m0_change, "min_allowed_abund_max_delta",
                                settings.min_allowed_abund_max_delta, False),
         setting_numerical_info(self.min_sequence_length, "min_aa_sequence_length",
                                settings.min_aa_sequence_length, True),
         setting_numerical_info(self.min_n_value, "min_allowed_n_values", 
                                settings.min_allowed_n_values, True),
         setting_numerical_info(self.ms_level, "ms_level",
                                settings.ms_level, True),
         setting_string_info(self.verbose_rate, "verbose_rate",
                             settings.verbose_rate, True),
         setting_string_info(self.graph_save_file_type, "rate_output_format",
                             settings.rate_output_format, False)
         ]
     for setting_object in self.all_settings:
         setting_object.set_object_value()
     self.SaveButton.clicked.connect(self.save_settings)
     self.ExitButton.clicked.connect(self.close)
Exemple #10
0
    def _mp_function(data_tuple, settings_path, fn_col, 
                     fn_std_dev, calc_type, manual_bias, std_dev_filter, graph_folder,
                     rate_eq, max_time, p0):
        w.filterwarnings("error")
        settings.load(settings_path)
        pd.options.mode.chained_assignment = None
        
        id_values, group = data_tuple[0], data_tuple[1]
        id_name = id_values[0]
        sample_group_name =id_values[1]
        common_name = group[settings.peptide_analyte_name_column].iloc[0]
        #$drop error string could do earlier for more speed, but this is
        #$clearer and allows errors that affect only one calculation type
        group = RateCalculator._error_trimmer(
            group, [fn_col, fn_std_dev])
        #$the copy is just to avoid a SettingWithCopy warning in a few
        #$operations.  if it causes problems remove and suppress warning
        group = group[group[fn_std_dev] < std_dev_filter].copy()
        
        if len(group) == 0:
            result = RateCalculator._make_error_message(
                "No Isotope Envelopes Agree","", id_name, common_name,
                sample_group_name, calc_type, 0, 0, 0, 0)
            return result, group
    
        #offset all values by a certain amount (instrument bias)
        if settings.bias_calculation == "calculated":
            bias = RateCalculator._calc_bias(group, fn_col)
            group[fn_col] = group[fn_col] - bias
        elif settings.bias_calculation == "manual": #$ user designated bias
            group[fn_col] = group[fn_col] - manual_bias
        
        xs = np.concatenate(([0], group['time'].to_numpy()))

        ys = np.concatenate(([settings.y_intercept_of_fit],
                                 group[fn_col].to_numpy()))
        
            
        if settings.roll_up_rate_calc:
                xs, ys, devs = RateCalculator._roll(xs, ys)
        else:
            devs = np.concatenate((
                [settings.error_of_zero],
                group[fn_std_dev].to_numpy()))
            
        # Get the number of unique time points, and continue if not enough
        num_unique_times = len(set(group['time']))

        unique_length = len(set(group[settings.unique_sequence_column]))
        num_measurements = len(group.index)
        
        #TODO$ this is not ideal but this is a good first attempt
        num_files = len(set(group["mzml_path"]))
        
        #TODO: Handle Num Bio Reps in Stuff
        
        # I think this fixes the issues with technical replicates.
        #$need to use astype(str) on all or if someone uses numbers for group names or replicate names issues result
        num_bio_reps = len(set(group["time"].astype(str) + group["sample_group"].astype(str) + group["bio_rep"].astype(str)))
        
        if num_unique_times < settings.minimum_nonzero_points:
            result = RateCalculator._make_error_message(
                "Insufficient Timepoints","", id_name, common_name, sample_group_name,
                calc_type, num_measurements, num_unique_times, unique_length,
                num_files)
            return result, group
        # perform fit
        try:
            #$DO NOT use std dev as the Sigma because it creates influential outliers
            #$don't use sigma unless we have a different
            popt, pcov = curve_fit(
                f=rate_eq, xdata=xs, ydata=ys,
                p0=p0)
            
             # pull results of fit into variables
            rate = popt[0]
            asymptote = \
                popt[1] if len(popt) > 1 else settings.fixed_asymptote_value
            #TODO$ ci uses degrees of freedom = n-k where n is the number of points and k is the number of parameters estimated
            #$including intercept in linear regression.  if asymptote is fixed k =1 otherwise k =2 (intercept is fit by equation, not data)
            #$not counting charge states and different peptides as unique measurements.
            #$despite the claim in the documentation, acorrding to statistics consultation and every site I checked, the np.sqrt(np.diag(pcov))[0]
            #$is standard error, not std dev, so don't divide by sqrt of n
            
            confint = \
                t.ppf(.975, num_files - len(popt)) * \
                np.sqrt(np.diag(pcov))[0]
                
            y_predicted = dur.simple(xs, rate, asymptote, settings.proliferation_adjustment)
            r_2 = dur.calculate_r2(ys, y_predicted)
            
            result = {
                'analyte_id': id_name,
                'analyte_name': common_name,
                'group_name': sample_group_name,
                '{} rate'.format(calc_type) : rate,
                '{} asymptote'.format(calc_type) : asymptote,
                '{} std_error'.format(calc_type): np.sqrt(np.diag(pcov))[0],
                '{} 95pct_confidence'.format(calc_type): confint,
                '{} half life'.format(calc_type): RateCalculator._halflife(rate),
                '{} R2'.format(calc_type): r_2,
                "{} files observed in".format(calc_type): num_files,
                '{} num_measurements'.format(calc_type):
                    num_measurements,
                '{} num_time_points'.format(calc_type):
                    num_unique_times,
                '{} uniques'.format(calc_type): unique_length,
                '{} exceptions'.format(calc_type): "",
                #$'calculation_type': calc_type
            }
            #$ if there is an asymptote need to provide it
            graph_name = "{}_{}_{}".format(id_name, sample_group_name,
                                           fn_col)
            graph_title = "{}_{}_{}\nk={}, a={}".format(common_name,
                                                            sample_group_name, fn_col, result[f'{calc_type} rate'],
                                                            1.0)
            
            if settings.roll_up_rate_calc:
                graph_rate(graph_name, xs, ys, rate, asymptote, confint,
                           rate_eq, graph_folder, max_time,
                           settings.asymptote, devs, title=graph_title)
            else:
                graph_rate(graph_name, xs, ys, rate, asymptote, confint,
                           rate_eq, graph_folder, max_time,
                           settings.asymptote, title=graph_title)
        except Exception as c:
            #$"we have a guess but are unsure" warning
            if type(c).__name__ == "OptimizeWarning":
                current_exception = \
                    'OptimizeWarning: optimal fit could not be found'
            #$couldn't find the minimum
            elif type(c).__name__ == "RuntimeError":
                current_exception = \
                    'fit could not be found'
            else:
                raise c #$will stop here so don't need to consider further
            result = RateCalculator._make_error_message(
                "value could not be determined", current_exception, id_name,
                common_name, sample_group_name, calc_type,num_measurements,
                num_unique_times, unique_length, num_files)
        return result, group