def protein_FDR_assign_scoring_by_variant_count(proteome, psms, peptide_to_protein_mapping): #Setting initial score for proteins for protein in proteome.protein_list: protein.score = 0 protein.number_precursors = 0 for psm in psms: annotation = psm.annotation score = psm.score stripped_sequence = ming_psm_library.strip_sequence(annotation).replace("I", "L") protein_list = peptide_to_protein_mapping[stripped_sequence] if len(protein_list) > 1: continue gene_list = [] for protein in protein_list: protein_obj = proteome.protein_map[protein] gene_name = protein_obj.gene_name gene_list.append(gene_name) gene_list = list(set(gene_list)) if len(gene_list) > 1: print(stripped_sequence, len(gene_list), "Skipping because not unique") continue for protein_name in protein_list: if len(protein_name) < 2: continue proteome.get_protein(protein_name).score += score #proteome.get_protein(protein_name).score += 1 proteome.get_protein(protein_name).number_precursors += 1
def determine_b_y_breaks_total(peaks, max_charge, tolerance, peptide, SNR=2.0): if SNR > 1.0: peaks = filter_peaks_noise_or_window(peaks, SNR, 100, 20) ions_to_consider=["b", "y"] ions_to_peaks_mapping = map_ions_to_peak(peaks, max_charge, tolerance, peptide, ions_to_consider) all_ions = ions_to_peaks_mapping.keys() peptide_length = len(ming_psm_library.strip_sequence(peptide)) all_prm_break_numbers = [] for ion in all_ions: ion_splits = ion.split(":") ion_type = ion_splits[0] ion_number = int(ion_splits[1]) ion_charge = int(ion_splits[2]) prm_break_number = -1 if ion_type == "b": prm_break_number = ion_number if ion_type == "y": prm_break_number = peptide_length - ion_number + 1 all_prm_break_numbers.append(prm_break_number) all_prm_break_numbers = list(set(all_prm_break_numbers)) #print(peptide, max_charge, peptide_length, all_prm_break_numbers, all_ions, peaks) return len(all_prm_break_numbers)
def determine_b_y_breaks_total(peaks, max_charge, tolerance, peptide, SNR=2.0): if SNR > 1.0: peaks = filter_peaks_noise_or_window(peaks, SNR, 100, 20) ions_to_consider=["b", "y"] ions_to_peaks_mapping = map_ions_to_peak(peaks, max_charge, tolerance, peptide, ions_to_consider) all_ions = ions_to_peaks_mapping.keys() peptide_length = len(ming_psm_library.strip_sequence(peptide)) all_prm_break_numbers = [] for ion in all_ions: ion_splits = ion.split(":") ion_type = ion_splits[0] ion_number = int(ion_splits[1]) ion_charge = int(ion_splits[2]) prm_break_number = -1 if ion_type == "b": prm_break_number = ion_number if ion_type == "y": prm_break_number = peptide_length - ion_number + 1 all_prm_break_numbers.append(prm_break_number) all_prm_break_numbers = list(set(all_prm_break_numbers)) #print(peptide, max_charge, peptide_length, all_prm_break_numbers, all_ions, peaks) return len(all_prm_break_numbers)
def get_tsv_line(self, output_mgf_filename=""): length_of_peptide = len(ming_psm_library.strip_sequence(self.peptide)) percentage_breaks = float( self.number_of_b_y_breaks) / float(length_of_peptide) return "%s\t%s\t%s\t%s\t%d\t%d\t%d\t%f\t%s\t%s\t%f\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%d\t%f\t%s\t%d\t%d" % ( output_mgf_filename, self.filename, self.originalfile_filename, self.originalfile_scan, self.index, self.scan, self.charge, self.mz, self.peptide, self.protein, self.collision_energy, self.annotated_peaks, self.explained_intensity, self.signal_peaks, self.number_of_peaks_within_1_percent_of_max, self.number_of_peaks_within_5_percent_of_max, len(self.peaks), self.annotated_ions, self.number_of_b_y_breaks, self.score, self.variant_score, length_of_peptide, percentage_breaks, self.proteosafe_task, self.num_spectra, self.spectrum_ranking)
def create_library_spectrum(all_spectra, consensus_selection_method, score_cutoff_by_length, variant_to_score, library_candidates_output_dict, filter_peaks=False): representative_spectrum = None spectra_to_consider = [] sequence = all_spectra[0]["annotation"] stripped_sequence = ming_psm_library.strip_sequence(sequence) length = len(stripped_sequence) score_cutoff = score_cutoff_by_length[ length] - 0.01 #delta is for floating point errors for spectrum in all_spectra: if spectrum["score"] < score_cutoff: continue else: spectra_to_consider.append(spectrum) print("DEBUG", sequence, len(all_spectra), len(spectra_to_consider), score_cutoff) #Decode all the spectrum peaks for spectrum in spectra_to_consider: spectrum["peaks"] = json.loads(spectrum["peaks"]) if consensus_selection_method == "MostSimilar_Combination_Score": representative_spectrum = choose_representative_spectrum_most_similary_combination_score( spectra_to_consider) #Summarizing summarize_candidate_library_spectra(spectra_to_consider, library_candidates_output_dict) representative_spectrum = copy.deepcopy(representative_spectrum) #Reencode peaks #for spectrum in all_spectra: # spectrum["peaks"] = json.dumps(spectrum["peaks"]) #Filtering out noise in library spectra if filter_peaks == True: representative_spectrum[ "peaks"] = ming_spectrum_library.filter_to_top_peaks( representative_spectrum["peaks"], 100) representative_ranking = 0 representative_score = representative_spectrum["score"] for spectrum in spectra_to_consider: if spectrum["score"] >= representative_score: representative_ranking += 1 #Creating library spectra library_spectrum = {} library_spectrum["peaks"] = representative_spectrum["peaks"] library_spectrum["charge"] = representative_spectrum["charge"] library_spectrum["annotation"] = representative_spectrum["annotation"] library_spectrum["mz"] = representative_spectrum["mz"] library_spectrum["protein"] = representative_spectrum["protein"] library_spectrum["score"] = representative_spectrum["score"] library_spectrum["spectra_to_consider"] = len(spectra_to_consider) library_spectrum["ranking"] = representative_ranking library_spectrum["originalspectrum_filename"] = representative_spectrum[ "filename"] library_spectrum["originalspectrum_scan"] = representative_spectrum["scan"] variant_key = representative_spectrum["annotation"] + "." + str( representative_spectrum["charge"]) library_spectrum["variant_score"] = variant_to_score[variant_key] if "proteosafe_task" in representative_spectrum: library_spectrum["proteosafe_task"] = representative_spectrum[ "proteosafe_task"] else: library_spectrum["proteosafe_task"] = "" return library_spectrum
def get_tsv_line(self, output_mgf_filename=""): length_of_peptide = len(ming_psm_library.strip_sequence(self.peptide)) percentage_breaks = float(self.number_of_b_y_breaks)/float(length_of_peptide) return "%s\t%s\t%s\t%s\t%d\t%d\t%d\t%f\t%s\t%s\t%f\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%d\t%f\t%s\t%d\t%d" % (output_mgf_filename, self.filename, self.originalfile_filename, self.originalfile_scan, self.index, self.scan, self.charge, self.mz, self.peptide, self.protein, self.collision_energy, self.annotated_peaks, self.explained_intensity, self.signal_peaks, self.number_of_peaks_within_1_percent_of_max, self.number_of_peaks_within_5_percent_of_max, len(self.peaks), self.annotated_ions, self.number_of_b_y_breaks, self.score, self.variant_score, length_of_peptide, percentage_breaks, self.proteosafe_task, self.num_spectra, self.spectrum_ranking)
def process_ambiguity(psm_list, mangled_mapping, library_scans_to_identification, cutoff_dict): output_results_dict = defaultdict(list) psm_by_filename_scan = defaultdict(list) #Grouping by scan and filename for psm in psm_list: spectrum_key = psm.filename + ":" + psm.scan psm_by_filename_scan[spectrum_key].append(psm) for key in psm_by_filename_scan: library_filename = psm_by_filename_scan[key][0].filename scan = psm_by_filename_scan[key][0].scan library_spectrum_key = library_filename + ":" + scan library_identification_object = library_scans_to_identification[library_spectrum_key] proteosafe_task = library_identification_object["proteosafe_task"] observed_annotations = set() observed_stripped_annotations = set() observed_stripped_annotations_scores = defaultdict(lambda: -1000) sequence_to_variant_map = defaultdict(list) for psm in psm_by_filename_scan[key]: annotation = psm.annotation stripped_annotation = ming_psm_library.strip_sequence(ming_psm_library.remove_charges_from_annotation(annotation)) peptide_length = str(len(stripped_annotation)) cutoff_score = 100000 if proteosafe_task in cutoff_dict: task_cutoffs = cutoff_dict[proteosafe_task] if peptide_length in task_cutoffs: cutoff_score = task_cutoffs[peptide_length] if psm.score >= cutoff_score: observed_annotations.add(annotation) observed_stripped_annotations.add(stripped_annotation) observed_stripped_annotations_scores[stripped_annotation] = max(psm.score, observed_stripped_annotations_scores[stripped_annotation]) sequence_to_variant_map[stripped_annotation].append(annotation) #print(annotation, psm.score, cutoff_score) output_list = ["ALLSTATUS", str(key), str(len(observed_stripped_annotations)), library_identification_object["peptide"], library_identification_object["charge"]] #print("\t".join(output_list)) ambiguity_category = "N/A" library_peptide = library_identification_object["peptide"] library_peptide_stripped = ming_psm_library.strip_sequence(ming_psm_library.remove_charges_from_annotation(library_identification_object["peptide"])) library_charge = library_identification_object["charge"] observed_annotations = len(observed_stripped_annotations) library_filename = library_identification_object["filename"] library_scan = library_identification_object["spectrumscan"] alternative_peptide = "N/A" if len(observed_stripped_annotations) == 2: ambiguous_stripped_sequence_1 = list(observed_stripped_annotations)[0] ambiguous_stripped_sequence_2 = list(observed_stripped_annotations)[1] ambiguous_mod_sequence_1 = sequence_to_variant_map[ambiguous_stripped_sequence_1][0] ambiguous_mod_sequence_2 = sequence_to_variant_map[ambiguous_stripped_sequence_2][0] sequence1_score = observed_stripped_annotations_scores[ambiguous_stripped_sequence_1] sequence2_score = observed_stripped_annotations_scores[ambiguous_stripped_sequence_2] score_delta = abs(sequence1_score - sequence2_score) #ambiguity_category = ming_ambiguity_library.categorize_peptide_distance(ambiguous_mod_sequence_1, ambiguous_mod_sequence_2) ambiguity_category = "N/A" if library_peptide_stripped == ambiguous_stripped_sequence_1: alternative_peptide = ambiguous_mod_sequence_2 else: alternative_peptide = ambiguous_mod_sequence_1 #print(ambiguous_mod_sequence_1, ambiguous_mod_sequence_2, ambiguity_category) #output_list = ["TWOAMBIGUOUS", str(key), str(len(observed_stripped_annotations)), library_identification_object["peptide"], library_identification_object["charge"], library_identification_object["score"], proteosafe_task, str(observed_stripped_annotations), str(cutoff_score), str(observed_annotations), str(ambiguity_category)] #print("\t".join(output_list)) output_results_dict["ambiguity_category"].append(ambiguity_category) output_results_dict["library_peptide"].append(library_peptide) output_results_dict["library_peptide_stripped"].append(library_peptide_stripped) output_results_dict["library_charge"].append(library_charge) output_results_dict["observed_annotations"].append(observed_annotations) output_results_dict["library_filename"].append(library_filename) output_results_dict["library_scan"].append(library_scan) output_results_dict["alternative_peptide"].append(alternative_peptide) return output_results_dict