コード例 #1
0
def protein_FDR_assign_scoring_by_variant_count(proteome, psms, peptide_to_protein_mapping):
    #Setting initial score for proteins
    for protein in proteome.protein_list:
        protein.score = 0
        protein.number_precursors = 0

    for psm in psms:
        annotation = psm.annotation
        score = psm.score
        stripped_sequence = ming_psm_library.strip_sequence(annotation).replace("I", "L")
        protein_list = peptide_to_protein_mapping[stripped_sequence]


        if len(protein_list) > 1:
            continue
            gene_list = []
            for protein in protein_list:
                protein_obj = proteome.protein_map[protein]
                gene_name = protein_obj.gene_name
                gene_list.append(gene_name)
            gene_list = list(set(gene_list))
            if len(gene_list) > 1:
                print(stripped_sequence, len(gene_list), "Skipping because not unique")
                continue


        for protein_name in protein_list:
            if len(protein_name) < 2:
                continue
            proteome.get_protein(protein_name).score += score
            #proteome.get_protein(protein_name).score += 1
            proteome.get_protein(protein_name).number_precursors += 1
コード例 #2
0
def determine_b_y_breaks_total(peaks, max_charge, tolerance, peptide, SNR=2.0):
    if SNR > 1.0:
        peaks = filter_peaks_noise_or_window(peaks, SNR, 100, 20)

    ions_to_consider=["b", "y"]

    ions_to_peaks_mapping = map_ions_to_peak(peaks, max_charge, tolerance, peptide, ions_to_consider)

    all_ions = ions_to_peaks_mapping.keys()

    peptide_length = len(ming_psm_library.strip_sequence(peptide))

    all_prm_break_numbers = []
    for ion in all_ions:
        ion_splits = ion.split(":")
        ion_type = ion_splits[0]
        ion_number = int(ion_splits[1])
        ion_charge = int(ion_splits[2])

        prm_break_number = -1
        if ion_type == "b":
            prm_break_number = ion_number
        if ion_type == "y":
            prm_break_number = peptide_length - ion_number  + 1
        all_prm_break_numbers.append(prm_break_number)

    all_prm_break_numbers = list(set(all_prm_break_numbers))
    #print(peptide, max_charge, peptide_length, all_prm_break_numbers, all_ions, peaks)
    return len(all_prm_break_numbers)
コード例 #3
0
def determine_b_y_breaks_total(peaks, max_charge, tolerance, peptide, SNR=2.0):
    if SNR > 1.0:
        peaks = filter_peaks_noise_or_window(peaks, SNR, 100, 20)

    ions_to_consider=["b", "y"]

    ions_to_peaks_mapping = map_ions_to_peak(peaks, max_charge, tolerance, peptide, ions_to_consider)

    all_ions = ions_to_peaks_mapping.keys()

    peptide_length = len(ming_psm_library.strip_sequence(peptide))

    all_prm_break_numbers = []
    for ion in all_ions:
        ion_splits = ion.split(":")
        ion_type = ion_splits[0]
        ion_number = int(ion_splits[1])
        ion_charge = int(ion_splits[2])

        prm_break_number = -1
        if ion_type == "b":
            prm_break_number = ion_number
        if ion_type == "y":
            prm_break_number = peptide_length - ion_number  + 1
        all_prm_break_numbers.append(prm_break_number)

    all_prm_break_numbers = list(set(all_prm_break_numbers))
    #print(peptide, max_charge, peptide_length, all_prm_break_numbers, all_ions, peaks)
    return len(all_prm_break_numbers)
コード例 #4
0
 def get_tsv_line(self, output_mgf_filename=""):
     length_of_peptide = len(ming_psm_library.strip_sequence(self.peptide))
     percentage_breaks = float(
         self.number_of_b_y_breaks) / float(length_of_peptide)
     return "%s\t%s\t%s\t%s\t%d\t%d\t%d\t%f\t%s\t%s\t%f\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%d\t%f\t%s\t%d\t%d" % (
         output_mgf_filename, self.filename, self.originalfile_filename,
         self.originalfile_scan, self.index, self.scan, self.charge,
         self.mz, self.peptide, self.protein, self.collision_energy,
         self.annotated_peaks, self.explained_intensity, self.signal_peaks,
         self.number_of_peaks_within_1_percent_of_max,
         self.number_of_peaks_within_5_percent_of_max, len(self.peaks),
         self.annotated_ions, self.number_of_b_y_breaks, self.score,
         self.variant_score, length_of_peptide, percentage_breaks,
         self.proteosafe_task, self.num_spectra, self.spectrum_ranking)
コード例 #5
0
def create_library_spectrum(all_spectra,
                            consensus_selection_method,
                            score_cutoff_by_length,
                            variant_to_score,
                            library_candidates_output_dict,
                            filter_peaks=False):
    representative_spectrum = None

    spectra_to_consider = []
    sequence = all_spectra[0]["annotation"]
    stripped_sequence = ming_psm_library.strip_sequence(sequence)
    length = len(stripped_sequence)
    score_cutoff = score_cutoff_by_length[
        length] - 0.01  #delta is for floating point errors
    for spectrum in all_spectra:
        if spectrum["score"] < score_cutoff:
            continue
        else:
            spectra_to_consider.append(spectrum)

    print("DEBUG", sequence, len(all_spectra), len(spectra_to_consider),
          score_cutoff)

    #Decode all the spectrum peaks
    for spectrum in spectra_to_consider:
        spectrum["peaks"] = json.loads(spectrum["peaks"])

    if consensus_selection_method == "MostSimilar_Combination_Score":
        representative_spectrum = choose_representative_spectrum_most_similary_combination_score(
            spectra_to_consider)

    #Summarizing
    summarize_candidate_library_spectra(spectra_to_consider,
                                        library_candidates_output_dict)

    representative_spectrum = copy.deepcopy(representative_spectrum)

    #Reencode peaks
    #for spectrum in all_spectra:
    #    spectrum["peaks"] = json.dumps(spectrum["peaks"])

    #Filtering out noise in library spectra
    if filter_peaks == True:
        representative_spectrum[
            "peaks"] = ming_spectrum_library.filter_to_top_peaks(
                representative_spectrum["peaks"], 100)

    representative_ranking = 0
    representative_score = representative_spectrum["score"]
    for spectrum in spectra_to_consider:
        if spectrum["score"] >= representative_score:
            representative_ranking += 1

    #Creating library spectra
    library_spectrum = {}
    library_spectrum["peaks"] = representative_spectrum["peaks"]
    library_spectrum["charge"] = representative_spectrum["charge"]
    library_spectrum["annotation"] = representative_spectrum["annotation"]
    library_spectrum["mz"] = representative_spectrum["mz"]
    library_spectrum["protein"] = representative_spectrum["protein"]
    library_spectrum["score"] = representative_spectrum["score"]
    library_spectrum["spectra_to_consider"] = len(spectra_to_consider)
    library_spectrum["ranking"] = representative_ranking
    library_spectrum["originalspectrum_filename"] = representative_spectrum[
        "filename"]
    library_spectrum["originalspectrum_scan"] = representative_spectrum["scan"]

    variant_key = representative_spectrum["annotation"] + "." + str(
        representative_spectrum["charge"])
    library_spectrum["variant_score"] = variant_to_score[variant_key]
    if "proteosafe_task" in representative_spectrum:
        library_spectrum["proteosafe_task"] = representative_spectrum[
            "proteosafe_task"]
    else:
        library_spectrum["proteosafe_task"] = ""

    return library_spectrum
コード例 #6
0
 def get_tsv_line(self, output_mgf_filename=""):
     length_of_peptide = len(ming_psm_library.strip_sequence(self.peptide))
     percentage_breaks = float(self.number_of_b_y_breaks)/float(length_of_peptide)
     return "%s\t%s\t%s\t%s\t%d\t%d\t%d\t%f\t%s\t%s\t%f\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%d\t%f\t%s\t%d\t%d" % (output_mgf_filename, self.filename, self.originalfile_filename, self.originalfile_scan, self.index, self.scan, self.charge, self.mz, self.peptide, self.protein, self.collision_energy, self.annotated_peaks, self.explained_intensity, self.signal_peaks, self.number_of_peaks_within_1_percent_of_max, self.number_of_peaks_within_5_percent_of_max, len(self.peaks), self.annotated_ions, self.number_of_b_y_breaks, self.score, self.variant_score, length_of_peptide, percentage_breaks, self.proteosafe_task, self.num_spectra, self.spectrum_ranking)
コード例 #7
0
def process_ambiguity(psm_list, mangled_mapping, library_scans_to_identification, cutoff_dict):
    output_results_dict = defaultdict(list)

    psm_by_filename_scan = defaultdict(list)

    #Grouping by scan and filename
    for psm in psm_list:
        spectrum_key = psm.filename + ":" + psm.scan
        psm_by_filename_scan[spectrum_key].append(psm)

    for key in psm_by_filename_scan:
        library_filename = psm_by_filename_scan[key][0].filename
        scan = psm_by_filename_scan[key][0].scan

        library_spectrum_key = library_filename + ":" + scan
        library_identification_object = library_scans_to_identification[library_spectrum_key]
        proteosafe_task = library_identification_object["proteosafe_task"]

        observed_annotations = set()
        observed_stripped_annotations = set()
        observed_stripped_annotations_scores = defaultdict(lambda: -1000)
        sequence_to_variant_map = defaultdict(list)


        for psm in psm_by_filename_scan[key]:
            annotation = psm.annotation
            stripped_annotation = ming_psm_library.strip_sequence(ming_psm_library.remove_charges_from_annotation(annotation))

            peptide_length = str(len(stripped_annotation))
            cutoff_score = 100000
            if proteosafe_task in cutoff_dict:
                task_cutoffs = cutoff_dict[proteosafe_task]
                if peptide_length in task_cutoffs:
                    cutoff_score = task_cutoffs[peptide_length]

            if psm.score >= cutoff_score:
                observed_annotations.add(annotation)
                observed_stripped_annotations.add(stripped_annotation)
                observed_stripped_annotations_scores[stripped_annotation] = max(psm.score, observed_stripped_annotations_scores[stripped_annotation])
                sequence_to_variant_map[stripped_annotation].append(annotation)

                #print(annotation, psm.score, cutoff_score)

        output_list = ["ALLSTATUS", str(key), str(len(observed_stripped_annotations)), library_identification_object["peptide"], library_identification_object["charge"]]
        #print("\t".join(output_list))

        ambiguity_category = "N/A"
        library_peptide = library_identification_object["peptide"]
        library_peptide_stripped = ming_psm_library.strip_sequence(ming_psm_library.remove_charges_from_annotation(library_identification_object["peptide"]))
        library_charge = library_identification_object["charge"]
        observed_annotations = len(observed_stripped_annotations)
        library_filename = library_identification_object["filename"]
        library_scan = library_identification_object["spectrumscan"]
        alternative_peptide = "N/A"

        if len(observed_stripped_annotations) == 2:
            ambiguous_stripped_sequence_1 = list(observed_stripped_annotations)[0]
            ambiguous_stripped_sequence_2 = list(observed_stripped_annotations)[1]

            ambiguous_mod_sequence_1 = sequence_to_variant_map[ambiguous_stripped_sequence_1][0]
            ambiguous_mod_sequence_2 = sequence_to_variant_map[ambiguous_stripped_sequence_2][0]

            sequence1_score = observed_stripped_annotations_scores[ambiguous_stripped_sequence_1]
            sequence2_score = observed_stripped_annotations_scores[ambiguous_stripped_sequence_2]

            score_delta = abs(sequence1_score - sequence2_score)

            #ambiguity_category = ming_ambiguity_library.categorize_peptide_distance(ambiguous_mod_sequence_1, ambiguous_mod_sequence_2)
            ambiguity_category = "N/A"

            if library_peptide_stripped == ambiguous_stripped_sequence_1:
                alternative_peptide = ambiguous_mod_sequence_2
            else:
                alternative_peptide = ambiguous_mod_sequence_1

            #print(ambiguous_mod_sequence_1, ambiguous_mod_sequence_2, ambiguity_category)

            #output_list = ["TWOAMBIGUOUS", str(key), str(len(observed_stripped_annotations)), library_identification_object["peptide"], library_identification_object["charge"], library_identification_object["score"], proteosafe_task, str(observed_stripped_annotations), str(cutoff_score), str(observed_annotations), str(ambiguity_category)]

            #print("\t".join(output_list))

        output_results_dict["ambiguity_category"].append(ambiguity_category)
        output_results_dict["library_peptide"].append(library_peptide)
        output_results_dict["library_peptide_stripped"].append(library_peptide_stripped)
        output_results_dict["library_charge"].append(library_charge)
        output_results_dict["observed_annotations"].append(observed_annotations)
        output_results_dict["library_filename"].append(library_filename)
        output_results_dict["library_scan"].append(library_scan)
        output_results_dict["alternative_peptide"].append(alternative_peptide)

    return output_results_dict