def find_matches_in_dataset(dataset_id, input_spectrum_collection): dataset_match_list = [] path_to_clustered_mgf = os.path.join(PATH_TO_DATASET_UPLOADS, dataset_id, "clustered", dataset_id + "_specs_ms.mgf") relative_user_path_to_clustered = os.path.join( dataset_id, "clustered", dataset_id + "_specs_ms.mgf") if not ming_fileio_library.is_path_present(path_to_clustered_mgf): return dataset_match_list #Lets compare these two files # input_spectra_filename and symlink_destination dataset_clustered_spectra = ming_spectrum_library.SpectrumCollection( path_to_clustered_mgf) dataset_clustered_spectra.load_from_file() for myspectrum in input_spectrum_collection.spectrum_list: match_list = dataset_clustered_spectra.search_spectrum( myspectrum, 1.0, 1.0, 6, 0.7, 1) for match in match_list: match.filename = relative_user_path_to_clustered dataset_match_list += match_list print "Dataset matches: " + str(len(dataset_match_list)) return dataset_match_list
def filtering_out_high_scoring_decoys(input_decoy_psms, input_target_psms, target_filename, other_filename): input_decoy_psms = sorted(input_decoy_psms, key=lambda psm: psm.sorting_value(), reverse=True) print(target_filename, other_filename) target_collection = ming_spectrum_library.SpectrumCollection(target_filename) target_collection.load_from_mzXML(drop_ms1=True) decoy_collection = ming_spectrum_library.SpectrumCollection(other_filename) decoy_collection.load_from_mzXML(drop_ms1=True) top_scoring_precursor_target = {} for psm in input_target_psms: annotation = psm.annotation charge = psm.charge key = annotation + "." + str(charge) top_psm = psm if key in top_scoring_precursor_target: top_psm = top_scoring_precursor_target[key] else: top_scoring_precursor_target[key] = psm if psm.score > top_psm.score: top_scoring_precursor_target[key] = psm output_decoys_list = [] for psm in input_decoy_psms[:200]: annotation = psm.annotation charge = psm.charge key = annotation + "." + str(charge) if key in top_scoring_precursor_target: print(key, psm.score, psm.scan) print(annotation, annotation[:-2]) decoy_spectrum = decoy_collection.scandict[int(psm.scan)] target_spectrum = target_collection.scandict[int(top_scoring_precursor_target[key].scan)] cosine_score = spectrum_alignment.score_alignment_annotated_ion_peaks(decoy_spectrum.peaks, target_spectrum.peaks, 0, 0, 0.1, annotation, annotation, min(3, charge)) print(cosine_score) if cosine_score < 0.7: output_decoys_list.append(psm) else: output_decoys_list.append(psm) for psm in input_decoy_psms[200:]: output_decoys_list.append(psm) return output_decoys_list
def main(): input_folder = sys.argv[1] input_protein_fdr_filename = sys.argv[2] input_peptide_protein_mapping_filename = sys.argv[3] precursor_to_protein_map = load_precursor_to_protein_mapping( input_peptide_protein_mapping_filename) included_proteins = proteins_to_include(input_protein_fdr_filename) output_mgf_folder = sys.argv[4] output_tsv_folder = sys.argv[5] output_sptxt_folder = sys.argv[6] output_filename_prefix = sys.argv[7] input_files = ming_fileio_library.list_files_in_dir(input_folder) all_library_spectra = [] for input_filename in input_files: temp_spectra = ming_spectrum_library.load_mgf_peptide_library( input_filename) print("loaded ", len(temp_spectra), "from", input_filename) for spectrum in temp_spectra: peptide = spectrum.peptide protein = spectrum.protein #All Proteins new_proteins_set = set( precursor_to_protein_map[peptide].split(";")) if len(new_proteins_set.intersection(included_proteins)) == 0: continue if protein != "CREATION_FALSE_PROTEIN": spectrum.protein = precursor_to_protein_map[peptide] all_library_spectra.append(spectrum) library_spectrum_collection_split = ming_spectrum_library.SpectrumCollection( "library spectra") library_spectrum_collection_split.spectrum_list = all_library_spectra output_tsv_filename = os.path.join(output_tsv_folder, output_filename_prefix + ".tsv") output_mgf_filename = os.path.join(output_mgf_folder, output_filename_prefix + ".mgf") output_sptxt_filename = os.path.join(output_mgf_folder, output_filename_prefix + ".sptxt") library_spectrum_collection_split.save_to_mgf( open(output_mgf_filename, "w")) library_spectrum_collection_split.save_to_tsv( open(output_tsv_filename, "w"), output_mgf_filename) library_spectrum_collection_split.save_to_sptxt( open(output_sptxt_filename, "w"))
def finding_matches_in_public_data(input_spectra_filename, all_datasets): all_matches_to_datasets_map = {} input_spectrum_collection = ming_spectrum_library.SpectrumCollection( input_spectra_filename) input_spectrum_collection.load_from_file() total_matches = 0 search_parameters = [] for dataset in all_datasets: if dataset["title"].upper().find("GNPS") == -1: continue dataset_id = dataset["dataset"] search_parameters.append({ "dataset_id": dataset_id, "input_spectrum_collection": input_spectrum_collection }) #Doing search serial search_results = [] for search_param in search_parameters: if total_matches > 0: continue dataset_matches = find_matches_in_dataset_wrapper(search_param) search_results.append(dataset_matches) total_matches += len(dataset_matches) print "SEARCHING " + str(search_param) print("datasets to consider: " + str(len(search_parameters))) #Parallel #search_results = ming_parallel_library.run_parallel_job(find_matches_in_dataset_wrapper, search_parameters, 10) #formatting output for i in range(len(search_results)): dataset_matches = search_results[i] dataset_id = search_parameters[i]["dataset_id"] print "outputting: " + str(search_parameters[i]) all_matches_to_datasets_map[dataset_id] = {"matches": dataset_matches} return all_matches_to_datasets_map
def find_matches_in_file(input_spectrum_collection, dataset_filepath, relative_dataset_filepath, match_parameters, top_k=1): dataset_match_list = [] if not ming_fileio_library.is_path_present(dataset_filepath): print("Cant find", dataset_filepath) return dataset_match_list dataset_query_spectra = ming_spectrum_library.SpectrumCollection( dataset_filepath) try: dataset_query_spectra.load_from_file() except: return dataset_match_list for repo_spectrum in dataset_query_spectra.spectrum_list: if match_parameters["FILTER_WINDOW"]: repo_spectrum.window_filter_peaks(50, 6) if match_parameters["FILTER_PRECURSOR"]: repo_spectrum.filter_precursor_peaks() for myspectrum in input_spectrum_collection.spectrum_list: if match_parameters["FILTER_WINDOW"]: myspectrum.window_filter_peaks(50, 6) if match_parameters["FILTER_PRECURSOR"]: myspectrum.filter_precursor_peaks() try: match_list = dataset_query_spectra.search_spectrum( myspectrum, match_parameters["PM_TOLERANCE"], match_parameters["FRAGMENT_TOLERANCE"], match_parameters["MIN_MATCHED_PEAKS"], match_parameters["MIN_COSINE"], analog_search=match_parameters["ANALOG_SEARCH"], top_k=top_k) for match in match_list: match["filename"] = relative_dataset_filepath dataset_match_list += match_list except: print("Error in Matching") print("Dataset matches: " + str(len(dataset_match_list))) return dataset_match_list
def find_matches_in_dataset(dataset_id, input_spectrum_collection, identification_map): dataset_match_list = [] path_to_peak_collection = os.path.join(PATH_TO_DATASET_UPLOADS, dataset_id, "peak") peak_files = ming_fileio_library.list_files_in_dir(path_to_peak_collection) for input_file in peak_files: print(input_file) relative_user_path_to_file = os.path.relpath(input_file, PATH_TO_DATASET_UPLOADS) reference_spectra = ming_spectrum_library.SpectrumCollection( input_file) reference_spectra.load_from_mzXML(drop_ms1=True) is_blank = 0 if input_file.find("blank") != -1: is_blank = 1 for myspectrum in input_spectrum_collection.spectrum_list: match_list = reference_spectra.search_spectrum( myspectrum, 1.0, 1.0, 4, 0.7, 1) for match in match_list: match_obj = {} match_obj["filename"] = relative_user_path_to_file match_obj["scan"] = match.scan match_obj["score"] = match.score match_obj["query_filename"] = match.query_filename match_obj["query_scan"] = match.query_scan match_obj["ppm_error"] = match.ppm_error match_obj["is_blank"] = is_blank match_obj["dataset_id"] = dataset_id #compound identification if match.scan in identification_map: match_obj["identification"] = identification_map[ match.scan]["identification"] match_obj["spectrum_id"] = identification_map[ match.scan]["spectrum_id"] else: match_obj["identification"] = "" match_obj["spectrum_id"] = "" dataset_match_list.append(match_obj) return dataset_match_list
def get_spectrum_collection_from_param_obj(param_obj): precursor_mz = float(param_obj["precursor_mz"][0]) spectrum_string = param_obj["spectrum_string"][0] peaks_lines = spectrum_string.split("\n") peak_list = [] for peak_line in peaks_lines: splits = peak_line.split() mass = float(splits[0]) intensity = float(splits[1]) peak_list.append([mass, intensity]) peak_list = sorted(peak_list, key=lambda peak: peak[0]) spectrum_obj = ming_spectrum_library.Spectrum("search_spectrum.mgf", 1, 0, peak_list, precursor_mz, 1, 2) spectrum_collection = ming_spectrum_library.SpectrumCollection("search_spectrum.mgf") spectrum_collection.spectrum_list = [spectrum_obj] return spectrum_collection
def main(): input_intermediate_file = sys.argv[1] output_tsv_folder = sys.argv[2] output_mgf_folder = sys.argv[3] output_sptxt_folder = sys.argv[4] all_input_files = ming_fileio_library.list_files_in_dir(input_intermediate_folder) library_spectrum_collection = ming_spectrum_library.SpectrumCollection("library spectra") all_json_spectra_list = json.load(open(input_intermediate_file)) print("Loaded", input_intermediate_file, len(all_json_spectra_list)) for library_spectrum in list_of_library_spectra: lib_spec = ming_spectrum_library.PeptideLibrarySpectrum("", 0, 0, library_spectrum["peaks"], library_spectrum["mz"], library_spectrum["charge"], library_spectrum["annotation"], library_spectrum["protein"]) if "score" in library_spectrum: lib_spec.score = library_spectrum["score"] if "variant_score" in library_spectrum: lib_spec.variant_score = library_spectrum["variant_score"] if "spectra_to_consider" in library_spectrum: lib_spec.num_spectra = library_spectrum["spectra_to_consider"] if "ranking" in library_spectrum: lib_spec.spectrum_ranking = library_spectrum["ranking"] if "proteosafe_task" in library_spectrum: lib_spec.proteosafe_task = library_spectrum["proteosafe_task"] if "originalspectrum_filename" in library_spectrum: lib_spec.originalfile_filename = library_spectrum["originalspectrum_filename"] if "originalspectrum_scan" in library_spectrum: lib_spec.originalfile_scan = str(library_spectrum["originalspectrum_scan"]) library_spectrum_collection.spectrum_list.append(lib_spec) output_mgf_filename = os.path.join(output_mgf_folder, os.path.splitext(os.path.basename(input_intermediate_file))[0] + ".mgf") output_tsv_filename = os.path.join(output_tsv_filename, os.path.splitext(os.path.basename(input_intermediate_file))[0] + ".tsv") output_sptxt_filename = os.path.join(output_tsv_filename, os.path.splitext(os.path.basename(input_intermediate_file))[0] + ".sptxt") library_spectrum_collection_split.save_to_mgf(open(output_mgf_filename, "w")) library_spectrum_collection_split.save_to_tsv(open(output_tsv_filename, "w"), output_mgf_filename) try: library_spectrum_collection.save_to_sptxt(open(output_sptxt_filename, "w")) except: traceback.print_exc(file=sys.stdout) print("MEH")
def main(): input_folder = sys.argv[1] input_protein_fdr_filename = sys.argv[2] input_peptide_protein_mapping_filename = sys.argv[3] precursor_to_protein_map = load_precursor_to_protein_mapping( input_peptide_protein_mapping_filename) output_mgf_folder = sys.argv[4] output_tsv_folder = sys.argv[5] output_filename_prefix = sys.argv[6] input_files = ming_fileio_library.list_files_in_dir(input_folder) all_library_spectra = [] for input_filename in input_files: temp_spectra = ming_spectrum_library.load_mgf_peptide_library( input_filename) print("loaded ", len(temp_spectra), "from", input_filename) for spectrum in temp_spectra: peptide = spectrum.peptide protein = spectrum.protein if protein == "CREATION_FALSE_PROTEIN": continue spectrum.protein = precursor_to_protein_map[peptide] all_library_spectra += temp_spectra library_spectrum_collection_split = ming_spectrum_library.SpectrumCollection( "library spectra") library_spectrum_collection_split.spectrum_list = all_library_spectra output_tsv_filename = os.path.join(output_tsv_folder, output_filename_prefix + ".tsv") output_mgf_filename = os.path.join(output_mgf_folder, output_filename_prefix + ".mgf") library_spectrum_collection_split.save_to_mgf( open(output_mgf_filename, "w")) library_spectrum_collection_split.save_to_tsv( open(output_tsv_filename, "w"), output_mgf_filename)
def finding_matches_in_public_data(input_spectra_filename, all_datasets, identification_dict): input_spectrum_collection = ming_spectrum_library.SpectrumCollection( input_spectra_filename) input_spectrum_collection.load_from_file() total_matches = 0 search_parameters = [] for dataset in all_datasets: dataset_id = dataset["dataset"] search_parameters.append({ "dataset_id": dataset_id, "input_spectrum_collection": input_spectrum_collection, "identification_dict": identification_dict }) #Doing search serial search_results = [] for search_param in search_parameters: #if total_matches > 0: # continue print("SEARCHING " + str(search_param)) dataset_matches = find_matches_in_dataset_wrapper(search_param) search_results.append(dataset_matches) total_matches += len(dataset_matches) print("datasets to consider: " + str(len(search_parameters))) #Parallel #search_results = ming_parallel_library.run_parallel_job(find_matches_in_dataset_wrapper, search_parameters, 10) #formatting output all_matches = [] for i in range(len(search_results)): dataset_matches = search_results[i] print "outputting: " + str(search_parameters[i]) all_matches += dataset_matches return all_matches
def get_file_stats(input_filename): output_list = [] spectrum_collection = ming_spectrum_library.SpectrumCollection( input_filename) try: spectrum_collection.load_from_file(drop_ms1=True) except KeyboardInterrupt: raise except: print("Cannot load", input_filename) for spectrum in spectrum_collection.spectrum_list: output_dict = {} output_dict["fragmentation"] = spectrum.fragmenation_method output_dict["filename"] = os.path.basename(input_filename) output_dict["collision_energy"] = spectrum.collision_energy output_dict["scan"] = spectrum.scan output_list.append(output_dict) return output_list
import sys import getopt import argparse import ming_spectrum_library parser = argparse.ArgumentParser(description='Filter Spectra') parser.add_argument('input_mgf_filename', help='input_mgf_filename') parser.add_argument('output_mgf_filename', help='output_mgf_filename') parser.add_argument('output_filtered_mgf_filename', help='output_filtered_mgf_filename') parser.add_argument('--FILTER_PRECURSOR_WINDOW', help='0', default=None) parser.add_argument('--WINDOW_FILTER', help='0', default=None) args = parser.parse_args() # Load the spectra spectrum_collection = ming_spectrum_library.SpectrumCollection( args.input_mgf_filename) spectrum_collection.load_from_mgf() # Making sure to renumber spectrum_list = sorted(spectrum_collection.spectrum_list, key=lambda spectrum: int(spectrum.scan)) included_scans = set() spectrum_dict = {} for spectrum in spectrum_list: spectrum_dict[int(spectrum.scan)] = spectrum included_scans.add(int(spectrum.scan)) max_scan = max(included_scans) output_spectrum_list = []
def main(): parser = argparse.ArgumentParser( description='Creating Clustering Info Summary') parser.add_argument('params_xml', help='params_xml') parser.add_argument('consensus_feature_file', help='Consensus Quantification File') parser.add_argument('metadata_folder', help='metadata metadata_folder') parser.add_argument('mgf_filename', help='mgf_filename') parser.add_argument('output_clusterinfo_summary', help='output file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml)) task_id = param_obj["task"][0] group_to_files_mapping = defaultdict(list) attributes_to_groups_mapping = defaultdict(set) metadata_files = glob.glob(os.path.join(args.metadata_folder, "*")) if len(metadata_files) == 1: group_to_files_mapping, attributes_to_groups_mapping = load_group_attribute_mappings( metadata_files[0]) ROW_NORMALIZATION = "None" try: ROW_NORMALIZATION = param_obj["QUANT_FILE_NORM"][0] except: ROW_NORMALIZATION = "None" GROUP_COUNT_AGGREGATE_METHOD = "Mean" try: GROUP_COUNT_AGGREGATE_METHOD = param_obj[ "GROUP_COUNT_AGGREGATE_METHOD"][0] except: GROUP_COUNT_AGGREGATE_METHOD = "Mean" quantification_df = pd.read_csv(args.consensus_feature_file) quantification_list = quantification_df.to_dict(orient="records") input_filenames, input_filename_headers = determine_input_files( quantification_list[0].keys()) ### Filling in Quantification table if it is missing values for quantification_object in quantification_list: ###Handling empty quantification for filename in input_filename_headers: try: if len(quantification_object[filename]) == 0: #print(filename, quantification_object[filename], quantification_object["row ID"]) quantification_object[filename] = 0 except: x = 1 print("Number of Features", len(quantification_list)) #Doing row sum normalization if ROW_NORMALIZATION == "RowSum": print("ROW SUM NORM") for filename_header in input_filename_headers: file_quants = [ float(quantification_object[filename_header]) for quantification_object in quantification_list ] summed_file_quants = sum(file_quants) #Handling zero column if summed_file_quants > 0: for quantification_object in quantification_list: quantification_object[filename_header] = float( quantification_object[filename_header]) / sum( file_quants) * 1000000 """Loading MS2 Spectra""" mgf_collection = ming_spectrum_library.SpectrumCollection( args.mgf_filename) mgf_collection.load_from_file() clusters_list = [] for quantification_object in quantification_list: cluster_obj = {} cluster_obj["cluster index"] = str(int( quantification_object["row ID"])) cluster_obj["precursor mass"] = "{0:.4f}".format( float(quantification_object["row m/z"])) cluster_obj["RTConsensus"] = "{0:.4f}".format( float(quantification_object["row retention time"])) all_charges = [] """Checking about the charge of this cluster""" try: spectrum_object = mgf_collection.scandict[int( cluster_obj["cluster index"])] charge = int(spectrum_object.charge) except: charge = 0 """Checking if this spectrum has no peaks""" # try: # spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])] # # except: # continue all_files = [ os.path.basename(filename) for filename in input_filename_headers if float(quantification_object[filename]) > 0 ] abundance_per_file = [(os.path.basename(filename), float(quantification_object[filename])) for filename in input_filename_headers] all_abundances = [ float(quantification_object[filename]) for filename in input_filename_headers ] if charge != 0: cluster_obj["parent mass"] = "{0:.4f}".format( float(quantification_object["row m/z"]) * charge - charge + 1) else: cluster_obj["parent mass"] = "{0:.4f}".format( float(quantification_object["row m/z"])) cluster_obj["precursor charge"] = charge try: cluster_obj["RTMean"] = statistics.mean(all_retention_times) cluster_obj["RTStdErr"] = statistics.stdev(all_retention_times) except: cluster_obj["RTMean"] = cluster_obj["RTConsensus"] cluster_obj["RTStdErr"] = 0 cluster_obj[ "GNPSLinkout_Cluster"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID&show=true#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % ( task_id, quantification_object["row ID"], quantification_object["row ID"]) cluster_obj["sum(precursor intensity)"] = sum(all_abundances) cluster_obj["SumPeakIntensity"] = sum(all_abundances) cluster_obj["number of spectra"] = len(all_files) cluster_obj["UniqueFileSourcesCount"] = len(all_files) group_abundances = determine_group_abundances( group_to_files_mapping, abundance_per_file, operation=GROUP_COUNT_AGGREGATE_METHOD) default_groups = ["G1", "G2", "G3", "G4", "G5", "G6"] for group in group_to_files_mapping: group_header = "GNPSGROUP:" + group if group in default_groups: continue cluster_obj[group_header] = group_abundances[group] for group in default_groups: cluster_obj[group] = group_abundances[group] #Writing attributes for attribute in attributes_to_groups_mapping: groups_to_include = [] for group in attributes_to_groups_mapping[attribute]: if group_abundances[group] > 0.0: groups_to_include.append(group) if len(groups_to_include) == 0: cluster_obj[attribute] = "" else: cluster_obj[attribute] = ",".join(groups_to_include) """ Enriching the cluster info with adduct collapsing information """ enrich_adduct_annotations(cluster_obj, quantification_object) clusters_list.append(cluster_obj) pd.DataFrame(clusters_list).to_csv(args.output_clusterinfo_summary, sep="\t", index=False)
def main(): param_filename = sys.argv[1] choose_consensus_params_filename = sys.argv[2] filtered_peptide_list_filename = sys.argv[3] length_score_cutoff_filename = sys.argv[4] provenance_json_filename = sys.argv[5] merged_library_spectra_folder = sys.argv[6] output_library_json_folder = sys.argv[7] output_candidate_spectra_tsv_folder = sys.argv[8] filtered_peptide_set = load_filtered_peptide_set( filtered_peptide_list_filename) score_cutoff_by_length = load_score_cutoff_by_length( filtered_peptide_list_filename) variant_to_score = load_variant_to_score(filtered_peptide_list_filename) #Deciding on how to create consensus params_obj = ming_proteosafe_library.parse_xml_file(open(param_filename)) parallel_params = json.loads(open(choose_consensus_params_filename).read()) total_node_count = parallel_params["total_paritions"] my_node_number = parallel_params["node_partition"] consensus_selection_method = params_obj["ConsensusChoice"][0] #output dict for listing all candidates library_candidates_output_dict = defaultdict(list) #determine filenames merged_library_filename, my_position_for_file, total_nodes_for_file = determine_filenames_to_load( my_node_number, total_node_count, merged_library_spectra_folder) print(merged_library_filename, my_position_for_file, total_nodes_for_file) library_spectra = [] input_spectrum_file_handle = open(merged_library_filename) line_count = 0 for line in input_spectrum_file_handle: line_count += 1 if line_count % total_nodes_for_file != my_position_for_file: #print("Should Skip") continue else: print("NOT SKIP") all_spectra = json.loads(line) if len(all_spectra) == 0: continue annotation = all_spectra[0]["annotation"] + "." + str( all_spectra[0]["charge"]) print(annotation, len(all_spectra)) if not annotation in filtered_peptide_set: continue library_spectrum = create_library_spectrum( all_spectra, consensus_selection_method, score_cutoff_by_length, variant_to_score, library_candidates_output_dict) library_spectra.append(library_spectrum) json.dump( library_spectra, open( os.path.join(output_library_json_folder, str(my_node_number) + ".json"), "w")) #Provenance Records provenance_records = json.loads(open(provenance_json_filename).read()) #Modifying the output candidate file for i in range(len(library_candidates_output_dict["filename"])): proteosafe_task = library_candidates_output_dict["proteosafe_task"][i] if proteosafe_task in provenance_records["search_task_to_augment"]: library_candidates_output_dict["augment_task"].append( provenance_records["search_task_to_augment"][proteosafe_task]) else: library_candidates_output_dict["augment_task"].append("") if proteosafe_task in provenance_records["search_task_to_extraction"]: library_candidates_output_dict["extract_task"].append( provenance_records["search_task_to_extraction"] [proteosafe_task]) else: library_candidates_output_dict["extract_task"].append("") #Outputting output_candidate_spectra_tsv_filename = os.path.join( output_candidate_spectra_tsv_folder, str(my_node_number) + ".tsv") ming_fileio_library.write_dictionary_table_data( library_candidates_output_dict, output_candidate_spectra_tsv_filename) """Converted Output""" output_tsv_folder = sys.argv[9] output_mgf_folder = sys.argv[10] output_sptxt_folder = sys.argv[11] library_spectrum_collection = ming_spectrum_library.SpectrumCollection( "library spectra") for library_spectrum in library_spectra: lib_spec = ming_spectrum_library.PeptideLibrarySpectrum( "", 0, 0, library_spectrum["peaks"], library_spectrum["mz"], library_spectrum["charge"], library_spectrum["annotation"], library_spectrum["protein"]) if "score" in library_spectrum: lib_spec.score = library_spectrum["score"] if "variant_score" in library_spectrum: lib_spec.variant_score = library_spectrum["variant_score"] if "spectra_to_consider" in library_spectrum: lib_spec.num_spectra = library_spectrum["spectra_to_consider"] if "ranking" in library_spectrum: lib_spec.spectrum_ranking = library_spectrum["ranking"] if "proteosafe_task" in library_spectrum: lib_spec.proteosafe_task = library_spectrum["proteosafe_task"] if "originalspectrum_filename" in library_spectrum: lib_spec.originalfile_filename = library_spectrum[ "originalspectrum_filename"] if "originalspectrum_scan" in library_spectrum: lib_spec.originalfile_scan = str( library_spectrum["originalspectrum_scan"]) library_spectrum_collection.spectrum_list.append(lib_spec) output_mgf_filename = os.path.join(output_mgf_folder, str(my_node_number) + ".mgf") output_tsv_filename = os.path.join(output_tsv_folder, str(my_node_number) + ".tsv") output_sptxt_filename = os.path.join(output_sptxt_folder, str(my_node_number) + ".sptxt") library_spectrum_collection.save_to_mgf(open(output_mgf_filename, "w")) library_spectrum_collection.save_to_tsv(open(output_tsv_filename, "w"), output_mgf_filename) try: library_spectrum_collection.save_to_sptxt( open(output_sptxt_filename, "w")) except: traceback.print_exc(file=sys.stdout) print("MEH")
files_to_filter = [] for line in open(args.groupsfilename): if line.split("=")[0] == "GROUP_G6": files_to_filter = line.split("=")[1].split(";") files_to_filter = [filename.rstrip() for filename in files_to_filter] print("files_to_filter", files_to_filter) filtered_clusterinfo_df = clusterinfo_df[clusterinfo_df["#Filename"].isin( files_to_filter)] clusters_to_filter = set(list(filtered_clusterinfo_df["#ClusterIdx"])) print("clusters_to_filter", clusters_to_filter) #Loading the spectra spectrum_collection = ming_spectrum_library.SpectrumCollection(input_specs_ms) spectrum_collection.load_from_mgf() filtered_spectrum_list = [] for spectrum in spectrum_collection.spectrum_list: if spectrum is None: continue else: scan = spectrum.scan if scan in clusters_to_filter: continue else: filtered_spectrum_list.append(spectrum) #Renumbering to make sure empty ones are still there included_scans = set()
def calculated_ambiguity(parameter_map, peak_tolerance): filename = parameter_map["filename"] scan_mapping = parameter_map["scan_mapping"] spectrum_collection = ming_spectrum_library.SpectrumCollection(filename) spectrum_collection.load_from_file() return_ambiguity_mapping = defaultdict(lambda: {}) for scan in scan_mapping: spectrum_obj = spectrum_collection.scandict[int(scan)] #Lets determine if the strings are actually ambiguous ambiguous_list = ming_ambiguity_library.collapse_ambiguous_from_annotations_list( scan_mapping[scan]) #print(ambiguous_list) if len(ambiguous_list) == 1: score_summary = {} score_summary["ambiguity_total_score"] = -1 score_summary["first_unique_count"] = -1 score_summary["second_unique_count"] = -1 score_summary["first_unique_intensity"] = -1 score_summary["second_unique_intensity"] = -1 score_summary["first_second_unique_ratio"] = -1 return_ambiguity_mapping[scan] = score_summary continue if len(ambiguous_list) > 2: score_summary = {} score_summary["ambiguity_total_score"] = 10 score_summary["first_unique_count"] = 10 score_summary["second_unique_count"] = 10 score_summary["first_unique_intensity"] = 10 score_summary["second_unique_intensity"] = 10 score_summary["first_second_unique_ratio"] = -1 return_ambiguity_mapping[scan] = score_summary continue peptide_to_extracted_peaks_mapping = {} for peptide in ambiguous_list: theoreteical_peaks = ming_psm_library.create_theoretical_peak_map( peptide, ["b", "y"]) original_peaks = spectrum_obj.peaks extracted_peaks = extract_annotated_peaks(theoreteical_peaks, original_peaks, peak_tolerance) peptide_to_extracted_peaks_mapping[peptide] = extracted_peaks #print("Original:\t%d\tExtracted:\t%d" % (len(original_peaks), len(extracted_peaks))) #print(original_peaks) #print(extracted_peaks) #print(theoreteical_peaks) #Checkout overlap of stuff first_peaks = peptide_to_extracted_peaks_mapping[list( peptide_to_extracted_peaks_mapping.keys())[0]] second_peaks = peptide_to_extracted_peaks_mapping[list( peptide_to_extracted_peaks_mapping.keys())[1]] total_score, reported_alignments = spectrum_alignment.score_alignment( first_peaks, second_peaks, spectrum_obj.mz, spectrum_obj.mz, peak_tolerance) first_total = len(first_peaks) second_total = len(second_peaks) intersection_total = len(reported_alignments) first_unique_count = first_total - intersection_total second_unique_count = second_total - intersection_total #Calculating the explained intensity in each of these peaks_1_normed = spectrum_alignment.sqrt_normalize_spectrum( spectrum_alignment.convert_to_peaks(first_peaks)) peaks_2_normed = spectrum_alignment.sqrt_normalize_spectrum( spectrum_alignment.convert_to_peaks(second_peaks)) first_aligned_index = [] second_aligned_index = [] for alignment in reported_alignments: first_aligned_index.append(alignment.peak1) second_aligned_index.append(alignment.peak2) #intensity values first_unique = [] second_unique = [] for i in range(len(peaks_1_normed)): if not i in first_aligned_index: first_unique.append(peaks_1_normed[i][1]) for i in range(len(peaks_2_normed)): if not i in second_aligned_index: second_unique.append(peaks_2_normed[i][1]) first_unique_intensity = sum(i[0] * i[1] for i in zip(first_unique, first_unique)) second_unique_intensity = sum( i[0] * i[1] for i in zip(second_unique, second_unique)) first_second_unique_ratio = 0 try: first_second_unique_ratio = min( first_unique_intensity, second_unique_intensity) / max( first_unique_intensity, second_unique_intensity) except KeyboardInterrupt: raise except: first_second_unique_ratio = 10 if first_second_unique_ratio > 10: first_second_unique_ratio = 10 #print(reported_alignments) #print(peaks_1_normed) #print("FirstCount\t%d\tSecondCount\t%d\tFirstInt\t%f\tSecondInt\t%f" % (first_unique_count, second_unique_count, first_unique_intensity, second_unique_intensity)) score_summary = {} score_summary["ambiguity_total_score"] = total_score score_summary["first_unique_count"] = first_unique_count score_summary["second_unique_count"] = second_unique_count score_summary["first_unique_intensity"] = first_unique_intensity score_summary["second_unique_intensity"] = second_unique_intensity score_summary["first_second_unique_ratio"] = first_second_unique_ratio return_ambiguity_mapping[scan] = score_summary return return_ambiguity_mapping
def extract_psms_from_filename(filename, psms_list, snr_threshold, minimum_explained_intensity, min_signal_peaks, min_number_of_peaks_within_1_percent_of_max, min_number_of_annotated_ions, max_ppm_error): full_path = os.path.join( ming_proteosafe_library.PROTEOSAFE_USER_UPLOADS_DIR, filename, ) print("loading ", full_path) spectrum_collection = ming_spectrum_library.SpectrumCollection(full_path) spectrum_collection.load_from_file(drop_ms1=True) spectrum_list = [] for psm in psms_list: scan = psm.scan protein = "PROTEIN" if psm.decoy == 1: protein = "CREATION_FALSE_PROTEIN" loaded_spectrum = spectrum_collection.scandict[scan] loaded_spectrum.filter_precursor_peaks() number_of_signal_peaks = loaded_spectrum.get_number_of_signal_peaks( SNR_Threshold=3) number_of_peaks_within_1_percent_of_max = loaded_spectrum.get_number_of_peaks_within_percent_of_max( percent=1.0) number_of_peaks_within_5_percent_of_max = loaded_spectrum.get_number_of_peaks_within_percent_of_max( percent=5.0) annotated_peak_count = ming_psm_library.calculated_number_annotated_peaks( loaded_spectrum.peaks, loaded_spectrum.charge, psm.get_annotation_without_charge(), 0.1) explained_intensity = ming_psm_library.calculated_explained_intensity( loaded_spectrum.peaks, loaded_spectrum.charge, psm.get_annotation_without_charge(), 0.1) number_of_ions_annotated_above_SNR = ming_spectrum_library.calculated_number_unique_ions_annotated_in_signal( loaded_spectrum.peaks, min(loaded_spectrum.charge, 3), psm.get_annotation_without_charge(), 0.1, SNR=3.0) theoretical_mz = ming_psm_library.calculate_theoretical_peptide_mass( psm.get_annotation_without_charge(), psm.charge) mass_difference = abs(theoretical_mz - loaded_spectrum.mz) ppm_error = (mass_difference / theoretical_mz) * 1000000 parent_mass_error = mass_difference * psm.charge if snr_threshold > 0.9: loaded_spectrum.filter_noise_peaks(snr_threshold) output_spectrum_dict = {} output_spectrum_dict["filename"] = filename output_spectrum_dict["protein"] = protein output_spectrum_dict["scan"] = loaded_spectrum.scan output_spectrum_dict["peaks"] = json.dumps(loaded_spectrum.peaks) output_spectrum_dict["mz"] = loaded_spectrum.mz output_spectrum_dict["charge"] = psm.charge output_spectrum_dict["score"] = psm.score output_spectrum_dict["kl_score"] = float( psm.extra_metadata["kl_strict"]) output_spectrum_dict["annotation"] = psm.get_annotation_without_charge( ) output_spectrum_dict[ "collision_energy"] = loaded_spectrum.collision_energy output_spectrum_dict[ "precursor_intensity"] = loaded_spectrum.precursor_intensity output_spectrum_dict["signal_peaks"] = number_of_signal_peaks output_spectrum_dict[ "number_of_peaks_within_1_percent_of_max"] = number_of_peaks_within_1_percent_of_max output_spectrum_dict[ "number_of_peaks_within_5_percent_of_max"] = number_of_peaks_within_5_percent_of_max output_spectrum_dict["annotated_peak_count"] = annotated_peak_count output_spectrum_dict[ "number_of_ions_annotated_above_SNR"] = number_of_ions_annotated_above_SNR output_spectrum_dict["explained_intensity"] = explained_intensity output_spectrum_dict["ppm_error"] = ppm_error output_spectrum_dict["parent_mass_error"] = parent_mass_error if "proteosafe_task" in psm.extra_metadata: output_spectrum_dict["proteosafe_task"] = psm.extra_metadata[ "proteosafe_task"] else: output_spectrum_dict["proteosafe_task"] = "" #TODO FILTER OUT SPECTRA HERE if output_spectrum_dict["signal_peaks"] < min_signal_peaks: continue if output_spectrum_dict[ "number_of_peaks_within_1_percent_of_max"] < min_number_of_peaks_within_1_percent_of_max: continue if output_spectrum_dict[ "explained_intensity"] < minimum_explained_intensity: continue if output_spectrum_dict[ "number_of_ions_annotated_above_SNR"] < min_number_of_annotated_ions: continue if ppm_error > max_ppm_error: continue spectrum_list.append(output_spectrum_dict) return spectrum_list