def calc_cos_scores(spectra, masses): if len(spectra) != len(masses): raise IndexError('spectra and masses must be lists of the same length') cosScoreTxt = open('cos_score_data', 'w') cosScores = [] cosScoreTxt.write('[') for i,rSpec in enumerate(spectra): cosScoreTxt.write('[') temp = [] for j,cSpec in enumerate(spectra): if j > 0 and j < len(spectra): cosScoreTxt.write(', ') if i == j: cosScoreTxt.write('1.0') temp.append(1.0) else: x = spectrum_alignment.score_alignment(rSpec, cSpec, masses[i], masses[j], 0.02)[0] cosScoreTxt.write(str(x)) temp.append(x) if i < len(spectra)-1: cosScoreTxt.write('], ') else: cosScoreTxt.write(']') cosScores.append(temp) cosScoreTxt.write(']') cosScoreTxt.close() return cosScores
def process_spectra_similarity(metadata, spectral_data, path): fileout = open(path,"w") keys = metadata.keys() header = "The format is in ID, ID, similarity score" + "\n" fileout.write(header + "\n") # This lists holds tuples of indices # This list is used to prevent repeating comparisons reverse_order = [] for i,x in enumerate(keys): for j,y in enumerate(keys): # This condition prevents self comparison and repeating comparisons (happening in reverse order) if i != j and (i, j) not in reverse_order: reverse_order.append((j,i)) # The value is a list of tuples # Index 0 is the first tuple in the list (containing parent mass info) # Index 1 of the tuple is the parent mass value parent_mass1 = float(metadata[x][0][1]) parent_mass2 = float(metadata[y][0][1]) score, reported_alignments = score_alignment(spectral_data[x], spectral_data[y], parent_mass1, parent_mass2, 0.3) line = x + " " + y + " " + str(score) + "\n" fileout.write(line)
def choose_representative_spectrum_most_similary_combination_score( spectrum_list, minimum_spectra_for_combination=4): #Find the average spectrum, and then find the real spectrum that is closest to that average spectrum best_spectrum = None most_similar_score = -1000.0 #for efficiency #spectrum_list = sorted(spectrum_list, key=lambda spectrum: spectrum["score"], reverse=True) total_scores_to_consider = len(spectrum_list) for spectrum in spectrum_list: spectrum_unique_key = spectrum["filename"] + ":" + str( spectrum["scan"]) new_score_dict = {} existing_score_dict = {} if "score_dict" in spectrum: existing_score_dict = spectrum["score_dict"] all_scores = [] for other_spectrum in spectrum_list: other_spectrum_unique_key = other_spectrum["filename"] + ":" + str( other_spectrum["scan"]) if spectrum_unique_key == other_spectrum_unique_key: continue total_score = 0.0 if other_spectrum_unique_key in existing_score_dict: total_score = existing_score_dict[other_spectrum_unique_key] else: total_score, reported_alignments = spectrum_alignment.score_alignment( spectrum["peaks"], other_spectrum["peaks"], spectrum["mz"], other_spectrum["mz"], 0.1) new_score_dict[other_spectrum_unique_key] = total_score all_scores.append(total_score) average_score = sum(all_scores) / float(total_scores_to_consider) if len(spectrum_list) < minimum_spectra_for_combination: explained_intensity = spectrum["explained_intensity"] annotated_ions = spectrum["number_of_ions_annotated_above_SNR"] average_score = average_score * explained_intensity * annotated_ions if average_score > most_similar_score: most_similar_score = average_score best_spectrum = spectrum #Saving the new score matrix spectrum["score_dict"] = new_score_dict #print(best_spectrum) return best_spectrum
def cosine_spectrum(self, other_spectrum, peak_tolerance): total_score, reported_alignments = spectrum_alignment.score_alignment(self.peaks, other_spectrum.peaks, self.mz * self.charge, other_spectrum.mz * other_spectrum.charge, peak_tolerance, self.charge) return total_score, len(reported_alignments)
def create_masst_network(spectra_matches_df, output_graphml, output_image=None): # Loading all Datasets Information dataset_matches = list(set(spectra_matches_df["dataset_id"])) all_datasets = requests.get( "https://massive.ucsd.edu/ProteoSAFe/datasets_json.jsp#%7B%22query%22%3A%7B%7D%2C%22table_sort_history%22%3A%22createdMillis_dsc%22%7D" ).json()["datasets"] all_node_usi_list = [] # Source MASST USI output_dict = {} output_dict[ "usi"] = "mzspec:GNPS:TASK-c6b2797224f34d819d20dd7af622bc6b-spectra/:scan:1" output_dict["dataset"] = "QUERY" output_dict["scan"] = 1 all_node_usi_list.append(output_dict) # Getting all the MASST data for dataset in dataset_matches: filtered_dataset = [ current_dataset for current_dataset in all_datasets if current_dataset["dataset"] == dataset ] dataset_task = filtered_dataset[0]["task"] continuous_id = requests.get( "http://gnps.ucsd.edu/ProteoSAFe/ContinuousIDServlet?task={}". format(dataset_task)).json() network_url = "https://gnps.ucsd.edu/ProteoSAFe/result_json.jsp?task={}&view=clusters_network_pairs".format( continuous_id["jobs"][0]["task"]) data = requests.get(network_url).json()['blockData'] network_df = pd.DataFrame(data) dataset_spectra_matches = spectra_matches_df[ spectra_matches_df["dataset_id"] == dataset] clusters_matched = list(set(dataset_spectra_matches["cluster_scan"])) # Grabbing identification information try: dataset_identifications = ming_gnps_library.get_dataset_current_continuous_identifications( dataset_task) dataset_identifications_df = pd.DataFrame(dataset_identifications) #print(dataset_identifications_df.columns) except: pass network_df["Node1"] = network_df["Node1"].astype(int) filtered_edges = network_df[network_df["Node1"].isin(clusters_matched)] for edge in filtered_edges.to_dict(orient="records"): cluster = edge["Node2"] usi = "mzspec:GNPS:TASK-{}-speccontinuous/speccontinuous-00000.mgf:scan:{}".format( continuous_id["jobs"][0]["task"], cluster) output_dict = {} output_dict["usi"] = usi output_dict["dataset"] = filtered_dataset[0]["dataset"] output_dict["scan"] = cluster try: filtered_identifications_df = dataset_identifications_df[ dataset_identifications_df["#Scan#"] == cluster] identification_dict = filtered_identifications_df.to_dict( orient="records")[0] output_dict["Compound_Name"] = identification_dict[ "Compound_Name"] output_dict["Smiles"] = identification_dict["Smiles"] output_dict["INCHI"] = identification_dict["INCHI"] output_dict["MQScore"] = identification_dict["MQScore"] output_dict["SpectrumID"] = identification_dict["SpectrumID"] except: pass all_node_usi_list.append(output_dict) # Now we will load up all the spectra and do stuff with it from ming_spectrum_library import Spectrum import spectrum_alignment all_spectra_list = [] for usi_dict in all_node_usi_list: usi = usi_dict["usi"] display_information = "{}:{}".format(usi_dict["dataset"], usi_dict["scan"]) url = "https://metabolomics-usi.ucsd.edu/json/?usi={}".format(usi) spectrum_json = requests.get(url).json() spectrum = Spectrum("", display_information, display_information, spectrum_json["peaks"], spectrum_json["precursor_mz"], 1, 2) spectrum.dataset = usi_dict["dataset"] spectrum.usi = usi_dict["usi"] spectrum.Compound_Name = usi_dict.get("Compound_Name", "N/A") spectrum.Smiles = usi_dict.get("Smiles", "N/A") spectrum.INCHI = usi_dict.get("INCHI", "N/A") spectrum.MQScore = usi_dict.get("MQScore", "N/A") spectrum.SpectrumID = usi_dict.get("SpectrumID", "N/A") all_spectra_list.append(spectrum) min_score = 0.7 min_matched_peaks = 5 # Let's create a network now G = nx.Graph() from tqdm import tqdm for i, spectrum1 in tqdm(enumerate(all_spectra_list)): for j, spectrum2 in enumerate(all_spectra_list): if i <= j: continue if spectrum1.usi == spectrum2.usi: continue # Doing a network here total_score, reported_alignments = spectrum_alignment.score_alignment( spectrum1.peaks, spectrum2.peaks, spectrum1.mz, spectrum2.mz, 0.5, max_charge_consideration=1) if total_score < min_score: continue if len(reported_alignments) < min_matched_peaks: continue G.add_edge(spectrum1.scan, spectrum2.scan, cosine_score=total_score, matched_peaks=len(reported_alignments)) # Adding Node Attributes G.nodes[spectrum1.scan]["mz"] = spectrum1.mz G.nodes[spectrum2.scan]["mz"] = spectrum2.mz G.nodes[spectrum1.scan]["dataset"] = spectrum1.dataset G.nodes[spectrum2.scan]["dataset"] = spectrum2.dataset G.nodes[spectrum1.scan]["Compound_Name"] = spectrum1.Compound_Name G.nodes[spectrum2.scan]["Compound_Name"] = spectrum2.Compound_Name G.nodes[spectrum1.scan]["Smiles"] = spectrum1.Smiles G.nodes[spectrum2.scan]["Smiles"] = spectrum2.Smiles G.nodes[spectrum1.scan]["INCHI"] = spectrum1.INCHI G.nodes[spectrum2.scan]["INCHI"] = spectrum2.INCHI G.nodes[spectrum1.scan]["MQScore"] = spectrum1.MQScore G.nodes[spectrum2.scan]["MQScore"] = spectrum2.MQScore import matplotlib.pyplot as plt import molecular_network_filtering_library molecular_network_filtering_library.filter_top_k(G, 10) nx.draw(G, with_labels=True, font_weight='bold') nx.write_graphml(G, output_graphml) if output_image is not None: plt.savefig(output_image, format="PNG")
def calculated_ambiguity(parameter_map, peak_tolerance): filename = parameter_map["filename"] scan_mapping = parameter_map["scan_mapping"] spectrum_collection = ming_spectrum_library.SpectrumCollection(filename) spectrum_collection.load_from_file() return_ambiguity_mapping = defaultdict(lambda: {}) for scan in scan_mapping: spectrum_obj = spectrum_collection.scandict[int(scan)] #Lets determine if the strings are actually ambiguous ambiguous_list = ming_ambiguity_library.collapse_ambiguous_from_annotations_list( scan_mapping[scan]) #print(ambiguous_list) if len(ambiguous_list) == 1: score_summary = {} score_summary["ambiguity_total_score"] = -1 score_summary["first_unique_count"] = -1 score_summary["second_unique_count"] = -1 score_summary["first_unique_intensity"] = -1 score_summary["second_unique_intensity"] = -1 score_summary["first_second_unique_ratio"] = -1 return_ambiguity_mapping[scan] = score_summary continue if len(ambiguous_list) > 2: score_summary = {} score_summary["ambiguity_total_score"] = 10 score_summary["first_unique_count"] = 10 score_summary["second_unique_count"] = 10 score_summary["first_unique_intensity"] = 10 score_summary["second_unique_intensity"] = 10 score_summary["first_second_unique_ratio"] = -1 return_ambiguity_mapping[scan] = score_summary continue peptide_to_extracted_peaks_mapping = {} for peptide in ambiguous_list: theoreteical_peaks = ming_psm_library.create_theoretical_peak_map( peptide, ["b", "y"]) original_peaks = spectrum_obj.peaks extracted_peaks = extract_annotated_peaks(theoreteical_peaks, original_peaks, peak_tolerance) peptide_to_extracted_peaks_mapping[peptide] = extracted_peaks #print("Original:\t%d\tExtracted:\t%d" % (len(original_peaks), len(extracted_peaks))) #print(original_peaks) #print(extracted_peaks) #print(theoreteical_peaks) #Checkout overlap of stuff first_peaks = peptide_to_extracted_peaks_mapping[list( peptide_to_extracted_peaks_mapping.keys())[0]] second_peaks = peptide_to_extracted_peaks_mapping[list( peptide_to_extracted_peaks_mapping.keys())[1]] total_score, reported_alignments = spectrum_alignment.score_alignment( first_peaks, second_peaks, spectrum_obj.mz, spectrum_obj.mz, peak_tolerance) first_total = len(first_peaks) second_total = len(second_peaks) intersection_total = len(reported_alignments) first_unique_count = first_total - intersection_total second_unique_count = second_total - intersection_total #Calculating the explained intensity in each of these peaks_1_normed = spectrum_alignment.sqrt_normalize_spectrum( spectrum_alignment.convert_to_peaks(first_peaks)) peaks_2_normed = spectrum_alignment.sqrt_normalize_spectrum( spectrum_alignment.convert_to_peaks(second_peaks)) first_aligned_index = [] second_aligned_index = [] for alignment in reported_alignments: first_aligned_index.append(alignment.peak1) second_aligned_index.append(alignment.peak2) #intensity values first_unique = [] second_unique = [] for i in range(len(peaks_1_normed)): if not i in first_aligned_index: first_unique.append(peaks_1_normed[i][1]) for i in range(len(peaks_2_normed)): if not i in second_aligned_index: second_unique.append(peaks_2_normed[i][1]) first_unique_intensity = sum(i[0] * i[1] for i in zip(first_unique, first_unique)) second_unique_intensity = sum( i[0] * i[1] for i in zip(second_unique, second_unique)) first_second_unique_ratio = 0 try: first_second_unique_ratio = min( first_unique_intensity, second_unique_intensity) / max( first_unique_intensity, second_unique_intensity) except KeyboardInterrupt: raise except: first_second_unique_ratio = 10 if first_second_unique_ratio > 10: first_second_unique_ratio = 10 #print(reported_alignments) #print(peaks_1_normed) #print("FirstCount\t%d\tSecondCount\t%d\tFirstInt\t%f\tSecondInt\t%f" % (first_unique_count, second_unique_count, first_unique_intensity, second_unique_intensity)) score_summary = {} score_summary["ambiguity_total_score"] = total_score score_summary["first_unique_count"] = first_unique_count score_summary["second_unique_count"] = second_unique_count score_summary["first_unique_intensity"] = first_unique_intensity score_summary["second_unique_intensity"] = second_unique_intensity score_summary["first_second_unique_ratio"] = first_second_unique_ratio return_ambiguity_mapping[scan] = score_summary return return_ambiguity_mapping
def cosine_spectrum(self, other_spectrum, peak_tolerance): total_score, reported_alignments = spectrum_alignment.score_alignment( self.peaks, other_spectrum.peaks, self.mz, other_spectrum.mz, peak_tolerance) return total_score
def cosine_spectrum(self, other_spectrum, peak_tolerance): total_score, reported_alignments = spectrum_alignment.score_alignment(self.peaks, other_spectrum.peaks, self.mz, other_spectrum.mz, peak_tolerance) return total_score