def main(): paramxml_filename = sys.argv[1] psms_input_file = sys.argv[2] kl_input_file = sys.argv[3] output_psms_file = sys.argv[4] parameters_obj = ming_proteosafe_library.parse_xml_file( open(paramxml_filename)) row_count, kl_data = ming_fileio_library.parse_table_with_headers( kl_input_file) kl_dict = {} for i in range(row_count): filename = os.path.basename(kl_data["Filename"][i]) scan = kl_data["Scan"][i] kl_strict = (kl_data["KL Strict"][i]) kl_unstrict = (kl_data["KL"][i]) interpeak_intensity = (kl_data["Interpeak intensity"][i]) key = filename + ":" + str(scan) kl_dict[key] = { "kl_strict": kl_strict, "kl_unstrict": kl_unstrict, "kl_interpeak": interpeak_intensity } #Since we don't support more fields in the psm object, we're going to read this file in again as a tsv file and add the columns as necessary psm_rows, psm_table_data = ming_fileio_library.parse_table_with_headers( psms_input_file) psm_table_data["kl_strict"] = [] psm_table_data["kl_unstrict"] = [] psm_table_data["kl_interpeak"] = [] for i in range(psm_rows): key = psm_table_data["filename"][i] + ":" + psm_table_data["scan"][i] if key in kl_dict: psm_table_data["kl_strict"].append(kl_dict[key]["kl_strict"]) psm_table_data["kl_unstrict"].append(kl_dict[key]["kl_unstrict"]) psm_table_data["kl_interpeak"].append(kl_dict[key]["kl_interpeak"]) else: psm_table_data["kl_strict"].append(-1) psm_table_data["kl_unstrict"].append(-1) psm_table_data["kl_interpeak"].append(-1) #Change C to C+57 #if "cysteine_protease.cysteine" in parameters_obj: # if parameters_obj["cysteine_protease.cysteine"][0] == "c57": # #Lets replace all the cysteines # for i in range(psm_rows): # psm_table_data["sequence"][i] = psm_table_data["sequence"][i].replace("C", "C+57") ming_fileio_library.write_dictionary_table_data(psm_table_data, output_psms_file)
def load_clustersummary(self, clustersummaryfilename): row_count, table_data = ming_fileio_library.parse_table_with_headers( clustersummaryfilename) for i in range(row_count): cluster_index = table_data["cluster index"][i] mz = table_data["precursor mass"][i] charge = table_data["precursor charge"][i] parentmass = table_data["parent mass"][i] number_of_spectra = table_data["number of spectra"][i] all_files = table_data["AllFiles"][i] componentindex = -1 if "componentindex" in table_data: componentindex = table_data["componentindex"][i] cluster_node = ClusterNode(mz, charge, cluster_index, number_of_spectra, componentindex) cluster_node.all_files_string = all_files self.nodes.append(cluster_node) self.index_to_node_map[cluster_index] = cluster_node #Making all the nodes not shit in terms of clustering info constituent_spectra = cluster_node.all_files_string.split("###") cluster_node.constituent_spectra = constituent_spectra
def main(): parallel_json = json.loads(open(sys.argv[1]).read()) params_filename = sys.argv[2] task_id_file = sys.argv[3] output_peptide_folder = sys.argv[4] output_psm_folder = sys.argv[5] #output_summary = sys.argv[5] params_dict = ming_proteosafe_library.parse_xml_file(open(params_filename)) source_tasks_text = params_dict["tasks_to_consolidate"][0] row_count, task_file_table = ming_fileio_library.parse_table_with_headers( task_id_file) my_node = parallel_json["node_partition"] total_node = parallel_json["total_paritions"] output_summary = os.path.join(sys.argv[6], str(my_node)) if len(source_tasks_text) > 0: source_tasks_list = json.loads(source_tasks_text) source_tasks_list += task_file_table["TASKID"] source_tasks_list.sort() source_tasks_list = source_tasks_list[my_node::total_node] grab_all_results(source_tasks_list, output_peptide_folder, output_psm_folder, output_summary, params_dict) else: open(output_summary, "w").write("None")
def parse_MSGF_tsvfile(filename): rows, table_data = ming_fileio_library.parse_table_with_headers(filename) scan_header = "Scan#" peptide_header = "Peptide" protein_header = "Protein" score_header = "P-value" filename_header = "#SpecFile" charge_header = "Charge" ppm_error_header = "PMError(ppm)" da_pm_error_header = "PMError(Da)" precursor_header = "Precursor" fragmethod_header = "FragMethod" parse_da_error = False if not ppm_error_header in table_data: parse_da_error = True decoy_indicator = "REV_" psm_list = [] for i in range(rows): scan = table_data[scan_header][i] peptide = table_data[peptide_header][i] protein = table_data[protein_header][i] score = -math.log10(float(table_data[score_header][i])) #print table_data[score_header][i] + "\t" + str(score) filename = table_data[filename_header][i] charge = int(table_data[charge_header][i]) frag_method = table_data[fragmethod_header][i] if parse_da_error: ppm_error = float(table_data[da_pm_error_header][i]) / float( table_data[precursor_header][i]) * 1000000 else: ppm_error = float(table_data[ppm_error_header][i]) decoy = 0 #Stripping peptide dots if peptide[1] == "." and peptide[-2] == ".": peptide = peptide[2:-2] if protein.find(decoy_indicator) != -1: decoy = 1 #Adding charge state to peptide name peptide += "." + str(charge) new_psm = PSM(filename, scan, peptide, score, decoy, protein, charge, frag_method=frag_method) new_psm.ppm_error = ppm_error psm_list.append(new_psm) return psm_list
def parse_variant_file(filename): rows, table_data = ming_fileio_library.parse_table_with_headers(filename) psm_list = [] for i in range(rows): filename = table_data["filename"][i] scan = int(table_data["scan"][i]) score = float(table_data["score"][i]) decoy = int(table_data["decoy"][i]) variant_sequence = table_data["variant_sequence"][i] charge = 0 if "charge" in table_data: charge = int(table_data["charge"][i]) else: charge = int(variant_sequence.split(".")[-1]) protein = "NONE" if "unmangled_name" in table_data: filename = table_data["unmangled_name"][i] new_psm = PSM(filename, scan, variant_sequence, score, decoy, protein, charge) psm_list.append(new_psm) return psm_list
def load_filename_to_coordinate_mapping(metadata_file): filename_map = {} line_counts, table_data = ming_fileio_library.parse_table_with_headers(metadata_file) if not("COORDINATE_X" in table_data and "COORDINATE_Y" in table_data and "COORDINATE_Z" in table_data): print("COORDINATE_X, COORDINATE_Y, COORDINATE_Z not present in metadata file for ili") exit(1) for i in range(line_counts): filename = table_data["filename"][i].rstrip() x = table_data["COORDINATE_X"][i].rstrip() y = table_data["COORDINATE_Y"][i].rstrip() z = table_data["COORDINATE_Z"][i].rstrip() radius = "0.25" if "COORDINATE_radius" in table_data: radius = table_data["COORDINATE_radius"][i].rstrip() if len(x) < 1: continue coordinate_object = {} coordinate_object["x"] = x coordinate_object["y"] = y coordinate_object["z"] = z coordinate_object["radius"] = radius filename_map[filename] = coordinate_object return filename_map
def load_filename_to_coordinate_mapping(metadata_file): filename_map = {} line_counts, table_data = ming_fileio_library.parse_table_with_headers( metadata_file) if not ("COORDINATE_X" in table_data and "COORDINATE_Y" in table_data and "COORDINATE_Z" in table_data): print( "COORDINATE_X, COORDINATE_Y, COORDINATE_Z not present in metadata file for ili" ) exit(1) for i in range(line_counts): filename = table_data["filename"][i].rstrip() x = table_data["COORDINATE_X"][i].rstrip() y = table_data["COORDINATE_Y"][i].rstrip() z = table_data["COORDINATE_Z"][i].rstrip() radius = "0.25" if "COORDINATE_radius" in table_data: radius = table_data["COORDINATE_radius"][i].rstrip() if len(x) < 1: continue coordinate_object = {} coordinate_object["x"] = x coordinate_object["y"] = y coordinate_object["z"] = z coordinate_object["radius"] = radius filename_map[filename] = coordinate_object return filename_map
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename, metadata_mapping): output_file = open(output_filename, "w") line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename) param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r")) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object) clusters_in_network = set() for row in csv.DictReader(open(clusterinfosummary_filename), delimiter='\t'): clusters_in_network.add(row["cluster index"]) cluster_index_to_file_map = {} clusters_map = {} all_files = {} for i in range(line_counts): cluster_number = table_data["#ClusterIdx"][i] if not(cluster_number in clusters_in_network): continue if not (cluster_number in clusters_map): clusters_map[cluster_number] = [] cluster_index_to_file_map[cluster_number] = {} #Adding all file names to mapping for mangled_name in mangled_mapping.keys(): cluster_index_to_file_map[cluster_number][mangled_name] = 0.0 #print table_data["#Filename"][i].split("/")[1] mangled_filename_only = os.path.basename(table_data["#Filename"][i]) cluster_index_to_file_map[cluster_number][mangled_filename_only] += max(float(table_data["#PrecIntensity"][i]), 1.0) spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]} all_files[table_data["#Filename"][i]] = 1 clusters_map[cluster_number].append(spectrum_info) output_header_list = [] output_header_list.append("#OTU ID") for header in mangled_mapping.keys(): if header.find("spec") == -1: continue if os.path.basename(mangled_mapping[header]) in metadata_mapping: output_header_list.append(metadata_mapping[os.path.basename(mangled_mapping[header])]) else: output_header_list.append(ming_fileio_library.get_filename_without_extension(os.path.basename(mangled_mapping[header]))) output_file.write("\t".join(output_header_list) + "\n") for cluster_idx in cluster_index_to_file_map: line_output_list = [] line_output_list.append(str(cluster_idx)) #line_string = str(cluster_idx) + "\t" for header in mangled_mapping.keys(): if header.find("spec") == -1: continue line_output_list.append(str(cluster_index_to_file_map[cluster_idx][header])) #line_string += str(cluster_index_to_file_map[cluster_idx][header]) + "\t" #print line_string #output_file.write(line_string + "\n") output_file.write("\t".join(line_output_list) + "\n") output_file.close()
def add_library_search_results_to_graph(G, library_search_filename): row_count, table_data = ming_fileio_library.parse_table_with_headers(library_search_filename) for i in range(row_count): cluster_index = table_data["#Scan#"][i] if cluster_index in G.node: G.node[cluster_index]["Adduct"] = str(table_data["Adduct"][i].encode('ascii', 'ignore')) G.node[cluster_index]["Compound_Name"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["Compound_Name"][i]]).replace("\\", "\\\\")) G.node[cluster_index]["Adduct"] = str(table_data["Adduct"][i]) G.node[cluster_index]["INCHI"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["INCHI"][i]]).replace("\\", "\\\\")) G.node[cluster_index]["Smiles"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["Smiles"][i]]).replace("\\", "\\\\")) G.node[cluster_index]["MQScore"] = str(table_data["MQScore"][i]) G.node[cluster_index]["MassDiff"] = str(table_data["MassDiff"][i]) G.node[cluster_index]["MZErrorPPM"] = str(table_data["MZErrorPPM"][i]) G.node[cluster_index]["SharedPeaks"] = str(table_data["SharedPeaks"][i]) G.node[cluster_index]["tags"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["tags"][i]]).replace("\\", "\\\\")) G.node[cluster_index]["Library_Class"] = str(table_data["Library_Class"][i]) G.node[cluster_index]["Instrument"] = str(table_data["Instrument"][i]) G.node[cluster_index]["IonMode"] = str(table_data["IonMode"][i]) G.node[cluster_index]["Ion_Source"] = str(table_data["Ion_Source"][i]) G.node[cluster_index]["PI"] = str(table_data["PI"][i]) G.node[cluster_index]["Data_Collector"] = str(table_data["Data_Collector"][i]) G.node[cluster_index]["Compound_Source"] = str(table_data["Compound_Source"][i]) G.node[cluster_index]["SpectrumID"] = str(table_data["SpectrumID"][i]) G.node[cluster_index]["GNPSLibraryURL"] = "http://gnps.ucsd.edu/ProteoSAFe/gnpslibraryspectrum.jsp?SpectrumID=" + table_data["SpectrumID"][i]
def parse_input_consensus_feature(tsv_file): rows, table_data = ming_fileio_library.parse_table_with_headers(tsv_file) headers = table_data.keys() print headers #Finding all file names data_filenames = [] for header in headers: if header.find("_MZ") != -1: data_filenames.append(header[:-3]) consensus_features = [] for i in range(rows): file_feature_map = {} for filename in data_filenames: intensity_key = filename mz_key = filename + "_MZ" rt_key = filename + "_RT" intensity_value = float(table_data[intensity_key][i]) mz_value = float(table_data[mz_key][i]) rt_value = float(table_data[rt_key][i]) file_feature = LC_Feature(filename, mz_value, rt_value, intensity_value) file_feature_map[filename] = file_feature consensus_feature = ConsensusFeature(table_data["#FeatureID"][i], file_feature_map) consensus_features.append(consensus_feature) return consensus_features
def main(): input_param = ming_proteosafe_library.parse_xml_file(open(sys.argv[1])) input_folder = sys.argv[2] output_file = sys.argv[3] scratch_folder = sys.argv[4] path_to_executable = sys.argv[5] path_to_isotopes_table = sys.argv[6] #parent_mass_tolerance = input_param[] parent_mass_tolerance = 0.05 all_input_file_paths = ming_fileio_library.list_files_in_dir(input_folder) output_kl_intermediates = [] for input_file in all_input_file_paths: output_kl_file = os.path.join(scratch_folder, os.path.basename(input_file) + ".kl") cmd = path_to_executable + " --input " + input_file + " --output_summary " + output_kl_file + " " + "--peak_tolerance " + str( parent_mass_tolerance ) + " --isotope_file " + path_to_isotopes_table + " >/dev/null 2>&1 " print(cmd) os.system(cmd) #subprocess.call([cmd]) output_kl_intermediates.append(output_kl_file) combined_table = defaultdict(list) for output_kl_file in output_kl_intermediates: row_count, table_data = ming_fileio_library.parse_table_with_headers( output_kl_file) for key in table_data: combined_table[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(combined_table, output_file)
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename): param_object = ming_proteosafe_library.parse_xml_file( open(param_filename, "r")) output_file = open(output_filename, "w") if param_object["CREATE_CLUSTER_BUCKETS"][0] != "1": output_file.write("No Output") return test_network = molecular_network_library.MolecularNetwork() test_network.load_clustersummary(clusterinfosummary_filename) line_counts, table_data = ming_fileio_library.parse_table_with_headers( cluster_info_filename) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( param_object) cluster_index_to_file_map = {} clusters_map = {} all_files = {} for i in range(line_counts): cluster_number = table_data["#ClusterIdx"][i] if test_network.get_cluster_index(cluster_number) == None: continue if not (cluster_number in clusters_map): clusters_map[cluster_number] = [] cluster_index_to_file_map[cluster_number] = {} #Adding all file names to mapping for mangled_name in mangled_mapping.keys(): cluster_index_to_file_map[cluster_number][mangled_name] = 0.0 #print table_data["#Filename"][i].split("/")[1] mangled_filename_only = os.path.basename(table_data["#Filename"][i]) cluster_index_to_file_map[cluster_number][ mangled_filename_only] += float(table_data["#PrecIntensity"][i]) spectrum_info = { "filename": table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i] } all_files[table_data["#Filename"][i]] = 1 clusters_map[cluster_number].append(spectrum_info) output_header = "#OTU ID\t" for header in mangled_mapping.keys(): output_header += os.path.basename(mangled_mapping[header]) + "\t" output_file.write(output_header + "\n") for cluster_idx in cluster_index_to_file_map: line_string = str(cluster_idx) + "\t" for header in mangled_mapping.keys(): line_string += str( cluster_index_to_file_map[cluster_idx][header]) + "\t" #print line_string output_file.write(line_string + "\n")
def main(): params = ming_proteosafe_library.parse_xml_file(open(sys.argv[1])) proteome = ming_protein_library.parse_fasta_proteome_file(sys.argv[2]) row_count, table_data = ming_fileio_library.parse_table_with_headers(sys.argv[3]) decoy_marker = sys.argv[5] add_decoy_to_results(table_data, row_count, decoy_marker) psm_results = add_fdr_to_results(table_data, row_count) output_table = defaultdict(list) #Performing filters filter_type = params["filter.filter"][0] if filter_type == "FDR": fdr_threshold = float(params["FDR.FDR"][0]) for psm in psm_results: if psm["QValue"] < fdr_threshold: for key in psm: output_table[key].append(psm[key]) if filter_type == "PepFDR": fdr_threshold = float(params["PepFDR.PepFDR"][0]) for psm in psm_results: if psm["PepQValue"] < fdr_threshold and psm["QValue"] < fdr_threshold: for key in psm: output_table[key].append(psm[key]) if filter_type == "FPR": print("Lets do nothing, don't know what this is") ming_fileio_library.write_dictionary_table_data(output_table, sys.argv[4])
def add_library_search_results_to_graph(G, library_search_filename, annotation_prefix=""): row_count, table_data = ming_fileio_library.parse_table_with_headers(library_search_filename) for i in range(row_count): cluster_index = table_data["#Scan#"][i] if cluster_index in G.node: G.node[cluster_index][annotation_prefix + "Adduct"] = str(table_data["Adduct"][i].encode('ascii', 'ignore')) G.node[cluster_index][annotation_prefix + "Compound_Name"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["Compound_Name"][i]]).replace("\\", "\\\\")) G.node[cluster_index][annotation_prefix + "Adduct"] = str(table_data["Adduct"][i]) G.node[cluster_index][annotation_prefix + "INCHI"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["INCHI"][i]]).replace("\\", "\\\\")) G.node[cluster_index][annotation_prefix + "Smiles"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["Smiles"][i]]).replace("\\", "\\\\")) G.node[cluster_index][annotation_prefix + "MQScore"] = str(table_data["MQScore"][i]) G.node[cluster_index][annotation_prefix + "MassDiff"] = str(table_data["MassDiff"][i]) G.node[cluster_index][annotation_prefix + "MZErrorPPM"] = str(table_data["MZErrorPPM"][i]) G.node[cluster_index][annotation_prefix + "SharedPeaks"] = str(table_data["SharedPeaks"][i]) G.node[cluster_index][annotation_prefix + "tags"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["tags"][i]]).replace("\\", "\\\\")) G.node[cluster_index][annotation_prefix + "Library_Class"] = str(table_data["Library_Class"][i]) G.node[cluster_index][annotation_prefix + "Instrument"] = str(table_data["Instrument"][i]) G.node[cluster_index][annotation_prefix + "IonMode"] = str(table_data["IonMode"][i]) G.node[cluster_index][annotation_prefix + "Ion_Source"] = str(table_data["Ion_Source"][i]) G.node[cluster_index][annotation_prefix + "PI"] = str(table_data["PI"][i]) G.node[cluster_index][annotation_prefix + "Data_Collector"] = str(table_data["Data_Collector"][i]) G.node[cluster_index][annotation_prefix + "Compound_Source"] = str(table_data["Compound_Source"][i]) G.node[cluster_index][annotation_prefix + "SpectrumID"] = str(table_data["SpectrumID"][i]) G.node[cluster_index][annotation_prefix + "GNPSLibraryURL"] = "http://gnps.ucsd.edu/ProteoSAFe/gnpslibraryspectrum.jsp?SpectrumID=" + table_data["SpectrumID"][i]
def load_features_table(input_filename): feature_list = [] line_counts, table_data = ming_fileio_library.parse_table_with_headers(input_filename) for i in range(line_counts): feature = Feature(float(table_data["#rt"][i]), float(table_data["mz"][i]), float(table_data["intensity"][i])) feature_list.append(feature) return feature_list
def main(): input_results_filename = sys.argv[1] input_peptide_list_filename = sys.argv[2] products_to_rt_map = parse_identification_file(input_results_filename) line_counts, table_data = ming_fileio_library.parse_table_with_headers( input_peptide_list_filename) all_peptides = table_data["Peptides"] full_peptides_to_rt = map_products_to_peptide_rt(products_to_rt_map, all_peptides) partitioned_peptide_list = partition_peptides_random( full_peptides_to_rt, 3) #partitioned_peptide_list = partition_peptides_number_products(full_peptides_to_rt, 3) print "Total Products: " + str(len(products_to_rt_map)) total_detectable_products = 0 for peptide_list in partitioned_peptide_list: number_products_detectable = count_number_of_acquireable_products( peptide_list, full_peptides_to_rt) #print number_products_detectable total_detectable_products += number_products_detectable print "Total Products Detectable: " + str(total_detectable_products) for peptide_list in partitioned_peptide_list: print "Partition=================" for peptide in peptide_list: print peptide
def create_ili_output_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, filename_coordinate_mapping, output_filename): output_file = open(output_filename, "w") test_network = molecular_network_library.MolecularNetwork() test_network.load_clustersummary(clusterinfosummary_filename) line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename) param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r")) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object) cluster_index_to_file_map = {} clusters_map = {} all_files = {} for i in range(line_counts): cluster_number = table_data["#ClusterIdx"][i] if test_network.get_cluster_index(cluster_number) == None: continue if not (cluster_number in clusters_map): clusters_map[cluster_number] = [] cluster_index_to_file_map[cluster_number] = {} #Adding all file names to mapping for mangled_name in mangled_mapping.keys(): cluster_index_to_file_map[cluster_number][mangled_name] = 0.0 #print table_data["#Filename"][i].split("/")[1] mangled_filename_only = os.path.basename(table_data["#Filename"][i]) cluster_index_to_file_map[cluster_number][mangled_filename_only] += float(table_data["#PrecIntensity"][i]) spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]} all_files[table_data["#Filename"][i]] = 1 clusters_map[cluster_number].append(spectrum_info) all_headers = ["filename", "X", "Y", "Z", "radius"] for cluster_idx in cluster_index_to_file_map: all_headers.append(cluster_idx) #writing header output_file.write(",".join(all_headers) + "\n") for sample_name in mangled_mapping: if sample_name.find("spec") == -1: continue real_filename = mangled_mapping[sample_name] if not os.path.basename(real_filename) in filename_coordinate_mapping: continue line_output = [real_filename] coordinate_object = filename_coordinate_mapping[os.path.basename(real_filename)] line_output.append(coordinate_object["x"]) line_output.append(coordinate_object["y"]) line_output.append(coordinate_object["z"]) line_output.append(coordinate_object["radius"]) print(line_output, coordinate_object) for cluster_idx in cluster_index_to_file_map: line_output.append(str(cluster_index_to_file_map[cluster_idx][sample_name])) output_file.write(",".join(line_output) + "\n") output_file.close()
def load_features_table(input_filename): feature_list = [] line_counts, table_data = ming_fileio_library.parse_table_with_headers( input_filename) for i in range(line_counts): feature = Feature(float(table_data["#rt"][i]), float(table_data["mz"][i]), float(table_data["intensity"][i])) feature_list.append(feature) return feature_list
def parse_MSGFPlus_tsvfile(filename): rows, table_data = ming_fileio_library.parse_table_with_headers(filename) scan_header = "ScanNum" peptide_header = "Peptide" protein_header = "Protein" score_header = "EValue" filename_header = "#SpecFile" charge_header = "Charge" ppm_error_header = "PrecursorError(ppm)" da_pm_error_header = "PrecursorError(Da)" precursor_header = "Precursor" frag_method_header = "FragMethod" parse_da_error = False if not ppm_error_header in table_data: parse_da_error = True decoy_indicator = "XXX_" psm_list = [] for i in range(rows): scan = table_data[scan_header][i] peptide = table_data[peptide_header][i] protein = table_data[protein_header][i] score = -math.log10(float(table_data[score_header][i])) #print table_data[score_header][i] + "\t" + str(score) filename = table_data[filename_header][i] charge = int(table_data[charge_header][i]) frag_method = table_data[frag_method_header][i] if parse_da_error: ppm_error = float(table_data[da_pm_error_header][i])/float(table_data[precursor_header][i]) * 1000000 else: ppm_error = float(table_data[ppm_error_header][i]) decoy = 0 #Stripping peptide dots if peptide[1] == "." and peptide[-2] == ".": peptide = peptide[2:-2] if protein.find(decoy_indicator) != -1: decoy = 1 #Adding charge state to peptide name peptide += "." + str(charge) new_psm = PSM(filename, scan, peptide, score, decoy, protein, charge) new_psm.ppm_error = ppm_error new_psm.frag_method = frag_method psm_list.append(new_psm) return psm_list
def load_masses(input_filename): masses_list = [] line_counts, table_data = ming_fileio_library.parse_table_with_headers(input_filename) for i in range(line_counts): masses_list.append([table_data["Resolaveability"][i], table_data["Peptide"][i], float(table_data["m/z"][i])]) #Sort this mofo sorted_peptide_mass_list = sorted(masses_list, key=lambda pep_obj: pep_obj[2]) return sorted_peptide_mass_list
def load_variant_to_score(filtered_filename): row_count, table_data = ming_fileio_library.parse_table_with_headers( filtered_filename) variant_to_score = {} for i in range(row_count): variant = table_data["variant_sequence"][i] score = float(table_data["score"][i]) variant_to_score[variant] = score return variant_to_score
def create_ili_output_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, filename_coordinate_mapping, output_filename): output_file = open(output_filename, "w") line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename) param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r")) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object) cluster_index_to_file_map = {} clusters_map = {} all_files = {} for i in range(line_counts): cluster_number = table_data["#ClusterIdx"][i] if not (cluster_number in clusters_map): clusters_map[cluster_number] = [] cluster_index_to_file_map[cluster_number] = {} #Adding all file names to mapping for mangled_name in mangled_mapping.keys(): cluster_index_to_file_map[cluster_number][mangled_name] = 0.0 #print table_data["#Filename"][i].split("/")[1] mangled_filename_only = os.path.basename(table_data["#Filename"][i]) cluster_index_to_file_map[cluster_number][mangled_filename_only] += float(table_data["#PrecIntensity"][i]) spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]} all_files[table_data["#Filename"][i]] = 1 clusters_map[cluster_number].append(spectrum_info) all_headers = ["filename", "X", "Y", "Z", "radius"] for cluster_idx in cluster_index_to_file_map: all_headers.append(cluster_idx) #writing header output_file.write(",".join(all_headers) + "\n") for sample_name in mangled_mapping: if sample_name.find("spec") == -1: continue real_filename = mangled_mapping[sample_name] if not os.path.basename(real_filename) in filename_coordinate_mapping: continue line_output = [real_filename] coordinate_object = filename_coordinate_mapping[os.path.basename(real_filename)] line_output.append(coordinate_object["x"]) line_output.append(coordinate_object["y"]) line_output.append(coordinate_object["z"]) line_output.append(coordinate_object["radius"]) print(line_output, coordinate_object) for cluster_idx in cluster_index_to_file_map: line_output.append(str(cluster_index_to_file_map[cluster_idx][sample_name])) output_file.write(",".join(line_output) + "\n") output_file.close()
def load_precursor_to_protein_mapping(input_filename): row_count, table_data = ming_fileio_library.parse_table_with_headers( input_filename) precursor_to_protein_map = {} for i in range(row_count): precursor_string = table_data["original_peptide"][i] protein_string = table_data["proteins_mapped"][i] precursor_to_protein_map[precursor_string] = protein_string return precursor_to_protein_map
def load_score_cutoff_by_length(filtered_filename): row_count, table_data = ming_fileio_library.parse_table_with_headers( filtered_filename) score_cutoff_by_length = defaultdict(lambda: 10000) for i in range(row_count): length = int(table_data["length"][i]) score = float(table_data["score"][i]) score_cutoff_by_length[length] = min(score, score_cutoff_by_length[length]) return score_cutoff_by_length
def get_scan_mapping_for_collision_method(path_to_original_results): mapping_dict = {} row_count, table_data = ming_fileio_library.parse_table_with_headers( path_to_original_results) print(path_to_original_results) scan_header = "Scan#" if not scan_header in table_data: scan_header = "ScanNum" for i in range(row_count): key = table_data["#SpecFile"][i] + "_" + table_data[scan_header][i] mapping_dict[key] = table_data["FragMethod"][i] return mapping_dict
def main(): input_intermediate_folder = sys.argv[1] output_filename = sys.argv[2] all_protein_stats = {} #Creating a command line for each partition all_intermediate_files = ming_fileio_library.list_files_in_dir(input_intermediate_folder) output_map = defaultdict(list) for parallel_output_filename in all_intermediate_files: row_count, table_data = ming_fileio_library.parse_table_with_headers(parallel_output_filename) for key in table_data: output_map[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(output_map, output_filename)
def main(): input_folder_path = sys.argv[1] output_tsv = sys.argv[2] files = ming_fileio_library.list_files_in_dir(input_folder_path) merged_dict = defaultdict(list) for input_file in files: print("loading", input_file) row_count, table_data = ming_fileio_library.parse_table_with_headers(input_file) for key in table_data: merged_dict[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(merged_dict, output_tsv)
def proteins_to_include(input_filename): row_count, table_data = ming_fileio_library.parse_table_with_headers( input_filename) protein_set = set() for i in range(row_count): fdr = float(table_data["fdr"][i]) protein = table_data["protein"][i] number_of_non_overlapping_sequences = len( table_data["number_of_non_overlapping_sequences"][i]) if fdr <= 0.01 and number_of_non_overlapping_sequences > 1: protein_set.add(protein) return protein_set
def load_gnps_librarysearch(self, identification_filename): row_count, table_data = ming_fileio_library.parse_table_with_headers(identification_filename) for i in range(row_count): compound_name = table_data["Compound_Name"][i] smiles = table_data["Smiles"][i] inchi = table_data["INCHI"][i] SpectrumID = table_data["SpectrumID"][i] score = table_data["MQScore"][i] scan = table_data["#Scan#"][i] identification = ClusterLibraryIdentification(SpectrumID, compound_name, smiles, inchi, score, scan) self.identifications.append(identification) #Finding the cluster if scan in self.index_to_node_map: self.index_to_node_map[scan].library_identifications.append(identification)
def main(): input_folder_path = sys.argv[1] param_xml_filename = sys.argv[2] output_tsv = sys.argv[3] files = ming_fileio_library.list_files_in_dir(input_folder_path) params_obj = ming_proteosafe_library.parse_xml_file(open(param_xml_filename)) top_k = 1 try: top_k = int(params_obj["TOP_K_RESULTS"][0]) except: top_k = 1 #merged_dict = defaultdict(list) merged_results = [] for input_file in files: print("loading", input_file) row_count, table_data = ming_fileio_library.parse_table_with_headers(input_file) for i in range(row_count): result_dict = {} for key in table_data: result_dict[key] = table_data[key][i] merged_results.append(result_dict) results_per_spectrum = defaultdict(list) for result_obj in merged_results: spectrum_unique_key = result_obj["SpectrumFile"] + "___" + result_obj["#Scan#"] results_per_spectrum[spectrum_unique_key].append(result_obj) output_results = [] for spectrum_unique_key in results_per_spectrum: sorted_results = sorted(results_per_spectrum[spectrum_unique_key], key=lambda spectrum_obj: float(spectrum_obj["MQScore"]), reverse=True) filtered_results = sorted_results[:top_k] output_results += filtered_results output_dict = defaultdict(list) for result_obj in output_results: for key in result_obj: output_dict[key].append(result_obj[key]) ming_fileio_library.write_dictionary_table_data(output_dict, output_tsv)
def main(): input_folder_path = sys.argv[1] output_tsv = sys.argv[2] files = ming_fileio_library.list_files_in_dir(input_folder_path) merged_dict = defaultdict(list) for input_file in files: print("loading", input_file) row_count, table_data = ming_fileio_library.parse_table_with_headers( input_file) for key in table_data: merged_dict[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(merged_dict, output_tsv)
def load_masses(input_filename): masses_list = [] line_counts, table_data = ming_fileio_library.parse_table_with_headers( input_filename) for i in range(line_counts): masses_list.append([ table_data["Resolaveability"][i], table_data["Peptide"][i], float(table_data["m/z"][i]) ]) #Sort this mofo sorted_peptide_mass_list = sorted(masses_list, key=lambda pep_obj: pep_obj[2]) return sorted_peptide_mass_list
def load_metadata_mapping(metadata_folder): file_name_to_sample_id_mapping = {} all_files = ming_fileio_library.list_files_in_dir(metadata_folder) if len(all_files) != 1: return {} row_count, table_data = ming_fileio_library.parse_table_with_headers(all_files[0]) for i in range(row_count): filename = table_data["filename"][i] sample_id = table_data["#SampleID"][i] file_name_to_sample_id_mapping[filename] = sample_id return file_name_to_sample_id_mapping
def parse_psm_file(filename, load_extra_metadata=False): rows, table_data = ming_fileio_library.parse_table_with_headers(filename) known_headers = [ "filename", "scan", "score", "decoy", "sequence", "charge", "ppm_error", "unmangled_name", "FDR", "collision_energy", "FragMethod" ] extra_metadata_headers = set(table_data.keys()).difference( set(known_headers)) psm_list = [] for i in range(rows): filename = table_data["filename"][i] scan = int(table_data["scan"][i]) score = float(table_data["score"][i]) decoy = int(table_data["decoy"][i]) variant_sequence = table_data["sequence"][i] charge = int(table_data["charge"][i]) ppm_error = float(table_data["ppm_error"][i]) fdr = float(table_data["FDR"][i]) fragmentation_method = "N/A" if "FragMethod" in table_data: fragmentation_method = table_data["FragMethod"][i] collision_energy = 0.0 if "collision_energy" in table_data: collision_energy = float(table_data["collision_energy"][i]) protein = "NONE" if "unmangled_name" in table_data: filename = table_data["unmangled_name"][i] new_psm = PSM(filename, scan, variant_sequence, score, decoy, protein, charge) new_psm.ppm_error = ppm_error new_psm.fdr = fdr new_psm.frag_method = fragmentation_method new_psm.collision_energy = collision_energy if load_extra_metadata: extra_metadata = {} for header in extra_metadata_headers: extra_metadata[header] = table_data[header][i] new_psm.extra_metadata = extra_metadata psm_list.append(new_psm) return psm_list
def main(): input_filename = sys.argv[1] ppm_tolerance = float(sys.argv[2]) line_counts, table_data = ming_fileio_library.parse_table_with_headers( input_filename) all_sub_peptides = [] for i in range(line_counts): #print table_data["Peptides"][i] peptide = table_data["Peptides"][i] all_sub_peptides.append(peptide) for length in range(10): #substrings = find_all_substring_of_length(peptide, length + 4) substrings = [peptide[:length + 4], peptide[length + 4:]] #print peptide + "\t" + str(substrings) all_sub_peptides += substrings #print len(all_sub_peptides) all_sub_peptides = list(set(all_sub_peptides)) #print len(all_sub_peptides) peptide_mass_map = {} for peptide in all_sub_peptides: peptide_key = peptide + ".2" peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=2) peptide_mass_map[peptide_key] = peptide_mass peptide_key = peptide + ".3" peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=3) peptide_mass_map[peptide_key] = peptide_mass peptide_key = peptide + ".4" peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=4) peptide_mass_map[peptide_key] = peptide_mass #print peptide + "\t" + "2" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=2)) #print peptide + "\t" + "3" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=3)) #print peptide + "\t" + "4" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=4)) #Determine uniqueness find_resolveable_peptides(peptide_mass_map, ppm_tolerance)
def load_group_attribute_mappings(metadata_filename): row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_filename) filename_header = "filename" attributes_to_groups_mapping = defaultdict(set) group_to_files_mapping = defaultdict(list) for key in table_data: all_group_names = [] if key.find("ATTRIBUTE_") != -1: #Determine unique values in this column for i in range(row_count): filename = table_data[filename_header][i].rstrip() if len(filename) > 2: group_to_files_mapping[table_data[key][i]].append(filename) attributes_to_groups_mapping[key].add(table_data[key][i]) return group_to_files_mapping, attributes_to_groups_mapping
def main(): input_folder = sys.argv[1] input_tsvfile = sys.argv[2] output_tsvfile = sys.argv[3] allowed_passthrough_extensions = [] extension_conversion_mapping = {} for i in range(4, len(sys.argv)): print(i) conversion_parameter = sys.argv[i] print(conversion_parameter) from_extension = conversion_parameter.split(":")[0] to_extension = conversion_parameter.split(":")[1] extension_conversion_mapping[from_extension] = to_extension if from_extension == to_extension: allowed_passthrough_extensions.append(from_extension) file_renaming_reverse_mapping = {} all_input_files = [ os.path.join(input_folder, f) for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f)) ] for input_file in all_input_files: input_extension = os.path.splitext(input_file)[1][1:] if input_extension in extension_conversion_mapping: renamed = os.path.splitext( os.path.basename(input_file) )[0] + "." + extension_conversion_mapping[input_extension] file_renaming_reverse_mapping[renamed] = os.path.basename( input_file) row_count, table_data = ming_fileio_library.parse_table_with_headers( input_tsvfile) for header in table_data: for i in range(row_count): for find_to_replace in file_renaming_reverse_mapping: table_data[header][i] = table_data[header][i].replace( find_to_replace, file_renaming_reverse_mapping[find_to_replace]) ming_fileio_library.write_dictionary_table_data(table_data, output_tsvfile)
def main(): input_intermediate_folder = sys.argv[1] output_filename = sys.argv[2] all_protein_stats = {} #Creating a command line for each partition all_intermediate_files = ming_fileio_library.list_files_in_dir( input_intermediate_folder) output_map = defaultdict(list) for parallel_output_filename in all_intermediate_files: row_count, table_data = ming_fileio_library.parse_table_with_headers( parallel_output_filename) for key in table_data: output_map[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(output_map, output_filename)
def load_gnps_librarysearch(self, identification_filename): row_count, table_data = ming_fileio_library.parse_table_with_headers( identification_filename) for i in range(row_count): compound_name = table_data["Compound_Name"][i] smiles = table_data["Smiles"][i] inchi = table_data["INCHI"][i] SpectrumID = table_data["SpectrumID"][i] score = table_data["MQScore"][i] scan = table_data["#Scan#"][i] identification = ClusterLibraryIdentification( SpectrumID, compound_name, smiles, inchi, score, scan) self.identifications.append(identification) #Finding the cluster if scan in self.index_to_node_map: self.index_to_node_map[scan].library_identifications.append( identification)
def load_identification_file_as_map(input_results_filename): print("Loading", input_results_filename) row_count, table_data = ming_fileio_library.parse_table_with_headers( input_results_filename) identification_map = {} for i in range(row_count): scan_number = int(table_data["#Scan#"][i]) identification = table_data["Compound_Name"][i] spectrum_id = table_data["SpectrumID"][i] identification_dict = {} identification_dict["identification"] = identification identification_dict["spectrum_id"] = spectrum_id identification_map[scan_number] = identification_dict return identification_map
def parse_msplit_file(filename, load_extra_metadata=False): rows, table_data = ming_fileio_library.parse_table_with_headers(filename) known_headers = ["filename", "scan", "score", "decoy", "sequence", "charge", "ppm_error", "unmangled_name", "FDR", "collision_energy", "FragMethod"] extra_metadata_headers = set(table_data.keys()).difference(set(known_headers)) psm_list = [] for i in range(rows): filename = table_data["internalFilename"][i] scan = int(table_data["Scan#"][i]) score = table_data["cosine(M,A)"][i] decoy = 0 variant_sequence = table_data["Annotation"][i] charge = table_data["Charge"][i] protein = "NONE" new_psm = PSM(filename, scan, variant_sequence, score, decoy, protein, charge) psm_list.append(new_psm) return psm_list
def parse_psm_file(filename, load_extra_metadata=False): rows, table_data = ming_fileio_library.parse_table_with_headers(filename) known_headers = ["filename", "scan", "score", "decoy", "sequence", "charge", "ppm_error", "unmangled_name", "FDR", "collision_energy", "FragMethod"] extra_metadata_headers = set(table_data.keys()).difference(set(known_headers)) psm_list = [] for i in range(rows): filename = table_data["filename"][i] scan = int(table_data["scan"][i]) score = float(table_data["score"][i]) decoy = int(table_data["decoy"][i]) variant_sequence = table_data["sequence"][i] charge = int(table_data["charge"][i]) ppm_error = float(table_data["ppm_error"][i]) fdr = float(table_data["FDR"][i]) fragmentation_method = "N/A" if "FragMethod" in table_data: fragmentation_method = table_data["FragMethod"][i] collision_energy = 0.0 if "collision_energy" in table_data: collision_energy = float(table_data["collision_energy"][i]) protein = "NONE" if "unmangled_name" in table_data: filename = table_data["unmangled_name"][i] new_psm = PSM(filename, scan, variant_sequence, score, decoy, protein, charge) new_psm.ppm_error = ppm_error new_psm.fdr = fdr new_psm.frag_method = fragmentation_method new_psm.collision_energy = collision_energy if load_extra_metadata: extra_metadata = {} for header in extra_metadata_headers: extra_metadata[header] = table_data[header][i] new_psm.extra_metadata = extra_metadata psm_list.append(new_psm) return psm_list
def main(): input_filename = sys.argv[1] ppm_tolerance = float(sys.argv[2]) line_counts, table_data = ming_fileio_library.parse_table_with_headers(input_filename) all_sub_peptides = [] for i in range(line_counts): #print table_data["Peptides"][i] peptide = table_data["Peptides"][i] all_sub_peptides.append(peptide) for length in range(10): #substrings = find_all_substring_of_length(peptide, length + 4) substrings = [peptide[:length+4], peptide[length+4:]] #print peptide + "\t" + str(substrings) all_sub_peptides += substrings #print len(all_sub_peptides) all_sub_peptides = list(set(all_sub_peptides)) #print len(all_sub_peptides) peptide_mass_map = {} for peptide in all_sub_peptides: peptide_key = peptide + ".2" peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=2) peptide_mass_map[peptide_key] = peptide_mass peptide_key = peptide + ".3" peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=3) peptide_mass_map[peptide_key] = peptide_mass peptide_key = peptide + ".4" peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=4) peptide_mass_map[peptide_key] = peptide_mass #print peptide + "\t" + "2" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=2)) #print peptide + "\t" + "3" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=3)) #print peptide + "\t" + "4" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=4)) #Determine uniqueness find_resolveable_peptides(peptide_mass_map, ppm_tolerance)
def main(): input_filename = sys.argv[1] line_counts, table_data = ming_fileio_library.parse_table_with_headers(input_filename) all_sub_peptides = [] for i in range(line_counts): #print table_data["Peptides"][i] for length in range(10): peptide = table_data["Peptides"][i] substrings = find_all_substring_of_length(peptide, length + 4) #print peptide + "\t" + str(substrings) all_sub_peptides += substrings #print len(all_sub_peptides) all_sub_peptides = list(set(all_sub_peptides)) #print len(all_sub_peptides) for peptide in all_sub_peptides: print peptide + "\t" + "2" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=2)) print peptide + "\t" + "3" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=3)) print peptide + "\t" + "4" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=4))
def main(): input_results_filename = sys.argv[1] input_peptide_list_filename = sys.argv[2] products_to_rt_map = parse_identification_file(input_results_filename) line_counts, table_data = ming_fileio_library.parse_table_with_headers(input_peptide_list_filename) all_peptides = table_data["Peptides"] full_peptides_to_rt = map_products_to_peptide_rt(products_to_rt_map, all_peptides) partitioned_peptide_list = partition_peptides_random(full_peptides_to_rt, 3) #partitioned_peptide_list = partition_peptides_number_products(full_peptides_to_rt, 3) print "Total Products: " + str(len(products_to_rt_map)) total_detectable_products = 0 for peptide_list in partitioned_peptide_list: number_products_detectable = count_number_of_acquireable_products(peptide_list, full_peptides_to_rt) #print number_products_detectable total_detectable_products += number_products_detectable print "Total Products Detectable: " + str(total_detectable_products) for peptide_list in partitioned_peptide_list: print "Partition=================" for peptide in peptide_list: print peptide
def load_pairsinfo(self, pairs_filename): row_count, table_data = ming_fileio_library.parse_table_with_headers(pairs_filename) if "CLUSTERID1" in table_data: for i in range(row_count): node1 = table_data["CLUSTERID1"][i] node2 = table_data["CLUSTERID2"][i] cosine = table_data["Cosine"][i] deltamz = table_data["DeltaMZ"][i] pair = NetworkPair(node1, node2, cosine, deltamz) self.pairs.append(pair) else: row_count, table_data = ming_fileio_library.parse_table_without_headers(pairs_filename) for i in range(row_count): node1 = table_data[0][i] node2 = table_data[1][i] cosine = table_data[4][i] deltamz = table_data[2][i] pair = NetworkPair(node1, node2, cosine, deltamz) self.pairs.append(pair) #Make stuff consistent, specifically adding adjacency list for pair in self.pairs: node1 = pair.node1 node2 = pair.node2 if not(node1 in self.index_to_neighbors): self.index_to_neighbors[node1] = [] if not(node2 in self.index_to_neighbors): self.index_to_neighbors[node2] = [] self.index_to_neighbors[node1].append(node2) self.index_to_neighbors[node2].append(node1)
def load_clustersummary(self, clustersummaryfilename): row_count, table_data = ming_fileio_library.parse_table_with_headers(clustersummaryfilename) for i in range(row_count): cluster_index = table_data["cluster index"][i] mz = table_data["precursor mass"][i] charge = table_data["precursor charge"][i] parentmass = table_data["parent mass"][i] number_of_spectra = table_data["number of spectra"][i] all_files = table_data["AllFiles"][i] componentindex = -1 if "componentindex" in table_data: componentindex = table_data["componentindex"][i] cluster_node = ClusterNode(mz, charge, cluster_index, number_of_spectra, componentindex) cluster_node.all_files_string = all_files self.nodes.append(cluster_node) self.index_to_node_map[cluster_index] = cluster_node #Making all the nodes not shit in terms of clustering info constituent_spectra = cluster_node.all_files_string.split("###") cluster_node.constituent_spectra = constituent_spectra
def main(): input_result_filename = sys.argv[1] output_result_filename = sys.argv[2] spectrum_id_cache = {} input_rows, input_table = ming_fileio_library.parse_table_with_headers(input_result_filename) output_table = defaultdict(list) output_headers = ["SpectrumID", "Compound_Name", "Ion_Source", "Instrument", "Compound_Source", "PI", "Data_Collector", "Adduct"] output_headers += ["Precursor_MZ", "ExactMass", "Charge", "CAS_Number", "Pubmed_ID", "Smiles", "INCHI", "INCHI_AUX", "Library_Class"] output_headers += ["IonMode", "UpdateWorkflowName", "LibraryQualityString", "#Scan#", "SpectrumFile", "MQScore", "Organism"] output_headers += ["TIC_Query", "RT_Query", "MZErrorPPM", "SharedPeaks", "MassDiff", "LibMZ", "SpecMZ", "SpecCharge"] for header in output_headers: output_table[header] = [] number_hits_per_query = defaultdict(lambda: 0) for i in range(input_rows): number_hits_per_query[input_table["FileScanUniqueID"][i]] += 1 for i in range(input_rows): spectrum_id = input_table["LibrarySpectrumID"][i] score = input_table["MQScore"][i] filename = input_table["SpectrumFile"][i] libfilename = input_table["LibraryName"][i] scan = input_table["#Scan#"][i] TIC_Query = input_table["UnstrictEvelopeScore"][i] RT_Query = input_table["p-value"][i] SpecCharge = input_table["Charge"][i] SpecMZ = input_table["SpecMZ"][i] MZErrorPPM = input_table["mzErrorPPM"][i] SharedPeaks = input_table["LibSearchSharedPeaks"][i] MassDiff = input_table["ParentMassDiff"][i] print(spectrum_id) gnps_library_spectrum = None try: gnps_library_spectrum = None if spectrum_id in spectrum_id_cache: gnps_library_spectrum = spectrum_id_cache[spectrum_id] else: gnps_library_spectrum = ming_gnps_library.get_library_spectrum(spectrum_id) spectrum_id_cache[spectrum_id] = gnps_library_spectrum except KeyboardInterrupt: raise except: continue gnps_library_spectrum["annotations"] = sorted(gnps_library_spectrum["annotations"], key=lambda annotation: annotation["create_time"], reverse=True) output_table["SpectrumID"].append(spectrum_id) output_table["Compound_Name"].append(gnps_library_spectrum["annotations"][0]["Compound_Name"].replace("\t", "")) output_table["Ion_Source"].append(gnps_library_spectrum["annotations"][0]["Ion_Source"].replace("\t", "")) output_table["Instrument"].append(gnps_library_spectrum["annotations"][0]["Instrument"].replace("\t", "")) output_table["Compound_Source"].append(gnps_library_spectrum["annotations"][0]["Compound_Source"].replace("\t", "")) output_table["PI"].append(gnps_library_spectrum["annotations"][0]["PI"].replace("\t", "")) output_table["Data_Collector"].append(gnps_library_spectrum["annotations"][0]["Data_Collector"].replace("\t", "")) output_table["Adduct"].append(gnps_library_spectrum["annotations"][0]["Adduct"].replace("\t", "")) output_table["Precursor_MZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"].replace("\t", "")) output_table["ExactMass"].append(gnps_library_spectrum["annotations"][0]["ExactMass"].replace("\t", "")) output_table["Charge"].append(gnps_library_spectrum["annotations"][0]["Charge"].replace("\t", "")) output_table["CAS_Number"].append(gnps_library_spectrum["annotations"][0]["CAS_Number"].replace("\t", "")) output_table["Pubmed_ID"].append(gnps_library_spectrum["annotations"][0]["Pubmed_ID"].replace("\t", "")) output_table["Smiles"].append(gnps_library_spectrum["annotations"][0]["Smiles"].replace("\t", "")) output_table["INCHI"].append(gnps_library_spectrum["annotations"][0]["INCHI"].replace("\t", "")) output_table["INCHI_AUX"].append(gnps_library_spectrum["annotations"][0]["INCHI_AUX"].replace("\t", "")) output_table["Library_Class"].append(gnps_library_spectrum["annotations"][0]["Library_Class"].replace("\t", "")) output_table["IonMode"].append(gnps_library_spectrum["annotations"][0]["Ion_Mode"].replace("\t", "")) if gnps_library_spectrum["annotations"][0]["Library_Class"] == "1": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-GOLD") output_table["LibraryQualityString"].append("Gold") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "2": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-SILVER") output_table["LibraryQualityString"].append("Silver") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "3": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Bronze") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "4": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Insilico") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "5": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Insilico") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "10": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Challenge") else: print("BULLLSHIT", gnps_library_spectrum["annotations"][0]["Library_Class"]) output_table["#Scan#"].append(scan) output_table["SpectrumFile"].append(filename) output_table["LibraryName"].append(libfilename) output_table["MQScore"].append(score) output_table["Organism"].append(gnps_library_spectrum["spectruminfo"]["library_membership"]) output_table["TIC_Query"].append(TIC_Query) output_table["RT_Query"].append(RT_Query) output_table["MZErrorPPM"].append(MZErrorPPM) output_table["SharedPeaks"].append(SharedPeaks) output_table["MassDiff"].append(MassDiff) output_table["LibMZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"]) output_table["SpecMZ"].append(SpecMZ) output_table["SpecCharge"].append(SpecCharge) output_table["FileScanUniqueID"].append(input_table["FileScanUniqueID"][i]) output_table["NumberHits"].append(number_hits_per_query[input_table["FileScanUniqueID"][i]]) tag_list = [ (tag["tag_desc"] + "[" + tag["tag_type"] + "]") for tag in gnps_library_spectrum["spectrum_tags"]] tag_string = "||".join(tag_list).replace("\t", "") output_table["tags"].append(tag_string) ming_fileio_library.write_dictionary_table_data(output_table, output_result_filename)
def loading_network(filename, hasHeaders=False): node1_list = [] node2_list = [] mass_difference = [] property1 = [] cosine_score = [] explained_intensity = [] edge_annotation = [] if hasHeaders == True: row_count, table_data = ming_fileio_library.parse_table_with_headers(filename) if row_count == -1: return nx.MultiGraph() node1_list = table_data["CLUSTERID1"] node2_list = table_data["CLUSTERID2"] mass_difference = table_data["DeltaMZ"] property1 = table_data["MEH"] cosine_score = None if "Cosine" in table_data: cosine_score = table_data["Cosine"] if "COSINE" in table_data: cosine_score = table_data["COSINE"] explained_intensity = table_data["OtherScore"] if len(property1) != len(node1_list): property1 = node1_list if len(explained_intensity) != len(node1_list): explained_intensity = node1_list if "EdgeAnnotation" in table_data: edge_annotation = table_data["EdgeAnnotation"] else: edge_annotation = [" "] * len(node1_list) else: row_count, table_data = ming_fileio_library.parse_table_without_headers(filename) if row_count == -1: return nx.MultiGraph() node1_list = table_data[0] node2_list = table_data[1] mass_difference = table_data[2] property1 = table_data[3] cosine_score = table_data[4] explained_intensity = table_data[5] edge_annotation = [" "] * len(node1_list) edge_property_map = {} edge_object_list = [] intermediate_graph_nodes = set() intermediate_edges_to_add = [] for i in range(row_count): edge_object = {} edge_object["node1"] = node1_list[i] edge_object["node2"] = node2_list[i] edge_object["mass_difference"] = mass_difference[i] edge_object["property1"] = property1[i] edge_object["cosine_score"] = float(cosine_score[i]) edge_object["explained_intensity"] = float(explained_intensity[i]) edge_object["component"] = -1 edge_object["EdgeType"] = "Cosine" edge_object["EdgeAnnotation"] = edge_annotation[i].rstrip() edge_object["EdgeScore"] = float(cosine_score[i]) edge_key = node1_list[i] + "-" + node2_list[i] edge_property_map[edge_key] = edge_object intermediate_graph_nodes.add(edge_object["node1"]) intermediate_graph_nodes.add(edge_object["node2"]) intermediate_edges_to_add.append((edge_object["node1"], edge_object["node2"], edge_object)) G=nx.MultiGraph() G.add_nodes_from(intermediate_graph_nodes) G.add_edges_from(intermediate_edges_to_add) return G
def main(): paramxml_input_filename = sys.argv[1] parallel_param_filename = sys.argv[2] input_spectra_folder = sys.argv[3] library_search_results_filename = sys.argv[4] output_matches_filename = sys.argv[5] params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename)) output_map = {"specs_filename" : [],"specs_scan" : [], "dataset_filename" : [], "dataset_scan" : [], "score" : [], "dataset_id" : [], "dataset_title" : [], "dataset_neighbors" : [], "Compound_Name" : [], "SpectrumID" : []} try: if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1": ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename) exit(0) except: ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename) exit(0) #If we are doing parallel partition_total = 1 partition_of_node = 0 params_map = json.loads(open(parallel_param_filename).read()) partition_total = params_map["total_paritions"] partition_of_node = params_map["node_partition"] dataset_dict = params_map["dataset_dict"] all_datasets = params_map["all_datasets"] #print(len(all_datasets)) #print(partition_of_node) #print(partition_total) #all_datasets = all_datasets[partition_of_node::partition_total] all_matches = finding_matches_in_public_data(os.path.join(input_spectra_folder, "specs_ms.mgf"), all_datasets) #Lets parse the search results and then populate this thing with search results library_search_result_count, library_search_data = ming_fileio_library.parse_table_with_headers(library_search_results_filename) scan_to_library_map = {} for i in range(library_search_result_count): scan = library_search_data["Scan"][i] scan_to_library_map[scan] = {"Compound_Name" : library_search_data["Compound_Name"][i], "SpectrumID" : library_search_data["SpectrumID"][i]} for dataset in all_matches: #For each dataset, lets try to find the clustering information if len(all_matches[dataset]["matches"]) == 0: continue most_recent_molecular_networking_job = ming_gnps_library.get_most_recent_continuous_networking_of_dataset(dataset_dict[dataset]["task"]) molecular_network = get_molecular_network_obj(most_recent_molecular_networking_job) for match in all_matches[dataset]["matches"]: output_map['specs_filename'].append("specs_ms.mgf") output_map['specs_scan'].append(match.query_scan) output_map['dataset_id'].append(dataset_dict[dataset]["dataset"]) output_map['dataset_title'].append(dataset_dict[dataset]["title"]) output_map['dataset_filename'].append(match.filename) output_map['dataset_scan'].append(match.scan) output_map['score'].append(match.score) #List the library identifications if str(match.query_scan) in scan_to_library_map: output_map['Compound_Name'].append(scan_to_library_map[str(match.query_scan)]["Compound_Name"]) output_map['SpectrumID'].append(scan_to_library_map[str(match.query_scan)]["SpectrumID"]) else: output_map['Compound_Name'].append("") output_map['SpectrumID'].append("") #Lets find all the analogs available if molecular_network != None: neighbors_in_dataset = molecular_network.get_node_neighbors(match.scan) output_map['dataset_neighbors'].append(len(neighbors_in_dataset)) else: output_map['dataset_neighbors'].append(0) ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
def main(): parser = argparse.ArgumentParser(description='Creating Clustering Info Summary') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_metadata_file', help='output_metadata_file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters)) mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj) default_group_mapping = defaultdict(list) file_to_group_mapping = {} for mangled_name in mangled_file_mapping: if mangled_name.find("specone-") != -1: default_group_mapping["G1"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G1" if mangled_name.find("spectwo-") != -1: default_group_mapping["G2"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G2" if mangled_name.find("specthree-") != -1: default_group_mapping["G3"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G3" if mangled_name.find("specfour-") != -1: default_group_mapping["G4"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G4" if mangled_name.find("specfive-") != -1: default_group_mapping["G5"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G5" if mangled_name.find("specsix-") != -1: default_group_mapping["G6"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G6" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) row_count = 0 table_data = defaultdict(list) if len(metadata_files_in_folder) == 1: row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0]) print(table_data) for key in table_data: print(key, len(table_data[key])) for i in range(row_count): print(i) filename = table_data["filename"][i] if len(filename) < 2: continue print(filename, filename[0], filename[-1]) if filename[0] == "\"": filename = filename[1:] if filename[-1] == "\"": filename = filename[:-1] table_data["filename"][i] = filename basename_filename = os.path.basename(filename) group_name = "NoDefaultGroup" if basename_filename in file_to_group_mapping: group_name = file_to_group_mapping[basename_filename] table_data["ATTRIBUTE_DefaultGroup"].append(group_name) for input_filename in file_to_group_mapping: if input_filename in table_data["filename"]: continue else: for key in table_data: if key != "ATTRIBUTE_DefaultGroup" and key != "filename": table_data[key].append("N/A") table_data["ATTRIBUTE_DefaultGroup"].append(file_to_group_mapping[input_filename]) table_data["filename"].append(input_filename) ming_fileio_library.write_dictionary_table_data(table_data, args.output_metadata_file)
def add_clusterinfo_summary_to_graph(G, cluster_info_summary_filename): row_count, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_summary_filename) #Setting default metadata for nodes in network #for node in G.node: # print(node) default_listed_columns = [("precursor mass", "float"), \ ("charge", "int"), \ ("parent mass", "float"), \ ("number of spectra", "int"), \ ("cluster index", "int"), \ ("sum(precursor intensity)", "float"), \ ("RTMean", "float"), \ ("AllGroups", "string"), ("DefaultGroups", "string"), \ ("RTConsensus", "float"), ("UniqueFileSources", "string")] optional_listed_columns = [("Correlated Features Group ID", "string"), \ ("Annotated Adduct Features ID", "string"), \ ("Best Ion", "string"), \ ("neutral M mass", "float"), \ ("MS2 Verification Comment", "string"), \ ("ProteoSAFeClusterLink", "string"), \ ("GNPSLinkout_Cluster", "string"), \ ("GNPSLinkout_Network", "string"), ("componentindex", "string")] group_columns = ["G1", "G2", "G3", "G4", "G5", "G6"] for i in range(row_count): cluster_index = table_data["cluster index"][i] if cluster_index in G.node: for default_column in default_listed_columns: key_name = default_column[0] type_name = default_column[1] try: if type_name == "float": G.node[cluster_index][key_name] = float(table_data[key_name][i]) elif type_name == "int": G.node[cluster_index][key_name] = int(table_data[key_name][i]) elif type_name == "string": G.node[cluster_index][key_name] = str(table_data[key_name][i]) except: if type_name == "float": G.node[cluster_index][key_name] = float("0.0") elif type_name == "int": G.node[cluster_index][key_name] = int("0") elif type_name == "string": G.node[cluster_index][key_name] = str("N/A") for group_name in group_columns: try: G.node[cluster_index][group_name] = float(table_data[group_name][i]) except: G.node[cluster_index][group_name] = 0.0 #Looking for all the groups for header in table_data: if header.find("GNPSGROUP") != -1: try: G.node[cluster_index][header] = int(table_data[header][i]) except: try: G.node[cluster_index][header] = float(table_data[header][i]) except: G.node[cluster_index][header] = -1 #Looking for all Attributes for header in table_data: if header.find("ATTRIBUTE_") != -1: try: G.node[cluster_index][header] = table_data[header][i] except: G.node[cluster_index][header] = "" #Looking for optional columns for optional_column in optional_listed_columns: key_name = optional_column[0] type_name = optional_column[1] if key_name in table_data: try: if type_name == "float": G.node[cluster_index][key_name] = float(table_data[key_name][i]) elif type_name == "int": G.node[cluster_index][key_name] = int(table_data[key_name][i]) elif type_name == "string": G.node[cluster_index][key_name] = str(table_data[key_name][i]) except: if type_name == "float": G.node[cluster_index][key_name] = float("0.0") elif type_name == "int": G.node[cluster_index][key_name] = int("0") elif type_name == "string": G.node[cluster_index][key_name] = str("N/A")
def main(): parser = argparse.ArgumentParser(description='Group Mapping from input, defaults and metadata file') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('groupmapping_folder', help='groupmapping_folder') parser.add_argument('attributemapping_folder', help='attributemapping_folder') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_groupmapping_file', help='output_groupmapping_file') parser.add_argument('output_attributemapping_file', help='output_attributemapping_file') parser.add_argument('inputspectrafolder', help='inputspectrafolder') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters)) mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj) reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_obj) file_path_prefix = args.inputspectrafolder output_group_file = open(args.output_groupmapping_file, "w") output_attribute_file = open(args.output_attributemapping_file, "w") """ Writing Default Grouping to output file """ default_groupings = {'G1' : [] , 'G2' : [] ,'G3' : [] ,'G4' : [] ,'G5' : [] ,'G6' : [] } for mangled_name in mangled_file_mapping.keys(): if mangled_name.find("spec-") != -1: default_groupings['G1'].append(mangled_name.rstrip()) if mangled_name.find("spectwo-") != -1: default_groupings['G2'].append(mangled_name.rstrip()) if mangled_name.find("specthree-") != -1: default_groupings['G3'].append(mangled_name.rstrip()) if mangled_name.find("specfour-") != -1: default_groupings['G4'].append(mangled_name.rstrip()) if mangled_name.find("specfive-") != -1: default_groupings['G5'].append(mangled_name.rstrip()) if mangled_name.find("specsix-") != -1: default_groupings['G6'].append(mangled_name.rstrip()) for default_group_key in default_groupings.keys(): default_group_string = "" default_group_string += "GROUP_" + default_group_key +"=" for mangled_name in default_groupings[default_group_key]: default_group_string += os.path.join(file_path_prefix, mangled_name) + ";" if len(default_groupings[default_group_key]) > 0: default_group_string = default_group_string[:-1] output_group_file.write(default_group_string + "\n") """Determining output whether to use group mapping file or metadata file""" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) groupmapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.groupmapping_folder) attributemapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.attributemapping_folder) if len(metadata_files_in_folder) > 1: print("Too many metafile inputted") exit(1) if len(metadata_files_in_folder) == 1: #Using metadatat file row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0]) if not "filename" in table_data: print("Missing 'filename' header in metadata file. Please specify the file name that goes along with each piece of metadata with the header: filename") exit(1) attributes_to_groups_mapping = defaultdict(set) group_to_files_mapping = defaultdict(list) for i in range(row_count): filename = table_data["filename"][i] basename_filename = os.path.basename(filename).rstrip() if basename_filename in reverse_file_mangling: mangled_name = reverse_file_mangling[basename_filename] for key in table_data: if key.find("ATTRIBUTE_") != -1: group_name = table_data[key][i] if len(group_name) < 1: continue group_to_files_mapping[group_name].append(os.path.join(file_path_prefix, mangled_name)) attributes_to_groups_mapping[key.replace("ATTRIBUTE_", "")].add(group_name) else: #Filename is not part of sample set continue for group_name in group_to_files_mapping: group_string = "GROUP_" + group_name + "=" + ";".join(group_to_files_mapping[group_name]) output_group_file.write(group_string + "\n") for attribute_name in attributes_to_groups_mapping: attribute_string = attribute_name + "=" + ";".join(list(attributes_to_groups_mapping[attribute_name])) output_attribute_file.write(attribute_string + "\n") exit(0) """Falling back on old group mapping file""" if len(groupmapping_files_in_folder) > 1 or len(attributemapping_files_in_folder) > 1: print("Too many group/attribute mappings inputted") exit(1) if len(groupmapping_files_in_folder) == 1: for line in open(groupmapping_files_in_folder[0], errors='ignore'): splits = line.rstrip().split("=") if len(splits) < 2: continue group_name = splits[0] group_files = [] for filename in splits[1].split(";"): if os.path.basename(filename) in reverse_file_mangling: mangled_name = reverse_file_mangling[os.path.basename(filename)] group_files.append(os.path.join(file_path_prefix, mangled_name)) group_string = group_name + "=" + ";".join(group_files) output_group_file.write(group_string + "\n") if len(attributemapping_files_in_folder) == 1: for line in open(attributemapping_files_in_folder[0]): output_attribute_file.write(line)
def main(): input_result_filename = sys.argv[1] output_result_filename = sys.argv[2] input_rows, input_table = ming_fileio_library.parse_table_with_headers(input_result_filename) output_table = defaultdict(list) output_headers = ["SpectrumID", "Compound_Name", "Ion_Source", "Instrument", "Compound_Source", "PI", "Data_Collector", "Adduct"] output_headers += ["Precursor_MZ", "ExactMass", "Charge", "CAS_Number", "Pubmed_ID", "Smiles", "INCHI", "INCHI_AUX", "Library_Class"] output_headers += ["IonMode", "UpdateWorkflowName", "LibraryQualityString", "#Scan#", "SpectrumFile", "MQScore", "Organism"] output_headers += ["TIC_Query", "RT_Query", "MZErrorPPM", "SharedPeaks", "MassDiff", "LibMZ", "SpecMZ", "SpecCharge"] for header in output_headers: output_table[header] = [] for i in range(input_rows): spectrum_id = input_table["LibrarySpectrumID"][i] score = input_table["MQScore"][i] filename = input_table["SpectrumFile"][i] libfilename = input_table["LibraryName"][i] scan = input_table["#Scan#"][i] TIC_Query = input_table["UnstrictEvelopeScore"][i] RT_Query = input_table["p-value"][i] SpecCharge = input_table["Charge"][i] SpecMZ = input_table["SpecMZ"][i] MZErrorPPM = input_table["mzErrorPPM"][i] SharedPeaks = input_table["LibSearchSharedPeaks"][i] MassDiff = input_table["ParentMassDiff"][i] print(spectrum_id) gnps_library_spectrum = None try: gnps_library_spectrum = ming_gnps_library.get_library_spectrum(spectrum_id) except KeyboardInterrupt: raise except: continue output_table["SpectrumID"].append(spectrum_id) output_table["Compound_Name"].append(gnps_library_spectrum["annotations"][0]["Compound_Name"]) output_table["Ion_Source"].append(gnps_library_spectrum["annotations"][0]["Ion_Source"]) output_table["Instrument"].append(gnps_library_spectrum["annotations"][0]["Instrument"]) output_table["Compound_Source"].append(gnps_library_spectrum["annotations"][0]["Compound_Source"]) output_table["PI"].append(gnps_library_spectrum["annotations"][0]["PI"]) output_table["Data_Collector"].append(gnps_library_spectrum["annotations"][0]["Data_Collector"]) output_table["Adduct"].append(gnps_library_spectrum["annotations"][0]["Adduct"]) output_table["Precursor_MZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"]) output_table["ExactMass"].append(gnps_library_spectrum["annotations"][0]["ExactMass"]) output_table["Charge"].append(gnps_library_spectrum["annotations"][0]["Charge"]) output_table["CAS_Number"].append(gnps_library_spectrum["annotations"][0]["CAS_Number"]) output_table["Pubmed_ID"].append(gnps_library_spectrum["annotations"][0]["Pubmed_ID"]) output_table["Smiles"].append(gnps_library_spectrum["annotations"][0]["Smiles"]) output_table["INCHI"].append(gnps_library_spectrum["annotations"][0]["INCHI"]) output_table["INCHI_AUX"].append(gnps_library_spectrum["annotations"][0]["INCHI_AUX"]) output_table["Library_Class"].append(gnps_library_spectrum["annotations"][0]["Library_Class"]) output_table["IonMode"].append(gnps_library_spectrum["annotations"][0]["Ion_Mode"]) if gnps_library_spectrum["annotations"][0]["Library_Class"] == "1": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-GOLD") output_table["LibraryQualityString"].append("Gold") if gnps_library_spectrum["annotations"][0]["Library_Class"] == "2": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-SILVER") output_table["LibraryQualityString"].append("Silver") if gnps_library_spectrum["annotations"][0]["Library_Class"] == "3": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Bronze") if gnps_library_spectrum["annotations"][0]["Library_Class"] == "4": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Insilico") if gnps_library_spectrum["annotations"][0]["Library_Class"] == "10": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Challenge") output_table["#Scan#"].append(scan) output_table["SpectrumFile"].append(filename) output_table["LibraryName"].append(libfilename) output_table["MQScore"].append(score) output_table["Organism"].append(gnps_library_spectrum["spectruminfo"]["library_membership"]) output_table["TIC_Query"].append(TIC_Query) output_table["RT_Query"].append(RT_Query) output_table["MZErrorPPM"].append(MZErrorPPM) output_table["SharedPeaks"].append(SharedPeaks) output_table["MassDiff"].append(MassDiff) output_table["LibMZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"]) output_table["SpecMZ"].append(SpecMZ) output_table["SpecCharge"].append(SpecCharge) tag_string = "" for tag in gnps_library_spectrum["spectrum_tags"]: tag_string += tag["tag_desc"].replace("\t", "") + "||" if len(tag_string) > 3: tag_string = tag_string[:-2] output_table["tags"].append(tag_string) ming_fileio_library.write_dictionary_table_data(output_table, output_result_filename)