def simple_presence_of_merged_spectra_processing(input_integrals_filename, output_clusterinfo_filename, mangled_mapping): extension_stripped_mangled_mapping = {} for key in mangled_mapping: without_ext = ming_fileio_library.get_filename_without_extension(key) extension_stripped_mangled_mapping[without_ext] = mangled_mapping[key] header_order = open(input_integrals_filename).readline().rstrip().split(",")[1:] table_list = ming_fileio_library.parse_table_with_headers_object_list(input_integrals_filename, delimiter=",") #Removing other header infroamtion table_list = table_list[2:] output_dict = defaultdict(list) print("for zheng's sanity print the wholetable ----") print(table_list) for result_object in table_list: try: sample_name = result_object["RTS:"] except: sample_name = "unknown" scan_number = 0 for header in header_order: scan_number += 1 abundance = result_object[header] output_dict["filename"].append( sample_name ) output_dict["abundance"].append( abundance ) output_dict["scan_number"].append( scan_number ) output_dict["RT"].append( header ) ming_fileio_library.write_dictionary_table_data(output_dict, output_clusterinfo_filename)
def main(): params = ming_proteosafe_library.parse_xml_file(open(sys.argv[1])) proteome = ming_protein_library.parse_fasta_proteome_file(sys.argv[2]) row_count, table_data = ming_fileio_library.parse_table_with_headers(sys.argv[3]) decoy_marker = sys.argv[5] add_decoy_to_results(table_data, row_count, decoy_marker) psm_results = add_fdr_to_results(table_data, row_count) output_table = defaultdict(list) #Performing filters filter_type = params["filter.filter"][0] if filter_type == "FDR": fdr_threshold = float(params["FDR.FDR"][0]) for psm in psm_results: if psm["QValue"] < fdr_threshold: for key in psm: output_table[key].append(psm[key]) if filter_type == "PepFDR": fdr_threshold = float(params["PepFDR.PepFDR"][0]) for psm in psm_results: if psm["PepQValue"] < fdr_threshold and psm["QValue"] < fdr_threshold: for key in psm: output_table[key].append(psm[key]) if filter_type == "FPR": print("Lets do nothing, don't know what this is") ming_fileio_library.write_dictionary_table_data(output_table, sys.argv[4])
def main(): input_param = ming_proteosafe_library.parse_xml_file(open(sys.argv[1])) input_folder = sys.argv[2] output_file = sys.argv[3] scratch_folder = sys.argv[4] path_to_executable = sys.argv[5] path_to_isotopes_table = sys.argv[6] #parent_mass_tolerance = input_param[] parent_mass_tolerance = 0.05 all_input_file_paths = ming_fileio_library.list_files_in_dir(input_folder) output_kl_intermediates = [] for input_file in all_input_file_paths: output_kl_file = os.path.join(scratch_folder, os.path.basename(input_file) + ".kl") cmd = path_to_executable + " --input " + input_file + " --output_summary " + output_kl_file + " " + "--peak_tolerance " + str( parent_mass_tolerance ) + " --isotope_file " + path_to_isotopes_table + " >/dev/null 2>&1 " print(cmd) os.system(cmd) #subprocess.call([cmd]) output_kl_intermediates.append(output_kl_file) combined_table = defaultdict(list) for output_kl_file in output_kl_intermediates: row_count, table_data = ming_fileio_library.parse_table_with_headers( output_kl_file) for key in table_data: combined_table[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(combined_table, output_file)
def main(): paramxml_filename = sys.argv[1] psms_input_file = sys.argv[2] kl_input_file = sys.argv[3] output_psms_file = sys.argv[4] parameters_obj = ming_proteosafe_library.parse_xml_file( open(paramxml_filename)) row_count, kl_data = ming_fileio_library.parse_table_with_headers( kl_input_file) kl_dict = {} for i in range(row_count): filename = os.path.basename(kl_data["Filename"][i]) scan = kl_data["Scan"][i] kl_strict = (kl_data["KL Strict"][i]) kl_unstrict = (kl_data["KL"][i]) interpeak_intensity = (kl_data["Interpeak intensity"][i]) key = filename + ":" + str(scan) kl_dict[key] = { "kl_strict": kl_strict, "kl_unstrict": kl_unstrict, "kl_interpeak": interpeak_intensity } #Since we don't support more fields in the psm object, we're going to read this file in again as a tsv file and add the columns as necessary psm_rows, psm_table_data = ming_fileio_library.parse_table_with_headers( psms_input_file) psm_table_data["kl_strict"] = [] psm_table_data["kl_unstrict"] = [] psm_table_data["kl_interpeak"] = [] for i in range(psm_rows): key = psm_table_data["filename"][i] + ":" + psm_table_data["scan"][i] if key in kl_dict: psm_table_data["kl_strict"].append(kl_dict[key]["kl_strict"]) psm_table_data["kl_unstrict"].append(kl_dict[key]["kl_unstrict"]) psm_table_data["kl_interpeak"].append(kl_dict[key]["kl_interpeak"]) else: psm_table_data["kl_strict"].append(-1) psm_table_data["kl_unstrict"].append(-1) psm_table_data["kl_interpeak"].append(-1) #Change C to C+57 #if "cysteine_protease.cysteine" in parameters_obj: # if parameters_obj["cysteine_protease.cysteine"][0] == "c57": # #Lets replace all the cysteines # for i in range(psm_rows): # psm_table_data["sequence"][i] = psm_table_data["sequence"][i].replace("C", "C+57") ming_fileio_library.write_dictionary_table_data(psm_table_data, output_psms_file)
def grab_all_results(task_list, output_peptide_directory, output_psm_directory, output_summary_filename): results_list = [] for task in task_list: ret_value = grab_single_result(task, output_peptide_directory, output_psm_directory) results_list.append(ret_value) summary_dictionary = defaultdict(list) for result in results_list: for key in result.keys(): summary_dictionary[key].append(result[key]) ming_fileio_library.write_dictionary_table_data(summary_dictionary, output_summary_filename)
def main(): input_folder_path = sys.argv[1] output_tsv = sys.argv[2] files = ming_fileio_library.list_files_in_dir(input_folder_path) merged_dict = defaultdict(list) for input_file in files: print("loading", input_file) row_count, table_data = ming_fileio_library.parse_table_with_headers(input_file) for key in table_data: merged_dict[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(merged_dict, output_tsv)
def main(): input_intermediate_folder = sys.argv[1] output_filename = sys.argv[2] all_protein_stats = {} #Creating a command line for each partition all_intermediate_files = ming_fileio_library.list_files_in_dir(input_intermediate_folder) output_map = defaultdict(list) for parallel_output_filename in all_intermediate_files: row_count, table_data = ming_fileio_library.parse_table_with_headers(parallel_output_filename) for key in table_data: output_map[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(output_map, output_filename)
def main(): input_folder_path = sys.argv[1] param_xml_filename = sys.argv[2] output_tsv = sys.argv[3] files = ming_fileio_library.list_files_in_dir(input_folder_path) params_obj = ming_proteosafe_library.parse_xml_file(open(param_xml_filename)) top_k = 1 try: top_k = int(params_obj["TOP_K_RESULTS"][0]) except: top_k = 1 #merged_dict = defaultdict(list) merged_results = [] for input_file in files: print("loading", input_file) row_count, table_data = ming_fileio_library.parse_table_with_headers(input_file) for i in range(row_count): result_dict = {} for key in table_data: result_dict[key] = table_data[key][i] merged_results.append(result_dict) results_per_spectrum = defaultdict(list) for result_obj in merged_results: spectrum_unique_key = result_obj["SpectrumFile"] + "___" + result_obj["#Scan#"] results_per_spectrum[spectrum_unique_key].append(result_obj) output_results = [] for spectrum_unique_key in results_per_spectrum: sorted_results = sorted(results_per_spectrum[spectrum_unique_key], key=lambda spectrum_obj: float(spectrum_obj["MQScore"]), reverse=True) filtered_results = sorted_results[:top_k] output_results += filtered_results output_dict = defaultdict(list) for result_obj in output_results: for key in result_obj: output_dict[key].append(result_obj[key]) ming_fileio_library.write_dictionary_table_data(output_dict, output_tsv)
def main(): input_folder_path = sys.argv[1] output_tsv = sys.argv[2] files = ming_fileio_library.list_files_in_dir(input_folder_path) merged_dict = defaultdict(list) for input_file in files: print("loading", input_file) row_count, table_data = ming_fileio_library.parse_table_with_headers( input_file) for key in table_data: merged_dict[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(merged_dict, output_tsv)
def main(): input_folder = sys.argv[1] input_tsvfile = sys.argv[2] output_tsvfile = sys.argv[3] allowed_passthrough_extensions = [] extension_conversion_mapping = {} for i in range(4, len(sys.argv)): print(i) conversion_parameter = sys.argv[i] print(conversion_parameter) from_extension = conversion_parameter.split(":")[0] to_extension = conversion_parameter.split(":")[1] extension_conversion_mapping[from_extension] = to_extension if from_extension == to_extension: allowed_passthrough_extensions.append(from_extension) file_renaming_reverse_mapping = {} all_input_files = [ os.path.join(input_folder, f) for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f)) ] for input_file in all_input_files: input_extension = os.path.splitext(input_file)[1][1:] if input_extension in extension_conversion_mapping: renamed = os.path.splitext( os.path.basename(input_file) )[0] + "." + extension_conversion_mapping[input_extension] file_renaming_reverse_mapping[renamed] = os.path.basename( input_file) row_count, table_data = ming_fileio_library.parse_table_with_headers( input_tsvfile) for header in table_data: for i in range(row_count): for find_to_replace in file_renaming_reverse_mapping: table_data[header][i] = table_data[header][i].replace( find_to_replace, file_renaming_reverse_mapping[find_to_replace]) ming_fileio_library.write_dictionary_table_data(table_data, output_tsvfile)
def main(): input_intermediate_folder = sys.argv[1] output_filename = sys.argv[2] all_protein_stats = {} #Creating a command line for each partition all_intermediate_files = ming_fileio_library.list_files_in_dir( input_intermediate_folder) output_map = defaultdict(list) for parallel_output_filename in all_intermediate_files: row_count, table_data = ming_fileio_library.parse_table_with_headers( parallel_output_filename) for key in table_data: output_map[key] += table_data[key] ming_fileio_library.write_dictionary_table_data(output_map, output_filename)
def main(): paramxml_input_filename = sys.argv[1] parallel_param_filename = sys.argv[2] output_matches_filename = sys.argv[3] output_filename_unique_files = sys.argv[4] output_filename_all_matches = sys.argv[5] params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename)) output_map = {"specs_filename" : [],"specs_scan" : [], "dataset_filename" : [], "dataset_scan" : [], "score" : [], "dataset_id" : [], "dataset_title" : [], "dataset_description" : [], "matchedpeaks" : [], "mzerror" : []} match_parameters = get_parameters(params_obj) try: if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1": ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename) exit(0) except: ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename) exit(0) #If we are doing parallel partition_total = 1 partition_of_node = 0 params_map = json.loads(open(parallel_param_filename).read()) partition_total = params_map["total_paritions"] partition_of_node = params_map["node_partition"] dataset_dict = params_map["dataset_dict"] all_datasets = params_map["all_datasets"] SEARCH_RAW = False try: if params_obj["SEARCH_RAW"][0] == "1": SEARCH_RAW = True except: print("Param Not Found", "SEARCH_RAW") """Matchign Clustered Data""" if SEARCH_RAW: match_unclustered(match_parameters, get_spectrum_collection_from_param_obj(params_obj), dataset_dict, all_datasets, output_matches_filename, output_filename_unique_files, output_filename_all_matches) else: match_clustered(match_parameters, get_spectrum_collection_from_param_obj(params_obj), dataset_dict, all_datasets, output_matches_filename, output_filename_unique_files, output_filename_all_matches)
def name_demangle_filenames(input_file, output_file, path_to_param, old_filename_header, new_filename_header): row_count, table_data = ming_fileio_library.parse_table_with_headers( input_file) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( ming_proteosafe_library.parse_xml_file(open(path_to_param))) if old_filename_header == new_filename_header: for i in range(row_count): mangled_name = table_data[old_filename_header][i] unmangled_name = mangled_mapping[mangled_name] table_data[new_filename_header][i] = unmangled_name else: table_data[new_filename_header] = [] for i in range(row_count): mangled_name = table_data[old_filename_header][i] unmangled_name = mangled_mapping[mangled_name] table_data[new_filename_header].append(unmangled_name) ming_fileio_library.write_dictionary_table_data(table_data, output_file)
def main(): input_file_of_tsv_results = sys.argv[1] input_params_xml_filename = sys.argv[2] input_library_identifications_filename = sys.argv[3] input_cutoff_scores = sys.argv[4] output_folder = sys.argv[5] output_filename = os.path.join(output_folder, os.path.basename(input_file_of_tsv_results)) params_object = ming_proteosafe_library.parse_xml_file(open(input_params_xml_filename)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) library_scans_to_identification = library_scans_to_identification_info(input_library_identifications_filename) cutoff_dict = json.loads(open(input_cutoff_scores).read()) psm_list = ming_psm_library.parse_MSGFPlus_tsvfile(input_file_of_tsv_results) output_results_dict = process_ambiguity(psm_list, mangled_mapping, library_scans_to_identification, cutoff_dict) ming_fileio_library.write_dictionary_table_data(output_results_dict, output_filename)
def main(): input_intermediate_folder = sys.argv[1] output_file = sys.argv[2] output_dict = defaultdict(list) total_rows = 0 input_filenames = ming_fileio_library.list_files_in_dir( input_intermediate_folder) for input_filename in input_filenames: if total_rows > 10000000: continue row_count, table_data = ming_fileio_library.parse_table_with_headers( input_filename) total_rows += row_count for i in range(row_count): for key in table_data: output_dict[key].append(table_data[key][i]) ming_fileio_library.write_dictionary_table_data(output_dict, output_file)
def name_demangle_filenames_and_instrument_collision(input_file, output_file, path_to_param, path_to_original_results, old_filename_header, new_filename_header): row_count, table_data = ming_fileio_library.parse_table_with_headers( input_file, skip_incomplete_lines=True) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( ming_proteosafe_library.parse_xml_file(open(path_to_param))) if not "FragMethod" in table_data: print("Demangling", path_to_original_results, input_file) collision_mapping = get_scan_mapping_for_collision_method( path_to_original_results) #Adding collision column table_data["FragMethod"] = [] print(len(table_data["filename"]), len(table_data["scan"])) for i in range(row_count): key = table_data["filename"][i] + "_" + table_data["scan"][i] if key in collision_mapping: table_data["FragMethod"].append(collision_mapping[key]) else: table_data["FragMethod"].append("NO_COLLISION") if old_filename_header == new_filename_header: for i in range(row_count): mangled_name = table_data[old_filename_header][i] unmangled_name = mangled_mapping[mangled_name] table_data[new_filename_header][i] = unmangled_name else: table_data[new_filename_header] = [] for i in range(row_count): mangled_name = table_data[old_filename_header][i] unmangled_name = mangled_mapping[mangled_name] table_data[new_filename_header].append(unmangled_name) ming_fileio_library.write_dictionary_table_data(table_data, output_file)
def main(): input_filename = sys.argv[1] output_filename = sys.argv[2] row_count, table_data = ming_fileio_library.parse_table_with_headers(input_filename) output_dict = defaultdict(list) max_fdr = 0.01 for i in range(row_count): sequence = table_data["sequence"][i] modified_sequence = sequence[:-2] fdr = float(table_data["FDR"][i]) if fdr > max_fdr: continue for key in table_data: output_dict[key].append(table_data[key][i]) output_dict["modified_sequence"].append(modified_sequence) ming_fileio_library.write_dictionary_table_data(output_dict, output_filename)
def main(): paramxml_input_filename = sys.argv[1] parallel_param_filename = sys.argv[2] input_spectra_folder = sys.argv[3] library_search_results_filename = sys.argv[4] output_matches_filename = sys.argv[5] params_obj = ming_proteosafe_library.parse_xml_file( open(paramxml_input_filename)) try: if params_obj["MATCH_REFERENCE_DATASETS"][0] != "1": output_map = {"EMPTY": []} ming_fileio_library.write_dictionary_table_data( output_map, output_matches_filename) exit(0) except: output_map = {"EMPTY": []} ming_fileio_library.write_dictionary_table_data( output_map, output_matches_filename) exit(0) #Loading a dict of identifications identifications_map = load_identification_file_as_map( library_search_results_filename) #If we are doing parallel partition_total = 1 partition_of_node = 0 params_map = json.loads(open(parallel_param_filename).read()) partition_total = params_map["total_paritions"] partition_of_node = params_map["node_partition"] all_datasets = params_map["all_datasets"] all_matches = finding_matches_in_public_data( os.path.join(input_spectra_folder, "specs_ms.mgf"), all_datasets, identifications_map) output_map = defaultdict(list) for match in all_matches: for key in match: output_map[key].append(match[key]) ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
def main(): paramxml_input_filename = sys.argv[1] parallel_param_filename = sys.argv[2] input_spectra_folder = sys.argv[3] library_search_results_filename = sys.argv[4] output_matches_filename = sys.argv[5] params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename)) output_map = {"specs_filename" : [],"specs_scan" : [], "dataset_filename" : [], "dataset_scan" : [], "score" : [], "dataset_id" : [], "dataset_title" : [], "dataset_neighbors" : [], "Compound_Name" : [], "SpectrumID" : []} try: if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1": ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename) exit(0) except: ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename) exit(0) #If we are doing parallel partition_total = 1 partition_of_node = 0 params_map = json.loads(open(parallel_param_filename).read()) partition_total = params_map["total_paritions"] partition_of_node = params_map["node_partition"] dataset_dict = params_map["dataset_dict"] all_datasets = params_map["all_datasets"] #print(len(all_datasets)) #print(partition_of_node) #print(partition_total) #all_datasets = all_datasets[partition_of_node::partition_total] all_matches = finding_matches_in_public_data(os.path.join(input_spectra_folder, "specs_ms.mgf"), all_datasets) #Lets parse the search results and then populate this thing with search results library_search_result_count, library_search_data = ming_fileio_library.parse_table_with_headers(library_search_results_filename) scan_to_library_map = {} for i in range(library_search_result_count): scan = library_search_data["Scan"][i] scan_to_library_map[scan] = {"Compound_Name" : library_search_data["Compound_Name"][i], "SpectrumID" : library_search_data["SpectrumID"][i]} for dataset in all_matches: #For each dataset, lets try to find the clustering information if len(all_matches[dataset]["matches"]) == 0: continue most_recent_molecular_networking_job = ming_gnps_library.get_most_recent_continuous_networking_of_dataset(dataset_dict[dataset]["task"]) molecular_network = get_molecular_network_obj(most_recent_molecular_networking_job) for match in all_matches[dataset]["matches"]: output_map['specs_filename'].append("specs_ms.mgf") output_map['specs_scan'].append(match.query_scan) output_map['dataset_id'].append(dataset_dict[dataset]["dataset"]) output_map['dataset_title'].append(dataset_dict[dataset]["title"]) output_map['dataset_filename'].append(match.filename) output_map['dataset_scan'].append(match.scan) output_map['score'].append(match.score) #List the library identifications if str(match.query_scan) in scan_to_library_map: output_map['Compound_Name'].append(scan_to_library_map[str(match.query_scan)]["Compound_Name"]) output_map['SpectrumID'].append(scan_to_library_map[str(match.query_scan)]["SpectrumID"]) else: output_map['Compound_Name'].append("") output_map['SpectrumID'].append("") #Lets find all the analogs available if molecular_network != None: neighbors_in_dataset = molecular_network.get_node_neighbors(match.scan) output_map['dataset_neighbors'].append(len(neighbors_in_dataset)) else: output_map['dataset_neighbors'].append(0) ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
def main(): param_filename = sys.argv[1] choose_consensus_params_filename = sys.argv[2] filtered_peptide_list_filename = sys.argv[3] length_score_cutoff_filename = sys.argv[4] existing_library_spectra_folder = sys.argv[5] new_library_spectra_folder = sys.argv[6] output_library_json_folder = sys.argv[7] output_library_all_spectra_json_folder = sys.argv[8] output_candidate_spectra_tsv_folder = sys.argv[9] filtered_peptide_set = load_filtered_peptide_set( filtered_peptide_list_filename) score_cutoff_by_length = load_score_cutoff_by_length( filtered_peptide_list_filename) variant_to_score = load_variant_to_score(filtered_peptide_list_filename) #Deciding on how to create consensus params_obj = ming_proteosafe_library.parse_xml_file(open(param_filename)) parallel_params = json.loads(open(choose_consensus_params_filename).read()) total_node_count = parallel_params["total_paritions"] my_node_number = parallel_params["node_partition"] consensus_selection_method = params_obj["ConsensusChoice"][0] #output dict for listing all candidates library_candidates_output_dict = defaultdict(list) #determine filenames existing_library_filename, new_library_filename = determine_filenames_to_load( my_node_number, params_obj, existing_library_spectra_folder, new_library_spectra_folder) output_library_all_spectra_json_filename = os.path.join( output_library_all_spectra_json_folder, str(my_node_number) + ".json") output_library_all_spectra_json_file_handle = open( output_library_all_spectra_json_filename, "w") print(existing_library_filename, new_library_filename, output_library_all_spectra_json_folder, output_library_all_spectra_json_filename) print(len(existing_library_filename), existing_library_filename) print(len(new_library_filename), new_library_filename) library_spectra = [] top_scoring_to_keep = 100 top_per_dataset = 20 #If we are starting from scratch, so no existing library if len(existing_library_filename) == 0 and len(new_library_filename) == 0: print("no files to load") exit(0) if len(existing_library_filename) == 0 and len(new_library_filename) != 0: print("New Only") input_spectrum_file_handle = open(new_library_filename) for line in input_spectrum_file_handle: all_spectra = json.loads(line) if len(all_spectra) == 0: continue #Filter to only top K scoring psms #all_spectra = sorted(all_spectra, key=lambda spectrum: spectrum["score"], reverse=True) #all_spectra = all_spectra[:top_scoring_to_keep] """Filtering intelligently""" all_spectra = filter_out_spectra_to_top(all_spectra, top_scoring_to_keep, top_per_dataset) annotation = all_spectra[0]["annotation"] + "." + str( all_spectra[0]["charge"]) print(annotation, len(all_spectra)) if not annotation in filtered_peptide_set: output_library_all_spectra_json_file_handle.write( json.dumps(all_spectra)) output_library_all_spectra_json_file_handle.write("\n") continue output_library_all_spectra_json_file_handle.write( json.dumps(all_spectra)) output_library_all_spectra_json_file_handle.write("\n") #Filter out spectra that do not pass minimum score by length #TODO library_spectrum = create_library_spectrum( all_spectra, consensus_selection_method, score_cutoff_by_length, variant_to_score, library_candidates_output_dict) library_spectra.append(library_spectrum) if len(existing_library_filename) != 0 and len(new_library_filename) != 0: print("New and Old") #load both files and iterate through them new_library_file_handle = open(new_library_filename) existing_library_file_handle = open(existing_library_filename) new_library_current_spectra_string = new_library_file_handle.readline() existing_library_current_spectra_string = existing_library_file_handle.readline( ) new_library_current_spectra = [] existing_library_current_spectra = [] new_library_precursor = "" existing_library_precursor = "" parse_new_spectra = True parse_existing_spectra = True new_spectra_ended = False existing_spectra_ended = False #new_library_current_spectra = json.loads(new_library_file_handle.readline()) #existing_library_current_spectra = json.loads(existing_library_file_handle.readline()) #new_library_precursor = new_library_current_spectra[0]["annotation"] + "." + str(new_library_current_spectra[0]["charge"]) #existing_library_precursor = existing_library_current_spectra[0]["annotation"] + "." + str(existing_library_current_spectra[0]["charge"]) #print(new_library_precursor, existing_library_precursor) while True: print(len(existing_library_current_spectra_string), len(new_library_current_spectra_string)) if len(new_library_current_spectra_string) == 0: new_spectra_ended = True parse_new_spectra = False if len(existing_library_current_spectra_string) == 0: existing_spectra_ended = True parse_existing_spectra = False if existing_spectra_ended == True and new_spectra_ended == True: break if parse_new_spectra == True: new_library_current_spectra = json.loads( new_library_current_spectra_string) new_library_precursor = new_library_current_spectra[0][ "annotation"] + "." + str( new_library_current_spectra[0]["charge"]) if parse_existing_spectra == True: existing_library_current_spectra = json.loads( existing_library_current_spectra_string) existing_library_precursor = existing_library_current_spectra[ 0]["annotation"] + "." + str( existing_library_current_spectra[0]["charge"]) if new_library_precursor == existing_library_precursor: print("FOUND BOTH") all_spectra = [] all_spectra += new_library_current_spectra all_spectra += existing_library_current_spectra #Filter to only top K scoring psms #all_spectra = sorted(all_spectra, key=lambda spectrum: spectrum["score"], reverse=True) #all_spectra = all_spectra[:top_scoring_to_keep] """Filtering intelligently""" all_spectra = filter_out_spectra_to_top( all_spectra, top_scoring_to_keep, top_per_dataset) annotation = all_spectra[0]["annotation"] + "." + str( all_spectra[0]["charge"]) print(annotation, len(all_spectra)) #Get new spectra new_library_current_spectra_string = new_library_file_handle.readline( ) existing_library_current_spectra_string = existing_library_file_handle.readline( ) parse_new_spectra = True parse_existing_spectra = True #Determing library if not annotation in filtered_peptide_set: output_library_all_spectra_json_file_handle.write( json.dumps(all_spectra)) output_library_all_spectra_json_file_handle.write("\n") continue output_library_all_spectra_json_file_handle.write( json.dumps(all_spectra)) output_library_all_spectra_json_file_handle.write("\n") library_spectrum = create_library_spectrum( all_spectra, consensus_selection_method, score_cutoff_by_length, variant_to_score, library_candidates_output_dict) library_spectra.append(library_spectrum) elif (new_library_precursor < existing_library_precursor and new_spectra_ended == False) or existing_spectra_ended == True: print("FOUND NEW", existing_spectra_ended, new_spectra_ended) all_spectra = new_library_current_spectra #Filter to only top K scoring psms #all_spectra = sorted(all_spectra, key=lambda spectrum: spectrum["score"], reverse=True) #all_spectra = all_spectra[:top_scoring_to_keep] """Filtering intelligently""" all_spectra = filter_out_spectra_to_top( all_spectra, top_scoring_to_keep, top_per_dataset) annotation = all_spectra[0]["annotation"] + "." + str( all_spectra[0]["charge"]) print(annotation, len(all_spectra)) #Get new spectra new_library_current_spectra_string = new_library_file_handle.readline( ) parse_new_spectra = True #Determing library if not annotation in filtered_peptide_set: output_library_all_spectra_json_file_handle.write( json.dumps(all_spectra)) output_library_all_spectra_json_file_handle.write("\n") continue output_library_all_spectra_json_file_handle.write( json.dumps(all_spectra)) output_library_all_spectra_json_file_handle.write("\n") library_spectrum = create_library_spectrum( all_spectra, consensus_selection_method, score_cutoff_by_length, variant_to_score, library_candidates_output_dict) library_spectra.append(library_spectrum) elif (new_library_precursor > existing_library_precursor and existing_spectra_ended == False) or new_spectra_ended == True: print("FOUND EXISTING") all_spectra = existing_library_current_spectra #Filter to only top K scoring psms #all_spectra = sorted(all_spectra, key=lambda spectrum: spectrum["score"], reverse=True) #all_spectra = all_spectra[:top_scoring_to_keep] """Filtering intelligently""" all_spectra = filter_out_spectra_to_top( all_spectra, top_scoring_to_keep, top_per_dataset) annotation = all_spectra[0]["annotation"] + "." + str( all_spectra[0]["charge"]) print(annotation, len(all_spectra)) #Get new spectra existing_library_current_spectra_string = existing_library_file_handle.readline( ) parse_existing_spectra = True #Determing library if not annotation in filtered_peptide_set: output_library_all_spectra_json_file_handle.write( json.dumps(all_spectra)) output_library_all_spectra_json_file_handle.write("\n") continue output_library_all_spectra_json_file_handle.write( json.dumps(all_spectra)) output_library_all_spectra_json_file_handle.write("\n") library_spectrum = create_library_spectrum( all_spectra, consensus_selection_method, score_cutoff_by_length, variant_to_score, library_candidates_output_dict) library_spectra.append(library_spectrum) else: print("Problem with Ordering") json.dump( library_spectra, open( os.path.join(output_library_json_folder, str(my_node_number) + ".json"), "w")) output_library_all_spectra_json_file_handle.close() #Outputting output_candidate_spectra_tsv_filename = os.path.join( output_candidate_spectra_tsv_folder, str(my_node_number) + ".tsv") ming_fileio_library.write_dictionary_table_data( library_candidates_output_dict, output_candidate_spectra_tsv_filename)
def main(): input_result_filename = sys.argv[1] output_result_filename = sys.argv[2] spectrum_id_cache = {} input_rows, input_table = ming_fileio_library.parse_table_with_headers(input_result_filename) output_table = defaultdict(list) output_headers = ["SpectrumID", "Compound_Name", "Ion_Source", "Instrument", "Compound_Source", "PI", "Data_Collector", "Adduct"] output_headers += ["Precursor_MZ", "ExactMass", "Charge", "CAS_Number", "Pubmed_ID", "Smiles", "INCHI", "INCHI_AUX", "Library_Class"] output_headers += ["IonMode", "UpdateWorkflowName", "LibraryQualityString", "#Scan#", "SpectrumFile", "MQScore", "Organism"] output_headers += ["TIC_Query", "RT_Query", "MZErrorPPM", "SharedPeaks", "MassDiff", "LibMZ", "SpecMZ", "SpecCharge"] for header in output_headers: output_table[header] = [] number_hits_per_query = defaultdict(lambda: 0) for i in range(input_rows): number_hits_per_query[input_table["FileScanUniqueID"][i]] += 1 for i in range(input_rows): spectrum_id = input_table["LibrarySpectrumID"][i] score = input_table["MQScore"][i] filename = input_table["SpectrumFile"][i] libfilename = input_table["LibraryName"][i] scan = input_table["#Scan#"][i] TIC_Query = input_table["UnstrictEvelopeScore"][i] RT_Query = input_table["p-value"][i] SpecCharge = input_table["Charge"][i] SpecMZ = input_table["SpecMZ"][i] MZErrorPPM = input_table["mzErrorPPM"][i] SharedPeaks = input_table["LibSearchSharedPeaks"][i] MassDiff = input_table["ParentMassDiff"][i] print(spectrum_id) gnps_library_spectrum = None try: gnps_library_spectrum = None if spectrum_id in spectrum_id_cache: gnps_library_spectrum = spectrum_id_cache[spectrum_id] else: gnps_library_spectrum = ming_gnps_library.get_library_spectrum(spectrum_id) spectrum_id_cache[spectrum_id] = gnps_library_spectrum except KeyboardInterrupt: raise except: continue gnps_library_spectrum["annotations"] = sorted(gnps_library_spectrum["annotations"], key=lambda annotation: annotation["create_time"], reverse=True) output_table["SpectrumID"].append(spectrum_id) output_table["Compound_Name"].append(gnps_library_spectrum["annotations"][0]["Compound_Name"].replace("\t", "")) output_table["Ion_Source"].append(gnps_library_spectrum["annotations"][0]["Ion_Source"].replace("\t", "")) output_table["Instrument"].append(gnps_library_spectrum["annotations"][0]["Instrument"].replace("\t", "")) output_table["Compound_Source"].append(gnps_library_spectrum["annotations"][0]["Compound_Source"].replace("\t", "")) output_table["PI"].append(gnps_library_spectrum["annotations"][0]["PI"].replace("\t", "")) output_table["Data_Collector"].append(gnps_library_spectrum["annotations"][0]["Data_Collector"].replace("\t", "")) output_table["Adduct"].append(gnps_library_spectrum["annotations"][0]["Adduct"].replace("\t", "")) output_table["Precursor_MZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"].replace("\t", "")) output_table["ExactMass"].append(gnps_library_spectrum["annotations"][0]["ExactMass"].replace("\t", "")) output_table["Charge"].append(gnps_library_spectrum["annotations"][0]["Charge"].replace("\t", "")) output_table["CAS_Number"].append(gnps_library_spectrum["annotations"][0]["CAS_Number"].replace("\t", "")) output_table["Pubmed_ID"].append(gnps_library_spectrum["annotations"][0]["Pubmed_ID"].replace("\t", "")) output_table["Smiles"].append(gnps_library_spectrum["annotations"][0]["Smiles"].replace("\t", "")) output_table["INCHI"].append(gnps_library_spectrum["annotations"][0]["INCHI"].replace("\t", "")) output_table["INCHI_AUX"].append(gnps_library_spectrum["annotations"][0]["INCHI_AUX"].replace("\t", "")) output_table["Library_Class"].append(gnps_library_spectrum["annotations"][0]["Library_Class"].replace("\t", "")) output_table["IonMode"].append(gnps_library_spectrum["annotations"][0]["Ion_Mode"].replace("\t", "")) if gnps_library_spectrum["annotations"][0]["Library_Class"] == "1": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-GOLD") output_table["LibraryQualityString"].append("Gold") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "2": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-SILVER") output_table["LibraryQualityString"].append("Silver") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "3": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Bronze") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "4": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Insilico") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "5": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Insilico") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "10": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Challenge") else: print("BULLLSHIT", gnps_library_spectrum["annotations"][0]["Library_Class"]) output_table["#Scan#"].append(scan) output_table["SpectrumFile"].append(filename) output_table["LibraryName"].append(libfilename) output_table["MQScore"].append(score) output_table["Organism"].append(gnps_library_spectrum["spectruminfo"]["library_membership"]) output_table["TIC_Query"].append(TIC_Query) output_table["RT_Query"].append(RT_Query) output_table["MZErrorPPM"].append(MZErrorPPM) output_table["SharedPeaks"].append(SharedPeaks) output_table["MassDiff"].append(MassDiff) output_table["LibMZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"]) output_table["SpecMZ"].append(SpecMZ) output_table["SpecCharge"].append(SpecCharge) output_table["FileScanUniqueID"].append(input_table["FileScanUniqueID"][i]) output_table["NumberHits"].append(number_hits_per_query[input_table["FileScanUniqueID"][i]]) tag_list = [ (tag["tag_desc"] + "[" + tag["tag_type"] + "]") for tag in gnps_library_spectrum["spectrum_tags"]] tag_string = "||".join(tag_list).replace("\t", "") output_table["tags"].append(tag_string) ming_fileio_library.write_dictionary_table_data(output_table, output_result_filename)
def main(): paramxml_input_filename = sys.argv[1] parallel_param_filename = sys.argv[2] input_spectra_folder = sys.argv[3] library_search_results_filename = sys.argv[4] output_matches_filename = sys.argv[5] params_obj = ming_proteosafe_library.parse_xml_file( open(paramxml_input_filename)) output_map = { "specs_filename": [], "specs_scan": [], "dataset_filename": [], "dataset_scan": [], "score": [], "dataset_id": [], "dataset_title": [], "dataset_neighbors": [], "Compound_Name": [], "SpectrumID": [] } try: if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1": ming_fileio_library.write_dictionary_table_data( output_map, output_matches_filename) exit(0) except: ming_fileio_library.write_dictionary_table_data( output_map, output_matches_filename) exit(0) #If we are doing parallel partition_total = 1 partition_of_node = 0 params_map = json.loads(open(parallel_param_filename).read()) partition_total = params_map["total_paritions"] partition_of_node = params_map["node_partition"] dataset_dict = params_map["dataset_dict"] all_datasets = params_map["all_datasets"] #print(len(all_datasets)) #print(partition_of_node) #print(partition_total) #all_datasets = all_datasets[partition_of_node::partition_total] all_matches = finding_matches_in_public_data( os.path.join(input_spectra_folder, "specs_ms.mgf"), all_datasets) #Lets parse the search results and then populate this thing with search results library_search_result_count, library_search_data = ming_fileio_library.parse_table_with_headers( library_search_results_filename) scan_to_library_map = {} for i in range(library_search_result_count): scan = library_search_data["Scan"][i] scan_to_library_map[scan] = { "Compound_Name": library_search_data["Compound_Name"][i], "SpectrumID": library_search_data["SpectrumID"][i] } for dataset in all_matches: #For each dataset, lets try to find the clustering information if len(all_matches[dataset]["matches"]) == 0: continue most_recent_molecular_networking_job = ming_gnps_library.get_most_recent_continuous_networking_of_dataset( dataset_dict[dataset]["task"]) molecular_network = get_molecular_network_obj( most_recent_molecular_networking_job) for match in all_matches[dataset]["matches"]: output_map['specs_filename'].append("specs_ms.mgf") output_map['specs_scan'].append(match.query_scan) output_map['dataset_id'].append(dataset_dict[dataset]["dataset"]) output_map['dataset_title'].append(dataset_dict[dataset]["title"]) output_map['dataset_filename'].append(match.filename) output_map['dataset_scan'].append(match.scan) output_map['score'].append(match.score) #List the library identifications if str(match.query_scan) in scan_to_library_map: output_map['Compound_Name'].append(scan_to_library_map[str( match.query_scan)]["Compound_Name"]) output_map['SpectrumID'].append(scan_to_library_map[str( match.query_scan)]["SpectrumID"]) else: output_map['Compound_Name'].append("") output_map['SpectrumID'].append("") #Lets find all the analogs available if molecular_network != None: neighbors_in_dataset = molecular_network.get_node_neighbors( match.scan) output_map['dataset_neighbors'].append( len(neighbors_in_dataset)) else: output_map['dataset_neighbors'].append(0) ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
def write_summary(self, output_filename): ming_fileio_library.write_dictionary_table_data(self.produce_protein_dict(), output_filename)
def main(): parser = argparse.ArgumentParser( description='Creating Clustering Info Summary') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_metadata_file', help='output_metadata_file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file( open(args.proteosafe_parameters)) mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping( param_obj) default_group_mapping = defaultdict(list) file_to_group_mapping = {} for mangled_name in mangled_file_mapping: if mangled_name.find("specone-") != -1: default_group_mapping["G1"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G1" if mangled_name.find("spectwo-") != -1: default_group_mapping["G2"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G2" if mangled_name.find("specthree-") != -1: default_group_mapping["G3"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G3" if mangled_name.find("specfour-") != -1: default_group_mapping["G4"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G4" if mangled_name.find("specfive-") != -1: default_group_mapping["G5"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G5" if mangled_name.find("specsix-") != -1: default_group_mapping["G6"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G6" metadata_files_in_folder = ming_fileio_library.list_files_in_dir( args.metadata_folder) row_count = 0 table_data = defaultdict(list) if len(metadata_files_in_folder) == 1: row_count, table_data = ming_fileio_library.parse_table_with_headers( metadata_files_in_folder[0]) print(table_data) for key in table_data: print(key, len(table_data[key])) for i in range(row_count): print(i) filename = table_data["filename"][i] if len(filename) < 2: continue print(filename, filename[0], filename[-1]) if filename[0] == "\"": filename = filename[1:] if filename[-1] == "\"": filename = filename[:-1] table_data["filename"][i] = filename basename_filename = os.path.basename(filename) group_name = "NoDefaultGroup" if basename_filename in file_to_group_mapping: group_name = file_to_group_mapping[basename_filename] table_data["ATTRIBUTE_DefaultGroup"].append(group_name) for input_filename in file_to_group_mapping: if input_filename in table_data["filename"]: continue else: for key in table_data: if key != "ATTRIBUTE_DefaultGroup" and key != "filename": table_data[key].append("N/A") table_data["ATTRIBUTE_DefaultGroup"].append( file_to_group_mapping[input_filename]) table_data["filename"].append(input_filename) ming_fileio_library.write_dictionary_table_data(table_data, args.output_metadata_file)
def main(): param_filename = sys.argv[1] choose_consensus_params_filename = sys.argv[2] filtered_peptide_list_filename = sys.argv[3] length_score_cutoff_filename = sys.argv[4] provenance_json_filename = sys.argv[5] merged_library_spectra_folder = sys.argv[6] output_library_json_folder = sys.argv[7] output_candidate_spectra_tsv_folder = sys.argv[8] filtered_peptide_set = load_filtered_peptide_set( filtered_peptide_list_filename) score_cutoff_by_length = load_score_cutoff_by_length( filtered_peptide_list_filename) variant_to_score = load_variant_to_score(filtered_peptide_list_filename) #Deciding on how to create consensus params_obj = ming_proteosafe_library.parse_xml_file(open(param_filename)) parallel_params = json.loads(open(choose_consensus_params_filename).read()) total_node_count = parallel_params["total_paritions"] my_node_number = parallel_params["node_partition"] consensus_selection_method = params_obj["ConsensusChoice"][0] #output dict for listing all candidates library_candidates_output_dict = defaultdict(list) #determine filenames merged_library_filename, my_position_for_file, total_nodes_for_file = determine_filenames_to_load( my_node_number, total_node_count, merged_library_spectra_folder) print(merged_library_filename, my_position_for_file, total_nodes_for_file) library_spectra = [] input_spectrum_file_handle = open(merged_library_filename) line_count = 0 for line in input_spectrum_file_handle: line_count += 1 if line_count % total_nodes_for_file != my_position_for_file: #print("Should Skip") continue else: print("NOT SKIP") all_spectra = json.loads(line) if len(all_spectra) == 0: continue annotation = all_spectra[0]["annotation"] + "." + str( all_spectra[0]["charge"]) print(annotation, len(all_spectra)) if not annotation in filtered_peptide_set: continue library_spectrum = create_library_spectrum( all_spectra, consensus_selection_method, score_cutoff_by_length, variant_to_score, library_candidates_output_dict) library_spectra.append(library_spectrum) json.dump( library_spectra, open( os.path.join(output_library_json_folder, str(my_node_number) + ".json"), "w")) #Provenance Records provenance_records = json.loads(open(provenance_json_filename).read()) #Modifying the output candidate file for i in range(len(library_candidates_output_dict["filename"])): proteosafe_task = library_candidates_output_dict["proteosafe_task"][i] if proteosafe_task in provenance_records["search_task_to_augment"]: library_candidates_output_dict["augment_task"].append( provenance_records["search_task_to_augment"][proteosafe_task]) else: library_candidates_output_dict["augment_task"].append("") if proteosafe_task in provenance_records["search_task_to_extraction"]: library_candidates_output_dict["extract_task"].append( provenance_records["search_task_to_extraction"] [proteosafe_task]) else: library_candidates_output_dict["extract_task"].append("") #Outputting output_candidate_spectra_tsv_filename = os.path.join( output_candidate_spectra_tsv_folder, str(my_node_number) + ".tsv") ming_fileio_library.write_dictionary_table_data( library_candidates_output_dict, output_candidate_spectra_tsv_filename) """Converted Output""" output_tsv_folder = sys.argv[9] output_mgf_folder = sys.argv[10] output_sptxt_folder = sys.argv[11] library_spectrum_collection = ming_spectrum_library.SpectrumCollection( "library spectra") for library_spectrum in library_spectra: lib_spec = ming_spectrum_library.PeptideLibrarySpectrum( "", 0, 0, library_spectrum["peaks"], library_spectrum["mz"], library_spectrum["charge"], library_spectrum["annotation"], library_spectrum["protein"]) if "score" in library_spectrum: lib_spec.score = library_spectrum["score"] if "variant_score" in library_spectrum: lib_spec.variant_score = library_spectrum["variant_score"] if "spectra_to_consider" in library_spectrum: lib_spec.num_spectra = library_spectrum["spectra_to_consider"] if "ranking" in library_spectrum: lib_spec.spectrum_ranking = library_spectrum["ranking"] if "proteosafe_task" in library_spectrum: lib_spec.proteosafe_task = library_spectrum["proteosafe_task"] if "originalspectrum_filename" in library_spectrum: lib_spec.originalfile_filename = library_spectrum[ "originalspectrum_filename"] if "originalspectrum_scan" in library_spectrum: lib_spec.originalfile_scan = str( library_spectrum["originalspectrum_scan"]) library_spectrum_collection.spectrum_list.append(lib_spec) output_mgf_filename = os.path.join(output_mgf_folder, str(my_node_number) + ".mgf") output_tsv_filename = os.path.join(output_tsv_folder, str(my_node_number) + ".tsv") output_sptxt_filename = os.path.join(output_sptxt_folder, str(my_node_number) + ".sptxt") library_spectrum_collection.save_to_mgf(open(output_mgf_filename, "w")) library_spectrum_collection.save_to_tsv(open(output_tsv_filename, "w"), output_mgf_filename) try: library_spectrum_collection.save_to_sptxt( open(output_sptxt_filename, "w")) except: traceback.print_exc(file=sys.stdout) print("MEH")
def main(): input_json = json.loads(open(sys.argv[1]).read()) input_intermediate_folder = sys.argv[2] output_folder = sys.argv[3] output_peptide_list_folder = sys.argv[4] my_node = input_json["node_partition"] output_filename = os.path.join(output_folder, str(my_node) + ".json") output_file = open(output_filename, "w") number_of_spectra = 0 input_json_files = ming_fileio_library.list_files_in_dir( input_intermediate_folder) input_json_files.sort() all_spectra = [] for json_filename in input_json_files: #Skip files json_basename = os.path.basename(json_filename).split(".")[0] bin_peptide = int(json_basename.split("_")[2]) if bin_peptide != my_node: continue print("Loading", json_filename) spectrum_list = json.load(open(json_filename)) all_spectra += spectrum_list print("Total Spectra", len(spectrum_list), len(all_spectra)) peptide_dict = defaultdict(list) print("Creating hash") for spectrum in all_spectra: annotation = spectrum["annotation"] + "." + str(spectrum["charge"]) peptide_dict[annotation].append(spectrum) print("writing out strings") all_annotation = list(peptide_dict.keys()) all_annotation.sort() for annotation in all_annotation: output_file.write(json.dumps(peptide_dict[annotation])) output_file.write("\n") output_file.close() #Write out all the peptides into a file output_peptide_dict = defaultdict(list) for annotation_key in peptide_dict: max_score = -10 if len(peptide_dict[annotation_key]) > 0: for spectrum in peptide_dict[annotation_key]: max_score = max(spectrum["score"], max_score) #max score per peptide output_peptide_dict["score"].append(max_score) output_peptide_dict["annotation_key"].append(annotation_key) output_peptide_dict["annotation"].append( peptide_dict[annotation_key][0]["annotation"]) output_peptide_dict["charge"].append( peptide_dict[annotation_key][0]["charge"]) output_peptide_dict["protein"].append( peptide_dict[annotation_key][0]["protein"]) #writing out file output_peptide_filename = os.path.join(output_peptide_list_folder, str(my_node) + ".tsv") ming_fileio_library.write_dictionary_table_data(output_peptide_dict, output_peptide_filename)
def main(): input_result_filename = sys.argv[1] output_result_filename = sys.argv[2] input_rows, input_table = ming_fileio_library.parse_table_with_headers(input_result_filename) output_table = defaultdict(list) output_headers = ["SpectrumID", "Compound_Name", "Ion_Source", "Instrument", "Compound_Source", "PI", "Data_Collector", "Adduct"] output_headers += ["Precursor_MZ", "ExactMass", "Charge", "CAS_Number", "Pubmed_ID", "Smiles", "INCHI", "INCHI_AUX", "Library_Class"] output_headers += ["IonMode", "UpdateWorkflowName", "LibraryQualityString", "#Scan#", "SpectrumFile", "MQScore", "Organism"] output_headers += ["TIC_Query", "RT_Query", "MZErrorPPM", "SharedPeaks", "MassDiff", "LibMZ", "SpecMZ", "SpecCharge"] for header in output_headers: output_table[header] = [] for i in range(input_rows): spectrum_id = input_table["LibrarySpectrumID"][i] score = input_table["MQScore"][i] filename = input_table["SpectrumFile"][i] libfilename = input_table["LibraryName"][i] scan = input_table["#Scan#"][i] TIC_Query = input_table["UnstrictEvelopeScore"][i] RT_Query = input_table["p-value"][i] SpecCharge = input_table["Charge"][i] SpecMZ = input_table["SpecMZ"][i] MZErrorPPM = input_table["mzErrorPPM"][i] SharedPeaks = input_table["LibSearchSharedPeaks"][i] MassDiff = input_table["ParentMassDiff"][i] print(spectrum_id) gnps_library_spectrum = None try: gnps_library_spectrum = ming_gnps_library.get_library_spectrum(spectrum_id) except KeyboardInterrupt: raise except: continue output_table["SpectrumID"].append(spectrum_id) output_table["Compound_Name"].append(gnps_library_spectrum["annotations"][0]["Compound_Name"]) output_table["Ion_Source"].append(gnps_library_spectrum["annotations"][0]["Ion_Source"]) output_table["Instrument"].append(gnps_library_spectrum["annotations"][0]["Instrument"]) output_table["Compound_Source"].append(gnps_library_spectrum["annotations"][0]["Compound_Source"]) output_table["PI"].append(gnps_library_spectrum["annotations"][0]["PI"]) output_table["Data_Collector"].append(gnps_library_spectrum["annotations"][0]["Data_Collector"]) output_table["Adduct"].append(gnps_library_spectrum["annotations"][0]["Adduct"]) output_table["Precursor_MZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"]) output_table["ExactMass"].append(gnps_library_spectrum["annotations"][0]["ExactMass"]) output_table["Charge"].append(gnps_library_spectrum["annotations"][0]["Charge"]) output_table["CAS_Number"].append(gnps_library_spectrum["annotations"][0]["CAS_Number"]) output_table["Pubmed_ID"].append(gnps_library_spectrum["annotations"][0]["Pubmed_ID"]) output_table["Smiles"].append(gnps_library_spectrum["annotations"][0]["Smiles"]) output_table["INCHI"].append(gnps_library_spectrum["annotations"][0]["INCHI"]) output_table["INCHI_AUX"].append(gnps_library_spectrum["annotations"][0]["INCHI_AUX"]) output_table["Library_Class"].append(gnps_library_spectrum["annotations"][0]["Library_Class"]) output_table["IonMode"].append(gnps_library_spectrum["annotations"][0]["Ion_Mode"]) if gnps_library_spectrum["annotations"][0]["Library_Class"] == "1": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-GOLD") output_table["LibraryQualityString"].append("Gold") if gnps_library_spectrum["annotations"][0]["Library_Class"] == "2": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-SILVER") output_table["LibraryQualityString"].append("Silver") if gnps_library_spectrum["annotations"][0]["Library_Class"] == "3": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Bronze") if gnps_library_spectrum["annotations"][0]["Library_Class"] == "4": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Insilico") if gnps_library_spectrum["annotations"][0]["Library_Class"] == "10": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Challenge") output_table["#Scan#"].append(scan) output_table["SpectrumFile"].append(filename) output_table["LibraryName"].append(libfilename) output_table["MQScore"].append(score) output_table["Organism"].append(gnps_library_spectrum["spectruminfo"]["library_membership"]) output_table["TIC_Query"].append(TIC_Query) output_table["RT_Query"].append(RT_Query) output_table["MZErrorPPM"].append(MZErrorPPM) output_table["SharedPeaks"].append(SharedPeaks) output_table["MassDiff"].append(MassDiff) output_table["LibMZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"]) output_table["SpecMZ"].append(SpecMZ) output_table["SpecCharge"].append(SpecCharge) tag_string = "" for tag in gnps_library_spectrum["spectrum_tags"]: tag_string += tag["tag_desc"].replace("\t", "") + "||" if len(tag_string) > 3: tag_string = tag_string[:-2] output_table["tags"].append(tag_string) ming_fileio_library.write_dictionary_table_data(output_table, output_result_filename)
def match_clustered(match_parameters, spectrum_collection, dataset_dict, all_datasets, output_matches_filename, output_filename_unique_files, output_filename_all_matches): all_matches = finding_matches_in_public_data(spectrum_collection, all_datasets, match_parameters) """Resolving to File Level""" dataset_files_count = defaultdict(lambda: 0) output_source_list = [] output_match_list = [] MetaDataServerStatus = trace_to_single_file.test_metadata_server() for dataset in all_matches: for match_object in all_matches[dataset]["matches"]: dataset_accession = dataset_dict[dataset]["dataset"] dataset_scan = match_object["scan"] current_filelist, current_match_list = trace_to_single_file.trace_filename_filesystem( all_datasets, dataset_accession, dataset_scan, enrichmetadata=MetaDataServerStatus) output_source_list += current_filelist output_match_list += current_match_list seen_files = set() output_unique_source_list = [] for output_file_object in output_source_list: dataset_accession = output_file_object["dataset_id"] dataset_filename = output_file_object["filename"] key = dataset_accession + ":" + dataset_filename if key in seen_files: continue dataset_files_count[dataset_accession] += 1 seen_files.add(key) output_unique_source_list.append(output_file_object) ming_fileio_library.write_list_dict_table_data( output_unique_source_list, output_filename_unique_files) ming_fileio_library.write_list_dict_table_data( output_match_list, output_filename_all_matches) """ Summary """ output_map = { "specs_filename": [], "specs_scan": [], "dataset_filename": [], "dataset_scan": [], "score": [], "dataset_id": [], "dataset_title": [], "dataset_description": [], "dataset_organisms": [], "matchedpeaks": [], "mzerror": [], "files_count": [] } for dataset in all_matches: #For each dataset, lets try to find the clustering information if len(all_matches[dataset]["matches"]) == 0: continue match_object = None #If it is more than one match, we need to consolidate if len(all_matches[dataset]["matches"]) > 1: sorted_match_list = sorted( all_matches[dataset]["matches"], key=lambda match: float(match["cosine"]), reverse=True) match_object = sorted_match_list[0] else: match_object = all_matches[dataset]["matches"][0] output_map['specs_filename'].append("specs_ms.mgf") output_map['specs_scan'].append(match_object["queryscan"]) output_map['dataset_id'].append(dataset_dict[dataset]["dataset"]) output_map['dataset_title'].append(dataset_dict[dataset]["title"]) output_map['dataset_description'].append( dataset_dict[dataset]["description"].replace("\n", "").replace( "\t", "").replace("\r", "")) output_map['dataset_organisms'].append( dataset_dict[dataset]["species"].replace( "<hr class='separator'\/>", "!")) output_map['dataset_filename'].append(match_object["filename"]) output_map['dataset_scan'].append(match_object["scan"]) output_map['score'].append(match_object["cosine"]) output_map['matchedpeaks'].append(match_object["matchedpeaks"]) output_map['mzerror'].append(match_object["mzerror"]) output_map['files_count'].append(dataset_files_count[dataset]) ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
def main(): parser = argparse.ArgumentParser(description='Creating Clustering Info Summary') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_metadata_file', help='output_metadata_file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters)) mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj) default_group_mapping = defaultdict(list) file_to_group_mapping = {} for mangled_name in mangled_file_mapping: if mangled_name.find("specone-") != -1: default_group_mapping["G1"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G1" if mangled_name.find("spectwo-") != -1: default_group_mapping["G2"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G2" if mangled_name.find("specthree-") != -1: default_group_mapping["G3"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G3" if mangled_name.find("specfour-") != -1: default_group_mapping["G4"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G4" if mangled_name.find("specfive-") != -1: default_group_mapping["G5"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G5" if mangled_name.find("specsix-") != -1: default_group_mapping["G6"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G6" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) row_count = 0 table_data = defaultdict(list) if len(metadata_files_in_folder) == 1: row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0]) print(table_data) for key in table_data: print(key, len(table_data[key])) for i in range(row_count): print(i) filename = table_data["filename"][i] if len(filename) < 2: continue print(filename, filename[0], filename[-1]) if filename[0] == "\"": filename = filename[1:] if filename[-1] == "\"": filename = filename[:-1] table_data["filename"][i] = filename basename_filename = os.path.basename(filename) group_name = "NoDefaultGroup" if basename_filename in file_to_group_mapping: group_name = file_to_group_mapping[basename_filename] table_data["ATTRIBUTE_DefaultGroup"].append(group_name) for input_filename in file_to_group_mapping: if input_filename in table_data["filename"]: continue else: for key in table_data: if key != "ATTRIBUTE_DefaultGroup" and key != "filename": table_data[key].append("N/A") table_data["ATTRIBUTE_DefaultGroup"].append(file_to_group_mapping[input_filename]) table_data["filename"].append(input_filename) ming_fileio_library.write_dictionary_table_data(table_data, args.output_metadata_file)
def main(): print(sys.argv) paramxml_filename = sys.argv[1] psms_input_file = sys.argv[2] kl_input_file = sys.argv[3] output_psms_file = sys.argv[4] output_decoy_psms_file = sys.argv[5] parameters_obj = ming_proteosafe_library.parse_xml_file( open(paramxml_filename)) target_filename_list, decoy_filename_list = determine_set_of_target_and_decoy_spectrum_files( parameters_obj) input_psm_set = ming_psm_library.PSMset("input psms") input_psm_set.load_PSM_tsvfile(psms_input_file, load_extra_metadata=True) decoy_psm_set = ming_psm_library.PSMset("decoy psms") decoy_psm_set.psms = input_psm_set.synthetic_psms_by_length_decoy_set( target_filename_list, decoy_filename_list) print("GETTING ALL SYNETHTIC with 0% FDR") input_psm_set.filter_synthetic_psms_by_length(target_filename_list, decoy_filename_list) row_count, kl_data = ming_fileio_library.parse_table_with_headers( kl_input_file) kl_dict = {} for i in range(row_count): filename = os.path.basename(kl_data["Filename"][i]) scan = kl_data["Scan"][i] kl_strict = (kl_data["KL Strict"][i]) kl_unstrict = (kl_data["KL"][i]) interpeak_intensity = (kl_data["Interpeak intensity"][i]) key = filename + ":" + str(scan) kl_dict[key] = { "kl_strict": kl_strict, "kl_unstrict": kl_unstrict, "kl_interpeak": interpeak_intensity } output_file = open(output_psms_file, "w") input_psm_set.write_output(output_file, write_extra_metadata=True) decoy_psm_set.write_output(open(output_decoy_psms_file, "w"), write_extra_metadata=True) output_file.close() #Since we don't support more fields in the psm object, we're going to read this file in again as a tsv file and add the columns as necessary psm_rows, psm_table_data = ming_fileio_library.parse_table_with_headers( output_psms_file) psm_table_data["kl_strict"] = [] psm_table_data["kl_unstrict"] = [] psm_table_data["kl_interpeak"] = [] psm_table_data["ambiguity_total_score"] = [] psm_table_data["first_second_unique_ratio"] = [] psm_table_data["first_unique_count"] = [] psm_table_data["first_unique_intensity"] = [] psm_table_data["numberpsms"] = [] psm_table_data["second_unique_count"] = [] psm_table_data["second_unique_intensity"] = [] psm_table_data["spectrum_unique_key"] = [] psm_table_data["modified_sequence"] = [] for i in range(psm_rows): key = psm_table_data["filename"][i] + ":" + psm_table_data["scan"][i] if key in kl_dict: psm_table_data["kl_strict"].append(kl_dict[key]["kl_strict"]) psm_table_data["kl_unstrict"].append(kl_dict[key]["kl_unstrict"]) psm_table_data["kl_interpeak"].append(kl_dict[key]["kl_interpeak"]) else: psm_table_data["kl_strict"].append(-1) psm_table_data["kl_unstrict"].append(-1) psm_table_data["kl_interpeak"].append(-1) #writing the ambiguity stuff, but just assuming no ambiguity psm_table_data["ambiguity_total_score"].append("-1") psm_table_data["first_second_unique_ratio"].append("-1") psm_table_data["first_unique_count"].append("-1") psm_table_data["first_unique_intensity"].append("-1") psm_table_data["numberpsms"].append(1) psm_table_data["second_unique_count"].append("-1") psm_table_data["second_unique_intensity"].append("-1") psm_table_data["spectrum_unique_key"].append(key) psm_table_data["modified_sequence"].append( psm_table_data["sequence"][i][:-2]) ming_fileio_library.write_dictionary_table_data(psm_table_data, output_psms_file)
def main(): psms_input_file = sys.argv[1] input_spectrum_folder = sys.argv[2] output_psms_file = sys.argv[3] psms_row, psm_table = ming_fileio_library.parse_table_with_headers( psms_input_file) peak_tolerance = 0.1 #Determine which ones have possible bad ambiguity spectrum_to_number_psms_dict = defaultdict(lambda: 0) psm_table["spectrum_unique_key"] = [] for i in range(psms_row): filename = psm_table["filename"][i] scan = psm_table["scan"][i] key = filename + ":" + scan psm_table["spectrum_unique_key"].append(key) spectrum_to_number_psms_dict[key] += 1 psm_table["numberpsms"] = [] spectra_to_reconsider = defaultdict(lambda: defaultdict(list)) for i in range(psms_row): filename = psm_table["filename"][i] scan = psm_table["scan"][i] key = filename + ":" + scan number_of_psms_per_spectrum = spectrum_to_number_psms_dict[key] psm_table["numberpsms"].append(number_of_psms_per_spectrum) if number_of_psms_per_spectrum > 1: spectra_to_reconsider[filename][scan].append( psm_table["sequence"][i][:-2]) spectrum_to_ambiguity_mapping = {} for filename in spectra_to_reconsider: scan_mapping = spectra_to_reconsider[filename] parameter_object = {} parameter_object["filename"] = os.path.join(input_spectrum_folder, filename) parameter_object["scan_mapping"] = scan_mapping print(parameter_object) scan_ambiguity_mapping = calculated_ambiguity(parameter_object, peak_tolerance) for key in scan_ambiguity_mapping: full_spectrum_key = "%s:%s" % (filename, key) spectrum_to_ambiguity_mapping[ full_spectrum_key] = scan_ambiguity_mapping[key] psm_table["ambiguity_total_score"] = [] psm_table["first_unique_count"] = [] psm_table["second_unique_count"] = [] psm_table["first_unique_intensity"] = [] psm_table["second_unique_intensity"] = [] psm_table["first_second_unique_ratio"] = [] for i in range(psms_row): filename = psm_table["filename"][i] scan = psm_table["scan"][i] key = filename + ":" + scan if key in spectrum_to_ambiguity_mapping: psm_table["ambiguity_total_score"].append( spectrum_to_ambiguity_mapping[key]["ambiguity_total_score"]) psm_table["first_unique_count"].append( spectrum_to_ambiguity_mapping[key]["first_unique_count"]) psm_table["second_unique_count"].append( spectrum_to_ambiguity_mapping[key]["second_unique_count"]) psm_table["first_unique_intensity"].append( spectrum_to_ambiguity_mapping[key]["first_unique_intensity"]) psm_table["second_unique_intensity"].append( spectrum_to_ambiguity_mapping[key]["second_unique_intensity"]) psm_table["first_second_unique_ratio"].append( spectrum_to_ambiguity_mapping[key] ["first_second_unique_ratio"]) else: psm_table["ambiguity_total_score"].append(-1) psm_table["first_unique_count"].append(-1) psm_table["second_unique_count"].append(-1) psm_table["first_unique_intensity"].append(-1) psm_table["second_unique_intensity"].append(-1) psm_table["first_second_unique_ratio"].append(-1) ming_fileio_library.write_dictionary_table_data(psm_table, output_psms_file)
def main(): input_result_filename = sys.argv[1] output_result_filename = sys.argv[2] spectrum_id_cache = {} input_rows, input_table = ming_fileio_library.parse_table_with_headers(input_result_filename) output_table = defaultdict(list) output_headers = ["SpectrumID", "Compound_Name", "Ion_Source", "Instrument", "Compound_Source", "PI", "Data_Collector", "Adduct"] output_headers += ["Precursor_MZ", "ExactMass", "Charge", "CAS_Number", "Pubmed_ID", "Smiles", "INCHI", "INCHI_AUX", "Library_Class"] output_headers += ["IonMode", "UpdateWorkflowName", "LibraryQualityString", "#Scan#", "SpectrumFile", "MQScore", "Organism"] output_headers += ["TIC_Query", "RT_Query", "MZErrorPPM", "SharedPeaks", "MassDiff", "LibMZ", "SpecMZ", "SpecCharge"] output_headers += ["MoleculeExplorerDatasets", "MoleculeExplorerFiles"] for header in output_headers: output_table[header] = [] number_hits_per_query = defaultdict(lambda: 0) for i in range(input_rows): number_hits_per_query[input_table["FileScanUniqueID"][i]] += 1 molecule_explorer_df = pd.DataFrame(ming_gnps_library.get_molecule_explorer_dataset_data()) for i in range(input_rows): spectrum_id = input_table["LibrarySpectrumID"][i] score = input_table["MQScore"][i] filename = input_table["SpectrumFile"][i] libfilename = input_table["LibraryName"][i] scan = input_table["#Scan#"][i] TIC_Query = input_table["UnstrictEvelopeScore"][i] RT_Query = input_table["p-value"][i] SpecCharge = input_table["Charge"][i] SpecMZ = input_table["SpecMZ"][i] MZErrorPPM = input_table["mzErrorPPM"][i] SharedPeaks = input_table["LibSearchSharedPeaks"][i] MassDiff = input_table["ParentMassDiff"][i] print(spectrum_id) gnps_library_spectrum = None try: gnps_library_spectrum = None if spectrum_id in spectrum_id_cache: gnps_library_spectrum = spectrum_id_cache[spectrum_id] else: gnps_library_spectrum = ming_gnps_library.get_library_spectrum(spectrum_id) spectrum_id_cache[spectrum_id] = gnps_library_spectrum except KeyboardInterrupt: raise except: continue gnps_library_spectrum["annotations"] = sorted(gnps_library_spectrum["annotations"], key=lambda annotation: annotation["create_time"], reverse=True) output_table["SpectrumID"].append(spectrum_id) output_table["Compound_Name"].append(gnps_library_spectrum["annotations"][0]["Compound_Name"].replace("\t", "")) output_table["Ion_Source"].append(gnps_library_spectrum["annotations"][0]["Ion_Source"].replace("\t", "")) output_table["Instrument"].append(gnps_library_spectrum["annotations"][0]["Instrument"].replace("\t", "")) output_table["Compound_Source"].append(gnps_library_spectrum["annotations"][0]["Compound_Source"].replace("\t", "")) output_table["PI"].append(gnps_library_spectrum["annotations"][0]["PI"].replace("\t", "")) output_table["Data_Collector"].append(gnps_library_spectrum["annotations"][0]["Data_Collector"].replace("\t", "")) output_table["Adduct"].append(gnps_library_spectrum["annotations"][0]["Adduct"].replace("\t", "")) output_table["Precursor_MZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"].replace("\t", "")) output_table["ExactMass"].append(gnps_library_spectrum["annotations"][0]["ExactMass"].replace("\t", "")) output_table["Charge"].append(gnps_library_spectrum["annotations"][0]["Charge"].replace("\t", "")) output_table["CAS_Number"].append(gnps_library_spectrum["annotations"][0]["CAS_Number"].replace("\t", "")) output_table["Pubmed_ID"].append(gnps_library_spectrum["annotations"][0]["Pubmed_ID"].replace("\t", "")) output_table["Smiles"].append(gnps_library_spectrum["annotations"][0]["Smiles"].replace("\t", "")) output_table["INCHI"].append(gnps_library_spectrum["annotations"][0]["INCHI"].replace("\t", "")) output_table["INCHI_AUX"].append(gnps_library_spectrum["annotations"][0]["INCHI_AUX"].replace("\t", "")) output_table["Library_Class"].append(gnps_library_spectrum["annotations"][0]["Library_Class"].replace("\t", "")) output_table["IonMode"].append(gnps_library_spectrum["annotations"][0]["Ion_Mode"].replace("\t", "")) if gnps_library_spectrum["annotations"][0]["Library_Class"] == "1": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-GOLD") output_table["LibraryQualityString"].append("Gold") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "2": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-SILVER") output_table["LibraryQualityString"].append("Silver") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "3": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Bronze") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "4": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Insilico") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "5": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Insilico") elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "10": output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE") output_table["LibraryQualityString"].append("Challenge") else: print("BULLLSHIT", gnps_library_spectrum["annotations"][0]["Library_Class"]) output_table["#Scan#"].append(scan) output_table["SpectrumFile"].append(filename) output_table["LibraryName"].append(libfilename) output_table["MQScore"].append(score) output_table["Organism"].append(gnps_library_spectrum["spectruminfo"]["library_membership"]) output_table["TIC_Query"].append(TIC_Query) output_table["RT_Query"].append(RT_Query) output_table["MZErrorPPM"].append(MZErrorPPM) output_table["SharedPeaks"].append(SharedPeaks) output_table["MassDiff"].append(MassDiff) output_table["LibMZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"]) output_table["SpecMZ"].append(SpecMZ) output_table["SpecCharge"].append(SpecCharge) output_table["FileScanUniqueID"].append(input_table["FileScanUniqueID"][i]) output_table["NumberHits"].append(number_hits_per_query[input_table["FileScanUniqueID"][i]]) tag_list = [ (tag["tag_desc"] + "[" + tag["tag_type"] + "]") for tag in gnps_library_spectrum["spectrum_tags"]] tag_string = "||".join(tag_list).replace("\t", "") output_table["tags"].append(tag_string) #Getting molecule explorer information compound_name = gnps_library_spectrum["annotations"][0]["Compound_Name"].replace("\t", "") compound_filtered_df = molecule_explorer_df[molecule_explorer_df["compound_name"] == compound_name] if len(compound_filtered_df) == 1: output_table["MoleculeExplorerDatasets"].append(compound_filtered_df.to_dict(orient="records")[0]["number_datasets"]) output_table["MoleculeExplorerFiles"].append(compound_filtered_df.to_dict(orient="records")[0]["number_files"]) else: output_table["MoleculeExplorerDatasets"].append(0) output_table["MoleculeExplorerFiles"].append(0) ming_fileio_library.write_dictionary_table_data(output_table, output_result_filename)
def main(): params_obj = ming_proteosafe_library.parse_xml_file(open(sys.argv[1])) augment_task_id = params_obj["task"][0] all_tasks_output_dict = defaultdict(list) all_augments_output_dict = defaultdict(list) all_spectrum_files_output_dict = defaultdict(list) search_task_to_augment = {} search_task_to_extraction = {} all_search_tasks = set() process_tree = True while process_tree: print("AUGMENT", augment_task_id, len(augment_task_id)) augment_task_information = ming_proteosafe_library.get_task_information( "proteomics2.ucsd.edu", augment_task_id) extract_task_id = "" previous_augment_task_id = "" for filename in augment_task_information["files"]: if filename.find("unfiltered_peptide_list") != -1: previous_augment_task_id = ming_fileio_library.get_root_folder( filename.replace( ming_fileio_library.get_root_folder(filename) + "/", "")) if filename.find("extracted_spectra_peptides_merged") != -1: extract_task_id = ming_fileio_library.get_root_folder( filename.replace( ming_fileio_library.get_root_folder(filename) + "/", "")) previous_augment_task_id = previous_augment_task_id.strip() if len(previous_augment_task_id) < 10: process_tree = False print(previous_augment_task_id, extract_task_id) all_augments_output_dict["augment_task"].append(augment_task_id) all_augments_output_dict["extract_task"].append(extract_task_id) all_augments_output_dict["precursor_count"].append(0) all_augments_output_dict["timestamp"].append( augment_task_information["createtime"]) #Processing extract task_id extract_task_info = ming_proteosafe_library.get_task_information( "proteomics2.ucsd.edu", extract_task_id) extract_task_parameters = ming_proteosafe_library.get_task_parameters( "proteomics2.ucsd.edu", extract_task_id) tasks_to_extract = json.loads( extract_task_parameters["tasks_to_consolidate"][0]) for task in tasks_to_extract: search_task_to_augment[task] = augment_task_id search_task_to_extraction[task] = extract_task_id all_tasks_output_dict["search_task_id"].append(task) all_tasks_output_dict["extract_task_id"].append(extract_task_id) all_tasks_output_dict["augment_task_id"].append(augment_task_id) all_search_tasks.add(task) print(extract_task_parameters["task_file"][0]) path_to_task_file = os.path.join( "/data/ccms-data/uploads", extract_task_parameters["task_file"][0][2:-1]) if os.path.isfile(path_to_task_file): print("SEARCH FILE", path_to_task_file) try: row_count, table_data = ming_fileio_library.parse_table_with_headers( path_to_task_file) print("Rows", row_count) for i in range(row_count): search_task_id = table_data["TASKID"][i] print(i, search_task_id) search_task_to_augment[search_task_id] = augment_task_id search_task_to_extraction[search_task_id] = extract_task_id all_tasks_output_dict["search_task_id"].append( search_task_id) all_tasks_output_dict["extract_task_id"].append( extract_task_id) all_tasks_output_dict["augment_task_id"].append( augment_task_id) all_search_tasks.add(search_task_id) except: raise continue augment_task_id = previous_augment_task_id print(len(all_search_tasks)) for i in range(len(all_tasks_output_dict["search_task_id"])): search_task = all_tasks_output_dict["search_task_id"][i] try: print(search_task) task_information = ming_proteosafe_library.get_task_information( "proteomics2.ucsd.edu", search_task) all_tasks_output_dict["search_description"].append( task_information["description"]) for filename in task_information["files"]: if filename.find(".mzXML") != -1 or filename.find( ".mzML") != -1: all_spectrum_files_output_dict["spectrum_filename"].append( filename) all_spectrum_files_output_dict["search_task"].append( search_task) all_spectrum_files_output_dict[ "search_description"].append( task_information["description"]) except KeyboardInterrupt: raise except: all_tasks_output_dict["search_description"].append("") print("error", search_task) continue provenace_structure = {} provenace_structure["search_task_to_augment"] = search_task_to_augment provenace_structure[ "search_task_to_extraction"] = search_task_to_extraction open(sys.argv[2], "w").write(json.dumps(provenace_structure, indent=4)) ming_fileio_library.write_dictionary_table_data(all_tasks_output_dict, sys.argv[3]) ming_fileio_library.write_dictionary_table_data(all_augments_output_dict, sys.argv[4]) ming_fileio_library.write_dictionary_table_data( all_spectrum_files_output_dict, sys.argv[5])
def main(): paramxml_input_filename = sys.argv[1] all_matches_filename = sys.argv[2] summary_filename = sys.argv[3] params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename)) try: if params_obj["MATCH_REFERENCE_DATASETS"][0] != "1": output_dict = {} ming_fileio_library.write_dictionary_table_data(output_dict, summary_filename) exit(0) except: output_dict = {} ming_fileio_library.write_dictionary_table_data(output_dict, summary_filename) exit(0) dataset_dict = [] try: dataset_dict = ming_proteosafe_library.get_all_dataset_dict() except: dataset_dict = {} row_count, table_data = ming_fileio_library.parse_table_with_headers(all_matches_filename) matches_list = [] for i in range(row_count): match = {} for key in table_data: match[key] = table_data[key][i] matches_list.append(match) matches_by_scan = defaultdict(list) for match in matches_list: query_spectrum_key = match["query_filename"] + ":" + match["query_scan"] matches_by_scan[query_spectrum_key].append(match) output_dict = defaultdict(list) for spectrum_key in matches_by_scan: contains_blank = 0 datasets_contained = [] compound_identifications = [] spectrum_ids = [] all_scores = [] for match in matches_by_scan[spectrum_key]: if match["is_blank"] == "1": contains_blank = 1 datasets_contained.append(match["dataset_id"]) compound_identifications.append(match["identification"]) spectrum_ids.append(match["spectrum_id"]) all_scores.append(match["score"]) datasets_contained = list(set(datasets_contained)) compound_identifications = list(set(compound_identifications)) spectrum_ids = list(set(spectrum_ids)) dataset_descriptions = [] for dataset_id in datasets_contained: dataset_descriptions.append(dataset_dict[dataset_id]["title"].strip()) output_dict["query_scan"].append(matches_by_scan[spectrum_key][0]["query_scan"]) output_dict["query_filename"].append(matches_by_scan[spectrum_key][0]["query_filename"]) output_dict["dataset_list"].append("!".join(datasets_contained)) output_dict["dataset_descriptions"].append("!".join(dataset_descriptions)) output_dict["contains_blank"].append(contains_blank) output_dict["identification"].append("!".join(compound_identifications)) output_dict["spectrum_id"].append("!".join(spectrum_ids)) output_dict["best_score"].append(max(all_scores)) for key in output_dict: print(key, len(output_dict[key])) ming_fileio_library.write_dictionary_table_data(output_dict, summary_filename)