def grab_single_result(task_id, output_peptide_directory, output_psm_directory): return_dict = {} return_dict["number_psms"] = 0 return_dict["number_peptides"] = 0 return_dict["task_id"] = task_id task_info = ming_proteosafe_library.get_task_information( "proteomics2.ucsd.edu", task_id) user = task_info["user"] if task_info["status"] == "FAILED": return return_dict #lets check whether whether this has the peptide output, if not we can create it path_to_secondpass_peptides_files_list = ming_proteosafe_library.get_proteosafe_result_file_path( task_id, user, "updated_eval_psms_with_kl_with_ambiguity") if len(path_to_secondpass_peptides_files_list) == 0: return_dict = grab_results_from_MSGFDB(task_id, user, output_peptide_directory, output_psm_directory) return return_dict if len(path_to_secondpass_peptides_files_list) == 1: return_dict = grab_results_from_multipass(task_id, user, output_peptide_directory, output_psm_directory) return return_dict
def main(): params_obj = ming_proteosafe_library.parse_xml_file(open(sys.argv[1])) augment_task_id = params_obj["task"][0] all_tasks_output_dict = defaultdict(list) all_augments_output_dict = defaultdict(list) all_spectrum_files_output_dict = defaultdict(list) search_task_to_augment = {} search_task_to_extraction = {} all_search_tasks = set() process_tree = True while process_tree: print("AUGMENT", augment_task_id, len(augment_task_id)) augment_task_information = ming_proteosafe_library.get_task_information( "proteomics2.ucsd.edu", augment_task_id) extract_task_id = "" previous_augment_task_id = "" for filename in augment_task_information["files"]: if filename.find("unfiltered_peptide_list") != -1: previous_augment_task_id = ming_fileio_library.get_root_folder( filename.replace( ming_fileio_library.get_root_folder(filename) + "/", "")) if filename.find("extracted_spectra_peptides_merged") != -1: extract_task_id = ming_fileio_library.get_root_folder( filename.replace( ming_fileio_library.get_root_folder(filename) + "/", "")) previous_augment_task_id = previous_augment_task_id.strip() if len(previous_augment_task_id) < 10: process_tree = False print(previous_augment_task_id, extract_task_id) all_augments_output_dict["augment_task"].append(augment_task_id) all_augments_output_dict["extract_task"].append(extract_task_id) all_augments_output_dict["precursor_count"].append(0) all_augments_output_dict["timestamp"].append( augment_task_information["createtime"]) #Processing extract task_id extract_task_info = ming_proteosafe_library.get_task_information( "proteomics2.ucsd.edu", extract_task_id) extract_task_parameters = ming_proteosafe_library.get_task_parameters( "proteomics2.ucsd.edu", extract_task_id) tasks_to_extract = json.loads( extract_task_parameters["tasks_to_consolidate"][0]) for task in tasks_to_extract: search_task_to_augment[task] = augment_task_id search_task_to_extraction[task] = extract_task_id all_tasks_output_dict["search_task_id"].append(task) all_tasks_output_dict["extract_task_id"].append(extract_task_id) all_tasks_output_dict["augment_task_id"].append(augment_task_id) all_search_tasks.add(task) print(extract_task_parameters["task_file"][0]) path_to_task_file = os.path.join( "/data/ccms-data/uploads", extract_task_parameters["task_file"][0][2:-1]) if os.path.isfile(path_to_task_file): print("SEARCH FILE", path_to_task_file) try: row_count, table_data = ming_fileio_library.parse_table_with_headers( path_to_task_file) print("Rows", row_count) for i in range(row_count): search_task_id = table_data["TASKID"][i] print(i, search_task_id) search_task_to_augment[search_task_id] = augment_task_id search_task_to_extraction[search_task_id] = extract_task_id all_tasks_output_dict["search_task_id"].append( search_task_id) all_tasks_output_dict["extract_task_id"].append( extract_task_id) all_tasks_output_dict["augment_task_id"].append( augment_task_id) all_search_tasks.add(search_task_id) except: raise continue augment_task_id = previous_augment_task_id print(len(all_search_tasks)) for i in range(len(all_tasks_output_dict["search_task_id"])): search_task = all_tasks_output_dict["search_task_id"][i] try: print(search_task) task_information = ming_proteosafe_library.get_task_information( "proteomics2.ucsd.edu", search_task) all_tasks_output_dict["search_description"].append( task_information["description"]) for filename in task_information["files"]: if filename.find(".mzXML") != -1 or filename.find( ".mzML") != -1: all_spectrum_files_output_dict["spectrum_filename"].append( filename) all_spectrum_files_output_dict["search_task"].append( search_task) all_spectrum_files_output_dict[ "search_description"].append( task_information["description"]) except KeyboardInterrupt: raise except: all_tasks_output_dict["search_description"].append("") print("error", search_task) continue provenace_structure = {} provenace_structure["search_task_to_augment"] = search_task_to_augment provenace_structure[ "search_task_to_extraction"] = search_task_to_extraction open(sys.argv[2], "w").write(json.dumps(provenace_structure, indent=4)) ming_fileio_library.write_dictionary_table_data(all_tasks_output_dict, sys.argv[3]) ming_fileio_library.write_dictionary_table_data(all_augments_output_dict, sys.argv[4]) ming_fileio_library.write_dictionary_table_data( all_spectrum_files_output_dict, sys.argv[5])
def trace_filename_filesystem(all_datasets, dataset_accession, dataset_scan, enrichmetadata=False): output_file_list = [] output_match_list = [] for dataset_object in all_datasets: if dataset_object["dataset"] == dataset_accession: networking_job = ming_gnps_library.get_most_recent_continuous_networking_of_dataset( dataset_object["task"]) if networking_job == None: continue networking_task_info = ming_proteosafe_library.get_task_information( "gnps.ucsd.edu", networking_job["task"]) task_user = networking_task_info["user"] clustering_path = os.path.join( "/data/ccms-data/tasks", task_user, networking_job["task"], "allclustered_spectra_info_withpath") clustering_files = ming_fileio_library.list_files_in_dir( clustering_path) if len(clustering_files) != 1: continue clustering_membership_list = ming_fileio_library.parse_table_with_headers_object_list( clustering_files[0]) acceptable_raw_spectra = [ spectrum for spectrum in clustering_membership_list if spectrum["cluster index"] == str(dataset_scan) ] for raw_spectrum in acceptable_raw_spectra: output_object = {} output_object["dataset_id"] = dataset_accession output_object["cluster_scan"] = dataset_scan output_object["filename"] = raw_spectrum["Original_Path"] output_object["filescan"] = raw_spectrum["ScanNumber"] output_object["metadata"] = "" output_object["basefilename"] = os.path.basename( raw_spectrum["Original_Path"]) if enrichmetadata: try: metadata_list = get_metadata_information_per_filename( raw_spectrum["Original_Path"]) output_object["metadata"] = "|".join(metadata_list) except: print("ReDU is down") output_match_list.append(output_object) print(len(acceptable_raw_spectra)) unique_files = list( set([ spectrum["Original_Path"] for spectrum in acceptable_raw_spectra ])) print(len(unique_files)) for source_file in unique_files: output_object = {} output_object["dataset_id"] = dataset_accession output_object["cluster_scan"] = dataset_scan output_object["filename"] = source_file output_object["metadata"] = "" output_object["basefilename"] = os.path.basename(source_file) if enrichmetadata: try: metadata_list = get_metadata_information_per_filename( source_file) output_object["metadata"] = "|".join(metadata_list) except: print("ReDU is down") output_file_list.append(output_object) #Performing a fix to make sure the spectrum is present because of a renaming from <dataset>/spectrum to <dataset>/ccms_peak for file_dict in output_file_list: splits = file_dict["filename"].split("/") splits[1] = splits[1].replace("spectrum", "ccms_peak") file_dict["filename"] = "/".join(splits) for file_dict in output_match_list: splits = file_dict["filename"].split("/") splits[1] = splits[1].replace("spectrum", "ccms_peak") file_dict["filename"] = "/".join(splits) return output_file_list, output_match_list