def main(): first_pass_results_filename = sys.argv[1] second_pass_results_filename = sys.argv[2] fasta_db_filename = sys.argv[3] second_pass_proteins_filename = sys.argv[4] output_first_pass_peptides = sys.argv[5] output_second_pass_peptides = sys.argv[6] output_psms_first_pass = sys.argv[7] output_psms_updated_evalues = sys.argv[8] output_original_high_FDR_psms = sys.argv[9] output_updated_high_FDR_psms = sys.argv[10] #Low FDR Original and updated evals psm_list_first_pass = ming_psm_library.PSMset(first_pass_results_filename) psm_list_first_pass.load_MSGF_Plus_tsvfile(first_pass_results_filename) psm_list_first_pass.remove_duplicated_rows() psm_list_first_pass.filter_to_fdr_by_length(0.01) psm_list_first_pass.write_output(open(output_psms_first_pass, "w")) update_psm_set_with_second_pass_psms(first_pass_results_filename, second_pass_results_filename, output_psms_updated_evalues) #High FDR for other purposes to show things psm_list_first_pass = ming_psm_library.PSMset(first_pass_results_filename) psm_list_first_pass.load_MSGF_Plus_tsvfile(first_pass_results_filename) psm_list_first_pass.remove_duplicated_rows() psm_list_first_pass.filter_to_fdr_by_length(0.01) psm_list_first_pass.write_output(open(output_original_high_FDR_psms, "w")) update_psm_set_with_second_pass_psms(first_pass_results_filename, second_pass_results_filename, output_updated_high_FDR_psms, 0.05) #Precursor Level psm_list_first_pass = ming_psm_library.PSMset(first_pass_results_filename) psm_list_first_pass.load_MSGF_Plus_tsvfile(first_pass_results_filename) psm_list_first_pass.remove_duplicated_rows() full_peptides_list_first_pass = library_creation.create_library_unique_peptides_filtered( [psm_list_first_pass], filter_by_length=True) full_peptides_list_first_pass.write_output( open(output_first_pass_peptides, "w")) psm_list_second_pass = ming_psm_library.PSMset( second_pass_results_filename) psm_list_second_pass.load_MSGF_Plus_tsvfile(second_pass_results_filename) psm_list_second_pass.remove_duplicated_rows() full_peptides_list_second_pass = library_creation.create_library_unique_peptides_filtered( [psm_list_second_pass], filter_by_length=True) full_peptides_list_second_pass.write_output( open(output_second_pass_peptides, "w"))
def grab_results_from_task(task_id, user, output_peptide_directory, output_psm_directory, params_obj, folder_for_results): return_dict = {} return_dict["number_psms"] = 0 return_dict["number_peptides"] = 0 return_dict["task_id"] = task_id #Copying the psm files path_to_psm_files_list = ming_proteosafe_library.get_proteosafe_result_file_path( task_id, user, folder_for_results) if len(path_to_psm_files_list) == 1: output_psm_path = os.path.join(output_psm_directory, task_id + ".psms") path_to_param_file = ming_proteosafe_library.get_proteosafe_result_file_path( task_id, user, "params")[0] #These are original results that are from MSGF+ that includes the fragmentation method print( task_id, user, ming_proteosafe_library.get_proteosafe_result_file_path( task_id, user, "mergedResult")) path_to_merged_results = ming_proteosafe_library.get_proteosafe_result_file_path( task_id, user, "mergedResult")[0] print(path_to_psm_files_list[0] + " to " + output_psm_path) #name_demangle_filenames(path_to_psm_files_list[0], output_psm_path, path_to_param_file, "filename", "filename") name_demangle_filenames_and_instrument_collision( path_to_psm_files_list[0], output_psm_path, path_to_param_file, path_to_merged_results, "filename", "filename") #Now lets generate the peptide list from the psm list psm_set = ming_psm_library.PSMset("task results") psm_set.load_PSM_tsvfile(output_psm_path, True) print("PSM Count", len(psm_set.psms)) psm_set.psms = filter_psms_with_params(params_obj, psm_set.psms) #Setting the task of each psm for psm in psm_set.psms: psm.extra_metadata["proteosafe_task"] = task_id print("PSM Count Filtered", len(psm_set.psms)) psm_set.filter_to_fdr_by_length(0.05) output_pickle = open(output_psm_path, 'wb') pickle.dump(psm_set, output_pickle, pickle.HIGHEST_PROTOCOL) output_pickle.close() output_peptide_path = output_psm_path = os.path.join( output_peptide_directory, task_id + ".peptides") peptide_variant_set = save_psms_as_peptides(psm_set, output_peptide_path, 0.05) return_dict["number_psms"] = len(psm_set.psms) return_dict["number_peptides"] = len(peptide_variant_set.peptide_list) return return_dict
def get_first_pass_variant_set(first_pass_results_filename): psm_list_first_pass = ming_psm_library.PSMset(first_pass_results_filename) psm_list_first_pass.load_MSGF_Plus_tsvfile(first_pass_results_filename) psm_list_first_pass.filter_to_fdr_by_length(0.05) print "First Pass PSMs: " + str(len(psm_list_first_pass)) full_peptides_list_first_pass = library_creation.create_library_unique_peptides_filtered( [psm_list_first_pass], filter_by_length=True) print "First Pass Variants: " + str(len(full_peptides_list_first_pass)) return full_peptides_list_first_pass
def save_psms_as_peptides(psm_set, output_peptide_path, fdr): peptide_variant_set = library_creation.create_library_unique_peptides_filtered( [psm_set], fdr, filter_by_length=True) psm_set = ming_psm_library.PSMset("task results") for peptide in peptide_variant_set.peptide_list: psm_set.psms.append(peptide.get_best_psm()) output_pickle = open(output_peptide_path, 'wb') pickle.dump(psm_set, output_pickle, pickle.HIGHEST_PROTOCOL) output_pickle.close() return peptide_variant_set
def update_psm_set_with_second_pass_psms(first_pass_psms, second_pass_psms, output_psms, FDR=0.05): #print(second_pass_psms) print("Loading second pass PSMs", second_pass_psms) psm_list_second_pass = ming_psm_library.PSMset(second_pass_psms) psm_list_second_pass.load_MSGF_Plus_tsvfile(second_pass_psms) psm_list_second_pass.remove_duplicated_rows() print("Loading first pass PSMs", first_pass_psms) psm_list_first_pass = ming_psm_library.PSMset(first_pass_psms) psm_list_first_pass.load_MSGF_Plus_tsvfile(first_pass_psms) psm_list_first_pass.remove_duplicated_rows() update_evalues_first_second_pass(psm_list_first_pass, psm_list_second_pass) psm_list_first_pass.filter_to_fdr_by_length(FDR) #Writing out the results psm_list_first_pass.write_output(open(output_psms, "w"), write_extra_metadata=True)
def main(): input_searchresults_filename = sys.argv[1] output_peptide_list = sys.argv[2] output_peptide_list_with_decoy_filename = sys.argv[3] psm_list = ming_psm_library.PSMset(input_searchresults_filename) psm_list.load_MSGF_Plus_tsvfile(input_searchresults_filename) psm_list.filter_to_fdr_by_length(0.01) print len(psm_list) full_peptides_list = library_creation.create_library_unique_peptides_filtered( [psm_list], fdr=0.01, filter_by_length=True) output_file = open(output_peptide_list, "w") all_peptides = [ peptide.get_stripped_sequence() for peptide in full_peptides_list.peptide_list ] all_peptides = list(set(all_peptides)) for peptide in all_peptides: output_file.write(peptide + "\n") #Now lets load the PSMs and keep all variants, and then output them with the decoys present print "GIVING US FULL RESULT SET" psm_list = ming_psm_library.PSMset(input_searchresults_filename) psm_list.load_MSGF_Plus_tsvfile(input_searchresults_filename) full_peptides_list = library_creation.create_library_unique_peptides_filtered( [psm_list], 1.0) output_peptide_list_with_decoy_file = open( output_peptide_list_with_decoy_filename, "w") output_peptide_list_with_decoy_file.write( ming_psm_library.PeptideVariant.output_header() + "\n") for peptide in full_peptides_list.peptide_list: output_peptide_list_with_decoy_file.write(str(peptide) + "\n")
def main(): input_fasta_filename = sys.argv[1] input_searchresults_filename = sys.argv[2] output_proteins_as_list = sys.argv[3] proteome = ming_protein_library.parse_fasta_proteome_file( input_fasta_filename) #for protein in proteome.protein_list: # print protein.protein psm_list = ming_psm_library.PSMset(input_searchresults_filename) psm_list.load_MSGF_Plus_tsvfile(input_searchresults_filename) full_peptides_list = library_creation.create_library_unique_peptides_filtered( [psm_list], 0.01, filter_by_length=True) target_peptide_strings = [] decoy_peptide_strings = [] for peptide_obj in full_peptides_list.peptide_list: peptide_to_search = peptide_obj.get_stripped_sequence() if peptide_obj.is_decoy(): decoy_peptide_strings.append(peptide_to_search[::-1]) else: target_peptide_strings.append(peptide_to_search) protein_coverage_of_targets = proteome.get_proteins_with_number_of_peptides_covered_map( target_peptide_strings) protein_coverage_of_decoys = proteome.get_proteins_with_number_of_peptides_covered_map( decoy_peptide_strings) output_file = open(output_proteins_as_list, "w") output_file.write( "protein\tdecoy_count\ttarget_count\ttotal_count\tlength\n") for protein in protein_coverage_of_targets: output_string = protein + "\t" output_string += str(protein_coverage_of_decoys[protein]) + "\t" output_string += str(protein_coverage_of_targets[protein]) + "\t" output_string += str(protein_coverage_of_targets[protein] + protein_coverage_of_decoys[protein]) + "\t" output_string += str(len( proteome.protein_map[protein].sequence)) + "\n" output_file.write(output_string) output_file.close()
def main(): parallel_json = json.loads(open(sys.argv[1]).read()) params_filename = sys.argv[2] input_folder_of_results = sys.argv[3] output_folder = sys.argv[4] my_node = parallel_json["node_partition"] total_node = parallel_json["total_paritions"] all_input_files = ming_fileio_library.list_files_in_dir(input_folder_of_results) all_input_files.sort() ### ### TODO We will have to read parameters and see if we need to eliminate some PSMs, with PSM FDR filter, KL Filter, ambiguity score filter, unique intensity filter ### params_obj = ming_proteosafe_library.parse_xml_file(open(params_filename)) total_file_count = 0 all_input_files = all_input_files[my_node::total_node] current_working_psm_set = ming_psm_library.PSMset("Ming") for input_file in all_input_files: #Assume these are variant files #We can treat this like a psm file and then combine all of the as a new variants file total_file_count += 1 print(input_file, total_file_count, "of", len(all_input_files)) input_pickle = open(input_file, 'rb') temp_psm_set = pickle.load(input_pickle) print("Loaded", len(temp_psm_set.psms)) for psm in temp_psm_set.psms: precursor_string = "%s:%d" % (psm.annotation, psm.charge) score = psm.score #Determine minimum score cutoff current_score = psm.sorting_value() peptide_length = len(psm.get_stripped_sequence()) current_working_psm_set.psms.append(psm) #Saving out psms output_filename = os.path.join(output_folder, str(my_node) + ".psms") current_working_psm_set.write_output(open(output_filename, "w"), True)
def grab_results_from_multipass(task_id, user, output_peptide_directory, output_psm_directory): return_dict = {} return_dict["number_psms"] = 0 return_dict["number_peptides"] = 0 return_dict["task_id"] = task_id #Copying the psm files path_to_psm_files_list = ming_proteosafe_library.get_proteosafe_result_file_path( task_id, user, "updated_eval_psms_with_kl_with_ambiguity") if len(path_to_psm_files_list) == 1: output_psm_path = os.path.join(output_psm_directory, task_id + ".psms") path_to_param_file = ming_proteosafe_library.get_proteosafe_result_file_path( task_id, user, "params")[0] #path_to_merged_results = ming_proteosafe_library.get_proteosafe_backend_result_file_path(task_id, "mergedResult", "proteomics2")[0] print( ming_proteosafe_library.get_proteosafe_result_file_path( task_id, user, "mergedResult")) path_to_merged_results = ming_proteosafe_library.get_proteosafe_result_file_path( task_id, user, "mergedResult")[0] print(path_to_psm_files_list[0] + " to " + output_psm_path) #name_demangle_filenames(path_to_psm_files_list[0], output_psm_path, path_to_param_file, "filename", "filename") name_demangle_filenames_and_instrument_collision( path_to_psm_files_list[0], output_psm_path, path_to_param_file, path_to_merged_results, "filename", "filename") #Now lets generate the peptide list from the psm list psm_set = ming_psm_library.PSMset("task results") psm_set.load_PSM_tsvfile(output_psm_path) output_peptide_path = output_psm_path = os.path.join( output_peptide_directory, task_id + ".peptides") peptide_variant_set = library_creation.create_library_unique_peptides_filtered( [psm_set], 0.01) peptide_variant_set.write_output(open(output_peptide_path, "w")) return_dict["number_psms"] = len(psm_set.psms) return_dict["number_peptides"] = len(peptide_variant_set.peptide_list) return return_dict
def main(): paramxml_filename = sys.argv[1] input_spectrum_filename = sys.argv[2] input_spectrum_all = sys.argv[3] psms_input_file = sys.argv[4] input_collision_energy_folder = sys.argv[5] output_psms_file = sys.argv[6] parameters_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_filename)) scan_metadata_maps = load_collision_energy_mapping(input_collision_energy_folder) target_filename_list, decoy_filename_list = determine_set_of_target_and_decoy_spectrum_files(parameters_obj) input_psm_set = ming_psm_library.PSMset("input psms") input_psm_set.load_MSGF_Plus_tsvfile(psms_input_file) """Filtering on Collision Energy""" print("Size Before Filtering", len(input_psm_set.psms)) filter_psms_to_acceptable_metadata(input_psm_set, scan_metadata_maps, parameters_obj) print("Size After CE Filtering", len(input_psm_set.psms)) """Filtering to current file""" current_file_psms = get_psms_to_current_file(input_psm_set, input_spectrum_filename) target_file_psms = get_psms_to_target_file(input_psm_set, target_filename_list) print(len(current_file_psms), len(target_file_psms)) output_decoys_list = [] if os.path.basename(input_spectrum_filename) in target_filename_list: #no filtering, just save print("Target") output_decoys_list = target_file_psms else: #Find top scoring hit for each precursor blacklisted_decoy_peptides = json.loads(parameters_obj["blacklisted_decoy_peptides_json"][0]) current_file_psms = filtering_out_blacklisted_decoys(current_file_psms, blacklisted_decoy_peptides) output_decoys_list = filtering_out_high_scoring_decoys(current_file_psms, target_file_psms, os.path.join(input_spectrum_all, target_filename_list[0]), input_spectrum_filename) output_decoys_list = filtering_redundant_identifications_per_scan(output_decoys_list) input_psm_set.psms = output_decoys_list input_psm_set.write_output(open(output_psms_file, "w"))
def main(): input_fasta_filename = sys.argv[1] input_searchresults_filename = sys.argv[2] output_fasta_filename = sys.argv[3] output_proteins_as_list = sys.argv[4] proteome = ming_protein_library.parse_fasta_proteome_file(input_fasta_filename) #for protein in proteome.protein_list: # print protein.protein psm_list = ming_psm_library.PSMset(input_searchresults_filename) psm_list.load_MSGF_Plus_tsvfile(input_searchresults_filename) psm_list.filter_to_fdr_by_length(0.01) print len(psm_list) full_peptides_list = library_creation.create_library_unique_peptides_filtered([psm_list], fdr=0.01, filter_by_length=True) #Testing efficient version fo this all_peptide_strings = [] for peptide_obj in full_peptides_list.peptide_list: peptide_to_search = peptide_obj.get_stripped_sequence() all_peptide_strings.append(peptide_obj.get_stripped_sequence()) all_proteins = proteome.get_proteins_covered_by_k_peptides(all_peptide_strings, 2, True) all_protein_names = [] for protein in all_proteins: all_protein_names.append(protein.protein) output_protein_filename = output_fasta_filename open(output_protein_filename, "w").write(json.dumps(all_protein_names)) #Outputting the list of proteins output_protein_list_file = open(output_proteins_as_list, "w") output_protein_list_file.write("Protein\n") for protein in all_protein_names: output_protein_list_file.write(protein + "\n") exit(0)
def create_library_merged_psm_list_separate_fdr_peptide_length( psm_set, fdr=0.01): full_peptide_set = ming_psm_library.PeptideVariantSet("Combined") peptide_length_map = {} for psm in psm_set.psms: peptide_length = len(psm.get_stripped_sequence()) if not peptide_length in peptide_length_map: peptide_length_map[peptide_length] = ming_psm_library.PSMset( "length" + str(peptide_length)) peptide_length_map[peptide_length].psms.append(psm) #Lets do FDR on each length for peptide_length in peptide_length_map: #print peptide_length_map[peptide_length] peptide_set = ming_psm_library.PeptideVariantSet("Test") peptide_set.add_psms_set(peptide_length_map[peptide_length]) peptide_set.filter_to_fdr(fdr) full_peptide_set.add_variant_set(peptide_set) #print full_peptide_set.peptide_list return full_peptide_set
def main(): paramxml_filename = sys.argv[1] psms_input_file = sys.argv[2] kl_input_file = sys.argv[3] output_psms_file = sys.argv[4] parameters_obj = ming_proteosafe_library.parse_xml_file( open(paramxml_filename)) input_psm_set = ming_psm_library.PSMset("input psms") input_psm_set.load_MSGF_Plus_tsvfile(psms_input_file) input_psm_set.remove_redundant_psms() #input_psm_set.filter_to_fdr(0.05) input_psm_set.filter_to_fdr_by_length(0.01) row_count, kl_data = ming_fileio_library.parse_table_with_headers( kl_input_file) kl_dict = {} for i in range(row_count): filename = os.path.basename(kl_data["Filename"][i]) scan = kl_data["Scan"][i] kl_strict = (kl_data["KL Strict"][i]) kl_unstrict = (kl_data["KL"][i]) interpeak_intensity = (kl_data["Interpeak intensity"][i]) key = filename + ":" + str(scan) kl_dict[key] = { "kl_strict": kl_strict, "kl_unstrict": kl_unstrict, "kl_interpeak": interpeak_intensity } #for psm in input_psm_set.psms: # key = psm.filename + ":" + str(psm.scan) # if key in kl_dict: # psm.kl = kl_dict[key] output_file = open(output_psms_file, "w") input_psm_set.write_output(output_file) output_file.close() #Since we don't support more fields in the psm object, we're going to read this file in again as a tsv file and add the columns as necessary psm_rows, psm_table_data = ming_fileio_library.parse_table_with_headers( output_psms_file) psm_table_data["kl_strict"] = [] psm_table_data["kl_unstrict"] = [] psm_table_data["kl_interpeak"] = [] for i in range(psm_rows): key = psm_table_data["filename"][i] + ":" + psm_table_data["scan"][i] if key in kl_dict: psm_table_data["kl_strict"].append(kl_dict[key]["kl_strict"]) psm_table_data["kl_unstrict"].append(kl_dict[key]["kl_unstrict"]) psm_table_data["kl_interpeak"].append(kl_dict[key]["kl_interpeak"]) else: psm_table_data["kl_strict"].append(-1) psm_table_data["kl_unstrict"].append(-1) psm_table_data["kl_interpeak"].append(-1) #Change C to C+57 #if "cysteine_protease.cysteine" in parameters_obj: # if parameters_obj["cysteine_protease.cysteine"][0] == "c57": # #Lets replace all the cysteines # for i in range(psm_rows): # psm_table_data["sequence"][i] = psm_table_data["sequence"][i].replace("C", "C+57") ming_fileio_library.write_dictionary_table_data(psm_table_data, output_psms_file)
def main(): print(sys.argv) paramxml_filename = sys.argv[1] psms_input_file = sys.argv[2] kl_input_file = sys.argv[3] output_psms_file = sys.argv[4] output_decoy_psms_file = sys.argv[5] parameters_obj = ming_proteosafe_library.parse_xml_file( open(paramxml_filename)) target_filename_list, decoy_filename_list = determine_set_of_target_and_decoy_spectrum_files( parameters_obj) input_psm_set = ming_psm_library.PSMset("input psms") input_psm_set.load_PSM_tsvfile(psms_input_file, load_extra_metadata=True) decoy_psm_set = ming_psm_library.PSMset("decoy psms") decoy_psm_set.psms = input_psm_set.synthetic_psms_by_length_decoy_set( target_filename_list, decoy_filename_list) print("GETTING ALL SYNETHTIC with 0% FDR") input_psm_set.filter_synthetic_psms_by_length(target_filename_list, decoy_filename_list) row_count, kl_data = ming_fileio_library.parse_table_with_headers( kl_input_file) kl_dict = {} for i in range(row_count): filename = os.path.basename(kl_data["Filename"][i]) scan = kl_data["Scan"][i] kl_strict = (kl_data["KL Strict"][i]) kl_unstrict = (kl_data["KL"][i]) interpeak_intensity = (kl_data["Interpeak intensity"][i]) key = filename + ":" + str(scan) kl_dict[key] = { "kl_strict": kl_strict, "kl_unstrict": kl_unstrict, "kl_interpeak": interpeak_intensity } output_file = open(output_psms_file, "w") input_psm_set.write_output(output_file, write_extra_metadata=True) decoy_psm_set.write_output(open(output_decoy_psms_file, "w"), write_extra_metadata=True) output_file.close() #Since we don't support more fields in the psm object, we're going to read this file in again as a tsv file and add the columns as necessary psm_rows, psm_table_data = ming_fileio_library.parse_table_with_headers( output_psms_file) psm_table_data["kl_strict"] = [] psm_table_data["kl_unstrict"] = [] psm_table_data["kl_interpeak"] = [] psm_table_data["ambiguity_total_score"] = [] psm_table_data["first_second_unique_ratio"] = [] psm_table_data["first_unique_count"] = [] psm_table_data["first_unique_intensity"] = [] psm_table_data["numberpsms"] = [] psm_table_data["second_unique_count"] = [] psm_table_data["second_unique_intensity"] = [] psm_table_data["spectrum_unique_key"] = [] psm_table_data["modified_sequence"] = [] for i in range(psm_rows): key = psm_table_data["filename"][i] + ":" + psm_table_data["scan"][i] if key in kl_dict: psm_table_data["kl_strict"].append(kl_dict[key]["kl_strict"]) psm_table_data["kl_unstrict"].append(kl_dict[key]["kl_unstrict"]) psm_table_data["kl_interpeak"].append(kl_dict[key]["kl_interpeak"]) else: psm_table_data["kl_strict"].append(-1) psm_table_data["kl_unstrict"].append(-1) psm_table_data["kl_interpeak"].append(-1) #writing the ambiguity stuff, but just assuming no ambiguity psm_table_data["ambiguity_total_score"].append("-1") psm_table_data["first_second_unique_ratio"].append("-1") psm_table_data["first_unique_count"].append("-1") psm_table_data["first_unique_intensity"].append("-1") psm_table_data["numberpsms"].append(1) psm_table_data["second_unique_count"].append("-1") psm_table_data["second_unique_intensity"].append("-1") psm_table_data["spectrum_unique_key"].append(key) psm_table_data["modified_sequence"].append( psm_table_data["sequence"][i][:-2]) ming_fileio_library.write_dictionary_table_data(psm_table_data, output_psms_file)
def get_second_pass_psms(second_pass_results_filename): psm_list_second_pass = ming_psm_library.PSMset( second_pass_results_filename) psm_list_second_pass.load_MSGF_Plus_tsvfile(second_pass_results_filename) return psm_list_second_pass
def main(): input_paramxml = sys.argv[1] input_tsv_filename = sys.argv[2] intermediate_output_folder = sys.argv[3] output_file_bins = int(sys.argv[4]) params_obj = ming_proteosafe_library.parse_xml_file(open(input_paramxml)) snr_threshold = get_snr_filter(params_obj) #Filtering Criteria minimum_explained_intensity = 0.0 min_number_of_peaks_within_1_percent_of_max = 0.0 min_signal_peaks = 0.0 min_number_of_annotated_ions = 0.0 max_kl_strict_score = 50 max_ppm_error = 100000000 try: minimum_explained_intensity = float( params_obj["min_explained_intensity"][0]) min_number_of_peaks_within_1_percent_of_max = float( params_obj["min_number_of_peaks_within_1_percent_of_max"][0]) min_signal_peaks = float(params_obj["min_signal_peaks"][0]) min_number_of_annotated_ions = float( params_obj["min_number_of_annotated_ions"][0]) max_kl_strict_score = float(params_obj["kl_strict_max"][0]) if max_kl_strict_score == 0: max_kl_strict_score = 50 max_ppm_error = float(params_obj["max_ppm_error"][0]) except: print("exception") minimum_explained_intensity = 0.0 min_number_of_peaks_within_1_percent_of_max = 0.0 min_signal_peaks = 0.0 max_kl_strict_score = 50 #lets find the 1% variant point, and then the naive solution is to to take the top scoring one psm_set = ming_psm_library.PSMset("") psm_set.load_PSM_tsvfile(input_tsv_filename, load_extra_metadata=True) filename_to_psm_dict = group_psms_by_filename(psm_set) #All output files, we are going to bin them starting now output_filename_prefix = os.path.join( intermediate_output_folder, ming_fileio_library.get_filename_without_extension( os.path.basename(input_tsv_filename)) + "_partition_") output_files = {} output_files_number_spectra = {} for i in range(output_file_bins): output_filename = output_filename_prefix + str(i) + ".json" output_file = open(output_filename, "w") output_file.write("[") output_files[i] = output_file output_files_number_spectra[i] = 0 for filename in filename_to_psm_dict: extracted_spectra = extract_psms_from_filename( filename, filename_to_psm_dict[filename], snr_threshold, minimum_explained_intensity, min_signal_peaks, min_number_of_peaks_within_1_percent_of_max, min_number_of_annotated_ions, max_ppm_error) for spectrum in extracted_spectra: hashed_index = int( hashlib.sha1( spectrum["annotation"].encode('utf-8')).hexdigest(), 16) % (output_file_bins) if output_files_number_spectra[hashed_index] == 0: output_files[hashed_index].write(json.dumps(spectrum) + "\n") else: output_files[hashed_index].write("," + json.dumps(spectrum) + "\n") output_files_number_spectra[hashed_index] += 1 for i in range(output_file_bins): output_files[i].write("]") output_files[i].close()