def finding_matches_in_public_data(input_spectrum_collection, all_datasets, match_parameters): all_matches_to_datasets_map = {} dataset_search_parameters = [] for dataset in all_datasets: if dataset["title"].upper().find("GNPS") == -1: continue dataset_id = dataset["dataset"] dataset_search_parameters.append({ "dataset_id": dataset_id, "input_spectrum_collection": input_spectrum_collection, "match_parameters": match_parameters }) print("datasets to consider: " + str(len(dataset_search_parameters))) #Parallel search_results = ming_parallel_library.run_parallel_job( find_matches_in_dataset_wrapper, dataset_search_parameters, 50) #formatting output for i in range(len(search_results)): dataset_matches = search_results[i] dataset_id = dataset_search_parameters[i]["dataset_id"] all_matches_to_datasets_map[dataset_id] = {"matches": dataset_matches} return all_matches_to_datasets_map
def main(): parser = argparse.ArgumentParser( description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('json_parameters', help='proteosafe xml parameters') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('library_folder', help='output folder for parameters') parser.add_argument('result_folder', help='output folder for parameters') parser.add_argument('convert_binary', help='output folder for parameters') parser.add_argument('librarysearch_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() parallel_json = json.loads(open(args.json_parameters).read()) params_object = ming_proteosafe_library.parse_xml_file( open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( params_object) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() print(spectra_files) spectra_files = spectra_files[ parallel_json["node_partition"]::parallel_json["total_paritions"]] print(spectra_files) temp_folder = "temp" try: os.mkdir(temp_folder) except: print("folder error") tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") list_of_spectrumfiles = chunks(spectra_files, 5) parameter_list = [] for spectrum_files_chunk in list_of_spectrumfiles: param_dict = {} param_dict["spectra_files"] = spectrum_files_chunk param_dict["temp_folder"] = temp_folder param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args param_dict["params_object"] = params_object param_dict["library_files"] = library_files parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir( tempresults_folder) full_result_list = [] for input_file in all_result_files: result_list = ming_fileio_library.parse_table_with_headers_object_list( input_file) full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["SpectrumFile"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data( full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
def calculate_statistics(input_quant_filename, input_metadata_file, input_summary_file, output_summary_folder, output_plots_folder=None, metadata_column=None, condition_first=None, condition_second=None, metadata_facet_column=None, run_stats=True, PARALLELISM=8, libraryidentifications_df=None): ## Loading feature table features_df = pd.read_csv(input_quant_filename, sep=",") metadata_df = pd.read_csv(input_metadata_file, sep="\t") metadata_df["filename"] = metadata_df["filename"].apply( lambda x: x.rstrip()) ## Determining if we can even do anything print(len(features_df), len(features_df.columns), len(features_df) * len(features_df.columns)) if len(features_df) * len(features_df.columns) > 10000000: print("Feature Table Too Big To Generate") return # removing peak area from columns feature_information_df = features_df[[ "row ID", "row retention time", "row m/z" ]] features_df.index = features_df["row ID"] metabolite_id_list = list(features_df["row ID"]) headers_to_keep = [ header for header in features_df.columns if "Peak area" in header ] features_df = features_df[headers_to_keep] column_mapping = { headers: headers.replace(" Peak area", "").rstrip() for headers in features_df.columns } features_df = features_df.rename(columns=column_mapping) # Transpose features_df = features_df.T # Merging with Metadata features_df["filename"] = features_df.index features_df = features_df.merge(metadata_df, how="inner", on="filename") # Format Long version for later plotting long_form_df = pd.melt(features_df, id_vars=metadata_df.columns, value_vars=metabolite_id_list) long_form_df = long_form_df.rename(columns={ "variable": "featureid", "value": "featurearea" }) # Adding in feature information feature_information_df = feature_information_df.rename( columns={ "row ID": "featureid", "row retention time": "featurert", "row m/z": "featuremz" }) long_form_df = long_form_df.merge(feature_information_df, how="left", on="featureid") # Adding Library Search Inforamtion try: long_form_df = long_form_df.merge(libraryidentifications_df, how="left", left_on="featureid", right_on="#Scan#") long_form_df = long_form_df.drop(columns=["#Scan#"]) except: pass long_form_df.to_csv(os.path.join(output_summary_folder, "data_long.csv"), index=False) # Trying to add in summary to proteosafe output try: file_summary_df = pd.read_csv(input_summary_file, sep="\t") file_summary_df["filename"] = file_summary_df["full_CCMS_path"].apply( lambda x: os.path.basename(x)) enriched_long_df = long_form_df.merge(file_summary_df, how="left", on="filename") columns_to_keep = list(long_form_df.columns) columns_to_keep.append("full_CCMS_path") enriched_long_df = enriched_long_df[columns_to_keep] except: enriched_long_df = long_form_df # Visualization in ProteoSAFe enriched_long_df.to_csv(os.path.join(output_summary_folder, "data_long_visualize.tsv"), sep="\t", index=False) global GLOBAL_DF GLOBAL_DF = long_form_df metabolite_id_list = metabolite_id_list if run_stats == False: return param_candidates = [] # If we do not select a column, we don't calculate stats or do any plots if metadata_column in features_df: output_boxplot_list = [] columns_to_consider = metadata_permanova_prioritizer.permanova_validation( input_metadata_file) # Ignore columns_to_consider = [metadata_column] # HACK TO MAKE FASTER if len(columns_to_consider) > 0: columns_to_consider = columns_to_consider[:5] for column_to_consider in columns_to_consider: # Loop through all metabolites, and create plots if output_plots_folder is not None: for metabolite_id in metabolite_id_list: output_filename = os.path.join( output_plots_folder, "{}_{}.png".format(column_to_consider, metabolite_id)) input_params = {} input_params["metadata_column"] = column_to_consider input_params["output_filename"] = output_filename input_params["variable_value"] = metabolite_id param_candidates.append(input_params) output_dict = {} output_dict["metadata_column"] = column_to_consider output_dict["boxplotimg"] = os.path.basename( output_filename) output_dict["scan"] = metabolite_id output_boxplot_list.append(output_dict) metadata_all_columns_summary_df = pd.DataFrame(output_boxplot_list) metadata_all_columns_summary_df.to_csv(os.path.join( output_summary_folder, "all_columns.tsv"), sep="\t", index=False) # plotting on a specific column if not metadata_column in features_df: pass elif condition_first is None or condition_second is None: pass elif condition_first == "None" or condition_second == "None": pass else: output_stats_list = [] features_df = features_df[features_df[metadata_column].isin( [condition_first, condition_second])] data_first_df = features_df[features_df[metadata_column] == condition_first] data_second_df = features_df[features_df[metadata_column] == condition_second] for metabolite_id in metabolite_id_list: try: stat, pvalue = mannwhitneyu(data_first_df[metabolite_id], data_second_df[metabolite_id]) except KeyboardInterrupt: raise except: continue output_filename = os.path.join( output_plots_folder, "chosen_{}_{}.png".format(metadata_column, metabolite_id)) input_params = {} input_params["metadata_column"] = metadata_column input_params["output_filename"] = output_filename input_params["variable_value"] = metabolite_id input_params["metadata_facet"] = metadata_facet_column input_params[ "metadata_conditions"] = condition_first + ";" + condition_second param_candidates.append(input_params) output_stats_dict = {} output_stats_dict["metadata_column"] = metadata_column output_stats_dict["condition_first"] = condition_first output_stats_dict["condition_second"] = condition_second output_stats_dict["stat"] = stat output_stats_dict["pvalue"] = pvalue output_stats_dict["boxplotimg"] = os.path.basename(output_filename) output_stats_dict["scan"] = metabolite_id output_stats_list.append(output_stats_dict) metadata_columns_summary_df = pd.DataFrame(output_stats_list) metadata_columns_summary_df.to_csv(os.path.join( output_summary_folder, "chosen_columns.tsv"), sep="\t", index=False) print("Calculate Plots", len(param_candidates)) ming_parallel_library.run_parallel_job(plot_box, param_candidates, PARALLELISM, backend="multiprocessing")
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('result_file', help='output folder for parameters') parser.add_argument('msaccess_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") parameter_list = [] for spectrum_file in spectra_files: param_dict = {} param_dict["spectrum_file"] = spectrum_file param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: try: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) for result in result_list: output_dict = {} output_dict["Filename"] = result["Filename"] output_dict["Vendor"] = result["Vendor"] output_dict["Model"] = result["Model"] output_dict["MS1s"] = result["MS1s"] output_dict["MS2s"] = result["MS2s"] full_result_list.append(output_dict) except: #raise print("Error", input_file) #print(result_list) #full_result_list += result_list used_files = set() for result_object in full_result_list: mangled_name = os.path.basename(result_object["Filename"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path result_object["CCMS_filename"] = os.path.basename(full_path) used_files.add(full_path) for mangled_name in spectra_files: full_path = mangled_mapping[os.path.basename(mangled_name)] if full_path in used_files: continue output_dict = {} output_dict["full_CCMS_path"] = full_path output_dict["CCMS_filename"] = os.path.basename(full_path) full_result_list.append(output_dict) pd.DataFrame(full_result_list).to_csv(args.result_file, sep="\t", index=False)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('result_file', help='output folder for parameters') parser.add_argument('msaccess_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") parameter_list = [] for spectrum_file in spectra_files: param_dict = {} param_dict["spectrum_file"] = spectrum_file param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: try: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) for result in result_list: output_dict = {} output_dict["Filename"] = result["Filename"] output_dict["Vendor"] = result["Vendor"] output_dict["Model"] = result["Model"] output_dict["MS1s"] = result["MS1s"] output_dict["MS2s"] = result["MS2s"] full_result_list.append(output_dict) except: #raise print("Error", input_file) #print(result_list) #full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["Filename"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data(full_result_list, args.result_file)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('json_parameters', help='proteosafe xml parameters') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('library_folder', help='output folder for parameters') parser.add_argument('result_folder', help='output folder for parameters') parser.add_argument('convert_binary', help='output folder for parameters') parser.add_argument('librarysearch_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() parallel_json = json.loads(open(args.json_parameters).read()) params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() print(spectra_files) spectra_files = spectra_files[parallel_json["node_partition"]::parallel_json["total_paritions"]] print(spectra_files) temp_folder = "temp" try: os.mkdir(temp_folder) except: print("folder error") tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") list_of_spectrumfiles = chunks(spectra_files, 5) parameter_list = [] for spectrum_files_chunk in list_of_spectrumfiles: param_dict = {} param_dict["spectra_files"] = spectrum_files_chunk param_dict["temp_folder"] = temp_folder param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args param_dict["params_object"] = params_object param_dict["library_files"] = library_files parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["SpectrumFile"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data(full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))