def finding_matches_in_public_data(input_spectrum_collection, all_datasets,
                                   match_parameters):
    all_matches_to_datasets_map = {}

    dataset_search_parameters = []
    for dataset in all_datasets:
        if dataset["title"].upper().find("GNPS") == -1:
            continue
        dataset_id = dataset["dataset"]
        dataset_search_parameters.append({
            "dataset_id": dataset_id,
            "input_spectrum_collection": input_spectrum_collection,
            "match_parameters": match_parameters
        })

    print("datasets to consider: " + str(len(dataset_search_parameters)))

    #Parallel
    search_results = ming_parallel_library.run_parallel_job(
        find_matches_in_dataset_wrapper, dataset_search_parameters, 50)

    #formatting output
    for i in range(len(search_results)):
        dataset_matches = search_results[i]
        dataset_id = dataset_search_parameters[i]["dataset_id"]
        all_matches_to_datasets_map[dataset_id] = {"matches": dataset_matches}

    return all_matches_to_datasets_map
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser(
        description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('json_parameters', help='proteosafe xml parameters')
    parser.add_argument('workflow_parameters',
                        help='output folder for parameters')
    parser.add_argument('library_folder', help='output folder for parameters')
    parser.add_argument('result_folder', help='output folder for parameters')
    parser.add_argument('convert_binary', help='output folder for parameters')
    parser.add_argument('librarysearch_binary',
                        help='output folder for parameters')
    parser.add_argument('--parallelism',
                        default=1,
                        type=int,
                        help='Parallelism')
    args = parser.parse_args()

    parallel_json = json.loads(open(args.json_parameters).read())

    params_object = ming_proteosafe_library.parse_xml_file(
        open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        params_object)
    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()

    print(spectra_files)
    spectra_files = spectra_files[
        parallel_json["node_partition"]::parallel_json["total_paritions"]]
    print(spectra_files)

    temp_folder = "temp"
    try:
        os.mkdir(temp_folder)
    except:
        print("folder error")

    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    list_of_spectrumfiles = chunks(spectra_files, 5)
    parameter_list = []
    for spectrum_files_chunk in list_of_spectrumfiles:
        param_dict = {}
        param_dict["spectra_files"] = spectrum_files_chunk
        param_dict["temp_folder"] = temp_folder
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args
        param_dict["params_object"] = params_object
        param_dict["library_files"] = library_files

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5)
    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(
        tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        result_list = ming_fileio_library.parse_table_with_headers_object_list(
            input_file)
        full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["SpectrumFile"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(
        full_result_list,
        os.path.join(args.result_folder,
                     str(uuid.uuid4()) + ".tsv"))
def calculate_statistics(input_quant_filename,
                         input_metadata_file,
                         input_summary_file,
                         output_summary_folder,
                         output_plots_folder=None,
                         metadata_column=None,
                         condition_first=None,
                         condition_second=None,
                         metadata_facet_column=None,
                         run_stats=True,
                         PARALLELISM=8,
                         libraryidentifications_df=None):
    ## Loading feature table
    features_df = pd.read_csv(input_quant_filename, sep=",")
    metadata_df = pd.read_csv(input_metadata_file, sep="\t")
    metadata_df["filename"] = metadata_df["filename"].apply(
        lambda x: x.rstrip())

    ## Determining if we can even do anything
    print(len(features_df), len(features_df.columns),
          len(features_df) * len(features_df.columns))
    if len(features_df) * len(features_df.columns) > 10000000:
        print("Feature Table Too Big To Generate")
        return

    # removing peak area from columns
    feature_information_df = features_df[[
        "row ID", "row retention time", "row m/z"
    ]]
    features_df.index = features_df["row ID"]
    metabolite_id_list = list(features_df["row ID"])
    headers_to_keep = [
        header for header in features_df.columns if "Peak area" in header
    ]
    features_df = features_df[headers_to_keep]
    column_mapping = {
        headers: headers.replace(" Peak area", "").rstrip()
        for headers in features_df.columns
    }
    features_df = features_df.rename(columns=column_mapping)

    # Transpose
    features_df = features_df.T

    # Merging with Metadata
    features_df["filename"] = features_df.index
    features_df = features_df.merge(metadata_df, how="inner", on="filename")

    # Format Long version for later plotting
    long_form_df = pd.melt(features_df,
                           id_vars=metadata_df.columns,
                           value_vars=metabolite_id_list)
    long_form_df = long_form_df.rename(columns={
        "variable": "featureid",
        "value": "featurearea"
    })

    # Adding in feature information
    feature_information_df = feature_information_df.rename(
        columns={
            "row ID": "featureid",
            "row retention time": "featurert",
            "row m/z": "featuremz"
        })
    long_form_df = long_form_df.merge(feature_information_df,
                                      how="left",
                                      on="featureid")

    # Adding Library Search Inforamtion
    try:
        long_form_df = long_form_df.merge(libraryidentifications_df,
                                          how="left",
                                          left_on="featureid",
                                          right_on="#Scan#")
        long_form_df = long_form_df.drop(columns=["#Scan#"])
    except:
        pass

    long_form_df.to_csv(os.path.join(output_summary_folder, "data_long.csv"),
                        index=False)
    # Trying to add in summary to proteosafe output
    try:
        file_summary_df = pd.read_csv(input_summary_file, sep="\t")
        file_summary_df["filename"] = file_summary_df["full_CCMS_path"].apply(
            lambda x: os.path.basename(x))
        enriched_long_df = long_form_df.merge(file_summary_df,
                                              how="left",
                                              on="filename")
        columns_to_keep = list(long_form_df.columns)
        columns_to_keep.append("full_CCMS_path")
        enriched_long_df = enriched_long_df[columns_to_keep]
    except:
        enriched_long_df = long_form_df

    # Visualization in ProteoSAFe
    enriched_long_df.to_csv(os.path.join(output_summary_folder,
                                         "data_long_visualize.tsv"),
                            sep="\t",
                            index=False)

    global GLOBAL_DF
    GLOBAL_DF = long_form_df

    metabolite_id_list = metabolite_id_list

    if run_stats == False:
        return

    param_candidates = []
    # If we do not select a column, we don't calculate stats or do any plots
    if metadata_column in features_df:
        output_boxplot_list = []

        columns_to_consider = metadata_permanova_prioritizer.permanova_validation(
            input_metadata_file)  # Ignore
        columns_to_consider = [metadata_column]

        # HACK TO MAKE FASTER
        if len(columns_to_consider) > 0:
            columns_to_consider = columns_to_consider[:5]

        for column_to_consider in columns_to_consider:
            # Loop through all metabolites, and create plots
            if output_plots_folder is not None:
                for metabolite_id in metabolite_id_list:
                    output_filename = os.path.join(
                        output_plots_folder,
                        "{}_{}.png".format(column_to_consider, metabolite_id))

                    input_params = {}
                    input_params["metadata_column"] = column_to_consider
                    input_params["output_filename"] = output_filename
                    input_params["variable_value"] = metabolite_id

                    param_candidates.append(input_params)

                    output_dict = {}
                    output_dict["metadata_column"] = column_to_consider
                    output_dict["boxplotimg"] = os.path.basename(
                        output_filename)
                    output_dict["scan"] = metabolite_id

                    output_boxplot_list.append(output_dict)

        metadata_all_columns_summary_df = pd.DataFrame(output_boxplot_list)
        metadata_all_columns_summary_df.to_csv(os.path.join(
            output_summary_folder, "all_columns.tsv"),
                                               sep="\t",
                                               index=False)

    # plotting on a specific column
    if not metadata_column in features_df:
        pass
    elif condition_first is None or condition_second is None:
        pass
    elif condition_first == "None" or condition_second == "None":
        pass
    else:
        output_stats_list = []

        features_df = features_df[features_df[metadata_column].isin(
            [condition_first, condition_second])]

        data_first_df = features_df[features_df[metadata_column] ==
                                    condition_first]
        data_second_df = features_df[features_df[metadata_column] ==
                                     condition_second]

        for metabolite_id in metabolite_id_list:
            try:
                stat, pvalue = mannwhitneyu(data_first_df[metabolite_id],
                                            data_second_df[metabolite_id])
            except KeyboardInterrupt:
                raise
            except:
                continue

            output_filename = os.path.join(
                output_plots_folder,
                "chosen_{}_{}.png".format(metadata_column, metabolite_id))

            input_params = {}
            input_params["metadata_column"] = metadata_column
            input_params["output_filename"] = output_filename
            input_params["variable_value"] = metabolite_id
            input_params["metadata_facet"] = metadata_facet_column
            input_params[
                "metadata_conditions"] = condition_first + ";" + condition_second

            param_candidates.append(input_params)

            output_stats_dict = {}
            output_stats_dict["metadata_column"] = metadata_column
            output_stats_dict["condition_first"] = condition_first
            output_stats_dict["condition_second"] = condition_second
            output_stats_dict["stat"] = stat
            output_stats_dict["pvalue"] = pvalue
            output_stats_dict["boxplotimg"] = os.path.basename(output_filename)
            output_stats_dict["scan"] = metabolite_id

            output_stats_list.append(output_stats_dict)

        metadata_columns_summary_df = pd.DataFrame(output_stats_list)
        metadata_columns_summary_df.to_csv(os.path.join(
            output_summary_folder, "chosen_columns.tsv"),
                                           sep="\t",
                                           index=False)

    print("Calculate Plots", len(param_candidates))
    ming_parallel_library.run_parallel_job(plot_box,
                                           param_candidates,
                                           PARALLELISM,
                                           backend="multiprocessing")
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('result_file', help='output folder for parameters')
    parser.add_argument('msaccess_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()


    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()


    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    parameter_list = []
    for spectrum_file in spectra_files:
        param_dict = {}
        param_dict["spectrum_file"] = spectrum_file
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        try:
            result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
            for result in result_list:
                output_dict = {}
                output_dict["Filename"] = result["Filename"]
                output_dict["Vendor"] = result["Vendor"]
                output_dict["Model"] = result["Model"]
                output_dict["MS1s"] = result["MS1s"]
                output_dict["MS2s"] = result["MS2s"]
                full_result_list.append(output_dict)
        except:
            #raise
            print("Error", input_file)

        #print(result_list)
        #full_result_list += result_list
    
    used_files = set()
    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["Filename"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path
        result_object["CCMS_filename"] = os.path.basename(full_path)
        used_files.add(full_path)

    for mangled_name in spectra_files:
        full_path = mangled_mapping[os.path.basename(mangled_name)]
        if full_path in used_files:
            continue

        output_dict = {}
        output_dict["full_CCMS_path"] = full_path
        output_dict["CCMS_filename"] = os.path.basename(full_path)
        full_result_list.append(output_dict)

    pd.DataFrame(full_result_list).to_csv(args.result_file, sep="\t", index=False)
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('result_file', help='output folder for parameters')
    parser.add_argument('msaccess_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()


    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()


    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    parameter_list = []
    for spectrum_file in spectra_files:
        param_dict = {}
        param_dict["spectrum_file"] = spectrum_file
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        try:
            result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
            for result in result_list:
                output_dict = {}
                output_dict["Filename"] = result["Filename"]
                output_dict["Vendor"] = result["Vendor"]
                output_dict["Model"] = result["Model"]
                output_dict["MS1s"] = result["MS1s"]
                output_dict["MS2s"] = result["MS2s"]
                full_result_list.append(output_dict)
        except:
            #raise
            print("Error", input_file)

        #print(result_list)
        #full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["Filename"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(full_result_list, args.result_file)
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('json_parameters', help='proteosafe xml parameters')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('library_folder', help='output folder for parameters')
    parser.add_argument('result_folder', help='output folder for parameters')
    parser.add_argument('convert_binary', help='output folder for parameters')
    parser.add_argument('librarysearch_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()

    parallel_json = json.loads(open(args.json_parameters).read())

    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()

    print(spectra_files)
    spectra_files = spectra_files[parallel_json["node_partition"]::parallel_json["total_paritions"]]
    print(spectra_files)

    temp_folder = "temp"
    try:
        os.mkdir(temp_folder)
    except:
        print("folder error")

    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")


    list_of_spectrumfiles = chunks(spectra_files, 5)
    parameter_list = []
    for spectrum_files_chunk in list_of_spectrumfiles:
        param_dict = {}
        param_dict["spectra_files"] = spectrum_files_chunk
        param_dict["temp_folder"] = temp_folder
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args
        param_dict["params_object"] = params_object
        param_dict["library_files"] = library_files

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
        full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["SpectrumFile"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))