def test_permanova_selection():
    import metadata_permanova_prioritizer

    permanova_colums = metadata_permanova_prioritizer.permanova_validation("reference_data/permanova/metadata_table-00000.txt")
    assert(len(permanova_colums) == 4)

    permanova_colums = metadata_permanova_prioritizer.permanova_validation("reference_data/permanova/kelly_metadata.txt")
    print(permanova_colums)
    assert("ATTRIBUTE_bdi_group" in permanova_colums)
Example #2
0
def test_metadata_test():
    import metadata_permanova_prioritizer

    input_filename = "reference_data/test_metadata_permanova_parse.tsv"
    selected_columns = metadata_permanova_prioritizer.permanova_validation(input_filename)

    print(selected_columns)
Example #3
0
def calculate_statistics(input_quant_filename,
                         input_metadata_file,
                         input_summary_file,
                         output_summary_folder,
                         output_plots_folder=None,
                         metadata_column=None,
                         condition_first=None,
                         condition_second=None,
                         metadata_facet_column=None,
                         run_stats=True,
                         PARALLELISM=8,
                         libraryidentifications_df=None):
    ## Loading feature table
    features_df = pd.read_csv(input_quant_filename, sep=",")
    metadata_df = pd.read_csv(input_metadata_file, sep="\t")
    metadata_df["filename"] = metadata_df["filename"].apply(
        lambda x: x.rstrip())

    ## Determining if we can even do anything
    print(len(features_df), len(features_df.columns),
          len(features_df) * len(features_df.columns))
    if len(features_df) * len(features_df.columns) > 10000000:
        print("Feature Table Too Big To Generate")
        return

    # removing peak area from columns
    feature_information_df = features_df[[
        "row ID", "row retention time", "row m/z"
    ]]
    features_df.index = features_df["row ID"]
    metabolite_id_list = list(features_df["row ID"])
    headers_to_keep = [
        header for header in features_df.columns if "Peak area" in header
    ]
    features_df = features_df[headers_to_keep]
    column_mapping = {
        headers: headers.replace(" Peak area", "").rstrip()
        for headers in features_df.columns
    }
    features_df = features_df.rename(columns=column_mapping)

    # Transpose
    features_df = features_df.T

    # Merging with Metadata
    features_df["filename"] = features_df.index
    features_df = features_df.merge(metadata_df, how="inner", on="filename")

    # Format Long version for later plotting
    long_form_df = pd.melt(features_df,
                           id_vars=metadata_df.columns,
                           value_vars=metabolite_id_list)
    long_form_df = long_form_df.rename(columns={
        "variable": "featureid",
        "value": "featurearea"
    })

    # Adding in feature information
    feature_information_df = feature_information_df.rename(
        columns={
            "row ID": "featureid",
            "row retention time": "featurert",
            "row m/z": "featuremz"
        })
    long_form_df = long_form_df.merge(feature_information_df,
                                      how="left",
                                      on="featureid")

    # Adding Library Search Inforamtion
    try:
        long_form_df = long_form_df.merge(libraryidentifications_df,
                                          how="left",
                                          left_on="featureid",
                                          right_on="#Scan#")
        long_form_df = long_form_df.drop(columns=["#Scan#"])
    except:
        pass

    long_form_df.to_csv(os.path.join(output_summary_folder, "data_long.csv"),
                        index=False)
    # Trying to add in summary to proteosafe output
    try:
        file_summary_df = pd.read_csv(input_summary_file, sep="\t")
        file_summary_df["filename"] = file_summary_df["full_CCMS_path"].apply(
            lambda x: os.path.basename(x))
        enriched_long_df = long_form_df.merge(file_summary_df,
                                              how="left",
                                              on="filename")
        columns_to_keep = list(long_form_df.columns)
        columns_to_keep.append("full_CCMS_path")
        enriched_long_df = enriched_long_df[columns_to_keep]
    except:
        enriched_long_df = long_form_df

    # Visualization in ProteoSAFe
    enriched_long_df.to_csv(os.path.join(output_summary_folder,
                                         "data_long_visualize.tsv"),
                            sep="\t",
                            index=False)

    global GLOBAL_DF
    GLOBAL_DF = long_form_df

    metabolite_id_list = metabolite_id_list

    if run_stats == False:
        return

    param_candidates = []
    # If we do not select a column, we don't calculate stats or do any plots
    if metadata_column in features_df:
        output_boxplot_list = []

        columns_to_consider = metadata_permanova_prioritizer.permanova_validation(
            input_metadata_file)  # Ignore
        columns_to_consider = [metadata_column]

        # HACK TO MAKE FASTER
        if len(columns_to_consider) > 0:
            columns_to_consider = columns_to_consider[:5]

        for column_to_consider in columns_to_consider:
            # Loop through all metabolites, and create plots
            if output_plots_folder is not None:
                for metabolite_id in metabolite_id_list:
                    output_filename = os.path.join(
                        output_plots_folder,
                        "{}_{}.png".format(column_to_consider, metabolite_id))

                    input_params = {}
                    input_params["metadata_column"] = column_to_consider
                    input_params["output_filename"] = output_filename
                    input_params["variable_value"] = metabolite_id

                    param_candidates.append(input_params)

                    output_dict = {}
                    output_dict["metadata_column"] = column_to_consider
                    output_dict["boxplotimg"] = os.path.basename(
                        output_filename)
                    output_dict["scan"] = metabolite_id

                    output_boxplot_list.append(output_dict)

        metadata_all_columns_summary_df = pd.DataFrame(output_boxplot_list)
        metadata_all_columns_summary_df.to_csv(os.path.join(
            output_summary_folder, "all_columns.tsv"),
                                               sep="\t",
                                               index=False)

    # plotting on a specific column
    if not metadata_column in features_df:
        pass
    elif condition_first is None or condition_second is None:
        pass
    elif condition_first == "None" or condition_second == "None":
        pass
    else:
        output_stats_list = []

        features_df = features_df[features_df[metadata_column].isin(
            [condition_first, condition_second])]

        data_first_df = features_df[features_df[metadata_column] ==
                                    condition_first]
        data_second_df = features_df[features_df[metadata_column] ==
                                     condition_second]

        for metabolite_id in metabolite_id_list:
            try:
                stat, pvalue = mannwhitneyu(data_first_df[metabolite_id],
                                            data_second_df[metabolite_id])
            except KeyboardInterrupt:
                raise
            except:
                continue

            output_filename = os.path.join(
                output_plots_folder,
                "chosen_{}_{}.png".format(metadata_column, metabolite_id))

            input_params = {}
            input_params["metadata_column"] = metadata_column
            input_params["output_filename"] = output_filename
            input_params["variable_value"] = metabolite_id
            input_params["metadata_facet"] = metadata_facet_column
            input_params[
                "metadata_conditions"] = condition_first + ";" + condition_second

            param_candidates.append(input_params)

            output_stats_dict = {}
            output_stats_dict["metadata_column"] = metadata_column
            output_stats_dict["condition_first"] = condition_first
            output_stats_dict["condition_second"] = condition_second
            output_stats_dict["stat"] = stat
            output_stats_dict["pvalue"] = pvalue
            output_stats_dict["boxplotimg"] = os.path.basename(output_filename)
            output_stats_dict["scan"] = metabolite_id

            output_stats_list.append(output_stats_dict)

        metadata_columns_summary_df = pd.DataFrame(output_stats_list)
        metadata_columns_summary_df.to_csv(os.path.join(
            output_summary_folder, "chosen_columns.tsv"),
                                           sep="\t",
                                           index=False)

    print("Calculate Plots", len(param_candidates))
    ming_parallel_library.run_parallel_job(plot_box,
                                           param_candidates,
                                           PARALLELISM,
                                           backend="multiprocessing")
Example #4
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('input_metadata_filename',
                        help='input_metadata_filename')
    parser.add_argument('input_quantification_table',
                        help='input_quantification_table')
    parser.add_argument('output_folder', help='output_folder')
    parser.add_argument("conda_activate_bin")
    parser.add_argument("conda_environment")
    parser.add_argument('--distance_metric',
                        default="cosine",
                        help='Enter Distance Metric')

    args = parser.parse_args()

    output_metadata_filename = os.path.join(args.output_folder,
                                            "qiime2_metadata.tsv")
    output_manifest_filename = os.path.join(args.output_folder,
                                            "qiime2_manifest.tsv")

    df_quantification = pd.read_csv(args.input_quantification_table, sep=",")
    """Reading Metadata Filename and filling in empty entries"""
    if len(args.input_metadata_filename) < 2:
        df_metadata = pd.DataFrame([{"filename": "placeholder"}])
    elif os.path.isfile(args.input_metadata_filename):
        df_metadata = pd.read_csv(args.input_metadata_filename, sep="\t")
    else:
        #It is a directory
        metadata_files = glob.glob(
            os.path.join(args.input_metadata_filename, "*"))
        if len(metadata_files) > 1:
            print("Enter only a single metadata file")
            exit(1)
        elif len(metadata_files) == 0:
            df_metadata = pd.DataFrame([{"filename": "placeholder"}])
        else:
            df_metadata = pd.read_csv(metadata_files[0], sep="\t")

    if not "sample_name" in df_metadata:
        df_metadata["sample_name"] = df_metadata["filename"]
    """Checking if the set of filenames are fully covered, if not then we'll provide a place holder"""
    all_quantification_filenames = [
        key.replace("Peak area", "").rstrip()
        for key in df_quantification.keys() if "Peak area" in key
    ]
    metadata_filenames = []
    try:
        metadata_filenames = list(df_metadata["filename"])
    except:
        metadata_filenames

    metadata_object_list = df_metadata.to_dict(orient="records")
    for quantification_filename in all_quantification_filenames:
        if not quantification_filename in metadata_filenames:
            print(quantification_filename, "not found")
            metadata_object = {}
            metadata_object["filename"] = quantification_filename
            metadata_object["sample_name"] = quantification_filename
            metadata_object_list.append(metadata_object)
    """Adding in missing filenames into the metadata"""
    new_output_metadata = pd.DataFrame(metadata_object_list)

    #Removing protected headers
    new_output_metadata = new_output_metadata.drop(
        columns=["feature", "#SampleID"], errors="ignore")

    output_columns = list(new_output_metadata.keys())
    output_columns.remove("sample_name")
    output_columns.insert(0, "sample_name")

    new_output_metadata.to_csv(output_metadata_filename,
                               index=False,
                               sep="\t",
                               columns=output_columns,
                               na_rep="NaN")
    """Outputting Manifest Filename"""
    manifest_df = pd.DataFrame()
    manifest_df["sample_name"] = new_output_metadata["sample_name"]
    manifest_df["filepath"] = new_output_metadata["filename"]
    manifest_df.to_csv(output_manifest_filename, index=False, sep=",")

    #Running Qiime2
    local_qza_table = os.path.join(args.output_folder, "qiime2_table.qza")
    local_qza_relative_table = os.path.join(args.output_folder,
                                            "qiime2_relative_table.qza")
    local_qza_distance = os.path.join(args.output_folder,
                                      "qiime2_distance.qza")
    local_qza_pcoa = os.path.join(args.output_folder, "qiime2_pcoa.qza")
    local_qzv_emperor = os.path.join(args.output_folder, "qiime2_emperor.qzv")
    local_qza_biplot = os.path.join(args.output_folder, "qiime2_biplot.qza")
    local_qzv_biplot_emperor = os.path.join(args.output_folder,
                                            "qiime2_biplot_emperor.qzv")

    all_cmd = []
    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime metabolomics import-mzmine2 \
        --p-manifest {} \
        --p-quantificationtable {} \
        --o-feature-table {}".format(args.conda_activate_bin,
                                     args.conda_environment,
                                     output_manifest_filename,
                                     args.input_quantification_table,
                                     local_qza_table))

    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime diversity beta \
        --i-table {} \
        --p-metric {} \
        --o-distance-matrix {}".format(args.conda_activate_bin,
                                       args.conda_environment, local_qza_table,
                                       args.distance_metric,
                                       local_qza_distance))

    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime diversity pcoa \
        --i-distance-matrix {} \
        --o-pcoa {}".format(args.conda_activate_bin, args.conda_environment,
                            local_qza_distance, local_qza_pcoa))

    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime emperor plot \
        --i-pcoa {} \
        --m-metadata-file {} \
        --o-visualization {} \
        --p-ignore-missing-samples".format(args.conda_activate_bin,
                                           args.conda_environment,
                                           local_qza_pcoa,
                                           output_metadata_filename,
                                           local_qzv_emperor))

    #Biplotting
    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime feature-table relative-frequency \
        --i-table {} \
        --o-relative-frequency-table  {}".format(args.conda_activate_bin,
                                                 args.conda_environment,
                                                 local_qza_table,
                                                 local_qza_relative_table))

    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime diversity pcoa-biplot \
        --i-pcoa {} \
        --i-features {} \
        --o-biplot {}".format(args.conda_activate_bin, args.conda_environment,
                              local_qza_pcoa, local_qza_relative_table,
                              local_qza_biplot))

    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime emperor biplot \
        --i-biplot {} \
        --m-sample-metadata-file {} \
        --p-number-of-features 10 \
        --o-visualization {} \
        --p-ignore-missing-samples".format(args.conda_activate_bin,
                                           args.conda_environment,
                                           local_qza_biplot,
                                           output_metadata_filename,
                                           local_qzv_biplot_emperor))

    # Running Permanova
    import metadata_permanova_prioritizer
    selected_columns = metadata_permanova_prioritizer.permanova_validation(
        output_metadata_filename)
    for column in selected_columns:
        print(column)
        output_qiime2_permanova_qzv = os.path.join(
            args.output_folder, "permanova_{}.qzv".format(column))
        import pathvalidate
        output_qiime2_permanova_qzv = pathvalidate.sanitize_filepath(
            output_qiime2_permanova_qzv)

        cmd = "LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime diversity beta-group-significance \
        --i-distance-matrix {} \
        --m-metadata-file {} \
        --m-metadata-column \"{}\" \
        --p-pairwise \
        --o-visualization {}".format(args.conda_activate_bin,
                                     args.conda_environment,
                                     local_qza_distance,
                                     output_metadata_filename, column,
                                     output_qiime2_permanova_qzv)

        all_cmd.append(cmd)

    for cmd in all_cmd:
        print(cmd)
        os.system(cmd)