def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    """Coordinate preparation for script, and terminating the programme when finished."""
    # programme preparation

    # build cmd-line arguments parser
    # Check if namepsace isn't passed, if not parse command-line
    if argv is None:
        # Parse command-line
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    # build logger
    # Note: log file is only created if specified at cmd-line
    if logger is None:
        logger = build_logger("get_ncbi_genomes", args)
    logger.info("Run initated")

    # open session to local CAZy database (this can be curated by using the
    # cazy_webscraper, available at https://github.com/HobnobMancer/cazy_webscraper)
    session = get_cazy_db_session(args, logger)

    # retrieve paths to FASTA files
    fasta_files = get_fasta_file_paths(args, logger)

    # create dataset per FASTA file
    for fasta in fasta_files:
        build_protein_dataframe(fasta, session, args, logger)

    logger.info("Finished creating a dataset for each input FASTA file.\n"
                "Terminating Progamme.")
Esempio n. 2
0
def calculate_class_stats_by_testsets(
    ground_truth_df,
    prediction_df,
    time_stamp,
    args,
    class_list=["GH", "GT", "PL", "CE", "AA", "CBM"],
):
    """Calculate statistical parameters for evaluating performance of CAZy clas prediction for each
    test set and each prediction tool.

    :param predictions_df: df of predicted CAZy family annotations from all prediction tools
    :param ground_truths_df: df of CAZy annotations of proteins
    :param time_stamp: str
    :param args: cmd-line args parser
    :param class_list: list of CAZy class names

    Return predictions_df including calculated CAZy class statistics.
    """
    specific_logger = build_logger(
        args.output,
        "cazy_class_confusion_matrix_errors_per_testset.log",
    )
    logger = logging.getLogger(__name__)

    # get the names of the genomic accessions, one accession = one test set
    all_genomic_accessions = ground_truth_df["Genomic_accession"]
    all_genomic_accessions = set(all_genomic_accessions)

    class_stats_data = [
    ]  # Genomic_accession, Prediction_tool, Statistic_parameter, Stat_value

    for tool in ["dbCAN", "HMMER", "Hotpep", "DIAMOND", "CUPP", "eCAMI"]:
        # retrieve the rows of interest from dataframes with the current working prediction tool
        tool_ground_truths = ground_truth_df.loc[
            ground_truth_df["Prediction_tool"] == tool]
        tool_predictions = prediction_df.loc[prediction_df["Prediction_tool"]
                                             == tool]

        # one accession is one test set
        for accession in tqdm(
                all_genomic_accessions,
                desc=f"Evaluate Class prediction by test set for {tool}:",
        ):
            testset_tool_ground_truths = tool_ground_truths.loc[
                tool_ground_truths["Genomic_accession"] == accession]
            testset_tool_predictions = tool_predictions.loc[
                tool_predictions["Genomic_accession"] == accession]

            # build empty dataframes to store all results EXCEPT true negative non-CAZymes
            tp_fp_fn_ground_truths = pd.DataFrame(
                columns=list(testset_tool_ground_truths.columns))
            tp_fp_fn_predictions = pd.DataFrame(
                columns=list(testset_tool_predictions.columns))

            # exclude true negative non-CAZyme predictions
            index = 0
            for index in range(
                    len(testset_tool_predictions["Genomic_accession"])):
                y_true = testset_tool_ground_truths.iloc[index]
                y_true = list(y_true[class_list]
                              )  # retrieve only the cazy class 0/1 annotations

                y_pred = testset_tool_predictions.iloc[index]
                y_pred = list(y_pred[class_list]
                              )  # retrieve only the cazy class 0/1 annotations

                if (1 not in y_true) and (1 not in y_pred):
                    # if y_true and y_pred are all 0s, this is a true negative non-CAZyme prediction
                    # do not include true negative non-CAZyme predictions
                    continue

                else:
                    tp_fp_fn_ground_truths = tp_fp_fn_ground_truths.append(
                        testset_tool_ground_truths.iloc[index], )
                    tp_fp_fn_predictions = tp_fp_fn_predictions.append(
                        testset_tool_predictions.iloc[index], )

            for cazy_class in class_list:
                y_true = list(tp_fp_fn_ground_truths[cazy_class])
                y_pred = list(tp_fp_fn_predictions[cazy_class])

                # check if the CAZy class was included in predictions and ground truths
                # if not exclude the class from the evaluation
                if (1 not in y_true) and (1 not in y_pred):
                    # do not include in statistics
                    logger.warning(
                        f"{cazy_class} not predicted by {tool} in {accession} and not in known "
                        f"annotations\nExcluding {cazy_class} from evaluation by setting all "
                        "stats results as NaN")
                    specific_logger.warning(
                        f"{cazy_class} not predicted by {tool} in {accession} and not in known "
                        f"annotations\nExcluding {cazy_class} from evaluation by setting all "
                        "stats results as NaN")

                    specificity = np.nan
                    class_stats_data.append([
                        accession, tool, cazy_class, "Specificity", specificity
                    ], )

                    sensitivity = np.nan
                    class_stats_data.append(
                        [accession, tool, cazy_class, "Recall", recall])

                    precision = np.nan
                    class_stats_data.append(
                        [accession, tool, cazy_class, "Precision", precision])

                    fbeta = np.nan
                    class_stats_data.append(
                        [accession, tool, cazy_class, "Fbeta_score", fbeta])

                    accuracy = np.nan
                    class_stats_data.append(
                        [accession, tool, cazy_class, "Accuracy", accuracy])

                    continue

                recall = recall_score(y_true, y_pred)
                class_stats_data.append(
                    [accession, tool, cazy_class, "Recall", recall])

                precision = precision_score(y_true, y_pred)
                class_stats_data.append(
                    [accession, tool, cazy_class, "Precision", precision])

                fbeta = fbeta_score(y_true, y_pred, beta=args.beta)
                class_stats_data.append(
                    [accession, tool, cazy_class, "Fbeta_score", fbeta])

                cm = confusion_matrix(y_true, y_pred)
                try:
                    tn = cm[0][0]
                    fn = cm[1][0]
                    tp = cm[1][1]
                    fp = cm[0][1]

                    accuracy = (tp + tn) / (tp + fp + fn + tn)
                    class_stats_data.append(
                        [accession, tool, cazy_class, "Accuracy", accuracy])

                except IndexError as e:
                    class_stats_data.append(
                        [accession, tool, cazy_class, "Accuracy", np.nan])
                    logger.warning(
                        f"Error raised when creating confusion matrix for protein {accession}, "
                        f"{tool}, {cazy_class}, when calculating accuracy.\nError raised:\n{e}"
                    )
                    specific_logger.warning(
                        f"Prediction Tool: {tool}\tClass: {cazy_class}\tAccession: {accession}\t"
                        f"Stat: Specificity\ny_true: {y_true}\ny_pred: {y_pred}\n"
                    )

                try:
                    tn = cm[0][0]
                    fp = cm[0][1]
                    specificity = tn / (tn + fp)

                    class_stats_data.append([
                        accession,
                        tool,
                        cazy_class,
                        "Specificity",
                        specificity,
                    ], )

                except IndexError as e:
                    class_stats_data.append(
                        [accession, tool, cazy_class, "Specificity", np.nan])
                    logger.warning(
                        f"Error raised when creating confusion matrix for protein {accession}, "
                        f"{tool}, {cazy_class}, when calculating specificity.\nError raised:\n{e}"
                    )
                    specific_logger.warning(
                        f"Prediction Tool: {tool}\tClass: {cazy_class}\tAccession: {accession}\t"
                        f"Stat: Specificity\ny_true: {y_true}\ny_pred: {y_pred}\n"
                    )

    # build dataframe using class_stats_data
    class_stats_df = pd.DataFrame(
        class_stats_data,
        columns=[
            "Genomic_accession",
            "Prediction_tool",
            "CAZy_class",
            "Statistic_parameter",
            "Statistic_value",
        ],
    )
    class_stats_df.to_csv(f"class_stats_per_test_set_{time_stamp}.csv")

    return
Esempio n. 3
0
def calculate_class_stats(
    ground_truth_df,
    prediction_df,
    time_stamp,
    args,
    class_list=["GH", "GT", "PL", "CE", "AA", "CBM"],
):
    """Calculate the adjusted rand index and rand index of CAZy class annotation predictions.

    :param predictions_df: df of predicted CAZy family annotations from all prediction tools
    :param ground_truths_df: df of CAZy annotations of proteins
    :param time_stamp: str
    :param args: cmd-line parser
    :param class_list: list of CAZy class names

    Return predictions_df including calculated CAZy class statistics.
    """
    logger = logging.getLogger(__name__)
    specific_logger = build_logger(args.output,
                                   "cazy_class_confusion_matrix_errors.log")

    across_all_test_sets_data = []

    # evaluate performance across all test sets and all proteins
    for tool in ["dbCAN", "HMMER", "Hotpep", "DIAMOND", "CUPP", "eCAMI"]:
        # retrieve the rows of interest from from predictions and ground truths df
        tool_class_ground_truths = ground_truth_df.loc[
            ground_truth_df["Prediction_tool"] == tool]
        tool_class_predictions = prediction_df.loc[
            prediction_df["Prediction_tool"] == tool]

        # build an empty dataframe to store all predictions EXCEPT true negative non-CAZymes
        tp_fp_fn_ground_truths = pd.DataFrame(
            columns=list(tool_class_ground_truths.columns))
        tp_fp_fn_predictions = pd.DataFrame(
            columns=list(tool_class_predictions.columns))

        index = 0
        for index in range(len(tool_class_ground_truths["Prediction_tool"])):
            y_true = tool_class_ground_truths.iloc[index]
            y_true = list(y_true[class_list]
                          )  # retrieve only the cazy class 0/1 annotations

            y_pred = tool_class_predictions.iloc[index]
            y_pred = list(y_pred[class_list]
                          )  # retrieve only the cazy class 0/1 annotations

            if (1 not in y_true) and (1 not in y_pred):
                # if y_true and y_pred are all 0s, this is a true negative non-CAZyme prediction
                # do not include true negative non-CAZyme predictions
                continue
            else:
                # add TP, FP or FN result to the dataframes
                tp_fp_fn_ground_truths = tp_fp_fn_ground_truths.append(
                    tool_class_ground_truths.iloc[index], )
                tp_fp_fn_predictions = tp_fp_fn_predictions.append(
                    tool_class_predictions.iloc[index], )

        # calculate statistics
        for cazy_class in tqdm(class_list,
                               desc=f"Calc CAZy class stats for {tool}"):
            data = [tool, cazy_class]

            y_true = list(tp_fp_fn_ground_truths[cazy_class])
            y_pred = list(tp_fp_fn_predictions[cazy_class])

            # check if the CAZy class was included in predictions and ground truths
            # if not exclude the class from the evaluation
            if (1 not in y_true) and (1 not in y_pred):
                # do not include in statistics
                logger.warning(
                    f"{cazy_class} not predicted by {tool} in {accession} and not in known "
                    f"annotations\nExcluding {cazy_class} from evaluation by setting all "
                    "stats results as NaN")
                specific_logger.warning(
                    f"{cazy_class} not predicted by {tool} in {accession} and not in known "
                    f"annotations\nExcluding {cazy_class} from evaluation by setting all "
                    "stats results as NaN")

                specificity = np.nan
                class_stats_data.append(
                    [accession, tool, cazy_class, "Specificity", specificity
                     ], )

                sensitivity = np.nan
                class_stats_data.append(
                    [accession, tool, cazy_class, "Recall", recall])

                precision = np.nan
                class_stats_data.append(
                    [accession, tool, cazy_class, "Precision", precision])

                fbeta = np.nan
                class_stats_data.append(
                    [accession, tool, cazy_class, "Fbeta_score", fbeta])

                accuracy = np.nan
                class_stats_data.append(
                    [accession, tool, cazy_class, "Accuracy", accuracy])

                continue

            recall = recall_score(y_true, y_pred)
            data.append(recall)

            precision = precision_score(y_true, y_pred)
            data.append(precision)

            fbeta = fbeta_score(y_true, y_pred, beta=args.beta)
            data.append(fbeta)

            cm = confusion_matrix(y_true, y_pred)
            try:
                tn = cm[0][0]
                fn = cm[1][0]
                tp = cm[1][1]
                fp = cm[0][1]

            except IndexError:  # cannot build confusion matrix if true negative, or
                data.append(np.nan)  # add specificity
                data.append(np.nan)  # add acuracy

                specific_logger.warning(
                    f"Prediction Tool: {tool}\tCAZy class: {cazy_class}\n"
                    f"y_true: {y_true}\ny_pred: {y_pred}\n")

                continue

            specificity = tn / (tn + fp)
            data.append(specificity)

            accuracy = (tp + tn) / (tp + fp + fn + tn)
            data.append(accuracy)

            across_all_test_sets_data.append(data)

    # build dataframe using across_all_test_sets_data
    class_stats_df = pd.DataFrame(
        across_all_test_sets_data,
        columns=[
            "Prediction_tool",
            "CAZy_class",
            "Specificity",
            "Recall",
            "Precision",
            "Fbeta_score",
            "Accuracy",
        ],
    )
    class_stats_df.to_csv(f"class_stats_across_all_test_sets_{time_stamp}.csv")

    return
Esempio n. 4
0
def test_build_logger_v_true(logger_output):
    """Tests building of logger"""
    build_logger(None, "test_logger")
Esempio n. 5
0
def test_build_logger_v_false(logger_output):
    """Tests building of logger"""
    build_logger(logger_output, "test_logger.log")