Beispiel #1
0
def paralogs_analyses_methods(config_file, expert_config_file, paranome_table,
                              anchors_table, correction_table, anchorpoints,
                              multiplicons, segments, list_elements,
                              multiplicon_pairs):
    # INPUT
    config = fcConf.Configuration(config_file, expert_config_file)
    logging.basicConfig(format='%(levelname)s\t%(message)s',
                        level=config.get_logging_level(),
                        stream=sys.stdout)

    paranome = config.get_paranome()
    colinearity = config.get_colinearity()
    extra_paralogs_analyses_methods = config.get_extra_paralogs_analyses_methods(
    )

    if paranome and not colinearity:
        # Only exp-log mixture model by default
        exp_log_mixture(config_file, expert_config_file, paranome_table,
                        correction_table)
        if extra_paralogs_analyses_methods:
            logging.info(f"\n")
            # Lognormal mixture model on paranome
            lognormal_mixture(config_file, expert_config_file, paranome_table,
                              anchors_table, correction_table)

    if colinearity and not paranome:
        # Only anchor clustering by default
        cluster_anchor_ks(config_file, expert_config_file, correction_table,
                          anchorpoints, multiplicons, segments, list_elements,
                          anchors_table, multiplicon_pairs)
        if extra_paralogs_analyses_methods:
            logging.info(f"\n")
            # Lognormal mixture model on anchors
            lognormal_mixture(config_file, expert_config_file, paranome_table,
                              anchors_table, correction_table)

    if colinearity and paranome:
        # Only anchor clustering by default
        cluster_anchor_ks(config_file, expert_config_file, correction_table,
                          anchorpoints, multiplicons, segments, list_elements,
                          anchors_table, multiplicon_pairs)
        if extra_paralogs_analyses_methods:
            logging.info(f"\n")
            # Exp-log mixture model on paranome
            exp_log_mixture(config_file, expert_config_file, paranome_table,
                            correction_table)
            logging.info(f"\n")
            # Lognormal mixture model on both
            lognormal_mixture(config_file, expert_config_file, paranome_table,
                              anchors_table, correction_table)
Beispiel #2
0
def wgd_orthologs(config_file, expert_config_file, species_one, species_two,
                  n_threads):
    # INPUT
    species_pair = sorted([species_one, species_two], key=str.casefold)
    species1, species2 = species_pair[0], species_pair[1]  # sorted!

    config = fcConf.Configuration(config_file, expert_config_file)
    init_logging(
        f"Ortholog wgd analysis for species pair [{species1} - {species2}]",
        config.get_logging_level())

    # Get parameters and FASTA files from configuration file
    latin_names = config.get_latin_names()

    fasta_names_dict = config.get_fasta_dict()
    species1_fasta_file = config.get_fasta_name(fasta_names_dict, species1)
    fcCheck.check_inputfile(species1_fasta_file, "FASTA file")
    fcCheck.check_IDs(species1_fasta_file, latin_names[species1])

    species2_fasta_file = config.get_fasta_name(fasta_names_dict, species2)
    fcCheck.check_inputfile(species2_fasta_file, "FASTA file")
    fcCheck.check_IDs(species2_fasta_file, latin_names[species2])

    # Creating folder for output files of wgd ortholog pipeline.
    # Note: since in Nextflow mode there are multiple wgdOrtholog processes running in parallel,
    # this "if-try-except" prevents that almost-simultaneous checks rise an error: a slower process
    # would rise an error if in the meanwhile a faster process had already created the folder.
    ortholog_dists_dir = os.path.join("ortholog_distributions", "")
    if not os.path.exists(ortholog_dists_dir):
        logging.info(f"Creating directory {ortholog_dists_dir}")
        os.makedirs(ortholog_dists_dir, exist_ok=True)

    # -----------------------------------------------------------------------------

    # ESTIMATING ORTHOLOG Ks VALUES
    logging.info("Running wgd ortholog Ks pipeline...")
    fc_wgd.ks_orthologs(species1,
                        species2,
                        species1_fasta_file,
                        species2_fasta_file,
                        base_dir=ortholog_dists_dir,
                        n_threads=n_threads)

    logging.info(datetime.datetime.today().ctime())
    logging.info("Done")
Beispiel #3
0
def correct(config_file, expert_config_file, trios_file):
    # INPUT
    config = fcConf.Configuration(config_file, expert_config_file)
    init_logging("Rate-adjustment of ortholog Ks distributions",
                 config.get_logging_level())
    logging.info("Loading parameters and input files")

    # Get parameters from configfile
    species_of_interest = config.get_species()
    latin_names = config.get_latin_names()
    db_path = config.get_ortho_db()

    default_path_trios_file = os.path.join(
        "rate_adjustment", f"{species_of_interest}",
        f"ortholog_trios_{species_of_interest}.tsv")
    trios_file = fcCheck.get_argument_path(trios_file, default_path_trios_file,
                                           "Trios TSV file")
    if trios_file == "":
        logging.error(
            f"Trios TSV file not found at default position [{default_path_trios_file}]."
        )
        logging.error("Exiting")
        sys.exit(1)

    try:
        with open(db_path, "r") as f:
            db = pd.read_csv(f, sep="\t", index_col=0)
    except Exception:
        logging.error(
            f"Ortholog peak database [{db_path}] not found or empty\n\
            -> rate-adjustment will be skipped.")
        sys.exit(0)

    # Getting the statistical measure for how to determine the representative value of an ortholog distribution
    peak_stats = config.get_peak_stats(
    )  # default is mode (other option, median)

    # Getting the choice on how to deal with the presence of multiple adjustments for the same divergent pair
    # due to the use of multiple trios/outgroup during adjustment
    consensus_peak_for_multiple_outgroups = config.get_consensus_peak_for_multiple_outgroups(
    )

    # -------------------------------------------------------------------

    all_trios_correction_array = [
    ]  # will contain the adjusted peak for each trio
    all_pairs_array = [
    ]  # will contain the adjusted peak for each divergent pair after getting a consensus from multiple outspecies;
    # (both "best OC" and "multiple outspecies" strategies)
    sisters_per_node = {}  # keys=nodes, values=list with sisters

    # FILLING IN THE DATAFRAME FOR ALL TRIOS
    # It lists the adjustment results for each outgroup that has been used
    logging.info("")
    logging.info(
        f"Performing rate-adjustment of each divergent pair by using one or more outgroups:"
    )
    with open(trios_file, "r") as f:
        trios = pd.read_csv(f, sep="\t")

    for __, row in trios.iterrows():
        node = row['Node']
        species, sister, out = row['Focal_Species'], row[
            'Sister_Species'], row['Out_Species']
        latinSpecies, latinSister, latinOut = latin_names[
            species], latin_names[sister], latin_names[out]
        logging.info(
            f" - Adjusting the peak for [{latinSpecies}] and [{latinSister}] with outspecies [{latinOut}]"
        )

        species_sister = "_".join(
            sorted([latinSpecies, latinSister],
                   key=str.casefold))  # e.g. A.filiculoides_S.cucullata
        species_out = "_".join(
            sorted([latinSpecies, latinOut], key=str.casefold))
        sister_out = "_".join(sorted([latinSister, latinOut],
                                     key=str.casefold))

        if species_sister in db.index and species_out in db.index and sister_out in db.index:
            rate_species, rate_species_sd, rate_sister, rate_sister_sd = fcCorrect.decompose_ortholog_ks(
                db, species_sister, species_out, sister_out, peak_stats)
            correct_peak, correct_sd = fcCorrect.compute_corrected_ks_species_sister(
                rate_species, rate_species_sd)
            # OC_segment is a measure of better/worse outgroup choices for the decomposition into branch-specific Ks contributions; see documentation.
            OC_segment = db.loc[species_out]['Mode'] - rate_species

            orig_mode = db.loc[species_sister]['Mode']
            orig_mode_sd = db.loc[species_sister]['Mode_SD']

            all_trios_correction_array.append([
                node, latinSpecies, latinSister, latinOut,
                round(correct_peak, 6),
                round(correct_sd, 6),
                round(orig_mode, 6),
                round(orig_mode_sd, 6),
                round(rate_species, 6),
                round(rate_sister, 6),
                round(OC_segment, 6)
            ])

            if node not in sisters_per_node:
                sisters_per_node[node] = []
            if latinSister not in sisters_per_node[node]:
                sisters_per_node[node].append(latinSister)

        else:  # missing ortholog data
            logging.warning(
                f"Couldn't process trio [{latinSpecies}, {latinSister}, {latinOut}]:"
            )
            if species_sister not in db.index:
                logging.warning(
                    f" - [{species_sister}] not in ortholog peak database.")
            if species_out not in db.index:
                logging.warning(
                    f" - [{species_out}] not in ortholog peak database.")
            if sister_out not in db.index:
                logging.warning(
                    f" - [{sister_out}] not in ortholog peak database.")

    # Generating file with adjustment data for each trio.
    all_trios_correction_df = DataFrame.from_records(
        all_trios_correction_array,
        columns=[
            "Node", "Focal_Species", "Sister_Species", "Out_Species",
            "Adjusted_Mode", "Adjusted_Mode_SD", "Original_Mode",
            "Original_Mode_SD", "Ks_Focal", "Ks_Sister", "Ks_Out"
        ])
    with open(
            os.path.join(
                "rate_adjustment", f"{species_of_interest}",
                f"{fcCorrect._ADJUSTMENT_TABLE_ALL.format(species_of_interest)}"
            ), "w+") as outfile:
        outfile.write(all_trios_correction_df.to_csv(sep="\t", index=False))
    logging.info(
        f"Rate-adjustment results for each trio saved in TSV format [{fcCorrect._ADJUSTMENT_TABLE_ALL.format(species_of_interest)}]"
    )
    logging.info("")

    # FILLING IN THE DATAFRAME FOR ALL ORTHOLOG PAIRS WITH FOCAL SPECIES
    # (ALL DIVERGENCE EVENTS OF FOCAL SPECIES)
    if len(sisters_per_node) != 0:
        logging.info(
            f"Finding a consensus value in case multiple outgroups have been used to adjust a divergent pair [strategy: {consensus_peak_for_multiple_outgroups}]"
        )

    # Get the headers for the file with data to plot the tree
    df_tested_outgroups_per_sister = all_trios_correction_df.loc[
        all_trios_correction_df['Node'] == 1]
    number_of_lines = len(df_tested_outgroups_per_sister.index)

    for node in sisters_per_node:
        # to select all the lines with "Node" field equal to 0 (or to 1, 2...)
        node_df = all_trios_correction_df.loc[all_trios_correction_df['Node']
                                              == node]

        for sister in sisters_per_node[node]:
            # FIRST STRATEGY: taking the MEAN among adjusted peaks from all outgroups
            # TODO: take the median too?
            peak_list = node_df.loc[node_df['Sister_Species'] == sister,
                                    ['Adjusted_Mode']]
            sd_list = node_df.loc[node_df['Sister_Species'] == sister,
                                  ['Adjusted_Mode_SD']]
            sd_list = sd_list["Adjusted_Mode_SD"].values.tolist()

            peak_mean = float(peak_list.mean())
            # Computing st.dev of the mean peak by following error propagation rules (sum of squares, square root; division)
            sd_err_prop = 0
            for sd in sd_list:
                sd_err_prop += pow(sd, 2)
            sd_err_prop = sqrt(sd_err_prop) / len(sd_list)
            # Getting the mean rate_species and the mean rate_sister out of the adjustments (when multiple trios/outgroups are used)
            rate_species_list = node_df.loc[node_df['Sister_Species'] ==
                                            sister, ['Ks_Focal']]
            rate_species_mean = float(rate_species_list.mean())
            rate_sister_list = node_df.loc[node_df['Sister_Species'] == sister,
                                           ['Ks_Sister']]
            rate_sister_mean = float(rate_sister_list.mean())

            # SECOND STRATEGY: taking only the adjusted peak from the "BEST" outgroup (lowest OC value)
            # TODO: use DataFrame.at?
            oc_list = node_df.loc[node_df['Sister_Species'] == sister,
                                  ['Ks_Out']]
            oc_best_value = float(oc_list.min())
            peak_best_oc = node_df.loc[node_df['Ks_Out'] == oc_best_value,
                                       ['Adjusted_Mode']]
            peak_best_oc_float = float(
                peak_best_oc.mean())  # trick to make it a float number
            sd_best_oc = node_df.loc[node_df['Ks_Out'] == oc_best_value,
                                     ['Adjusted_Mode_SD']]
            sd_best_oc_float = float(
                sd_best_oc.mean())  # trick to make it a float number
            # Getting the rate_species and Ks_Sister associated to the adjustment with the best outgroup
            rate_species_best_out = node_df.loc[node_df['Ks_Out'] ==
                                                oc_best_value, ['Ks_Focal']]
            rate_species_best_out_float = float(rate_species_best_out.mean(
            ))  # trick to make it a float number
            rate_sister_best_out = node_df.loc[node_df['Ks_Out'] ==
                                               oc_best_value, ['Ks_Sister']]
            rate_sister_best_out_float = float(
                rate_sister_best_out.mean())  # trick to make it a float number

            species_sister = "_".join(
                sorted([latin_names[species_of_interest], sister],
                       key=str.casefold))

            orig_mode = db.loc[species_sister]['Mode']
            orig_mode_sd = db.loc[species_sister]['Mode_SD']

            all_pairs_array.append([
                node, latin_names[species_of_interest], sister,
                round(peak_mean, 6),
                round(sd_err_prop, 6),
                round(rate_species_mean, 6),
                round(rate_sister_mean, 6),
                round(peak_best_oc_float, 6),
                round(sd_best_oc_float, 6),
                round(rate_species_best_out_float, 6),
                round(rate_sister_best_out_float, 6),
                round(orig_mode, 6),
                round(orig_mode_sd, 6)
            ])

    # Generating file with adjustment data for each divergent pair,
    # namely after obtaining a consensus value for the results coming from using different outspecies on the same divergent pair.
    all_pairs_df = DataFrame.from_records(
        all_pairs_array,
        columns=[
            "Node", "Focal_Species", "Sister_Species", "Adjusted_Mode_Mean",
            "Adjusted_Mode_Mean_SD", "Ks_Focal_Mean", "Ks_Sister_Mean",
            "Adjusted_Mode_Best", "Adjusted_Mode_Best_SD", "Ks_Focal_Best",
            "Ks_Sister_Best", "Original_Mode", "Original_Mode_SD"
        ])
    with open(
            os.path.join(
                "rate_adjustment", f"{species_of_interest}",
                f"{fcCorrect._ADJUSTMENT_TABLE.format(species_of_interest)}"),
            "w+") as outfile:
        outfile.write(all_pairs_df.to_csv(sep="\t", index=False))
        logging.info(
            f"Rate-adjustment results as consensus values saved in TSV format [{fcCorrect._ADJUSTMENT_TABLE.format(species_of_interest)}]"
        )

    logging.info("")
    logging.info("All done")
Beispiel #4
0
def wgd_paralogs(config_file, expert_config_file, n_threads):
    # INPUT
    # Get parameters and FASTA files from configuration file
    config = fcConf.Configuration(config_file, expert_config_file)
    species = config.get_species()
    init_logging(f"Paralog wgd analysis for species [{species}]",
                 config.get_logging_level())

    latin_names = config.get_latin_names()
    max_gene_family_size = config.get_max_gene_family_size()
    paranome = config.get_paranome()
    colinearity = config.get_colinearity()

    if not paranome and not colinearity:
        logging.error(
            'At least one of the "paranome" or "collinearity" parameters in the configuration file needs to be set to "yes".'
        )
        logging.error("Exiting.")
        sys.exit(1)

    logging.info(
        f"Checking if sequence data files exist and if sequence IDs are compatible with wgd pipeline..."
    )
    # Will exit if FASTA or GFF files are missing or empty or if GFF feature/attribute are missing
    trigger_exit = False

    if colinearity:  # if colinearity analysis is required, load related parameters
        gff = config.get_gff(species)
        if fcCheck.check_file_nonexistent_or_empty(gff, "GFF file"):
            trigger_exit = True

        gff_feature = config.get_feature()
        gff_gene_attribute = config.get_attribute()
        if gff_feature == "":
            logging.error(
                "No GFF attribute provided in configuration file. Will exit.")
            trigger_exit = True
        if gff_gene_attribute == "":
            logging.error(
                "No GFF feature provided in configuration file. Will exit.")
            trigger_exit = True

    # Checking if FASTA file exists and if sequence IDs are compatible with wgd pipeline (paml)
    fasta_names_dict = config.get_fasta_dict()
    species_fasta_file = config.get_fasta_name(fasta_names_dict, species)
    if fcCheck.check_file_nonexistent_or_empty(
            species_fasta_file, "FASTA file"):  # if missing/empty
        trigger_exit = True
    else:  # If FASTA file exists, check for ID compatibility
        if colinearity:
            fcCheck.check_IDs(species_fasta_file, latin_names[species], gff)
        else:
            fcCheck.check_IDs(species_fasta_file, latin_names[species])

    if trigger_exit:
        logging.error(
            "Please add the missing information to the configuration file and rerun the analysis. Exiting."
        )
        sys.exit(1)
    logging.info("Completed")

    # Creating folder for output files of wgd paralog pipeline
    paralog_dists_dir = os.path.join("paralog_distributions", "")
    if not os.path.isdir(paralog_dists_dir):
        logging.info(f"Creating directory [{paralog_dists_dir}]")
        os.makedirs(paralog_dists_dir)

    # -----------------------------------------------------------------------------

    # ESTIMATING PARANOME Ks VALUES
    logging.info("Running wgd paralog Ks pipeline...")
    fc_wgd.ks_paralogs(species,
                       species_fasta_file,
                       max_gene_family_size=max_gene_family_size,
                       base_dir=paralog_dists_dir,
                       n_threads=n_threads)

    # EXTRACTING COLINEARITY/SYNTENY ANCHOR PAIRS Ks VALUES
    if colinearity:
        logging.info('---')
        logging.info("Running wgd colinearity Ks pipeline...")
        fc_wgd.ks_colinearity(species,
                              gff,
                              base_dir=paralog_dists_dir,
                              gff_feature=gff_feature,
                              gff_gene_attribute=gff_gene_attribute,
                              n_threads=n_threads)

    logging.info(datetime.datetime.today().ctime())
    logging.info("Done")
Beispiel #5
0
def exp_log_mixture(config_file, expert_config_file, paralog_tsv_file,
                    correction_table_file):
    # INPUT
    config = fcConf.Configuration(config_file, expert_config_file)
    init_logging("Exponential-Lognormal mixture model on Ks paranome",
                 config.get_logging_level())
    logging.info("Loading parameters and input files")

    # GET PARAMETERS and INPUT FILES
    species = config.get_species()
    latin_names = config.get_latin_names()
    latinSpecies = latin_names[species]
    species_escape_whitespace = latinSpecies.replace(' ', '\ ')
    # NOTE: only for this script, the max accepted Ks value is constrained to 5,
    # instead of being taken from configfile, due to the fact that the code uses
    # a built-in "buffer" lognormal to cover the area of high Ks values (4-5 Ks).
    max_ks_para = config.get_max_ks_para()
    bin_width = config.get_bin_width_para()
    bin_list = fcPlot.get_bins(max_ks_para, bin_width)
    x_max_lim = config.get_x_max_lim()
    y_max_lim = config.get_y_lim()
    color_list = config.get_color_list()
    plot_correction_arrows = config.plot_correction_arrows()
    paranome_analysis = config.get_paranome()
    # Getting the statistical measure for how to determine the representative value of an ortholog distribution
    peak_stats = config.get_peak_stats(
    )  # default is mode (other option, median)
    # Getting the choice on how to deal with the presence of multiple adjustments for the same divergent pair
    consensus_peak_for_multiple_outgroups = config.get_consensus_peak_for_multiple_outgroups(
    )

    # Parameters used during the mixture modeling
    max_ks_EM = config.get_max_ks_for_mixture_model(
        max_ks_para)  # upper Ks limit considered for the mixture model fitting
    max_EM_iterations = config.get_max_EM_iterations()  # default 300
    num_EM_initializations = config.get_num_EM_initializations(
    )  # how many times the fitting with N given components is initialized
    # in the random method and in the "peak + random" method). default 10
    max_num_comp = config.get_max_mixture_model_components(
    )  # max number of components used in the fitting with random components (exp + buffer lognormal + lognormal)
    min_num_comp = 2  # there are always at least two components (the exponential and the buffer lognormal)

    logging.info(f" - maximum EM iterations: {max_EM_iterations}")
    logging.info(f" - number of EM initializations: {num_EM_initializations}")
    logging.info(f" - maximum number of components: {max_num_comp}")
    if max_ks_EM != max_ks_para:
        logging.info(
            f" - Ks range considered for the mixture modeling: up to {max_ks_EM} Ks."
        )
    logging.info("")

    if paranome_analysis:
        default_path_paralog_tsv_file = os.path.join("paralog_distributions",
                                                     f"wgd_{species}",
                                                     f"{species}.ks.tsv")
        paralog_tsv_file = fcCheck.get_argument_path(
            paralog_tsv_file, default_path_paralog_tsv_file,
            "Paralog Ks TSV file")
        if paralog_tsv_file == "":
            logging.error(
                f"Paralog Ks TSV file not found at default position [{default_path_paralog_tsv_file}]."
            )
            logging.error("Exiting")
            sys.exit(1)
    else:
        logging.error(
            "Mixture modeling is not performed since paranome analysis is not required in configuration file"
        )
        logging.error("Exiting")
        sys.exit(0)  # exit code 0 because no actual errors were thrown

    ks_data, ks_weights = fc_extract_ks_list.ks_list_from_tsv(
        paralog_tsv_file, max_ks_para, "paralogs")

    # Get adjustment results TSV file
    # If correction_table is (still) missing, it will be equal to empty string (""), but the script will not exit
    default_path_correction_table_file = os.path.join(
        "rate_adjustment", f"{species}",
        f"{_ADJUSTMENT_TABLE.format(species)}")
    correction_table_file = fcCheck.get_argument_path(
        correction_table_file, default_path_correction_table_file,
        "Rate-adjustment table file")
    if correction_table_file == "":
        logging.warning(
            "Rate-adjustment data are not available yet, only Ks paranome distribution will be plotted."
        )
        correction_table = None
        correction_table_available = False
    else:
        with open(correction_table_file, "r") as f:
            correction_table = read_csv(f, sep="\t")
            correction_table_available = True

    # Creating folder for secondary output files
    output = os.path.join(subfolder)
    if not os.path.isdir(os.path.join("rate_adjustment", species, output)):
        logging.info(
            f"Creating directory [rate_adjustment/{species}/{output}]")
        logging.info("")
        os.makedirs(os.path.join("rate_adjustment", species, output))

    # Generating figures for the mixture models
    fig_peaks, ax_peaks_ks, ax_peaks_logks, ax_peaks2_ks, ax_peaks2_logks, sup_peaks = fcEM.generate_peak_model_figure(
        species_escape_whitespace, x_max_lim)
    fig_random, axes_random, sup_random = fcEM.generate_random_model_figure(
        species_escape_whitespace, min_num_comp, max_num_comp, x_max_lim)
    fig_best_model, ax_best_ks = fcEM.generate_best_model_figure(
        latinSpecies, x_max_lim, y_max_lim, correction_table_available,
        plot_correction_arrows)

    # Generating a proxy dataset for the weighted Ks paranome (deconvoluting the histogram)
    deconvoluted_data = fcEM.deconvolute_data(paralog_tsv_file, max_ks_EM,
                                              "paralogs")
    # Log-transformation of Ks paranome
    ks_data_log, ks_weights_log = fcEM.logtransformation(
        paralog_tsv_file, max_ks_EM)

    bic_dict, parameters_list = {}, {
    }  # will contain BIC scores and parameters of all models
    all_models_init_parameters = {
    }  # will contain the initial parameters of all models (for plotting purpose)
    all_models_fitted_parameters = {
    }  # will contain the fitted parameters of all models (for plotting purpose)
    parameter_table = [
    ]  # will contain parameters for every model iteration for tabular output text file

    # -----------------------------------------------------------------------------

    with open(
            os.path.join("rate_adjustment", f"{species}", subfolder,
                         f"elmm_{species}_parameters.txt"), "w+") as outfile:

        # Performing EM algorithm multiple times with different types of initializations

        logging.info(
            "Performing EM algorithm with initialization from Ks paranome data"
        )
        model_id = 1
        ax_peaks_ks.set_title(f"Model {model_id}\n")
        # Plotting background Ks paranome histograms for the figures (original and log-transformed)
        fcEM.plot_histograms_mixture(ax_peaks_ks, ax_peaks_logks, ks_data,
                                     ks_weights, ks_data_log, ks_weights_log,
                                     bin_list, bin_width, y_max_lim)

        # Initializing component parameters and plotting them
        init_lambd, init_means, init_stdevs, init_weights, reduced_gaussians = fcEM.init_parameters_from_data(
            ax_peaks_ks, species, ks_data_log, ks_weights_log, ks_data,
            ks_weights, species_escape_whitespace, output, max_ks_EM)
        all_models_init_parameters[model_id] = [
            init_means, init_stdevs, init_lambd, init_weights
        ]
        fcEM.plot_init_comp(ax_peaks_ks, ax_peaks_logks, init_means,
                            init_stdevs, init_lambd, init_weights)
        num_comp = len(init_means) + 1

        # Performing EM algorithm, computing the BIC and plotting the fitted components
        bic_peaks, means_peaks, stdevs_peaks, lambd_peaks, weights_peaks = fcEM.em(
            num_comp,
            max_EM_iterations,
            deconvoluted_data,
            init_lambd,
            init_means,
            init_stdevs,
            init_weights,
            model_id,
            num_EM_initializations,
            max_num_comp,
            parameter_table,
            outfile,
            reduced_gaussians_flag=reduced_gaussians,
            EM_data=True)
        all_models_fitted_parameters[model_id] = [
            means_peaks, stdevs_peaks, lambd_peaks, weights_peaks
        ]
        bic_dict[model_id] = bic_peaks
        fcEM.plot_fitted_comp(ax_peaks_ks, ax_peaks_logks, means_peaks,
                              stdevs_peaks, lambd_peaks, weights_peaks,
                              x_max_lim, peak_stats,
                              correction_table_available,
                              plot_correction_arrows)

        # -----------------------------------------------------------------

        logging.info(
            "Performing EM algorithm with initialization from Ks paranome data plus a random lognormal component"
        )
        model_id = 2
        ax_peaks2_ks.set_title(f"Model {model_id}")
        # Plotting background Ks paranome histograms for the figures (original and log-transformed)
        fcEM.plot_histograms_mixture(ax_peaks2_ks, ax_peaks2_logks, ks_data,
                                     ks_weights, ks_data_log, ks_weights_log,
                                     bin_list, bin_width, y_max_lim)

        # Adding a random lognormal to the components initialized from Ks data
        # It is done multiple times and only the best result is retained
        bic_from_same_num_comp = []
        start_parameters, final_parameters = [], []
        for i in range(num_EM_initializations):
            if len(init_means) > 4:
                # Limiting the number of total lognormal to 5:
                # 1 buffer lognormal, plus a maximum of 3 "peak" lognormals from data, plus the 1 random lognormal that is going to be added;
                # if there are already more than 4 lognormal, some of the peak lognormal are removed, while the buffer lognormal is always left
                updated_means = list(
                    list(random.choice(init_means[:-1], size=4, replace=False))
                    + [init_means[-1]])
                updated_stdevs = list(
                    list(random.choice(init_stdevs[:-1], size=4,
                                       replace=False)) + [init_stdevs[-1]])
                reduced_gaussians = True
            else:
                updated_means, updated_stdevs = init_means.copy(
                ), init_stdevs.copy()
                reduced_gaussians = False

            updated_means.append(round(random.choice(arange(-0.5, 1, 0.1)), 1))
            updated_stdevs.append(
                round(random.choice(arange(0.3, 0.9, 0.1)), 1))
            num_comp = len(updated_means) + 1
            updated_weights = [1 / num_comp] * num_comp
            start_parameters.append(
                [updated_means, updated_stdevs, init_lambd, updated_weights])

            # Performing EM algorithm and computing the BIC
            bic_peaks, means_peaks, stdevs_peaks, lambd_peaks, weights_peaks = fcEM.em(
                num_comp,
                max_EM_iterations,
                deconvoluted_data,
                init_lambd,
                updated_means,
                updated_stdevs,
                updated_weights,
                model_id,
                num_EM_initializations,
                max_num_comp,
                parameter_table,
                outfile,
                reduced_gaussians_flag=reduced_gaussians,
                model_iteration=i + 1,
                EM_data_random=True)
            bic_from_same_num_comp.append(bic_peaks)
            final_parameters.append(
                [means_peaks, stdevs_peaks, lambd_peaks, weights_peaks])

        # Get the initial and fitted parameters of the best model and plot them; get its BIC score
        updated_means, updated_stdevs, init_lambd, updated_weights = start_parameters[
            argmin(bic_from_same_num_comp)]
        all_models_init_parameters[model_id] = [
            updated_means, updated_stdevs, init_lambd, updated_weights
        ]
        fcEM.plot_init_comp(ax_peaks2_ks, ax_peaks2_logks, updated_means,
                            updated_stdevs, init_lambd, updated_weights)

        final_means, final_stdevs, final_lambd, final_weights = final_parameters[
            argmin(bic_from_same_num_comp)]
        all_models_fitted_parameters[model_id] = [
            final_means, final_stdevs, final_lambd, final_weights
        ]
        fcEM.plot_fitted_comp(ax_peaks2_ks, ax_peaks2_logks, final_means,
                              final_stdevs, final_lambd, final_weights,
                              x_max_lim, peak_stats,
                              correction_table_available,
                              plot_correction_arrows)

        bic_dict[model_id] = min(bic_from_same_num_comp)
        parameters_list[model_id] = final_parameters[argmin(
            bic_from_same_num_comp)]

        plt.close()
        fig_peaks.savefig(os.path.join(
            "rate_adjustment", f"{species}", output,
            f"elmm_{species}_models_data_driven.pdf"),
                          bbox_inches="tight",
                          bbox_extra_artists=(sup_peaks, ),
                          format="pdf")
        logging.info(
            f"Saving PDF figure of mixture models [{species}/{output}/elmm_{species}_models_data_driven.pdf]"
        )
        logging.info("")

        # -----------------------------------------------------------------------------

        logging.info(
            "Performing EM algorithm with (almost) random initialization")

        num_comp_list = arange(min_num_comp,
                               max_num_comp + 1)  # range(3, 6) is 2, 3, 4, 5
        axes_ids = num_comp_list - min_num_comp  # 0, 1, 2, 3
        model_ids = num_comp_list - (
            min_num_comp - 3
        )  # 3, 4, 5, 6 # model 1 and 2 are from data; from model 3 on is from random method
        for num_comp, ax_id, model_id in zip(num_comp_list, axes_ids,
                                             model_ids):
            logging.info(f" - using {num_comp} components")
            ax_rand_ks, ax_rand_logks = axes_random[ax_id][0], axes_random[
                ax_id][1]
            # Plotting background Ks paranome histograms for the figures (original and log-transformed)
            fcEM.plot_histograms_mixture(ax_rand_ks, ax_rand_logks, ks_data,
                                         ks_weights, ks_data_log,
                                         ks_weights_log, bin_list, bin_width,
                                         y_max_lim)

            bic_from_same_num_comp = []
            start_parameters, final_parameters = [], []
            for i in range(num_EM_initializations):
                # Initializing parameters for the given number of components
                init_means, init_stdevs, init_lambd, init_weights = fcEM.init_parameters_randomly(
                    model_id, num_comp, ax_rand_ks, max_ks_EM)
                start_parameters.append(
                    [init_means, init_stdevs, init_lambd, init_weights])

                # Performing EM algorithm and computing the BIC
                bic_random, means_random, stdevs_random, lambd_random, weights_random = fcEM.em(
                    num_comp,
                    max_EM_iterations,
                    deconvoluted_data,
                    init_lambd,
                    init_means,
                    init_stdevs,
                    init_weights,
                    model_id,
                    num_EM_initializations,
                    max_num_comp,
                    parameter_table,
                    outfile,
                    model_iteration=i + 1,
                    EM_random=True)
                bic_from_same_num_comp.append(bic_random)
                final_parameters.append([
                    means_random, stdevs_random, lambd_random, weights_random
                ])

            # Get the initial and fitted parameters of the best result for the given number of components and plot them; get its BIC score
            init_means, init_stdevs, init_lambd, init_weights = start_parameters[
                argmin(bic_from_same_num_comp)]
            all_models_init_parameters[model_id] = [
                init_means, init_stdevs, init_lambd, init_weights
            ]
            fcEM.plot_init_comp(ax_rand_ks, ax_rand_logks, init_means,
                                init_stdevs, init_lambd, init_weights)

            final_means, final_stdevs, final_lambd, final_weights = final_parameters[
                argmin(bic_from_same_num_comp)]
            all_models_fitted_parameters[model_id] = [
                final_means, final_stdevs, final_lambd, final_weights
            ]
            fcEM.plot_fitted_comp(ax_rand_ks, ax_rand_logks, final_means,
                                  final_stdevs, final_lambd, final_weights,
                                  x_max_lim, peak_stats,
                                  correction_table_available,
                                  plot_correction_arrows)

            bic_dict[model_id] = min(bic_from_same_num_comp)
            parameters_list[model_id] = final_parameters[argmin(
                bic_from_same_num_comp)]

        plt.close()
        fig_random.savefig(os.path.join("rate_adjustment", f"{species}",
                                        output,
                                        f"elmm_{species}_models_random.pdf"),
                           bbox_inches="tight",
                           bbox_extra_artists=(sup_random, ),
                           format="pdf")
        logging.info(
            f"Saving PDF figure of mixture models [{species}/{output}/elmm_{species}_models_random.pdf]"
        )
        logging.info("")

        # Generating tabular text file with all model parameters
        fcEM.make_parameter_table_file(parameter_table, species)

        # BIC evaluation of all models (from data, from data with random lognormal and from random components)
        logging.info("Models are evaluated according to their BIC score.")
        # Get best model by lowest BIC score and plot it; print comparison with the other models
        best_model_id = fcEM.eval_best_model(bic_dict, outfile)
        fcEM.plot_best_model(
            fig_best_model, ax_best_ks, species, ks_data, ks_weights, bin_list,
            bin_width, x_max_lim, y_max_lim, best_model_id,
            all_models_init_parameters, all_models_fitted_parameters,
            correction_table, correction_table_available,
            consensus_peak_for_multiple_outgroups, peak_stats, color_list,
            plot_correction_arrows, deconvoluted_data, max_ks_EM)
    logging.info(
        f"Saving PDF figure of best mixture model [mixed_{species}_elmm.pdf]")
    logging.info("")
    logging.info("All done")
Beispiel #6
0
def cluster_anchor_ks(config_file, expert_config_file, correction_table_file, path_anchorpoints_txt, path_multiplicons_txt, path_segments_txt, path_list_elements_txt, path_ks_anchor_file, path_multiplicon_pair_txt):
    config = fcConf.Configuration(config_file, expert_config_file)
    init_logging(f"Clustering anchorpoints Ks values to reconstruct recent WGD events", config.get_logging_level())
    logging.info("Loading parameters and input files")

    colinearity_analysis = config.get_colinearity()
    if not colinearity_analysis:
        logging.warning("Colinearity non required in configuration file; the anchor Ks clustering will be skipped.")
        sys.exit(0) # exit code 0 because no actual errors were thrown

    species = config.get_species()
    latin_names = config.get_latin_names()
    latin_name = latin_names[species].replace(' ', '\ ') # replace the space to have the correct format in titles
    max_ks_para = config.get_max_ks_para()
    bin_width = config.get_bin_width_para()
    bin_list = fcPlot.get_bins(max_ks_para, bin_width)
    x_max_lim = config.get_x_max_lim()
    y_max_lim = config.get_y_lim()
    color_list = config.get_color_list()
    plot_correction_arrows = config.plot_correction_arrows()

    max_EM_iterations = config.get_max_EM_iterations() # default 300
    num_EM_initializations = config.get_num_EM_initializations() # how many times the fitting with N given components is initialized 
    logging.info(f" - maximum EM iterations: {max_EM_iterations}")
    logging.info(f" - number of EM initializations: {num_EM_initializations}")

    # Getting the statistical measure for how to determine the representative value of an ortholog distribution
    peak_stats = config.get_peak_stats() # default is mode (other option, median)

    # Getting the choice on how to deal with the presence of multiple adjustments for the same divergent pair
    consensus_peak_for_multiple_outgroups = config.get_consensus_peak_for_multiple_outgroups()

    # Checking user-defined path and / or default path for each required input file
    default_path_correction_table_file = os.path.join("rate_adjustment", f"{species}", f"{_ADJUSTMENT_TABLE.format(species)}")
    correction_table_file = fcCheck.get_argument_path(correction_table_file, default_path_correction_table_file, "Rate-adjustment table file")
    if correction_table_file == "":
        logging.warning("Rate-adjustment data are not available yet, only anchor pair distribution will be plotted.")
        correction_table_available = False
    else:
        with open(correction_table_file, "r") as f:
            correction_table = read_csv(f, sep="\t")
            correction_table_available = True

    default_path_anchorpoints_txt = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}_i-adhore", "anchorpoints.txt")
    path_anchorpoints_txt = fcCheck.get_argument_path(path_anchorpoints_txt, default_path_anchorpoints_txt, "anchorpoints.txt file")
    if path_anchorpoints_txt == "":
        logging.error(f"anchorpoints.txt file not found at default position [{default_path_anchorpoints_txt}].")

    default_path_multiplicons_txt = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}_i-adhore", "multiplicons.txt")
    path_multiplicons_txt = fcCheck.get_argument_path(path_multiplicons_txt, default_path_multiplicons_txt, "multiplicons.txt file")
    if path_multiplicons_txt == "":
        logging.error(f"multiplicons.txt file not found at default position [{default_path_anchorpoints_txt}].")

    default_path_segments_txt = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}_i-adhore", "segments.txt")
    path_segments_txt = fcCheck.get_argument_path(path_segments_txt, default_path_segments_txt, "segments.txt file")
    if path_segments_txt == "":
        logging.error(f"segments.txt file not found at default position [{default_path_segments_txt}].")

    default_path_multiplicon_pair_txt = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}_i-adhore", "multiplicon_pairs.txt")
    path_multiplicon_pair_txt = fcCheck.get_argument_path(path_multiplicon_pair_txt, default_path_multiplicon_pair_txt, "multiplicon_pairs.txt file")
    if path_multiplicon_pair_txt == "":
        logging.error(f"multiplicon_pairs.txt file not found at default position [{default_path_multiplicon_pair_txt}].")

    default_path_list_elements_txt = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}_i-adhore", "list_elements.txt")
    path_list_elements_txt = fcCheck.get_argument_path(path_list_elements_txt, default_path_list_elements_txt, "list_elements.txt file")
    if path_list_elements_txt == "":
        logging.error(f"list_elements.txt file not found at default position [{default_path_list_elements_txt}].")

    default_path_ks_anchor_file = os.path.join("paralog_distributions", f"wgd_{species}", f"{species}.ks_anchors.tsv")
    path_ks_anchor_file = fcCheck.get_argument_path(path_ks_anchor_file, default_path_ks_anchor_file, f"{species}.ks_anchors.tsv file")
    if path_ks_anchor_file == "":
        logging.error(f"{species}.ks_anchors.tsv file not found at default position [{default_path_ks_anchor_file}].")

    if path_anchorpoints_txt == "" or path_multiplicons_txt == "" or path_segments_txt == "" or path_multiplicon_pair_txt == "" or path_list_elements_txt == "" or path_ks_anchor_file == "":
        logging.error("Exiting")
        sys.exit(1)

    # Creating folder for secondary output files
    output = os.path.join(subfolder)
    if not os.path.isdir(os.path.join("rate_adjustment", species, output)):
        logging.info(f"Creating directory [rate_adjustment/{species}/{output}]")
        os.makedirs(os.path.join("rate_adjustment", species, output))

    # Parsing I-ADHoRe output files to get information about multiplicons, multiplicon levels, segments, anchorpoints, anchor pairs and Ks values. 
    segments_per_multip = fcCluster.parse_segments_file(path_segments_txt)

    segments_from_gene = fcCluster.parse_list_elements(path_list_elements_txt)

    ks_anchors = fcCluster.parse_ks_anchors_tsv_file(path_ks_anchor_file)

    multipl_per_level, level_of_each_multipl, max_level, level_list, level_list_filtered = fcCluster.parse_multiplicons_file(path_multiplicons_txt)

    anchors_per_multipl, levels_of_each_anchor = fcCluster.parse_multiplicon_pairs_file(path_multiplicon_pair_txt, level_of_each_multipl)

    anchorpoints_per_multipl, multipl_per_anchorpoint, levels_of_anchorpoints = fcCluster.parse_anchorpoints_file(path_anchorpoints_txt, level_of_each_multipl)

    anchor_ks_list, anchors_weights = fc_extract_ks_list.ks_list_from_tsv(path_ks_anchor_file, max_ks_para, "anchor pairs") # Get complete anchor Ks list to be plotted in the background

    # -----------------------------------------------------------------------------

    # Getting non-redundant segment pairs based on group of anchorpoints
    # Filtering away segment pairs whose anchorpoints list is a subset of another segment pair Ks list
    logging.info("")
    logging.info(f"Obtaining a non-redundant list of anchorpoints Ks values")
    logging.info("")

    anchorpoints_ks_per_segment_pair = {} # Ks values per segment pair
    anchorpoints_per_segment_pair = {} # anchorpoint names per segment pair

    for multipl_id in level_of_each_multipl: # for each multiplicon
        anchorpoints_list = anchorpoints_per_multipl[multipl_id]
        segments_current_multip = segments_per_multip[multipl_id]
        for anchorpoints in anchorpoints_list:

            # Process only anchorpoints with acceptable Ks value
            if anchorpoints in ks_anchors:
                ks = ks_anchors[anchorpoints]
                if ks != "" and 0.05 <= float(ks) <= max_ks_para:
                    anchorpoint1, anchorpoint2 = anchorpoints[0], anchorpoints[1]
                    # Get the two segments where the two anchorpoints are placed in the current multiplicon
                    all_segments_anchorpoint1 = segments_from_gene[anchorpoint1]
                    all_segments_anchorpoint2 = segments_from_gene[anchorpoint2]
                    anchorpoint1_segment = list(set(all_segments_anchorpoint1) & set(segments_current_multip))[0]
                    anchorpoint2_segment = list(set(all_segments_anchorpoint2) & set(segments_current_multip))[0]
                    current_segment_pair_sorted = tuple(sorted([anchorpoint1_segment, anchorpoint2_segment]))
                    
                    if current_segment_pair_sorted not in anchorpoints_per_segment_pair:
                        anchorpoints_per_segment_pair[current_segment_pair_sorted] = [anchorpoints]
                    else:
                        anchorpoints_per_segment_pair[current_segment_pair_sorted].append(anchorpoints)  


    other_segment_pairs_dict = {} # links to each other the segment pairs that share anchorpoints
    # {key = segment pair; value = {key= other segment pair in which the anchorpoint is present; value= shared anchorpoints}
    # e.g. { (3, 5): { (1287, 1288): [shared anchorpoints] } }
    for segment_pair in anchorpoints_per_segment_pair:
        for anchorpoint in anchorpoints_per_segment_pair[segment_pair]:
            anchorpoint1, anchorpoint2 = anchorpoint[0], anchorpoint[1]
            # Get all multiplicons in which the current anchorpoint is found
            multipl_of_current_anchorpoint = multipl_per_anchorpoint[anchorpoint]

            for multipl_id in multipl_of_current_anchorpoint:
                # Let's take the segment pairs where this anchorpoints lies on
                segments_current_multip = segments_per_multip[multipl_id]
                all_segments_anchorpoint1 = segments_from_gene[anchorpoint1]
                all_segments_anchorpoint2 = segments_from_gene[anchorpoint2]
                anchorpoint1_segment = list(set(all_segments_anchorpoint1) & set(segments_current_multip))[0]
                anchorpoint2_segment = list(set(all_segments_anchorpoint2) & set(segments_current_multip))[0]
                current_segment_pair_sorted = tuple(sorted([anchorpoint1_segment, anchorpoint2_segment]))

                if segment_pair not in other_segment_pairs_dict:
                    other_segment_pairs_dict[segment_pair] = {}

                if current_segment_pair_sorted != segment_pair: # if the anchorpoint is also in another segment pair
                    if current_segment_pair_sorted not in other_segment_pairs_dict[segment_pair]:
                        other_segment_pairs_dict[segment_pair][current_segment_pair_sorted] = [anchorpoint]
                    else:
                        other_segment_pairs_dict[segment_pair][current_segment_pair_sorted].append(anchorpoint)

    clean_segment_pair_dictionary = copy.deepcopy(other_segment_pairs_dict)
    for segment_pair in other_segment_pairs_dict: # For each segment_pair that shares anchor points with other segments
        anchorpoints_current_segment_pair = anchorpoints_per_segment_pair[segment_pair]
        for other_segment_pair in other_segment_pairs_dict[segment_pair]:
            anchorpoints_other_segment_pair = anchorpoints_per_segment_pair[other_segment_pair]
            shortest_anchorpoints_list = min([anchorpoints_current_segment_pair, anchorpoints_other_segment_pair], key=len)
            intersection_current_segment_other_segment = list(set(anchorpoints_current_segment_pair) & set(anchorpoints_other_segment_pair))
            
            # If one list is a subset of the other list, let's remove the short list because it's redundant 
            # If the shorter list shares at least 1/3 of the anchorpoints with the longer list, it is removed (still considered redundant) 
            if set(intersection_current_segment_other_segment) == set(shortest_anchorpoints_list) or len(intersection_current_segment_other_segment) >= 1/3 * len(shortest_anchorpoints_list):   
                if shortest_anchorpoints_list == anchorpoints_current_segment_pair: # the shortest is the current segment pair
                    if segment_pair in clean_segment_pair_dictionary:
                        del clean_segment_pair_dictionary[segment_pair]
                elif shortest_anchorpoints_list == anchorpoints_other_segment_pair: # the shortest is the other segment pair
                    if other_segment_pair in clean_segment_pair_dictionary:
                        del clean_segment_pair_dictionary[other_segment_pair]


    # Getting the Ks lists of the non redundant segment pairs
    nonred_segment_pairs_segment_based = {}
    nonred_segment_pairs_segment_based_no_outliers = {}

    for segment_pair in clean_segment_pair_dictionary:
        anchorpoints_list_current_segment = anchorpoints_per_segment_pair[segment_pair]
        anchorpoints_ks_list_current_segment = []
        for anchorpoint in anchorpoints_list_current_segment:
                ks = ks_anchors[anchorpoint]
                if ks != "" and 0.05 <= float(ks) <= max_ks_para:
                    anchorpoints_ks_list_current_segment.append(float(ks))
        if len(anchorpoints_ks_list_current_segment) > 0: # if there is at least one Ks in the list, add the segment pair to the dictionaries
            # Updating the dictionary that takes into account also outliers
            nonred_segment_pairs_segment_based[segment_pair] = anchorpoints_ks_list_current_segment
            # Updating the dictionary that filters away the outliers by using MAD (median absolute deviation)
            if len(anchorpoints_ks_list_current_segment) <= 5: # if up to 5 Ks, too short for reliable median and MAD statistic
                nonred_segment_pairs_segment_based_no_outliers[segment_pair] = anchorpoints_ks_list_current_segment
            else: # if 6 or more: remove outliers
                median_ks = median(anchorpoints_ks_list_current_segment)
                mad = stats.median_absolute_deviation(anchorpoints_ks_list_current_segment)
                lower_bound, upper_bound = median_ks - mad, median_ks + mad
                ks_list_without_outliers = []
                for ks in anchorpoints_ks_list_current_segment:
                    if lower_bound <= ks <= upper_bound:
                        ks_list_without_outliers.append(ks)
                if len(ks_list_without_outliers) > 0:
                    nonred_segment_pairs_segment_based_no_outliers[segment_pair] = ks_list_without_outliers

    # -----------------------------------------------------------------------------

    # Taking median of each non-redundant Ks list per segment pair

    # There are two options for the Ks list source from which the medians are computed: with or without outliers
    ###chosen_segment_ks_list = nonred_segment_pairs_segment_based (OPTION ONE)
    chosen_segment_ks_list = nonred_segment_pairs_segment_based_no_outliers  # (OPTION TWO)

    all_segment_pairs_ks_median, all_segment_pairs_ks_median_dict = [], {}
    for segment_pair in chosen_segment_ks_list:
        median_ks = median(chosen_segment_ks_list[segment_pair])
        all_segment_pairs_ks_median.append([segment_pair, median_ks])
        all_segment_pairs_ks_median_dict[segment_pair] = median_ks

    # Preparing the list of medians to be given as input to the clustering function
    all_medians_list = []
    for segment_median in all_segment_pairs_ks_median:
        median_value = segment_median[1]
        all_medians_list.append(median_value)
    all_medians_list = array(all_medians_list) # must be a numpy array

    # -----------------------------------------------------------------------------

    # Choosing how many clusters to use, namely as many as the number of WGD events that explains the maximum multiplicon level
    # E.g. if the highest multiplicon level is 8, it is explained by 3 WGDs, then we set the number of clusters as 3
    # We don't consider the combination with WGTs
    num_wgd_per_level = {2: 1, 3: 2, 4: 2, 5: 3, 6: 3, 7: 3, 8: 3, 9: 4, 10: 4, 11: 4, 12: 4, 13: 4, 14: 4, 15: 4, 16: 4, 17: 4, 18: 4, 19: 4}

    if max_level in num_wgd_per_level:
        n_clusters = num_wgd_per_level[max_level]
    else: 
        n_clusters = 4

    logging.info(f"Highest multiplicon level in i-ADHoRe output files: {max_level}")
    logging.info(f"Number of WGDs inferred to explain the highest level: {n_clusters}")
    logging.info("")

    # -----------------------------------------------------------------------------

    # FIRST round of clustering (with all medians)

    # Clustering with GaussianMixtureModel, k-means or lognormal mixture modeling
    clustering_method = "Gaussian mixture modeling" # other options: "k-means" or "lognormal mixture modeling" 
    logging.info(f"Performing a first Ks clustering round with {n_clusters} clusters using {clustering_method}")
    if clustering_method == "Gaussian mixture modeling":
        gmm_clustered_medians = fcCluster.gmm(all_medians_list, n_clusters, max_EM_iterations, num_EM_initializations)
    elif clustering_method == "k-means":
        kmeans_clustered_medians = fcCluster.kmeans(all_medians_list, n_clusters)
    #elif clustering_method == "lognormal mixture modeling":
        # Temporary not available because pomegranate is not on midas:
        #lognormal_clustered_medians = fcCluster.lognormalmm(all_medians_list, n_clusters)

    # TODO: Decide whether to ignore Ks outliers in the segment pair Ks lists or decide instead to consider the complete Ks lists:
    ###chosen_nonred_segment_pair_ks_list = nonred_segment_pairs_segment_based
    chosen_nonred_segment_pair_ks_list = nonred_segment_pairs_segment_based_no_outliers

    # Get the clusters of medians and the resulting clusters of Ks
    cluster_of_ks, medians_per_cluster, segments_medians_per_cluster, cluster_color_letter_list = fcCluster.get_clusters_of_ks(gmm_clustered_medians, all_medians_list, all_segment_pairs_ks_median, chosen_nonred_segment_pair_ks_list, "first")


    logging.info(f"Saving the distribution of clustered medians [{subfolder}/{fcCluster._ANCHOR_CLUSTERS_MEDIANS.format(species)}]")
    # Plot the clusters of medians according to the used method
    fcCluster.plot_clusters_of_medians(medians_per_cluster, cluster_color_letter_list, x_max_lim, bin_list, species, latin_name, output)


    # Generate the plot for the mixed distribution with clusters of Ks
    fig_corr_first, ax_corr_first = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim,
                                    y_max_lim, "corrected", correction_table_available, plot_correction_arrows,
                                    paranome_data=False, colinearity_data=True)
    fig_uncorr_first, ax_uncorr_first = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim,
                                    y_max_lim, "un-corrected", correction_table_available, plot_correction_arrows,
                                    paranome_data=False, colinearity_data=True)

    # Plot the original complete anchor distribution in the background
    fcPlot.plot_histogram_for_anchor_clustering(ax_corr_first, anchor_ks_list, anchors_weights, bin_list, y_max_lim)
    fcPlot.plot_histogram_for_anchor_clustering(ax_uncorr_first, anchor_ks_list, anchors_weights, bin_list, y_max_lim)

    # Plot the clusters of anchor Ks and on top of them their KDEs
    clusters_sorted_by_median, cluster_color_letter_list = fcCluster.plot_clusters(ax_corr_first, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows)
    fcCluster.plot_clusters(ax_uncorr_first, cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows)

    # Plotting the ortholog peaks coming from the adjustment, if available
    if correction_table_available:
        logging.info("Plotting divergence lines")
        fcPlot.plot_divergences(correction_table, peak_stats, consensus_peak_for_multiple_outgroups, ax_uncorr_first, ax_corr_first,
                                color_list, plot_correction_arrows)

    # -----------------------------------------------------------------------------

    # Removing the clusters that are poorly populated (Ks content <= 10%), very old (cluster median >= 3 Ks) or quite horizontally spread (IQR > 1.1 Ks)
    logging.info("")
    logging.info(f"Filtering away Ks clusters with unclear signal (poor Ks content, old Ks age or flat peak)...")
    clean_clusters_of_ks = fcCluster.filter_degenerated_clusters(cluster_of_ks, clusters_sorted_by_median, cluster_color_letter_list)

    # -----------------------------------------------------------------------------

    # SECOND round of clustering (only of medians coming from the good clusters)
    updated_n_clusters = len(clean_clusters_of_ks)
    if updated_n_clusters == n_clusters:
        logging.info("All clusters were retained")

        # Saving the first plot with the final name (leaving away "unfiltered")
        logging.info(f"Saving mixed Ks plot with anchor Ks clusters [{fcCluster._ANCHOR_CLUSTERS_FILTERED.format(species)}]")
        fcCluster.save_anchor_cluster_plot(fig_corr_first, fig_uncorr_first, ax_corr_first, ax_uncorr_first, species, latin_names, correction_table_available, cluster_of_ks, output, "second")
        logging.info("")

    elif updated_n_clusters == 0:
        logging.info("No clusters are left after filtering.")
        logging.info("")

    else: # if one ore more clusters were removed
        logging.info(f"Saving mixed Ks plot with unfiltered anchor Ks clusters [{fcCluster._ANCHOR_CLUSTERS_UNFILTERED.format(species)}]")
        fcCluster.save_anchor_cluster_plot(fig_corr_first, fig_uncorr_first, ax_corr_first, ax_uncorr_first, species, latin_names, correction_table_available, cluster_of_ks, output, "first")
        logging.info("")


        clean_medians_list = []
        clean_segment_medians_list = []

        for cluster in cluster_of_ks:
            if cluster in clean_clusters_of_ks: # if it is not too degenerated
                clean_medians_list.extend(medians_per_cluster[cluster])
                clean_segment_medians_list.extend(segments_medians_per_cluster[cluster])
        clean_medians_list = array(clean_medians_list)
        
        # Clustering with GaussianMixtureModel, k-means or lognormal mixture modeling
        logging.info("")
        logging.info(f"Performing a second Ks clustering round with {updated_n_clusters} clusters through {clustering_method} for the remaining Ks values")
        if clustering_method == "Gaussian mixture modeling":
            gmm_clustered_medians2 = fcCluster.gmm(clean_medians_list, updated_n_clusters, max_EM_iterations, num_EM_initializations)
        elif clustering_method == "k-means":
            kmeans_clustered_medians2 = fcCluster.kmeans(clean_medians_list, updated_n_clusters)
        #elif clustering_method == "lognormal mixture modeling":
            # Temporary not available because pomegranate is not on midas:
            #lognormal_clustered_medians2 = fcCluster.lognormalmm(clean_medians_list, updated_n_clusters)

        filtered_cluster_of_ks, __, __, filtered_cluster_color_list = fcCluster.get_clusters_of_ks(gmm_clustered_medians2, clean_medians_list, clean_segment_medians_list, chosen_nonred_segment_pair_ks_list, "second")

        # Generate the plot for the mixed distribution with clusters of Ks
        fig_corr_second, ax_corr_second = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim,
                                     y_max_lim, "corrected", correction_table_available, plot_correction_arrows,
                                     paranome_data=False, colinearity_data=True)
        fig_uncorr_second, ax_uncorr_second = fcPlot.generate_mixed_plot_figure(latin_names.get(species), x_max_lim,
                                     y_max_lim, "un-corrected", correction_table_available, plot_correction_arrows,
                                     paranome_data=False, colinearity_data=True)
        
        # Plot the original complete anchor distribution in the background
        fcPlot.plot_histogram_for_anchor_clustering(ax_corr_second, anchor_ks_list, anchors_weights, bin_list, y_max_lim)
        fcPlot.plot_histogram_for_anchor_clustering(ax_uncorr_second, anchor_ks_list, anchors_weights, bin_list, y_max_lim)

        # Plot the clusters of anchor Ks and on top of them their KDEs
        fcCluster.plot_clusters(ax_corr_second, filtered_cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows)
        fcCluster.plot_clusters(ax_uncorr_second, filtered_cluster_of_ks, bin_width, max_ks_para, peak_stats, correction_table_available, plot_correction_arrows)

        # Plotting the ortholog peaks coming from the adjustment, if available
        if correction_table_available:
            logging.info("Plotting divergence lines")
            fcPlot.plot_divergences(correction_table, peak_stats, consensus_peak_for_multiple_outgroups, ax_uncorr_second,
                                    ax_corr_second, color_list, plot_correction_arrows)

        logging.info(f"Saving mixed Ks plot with filtered anchor Ks clusters [{fcCluster._ANCHOR_CLUSTERS_FILTERED.format(species)}]")
        logging.info("")
        fcCluster.save_anchor_cluster_plot(fig_corr_second, fig_uncorr_second, ax_corr_second, ax_uncorr_second, species, latin_names, correction_table_available, filtered_cluster_of_ks, output, "second")

    logging.info("All done")
Beispiel #7
0
def plot_tree_rates(config_file, expert_config_file, correction_table_file,
                    nextflow_flag):
    # INPUT
    config = fcConf.Configuration(config_file, expert_config_file)
    init_logging(
        "Generating PDF of input tree with branch length equal to Ks distances",
        config.get_logging_level())
    logging.info("Loading parameters and input files")

    # GET PARAMETERS and INPUT FILES
    species = config.get_species()
    peak_db_path = config.get_ortho_db()
    newick_tree = config.get_newick_tree()  # as Tree object by ete3
    latin_names = config.get_latin_names()

    # Get correction results TSV file
    default_path_correction_table_file = os.path.join(
        "rate_adjustment", f"{species}",
        f"{_ADJUSTMENT_TABLE.format(species)}")
    correction_table_file = fcCheck.get_argument_path(
        correction_table_file, default_path_correction_table_file,
        "Rate-adjustment table file")
    if correction_table_file == "":  # it means that the correction_table is not present or available yet
        logging.warning(
            f"Rate-adjustment data not available yet: PDF figure of phylogenetic tree not generated."
        )
        logging.info(f"Exiting")
        sys.exit(1)  # exit 1 because the adjustment_table data is required
    else:
        with open(correction_table_file, "r") as f:
            correction_table = pandas.read_csv(f, sep="\t")
            species_in_correction_table = fcTree.counts_expected_line_number_in_correction_table(
                species, newick_tree, latin_names)
            missing_required_rates = set(
                sorted(species_in_correction_table)) - set(
                    sorted(correction_table["Sister_Species"]))

            if correction_table.shape[0] == 0:
                logging.warning(
                    f"Rate-adjustment data not available yet: PDF figure of phylogenetic tree not generated."
                )
                logging.info(f"Exiting")
                sys.exit(
                    1)  # exit 1 because the adjustment_table data is required
            elif len(missing_required_rates) != 0:
                # Having a complete adjustment table is strictly required for building the tree,
                # because all the branch-specific Ks contributions contained in there are needed.
                # Other extra-table branch contributions may be additionally required to fill in all the branch lengths,
                # but their absence is tolerated.
                logging.warning(
                    f"The branch-specific Ks contributions between {latin_names[species]} and the following species are missing from the rate-adjustment table. Please compute them before building the tree:"
                )
                for name in sorted(missing_required_rates):
                    logging.warning(f" - {name}")
                logging.warning(f"Exiting")
                sys.exit(
                    1)  # exit 1 because the adjustment_table data is required

    # Getting the choice on how to deal with the presence of multiple corrections for the same divergent pair
    # due to the use of multiple trios/outgroup during correction
    # Available options:
    #  - 'mean among outgroups': taking the average of the corrected peaks
    #  - 'best outgroup': taking the corrected peak coming from the best outgroup, which is the one with smallest OC segment
    consensus_peak_for_multiple_outgroups = config.get_consensus_peak_for_multiple_outgroups(
    )
    # Getting the statistical measure for how to determine the representative value of an ortholog distribution
    peak_stats = config.get_peak_stats(
    )  # default is mode (other option, median)

    # Loading the ortholog peak database for the tree picture: it can contain data to compute all the branch lengths.
    try:
        with open(peak_db_path, "r") as f:
            ortholog_db = pandas.read_csv(f, sep="\t", index_col=0)
            if ortholog_db.shape[0] == 0:
                logging.warning(
                    f"Ortholog peak database is present by doesn't contain any data, will be ignored when plotting the tree"
                )
                ortholog_db = pandas.DataFrame()
    except Exception:
        logging.warning(
            f"Ortholog peak database [{peak_db_path}] not found or empty, will be ignored when plotting the tree"
        )
        ortholog_db = pandas.DataFrame(
        )  # since there is no ortholog database, just assign an empty dataframe to this variable

    logging.info("")
    fcTree.plotting_tree(species, latin_names, newick_tree, correction_table,
                         consensus_peak_for_multiple_outgroups, ortholog_db,
                         peak_stats, nextflow_flag)
    logging.info("")
    logging.info(
        f"Saved PDF tree figure [{fcTree._TREE_BRANCH_DISTANCES.format(species)}]"
    )
    logging.info("")
    logging.info("All done")
Beispiel #8
0
def setup_correction(config_file, expert_config_file, nextflow_flag):
    config = fcConf.Configuration(config_file, expert_config_file)

    init_logging("Setting up the analysis from configuration file",
                 config.get_logging_level())
    logging.info("Loading parameters and input files")

    # Check configfile
    species_of_interest = config.get_species()
    original_tree = config.get_newick_tree()
    fcTree.check_integrity_newick_tree(original_tree)
    tree = fcTree.reorder_tree_leaves(
        original_tree, species_of_interest)  # focal species is the top leaf
    latin_names = config.get_latin_names()
    divergence_colors = config.get_color_list()
    paranome = config.get_paranome()
    colinearity = config.get_colinearity()

    if not paranome and not colinearity:
        logging.error(
            'At least one of the "paranome" or "collinearity" parameters in the configuration file needs to be set to "yes".'
        )
        logging.error("Exiting.")
        sys.exit(1)

    db_path = config.get_ortho_db()
    ks_list_db_path = config.get_ks_db()
    max_num_outspecies = config.get_max_num_outspecies()
    try:
        max_num_outspecies = int(max_num_outspecies)
    except Exception:
        pass

    # Checking if the IDs in FASTA (and GFF) files are likely to raise an error when using the PAML package.
    # For example, paml 4.4 gives "Node" Keyerror if the len(IDs) > 50, and/or there are special characters,
    # and/or there are two spaces in a row.
    logging.info(
        f"Checking if sequence data files exist and if sequence IDs are compatible with wgd pipeline..."
    )
    trigger_exit = False  # will trigger exit if FASTA or GFF files are missing or empty

    if colinearity:  # If a GFF is required, check existence and content
        gff = config.get_gff(species_of_interest)
        if fcCheck.check_file_nonexistent_or_empty(
                gff, "GFF file"):  # if missing/empty
            trigger_exit = True

    fasta_dict = config.get_fasta_dict()
    all_species = []
    for leaf in tree.get_leaves():
        all_species.append(leaf.name)
    for species in all_species:
        # Check existence and content of FASTA file
        fasta = config.get_fasta_name(fasta_dict, species)
        if fcCheck.check_file_nonexistent_or_empty(
                fasta, "FASTA file"):  # if missing/empty
            trigger_exit = True
            continue
        # Check the IDs in FASTA file
        if species == species_of_interest and colinearity:  # Warn about both FASTA and GFF files
            fcCheck.check_IDs(fasta, latin_names[species], gff)
        else:  # Warn only about FASTA file
            fcCheck.check_IDs(fasta, latin_names[species])

    if trigger_exit:
        logging.error(
            "Please add the missing information to the configuration file and rerun the analysis. Exiting."
        )
        sys.exit(1)
    logging.info("Completed")

    # Creating folders for correction output files
    if not os.path.exists('rate_adjustment'):
        os.mkdir('rate_adjustment')
    if not os.path.exists(
            os.path.join("rate_adjustment", f"{species_of_interest}")):
        os.mkdir(os.path.join("rate_adjustment", f"{species_of_interest}"))
        logging.info(
            f"Creating output folder [rate_adjustment/{species_of_interest}]")

    # -----------------------------------------------------------------------------

    # 1) FINDING SISTER AND OUTGROUP SPECIES FOR EACH NODE OF INTEREST

    logging.info("")
    logging.info(
        f"Extracting ortholog trios for rate-adjustment [ortholog_trios_{species_of_interest}.tsv]"
    )

    if isinstance(max_num_outspecies, int):
        logging.info(
            f"- Each divergent species pair will be rate-adjusted with at maximum {max_num_outspecies} "
            f"trios by using the closest outspecies")
        logging.info(
            f"  (as required in configuration file field 'maximum_number_outspecies')"
        )
    else:
        logging.info(
            f"- Each divergent species pair will be rate-adjusted by using all the possible outspecies "
            f"found in the tree.")

    # get tree node object of the focal species
    species_of_interest_node = fcTree.get_species_node(species_of_interest,
                                                       tree)
    # get the list of ancestors (as tree node objects) in the lineage that lead to the focal species
    sp_history = fcTree.get_species_history(species_of_interest_node)
    # Checking if the focal species has at least one outgroup in the provided tree
    if len(sp_history) - 2 == 0:
        logging.error("")
        logging.error(
            f"Species [{species_of_interest}] has no outgroup in the provided Newick tree "
            f"and the rate-adjustment can't be performed.")
        logging.error(
            f"Please add at least one outgroup species or change the focal species."
        )
        sys.exit(1)

    # Obtaining the numeric labels for internal nodes relevant in the species analysis
    fcTree.labeling_internal_nodes(species_of_interest_node)
    # If the amount of colors provided for the divergence lines in the config file
    # is insufficient for the number of divergence nodes in the tree, exit
    num_required_colors = sp_history[-2].name
    if len(divergence_colors) < num_required_colors:
        logging.error("")
        logging.error(
            f'Configuration file field "divergence_colors" is missing {num_required_colors - len(divergence_colors)} color(s) '
            +
            f"out of {num_required_colors} required for the analysis on focal species [{species_of_interest}]"
        )
        logging.error("Please add the missing color(s) and rerun the analysis")
        logging.error("Exiting.")
        sys.exit(1)

    trios_array = []  # list of trios
    outfile_drawing_path = os.path.join("rate_adjustment",
                                        f"{species_of_interest}",
                                        f"tree_{species_of_interest}.txt")
    with open(outfile_drawing_path, "w+") as outfile_drawing:
        outfile_drawing.write(f"Focal species: {species_of_interest}\n\n")

        node = 0
        while node < len(sp_history) - 2:
            # the name label to be shown in the ASCII tree will start from 1 and not from 0
            outfile_drawing.write(f"Node {node+1}:\n")
            currentnode = sp_history[node]

            # GETTING SISTER SPECIES
            sisters = fcTree.get_sister_species_of_a_node(currentnode)
            outfile_drawing.write(
                f"Sister species:      {', '.join(sisters)}\n")

            # GETTING OUTSPECIES
            outspecies = fcTree.get_outspecies_of_a_node(
                currentnode, max_num_outspecies)
            outfile_drawing.write(
                f"Outgroup species:    {', '.join(outspecies)}\n\n")

            # APPENDING TRIOS (a trio is composed of focal species, sister species and outgroup species)
            for s in sisters:
                for o in outspecies:
                    trios_array.append([node + 1, species_of_interest, s, o])
            node += 1

        print_tree = tree.get_ascii(attributes=["name"], show_internal=True)
        outfile_drawing.write(f"{print_tree}\n\n")

    logging.info(f"- Total number of trios: {len(trios_array)}")
    logging.info("")

    # Generate trios DataFrame from trios array
    trios_df = DataFrame.from_records(
        trios_array,
        columns=["Node", "Focal_Species", "Sister_Species", "Out_Species"])
    outfile_trios_path = os.path.join(
        "rate_adjustment", f"{species_of_interest}",
        f"ortholog_trios_{species_of_interest}.tsv")
    with open(outfile_trios_path, "w+") as outfile:
        outfile.write(trios_df.to_csv(sep="\t", index=False))

    # -----------------------------------------------------------------------------

    # 2) FINDING UNKNOWN PAIRS FOR wgd ORTHOLOG RUNS
    logging.info(
        f"Extracting species pairs for wgd ortholog pipeline [ortholog_pairs_{species_of_interest}.tsv]"
    )
    if isinstance(max_num_outspecies, int):
        logging.info(
            f"- Only species pairs required by the selected trios will be considered."
        )

    # THIS CODE JUST TAKES THE PAIRS THAT ARE NECESSARY FOR THE TRIOS
    species_pairs = []
    for trio in trios_array:
        combos = [[trio[1], trio[2]], [trio[1], trio[3]], [trio[2], trio[3]]]
        for pair in combos:
            if pair not in species_pairs:
                species_pairs.append(pair)

    species_pairs = [sorted(x, key=str.casefold) for x in species_pairs]
    species_pairs_unknown = []

    # Getting possible extra species pairs whose ortholog data needs to be computed in order to
    # be able to fill in all branch lengths in the tree figure
    missing_pairs_with_latin_names, missing_pairs = fcTree.find_missing_pairs_for_tree_rates(
        tree, species_of_interest_node, sp_history, latin_names)
    # Adding these species pairs to the list obtained from the trios
    species_pairs.extend(missing_pairs)

    tags_list = []
    missing_latin_names = []
    flag_exit = False
    for pair in species_pairs:
        sp1, sp2 = pair[0], pair[1]
        try:
            latinSp1 = latin_names[sp1]
        except KeyError:
            if sp1 not in missing_latin_names:
                missing_latin_names.append(sp1)
                logging.error(
                    f"Latin name for [{sp1}] not found in the configuration file"
                )
                flag_exit = True
        try:
            latinSp2 = latin_names[sp2]
        except KeyError:
            if sp2 not in missing_latin_names:
                missing_latin_names.append(sp2)
                logging.error(
                    f"Latin name for [{sp2}] not found in the configuration file"
                )
                flag_exit = True
        try:
            latin_tag = "_".join(sorted([latinSp1, latinSp2],
                                        key=str.casefold))
            tags_list.append([latin_tag, [sp1, sp2]])
        except Exception:
            pass
    if flag_exit:  # exits if one or more scientific names are missing
        logging.error("Exiting")
        sys.exit(1)

    try:
        with open(db_path, "r") as f:
            db = pandas.read_csv(f, sep="\t", index_col=0)
            for tags in tags_list:
                if tags[0] not in db.index:
                    species_pairs_unknown.append([tags[1][0], tags[1][1]])
            no_db_file = False
    except Exception:
        no_db_file = True

    try:
        with open(ks_list_db_path, "r") as f:
            ks_list_db = pandas.read_csv(f, sep="\t", index_col=0)
            # When imported from csv format, all the Ks lists in the df are read as
            # plain text (strings) and must be converted back to value lists
            ks_list_db.loc[:, 'Ks_Values'] = ks_list_db.loc[:,
                                                            'Ks_Values'].apply(
                                                                literal_eval)
            for tags in tags_list:
                if tags[0] not in ks_list_db.index:
                    if [tags[1][0], tags[1][1]] not in species_pairs_unknown:
                        species_pairs_unknown.append([tags[1][0], tags[1][1]])
        no_ks_list_db_file = False
    except Exception:
        no_ks_list_db_file = True

    if no_db_file and not no_ks_list_db_file:
        logging.info(
            "Ortholog peak database empty or not found at the path provided in the configuration file."
        )
        logging.info(
            "All of the species pairs will be used to compute their ortholog distribution and its peak."
        )
        species_pairs_unknown = species_pairs.copy()
    elif not no_db_file and no_ks_list_db_file:
        logging.info(
            "Ortholog Ks list database empty or not found at the path provided in the configuration file."
        )
        logging.info(
            "All of the species pairs will be used to compute their ortholog distribution and its peak."
        )
        species_pairs_unknown = species_pairs.copy()
    elif no_db_file or no_ks_list_db_file:
        logging.info(
            "Ortholog databases empty or not found at the paths provided in the configuration file."
        )
        logging.info(
            "All of the species pairs will be used to compute their ortholog distribution and its peak."
        )
        species_pairs_unknown = species_pairs.copy()
    else:  # False and False
        pass

    logging.info(
        f"- Total number of species pairs not in database(s): {len(species_pairs_unknown)}"
    )
    logging.info("")

    species_pairs_unknown_df = DataFrame.from_records(
        species_pairs_unknown, columns=["Species1", "Species2"])
    outfile_pairs_path = os.path.join(
        "rate_adjustment", f"{species_of_interest}",
        f"ortholog_pairs_{species_of_interest}.tsv")
    with open(outfile_pairs_path, "w+") as outfile_pairs:
        outfile_pairs.write(
            species_pairs_unknown_df.to_csv(sep="\t", index=False))

    # -----------------------------------------------------------------------------

    # 3) PLOTTING THE ORIGINAL UN-CORRECTED TREE in PDF FORMAT
    logging.info(
        f"Plotting input phylogenetic tree [{fcTree._TREE.format(species_of_interest)}]"
    )
    logging.info("")
    fcTree.plot_uncorrected_phylogeny(tree, species_of_interest, latin_names,
                                      sp_history)

    # -----------------------------------------------------------------------------

    # 4) IF OUTSIDE NEXTFLOW PIPELINE:
    # Generating an output file containing the list of wgd runs that needs to be done by wgd_orthologs.py
    # By default it is assumed no Nextflow and the list of commands will be created
    if not nextflow_flag:
        logging.info(
            f"Generating the list of wgd paralog and ortholog runs to be manually executed "
            f"[wgd_runs_{species_of_interest}.txt]")
        logging.info("")
        with open(os.path.join(f"wgd_runs_{species_of_interest}.txt"),
                  "w+") as wgd_runs:
            default_threads = 1
            wgd_runs.write(
                f"# Note: the number of threads can be increased depending on the number of cores [default: {default_threads}]\n\n"
            )
            wgd_runs.write(
                f"ksrates paralogs-ks config_{species_of_interest}.txt --n-threads={default_threads}\n"
            )
            for pair in species_pairs_unknown:
                sp1, sp2 = pair[0], pair[1]
                wgd_runs.write(
                    f"ksrates orthologs-ks config_{species_of_interest}.txt {sp1} {sp2} "
                    f"--n-threads={default_threads}\n")

    logging.info("All done")
Beispiel #9
0
def compute_peaks(config_file, expert_config_file, ortholog_pairs_file):
    # INPUT
    config = fcConf.Configuration(config_file, expert_config_file)
    init_logging("Computing ortholog distribution peaks with related error",
                 config.get_logging_level())
    logging.info("Loading parameters and input files")

    # Get parameters and input files
    species = config.get_species()
    tree = config.get_newick_tree()
    latin_names = config.get_latin_names()
    max_ks_ortho = config.get_max_ks_ortho()
    n_iter = config.get_num_iteration()
    bin_width_ortho = config.get_bin_width_ortho()
    x_lim_ortho = config.get_x_lim_ortho()

    default_path_ortholog_pairs_file = os.path.join(
        "rate_adjustment", f"{species}", f"ortholog_pairs_{species}.tsv")
    ortholog_pairs_file = fcCheck.get_argument_path(
        ortholog_pairs_file, default_path_ortholog_pairs_file,
        "Ortholog pairs file")
    if ortholog_pairs_file == "":
        logging.error(
            f"Ortholog pairs file not found at default position [{default_path_ortholog_pairs_file}]."
        )
        logging.error("Exiting")
        sys.exit(1)
    with open(ortholog_pairs_file, "r") as f:
        header = f.readline()  # ignore first line due to headers
        ortholog_pairs = f.readlines()

    db_path = config.get_ortho_db()
    ks_list_db_path = config.get_ks_db()

    try:
        with open(db_path, "r") as f:
            db = pandas.read_csv(f, sep="\t", index_col=0)
    except Exception:
        logging.warning(
            f"Ortholog peak database [{db_path}] not found or empty: a new one is now generated."
        )

        db = DataFrame()
        with open(db_path, "w+") as outfile:
            outfile.write('\tSpecies1\tSpecies2\tMode\tMode_SD\n')

    try:
        with open(ks_list_db_path, "r") as f:
            ks_list_db = pandas.read_csv(f, sep="\t", index_col=0)
            # When imported from csv format, all the Ks lists in the df are read as
            # plain text (strings) and must be converted back to value lists
            ks_list_db.loc[:, 'Ks_Values'] = ks_list_db.loc[:,
                                                            'Ks_Values'].apply(
                                                                literal_eval)
    except Exception:
        logging.warning(
            f"Ortholog Ks list database [{ks_list_db_path}] not found or empty: a new one is now generated."
        )

        ks_list_db = DataFrame()
        with open(ks_list_db_path, "w+") as outfile:
            outfile.write('\tSpecies1\tSpecies2\tKs_Values\n')

    logging.info("")

    # -----------------------------------------------------------------------------

    if len(ortholog_pairs) == 0:
        logging.info(
            f"There are no ortholog species pairs listed in [{ortholog_pairs_file}]."
        )
        logging.info(f"Nothing to do.")
        sys.exit(
            0
        )  # exit code 0 because no actual errors were thrown (the pair list
        # is empty if such ortholog data were already obtained in previous runs)

    # Checking if the ortholog pair files contains correct format and valid species names
    format_error, species_name_error = False, False
    for pair_string in ortholog_pairs:
        # File parsing support both tab space (inserted by the script which generates the file, init.py),
        # but also accepts a single space in case the user manually write in the file.
        if len(pair_string.rstrip().split("\t")) == 2:
            pair = pair_string.rstrip().split("\t")
        elif len(pair_string.rstrip().split(" ")) == 2:
            pair = pair_string.rstrip().split(" ")
        else:
            format_error = True
            continue
        # good format, now let's check if the species names are valid
        if pair[0] not in tree.get_leaf_names(
        ) or pair[1] not in tree.get_leaf_names():
            species_name_error = True

    if format_error:
        logging.error(
            f"Format error in [{ortholog_pairs_file}]. Please check that each line contains two species names separated by a tabular space or by a single space."
        )
    if species_name_error:
        logging.error(
            f"One or more invalid (misspelled?) species names are listed in [{ortholog_pairs_file}]"
        )
    if format_error or species_name_error:
        logging.error(f"Exiting")
        sys.exit(1)

    # CHECKING FOR WHICH SPECIES PAIRS THE ORTHOLOG PEAK IS MISSING OR THE KS LIST IS MISSING

    list_of_compute_peak_results = [
    ]  # will be the flag for unsuccessful peak computations
    for pair_string in ortholog_pairs:
        if len(pair_string.rstrip().split(
                "\t")) == 2:  # e.g. [ ['sp1', 'sp2'], ['sp3','sp4'] ]
            pair = pair_string.rstrip().split("\t")
        elif len(pair_string.rstrip().split(" ")) == 2:
            pair = pair_string.rstrip().split(" ")
        pair.sort(key=str.lower)
        sp1, sp2 = pair[0], pair[1]
        latin_name_list = sorted([latin_names[sp1], latin_names[sp2]],
                                 key=str.casefold)
        latinSp1, latinSp2 = latin_name_list[0], latin_name_list[1]

        # Checking if there are temporary folders in the wgd folder of the two species (in case, their peak in the DB or the Ks list that will be extracted could be incomplete)
        if os.path.isdir(
                os.path.join("ortholog_distributions", f"wgd_{sp1}_{sp2}",
                             f"{sp1}_{sp2}.ks_tmp")) or os.path.isdir(
                                 os.path.join("ortholog_distributions",
                                              f"wgd_{sp1}_{sp2}",
                                              f"{sp1}_{sp2}.blast_tmp")):
            logging.warning(
                f"One or more temporary folders have been found in [ortholog_distributions/wgd_{sp1}_{sp2}]: the ortholog data may be incomplete!"
            )
            logging.warning(
                "It is advised to delete the temporary folders and the file that was being created (.blast.tsv or .ks.tsv) and re-compute the data."
            )
            logging.warning("")

        if f"{latinSp1}_{latinSp2}" not in db.index:  # flag for presence/absence of species pair in the ortholog peak database
            flag_not_in_peak_db = True
        else:
            flag_not_in_peak_db = False

        if f"{latinSp1}_{latinSp2}" not in ks_list_db.index:  # same but for ks list database
            flag_not_in_ks_db = True
        else:
            flag_not_in_ks_db = False

        if flag_not_in_peak_db is False and flag_not_in_ks_db is False:
            logging.info(
                f"{latinSp1} and {latinSp2}: data already present in ortholog database"
            )

        if flag_not_in_peak_db or flag_not_in_ks_db:  # if the pair is missing in the ortholog peak or ks databases
            compute_peak_failed_flag = fcPeak.estimate_peak(
                sp1,
                sp2,
                latinSp1,
                latinSp2,
                max_ks_ortho,
                n_iter,
                x_lim_ortho,
                bin_width_ortho,
                ks_list_db_path,
                db_path,
                flag_not_in_peak_db=flag_not_in_peak_db,
                flag_not_in_ks_db=flag_not_in_ks_db)
            list_of_compute_peak_results.append(compute_peak_failed_flag)
        logging.info("")

    if True in list_of_compute_peak_results:
        logging.warning(
            f"Number of failed peak computations: {list_of_compute_peak_results.count(True)}"
        )
        logging.warning("")
    logging.info("All done")
Beispiel #10
0
def plot_paralogs_distr(config_file, expert_config_file, correction_table_file,
                        paralog_tsv_file, anchors_ks_tsv_file):
    # INPUT
    config = fcConf.Configuration(config_file, expert_config_file)
    init_logging("Generating mixed paralog and ortholog distributions",
                 config.get_logging_level())
    logging.info("Loading parameters and input files")

    # GET PARAMETERS and INPUT FILES
    species = config.get_species()
    latin_names = config.get_latin_names()
    # Get analysis type
    paranome_analysis = config.get_paranome()
    colinearity_analysis = config.get_colinearity()
    if not colinearity_analysis and not paranome_analysis:
        logging.error(
            'At least one of the "paranome" or "collinearity" parameters in the configuration file needs to be set to "yes".'
        )
        logging.error("Exiting.")
        sys.exit(1)

    # Get paralog and anchors TSV files
    # If a Ks file is not required, it will be equal to "None"
    # If the required input Ks file (paranome or anchor pairs or both) is missing, its path will be qual to an empty string ("") and the script will exit
    if paranome_analysis:
        default_path_paralog_tsv_file = os.path.join("paralog_distributions",
                                                     f"wgd_{species}",
                                                     f"{species}.ks.tsv")
        paralog_tsv_file = fcCheck.get_argument_path(
            paralog_tsv_file, default_path_paralog_tsv_file,
            "Paralog Ks TSV file")
        if paralog_tsv_file == "":
            logging.error(
                f"Paralog Ks TSV file not found at default position [{default_path_paralog_tsv_file}]."
            )
    if colinearity_analysis:
        default_path_anchors_tsv_file = os.path.join(
            "paralog_distributions", f"wgd_{species}",
            f"{species}.ks_anchors.tsv")
        anchors_ks_tsv_file = fcCheck.get_argument_path(
            anchors_ks_tsv_file, default_path_anchors_tsv_file,
            "Anchor pair Ks TSV file")
        if anchors_ks_tsv_file == "":
            logging.error(
                f"Anchor pair Ks TSV file not found at default position [{default_path_anchors_tsv_file}]."
            )
    if paralog_tsv_file == "" or anchors_ks_tsv_file == "":
        logging.error("Exiting")
        sys.exit(1)

    # Creating folders for output files
    output_folder = os.path.join("rate_adjustment", f"{species}")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        logging.info(f"Generating output folder [{output_folder}]")

    # Get correction results TSV file
    # If correction_table is (still) missing, it will be equal to empty string (""), but the script will not exit
    default_path_correction_table_file = os.path.join(
        output_folder, f"{_ADJUSTMENT_TABLE.format(species)}")
    correction_table_file = fcCheck.get_argument_path(
        correction_table_file, default_path_correction_table_file,
        "Rate-adjustment table file")
    if correction_table_file == "":  # it means that the correction_table is not present or available yet
        logging.warning(
            "Rate-adjustment data are not available yet, only paralog distribution will be plotted."
        )
        correction_table_available = False
    else:
        with open(correction_table_file, "r") as f:
            correction_table = pandas.read_csv(f, sep="\t")
            if correction_table.shape[0] == 0:
                logging.warning(
                    f"Rate-adjustment table file is present but doesn't contain any data: rate-adjusted divergences will not be plotted."
                )
                correction_table_available = False
            else:
                correction_table_available = True

    # Getting the choice on how to deal with the presence of multiple corrections for the same divergent pair
    # due to the use of multiple trios/outgroup during correction
    # Available options:
    #  - 'mean among outgroups': taking the average of the corrected peaks
    #  - 'best outgroup': taking the corrected peak coming from the best outgroup, which is the one with smallest OC segment
    consensus_peak_for_multiple_outgroups = config.get_consensus_peak_for_multiple_outgroups(
    )

    # Get other parameters
    max_ks_para = config.get_max_ks_para()
    bin_width_para = config.get_bin_width_para()
    bin_list = fcPlot.get_bins(max_ks_para, bin_width_para)
    x_max_lim = config.get_x_max_lim()
    y_lim = config.get_y_lim()  # by default it's "None"
    color_list = config.get_color_list(
    )  # colors for vertical lines depicting speciation events
    kde_bandwidth_modifier = config.get_kde_bandwidth_modifier(
    )  # get the modifier to adjust KDE fitting
    plot_correction_arrows = config.plot_correction_arrows()
    peak_stats = config.get_peak_stats(
    )  # default is mode (other option, median)

    # -----------------------------------------------------------------------------

    # GENERATING MIXED DISTRIBUTION PLOT
    logging.info("")
    analysis_type = 'both'
    if not colinearity_analysis:
        analysis_type = 'paranome'
        logging.info(
            f"Plotting paranome Ks distribution for species [{species}]")
    elif not paranome_analysis:
        analysis_type = 'colinearity'
        logging.info(
            f"Plotting anchor pair Ks distribution for species [{species}]")
    else:
        logging.info(
            f"Plotting paranome and anchor pairs Ks distributions for species [{species}]"
        )

    # PLOTTING THE BACKGROUND PARALOG AND/OR ANCHOR DISTRIBUTION
    fig_uncorr, ax_uncorr = fcPlot.generate_mixed_plot_figure(
        latin_names.get(species),
        x_max_lim,
        y_lim,
        "un-corrected",
        correction_table_available,
        plot_correction_arrows,
        paranome_data=paranome_analysis,
        colinearity_data=colinearity_analysis)
    fig_corr, ax_corr = fcPlot.generate_mixed_plot_figure(
        latin_names.get(species),
        x_max_lim,
        y_lim,
        "corrected",
        correction_table_available,
        plot_correction_arrows,
        paranome_data=paranome_analysis,
        colinearity_data=colinearity_analysis)

    if paranome_analysis:
        paranome_list, paranome_weights = fc_extract_ks_list.ks_list_from_tsv(
            paralog_tsv_file, max_ks_para, "paralogs")
        hist_paranome = fcPlot.plot_histogram("Whole-paranome",
                                              ax_uncorr,
                                              paranome_list,
                                              bin_list,
                                              bin_width_para,
                                              max_ks_para,
                                              kde_bandwidth_modifier,
                                              weight_list=paranome_weights)
        fcPlot.plot_histogram("Whole-paranome",
                              ax_corr,
                              paranome_list,
                              bin_list,
                              bin_width_para,
                              max_ks_para,
                              kde_bandwidth_modifier,
                              weight_list=paranome_weights)

    if colinearity_analysis:
        anchors_list, anchors_weights = fc_extract_ks_list.ks_list_from_tsv(
            anchors_ks_tsv_file, max_ks_para, "anchor pairs")
        if len(anchors_list) == 0:
            logging.warning(
                f"No anchor pairs found! Maybe check your (gene) IDs between "
                f"the [{species}.ks_anchors.tsv] file and the [{species}.ks.tsv] files."
            )
        # FOR NOW USE AN UNWEIGHTED HISTOGRAM
        hist_anchors = fcPlot.plot_histogram(
            "Anchor pairs",
            ax_uncorr,
            anchors_list,
            bin_list,
            bin_width_para,
            max_ks_para,
            kde_bandwidth_modifier,
            color=fcPlot.COLOR_ANCHOR_HISTOGRAM,
            weight_list=anchors_weights)
        fcPlot.plot_histogram("Anchor pairs",
                              ax_corr,
                              anchors_list,
                              bin_list,
                              bin_width_para,
                              max_ks_para,
                              kde_bandwidth_modifier,
                              color=fcPlot.COLOR_ANCHOR_HISTOGRAM,
                              weight_list=anchors_weights)

    # Setting plot height based on tallest histogram (if paranome analysis is on, it will come from that distribution)
    if y_lim is None:
        if paranome_analysis:
            fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_paranome)
            fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_paranome)
        else:
            fcPlot.set_mixed_plot_height(ax_uncorr, y_lim, hist_anchors)
            fcPlot.set_mixed_plot_height(ax_corr, y_lim, hist_anchors)

    # PLOTTING THE ORTHOLOG DIVERGENCE LINES on the paralog distribution
    if correction_table_available:
        logging.info("Plotting ortholog divergence lines in the mixed plot")
        fcPlot.plot_divergences(correction_table, peak_stats,
                                consensus_peak_for_multiple_outgroups,
                                ax_uncorr, ax_corr, color_list,
                                plot_correction_arrows)

    logging.info("")
    logging.info(
        f"Saving PDF figures of mixed plots [{fcPlot._MIXED_ADJUSTED_PLOT_FILENAME.format(species)}, {fcPlot._MIXED_UNADJUSTED_PLOT_FILENAME.format(species)}]"
    )
    fcPlot.save_mixed_plot(fig_corr,
                           fig_uncorr,
                           ax_corr,
                           ax_uncorr,
                           species,
                           correction_table_available,
                           paranome=paranome_analysis,
                           colinearity=colinearity_analysis)

    logging.info("")
    logging.info("All done")
Beispiel #11
0
def plot_orthologs_distr(config_file, expert_config_file, trios_file):
    # INPUT
    config = fcConf.Configuration(config_file, expert_config_file)
    init_logging("Plotting ortholog distributions for all ortholog trios", config.get_logging_level())
    logging.info("Loading parameters and input files")

    # Get parameters from configuration file
    species_of_interest = config.get_species()
    latin_names = config.get_latin_names()
    max_ks_ortho = config.get_max_ks_ortho()
    bin_width_ortho = config.get_bin_width_ortho()
    bin_list_ortho = fcPlot.get_bins(max_ks_ortho, bin_width_ortho)
    x_lim = config.get_x_lim_ortho()

    # Get input file listing the trios
    default_path_trios_file = os.path.join("rate_adjustment", f"{species_of_interest}", f"ortholog_trios_{species_of_interest}.tsv")
    trios_file = fcCheck.get_argument_path(trios_file, default_path_trios_file, "Trios TSV file")
    if trios_file == "":
        logging.error(f"Trios TSV file not found at default position [{default_path_trios_file}]")
        logging.error("Exiting")
        sys.exit(1)
    with open(trios_file, 'r') as f1:
        trios = pandas.read_csv(f1, sep="\t")

    # Get the ortholog Ks list database (to plot histograms; mandatory input file)
    ks_list_db_path = config.get_ks_db()
    fcCheck.check_inputfile(ks_list_db_path, "Ortholog Ks list database")
    with open(ks_list_db_path, 'r') as f2:
        ks_list_db = pandas.read_csv(f2, sep="\t", index_col=0)

    # Get the ortholog peak database (to plot distribution mode and median; not mandatory)
    db_path = config.get_ortho_db()
    no_peak_db = False
    try:
        with open(db_path, 'r') as f3:
            db = pandas.read_csv(f3, sep="\t", index_col=0)
    except Exception:
        no_peak_db = True
        logging.warning(f"Ortholog Ks peak database empty or not found at the path provided in the config file: distribution peaks will not be shown")

    # -----------------------------------------------------------------------------

    # GENERATING PDF FIGURE with ortholog distributions FOR EACH TRIO
    outgroups_per_divergent_pair_dict = {}
    missing_pairs_ks_list, missing_pairs_peaks = [], []

    for __, row in trios.iterrows():
        species, sister, out = row['Focal_Species'], row['Sister_Species'], row['Out_Species']
        # Generate dictionary of divergent pairs linked with their outgroups
        divergent_pair_key = f"{species}_{sister}"
        if divergent_pair_key not in outgroups_per_divergent_pair_dict.keys():
            outgroups_per_divergent_pair_dict[divergent_pair_key] = [out]
        else:
            outgroups_per_divergent_pair_dict[divergent_pair_key].append(out)


    # PLOTTING THE DISTRIBUTIONS
    for divergent_pair in outgroups_per_divergent_pair_dict.keys():

        species, sister = divergent_pair.split("_")[0], divergent_pair.split("_")[1]
        latinSpecies, latinSister = latin_names[species], latin_names[sister]
        # Tags (sorted names, e.g. A.filiculoides_S.cucullata)
        species_sister = "_".join(sorted([latinSpecies, latinSister], key=str.casefold))

        out_list = outgroups_per_divergent_pair_dict[divergent_pair]
        available_trios, unavailable_trios = [], []
        for out in out_list:  # Check if all data are available for this trio
            latinOut = latin_names[out]
            species_out = "_".join(sorted([latinSpecies, latinOut], key=str.casefold))
            sister_out = "_".join(sorted([latinSister, latinOut], key=str.casefold))

            available_data = True
            for pair in [species_sister, species_out, sister_out]:
                if pair not in list(ks_list_db.index):
                    available_data = False
                    if pair not in missing_pairs_ks_list:
                        missing_pairs_ks_list.append(pair)
                if not no_peak_db:  # If ortholog Ks peak database is available
                    if pair not in list(db.index):
                        available_data = False
                        if pair not in missing_pairs_peaks:
                            missing_pairs_peaks.append(pair)

            if available_data:
                available_trios.append(out)
            else:
                unavailable_trios.append(out)

        if len(available_trios) == 0:
            logging.info("")
            logging.info(f"Plotting ortholog Ks distributions for species pair [{latinSpecies} - {latinSister}]")
            logging.warning(f"- Skipping all outspecies: not enough ortholog data available (PDF figure not generated)")
            continue

        with PdfPages(os.path.join("rate_adjustment", f"{species_of_interest}", f"orthologs_{divergent_pair}.pdf")) as pdf:
            logging.info("")
            logging.info(f"Plotting ortholog Ks distributions for species pair [{latinSpecies} - {latinSister}]")

            # SPECIES - SISTER
            ks_list_species_sister = literal_eval(ks_list_db.at[species_sister, 'Ks_Values'])
            # Getting 20 KDE curves through bootstrap
            bootstrap_kde_species_sister = fcPeak.bootstrap_KDE(ks_list_species_sister, 20, x_lim, bin_width_ortho)

            for out in unavailable_trios:
                latinOut = latin_names[out]
                logging.warning(f"- Skipping outspecies [{latinOut}]: not enough ortholog data available")

            for out in available_trios:
                latinOut = latin_names[out]
                logging.info(f"- Using outspecies [{latinOut}]:")
                fig, axes = fcPlot.generate_orthologs_figure(latinSpecies, latinSister, latinOut, x_lim)

                # tags, e.g. A.filiculoides_S.cucullata
                species_out = "_".join(sorted([latinSpecies, latinOut], key=str.casefold))
                sister_out = "_".join(sorted([latinSister, latinOut], key=str.casefold))

                # SPECIES - SISTER
                # Plotting Ks lists and their KDE lines
                logging.info(f"  Plotting data for the two sister species [{latinSpecies} - {latinSister}]")
                fcPlot.plot_orthologs_histogram_kdes(ks_list_species_sister, bin_list_ortho, axes[0],
                                                            bootstrap_kde_species_sister)

                # SPECIES - OUTGROUP
                ks_list = literal_eval(ks_list_db.at[species_out, 'Ks_Values'])
                # Getting 20 KDE curves through bootstrap
                logging.info(f"  Plotting data for focal species and outspecies [{latinSpecies} - {latinOut}]")
                bootstrap_kde = fcPeak.bootstrap_KDE(ks_list, 20, x_lim, bin_width_ortho)
                # Plotting Ks lists and their KDE lines
                fcPlot.plot_orthologs_histogram_kdes(ks_list, bin_list_ortho, axes[1], bootstrap_kde)

                # SISTER - OUTGROUP
                ks_list = literal_eval(ks_list_db.at[sister_out, 'Ks_Values'])
                # Getting 20 KDE curves through bootstrap
                logging.info(f"  Plotting data for sister species and outspecies [{latinSister} - {latinOut}]")
                bootstrap_kde = fcPeak.bootstrap_KDE(ks_list, 20, x_lim, bin_width_ortho)
                # Plotting Ks lists and their KDE lines
                fcPlot.plot_orthologs_histogram_kdes(ks_list, bin_list_ortho, axes[2], bootstrap_kde)

                # Plotting estimated mode of the orthologs distributions as vertical lines
                y_upper_lim = axes[0].get_ylim()[1] 
                if not no_peak_db:  # If ortholog Ks peak database is available
                    fcPlot.plot_orthologs_peak_lines(db, species_sister, axes[0], y_upper_lim)
                    fcPlot.plot_orthologs_peak_lines(db, species_out, axes[1], y_upper_lim)
                    fcPlot.plot_orthologs_peak_lines(db, sister_out, axes[2], y_upper_lim)

                pdf.savefig(fig, transparent=True, bbox_extra_artists=(fig._suptitle,), bbox_inches='tight')
                plt.close()
        logging.info(f"- Saving PDF figure [orthologs_{divergent_pair}.pdf]")

    # Report if species are missing from any of the two ortholog databases
    if len(missing_pairs_ks_list) != 0 or len(missing_pairs_peaks) != 0:
        logging.warning("")
        logging.warning("The species pairs listed below are not (yet) available in the ortholog databases")
        logging.warning("The trios involving such species pairs have not been plotted")
        logging.warning("")

        missing_in_both_dbs = list((set(missing_pairs_peaks) & set(missing_pairs_ks_list)))
        if len(missing_in_both_dbs) != 0:
            logging.warning("Species pairs not yet available in both Ks peak and Ks list ortholog databases:")
            for pair in sorted(missing_in_both_dbs):
                logging.warning(f"  {pair.split('_')[0]} - {pair.split('_')[1]}")
            logging.warning("")

        missing_pairs_peaks = list(set(missing_pairs_peaks) - set(missing_in_both_dbs))
        if len(missing_pairs_peaks) != 0:
            logging.warning("Species pairs not yet available in the ortholog Ks peak database:")
            for pair in sorted(missing_pairs_peaks):
                logging.warning(f"  {pair.split('_')[0]} - {pair.split('_')[1]}")
            logging.warning("")

        missing_pairs_ks_list = list(set(missing_pairs_ks_list) - set(missing_in_both_dbs))
        if len(missing_pairs_ks_list) != 0:
            logging.warning("Species pairs not yet available in the ortholog Ks list database:")
            for pair in sorted(missing_pairs_ks_list):
                logging.warning(f"  {pair.split('_')[0]} - {pair.split('_')[1]}")
            logging.warning("")

        logging.warning("Please compute their ortholog Ks data and/or add the ortholog data to the databases,")
        logging.warning("then rerun this step.")

    logging.info("")
    logging.info("All done")