Example #1
0
        "--tip-attributes",
        required=True,
        help="tab-delimited file describing tip attributes at all timepoints")
    parser.add_argument("--timepoint", help="current timepoint", required=True)
    parser.add_argument(
        "--attribute-names",
        nargs="+",
        help="names of attributes for tips to export to node data JSON",
        required=True)
    parser.add_argument("--output",
                        help="JSON file with fitness information by node",
                        required=True)
    args = parser.parse_args()

    # Load tip attributes
    tips = pd.read_csv(args.tip_attributes,
                       sep="\t",
                       parse_dates=["timepoint"])

    timepoint = pd.to_datetime(args.timepoint)
    data = tips.loc[tips["timepoint"] == timepoint, ["strain"] +
                    args.attribute_names].to_dict(orient="records")
    fitnesses = {}
    for record in data:
        fitnesses[record["strain"]] = {}
        for attribute in args.attribute_names:
            fitnesses[record["strain"]][attribute] = record[attribute]

    # Write out the node annotations.
    write_json({"nodes": fitnesses}, args.output)
    forecasts = pd.read_csv(args.forecasts, sep="\t")
    if args.sequence_attribute_name not in forecasts.columns:
        print("ERROR: missing sequence column '%s' in forecasts file '%s'" % (args.sequence_attribute_name, args.forecasts), file=sys.stderr)
        sys.exit(1)

    future_tip_sequence_by_name = dict(forecasts.loc[:, ["strain", args.sequence_attribute_name]].values)
    future_tip_frequency_by_name = dict(forecasts.loc[:, ["strain", "projected_frequency"]].values)

    # Convert future tip sequences as arrays once for pairwise comparisons.
    for tip_name in future_tip_sequence_by_name.keys():
        future_tip_sequence_by_name[tip_name] = np.frombuffer(
            future_tip_sequence_by_name[tip_name].encode(),
            dtype="S1"
        )

    # Calculate weighted distances between given tips and forecasts and store in
    # node data JSON format.
    distances = {}
    for tip_name, tip_sequence in tip_sequence_by_name.items():
        current_tip_sequence_array = np.frombuffer(tip_sequence.encode(), dtype="S1")
        weighted_distance_to_future = 0.0

        for future_tip_name in future_tip_sequence_by_name.keys():
            distance = (current_tip_sequence_array != future_tip_sequence_by_name[future_tip_name]).sum()
            weighted_distance_to_future += future_tip_frequency_by_name[future_tip_name] * distance

        distances[tip_name] = {args.distance_attribute_name: weighted_distance_to_future}

    # Export distances to JSON.
    write_json({"nodes": distances}, args.output)
    for node in tree.find_clades(order="postorder"):
        if node.is_terminal():
            if node.name in titer_count_by_strain:
                node_data[node.name] = {
                    args.attribute_name: titer_count_by_strain[node.name]
                }
        elif args.include_internal_nodes:
            node_data[node.name] = {
                args.attribute_name:
                sum([
                    node_data[child.name][args.attribute_name]
                    for child in node.clades if child.name in node_data
                ])
            }

    # Assign categorical counts. These ranges are hardcoded for now.
    if args.use_categorical_ranges:
        for node in tree.find_clades():
            if node.name in node_data:
                node_data[node.name][
                    args.attribute_name] = get_categorical_range_for_count(
                        node_data[node.name][args.attribute_name])
            else:
                # Get the categorical value for zero, if no counts are assigned to this node.
                node_data[node.name] = {
                    args.attribute_name: get_categorical_range_for_count(0)
                }

    # Save titers per strain in node data format.
    write_json({"nodes": node_data}, args.output)
        # Determine the total time that elapsed between the current and past timepoint.
        delta_time = kde_frequencies.pivots[-1] - kde_frequencies.pivots[-(args.delta_pivots + 1)]

        # Calculate the change in frequency over time elapsed for each clade.
        delta_frequency_by_clade = {}
        for clade, current_frequency in current_clade_frequencies.items():
            # If the current clade was not observed in the previous timepoint, it
            # will have a zero frequency.
            delta_frequency_by_clade[clade] = (current_frequency - previous_clade_frequencies.get(clade, 0.0)) / delta_time

        # Assign clade delta frequencies to all corresponding tips and internal nodes.
        delta_frequency = {}
        for node in tree.find_clades(terminal=True):
            delta_frequency[node.name] = {
                "delta_frequency": delta_frequency_by_clade.get(clades_by_node[node.name], 0.0)
            }
    else:
        frequencies = frequencies_json

        # Determine the total time that elapsed between the current and past timepoint.
        delta_time = frequencies["pivots"][-1] - frequencies["pivots"][-(args.delta_pivots + 1)]

        delta_frequency = {}
        for node in tree.find_clades(terminal=True):
            delta_frequency[node.name] = {
                "delta_frequency": (frequencies[node.name]["global"][-1] - frequencies[node.name]["global"][-(args.delta_pivots + 1)]) / delta_time
            }

    # Write out the node annotations.
    write_json({"nodes": delta_frequency}, args.output)
Example #5
0

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--ancestral-sequences", required=True, help="node data JSON of nucleotide mutations including observed and inferred by TreeTime")
    parser.add_argument("--reference-node-name", required=True, help="name of the node whose sequence flags the desired haplotype")
    parser.add_argument("--attribute-name", default="haplotype_status", help="name of attribute for haplotype status")
    parser.add_argument("--output", required=True, help="node data JSON with annotated haplotype status based on the given reference node's sequence")

    args = parser.parse_args()

    with open(args.ancestral_sequences, "r") as fh:
        sequences = json.load(fh)

    if args.reference_node_name not in sequences["nodes"]:
        print("ERROR: Could not find the requested reference node named '%s' in the given ancestral sequences." % args.reference_node_name, file=sys.stderr)
        sys.exit(1)

    haplotype_sequence = sequences["nodes"][args.reference_node_name]["sequence"]
    haplotype_status = {"nodes": {}}

    for node in sequences["nodes"]:
        if sequences["nodes"][node]["sequence"] == haplotype_sequence:
            status = "haplotype matches %s" % args.reference_node_name
        else:
            status = "haplotype does not match %s" % args.reference_node_name

        haplotype_status["nodes"][node] = {args.attribute_name: status}

    write_json(haplotype_status, args.output)
Example #6
0
            ]],
                                     columns=[
                                         "MCC", "accuracy", "threshold",
                                         "embedding", "TN", "FN", "TP", "FP",
                                         "roc_fpr", "roc_tpr", "roc_thresholds"
                                     ]).round(3)
            values_df.to_csv(args.output_metadata)

        embedding_df.to_csv(args.output_outliers, index=False)

        if args.output_json is not None:
            embedding_df.index = embedding_df["strain"]
            embedding_dict = embedding_df[[
                "X_scores", "predicted_outlier_status", "mds_label"
            ]].transpose().to_dict()
            write_json({"nodes": embedding_dict}, args.output_json)

    if args.output_main_figure is not None:

        plt.title("Local Outlier Factor (LOF)")
        if args.find_outlier:
            predicted_outliers = embedding_df[
                "predicted_outlier_status"].values.tolist()
            confusion_matrix_values = []
            for i in range(len(predicted)):
                #Not Outlier
                if predicted_outliers[i] == 1:
                    confusion_matrix_values.append('#0000FF')
                #Outlier
                elif predicted_outliers[i] == -1:
                    confusion_matrix_values.append('#FF6600')
Example #7
0
"""Create node json (augur) from pandas dataframe.
"""
import argparse
import pandas as pd
from augur.utils import write_json

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--table", help="table to make node data from")
    parser.add_argument("--separator", default=",", help="separator between columns in the given tables")
    parser.add_argument("--node_name", default="nodes", help="what to name the node value in the auspice json")
    parser.add_argument("--output", help="json file")

    args = parser.parse_args()

    if args.output is not None:
        embedding_dict = pd.read_csv(
            args.table,
            sep=args.separator,
            index_col=0
        ).transpose().to_dict()
        write_json({args.node_name: embedding_dict}, args.output)
Example #8
0
    # Load tree.
    tree = Bio.Phylo.read(args.tree, "newick")

    # Load sequences.
    alignments = load_alignments(args.alignment, args.gene_names)

    # Concatenate translated sequences into a single sequence indexed by sample name.
    is_node_terminal = {
        node.name: node.is_terminal()
        for node in tree.find_clades()
    }

    translations = {}
    for gene in args.gene_names:
        alignment = alignments[gene]

        for record in alignment:
            if is_node_terminal[record.name] or args.include_internal_nodes:
                # Initialize new samples by name with an empty string.
                if record.name not in translations:
                    translations[record.name] = {args.attribute_name: ""}

                # Append the current gene's amino acid sequence to the current
                # string for this sample.
                translations[record.name][args.attribute_name] += str(
                    record.seq)

    # Write out the node annotations.
    write_json({"nodes": translations}, args.output)
Example #9
0
                mutations_to_number[clades[node.name]
                                    ["clade_membership"]] = clade_number
                clade_number += 1

        for node in tree.find_clades():
            clades[node.name][
                "clade_membership"] = "Clade %i" % mutations_to_number[clades[
                    node.name]["clade_membership"]]
    # elif args.use_hash_ids:
    #     # Assign abbreviated SHA hashes based on concatenated mutations.
    #     for node_name in clades.keys():
    #         if clades[node_name]["clade_membership"] != "root":
    #             clades[node_name]["clade_membership"] = hashlib.sha256(clades[node_name]["clade_membership"].encode()).hexdigest()[:MAX_HASH_LENGTH]

    # Write out the node annotations.
    write_json({"nodes": clades}, args.output)

    # Output the optional tip-to-clade table, if requested.
    if args.output_tip_clade_table:
        records = []
        for tip in tree.find_clades(terminal=True):
            # Note the tip's own clade assignment which may be distinct from its
            # parent's.
            depth = 0
            records.append(
                [tip.name, clades[tip.name]["clade_membership"], depth])

            parent = tip.parent
            depth += 1
            while True:
                records.append(
Example #10
0
            max_df.where(max_df["method"] == args.command).dropna(
                subset=['distance_threshold'])[["distance_threshold"
                                                ]].values.tolist()[0][0]))

    if clusterer is not None:
        clusterer_default = hdbscan.HDBSCAN()
        clusterer.fit(embedding_df)
        clusterer_default.fit(embedding_df)
        embedding_df[f"{args.command}_label"] = clusterer.labels_.astype(str)
        embedding_df[
            f"{args.command}_label_default"] = clusterer_default.labels_.astype(
                str)

    if args.output_node_data is not None:
        embedding_dict = embedding_df.transpose().to_dict()
        write_json({"nodes": embedding_dict}, args.output_node_data)

    if args.output_dataframe is not None:
        embedding_df.to_csv(args.output_dataframe, index_label="strain")

    if args.output_figure:
        plot_data = {
            "x": embedding[:, 0],
            "y": embedding[:, 1],
        }

        if clusterer is not None:
            plot_data["cluster"] = clusterer.labels_.astype(str)
        else:
            plot_data["cluster"] = "0"
        distance_map = read_distance_map(distance_map_file)
        distance_map_names.append(distance_map.get("name", distance_map_file))

        for current_sample in current_samples:
            if not current_sample in distances_by_node:
                distances_by_node[current_sample] = {}

            if not attribute in distances_by_node[current_sample]:
                distances_by_node[current_sample][attribute] = {}

            for past_sample in past_samples:
                # The past is in the past.
                comparisons += 1
                if date_by_sample[past_sample] < date_by_sample[current_sample]:
                    distances_by_node[current_sample][attribute][
                        past_sample] = get_distance_between_nodes(
                            sequences_by_node_and_gene[past_sample],
                            sequences_by_node_and_gene[current_sample],
                            distance_map)

    print("Calculated %i comparisons" % comparisons)
    # Prepare params for export.
    params = {
        "attribute": args.attribute_name,
        "map_name": distance_map_names,
        "years_back_to_compare": args.years_back_to_compare
    }

    # Export distances to JSON.
    write_json({"params": params, "nodes": distances_by_node}, args.output)
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Create node data for assigned pangolin lineages",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--pangolineages",
                        type=str,
                        required=True,
                        help="pangolineages.csv")
    parser.add_argument("--node_data_outfile",
                        type=str,
                        help="pangolineages.json")
    parser.add_argument(
        "--attribute_name",
        default="pango_lineage_local",
        help=
        "attribute name for pangolin lineage annotations in the output JSON")
    args = parser.parse_args()

    pangolineages = pd.read_csv(args.pangolineages)

    node_data = {
        "nodes": {
            row['taxon']: {
                args.attribute_name: row['lineage']
            }
            for idx, row in pangolineages.iterrows()
        }
    }

    write_json(node_data, args.node_data_outfile)
Example #13
0
                distances_by_node[current_sample][
                    args.attribute_name][past_sample] = np.around(
                        get_titer_distance_between_nodes(
                            tree, tips_by_sample[past_sample],
                            tips_by_sample[current_sample],
                            args.model_attribute_name), 4)

                comparisons += 1
                if comparisons % 10000 == 0:
                    print("Completed",
                          comparisons,
                          "comparisons, with last distance of",
                          distances_by_node[current_sample][
                              args.attribute_name][past_sample],
                          flush=True)

    print("Calculated %i comparisons" % comparisons)
    # Prepare params for export.
    params = {
        "attribute": args.attribute_name,
        "years_back_to_compare": args.years_back_to_compare
    }

    # Export distances to JSON.
    write_json({
        "params": params,
        "nodes": distances_by_node
    },
               args.output,
               indent=None)
    today = pd.to_datetime(datetime.date.today())
    meta["_days_since_submission"] = (today -
                                      meta[args.submission_date_field]).dt.days

    # Create bins to use for day intervals.
    bins = args.date_bins

    # Bins need to start with zero.
    if 0 not in bins:
        bins.insert(0, 0)

    # The last bin needs to include the maximum possible value.
    bins.append(np.inf)

    # Build a list of bin labels.
    bin_labels = args.date_bin_labels
    bin_labels.append(args.upper_bin_label)

    # Bin sequences by relevant submission delay intervals.
    meta["_day_bins"] = pd.cut(meta["_days_since_submission"],
                               bins=bins,
                               labels=bin_labels,
                               include_lowest=True)

    # Create node data annotations of recency per strain.
    recency_by_strain = meta["_day_bins"].to_dict()
    for strain, recency in recency_by_strain.items():
        node_data['nodes'][strain] = {args.output_field_name: recency}

    write_json(node_data, args.output)
    frequencies = pd.read_csv(args.frequencies_table, sep="\t")

    # Filter samples to those with nonzero frequencies at the current timepoint.
    nonzero_frequencies = frequencies[
        frequencies["%s_frequency" % args.frequency_method] > 0].copy()

    # Merge extent sample frequencies with metadata containing fitnesses.
    nonzero_metadata = nonzero_frequencies.merge(metadata, on="strain")

    # Normalize fitness by maximum fitness.
    nonzero_metadata["normalized_fitness"] = nonzero_metadata[
        "fitness"] / nonzero_metadata["fitness"].max()

    # Prepare dictionary of normalized fitnesses by sample.
    normalized_fitness = {
        strain: {
            "normalized_fitness": fitness
        }
        for strain, fitness in
        nonzero_metadata.loc[:, ["strain", "normalized_fitness"]].values
    }

    print("Raw fitness: %.2f +/- %.2f" % (nonzero_metadata["fitness"].mean(),
                                          nonzero_metadata["fitness"].std()))
    print("Normalized fitness: %.2f +/- %.2f" %
          (nonzero_metadata["normalized_fitness"].mean(),
           nonzero_metadata["normalized_fitness"].std()))

    # Save normalized fitness as a node data JSON.
    write_json({"nodes": normalized_fitness}, args.output)
Example #16
0
        merged_scaled_df["distance"] = distance

        if args.output_metadata is not None:
            merged_scaled_df.to_csv(args.output_metadata)

        classifier_threshold = (np.mean(distance) + (1 * np.std(distance)))
        estimated_outlier_status = np.where(distance < classifier_threshold,
                                            -1, 1)

        distance_df = pd.DataFrame()
        distance_df["distance_" + str(args.method)] = estimated_outlier_status

        #distance_df["distance_" + str(args.method)] = distance
        distance_df.index = merged_scaled_df["strain"]
        distance_dict = distance_df.transpose().to_dict()
        write_json({"nodes": distance_dict}, args.output_distance_metric)

        if args.output_boxplot is not None:
            sns_plot = sns.catplot(x="clade_membership",
                                   y="distance",
                                   kind="box",
                                   data=merged_scaled_df,
                                   height=4,
                                   aspect=2)
            sns_plot.savefig(args.output_boxplot)

    if args.output_figure is not None:
        if args.metadata is not None:
            from matplotlib.lines import Line2D
            domain = args.domain
            range_ = args.colors