"--tip-attributes", required=True, help="tab-delimited file describing tip attributes at all timepoints") parser.add_argument("--timepoint", help="current timepoint", required=True) parser.add_argument( "--attribute-names", nargs="+", help="names of attributes for tips to export to node data JSON", required=True) parser.add_argument("--output", help="JSON file with fitness information by node", required=True) args = parser.parse_args() # Load tip attributes tips = pd.read_csv(args.tip_attributes, sep="\t", parse_dates=["timepoint"]) timepoint = pd.to_datetime(args.timepoint) data = tips.loc[tips["timepoint"] == timepoint, ["strain"] + args.attribute_names].to_dict(orient="records") fitnesses = {} for record in data: fitnesses[record["strain"]] = {} for attribute in args.attribute_names: fitnesses[record["strain"]][attribute] = record[attribute] # Write out the node annotations. write_json({"nodes": fitnesses}, args.output)
forecasts = pd.read_csv(args.forecasts, sep="\t") if args.sequence_attribute_name not in forecasts.columns: print("ERROR: missing sequence column '%s' in forecasts file '%s'" % (args.sequence_attribute_name, args.forecasts), file=sys.stderr) sys.exit(1) future_tip_sequence_by_name = dict(forecasts.loc[:, ["strain", args.sequence_attribute_name]].values) future_tip_frequency_by_name = dict(forecasts.loc[:, ["strain", "projected_frequency"]].values) # Convert future tip sequences as arrays once for pairwise comparisons. for tip_name in future_tip_sequence_by_name.keys(): future_tip_sequence_by_name[tip_name] = np.frombuffer( future_tip_sequence_by_name[tip_name].encode(), dtype="S1" ) # Calculate weighted distances between given tips and forecasts and store in # node data JSON format. distances = {} for tip_name, tip_sequence in tip_sequence_by_name.items(): current_tip_sequence_array = np.frombuffer(tip_sequence.encode(), dtype="S1") weighted_distance_to_future = 0.0 for future_tip_name in future_tip_sequence_by_name.keys(): distance = (current_tip_sequence_array != future_tip_sequence_by_name[future_tip_name]).sum() weighted_distance_to_future += future_tip_frequency_by_name[future_tip_name] * distance distances[tip_name] = {args.distance_attribute_name: weighted_distance_to_future} # Export distances to JSON. write_json({"nodes": distances}, args.output)
for node in tree.find_clades(order="postorder"): if node.is_terminal(): if node.name in titer_count_by_strain: node_data[node.name] = { args.attribute_name: titer_count_by_strain[node.name] } elif args.include_internal_nodes: node_data[node.name] = { args.attribute_name: sum([ node_data[child.name][args.attribute_name] for child in node.clades if child.name in node_data ]) } # Assign categorical counts. These ranges are hardcoded for now. if args.use_categorical_ranges: for node in tree.find_clades(): if node.name in node_data: node_data[node.name][ args.attribute_name] = get_categorical_range_for_count( node_data[node.name][args.attribute_name]) else: # Get the categorical value for zero, if no counts are assigned to this node. node_data[node.name] = { args.attribute_name: get_categorical_range_for_count(0) } # Save titers per strain in node data format. write_json({"nodes": node_data}, args.output)
# Determine the total time that elapsed between the current and past timepoint. delta_time = kde_frequencies.pivots[-1] - kde_frequencies.pivots[-(args.delta_pivots + 1)] # Calculate the change in frequency over time elapsed for each clade. delta_frequency_by_clade = {} for clade, current_frequency in current_clade_frequencies.items(): # If the current clade was not observed in the previous timepoint, it # will have a zero frequency. delta_frequency_by_clade[clade] = (current_frequency - previous_clade_frequencies.get(clade, 0.0)) / delta_time # Assign clade delta frequencies to all corresponding tips and internal nodes. delta_frequency = {} for node in tree.find_clades(terminal=True): delta_frequency[node.name] = { "delta_frequency": delta_frequency_by_clade.get(clades_by_node[node.name], 0.0) } else: frequencies = frequencies_json # Determine the total time that elapsed between the current and past timepoint. delta_time = frequencies["pivots"][-1] - frequencies["pivots"][-(args.delta_pivots + 1)] delta_frequency = {} for node in tree.find_clades(terminal=True): delta_frequency[node.name] = { "delta_frequency": (frequencies[node.name]["global"][-1] - frequencies[node.name]["global"][-(args.delta_pivots + 1)]) / delta_time } # Write out the node annotations. write_json({"nodes": delta_frequency}, args.output)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--ancestral-sequences", required=True, help="node data JSON of nucleotide mutations including observed and inferred by TreeTime") parser.add_argument("--reference-node-name", required=True, help="name of the node whose sequence flags the desired haplotype") parser.add_argument("--attribute-name", default="haplotype_status", help="name of attribute for haplotype status") parser.add_argument("--output", required=True, help="node data JSON with annotated haplotype status based on the given reference node's sequence") args = parser.parse_args() with open(args.ancestral_sequences, "r") as fh: sequences = json.load(fh) if args.reference_node_name not in sequences["nodes"]: print("ERROR: Could not find the requested reference node named '%s' in the given ancestral sequences." % args.reference_node_name, file=sys.stderr) sys.exit(1) haplotype_sequence = sequences["nodes"][args.reference_node_name]["sequence"] haplotype_status = {"nodes": {}} for node in sequences["nodes"]: if sequences["nodes"][node]["sequence"] == haplotype_sequence: status = "haplotype matches %s" % args.reference_node_name else: status = "haplotype does not match %s" % args.reference_node_name haplotype_status["nodes"][node] = {args.attribute_name: status} write_json(haplotype_status, args.output)
]], columns=[ "MCC", "accuracy", "threshold", "embedding", "TN", "FN", "TP", "FP", "roc_fpr", "roc_tpr", "roc_thresholds" ]).round(3) values_df.to_csv(args.output_metadata) embedding_df.to_csv(args.output_outliers, index=False) if args.output_json is not None: embedding_df.index = embedding_df["strain"] embedding_dict = embedding_df[[ "X_scores", "predicted_outlier_status", "mds_label" ]].transpose().to_dict() write_json({"nodes": embedding_dict}, args.output_json) if args.output_main_figure is not None: plt.title("Local Outlier Factor (LOF)") if args.find_outlier: predicted_outliers = embedding_df[ "predicted_outlier_status"].values.tolist() confusion_matrix_values = [] for i in range(len(predicted)): #Not Outlier if predicted_outliers[i] == 1: confusion_matrix_values.append('#0000FF') #Outlier elif predicted_outliers[i] == -1: confusion_matrix_values.append('#FF6600')
"""Create node json (augur) from pandas dataframe. """ import argparse import pandas as pd from augur.utils import write_json if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--table", help="table to make node data from") parser.add_argument("--separator", default=",", help="separator between columns in the given tables") parser.add_argument("--node_name", default="nodes", help="what to name the node value in the auspice json") parser.add_argument("--output", help="json file") args = parser.parse_args() if args.output is not None: embedding_dict = pd.read_csv( args.table, sep=args.separator, index_col=0 ).transpose().to_dict() write_json({args.node_name: embedding_dict}, args.output)
# Load tree. tree = Bio.Phylo.read(args.tree, "newick") # Load sequences. alignments = load_alignments(args.alignment, args.gene_names) # Concatenate translated sequences into a single sequence indexed by sample name. is_node_terminal = { node.name: node.is_terminal() for node in tree.find_clades() } translations = {} for gene in args.gene_names: alignment = alignments[gene] for record in alignment: if is_node_terminal[record.name] or args.include_internal_nodes: # Initialize new samples by name with an empty string. if record.name not in translations: translations[record.name] = {args.attribute_name: ""} # Append the current gene's amino acid sequence to the current # string for this sample. translations[record.name][args.attribute_name] += str( record.seq) # Write out the node annotations. write_json({"nodes": translations}, args.output)
mutations_to_number[clades[node.name] ["clade_membership"]] = clade_number clade_number += 1 for node in tree.find_clades(): clades[node.name][ "clade_membership"] = "Clade %i" % mutations_to_number[clades[ node.name]["clade_membership"]] # elif args.use_hash_ids: # # Assign abbreviated SHA hashes based on concatenated mutations. # for node_name in clades.keys(): # if clades[node_name]["clade_membership"] != "root": # clades[node_name]["clade_membership"] = hashlib.sha256(clades[node_name]["clade_membership"].encode()).hexdigest()[:MAX_HASH_LENGTH] # Write out the node annotations. write_json({"nodes": clades}, args.output) # Output the optional tip-to-clade table, if requested. if args.output_tip_clade_table: records = [] for tip in tree.find_clades(terminal=True): # Note the tip's own clade assignment which may be distinct from its # parent's. depth = 0 records.append( [tip.name, clades[tip.name]["clade_membership"], depth]) parent = tip.parent depth += 1 while True: records.append(
max_df.where(max_df["method"] == args.command).dropna( subset=['distance_threshold'])[["distance_threshold" ]].values.tolist()[0][0])) if clusterer is not None: clusterer_default = hdbscan.HDBSCAN() clusterer.fit(embedding_df) clusterer_default.fit(embedding_df) embedding_df[f"{args.command}_label"] = clusterer.labels_.astype(str) embedding_df[ f"{args.command}_label_default"] = clusterer_default.labels_.astype( str) if args.output_node_data is not None: embedding_dict = embedding_df.transpose().to_dict() write_json({"nodes": embedding_dict}, args.output_node_data) if args.output_dataframe is not None: embedding_df.to_csv(args.output_dataframe, index_label="strain") if args.output_figure: plot_data = { "x": embedding[:, 0], "y": embedding[:, 1], } if clusterer is not None: plot_data["cluster"] = clusterer.labels_.astype(str) else: plot_data["cluster"] = "0"
distance_map = read_distance_map(distance_map_file) distance_map_names.append(distance_map.get("name", distance_map_file)) for current_sample in current_samples: if not current_sample in distances_by_node: distances_by_node[current_sample] = {} if not attribute in distances_by_node[current_sample]: distances_by_node[current_sample][attribute] = {} for past_sample in past_samples: # The past is in the past. comparisons += 1 if date_by_sample[past_sample] < date_by_sample[current_sample]: distances_by_node[current_sample][attribute][ past_sample] = get_distance_between_nodes( sequences_by_node_and_gene[past_sample], sequences_by_node_and_gene[current_sample], distance_map) print("Calculated %i comparisons" % comparisons) # Prepare params for export. params = { "attribute": args.attribute_name, "map_name": distance_map_names, "years_back_to_compare": args.years_back_to_compare } # Export distances to JSON. write_json({"params": params, "nodes": distances_by_node}, args.output)
if __name__ == '__main__': parser = argparse.ArgumentParser( description="Create node data for assigned pangolin lineages", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--pangolineages", type=str, required=True, help="pangolineages.csv") parser.add_argument("--node_data_outfile", type=str, help="pangolineages.json") parser.add_argument( "--attribute_name", default="pango_lineage_local", help= "attribute name for pangolin lineage annotations in the output JSON") args = parser.parse_args() pangolineages = pd.read_csv(args.pangolineages) node_data = { "nodes": { row['taxon']: { args.attribute_name: row['lineage'] } for idx, row in pangolineages.iterrows() } } write_json(node_data, args.node_data_outfile)
distances_by_node[current_sample][ args.attribute_name][past_sample] = np.around( get_titer_distance_between_nodes( tree, tips_by_sample[past_sample], tips_by_sample[current_sample], args.model_attribute_name), 4) comparisons += 1 if comparisons % 10000 == 0: print("Completed", comparisons, "comparisons, with last distance of", distances_by_node[current_sample][ args.attribute_name][past_sample], flush=True) print("Calculated %i comparisons" % comparisons) # Prepare params for export. params = { "attribute": args.attribute_name, "years_back_to_compare": args.years_back_to_compare } # Export distances to JSON. write_json({ "params": params, "nodes": distances_by_node }, args.output, indent=None)
today = pd.to_datetime(datetime.date.today()) meta["_days_since_submission"] = (today - meta[args.submission_date_field]).dt.days # Create bins to use for day intervals. bins = args.date_bins # Bins need to start with zero. if 0 not in bins: bins.insert(0, 0) # The last bin needs to include the maximum possible value. bins.append(np.inf) # Build a list of bin labels. bin_labels = args.date_bin_labels bin_labels.append(args.upper_bin_label) # Bin sequences by relevant submission delay intervals. meta["_day_bins"] = pd.cut(meta["_days_since_submission"], bins=bins, labels=bin_labels, include_lowest=True) # Create node data annotations of recency per strain. recency_by_strain = meta["_day_bins"].to_dict() for strain, recency in recency_by_strain.items(): node_data['nodes'][strain] = {args.output_field_name: recency} write_json(node_data, args.output)
frequencies = pd.read_csv(args.frequencies_table, sep="\t") # Filter samples to those with nonzero frequencies at the current timepoint. nonzero_frequencies = frequencies[ frequencies["%s_frequency" % args.frequency_method] > 0].copy() # Merge extent sample frequencies with metadata containing fitnesses. nonzero_metadata = nonzero_frequencies.merge(metadata, on="strain") # Normalize fitness by maximum fitness. nonzero_metadata["normalized_fitness"] = nonzero_metadata[ "fitness"] / nonzero_metadata["fitness"].max() # Prepare dictionary of normalized fitnesses by sample. normalized_fitness = { strain: { "normalized_fitness": fitness } for strain, fitness in nonzero_metadata.loc[:, ["strain", "normalized_fitness"]].values } print("Raw fitness: %.2f +/- %.2f" % (nonzero_metadata["fitness"].mean(), nonzero_metadata["fitness"].std())) print("Normalized fitness: %.2f +/- %.2f" % (nonzero_metadata["normalized_fitness"].mean(), nonzero_metadata["normalized_fitness"].std())) # Save normalized fitness as a node data JSON. write_json({"nodes": normalized_fitness}, args.output)
merged_scaled_df["distance"] = distance if args.output_metadata is not None: merged_scaled_df.to_csv(args.output_metadata) classifier_threshold = (np.mean(distance) + (1 * np.std(distance))) estimated_outlier_status = np.where(distance < classifier_threshold, -1, 1) distance_df = pd.DataFrame() distance_df["distance_" + str(args.method)] = estimated_outlier_status #distance_df["distance_" + str(args.method)] = distance distance_df.index = merged_scaled_df["strain"] distance_dict = distance_df.transpose().to_dict() write_json({"nodes": distance_dict}, args.output_distance_metric) if args.output_boxplot is not None: sns_plot = sns.catplot(x="clade_membership", y="distance", kind="box", data=merged_scaled_df, height=4, aspect=2) sns_plot.savefig(args.output_boxplot) if args.output_figure is not None: if args.metadata is not None: from matplotlib.lines import Line2D domain = args.domain range_ = args.colors