parser.add_argument( "--branch-lengths", help= "JSON with branch lengths and internal node dates estimated by TreeTime", required=True) parser.add_argument("--min-date", help="minimum date for sequences to emit", required=True) parser.add_argument("--output", nargs="+", help="filtered amino acid sequences (one per input)", required=True) args = parser.parse_args() # Get min date. min_date = pd.to_datetime(args.min_date) # Load branch lengths. node_data = read_node_data(args.branch_lengths) # Write alignments to file. for alignment_file, output_file in zip(args.alignment, args.output): alignments = AlignIO.read(alignment_file, "fasta") new_alignments = [] for alignment in alignments: date = pd.to_datetime(node_data["nodes"][alignment.id]["date"]) if date >= min_date: new_alignments.append(alignment) SeqIO.write(new_alignments, output_file, "fasta")
default_tuned_values.append(embedding_parameters) embedding_parameters = { "dissimilarity": "precomputed", "n_components": 2, "n_jobs": 1, "n_init": 2, } default_tuned_values.append(embedding_parameters) embedding_parameters = {"n_components": 10, "svd_solver": "full"} default_tuned_values.append(embedding_parameters) # creating dataframe of clade information node_data = read_node_data(args.clades) clade_annotations = pd.DataFrame([{ "strain": sequence_name, "clade_membership": node_data["nodes"][sequence_name][args.column_metadata] } for sequence_name in sequence_names if sequence_name in node_data["nodes"]]) strains_df = pd.DataFrame(distance_matrix.index.values.tolist(), columns=["strain"]) clade_annotations = clade_annotations.merge(strains_df, on="strain") distance_matrix.columns = distance_matrix.index indices_to_drop = distance_matrix[~distance_matrix.index. isin(clade_annotations["strain"])]
method_parameters = { key: value for key, value in snakemake.params.method_parameters.items() if not pd.isna(value) } method_parameters.update(DEFAULT_PARAMETERS_BY_METHOD[method]) for parameter, parameter_value in method_parameters.items(): if parameter in TYPE_BY_PARAMETER: method_parameters[parameter] = TYPE_BY_PARAMETER[parameter]( parameter_value) print(method_parameters) # Load clade annotations. clades = read_node_data(snakemake.input.clades)["nodes"] clades = [{ "strain": strain, "clade_membership": values["clade_membership"] } for strain, values in clades.items() if not strain.startswith("NODE")] clades = pd.DataFrame(clades) strains = clades["strain"].values if method == "pca": # Load alignment. input_matrix = get_PCA_feature_matrix(snakemake.input.alignment, strains) is_distance_matrix = False else: # Load distance matrix. input_matrix = pd.read_csv(snakemake.input.distance_matrix, index_col=0)
# Identify maximum frequency per sample. max_frequency_per_sample = { sample: float(max(sample_frequencies["frequencies"])) for sample, sample_frequencies in frequencies.items() if sample not in ["pivots", "generated_by"] and not sample.startswith("count") } current_timepoint = frequencies["pivots"][-1] # Load distances. with open(args.distances, "r") as fh: distances = json.load(fh) distances = distances["nodes"] # Load date annotations and annotate tree with them. date_annotations = read_node_data(args.date_annotations) date_by_node_name = {} for node, annotations in date_annotations["nodes"].items(): date_by_node_name[node] = annotations["numdate"] """ "A/Acre/15093/2010": { "ep": 9, "ne": 8, "rb": 3 }, """ if args.years_to_wane is not None: print("Waning effect with max years of %i" % args.years_to_wane) else: print("No waning effect")
formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--node-data", required=True, help="JSON with ancestral reconstruction") parser.add_argument("--metadata", required=True, help="JSON with ancestral reconstruction") parser.add_argument("--tree", required=True, help="newick tree") parser.add_argument("--output", required=True, help="figure file") args = parser.parse_args() T = Phylo.read(args.tree, 'newick') metadata, columns = read_metadata(args.metadata) dates = get_numerical_dates(metadata, fmt='%Y-%m-%d') node_data = read_node_data(args.node_data, args.tree) tips = {} for n in T.get_terminals(): if type(dates[n.name]) == list: continue tips[n.name] = {'numdate': dates[n.name], 'mutations': []} path = T.root.get_path(target=n) for c in path: tips[n.name]['mutations'].extend([ x for x in node_data['nodes'][c.name]['muts'] if not (x[0] in ['N', '-'] or x[-1] in ['N', '-']) ]) tmrca = np.linspace(2019.7, np.min([x['numdate'] for x in tips.values()]) - 0.001,
default=3, help="minimum tips per polytomy to be consider as a cluster") parser.add_argument( "--output", required=True, help= "tab-delimited file with strain, cluster id, and group value for each strain" ) args = parser.parse_args() tree = read_tree(args.tree) tree.collapse_all(lambda c: c.branch_length < 1e-5) metadata, columns = read_metadata(args.metadata) muts = read_node_data(args.mutations) attribute_name = args.attribute_name group_by = args.group_by polytomies = [] for node in tree.find_clades(terminal=False): if node == tree.root: continue count_by_group = Counter() polytomy_sequence_id = None for child in node.clades: if child.is_terminal() and child.name: child_muts_data = muts["nodes"].get(child.name, {}) any_muts = (len(child_muts_data.get("muts", [])) > 0) if not any_muts:
import numpy as np import matplotlib.pyplot as plt from augur.utils import read_node_data import argparse from Bio import Phylo parser = argparse.ArgumentParser(description="Analyze TMRCA.") parser.add_argument("--tree", help="tree file") parser.add_argument("--node_data", help="node_data file") parser.add_argument("--titers", help="titer_model file") parser.add_argument("--output", help="output prefix") args = parser.parse_args() T = Phylo.read(args.tree, 'newick') of = [args.node_data, args.titers] if args.titers else [args.node_data] node_data = read_node_data(of) T.root.up = None for n in T.find_clades(order='postorder'): n.numdate = node_data["nodes"][n.name]["numdate"] if args.titers: n.cTiter = node_data["nodes"][n.name]["cTiter"] n.dTiter = node_data["nodes"][n.name]["dTiter"] if n.is_terminal(): n.ntips = 1 n.tree_length = n.branch_length if args.titers: n.antigenic_length = n.dTiter else: n.ntips = np.sum([c.ntips for c in n]) n.tree_length = n.branch_length + np.sum([c.tree_length for c in n])
'2011-9':{'startTime':2011, 'endTime':2019, 'time': [], 'lineages': []}} origKeys = list(sampSets.keys()) for key in origKeys: realKey = key params = sampSets[realKey] startTime = params['startTime'] endTime = params['endTime'] time = params['time'] num_lineages = params['lineages'] T = Phylo.read(treefile, 'newick') node_data = read_node_data([branchfile]) node_data, node_attrs, node_data_names, metadata_names = parse_node_data_and_metadata(T, [branchfile], metadatafile) rate = node_data['clock']['rate'] for node in T.find_clades(order='postorder'): data = node_data['nodes'][node.name] node.date = data['date'] node.num_date = data['numdate'] raw_data = node_attrs[node.name] node.region = raw_data['region'] if 'region' in raw_data else '' node.branch_length = data['branch_length']/rate #set parents to avoid excess tree-traversal for node in T.find_clades(order='preorder'): for child in node: child.parent = node
last_past_datetime = last_pivot_datetime - pd.DateOffset( years=args.years_back_to_compare) # Find the pivot indices that correspond to the current and past pivots. current_pivot_indices = np.array([ pd.to_datetime(float_to_datestring(pivot)) > last_current_datetime for pivot in pivots ]) past_pivot_indices = np.array([ ((pd.to_datetime(float_to_datestring(pivot)) >= last_past_datetime) & (pd.to_datetime(float_to_datestring(pivot)) <= last_current_datetime)) for pivot in pivots ]) # Load date and titer model annotations and annotate tree with them. annotations = read_node_data([args.date_annotations, args.model]) for node in tree.find_clades(): node.attr = annotations["nodes"][node.name] node.attr["num_date"] = node.attr["numdate"] # Identify samples to compare including those in the current timepoint # (pivot) and those in previous timepoints. current_samples = [] past_samples = [] date_by_sample = {} tips_by_sample = {} for tip in tree.find_clades(terminal=True): # Samples with nonzero frequencies in the last timepoint are current # samples. Those with one or more nonzero frequencies in the search # window of the past timepoints are past samples. frequencies[tip.name]["frequencies"] = np.array(
parser.add_argument('--seqs-in', help="input sequences") parser.add_argument('--meta-in', help="input meta file") parser.add_argument('--clades', help="clades JSON file") parser.add_argument('--meta-out', help="output metadata just with subgenogroup added") parser.add_argument('--seqs-out', help="output sequences just with subgenogroup added") args = parser.parse_args() orig_meta = args.meta_in #"results/metadata-ages.tsv" clade_info = args.clades #"results/clades_vp1.json" seqs = args.seqs_in #"results/aligned_vp1.fasta" meta = pd.read_csv(orig_meta, sep='\t', index_col=False) clade_node = read_node_data(clade_info) clade_node = clade_node["nodes"] record_dict = SeqIO.to_dict(SeqIO.parse(seqs, "fasta")) to_exclude = [] for i, row in meta.iterrows(): #only do this if the strain is in the clades if row.strain in clade_node.keys() and row.strain in record_dict.keys( ): meta.loc[meta.strain == row.strain, 'subgenogroup'] = clade_node[ row.strain]['clade_membership'] if clade_node[row.strain]['clade_membership'] == 'unassigned': meta.loc[meta.strain == row.strain, 'subgenogroup'] = "" else:
#cladefile = "results/clades_genome.json" #metadatafile = "results/metadata-ages.tsv" treefile = args.tree branchfile = args.branch_lengths cladefile = args.clades metadatafile = args.meta print("treefile", treefile) print("branchfile", branchfile) print("cladefile", cladefile) print("metafile", metadatafile) #T = Phylo.read(treefile, 'newick') #node_data = read_node_data([branchfile, cladefile]) node_data = read_node_data([cladefile]) sampSets = { '2014-5': { 'startTime': 2014, 'endTime': 2015, 'time': [], 'lineages': [] }, '2016-7': { 'startTime': 2016, 'endTime': 2017, 'time': [], 'lineages': [] }, '2018-9': {
required=True, help="tab-delimited file collecting all given node data") parser.add_argument( "--include-internal-nodes", action="store_true", help="include data associated with internal nodes in the output table") args = parser.parse_args() # Load tree. tree = Bio.Phylo.read(args.tree, "newick") # Load metadata for samples. metadata = pd.read_csv(args.metadata, sep="\t") # Load one or more node data JSONs into a single dictionary indexed by node name. node_data = read_node_data(args.jsons) # Convert node data into a data frame. # Data are initially loaded with one column per node. # Transposition converts the table to the expected one row per node format. df = pd.DataFrame(node_data["nodes"]).T.rename_axis("strain").reset_index() # Annotate node data with per sample metadata. df = df.merge(metadata, on="strain", suffixes=["", "_metadata"]) # Remove excluded fields if they are in the data frame. df = df.drop(columns=[ field for field in args.excluded_fields if field in df.columns ]) # Annotate the tip/internal status of each node using the tree.
args = parser.parse_args() if args.output_distance_metric is None and args.output_boxplot is not None: print("You must create the distance metric to create the boxplot", file=sys.stderr) sys.exit(1) if args.metadata is None and args.output_boxplot is not None: print("You must have metadata to create the boxplot", file=sys.stderr) sys.exit(1) embedding_1_df = pd.read_csv(args.embeddings[0]) embedding_2_df = pd.read_csv(args.embeddings[1]) if args.metadata is not None: node_data = read_node_data(args.metadata) metadata_df = clade_annotations = pd.DataFrame([{ "strain": strain, "clade_membership": annotations["clade_membership"] } for strain, annotations in node_data["nodes"].items()]) embedding_1_df = metadata_df.merge(embedding_1_df, on="strain") embedding_2_df = metadata_df.merge(embedding_2_df, on="strain") #procrustes analysis on the embeddings a = np.array([ list(a) for a in zip(embedding_1_df[args.columns[0]].values.tolist(), embedding_1_df[args.columns[1]].values.tolist()) ]) b = np.array([