Exemple #1
0
    parser.add_argument(
        "--branch-lengths",
        help=
        "JSON with branch lengths and internal node dates estimated by TreeTime",
        required=True)
    parser.add_argument("--min-date",
                        help="minimum date for sequences to emit",
                        required=True)
    parser.add_argument("--output",
                        nargs="+",
                        help="filtered amino acid sequences (one per input)",
                        required=True)
    args = parser.parse_args()

    # Get min date.
    min_date = pd.to_datetime(args.min_date)

    # Load branch lengths.
    node_data = read_node_data(args.branch_lengths)

    # Write alignments to file.
    for alignment_file, output_file in zip(args.alignment, args.output):
        alignments = AlignIO.read(alignment_file, "fasta")
        new_alignments = []
        for alignment in alignments:
            date = pd.to_datetime(node_data["nodes"][alignment.id]["date"])
            if date >= min_date:
                new_alignments.append(alignment)

        SeqIO.write(new_alignments, output_file, "fasta")
Exemple #2
0
    default_tuned_values.append(embedding_parameters)

    embedding_parameters = {
        "dissimilarity": "precomputed",
        "n_components": 2,
        "n_jobs": 1,
        "n_init": 2,
    }
    default_tuned_values.append(embedding_parameters)

    embedding_parameters = {"n_components": 10, "svd_solver": "full"}
    default_tuned_values.append(embedding_parameters)

    # creating dataframe of clade information

    node_data = read_node_data(args.clades)
    clade_annotations = pd.DataFrame([{
        "strain":
        sequence_name,
        "clade_membership":
        node_data["nodes"][sequence_name][args.column_metadata]
    } for sequence_name in sequence_names
                                      if sequence_name in node_data["nodes"]])

    strains_df = pd.DataFrame(distance_matrix.index.values.tolist(),
                              columns=["strain"])
    clade_annotations = clade_annotations.merge(strains_df, on="strain")

    distance_matrix.columns = distance_matrix.index
    indices_to_drop = distance_matrix[~distance_matrix.index.
                                      isin(clade_annotations["strain"])]
Exemple #3
0
method_parameters = {
    key: value
    for key, value in snakemake.params.method_parameters.items()
    if not pd.isna(value)
}
method_parameters.update(DEFAULT_PARAMETERS_BY_METHOD[method])

for parameter, parameter_value in method_parameters.items():
    if parameter in TYPE_BY_PARAMETER:
        method_parameters[parameter] = TYPE_BY_PARAMETER[parameter](
            parameter_value)

print(method_parameters)

# Load clade annotations.
clades = read_node_data(snakemake.input.clades)["nodes"]
clades = [{
    "strain": strain,
    "clade_membership": values["clade_membership"]
} for strain, values in clades.items() if not strain.startswith("NODE")]
clades = pd.DataFrame(clades)
strains = clades["strain"].values

if method == "pca":
    # Load alignment.
    input_matrix = get_PCA_feature_matrix(snakemake.input.alignment, strains)
    is_distance_matrix = False
else:
    # Load distance matrix.
    input_matrix = pd.read_csv(snakemake.input.distance_matrix, index_col=0)
Exemple #4
0
    # Identify maximum frequency per sample.
    max_frequency_per_sample = {
        sample: float(max(sample_frequencies["frequencies"]))
        for sample, sample_frequencies in frequencies.items()
        if sample not in ["pivots", "generated_by"] and not sample.startswith("count")
    }
    current_timepoint = frequencies["pivots"][-1]

    # Load distances.
    with open(args.distances, "r") as fh:
        distances = json.load(fh)

    distances = distances["nodes"]

    # Load date annotations and annotate tree with them.
    date_annotations = read_node_data(args.date_annotations)
    date_by_node_name = {}
    for node, annotations in date_annotations["nodes"].items():
        date_by_node_name[node] = annotations["numdate"]

    """
  "A/Acre/15093/2010": {
   "ep": 9,
   "ne": 8,
   "rb": 3
  },
    """
    if args.years_to_wane is not None:
        print("Waning effect with max years of %i" % args.years_to_wane)
    else:
        print("No waning effect")
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--node-data",
                        required=True,
                        help="JSON with ancestral reconstruction")
    parser.add_argument("--metadata",
                        required=True,
                        help="JSON with ancestral reconstruction")
    parser.add_argument("--tree", required=True, help="newick tree")
    parser.add_argument("--output", required=True, help="figure file")
    args = parser.parse_args()

    T = Phylo.read(args.tree, 'newick')

    metadata, columns = read_metadata(args.metadata)
    dates = get_numerical_dates(metadata, fmt='%Y-%m-%d')
    node_data = read_node_data(args.node_data, args.tree)

    tips = {}
    for n in T.get_terminals():
        if type(dates[n.name]) == list:
            continue
        tips[n.name] = {'numdate': dates[n.name], 'mutations': []}
        path = T.root.get_path(target=n)
        for c in path:
            tips[n.name]['mutations'].extend([
                x for x in node_data['nodes'][c.name]['muts']
                if not (x[0] in ['N', '-'] or x[-1] in ['N', '-'])
            ])

    tmrca = np.linspace(2019.7,
                        np.min([x['numdate'] for x in tips.values()]) - 0.001,
Exemple #6
0
        default=3,
        help="minimum tips per polytomy to be consider as a cluster")
    parser.add_argument(
        "--output",
        required=True,
        help=
        "tab-delimited file with strain, cluster id, and group value for each strain"
    )

    args = parser.parse_args()

    tree = read_tree(args.tree)
    tree.collapse_all(lambda c: c.branch_length < 1e-5)

    metadata, columns = read_metadata(args.metadata)
    muts = read_node_data(args.mutations)
    attribute_name = args.attribute_name
    group_by = args.group_by

    polytomies = []
    for node in tree.find_clades(terminal=False):
        if node == tree.root:
            continue

        count_by_group = Counter()
        polytomy_sequence_id = None
        for child in node.clades:
            if child.is_terminal() and child.name:
                child_muts_data = muts["nodes"].get(child.name, {})
                any_muts = (len(child_muts_data.get("muts", [])) > 0)
                if not any_muts:
import numpy as np
import matplotlib.pyplot as plt
from augur.utils import read_node_data
import argparse
from Bio import Phylo

parser = argparse.ArgumentParser(description="Analyze TMRCA.")
parser.add_argument("--tree", help="tree file")
parser.add_argument("--node_data", help="node_data file")
parser.add_argument("--titers", help="titer_model file")
parser.add_argument("--output", help="output prefix")
args = parser.parse_args()

T = Phylo.read(args.tree, 'newick')
of = [args.node_data, args.titers] if args.titers else [args.node_data]
node_data = read_node_data(of)

T.root.up = None
for n in T.find_clades(order='postorder'):
    n.numdate = node_data["nodes"][n.name]["numdate"]
    if args.titers:
        n.cTiter = node_data["nodes"][n.name]["cTiter"]
        n.dTiter = node_data["nodes"][n.name]["dTiter"]
    if n.is_terminal():
        n.ntips = 1
        n.tree_length = n.branch_length
        if args.titers:
            n.antigenic_length = n.dTiter
    else:
        n.ntips = np.sum([c.ntips for c in n])
        n.tree_length = n.branch_length + np.sum([c.tree_length for c in n])
                '2011-9':{'startTime':2011, 'endTime':2019, 'time': [], 'lineages': []}}

    origKeys = list(sampSets.keys())

    for key in origKeys:
        realKey = key

        params = sampSets[realKey]

        startTime = params['startTime']
        endTime = params['endTime']
        time = params['time']
        num_lineages = params['lineages']

        T = Phylo.read(treefile, 'newick')
        node_data = read_node_data([branchfile])
        node_data, node_attrs, node_data_names, metadata_names = parse_node_data_and_metadata(T, [branchfile], metadatafile)
        rate = node_data['clock']['rate']

        for node in T.find_clades(order='postorder'):
            data = node_data['nodes'][node.name]
            node.date = data['date']
            node.num_date = data['numdate']
            raw_data = node_attrs[node.name]
            node.region = raw_data['region'] if 'region' in raw_data else ''
            node.branch_length = data['branch_length']/rate

        #set parents to avoid excess tree-traversal
        for node in T.find_clades(order='preorder'):
            for child in node:
                child.parent = node
Exemple #9
0
    last_past_datetime = last_pivot_datetime - pd.DateOffset(
        years=args.years_back_to_compare)

    # Find the pivot indices that correspond to the current and past pivots.
    current_pivot_indices = np.array([
        pd.to_datetime(float_to_datestring(pivot)) > last_current_datetime
        for pivot in pivots
    ])
    past_pivot_indices = np.array([
        ((pd.to_datetime(float_to_datestring(pivot)) >= last_past_datetime) &
         (pd.to_datetime(float_to_datestring(pivot)) <= last_current_datetime))
        for pivot in pivots
    ])

    # Load date and titer model annotations and annotate tree with them.
    annotations = read_node_data([args.date_annotations, args.model])
    for node in tree.find_clades():
        node.attr = annotations["nodes"][node.name]
        node.attr["num_date"] = node.attr["numdate"]

    # Identify samples to compare including those in the current timepoint
    # (pivot) and those in previous timepoints.
    current_samples = []
    past_samples = []
    date_by_sample = {}
    tips_by_sample = {}
    for tip in tree.find_clades(terminal=True):
        # Samples with nonzero frequencies in the last timepoint are current
        # samples. Those with one or more nonzero frequencies in the search
        # window of the past timepoints are past samples.
        frequencies[tip.name]["frequencies"] = np.array(
Exemple #10
0
    parser.add_argument('--seqs-in', help="input sequences")
    parser.add_argument('--meta-in', help="input meta file")
    parser.add_argument('--clades', help="clades JSON file")
    parser.add_argument('--meta-out',
                        help="output metadata just with subgenogroup added")
    parser.add_argument('--seqs-out',
                        help="output sequences just with subgenogroup added")
    args = parser.parse_args()

    orig_meta = args.meta_in  #"results/metadata-ages.tsv"
    clade_info = args.clades  #"results/clades_vp1.json"
    seqs = args.seqs_in  #"results/aligned_vp1.fasta"

    meta = pd.read_csv(orig_meta, sep='\t', index_col=False)

    clade_node = read_node_data(clade_info)
    clade_node = clade_node["nodes"]

    record_dict = SeqIO.to_dict(SeqIO.parse(seqs, "fasta"))

    to_exclude = []

    for i, row in meta.iterrows():
        #only do this if the strain is in the clades
        if row.strain in clade_node.keys() and row.strain in record_dict.keys(
        ):
            meta.loc[meta.strain == row.strain, 'subgenogroup'] = clade_node[
                row.strain]['clade_membership']
            if clade_node[row.strain]['clade_membership'] == 'unassigned':
                meta.loc[meta.strain == row.strain, 'subgenogroup'] = ""
            else:
    #cladefile = "results/clades_genome.json"
    #metadatafile = "results/metadata-ages.tsv"

    treefile = args.tree
    branchfile = args.branch_lengths
    cladefile = args.clades
    metadatafile = args.meta

    print("treefile", treefile)
    print("branchfile", branchfile)
    print("cladefile", cladefile)
    print("metafile", metadatafile)

    #T = Phylo.read(treefile, 'newick')
    #node_data = read_node_data([branchfile, cladefile])
    node_data = read_node_data([cladefile])

    sampSets = {
        '2014-5': {
            'startTime': 2014,
            'endTime': 2015,
            'time': [],
            'lineages': []
        },
        '2016-7': {
            'startTime': 2016,
            'endTime': 2017,
            'time': [],
            'lineages': []
        },
        '2018-9': {
Exemple #12
0
        required=True,
        help="tab-delimited file collecting all given node data")
    parser.add_argument(
        "--include-internal-nodes",
        action="store_true",
        help="include data associated with internal nodes in the output table")
    args = parser.parse_args()

    # Load tree.
    tree = Bio.Phylo.read(args.tree, "newick")

    # Load metadata for samples.
    metadata = pd.read_csv(args.metadata, sep="\t")

    # Load one or more node data JSONs into a single dictionary indexed by node name.
    node_data = read_node_data(args.jsons)

    # Convert node data into a data frame.
    # Data are initially loaded with one column per node.
    # Transposition converts the table to the expected one row per node format.
    df = pd.DataFrame(node_data["nodes"]).T.rename_axis("strain").reset_index()

    # Annotate node data with per sample metadata.
    df = df.merge(metadata, on="strain", suffixes=["", "_metadata"])

    # Remove excluded fields if they are in the data frame.
    df = df.drop(columns=[
        field for field in args.excluded_fields if field in df.columns
    ])

    # Annotate the tip/internal status of each node using the tree.
Exemple #13
0
    args = parser.parse_args()

    if args.output_distance_metric is None and args.output_boxplot is not None:
        print("You must create the distance metric to create the boxplot",
              file=sys.stderr)
        sys.exit(1)

    if args.metadata is None and args.output_boxplot is not None:
        print("You must have metadata to create the boxplot", file=sys.stderr)
        sys.exit(1)

    embedding_1_df = pd.read_csv(args.embeddings[0])
    embedding_2_df = pd.read_csv(args.embeddings[1])

    if args.metadata is not None:
        node_data = read_node_data(args.metadata)
        metadata_df = clade_annotations = pd.DataFrame([{
            "strain":
            strain,
            "clade_membership":
            annotations["clade_membership"]
        } for strain, annotations in node_data["nodes"].items()])
        embedding_1_df = metadata_df.merge(embedding_1_df, on="strain")
        embedding_2_df = metadata_df.merge(embedding_2_df, on="strain")

    #procrustes analysis on the embeddings
    a = np.array([
        list(a) for a in zip(embedding_1_df[args.columns[0]].values.tolist(),
                             embedding_1_df[args.columns[1]].values.tolist())
    ])
    b = np.array([