Exemple #1
0
 def test_read_metadata_bad_query(self, tmpdir):
     meta_fn = str(tmpdir / "metadata.tsv")
     meta_lines = [
         "strain\tlocation\tquality",
         "c_good\tcolorado\tgood",
         "n_bad\tnevada\tbad",
     ]
     with open(meta_fn, "w") as fh:
         fh.write("\n".join(meta_lines))
     with pytest.raises(SystemExit):
         utils.read_metadata(meta_fn, query='badcol=="goodval"')
Exemple #2
0
 def test_read_metadata(self, tmpdir):
     meta_fn = write_metadata(tmpdir, (("strain", "location", "quality"),
                                       ("SEQ_1", "colorado", "good"),
                                       ("SEQ_2", "colorado", "bad"),
                                       ("SEQ_3", "nevada", "good")))
     utils.read_metadata(meta_fn, as_data_frame=True)
     # duplicates SEQ_1 raises ValueError
     meta_fn = write_metadata(tmpdir, (("strain", "location", "quality"),
                                       ("SEQ_1", "colorado", "good"),
                                       ("SEQ_1", "colorado", "bad"),
                                       ("SEQ_3", "nevada", "good")))
     with pytest.raises(ValueError) as e_info:
         utils.read_metadata(meta_fn, as_data_frame=True)
     assert str(e_info.value) == "Duplicated strain in metadata: SEQ_1"
Exemple #3
0
def parse_metadata(segments, metadata_files, date_format="%Y-%m-%d"):
    metadata = {}
    for segment, fname in zip(segments, metadata_files):
        tmp_meta, columns = read_metadata(fname)

        numerical_dates = get_numerical_dates(tmp_meta, fmt=date_format)
        for x in tmp_meta:
            try:
                tmp_meta[x]['num_date'] = np.mean(numerical_dates[x])
            except:
                try:
                    tmp_meta[x]['num_date'] = int(
                        tmp_meta[x]['date'][-4:]) + 0.5
                except:
                    tmp_meta[x]['num_date'] = np.nan
                    continue
            tmp_meta[x]['year'] = int(tmp_meta[x]['num_date'])
            tmp_meta[x]['month'] = int((tmp_meta[x]['num_date'] % 1) * 12)
            if 'age' in tmp_meta[x]:
                age_str = tmp_meta[x]['age']
                if age_str[-1] == 'y':
                    tmp_meta[x]['age'] = int(age_str[:-1])
                elif tmp_meta[x]['age'] == 'm':
                    tmp_meta[x]['age'] = float(age_str[:-1]) / 12.0
                else:
                    tmp_meta[x]['age'] = 'unknown'

        metadata[segment] = tmp_meta
    return metadata
Exemple #4
0
 def test_filter_on_query_good(self, tmpdir, sequences):
     """Basic filter_on_query test"""
     meta_fn = write_metadata(tmpdir, (("strain","location","quality"),
                                       ("SEQ_1","colorado","good"),
                                       ("SEQ_2","colorado","bad"),
                                       ("SEQ_3","nevada","good")))
     metadata, columns = read_metadata(meta_fn, as_data_frame=True)
     filtered = augur.filter.filter_by_query(metadata, 'quality=="good"')
     assert sorted(filtered) == ["SEQ_1", "SEQ_3"]
Exemple #5
0
 def test_read_metadata_with_good_query(self, tmpdir):
     meta_fn = str(tmpdir / "metadata.tsv")
     meta_lines = [
         "strain\tlocation\tquality", "c_good\tcolorado\tgood",
         "c_bad\tcolorado\tbad", "n_good\tnevada\tgood"
     ]
     with open(meta_fn, "w") as fh:
         fh.write("\n".join(meta_lines))
     meta_dict, _ = utils.read_metadata(
         meta_fn, query='quality=="good" & location=="colorado"')
     assert len(meta_dict) == 1
     assert "c_good" in meta_dict
def parse_metadata(segments, metadata_files):
    metadata = {}
    for segment, fname in zip(segments, metadata_files):
        tmp_meta, columns = read_metadata(fname)

        numerical_dates = get_numerical_dates(tmp_meta, fmt='%Y-%m-%d')
        for x in tmp_meta:
            tmp_meta[x]['num_date'] = np.mean(numerical_dates[x])
            tmp_meta[x]['year'] = int(tmp_meta[x]['num_date'])

            # Extract month values starting at January == 1 for comparison with
            # datetime objects.
            tmp_meta[x]['month'] = int((tmp_meta[x]['num_date'] % 1) * 12) + 1
        metadata[segment] = tmp_meta
    return metadata
Exemple #7
0
def parse_metadata(segments, metadata_files):
    metadata = {}
    for segment, fname in zip(segments, metadata_files):
        tmp_meta, columns = read_metadata(fname)

        numerical_dates = get_numerical_dates(tmp_meta, fmt='%Y-%m-%d')
        for x in tmp_meta:
            tmp_meta[x]['num_date'] = np.mean(numerical_dates[x])
            tmp_meta[x]['year'] = int(tmp_meta[x]['num_date'])
            tmp_meta[x]['month'] = int((tmp_meta[x]['num_date'] % 1) * 12)
            age_str = tmp_meta[x]['age']
            if age_str[-1] == 'y':
                tmp_meta[x]['age'] = int(age_str[:-1])
            elif tmp_meta[x]['age'] == 'm':
                tmp_meta[x]['age'] = float(age_str[:-1]) / 12.0
            else:
                tmp_meta[x]['age'] = 'unknown'

        metadata[segment] = tmp_meta
    return metadata
Exemple #8
0
                        required=True,
                        help="Output of stats for every sequence")
    parser.add_argument(
        "--output-flagged",
        type=str,
        required=True,
        help="Output of sequences flagged for exclusion with specific reasons")
    parser.add_argument("--output-exclusion-list",
                        type=str,
                        required=True,
                        help="Output to-be-reviewed addition to exclude.txt")
    args = parser.parse_args()

    # load entire alignment and the alignment of focal sequences (upper case -- probably not necessary)
    ref = SeqIO.read(args.reference, 'genbank').seq
    metadata, _ = read_metadata(args.metadata)

    diagnostics = analyze_divergence(args.alignment,
                                     metadata,
                                     ref,
                                     mask_5p=args.mask_from_beginning,
                                     mask_3p=args.mask_from_end)
    snp_cutoff = 25
    no_data_cutoff = 3000
    flagged_sequences = []
    # output diagnostics for each sequence, ordered by divergence
    with open(args.output_diagnostics, 'w') as diag:
        diag.write('\t'.join([
            'strain', 'divergence', 'excess divergence', '#Ns', '#gaps',
            'clusters', 'gaps', 'all_snps', 'gap_list'
        ]) + '\n')
Exemple #9
0
        type=int,
        default=3,
        help="minimum tips per polytomy to be consider as a cluster")
    parser.add_argument(
        "--output",
        required=True,
        help=
        "tab-delimited file with strain, cluster id, and group value for each strain"
    )

    args = parser.parse_args()

    tree = read_tree(args.tree)
    tree.collapse_all(lambda c: c.branch_length < 1e-5)

    metadata, columns = read_metadata(args.metadata)
    muts = read_node_data(args.mutations)
    attribute_name = args.attribute_name
    group_by = args.group_by

    polytomies = []
    for node in tree.find_clades(terminal=False):
        if node == tree.root:
            continue

        count_by_group = Counter()
        polytomy_sequence_id = None
        for child in node.clades:
            if child.is_terminal() and child.name:
                child_muts_data = muts["nodes"].get(child.name, {})
                any_muts = (len(child_muts_data.get("muts", [])) > 0)
    args = parser.parse_args()
    return args

if __name__ == '__main__':
    args = parse_args()
    try:
        assert(len(args.metadata)==len(args.origins))
        assert(len(args.origins)>1)
    except AssertionError:
        print("Error. Please check your inputs - there must be the same number of metadata files as origins provided, and there must be more than one of each!")
        sys.exit(2)

    # READ IN METADATA FILES
    metadata = []
    for (origin, fname) in zip(args.origins, args.metadata):
        data, columns = read_metadata(fname)
        metadata.append({'origin': origin, "fname": fname, 'data': data, 'columns': columns, 'strains': {s for s in data.keys()}})

    # SUMMARISE INPUT METADATA
    print(f"Parsed {len(metadata)} metadata TSVs")
    for m in metadata:
        print(f"\t{m['origin']} ({m['fname']}): {len(m['data'].keys())} strains x {len(m['columns'])} columns")

    # BUILD UP COLUMN NAMES FROM MULTIPLE INPUTS TO PRESERVE ORDER
    combined_columns = []
    for m in metadata:
        combined_columns.extend([c for c in m['columns'] if c not in combined_columns])
    combined_columns.extend(list(args.origins))

    # ADD IN VALUES ONE BY ONE, OVERWRITING AS NECESSARY
    combined_data = metadata[0]['data']
Exemple #11
0
from augur.utils import read_metadata
import sys
import json

fields_to_add = ['coverage', 'date_seq', 'lab']

if __name__ == "__main__":
    data = {'nodes': {}}
    ms_dict, ms_columns = read_metadata(sys.argv[1])
    private_dict, _ = read_metadata(sys.argv[2])

    print("\t".join(ms_columns+fields_to_add))
    
    for strain, data in ms_dict.items():
        line = [data[f] for f in ms_columns]
        for key in fields_to_add:
            try:
                line.append(str(private_dict[strain][key]))
            except KeyError:
                line.append("")
        print("\t".join(line))

Exemple #12
0
#!/usr/bin/env python
# coding: utf-8
import argparse
from augur.utils import read_metadata, read_tree, write_json

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--metadata", help="tab-delimited metadata")
    parser.add_argument("--tree", help="Newick tree with internal node names")
    parser.add_argument(
        "--output", help="node data JSON with clade membership annotations")

    args = parser.parse_args()

    metadata, metadata_fields = read_metadata(args.metadata)
    tree = read_tree(args.tree)

    # Look for clades for which all children have the same host. To do this,
    # make a postorder traversal of the tree such that each internal node gets
    # marked with the host of its children if all children have the same host.
    # Otherwise, the internal node is marked with a host of `None` to note that
    # its children were sampled from multiple hosts.
    for node in tree.find_clades(order="postorder"):
        if node.is_terminal():
            node.host = metadata[node.name]["host"]
        else:
            # Find all unique hosts of this node's children.
            hosts = list({child.host for child in node.clades})
            if len(hosts) == 1 and hosts[0] is not None:
                node.host = hosts[0]
            else: