def test_read_metadata_bad_query(self, tmpdir): meta_fn = str(tmpdir / "metadata.tsv") meta_lines = [ "strain\tlocation\tquality", "c_good\tcolorado\tgood", "n_bad\tnevada\tbad", ] with open(meta_fn, "w") as fh: fh.write("\n".join(meta_lines)) with pytest.raises(SystemExit): utils.read_metadata(meta_fn, query='badcol=="goodval"')
def test_read_metadata(self, tmpdir): meta_fn = write_metadata(tmpdir, (("strain", "location", "quality"), ("SEQ_1", "colorado", "good"), ("SEQ_2", "colorado", "bad"), ("SEQ_3", "nevada", "good"))) utils.read_metadata(meta_fn, as_data_frame=True) # duplicates SEQ_1 raises ValueError meta_fn = write_metadata(tmpdir, (("strain", "location", "quality"), ("SEQ_1", "colorado", "good"), ("SEQ_1", "colorado", "bad"), ("SEQ_3", "nevada", "good"))) with pytest.raises(ValueError) as e_info: utils.read_metadata(meta_fn, as_data_frame=True) assert str(e_info.value) == "Duplicated strain in metadata: SEQ_1"
def parse_metadata(segments, metadata_files, date_format="%Y-%m-%d"): metadata = {} for segment, fname in zip(segments, metadata_files): tmp_meta, columns = read_metadata(fname) numerical_dates = get_numerical_dates(tmp_meta, fmt=date_format) for x in tmp_meta: try: tmp_meta[x]['num_date'] = np.mean(numerical_dates[x]) except: try: tmp_meta[x]['num_date'] = int( tmp_meta[x]['date'][-4:]) + 0.5 except: tmp_meta[x]['num_date'] = np.nan continue tmp_meta[x]['year'] = int(tmp_meta[x]['num_date']) tmp_meta[x]['month'] = int((tmp_meta[x]['num_date'] % 1) * 12) if 'age' in tmp_meta[x]: age_str = tmp_meta[x]['age'] if age_str[-1] == 'y': tmp_meta[x]['age'] = int(age_str[:-1]) elif tmp_meta[x]['age'] == 'm': tmp_meta[x]['age'] = float(age_str[:-1]) / 12.0 else: tmp_meta[x]['age'] = 'unknown' metadata[segment] = tmp_meta return metadata
def test_filter_on_query_good(self, tmpdir, sequences): """Basic filter_on_query test""" meta_fn = write_metadata(tmpdir, (("strain","location","quality"), ("SEQ_1","colorado","good"), ("SEQ_2","colorado","bad"), ("SEQ_3","nevada","good"))) metadata, columns = read_metadata(meta_fn, as_data_frame=True) filtered = augur.filter.filter_by_query(metadata, 'quality=="good"') assert sorted(filtered) == ["SEQ_1", "SEQ_3"]
def test_read_metadata_with_good_query(self, tmpdir): meta_fn = str(tmpdir / "metadata.tsv") meta_lines = [ "strain\tlocation\tquality", "c_good\tcolorado\tgood", "c_bad\tcolorado\tbad", "n_good\tnevada\tgood" ] with open(meta_fn, "w") as fh: fh.write("\n".join(meta_lines)) meta_dict, _ = utils.read_metadata( meta_fn, query='quality=="good" & location=="colorado"') assert len(meta_dict) == 1 assert "c_good" in meta_dict
def parse_metadata(segments, metadata_files): metadata = {} for segment, fname in zip(segments, metadata_files): tmp_meta, columns = read_metadata(fname) numerical_dates = get_numerical_dates(tmp_meta, fmt='%Y-%m-%d') for x in tmp_meta: tmp_meta[x]['num_date'] = np.mean(numerical_dates[x]) tmp_meta[x]['year'] = int(tmp_meta[x]['num_date']) # Extract month values starting at January == 1 for comparison with # datetime objects. tmp_meta[x]['month'] = int((tmp_meta[x]['num_date'] % 1) * 12) + 1 metadata[segment] = tmp_meta return metadata
def parse_metadata(segments, metadata_files): metadata = {} for segment, fname in zip(segments, metadata_files): tmp_meta, columns = read_metadata(fname) numerical_dates = get_numerical_dates(tmp_meta, fmt='%Y-%m-%d') for x in tmp_meta: tmp_meta[x]['num_date'] = np.mean(numerical_dates[x]) tmp_meta[x]['year'] = int(tmp_meta[x]['num_date']) tmp_meta[x]['month'] = int((tmp_meta[x]['num_date'] % 1) * 12) age_str = tmp_meta[x]['age'] if age_str[-1] == 'y': tmp_meta[x]['age'] = int(age_str[:-1]) elif tmp_meta[x]['age'] == 'm': tmp_meta[x]['age'] = float(age_str[:-1]) / 12.0 else: tmp_meta[x]['age'] = 'unknown' metadata[segment] = tmp_meta return metadata
required=True, help="Output of stats for every sequence") parser.add_argument( "--output-flagged", type=str, required=True, help="Output of sequences flagged for exclusion with specific reasons") parser.add_argument("--output-exclusion-list", type=str, required=True, help="Output to-be-reviewed addition to exclude.txt") args = parser.parse_args() # load entire alignment and the alignment of focal sequences (upper case -- probably not necessary) ref = SeqIO.read(args.reference, 'genbank').seq metadata, _ = read_metadata(args.metadata) diagnostics = analyze_divergence(args.alignment, metadata, ref, mask_5p=args.mask_from_beginning, mask_3p=args.mask_from_end) snp_cutoff = 25 no_data_cutoff = 3000 flagged_sequences = [] # output diagnostics for each sequence, ordered by divergence with open(args.output_diagnostics, 'w') as diag: diag.write('\t'.join([ 'strain', 'divergence', 'excess divergence', '#Ns', '#gaps', 'clusters', 'gaps', 'all_snps', 'gap_list' ]) + '\n')
type=int, default=3, help="minimum tips per polytomy to be consider as a cluster") parser.add_argument( "--output", required=True, help= "tab-delimited file with strain, cluster id, and group value for each strain" ) args = parser.parse_args() tree = read_tree(args.tree) tree.collapse_all(lambda c: c.branch_length < 1e-5) metadata, columns = read_metadata(args.metadata) muts = read_node_data(args.mutations) attribute_name = args.attribute_name group_by = args.group_by polytomies = [] for node in tree.find_clades(terminal=False): if node == tree.root: continue count_by_group = Counter() polytomy_sequence_id = None for child in node.clades: if child.is_terminal() and child.name: child_muts_data = muts["nodes"].get(child.name, {}) any_muts = (len(child_muts_data.get("muts", [])) > 0)
args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() try: assert(len(args.metadata)==len(args.origins)) assert(len(args.origins)>1) except AssertionError: print("Error. Please check your inputs - there must be the same number of metadata files as origins provided, and there must be more than one of each!") sys.exit(2) # READ IN METADATA FILES metadata = [] for (origin, fname) in zip(args.origins, args.metadata): data, columns = read_metadata(fname) metadata.append({'origin': origin, "fname": fname, 'data': data, 'columns': columns, 'strains': {s for s in data.keys()}}) # SUMMARISE INPUT METADATA print(f"Parsed {len(metadata)} metadata TSVs") for m in metadata: print(f"\t{m['origin']} ({m['fname']}): {len(m['data'].keys())} strains x {len(m['columns'])} columns") # BUILD UP COLUMN NAMES FROM MULTIPLE INPUTS TO PRESERVE ORDER combined_columns = [] for m in metadata: combined_columns.extend([c for c in m['columns'] if c not in combined_columns]) combined_columns.extend(list(args.origins)) # ADD IN VALUES ONE BY ONE, OVERWRITING AS NECESSARY combined_data = metadata[0]['data']
from augur.utils import read_metadata import sys import json fields_to_add = ['coverage', 'date_seq', 'lab'] if __name__ == "__main__": data = {'nodes': {}} ms_dict, ms_columns = read_metadata(sys.argv[1]) private_dict, _ = read_metadata(sys.argv[2]) print("\t".join(ms_columns+fields_to_add)) for strain, data in ms_dict.items(): line = [data[f] for f in ms_columns] for key in fields_to_add: try: line.append(str(private_dict[strain][key])) except KeyError: line.append("") print("\t".join(line))
#!/usr/bin/env python # coding: utf-8 import argparse from augur.utils import read_metadata, read_tree, write_json if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--metadata", help="tab-delimited metadata") parser.add_argument("--tree", help="Newick tree with internal node names") parser.add_argument( "--output", help="node data JSON with clade membership annotations") args = parser.parse_args() metadata, metadata_fields = read_metadata(args.metadata) tree = read_tree(args.tree) # Look for clades for which all children have the same host. To do this, # make a postorder traversal of the tree such that each internal node gets # marked with the host of its children if all children have the same host. # Otherwise, the internal node is marked with a host of `None` to note that # its children were sampled from multiple hosts. for node in tree.find_clades(order="postorder"): if node.is_terminal(): node.host = metadata[node.name]["host"] else: # Find all unique hosts of this node's children. hosts = list({child.host for child in node.clades}) if len(hosts) == 1 and hosts[0] is not None: node.host = hosts[0] else: