def get_fam(rfid): '''Get a family including tree and sequence information from an Rfam data dump stored in data/rfam inputs: rfid: rfam family id. outputs: ali: a biopython alignment tree: a biopython tree from a newick file. info: information parsed from the original stockholm file. ''' fmeta = open(cfg.dataPath('rfam/family_metas/{0}.pickle'.format(rfid))) fali = open(cfg.dataPath('rfam/family_alis/{0}.fa'.format(rfid))) ali = aio.parse(fali, 'fasta').next() info = pickle.load(fmeta) fname = cfg.dataPath('rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid)) tree = nio.parse( open(cfg.dataPath( 'rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid)))).next() return ali, tree, info
def test_phylotree(self): sample_names = [ s["sample_name"] for s in self.common_inputs["samples"] ] res = self.run_miniwdl() outputs = res["outputs"] self.assertCountEqual(outputs.keys(), [ "phylotree.clustermap_png", "phylotree.clustermap_svg", "phylotree.ncbi_metadata_json", "phylotree.phylotree_newick", "phylotree.ska_distances", "phylotree.variants", ]) with open(outputs["phylotree.phylotree_newick"]) as f: tree = next(NewickIO.parse(f)) nodes = [ n.name for n in tree.get_terminals() + tree.get_nonterminals() if n.name ] self.assertCountEqual(nodes, sample_names + self.accession_ids) identifiers = sorted(sample_names + self.accession_ids) with open(outputs["phylotree.ska_distances"]) as f: pairs = [ sorted([r["Sample 1"], r["Sample 2"]]) for r in DictReader(f, delimiter="\t") ] expected = [[a, b] for a in identifiers for b in identifiers if a < b] self.assertCountEqual(pairs, expected) with open(outputs["phylotree.variants"]) as f: self.assertCountEqual(identifiers, [r.id for r in SeqIO.parse(f, "fasta")]) with open(outputs["phylotree.ncbi_metadata_json"]) as f: self.assertEqual( json.load(f), { "NC_012532.1": { "name": "Zika virus, complete genome", "country": "Uganda", }, "NC_035889.1": { "name": "Zika virus isolate ZIKV/H. sapiens/Brazil/Natal/2015, complete genome", "country": "Brazil: Rio Grande do Norte, Natal", "collection_date": "2015", }, }) with open(outputs["phylotree.clustermap_svg"]) as f: full_text = "\n".join(f.readlines()) for name in sample_names + self.accession_ids: self.assertEqual(full_text.count(name), 2, name)
def readOneTree(stream): """Reads a Newick-formatted tree, permitting lines with comments denoted by leading '#'.""" tree_string = "" lines = stream.readlines() for line in lines: if not line.strip()[0] == '#': tree_string += line.strip() trees = NewickIO.parse(StringIO(tree_string)) tree = next(trees) return tree
def main(newick: str, output_newick: str, samples: Iterable[Sample]): sample_name_by_workflow_run_id = { str(s["workflow_run_id"]): s["sample_name"] for s in samples } with open(newick) as i, open(output_newick, "w") as o: tree = next(NewickIO.parse(i)) for node in tree.find_clades(order="level"): node.name = sample_name_by_workflow_run_id.get( node.name, node.name) NewickIO.write([tree], o)
def read_fasta_or_newick_and_return_tree(path, nwk_path=None, patt=None): global NUM_OF_VIRIONS if any(path.name.endswith(x) for x in FASTA_EXTENSIONS): seqs = AlignIO.read(path, FASTA) seqs._records = [x for x in seqs if get_count(x, patt) > MIN_COUNT] NUM_OF_VIRIONS = int(sum(get_count(x, patt) for x in seqs)) if len(seqs) <= 2: return None tree = build_phylogenetic_tree(seqs) if nwk_path is not None and tree is not None: NewickIO.write([tree], nwk_path) elif any(path.name.endswith(x) for x in NEWICK_EXTENSIONS): tree = NewickIO.parse(path).next() # Root the tree if necessary if not tree.rooted: tree.root_at_midpoint() return tree
def get_fam(rfid): '''Get a family including tree and sequence information from an Rfam data dump stored in data/rfam inputs: rfid: rfam family id. outputs: ali: a biopython alignment tree: a biopython tree from a newick file. info: information parsed from the original stockholm file. ''' fmeta = open( cfg.dataPath('rfam/family_metas/{0}.pickle'.format(rfid))) fali = open( cfg.dataPath('rfam/family_alis/{0}.fa'.format(rfid))) ali = aio.parse(fali,'fasta').next() info= pickle.load(fmeta) fname = cfg.dataPath('rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid)) tree = nio.parse(open(cfg.dataPath('rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid)))).next() return ali, tree, info
n.name = 'n{}'.format(i) tree_ids = [_.name for _ in tree] df = pd.read_csv(params.tab, sep='\t', index_col=0) df = df.loc[df.index.isin(tree_ids), :] cdf = df[['country', 'host']].groupby(['country']).count().to_dict()['host'] for c, n in cdf.items(): print(c, n) c2ids = defaultdict(set) for t in tree: if t.name in df.index: c2ids[df.loc[t.name, 'country']].add(t.name) to_keep = set() for c, ids in c2ids.items(): if not pd.isna(c): if len(ids) <= params.threshold: to_keep |= ids else: to_keep |= set(pd.np.random.choice(list(ids), size=params.threshold, replace=False)) tree = remove_certain_leaves(tree, lambda _: _.name not in to_keep) features = [DATE, DATE_CI] nwk = tree.write(format_root_node=True, features=features, format=3) write(NewickIO.parse(StringIO(nwk)), params.out_tree, 'nexus') with open(params.out_tree, 'r') as f: nexus_str = f.read().replace('&&NHX:', '&') for feature in features: nexus_str = nexus_str.replace(':{}='.format(feature), ',{}='.format(feature)) with open(params.out_tree, 'w') as f: f.write(nexus_str)
if not os.path.isfile(fname): raise IOError("# Error: file {} does not exist".format(fname)) with open(fname, 'r') as inf: # Read a FASTA file? (headers, seqs) = biofile.readFASTA(inf) # Read tree tree_fname = os.path.expanduser(options.tree_in_fname) if not os.path.isfile(tree_fname): raise IOError("# Error: file {} does not exist".format(tree_fname)) tree_string = "" with open(tree_fname, 'r') as inf: lines = inf.readlines() for line in lines: if not line.strip()[0] == '#': tree_string += line.strip() trees = NewickIO.parse(StringIO(tree_string)) tree = next(trees) # Read mapping file map_fname = os.path.expanduser(options.mapping_in_fname) if not os.path.isfile(map_fname): raise IOError("# Error: file {} does not exist".format(map_fname)) with open(map_fname, 'r') as inf: map_table = util.readTable(inf, header=True) # Create mapping mapping_dict = dict(zip(map_table['species'], map_table['updated.species'])) # Update the FASTA headers #new_headers = [] #new_seqs = []