Exemple #1
0
def get_fam(rfid):
    '''Get a family including tree and sequence information
from an Rfam data dump stored in data/rfam

inputs: 
  rfid:   rfam family id.

outputs:
  ali:    a biopython alignment
  tree:   a biopython tree from a newick file.
  info:   information parsed from the original stockholm file.

'''

    fmeta = open(cfg.dataPath('rfam/family_metas/{0}.pickle'.format(rfid)))
    fali = open(cfg.dataPath('rfam/family_alis/{0}.fa'.format(rfid)))

    ali = aio.parse(fali, 'fasta').next()
    info = pickle.load(fmeta)

    fname = cfg.dataPath('rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid))
    tree = nio.parse(
        open(cfg.dataPath(
            'rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid)))).next()
    return ali, tree, info
    def test_phylotree(self):
        sample_names = [
            s["sample_name"] for s in self.common_inputs["samples"]
        ]

        res = self.run_miniwdl()
        outputs = res["outputs"]

        self.assertCountEqual(outputs.keys(), [
            "phylotree.clustermap_png",
            "phylotree.clustermap_svg",
            "phylotree.ncbi_metadata_json",
            "phylotree.phylotree_newick",
            "phylotree.ska_distances",
            "phylotree.variants",
        ])

        with open(outputs["phylotree.phylotree_newick"]) as f:
            tree = next(NewickIO.parse(f))
            nodes = [
                n.name for n in tree.get_terminals() + tree.get_nonterminals()
                if n.name
            ]
            self.assertCountEqual(nodes, sample_names + self.accession_ids)

        identifiers = sorted(sample_names + self.accession_ids)
        with open(outputs["phylotree.ska_distances"]) as f:
            pairs = [
                sorted([r["Sample 1"], r["Sample 2"]])
                for r in DictReader(f, delimiter="\t")
            ]
            expected = [[a, b] for a in identifiers for b in identifiers
                        if a < b]
            self.assertCountEqual(pairs, expected)

        with open(outputs["phylotree.variants"]) as f:
            self.assertCountEqual(identifiers,
                                  [r.id for r in SeqIO.parse(f, "fasta")])

        with open(outputs["phylotree.ncbi_metadata_json"]) as f:
            self.assertEqual(
                json.load(f), {
                    "NC_012532.1": {
                        "name": "Zika virus, complete genome",
                        "country": "Uganda",
                    },
                    "NC_035889.1": {
                        "name":
                        "Zika virus isolate ZIKV/H. sapiens/Brazil/Natal/2015, complete genome",
                        "country": "Brazil: Rio Grande do Norte, Natal",
                        "collection_date": "2015",
                    },
                })

        with open(outputs["phylotree.clustermap_svg"]) as f:
            full_text = "\n".join(f.readlines())
            for name in sample_names + self.accession_ids:
                self.assertEqual(full_text.count(name), 2, name)
Exemple #3
0
def readOneTree(stream):
	"""Reads a Newick-formatted tree, permitting lines with comments denoted by leading '#'."""
	tree_string = ""
	lines = stream.readlines()
	for line in lines:
		if not line.strip()[0] == '#':
			tree_string += line.strip()
	trees = NewickIO.parse(StringIO(tree_string))
	tree = next(trees)
	return tree
Exemple #4
0
def readOneTree(stream):
	"""Reads a Newick-formatted tree, permitting lines with comments denoted by leading '#'."""
	tree_string = ""
	lines = stream.readlines()
	for line in lines:
		if not line.strip()[0] == '#':
			tree_string += line.strip()
	trees = NewickIO.parse(StringIO(tree_string))
	tree = next(trees)
	return tree
def main(newick: str, output_newick: str, samples: Iterable[Sample]):
    sample_name_by_workflow_run_id = {
        str(s["workflow_run_id"]): s["sample_name"]
        for s in samples
    }
    with open(newick) as i, open(output_newick, "w") as o:
        tree = next(NewickIO.parse(i))
        for node in tree.find_clades(order="level"):
            node.name = sample_name_by_workflow_run_id.get(
                node.name, node.name)
        NewickIO.write([tree], o)
Exemple #6
0
def read_fasta_or_newick_and_return_tree(path, nwk_path=None, patt=None):
    global NUM_OF_VIRIONS
    if any(path.name.endswith(x) for x in FASTA_EXTENSIONS):
        seqs = AlignIO.read(path, FASTA)
        seqs._records = [x for x in seqs if get_count(x, patt) > MIN_COUNT]
        NUM_OF_VIRIONS = int(sum(get_count(x, patt) for x in seqs))

        if len(seqs) <= 2: return None
        tree = build_phylogenetic_tree(seqs)
        if nwk_path is not None and tree is not None:
            NewickIO.write([tree], nwk_path)
    elif any(path.name.endswith(x) for x in NEWICK_EXTENSIONS):
        tree = NewickIO.parse(path).next()

    # Root the tree if necessary
    if not tree.rooted:
        tree.root_at_midpoint()

    return tree
Exemple #7
0
def get_fam(rfid):
    '''Get a family including tree and sequence information
from an Rfam data dump stored in data/rfam

inputs: 
  rfid:   rfam family id.

outputs:
  ali:    a biopython alignment
  tree:   a biopython tree from a newick file.
  info:   information parsed from the original stockholm file.

'''

    fmeta = open( cfg.dataPath('rfam/family_metas/{0}.pickle'.format(rfid)))
    fali  = open( cfg.dataPath('rfam/family_alis/{0}.fa'.format(rfid)))
    
    ali = aio.parse(fali,'fasta').next()
    info= pickle.load(fmeta)

    fname = cfg.dataPath('rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid))
    tree = nio.parse(open(cfg.dataPath('rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid)))).next()
    return ali, tree, info
Exemple #8
0
            n.name = 'n{}'.format(i)
    tree_ids = [_.name for _ in tree]
    df = pd.read_csv(params.tab, sep='\t', index_col=0)
    df = df.loc[df.index.isin(tree_ids), :]
    cdf = df[['country', 'host']].groupby(['country']).count().to_dict()['host']
    for c, n in cdf.items():
        print(c, n)

    c2ids = defaultdict(set)
    for t in tree:
        if t.name in df.index:
            c2ids[df.loc[t.name, 'country']].add(t.name)
    to_keep = set()
    for c, ids in c2ids.items():
        if not pd.isna(c):
            if len(ids) <= params.threshold:
                to_keep |= ids
            else:
                to_keep |= set(pd.np.random.choice(list(ids), size=params.threshold, replace=False))

    tree = remove_certain_leaves(tree, lambda _: _.name not in to_keep)
    features = [DATE, DATE_CI]
    nwk = tree.write(format_root_node=True, features=features, format=3)
    write(NewickIO.parse(StringIO(nwk)), params.out_tree, 'nexus')
    with open(params.out_tree, 'r') as f:
        nexus_str = f.read().replace('&&NHX:', '&')
    for feature in features:
        nexus_str = nexus_str.replace(':{}='.format(feature), ',{}='.format(feature))
    with open(params.out_tree, 'w') as f:
        f.write(nexus_str)
Exemple #9
0
    if not os.path.isfile(fname):
        raise IOError("# Error: file {} does not exist".format(fname))
    with open(fname, 'r') as inf:
        # Read a FASTA file?
        (headers, seqs) = biofile.readFASTA(inf)
    # Read tree
    tree_fname = os.path.expanduser(options.tree_in_fname)
    if not os.path.isfile(tree_fname):
        raise IOError("# Error: file {} does not exist".format(tree_fname))
    tree_string = ""
    with open(tree_fname, 'r') as inf:
        lines = inf.readlines()
        for line in lines:
            if not line.strip()[0] == '#':
                tree_string += line.strip()
    trees = NewickIO.parse(StringIO(tree_string))
    tree = next(trees)
    # Read mapping file
    map_fname = os.path.expanduser(options.mapping_in_fname)
    if not os.path.isfile(map_fname):
        raise IOError("# Error: file {} does not exist".format(map_fname))
    with open(map_fname, 'r') as inf:
        map_table = util.readTable(inf, header=True)

    # Create mapping
    mapping_dict = dict(zip(map_table['species'],
                            map_table['updated.species']))

    # Update the FASTA headers
    #new_headers = []
    #new_seqs = []