def test_read_write(tmpdir): trees = read(os.path.join( os.path.dirname(__file__), 'fixtures', 'tree-glottolog-newick.txt')) descs = [len(tree.descendants) for tree in trees] # The bookkeeping family has 391 languages assert descs[0] == 391 tmp = str(tmpdir.join('test.txt')) write(trees, tmp) assert os.path.exists(tmp) assert [len(tree.descendants) for tree in read(tmp)] == descs
def test_read_write(tmpdir): trees = read( os.path.join(os.path.dirname(__file__), 'fixtures', 'tree-glottolog-newick.txt')) descs = [len(tree.descendants) for tree in trees] # The bookkeeping family has 391 languages assert descs[0] == 391 tmp = str(tmpdir.join('test.txt')) write(trees, tmp) assert os.path.exists(tmp) assert [len(tree.descendants) for tree in read(tmp)] == descs
def test_read_write(self): trees = read(os.path.join( os.path.dirname(__file__), 'fixtures', 'tree-glottolog-newick.txt')) descs = [len(tree.descendants) for tree in trees] # The bookkeeping family has 391 languages self.assertEqual(descs[0], 391) tmp = mktemp() write(trees, tmp) assert os.path.exists(tmp) self.assertEqual([len(tree.descendants) for tree in read(tmp)], descs) os.remove(tmp)
def test_read_write(tmpdir): trees = read( pathlib.Path(__file__).parent / 'fixtures' / 'tree-glottolog-newick.txt') descs = [len(tree.descendants) for tree in trees] # The bookkeeping family has 391 languages assert descs[0] == 391 tmp = str(tmpdir.join('test.txt')) write(trees, tmp) assert pathlib.Path(tmp).exists() assert [len(tree.descendants) for tree in read(tmp)] == descs
def test_read_write(tmp_path): trees = read( pathlib.Path(__file__).parent / 'fixtures' / 'tree-glottolog-newick.txt') assert '[' in trees[0].descendants[0].name descs = [len(tree.descendants) for tree in trees] # The bookkeeping family has 391 languages assert descs[0] == 391 tmp = tmp_path / 'test.txt' write(trees, tmp) assert tmp.exists() assert [len(tree.descendants) for tree in read(tmp)] == descs
def parse_newick_file(filename: str, digraph=True): """ Parses a newick file and returns the networkx graph. :param filename: str; full path of the to be parsed file :param digraph: Bool; is the graph a digraph :return: nx.Graph() """ tree = newick.read(filename) if digraph: graph_newick = nx.DiGraph() else: graph_newick = nx.Graph none_counter = 1 # Adding root node graph_newick.add_node(tree[0], child_position=0) while tree: tree_node = tree[0] tree_node, none_counter = rename_none_node(tree_node, none_counter) graph_newick, descendants, none_counter = add_newick_node_and_edge( graph_newick, tree_node, none_counter) tree += descendants tree.remove(tree_node) return graph_newick
def test_get_glottolog_newick(tmppath, mocker): tmppath.joinpath('glottolog-2.5.newick').write_text( '(B [abcd1234],C [abcd1234])A [abcd1234];', encoding='utf8') mocker.patch( 'beastling.configuration.user_data_dir', new=mocker.Mock(return_value=str(tmppath))) trees = newick.read(str(get_glottolog_data('newick', '2.5'))) assert trees[0].name == 'A [abcd1234]'
def test_get_glottolog_newick(self): with self.tmp.joinpath('glottolog-2.5.newick').open('w', encoding='utf8') as fp: fp.write('(B [abcd1234],C [abcd1234])A [abcd1234];') with patch( 'beastling.configuration.user_data_dir', new=Mock(return_value=self.tmp.as_posix())): trees = newick.read(get_glottolog_data('newick', '2.5')) self.assertEqual(trees[0].name, 'A [abcd1234]')
def test_get_glottolog_newick(self): with self.tmp.joinpath('glottolog-2.5.newick').open( 'w', encoding='utf8') as fp: fp.write('(B [abcd1234],C [abcd1234])A [abcd1234];') with patch('beastling.configuration.user_data_dir', new=Mock(return_value=self.tmp.as_posix())): trees = newick.read(get_glottolog_data('newick', '2.5')) self.assertEqual(trees[0].name, 'A [abcd1234]')
def trees_from_file(fname, encoding='utf8', strip_comments=False, **kw): """ Load a list of trees from a Newick formatted file. :param fname: file path. :param strip_comments: Flag signaling whether to strip comments enclosed in square \ brackets. :param kw: Keyword arguments are passed through to `Node.read`. :return: [`PhyloTree`] instance. """ l = newick.read(fname, encoding, strip_comments, **kw) return [newick_node_to_tree(n) for n in l]
def tree_probs_from_file(fname, encoding='utf8', strip_comments=False, **kw): """ Load a list of instances of `PhyloTree` from a Newick formatted file and return their probabilities in a list, assuming they are instances of `PhyloTree`. :param fname: file path. :param strip_comments: Flag signaling whether to strip comments enclosed in square \ brackets. :param kw: Keyword arguments are passed through to `Node.read`. :return: A list of symbolic expressions depending on `a`. """ l = newick.read(fname, encoding, strip_comments, **kw) return [prob_tree(PhyloTree.newick_node_to_tree(t)) for t in l]
def main(): filename = ['OG0000002_tree.txt', ] for file in filename: #Read Dataset trees = read('./dataset/{}'.format(file)) node_needed = trees[0].get_node('n441').get_leaves() all_node = trees[0].get_leaves() trees[0].prune(node_needed, inverse=True) #Rename Nodes trees[0].visit(clean_node) #Get node names leaves = trees[0].get_leaves() #Get Node lengths unique = {} for node in leaves: regex = r"[a-z]_[a-zA-Z]+_[a-zA-Z1-9]" if re.search(regex, node.name): node_new_name = re.search(regex, str(node.name)).group(0) if node_new_name in unique: unique[node_new_name] = max(unique[node_new_name], node.length) else: unique[node_new_name] = node.length #prune to just keep the longest unique nodes for node in leaves: regex = r"[a-z]_[a-zA-Z]+_[a-zA-Z1-9]" if re.search(regex, node.name): node_new_name = re.search(regex, str(node.name)).group(0) if node.length != unique[node_new_name]: trees[0].prune_by_names(node.name) #remove c elegans ref c_elegans_remove = ['c_elegans_ref_protein_PAR-1'] trees[0].prune_by_names(c_elegans_remove) #Remove nodes with no chil while trees[0].walk(mode='postorder'): atleast_once = True for n in trees[0].walk(mode='postorder'): regex = r"^n[1-9]+" if n.ancestor and len(n.descendants) == 0 and re.search(regex, n.name): trees[0].prune_by_names(n.name) atleast_once = False if atleast_once == True: break #dump the tree with open('{}_required_tree.txt'.format(file), 'w') as fobj: dump(trees, fobj)
def parse_tree(fname): trees = read(fname) new_tr = Tree() new_tr.create_node("root", 1) size = 2 * len(trees[0].get_leaves()) + 1 st_ids = [] for i in range(2, size + 2): st_ids.append(i) parse_newick(trees[0].descendants, new_tr, 1, st_ids) return new_tr
def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain: Dict[Text, Any]) -> List[Dict[Text, Any]]: trees = read("data/tree.txt", strip_comments=True) entities = list(tracker.get_latest_entity_values("language")) data_path = os.path.join("data", "cldf-datasets-wals-014143f", "cldf", "languages.csv") wals_data = pd.read_csv(data_path) if len(entities) > 0: query_lang = entities.pop() query_lang_en = translator.translate(text=query_lang, lang_tgt='en') query_lang_en = query_lang_en.strip() query_lang_en = query_lang_en.lower() if len(query_lang_en.split(' ')) > 1: f = [x.capitalize() for x in query_lang_en.split(' ')] query_lang_en = list( set(f).intersection(set(wals_data["Name"])))[0] print(query_lang_en) matched_leaves = [] for i, node in enumerate(trees): s = node.get_leaves() r = [k for k in s if query_lang_en in k.name] matched_leaves.extend(r) # print(r,node,i) if len(matched_leaves) > 0: for i in matched_leaves: anc = get_immediate_cousins(i) out_text = ','.join(anc) out_text = translator.translate(text=out_text, lang_tgt='hi') # print(out_text) dispatcher.utter_message(text="मिलती जुलती भाषा " + out_text) else: dispatcher.utter_message(text='क्षमा करें, मुझे समझ नहीं आया')
def sequence(trees, mp="", ml=""): """ Add sequence for each node in a tree @param trees: dictionary, in the form of {id : [parent, name, offspring, support, length, level, sequence]} @param mp: string, a string refer to a filename of MSA file (for MP method only), only contain aligned protein sequences for terminal nodes @param ml: string, a string refer to a filename of MSA file (for ML method only), contains aligned protein sequences both for all terminal nodes and internal nodes @return: dictionary, in the form of {id : [parent, name, offspring, support, length, level, sequence]} """ tree = newick.read(trees) mp_sequence = fasta.read(mp) ml_sequence = fasta.read(ml) if mp_sequence: tips = [v[1] for k, v in tree.items() if not v[2]] taxa = mp_sequence.keys() if not set(tips).difference(set(taxa)): for k, v in tree.items(): if v[1] in tips: v[-1] = mp_sequence[v[1]] tree[k] = v elif ml_sequence: s = ml_sequence support = [v for v in tree.values() if v[1] == "root"][0] ids = [k for k, v in tree.items() if v[1] == "root"][0] support[3] = "N1" tree[ids] = support for k, v in tree.items(): if v[3] in s: v[-1] = s[v[3]] tree[k] = v elif v[1] in s: v[-1] = s[v[1]] tree[k] = v return tree
def classifications_from_newick(string, label_pattern=GLOTTOLOG_NODE_LABEL): label2name = {} def parse_label(label): match = { k: v.strip() if v else '' for k, v in label_pattern.match(label).groupdict().items() } assert match['glottocode'] label2name[label] = (match.get('name', '').strip().replace("\\'", "'"), match['glottocode']) return match def get_classification(node): ancestor = node.ancestor if not ancestor: # Node is root of some family return [label2name[node.name]] res = [] while ancestor: res.append(label2name[ancestor.name]) ancestor = ancestor.ancestor return list(reversed(res)) classifications, nodemap = {}, {} # Walk the tree and build the classifications dictionary trees = newick.read(string) for tree in trees: for node in tree.walk(): label = parse_label(node.name) classification = get_classification(node) classifications[label['glottocode']] = classification if label.get('isocode'): classifications[label['isocode']] = classification nodemap[label['glottocode']] = node return classifications, nodemap, label2name
def load_glottolog_data(self): """ Loads the Glottolog classification information from the appropriate newick file, parses it and stores the required datastructure in self.classification. """ # Don't load if the analysis doesn't use it if not self.check_glottolog_required(): return # Don't load if we already have - can this really happen? if self.glottolog_loaded: return self.glottolog_loaded = True label2name = {} glottocode2node = {} def parse_label(label): match = GLOTTOLOG_NODE_LABEL.match(label) label2name[label] = (match.group('name').strip().replace("\\'","'"), match.group('glottocode')) return ( match.group('name').strip(), match.group('glottocode'), match.group('isocode')) def get_classification(node): res = [] ancestor = node.ancestor while ancestor: res.append(label2name[ancestor.name]) ancestor = ancestor.ancestor return list(reversed(res)) # Walk the tree and build the classifications dictionary glottolog_trees = newick.read(get_glottolog_data('newick', self.glottolog_release)) for tree in glottolog_trees: for node in tree.walk(): name, glottocode, isocode = parse_label(node.name) classification = get_classification(node) self.classifications[glottocode] = classification if isocode: self.classifications[isocode] = classification glottocode2node[glottocode] = node # Load geographic metadata for t in reader( get_glottolog_data('geo', self.glottolog_release), namedtuples=True): if t.macroarea: self.glotto_macroareas[t.glottocode] = t.macroarea for isocode in t.isocodes.split(): self.glotto_macroareas[isocode] = t.macroarea if self.location_data: continue # Use user-supplied data instead if t.latitude and t.longitude: latlon = (float(t.latitude), float(t.longitude)) self.locations[t.glottocode] = latlon for isocode in t.isocodes.split(): self.locations[isocode] = latlon if self.location_data: return # Second pass of geographic data to handle dialects, which inherit # their parent language's location for t in reader( get_glottolog_data('geo', self.glottolog_release), namedtuples=True): if t.level == "dialect": failed = False node = glottocode2node[t.glottocode] ancestor = node.ancestor while label2name[ancestor.name][1] not in self.locations: if not ancestor.ancestor: # We've hit the root without finding an ancestral node # with location data! failed = True break else: ancestor = ancestor.ancestor if failed: continue latlon = self.locations[label2name[ancestor.name][1]] self.locations[t.glottocode] = latlon for isocode in t.isocodes.split(): self.locations[isocode] = latlon
import newick from Bio import SeqIO from StringIO import StringIO def find_rev(t,dnas): r = [] for i in range(len(dnas[t.u])): r += [(p[0],p[-1],i,dnas[p[0].u][i]) for p in t.find_rev(dnas,i)] return r if __name__ == '__main__': with open('data/data.dat') as f: nw = f.readline() nw.split() tree = newick.read(StringIO(nw)) fst = f.read() fst = StringIO(fst) dnas,_ = SeqIO.parse(fst,'fasta') nodes = tree.nodes() for node in nodes: revs = find_rev(node,dnas) for fc, dest, pos, mid in revs: print("%s %s %d %s->%s->%s" % (fc.u, dest.u, pos + 1, dnas[node.u][pos], mid, dnas[dest.u][pos])) assert(dnas[node.u][pos] == dnas[dest.u][pos])
# sort nodes according to branch length lengths=[node.length for node in obj] pair_ind=np.triu_indices(len(touch_ind), 1) for k in range(len(pair_ind[0])): i,j=pair_ind[0][k],pair_ind[1][k] ll=[lengths[touch_ind[i]],lengths[touch_ind[j]]] tot=sum(ll) if tot<threshold: ind=ll.index(min(ll)) removed=True if ind==0: del obj[touch_ind[i]] else: del obj[touch_ind[j]] break def get_names(obj): if type(obj)==newick.Node: print obj.name def get_lengths(obj): if type(obj)==newick.Node: print obj.length if __name__ == "__main__": threshold = 0.001 tree = newick.read(sys.argv[1]) map_tree(tree,prune,threshold) map_tree(tree,prune,threshold) map_tree(tree,get_names) newick.write(tree,sys.argv[1]+'_pruned')
def read(newick_filename): return Node.build_from_newick_object(newick.read(newick_filename)[0])
def load_glottolog_data(self): """ Loads the Glottolog classification information from the appropriate newick file, parses it and stores the required datastructure in self.classification. """ # Don't load if the analysis doesn't use it if not self.check_glottolog_required(): return # Don't load if we already have - can this really happen? if self.glottolog_loaded: return self.glottolog_loaded = True label2name = {} glottocode2node = {} def parse_label(label): match = GLOTTOLOG_NODE_LABEL.match(label) label2name[label] = (match.group('name').strip().replace( "\\'", "'"), match.group('glottocode')) return (match.group('name').strip(), match.group('glottocode'), match.group('isocode')) def get_classification(node): res = [] ancestor = node.ancestor while ancestor: res.append(label2name[ancestor.name]) ancestor = ancestor.ancestor return list(reversed(res)) # Walk the tree and build the classifications dictionary glottolog_trees = newick.read( get_glottolog_data('newick', self.glottolog_release)) for tree in glottolog_trees: for node in tree.walk(): name, glottocode, isocode = parse_label(node.name) classification = get_classification(node) self.classifications[glottocode] = classification if isocode: self.classifications[isocode] = classification glottocode2node[glottocode] = node # Load geographic metadata for t in reader(get_glottolog_data('geo', self.glottolog_release), namedtuples=True): if t.macroarea: self.glotto_macroareas[t.glottocode] = t.macroarea for isocode in t.isocodes.split(): self.glotto_macroareas[isocode] = t.macroarea if self.location_data: continue # Use user-supplied data instead if t.latitude and t.longitude: latlon = (float(t.latitude), float(t.longitude)) self.locations[t.glottocode] = latlon for isocode in t.isocodes.split(): self.locations[isocode] = latlon if self.location_data: return # Second pass of geographic data to handle dialects, which inherit # their parent language's location for t in reader(get_glottolog_data('geo', self.glottolog_release), namedtuples=True): if t.level == "dialect": failed = False node = glottocode2node[t.glottocode] ancestor = node.ancestor while label2name[ancestor.name][1] not in self.locations: if not ancestor.ancestor: # We've hit the root without finding an ancestral node # with location data! failed = True break else: ancestor = ancestor.ancestor if failed: continue latlon = self.locations[label2name[ancestor.name][1]] self.locations[t.glottocode] = latlon for isocode in t.isocodes.split(): self.locations[isocode] = latlon
def load_from_file(file_name: str): return read(file_name)