def run(grammar, depth, size=1000000): assert (isinstance(grammar, CFG) and isinstance(depth, int) and isinstance(size, int)) skip_prods = grammar.skip_prods() tree = Tree(grammar.start, grammar.indent, grammar.dedent) while tree.size < size: leaves = tree.leaves(depth) if leaves: k = randint(1, int(len(leaves) / 2)) if len(leaves) > 5 else 1 indexes = sample(range(len(leaves)), k) for i in indexes: node = leaves[i] prod = choice(grammar.symbol_prods(node.symbol)) tree.insert(node, prod) else: skip_nodes = tree.skips if skip_prods and skip_nodes: k = randint(1, int(len(skip_nodes) / 2)) if len(skip_nodes) > 5 else 1 indexes = sample(range(len(skip_nodes)), k) for i in indexes: node = skip_nodes[i] prod = choice([ x for x in skip_prods[node.symbol] if node.symbol in x.rhs ]) tree.insert(node, prod) else: break leaves = tree.leaves() for node in leaves[:]: prod = choice(grammar.symbol_ground(node.symbol)) tree.insert(node, prod) return tree
def convert_penn_to_knaf_with_numtokens(tree_str,term_ids,lemma_for_termid,off_t=0,off_nt=0,off_edge=0): global list_t, list_nt,list_edge,cnt_t, cnt_nt, cnt_edge list_t = [] list_nt = [] list_edge = [] cnt_t = off_t cnt_nt = off_nt cnt_edge = off_edge this_tree = Tree(tree_str) for num, num_token_and_token in enumerate(this_tree.leaves()): ## token is not used at all ##print num,token,position,token_id p = num_token_and_token.find('#') num_token = int(num_token_and_token[:p]) position = this_tree.leaf_treeposition(num) token_id = term_ids[int(num_token)] this_tree[position] = token_id ##Creat the ROOT create_extra_root = False nt_id = None if create_extra_root: nt_id = 'nter'+str(cnt_nt) cnt_nt +=1 list_nt.append((nt_id,'ROOT')) visit_node(this_tree, nt_id) this_tree = Ctree() nonter_heads = set() #Nonter labels_for_nt = {} for nt_id, label in list_nt: ##Checking the head if len(label)>=2 and label[-1]=='H' and label[-2]=='=': nonter_heads.add(nt_id) label = label[:-2] nt_obj = Cnonterminal() nt_obj.set_id(nt_id) nt_obj.set_label(label) labels_for_nt[nt_id] = label this_tree.append_element(nt_obj) ## Terminals lemma_for_ter = {} for ter_id, span_ids in list_t: ter_obj = Cterminal() ter_obj.set_id(ter_id) this_span = Cspan() term_ids = span_ids.split(' ') this_span.create_from_ids(term_ids) ter_obj.set_span(this_span) lemma_for_ter[ter_id] = lemma_for_termid[term_ids[-1]] this_tree.append_element(ter_obj) ##Edges #for edge_id,node_to,node_from in list_edge: for edge_id, node_from, node_to in list_edge: edge_obj = Cedge() edge_obj.set_id(edge_id) edge_obj.set_from(node_from) edge_obj.set_to(node_to) ## For the comment ##Only non-ter label_to = labels_for_nt.get(node_to) ##Could be ter or nonter label_from = labels_for_nt.get(node_from) if label_from is None: label_from = lemma_for_ter.get(node_from,'kk') comment = ' '+(edge_id)+' '+(label_to)+' <- '+(label_from)+' ' comment = comment.replace('--','-') if node_from in nonter_heads: edge_obj.set_as_head() edge_obj.set_comment(comment) this_tree.append_element(edge_obj) return this_tree,cnt_t,cnt_nt,cnt_edge
def convert_penn_to_kaf_with_numtokens(tree_str,term_ids,logging,lemma_for_termid,off_t=0,off_nt=0,off_edge=0): global list_t, list_nt,list_edge,cnt_t, cnt_nt, cnt_edge list_t = [] list_nt = [] list_edge = [] cnt_t = off_t cnt_nt = off_nt cnt_edge = off_edge this_tree = Tree(tree_str) logging.debug('\n'+str(this_tree)) ##It has been already encoded using UTF8 for num, num_token_and_token in enumerate(this_tree.leaves()): ## token is not used at all ##print num,token,position,token_id p = num_token_and_token.find('#') num_token = int(num_token_and_token[:p]) position = this_tree.leaf_treeposition(num) token_id = term_ids[int(num_token)] this_tree[position] = token_id logging.debug('Matching '+num_token_and_token+' with term id='+token_id+' according to KAF lemma='+str(lemma_for_termid.get(token_id).encode('utf-8'))) ##Creat the ROOT create_extra_root = False nt_id = None if create_extra_root: nt_id = 'nter'+str(cnt_nt) cnt_nt +=1 list_nt.append((nt_id,'ROOT')) visit_node(this_tree, nt_id) root = etree.Element('tree') nonter_heads = set() #Nonter labels_for_nt = {} for nt_id, label in list_nt: ##Checking the head if len(label)>=2 and label[-1]=='H' and label[-2]=='=': nonter_heads.add(nt_id) label = label[:-2] ele = etree.Element('nt', attrib={'id':nt_id,'label':label}) labels_for_nt[nt_id] = label root.append(ele) ## Terminals lemma_for_ter = {} for ter_id, span_ids in list_t: ele = etree.Element('t',attrib={'id':ter_id}) span = etree.Element('span') ele.append(span) for termid in span_ids.split(' '): target = etree.Element('target',attrib={'id':termid}) span.append(target) lemma_for_ter[ter_id] = lemma_for_termid[termid] root.append(ele) ##Edges #for edge_id,node_to,node_from in list_edge: for edge_id, node_from, node_to in list_edge: ele = etree.Element('edge',attrib={'id':edge_id,'from':node_from,'to':node_to}) ## For the comment ##Only non-ter label_to = labels_for_nt.get(node_to) ##Could be ter or nonter label_from = labels_for_nt.get(node_from) if label_from is None: label_from = lemma_for_ter.get(node_from,'kk') comment = ' '+(edge_id)+' '+(label_to)+' <- '+(label_from)+' ' comment = comment.replace('--','-') if node_from in nonter_heads: ele.set('head','yes') root.append(etree.Comment(comment)) root.append(ele) return root,cnt_t,cnt_nt,cnt_edge
from tree import Tree #Tree tests complete t = Tree(3) print(t) a = t.addNode(0, 4) b = t.addNode(0, 5) c = t.addNode(a, 6) d = t.addNode(a, 7) e = t.addNode(d, 8) print(t) print("Path to e:") p = t.pathToNode(e) for i in p: print("\t", t.getData(i)) print("Path to c") p = t.pathToNode(c) for i in p: print("\t", t.getData(i)) print("Leaves:") l = t.leaves() for i in l: print(t.getData(i))
def convert_penn_to_kaf(tree_str, term_ids, logging, lemma_for_termid, off_t, off_nt, off_edge): global list_t, list_nt, list_edge, cnt_t, cnt_nt, cnt_edge list_t = [] list_nt = [] list_edge = [] cnt_t = off_t cnt_nt = off_nt cnt_edge = off_edge this_tree = Tree(tree_str) logging.debug("\n" + str(this_tree)) for num, token in enumerate(this_tree.leaves()): position = this_tree.leaf_treeposition(num) token_id = term_ids[num] this_tree[position] = token_id logging.debug( "Matching " + token + " with term id=" + token_id + " which according to KAF lemma=" + str(lemma_for_termid.get(token_id).encode("utf-8")) ) ##Creat the ROOT create_extra_root = False nt_id = None if create_extra_root: nt_id = "nter" + str(cnt_nt) cnt_nt += 1 list_nt.append((nt_id, "ROOT")) visit_node(this_tree, nt_id) root = etree.Element("tree") nonter_heads = set() # Nonter labels_for_nt = {} for nt_id, label in list_nt: ##Checking the head if len(label) >= 2 and label[-1] == "H" and label[-2] == "=": nonter_heads.add(nt_id) label = label[:-2] ele = etree.Element("nt", attrib={"id": nt_id, "label": label}) labels_for_nt[nt_id] = label root.append(ele) ## Terminals lemma_for_ter = {} for ter_id, span_ids in list_t: ele = etree.Element("t", attrib={"id": ter_id}) span = etree.Element("span") ele.append(span) for termid in span_ids.split(" "): target = etree.Element("target", attrib={"id": termid}) span.append(target) lemma_for_ter[ter_id] = lemma_for_termid.get(termid, "unknown") root.append(ele) ##Edges # for edge_id,node_to,node_from in list_edge: for edge_id, node_from, node_to in list_edge: ele = etree.Element("edge", attrib={"id": edge_id, "from": node_from, "to": node_to}) ## For the comment ##Only non-ter label_to = labels_for_nt.get(node_to) ##Could be ter or nonter label_from = labels_for_nt.get(node_from) if label_from is None: label_from = lemma_for_ter.get(node_from, "kk") comment = " " + (edge_id) + " " + (label_to) + " <- " + (label_from) + " " if node_from in nonter_heads: ele.set("head", "yes") root.append(etree.Comment(comment)) root.append(ele) return root, cnt_t, cnt_nt, cnt_edge
def build_haplexD(trainingHap, trainingExp, testHap, k, n_clusters): t=Tree(trainingHap,trainingExp) nodes=t.leaves() predicted=classify(nodes, k, metric="chi2", testHap, n_clusters) return predicted