def parse2tree(ifn,ldlim=None,shuffle=False): ofn=os.path.basename(ifn) f=open(ifn) t=str(f.read()) f.close() sents=t.split('<sentence ') if shuffle: import random random.shuffle(sents) sentnum=0 import networkx as nx noderoot=None ld=[] for sentence in sents[1:]: sentnum+=1 G=nx.DiGraph() if ldlim and sentnum>ldlim: break parse=sentence.split('<parse>')[1].split('</parse>')[0] print parse pdat=parse.split() wordi=0 pnumi=-1 pstack=[] words=[] wordnodes=[] for pnum in range(len(pdat)): p=pdat[pnum] pnumi+=1 pnop=p.replace('(','').replace(')','') if not pytxt.noPunc(pnop): continue pnode=(pnumi,pnop) ## lay first stone if not len(pstack): pstack.append(pnode) noderoot=pnode continue ## make sure maximally binary if len(G.edge): edges_already=sorted(G.edge[pstack[-1]].keys()) if len(edges_already)>1: print edges_already #newnode=(pnumi+0.1,'NODE') newnode=(str(pnumi)+"b",'NODE') G.add_edge(pstack[-1],newnode,type='real',prom=None,weight=0) for e in edges_already[1:]: G.remove_edge(pstack[-1],e) G.add_edge(newnode,e,type='real',prom=None,weight=0) pstack.pop() pstack.append(newnode) G.add_edge(pstack[-1],pnode,weight=0,type='real',prom=None) if p.startswith('('): # is tag #G.edge[pstack[-1]][pnode]['isFinal']=False pstack.append(pnode) else: # is word #G.edge[pstack[-1]][pnode]['isFinal']=True ## get word stats word=p.replace(')','') words+=[word] wordnodes+=[pnode] stresslevel=0 ## go through tags in stack according to the number of tags which closed num_closing_paren=p.count(')') for i in range(num_closing_paren): pt=pstack.pop() for node in G.nodes(): if node in wordnodes: G.node[node]['type']='word' G.node[node]['color']='green' else: G.node[node]['type']='nonword' G=treeStress(G) G=tree2grid(G) print G return None
def parse2tree(ifn, ldlim=None, shuffle=False): ofn = os.path.basename(ifn) f = open(ifn) t = str(f.read()) f.close() sents = t.split('<sentence ') if shuffle: import random random.shuffle(sents) sentnum = 0 import networkx as nx noderoot = None ld = [] for sentence in sents[1:]: sentnum += 1 G = nx.DiGraph() if ldlim and sentnum > ldlim: break parse = sentence.split('<parse>')[1].split('</parse>')[0] print parse pdat = parse.split() wordi = 0 pnumi = -1 pstack = [] words = [] wordnodes = [] for pnum in range(len(pdat)): p = pdat[pnum] pnumi += 1 pnop = p.replace('(', '').replace(')', '') if not pytxt.noPunc(pnop): continue pnode = (pnumi, pnop) ## lay first stone if not len(pstack): pstack.append(pnode) noderoot = pnode continue ## make sure maximally binary if len(G.edge): edges_already = sorted(G.edge[pstack[-1]].keys()) if len(edges_already) > 1: print edges_already #newnode=(pnumi+0.1,'NODE') newnode = (str(pnumi) + "b", 'NODE') G.add_edge(pstack[-1], newnode, type='real', prom=None, weight=0) for e in edges_already[1:]: G.remove_edge(pstack[-1], e) G.add_edge(newnode, e, type='real', prom=None, weight=0) pstack.pop() pstack.append(newnode) G.add_edge(pstack[-1], pnode, weight=0, type='real', prom=None) if p.startswith('('): # is tag #G.edge[pstack[-1]][pnode]['isFinal']=False pstack.append(pnode) else: # is word #G.edge[pstack[-1]][pnode]['isFinal']=True ## get word stats word = p.replace(')', '') words += [word] wordnodes += [pnode] stresslevel = 0 ## go through tags in stack according to the number of tags which closed num_closing_paren = p.count(')') for i in range(num_closing_paren): pt = pstack.pop() for node in G.nodes(): if node in wordnodes: G.node[node]['type'] = 'word' G.node[node]['color'] = 'green' else: G.node[node]['type'] = 'nonword' G = treeStress(G) G = tree2grid(G) print G return None
def parse2phrase(ifn,phrasetype='NP',embedlimit=2,shuffle=False,ldlim=None): ofn=os.path.basename(ifn) f=open(ifn) t=str(f.read()) f.close() sents=t.split('<sentence ') if shuffle: import random random.shuffle(sents) sentnum=0 phrases=[] phrase=[] ld=[] for sentence in sents[1:]: sentnum+=1 if ldlim and sentnum>ldlim: break parse=sentence.split('<parse>')[1].split('</parse>')[0] tokens=[] for token in sentence.split('<word>')[1:]: token=token.split('</word>')[0] tokens+=[token] sentlen_word=len(tokens) #print parse pdat=parse.split() sentlen_paren=len(pdat) wordi=0 pnumi=-1 pstack=[] words=[] wordnodes=[] embedlevel=0 for pnum in range(len(pdat)): p=pdat[pnum] pnumi+=1 pnop=p.replace('(','').replace(')','') if not pytxt.noPunc(pnop): continue if pnop==phrasetype: #print "yes" embedlevel=0 phrase=[] pnode=(pnumi,pnop) #print pnumi,wordi,pnop,p,embedlevel #print phrase if p.startswith('('): # is tag #G.edge[pstack[-1]][pnode]['isFinal']=False pstack.append(pnode) if embedlevel!=None: embedlevel+=1 else: # is word #G.edge[pstack[-1]][pnode]['isFinal']=True ## get word stats word=p.replace(')','') word=word.lower() words+=[word] wordnodes+=[pnode] stresslevel=0 wordi+=1 if embedlevel!=None: if embedlevel<=embedlimit: phrase+=[ {'word':word, 'paren_num':pnumi+1, 'word_num':wordi+1, 'word_sentlen':sentlen_word, 'paren_sentlen':sentlen_paren} ] ## go through tags in stack according to the number of tags which closed num_closing_paren=p.count(')') #print num_closing_paren,pstack for i in range(num_closing_paren): if embedlevel!=None: embedlevel-=1 if len(pstack): pt=pstack.pop() #print pt #print pt if i==num_closing_paren-1: if num_closing_paren<=embedlimit and pt[1]==phrasetype: #if pt[1]==phrasetype: phrases+=[phrase] phrase=[] embedlevel=None #print embedlevel return phrases
def parse2phrase(ifn, phrasetype='NP', embedlimit=2, shuffle=False, ldlim=None): ofn = os.path.basename(ifn) f = open(ifn) t = str(f.read()) f.close() sents = t.split('<sentence ') if shuffle: import random random.shuffle(sents) sentnum = 0 phrases = [] phrase = [] ld = [] for sentence in sents[1:]: sentnum += 1 if ldlim and sentnum > ldlim: break parse = sentence.split('<parse>')[1].split('</parse>')[0] tokens = [] for token in sentence.split('<word>')[1:]: token = token.split('</word>')[0] tokens += [token] sentlen_word = len(tokens) #print parse pdat = parse.split() sentlen_paren = len(pdat) wordi = 0 pnumi = -1 pstack = [] words = [] wordnodes = [] embedlevel = 0 for pnum in range(len(pdat)): p = pdat[pnum] pnumi += 1 pnop = p.replace('(', '').replace(')', '') if not pytxt.noPunc(pnop): continue if pnop == phrasetype: #print "yes" embedlevel = 0 phrase = [] pnode = (pnumi, pnop) #print pnumi,wordi,pnop,p,embedlevel #print phrase if p.startswith('('): # is tag #G.edge[pstack[-1]][pnode]['isFinal']=False pstack.append(pnode) if embedlevel != None: embedlevel += 1 else: # is word #G.edge[pstack[-1]][pnode]['isFinal']=True ## get word stats word = p.replace(')', '') word = word.lower() words += [word] wordnodes += [pnode] stresslevel = 0 wordi += 1 if embedlevel != None: if embedlevel <= embedlimit: phrase += [{ 'word': word, 'paren_num': pnumi + 1, 'word_num': wordi + 1, 'word_sentlen': sentlen_word, 'paren_sentlen': sentlen_paren }] ## go through tags in stack according to the number of tags which closed num_closing_paren = p.count(')') #print num_closing_paren,pstack for i in range(num_closing_paren): if embedlevel != None: embedlevel -= 1 if len(pstack): pt = pstack.pop() #print pt #print pt if i == num_closing_paren - 1: if num_closing_paren <= embedlimit and pt[ 1] == phrasetype: #if pt[1]==phrasetype: phrases += [phrase] phrase = [] embedlevel = None #print embedlevel return phrases