Example #1
0
def parse2tree(ifn,ldlim=None,shuffle=False):
	ofn=os.path.basename(ifn)
	f=open(ifn)
	t=str(f.read())
	f.close()
	sents=t.split('<sentence ')

	if shuffle:
		import random
		random.shuffle(sents)

	sentnum=0
	import networkx as nx
	noderoot=None
	
	
	ld=[]
	for sentence in sents[1:]:
		sentnum+=1
		G=nx.DiGraph()
		if ldlim and sentnum>ldlim: break
		parse=sentence.split('<parse>')[1].split('</parse>')[0]
		
		print parse
		
		pdat=parse.split()
		wordi=0
		pnumi=-1

		pstack=[]
		words=[]
		wordnodes=[]
		for pnum in range(len(pdat)):
			p=pdat[pnum]
			pnumi+=1
			
			pnop=p.replace('(','').replace(')','')
			if not pytxt.noPunc(pnop): continue
			pnode=(pnumi,pnop)
			
			## lay first stone
			if not len(pstack):
				pstack.append(pnode)
				noderoot=pnode
				continue
			
			
			## make sure maximally binary
			if len(G.edge):
				edges_already=sorted(G.edge[pstack[-1]].keys())
				
				if len(edges_already)>1:
					print edges_already
					
					#newnode=(pnumi+0.1,'NODE')
					newnode=(str(pnumi)+"b",'NODE')
					G.add_edge(pstack[-1],newnode,type='real',prom=None,weight=0)
					for e in edges_already[1:]:
						G.remove_edge(pstack[-1],e)
						G.add_edge(newnode,e,type='real',prom=None,weight=0)
					pstack.pop()
					pstack.append(newnode)
			
			G.add_edge(pstack[-1],pnode,weight=0,type='real',prom=None)
			
			if p.startswith('('):		# is tag	
				#G.edge[pstack[-1]][pnode]['isFinal']=False
				pstack.append(pnode)
			else:						# is word
				#G.edge[pstack[-1]][pnode]['isFinal']=True
			
				## get word stats
				word=p.replace(')','')
				
				words+=[word]
				wordnodes+=[pnode]
				stresslevel=0

				## go through tags in stack according to the number of tags which closed
				num_closing_paren=p.count(')')
				for i in range(num_closing_paren):
					pt=pstack.pop()
		
		for node in G.nodes():
			if node in wordnodes:
				G.node[node]['type']='word'
				G.node[node]['color']='green'
			else:
				G.node[node]['type']='nonword'
		
		G=treeStress(G)
		
		
		
		G=tree2grid(G)
		print G
		

	return None
Example #2
0
def parse2tree(ifn, ldlim=None, shuffle=False):
    ofn = os.path.basename(ifn)
    f = open(ifn)
    t = str(f.read())
    f.close()
    sents = t.split('<sentence ')

    if shuffle:
        import random
        random.shuffle(sents)

    sentnum = 0
    import networkx as nx
    noderoot = None

    ld = []
    for sentence in sents[1:]:
        sentnum += 1
        G = nx.DiGraph()
        if ldlim and sentnum > ldlim: break
        parse = sentence.split('<parse>')[1].split('</parse>')[0]

        print parse

        pdat = parse.split()
        wordi = 0
        pnumi = -1

        pstack = []
        words = []
        wordnodes = []
        for pnum in range(len(pdat)):
            p = pdat[pnum]
            pnumi += 1

            pnop = p.replace('(', '').replace(')', '')
            if not pytxt.noPunc(pnop): continue
            pnode = (pnumi, pnop)

            ## lay first stone
            if not len(pstack):
                pstack.append(pnode)
                noderoot = pnode
                continue

            ## make sure maximally binary
            if len(G.edge):
                edges_already = sorted(G.edge[pstack[-1]].keys())

                if len(edges_already) > 1:
                    print edges_already

                    #newnode=(pnumi+0.1,'NODE')
                    newnode = (str(pnumi) + "b", 'NODE')
                    G.add_edge(pstack[-1],
                               newnode,
                               type='real',
                               prom=None,
                               weight=0)
                    for e in edges_already[1:]:
                        G.remove_edge(pstack[-1], e)
                        G.add_edge(newnode,
                                   e,
                                   type='real',
                                   prom=None,
                                   weight=0)
                    pstack.pop()
                    pstack.append(newnode)

            G.add_edge(pstack[-1], pnode, weight=0, type='real', prom=None)

            if p.startswith('('):  # is tag
                #G.edge[pstack[-1]][pnode]['isFinal']=False
                pstack.append(pnode)
            else:  # is word
                #G.edge[pstack[-1]][pnode]['isFinal']=True

                ## get word stats
                word = p.replace(')', '')

                words += [word]
                wordnodes += [pnode]
                stresslevel = 0

                ## go through tags in stack according to the number of tags which closed
                num_closing_paren = p.count(')')
                for i in range(num_closing_paren):
                    pt = pstack.pop()

        for node in G.nodes():
            if node in wordnodes:
                G.node[node]['type'] = 'word'
                G.node[node]['color'] = 'green'
            else:
                G.node[node]['type'] = 'nonword'

        G = treeStress(G)

        G = tree2grid(G)
        print G

    return None
Example #3
0
def parse2phrase(ifn,phrasetype='NP',embedlimit=2,shuffle=False,ldlim=None):
	ofn=os.path.basename(ifn)
	f=open(ifn)
	t=str(f.read())
	f.close()
	sents=t.split('<sentence ')

	if shuffle:
		import random
		random.shuffle(sents)

	sentnum=0
	phrases=[]
	phrase=[]
	ld=[]
	for sentence in sents[1:]:
		sentnum+=1
		if ldlim and sentnum>ldlim: break
		parse=sentence.split('<parse>')[1].split('</parse>')[0]
		
		tokens=[]
		for token in sentence.split('<word>')[1:]:
			token=token.split('</word>')[0]
			tokens+=[token]
		sentlen_word=len(tokens)
		#print parse
		pdat=parse.split()
		sentlen_paren=len(pdat)
		wordi=0
		pnumi=-1
		pstack=[]
		words=[]
		wordnodes=[]
		embedlevel=0
		
		for pnum in range(len(pdat)):
			p=pdat[pnum]
			pnumi+=1
			
			pnop=p.replace('(','').replace(')','')
			if not pytxt.noPunc(pnop): continue
			if pnop==phrasetype:
				#print "yes"
				embedlevel=0
				phrase=[]
			
			
			pnode=(pnumi,pnop)
			
			#print pnumi,wordi,pnop,p,embedlevel
			#print phrase
			
			if p.startswith('('):		# is tag	
				#G.edge[pstack[-1]][pnode]['isFinal']=False
				pstack.append(pnode)
				if embedlevel!=None: embedlevel+=1
			else:						# is word
				#G.edge[pstack[-1]][pnode]['isFinal']=True
			
				## get word stats
				word=p.replace(')','')
				word=word.lower()
				
				words+=[word]
				wordnodes+=[pnode]
				stresslevel=0
				wordi+=1
				
				if embedlevel!=None:
					if embedlevel<=embedlimit:
						phrase+=[ {'word':word, 'paren_num':pnumi+1, 'word_num':wordi+1, 'word_sentlen':sentlen_word, 'paren_sentlen':sentlen_paren} ]
				
				## go through tags in stack according to the number of tags which closed
				num_closing_paren=p.count(')')
				
				#print num_closing_paren,pstack
				for i in range(num_closing_paren):
					if embedlevel!=None: embedlevel-=1
					if len(pstack):
						pt=pstack.pop()
					
					#print pt
					#print pt
					if i==num_closing_paren-1:
						if num_closing_paren<=embedlimit and pt[1]==phrasetype:
						#if pt[1]==phrasetype:
							phrases+=[phrase]
							phrase=[]
							embedlevel=None
							
				
				
			
			#print embedlevel
			
			
			
		

	return phrases
Example #4
0
def parse2phrase(ifn,
                 phrasetype='NP',
                 embedlimit=2,
                 shuffle=False,
                 ldlim=None):
    ofn = os.path.basename(ifn)
    f = open(ifn)
    t = str(f.read())
    f.close()
    sents = t.split('<sentence ')

    if shuffle:
        import random
        random.shuffle(sents)

    sentnum = 0
    phrases = []
    phrase = []
    ld = []
    for sentence in sents[1:]:
        sentnum += 1
        if ldlim and sentnum > ldlim: break
        parse = sentence.split('<parse>')[1].split('</parse>')[0]

        tokens = []
        for token in sentence.split('<word>')[1:]:
            token = token.split('</word>')[0]
            tokens += [token]
        sentlen_word = len(tokens)
        #print parse
        pdat = parse.split()
        sentlen_paren = len(pdat)
        wordi = 0
        pnumi = -1
        pstack = []
        words = []
        wordnodes = []
        embedlevel = 0

        for pnum in range(len(pdat)):
            p = pdat[pnum]
            pnumi += 1

            pnop = p.replace('(', '').replace(')', '')
            if not pytxt.noPunc(pnop): continue
            if pnop == phrasetype:
                #print "yes"
                embedlevel = 0
                phrase = []

            pnode = (pnumi, pnop)

            #print pnumi,wordi,pnop,p,embedlevel
            #print phrase

            if p.startswith('('):  # is tag
                #G.edge[pstack[-1]][pnode]['isFinal']=False
                pstack.append(pnode)
                if embedlevel != None: embedlevel += 1
            else:  # is word
                #G.edge[pstack[-1]][pnode]['isFinal']=True

                ## get word stats
                word = p.replace(')', '')
                word = word.lower()

                words += [word]
                wordnodes += [pnode]
                stresslevel = 0
                wordi += 1

                if embedlevel != None:
                    if embedlevel <= embedlimit:
                        phrase += [{
                            'word': word,
                            'paren_num': pnumi + 1,
                            'word_num': wordi + 1,
                            'word_sentlen': sentlen_word,
                            'paren_sentlen': sentlen_paren
                        }]

                ## go through tags in stack according to the number of tags which closed
                num_closing_paren = p.count(')')

                #print num_closing_paren,pstack
                for i in range(num_closing_paren):
                    if embedlevel != None: embedlevel -= 1
                    if len(pstack):
                        pt = pstack.pop()

                    #print pt
                    #print pt
                    if i == num_closing_paren - 1:
                        if num_closing_paren <= embedlimit and pt[
                                1] == phrasetype:
                            #if pt[1]==phrasetype:
                            phrases += [phrase]
                            phrase = []
                            embedlevel = None

            #print embedlevel

    return phrases