Example #1
0
sentfolder='/Lab/Projects/sentence/sents'
sentparafolder='/Lab/Projects/sentence/sentparas'

import os,pytxt
for sentfn in os.listdir(sentfolder):
	sentfnfn=os.path.join(sentfolder,sentfn)
	f=open(sentfnfn)
	t=f.read()
	f.close()
	
	x=[]
	
	for sentence in t.split('\n'):
		x+=[sentence+'\n']
	
	ofn=os.path.join(sentparafolder,sentfn)
	pytxt.write(ofn,'\n'.join(x),toprint=True)
Example #2
0
	exit()

	import rpyd2
	for i in stats:
		r=rpyd2.RpyD2([d for d in stats[i]])
		r.plot(fn='cadences-length-'+str(i).zfill(4)+'.',
			x='pattern',
			y='oe',
			title='Cadences of '+str(i)+' syllables in length, measured for their observed/expected appearance at ends of sentences',
			boxplot=True,group='pattern',col='pattern',point=True,smooth=False,flip=True)
	
	exit()
	o=[]
	for p,count in sorted(profile_stats.items(),key=lambda lx: -lx[1]):
		#print p,"\t",count
		o+=[ [str(p),str(count)] ]

	import pytxt
	pytxt.write('cadence-stats.txt',o,toprint=True)
	
	for i in profile_eg:
		o=[]
		lines=1000
		line=0
		for p,count in sorted(profile_eg[i].items(),key=lambda lx: -lx[1]):
			#print p,"\t",count
			line+=1
			o+=[ [str(p),str(count)] ]
			if line>lines: break
	
		pytxt.write('cadence-egs.'+str(i).zfill(2)+'.txt',o,toprint=True)
Example #3
0
                    if embedlevel < 0 or embedlevel > embedlimit:
                        break
                else:
                    print x
                    if x.isalpha() and x == x.upper(): continue
                    pstr += x
            exit()

            pstr = pstr.replace('  ', ' ').strip()
            if not pstr: continue
            if not ' ' in pstr: continue
            # try:
            # 				pdict[pstr]+=1
            # 			except:
            # 				pdict[pstr]=1

            try:
                print pstr, sentstr.index(pstr)
            except:
                print "!!" * 10
                print sentstr
                print pstr
                print "!!" * 10

exit()
o = ''
for k, v in sorted(pdict.items(), key=lambda x: -x[1]):
    o += str(k) + '\t' + str(v) + '\n'
import pytxt
pytxt.write('np-stats.txt', o, toprint=True)
Example #4
0
            x='pattern',
            y='oe',
            title='Cadences of ' + str(i) +
            ' syllables in length, measured for their observed/expected appearance at ends of sentences',
            boxplot=True,
            group='pattern',
            col='pattern',
            point=True,
            smooth=False,
            flip=True)

    exit()
    o = []
    for p, count in sorted(profile_stats.items(), key=lambda lx: -lx[1]):
        #print p,"\t",count
        o += [[str(p), str(count)]]

    import pytxt
    pytxt.write('cadence-stats.txt', o, toprint=True)

    for i in profile_eg:
        o = []
        lines = 1000
        line = 0
        for p, count in sorted(profile_eg[i].items(), key=lambda lx: -lx[1]):
            #print p,"\t",count
            line += 1
            o += [[str(p), str(count)]]
            if line > lines: break

        pytxt.write('cadence-egs.' + str(i).zfill(2) + '.txt', o, toprint=True)
Example #5
0
def parse2lines(fn):
	#ifn=sys.argv[1]
	ifn='/Lab/Projects/sentence/parsed/middlemarch.txt.xml'
	ofn=os.path.basename(ifn)
	f=open(ifn)
	t=str(f.read())
	f.close()
	sents=t.split('<sentence ')

	ldlim=100
	for nn in range(30,31):
		ld=[]
		dl={}
		df=None
		o=[]
		sentnum=0
		random.shuffle(sents)
		print nn, "?"
		for sentence in sents[1:]:
			tokens=[]
			for token in sentence.split('<word>')[1:]:
				token=token.split('</word>')[0]
				tokens+=[token]

			if len(tokens)!=nn: continue

			try:
				x=[unicode(bb) for bb in tokens]
			except UnicodeDecodeError:
				continue

			sentnum+=1
			if ldlim and sentnum>ldlim: break
			parse=sentence.split('<parse>')[1].split('</parse>')[0]
			pdat=parse.split()
			wordi=0
			y=4
			o+=[['sent'+str(sentnum).zfill(3)," ".join(tokens)]]


			for pnum in range(len(pdat)):
				p=pdat[pnum]
				try:
					w=tokens[wordi]
				except IndexError:
					continue
				pnop=p.replace('(','').replace(')','')


				if pnop==w:
					wordi+=1
					if wordi>=len(tokens): break

					d={}
					d['wordnum']=wordi
					d['depth']=y
					d['sentnum']=str(sentnum).zfill(3)

					try:
						dl['sent'+str(sentnum).zfill(3)]+=[y]
					except KeyError:
						dl['sent'+str(sentnum).zfill(3)]=[]
						dl['sent'+str(sentnum).zfill(3)]+=[y]

					ld.append(d)

				y+=p.count(')')
				y-=p.count('(')


		if not ld: continue
		if ldlim and sentnum<ldlim: continue
		r1=rpyd2.RpyD2(dl)
		r2=rpyd2.RpyD2(ld)
		pytxt.write('sentkey.'+os.path.basename(ifn)+'.'+str(nn).zfill(3)+'.txt',o,toprint=True)
		#r2.plot(x='wordnum',y='depth',col='sentnum',group='sentnum',line=True,point=False)
		#r1.corrgram()
		r1.kclust(cor=True)
		r1.hclust(cor=True)
Example #6
0
					print x
					if x.isalpha() and x==x.upper(): continue
					pstr+=x
			exit()
		
			pstr=pstr.replace('  ',' ').strip()
			if not pstr: continue
			if not ' ' in pstr: continue
			# try:
			# 				pdict[pstr]+=1
			# 			except:
			# 				pdict[pstr]=1
			
			try:
				print pstr,sentstr.index(pstr)
			except:
				print "!!"*10
				print sentstr
				print pstr
				print "!!"*10
			



exit()
o=''
for k,v in sorted(pdict.items(),key=lambda x: -x[1]):
	o+=str(k)+'\t'+str(v)+'\n'
import pytxt
pytxt.write('np-stats.txt',o,toprint=True)
Example #7
0
def parse2lines(fn):
    #ifn=sys.argv[1]
    ifn = '/Lab/Projects/sentence/parsed/middlemarch.txt.xml'
    ofn = os.path.basename(ifn)
    f = open(ifn)
    t = str(f.read())
    f.close()
    sents = t.split('<sentence ')

    ldlim = 100
    for nn in range(30, 31):
        ld = []
        dl = {}
        df = None
        o = []
        sentnum = 0
        random.shuffle(sents)
        print nn, "?"
        for sentence in sents[1:]:
            tokens = []
            for token in sentence.split('<word>')[1:]:
                token = token.split('</word>')[0]
                tokens += [token]

            if len(tokens) != nn: continue

            try:
                x = [unicode(bb) for bb in tokens]
            except UnicodeDecodeError:
                continue

            sentnum += 1
            if ldlim and sentnum > ldlim: break
            parse = sentence.split('<parse>')[1].split('</parse>')[0]
            pdat = parse.split()
            wordi = 0
            y = 4
            o += [['sent' + str(sentnum).zfill(3), " ".join(tokens)]]

            for pnum in range(len(pdat)):
                p = pdat[pnum]
                try:
                    w = tokens[wordi]
                except IndexError:
                    continue
                pnop = p.replace('(', '').replace(')', '')

                if pnop == w:
                    wordi += 1
                    if wordi >= len(tokens): break

                    d = {}
                    d['wordnum'] = wordi
                    d['depth'] = y
                    d['sentnum'] = str(sentnum).zfill(3)

                    try:
                        dl['sent' + str(sentnum).zfill(3)] += [y]
                    except KeyError:
                        dl['sent' + str(sentnum).zfill(3)] = []
                        dl['sent' + str(sentnum).zfill(3)] += [y]

                    ld.append(d)

                y += p.count(')')
                y -= p.count('(')

        if not ld: continue
        if ldlim and sentnum < ldlim: continue
        r1 = rpyd2.RpyD2(dl)
        r2 = rpyd2.RpyD2(ld)
        pytxt.write('sentkey.' + os.path.basename(ifn) + '.' +
                    str(nn).zfill(3) + '.txt',
                    o,
                    toprint=True)
        #r2.plot(x='wordnum',y='depth',col='sentnum',group='sentnum',line=True,point=False)
        #r1.corrgram()
        r1.kclust(cor=True)
        r1.hclust(cor=True)