Esempio n. 1
0
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

dir = 'Corpus'

titles = []
sentenceLength = []


for file in os.listdir( dir ):
    if re.search( r'[.]txt' , file ):

        fullPath = join( dir , file )
        print( 'Analysing ' +  fullPath + '...' )
        tokens = tdm.numberOfTokens( fullPath )
        sentences = tdm.numberOfSentences( fullPath )

        sentenceLength.append( tokens / sentences  )


        title = re.sub( r'\.txt' , '' , file )
        titles.append( title )


import matplotlib.pyplot as plt

fig = plt.figure( figsize=( 12 , 5 ) )
ax = plt.axes()

ax.bar( titles , sentenceLength , width = 0.6 , alpha = 0.5 , color = '#03017a')
Esempio n. 2
0
import re
import os
from os.path import join
import dtdpTdm as dtdp

out = open('data.csv', 'w')
dir = 'Corpus'

## make a heade
out.write('title,tokens,sentences,syllables\n')

for file in os.listdir(dir):
    if re.search(r'[.]txt', file):
        fileName = join(dir, file)
        print("Analysing " + fileName + " ...")
        out.write(fileName + ',')
        print("Calculating number of tokens")
        out.write(str(dtdp.numberOfTokens(fileName)))
        out.write(',')
        print("Calculating number of sentences")
        out.write(str(dtdp.numberOfSentences(fileName)))
        out.write(',')
        print("Calculating number of syllables")
        out.write(str(dtdp.numberOfSyllables(fileName)))
        out.write('\n')

out.close()