import re import nltk from nltk.tokenize import sent_tokenize, word_tokenize dir = 'Corpus' titles = [] sentenceLength = [] for file in os.listdir( dir ): if re.search( r'[.]txt' , file ): fullPath = join( dir , file ) print( 'Analysing ' + fullPath + '...' ) tokens = tdm.numberOfTokens( fullPath ) sentences = tdm.numberOfSentences( fullPath ) sentenceLength.append( tokens / sentences ) title = re.sub( r'\.txt' , '' , file ) titles.append( title ) import matplotlib.pyplot as plt fig = plt.figure( figsize=( 12 , 5 ) ) ax = plt.axes() ax.bar( titles , sentenceLength , width = 0.6 , alpha = 0.5 , color = '#03017a')
import re import os from os.path import join import dtdpTdm as dtdp out = open('data.csv', 'w') dir = 'Corpus' ## make a heade out.write('title,tokens,sentences,syllables\n') for file in os.listdir(dir): if re.search(r'[.]txt', file): fileName = join(dir, file) print("Analysing " + fileName + " ...") out.write(fileName + ',') print("Calculating number of tokens") out.write(str(dtdp.numberOfTokens(fileName))) out.write(',') print("Calculating number of sentences") out.write(str(dtdp.numberOfSentences(fileName))) out.write(',') print("Calculating number of syllables") out.write(str(dtdp.numberOfSyllables(fileName))) out.write('\n') out.close()