def dataset_stats(dataset): """ Print and plot statistics for a given dataset. A histogram is plotted with the document length distribution of the data. """ print '> Reading data..', dataset corpus_path = '../data/' + dataset (documents, labels) = data.read_files(corpus_path) file_names = data.get_file_names(corpus_path) lengths = [] empty = 0 for i, d in enumerate(documents): d = preprocess.tokenize_tokens(d) lengths.append(len(d)) if len(d) == 0: print file_names[i], 'is empty' empty += 1 lengths = numpy.array(lengths) print '# documents:', len(documents) print '# empty documents:', empty print '# words:', sum(lengths) print 'length avg:', lengths.mean() print 'length stddev:', lengths.std() print print 'document lengths (sorted):', sorted(lengths) plotter.histogram(lengths, '# tokens', '# documents', '', bins=80)
def dataset_stats(dataset): """ Print and plot statistics for a given dataset. A histogram is plotted with the document length distribution of the data. """ print '> Reading data..', dataset corpus_path = '../data/'+dataset (documents, labels) = data.read_files(corpus_path) file_names = data.get_file_names(corpus_path) lengths = [] empty = 0 for i,d in enumerate(documents): d = preprocess.tokenize_tokens(d) lengths.append(len(d)) if len(d)==0: print file_names[i],'is empty' empty += 1 lengths = numpy.array(lengths) print '# documents:',len(documents) print '# empty documents:',empty print '# words:',sum(lengths) print 'length avg:',lengths.mean() print 'length stddev:',lengths.std() print print 'document lengths (sorted):',sorted(lengths) plotter.histogram(lengths,'# tokens','# documents','',bins=80)
def test_case(report=None): if report==None: report = load_report('../data/air/html2/test/a04h0001.html') case = ReportCase(report) for section in report.sections: print section.title for subsec in section.sections: print ' ',subsec.title print #~ print case.description #~ print #~ print case.solution print import preprocess print len(preprocess.tokenize_tokens(case.solution)) print len(preprocess.tokenize_tokens(case.description))
def store_document_lengths(dataset): path = '../data/'+dataset docs, labels = read_files(path) lengths = [] for doc in docs: doc = preprocess.tokenize_tokens(doc) lengths.append(len(doc)) print lengths pickle_to_file(lengths, 'output/document-lengths/'+dataset)
def store_sentence_lengths(dataset): path = '../data/'+dataset docs, labels = read_files(path) lengths = [] for doc in docs: sentences = preprocess.tokenize_sentences(doc) for s in sentences: tokens = preprocess.tokenize_tokens(s) lengths.append(len(tokens)) print lengths pickle_to_file(lengths, 'output/sentence-lengths/'+dataset)
def plot_sentence_lengths(datafile=None): """ Function for plotting histogram of sentence lengths within a given dataset. """ if datafile is None: import preprocess print '> reading data..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) sentence_lengths = [] print '> counting lengths..' for text in texts: sentences = preprocess.tokenize_sentences(text) for sentence in sentences: tokens = preprocess.tokenize_tokens(sentence) sentence_lengths.append(len(tokens)) data.pickle_to_file(sentence_lengths, 'output/tasa_sentence_lengths.pkl') else: sentence_lengths = data.pickle_from_file(datafile) plotter.histogram(sentence_lengths, 'sentence length (tokens)', '# sentences', bins=70)