Ejemplo n.º 1
0
def dataset_stats(dataset):
    """
    Print and plot statistics for a given dataset.
    A histogram is plotted with the document length distribution of the data.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset
    (documents, labels) = data.read_files(corpus_path)
    file_names = data.get_file_names(corpus_path)
    lengths = []
    empty = 0
    for i, d in enumerate(documents):
        d = preprocess.tokenize_tokens(d)
        lengths.append(len(d))
        if len(d) == 0:
            print file_names[i], 'is empty'
            empty += 1
    lengths = numpy.array(lengths)
    print '# documents:', len(documents)
    print '# empty documents:', empty
    print '# words:', sum(lengths)
    print 'length avg:', lengths.mean()
    print 'length stddev:', lengths.std()
    print
    print 'document lengths (sorted):', sorted(lengths)
    plotter.histogram(lengths, '# tokens', '# documents', '', bins=80)
Ejemplo n.º 2
0
def dataset_stats(dataset):
    """
    Print and plot statistics for a given dataset.
    A histogram is plotted with the document length distribution of the data.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset
    (documents, labels) = data.read_files(corpus_path)
    file_names = data.get_file_names(corpus_path)
    lengths = []
    empty = 0
    for i,d in enumerate(documents):
        d = preprocess.tokenize_tokens(d)
        lengths.append(len(d))
        if len(d)==0:
            print file_names[i],'is empty'
            empty += 1
    lengths = numpy.array(lengths)
    print '# documents:',len(documents)
    print '# empty documents:',empty
    print '# words:',sum(lengths)
    print 'length avg:',lengths.mean()
    print 'length stddev:',lengths.std()
    print
    print 'document lengths (sorted):',sorted(lengths)
    plotter.histogram(lengths,'# tokens','# documents','',bins=80)
Ejemplo n.º 3
0
def test_case(report=None):
    if report==None:
        report = load_report('../data/air/html2/test/a04h0001.html')
    case = ReportCase(report)
    for section in report.sections:
        print section.title
        for subsec in section.sections:
            print '   ',subsec.title
    print
    #~ print case.description
    #~ print
    #~ print case.solution
    print
    import preprocess
    print len(preprocess.tokenize_tokens(case.solution))
    print len(preprocess.tokenize_tokens(case.description))
Ejemplo n.º 4
0
def store_document_lengths(dataset):
    path = '../data/'+dataset
    docs, labels = read_files(path)
    lengths = []
    for doc in docs:
        doc = preprocess.tokenize_tokens(doc)
        lengths.append(len(doc))
    print lengths
    pickle_to_file(lengths, 'output/document-lengths/'+dataset)
Ejemplo n.º 5
0
def store_sentence_lengths(dataset):
    path = '../data/'+dataset
    docs, labels = read_files(path)
    lengths = []
    for doc in docs:
        sentences = preprocess.tokenize_sentences(doc)
        for s in sentences:
            tokens = preprocess.tokenize_tokens(s)
            lengths.append(len(tokens))
    print lengths
    pickle_to_file(lengths, 'output/sentence-lengths/'+dataset)
Ejemplo n.º 6
0
def plot_sentence_lengths(datafile=None):
    """
    Function for plotting histogram of sentence lengths within a given dataset.
    """
    if datafile is None:
        import preprocess
        print '> reading data..'
        path = '../data/tasa/TASA900_text'
        texts, labels = data.read_files(path)
        sentence_lengths = []
        print '> counting lengths..'
        for text in texts:
            sentences = preprocess.tokenize_sentences(text)
            for sentence in sentences:
                tokens = preprocess.tokenize_tokens(sentence)
                sentence_lengths.append(len(tokens))
        data.pickle_to_file(sentence_lengths, 'output/tasa_sentence_lengths.pkl')
    else:
        sentence_lengths = data.pickle_from_file(datafile)
    plotter.histogram(sentence_lengths, 'sentence length (tokens)', '# sentences', bins=70)
Ejemplo n.º 7
0
def plot_sentence_lengths(datafile=None):
    """
    Function for plotting histogram of sentence lengths within a given dataset.
    """
    if datafile is None:
        import preprocess
        print '> reading data..'
        path = '../data/tasa/TASA900_text'
        texts, labels = data.read_files(path)
        sentence_lengths = []
        print '> counting lengths..'
        for text in texts:
            sentences = preprocess.tokenize_sentences(text)
            for sentence in sentences:
                tokens = preprocess.tokenize_tokens(sentence)
                sentence_lengths.append(len(tokens))
        data.pickle_to_file(sentence_lengths,
                            'output/tasa_sentence_lengths.pkl')
    else:
        sentence_lengths = data.pickle_from_file(datafile)
    plotter.histogram(sentence_lengths,
                      'sentence length (tokens)',
                      '# sentences',
                      bins=70)