Beispiel #1
0
    def test_arias_model(self):
        cutoff_percent = 60
        window = 2

        a = Arias(cutoff_percent, window)
        content_arias = a.analyze(big_html_doc)

        # now compute the actual content
        blocks = Blockifier.blockify(big_html_doc)
        actual_content_indices = [1, 2, 3]
        actual_content = ' '.join([blocks[k].text for k in actual_content_indices])

        self.assertEqual(actual_content, content_arias)
Beispiel #2
0
    def test_arias_model(self):
        cutoff_percent = 60
        window = 2

        a = Arias(cutoff_percent, window)
        content_arias = a.analyze(big_html_doc)

        # now compute the actual content
        blocks = Blockifier.blockify(big_html_doc)
        actual_content_indices = [1, 2, 3]
        actual_content = ' '.join(
            [blocks[k].text for k in actual_content_indices])

        self.assertEqual(actual_content, content_arias)
Beispiel #3
0
#! /usr/bin/env python

# Run a particular algorithm on the entire set of documents

from dragnet import Arias as technique

import os
for site in os.listdir('documents'):
    sitepath = os.path.join('documents', site)
    for document in os.listdir(sitepath):
        # Make sure the output directory exists
        try:
            os.makedirs(os.path.join('output', site))
        except:
            pass
        
        # Read in, analyze, write out
        inpath  = os.path.join(sitepath, document)
        outpath = os.path.join('output', site, document)
        
        print 'Working on %s' % inpath
        with open(inpath) as inf:
            with open(outpath, 'w+') as outf:
                outf.write(technique.analyze(inf.read(), inpath))