Beispiel #1
0
    def test_parses_clubweb09(self):
        warc = WARCParser(open(fixtures.dumppath('ClueWeb09_English_Sample')))
        self.assertEquals(0, warc.tell())
        
        d1 = warc.next()
        self.assertEquals(21894, warc.tell()) # TODO: check 21894
        self.assertEquals('http://www.smartwebby.com/DreamweaverTemplates/templates/business_general_template59.asp', d1.uri)
        self.assertTrue(d1.raw.startswith('<!DOCTYPE HTML PUBLIC'))
        self.assertTrue(d1.raw.endswith('<!-- InstanceEnd --></html>'))
        
        d2 = warc.next()
        self.assertEquals(43359, warc.tell()) # TODO: check 43359
        self.assertEquals('http://www.smartwebby.com/DreamweaverTemplates/templates/business_telecom_template71.asp', d2.uri)
        self.assertTrue(d2.raw.startswith('<!DOCTYPE HTML PUBLIC'))
        self.assertTrue(d2.raw.endswith('<!-- InstanceEnd --></html>'))

        # Total of 100 docs, but we already iterated over 2.
        self.assertEquals(100, len(list(warc)) + 2)
Beispiel #2
0
    def setUpClass(klass):
        if klass.__name__ == 'IntegrationTestCase':
            return
        
        klass.spec = Spec(klass.__name__)
        klass.fqclient = FreequeryClient(klass.spec)

        # docset
        klass.docset = Docset(klass.spec.docset_name)
        klass.clean_up()
        for dumpname in klass.dumps:
            klass.docset.add_dump(dumpname, dumppath(dumpname))
        klass.docset.save()
        
        # index
        if klass.index:
            klass.fqclient.index()
            
        # rank
        if klass.rank:
            klass.fqclient.linkparse()
            niter = klass.niter if hasattr(klass, 'niter') else 2
            klass.fqclient.rank(niter=niter)
Beispiel #3
0
import os, unittest, StringIO
from freequery.graph.links import LinkFile, LinkFileOutputStream
from freequery.formats.warc import WARCParser
from freequery.test import fixtures

with open(fixtures.dumppath("small1-links"), "rb") as lf:
    small1_links = lf.read()


class TestLinkFile(unittest.TestCase):
    def test_parses_file1(self):
        linkfile = LinkFile(small1_links.splitlines(True))
        doclinks = dict((doc.uri, list(doc.link_uris)) for doc in linkfile)
        exp_doclinks = dict((uri, list(doc.link_uris)) for (uri, doc) in fixtures.dumpdocs("small1").items())
        self.assertEquals(exp_doclinks, doclinks)


class TestLinkFileOutputStream(unittest.TestCase):
    def test_writes_file1(self):
        out = StringIO.StringIO()
        writer = LinkFileOutputStream(out)
        docs = fixtures.dumpdocs("small1")
        writer.add("http://example.com/", docs["http://example.com/"].link_uris)
        writer.add("http://example.com/about", docs["http://example.com/about"].link_uris)
        writer.add("http://example.com/contact", docs["http://example.com/contact"].link_uris)
        self.assertEquals(small1_links, out.getvalue())