Example #1
0
 def test_writes_file1(self):
     out = StringIO.StringIO()
     writer = WARCWriter(out)
     writer.write(fixtures.example)
     writer.write(fixtures.apple)
     out.seek(0)
     parser = WARCParser(out)
     self.assertEquals(fixtures.example, parser.next())
     self.assertEquals(fixtures.apple, parser.next())
Example #2
0
 def add_dump(self, dumpname, dump):
     """
     Adds a dump to this docset and indexes its documents by position,
     uploading the dump to DDFS with the tag for this docset.
     """
     # index positions
     startpos = 0
     endpos = None
     with open(dump, 'rb') as f:
         dociter = WARCParser(f)
         for doc in dociter:
             endpos = dociter.tell()
             self.index[doc.uri] = (dumpname, startpos, endpos - startpos)
             startpos = endpos
     self.ddfs.push(self.ddfs_tag, [(dump, dumpname)])
     self.dirty = True
Example #3
0
    def test_parses_clubweb09(self):
        warc = WARCParser(open(fixtures.dumppath('ClueWeb09_English_Sample')))
        self.assertEquals(0, warc.tell())
        
        d1 = warc.next()
        self.assertEquals(21894, warc.tell()) # TODO: check 21894
        self.assertEquals('http://www.smartwebby.com/DreamweaverTemplates/templates/business_general_template59.asp', d1.uri)
        self.assertTrue(d1.raw.startswith('<!DOCTYPE HTML PUBLIC'))
        self.assertTrue(d1.raw.endswith('<!-- InstanceEnd --></html>'))
        
        d2 = warc.next()
        self.assertEquals(43359, warc.tell()) # TODO: check 43359
        self.assertEquals('http://www.smartwebby.com/DreamweaverTemplates/templates/business_telecom_template71.asp', d2.uri)
        self.assertTrue(d2.raw.startswith('<!DOCTYPE HTML PUBLIC'))
        self.assertTrue(d2.raw.endswith('<!-- InstanceEnd --></html>'))

        # Total of 100 docs, but we already iterated over 2.
        self.assertEquals(100, len(list(warc)) + 2)