def test_writes_file1(self): out = StringIO.StringIO() writer = WARCWriter(out) writer.write(fixtures.example) writer.write(fixtures.apple) out.seek(0) parser = WARCParser(out) self.assertEquals(fixtures.example, parser.next()) self.assertEquals(fixtures.apple, parser.next())
def add_dump(self, dumpname, dump): """ Adds a dump to this docset and indexes its documents by position, uploading the dump to DDFS with the tag for this docset. """ # index positions startpos = 0 endpos = None with open(dump, 'rb') as f: dociter = WARCParser(f) for doc in dociter: endpos = dociter.tell() self.index[doc.uri] = (dumpname, startpos, endpos - startpos) startpos = endpos self.ddfs.push(self.ddfs_tag, [(dump, dumpname)]) self.dirty = True
def test_parses_clubweb09(self): warc = WARCParser(open(fixtures.dumppath('ClueWeb09_English_Sample'))) self.assertEquals(0, warc.tell()) d1 = warc.next() self.assertEquals(21894, warc.tell()) # TODO: check 21894 self.assertEquals('http://www.smartwebby.com/DreamweaverTemplates/templates/business_general_template59.asp', d1.uri) self.assertTrue(d1.raw.startswith('<!DOCTYPE HTML PUBLIC')) self.assertTrue(d1.raw.endswith('<!-- InstanceEnd --></html>')) d2 = warc.next() self.assertEquals(43359, warc.tell()) # TODO: check 43359 self.assertEquals('http://www.smartwebby.com/DreamweaverTemplates/templates/business_telecom_template71.asp', d2.uri) self.assertTrue(d2.raw.startswith('<!DOCTYPE HTML PUBLIC')) self.assertTrue(d2.raw.endswith('<!-- InstanceEnd --></html>')) # Total of 100 docs, but we already iterated over 2. self.assertEquals(100, len(list(warc)) + 2)