def write_warc(self, resources=None, dumpfile=None): """Write a WARC dump file WARC support is not part of ResourceSync v1.0 (Z39.99 2014) but is left in this library for experimentation. """ # Load library late as we want to be able to run rest of code # without this installed try: from warc import WARCFile,WARCHeader,WARCRecord except: raise DumpError("Failed to load WARC library") wf = WARCFile(dumpfile, mode="w", compress=self.compress) # Add all files in the resources for resource in resources: wh = WARCHeader({}) wh.url = resource.uri wh.ip_address = None wh.date = resource.lastmod wh.content_type = 'text/plain' wh.result_code = 200 wh.checksum = 'aabbcc' wh.location = self.archive_path(resource.path) wf.write_record( WARCRecord( header=wh, payload=resource.path ) ) wf.close() warcsize = os.path.getsize(dumpfile) self.logging.info("Wrote WARC file dump %s with size %d bytes" % (dumpfile,warcsize))
def parse_warc_content(buf): wf = WARCFile(fileobj=StringIO(buf)) record = wf.read_record() payload = record.payload.read() top = payload[:15] if top.startswith('HTTP/') and top.endswith('200 OK'): content_start = payload.find('\r\n\r\n') if content_start != -1: yield record.url, payload[content_start + 4:]
def read_warc(self, key): keypath = 's3://aws-publicdatasets/{key}'.format(key=key) with self.s3.open(keypath, 'rb') as fp: warcfile = WARCFile(fileobj=fp, compress='gzip') for record in warcfile.reader: if record.type == 'response': self.increment_counter(self.__class__.__name__, 'match', 1) yield record
def read_warc(self, key): keypath = 's3://commoncrawl/{key}'.format(key=key) with self.s3.open(keypath, 'rb') as fp: g = gzip.GzipFile(fileobj=fp) warcfile = WARCFile(fileobj=g) for record in warcfile: #if record.type == 'response': self.increment_counter(self.__class__.__name__, 'match', 1) yield record
def write_warc(self, resources=None, dumpfile=None): """Write a WARC dump file. WARC support is not part of ResourceSync v1.0 (Z39.99 2014) but is left in this library for experimentation. """ # Load library late as we want to be able to run rest of code # without this installed try: from warc import WARCFile, WARCHeader, WARCRecord except: raise DumpError("Failed to load WARC library") wf = WARCFile(dumpfile, mode="w", compress=self.compress) # Add all files in the resources for resource in resources: wh = WARCHeader({}) wh.url = resource.uri wh.ip_address = None wh.date = resource.lastmod wh.content_type = 'text/plain' wh.result_code = 200 wh.checksum = 'aabbcc' wh.location = self.archive_path(resource.path) wf.write_record(WARCRecord(header=wh, payload=resource.path)) wf.close() warcsize = os.path.getsize(dumpfile) self.logging.info("Wrote WARC file dump %s with size %d bytes" % (dumpfile, warcsize))
def write_warc(self, inventory=None, dumpfile=None): """Write a WARC dump file""" # Load library late as we want to be able to run rest of code # without this installed try: from warc import WARCFile,WARCHeader,WARCRecord except: raise DumpError("Failed to load WARC library") wf = WARCFile(dumpfile, mode="w", compress=self.compress) # Add all files in the inventory for resource in inventory: wh = WARCHeader({}) wh.url = resource.uri wh.ip_address = None wh.date = resource.lastmod wh.content_type = 'text/plain' wh.result_code = 200 wh.checksum = 'aabbcc' wh.location = 'loc' wf.write_record( WARCRecord( header=wh, payload=resource.file ) ) wf.close() warcsize = os.path.getsize(dumpfile) print "Wrote WARC file dump %s with size %d bytes" % (dumpfile,warcsize)