def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) if options.strip_404s and not options.decode_http: raise RuntimeError("--strip-404s requires --decode_http") with open(options.output, "wb") as out: if len(input_files) < 1: fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None, mode="rb") try: previous_record = None for record in fh: process(record, previous_record, out, options) previous_record = record finally: fh.close() else: for name in input_files: previous_record = None fh = WarcRecord.open_archive(name, gzip="auto", mode="rb") try: for record in fh: process(record, previous_record, out, options) previous_record = record finally: fh.close() return 0
def process(self, infn, outfn, delete=False): """Process a WARC at a given infn, producing plain text via Tika where suitable, and writing a new WARC file to outfn.""" # These are objects of type RecordStream (or a subclass), unlike with # the IA library inwf = WarcRecord.open_archive(infn, mode='rb') outf = open(outfn, 'wb') self._openfiles.add(outfn) # try: # fcntl.lockf(inwf.file_handle, fcntl.LOCK_EX | fcntl.LOCK_NB) # fcntl.lockf(outf, fcntl.LOCK_EX | fcntl.LOCK_NB) # # Get locks on both files # except IOError: # print ("Unable to get file locks processing", infn, "so will " # "try later") # return False print "Processing", infn for record in inwf: try: if record.type == WarcRecord.WARCINFO: self.add_description_to_warcinfo(record) elif (record.type == WarcRecord.RESPONSE or record.type == WarcRecord.RESOURCE): if record.get_header('WARC-Segment-Number'): raise WarcTikaException("Segmented response/resource " "record. Not processing.") else: record = self.generate_new_record(record) # If 'metadata', 'request', 'revisit', 'continuation', # 'conversion' or something exotic, we can't do anything more # interesting than immediately re-writing it to the new file newrecord = WarcRecord(headers=record.headers, content=record.content) except Exception as e: print ("Warning: WARCTikaProcessor.process() failed on "+ record.url+": "+str(e.message)+ "\n\tWriting old record to new WARC.") traceback.print_exc() newrecord = record finally: newrecord.write_to(outf, gzip=outfn.endswith('.gz')) print "****Finished file. Tika status codes:", self.tikacodes.items() self.tikacodes = defaultdict(int) inwf.close() outf.close() self._openfiles.remove(outfn) # Check that the file has written correctly - for an excess of caution validrc = os.system("warcvalid "+outfn) if validrc: print "New file", outfn, "appears not to be valid. Deleting it." os.unlink(outfn) if delete and not validrc: print "Deleting", infn os.unlink(infn) return True
def dump_payload_from_file(filename, offset=None, length=None, output_filename="/tmp/warc_dump"): print("ci siamo:", filename, offset, length, output_filename) print(WarcRecord.open_archive) print(closing) print("ok") fp = WarcRecord.open_archive(filename=filename, gzip="auto", offset=offset, length=length) print("dopo open_archive") print(fp) with closing(WarcRecord.open_archive(filename=filename, gzip="auto", offset=offset, length=length)) as fh: print("ho aperto il file") return dump_payload_from_stream(fh)
def write_warcinfo_record(self, warc): """Writes the initial warcinfo record.""" headers = [ (WarcRecord.TYPE, WarcRecord.WARCINFO), (WarcRecord.DATE, warc_datetime_str(datetime.now())), (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()), ] data = "software=%s\nhostname=%s\nip=%s" % (self.software, self.hostname, self.ip) if self.description is not None: data += "\ndescription=%s" % self.description record = WarcRecord(headers=headers, content=("application/warc-fields", data)) record.write_to(warc, gzip=self.gzip) warc.flush()
def __init__(self, url_or_io, bytes_range=None): if isinstance(url_or_io, str): self.archive = WarcRecord.open_archive(file_handle=response_as_file(url_or_io, bytes_range)) elif isinstance(url_or_io, IterContentAsFile): self.archive = WarcRecord.open_archive(file_handle=url_or_io) else: self.archive = WarcRecord.open_archive(file_handle=stream_as_file("upload.warc.gz", url_or_io)) self.path_types = {} self.files = {} self.errors = [] self.offset = 0 self.buffer = []
def main(argv): (options, args) = parser.parse_args(args=argv[1:]) out = sys.stdout if options.output: if not os.path.exists(options.output): os.makedirs(options.output) output_dir = options.output else: output_dir = os.getcwd() collisions = 0 if len(args) < 1: log_file = sys.stdout if not options.log_file else open(options.log_file, 'wb') log_headers(log_file) with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: collisions += unpack_records('<stdin>', fh, output_dir, options.default_name, log_file, options.wayback) else: for filename in args: log_file = os.path.join(output_dir, os.path.basename(filename)+ '.index.txt') if not options.log_file else options.log_file log_file = open(log_file, 'wb') log_headers(log_file) try: with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: collisions+=unpack_records(filename, fh, output_dir, options.default_name, log_file, options.wayback) except StandardError, e: print >> sys.stderr, "exception in handling", filename, e
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: parser.error("no imput warc file(s)") total = 0 # print '#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length' for name in expand_files(input_files): fh = WarcRecord.open_archive(name, gzip="auto") for (offset, record, errors) in fh.read_records(limit=None): if record: print name, offset, record.type, record.url, record.id, record.content_type, record.content_length total += record.content_length elif errors: pass # ignore else: pass # no errors at tail fh.close() print total return 0
def build_from_warcs(self, warcs): for warc in warcs: fh = WarcRecord.open_archive(warc, gzip="auto") try: for (offset, record, errors) in fh.read_records(limit=None): if record: if record.type == WarcRecord.METADATA: for line in StringIO(record.content[1]): if line.startswith("outlink: "): outlink = line.strip().split()[1] self.inverted_index[outlink] = record.url if record.type == WarcRecord.RESPONSE: f = FileHTTPResponse(record.content_file) f.begin() if f.status == 200 and record.url.startswith( "http"): self.crawled_uris.append( (record.url, f.getheader("content-type"), record.date, record.content_length)) elif errors: pass else: pass finally: fh.close()
def __init__(self, url_or_io, bytes_range=None): if isinstance(url_or_io, str): self.archive = WarcRecord.open_archive( file_handle=response_as_file(url_or_io, bytes_range)) elif isinstance(url_or_io, IterContentAsFile): self.archive = WarcRecord.open_archive(file_handle=url_or_io) else: self.archive = WarcRecord.open_archive( file_handle=stream_as_file("upload.warc.gz", url_or_io)) self.path_types = {} self.files = {} self.errors = [] self.offset = 0 self.buffer = []
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) if options.strip_404s and not options.decode_http: raise RuntimeError("--strip-404s requires --decode_http") if options.json_hrefs_file and not options.decode_http: raise RuntimeError("--json-hrefs-file requires --decode_http") if options.json_hrefs_file: found_hrefs = set() else: found_hrefs = None with open(options.output, "wb") as out: if len(input_files) < 1: fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None, mode="rb") try: previous_record = None for record in fh: process(record, previous_record, out, options, found_hrefs) previous_record = record finally: fh.close() else: for name in input_files: previous_record = None fh = WarcRecord.open_archive(name, gzip="auto", mode="rb") try: for record in fh: process(record, previous_record, out, options, found_hrefs) previous_record = record finally: fh.close() if found_hrefs is not None: fh = bz2.BZ2File(options.json_hrefs_file, "wb") try: fh.write("\n".join(sorted(found_hrefs)) + "\n") finally: fh.close() return 0
def run(self): path = self.path idx_file = "%s.idx" % path records = None if os.path.exists(idx_file) and os.path.getmtime( idx_file) >= os.path.getmtime(path): print "Loading " + path + " from cache" self.status = "loading-cache" with open(idx_file, "rb") as f: def update_progress(): self.bytes_read = f.tell() f_pr = IOWithProgress(f, update_progress) data = cPickle.load(f_pr) self.bytes_read = self.bytes_total if "version" in data and data["version"] == 1: records = data["records"] if not records: self.status = "indexing" self.bytes_total = os.path.getsize(self.path) print "Loading " + path records = OrderedDict() warc = WarcRecord.open_archive(path, gzip="auto") for (offset, record, errors) in warc.read_records(limit=None): if self.cancel: raise Exception("Loading " + path + " canceled") if record and re.sub( r"[^a-z;=/]+", "", record.type) == WarcRecord.RESPONSE and re.sub( r"[^a-z;=/]+", "", record.content[0]) == ResponseMessage.CONTENT_TYPE: http_response = parse_http_response(record) records[canonicalize_url(record.url)] = { "offset": offset, "code": http_response[0], "type": http_response[1] } self.bytes_read = offset warc.close() with open(idx_file, "wb") as f: cPickle.dump({"version": 1, "records": records}, f) if self.cancel: raise Exception("Loading " + path + " canceled") print "Indexed " + path + ". Found " + str(len(records)) + " URLs" self.status = "indexed" self.records = records
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None) for record in fh: process(record, out, options) else: for name in input_files: fh = WarcRecord.open_archive(name, gzip="auto") for record in fh: process(record, out, options) fh.close() return 0
def create_metadata_record_bytes( url='http://example.com/', content_type='image/png', date='2016-08-03T10:49:41Z', content=b'', include_block_digest=True): """Build WARC metadata record bits.""" headers = { WarcRecord.TYPE: WarcRecord.METADATA, WarcRecord.URL: url.encode('utf-8'), WarcRecord.CONTENT_TYPE: content_type.encode('utf-8'), WarcRecord.DATE: date.encode('utf-8') } if include_block_digest: hasher = hashlib.sha1(content) block_digest = base64.b32encode(hasher.digest()) headers[WarcRecord.BLOCK_DIGEST] = b'sha1:' + block_digest # XXX - I wish I could use WarcRecord. Current implementation of # WarcRecord.write_to() ignores Warc-Block-Digest passed and writes out # hex-encoded SHA256 calculated from the content. out = io.BytesIO() if False: rec = WarcRecord( headers=headers.items(), content=(content_type.encode('utf-8'), content) ) out = io.BytesIO() rec.write_to(out, gzip=True) return out.getvalue() else: z = GzipFile(fileobj=out, mode='wb') z.write(b'WARC/1.0\r\n') for k, v in headers.items(): z.write(b''.join((k, b': ', v, b'\r\n'))) z.write('Content-Length: {}\r\n'.format(len(content)).encode('ascii')) z.write(b'\r\n') z.write(content) z.write(b'\r\n\r\n') z.flush() z.close() return out.getvalue()
def _load_warc_info(self): self._warc_file_read.seek(0) wrs = WarcRecord.open_archive(file_handle=self._warc_file_read, \ gzip="record") temp = wrs.read_records(limit=1) if not temp or (temp[0].type != WarcRecord.WARCINFO): raise ValueError("WARC info not found") return temp[0]
def write_record(self, headers, mime, data): """Writes a WARC record. Arguments: headers -- Array of WARC headers. mime -- MIME type of the data. data -- the data block. """ record = WarcRecord(headers=headers, content=(mime, data)) logger.debug("Getting WARC: %s" % str(self.warcs.keys())) name = self.pool.get() logger.debug("Writing to: %s" % name) fh = self.warcs[name] record.write_to(fh, gzip=self.gzip) fh.flush() if not self.warc_reached_max_size(name): logger.debug("%s undersized; adding back to the pool." % name) self.pool.put(name)
def readRecord(filename, offset): """ :type filename: str :type offset: int :rtype : WarcRecord """ w = WarcRecord.open_archive(filename, offset=offset) g = w.read_records(limit=1) r = g.next()[1] w.close() return r
def loadWarcFileRecords(name): """ Generator function for records from the file 'name' """ f = WarcRecord.open_archive(name, gzip="auto") for (offset, r, err) in f.read_records(limit=None): if err: print "warc errors at %s:%d" % (name, offset or 0) for e in err: print '\t', e if r: yield (r, offset) f.close()
def find_record(self, url): self._warc_file_read.seek(0) wrs = WarcRecord.open_archive(file_handle=self._warc_file_read, \ gzip="record") for (offset, record, errors) in wrs.read_records(limit=None): if record and (record.type == WarcRecord.RESPONSE) \ and (record.content[0] == ResponseMessage.CONTENT_TYPE) \ and (record.url == url): return record return None
def _init_file(self): warcinfo_headers = [ (WarcRecord.TYPE, WarcRecord.WARCINFO), (WarcRecord.ID, WarcRecord.random_warc_uuid()), (WarcRecord.DATE, warc.warc_datetime_str(datetime.utcnow())), (WarcRecord.FILENAME, os.path.basename(self._file_name)), (Warc.MAIN_URL, self._main_url), ] warcinfo_fields = "\r\n".join([ "software: bardo", "format: WARC File Format 1.0", "conformsTo: " + CONFORMS_TO, "robots: unknown", ]) warcinfo_content = ("application/warc-fields", warcinfo_fields) warcinfo_record = WarcRecord(headers=warcinfo_headers, \ content=warcinfo_content) self.write_record(warcinfo_record)
def warc_record_for_uri(self, uri): found = False for (path, uris) in self.indices.iteritems(): if uri in uris: warc = WarcRecord.open_archive(path, gzip="auto") warc.seek(uris[uri]["offset"]) for record in warc.read_records(limit=1, offsets=uris[uri]["offset"]): found = True yield record warc.close() if not found: yield None
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) try: # python3 out = sys.stdout.buffer except AttributeError: # python2 out = sys.stdout if len(input_files) < 1: fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None) for record in fh: process(record, out, options) else: for name in expand_files(input_files): fh = WarcRecord.open_archive(name, gzip="auto") for record in fh: process(record, out, options) fh.close() return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-",offsets=False) else: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") dump_archive(fh,name) fh.close() return 0
def doc_from_warc(infn, gzip='auto'): """Generator to process a WARC at a given infn.""" # These are objects of type RecordStream (or a subclass), unlike with # the IA library inwf = WarcRecord.open_archive(infn, mode='rb', gzip=gzip) sys.stderr.write("Processing "+str(infn)+"\n") for record in inwf: # print "\nStarting record: "+str(record.url) try: if record.get_header('WARC-Segment-Number'): raise Exception("Segmented response/resource record " "for "+record.url+". Not processing.") # We can process resource records (and conversion records, # which we assume are all of resource type (contain a document # rather than an HTTP transaction with nested document). This # may be unsafe, but conversion records are almost unknown in # the wild. The only ones we'll be handling here are those # output from WarcTika, which are in that format. # TODO: generalise this. # We also handle HTTP response records. if (record.type == WarcRecord.RESPONSE and record.url.startswith('http')): httpcode, mimetype, charset, body = parse_http_response_charset(record) elif (record.type == WarcRecord.RESOURCE or record.type == WarcRecord.CONVERSION): mimetype, body = record.content httpcode = 200 # "Success" for stored content charset = None # Not recorded # If 'metadata', 'request', 'revisit', 'continuation', # or something exotic, we can't do anything interesting elif (record.type == WarcRecord.METADATA or record.type == WarcRecord.WARCINFO or record.type == WarcRecord.REQUEST): continue else: sys.stderr.write("Can't handle"+str(record.type)+", "+str(record.url)) yield (record.url, mimetype, body, httpcode, charset) except Exception: # General catch to avoid multiprocessing taking down the whole job # for one bogus record sys.stderr.write("\n\n***** Uncaught exception reading "+record.url +" from file "+infn+":\n") traceback.print_exc() sys.stderr.write("Continuing.\n\n\n") inwf.close()
def run(self): path = self.path idx_file = "%s.idx" % path records = None if os.path.exists(idx_file) and os.path.getmtime(idx_file) >= os.path.getmtime(path): print "Loading " + path + " from cache" self.status = "loading-cache" with open(idx_file, "rb") as f: def update_progress(): self.bytes_read = f.tell() f_pr = IOWithProgress(f, update_progress) data = cPickle.load(f_pr) self.bytes_read = self.bytes_total if "version" in data and data["version"] == 1: records = data["records"] if not records: self.status = "indexing" self.bytes_total = os.path.getsize(self.path) print "Loading " + path records = OrderedDict() warc = WarcRecord.open_archive(path, gzip="auto") for (offset, record, errors) in warc.read_records(limit=None): if self.cancel: raise Exception("Loading " + path + " canceled") if record and re.sub(r"[^a-z;=/]+", "", record.type) == WarcRecord.RESPONSE and re.sub(r"[^a-z;=/]+", "", record.content[0]) == ResponseMessage.CONTENT_TYPE: http_response = parse_http_response(record) records[canonicalize_url(record.url)] = { "offset":offset, "code":http_response[0], "type":http_response[1] } self.bytes_read = offset warc.close() with open(idx_file, "wb") as f: cPickle.dump({ "version": 1, "records": records }, f) if self.cancel: raise Exception("Loading " + path + " canceled") print "Indexed "+path+". Found "+str(len(records))+" URLs" self.status = "indexed" self.records = records
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-", offsets=False) else: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") dump_archive(fh, name) fh.close() return 0
def warcinfo_record(warc_filename): """Return warcinfo WarcRecord. Required to write in the beginning of a WARC file. """ warc_date = warc_datetime_str(datetime.utcnow()) metadata = "\r\n".join(( "format: WARC File Format 1.0", "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf" )) return WarcRecord(headers=[ (WarcRecord.TYPE, WarcRecord.WARCINFO), (WarcRecord.CONTENT_TYPE, b'application/warc-fields'), (WarcRecord.ID, warc_uuid(metadata + warc_date)), (WarcRecord.DATE, warc_date), (WarcRecord.FILENAME, warc_filename) ], content=(b'application/warc-fields', metadata + "\r\n"), version=b"WARC/1.0")
def process_file(self, filename): f = WarcRecord.open_archive(filename, gzip="auto") for (offset, record, errors) in f.read_records(limit=None): if record: if record.type=="response": self._process_response(record) elif record.type=="request": self._process_request(record) elif record.type=="resource": self._process_resource(record) elif errors: raise WarcException, "Cannot decode WARC: %s" % errors self.current_request = None f.close()
def main(argv): (options, args) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(args) < 1: # dump the first record on stdin with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: dump_record(fh) else: filename = args[0] zipfilename = args[1] with ZipFile(zipfilename, "w") as outzip: with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: dump_record(fh, outzip) return 0
def main(argv): (options, args) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(args) < 1: # dump the first record on stdin with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: dump_record(fh) else: filename = args[0] zipfilename = args[1] with ZipFile(zipfilename, "w") as outzip: with closing( ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: dump_record(fh, outzip) return 0
def build_from_warcs(self, warcs): for warc in warcs: fh = WarcRecord.open_archive(warc, gzip="auto") try: for (offset, record, errors) in fh.read_records(limit=None): if record: if record.type == WarcRecord.METADATA: for line in StringIO(record.content[1]): if line.startswith("outlink: "): outlink = line.strip().split()[1] self.inverted_index[outlink] = record.url if record.type == WarcRecord.RESPONSE: f = FileHTTPResponse(record.content_file) f.begin() if f.status == 200 and record.url.startswith("http"): self.crawled_uris.append((record.url, f.getheader("content-type"), record.date, record.content_length)) elif errors: pass else: pass finally: fh.close()
def read_record(path, num_pages=10): warcr = WarcRecord.open_archive(path, gzip='auto') i = 0 documents = [] urls = [] for record in warcr: if i >= num_pages: break if record.type == b'response' and record.content[ 0] == b'application/http; msgtype=response': url = "" for (h, v) in record.headers: if h == b'WARC-Target-URI': url = str(v, errors="ignore") # domain = re.sub(r'^(www\.)?','',urlparse(url.decode("ISO-8859-1"))[1].lower()) # urls.append(url.decode("ISO-8859-1").lower()) urls.append(url) # documents.append(extract_text(record.content[1].decode("ISO-8859-1"))) documents.append( extract_text(str(record.content[1], errors="ignore"))) i += 1 return documents, urls
def main(argv): (options, args) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(args) < 1: # dump the first record on stdin with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: dump_record(fh) else: # dump a record from the filename, with optional offset filename = args[0] if len(args) > 1: offset = int(args[1]) else: offset = 0 with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: fh.seek(offset) dump_record(fh) return 0
def tweet_warc_record(tweet_json): """Parse Tweet JSON and return WarcRecord. """ try: tweet = json.loads(tweet_json) # skip deleted tweet if 'user' not in tweet: return url = "https://twitter.com/%s/status/%s" % ( tweet['user']['screen_name'], tweet['id']) except Exception as ex: logging.error('error in tweet_warc_record', exc_info=1) return None warc_date = warc_datetime_str( datetime.utcfromtimestamp(float(tweet['timestamp_ms']) / 1000.0)) return WarcRecord(headers=[(WarcRecord.TYPE, WarcRecord.RESOURCE), (WarcRecord.CONTENT_TYPE, b'application/json'), (WarcRecord.ID, warc_uuid(url + warc_date)), (WarcRecord.URL, url), (WarcRecord.DATE, warc_date)], content=(b'application/json', tweet_json + "\r\n"), version=b"WARC/1.0")
r.seed(1818118181) # Arbitrary content = [] rejects = defaultdict(int) #Load all the objects into memory first try: with open(picklefn, "rb") as fh: print "Unpickling selected sample." content = pickle.load(fh) except IOError: print "Pickled file does not appear to exist. Loading content." for fn in os.listdir(dirname): if not fn.endswith('.warc.gz'): continue wf = WarcRecord.open_archive(dirname+'/'+fn, mode='rb') try: print fn for record in wf: if not record.type in [WarcRecord.RESPONSE, WarcRecord.RESOURCE, WarcRecord.CONVERSION]: continue if (record.type == WarcRecord.RESPONSE and record.url.startswith('http')): ccode, cmime, cbody = parse_http_response(record) if ccode not in successcodes: continue else: ccode = None cmime = record.content[0]
r.seed(1818118181) # Arbitrary content = [] rejects = defaultdict(int) #Load all the objects into memory first try: with open(picklefn, "rb") as fh: print "Unpickling selected sample." content = pickle.load(fh) except IOError: print "Pickled file does not appear to exist. Loading content." for fn in os.listdir(dirname): if not fn.endswith('.warc.gz'): continue wf = WarcRecord.open_archive(dirname + '/' + fn, mode='rb') try: print fn for record in wf: if not record.type in [ WarcRecord.RESPONSE, WarcRecord.RESOURCE, WarcRecord.CONVERSION ]: continue if (record.type == WarcRecord.RESPONSE and record.url.startswith('http')): ccode, cmime, cbody = parse_http_response(record) if ccode not in successcodes: continue else: ccode = None
webbase_header = "==P=>>>>=i===<<<<=T===>=A===<=!Junghoo!==>" content = "" headers = [("WARC-Filename", filename), ("WARC-Type", "response")] finished_headers = False first_line = fh.readline() assert first_line.startswith(webbase_header) for line in fh: if line.startswith(webbase_header): yield headers, ("text/html", content) content = "" else: if finished_headers: content += line elif "" == line.strip(): finished_headers = True else: add_header(headers, line) i = 0 warc_out = open("out.warc.gz", "w") for headers, content in get_wb_record("2pages"): print i i += 1 # print headers # print content record = WarcRecord(headers=headers, content=content) record.write_to(warc_out, gzip=True) record.dump() print "_" * 80
def _reply_finished(self): self._network_reply.readyRead.disconnect(self._reply_ready_read) self._network_reply.finished.disconnect(self._reply_finished) self._network_reply.error.disconnect(self._reply_error) status_code = self._network_reply.attribute(QNetworkRequest \ .HttpStatusCodeAttribute) if not status_code.isValid(): self._temp_data.close() self._temp_data = None self._network_reply = None QTimer.singleShot(0, lambda: self.finished.emit()) return headers = dict() for header in self._network_reply.rawHeaderList(): temp = str(self._network_reply.rawHeader(header)) headers[str(header)] = re.sub("\s", " ", temp) elements = [] for name, value in headers.iteritems(): elements.append(name + ": " + value) elements.append("") url = qstring_to_str(self._network_reply.url().toString()) status_msg = self._network_reply.attribute(QNetworkRequest \ .HttpReasonPhraseAttribute) assert (status_msg.isValid()) self._temp_data.seek(0) # XXX: we can't get HTTP version from Qt webkit, assumes 1.1 h_status = "HTTP/1.1 " + str(status_code.toString()) + " " \ + str(status_msg.toString()) content_data = h_status + "\r\n" \ + "\r\n".join(elements) + "\r\n" \ + self._temp_data.read() content_type = ResponseMessage.CONTENT_TYPE content = (content_type, content_data) wr = warc.make_response(WarcRecord.random_warc_uuid(), warc.warc_datetime_str(datetime.utcnow()), url, content, None) self._temp_data.close() self._temp_data = None self.manager().current_warc.write_record(wr) self._init_from_warc_record(wr) self._network_reply = None
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if options.output: out = open(options.output, 'wb') if options.output.endswith('.gz'): options.gzip = True if len(input_files) < 1: parser.error("no imput warc file(s)") for name in input_files: fh = ArcRecord.open_archive(name, gzip="auto") filedesc = None warcinfo_id = None for record in fh: version = "WARC/1.0" warc_id = make_warc_uuid(record.url+record.date) headers = [ (WarcRecord.ID, warc_id), ] if record.date: date = datetime.datetime.strptime(record.date,'%Y%m%d%H%M%S') headers.append((WarcRecord.DATE, warc_datetime_str(date))) if record.type == 'filedesc': warcinfo_id = warc_id warcinfo_headers = list(headers) warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:])) warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO)) warcinfo_content = ('application/warc-fields', 'software: hanzo.arc2warc\r\n') warcrecord = WarcRecord(headers=warcinfo_headers, content=warcinfo_content, version=version) warcrecord.write_to(out, gzip=options.gzip) warc_id = make_warc_uuid(record.url+record.date+"-meta") warcmeta_headers = [ (WarcRecord.TYPE, WarcRecord.METADATA), (WarcRecord.CONCURRENT_TO, warcinfo_id), (WarcRecord.ID, warc_id), (WarcRecord.URL, record.url), (WarcRecord.DATE, warcrecord.date), (WarcRecord.WARCINFO_ID, warcinfo_id), ] warcmeta_content =('application/arc', record.raw()) warcrecord = WarcRecord(headers=warcmeta_headers, content=warcmeta_content, version=version) warcrecord.write_to(out, gzip=options.gzip) else: content_type, content = record.content if record.url.startswith('http'): # don't promote content-types for http urls, # they contain headers in the body. content_type="application/http;msgtype=response" headers.extend([ (WarcRecord.TYPE, WarcRecord.RESPONSE ), (WarcRecord.URL,record.url), (WarcRecord.WARCINFO_ID, warcinfo_id), ]) warcrecord = WarcRecord(headers=headers, content=(content_type, content), version=version) warcrecord.write_to(out, gzip=options.gzip) fh.close() return 0
uuidsexcluded = set() exclist = parse_exc_args(args.pattern) # In theory this could be agnostic as to whether the stream is compressed or # not. In practice, the gzip guessing code reads the stream for marker bytes # and then attempts to rewind, which fails for stdin unless an elaborate # stream wrapping class is set up. gzi = 'auto' if args.gzipped_input: gzi = 'record' elif args.plain_input: gzi = False if args.in_filename is None: inwf = WarcRecord.open_archive(file_handle=sys.stdin, mode='rb', gzip=gzi) else: inwf = WarcRecord.open_archive(filename=args.in_filename, mode='rb', gzip=gzi) ##### #MAIN ##### outf = sys.stdout if args.out_filename is not None: outf = open(args.out_filename, 'wb') for record in inwf: # How many matches constitutes failure? write = len(exclist)
def _reply_finished(self): self._network_reply.readyRead.disconnect(self._reply_ready_read) self._network_reply.finished.disconnect(self._reply_finished) self._network_reply.error.disconnect(self._reply_error) status_code = self._network_reply.attribute(QNetworkRequest \ .HttpStatusCodeAttribute) if not status_code.isValid(): self._temp_data.close() self._temp_data = None self._network_reply = None QTimer.singleShot(0, lambda: self.finished.emit()) return headers = dict() for header in self._network_reply.rawHeaderList(): temp = str(self._network_reply.rawHeader(header)) headers[str(header)] = re.sub("\s", " ", temp) elements = [] for name, value in headers.iteritems(): elements.append(name + ": " + value) elements.append("") url = qstring_to_str(self._network_reply.url().toString()) status_msg = self._network_reply.attribute(QNetworkRequest \ .HttpReasonPhraseAttribute) assert(status_msg.isValid()) self._temp_data.seek(0) # XXX: we can't get HTTP version from Qt webkit, assumes 1.1 h_status = "HTTP/1.1 " + str(status_code.toString()) + " " \ + str(status_msg.toString()) content_data = h_status + "\r\n" \ + "\r\n".join(elements) + "\r\n" \ + self._temp_data.read() content_type = ResponseMessage.CONTENT_TYPE content = (content_type, content_data) wr = warc.make_response(WarcRecord.random_warc_uuid(), warc.warc_datetime_str(datetime.utcnow()), url, content, None) self._temp_data.close() self._temp_data = None self.manager().current_warc.write_record(wr) self._init_from_warc_record(wr) self._network_reply = None
def __init__( self, warc ): self.warc = warc logger.debug( "Mounting %s" % self.warc ) self.fh = WarcRecord.open_archive( warc, gzip="auto", mode="rb" ) self.tree = Tree() self._get_records()
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if options.output: out = open(options.output, 'ab') if options.output.endswith('.gz'): options.gzip = True if len(input_files) < 1: parser.error("no imput warc file(s)") for name in input_files: fh = ArcRecord.open_archive(name, gzip="auto") filedesc = None warcinfo_id = None for record in fh: version = "WARC/1.0" warc_id = make_warc_uuid(record.url + record.date) headers = [ (WarcRecord.ID, warc_id), ] if record.date: date = datetime.datetime.strptime(record.date, '%Y%m%d%H%M%S') headers.append((WarcRecord.DATE, warc_datetime_str(date))) if record.type == 'filedesc': warcinfo_id = warc_id warcinfo_headers = list(headers) warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:])) warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO)) warcinfo_content = ('application/warc-fields', 'software: hanzo.arc2warc\r\n') warcrecord = WarcRecord(headers=warcinfo_headers, content=warcinfo_content, version=version) warcrecord.write_to(out, gzip=options.gzip) warc_id = make_warc_uuid(record.url + record.date + "-meta") warcmeta_headers = [ (WarcRecord.TYPE, WarcRecord.METADATA), (WarcRecord.CONCURRENT_TO, warcinfo_id), (WarcRecord.ID, warc_id), (WarcRecord.URL, record.url), (WarcRecord.DATE, warcrecord.date), (WarcRecord.WARCINFO_ID, warcinfo_id), ] warcmeta_content = ('application/arc', record.raw()) warcrecord = WarcRecord(headers=warcmeta_headers, content=warcmeta_content, version=version) warcrecord.write_to(out, gzip=options.gzip) else: content_type, content = record.content if record.url.startswith('http'): # don't promote content-types for http urls, # they contain headers in the body. content_type = "application/http;msgtype=response" headers.extend([ (WarcRecord.TYPE, WarcRecord.RESPONSE), (WarcRecord.URL, record.url), (WarcRecord.WARCINFO_ID, warcinfo_id), ]) warcrecord = WarcRecord(headers=headers, content=(content_type, content), version=version) warcrecord.write_to(out, gzip=options.gzip) fh.close() return 0