def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: parser.error("no imput warc file(s)") for name in input_files: fh = ArcRecord.open_archive(name, gzip="auto") for record in fh: content = record.content headers = [ (WarcRecord.TYPE, "response"), (WarcRecord.ID, "<urn:uuid:%s>"%uuid.UUID(hashlib.sha1(record.url+record.date).hexdigest()[0:32])), ] version = "WARC/1.0" url = record.url if url: headers.append((WarcRecord.URL,url)) date = record.date if date: headers.append((WarcRecord.DATE,date)) warcrecord = WarcRecord(headers=headers, content=content, version=version) warcrecord.write_to(out, gzip=options.gzip) fh.close() return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-",offsets=False) else: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") dump_archive(fh,name) fh.close() tf = zipfile.ZipFile("dump.zip", "w") for dirname, subdirs, files in os.walk("html"): for filename in files: tf.write(os.path.join(dirname, filename)) tf.write("fulltext.html") tf.write("index.html") tf.close() return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None) for record in fh: record.write_to(out, gzip=options.gzip) else: for name in input_files: fh = WarcRecord.open_archive(name, gzip="auto") for record in fh: record.write_to(out, gzip=options.gzip) fh.close() return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-",offsets=False) else: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") dump_archive(fh,name) fh.close() return 0
def main(argv): (options, args) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(args) < 1: # dump the first record on stdin with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: dump_record(fh) else: # dump a record from the filename, with optional offset filename = args[0] if len(args) > 1: offset = int(args[1]) else: offset = 0 with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: fh.seek(offset) dump_record(fh) return 0