def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: parser.error("no pattern") pattern, input_files = input_files[0], input_files[1:] invert = options.invert out = sys.stdout pattern = re.compile(pattern) if not input_files: fh = ArchiveRecord.open_archive(file_handle=sys.stdin, gzip=None) filter_archive(fh, options, pattern, out) else: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") filter_archive(fh, options, pattern,out) fh.close() return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: parser.error("no imput warc file(s)") print "filename\toffset\twarc-type\twarc-subject-uri\twarc-record-id\tcontent-type\tcontent-length" for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") for (offset, record, errors) in fh.read_records(limit=None): if record: print("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (name, offset, record.type, record.url, record.id, record.content_type, record.content_length)) elif errors: pass # ignore else: pass # no errors at tail fh.close() return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-",offsets=False) else: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") dump_archive(fh,name) fh.close() tf = zipfile.ZipFile("dump.zip", "w") for dirname, subdirs, files in os.walk("html"): for filename in files: tf.write(os.path.join(dirname, filename)) tf.write("fulltext.html") tf.write("index.html") tf.close() return 0
def make_cdx(self): print ' CDX ' + self.format #print header fh = ArchiveRecord.open_archive(self.file, gzip="auto", mode="r") for (offset, record, errors) in fh.read_records(limit=None, offsets=True): self.offset = offset if record: ### precalculated data that is used multiple times self.headers, self.content = self.parse_headers_and_content(record) self.mime_type = self.get_mime_type(record, use_precalculated_value=False) self.response_code = self.get_response_code(record, use_precalculated_value=False) self.meta_tags = self.parse_meta_tags(record) s = '' for field in self.format.split(): if not field in self.field_map: sys.exit('Unknown field: ' + field) endpoint = self.field_map[field].replace(' ', '_') response = getattr(self, 'get_' + endpoint)(record) s += response + ' ' print s.rstrip() #record.dump() elif errors: pass # ignore else: pass # tail fh.close()
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-",offsets=False) else: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") dump_archive(fh,name) fh.close() return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: parser.error("no imput warc file(s)") print "<html>" print '<head><style type="text/css" media="all">table{width:100%;border-collapse:collapse;}td,th{border:solid 1px black;padding:0.1em;word-wrap:break-word}</style></head>' print "<body>" for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") print "<h1>"+name+"</h1>" print '<table>' print '<tr><th>warc-subject-uri</th><th>content-type</th><th>content-length</th></tr>' for (offset, record, errors) in fh.read_records(limit=None): if record: fname = record.id fname = fname.strip('<>') fnameu = fname[9:]+".html" urlu = textwrap.fill(record.url, 60) print("<tr><td><a href=\"html/%s\" id=\"%s\">%s</a></td><td>%s</td><td>%s</td></tr>" % (fnameu, record.url, urlu, record.content_type, record.content_length)) elif errors: pass # ignore else: pass # no errors at tail fh.close() print "</table>" print "</body>" print "</html>" return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: parser.error("no imput warc file(s)") correct=False fh=None try: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") for (offset, record, errors) in fh.read_records(limit=None): if errors: # print "warc errors at %s:%d"%(name, offset) break elif record is None and not errors : correct=True except StandardError, e: correct=False
def main(argv): (options, args) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(args) < 1: # dump the first record on stdin with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: dump_record(fh) else: # dump a record from the filename, with optional offset filename = args[0] if len(args) > 1: offset = int(args[1]) else: offset = 0 with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: fh.seek(offset) dump_record(fh) return 0
def make_cdx(self): if isinstance(self.out_file, basestring): self.out_file = open(self.out_file, 'wb') self.out_file.write(' CDX ' + self.format + '\n') #print header if not self.all_records: #filter cdx lines if --all-records isn't specified allowed_record_types = set(['response', 'revisit']) disallowed_content_types = set(['text/dns']) stats = { 'num_records_processed': 0, 'num_records_included': 0, 'num_records_filtered': 0, } fh = ArchiveRecord.open_archive(self.file, gzip="auto", mode="r") for (offset, record, errors) in fh.read_records(limit=None, offsets=True): self.offset = offset if record: stats['num_records_processed'] += 1 if self.screenshot_mode: if record.type != 'metadata': continue elif not self.all_records and (record.type not in allowed_record_types or record.content_type in disallowed_content_types): continue ### arc files from the live web proxy can have a negative content length and a missing payload ### check the content_length from the arc header, not the computed payload size returned by record.content_length content_length_str = record.get_header(record.CONTENT_LENGTH) if content_length_str is not None and int(content_length_str) < 0: continue self.surt = self.get_massaged_url(record, use_precalculated_value=False) if self.should_exclude(self.surt): stats['num_records_filtered'] += 1 continue ### precalculated data that is used multiple times self.headers, self.content = self.parse_headers_and_content(record) self.mime_type = self.get_mime_type(record, use_precalculated_value=False) self.response_code = self.get_response_code(record, use_precalculated_value=False) self.meta_tags = self.parse_meta_tags(record) s = u'' for field in self.format.split(): if not field in self.field_map: raise ParseError('Unknown field: ' + field) endpoint = self.field_map[field].replace(' ', '_') response = getattr(self, 'get_' + endpoint)(record) #print self.offset #print record.compressed_record_size #print record.content_length #print record.headers #print len(self.content) #print repr(record.content[1]) #print endpoint #print repr(response) s += response + ' ' self.out_file.write(s.rstrip().encode('utf-8')+'\n') #record.dump() stats['num_records_included'] += 1 elif errors: raise ParseError(str(errors)) else: pass # tail fh.close() if self.stats_file is not None: f = open(self.stats_file, 'w') json.dump(stats, f, indent=4) f.close()
def make_cdx(self): if isinstance(self.out_file, basestring): self.out_file = open(self.out_file, 'wb') self.out_file.write(' CDX ' + self.format + '\n') #print header if not self.all_records: #filter cdx lines if --all-records isn't specified allowed_record_types = set(['response', 'revisit']) disallowed_content_types = set(['text/dns']) stats = { 'num_records_processed': 0, 'num_records_included': 0, 'num_records_filtered': 0, } fh = ArchiveRecord.open_archive(self.file, gzip="auto", mode="r") for (offset, record, errors) in fh.read_records(limit=None, offsets=True): self.offset = offset if record: stats['num_records_processed'] += 1 if self.screenshot_mode: if record.type != 'metadata': continue elif not self.all_records and ( record.type not in allowed_record_types or record.content_type in disallowed_content_types): continue ### arc files from the live web proxy can have a negative content length and a missing payload ### check the content_length from the arc header, not the computed payload size returned by record.content_length content_length_str = record.get_header(record.CONTENT_LENGTH) if content_length_str is not None and int( content_length_str) < 0: continue self.surt = self.get_massaged_url( record, use_precalculated_value=False) if self.should_exclude(self.surt): stats['num_records_filtered'] += 1 continue ### precalculated data that is used multiple times self.headers, self.content = self.parse_headers_and_content( record) self.mime_type = self.get_mime_type( record, use_precalculated_value=False) self.response_code = self.get_response_code( record, use_precalculated_value=False) self.meta_tags = self.parse_meta_tags(record) s = u'' for field in self.format.split(): if not field in self.field_map: raise ParseError('Unknown field: ' + field) endpoint = self.field_map[field].replace(' ', '_') response = getattr(self, 'get_' + endpoint)(record) #print self.offset #print record.compressed_record_size #print record.content_length #print record.headers #print len(self.content) #print repr(record.content[1]) #print endpoint #print repr(response) s += response + ' ' self.out_file.write(s.rstrip().encode('utf-8') + '\n') #record.dump() stats['num_records_included'] += 1 elif errors: raise ParseError(str(errors)) else: pass # tail fh.close() if self.stats_file is not None: f = open(self.stats_file, 'w') json.dump(stats, f, indent=4) f.close()