def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: parser.error("no imput warc file(s)") total = 0 # print '#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length' for name in expand_files(input_files): fh = WarcRecord.open_archive(name, gzip="auto") for (offset, record, errors) in fh.read_records(limit=None): if record: print name, offset, record.type, record.url, record.id, record.content_type, record.content_length total += record.content_length elif errors: pass # ignore else: pass # no errors at tail fh.close() print total return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) try: # python3 out = sys.stdout.buffer except AttributeError: # python2 out = sys.stdout if len(input_files) < 1: fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None) for record in fh: process(record, out, options) else: for name in expand_files(input_files): fh = WarcRecord.open_archive(name, gzip="auto") for record in fh: process(record, out, options) fh.close() return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) # prepare regular expressions link_ignore_expressions = prepare_link_ignore_re(options.ignore_links) print "parsing WARC archives" all_urls = [] for filename in expand_files(input_files): print "WARC: "+filename link_cache_filename = filename+'.urls' if options.persist_links and os.path.exists(link_cache_filename): url_fh = open(link_cache_filename, 'r') urls = pickle.load(url_fh) url_fh.close() all_urls += urls else: urls = [] fh = WarcRecord.open_archive(filename, gzip="auto") for record in fh: record = record """@type : ArchiveRecord """ if not record.is_response(): continue urls.append({ 'url': record.url, 'content-type': record.content_content_type }) # urls.sort(cmp=url_cmp) if options.persist_links: url_fh = open(link_cache_filename, 'w+') pickle.dump(urls, url_fh) url_fh.close() fh.close() all_urls += urls if options.dump_links is not None: f = open(options.dump_links, 'w+') all_urls.sort() for url in all_urls: # skip ignorable links skip_addition = False for expression in link_ignore_expressions: if expression.match(url['url']): skip_addition = True break if not skip_addition: f.write(url['url']) f.write('\n') f.close() if options.web_start is not False: urltree = UrlTree() for url in all_urls: # skip filtered links via regex skip_addition = False for expression in link_ignore_expressions: if expression.match(url['url']): skip_addition = True break # skip links filtered by content_type filter if options.content_type: if not url['content-type'].startswith(options.content_type): skip_addition = True if options.content_type_not: if url['content-type'].startswith(options.content_type_not): skip_addition = True if not skip_addition: urltree.add_url(url['url']) print "Total urls: "+str(urltree.childcount) webserver.run(urltree)