def _start_new_warc(self): self.filename = self._unique_warc_filename() self.fd = open(self.filename, 'wb') LOGGER.info('opening new warc file %s', self.filename) self.writer = WARCWriter(self.fd, gzip=self.gzip, warc_version=self.warc_version) warcinfo = self.writer.create_warcinfo_record(self.filename, self.info) self.writer.write_record(warcinfo)
def __init__(self, name, default_delay=3600, n_thread=None): file = open(f"{name}.warc.gz", "wb") self.writer = WARCWriter(file) self.lock = RLock() self.scheduler = scheduler(time.time) self.executor = ThreadPoolExecutor(n_thread) self.default_delay = default_delay self.delay_adjusters = {} # Constant delay self.last = {}
class CDXToolkitWARCWriter: def __init__(self, prefix, subprefix, info, size=1000000000, gzip=True, warc_version=None): self.prefix = prefix self.subprefix = subprefix self.info = info self.size = size self.gzip = gzip self.warc_version = warc_version self.segment = 0 self.writer = None def write_record(self, *args, **kwargs): if self.writer is None: if self.warc_version is None: # opportunity to intuit warc version here self.warc_version = '1.0' if self.warc_version != '1.0': LOGGER.error('WARC versions other than 1.0 are not correctly supported yet') # ...because fake_wb_warc always generates 1.0 # should we also check the warcinfo record to make sure it's got a matching warc_version inside? self._start_new_warc() self.writer.write_record(*args, **kwargs) fsize = os.fstat(self.fd.fileno()).st_size if fsize > self.size: self.fd.close() self.writer = None self.segment += 1 def _unique_warc_filename(self): while True: name = self.prefix + '-' if self.subprefix is not None: name += self.subprefix + '-' name += '{:06d}'.format(self.segment) + '.extracted.warc' if self.gzip: name += '.gz' if os.path.exists(name): self.segment += 1 else: break return name def _start_new_warc(self): self.filename = self._unique_warc_filename() self.fd = open(self.filename, 'wb') LOGGER.info('opening new warc file %s', self.filename) self.writer = WARCWriter(self.fd, gzip=self.gzip, warc_version=self.warc_version) warcinfo = self.writer.create_warcinfo_record(self.filename, self.info) self.writer.write_record(warcinfo)
def process_item(self, item, spider): writer = WARCWriter(self.output, gzip=True) headers_list = item['headers'] http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(item['url'], 'response', payload=BytesIO(item['content']), http_headers=http_headers) writer.write_record(record)
def construct_warcio_record(url, warcheader, httpheader, content_bytes): # payload will be parsed for http headers payload = httpheader.rstrip(b'\r\n') + b'\r\n\r\n' + content_bytes warc_headers_dict = {} if warcheader: for header in warcheader.split(b'\r\n')[1:]: # skip the initial WARC/1 line k, v = header.split(b':', 1) warc_headers_dict[k] = v.strip() writer = WARCWriter(None) return writer.create_warc_record(url, 'response', payload=BytesIO(payload), warc_headers_dict=warc_headers_dict)
def filter_warc_stream(ids, warc_in, warc_out): writer = WARCWriter(warc_out, gzip=True) output, total, errors = 0, 0, 0 for record in ArchiveIterator(warc_in): id_ = get_record_id(record) if id_ in ids: output += 1 try: writer.write_record(record) except Exception as e: logging.error(f'failed to write record: {e}') errors += 1 total += 1 if total % 10000 == 0: logging.info(f'processed {total} records, output {output}, ' f'{errors} errors') print(f'Done, processed {total} records, output {output}, ' f'{errors} errors')
class CDXToolkitWARCWriter: def __init__(self, prefix, subprefix, info, size=1000000000, gzip=True): self.prefix = prefix self.subprefix = subprefix self.info = info self.size = size self.gzip = gzip self.segment = 0 self.writer = None def write_record(self, *args, **kwargs): if self.writer is None: self._start_new_warc() self.writer.write_record(*args, **kwargs) fsize = os.fstat(self.fd.fileno()).st_size if fsize > self.size: self.fd.close() self.writer = None self.segment += 1 def _unique_warc_filename(self): while True: name = self.prefix + '-' if self.subprefix is not None: name += self.subprefix + '-' name += '{:06d}'.format(self.segment) + '.extracted.warc' if self.gzip: name += '.gz' if os.path.exists(name): self.segment += 1 else: break return name def _start_new_warc(self): self.filename = self._unique_warc_filename() self.fd = open(self.filename, 'wb') LOGGER.info('opening new warc file %s', self.filename) self.writer = WARCWriter(self.fd, gzip=self.gzip) warcinfo = self.writer.create_warcinfo_record(self.filename, self.info) self.writer.write_record(warcinfo)
def fake_wb_warc(url, wb_url, resp, capture): ''' Given a playback from a wayback, fake up a warc response record ''' status_code = resp.status_code status_reason = resp.reason if str(status_code) != capture['status']: url = capture['url'] timestamp = capture['timestamp'] if status_code == 200 and capture['status'] == '-': LOGGER.warning('revisit record vivified by wayback for %s %s', url, timestamp) elif status_code == 200 and capture['status'].startswith('3'): LOGGER.warning('redirect capture came back 200, same-surt same-timestamp capture? %s %s', url, timestamp) elif status_code == 302 and capture['status'].startswith('3'): # this is OK, wayback always sends a temporary redir status_code = int(capture['status']) if status_code != resp.status_code and status_code in http_status_text: status_reason = http_status_text[status_code] else: # pragma: no cover LOGGER.warning('surprised that status code is now=%d orig=%s %s %s', status_code, capture['status'], url, timestamp) http_headers = [] http_date = None for k, v in resp.headers.items(): kl = k.lower() if kl.startswith('x-archive-orig-date'): http_date = v if kl.startswith('x-archive-orig-'): k = k[len('x-archive-orig-'):] http_headers.append((k, v)) elif kl == 'content-type': http_headers.append(('Content-Type', v)) elif kl == 'location': v = wb_redir_to_original(v) http_headers.append((k, v)) else: if not kl.startswith('x-archive-'): k = 'X-Archive-' + k http_headers.append((k, v)) statusline = '{} {}'.format(status_code, status_reason) http_headers = StatusAndHeaders(statusline, headers=http_headers, protocol='HTTP/1.1') warc_headers_dict = { 'WARC-Source-URI': wb_url, 'WARC-Creation-Date': datetime_to_iso_date(datetime.datetime.now()), } if http_date: warc_headers_dict['WARC-Date'] = datetime_to_iso_date(http_date_to_datetime(http_date)) content_bytes = resp.content writer = WARCWriter(None) # needs warc_version here? return writer.create_warc_record(url, 'response', payload=BytesIO(content_bytes), http_headers=http_headers, warc_headers_dict=warc_headers_dict)
args.polls = args.attachments = args.members = True with Mkchdir(args.group, sanitize=False): log_file_handler = logging.FileHandler('archive.log') log_file_handler.setFormatter(log_formatter) root_logger.addHandler(log_file_handler) if args.warc: try: from warcio import WARCWriter except ImportError: logging.error( 'WARC output requires the warcio package to be installed.') exit(1) fhwarc = open('data.warc.gz', 'ab') warc_writer = WARCWriter(fhwarc) warcmeta = warc_writer.create_warcinfo_record( fhwarc.name, WARC_META_PARAMS) warc_writer.write_record(warcmeta) yga.set_warc_writer(warc_writer) if args.email: with Mkchdir('email'): archive_email(yga, message_subset=args.ids, start=args.start, stop=args.stop) if args.files: with Mkchdir('files'): archive_files(yga) if args.photos:
class Harvester: def __init__(self, name, default_delay=3600, n_thread=None): file = open(f"{name}.warc.gz", "wb") self.writer = WARCWriter(file) self.lock = RLock() self.scheduler = scheduler(time.time) self.executor = ThreadPoolExecutor(n_thread) self.default_delay = default_delay self.delay_adjusters = {} # Constant delay self.last = {} def visit(self, url): ts0 = time.time() resp = requests.get(url, headers=HEADERS, stream=True) ts1 = time.time() headers_list = resp.raw.headers.items() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') record = self.writer.create_warc_record(resp.url, 'response', payload=BytesIO(resp.content), http_headers=http_headers) self.lock.acquire() self.writer.write_record(record) self.lock.release() res = html_to_counters(resp.content) return res, (ts1 + ts0) / 2 def foo(self, target, website, lower_limit, upper_limit): try: sr1, ts1 = self.visit(website) delay_adjuster = self.delay_adjusters[website] delay_adjuster.add_case(ts1, sr1) estimate = delay_adjuster.get_delay() new_delay = max(lower_limit, min(upper_limit, estimate)) self.scheduler.enterabs( ts1 + new_delay, 1, self.executor.submit, (self.foo, target, website, lower_limit, upper_limit)) if logging.root.level <= logging.INFO: delay = None sim = None ts0, sr0 = self.last.get(website, (None, None)) self.last[website] = (ts1, sr1) if ts0 is not None and sr0 is not None: sim = sr0.similarity(sr1) delay = ts1 - ts0 logging.info( f"website={website}, delay={delay}, sim={sim}, estimate={estimate}, new_delay={new_delay}" ) except Exception as e: logging.exception(e) logging.exception(f"website={website}") def harvest(self, websites, target, delay_adjuster=None): logging.info( f"Starting harvest with {len(websites)} websites and delay adjuster " f"{str(delay_adjuster)}, target={target}") start_time = time.time() + 10 diff = self.default_delay / len( websites) # Distribute initial harvests equally for ws in websites: if delay_adjuster is None: self.delay_adjusters[ws] = ConstantDelayAdjuster( self.default_delay) else: self.delay_adjusters[ws] = delay_adjuster( self.default_delay, target) start_time += diff self.scheduler.enterabs(start_time, 1, self.executor.submit, (self.foo, target, ws, 60, 86400)) while True: try: self.scheduler.run() except Exception as e: logging.exception(e) logging.exception( f"Exception occured: {e}, stopping harvest...") return
args.polls = args.attachments = args.members = True with Mkchdir(args.group): log_file_handler = logging.FileHandler('archive.log') log_file_handler.setFormatter(log_formatter) root_logger.addHandler(log_file_handler) if args.warc: try: from warcio import WARCWriter except ImportError: logging.error( 'WARC output requires the warcio package to be installed.') exit(1) fhwarc = open('data.warc.gz', 'ab') warc_writer = WARCWriter(fhwarc) yga.set_warc_writer(warc_writer) if args.email: with Mkchdir('email'): archive_email(yga) if args.files: with Mkchdir('files'): archive_files(yga) if args.photos: with Mkchdir('photos'): archive_photos(yga) if args.database: with Mkchdir('databases'): archive_db(yga) if args.links:
def handle_download_name(self, user, coll_name, warc_name, url): #username = request.query.getunicode('user') #warc_name = request.query.getunicode('doi') # some clients use collection rather than coll_name so we must check for both #coll_name = request.query.getunicode('collection') #user = self._get_wasapi_user() #self.access.assert_is_curr_user(user) #colls = None #if coll_name: # collection = user.get_collection_by_name(coll_name) # if collection: # colls = [collection] # else: # self._raise_error(404, 'no_such_collection') #else: # colls = user.get_collections() #files = [] user_name = user user = self.user_manager.get_user(user) collection = user.get_collection_by_name(coll_name) if not collection: self._raise_error(404, 'no_such_collection') self.access.assert_can_write_coll(collection) # collection['uid'] = coll collection.load() Stats(self.redis).incr_download(collection) download_path = self.get_origin() + "/api/v1/download/{}/".format( user_name) warc_name_broke = warc_name.replace("/", "\/") warc_name_broke = warc_name.replace("10.25354/", "") local_storage = LocalFileStorage(self.redis) landingpage = template( 'webrecorder/templates/landingpage.html', title=coll_name, warc_file= 'https://projects.zo.uni-heidelberg.de/webarchive/warc/10.25354/' + warc_name_broke + '.warc', url=url) try: os.makedirs( os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354')) print("Directory '% s' created" % os.path.isfile( os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354'))) except FileExistsError: print("Directory '% s' already created!" % os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354')) except FileNotFoundError: print("Directory '% s' No such file or directory!" % os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354')) try: os.makedirs( os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354')) print("Directory '% s' created" % os.path.isfile( os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354'))) except FileExistsError: print( "Directory '% s' already created!" % os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354')) except FileNotFoundError: print( "Directory '% s' No such file or directory!" % os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354')) try: f = open( os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354', warc_name_broke) + ".html", 'w') f.write(landingpage) f.close() except FileExistsError: print( os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354', warc_name_broke) + ".html exists") except FileNotFoundError: print( os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354', warc_name_broke) + ".html doesn't exists") commit_storage = collection.get_storage() for recording in collection.get_recordings(): is_committed = recording.is_fully_committed() is_open = not is_committed and recording.get_pending_count() > 0 storage = commit_storage if is_committed else local_storage try: f = open( os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354', warc_name_broke) + ".warc", 'wb') writer = WARCWriter(f, gzip=True) for name, path in recording.iter_all_files( include_index=False): local_download = download_path.format(user=user.name, coll=collection.name, filename=name) warc_key = collection.get_warc_key() warc_path = self.redis.hget(warc_key, name) if 'http://nginx:6090' in warc_path: warc_path = warc_path.replace('http://nginx:6090', '') if 'https://nginx:6090' in warc_path: warc_path = warc_path.replace('https://nginx:6090', '') if not warc_path: self._raise_error(404, 'file_not_found') with open(warc_path, 'rb') as stream: for record in ArchiveIterator(stream): writer.write_record(record) f.close() except FileExistsError: print( os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354', warc_name_broke) + ".warc exists") except FileNotFoundError: print( os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354', warc_name_broke) + ".warc doesn't exists")
def sample_warc_stream(ratio, warc_in, warc_out, options): if options.language != ANY_LANGUAGE: from langdetect import DetectorFactory, detect_langs DetectorFactory.seed = options.seed # Make langdetect deterministic writer = WARCWriter(warc_out, gzip=True) responses, total, errors, empties, notlang = 0, 0, 0, 0, 0 for total, record in enumerate(ArchiveIterator(warc_in), start=1): if total % 10000 == 0: print( f'sample_warc_responses.py: processed {total} records, ' f'{responses} responses, {errors} errors, {empties} empty, ' f'{notlang} not in target language.', file=sys.stderr) if record.rec_type != 'response': continue responses += 1 id_ = get_id(record) if random.random() > ratio: continue if options.language != ANY_LANGUAGE: # Workaround for https://github.com/webrecorder/warcio/issues/114 payload_copy = BytesIO(record.raw_stream.read()) record = copy_warc_record(record, payload_copy) content = record.content_stream().read() # force length recalculation to work around issues with # header encoding changes causing content-length mismatch # (related: https://github.com/webrecorder/warcio/issues/104) record.length = None payload_copy.seek(0) try: text_content = trafilatura.extract(content) except Exception as e: logging.error(f'failed extract for {id_}: {e}') errors += 1 continue if not text_content: empties += 1 continue try: langs = detect_langs(text_content) except Exception as e: logging.error(f'failed langdetect for {id_}: {e}') errors += 1 continue target_lang = [l for l in langs if l.lang == options.language] target_lang = None if not target_lang else target_lang[0] if target_lang is None or target_lang.prob < options.lang_prob: notlang += 1 continue try: writer.write_record(record) except Exception as e: logging.error(f'failed to write record {id_}: {e}') errors += 1
def main(): args = parse_arguments() # Setup logging root_logger = logging.getLogger() root_logger.setLevel(logging.DEBUG) log_format = { 'fmt': '%(asctime)s %(levelname)s %(name)s %(message)s', 'datefmt': '%Y-%m-%d %H:%M:%S.%f %Z' } log_formatter = CustomFormatter(**log_format) if args.verbose: log_level = logging.DEBUG elif args.quiet: log_level = logging.ERROR else: log_level = logging.INFO if args.colour: try: import coloredlogs except ImportError as e: sys.exit( "Error: Coloured logging output requires the 'coloredlogs' package to be installed." ) coloredlogs.install(level=log_level, **log_format) else: log_stdout_handler = logging.StreamHandler(sys.stdout) log_stdout_handler.setLevel(log_level) log_stdout_handler.setFormatter(log_formatter) root_logger.addHandler(log_stdout_handler) cookie_jar = init_cookie_jar(args.cookie_file, args.cookie_t, args.cookie_y, args.cookie_e) headers = {} if args.user_agent: headers['User-Agent'] = args.user_agent yga = YahooGroupsAPI(args.group, cookie_jar, headers, min_delay=args.delay) # Default to all unique content. This includes topics and raw email, # but not the full email download since that would duplicate html emails we get through topics. if not (args.email or args.files or args.photos or args.database or args.links or args.calendar or args.about or args.polls or args.attachments or args.members or args.topics or args.raw): args.files = args.photos = args.database = args.links = args.calendar = args.about = \ args.polls = args.attachments = args.members = args.topics = args.raw = True with Mkchdir(args.group, sanitize=False): log_file_handler = logging.FileHandler('archive.log', 'a', 'utf-8') log_file_handler.setFormatter(log_formatter) root_logger.addHandler(log_file_handler) if args.warc: try: from warcio import WARCWriter fhwarc = open('data.warc.gz', 'ab') warc_writer = WARCWriter(fhwarc) warcmeta = warc_writer.create_warcinfo_record( fhwarc.name, WARC_META_PARAMS) warc_writer.write_record(warcmeta) yga.set_warc_writer(warc_writer) except ImportError: logging.error( 'WARC output requires the warcio package to be installed.') exit(1) if args.overwrite: hacky_vars['file'] = True if args.email: with Mkchdir('email'): archive_email(yga, message_subset=args.ids, start=args.start, stop=args.stop, noAttachments=args.noattachments) if args.files: with Mkchdir('files'): archive_files(yga) if args.photos: with Mkchdir('photos'): archive_photos(yga) if args.topics: with Mkchdir('topics'): archive_topics(yga, noAttachments=args.noattachments) if args.raw: with Mkchdir('email'): archive_email(yga, message_subset=args.ids, start=args.start, stop=args.stop, skipHTML=True) if args.database: with Mkchdir('databases'): archive_db(yga) if args.links: with Mkchdir('links'): archive_links(yga) if args.about: with Mkchdir('about'): archive_about(yga) if args.polls: with Mkchdir('polls'): archive_polls(yga) if args.attachments: with Mkchdir('attachments'): archive_attachments(yga) if args.members: with Mkchdir('members'): archive_members(yga) if args.calendar: with Mkchdir('calendar'): archive_calendar(yga) if args.warc: fhwarc.close()