def bios_from_wet_url(url, verbose=False): try: time0 = time.time() log("TRYING "+url) r = requests.get(url, stream=True) assert r.status_code == 200, f"*** Got status code {r.status_code} != 200" if verbose: print(f"Status code {r.status_code} for {url}") it = ArchiveIterator(fileobj=r.raw) it.__next__() ans = dedup_exact([bio for record in it for bio in extract_bios_from_page(record.content_stream().read().decode()[:MAX_PAGE_LEN], record.rec_headers.get_header('WARC-Target-URI'))]) log(f"DONE {url} {time.time()-time0:.1f} seconds") return ans except Exception as e: print(f"*** Exception in {url}:", file=sys.stderr) print(f"*** {e}", file=sys.stderr) print(f"***", file=sys.stderr) print("", file=sys.stderr) return None
def get_record(self, url): reqv_resp_pair = self.url_index.get(url) if reqv_resp_pair is not None: self._stream.seek(reqv_resp_pair[0][0]) reqv = next(iter(ArchiveIterator(self._stream))) self._stream.seek(reqv_resp_pair[1][0]) resp = next(iter(ArchiveIterator(self._stream))) return reqv, resp else: raise KeyError('The request or response is missing from the archive for URL: {0}'.format(url))
def deduplicate(self): self._log.log('Start deduplication process.') iaData = {} # dict of (payload digest, URL) => IA response|None with open(self.warc_source, 'rb') as s: for record in ArchiveIterator(s): if record.rec_headers.get_header('WARC-Type') == 'response': iaData[(record.rec_headers.get_header('WARC-Payload-Digest'), record.rec_headers.get_header('WARC-Target-URI'))] = None self.fetch_from_ia(iaData) with open(self.warc_source, 'rb') as s, \ open(self.warc_target, 'wb') as t: writer = WARCWriter(filebuf=t, gzip=self.warc_target.endswith('.gz')) for record in ArchiveIterator(s): url = record.rec_headers.get_header('WARC-Target-URI') record_id = record.rec_headers.get_header('WARC-Record-ID') self._log.log('Processing record {}.'.format(record_id)) if url is not None and url.startswith('<'): url = re.search('^<(.+)>$', url).group(1) self._log.log('Replacing URL in record {} with {}.' .format(record_id, url)) record.rec_headers.replace_header('WARC-Target-URI', url) if record.rec_headers.get_header('WARC-Type') == 'response': self._log.log('Deduplicating record {}.'.format(record_id)) key = (record.rec_headers.get_header('WARC-Payload-Digest'), record.rec_headers.get_header('WARC-Target-URI')) assert key in iaData if iaData[key]: self._log.log('Record {} is a duplicate from {}.' .format(record_id, iaData[key])) writer.write_record( self.response_to_revisit(writer, record, iaData[key]) ) else: if iaData[key] is False: self._log.log('Record {} could not be deduplicated.' .format(record_id)) else: self._log.log('Record {} is not a duplicate.' .format(record_id)) self.register_response(record) writer.write_record(record) elif record.rec_headers.get_header('WARC-Type') == 'warcinfo': self._log.set_warcinfo(record.rec_headers.get_header('WARC-Record-ID')) record.rec_headers.replace_header('WARC-Filename', self.warc_target) writer.write_record(record) else: writer.write_record(record) self._log.log('Writing log to WARC.') writer.write_record(self._log.create_record(writer))
def run(url, outPath, timeLimit, agent, filetypes, warcfilename, wait): cmd = "" if timeLimit: cmd += "timeout {} ".format(timeLimit) waitoption = "" if wait is not None: waitoption = "--wait " + wait agentoption = "" if agent is not None: agentoption = "--user-agent \"" + agent + "\"" filetypesoption = "" if filetypes is not None: filetypesoption = "-A \"" + filetypes + "\"" warcoption = "" warcfilebasename = warcfilename[0:warcfilename.find(".warc.gz")] if warcfilename is not None: warcoption = "--warc-file \"" + warcfilebasename + "\"" if check_wget_compression("wget --help | grep 'no-warc-compression'"): warcoption += " --no-warc-compression" cmd += "wget --mirror {WAIT} {FILETYPES} -q {URL} -P {DOWNLOAD_PATH} {AGENT} {WARC}".format( WAIT=waitoption, FILETYPES=filetypesoption, URL=url, DOWNLOAD_PATH=outPath, AGENT=agentoption, WARC=warcoption) # print("cmd", cmd) try: system_check(cmd) with open(warcfilebasename + ".warc", 'rb') as f_in: with open(warcfilebasename + ".warc.gz", 'wb') as f_out: writer = WARCWriter(f_out, gzip=True) for record in ArchiveIterator(f_in): writer.write_record(record) except subprocess.CalledProcessError as grepexc: with open(warcfilebasename + ".warc", 'rb') as f_in: with open(warcfilebasename + ".warc.gz", 'wb') as f_out: writer = WARCWriter(f_out, gzip=True) for record in ArchiveIterator(f_in): writer.write_record(record) # try except here sys.stderr.write( "Warning: Some files could not be downloaded with wget\n") system_check("rm {WARC}".format(WARC=warcfilebasename + ".warc"))
def decode(x, record_attribute): html_pages_array = [] _, payload = x wholeTextFile = ''.join([c.encode('utf-8') for c in payload]) wholeTextFile = "WARC/1.0 " + wholeTextFile from cStringIO import StringIO from warcio.archiveiterator import ArchiveIterator from html2text import HTML2Text from bs4 import BeautifulSoup stream = StringIO(wholeTextFile) try: for record in ArchiveIterator(stream): # if the record type is a response (which is the case for html page) if record.rec_type == 'response': # check if the response is http if record.http_headers != None: # Get the WARC-RECORD-ID record_id = record.rec_headers.get_header(record_attribute) # Clean up the HTML using BeautifulSoup html = record.content_stream().read() soup = BeautifulSoup(html, "html5lib") data = soup.findAll(text=True) result = filter(visible, data) result2 = ' '.join(result) result2 = ' '.join(result2.split()).encode('utf-8') # Build up the resulting list. # result2 = re.sub(r'[\?\.\!]+(?=[\?\.\!])', '.', result2) html_pages_array.append((record_id, result2)) except Exception: print("Something went wrong with the archive entry") return html_pages_array
def warcToText(url): # request the url/warc.gz file resp = requests.get(url, stream=True) # iterate through the archive fail = 0 succeed = 0 for record in ArchiveIterator(resp.raw, arc2warc=True): # if the record type is a response (which is the case for html page) if record.rec_type == 'response': # check if the response is http if record.http_headers != None: # if the http header is one of the following if ((record.http_headers.get_header('Content-Type') =='text/html') |(record.http_headers.get_header('Content-Type') == 'text/html; charset=UTF-8')\ | (record.http_headers.get_header('Content-Type') =='text/html; charset=utf-8')| (record.http_headers.get_header('Content-Type') =='text/html; charset=ISO-8859-1')\ | (record.http_headers.get_header('Content-Type') =='charset=iso-8859-1')): # return the html page try: html = record.content_stream().read() # from html to plain text html_parse = html5lib.parseFragment(html) s = ''.join(html_parse.itertext()) print(s) succeed = succeed +1 except Exception: fail = fail +1 continue print('fail: %s'%(fail)) print('succeed: %s'%(succeed))
def handleOneSegment(warc_file_path, site_list, is_fake=1): connection = pymysql.connect(host='localhost', port=8889, user='******', password='******', db='gingko', cursorclass=pymysql.cursors.DictCursor) with open(warc_file_path, 'rb') as f: for record in ArchiveIterator(f): if record.rec_type == 'response': headers = record.__dict__['http_headers'].headers content_type = "" for h in headers: if h[0] == 'Content-Type': content_type = h[1] break if not content_type.startswith("text/html"): continue html = record.content_stream().read().decode("cp437") rec_headers = record.__dict__['rec_headers'].headers for h in rec_headers: if h[0] == 'WARC-Target-URI': if h[1].startswith("http://") or h[1].startswith("https://"): site = matchSite(site_list, h[1]) if site: storeInSQL(connection, site, headers, rec_headers, is_fake, html) connection.close()
def download_row(row, language, prefix): """ This method gets the filtered WARC result. row: row object, what is returned bhe CC index. Contains filename, length, offset language: language currently used, not really necessary but useful for data structure prefix: prefix of the S3 dataset, usually https://commoncrawl.s3.amazonaws.com/ """ url = prefix + row.filename url = url.replace(',', '').replace('}', '') # Strip infrequent last characters # This is necessary since the CC data sometimes (but not always) contains a comma offset = int(row.offset.replace(',', '')) end = offset + int(row.length.replace(',', '')) # end = offset + length headers = {"Range": "bytes={}-{}".format(offset, end)} try: # Download WARC bestand voor dit specifieke domein resp = requests.get(url, headers=headers, stream=True) for record in ArchiveIterator(resp.raw, arc2warc=True): try: item = FilteredItem(Item(record.content_stream().read())) if not item.filter_out: if item.to_detect == '': raise Exception('Empty item %s %s %s' % (item.filter_out, item.to_detect, url)) return language, item.to_detect except Exception as e: logging.debug('Skipping record %s, got exception %s', record, e) except Exception as e: logging.info('Warc Error: %s', e) return
def extract_record(cmd): with open(cmd.filename, 'rb') as fh: fh.seek(int(cmd.offset)) it = iter(ArchiveIterator(fh)) record = next(it) try: stdout_raw = sys.stdout.buffer except AttributeError: stdout_raw = sys.stdout if cmd.payload: stream = record.content_stream() buf = stream.read(65536) while buf: stdout_raw.write(buf) buf = stream.read(65536) else: stdout_raw.write(record.rec_headers.to_bytes()) if record.http_headers: stdout_raw.write(record.http_headers.to_bytes()) if not cmd.headers: buf = record.raw_stream.read(65536) while buf: stdout_raw.write(buf) buf = record.raw_stream.read(65536)
def test_log (): logger = Logger () with NamedTemporaryFile() as fd: with WarcHandler (fd, logger) as handler: warclogger = WarcHandlerConsumer (handler) logger.connect (warclogger) golden = [] assert handler.log.tell () == 0 golden.append (logger.info (foo=1, bar='baz', encoding='äöü⇔ΓΨ')) assert handler.log.tell () != 0 handler.maxLogSize = 0 golden.append (logger.info (bar=1, baz='baz')) # should flush the log assert handler.log.tell () == 0 fd.seek (0) for it in ArchiveIterator (fd): headers = it.rec_headers assert headers['warc-type'] == 'metadata' assert 'warc-target-uri' not in headers assert headers['x-crocoite-type'] == 'log' assert headers['content-type'] == f'application/json; charset={handler.logEncoding}' while True: l = it.raw_stream.readline () if not l: break data = json.loads (l.strip ()) assert data == golden.pop (0)
def download_url(self, url): text = None reqv_resp_pair = self._internal_url_index.get(url) if reqv_resp_pair is not None: offset = reqv_resp_pair[1][ 0] # Only need the offset of the response part self._stream.seek( offset ) # Can not be cached as we also want to write it out to the new archive! record = next( iter( ArchiveIterator(self._stream, check_digests=self._check_digest))) data = record.content_stream().read() assert len(data) > 0 enc = record.rec_headers.get_header('WARC-X-Detected-Encoding', 'UTF-8') text = data.decode(enc, 'ignore') else: self._logger.log('CRITICAL', url, 'URL not found in WARC!', sep='\t') return text
def read_memento(self, murl=None): """ This function is for reading memento content. Parameters: murl (str):URI-M Returns: (str): Content on Success and None on Failure """ mpath = self.lookup_memento(murl) response = Utils.get_murl_info(murl, self.__thandle) if mpath: if self.__constants.WARC_EXT in mpath: try: with open(mpath, 'rb') as stream: for record in ArchiveIterator(stream): if record.rec_type == 'response': if self.__config.debug: sys.stdout.write(str(murl["uri"]) + " Content Size: " + str(record.rec_headers.get_header('Content-Length')) + "\n") if (int(response["timestamp"]) < 20090101000000 and int(record.rec_headers.get_header('Content-Length')) < 1000) or (int(response["timestamp"]) > 20200101000000 and int(record.rec_headers.get_header('Content-Length')) < 100000): return None else: return record.content_stream().read() except Exception as e: sys.stderr.write("Memento Read Error: " + str(e) + "\n") elif ".html" in mpath: try: with open(mpath, "r") as stream: return stream.read() except Exception as e: sys.stderr.write("Memento Read Error: " + str(e) + "\n") return None
def test_different_payload(writer): """ Duplicate URL, but different payload """ records = [] for i in range (2): httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) warcHeaders = {} record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), warc_headers_dict=warcHeaders, http_headers=httpHeaders) records.append (record) httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(f'data{i}'.encode ('utf8')), warc_headers_dict=warcHeaders, http_headers=httpHeaders) records.append (record) for r in records: writer.write_record (r) output = NamedTemporaryFile() mergeWarc ([writer.out.name], output) output.seek(0) recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
def test_unmodified(writer): """ Single request/response pair, no revisits """ records = [] httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) warcHeaders = {} record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), warc_headers_dict=warcHeaders, http_headers=httpHeaders) records.append (record) httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'), warc_headers_dict=warcHeaders, http_headers=httpHeaders) records.append (record) for r in records: writer.write_record (r) output = NamedTemporaryFile() mergeWarc ([writer.out.name], output) output.seek(0) recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
def test_resp_revisit_same_url(writer): """ Duplicate record for the same URL, creates a revisit """ records = [] for i in range (2): httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) warcHeaders = {} record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), warc_headers_dict=warcHeaders, http_headers=httpHeaders) records.append (record) httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'), warc_headers_dict=warcHeaders, http_headers=httpHeaders) records.append (record) for r in records: writer.write_record (r) dup = records.pop () ref = records[1] records.append (makeRevisit (writer, ref, dup)) output = NamedTemporaryFile() mergeWarc ([writer.out.name], output) output.seek(0) recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
def process_warc_archive(warc_path): sentry_sdk.init( "https://[email protected]/1409316") start = time.time() processed_records = 0 ignored_records = 0 match_stats = dict() fs = s3fs.S3FileSystem(anon=True) warc_file = fs.open('commoncrawl/%s' % warc_path, 'rb') for i, record in enumerate(ArchiveIterator(warc_file, arc2warc=True)): _should_process_record, data = should_process_record(record) if not _should_process_record: ignored_records += 1 continue headers, body = data process_record(warc_path, record, headers, body, match_stats) processed_records += 1 if i % REPORT_STATUS_EVERY == 0: report_status(i, start, processed_records, ignored_records, warc_path) spent = time.time() - start return warc_path, spent, processed_records, ignored_records, match_stats
def _read_first_response(self, filename): with open(get_test_file(filename), 'rb') as fh: for record in ArchiveIterator(fh): if record.rec_type == 'response': return record.content_stream().read() else: record.content_stream().read()
def process_warcs(self, id_, iterator): s3pattern = re.compile('^s3://([^/]+)/(.+)') base_dir = os.path.abspath(os.path.dirname(__file__)) # S3 client (not thread-safe, initialize outside parallelized loop) no_sign_request = botocore.client.Config( signature_version=botocore.UNSIGNED) s3client = boto3.client('s3', config=no_sign_request) for uri in iterator: self.warc_input_processed.add(1) if uri.startswith('s3://'): self.get_logger().info('Reading from S3 {}'.format(uri)) s3match = s3pattern.match(uri) if s3match is None: self.get_logger().error("Invalid S3 URI: " + uri) continue bucketname = s3match.group(1) path = s3match.group(2) warctemp = TemporaryFile(mode='w+b', dir=self.args.local_temp_dir) try: s3client.download_fileobj(bucketname, path, warctemp) except botocore.client.ClientError as exception: self.get_logger().error('Failed to download {}: {}'.format( uri, exception)) self.warc_input_failed.add(1) warctemp.close() continue warctemp.seek(0) stream = warctemp elif uri.startswith('hdfs://'): self.get_logger().error("HDFS input not implemented: " + uri) continue else: self.get_logger().info('Reading local stream {}'.format(uri)) if uri.startswith('file:'): uri = uri[5:] uri = os.path.join(base_dir, uri) try: stream = open(uri, 'rb') except IOError as exception: self.get_logger().error('Failed to open {}: {}'.format( uri, exception)) self.warc_input_failed.add(1) continue no_parse = (not self.warc_parse_http_header) try: for record in ArchiveIterator(stream, no_record_parse=no_parse): for res in self.process_record(record): yield res self.records_processed.add(1) except ArchiveLoadFailed as exception: self.warc_input_failed.add(1) self.get_logger().error('Invalid WARC: {} - {}'.format( uri, exception)) finally: stream.close()
def process(self, item): filename_in = '%(item_dir)s/%(warc_file_base)s.warc.gz' % item filename_out = '%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item with open(filename_in, 'rb') as file_in: with open(filename_out, 'wb') as file_out: writer = WARCWriter(filebuf=file_out, gzip=True) for record in ArchiveIterator(file_in): if record.rec_headers.get_header( 'WARC-Type') == 'response': record_url = record.rec_headers.get_header( 'WARC-Target-URI') record_digest = record.rec_headers.get_header( 'WARC-Payload-Digest') ia_record = self.ia_available(record_url, record_digest) #print(ia_record) if not ia_record: writer.write_record(record) else: print('Found duplicate, writing revisit record.') writer.write_record( self.revisit_record(writer, record, ia_record)) else: writer.write_record(record)
def test_capture_to_temp_file_append(self): full_path = os.path.join(self.temp_dir, 'example.warc.gz') url = 'http://localhost:{0}/get?foo=bar'.format(self.port) with capture_http(full_path): res = requests.get(url) with capture_http(full_path): res = requests.get(url) with open(full_path, 'rb') as stream: # response ai = ArchiveIterator(stream) response = next(ai) assert response.rec_type == 'response' assert response.rec_headers['WARC-Target-URI'] == url # request request = next(ai) assert request.rec_type == 'request' assert request.rec_headers['WARC-Target-URI'] == url response = next(ai) assert response.rec_type == 'response' assert response.rec_headers['WARC-Target-URI'] == url # request request = next(ai) assert request.rec_type == 'request' assert request.rec_headers['WARC-Target-URI'] == url os.remove(full_path)
def test_unseekable(self): """ Test iterator on unseekable 3 record uncompressed WARC input """ proc = subprocess.Popen( ['cat', get_test_file('example-iana.org-chunked.warc')], stdout=subprocess.PIPE) with closing(ArchiveIterator(proc.stdout)) as a: for record in a: assert record.rec_type == 'warcinfo' assert a.get_record_offset() == 0 break record = next(a) assert record.rec_type == 'response' assert a.get_record_offset() == 405 for record in a: assert record.rec_type == 'request' assert a.get_record_offset() == 8379 break with pytest.raises(StopIteration): record = next(a) assert a.record == None assert a.reader == None assert a.read_to_end() == None proc.stdout.close() proc.wait()
def process(filename_in, filename_out): starttime = datetime.now() dedupemiss = 0 dedupehit = 0 with open(filename_in, 'rb') as file_in: with open(filename_out, 'wb') as file_out: writer = WARCWriter(filebuf=file_out, gzip=True) for record in ArchiveIterator(file_in): if record.rec_headers.get_header('WARC-Type') == 'response': record_url = record.rec_headers.get_header( 'WARC-Target-URI') record_digest = record.rec_headers.get_header( 'WARC-Payload-Digest') ia_record = ia_available(record_url, record_digest) if not ia_record: writer.write_record(record) else: print('Found duplicate, writing revisit record.') writer.write_record( revisit_record(writer, record, ia_record)) dedupehit = dedupehit + 1 else: writer.write_record(record) dedupemiss = dedupemiss + 1 print(str(dedupehit) + " Hits") print(str(dedupemiss) + " Misses") print("took " + str(datetime.now() - starttime) + " to execute")
def test_post_stream(self): warc_writer = BufferWARCWriter(gzip=False) def nop_filter(request, response, warc_writer): assert request assert response return request, response postbuff = BytesIO(b'somedatatopost') url = 'http://localhost:{0}/post'.format(self.port) with capture_http(warc_writer, nop_filter): res = requests.post(url, data=postbuff) # response ai = ArchiveIterator(warc_writer.get_stream()) response = next(ai) assert response.rec_type == 'response' assert response.rec_headers['WARC-Target-URI'] == url assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1' assert res.json() == json.loads(response.content_stream().read().decode('utf-8')) # request request = next(ai) assert request.rec_type == 'request' assert request.rec_headers['WARC-Target-URI'] == url assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1' data = request.content_stream().read().decode('utf-8') assert data == 'somedatatopost'
def test_unseekable_gz(self): """ Test iterator on unseekable 3 record uncompressed gzipped WARC input """ proc = subprocess.Popen( ['cat', get_test_file('example-resource.warc.gz')], stdout=subprocess.PIPE) with closing(ArchiveIterator(proc.stdout)) as a: for record in a: assert record.rec_type == 'warcinfo' assert a.get_record_offset() == 0 break record = next(a) assert record.rec_type == 'warcinfo' assert a.get_record_offset() == 361 for record in a: assert record.rec_type == 'resource' assert a.get_record_offset() == 802 break with pytest.raises(StopIteration): record = next(a) assert a.record == None assert a.reader == None assert a.read_to_end() == None proc.stdout.close() proc.wait()
def process(self, item): digests = {} input_filename = '%(item_dir)s/%(warc_file_base)s.warc.gz' % item output_filename = '%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item with open(input_filename, 'rb') as f_in, \ open(output_filename, 'wb') as f_out: writer = WARCWriter(filebuf=f_out, gzip=True) for record in ArchiveIterator(f_in): url = record.rec_headers.get_header('WARC-Target-URI') if url is not None and url.startswith('<'): url = re.search('^<(.+)>$', url).group(1) record.rec_headers.replace_header('WARC-Target-URI', url) if record.rec_headers.get_header('WARC-Type') == 'response': digest = record.rec_headers.get_header('WARC-Payload-Digest') if digest in digests: writer.write_record( self._record_response_to_revisit(writer, record, digests[digest]) ) else: digests[digest] = ( record.rec_headers.get_header('WARC-Record-ID'), record.rec_headers.get_header('WARC-Date'), record.rec_headers.get_header('WARC-Target-URI') ) writer.write_record(record) elif record.rec_headers.get_header('WARC-Type') == 'warcinfo': record.rec_headers.replace_header('WARC-Filename', output_filename) writer.write_record(record) else: writer.write_record(record)
def _find_first_by_type(self, filename, match_type, **params): with open(get_test_file(filename), 'rb') as fh: with closing(ArchiveIterator(fh, **params)) as a: for record in a: if record.rec_type == match_type: yield record break
def read_warc_archive(self, archive_path): with open(archive_path, 'rb') as stream: for record in ArchiveIterator(stream): if record.rec_type == 'response': try: parser = BeautifulSoup(record.content_stream().read(), features="html.parser") except: continue links = parser.find_all("a") if links: for link in links: href = link.attrs.get("href") if href is not None: if self.domain in href and href.startswith( "http"): path = urlparse(href).path domain_link = self.proper_domain + path self.data.append({ '{0}_link'.format(self.domain_name): domain_link, 'reference_link': record.rec_headers.get_header( 'WARC-TARGET-URI'), 'warc_date': dateutil.parser.parse( record.rec_headers.get_header( 'WARC-Date')) })
def test_iterator(self): """ Test iterator semantics on 3 record WARC """ with open(get_test_file('example-iana.org-chunked.warc'), 'rb') as fh: with closing(ArchiveIterator(fh)) as a: for record in a: assert record.rec_type == 'warcinfo' assert a.get_record_offset() == 0 assert record.digest_checker.passed is None assert len(record.digest_checker.problems) == 0 break record = next(a) assert record.rec_type == 'response' assert a.get_record_offset() == 405 assert record.digest_checker.passed is None assert len(record.digest_checker.problems) == 0 for record in a: assert record.rec_type == 'request' assert a.get_record_offset() == 8379 assert record.digest_checker.passed is None assert len(record.digest_checker.problems) == 0 break with pytest.raises(StopIteration): record = next(a) assert a.record == None assert a.reader == None assert a.read_to_end() == None
def __process_warc_gz_file(self, path_name): """ Iterates all transactions in one WARC file and for each transaction tries to extract an article object. Afterwards, each article is checked against the filter criteria and if all are passed, the function on_valid_article_extracted is invoked with the article object. :param path_name: :return: """ counter_article_total = 0 counter_article_passed = 0 counter_article_discarded = 0 start_time = time.time() def read_record(record): record.read_stream = record.raw_stream.read() return record def delete_raw_stream(record): del record.raw_stream return record with open(path_name, 'rb') as stream: warc_records = list(map(read_record, ArchiveIterator(stream))) self.__logger.info('Extracting {} records'.format( len(warc_records))) warc_records = list(map(delete_raw_stream, warc_records)) self.__callback_on_article_extracted(warc_records)
def proxy(): iapath = request.args['iapath'] rng = request.args['range'] item, fn = iapath.split('/') meta = requests.get(f"https://archive.org/metadata/{item}").json() r = requests.get(f"https://{meta['d1']}{meta['dir']}/{fn}", headers={"Range": f"bytes={rng}"}, stream=True) for record in ArchiveIterator(r.raw, arc2warc=True): if record.rec_type == 'response': def stream(): stream = record.content_stream() buf = stream.read(8192) while buf: yield buf buf = stream.read(8192) r.close() return Response( stream(), mimetype=record.http_headers.get_header('Content-Type')) return abort(404)
def parse_uploaded(self, stream, expected_size): """Parse WARC archive. :param stream: file object :param int expected_size: expected WARC archive size :returns: list of recordings (indices) :rtype: list """ arciterator = ArchiveIterator(stream, no_record_parse=True, verify_http=True, block_size=BLOCK_SIZE) infos = [] last_indexinfo = None indexinfo = None is_first = True remote_archives = None for record in arciterator: warcinfo = None if record.rec_type == 'warcinfo': try: warcinfo = self.parse_warcinfo(record) except Exception as e: print('Error Parsing WARCINFO') traceback.print_exc() elif remote_archives is not None: source_uri = record.rec_headers.get('WARC-Source-URI') if source_uri: if self.wam_loader: res = self.wam_loader.find_archive_for_url(source_uri) if res: remote_archives.add(res[2]) arciterator.read_to_end(record) if last_indexinfo: last_indexinfo['offset'] = arciterator.member_info[0] last_indexinfo = None if warcinfo and 'json-metadata' in warcinfo: self.add_index_info(infos, indexinfo, arciterator.member_info[0]) indexinfo = warcinfo.get('json-metadata') indexinfo['offset'] = None if 'title' not in indexinfo: indexinfo['title'] = 'Uploaded Recording' if 'type' not in indexinfo: indexinfo['type'] = 'recording' indexinfo['ra'] = set() remote_archives = indexinfo['ra'] last_indexinfo = indexinfo elif is_first: indexinfo = {'type': 'recording', 'title': 'Uploaded Recording', 'offset': 0, } if is_first and warcinfo and 'software' in warcinfo: indexinfo['warcinfo:software'] = warcinfo['software'] indexinfo['warcinfo:datetime'] = record.rec_headers.get('WARC-Date') is_first = False if indexinfo: self.add_index_info(infos, indexinfo, stream.tell()) # if anything left over, likely due to WARC error, consume remainder if stream.tell() < expected_size: while True: buff = stream.read(8192) if not buff: break return infos