def _parse_line(self, line=""): """Parses single line of a CDX file and returns selected and derived attributes in a namedtuple.""" segs = line.strip().split(" ") if len(segs) > 3: url = urlparse(segs[2]) dom = tldextract.extract(segs[2]) Segments = namedtuple("Segments", "scheme, host, domain, tld, surt, uri, time, mime") return Segments(url.scheme, url.netloc, surt(dom.registered_domain), surt(dom.suffix), surt(segs[0]), segs[0], segs[1], segs[3])
def _parse_line(self, line=""): """Parses single line of a CDX file and returns selected and derived attributes in a namedtuple.""" segs = line.strip().split(" ") if len(segs) > 3: url = urlparse(segs[2]) dom = tldextract.extract(segs[2]) Segments = namedtuple( "Segments", "scheme, host, domain, tld, surt, uri, time, mime") return Segments(url.scheme, url.netloc, surt(dom.registered_domain), surt(dom.suffix), surt(segs[0]), segs[0], segs[1], segs[3])
def test_surt_return_type(burl): """surt.surt() returns the same type of string object (i.e. returns unicode string for unicode string input, and byets for bytes) Note this behavior may change in the future versions. This test is for testing compatibility until that happens. """ assert isinstance(burl, bytes) b = surt.surt(burl) assert type(b) is type(burl) uurl = burl.decode('ascii') u = surt.surt(uurl) assert type(u) is type(uurl)
def timestamp_simhash(redis_db, url, timestamp): """Get stored simhash data from Redis for URL and timestamp """ try: if url and timestamp: results = redis_db.hget(surt(url), timestamp) if results: return {'simhash': results} results = redis_db.hget(surt(url), timestamp[:4]) if results: return {'status': 'error', 'message': 'NO_CAPTURES'} return {'status': 'error', 'message': 'CAPTURE_NOT_FOUND'} except RedisError as exc: logging.error('error loading simhash data for url %s timestamp %s (%s)', url, timestamp, exc)
def getCDXJLinesWithURIR(urir, indexPath=ipwbConfig.getIPWBReplayIndexPath()): s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False) cdxjLinesWithURIR = [] cdxjLineIndex = getCDXJLine_binarySearch(s, indexPath, True, True) # get i if cdxjLineIndex is None: return [] cdxjLines = [] with open(indexPath, 'r') as f: cdxjLines = f.read().split('\n') baseCDXJLine = cdxjLines[cdxjLineIndex] # via binsearch cdxjLinesWithURIR.append(baseCDXJLine) # Get lines before pivot that match surt sI = cdxjLineIndex - 1 while sI >= 0: if cdxjLines[sI].split(' ')[0] == s: cdxjLinesWithURIR.append(cdxjLines[sI]) sI -= 1 # Get lines after pivot that match surt sI = cdxjLineIndex + 1 while sI < len(cdxjLines): if cdxjLines[sI].split(' ')[0] == s: cdxjLinesWithURIR.append(cdxjLines[sI]) sI += 1 return cdxjLinesWithURIR
def showTimeMap(urir, format): urir = getCompleteURI(urir) s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False) indexPath = ipwbUtils.getIPWBReplayIndexPath() cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath) tmContentType = '' hostAndPort = ipwbUtils.getIPWBReplayConfig() tgURI = 'http://{0}:{1}/timegate/{2}'.format( hostAndPort[0], hostAndPort[1], urir) tm = '' # Initialize for usage beyond below conditionals if format == 'link': tm = generateLinkTimeMapFromCDXJLines( cdxjLinesWithURIR, s, request.url, tgURI) tmContentType = 'application/link-format' elif format == 'cdxj': tm = generateCDXJTimeMapFromCDXJLines( cdxjLinesWithURIR, s, request.url, tgURI) tmContentType = 'application/cdxj+ors' resp = Response(tm) resp.headers['Content-Type'] = tmContentType return resp
def verify_block(urim): surim = surt.surt(urim) status = "FAILED" blkct = 0 t2_b = timeit.default_timer() for blk in blkfs: blkct += 1 rec = lookup_in_block(surim, blk) if not rec: continue t3_b = timeit.default_timer() t2_diff = t3_b - t2_b mfp = generate_current(urim) t4_b = timeit.default_timer() t3_diff = t4_b - t3_b mf = json.load(open(mfp)) if rec["hash"] == mf["hash"]: status = "VERIFIED" t4_diff = timeit.default_timer() - t4_b break return { "status": status, "blkct": blkct, "lookupt": t2_diff, "gent": t3_diff, "verift": t4_diff }
def __init__(self, runres, qrel, qtitles): with codecs.open(qtitles, "r", encoding='utf-8', errors='ignore') as f: # with open(qtitles, 'rb') as f: csv_reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE) for row in csv_reader: # for utf8_row in csv_reader: # row = [x.decode('utf8') for x in utf8_row] if len(row) < 2: continue self.qid_title_dict[row[0]] = row[1] f.close() with codecs.open(qrel, "r", encoding='utf-8', errors='ignore') as f: # with codecs.open(qrel, "rb") as f: csv_reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE) for row in csv_reader: # for utf8_row in csv_reader: # only select relevant document-query pairs # row = [x.decode('utf8') for x in utf8_row] # if row[3] == '1': # self.doc_query_pairs.append((row[2], row[0])) # if int(row[1]) <= 100: if len(row) < 5: continue self.doc_query_pairs.append((surt(row[4]), row[0])) # TODO: for now one-to-one query-url pair # self.doc_query_dict[surt(row[4])] = row[0] f.close() self._runRes = runres
def get_pseudo_rel_qd_bing(self, top_k): """ query-doc relevance grade query_id \t rank \t title \t description \t url \t query_id (tab delimiter) :param top_k: top k documents :return: """ d = [] q = [] with codecs.open(self._runRes, "r", encoding='utf-8', errors='ignore') as f: # with open(self._runRes, "rb") as f: next(f) # skip the first line csv_reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE) for row in csv_reader: # for utf8_row in csv_reader: # row = [x.decode('utf8') for x in utf8_row] if len(row) <= 1: print(row) continue if int(row[1]) <= top_k: # urls in Bing is not normalized yet d.append(surt(row[4])) # get Bing rank as label # qid - rank q.append([row[0], row[1]]) f.close() return d, q
def resolveMemento(urir, datetime): """ Request a URI-R at a supplied datetime from the CDXJ """ urir = getCompleteURI(urir) if ipwbUtils.isLocalHosty(urir): urir = urir.split('/', 4)[4] s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False) indexPath = ipwbUtils.getIPWBReplayIndexPath() print('Getting CDXJ Lines with the URI-R {0} from {1}' .format(urir, indexPath)) cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath) closestLine = getCDXJLineClosestTo(datetime, cdxjLinesWithURIR) if closestLine is None: msg = '<h1>ERROR 404</h1>' msg += 'No capture found for {0} at {1}.'.format(urir, datetime) return Response(msg, status=404) uri = unsurt(closestLine.split(' ')[0]) newDatetime = closestLine.split(' ')[1] linkHeader = getLinkHeaderAbbreviatedTimeMap(urir, newDatetime) return (newDatetime, linkHeader, uri)
def memento_data_for_url(request, url, qs=None, hash=None): from perma.models import Link #noqa try: canonicalized = surt.surt(url) except ValueError: return {} mementos = [{ 'uri': memento_url(request, link), 'datetime': link.creation_timestamp, } for link in Link.objects.visible_to_memento().filter( submitted_url_surt=canonicalized).order_by('creation_timestamp')] if not mementos: return {} return { 'self': request.build_absolute_uri(), 'original_uri': url, 'timegate_uri': timegate_url(request, url), 'timemap_uri': { 'json_format': timemap_url(request, url, 'json'), 'link_format': timemap_url(request, url, 'link'), 'html_format': timemap_url(request, url, 'html'), }, 'mementos': { 'first': mementos[0], 'last': mementos[-1], 'list': mementos, } }
def show_timemap(urir, format): urir = compile_target_uri(urir, request.query_string) s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False) index_path = ipwb_utils.get_ipwb_replay_index_path() cdxj_lines_with_urir = get_cdxj_lines_with_urir(urir, index_path) tm_content_type = '' host_and_port = ipwb_utils.get_ipwb_replay_config() tg_uri = f'http://{host_and_port[0]}:{host_and_port[1]}/timegate/{urir}' tm = '' # Initialize for usage beyond below conditionals if format == 'link': tm = generate_link_timemap_from_cdxj_lines(cdxj_lines_with_urir, s, request.url, tg_uri) tm_content_type = 'application/link-format' elif format == 'cdxj': tm = generate_cdxj_timemap_from_cdxj_lines(cdxj_lines_with_urir, s, request.url, tg_uri) tm_content_type = 'application/cdxj+ors' resp = Response(tm) resp.headers['Content-Type'] = tm_content_type return resp
def canonicalize(url, surt_ordered=True): """ Canonicalize url and convert to surt If not in surt ordered mode, convert back to url form as surt conversion is currently part of canonicalization >>> canonicalize('http://example.com/path/file.html', surt_ordered=True) 'com,example)/path/file.html' >>> canonicalize('http://example.com/path/file.html', surt_ordered=False) 'example.com/path/file.html' >>> canonicalize('urn:some:id') 'urn:some:id' """ try: key = surt.surt(url) except Exception as e: #pragma: no cover # doesn't happen with surt from 0.3b # urn is already canonical, so just use as-is if url.startswith('urn:'): return url raise UrlCanonicalizeException('Invalid Url: ' + url) # if not surt, unsurt the surt to get canonicalized non-surt url if not surt_ordered: key = unsurt(key) return key
def _assemble_entry(self, recorded_url, records): if recorded_url.response_recorder: if recorded_url.response_recorder.payload_digest.name == "sha1": sha1base32 = base64.b32encode( recorded_url.response_recorder.payload_digest.digest() ).decode("utf-8") else: self.logger.warn( "digest type is %s but big captures table is indexed " "by sha1", recorded_url.response_recorder.payload_digest.name) else: digest = hashlib.new("sha1", records[0].content[1]) sha1base32 = base64.b32encode(digest.digest()).decode("utf-8") if (recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta): bucket = recorded_url.warcprox_meta["captures-bucket"] else: bucket = "__unspecified__" canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False, with_scheme=True) entry = { # id only specified for rethinkdb partitioning "id": "{} {}".format( canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), "abbr_canon_surt": canon_surt[:150], "canon_surt": canon_surt, "timestamp": recorded_url.timestamp.replace( tzinfo=rethinkstuff.UTC), "url": recorded_url.url.decode("utf-8"), "offset": records[0].offset, "filename": os.path.basename(records[0].warc_filename), "warc_type": records[0].type.decode("utf-8"), "warc_id": records[0].id.decode("utf-8"), "sha1base32": sha1base32, "content_type": recorded_url.mimetype, "response_code": recorded_url.status, "http_method": recorded_url.method, "bucket": bucket, "record_length": records[0].length, # compressed (or not) length of # warc record including record # headers "wire_bytes": recorded_url.size, # count of bytes transferred over # the wire, including http headers # if any } if recorded_url.warcprox_meta: if "dedup-ok" in recorded_url.warcprox_meta: entry["dedup_ok"] = recorded_url.warcprox_meta["dedup-ok"] if "captures-table-extra-fields" in recorded_url.warcprox_meta: extras = recorded_url.warcprox_meta[ "captures-table-extra-fields"] for extra_field in extras: entry[extra_field] = extras[extra_field] return entry
def test_media_capture_in_iframes(self): settings.ENABLE_AV_CAPTURE = True target_folder = self.org_user.root_folder obj = self.successful_post(self.list_url, data={ 'url': self.server_url + "/test_media_outer.html", 'folder': target_folder.pk, }, user=self.org_user) # verify that all images in src and srcset were found and captured expected_captures = ( # test_media_a.html "test.wav", "test2.wav", # test_media_b.html "test.mp4", "test2.mp4", # test_media_c.html "test.swf", "test2.swf", "test3.swf", "test1.jpg", "test2.png", "test_fallback.jpg", "wide1.png", "wide2.png", "narrow.png" ) failures = [] for expected_capture in expected_captures: try: cdxline = CDXLine.objects.get(urlkey=surt(self.server_url + "/" + expected_capture), link_id=obj['guid']) if cdxline.parsed['status'] != '200': failures.append("%s returned HTTP status %s." % (expected_capture, cdxline.parsed['status'])) except CDXLine.DoesNotExist: failures.append("%s not captured." % expected_capture) self.assertFalse(bool(failures), "Failures in fetching media from iframes: %s" % failures)
def handle_results(redis_db, timestamps_to_fetch, url, snapshots_per_page, page=None): """Utility method used by `year_simhash` """ available_simhashes = [] if page: number_of_pages = ceil(len(timestamps_to_fetch) / snapshots_per_page) if page > number_of_pages: page = number_of_pages if number_of_pages > 0: timestamps_to_fetch = \ timestamps_to_fetch[(page - 1) * snapshots_per_page:(page * snapshots_per_page)] else: number_of_pages = 1 try: results = redis_db.hmget(surt(url), timestamps_to_fetch) for i, simhash in enumerate(results): available_simhashes.append([str(timestamps_to_fetch[i]), simhash]) if page: available_simhashes.insert(0, ["pages", number_of_pages]) return available_simhashes except RedisError as exc: logging.error('cannot handle results for url %s page %d (%s)', url, page, exc)
def getCDXJLinesWithURIR(urir, indexPath): """ Get all CDXJ records corresponding to a URI-R """ if not indexPath: indexPath = ipwbUtils.getIPWBReplayIndexPath() indexPath = getIndexFileFullPath(indexPath) print('Getting CDXJ Lines with {0} in {1}'.format(urir, indexPath)) s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False) cdxjLinesWithURIR = [] cdxjLineIndex = getCDXJLine_binarySearch(s, indexPath, True, True) # get i if cdxjLineIndex is None: return [] cdxjLines = [] with open(indexPath, 'r') as f: cdxjLines = f.read().split('\n') baseCDXJLine = cdxjLines[cdxjLineIndex] # via binsearch cdxjLinesWithURIR.append(baseCDXJLine) # Get lines before pivot that match surt sI = cdxjLineIndex - 1 while sI >= 0: if cdxjLines[sI].split(' ')[0] == s: cdxjLinesWithURIR.append(cdxjLines[sI]) sI -= 1 # Get lines after pivot that match surt sI = cdxjLineIndex + 1 while sI < len(cdxjLines): if cdxjLines[sI].split(' ')[0] == s: cdxjLinesWithURIR.append(cdxjLines[sI]) sI += 1 return cdxjLinesWithURIR
def showMementosForURIRs(urir): urir = getCompleteURI(urir) if ipwbConfig.isLocalHosty(urir): urir = urir.split('/', 4)[4] s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False) indexPath = ipwbConfig.getIPWBReplayIndexPath() print('Getting CDXJ Lines with the URI-R {0} from {1}'.format( urir, indexPath)) cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath) if len(cdxjLinesWithURIR) == 1: fields = cdxjLinesWithURIR[0].split(' ', 2) redirectURI = '/{1}/{0}'.format(unsurt(fields[0]), fields[1]) return redirect(redirectURI, code=302) msg = '' if cdxjLinesWithURIR: msg += '<p>{0} capture(s) available:</p><ul>'.format( len(cdxjLinesWithURIR)) for line in cdxjLinesWithURIR: fields = line.split(' ', 2) dt14 = fields[1] dtrfc1123 = ipwbConfig.datetimeToRFC1123(fields[1]) msg += ('<li><a href="/{1}/{0}">{0} at {2}</a></li>'.format( unsurt(fields[0]), dt14, dtrfc1123)) msg += '</ul>' return Response(msg)
def generate_surt(self, url): if self.RE_NONCHARS.search(url): logger.warn("Questionable characters found in URL [%s]" % url) return None surtVal = surt.surt(url) #### WA: ensure SURT has scheme of original URL ------------ # line_scheme = RE_SCHEME.match(line) # would allow http and https (and any others) line_scheme = 'http://' # for wayback, all schemes need to be only http surt_scheme = self.RE_SCHEME.match(surtVal) if line_scheme and not surt_scheme: if re.match(r'\(', surtVal): # surtVal = line_scheme.group(0) + surtVal surtVal = line_scheme + surtVal logger.debug("Added scheme [%s] to surt [%s]" % (line_scheme, surtVal)) else: # surtVal = line_scheme.group(0) + '(' + surtVal surtVal = line_scheme + '(' + surtVal # logger.debug("Added scheme [%s] and ( to surt [%s]" % (line_scheme, surtVal)) surtVal = re.sub(r'\)/$', ',', surtVal) return surtVal
def regenerate_urlkeys(urlkey_prefix='file'): """ Rewrite CDXLine urlkeys using the current version of the surt library. """ from perma.models import CDXLine from surt import surt target_cdxlines = CDXLine.objects.all() if urlkey_prefix: target_cdxlines = target_cdxlines.filter( urlkey__startswith=urlkey_prefix) for i, cdxline in enumerate(target_cdxlines): if not (i % 1000): print "%s records done -- next is %s." % (i, cdxline.link_id) new_surt = surt(cdxline.parsed['url']) if new_surt != cdxline.urlkey: try: cdxline.raw = cdxline.raw.replace(cdxline.urlkey, new_surt, 1) except UnicodeDecodeError: print "Skipping unicode for %s" % cdxline.link_id continue cdxline.urlkey = new_surt cdxline.save()
def _dsub(self, url): ext = tldextract.extract(url) reg_dom = surt(ext.registered_domain) if reg_dom[0].isalpha() and ")/" in reg_dom: subdom_len = 0 if ext.subdomain: subdom_len = ext.subdomain.count(".") + 1 return f"{reg_dom}{subdom_len}"
def load_cdx(self, query): """ This function accepts a standard CDX request, except with a GUID instead of date, and returns a standard CDX 11 response. """ guid = query.params['guid'] url = query.url # We'll first check the key-value store to see if we cached the lookup for this guid on a previous request. # This will be common, since each playback triggers lots of requests for the same .warc file. cache_key = guid + '-surts' url_key = guid+'-url' surt_lookup = django_cache.get(cache_key) url = url or django_cache.get(url_key) if surt_lookup and url: surt_lookup = json.loads(surt_lookup) else: # nothing in cache; find requested link in database try: link = Link.objects.select_related().get(pk=guid) except Link.DoesNotExist: return [] # cache url, which may be blank if this is the first request if not url: url = link.submitted_url django_cache.set(url_key, url, timeout=60*60) # get warc file for asset in link.assets.all(): if '.warc' in asset.warc_capture: warc_path = os.path.join(asset.base_storage_path, asset.warc_capture) break else: return [] # no .warc file -- do something to handle this? # now we have to get an index of all the URLs in this .warc file # first try fetching it from a .cdx file on disk cdx_path = warc_path.replace('.gz', '').replace('.warc', '.cdx') if not default_storage.exists(cdx_path): # there isn't a .cdx file on disk either -- let's create it with default_storage.open(warc_path, 'rb') as warc_file, default_storage.open(cdx_path, 'wb') as cdx_file: write_cdx_index(cdx_file, warc_file, warc_path, sort=True) # now load the URL index from disk and stick it in the cache cdx_lines = (line.strip() for line in default_storage.open(cdx_path, 'rb')) surt_lookup = dict((key, list(val)) for key, val in groupby(cdx_lines, key=lambda line: line.split(' ', 1)[0])) django_cache.set(cache_key, json.dumps(surt_lookup), timeout=60*60) # find cdx lines for url sorted_url = surt(url) if sorted_url in surt_lookup: return (str(i) for i in surt_lookup[sorted_url]) # didn't find requested url in this archive return []
def _dsub(self, url): ext = tldextract.extract(url) urlseg = urlparse("http://" + url) reg_dom = surt(ext.registered_domain) if reg_dom[0].isalpha() and ")/" in reg_dom: subdom_len = 0 if ext.subdomain: subdom_len = ext.subdomain.count(".") + 1 return "{0}{1}".format(reg_dom, subdom_len)
def run(self, url, year, created): """Run Celery Task. """ self.job_id = self.request.id self.url = url_fix(url) time_started = datetime.now() self._log.info('Start calculating simhashes.') self.download_errors = 0 if not self.url: self._log.error('did not give url parameter') return {'status': 'error', 'info': 'URL is required.'} if not year: self._log.error('did not give year parameter') return {'status': 'error', 'info': 'Year is required.'} # fetch captures self.update_state( state='PENDING', meta={'info': 'Fetching %s captures for year %s' % (url, year)}) resp = self.fetch_cdx(url, year) if resp.get('status') == 'error': return resp captures = resp.get('captures') total = len(captures) self.seen = dict() # calculate simhashes in parallel i = 0 final_results = {} for res in self.tpool.map(self.get_calc, captures): if not res: continue (timestamp, simhash) = res if simhash: final_results[timestamp] = simhash if i % 10 == 0: self.update_state(state='PENDING', meta={ 'info': 'Processed %d out of %d captures.' % (i, total) }) i += 1 self._log.info('%d final results for %s and year %s.', len(final_results), self.url, year) if final_results: try: urlkey = surt(self.url) self.redis.hmset(urlkey, final_results) self.redis.expire(urlkey, self.simhash_expire) except RedisError as exc: self._log.error('cannot write simhashes to Redis for URL %s', self.url, exc_info=1) duration = (datetime.now() - time_started).seconds self._log.info('Simhash calculation finished in %.2fsec.', duration) return {'duration': str(duration)}
def showTimeMap(urir, format): s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False) indexPath = ipwbConfig.getIPWBReplayIndexPath() cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath) tm = generateTimeMapFromCDXJLines(cdxjLinesWithURIR, s, request.url) return Response(tm)
def save_to_redis(self, ts, data): try: urlkey = surt(self.url) self._log.info('save simhash to Redis for timestamp %s urlkey %s', ts, urlkey) self.redis_db.hset(urlkey, ts, base64.b64encode(str(data).encode('ascii'))) except RedisError as exc: self._log.error('cannot save simhash to Redis (%s)', exc)
def normalize_url(url, base_url): absolute = urljoin(base_url, url) # Use SURT to do most normalization, but don't return in SURT format. result = surt(absolute, reverse_ipaddr=False, surt=False, with_scheme=True) # Use HTTPS for all web URLs. Don't translate other schemes (e.g. FTP). if result.startswith('http:'): result = f'https:{result[5:]}' return result
def __call__(self, url): try: key = surt.surt( url, trailing_comma=True, host_massage=False, with_scheme=True) # logging.debug('%s -> %s', url, key) return key except Exception as e: raise pywb.utils.canonicalize.UrlCanonicalizeException( 'Invalid Url: ' + url)
def _update_ds(self, count, entry): """Update data structure after processing a line from the CDX""" try: suburis = generate_suburis(surt(entry), max_host_segments=self.max_host_segments, max_path_segments=self.max_path_segments) for s in suburis: self._update_record("suburi", s, count) #self._update_record("time", entry.time[0:6], entry.surt) #self._update_record("mediatype", entry.mime, entry.surt) except: print("Something went wrong while processing " + entry)
def cdx(request): """ This function handles WARC lookups by our warc server (running in warc_server). It accepts a standard CDX request, except with a GUID instead of date, and returns a standard CDX 11 response. If there's no warc for the requested GUID, or the requested URL isn't stored in that WARC, it returns a 404. """ # find requested link and url try: link = Link.objects.select_related().get(pk=request.POST.get('guid')) except Link.DoesNotExist: print "COULDN'T FIND LINK" raise Http404 url = request.POST.get('url', link.submitted_url) # get warc file for asset in link.assets.all(): if '.warc' in asset.warc_capture: warc_path = os.path.join(settings.GENERATED_ASSETS_STORAGE, asset.base_storage_path, asset.warc_capture) break else: if settings.USE_WARC_ARCHIVE: print "COULDN'T FIND WARC" raise Http404 # no .warc file -- do something to handle this else: warc_path = os.path.join(settings.GENERATED_ASSETS_STORAGE, asset.base_storage_path, "archive.warc.gz") # get cdx file cdx_path = warc_path.replace('.gz', '').replace('.warc', '.cdx') try: cdx_file = open(cdx_path, 'rb') except IOError: # if we can't find the CDX file associated with this WARC, create it cdx_lines = StringIO.StringIO() cdx_writer.CDX_Writer(warc_path, cdx_lines).make_cdx() cdx_lines = cdx_lines.getvalue().split("\n") with open(cdx_path, 'wb') as cdx_file: cdx_file.write("\n".join(sorted(cdx_lines))) cdx_file = open(cdx_path, 'rb') # find cdx lines for url sorted_url = surt.surt(url) out = "" for line in cdx_file: if line.startswith(sorted_url+" "): out += line elif out: # file may contain multiple matching lines in a row; we want to return all of them # if we've already found one or more matching lines, and now they're no longer matching, we're done break if out: return HttpResponse(out, content_type="text/plain") print "COULDN'T FIND URL" raise Http404 # didn't find requested url in .cdx file
def _dpth(self, url): ext = tldextract.extract(url) urlseg = urlparse(url) reg_dom = surt(ext.registered_domain) if reg_dom[0].isalpha() and ")/" in reg_dom: subdom_len = path_len = 0 if ext.subdomain: subdom_len = ext.subdomain.count(".") + 1 if urlseg.path: path_len = urlseg.path.strip("\n\r/").count("/") + 1 return f"{reg_dom}{subdom_len}/{path_len}"
def _dpth(self, url): ext = tldextract.extract(url) urlseg = urlparse("http://" + url) reg_dom = surt(ext.registered_domain) if reg_dom[0].isalpha() and ")/" in reg_dom: subdom_len = path_len = 0 if ext.subdomain: subdom_len = ext.subdomain.count(".") + 1 if urlseg.path: path_len = urlseg.path.strip("\n\r/").count("/") + 1 return "{0}{1}/{2}".format(reg_dom, subdom_len, path_len)
def getLinkHeaderAbbreviatedTimeMap(urir, pivotDatetime): s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False) indexPath = ipwbUtils.getIPWBReplayIndexPath() cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath) hostAndPort = ipwbUtils.getIPWBReplayConfig() tgURI = 'http://{0}:{1}/timegate/{2}'.format( hostAndPort[0], hostAndPort[1], urir) tmURI = 'http://{0}:{1}/timemap/link/{2}'.format( hostAndPort[0], hostAndPort[1], urir) tm = generateLinkTimeMapFromCDXJLines(cdxjLinesWithURIR, s, tmURI, tgURI) # Fix base TM relation when viewing abbrev version in Link resp tm = tm.replace('rel="self timemap"', 'rel="timemap"') # Only one memento in TimeMap if 'rel="first last memento"' in tm: return tm.replace('\n', ' ').strip() tmLines = tm.split('\n') for idx, line in enumerate(tmLines): if len(re.findall('rel=.*memento"', line)) == 0: continue # Not a memento if pivotDatetime in line: addBothNextAndPrev = False if idx > 0 and idx < len(tmLines) - 1: addBothNextAndPrev = True if addBothNextAndPrev or idx == 0: tmLines[idx + 1] = \ tmLines[idx + 1].replace('memento"', 'next memento"') if addBothNextAndPrev or idx == len(tmLines) - 1: tmLines[idx - 1] = \ tmLines[idx - 1].replace('memento"', 'prev memento"') break # Remove all mementos in abbrev TM that are not: # first, last, prev, next, or pivot for idx, line in enumerate(tmLines): if len(re.findall('rel=.*memento"', line)) == 0: continue # Not a memento if pivotDatetime in line: continue if len(re.findall('rel=.*(next|prev|first|last)', line)) == 0: tmLines[idx] = '' return ' '.join(filter(None, tmLines))
def test_should_capture_all_srcset_images(self): target_folder = self.org_user.root_folder obj = self.successful_post(self.list_url, data={ 'url': self.server_url + "/test_media_outer.html", 'folder': target_folder.pk, }, user=self.org_user) # verify that all images in src and srcset were found and captured expected_captures = ("test1.jpg", "test2.png", "test_fallback.jpg", "wide1.png", "wide2.png", "narrow.png") for expected_capture in expected_captures: self.assertEqual('200', CDXLine.objects.get(urlkey=surt(self.server_url + "/" + expected_capture), link_id=obj['guid']).parsed['status'])
def test_should_capture_nested_audio_file(self): settings.ENABLE_AV_CAPTURE = True target_folder = self.org_user.root_folder obj = self.successful_post(self.list_url, data={ 'url': self.server_url + "/test_wav_outer.html", 'folder': target_folder.pk, }, user=self.org_user) # verify that embedded /test.* files in iframe were found and captured expected_captures = ("test.wav", "test2.wav", "test.mp4", "test2.mp4", "test.swf", "test2.swf", "test3.swf") for expected_capture in expected_captures: self.assertEqual('200', CDXLine.objects.get(urlkey=surt(self.server_url + "/" + expected_capture), link_id=obj['guid']).parsed['status'])
def get_massaged_url(self, record, use_precalculated_value=True): if use_precalculated_value: return self.surt if 'warcinfo' == record.type: return self.get_original_url(record) else: url = record.url if self.screenshot_mode: url = 'http://web.archive.org/screenshot/'+url try: return surt(url) except: return self.get_original_url(record)
def cdx(request): """ This function handles WARC lookups by our warc server (running in warc_server). It accepts a standard CDX request, except with a GUID instead of date, and returns a standard CDX 11 response. If there's no warc for the requested GUID, or the requested URL isn't stored in that WARC, it returns a 404. """ # find requested link and url try: link = Link.objects.select_related().get(pk=request.GET.get('guid')) except Link.DoesNotExist: print "COULDN'T FIND LINK" raise Http404 url = request.GET.get('url', link.submitted_url) # get warc file for asset in link.assets.all(): if '.warc' in asset.warc_capture: warc_path = os.path.join(asset.base_storage_path, asset.warc_capture) break else: print "COULDN'T FIND WARC" raise Http404 # no .warc file -- do something to handle this? # get cdx file cdx_path = warc_path.replace('.gz', '').replace('.warc', '.cdx') if not default_storage.exists(cdx_path): # if we can't find the CDX file associated with this WARC, create it with default_storage.open(warc_path, 'rb') as warc_file, default_storage.open(cdx_path, 'wb') as cdx_file: write_cdx_index(cdx_file, warc_file, warc_path, sort=True) cdx_lines = default_storage.open(cdx_path, 'rb') # find cdx lines for url sorted_url = surt.surt(url) out = "" for line in cdx_lines: if line.startswith(sorted_url+" "): out += line elif out: # file may contain multiple matching lines in a row; we want to return all of them # if we've already found one or more matching lines, and now they're no longer matching, we're done break if out: return HttpResponse(out, content_type="text/plain") print "COULDN'T FIND URL" raise Http404 # didn't find requested url in .cdx file
def _dini(self, url): ext = tldextract.extract(url) urlseg = urlparse("http://" + url) reg_dom = surt(ext.registered_domain) if reg_dom[0].isalpha() and ")/" in reg_dom: subdom_len = path_len = query_len = 0 path_init = urlseg.path.strip("\n\r/")[:1] if ext.subdomain: subdom_len = ext.subdomain.count(".") + 1 if urlseg.path: path_len = urlseg.path.strip("\n\r/").count("/") + 1 if urlseg.query: query_len = urlseg.query.strip("?&").count("&") + 1 if not path_init.isalnum(): path_init = "-" return "{0}{1}/{2}/{3}/{4}".format(reg_dom, subdom_len, path_len, query_len, path_init)
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues): # canonicalize to surt (canonicalization is part of surt conversion) try: key = surt.surt(url) except Exception as e: raise wbexceptions.BadUrlException('Bad Request Url: ' + url) # if not surt, unsurt the surt to get canonicalized non-surt url if not self.surt_ordered: key = utils.unsurt(key) match_func = binsearch.iter_exact params.update(**kwvalues) params['output'] = 'raw' if parsed_cdx else 'text' return cdxserve.cdx_serve(key, params, self.sources, match_func)
def canonicalize(url, surt_ordered=True): """ Canonicalize url and convert to surt If not in surt ordered mode, convert back to url form as surt conversion is currently part of canonicalization >>> canonicalize('http://example.com/path/file.html', surt_ordered=True) 'com,example)/path/file.html' >>> canonicalize('http://example.com/path/file.html', surt_ordered=False) 'example.com/path/file.html' """ try: key = surt.surt(url) except Exception as e: raise UrlCanonicalizeException('Invalid Url: ' + url) # if not surt, unsurt the surt to get canonicalized non-surt url if not surt_ordered: key = unsurt(key) return key
def regenerate_urlkeys(urlkey_prefix='file'): """ Rewrite CDXLine urlkeys using the current version of the surt library. """ from perma.models import CDXLine from surt import surt target_cdxlines = CDXLine.objects.all() if urlkey_prefix: target_cdxlines = target_cdxlines.filter(urlkey__startswith=urlkey_prefix) for i, cdxline in enumerate(target_cdxlines): if not (i%1000): print "%s records done -- next is %s." % (i, cdxline.link_id) new_surt = surt(cdxline.parsed['url']) if new_surt != cdxline.urlkey: try: cdxline.raw = cdxline.raw.replace(cdxline.urlkey, new_surt, 1) except UnicodeDecodeError: print "Skipping unicode for %s" % cdxline.link_id continue cdxline.urlkey = new_surt cdxline.save()
def getBing(url,outputArray, indexOfOutputArray,verbose=False, **kwargs): apiKey = [] try: apiKey_env=os.getenv('CD_Bing_key') if apiKey_env is not None: logging.debug ( 'cdGetBing: Bing api key detected in environment variable, overwite local config values.') apiKey=apiKey_env else: fileConfig = open(os.path.dirname(__file__)+"/../config", "r") config = fileConfig.read() fileConfig.close() apiKey = json.loads(config) apiKey = apiKey['BingAPIKey'] except: logging.debug ( 'cdGetBing: ', sys.exc_info() ) return '' if( len(apiKey) == 0 ): logging.info ( 'cdGetBing: apiKey empty' ) return '' elif( apiKey == 'YourBingSearchAPIKey' ): logging.info ( 'cdGetBing.py: please set Bing search api key in config' ) return '' api_key = apiKey headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux i686 (x86_64); rv:2.0b4pre) Gecko/20100812 Minefield/4.0b4pre', 'Ocp-Apim-Subscription-Key':api_key } base_url = 'https://api.cognitive.microsoft.com/bing/v5.0/search?q=' parsedUrl = urllib.parse.urlparse(url) if( len(parsedUrl.scheme)<1 ): url = 'http://'+url searchUrl=url[7:] converted_url=quote(url,safe='') url = base_url + converted_url +'&count=10' auth = HTTPBasicAuth(api_key,api_key) response = requests.get(url, headers=headers) json_result=response.json() #print json_result result='' canonical_search_url=surt(searchUrl) for category in json_result: if category == 'webPages' : for page in json_result[category]['value']: result_url=surt(page['displayUrl']) if result_url==canonical_search_url : result = page['dateLastCrawled'] break elif category == 'images' : for page in json_result[category]['value']: result_url=surt(page['contentUrl']) if result_url==canonical_search_url : result = page['datePublished'] break elif category == 'news' : for page in json_result[category]['value']: result_url=surt(page['url']) if result_url==canonical_search_url : result = page['datePublished'] break elif category == 'videos' : for page in json_result[category]['value']: result_url=surt(page['hostPageDisplayUrl']) if result_url==canonical_search_url : result = page['datePublished'] break if result != '' : break outputArray[indexOfOutputArray]=result kwargs['displayArray'][indexOfOutputArray] = result logging.debug ( 'Done Bing' ) return result
def apply_filters(self, wbrequest, matcher): """Parse the GUID and find the CDXLine in the DB""" guid = matcher.group(1) cache_key = guid+'-cdx' cached_cdx = django_cache.get(cache_key) redirect_matcher = re.compile(r' 30[1-7] ') if cached_cdx is None or not wbrequest.wb_url: with close_database_connection(): try: # This will filter out links that have user_deleted=True link = Link.objects.get(guid=guid) except Link.DoesNotExist: raise_not_found(wbrequest.wb_url) if not wbrequest.wb_url: # This is a bare request to /warc/1234-5678/ -- return so we can send a forward to submitted_url in PermaGUIDHandler. wbrequest.custom_params['guid'] = guid wbrequest.custom_params['url'] = link.submitted_url return # Legacy archives didn't generate CDXLines during # capture so generate them on demand if not found, unless # A: the warc capture hasn't been generated OR # B: we know other cdx lines have already been generated # and the requested line is simply missing lines = list(link.cdx_lines.all()) if not lines: # TEMP: remove after all legacy warcs have been exported if not default_storage.exists(link.warc_storage_file()): link.export_warc() lines = CDXLine.objects.create_all_from_link(link) # build a lookup of all cdx lines for this link indexed by urlkey, like: # cached_cdx = {'urlkey1':['raw1','raw2'], 'urlkey2':['raw3','raw4']} cached_cdx = defaultdict(list) for line in lines: cached_cdx[line.urlkey].append(str(line.raw)) # remove any redirects if we also have a non-redirect capture for the same URL, to prevent redirect loops for urlkey, lines in cached_cdx.iteritems(): if len(lines) > 1: lines_without_redirects = [line for line in lines if not redirect_matcher.search(line)] if lines_without_redirects: cached_cdx[urlkey] = lines_without_redirects django_cache.set(cache_key, cached_cdx) urlkey = surt(wbrequest.wb_url.url) cdx_lines = cached_cdx.get(urlkey) if not cdx_lines: raise_not_found(wbrequest.wb_url) # Store the line for use in PermaCDXSource # so we don't need to hit the DB again wbrequest.custom_params['lines'] = cdx_lines wbrequest.custom_params['guid'] = guid # Adds the Memento-Datetime header # Normally this is done in MementoReqMixin#_parse_extra # but we need the GUID to make the DB query and that # isn't parsed from the url until this point wbrequest.wb_url.set_replay_timestamp(CDXLine(raw=cdx_lines[0]).timestamp)
def show_uri(path, datetime=None): global IPFS_API daemonAddress = '{0}:{1}'.format(IPFSAPI_HOST, IPFSAPI_PORT) if not ipwbUtils.isDaemonAlive(daemonAddress): errStr = ('IPFS daemon not running. ' 'Start it using $ ipfs daemon on the command-line ' ' or from the <a href="/">' 'IPWB replay homepage</a>.') return Response(errStr, status=503) path = getCompleteURI(path) cdxjLine = '' try: surtedURI = surt.surt( path, path_strip_trailing_slash_unless_empty=False) indexPath = ipwbUtils.getIPWBReplayIndexPath() searchString = surtedURI if datetime is not None: searchString = surtedURI + ' ' + datetime cdxjLine = getCDXJLine_binarySearch(searchString, indexPath) except Exception as e: print(sys.exc_info()[0]) respString = ('{0} not found :(' + ' <a href="http://{1}:{2}">Go home</a>').format( path, IPWBREPLAY_HOST, IPWBREPLAY_PORT) return Response(respString) if cdxjLine is None: # Resource not found in archives return generateNoMementosInterface(path, datetime) cdxjParts = cdxjLine.split(" ", 2) jObj = json.loads(cdxjParts[2]) datetime = cdxjParts[1] digests = jObj['locator'].split('/') class HashNotFoundError(Exception): pass payload = None header = None try: def handler(signum, frame): raise HashNotFoundError() if os.name != 'nt': # Bug #310 signal.signal(signal.SIGALRM, handler) signal.alarm(10) payload = IPFS_API.cat(digests[-1]) header = IPFS_API.cat(digests[-2]) if os.name != 'nt': # Bug #310 signal.alarm(0) except ipfsapi.exceptions.TimeoutError: print("{0} not found at {1}".format(cdxjParts[0], digests[-1])) respString = ('{0} not found in IPFS :(' + ' <a href="http://{1}:{2}">Go home</a>').format( path, IPWBREPLAY_HOST, IPWBREPLAY_PORT) return Response(respString) except TypeError as e: print('A type error occurred') print(e) abort(500) except HTTPError as e: print("Fetching from the IPFS failed") print(e) abort(503) except HashNotFoundError: if payload is None: print("Hashes not found:\n\t{0}\n\t{1}".format( digests[-1], digests[-2])) abort(404) else: # payload found but not header, fabricate header print("HTTP header not found, fabricating for resp replay") header = '' except Exception as e: print('Unknown exception occurred while fetching from ipfs.') print(e) abort(500) if 'encryption_method' in jObj: keyString = None while keyString is None: if 'encryption_key' in jObj: keyString = jObj['encryption_key'] else: askForKey = ('Enter a path for file', ' containing decryption key: \n> ') keyString = raw_input(askForKey) paddedEncryptionKey = pad(keyString, AES.block_size) key = base64.b64encode(paddedEncryptionKey) nonce = b64decode(jObj['encryption_nonce']) cipher = AES.new(key, AES.MODE_CTR, nonce=nonce) header = cipher.decrypt(base64.b64decode(header)) payload = cipher.decrypt(base64.b64decode(payload)) hLines = header.decode() \ .replace('\r', '') \ .replace('\n\t', '\t') \ .replace('\n ', ' ') \ .split('\n') hLines.pop(0) status = 200 if 'status_code' in jObj: status = jObj['status_code'] resp = Response(payload, status=status) for idx, hLine in enumerate(hLines): k, v = hLine.split(':', 1) if k.lower() == 'transfer-encoding' and \ re.search(r'\bchunked\b', v, re.I): try: unchunkedPayload = extractResponseFromChunkedData(payload) except Exception as e: print('Error while dechunking') print(sys.exc_info()[0]) continue # Data may have no actually been chunked resp.set_data(unchunkedPayload) if k.lower() not in ["content-type", "content-encoding", "location"]: k = "X-Archive-Orig-" + k resp.headers[k] = v.strip() # Add ipwb header for additional SW logic newPayload = resp.get_data() lineJSON = cdxjLine.split(' ', 2)[2] mime = json.loads(lineJSON)['mime_type'] if 'text/html' in mime: ipwbjsinject = """<script src="/ipwbassets/webui.js"></script> <script>injectIPWBJS()</script>""" newPayload = newPayload.decode('utf-8').replace( '</html>', ipwbjsinject + '</html>') resp.set_data(newPayload) resp.headers['Memento-Datetime'] = ipwbUtils.digits14ToRFC1123(datetime) if header is None: resp.headers['X-Headers-Generated-By'] = 'InterPlanetary Wayback' # Get TimeMap for Link response header # respWithLinkHeader = getLinkHeaderAbbreviatedTimeMap(path, datetime) # resp.headers['Link'] = respWithLinkHeader.replace('\n', ' ') if status[0] == '3' and isUri(resp.headers.get('Location')): # Bad assumption that the URI-M will contain \d14 but works for now. uriBeforeURIR = request.url[:re.search(r'/\d{14}/', request.url).end()] newURIM = uriBeforeURIR + resp.headers['Location'] resp.headers['Location'] = newURIM return resp
def test_surt(): # These tests are from WaybackURLKeyMakerTest.java assert surt.surt(None) == '-' assert surt.surt('') == '-' assert surt.surt("filedesc:foo.arc.gz") == 'filedesc:foo.arc.gz' assert surt.surt("filedesc:/foo.arc.gz") == 'filedesc:/foo.arc.gz' assert surt.surt("filedesc://foo.arc.gz") == 'filedesc://foo.arc.gz' assert surt.surt("warcinfo:foo.warc.gz") == 'warcinfo:foo.warc.gz' assert surt.surt("dns:alexa.com") == 'dns:alexa.com' assert surt.surt("dns:archive.org") == 'dns:archive.org' assert surt.surt("http://www.archive.org/") == 'org,archive)/' assert surt.surt("http://archive.org/") == 'org,archive)/' assert surt.surt("http://archive.org/goo/") == 'org,archive)/goo' assert surt.surt("http://archive.org/goo/?") == 'org,archive)/goo' assert surt.surt("http://archive.org/goo/?b&a") == 'org,archive)/goo?a&b' assert surt.surt("http://archive.org/goo/?a=2&b&a=1") == 'org,archive)/goo?a=1&a=2&b' # trailing comma mode assert surt.surt("http://archive.org/goo/?a=2&b&a=1", trailing_comma=True) == 'org,archive,)/goo?a=1&a=2&b' assert surt.surt("dns:archive.org", trailing_comma=True) == 'dns:archive.org' assert surt.surt("warcinfo:foo.warc.gz", trailing_comma=True) == 'warcinfo:foo.warc.gz' # PHP session id: assert surt.surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221") == 'org,archive)/index.php?action=profile;u=4221' # WHOIS url: assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il") == 'il,org,isoc,whois)/shaveh.co.il' # Yahoo web bug. See https://github.com/internetarchive/surt/issues/1 assert surt.surt('http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2') == 'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2' # Simple customization: assert surt.surt("http://www.example.com/", canonicalizer=lambda x, **opts: x) == 'com,example,www)/' assert surt.surt("mailto:[email protected]") == 'mailto:[email protected]' assert surt.surt("http://www.example.com/", with_scheme=True) == 'http://(com,example)/' assert surt.surt("http://www.example.com/", with_scheme=True, host_massage=True) == 'http://(com,example)/' assert surt.surt("http://www.example.com/", with_scheme=False) == 'com,example)/' assert surt.surt("http://www.example.com/", with_scheme=True, trailing_comma=True) == 'http://(com,example,)/' assert surt.surt("https://www.example.com/", with_scheme=True, trailing_comma=True) == 'https://(com,example,)/' assert surt.surt("ftp://www.example.com/", with_scheme=False, trailing_comma=True) == 'com,example,)/' assert surt.surt("ftp://www.example.com/", with_scheme=False, trailing_comma=False) == 'com,example)/' assert surt.surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True) == 'ftp://(com,example,)/' assert surt.surt("http://www.example.com/", with_scheme=True, host_massage=False) == 'http://(com,example,www)/' assert surt.surt("http://www.example.com/", with_scheme=False, host_massage=False) == 'com,example,www)/' assert surt.surt("http://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'http://(com,example,www,)/' assert surt.surt("https://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'https://(com,example,www,)/' assert surt.surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'ftp://(com,example,www,)/' assert surt.surt("mailto:[email protected]", with_scheme=True) == 'mailto:[email protected]' assert surt.surt("mailto:[email protected]", trailing_comma=True) == 'mailto:[email protected]' assert surt.surt("mailto:[email protected]", with_scheme=True, trailing_comma=True) == 'mailto:[email protected]' assert surt.surt("dns:archive.org", with_scheme=True) == 'dns:archive.org' assert surt.surt("dns:archive.org", trailing_comma=True) == 'dns:archive.org' assert surt.surt("dns:archive.org", with_scheme=True, trailing_comma=True) == 'dns:archive.org' assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", with_scheme=True) == 'whois://(il,org,isoc,whois)/shaveh.co.il' assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", trailing_comma=True) == 'il,org,isoc,whois,)/shaveh.co.il' assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", trailing_comma=True, with_scheme=True) == 'whois://(il,org,isoc,whois,)/shaveh.co.il' assert surt.surt("warcinfo:foo.warc.gz", trailing_comma=True) == 'warcinfo:foo.warc.gz' assert surt.surt("warcinfo:foo.warc.gz", with_scheme=True) == 'warcinfo:foo.warc.gz' assert surt.surt("warcinfo:foo.warc.gz", with_scheme=True, trailing_comma=True) == 'warcinfo:foo.warc.gz'
def apply_filters(self, wbrequest, matcher): """Parse the GUID and find the CDXLine in the DB""" guid = matcher.group(1) cache_key = Link.get_cdx_cache_key(guid) cached_cdx = django_cache.get(cache_key) redirect_matcher = re.compile(r' 30[1-7] ') if cached_cdx is None or not wbrequest.wb_url: with opbeat_trace('cdx-cache-miss'), close_database_connection(): try: # This will filter out links that have user_deleted=True link = Link.objects.get(guid=guid) except Link.DoesNotExist: raise_not_found(wbrequest.wb_url) if not wbrequest.wb_url: # This is a bare request to /warc/1234-5678/ -- return so we can send a forward to submitted_url in PermaGUIDHandler. wbrequest.custom_params['guid'] = guid wbrequest.custom_params['url'] = link.safe_url return # Legacy archives didn't generate CDXLines during # capture so generate them on demand if not found, unless # A: the warc capture hasn't been generated OR # B: we know other cdx lines have already been generated # and the requested line is simply missing lines = CDXLine.objects.filter(link_id=link.guid) if not lines: lines = CDXLine.objects.create_all_from_link(link) # build a lookup of all cdx lines for this link indexed by urlkey, like: # cached_cdx = {'urlkey1':['raw1','raw2'], 'urlkey2':['raw3','raw4']} cached_cdx = defaultdict(list) for line in lines: cached_cdx[line.urlkey].append(str(line.raw)) # remove any redirects if we also have a non-redirect capture for the same URL, to prevent redirect loops for urlkey, lines in cached_cdx.iteritems(): if len(lines) > 1: lines_without_redirects = [line for line in lines if not redirect_matcher.search(line)] if lines_without_redirects: cached_cdx[urlkey] = lines_without_redirects # record whether link is private so we can enforce permissions cached_cdx['is_private'] = link.is_private django_cache.set(cache_key, cached_cdx) # enforce permissions if cached_cdx.get('is_private'): # if user is allowed to access this private link, they will have a cookie like GUID=<token>, # which can be validated with link.validate_access_token() cookie = Cookie.SimpleCookie(wbrequest.env.get('HTTP_COOKIE')).get(guid) if not cookie: raise CustomTemplateException(status='400 Bad Request', template_path='archive/missing-cookie.html', template_kwargs={ 'content_host': settings.WARC_HOST, }) if not Link(pk=guid).validate_access_token(cookie.value, 3600): raise_not_found(wbrequest.wb_url) # check whether archive contains the requested URL urlkey = surt(wbrequest.wb_url.url) cdx_lines = cached_cdx.get(urlkey) if not cdx_lines: raise_not_found(wbrequest.wb_url) # Store the line for use in PermaCDXSource # so we don't need to hit the DB again wbrequest.custom_params['lines'] = cdx_lines wbrequest.custom_params['guid'] = guid # Adds the Memento-Datetime header # Normally this is done in MementoReqMixin#_parse_extra # but we need the GUID to make the DB query and that # isn't parsed from the url until this point wbrequest.wb_url.set_replay_timestamp(CDXLine(raw=cdx_lines[0]).timestamp)
def urlkey(self, url): """compute urlkey from `url`.""" return surt(url, canonicalizer=self.canonicalize)
def __init__(self, file, out_file=sys.stdout, format="N b a m s k r M S V g", use_full_path=False, file_prefix=None, all_records=False, screenshot_mode=False, exclude_list=None, stats_file=None): self.field_map = {'M': 'AIF meta tags', 'N': 'massaged url', 'S': 'compressed record size', 'V': 'compressed arc file offset', 'a': 'original url', 'b': 'date', 'g': 'file name', 'k': 'new style checksum', 'm': 'mime type', 'r': 'redirect', 's': 'response code', } self.file = file self.out_file = out_file self.format = format self.all_records = all_records self.screenshot_mode = screenshot_mode self.crlf_pattern = re.compile('\r?\n\r?\n') self.response_pattern = re.compile('^application/http;\s*msgtype=response$', re.I) #similar to what what the wayback uses: self.fake_build_version = "archive-commons.0.0.1-SNAPSHOT-20120112102659-python" #these fields are set for each record in the warc self.offset = 0 self.surt = None self.mime_type = None self.headers = None self.content = None self.meta_tags = None self.response_code = None #Large html files cause lxml to segfault #problematic file was 154MB, we'll stop at 5MB self.lxml_parse_limit = 5 * 1024 * 1024 if use_full_path: self.warc_path = os.path.abspath(file) elif file_prefix: self.warc_path = os.path.join(file_prefix, file) else: self.warc_path = file if exclude_list: if not os.path.exists(exclude_list): raise IOError, "Exclude file not found" self.excludes = [] f = open(exclude_list, 'r') for line in f: if '' == line.strip(): continue url = line.split()[0] self.excludes.append(surt(url)) else: self.excludes = None if stats_file: if os.path.exists(stats_file): raise IOError, "Stats file already exists" self.stats_file = stats_file else: self.stats_file = None
def brozzler_list_captures(): ''' Handy utility for looking up entries in the rethinkdb "captures" table by url or sha1. ''' import surt import rethinkdb arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter) _add_rethinkdb_options(arg_parser) _add_common_options(arg_parser) arg_parser.add_argument( 'url_or_sha1', metavar='URL_or_SHA1', help='url or sha1 to look up in captures table') args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(','), args.rethinkdb_db) class Jsonner(json.JSONEncoder): def default(self, o): if isinstance(o, datetime.datetime): return o.isoformat() return json.JSONEncoder.default(self, o) if args.url_or_sha1[:5] == 'sha1:': raise Exception('not implemented') # def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"): # if algo != "sha1": # raise Exception( # "digest type is %s but big captures table is indexed by " # "sha1" % algo) # sha1base32 = base64.b32encode(raw_digest).decode("utf-8") # results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run() # results = list(results_iter) # if len(results) > 0: # if len(results) > 1: # self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket) # result = results[0] # else: # result = None # self.logger.debug("returning %s for sha1base32=%s bucket=%s", # result, sha1base32, bucket) # return result else: key = surt.surt( args.url_or_sha1, trailing_comma=True, host_massage=False, with_scheme=True) reql = r.table('captures').between( [key[:150], rethinkdb.minval], [key[:150]+'!', rethinkdb.maxval], index='abbr_canon_surt_timestamp') reql = reql.order_by(index='abbr_canon_surt_timestamp') reql = reql.filter( lambda capture: (capture['canon_surt'] >= key) & (capture['canon_surt'] <= key)) logging.debug('rethinkdb query: %s', reql) results = reql.run() for result in results: print(json.dumps(result, cls=Jsonner, indent=2))
def urlkey(self, url): """compute urlkey from `url`.""" return surt(url, **dict(self.canonicalizer_options))