def get_hand_picked_titles(filename_part): hand_picked_titles = [] with open('/homes/rps10/rps10Docs/Individual_Project/TermExtraction/hand-tagged-tests/wikiArticles/' + filename_part + '_wikis.txt') as f: for line in f: if line.split(): url = line.split()[0] url_title = url.split('/')[-1] unquoted_url_title = urllib2.unquote(url_title).decode('utf8') title = ' '.join(unquoted_url_title.split('_')) hand_picked_titles.append(title) return hand_picked_titles
def get_hand_picked_titles(filename_part): hand_picked_titles = [] with open( '/homes/rps10/rps10Docs/Individual_Project/TermExtraction/hand-tagged-tests/wikiArticles/' + filename_part + '_wikis.txt') as f: for line in f: if line.split(): url = line.split()[0] url_title = url.split('/')[-1] unquoted_url_title = urllib2.unquote(url_title).decode('utf8') title = ' '.join(unquoted_url_title.split('_')) hand_picked_titles.append(title) return hand_picked_titles
def compare_wiki_results(filename_part,retrieved_wiki_titles): hand_picked_titles = [] with open('/homes/rps10/rps10Docs/Individual_Project/TermExtraction/hand-tagged-tests/wikiArticles/' + filename_part + '_wikis.txt') as f: for line in f: print line if line.split(): url = line.split()[0] print repr(url) url_title = url.split('/')[-1] print repr(url_title) unquoted_url_title = urllib2.unquote(url_title).decode('utf8') print repr(unquoted_url_title) title = ' '.join(unquoted_url_title.split('_')) print repr(title) hand_picked_titles.append(title) print hand_picked_titles print "Titles not retrieved: " print set(hand_picked_titles).difference(set(retrieved_wiki_titles))
def compare_wiki_results(filename_part, retrieved_wiki_titles): hand_picked_titles = [] with open( '/homes/rps10/rps10Docs/Individual_Project/TermExtraction/hand-tagged-tests/wikiArticles/' + filename_part + '_wikis.txt') as f: for line in f: print line if line.split(): url = line.split()[0] print repr(url) url_title = url.split('/')[-1] print repr(url_title) unquoted_url_title = urllib2.unquote(url_title).decode('utf8') print repr(unquoted_url_title) title = ' '.join(unquoted_url_title.split('_')) print repr(title) hand_picked_titles.append(title) print hand_picked_titles print "Titles not retrieved: " print set(hand_picked_titles).difference(set(retrieved_wiki_titles))
def _handle_request(self, env, start_response): req = swob.Request(env) # Double (or triple, etc.) slashes in the URL should be ignored; # collapse them. fixes T34864 req.path_info = re.sub(r'/{2,}', '/', req.path_info) # Keep a copy of the original request so we can ask the scalers for it reqorig = swob.Request(req.environ.copy()) # Containers have 5 components: project, language, repo, zone, and shard. # If there's no zone in the URL, the zone is assumed to be 'public' (for b/c). # Shard is optional (and configurable), and is only used for large containers. # # Projects are wikipedia, wikinews, etc. # Languages are en, de, fr, commons, etc. # Repos are local, timeline, etc. # Zones are public, thumb, temp, etc. # Shard is extracted from "hash paths" in the URL and is 2 hex digits. # # These attributes are mapped to container names in the form of either: # (a) proj-lang-repo-zone (if not sharded) # (b) proj-lang-repo-zone.shard (if sharded) # (c) global-data-repo-zone (if not sharded) # (d) global-data-repo-zone.shard (if sharded) # # Rewrite wiki-global URLs of these forms: # (a) http://upload.wikimedia.org/math/<relpath> # => http://msfe/v1/AUTH_<hash>/global-data-math-render/<relpath> # (b) http://upload.wikimedia.org/<proj>/<lang>/math/<relpath> (legacy) # => http://msfe/v1/AUTH_<hash>/global-data-math-render/<relpath> # # Rewrite wiki-relative URLs of these forms: # (a) http://upload.wikimedia.org/<proj>/<lang>/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/<relpath> # (b) http://upload.wikimedia.org/<proj>/<lang>/archive/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/archive/<relpath> # (c) http://upload.wikimedia.org/<proj>/<lang>/thumb/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/<relpath> # (d) http://upload.wikimedia.org/<proj>/<lang>/thumb/archive/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/archive/<relpath> # (e) http://upload.wikimedia.org/<proj>/<lang>/thumb/temp/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/temp/<relpath> # (f) http://upload.wikimedia.org/<proj>/<lang>/transcoded/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-transcoded/<relpath> # (g) http://upload.wikimedia.org/<proj>/<lang>/timeline/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-timeline-render/<relpath> # regular uploads match = re.match( (r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/' r'((?P<zone>transcoded|thumb)/)?' r'(?P<path>((temp|archive)/)?[0-9a-f]/(?P<shard>[0-9a-f]{2})/.+)$' ), req.path) if match: proj = match.group('proj') lang = match.group('lang') repo = 'local' # the upload repo name is "local" # Get the repo zone (if not provided that means "public") zone = (match.group('zone') if match.group('zone') else 'public') # Get the object path relative to the zone (and thus container) obj = match.group('path') # e.g. "archive/a/ab/..." shard = match.group('shard') # timeline renderings if match is None: # /wikipedia/en/timeline/a876297c277d80dfd826e1f23dbfea3f.png match = re.match( r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/(?P<repo>timeline)/(?P<path>.+)$', req.path) if match: proj = match.group('proj') # wikipedia lang = match.group('lang') # en repo = match.group('repo') # timeline zone = 'render' obj = match.group( 'path') # a876297c277d80dfd826e1f23dbfea3f.png shard = '' # math renderings if match is None: # /math/c/9/f/c9f2055dadfb49853eff822a453d9ceb.png # /wikipedia/en/math/c/9/f/c9f2055dadfb49853eff822a453d9ceb.png (legacy) match = re.match( (r'^(/(?P<proj>[^/]+)/(?P<lang>[^/]+))?/(?P<repo>math)/' r'(?P<path>(?P<shard1>[0-9a-f])/(?P<shard2>[0-9a-f])/.+)$'), req.path) if match: proj = 'global' lang = 'data' repo = match.group('repo') # math zone = 'render' obj = match.group( 'path') # c/9/f/c9f2055dadfb49853eff822a453d9ceb.png shard = match.group('shard1') + match.group('shard2') # c9 # score renderings if match is None: # /score/j/q/jqn99bwy8777srpv45hxjoiu24f0636/jqn99bwy.png # /score/override-midi/8/i/8i9pzt87wtpy45lpz1rox8wusjkt7ki.ogg match = re.match(r'^/(?P<repo>score)/(?P<path>.+)$', req.path) if match: proj = 'global' lang = 'data' repo = match.group('repo') # score zone = 'render' obj = match.group( 'path') # j/q/jqn99bwy8777srpv45hxjoiu24f0636/jqn99bwy.png shard = '' if match is None: match = re.match(r'^/monitoring/(?P<what>.+)$', req.path) if match: what = match.group('what') if what == 'frontend': headers = {'Content-Type': 'application/octet-stream'} resp = swob.Response(headers=headers, body="OK\n") elif what == 'backend': req.host = '127.0.0.1:%s' % self.bind_port req.path_info = "/v1/%s/monitoring/backend" % self.account app_iter = self._app_call(env) status = self._get_status_int() headers = self._response_headers resp = swob.Response(status=status, headers=headers, app_iter=app_iter) else: resp = swob.HTTPNotFound('Monitoring type not found "%s"' % (req.path)) return resp(env, start_response) if match is None: match = re.match(r'^/(?P<path>[^/]+)?$', req.path) # /index.html /favicon.ico /robots.txt etc. # serve from a default "root" container if match: path = match.group('path') if not path: path = 'index.html' req.host = '127.0.0.1:%s' % self.bind_port req.path_info = "/v1/%s/root/%s" % (self.account, path) app_iter = self._app_call(env) status = self._get_status_int() headers = self._response_headers resp = swob.Response(status=status, headers=headers, app_iter=app_iter) return resp(env, start_response) # Internally rewrite the URL based on the regex it matched... if match: # Get the per-project "conceptual" container name, e.g. "<proj><lang><repo><zone>" container = "%s-%s-%s-%s" % (proj, lang, repo, zone) # Add 2-digit shard to the container if it is supposed to be sharded. # We may thus have an "actual" container name like "<proj><lang><repo><zone>.<shard>" if container in self.shard_container_list: container += ".%s" % shard # Save a url with just the account name in it. req.path_info = "/v1/%s" % (self.account) port = self.bind_port req.host = '127.0.0.1:%s' % port url = req.url[:] # Create a path to our object's name. req.path_info = "/v1/%s/%s/%s" % (self.account, container, urllib2.unquote(obj)) # self.logger.warn("new path is %s" % req.path_info) # do_start_response just remembers what it got called with, # because our 404 handler will generate a different response. app_iter = self._app_call(env) status = self._get_status_int() headers = self._response_headers if status == 404: # only send thumbs to the 404 handler; just return a 404 for everything else. if repo == 'local' and zone == 'thumb': resp = self.handle404(reqorig, url, container, obj) return resp(env, start_response) else: resp = swob.HTTPNotFound('File not found: %s' % req.path) return resp(env, start_response) else: if zone == 'thumb': for key, value in headers: if key == 'X-Delete-At' and self.thumbnail_update_expiry_headers: # Update expiry header asynchronously eventlet.spawn(self.update_expiry, env) break # Return the response verbatim return swob.Response(status=status, headers=headers, app_iter=app_iter)(env, start_response) else: resp = swob.HTTPNotFound('Regexp failed to match URI: "%s"' % (req.path)) return resp(env, start_response)
def __call__(self, env, start_response): #try: commented-out while debugging so you can see where stuff happened. req = webob.Request(env) # End-users should only do GET/HEAD, nothing else needs a rewrite if req.method != 'GET' and req.method != 'HEAD': return self.app(env, start_response) # Double (or triple, etc.) slashes in the URL should be ignored; collapse them. fixes bug 32864 while(req.path_info != req.path_info.replace('//', '/')): req.path_info = req.path_info.replace('//', '/') # If it already has AUTH, presume that it's good. #07. fixes bug 33620 hasauth = re.search('/AUTH_[0-9a-fA-F-]{32,36}', req.path) if req.path.startswith('/auth') or hasauth: return self.app(env, start_response) # keep a copy of the original request so we can ask the scalers for it reqorig = req.copy() # Containers have 4 components: project, language, zone, and shard. # Shard is optional (and configurable). If there's no zone in the URL, # the zone is 'public'. Project, language, and zone are turned into containers # with the pattern proj-lang-local-zone (or proj-lang-local-zone.shard). # Projects are wikipedia, wikinews, etc. # Languages are en, de, fr, commons, etc. # Zones are public, thumb, and temp. # Shards are stolen from the URL and are 2 digits of hex. # Examples: # Rewrite URLs of these forms (source, temp, and thumbnail files): # (a) http://upload.wikimedia.org/<proj>/<lang>/.* # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/.* # (b) http://upload.wikimedia.org/<proj>/<lang>/archive/.* # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/archive/.* # (c) http://upload.wikimedia.org/<proj>/<lang>/thumb/.* # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/.* # (d) http://upload.wikimedia.org/<proj>/<lang>/thumb/archive/.* # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/archive/.* # (e) http://upload.wikimedia.org/<proj>/<lang>/thumb/temp/.* # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/temp/.* # (f) http://upload.wikimedia.org/<proj>/<lang>/temp/.* # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-temp/.* match = re.match(r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/((?P<zone>thumb|temp)/)?(?P<path>((temp|archive)/)?[0-9a-f]/(?P<shard>[0-9a-f]{2})/.+)$', req.path) if match: # Get the repo zone (if not provided that means "public") zone = (match.group('zone') if match.group('zone') else 'public') # Get the object path relative to the zone (and thus container) obj = match.group('path') # e.g. "archive/a/ab/..." # Get the per-project "conceptual" container name, e.g. "<proj><lang><repo><zone>" container = "%s-%s-local-%s" % (match.group('proj'), match.group('lang'), zone) #02/#03 # Add 2-digit shard to the container if it is supposed to be sharded. # We may thus have an "actual" container name like "<proj><lang><repo><zone>.<shard>" if ( (self.shard_containers == 'all') or \ ((self.shard_containers == 'some') and (container in self.shard_container_list)) ): container += ".%s" % match.group('shard') # Save a url with just the account name in it. req.path_info = "/v1/%s" % (self.account) port = self.bind_port req.host = '127.0.0.1:%s' % port url = req.url[:] # Create a path to our object's name. req.path_info = "/v1/%s/%s/%s" % (self.account, container, urllib2.unquote(obj)) #self.logger.warn("new path is %s" % req.path_info) controller = ObjectController() # do_start_response just remembers what it got called with, # because our 404 handler will generate a different response. app_iter = self.app(env, controller.do_start_response) #01 status = int(controller.response_args[0].split()[0]) headers = dict(controller.response_args[1]) if 200 <= status < 300 or status == 304: # We have it! Just return it as usual. #headers['X-Swift-Proxy']= `headers` if 'etag' in headers: del headers['etag'] return webob.Response(status=status, headers=headers, app_iter=app_iter)(env, start_response) #01a elif status == 404: #4 resp = self.handle404(reqorig, url, container, obj) return resp(env, start_response) elif status == 401: # if the Storage URL is invalid or has expired we'll get this error. resp = webob.exc.HTTPUnauthorized('Token may have timed out') #05 return resp(env, start_response) else: resp = webob.exc.HTTPNotImplemented('Unknown Status: %s' % (status)) #10 return resp(env, start_response) else: resp = webob.exc.HTTPBadRequest('Regexp failed: "%s"' % (req.path)) #11 return resp(env, start_response)
def __call__(self, env, start_response): #try: commented-out while debugging so you can see where stuff happened. req = webob.Request(env) # End-users should only do GET/HEAD, nothing else needs a rewrite if req.method != 'GET' and req.method != 'HEAD': return self.app(env, start_response) # Double (or triple, etc.) slashes in the URL should be ignored; collapse them. fixes bug 32864 while (req.path_info != req.path_info.replace('//', '/')): req.path_info = req.path_info.replace('//', '/') # If it already has AUTH, presume that it's good. #07. fixes bug 33620 hasauth = re.search('/AUTH_[0-9a-fA-F-]{32,36}', req.path) if req.path.startswith('/auth') or hasauth: return self.app(env, start_response) # keep a copy of the original request so we can ask the scalers for it reqorig = req.copy() # Containers have 4 components: project, language, zone, and shard. # Shard is optional (and configurable). If there's no zone in the URL, # the zone is 'public'. Project, language, and zone are turned into containers # with the pattern proj-lang-local-zone (or proj-lang-local-zone.shard). # Projects are wikipedia, wikinews, etc. # Languages are en, de, fr, commons, etc. # Zones are public, thumb, and temp. # Shards are stolen from the URL and are 2 digits of hex. # Examples: # Rewrite URLs of these forms (source, temp, and thumbnail files): # (a) http://upload.wikimedia.org/<proj>/<lang>/.* # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/.* # (b) http://upload.wikimedia.org/<proj>/<lang>/archive/.* # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/archive/.* # (c) http://upload.wikimedia.org/<proj>/<lang>/thumb/.* # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/.* # (d) http://upload.wikimedia.org/<proj>/<lang>/thumb/archive/.* # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/archive/.* # (e) http://upload.wikimedia.org/<proj>/<lang>/thumb/temp/.* # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/temp/.* # (f) http://upload.wikimedia.org/<proj>/<lang>/temp/.* # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-temp/.* match = re.match( r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/((?P<zone>thumb|temp)/)?(?P<path>((temp|archive)/)?[0-9a-f]/(?P<shard>[0-9a-f]{2})/.+)$', req.path) if match: # Get the repo zone (if not provided that means "public") zone = (match.group('zone') if match.group('zone') else 'public') # Get the object path relative to the zone (and thus container) obj = match.group('path') # e.g. "archive/a/ab/..." # Get the per-project "conceptual" container name, e.g. "<proj><lang><repo><zone>" container = "%s-%s-local-%s" % ( match.group('proj'), match.group('lang'), zone) #02/#03 # Add 2-digit shard to the container if it is supposed to be sharded. # We may thus have an "actual" container name like "<proj><lang><repo><zone>.<shard>" if ( (self.shard_containers == 'all') or \ ((self.shard_containers == 'some') and (container in self.shard_container_list)) ): container += ".%s" % match.group('shard') # Save a url with just the account name in it. req.path_info = "/v1/%s" % (self.account) port = self.bind_port req.host = '127.0.0.1:%s' % port url = req.url[:] # Create a path to our object's name. req.path_info = "/v1/%s/%s/%s" % (self.account, container, urllib2.unquote(obj)) #self.logger.warn("new path is %s" % req.path_info) controller = ObjectController() # do_start_response just remembers what it got called with, # because our 404 handler will generate a different response. app_iter = self.app(env, controller.do_start_response) #01 status = int(controller.response_args[0].split()[0]) headers = dict(controller.response_args[1]) if 200 <= status < 300 or status == 304: # We have it! Just return it as usual. #headers['X-Swift-Proxy']= `headers` if 'etag' in headers: del headers['etag'] return webob.Response(status=status, headers=headers, app_iter=app_iter)(env, start_response) #01a elif status == 404: #4 resp = self.handle404(reqorig, url, container, obj) return resp(env, start_response) elif status == 401: # if the Storage URL is invalid or has expired we'll get this error. resp = webob.exc.HTTPUnauthorized( 'Token may have timed out') #05 return resp(env, start_response) else: resp = webob.exc.HTTPNotImplemented('Unknown Status: %s' % (status)) #10 return resp(env, start_response) else: resp = webob.exc.HTTPBadRequest('Regexp failed: "%s"' % (req.path)) #11 return resp(env, start_response)
def handle_request(self, env, start_response): req = webob.Request(env) # Double (or triple, etc.) slashes in the URL should be ignored; collapse them. fixes bug 32864 req.path_info = re.sub(r'/{2,}', '/', req.path_info) # Keep a copy of the original request so we can ask the scalers for it reqorig = req.copy() # Containers have 5 components: project, language, repo, zone, and shard. # If there's no zone in the URL, the zone is assumed to be 'public' (for b/c). # Shard is optional (and configurable), and is only used for large containers. # # Projects are wikipedia, wikinews, etc. # Languages are en, de, fr, commons, etc. # Repos are local, timeline, etc. # Zones are public, thumb, temp, etc. # Shard is extracted from "hash paths" in the URL and is 2 hex digits. # # These attributes are mapped to container names in the form of either: # (a) proj-lang-repo-zone (if not sharded) # (b) proj-lang-repo-zone.shard (if sharded) # (c) global-data-repo-zone (if not sharded) # (d) global-data-repo-zone.shard (if sharded) # # Rewrite wiki-global URLs of these forms: # (a) http://upload.wikimedia.org/math/.* # => http://msfe/v1/AUTH_<hash>/global-data-math-render/.* # # Rewrite wiki-relative URLs of these forms: # (a) http://upload.wikimedia.org/<proj>/<lang>/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/<relpath> # (b) http://upload.wikimedia.org/<proj>/<lang>/archive/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/archive/<relpath> # (c) http://upload.wikimedia.org/<proj>/<lang>/thumb/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/<relpath> # (d) http://upload.wikimedia.org/<proj>/<lang>/thumb/archive/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/archive/<relpath> # (e) http://upload.wikimedia.org/<proj>/<lang>/thumb/temp/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/temp/<relpath> # (f) http://upload.wikimedia.org/<proj>/<lang>/temp/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-temp/<relpath> # (g) http://upload.wikimedia.org/<proj>/<lang>/transcoded/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-transcoded/<relpath> # (h) http://upload.wikimedia.org/<proj>/<lang>/timeline/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-timeline-render/<relpath> # regular uploads match = re.match( r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/((?P<zone>transcoded|thumb|temp)/)?(?P<path>((temp|archive)/)?[0-9a-f]/(?P<shard>[0-9a-f]{2})/.+)$', req.path) if match: proj = match.group('proj') lang = match.group('lang') repo = 'local' # the upload repo name is "local" # Get the repo zone (if not provided that means "public") zone = (match.group('zone') if match.group('zone') else 'public') # Get the object path relative to the zone (and thus container) obj = match.group('path') # e.g. "archive/a/ab/..." shard = match.group('shard') # timeline renderings if match is None: # /wikipedia/en/timeline/a876297c277d80dfd826e1f23dbfea3f.png match = re.match( r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/(?P<repo>timeline)/(?P<path>.+)$', req.path) if match: proj = match.group('proj') # wikipedia lang = match.group('lang') # en repo = match.group('repo') # timeline zone = 'render' obj = match.group( 'path') # a876297c277d80dfd826e1f23dbfea3f.png shard = '' # math renderings if match is None: # /math/c/9/f/c9f2055dadfb49853eff822a453d9ceb.png match = re.match( r'^/(?P<repo>math)/(?P<path>(?P<shard1>[0-9a-f])/(?P<shard2>[0-9a-f])/.+)$', req.path) if match: proj = 'global' lang = 'data' repo = match.group('repo') # math zone = 'render' obj = match.group( 'path') # c/9/f/c9f2055dadfb49853eff822a453d9ceb.png shard = match.group('shard1') + match.group('shard2') # c9 # score renderings if match is None: # /score/j/q/jqn99bwy8777srpv45hxjoiu24f0636/jqn99bwy.png # /score/override-midi/8/i/8i9pzt87wtpy45lpz1rox8wusjkt7ki.ogg match = re.match(r'^/(?P<repo>score)/(?P<path>.+)$', req.path) if match: proj = 'global' lang = 'data' repo = match.group('repo') # score zone = 'render' obj = match.group( 'path') # j/q/jqn99bwy8777srpv45hxjoiu24f0636/jqn99bwy.png shard = '' if match is None: match = re.match(r'^/monitoring/(?P<what>.+)$', req.path) if match: what = match.group('what') if what == 'frontend': headers = {'Content-Type': 'application/octet-stream'} resp = webob.Response(headers=headers, body="OK\n") elif what == 'backend': req.host = '127.0.0.1:%s' % self.bind_port req.path_info = "/v1/%s/monitoring/backend" % self.account app_iter = self._app_call(env) status = self._get_status_int() headers = self._response_headers resp = webob.Response(status=status, headers=headers, app_iter=app_iter) else: resp = webob.exc.HTTPNotFound( 'Monitoring type not found "%s"' % (req.path)) return resp(env, start_response) # Internally rewrite the URL based on the regex it matched... if match: # Get the per-project "conceptual" container name, e.g. "<proj><lang><repo><zone>" container = "%s-%s-%s-%s" % (proj, lang, repo, zone) #02/#03 # Add 2-digit shard to the container if it is supposed to be sharded. # We may thus have an "actual" container name like "<proj><lang><repo><zone>.<shard>" if ( (self.shard_containers == 'all') or \ ((self.shard_containers == 'some') and (container in self.shard_container_list)) ): container += ".%s" % shard # Save a url with just the account name in it. req.path_info = "/v1/%s" % (self.account) port = self.bind_port req.host = '127.0.0.1:%s' % port url = req.url[:] # Create a path to our object's name. req.path_info = "/v1/%s/%s/%s" % (self.account, container, urllib2.unquote(obj)) #self.logger.warn("new path is %s" % req.path_info) # do_start_response just remembers what it got called with, # because our 404 handler will generate a different response. app_iter = self._app_call(env) #01 status = self._get_status_int() headers = self._response_headers if 200 <= status < 300 or status == 304: # We have it! Just return it as usual. #headers['X-Swift-Proxy']= `headers` return webob.Response(status=status, headers=headers, app_iter=app_iter)(env, start_response) #01a elif status == 404: #4 # only send thumbs to the 404 handler; just return a 404 for everything else. if repo == 'local' and zone == 'thumb': resp = self.handle404(reqorig, url, container, obj) return resp(env, start_response) else: resp = webob.exc.HTTPNotFound('File not found: %s' % req.path) return resp(env, start_response) elif status == 401: # if the Storage URL is invalid or has expired we'll get this error. resp = webob.exc.HTTPUnauthorized( 'Token may have timed out') #05 return resp(env, start_response) else: resp = webob.exc.HTTPNotImplemented('Unknown Status: %s' % (status)) #10 return resp(env, start_response) else: resp = webob.exc.HTTPNotFound('Regexp failed to match URI: "%s"' % (req.path)) #11 return resp(env, start_response)
def handle_request(self, env, start_response): req = webob.Request(env) # Double (or triple, etc.) slashes in the URL should be ignored; collapse them. fixes T34864 req.path_info = re.sub(r'/{2,}', '/', req.path_info) # Keep a copy of the original request so we can ask the scalers for it reqorig = req.copy() # Containers have 5 components: project, language, repo, zone, and shard. # If there's no zone in the URL, the zone is assumed to be 'public' (for b/c). # Shard is optional (and configurable), and is only used for large containers. # # Projects are wikipedia, wikinews, etc. # Languages are en, de, fr, commons, etc. # Repos are local, timeline, etc. # Zones are public, thumb, temp, etc. # Shard is extracted from "hash paths" in the URL and is 2 hex digits. # # These attributes are mapped to container names in the form of either: # (a) proj-lang-repo-zone (if not sharded) # (b) proj-lang-repo-zone.shard (if sharded) # (c) global-data-repo-zone (if not sharded) # (d) global-data-repo-zone.shard (if sharded) # # Rewrite wiki-global URLs of these forms: # (a) http://upload.wikimedia.org/math/<relpath> # => http://msfe/v1/AUTH_<hash>/global-data-math-render/<relpath> # (b) http://upload.wikimedia.org/<proj>/<lang>/math/<relpath> (legacy) # => http://msfe/v1/AUTH_<hash>/global-data-math-render/<relpath> # # Rewrite wiki-relative URLs of these forms: # (a) http://upload.wikimedia.org/<proj>/<lang>/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/<relpath> # (b) http://upload.wikimedia.org/<proj>/<lang>/archive/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/archive/<relpath> # (c) http://upload.wikimedia.org/<proj>/<lang>/thumb/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/<relpath> # (d) http://upload.wikimedia.org/<proj>/<lang>/thumb/archive/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/archive/<relpath> # (e) http://upload.wikimedia.org/<proj>/<lang>/thumb/temp/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/temp/<relpath> # (f) http://upload.wikimedia.org/<proj>/<lang>/transcoded/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-transcoded/<relpath> # (g) http://upload.wikimedia.org/<proj>/<lang>/timeline/<relpath> # => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-timeline-render/<relpath> # regular uploads match = re.match(r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/((?P<zone>transcoded|thumb)/)?(?P<path>((temp|archive)/)?[0-9a-f]/(?P<shard>[0-9a-f]{2})/.+)$', req.path) if match: proj = match.group('proj') lang = match.group('lang') repo = 'local' # the upload repo name is "local" # Get the repo zone (if not provided that means "public") zone = (match.group('zone') if match.group('zone') else 'public') # Get the object path relative to the zone (and thus container) obj = match.group('path') # e.g. "archive/a/ab/..." shard = match.group('shard') # timeline renderings if match is None: # /wikipedia/en/timeline/a876297c277d80dfd826e1f23dbfea3f.png match = re.match(r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/(?P<repo>timeline)/(?P<path>.+)$', req.path) if match: proj = match.group('proj') # wikipedia lang = match.group('lang') # en repo = match.group('repo') # timeline zone = 'render' obj = match.group('path') # a876297c277d80dfd826e1f23dbfea3f.png shard = '' # math renderings if match is None: # /math/c/9/f/c9f2055dadfb49853eff822a453d9ceb.png # /wikipedia/en/math/c/9/f/c9f2055dadfb49853eff822a453d9ceb.png (legacy) match = re.match(r'^(/(?P<proj>[^/]+)/(?P<lang>[^/]+))?/(?P<repo>math)/(?P<path>(?P<shard1>[0-9a-f])/(?P<shard2>[0-9a-f])/.+)$', req.path) if match: proj = 'global' lang = 'data' repo = match.group('repo') # math zone = 'render' obj = match.group('path') # c/9/f/c9f2055dadfb49853eff822a453d9ceb.png shard = match.group('shard1') + match.group('shard2') # c9 # score renderings if match is None: # /score/j/q/jqn99bwy8777srpv45hxjoiu24f0636/jqn99bwy.png # /score/override-midi/8/i/8i9pzt87wtpy45lpz1rox8wusjkt7ki.ogg match = re.match(r'^/(?P<repo>score)/(?P<path>.+)$', req.path) if match: proj = 'global' lang = 'data' repo = match.group('repo') # score zone = 'render' obj = match.group('path') # j/q/jqn99bwy8777srpv45hxjoiu24f0636/jqn99bwy.png shard = '' if match is None: match = re.match(r'^/monitoring/(?P<what>.+)$', req.path) if match: what = match.group('what') if what == 'frontend': headers = {'Content-Type': 'application/octet-stream'} resp = webob.Response(headers=headers, body="OK\n") elif what == 'backend': req.host = '127.0.0.1:%s' % self.bind_port req.path_info = "/v1/%s/monitoring/backend" % self.account app_iter = self._app_call(env) status = self._get_status_int() headers = self._response_headers resp = webob.Response(status=status, headers=headers, app_iter=app_iter) else: resp = webob.exc.HTTPNotFound('Monitoring type not found "%s"' % (req.path)) return resp(env, start_response) if match is None: match = re.match(r'^/(?P<path>[^/]+)?$', req.path) # /index.html /favicon.ico /robots.txt etc. # serve from a default "root" container if match: path = match.group('path') if not path: path = 'index.html' req.host = '127.0.0.1:%s' % self.bind_port req.path_info = "/v1/%s/root/%s" % (self.account, path) app_iter = self._app_call(env) status = self._get_status_int() headers = self._response_headers resp = webob.Response(status=status, headers=headers, app_iter=app_iter) return resp(env, start_response) # Internally rewrite the URL based on the regex it matched... if match: # Get the per-project "conceptual" container name, e.g. "<proj><lang><repo><zone>" container = "%s-%s-%s-%s" % (proj, lang, repo, zone) # Add 2-digit shard to the container if it is supposed to be sharded. # We may thus have an "actual" container name like "<proj><lang><repo><zone>.<shard>" if container in self.shard_container_list: container += ".%s" % shard # Save a url with just the account name in it. req.path_info = "/v1/%s" % (self.account) port = self.bind_port req.host = '127.0.0.1:%s' % port url = req.url[:] # Create a path to our object's name. req.path_info = "/v1/%s/%s/%s" % (self.account, container, urllib2.unquote(obj)) #self.logger.warn("new path is %s" % req.path_info) # do_start_response just remembers what it got called with, # because our 404 handler will generate a different response. app_iter = self._app_call(env) status = self._get_status_int() headers = self._response_headers if 200 <= status < 300 or status == 304: # We have it! Just return it as usual. # If the object has an expiry header, bump its value # "headers" is a list of tuples for key, value in headers: if key == 'X-Delete-At': # Update expiry header asynchronously eventlet.spawn(self.update_expiry, env) break return webob.Response(status=status, headers=headers, app_iter=app_iter)(env, start_response) elif status == 404: # only send thumbs to the 404 handler; just return a 404 for everything else. if repo == 'local' and zone == 'thumb': resp = self.handle404(reqorig, url, container, obj) return resp(env, start_response) else: resp = webob.exc.HTTPNotFound('File not found: %s' % req.path) return resp(env, start_response) elif status == 401: # if the Storage URL is invalid or has expired we'll get this error. resp = webob.exc.HTTPUnauthorized('Token may have timed out') return resp(env, start_response) else: resp = webob.exc.HTTPNotImplemented('Unknown Status: %s' % (status)) return resp(env, start_response) else: resp = webob.exc.HTTPNotFound('Regexp failed to match URI: "%s"' % (req.path)) return resp(env, start_response)
def __call__(self, env, start_response): #try: commented-out while debugging so you can see where stuff happened. req = webob.Request(env) # PUT requests never need rewriting. if req.method == 'PUT': return self.app(env, start_response) # if it already has AUTH, presume that it's good. #07 if req.path.startswith('/auth') or req.path.find('AUTH') >= 0: return self.app(env, start_response) # keep a copy of the original request so we can ask the scalers for it reqorig = req.copy() # match these two URL forms (source files and thumbnails): # http://upload.wikimedia.org/<site>/<lang>/.* # http://upload.wikimedia.org/<site>/<lang>/thumb/.* # example: # http://upload.wikimedia.org/wikipedia/commons/a/aa/000_Finlanda_harta.PNG # http://upload.wikimedia.org/wikipedia/commons/thumb/a/aa/000_Finlanda_harta.PNG/75px-000_Finlanda_harta.PNG match = re.match(r'/(.*?)/(.*?)/(.*)', req.path) if match: # Our target URL is as follows (example): # https://alsted.wikimedia.org:8080/v1/AUTH_6790933748e741268babd69804c6298b/wikipedia-en/2/25/Machinesmith.png # quote slashes in the container name container = "%s-%s" % (match.group(1), match.group(2)) #02 obj = match.group(3) # include the thumb in the container. if obj.startswith("thumb/"): #03 container += "-thumb" obj = obj[len("thumb/"):] if not obj: # don't let them list the container (it's CRAZY huge) #08 resp = webob.exc.HTTPForbidden('No container listing') return resp(env, start_response) # save a url with just the account name in it. req.path_info = "/v1/%s" % (self.account) port = self.bind_port req.host = '127.0.0.1:%s' % port url = req.url[:] # Create a path to our object's name. req.path_info = "/v1/%s/%s/%s" % (self.account, container, urllib2.unquote(obj)) controller = ObjectController() # do_start_response just remembers what it got called with, # because our 404 handler will generate a different response. app_iter = self.app(env, controller.do_start_response) #01 status = int(controller.response_args[0].split()[0]) headers = dict(controller.response_args[1]) if 200 <= status < 300 or status == 304: # We have it! Just return it as usual. #headers['X-Swift-Proxy']= `headers` if 'etag' in headers: del headers['etag'] return webob.Response(status=status, headers=headers, app_iter=app_iter)(env, start_response) #01a elif status == 404: #4 resp = self.handle404(reqorig, url, container, obj) return resp(env, start_response) elif status == 401: # if the Storage URL is invalid or has expired we'll get this error. resp = webob.exc.HTTPUnauthorized( 'Token may have timed out') #05 return resp(env, start_response) else: resp = webob.exc.HTTPNotImplemented('Unknown Status: %s' % (status)) #10 return resp(env, start_response) else: resp = webob.exc.HTTPBadRequest('Regexp failed: "%s"' % (req.path)) #11 return resp(env, start_response)
def __call__(self, env, start_response): #try: commented-out while debugging so you can see where stuff happened. req = webob.Request(env) # PUT requests never need rewriting. if req.method == 'PUT': return self.app(env, start_response) # if it already has AUTH, presume that it's good. #07 if req.path.startswith('/auth') or req.path.find('AUTH') >= 0: return self.app(env, start_response) # keep a copy of the original request so we can ask the scalers for it reqorig = req.copy() # match these two URL forms (source files and thumbnails): # http://upload.wikimedia.org/<site>/<lang>/.* # http://upload.wikimedia.org/<site>/<lang>/thumb/.* # example: # http://upload.wikimedia.org/wikipedia/commons/a/aa/000_Finlanda_harta.PNG # http://upload.wikimedia.org/wikipedia/commons/thumb/a/aa/000_Finlanda_harta.PNG/75px-000_Finlanda_harta.PNG match = re.match(r'/(.*?)/(.*?)/(.*)', req.path) if match: # Our target URL is as follows (example): # https://alsted.wikimedia.org:8080/v1/AUTH_6790933748e741268babd69804c6298b/wikipedia-en/2/25/Machinesmith.png # quote slashes in the container name container = "%s-%s" % (match.group(1), match.group(2)) #02 obj = match.group(3) # include the thumb in the container. if obj.startswith("thumb/"): #03 container += "-thumb" obj = obj[len("thumb/"):] if not obj: # don't let them list the container (it's CRAZY huge) #08 resp = webob.exc.HTTPForbidden('No container listing') return resp(env, start_response) # save a url with just the account name in it. req.path_info = "/v1/%s" % (self.account) port = self.bind_port req.host = '127.0.0.1:%s' % port url = req.url[:] # Create a path to our object's name. req.path_info = "/v1/%s/%s/%s" % (self.account, container, urllib2.unquote(obj)) controller = ObjectController() # do_start_response just remembers what it got called with, # because our 404 handler will generate a different response. app_iter = self.app(env, controller.do_start_response) #01 status = int(controller.response_args[0].split()[0]) headers = dict(controller.response_args[1]) if 200 <= status < 300 or status == 304: # We have it! Just return it as usual. #headers['X-Swift-Proxy']= `headers` if 'etag' in headers: del headers['etag'] return webob.Response(status=status, headers=headers, app_iter=app_iter)(env, start_response) #01a elif status == 404: #4 resp = self.handle404(reqorig, url, container, obj) return resp(env, start_response) elif status == 401: # if the Storage URL is invalid or has expired we'll get this error. resp = webob.exc.HTTPUnauthorized('Token may have timed out') #05 return resp(env, start_response) else: resp = webob.exc.HTTPNotImplemented('Unknown Status: %s' % (status)) #10 return resp(env, start_response) else: resp = webob.exc.HTTPBadRequest('Regexp failed: "%s"' % (req.path)) #11 return resp(env, start_response)