def get_hand_picked_titles(filename_part):
    hand_picked_titles = []
    with open('/homes/rps10/rps10Docs/Individual_Project/TermExtraction/hand-tagged-tests/wikiArticles/' + filename_part + '_wikis.txt') as f:
        for line in f:
            if line.split():
                url = line.split()[0]
                url_title = url.split('/')[-1]
                unquoted_url_title = urllib2.unquote(url_title).decode('utf8')
                title = ' '.join(unquoted_url_title.split('_'))
                hand_picked_titles.append(title)
    return hand_picked_titles
def get_hand_picked_titles(filename_part):
    hand_picked_titles = []
    with open(
            '/homes/rps10/rps10Docs/Individual_Project/TermExtraction/hand-tagged-tests/wikiArticles/'
            + filename_part + '_wikis.txt') as f:
        for line in f:
            if line.split():
                url = line.split()[0]
                url_title = url.split('/')[-1]
                unquoted_url_title = urllib2.unquote(url_title).decode('utf8')
                title = ' '.join(unquoted_url_title.split('_'))
                hand_picked_titles.append(title)
    return hand_picked_titles
def compare_wiki_results(filename_part,retrieved_wiki_titles):
    hand_picked_titles = []
    with open('/homes/rps10/rps10Docs/Individual_Project/TermExtraction/hand-tagged-tests/wikiArticles/' + filename_part + '_wikis.txt') as f:
        for line in f:
            print line
            if line.split():
                url = line.split()[0]
                print repr(url)
                url_title = url.split('/')[-1]
                print repr(url_title)
                unquoted_url_title = urllib2.unquote(url_title).decode('utf8')
                print repr(unquoted_url_title)
                title = ' '.join(unquoted_url_title.split('_'))
                print repr(title)
                hand_picked_titles.append(title)
    print hand_picked_titles
    print "Titles not retrieved: "
    print set(hand_picked_titles).difference(set(retrieved_wiki_titles))
def compare_wiki_results(filename_part, retrieved_wiki_titles):
    hand_picked_titles = []
    with open(
            '/homes/rps10/rps10Docs/Individual_Project/TermExtraction/hand-tagged-tests/wikiArticles/'
            + filename_part + '_wikis.txt') as f:
        for line in f:
            print line
            if line.split():
                url = line.split()[0]
                print repr(url)
                url_title = url.split('/')[-1]
                print repr(url_title)
                unquoted_url_title = urllib2.unquote(url_title).decode('utf8')
                print repr(unquoted_url_title)
                title = ' '.join(unquoted_url_title.split('_'))
                print repr(title)
                hand_picked_titles.append(title)
    print hand_picked_titles
    print "Titles not retrieved: "
    print set(hand_picked_titles).difference(set(retrieved_wiki_titles))
Exemple #5
0
    def _handle_request(self, env, start_response):
        req = swob.Request(env)

        # Double (or triple, etc.) slashes in the URL should be ignored;
        # collapse them. fixes T34864
        req.path_info = re.sub(r'/{2,}', '/', req.path_info)

        # Keep a copy of the original request so we can ask the scalers for it
        reqorig = swob.Request(req.environ.copy())

        # Containers have 5 components: project, language, repo, zone, and shard.
        # If there's no zone in the URL, the zone is assumed to be 'public' (for b/c).
        # Shard is optional (and configurable), and is only used for large containers.
        #
        # Projects are wikipedia, wikinews, etc.
        # Languages are en, de, fr, commons, etc.
        # Repos are local, timeline, etc.
        # Zones are public, thumb, temp, etc.
        # Shard is extracted from "hash paths" in the URL and is 2 hex digits.
        #
        # These attributes are mapped to container names in the form of either:
        # (a) proj-lang-repo-zone (if not sharded)
        # (b) proj-lang-repo-zone.shard (if sharded)
        # (c) global-data-repo-zone (if not sharded)
        # (d) global-data-repo-zone.shard (if sharded)
        #
        # Rewrite wiki-global URLs of these forms:
        # (a) http://upload.wikimedia.org/math/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/global-data-math-render/<relpath>
        # (b) http://upload.wikimedia.org/<proj>/<lang>/math/<relpath> (legacy)
        #         => http://msfe/v1/AUTH_<hash>/global-data-math-render/<relpath>
        #
        # Rewrite wiki-relative URLs of these forms:
        # (a) http://upload.wikimedia.org/<proj>/<lang>/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/<relpath>
        # (b) http://upload.wikimedia.org/<proj>/<lang>/archive/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/archive/<relpath>
        # (c) http://upload.wikimedia.org/<proj>/<lang>/thumb/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/<relpath>
        # (d) http://upload.wikimedia.org/<proj>/<lang>/thumb/archive/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/archive/<relpath>
        # (e) http://upload.wikimedia.org/<proj>/<lang>/thumb/temp/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/temp/<relpath>
        # (f) http://upload.wikimedia.org/<proj>/<lang>/transcoded/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-transcoded/<relpath>
        # (g) http://upload.wikimedia.org/<proj>/<lang>/timeline/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-timeline-render/<relpath>

        # regular uploads
        match = re.match(
            (r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/'
             r'((?P<zone>transcoded|thumb)/)?'
             r'(?P<path>((temp|archive)/)?[0-9a-f]/(?P<shard>[0-9a-f]{2})/.+)$'
             ), req.path)
        if match:
            proj = match.group('proj')
            lang = match.group('lang')
            repo = 'local'  # the upload repo name is "local"
            # Get the repo zone (if not provided that means "public")
            zone = (match.group('zone') if match.group('zone') else 'public')
            # Get the object path relative to the zone (and thus container)
            obj = match.group('path')  # e.g. "archive/a/ab/..."
            shard = match.group('shard')

        # timeline renderings
        if match is None:
            # /wikipedia/en/timeline/a876297c277d80dfd826e1f23dbfea3f.png
            match = re.match(
                r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/(?P<repo>timeline)/(?P<path>.+)$',
                req.path)
            if match:
                proj = match.group('proj')  # wikipedia
                lang = match.group('lang')  # en
                repo = match.group('repo')  # timeline
                zone = 'render'
                obj = match.group(
                    'path')  # a876297c277d80dfd826e1f23dbfea3f.png
                shard = ''

        # math renderings
        if match is None:
            # /math/c/9/f/c9f2055dadfb49853eff822a453d9ceb.png
            # /wikipedia/en/math/c/9/f/c9f2055dadfb49853eff822a453d9ceb.png (legacy)
            match = re.match(
                (r'^(/(?P<proj>[^/]+)/(?P<lang>[^/]+))?/(?P<repo>math)/'
                 r'(?P<path>(?P<shard1>[0-9a-f])/(?P<shard2>[0-9a-f])/.+)$'),
                req.path)

            if match:
                proj = 'global'
                lang = 'data'
                repo = match.group('repo')  # math
                zone = 'render'
                obj = match.group(
                    'path')  # c/9/f/c9f2055dadfb49853eff822a453d9ceb.png
                shard = match.group('shard1') + match.group('shard2')  # c9

        # score renderings
        if match is None:
            # /score/j/q/jqn99bwy8777srpv45hxjoiu24f0636/jqn99bwy.png
            # /score/override-midi/8/i/8i9pzt87wtpy45lpz1rox8wusjkt7ki.ogg
            match = re.match(r'^/(?P<repo>score)/(?P<path>.+)$', req.path)
            if match:
                proj = 'global'
                lang = 'data'
                repo = match.group('repo')  # score
                zone = 'render'
                obj = match.group(
                    'path')  # j/q/jqn99bwy8777srpv45hxjoiu24f0636/jqn99bwy.png
                shard = ''

        if match is None:
            match = re.match(r'^/monitoring/(?P<what>.+)$', req.path)
            if match:
                what = match.group('what')
                if what == 'frontend':
                    headers = {'Content-Type': 'application/octet-stream'}
                    resp = swob.Response(headers=headers, body="OK\n")
                elif what == 'backend':
                    req.host = '127.0.0.1:%s' % self.bind_port
                    req.path_info = "/v1/%s/monitoring/backend" % self.account

                    app_iter = self._app_call(env)
                    status = self._get_status_int()
                    headers = self._response_headers

                    resp = swob.Response(status=status,
                                         headers=headers,
                                         app_iter=app_iter)
                else:
                    resp = swob.HTTPNotFound('Monitoring type not found "%s"' %
                                             (req.path))
                return resp(env, start_response)

        if match is None:
            match = re.match(r'^/(?P<path>[^/]+)?$', req.path)
            # /index.html /favicon.ico /robots.txt etc.
            # serve from a default "root" container
            if match:
                path = match.group('path')
                if not path:
                    path = 'index.html'

                req.host = '127.0.0.1:%s' % self.bind_port
                req.path_info = "/v1/%s/root/%s" % (self.account, path)

                app_iter = self._app_call(env)
                status = self._get_status_int()
                headers = self._response_headers

                resp = swob.Response(status=status,
                                     headers=headers,
                                     app_iter=app_iter)
                return resp(env, start_response)

        # Internally rewrite the URL based on the regex it matched...
        if match:
            # Get the per-project "conceptual" container name, e.g. "<proj><lang><repo><zone>"
            container = "%s-%s-%s-%s" % (proj, lang, repo, zone)
            # Add 2-digit shard to the container if it is supposed to be sharded.
            # We may thus have an "actual" container name like "<proj><lang><repo><zone>.<shard>"
            if container in self.shard_container_list:
                container += ".%s" % shard

            # Save a url with just the account name in it.
            req.path_info = "/v1/%s" % (self.account)
            port = self.bind_port
            req.host = '127.0.0.1:%s' % port
            url = req.url[:]
            # Create a path to our object's name.
            req.path_info = "/v1/%s/%s/%s" % (self.account, container,
                                              urllib2.unquote(obj))
            # self.logger.warn("new path is %s" % req.path_info)

            # do_start_response just remembers what it got called with,
            # because our 404 handler will generate a different response.
            app_iter = self._app_call(env)
            status = self._get_status_int()
            headers = self._response_headers

            if status == 404:
                # only send thumbs to the 404 handler; just return a 404 for everything else.
                if repo == 'local' and zone == 'thumb':
                    resp = self.handle404(reqorig, url, container, obj)
                    return resp(env, start_response)
                else:
                    resp = swob.HTTPNotFound('File not found: %s' % req.path)
                    return resp(env, start_response)
            else:
                if zone == 'thumb':
                    for key, value in headers:
                        if key == 'X-Delete-At' and self.thumbnail_update_expiry_headers:
                            # Update expiry header asynchronously
                            eventlet.spawn(self.update_expiry, env)
                            break

                # Return the response verbatim
                return swob.Response(status=status,
                                     headers=headers,
                                     app_iter=app_iter)(env, start_response)
        else:
            resp = swob.HTTPNotFound('Regexp failed to match URI: "%s"' %
                                     (req.path))
            return resp(env, start_response)
Exemple #6
0
    def __call__(self, env, start_response):
      #try: commented-out while debugging so you can see where stuff happened.
        req = webob.Request(env)
        # End-users should only do GET/HEAD, nothing else needs a rewrite
        if req.method != 'GET' and req.method != 'HEAD':
            return self.app(env, start_response)

        # Double (or triple, etc.) slashes in the URL should be ignored; collapse them. fixes bug 32864
        while(req.path_info != req.path_info.replace('//', '/')):
            req.path_info = req.path_info.replace('//', '/')

        # If it already has AUTH, presume that it's good. #07. fixes bug 33620
        hasauth = re.search('/AUTH_[0-9a-fA-F-]{32,36}', req.path)
        if req.path.startswith('/auth') or hasauth:
            return self.app(env, start_response)

        # keep a copy of the original request so we can ask the scalers for it
        reqorig = req.copy()

        # Containers have 4 components: project, language, zone, and shard.
        # Shard is optional (and configurable).  If there's no zone in the URL,
        # the zone is 'public'.  Project, language, and zone are turned into containers
        # with the pattern proj-lang-local-zone (or proj-lang-local-zone.shard).
        # Projects are wikipedia, wikinews, etc.
        # Languages are en, de, fr, commons, etc.
        # Zones are public, thumb, and temp.
        # Shards are stolen from the URL and are 2 digits of hex.
        # Examples:
        # Rewrite URLs of these forms (source, temp, and thumbnail files):
        # (a) http://upload.wikimedia.org/<proj>/<lang>/.*
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/.*
        # (b) http://upload.wikimedia.org/<proj>/<lang>/archive/.*
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/archive/.*
        # (c) http://upload.wikimedia.org/<proj>/<lang>/thumb/.*
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/.*
        # (d) http://upload.wikimedia.org/<proj>/<lang>/thumb/archive/.*
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/archive/.*
        # (e) http://upload.wikimedia.org/<proj>/<lang>/thumb/temp/.*
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/temp/.*
        # (f) http://upload.wikimedia.org/<proj>/<lang>/temp/.*
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-temp/.*
        match = re.match(r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/((?P<zone>thumb|temp)/)?(?P<path>((temp|archive)/)?[0-9a-f]/(?P<shard>[0-9a-f]{2})/.+)$', req.path)
        if match:
            # Get the repo zone (if not provided that means "public")
            zone = (match.group('zone') if match.group('zone') else 'public')
            # Get the object path relative to the zone (and thus container)
            obj = match.group('path') # e.g. "archive/a/ab/..."

            # Get the per-project "conceptual" container name, e.g. "<proj><lang><repo><zone>"
            container = "%s-%s-local-%s" % (match.group('proj'), match.group('lang'), zone) #02/#03
            # Add 2-digit shard to the container if it is supposed to be sharded.
            # We may thus have an "actual" container name like "<proj><lang><repo><zone>.<shard>"
            if ( (self.shard_containers == 'all') or \
                 ((self.shard_containers == 'some') and (container in self.shard_container_list)) ):
                container += ".%s" % match.group('shard')

            # Save a url with just the account name in it.
            req.path_info = "/v1/%s" % (self.account)
            port = self.bind_port
            req.host = '127.0.0.1:%s' % port
            url = req.url[:]
            # Create a path to our object's name.
            req.path_info = "/v1/%s/%s/%s" % (self.account, container, urllib2.unquote(obj))
            #self.logger.warn("new path is %s" % req.path_info)

            controller = ObjectController()
            # do_start_response just remembers what it got called with,
            # because our 404 handler will generate a different response.
            app_iter = self.app(env, controller.do_start_response) #01
            status = int(controller.response_args[0].split()[0])
            headers = dict(controller.response_args[1])

            if 200 <= status < 300 or status == 304:
                # We have it! Just return it as usual.
                #headers['X-Swift-Proxy']= `headers`
                if 'etag' in headers: del headers['etag']
                return webob.Response(status=status, headers=headers,
                        app_iter=app_iter)(env, start_response) #01a
            elif status == 404: #4
                resp = self.handle404(reqorig, url, container, obj)
                return resp(env, start_response)
            elif status == 401:
                # if the Storage URL is invalid or has expired we'll get this error.
                resp = webob.exc.HTTPUnauthorized('Token may have timed out') #05
                return resp(env, start_response)
            else:
                resp = webob.exc.HTTPNotImplemented('Unknown Status: %s' % (status)) #10
                return resp(env, start_response)
        else:
            resp = webob.exc.HTTPBadRequest('Regexp failed: "%s"' % (req.path)) #11
            return resp(env, start_response)
    def __call__(self, env, start_response):
        #try: commented-out while debugging so you can see where stuff happened.
        req = webob.Request(env)
        # End-users should only do GET/HEAD, nothing else needs a rewrite
        if req.method != 'GET' and req.method != 'HEAD':
            return self.app(env, start_response)

        # Double (or triple, etc.) slashes in the URL should be ignored; collapse them. fixes bug 32864
        while (req.path_info != req.path_info.replace('//', '/')):
            req.path_info = req.path_info.replace('//', '/')

        # If it already has AUTH, presume that it's good. #07. fixes bug 33620
        hasauth = re.search('/AUTH_[0-9a-fA-F-]{32,36}', req.path)
        if req.path.startswith('/auth') or hasauth:
            return self.app(env, start_response)

        # keep a copy of the original request so we can ask the scalers for it
        reqorig = req.copy()

        # Containers have 4 components: project, language, zone, and shard.
        # Shard is optional (and configurable).  If there's no zone in the URL,
        # the zone is 'public'.  Project, language, and zone are turned into containers
        # with the pattern proj-lang-local-zone (or proj-lang-local-zone.shard).
        # Projects are wikipedia, wikinews, etc.
        # Languages are en, de, fr, commons, etc.
        # Zones are public, thumb, and temp.
        # Shards are stolen from the URL and are 2 digits of hex.
        # Examples:
        # Rewrite URLs of these forms (source, temp, and thumbnail files):
        # (a) http://upload.wikimedia.org/<proj>/<lang>/.*
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/.*
        # (b) http://upload.wikimedia.org/<proj>/<lang>/archive/.*
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/archive/.*
        # (c) http://upload.wikimedia.org/<proj>/<lang>/thumb/.*
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/.*
        # (d) http://upload.wikimedia.org/<proj>/<lang>/thumb/archive/.*
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/archive/.*
        # (e) http://upload.wikimedia.org/<proj>/<lang>/thumb/temp/.*
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/temp/.*
        # (f) http://upload.wikimedia.org/<proj>/<lang>/temp/.*
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-temp/.*
        match = re.match(
            r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/((?P<zone>thumb|temp)/)?(?P<path>((temp|archive)/)?[0-9a-f]/(?P<shard>[0-9a-f]{2})/.+)$',
            req.path)
        if match:
            # Get the repo zone (if not provided that means "public")
            zone = (match.group('zone') if match.group('zone') else 'public')
            # Get the object path relative to the zone (and thus container)
            obj = match.group('path')  # e.g. "archive/a/ab/..."

            # Get the per-project "conceptual" container name, e.g. "<proj><lang><repo><zone>"
            container = "%s-%s-local-%s" % (
                match.group('proj'), match.group('lang'), zone)  #02/#03
            # Add 2-digit shard to the container if it is supposed to be sharded.
            # We may thus have an "actual" container name like "<proj><lang><repo><zone>.<shard>"
            if ( (self.shard_containers == 'all') or \
                 ((self.shard_containers == 'some') and (container in self.shard_container_list)) ):
                container += ".%s" % match.group('shard')

            # Save a url with just the account name in it.
            req.path_info = "/v1/%s" % (self.account)
            port = self.bind_port
            req.host = '127.0.0.1:%s' % port
            url = req.url[:]
            # Create a path to our object's name.
            req.path_info = "/v1/%s/%s/%s" % (self.account, container,
                                              urllib2.unquote(obj))
            #self.logger.warn("new path is %s" % req.path_info)

            controller = ObjectController()
            # do_start_response just remembers what it got called with,
            # because our 404 handler will generate a different response.
            app_iter = self.app(env, controller.do_start_response)  #01
            status = int(controller.response_args[0].split()[0])
            headers = dict(controller.response_args[1])

            if 200 <= status < 300 or status == 304:
                # We have it! Just return it as usual.
                #headers['X-Swift-Proxy']= `headers`
                if 'etag' in headers: del headers['etag']
                return webob.Response(status=status,
                                      headers=headers,
                                      app_iter=app_iter)(env,
                                                         start_response)  #01a
            elif status == 404:  #4
                resp = self.handle404(reqorig, url, container, obj)
                return resp(env, start_response)
            elif status == 401:
                # if the Storage URL is invalid or has expired we'll get this error.
                resp = webob.exc.HTTPUnauthorized(
                    'Token may have timed out')  #05
                return resp(env, start_response)
            else:
                resp = webob.exc.HTTPNotImplemented('Unknown Status: %s' %
                                                    (status))  #10
                return resp(env, start_response)
        else:
            resp = webob.exc.HTTPBadRequest('Regexp failed: "%s"' %
                                            (req.path))  #11
            return resp(env, start_response)
Exemple #8
0
    def handle_request(self, env, start_response):
        req = webob.Request(env)

        # Double (or triple, etc.) slashes in the URL should be ignored; collapse them. fixes bug 32864
        req.path_info = re.sub(r'/{2,}', '/', req.path_info)

        # Keep a copy of the original request so we can ask the scalers for it
        reqorig = req.copy()

        # Containers have 5 components: project, language, repo, zone, and shard.
        # If there's no zone in the URL, the zone is assumed to be 'public' (for b/c).
        # Shard is optional (and configurable), and is only used for large containers.
        #
        # Projects are wikipedia, wikinews, etc.
        # Languages are en, de, fr, commons, etc.
        # Repos are local, timeline, etc.
        # Zones are public, thumb, temp, etc.
        # Shard is extracted from "hash paths" in the URL and is 2 hex digits.
        #
        # These attributes are mapped to container names in the form of either:
        # (a) proj-lang-repo-zone (if not sharded)
        # (b) proj-lang-repo-zone.shard (if sharded)
        # (c) global-data-repo-zone (if not sharded)
        # (d) global-data-repo-zone.shard (if sharded)
        #
        # Rewrite wiki-global URLs of these forms:
        # (a) http://upload.wikimedia.org/math/.*
        #         => http://msfe/v1/AUTH_<hash>/global-data-math-render/.*
        #
        # Rewrite wiki-relative URLs of these forms:
        # (a) http://upload.wikimedia.org/<proj>/<lang>/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/<relpath>
        # (b) http://upload.wikimedia.org/<proj>/<lang>/archive/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/archive/<relpath>
        # (c) http://upload.wikimedia.org/<proj>/<lang>/thumb/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/<relpath>
        # (d) http://upload.wikimedia.org/<proj>/<lang>/thumb/archive/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/archive/<relpath>
        # (e) http://upload.wikimedia.org/<proj>/<lang>/thumb/temp/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/temp/<relpath>
        # (f) http://upload.wikimedia.org/<proj>/<lang>/temp/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-temp/<relpath>
        # (g) http://upload.wikimedia.org/<proj>/<lang>/transcoded/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-transcoded/<relpath>
        # (h) http://upload.wikimedia.org/<proj>/<lang>/timeline/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-timeline-render/<relpath>

        # regular uploads
        match = re.match(
            r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/((?P<zone>transcoded|thumb|temp)/)?(?P<path>((temp|archive)/)?[0-9a-f]/(?P<shard>[0-9a-f]{2})/.+)$',
            req.path)
        if match:
            proj = match.group('proj')
            lang = match.group('lang')
            repo = 'local'  # the upload repo name is "local"
            # Get the repo zone (if not provided that means "public")
            zone = (match.group('zone') if match.group('zone') else 'public')
            # Get the object path relative to the zone (and thus container)
            obj = match.group('path')  # e.g. "archive/a/ab/..."
            shard = match.group('shard')

        # timeline renderings
        if match is None:
            # /wikipedia/en/timeline/a876297c277d80dfd826e1f23dbfea3f.png
            match = re.match(
                r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/(?P<repo>timeline)/(?P<path>.+)$',
                req.path)
            if match:
                proj = match.group('proj')  # wikipedia
                lang = match.group('lang')  # en
                repo = match.group('repo')  # timeline
                zone = 'render'
                obj = match.group(
                    'path')  # a876297c277d80dfd826e1f23dbfea3f.png
                shard = ''

        # math renderings
        if match is None:
            # /math/c/9/f/c9f2055dadfb49853eff822a453d9ceb.png
            match = re.match(
                r'^/(?P<repo>math)/(?P<path>(?P<shard1>[0-9a-f])/(?P<shard2>[0-9a-f])/.+)$',
                req.path)
            if match:
                proj = 'global'
                lang = 'data'
                repo = match.group('repo')  # math
                zone = 'render'
                obj = match.group(
                    'path')  # c/9/f/c9f2055dadfb49853eff822a453d9ceb.png
                shard = match.group('shard1') + match.group('shard2')  # c9

        # score renderings
        if match is None:
            # /score/j/q/jqn99bwy8777srpv45hxjoiu24f0636/jqn99bwy.png
            # /score/override-midi/8/i/8i9pzt87wtpy45lpz1rox8wusjkt7ki.ogg
            match = re.match(r'^/(?P<repo>score)/(?P<path>.+)$', req.path)
            if match:
                proj = 'global'
                lang = 'data'
                repo = match.group('repo')  # score
                zone = 'render'
                obj = match.group(
                    'path')  # j/q/jqn99bwy8777srpv45hxjoiu24f0636/jqn99bwy.png
                shard = ''

        if match is None:
            match = re.match(r'^/monitoring/(?P<what>.+)$', req.path)
            if match:
                what = match.group('what')
                if what == 'frontend':
                    headers = {'Content-Type': 'application/octet-stream'}
                    resp = webob.Response(headers=headers, body="OK\n")
                elif what == 'backend':
                    req.host = '127.0.0.1:%s' % self.bind_port
                    req.path_info = "/v1/%s/monitoring/backend" % self.account

                    app_iter = self._app_call(env)
                    status = self._get_status_int()
                    headers = self._response_headers

                    resp = webob.Response(status=status,
                                          headers=headers,
                                          app_iter=app_iter)
                else:
                    resp = webob.exc.HTTPNotFound(
                        'Monitoring type not found "%s"' % (req.path))
                return resp(env, start_response)

        # Internally rewrite the URL based on the regex it matched...
        if match:
            # Get the per-project "conceptual" container name, e.g. "<proj><lang><repo><zone>"
            container = "%s-%s-%s-%s" % (proj, lang, repo, zone)  #02/#03
            # Add 2-digit shard to the container if it is supposed to be sharded.
            # We may thus have an "actual" container name like "<proj><lang><repo><zone>.<shard>"
            if ( (self.shard_containers == 'all') or \
                 ((self.shard_containers == 'some') and (container in self.shard_container_list)) ):
                container += ".%s" % shard

            # Save a url with just the account name in it.
            req.path_info = "/v1/%s" % (self.account)
            port = self.bind_port
            req.host = '127.0.0.1:%s' % port
            url = req.url[:]
            # Create a path to our object's name.
            req.path_info = "/v1/%s/%s/%s" % (self.account, container,
                                              urllib2.unquote(obj))
            #self.logger.warn("new path is %s" % req.path_info)

            # do_start_response just remembers what it got called with,
            # because our 404 handler will generate a different response.
            app_iter = self._app_call(env)  #01
            status = self._get_status_int()
            headers = self._response_headers

            if 200 <= status < 300 or status == 304:
                # We have it! Just return it as usual.
                #headers['X-Swift-Proxy']= `headers`
                return webob.Response(status=status,
                                      headers=headers,
                                      app_iter=app_iter)(env,
                                                         start_response)  #01a
            elif status == 404:  #4
                # only send thumbs to the 404 handler; just return a 404 for everything else.
                if repo == 'local' and zone == 'thumb':
                    resp = self.handle404(reqorig, url, container, obj)
                    return resp(env, start_response)
                else:
                    resp = webob.exc.HTTPNotFound('File not found: %s' %
                                                  req.path)
                    return resp(env, start_response)
            elif status == 401:
                # if the Storage URL is invalid or has expired we'll get this error.
                resp = webob.exc.HTTPUnauthorized(
                    'Token may have timed out')  #05
                return resp(env, start_response)
            else:
                resp = webob.exc.HTTPNotImplemented('Unknown Status: %s' %
                                                    (status))  #10
                return resp(env, start_response)
        else:
            resp = webob.exc.HTTPNotFound('Regexp failed to match URI: "%s"' %
                                          (req.path))  #11
            return resp(env, start_response)
    def handle_request(self, env, start_response):
        req = webob.Request(env)

        # Double (or triple, etc.) slashes in the URL should be ignored; collapse them. fixes T34864
        req.path_info = re.sub(r'/{2,}', '/', req.path_info)

        # Keep a copy of the original request so we can ask the scalers for it
        reqorig = req.copy()

        # Containers have 5 components: project, language, repo, zone, and shard.
        # If there's no zone in the URL, the zone is assumed to be 'public' (for b/c).
        # Shard is optional (and configurable), and is only used for large containers.
        #
        # Projects are wikipedia, wikinews, etc.
        # Languages are en, de, fr, commons, etc.
        # Repos are local, timeline, etc.
        # Zones are public, thumb, temp, etc.
        # Shard is extracted from "hash paths" in the URL and is 2 hex digits.
        #
        # These attributes are mapped to container names in the form of either:
        # (a) proj-lang-repo-zone (if not sharded)
        # (b) proj-lang-repo-zone.shard (if sharded)
        # (c) global-data-repo-zone (if not sharded)
        # (d) global-data-repo-zone.shard (if sharded)
        #
        # Rewrite wiki-global URLs of these forms:
        # (a) http://upload.wikimedia.org/math/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/global-data-math-render/<relpath>
        # (b) http://upload.wikimedia.org/<proj>/<lang>/math/<relpath> (legacy)
        #         => http://msfe/v1/AUTH_<hash>/global-data-math-render/<relpath>
        #
        # Rewrite wiki-relative URLs of these forms:
        # (a) http://upload.wikimedia.org/<proj>/<lang>/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/<relpath>
        # (b) http://upload.wikimedia.org/<proj>/<lang>/archive/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-public/archive/<relpath>
        # (c) http://upload.wikimedia.org/<proj>/<lang>/thumb/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/<relpath>
        # (d) http://upload.wikimedia.org/<proj>/<lang>/thumb/archive/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/archive/<relpath>
        # (e) http://upload.wikimedia.org/<proj>/<lang>/thumb/temp/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-thumb/temp/<relpath>
        # (f) http://upload.wikimedia.org/<proj>/<lang>/transcoded/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-local-transcoded/<relpath>
        # (g) http://upload.wikimedia.org/<proj>/<lang>/timeline/<relpath>
        #         => http://msfe/v1/AUTH_<hash>/<proj>-<lang>-timeline-render/<relpath>

        # regular uploads
        match = re.match(r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/((?P<zone>transcoded|thumb)/)?(?P<path>((temp|archive)/)?[0-9a-f]/(?P<shard>[0-9a-f]{2})/.+)$', req.path)
        if match:
            proj = match.group('proj')
            lang = match.group('lang')
            repo = 'local'  # the upload repo name is "local"
            # Get the repo zone (if not provided that means "public")
            zone = (match.group('zone') if match.group('zone') else 'public')
            # Get the object path relative to the zone (and thus container)
            obj = match.group('path')  # e.g. "archive/a/ab/..."
            shard = match.group('shard')

        # timeline renderings
        if match is None:
            # /wikipedia/en/timeline/a876297c277d80dfd826e1f23dbfea3f.png
            match = re.match(r'^/(?P<proj>[^/]+)/(?P<lang>[^/]+)/(?P<repo>timeline)/(?P<path>.+)$', req.path)
            if match:
                proj = match.group('proj')  # wikipedia
                lang = match.group('lang')  # en
                repo = match.group('repo')  # timeline
                zone = 'render'
                obj = match.group('path')  # a876297c277d80dfd826e1f23dbfea3f.png
                shard = ''

        # math renderings
        if match is None:
            # /math/c/9/f/c9f2055dadfb49853eff822a453d9ceb.png
            # /wikipedia/en/math/c/9/f/c9f2055dadfb49853eff822a453d9ceb.png (legacy)
            match = re.match(r'^(/(?P<proj>[^/]+)/(?P<lang>[^/]+))?/(?P<repo>math)/(?P<path>(?P<shard1>[0-9a-f])/(?P<shard2>[0-9a-f])/.+)$', req.path)

            if match:
                proj = 'global'
                lang = 'data'
                repo = match.group('repo')  # math
                zone = 'render'
                obj = match.group('path')  # c/9/f/c9f2055dadfb49853eff822a453d9ceb.png
                shard = match.group('shard1') + match.group('shard2')  # c9

        # score renderings
        if match is None:
            # /score/j/q/jqn99bwy8777srpv45hxjoiu24f0636/jqn99bwy.png
            # /score/override-midi/8/i/8i9pzt87wtpy45lpz1rox8wusjkt7ki.ogg
            match = re.match(r'^/(?P<repo>score)/(?P<path>.+)$', req.path)
            if match:
                proj = 'global'
                lang = 'data'
                repo = match.group('repo')  # score
                zone = 'render'
                obj = match.group('path')  # j/q/jqn99bwy8777srpv45hxjoiu24f0636/jqn99bwy.png
                shard = ''

        if match is None:
            match = re.match(r'^/monitoring/(?P<what>.+)$', req.path)
            if match:
                what = match.group('what')
                if what == 'frontend':
                    headers = {'Content-Type': 'application/octet-stream'}
                    resp = webob.Response(headers=headers, body="OK\n")
                elif what == 'backend':
                    req.host = '127.0.0.1:%s' % self.bind_port
                    req.path_info = "/v1/%s/monitoring/backend" % self.account

                    app_iter = self._app_call(env)
                    status = self._get_status_int()
                    headers = self._response_headers

                    resp = webob.Response(status=status, headers=headers, app_iter=app_iter)
                else:
                    resp = webob.exc.HTTPNotFound('Monitoring type not found "%s"' % (req.path))
                return resp(env, start_response)

        if match is None:
            match = re.match(r'^/(?P<path>[^/]+)?$', req.path)
            # /index.html /favicon.ico /robots.txt etc.
            # serve from a default "root" container
            if match:
                path = match.group('path')
                if not path:
                    path = 'index.html'

                req.host = '127.0.0.1:%s' % self.bind_port
                req.path_info = "/v1/%s/root/%s" % (self.account, path)

                app_iter = self._app_call(env)
                status = self._get_status_int()
                headers = self._response_headers

                resp = webob.Response(status=status, headers=headers, app_iter=app_iter)
                return resp(env, start_response)

        # Internally rewrite the URL based on the regex it matched...
        if match:
            # Get the per-project "conceptual" container name, e.g. "<proj><lang><repo><zone>"
            container = "%s-%s-%s-%s" % (proj, lang, repo, zone)
            # Add 2-digit shard to the container if it is supposed to be sharded.
            # We may thus have an "actual" container name like "<proj><lang><repo><zone>.<shard>"
            if container in self.shard_container_list:
                container += ".%s" % shard

            # Save a url with just the account name in it.
            req.path_info = "/v1/%s" % (self.account)
            port = self.bind_port
            req.host = '127.0.0.1:%s' % port
            url = req.url[:]
            # Create a path to our object's name.
            req.path_info = "/v1/%s/%s/%s" % (self.account, container, urllib2.unquote(obj))
            #self.logger.warn("new path is %s" % req.path_info)

            # do_start_response just remembers what it got called with,
            # because our 404 handler will generate a different response.
            app_iter = self._app_call(env)
            status = self._get_status_int()
            headers = self._response_headers

            if 200 <= status < 300 or status == 304:
                # We have it! Just return it as usual.

                # If the object has an expiry header, bump its value
                # "headers" is a list of tuples
                for key, value in headers:
                    if key == 'X-Delete-At':
                        # Update expiry header asynchronously
                        eventlet.spawn(self.update_expiry, env)
                        break

                return webob.Response(status=status, headers=headers,
                                      app_iter=app_iter)(env, start_response)
            elif status == 404:
                # only send thumbs to the 404 handler; just return a 404 for everything else.
                if repo == 'local' and zone == 'thumb':
                    resp = self.handle404(reqorig, url, container, obj)
                    return resp(env, start_response)
                else:
                    resp = webob.exc.HTTPNotFound('File not found: %s' % req.path)
                    return resp(env, start_response)
            elif status == 401:
                # if the Storage URL is invalid or has expired we'll get this error.
                resp = webob.exc.HTTPUnauthorized('Token may have timed out')
                return resp(env, start_response)
            else:
                resp = webob.exc.HTTPNotImplemented('Unknown Status: %s' % (status))
                return resp(env, start_response)
        else:
            resp = webob.exc.HTTPNotFound('Regexp failed to match URI: "%s"' % (req.path))
            return resp(env, start_response)
Exemple #10
0
    def __call__(self, env, start_response):
        #try: commented-out while debugging so you can see where stuff happened.
        req = webob.Request(env)
        # PUT requests never need rewriting.
        if req.method == 'PUT':
            return self.app(env, start_response)

        # if it already has AUTH, presume that it's good. #07
        if req.path.startswith('/auth') or req.path.find('AUTH') >= 0:
            return self.app(env, start_response)

        # keep a copy of the original request so we can ask the scalers for it
        reqorig = req.copy()
        # match these two URL forms (source files and thumbnails):
        # http://upload.wikimedia.org/<site>/<lang>/.*
        # http://upload.wikimedia.org/<site>/<lang>/thumb/.*
        # example:
        # http://upload.wikimedia.org/wikipedia/commons/a/aa/000_Finlanda_harta.PNG
        # http://upload.wikimedia.org/wikipedia/commons/thumb/a/aa/000_Finlanda_harta.PNG/75px-000_Finlanda_harta.PNG
        match = re.match(r'/(.*?)/(.*?)/(.*)', req.path)
        if match:
            # Our target URL is as follows (example):
            # https://alsted.wikimedia.org:8080/v1/AUTH_6790933748e741268babd69804c6298b/wikipedia-en/2/25/Machinesmith.png

            # quote slashes in the container name
            container = "%s-%s" % (match.group(1), match.group(2))  #02
            obj = match.group(3)
            # include the thumb in the container.
            if obj.startswith("thumb/"):  #03
                container += "-thumb"
                obj = obj[len("thumb/"):]

            if not obj:
                # don't let them list the container (it's CRAZY huge) #08
                resp = webob.exc.HTTPForbidden('No container listing')
                return resp(env, start_response)

            # save a url with just the account name in it.
            req.path_info = "/v1/%s" % (self.account)
            port = self.bind_port
            req.host = '127.0.0.1:%s' % port
            url = req.url[:]
            # Create a path to our object's name.
            req.path_info = "/v1/%s/%s/%s" % (self.account, container,
                                              urllib2.unquote(obj))

            controller = ObjectController()
            # do_start_response just remembers what it got called with,
            # because our 404 handler will generate a different response.
            app_iter = self.app(env, controller.do_start_response)  #01
            status = int(controller.response_args[0].split()[0])
            headers = dict(controller.response_args[1])

            if 200 <= status < 300 or status == 304:
                # We have it! Just return it as usual.
                #headers['X-Swift-Proxy']= `headers`
                if 'etag' in headers: del headers['etag']
                return webob.Response(status=status,
                                      headers=headers,
                                      app_iter=app_iter)(env,
                                                         start_response)  #01a
            elif status == 404:  #4
                resp = self.handle404(reqorig, url, container, obj)
                return resp(env, start_response)
            elif status == 401:
                # if the Storage URL is invalid or has expired we'll get this error.
                resp = webob.exc.HTTPUnauthorized(
                    'Token may have timed out')  #05
                return resp(env, start_response)
            else:
                resp = webob.exc.HTTPNotImplemented('Unknown Status: %s' %
                                                    (status))  #10
                return resp(env, start_response)
        else:
            resp = webob.exc.HTTPBadRequest('Regexp failed: "%s"' %
                                            (req.path))  #11
            return resp(env, start_response)
    def __call__(self, env, start_response):
      #try: commented-out while debugging so you can see where stuff happened.
        req = webob.Request(env)
        # PUT requests never need rewriting.
        if req.method == 'PUT':
            return self.app(env, start_response)

        # if it already has AUTH, presume that it's good. #07
        if req.path.startswith('/auth') or req.path.find('AUTH') >= 0:
            return self.app(env, start_response)

        # keep a copy of the original request so we can ask the scalers for it
        reqorig = req.copy()
        # match these two URL forms (source files and thumbnails):
        # http://upload.wikimedia.org/<site>/<lang>/.*
        # http://upload.wikimedia.org/<site>/<lang>/thumb/.*
        # example:
        # http://upload.wikimedia.org/wikipedia/commons/a/aa/000_Finlanda_harta.PNG
        # http://upload.wikimedia.org/wikipedia/commons/thumb/a/aa/000_Finlanda_harta.PNG/75px-000_Finlanda_harta.PNG
        match = re.match(r'/(.*?)/(.*?)/(.*)', req.path)
        if match:
            # Our target URL is as follows (example):
            # https://alsted.wikimedia.org:8080/v1/AUTH_6790933748e741268babd69804c6298b/wikipedia-en/2/25/Machinesmith.png

            # quote slashes in the container name
            container = "%s-%s" % (match.group(1), match.group(2)) #02
            obj = match.group(3)
            # include the thumb in the container.
            if obj.startswith("thumb/"): #03
                container += "-thumb"
                obj = obj[len("thumb/"):]

            if not obj:
                # don't let them list the container (it's CRAZY huge) #08
                resp = webob.exc.HTTPForbidden('No container listing')
                return resp(env, start_response)

            # save a url with just the account name in it.
            req.path_info = "/v1/%s" % (self.account)
            port = self.bind_port
            req.host = '127.0.0.1:%s' % port
            url = req.url[:]
            # Create a path to our object's name.
            req.path_info = "/v1/%s/%s/%s" % (self.account, container, urllib2.unquote(obj))

            controller = ObjectController()
            # do_start_response just remembers what it got called with,
            # because our 404 handler will generate a different response.
            app_iter = self.app(env, controller.do_start_response) #01
            status = int(controller.response_args[0].split()[0])
            headers = dict(controller.response_args[1])

            if 200 <= status < 300 or status == 304:
                # We have it! Just return it as usual.
                #headers['X-Swift-Proxy']= `headers`
                if 'etag' in headers: del headers['etag']
                return webob.Response(status=status, headers=headers,
                        app_iter=app_iter)(env, start_response) #01a
            elif status == 404: #4
                resp = self.handle404(reqorig, url, container, obj)
                return resp(env, start_response)
            elif status == 401:
                # if the Storage URL is invalid or has expired we'll get this error.
                resp = webob.exc.HTTPUnauthorized('Token may have timed out') #05
                return resp(env, start_response)
            else:
                resp = webob.exc.HTTPNotImplemented('Unknown Status: %s' % (status)) #10
                return resp(env, start_response)
        else:
            resp = webob.exc.HTTPBadRequest('Regexp failed: "%s"' % (req.path)) #11
            return resp(env, start_response)