def is_url_allowed(self, url, syntax=GYM2008):
        allowed = True

        # Schemes and host names are not part of the robots.txt protocol,
        # so I ignore them. It is the caller's responsibility to make
        # sure they match.
        _, _, path, parameters, query, fragment = urllib_urlparse(url)
        url = urllib_urlunparse(("", "", path, parameters, query, fragment))

        url = _unquote_path(url)

        if url == '/robots.txt':
            return True

        done = False
        i = 0
        while not done:
            rule_type, path = self.rules[i]

            if (syntax == GYM2008) and ("*" in path or path.endswith("$")):
                # GYM2008-specific syntax applies here
                # http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40360
                if path.endswith("$"):
                    appendix = "$"
                    path = path[:-1]
                else:
                    appendix = ""
                # Multiple wildcards characters mean the same as one wildcard so they can be
                # condensed into one. If I don't do this, I run the risk of creating a
                # pathological regex.
                # ref: https://bitbucket.org/philip_semanchuk/robotexclusionrulesparser/issues/1
                path = re.sub(r'\*+', '*', path)
                parts = path.split("*")
                pattern = ".*".join([re.escape(p) for p in parts]) + appendix
                if re.match(pattern, url):
                    # Ding!
                    done = True
                    allowed = (rule_type == self.ALLOW)
            else:
                # Wildcards are either not present or are taken literally.
                if url.startswith(path):
                    # Ding!
                    done = True
                    allowed = (rule_type == self.ALLOW)
                    # A blank path means "nothing", so that effectively
                    # negates the value above.
                    # e.g. "Disallow:   " means allow everything
                    if not path:
                        allowed = not allowed

            i += 1
            if i == len(self.rules):
                done = True

        return allowed
    def is_url_allowed(self, url, syntax=GYM2008):
        allowed = True
    
        # Schemes and host names are not part of the robots.txt protocol, 
        # so  I ignore them. It is the caller's responsibility to make 
        # sure they match.
        _, _, path, parameters, query, fragment = urllib_urlparse(url)
        url = urllib_urlunparse(("", "", path, parameters, query, fragment))

        url = _unquote_path(url)
    
        done = False
        i = 0
        while not done:
            rule_type, path = self.rules[i]

            if (syntax == GYM2008) and ("*" in path or path.endswith("$")):
                # GYM2008-specific syntax applies here
                # http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40360
                if path.endswith("$"):
                    appendix = "$"
                    path = path[:-1]
                else:
                    appendix = ""
                parts = path.split("*")
                pattern = "%s%s" % \
                    (".*".join([re.escape(p) for p in parts]), appendix)
                if re.match(pattern, url):
                    # Ding!
                    done = True
                    allowed = (rule_type == self.ALLOW)
            else:  
                # Wildcards are either not present or are taken literally.
                if url.startswith(path):
                    # Ding!
                    done = True
                    allowed = (rule_type == self.ALLOW)
                    # A blank path means "nothing", so that effectively 
                    # negates the value above. 
                    # e.g. "Disallow:   " means allow everything
                    if not path:
                        allowed = not allowed


            i += 1
            if i == len(self.rules):
                done = True
            
        return allowed
    def is_url_allowed(self, url, syntax=GYM2008):
        allowed = True
    
        # Schemes and host names are not part of the robots.txt protocol, 
        # so  I ignore them. It is the caller's responsibility to make 
        # sure they match.
        _, _, path, parameters, query, fragment = urllib_urlparse(url)
        url = urllib_urlunparse(("", "", path, parameters, query, fragment))

        url = _unquote_path(url)
    
        done = False
        i = 0
        while not done:
            rule_type, path = self.rules[i]

            if (syntax == GYM2008) and ("*" in path or path.endswith("$")):
                # GYM2008-specific syntax applies here
                # http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40360
                if path.endswith("$"):
                    appendix = "$"
                    path = path[:-1]
                else:
                    appendix = ""
                parts = path.split("*")
                pattern = "%s%s" % \
                    (".*".join([re.escape(p) for p in parts]), appendix)
                if re.match(pattern, url):
                    # Ding!
                    done = True
                    allowed = (rule_type == self.ALLOW)
            else:  
                # Wildcards are either not present or are taken literally.
                if url.startswith(path):
                    # Ding!
                    done = True
                    allowed = (rule_type == self.ALLOW)
                    # A blank path means "nothing", so that effectively 
                    # negates the value above. 
                    # e.g. "Disallow:   " means allow everything
                    if not path:
                        allowed = not allowed


            i += 1
            if i == len(self.rules):
                done = True
            
        return allowed