コード例 #1
0
ファイル: test.py プロジェクト: meschansky/psl
    def test_custom_psl(self):
        source = """
invalid
*.invalid
!test.invalid
"""
        psl = PublicSuffixList(source.splitlines())

        self.assertEqual(psl.suffix("example.invalid"), None)
        self.assertEqual(psl.suffix("test.invalid"), "test.invalid")
        self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid")
        self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid")

        self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid")
        self.assertEqual(psl.publicsuffix("test.invalid"), "invalid")
コード例 #2
0
ファイル: test.py プロジェクト: mzpqnxow/psl
    def test_custom_psl(self):
        source = """
invalid
*.invalid
!test.invalid
"""
        psl = PublicSuffixList(source.splitlines())

        self.assertEqual(psl.suffix("example.invalid"), None)
        self.assertEqual(psl.suffix("test.invalid"), "test.invalid")
        self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid")
        self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid")

        self.assertEqual(psl.publicsuffix("example.invalid"),
                         "example.invalid")
        self.assertEqual(psl.publicsuffix("test.invalid"), "invalid")
コード例 #3
0
ファイル: test.py プロジェクト: mzpqnxow/psl
    def test_deny_unknown(self):
        source = """
known
"""
        psl = PublicSuffixList(source.splitlines(), accept_unknown=False)

        self.assertEqual(psl.suffix("www.example.unknowntld"), None)
コード例 #4
0
ファイル: test.py プロジェクト: meschansky/psl
    def test_deny_unknown(self):
        source = """
known
"""
        psl = PublicSuffixList(source.splitlines(), accept_unknown=False)

        self.assertEqual(psl.suffix("www.example.unknowntld"), None)
コード例 #5
0
ファイル: WebEye.py プロジェクト: 10467106/WebEye
    def get_whois(self, name):
        try:
            domain = urlparse.urlparse(self.target).netloc

            # if domain is ip,stop querying domain.
            result1 = re.search("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", domain)
            if result1:
                return
            # remove port
            result2 = re.search("\:\d{1,5}$", domain)
            if result2:
                domain = domain.split(":")[0]

            # get domain's ip
            try:
                ip = socket.gethostbyname(domain)
                self.cms_list.add("IP:" + ip)
            except Exception, e:
                # print e
                pass


#            if re.match("^www\.",domain):
#                 domain = domain.strip("www.")
            psl = PublicSuffixList()
            domain = psl.suffix(domain)
            who = pythonwhois.get_whois(domain)

            # get whois
            if who["contacts"]["registrant"]["name"] is not None:
                self.cms_list.add(
                    "Domain_User:"******"contacts"]["registrant"]["name"].encode("utf8"))
            if who["contacts"]["registrant"]["email"] is not None:
                self.cms_list.add(
                    "Domain_Email:" +
                    who["contacts"]["registrant"]["email"].encode("utf8"))
            if who["contacts"]["registrant"]["phone"] is not None:
                self.cms_list.add(
                    "Domain_Phone:" +
                    who["contacts"]["registrant"]["phone"].encode("utf8"))
            if who["registrar"] is not None:
                self.cms_list.add("Domain_Registrar:" +
                                  who["registrar"][0].encode("utf8"))
            if who["nameservers"] is not None:
                name_servers = []
                for i in who["nameservers"]:
                    name_servers.append(i.encode('UTF8'))
                self.cms_list.add("Domai_name_servers:" +
                                  str(name_servers).encode("utf8"))
class Master(BaseApplication):

    __flask = None

    @property
    def flask(self):
        if not self.__flask:
            raise ApplicationInitError(
                "Cannot obtain server instance before init")
        return self.__flask

    @property
    def db(self):
        return self.__db

    def init(self, config=None):

        if config is not None:
            self.config = load_config_obj(config)

        app = Flask('product_identifier')
        app.config.from_object(self.config)
        self.__flask = app
        self.__db = SQLAlchemy(self.__flask)
        Migrate(self.__flask, self.db)

        self.handler_pool = gevent.pool.Pool(
            self.config.MASTER_HANDLER_POOL_SIZE)

        with open(os.path.join(self.config.DATA_DIR, "ruleset.json"),
                  "r") as f:
            rule_set = json.load(f)
            self.product_patterns = []
            for name, pattern in rule_set["rules"].iteritems():
                self.product_patterns.append(re.compile(pattern))

        self.__psl = PublicSuffixList()

    def start(self):
        def handleURL():
            from product_identifier.models import URL
            db = Master.instance().db
            self.flask.logger.debug("JOB STARTED")
            while True:
                try:
                    # TODO: succeptible to concurrency problems
                    in_url = self.scripts.pop_zset(keys=[URLS_TO_PROCESS_SET])
                    if in_url:
                        if not self.redis.sismember(PROCESSED_URLS_SET,
                                                    in_url):
                            self.flask.logger.debug(
                                "PROCESSING: {}".format(in_url))
                            uri = furl(in_url)
                            domain = self.__psl.suffix(uri.host)

                            try:
                                p = URL()
                                p.domain = domain
                                p.url = in_url
                                p.is_product = self.is_product_url(in_url)
                                db.session.add(p)
                                db.session.commit()
                            except:
                                db.session.rollback()
                                error = traceback.format_exc()
                                self.flask.logger.error(
                                    "DB_ERROR: {}".format(error))
                                self.redis.sadd(DB_ERRORED_URL_SET, in_url)

                            self.redis.sadd(PROCESSED_URLS_SET, in_url)

                            self.redis.rpush(domain, in_url)
                            domain_added = self.redis.sadd(DOMAINS_SET, domain)
                            if domain_added:
                                self.flask.logger.info(
                                    "ADDED DOMAIN: {}".format(domain))
                        else:
                            self.flask.logger.debug(
                                "SKIPPING: {}".format(in_url))
                    else:
                        # no results, sleep
                        gevent.sleep(1)
                except:
                    error = traceback.format_exc()
                    self.flask.logger.error("ERROR: {}".format(error))

        for i in range(self.config.MASTER_HANDLER_POOL_SIZE):
            self.handler_pool.spawn(handleURL)

    def is_product_url(self, url):
        print url
        return any([pat.match(url) for pat in self.product_patterns])
コード例 #7
0
ファイル: test.py プロジェクト: mzpqnxow/psl
class TestPSL(unittest.TestCase):
    def setUp(self):

        self.psl = PublicSuffixList()

    def test_typesafe(self):
        self.assertEqual(
            self.psl.suffix("www.example.co.jp").__class__,
            "example.co.jp".__class__)
        self.assertEqual(
            self.psl.suffix(u("www.example.co.jp")).__class__,
            u("example.co.jp").__class__)

        self.assertEqual(
            self.psl.publicsuffix("www.example.co.jp").__class__,
            "co.jp".__class__)
        self.assertEqual(
            self.psl.publicsuffix(u("www.example.co.jp")).__class__,
            u("co.jp").__class__)

    def test_uppercase(self):
        self.assertEqual(self.psl.suffix("wWw.eXaMpLe.cO.Jp"), "example.co.jp")
        self.assertEqual(self.psl.publicsuffix("wWw.eXaMpLe.cO.Jp"), "co.jp")

    def test_invaliddomain(self):
        self.assertEqual(self.psl.suffix("www..invalid"), None)
        self.assertEqual(self.psl.suffix(".example.com"), None)
        self.assertEqual(self.psl.suffix("example.com."), None)
        self.assertEqual(self.psl.suffix(""), None)

        self.assertEqual(self.psl.publicsuffix("www..invalid"), None)
        self.assertEqual(self.psl.publicsuffix(".example.com"), None)
        self.assertEqual(self.psl.publicsuffix("example.com."), None)
        self.assertEqual(self.psl.publicsuffix(""), None)

    def test_idn(self):
        tld = u("香港")
        self.assertEqual(self.psl.suffix(u("www.example.") + tld),
                         u("example.") + tld)
        self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld)

    def test_punycoded(self):
        tld = encode_idn(u("香港"))
        self.assertEqual(self.psl.suffix(u("www.example.") + tld),
                         u("example.") + tld)
        self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld)

    def test_suffix_deny_public(self):
        self.assertEqual(self.psl.suffix("com"), None)
        self.assertEqual(self.psl.suffix("co.jp"), None)
        self.assertEqual(self.psl.suffix("example.nagoya.jp"), None)

    def test_unknown(self):
        self.assertEqual(self.psl.suffix("www.example.unknowntld"),
                         "example.unknowntld")
        self.assertEqual(self.psl.suffix("unknowntld"), None)

        self.assertEqual(self.psl.publicsuffix("www.example.unknowntld"),
                         "unknowntld")
        self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld")

    def test_deny_unknown(self):
        source = """
known
"""
        psl = PublicSuffixList(source.splitlines(), accept_unknown=False)

        self.assertEqual(psl.suffix("www.example.unknowntld"), None)

    def test_custom_psl(self):
        source = """
invalid
*.invalid
!test.invalid
"""
        psl = PublicSuffixList(source.splitlines())

        self.assertEqual(psl.suffix("example.invalid"), None)
        self.assertEqual(psl.suffix("test.invalid"), "test.invalid")
        self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid")
        self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid")

        self.assertEqual(psl.publicsuffix("example.invalid"),
                         "example.invalid")
        self.assertEqual(psl.publicsuffix("test.invalid"), "invalid")

    def test_publicsuffix(self):
        self.assertEqual(self.psl.publicsuffix("www.example.com"), "com")
        self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld")

    def test_wildcard(self):
        self.assertEqual(self.psl.suffix("test.example.nagoya.jp"),
                         "test.example.nagoya.jp")
        self.assertEqual(self.psl.suffix("example.nagoya.jp"), None)
        self.assertEqual(self.psl.publicsuffix("example.nagoya.jp"),
                         "example.nagoya.jp")
        self.assertEqual(self.psl.publicsuffix("test.example.nagoya.jp"),
                         "example.nagoya.jp")

    def test_checkpublicsuffix_script(self):
        regex = re.compile(r"^checkPublicSuffix\(('[^']+'), (null|'[^']+')\);")
        with open(os.path.join(os.path.dirname(__file__), "test_psl.txt"),
                  "rb") as f:
            ln = 0

            for line in f:
                ln += 1
                l = line.decode("utf-8")
                m = regex.match(l)
                if not m:
                    continue

                arg = m.group(1).strip("'")
                res = None if m.group(2) == "null" else m.group(2).strip("'")

                self.assertEqual(self.psl.suffix(arg), res,
                                 "in line {0}: {1}".format(ln, line.strip()))

    def test_typeerror(self):

        self.assertRaises(TypeError, lambda: self.psl.suffix(None))
        self.assertRaises(TypeError, lambda: self.psl.suffix(1))
        if b("") != "":
            # python3
            self.assertRaises(TypeError,
                              lambda: self.psl.suffix(b("www.example.com")))

    def test_compatclass(self):

        from publicsuffixlist.compat import PublicSuffixList
        psl = PublicSuffixList()

        self.assertEqual(psl.get_public_suffix("test.example.com"),
                         "example.com")
        self.assertEqual(psl.get_public_suffix("com"), "")
        self.assertEqual(psl.get_public_suffix(""), "")

    def test_unsafecompatclass(self):

        from publicsuffixlist.compat import UnsafePublicSuffixList
        psl = UnsafePublicSuffixList()

        self.assertEqual(psl.get_public_suffix("test.example.com"),
                         "example.com")
        self.assertEqual(psl.get_public_suffix("com"), "com")
        self.assertEqual(psl.get_public_suffix(""), "")

    def test_toomanylabels(self):
        d = "a." * 1000000 + "example.com"

        self.assertEqual(self.psl.publicsuffix(d), "com")
        self.assertEqual(self.psl.privatesuffix(d), "example.com")

    def test_flatstring(self):
        psl = PublicSuffixList(u("com\nnet\n"))
        self.assertEqual(psl.publicsuffix("example.com"), "com")

    def test_flatbytestring(self):
        psl = PublicSuffixList(b("com\nnet\n"))
        self.assertEqual(psl.publicsuffix("example.com"), "com")

    def test_privateparts(self):
        psl = self.psl
        self.assertEqual(psl.privateparts("aaa.www.example.com"),
                         ("aaa", "www", "example.com"))

    def test_noprivateparts(self):
        psl = self.psl
        self.assertEqual(psl.privateparts("com"), None)  # no private part

    def test_reconstructparts(self):
        psl = self.psl
        self.assertEqual(".".join(psl.privateparts("aaa.www.example.com")),
                         "aaa.www.example.com")

    def test_subdomain(self):
        psl = self.psl
        self.assertEqual(psl.subdomain("aaa.www.example.com", depth=0),
                         "example.com")
        self.assertEqual(psl.subdomain("aaa.www.example.com", depth=1),
                         "www.example.com")
        self.assertEqual(psl.subdomain("aaa.www.example.com", depth=2),
                         "aaa.www.example.com")
        self.assertEqual(psl.subdomain("aaa.www.example.com", depth=3),
                         None)  # no sufficient depth
コード例 #8
0
ファイル: master.py プロジェクト: mozilla/product_identifier
class Master(BaseApplication):

    __flask = None

    @property
    def flask(self):
        if not self.__flask:
            raise ApplicationInitError("Cannot obtain server instance before init")
        return self.__flask

    @property
    def db(self):
        return self.__db

    def init(self, config=None):

        if config is not None:
            self.config = load_config_obj(config)

        app = Flask('product_identifier')
        app.config.from_object(self.config)
        self.__flask = app
        self.__db = SQLAlchemy(self.__flask)
        Migrate(self.__flask, self.db)

        self.handler_pool = gevent.pool.Pool(self.config.MASTER_HANDLER_POOL_SIZE)

        with open(os.path.join(self.config.DATA_DIR, "ruleset.json"), "r") as f:
            rule_set = json.load(f)
            self.product_patterns = []
            for name, pattern in rule_set["rules"].iteritems():
                self.product_patterns.append(re.compile(pattern))

        self.__psl = PublicSuffixList()

    def start(self):
        def handleURL():
            from product_identifier.models import URL
            db = Master.instance().db
            self.flask.logger.debug("JOB STARTED")
            while True:
                try:
                    # TODO: succeptible to concurrency problems
                    in_url = self.scripts.pop_zset(keys=[URLS_TO_PROCESS_SET])
                    if in_url:
                        if not self.redis.sismember(PROCESSED_URLS_SET, in_url):
                            self.flask.logger.debug("PROCESSING: {}".format(in_url))
                            uri = furl(in_url)
                            domain = self.__psl.suffix(uri.host)

                            try:
                                p = URL()
                                p.domain = domain
                                p.url = in_url
                                p.is_product = self.is_product_url(in_url)
                                db.session.add(p)
                                db.session.commit()
                            except:
                                db.session.rollback()
                                error = traceback.format_exc()
                                self.flask.logger.error("DB_ERROR: {}".format(error))
                                self.redis.sadd(DB_ERRORED_URL_SET, in_url)

                            self.redis.sadd(PROCESSED_URLS_SET, in_url)

                            self.redis.rpush(domain, in_url)
                            domain_added = self.redis.sadd(DOMAINS_SET, domain)
                            if domain_added:
                                self.flask.logger.info("ADDED DOMAIN: {}".format(domain))
                        else:
                            self.flask.logger.debug("SKIPPING: {}".format(in_url))
                    else:
                        # no results, sleep
                        gevent.sleep(1)
                except:
                    error = traceback.format_exc()
                    self.flask.logger.error("ERROR: {}".format(error))

        for i in range(self.config.MASTER_HANDLER_POOL_SIZE):
            self.handler_pool.spawn(handleURL)

    def is_product_url(self, url):
        print url
        return any([pat.match(url) for pat in self.product_patterns])
コード例 #9
0
ファイル: test.py プロジェクト: meschansky/psl
class TestPSL(unittest.TestCase):

    def setUp(self):

        self.psl = PublicSuffixList()
        

    def test_typesafe(self):
        self.assertEqual(self.psl.suffix("www.example.co.jp").__class__, "example.co.jp".__class__)
        self.assertEqual(self.psl.suffix(u("www.example.co.jp")).__class__, u("example.co.jp").__class__)

        self.assertEqual(self.psl.publicsuffix("www.example.co.jp").__class__, "co.jp".__class__)
        self.assertEqual(self.psl.publicsuffix(u("www.example.co.jp")).__class__, u("co.jp").__class__)


    def test_uppercase(self):
        self.assertEqual(self.psl.suffix("wWw.eXaMpLe.cO.Jp"), "example.co.jp")
        self.assertEqual(self.psl.publicsuffix("wWw.eXaMpLe.cO.Jp"), "co.jp")


    def test_invaliddomain(self):
        self.assertEqual(self.psl.suffix("www..invalid"), None)
        self.assertEqual(self.psl.suffix(".example.com"), None)
        self.assertEqual(self.psl.suffix("example.com."), None)
        self.assertEqual(self.psl.suffix(""), None)

        self.assertEqual(self.psl.publicsuffix("www..invalid"), None)
        self.assertEqual(self.psl.publicsuffix(".example.com"), None)
        self.assertEqual(self.psl.publicsuffix("example.com."), None)
        self.assertEqual(self.psl.publicsuffix(""), None)


    def test_idn(self):
        tld = u("香港")
        self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld)
        self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld)

    
    def test_punycoded(self):
        tld = encode_idn(u("香港"))
        self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld)
        self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld)


    def test_suffix_deny_public(self):
        self.assertEqual(self.psl.suffix("com"), None)
        self.assertEqual(self.psl.suffix("co.jp"), None)
        self.assertEqual(self.psl.suffix("example.nagoya.jp"), None)


    def test_unknown(self):
        self.assertEqual(self.psl.suffix("www.example.unknowntld"), "example.unknowntld")
        self.assertEqual(self.psl.suffix("unknowntld"), None)

        self.assertEqual(self.psl.publicsuffix("www.example.unknowntld"), "unknowntld")
        self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld")


    def test_deny_unknown(self):
        source = """
known
"""
        psl = PublicSuffixList(source.splitlines(), accept_unknown=False)

        self.assertEqual(psl.suffix("www.example.unknowntld"), None)


    def test_custom_psl(self):
        source = """
invalid
*.invalid
!test.invalid
"""
        psl = PublicSuffixList(source.splitlines())

        self.assertEqual(psl.suffix("example.invalid"), None)
        self.assertEqual(psl.suffix("test.invalid"), "test.invalid")
        self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid")
        self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid")

        self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid")
        self.assertEqual(psl.publicsuffix("test.invalid"), "invalid")




    def test_publicsuffix(self):
        self.assertEqual(self.psl.publicsuffix("www.example.com"), "com")
        self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld")

    def test_wildcard(self):
        self.assertEqual(self.psl.suffix("test.example.nagoya.jp"), "test.example.nagoya.jp")
        self.assertEqual(self.psl.suffix("example.nagoya.jp"), None)
        self.assertEqual(self.psl.publicsuffix("example.nagoya.jp"), "example.nagoya.jp")
        self.assertEqual(self.psl.publicsuffix("test.example.nagoya.jp"), "example.nagoya.jp")



    def test_checkpublicsuffix_script(self):
        regex = re.compile(r"^checkPublicSuffix\(('[^']+'), (null|'[^']+')\);")
        with open(os.path.join(os.path.dirname(__file__), "test_psl.txt"), "rb") as f:
            ln = 0
        
            for line in f:
                ln += 1
                l = line.decode("utf-8")
                m = regex.match(l)
                if not m:
                    continue
    
                arg = m.group(1).strip("'")
                res = None if m.group(2) == "null" else m.group(2).strip("'")
                
                self.assertEqual(self.psl.suffix(arg), res, "in line {0}: {1}".format(ln, line.strip()))
            


    def test_typeerror(self):

        self.assertRaises(TypeError, lambda: self.psl.suffix(None))
        self.assertRaises(TypeError, lambda: self.psl.suffix(1))
        if b("") != "":
            # python3
            self.assertRaises(TypeError, lambda: self.psl.suffix(b("www.example.com")))
        

    def test_compatclass(self):

        from publicsuffixlist.compat import PublicSuffixList
        psl = PublicSuffixList()
        
        self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com")
        self.assertEqual(psl.get_public_suffix("com"), "")
        self.assertEqual(psl.get_public_suffix(""), "")

    def test_unsafecompatclass(self):

        from publicsuffixlist.compat import UnsafePublicSuffixList
        psl = UnsafePublicSuffixList()
        
        self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com")
        self.assertEqual(psl.get_public_suffix("com"), "com")
        self.assertEqual(psl.get_public_suffix(""), "")


    def test_toomanylabels(self):
        d = "a." * 1000000 + "example.com"

        self.assertEqual(self.psl.publicsuffix(d), "com")
        self.assertEqual(self.psl.privatesuffix(d), "example.com")


    def test_flatstring(self):
        psl = PublicSuffixList(u("com\nnet\n"))
        self.assertEqual(psl.publicsuffix("example.com"), "com")

    def test_flatbytestring(self):
        psl = PublicSuffixList(b("com\nnet\n"))
        self.assertEqual(psl.publicsuffix("example.com"), "com")