def test_custom_psl(self): source = """ invalid *.invalid !test.invalid """ psl = PublicSuffixList(source.splitlines()) self.assertEqual(psl.suffix("example.invalid"), None) self.assertEqual(psl.suffix("test.invalid"), "test.invalid") self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid") self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid") self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid") self.assertEqual(psl.publicsuffix("test.invalid"), "invalid")
def test_deny_unknown(self): source = """ known """ psl = PublicSuffixList(source.splitlines(), accept_unknown=False) self.assertEqual(psl.suffix("www.example.unknowntld"), None)
def get_whois(self, name): try: domain = urlparse.urlparse(self.target).netloc # if domain is ip,stop querying domain. result1 = re.search("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", domain) if result1: return # remove port result2 = re.search("\:\d{1,5}$", domain) if result2: domain = domain.split(":")[0] # get domain's ip try: ip = socket.gethostbyname(domain) self.cms_list.add("IP:" + ip) except Exception, e: # print e pass # if re.match("^www\.",domain): # domain = domain.strip("www.") psl = PublicSuffixList() domain = psl.suffix(domain) who = pythonwhois.get_whois(domain) # get whois if who["contacts"]["registrant"]["name"] is not None: self.cms_list.add( "Domain_User:"******"contacts"]["registrant"]["name"].encode("utf8")) if who["contacts"]["registrant"]["email"] is not None: self.cms_list.add( "Domain_Email:" + who["contacts"]["registrant"]["email"].encode("utf8")) if who["contacts"]["registrant"]["phone"] is not None: self.cms_list.add( "Domain_Phone:" + who["contacts"]["registrant"]["phone"].encode("utf8")) if who["registrar"] is not None: self.cms_list.add("Domain_Registrar:" + who["registrar"][0].encode("utf8")) if who["nameservers"] is not None: name_servers = [] for i in who["nameservers"]: name_servers.append(i.encode('UTF8')) self.cms_list.add("Domai_name_servers:" + str(name_servers).encode("utf8"))
class Master(BaseApplication): __flask = None @property def flask(self): if not self.__flask: raise ApplicationInitError( "Cannot obtain server instance before init") return self.__flask @property def db(self): return self.__db def init(self, config=None): if config is not None: self.config = load_config_obj(config) app = Flask('product_identifier') app.config.from_object(self.config) self.__flask = app self.__db = SQLAlchemy(self.__flask) Migrate(self.__flask, self.db) self.handler_pool = gevent.pool.Pool( self.config.MASTER_HANDLER_POOL_SIZE) with open(os.path.join(self.config.DATA_DIR, "ruleset.json"), "r") as f: rule_set = json.load(f) self.product_patterns = [] for name, pattern in rule_set["rules"].iteritems(): self.product_patterns.append(re.compile(pattern)) self.__psl = PublicSuffixList() def start(self): def handleURL(): from product_identifier.models import URL db = Master.instance().db self.flask.logger.debug("JOB STARTED") while True: try: # TODO: succeptible to concurrency problems in_url = self.scripts.pop_zset(keys=[URLS_TO_PROCESS_SET]) if in_url: if not self.redis.sismember(PROCESSED_URLS_SET, in_url): self.flask.logger.debug( "PROCESSING: {}".format(in_url)) uri = furl(in_url) domain = self.__psl.suffix(uri.host) try: p = URL() p.domain = domain p.url = in_url p.is_product = self.is_product_url(in_url) db.session.add(p) db.session.commit() except: db.session.rollback() error = traceback.format_exc() self.flask.logger.error( "DB_ERROR: {}".format(error)) self.redis.sadd(DB_ERRORED_URL_SET, in_url) self.redis.sadd(PROCESSED_URLS_SET, in_url) self.redis.rpush(domain, in_url) domain_added = self.redis.sadd(DOMAINS_SET, domain) if domain_added: self.flask.logger.info( "ADDED DOMAIN: {}".format(domain)) else: self.flask.logger.debug( "SKIPPING: {}".format(in_url)) else: # no results, sleep gevent.sleep(1) except: error = traceback.format_exc() self.flask.logger.error("ERROR: {}".format(error)) for i in range(self.config.MASTER_HANDLER_POOL_SIZE): self.handler_pool.spawn(handleURL) def is_product_url(self, url): print url return any([pat.match(url) for pat in self.product_patterns])
class TestPSL(unittest.TestCase): def setUp(self): self.psl = PublicSuffixList() def test_typesafe(self): self.assertEqual( self.psl.suffix("www.example.co.jp").__class__, "example.co.jp".__class__) self.assertEqual( self.psl.suffix(u("www.example.co.jp")).__class__, u("example.co.jp").__class__) self.assertEqual( self.psl.publicsuffix("www.example.co.jp").__class__, "co.jp".__class__) self.assertEqual( self.psl.publicsuffix(u("www.example.co.jp")).__class__, u("co.jp").__class__) def test_uppercase(self): self.assertEqual(self.psl.suffix("wWw.eXaMpLe.cO.Jp"), "example.co.jp") self.assertEqual(self.psl.publicsuffix("wWw.eXaMpLe.cO.Jp"), "co.jp") def test_invaliddomain(self): self.assertEqual(self.psl.suffix("www..invalid"), None) self.assertEqual(self.psl.suffix(".example.com"), None) self.assertEqual(self.psl.suffix("example.com."), None) self.assertEqual(self.psl.suffix(""), None) self.assertEqual(self.psl.publicsuffix("www..invalid"), None) self.assertEqual(self.psl.publicsuffix(".example.com"), None) self.assertEqual(self.psl.publicsuffix("example.com."), None) self.assertEqual(self.psl.publicsuffix(""), None) def test_idn(self): tld = u("香港") self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld) self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld) def test_punycoded(self): tld = encode_idn(u("香港")) self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld) self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld) def test_suffix_deny_public(self): self.assertEqual(self.psl.suffix("com"), None) self.assertEqual(self.psl.suffix("co.jp"), None) self.assertEqual(self.psl.suffix("example.nagoya.jp"), None) def test_unknown(self): self.assertEqual(self.psl.suffix("www.example.unknowntld"), "example.unknowntld") self.assertEqual(self.psl.suffix("unknowntld"), None) self.assertEqual(self.psl.publicsuffix("www.example.unknowntld"), "unknowntld") self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld") def test_deny_unknown(self): source = """ known """ psl = PublicSuffixList(source.splitlines(), accept_unknown=False) self.assertEqual(psl.suffix("www.example.unknowntld"), None) def test_custom_psl(self): source = """ invalid *.invalid !test.invalid """ psl = PublicSuffixList(source.splitlines()) self.assertEqual(psl.suffix("example.invalid"), None) self.assertEqual(psl.suffix("test.invalid"), "test.invalid") self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid") self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid") self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid") self.assertEqual(psl.publicsuffix("test.invalid"), "invalid") def test_publicsuffix(self): self.assertEqual(self.psl.publicsuffix("www.example.com"), "com") self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld") def test_wildcard(self): self.assertEqual(self.psl.suffix("test.example.nagoya.jp"), "test.example.nagoya.jp") self.assertEqual(self.psl.suffix("example.nagoya.jp"), None) self.assertEqual(self.psl.publicsuffix("example.nagoya.jp"), "example.nagoya.jp") self.assertEqual(self.psl.publicsuffix("test.example.nagoya.jp"), "example.nagoya.jp") def test_checkpublicsuffix_script(self): regex = re.compile(r"^checkPublicSuffix\(('[^']+'), (null|'[^']+')\);") with open(os.path.join(os.path.dirname(__file__), "test_psl.txt"), "rb") as f: ln = 0 for line in f: ln += 1 l = line.decode("utf-8") m = regex.match(l) if not m: continue arg = m.group(1).strip("'") res = None if m.group(2) == "null" else m.group(2).strip("'") self.assertEqual(self.psl.suffix(arg), res, "in line {0}: {1}".format(ln, line.strip())) def test_typeerror(self): self.assertRaises(TypeError, lambda: self.psl.suffix(None)) self.assertRaises(TypeError, lambda: self.psl.suffix(1)) if b("") != "": # python3 self.assertRaises(TypeError, lambda: self.psl.suffix(b("www.example.com"))) def test_compatclass(self): from publicsuffixlist.compat import PublicSuffixList psl = PublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "") self.assertEqual(psl.get_public_suffix(""), "") def test_unsafecompatclass(self): from publicsuffixlist.compat import UnsafePublicSuffixList psl = UnsafePublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "com") self.assertEqual(psl.get_public_suffix(""), "") def test_toomanylabels(self): d = "a." * 1000000 + "example.com" self.assertEqual(self.psl.publicsuffix(d), "com") self.assertEqual(self.psl.privatesuffix(d), "example.com") def test_flatstring(self): psl = PublicSuffixList(u("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com") def test_flatbytestring(self): psl = PublicSuffixList(b("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com") def test_privateparts(self): psl = self.psl self.assertEqual(psl.privateparts("aaa.www.example.com"), ("aaa", "www", "example.com")) def test_noprivateparts(self): psl = self.psl self.assertEqual(psl.privateparts("com"), None) # no private part def test_reconstructparts(self): psl = self.psl self.assertEqual(".".join(psl.privateparts("aaa.www.example.com")), "aaa.www.example.com") def test_subdomain(self): psl = self.psl self.assertEqual(psl.subdomain("aaa.www.example.com", depth=0), "example.com") self.assertEqual(psl.subdomain("aaa.www.example.com", depth=1), "www.example.com") self.assertEqual(psl.subdomain("aaa.www.example.com", depth=2), "aaa.www.example.com") self.assertEqual(psl.subdomain("aaa.www.example.com", depth=3), None) # no sufficient depth
class Master(BaseApplication): __flask = None @property def flask(self): if not self.__flask: raise ApplicationInitError("Cannot obtain server instance before init") return self.__flask @property def db(self): return self.__db def init(self, config=None): if config is not None: self.config = load_config_obj(config) app = Flask('product_identifier') app.config.from_object(self.config) self.__flask = app self.__db = SQLAlchemy(self.__flask) Migrate(self.__flask, self.db) self.handler_pool = gevent.pool.Pool(self.config.MASTER_HANDLER_POOL_SIZE) with open(os.path.join(self.config.DATA_DIR, "ruleset.json"), "r") as f: rule_set = json.load(f) self.product_patterns = [] for name, pattern in rule_set["rules"].iteritems(): self.product_patterns.append(re.compile(pattern)) self.__psl = PublicSuffixList() def start(self): def handleURL(): from product_identifier.models import URL db = Master.instance().db self.flask.logger.debug("JOB STARTED") while True: try: # TODO: succeptible to concurrency problems in_url = self.scripts.pop_zset(keys=[URLS_TO_PROCESS_SET]) if in_url: if not self.redis.sismember(PROCESSED_URLS_SET, in_url): self.flask.logger.debug("PROCESSING: {}".format(in_url)) uri = furl(in_url) domain = self.__psl.suffix(uri.host) try: p = URL() p.domain = domain p.url = in_url p.is_product = self.is_product_url(in_url) db.session.add(p) db.session.commit() except: db.session.rollback() error = traceback.format_exc() self.flask.logger.error("DB_ERROR: {}".format(error)) self.redis.sadd(DB_ERRORED_URL_SET, in_url) self.redis.sadd(PROCESSED_URLS_SET, in_url) self.redis.rpush(domain, in_url) domain_added = self.redis.sadd(DOMAINS_SET, domain) if domain_added: self.flask.logger.info("ADDED DOMAIN: {}".format(domain)) else: self.flask.logger.debug("SKIPPING: {}".format(in_url)) else: # no results, sleep gevent.sleep(1) except: error = traceback.format_exc() self.flask.logger.error("ERROR: {}".format(error)) for i in range(self.config.MASTER_HANDLER_POOL_SIZE): self.handler_pool.spawn(handleURL) def is_product_url(self, url): print url return any([pat.match(url) for pat in self.product_patterns])
class TestPSL(unittest.TestCase): def setUp(self): self.psl = PublicSuffixList() def test_typesafe(self): self.assertEqual(self.psl.suffix("www.example.co.jp").__class__, "example.co.jp".__class__) self.assertEqual(self.psl.suffix(u("www.example.co.jp")).__class__, u("example.co.jp").__class__) self.assertEqual(self.psl.publicsuffix("www.example.co.jp").__class__, "co.jp".__class__) self.assertEqual(self.psl.publicsuffix(u("www.example.co.jp")).__class__, u("co.jp").__class__) def test_uppercase(self): self.assertEqual(self.psl.suffix("wWw.eXaMpLe.cO.Jp"), "example.co.jp") self.assertEqual(self.psl.publicsuffix("wWw.eXaMpLe.cO.Jp"), "co.jp") def test_invaliddomain(self): self.assertEqual(self.psl.suffix("www..invalid"), None) self.assertEqual(self.psl.suffix(".example.com"), None) self.assertEqual(self.psl.suffix("example.com."), None) self.assertEqual(self.psl.suffix(""), None) self.assertEqual(self.psl.publicsuffix("www..invalid"), None) self.assertEqual(self.psl.publicsuffix(".example.com"), None) self.assertEqual(self.psl.publicsuffix("example.com."), None) self.assertEqual(self.psl.publicsuffix(""), None) def test_idn(self): tld = u("香港") self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld) self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld) def test_punycoded(self): tld = encode_idn(u("香港")) self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld) self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld) def test_suffix_deny_public(self): self.assertEqual(self.psl.suffix("com"), None) self.assertEqual(self.psl.suffix("co.jp"), None) self.assertEqual(self.psl.suffix("example.nagoya.jp"), None) def test_unknown(self): self.assertEqual(self.psl.suffix("www.example.unknowntld"), "example.unknowntld") self.assertEqual(self.psl.suffix("unknowntld"), None) self.assertEqual(self.psl.publicsuffix("www.example.unknowntld"), "unknowntld") self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld") def test_deny_unknown(self): source = """ known """ psl = PublicSuffixList(source.splitlines(), accept_unknown=False) self.assertEqual(psl.suffix("www.example.unknowntld"), None) def test_custom_psl(self): source = """ invalid *.invalid !test.invalid """ psl = PublicSuffixList(source.splitlines()) self.assertEqual(psl.suffix("example.invalid"), None) self.assertEqual(psl.suffix("test.invalid"), "test.invalid") self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid") self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid") self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid") self.assertEqual(psl.publicsuffix("test.invalid"), "invalid") def test_publicsuffix(self): self.assertEqual(self.psl.publicsuffix("www.example.com"), "com") self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld") def test_wildcard(self): self.assertEqual(self.psl.suffix("test.example.nagoya.jp"), "test.example.nagoya.jp") self.assertEqual(self.psl.suffix("example.nagoya.jp"), None) self.assertEqual(self.psl.publicsuffix("example.nagoya.jp"), "example.nagoya.jp") self.assertEqual(self.psl.publicsuffix("test.example.nagoya.jp"), "example.nagoya.jp") def test_checkpublicsuffix_script(self): regex = re.compile(r"^checkPublicSuffix\(('[^']+'), (null|'[^']+')\);") with open(os.path.join(os.path.dirname(__file__), "test_psl.txt"), "rb") as f: ln = 0 for line in f: ln += 1 l = line.decode("utf-8") m = regex.match(l) if not m: continue arg = m.group(1).strip("'") res = None if m.group(2) == "null" else m.group(2).strip("'") self.assertEqual(self.psl.suffix(arg), res, "in line {0}: {1}".format(ln, line.strip())) def test_typeerror(self): self.assertRaises(TypeError, lambda: self.psl.suffix(None)) self.assertRaises(TypeError, lambda: self.psl.suffix(1)) if b("") != "": # python3 self.assertRaises(TypeError, lambda: self.psl.suffix(b("www.example.com"))) def test_compatclass(self): from publicsuffixlist.compat import PublicSuffixList psl = PublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "") self.assertEqual(psl.get_public_suffix(""), "") def test_unsafecompatclass(self): from publicsuffixlist.compat import UnsafePublicSuffixList psl = UnsafePublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "com") self.assertEqual(psl.get_public_suffix(""), "") def test_toomanylabels(self): d = "a." * 1000000 + "example.com" self.assertEqual(self.psl.publicsuffix(d), "com") self.assertEqual(self.psl.privatesuffix(d), "example.com") def test_flatstring(self): psl = PublicSuffixList(u("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com") def test_flatbytestring(self): psl = PublicSuffixList(b("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com")