def __init__(self, tid = 0): threading.Thread.__init__(self) self._tid = tid self._tld = TLD() self._ssdb = MySSDB(SSDBHOST, SSDBPORT)
class DomainProcessor(threading.Thread): _tid = 0 _ssdb = None ThreadStartTime = time.time() ThreadCanExit = False ThreadTimeOut = 60 * 2 DomainsProcessed = 0 DomainsStored = 0 _UserAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36' _tld = None # Top Level Domain parser form Zeander _BlacklistFileModifyTime = 0 _BlacklistFileName = 'blacklist.txt' TLDUserPartRepeatMax = 200 * 2 TLDUserPartCacheMaxLen = 2000 _TLDUserPartCache = {} InternalBlackListMaxLen = 2000 ExternalBlacklistMaxLen = 1000 _InternalBlacklist = [] _ExternalBlacklist = [] def __init__(self, tid = 0): threading.Thread.__init__(self) self._tid = tid self._tld = TLD() self._ssdb = MySSDB(SSDBHOST, SSDBPORT) def run(self): while True: domain = HTTPSQSQueue.get(DOMAINQUEUE02).lower() if '' == domain and None == time.sleep(1): continue if self.isDomainInBlacklist(domain): C.Info('Domain in black list: %s' % domain, C.DEBUG) continue self.f**k(domain) self.refreshBlacklist(domain) self.monitor() def f**k(self, domain): global DomainsProcessed, DomainsStored DomainsProcessed = DomainsProcessed + 1 if not self._ssdb.isDomainInDB('hdm', domain): self._ssdb.setHItem('hdm', MD5(domain), domain) HTTPSQSQueue.put(DOMAINQUEUE01, domain) DomainsStored = DomainsStored + 1 def monitor(self): if os.path.exists('debug.dump'):self.dump() def refreshBlacklist(self, domain): self.refreshInternalBlacklist(domain) self.refreshExternalBlacklist() def isDomainInBlacklist(self, domain): return self.isDomainInExternalBlacklist(domain) or self.isDomainInInternalBlacklist(domain) def isDomainInInternalBlacklist(self, domain): domain_user_part = self._tld.getTLD(domain)[0] if not '' == domain_user_part: for black in self._InternalBlacklist: if domain_user_part == black: return True return False def isDomainInExternalBlacklist(self, domain): for black in self._ExternalBlacklist: if domain.endswith(black): return True return False def refreshInternalBlacklist(self, domain): ''' www.chinaz.com.cn -> chinaz -> CacheDictionary ''' black = self._tld.getTLD(domain)[0] if '' == black : return if self._TLDUserPartCache.has_key(black): self._TLDUserPartCache[black] = self._TLDUserPartCache[black] + 1 if self._TLDUserPartCache[black] > self.TLDUserPartRepeatMax/2 and not black in self._InternalBlacklist: self._InternalBlacklist.append(black) else: self._TLDUserPartCache[black] = 1 if len(self._TLDUserPartCache) > self.TLDUserPartCacheMaxLen: tempList = sorted(self._TLDUserPartCache, key=self._TLDUserPartCache.get) for i in range(0, len(tempList)/2): self._TLDUserPartCache.pop(tempList[i]) tempList = None if len(self._InternalBlacklist) > self.InternalBlackListMaxLen: self.saveInternalBlacklist() def refreshExternalBlacklist(self): if (random.randrange(1,11) % 3 == 0) :return if not os.path.exists(self._BlacklistFileName):return BlacklistFileModifyTime = os.stat(self._BlacklistFileName).st_mtime if self._BlacklistFileModifyTime == BlacklistFileModifyTime: return try: f = open(self._BlacklistFileName) lines = f.readlines() blacklists = [] for line in lines: domain = line.strip().replace('\n', '').replace('\r', '') if len(domain) > 0:blacklists.append(domain) self._ExternalBlacklist = blacklists[:] blacklists = None self._BlacklistFileModifyTime = BlacklistFileModifyTime C.Info('Get %2d domains in blacklist' % len(self._ExternalBlacklist), C.INFO) except Exception, e: C.Info(str(e), C.ERROR) finally: