Example #1
0
 def __init__(self, tid = 0):
     threading.Thread.__init__(self)
     self._tid  = tid
     self._tld  = TLD()
     self._ssdb = MySSDB(SSDBHOST, SSDBPORT)
Example #2
0
class DomainProcessor(threading.Thread):
    _tid                     = 0
    _ssdb                    = None

    ThreadStartTime          = time.time()
    ThreadCanExit            = False
    ThreadTimeOut            = 60 * 2
    
    DomainsProcessed         = 0
    DomainsStored            = 0
    
    _UserAgent               = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36'

    _tld                     = None   # Top Level Domain parser form Zeander
    _BlacklistFileModifyTime = 0
    _BlacklistFileName       = 'blacklist.txt'

    TLDUserPartRepeatMax     = 200 * 2

    TLDUserPartCacheMaxLen   = 2000
    _TLDUserPartCache        = {}

    InternalBlackListMaxLen  = 2000
    ExternalBlacklistMaxLen  = 1000
    _InternalBlacklist       = []
    _ExternalBlacklist       = []


    def __init__(self, tid = 0):
        threading.Thread.__init__(self)
        self._tid  = tid
        self._tld  = TLD()
        self._ssdb = MySSDB(SSDBHOST, SSDBPORT)

    def run(self):
        while True:
            domain = HTTPSQSQueue.get(DOMAINQUEUE02).lower()
            if '' == domain and None == time.sleep(1): continue
            if self.isDomainInBlacklist(domain): 
                C.Info('Domain in black list: %s' % domain, C.DEBUG)
                continue
            self.f**k(domain)
            self.refreshBlacklist(domain)
            self.monitor()


    def f**k(self, domain):
        global DomainsProcessed, DomainsStored
        DomainsProcessed  = DomainsProcessed + 1
        if not self._ssdb.isDomainInDB('hdm', domain):
            self._ssdb.setHItem('hdm', MD5(domain), domain)
            HTTPSQSQueue.put(DOMAINQUEUE01, domain)
            DomainsStored = DomainsStored    + 1

    def monitor(self):
        if os.path.exists('debug.dump'):self.dump()
    def refreshBlacklist(self, domain):
        self.refreshInternalBlacklist(domain)
        self.refreshExternalBlacklist()
    def isDomainInBlacklist(self, domain):
        return self.isDomainInExternalBlacklist(domain) or self.isDomainInInternalBlacklist(domain)
    def isDomainInInternalBlacklist(self, domain):
        domain_user_part = self._tld.getTLD(domain)[0]
        if not '' == domain_user_part:
            for black in self._InternalBlacklist:
                if domain_user_part == black: return True
        return False
    def isDomainInExternalBlacklist(self, domain):
        for black in self._ExternalBlacklist:
            if domain.endswith(black): return True
        return False

    def refreshInternalBlacklist(self, domain):
        '''
        www.chinaz.com.cn -> chinaz -> CacheDictionary
        '''
        black = self._tld.getTLD(domain)[0]
        if '' == black : return
        if self._TLDUserPartCache.has_key(black):
            self._TLDUserPartCache[black] = self._TLDUserPartCache[black] + 1
            if self._TLDUserPartCache[black] > self.TLDUserPartRepeatMax/2 and not black in self._InternalBlacklist:
                self._InternalBlacklist.append(black)
        else:
            self._TLDUserPartCache[black] = 1

        if len(self._TLDUserPartCache) > self.TLDUserPartCacheMaxLen:
            tempList = sorted(self._TLDUserPartCache, key=self._TLDUserPartCache.get)
            for i in range(0, len(tempList)/2):
                self._TLDUserPartCache.pop(tempList[i])
            tempList = None

        if len(self._InternalBlacklist) > self.InternalBlackListMaxLen:
            self.saveInternalBlacklist()
    
    def refreshExternalBlacklist(self):
        if     (random.randrange(1,11) % 3 == 0)      :return
        if not os.path.exists(self._BlacklistFileName):return
        BlacklistFileModifyTime = os.stat(self._BlacklistFileName).st_mtime
        if self._BlacklistFileModifyTime == BlacklistFileModifyTime:
            return
        try:
            f          = open(self._BlacklistFileName)
            lines      = f.readlines()
            blacklists = []
            for line in lines:
                domain = line.strip().replace('\n', '').replace('\r', '')
                if len(domain) > 0:blacklists.append(domain)
            self._ExternalBlacklist = blacklists[:]
            blacklists = None
            self._BlacklistFileModifyTime = BlacklistFileModifyTime
            C.Info('Get %2d domains in blacklist' % len(self._ExternalBlacklist), C.INFO)
        except Exception, e:
            C.Info(str(e), C.ERROR)
        finally: