def f**k(self, domain): global DomainsProcessed, DomainsStored DomainsProcessed = DomainsProcessed + 1 if not self._ssdb.isDomainInDB('hdm', domain): self._ssdb.setHItem('hdm', MD5(domain), domain) HTTPSQSQueue.put(DOMAINQUEUE01, domain) DomainsStored = DomainsStored + 1
def checkDomainsQueue(self): self._QueueUnRead01 = self.getQueueUnRead(DOMAINQUEUE01) self._QueueUnRead02 = self.getQueueUnRead(DOMAINQUEUE02) if self._QueueUnRead01 > 10000: C.Info('HTTPSQSQueue %s will be full, waiting for cache!!!!!!!!!!!!!' % DOMAINQUEUE01, C.ALERT) self.cacheHTTPSQSQueue(DOMAINQUEUE01) C.Info('HTTPSQSQueue cached', C.ALERT) if self._QueueUnRead02 > 10000: C.Info('HTTPSQSQueue %s will be full, waiting for reset!!!!!!!!!!!!!' % DOMAINQUEUE02, C.ALERT) for i in range(0, 100):HTTPSQSQueue.put(DOMAINQUEUE01, HTTPSQSQueue.get(DOMAINQUEUE02)) HTTPSQSQueue.reset(DOMAINQUEUE02)
def fuckDomain(self, originalDomain): time1 = time.time() newDomains = [] hc = self.getHTMLContentFromUrl('http://' + originalDomain) urls = self.parseUrlsFromHTMLContent(hc) for url in urls: domain = self.parseDomainFromUrl(url) if not domain in newDomains and not domain == originalDomain and not domain == '': HTTPSQSQueue.put(DOMAINQUEUE02, domain) newDomains.append(domain) C.Info('(%2d) get %3d new domains from %s in %.fs' % (self._tid, len(newDomains), originalDomain, time.time()-time1), C.DEBUG) newDomains = []
def feed(self, qName, cFile): ''' Feed HTTPSQSQueue with domains from cached file ''' if not os.path.exists(cFile): print 'File does not exists!' return f = open(cFile) lines = f.readlines() count = 0 for line in lines: HTTPSQSQueue.put(qName,line.strip()) count = count + 1 f.close() print HTTPSQSQueue.status(qName)
def getQueueUnRead(self, qName): qStatus = HTTPSQSQueue.status(qName).replace('\n', '') UnRead = 0 try: UnRead = int(json.loads(qStatus).pop('unread')) except Exception, e: C.Info(str(e), C.ERROR)
def run(self): while True: domain = HTTPSQSQueue.get(DOMAINQUEUE02).lower() if '' == domain and None == time.sleep(1): continue if self.isDomainInBlacklist(domain): C.Info('Domain in black list: %s' % domain, C.DEBUG) continue self.f**k(domain) self.refreshBlacklist(domain) self.monitor()
def f**k(self): while True: originalDomain = HTTPSQSQueue.get(DOMAINQUEUE01) if originalDomain == '': time.sleep(5) continue self.fuckDomain(originalDomain) self.ThreadRefreshTime = time.time() self.TotalProcessed = self.TotalProcessed + 1 info = '*****(%2d) processed %d domains in %.fs' % (self._tid, self.TotalProcessed, self.ThreadRefreshTime-self.ThreadStartTime) C.Info(info, C.DEBUG) if self.ThreadCanExit == True: break
def cacheHTTPSQSQueue(self,qName): while self.getQueueUnRead(qName) > 1000: domains = [] for i in range(0, 100): domains.append(HTTPSQSQueue.get(DOMAINQUEUE01)) try: cacheFileName = qName + '-' + time.strftime('%Y%m%d%H%M%S') + '.qc' f = open(cacheFileName, 'a+') try: f.write('\n'.join(domains)) except: f.close() except: pass
class Monitor(threading.Thread): ThreadStartTime = time.time() TotalProcessed = 0 def __init__(self): threading.Thread.__init__(self) def run(self): while True: self.ShowStatus() time.sleep(15) pass def ShowStatus(self): self.TotalProcessed = 0 for spider in DomainSpidders: self.TotalProcessed = self.TotalProcessed + spider.TotalProcessed TimeUsed = time.time() - self.ThreadStartTime info = 'Totoal Domains:%d, Time used:%.fm, Speed:%.f/m' % (self.TotalProcessed, TimeUsed/60, float(self.TotalProcessed * 60/TimeUsed)) C.Info(info, C.INFO) if __name__ == '__main__' : reload(sys) sys.setdefaultencoding("UTF-8") if len(sys.argv) < 2: sys.exit() if len(sys.argv) > 2: HTTPSQSQueue.put(DOMAINQUEUE01,sys.argv[2]) for i in range(0,int(sys.argv[1])): DomainSpidders.append(DomainSpidder(i)) for digger in DomainSpidders: digger.start() Monitor().start()
def reset(self, qName): ''' reset a HTTPSQSQueue ''' HTTPSQSQueue.reset(qName) print HTTPSQSQueue.status(qName)