Esempio n. 1
0
 def f**k(self, domain):
     global DomainsProcessed, DomainsStored
     DomainsProcessed  = DomainsProcessed + 1
     if not self._ssdb.isDomainInDB('hdm', domain):
         self._ssdb.setHItem('hdm', MD5(domain), domain)
         HTTPSQSQueue.put(DOMAINQUEUE01, domain)
         DomainsStored = DomainsStored    + 1
Esempio n. 2
0
 def checkDomainsQueue(self):
     self._QueueUnRead01 = self.getQueueUnRead(DOMAINQUEUE01)
     self._QueueUnRead02 = self.getQueueUnRead(DOMAINQUEUE02)
     if self._QueueUnRead01 > 10000:
         C.Info('HTTPSQSQueue %s will be full, waiting for cache!!!!!!!!!!!!!' % DOMAINQUEUE01, C.ALERT)
         self.cacheHTTPSQSQueue(DOMAINQUEUE01)
         C.Info('HTTPSQSQueue cached', C.ALERT)
     if self._QueueUnRead02 > 10000:
         C.Info('HTTPSQSQueue %s will be full, waiting for reset!!!!!!!!!!!!!' % DOMAINQUEUE02, C.ALERT)
         for i in range(0, 100):HTTPSQSQueue.put(DOMAINQUEUE01, HTTPSQSQueue.get(DOMAINQUEUE02))
         HTTPSQSQueue.reset(DOMAINQUEUE02)                
Esempio n. 3
0
 def fuckDomain(self, originalDomain):
     time1       = time.time()
     newDomains  = []
     hc          = self.getHTMLContentFromUrl('http://' + originalDomain)
     urls        = self.parseUrlsFromHTMLContent(hc)
     for url in urls:
         domain  = self.parseDomainFromUrl(url)
         if not domain in newDomains and not domain == originalDomain and not domain == '':
             HTTPSQSQueue.put(DOMAINQUEUE02, domain)
             newDomains.append(domain)
     C.Info('(%2d) get %3d new domains from %s in %.fs' % (self._tid, len(newDomains), originalDomain, time.time()-time1), C.DEBUG)
     newDomains = []
Esempio n. 4
0
 def feed(self, qName, cFile):
     '''
     Feed HTTPSQSQueue with domains from cached file
     '''
     if not os.path.exists(cFile):
         print 'File does not exists!'
         return
     f     = open(cFile)
     lines = f.readlines()
     count = 0
     for line in lines:
         HTTPSQSQueue.put(qName,line.strip())
         count = count + 1
     f.close()
     print HTTPSQSQueue.status(qName)
Esempio n. 5
0
 def getQueueUnRead(self, qName):
     qStatus = HTTPSQSQueue.status(qName).replace('\n', '')
     UnRead  = 0
     try:
         UnRead = int(json.loads(qStatus).pop('unread'))
     except Exception, e:
         C.Info(str(e), C.ERROR)
Esempio n. 6
0
 def run(self):
     while True:
         domain = HTTPSQSQueue.get(DOMAINQUEUE02).lower()
         if '' == domain and None == time.sleep(1): continue
         if self.isDomainInBlacklist(domain): 
             C.Info('Domain in black list: %s' % domain, C.DEBUG)
             continue
         self.f**k(domain)
         self.refreshBlacklist(domain)
         self.monitor()
Esempio n. 7
0
 def f**k(self):
     while True:
         originalDomain = HTTPSQSQueue.get(DOMAINQUEUE01)
         if originalDomain == '':
             time.sleep(5)
             continue
         self.fuckDomain(originalDomain)
         self.ThreadRefreshTime = time.time()
         self.TotalProcessed  = self.TotalProcessed + 1
         info = '*****(%2d) processed %d domains in %.fs' % (self._tid, self.TotalProcessed, self.ThreadRefreshTime-self.ThreadStartTime)
         C.Info(info, C.DEBUG)
         if self.ThreadCanExit == True: break
Esempio n. 8
0
 def cacheHTTPSQSQueue(self,qName):
     while self.getQueueUnRead(qName) > 1000:
         domains = []
         for i in range(0, 100):
             domains.append(HTTPSQSQueue.get(DOMAINQUEUE01))
         
         try:
             cacheFileName = qName + '-' + time.strftime('%Y%m%d%H%M%S') + '.qc'
             f = open(cacheFileName, 'a+')
             try:
                 f.write('\n'.join(domains))
             except:
                 f.close()
         except:
             pass
Esempio n. 9
0
class Monitor(threading.Thread):
    ThreadStartTime  = time.time()
    TotalProcessed   = 0
    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        while True:
            self.ShowStatus()
            time.sleep(15)
            pass
    def ShowStatus(self):
        self.TotalProcessed = 0
        for spider in DomainSpidders:
            self.TotalProcessed = self.TotalProcessed + spider.TotalProcessed
        TimeUsed = time.time() - self.ThreadStartTime
        info = 'Totoal Domains:%d, Time used:%.fm, Speed:%.f/m' % (self.TotalProcessed, TimeUsed/60, float(self.TotalProcessed * 60/TimeUsed))
        C.Info(info, C.INFO)

if __name__ == '__main__' :
    reload(sys)
    sys.setdefaultencoding("UTF-8")
    if len(sys.argv) < 2:
        sys.exit()
    if len(sys.argv) > 2:
        HTTPSQSQueue.put(DOMAINQUEUE01,sys.argv[2])
    for i in range(0,int(sys.argv[1])):
        DomainSpidders.append(DomainSpidder(i))
    for digger in DomainSpidders:
        digger.start()
    Monitor().start()
Esempio n. 10
0
 def reset(self, qName):
     '''
     reset a HTTPSQSQueue
     '''
     HTTPSQSQueue.reset(qName)
     print HTTPSQSQueue.status(qName)