def reset(self): self.running = False self.stopped = True self.Handler.join() self.totalUrls = 0 self.doneUrls = 0 self.POOL = [] self.Done = {} self.Requests = [] self.FormAnalysis = WebAnalyzer(forms=True)
def __init__(self, threads=2, reject=[], store=False, proxy=None, cookie=None): self.FormAnalysis = WebAnalyzer(forms=True) self.running = False self.stopped = False self.CFG_threads = threads self.runningThreads = 0 self.CFG_store = store self.CFG_reject = reject + [ "jpg", "gif", "png", "zip", "exe", "doc", "swf", "rar", "pdf" ] self.totalUrls = 0 self.doneUrls = 0 self.POOL = [] self.Done = {} self.CFG_proxy = proxy self.urlOKS = [] if cookie: self.CFG_cookie = cookie else: self.CFG_cookie = None self.threads_list = [] self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads) self.Semaphore_Mutex = threading.BoundedSemaphore(value=1) self.reReject = [] self.reNeeded = None consoleNg.Console.__init__(self, "dcrawl> ") self.Handler = None self.Requests = []
def __init__(self, threads=2, reject=[], store=False, proxy=None, cookie=None): self.FormAnalysis = WebAnalyzer(forms=True) self.running = False self.stopped = False self.CFG_threads = threads self.runningThreads = 0 self.CFG_store = store self.CFG_reject = reject + ["jpg", "gif", "png", "zip", "exe", "doc", "swf", "rar", "pdf"] self.totalUrls = 0 self.doneUrls = 0 self.POOL = [] self.Done = {} self.CFG_proxy = proxy self.urlOKS = [] if cookie: self.CFG_cookie = cookie else: self.CFG_cookie = None self.threads_list = [] self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads) self.Semaphore_Mutex = threading.BoundedSemaphore(value=1) self.reReject = [] self.reNeeded = None consoleNg.Console.__init__(self, "dcrawl> ") self.Handler = None self.Requests = []
class DCrawl(consoleNg.Console): def __init__(self, threads=2, reject=[], store=False, proxy=None, cookie=None): self.FormAnalysis = WebAnalyzer(forms=True) self.running = False self.stopped = False self.CFG_threads = threads self.runningThreads = 0 self.CFG_store = store self.CFG_reject = reject + [ "jpg", "gif", "png", "zip", "exe", "doc", "swf", "rar", "pdf" ] self.totalUrls = 0 self.doneUrls = 0 self.POOL = [] self.Done = {} self.CFG_proxy = proxy self.urlOKS = [] if cookie: self.CFG_cookie = cookie else: self.CFG_cookie = None self.threads_list = [] self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads) self.Semaphore_Mutex = threading.BoundedSemaphore(value=1) self.reReject = [] self.reNeeded = None consoleNg.Console.__init__(self, "dcrawl> ") self.Handler = None self.Requests = [] def addReject(self, regex): '''para paths que no se quieren visitar''' self.reReject.append(re.compile(regex, re.I)) def getRejects(self): return self.reReject def addNeeded(self, regex): '''Regexp que se debe cumplir para que se visite la url''' if regex: self.reNeeded = re.compile(regex, re.I) else: self.reNeeded = None def reset(self): self.running = False self.stopped = True self.Handler.join() self.totalUrls = 0 self.doneUrls = 0 self.POOL = [] self.Done = {} self.Requests = [] self.FormAnalysis = WebAnalyzer(forms=True) def continueWith(self, url): '''Logica que decide si se visita o no una URL''' if url not in self.Done: for i in self.reReject: if i.findall(url): return False if self.reNeeded and not self.reNeeded.findall(url): print url return False return True return False def __setattr__(self, item, value): '''Utilizado nada mas para la version consola''' if item == "CFG_threads": if self.running: raise Exception, "Crawler running!!! Wait to finish" self.__dict__[item] = value self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads) return self.__dict__[item] = value def __getattr__(self, item): '''Utilizado nada mas para la version consola''' if item == "CFG_threads": return self.threads else: raise AttributeError(item) def status(self): return self.doneUrls, self.totalUrls def stop(self): self.running = False self.stopped = True if self.Handler: self.Handler.join() def restart(self): self.running = True self.stopped = False self.Launch() def append(self, url): '''append una url en el pool de salida y ejecuta el carwler si esta parado''' if self.continueWith(url): self.Semaphore_Mutex.acquire() self.totalUrls += 1 self.POOL.append(url) self.Done[url] = True if url[-1] != "/": tmp = url.split("/") url = "/".join(tmp[:-1]) + "/" self.Semaphore_Mutex.release() if not self.running and not self.stopped: self.running = True self.Launch() def postFetch(self, str, url): '''Funcion que agrega las urls al pool final o bien guarda todo a HDD''' if str: if self.CFG_store: _, dom, path, pre, vars, _ = urlparse(url) if path[-1] == '/' and not pre and not vars: path = [dom] + path.split('/') file = "index.html" else: path = path.split('/') file = path[-1] if pre: file += ";" + pre if vars: file += "?" + vars path = [dom] + path[:-1] path = [i for i in path if i] dirs = "" for i in path: dirs = os.path.join(dirs, i) try: os.makedirs(dirs) print "Making dir: ", dirs except: pass finalpath = os.path.join(dirs, file) f = open(finalpath, "w") f.write(str) print "writting: ", finalpath f.close() else: self.FormAnalysis.appendPage(str, url) def getForms(self): '''deprecated''' dic = self.FormAnalysis.formSummary() for i, j in dic.items(): print j, i def getDynPages(self): '''deprecated''' for i in self.FormAnalysis.getDynPages(): print i def getInfo(self): '''deprecated''' print self.FormAnalysis.infoSummary() def wait(self): while self.running: sleep(1) def getAllPools(self): res = self.POOL self.POOL = [] return res def Launch(self): self.Handler = threading.Thread(target=self.threadHandler, kwargs={}) self.Handler.start() def threadHandler(self): while self.running: urls = self.getAllPools() if not urls: self.running = False break for url in urls: if not self.running: break path = urlparse(url)[2] if "." in path: ext = urlparse(url)[2].split(".")[-1] else: ext = "" if ext in self.CFG_reject: self.postFetch(None, url) self.doneUrls += 1 continue self.Semaphore.acquire() th = threading.Thread(target=self.crawl, kwargs={"url": url}) th.start() self.threads_list.append(th) temp = [] for thr in self.threads_list: if not thr.isAlive(): temp.append(thr) for i in temp: i.join() self.threads_list.remove(i) if not self.running: for i in self.threads_list: i.join() self.threads_list = [] def crawl(self, url): try: sl = SearchLinks(url) sl.setProxy(self.CFG_proxy) sl.addCookie(self.CFG_cookie) for j in sl: self.append(j) if sl.status != "Ok": print "fallo en", url else: self.Semaphore_Mutex.acquire() self.Requests.append(sl.getRequest()) self.Semaphore_Mutex.release() self.urlOKS.append(url) #self.CFG_cookie=sl.getCookie() self.postFetch(sl.getResponse(), url) except Exception, er: print "Crawler: ", er self.Semaphore_Mutex.acquire() self.doneUrls += 1 self.Semaphore_Mutex.release() self.Semaphore.release()
def run(self): while True: try: services = self.sounder.get() if not services: self._save_service_config() else: for service in services: time_interval = pendulum.now().subtract( days=service.day_interval, hours=service.hour_interval) if time_interval >= pendulum.parse( service.last_checked): # RUN SERVICE if service == 'openphish': from openphish import OpenPhish for finding in OpenPhish().get(): self.spotted.save(url=finding, source=service) self.publisher.post(finding) if service == 'phishingdatabase': from phishingdatabase import PhishingDatabase for finding in PhishingDatabase().get( today=True): self.spotted.save(url=finding, source=service) self.publisher.post(finding) if service == 'phishtank': from phishtank import PhishTank for finding in PhishTank().get(): self.spotted.save( url=finding['url'], source=service, ipv4_address=finding['ip'], country=finding['country'], registrar=finding['registrar']) self.publisher.post(finding['url']) if service == 'twitter': count = 0 last_id = self.sounder.get( service=service)['last_id'] if not last_id: last_id = None from twitterscraper import TwitterScraper for finding in TwitterScraper().get( since_id=last_id): if not last_id and count == 0: self.sounder.save( service=service, last_id=finding['id']) count += 1 self.spotted.save( tweet_extracted_urls=finding[ 'extracted_urls'], tweet_urls=finding['urls'], tweet_hash_tags=finding['tags'], tweet_text=finding['text'], tweet_id=finding['id'], source=service) self.publisher.post( finding['extracted_urls']) if service == 'urlscan': from urlscan import UrlScan for finding in UrlScan().get(): self.spotted.save( url=finding['url'], parsed_ur=finding['parsed_url'], ipv4_address=finding['ip'], country=finding['country'], domain=finding['domain'], source=finding['source']) self.publisher.post(finding['url']) if service == 'webanalyzer': from webanalyzer import WebAnalyzer for finding in WebAnalyzer().get(): self.spotted.save(url=finding, source=service) self.publisher.post(finding) if service == 'whoisds': from whoisds import WhoisDs for finding in WhoisDs().get(): self.spotted.save(url=finding, source=service) self.publisher.post(finding) except: print('ERROR: Error when calling spotter.run: {}'.format( sys.exc_info()[0])) pass
class DCrawl(consoleNg.Console): def __init__(self, threads=2, reject=[], store=False, proxy=None, cookie=None): self.FormAnalysis = WebAnalyzer(forms=True) self.running = False self.stopped = False self.CFG_threads = threads self.runningThreads = 0 self.CFG_store = store self.CFG_reject = reject + ["jpg", "gif", "png", "zip", "exe", "doc", "swf", "rar", "pdf"] self.totalUrls = 0 self.doneUrls = 0 self.POOL = [] self.Done = {} self.CFG_proxy = proxy self.urlOKS = [] if cookie: self.CFG_cookie = cookie else: self.CFG_cookie = None self.threads_list = [] self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads) self.Semaphore_Mutex = threading.BoundedSemaphore(value=1) self.reReject = [] self.reNeeded = None consoleNg.Console.__init__(self, "dcrawl> ") self.Handler = None self.Requests = [] def addReject(self, regex): """para paths que no se quieren visitar""" self.reReject.append(re.compile(regex, re.I)) def getRejects(self): return self.reReject def addNeeded(self, regex): """Regexp que se debe cumplir para que se visite la url""" if regex: self.reNeeded = re.compile(regex, re.I) else: self.reNeeded = None def reset(self): self.running = False self.stopped = True self.Handler.join() self.totalUrls = 0 self.doneUrls = 0 self.POOL = [] self.Done = {} self.Requests = [] self.FormAnalysis = WebAnalyzer(forms=True) def continueWith(self, url): """Logica que decide si se visita o no una URL""" if url not in self.Done: for i in self.reReject: if i.findall(url): return False if self.reNeeded and not self.reNeeded.findall(url): print url return False return True return False def __setattr__(self, item, value): """Utilizado nada mas para la version consola""" if item == "CFG_threads": if self.running: raise Exception, "Crawler running!!! Wait to finish" self.__dict__[item] = value self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads) return self.__dict__[item] = value def __getattr__(self, item): """Utilizado nada mas para la version consola""" if item == "CFG_threads": return self.threads else: raise AttributeError(item) def status(self): return self.doneUrls, self.totalUrls def stop(self): self.running = False self.stopped = True if self.Handler: self.Handler.join() def restart(self): self.running = True self.stopped = False self.Launch() def append(self, url): """append una url en el pool de salida y ejecuta el carwler si esta parado""" if self.continueWith(url): self.Semaphore_Mutex.acquire() self.totalUrls += 1 self.POOL.append(url) self.Done[url] = True if url[-1] != "/": tmp = url.split("/") url = "/".join(tmp[:-1]) + "/" self.Semaphore_Mutex.release() if not self.running and not self.stopped: self.running = True self.Launch() def postFetch(self, str, url): """Funcion que agrega las urls al pool final o bien guarda todo a HDD""" if str: if self.CFG_store: _, dom, path, pre, vars, _ = urlparse(url) if path[-1] == "/" and not pre and not vars: path = [dom] + path.split("/") file = "index.html" else: path = path.split("/") file = path[-1] if pre: file += ";" + pre if vars: file += "?" + vars path = [dom] + path[:-1] path = [i for i in path if i] dirs = "" for i in path: dirs = os.path.join(dirs, i) try: os.makedirs(dirs) print "Making dir: ", dirs except: pass finalpath = os.path.join(dirs, file) f = open(finalpath, "w") f.write(str) print "writting: ", finalpath f.close() else: self.FormAnalysis.appendPage(str, url) def getForms(self): """deprecated""" dic = self.FormAnalysis.formSummary() for i, j in dic.items(): print j, i def getDynPages(self): """deprecated""" for i in self.FormAnalysis.getDynPages(): print i def getInfo(self): """deprecated""" print self.FormAnalysis.infoSummary() def wait(self): while self.running: sleep(1) def getAllPools(self): res = self.POOL self.POOL = [] return res def Launch(self): self.Handler = threading.Thread(target=self.threadHandler, kwargs={}) self.Handler.start() def threadHandler(self): while self.running: urls = self.getAllPools() if not urls: self.running = False break for url in urls: if not self.running: break path = urlparse(url)[2] if "." in path: ext = urlparse(url)[2].split(".")[-1] else: ext = "" if ext in self.CFG_reject: self.postFetch(None, url) self.doneUrls += 1 continue self.Semaphore.acquire() th = threading.Thread(target=self.crawl, kwargs={"url": url}) th.start() self.threads_list.append(th) temp = [] for thr in self.threads_list: if not thr.isAlive(): temp.append(thr) for i in temp: i.join() self.threads_list.remove(i) if not self.running: for i in self.threads_list: i.join() self.threads_list = [] def crawl(self, url): try: sl = SearchLinks(url) sl.setProxy(self.CFG_proxy) sl.addCookie(self.CFG_cookie) for j in sl: self.append(j) if sl.status != "Ok": print "fallo en", url else: self.Semaphore_Mutex.acquire() self.Requests.append(sl.getRequest()) self.Semaphore_Mutex.release() self.urlOKS.append(url) # self.CFG_cookie=sl.getCookie() self.postFetch(sl.getResponse(), url) except Exception, er: print "Crawler: ", er self.Semaphore_Mutex.acquire() self.doneUrls += 1 self.Semaphore_Mutex.release() self.Semaphore.release()