def reset(self):
     self.running = False
     self.stopped = True
     self.Handler.join()
     self.totalUrls = 0
     self.doneUrls = 0
     self.POOL = []
     self.Done = {}
     self.Requests = []
     self.FormAnalysis = WebAnalyzer(forms=True)
    def __init__(self,
                 threads=2,
                 reject=[],
                 store=False,
                 proxy=None,
                 cookie=None):

        self.FormAnalysis = WebAnalyzer(forms=True)
        self.running = False
        self.stopped = False

        self.CFG_threads = threads
        self.runningThreads = 0
        self.CFG_store = store
        self.CFG_reject = reject + [
            "jpg", "gif", "png", "zip", "exe", "doc", "swf", "rar", "pdf"
        ]

        self.totalUrls = 0
        self.doneUrls = 0
        self.POOL = []
        self.Done = {}
        self.CFG_proxy = proxy

        self.urlOKS = []

        if cookie:
            self.CFG_cookie = cookie
        else:
            self.CFG_cookie = None

        self.threads_list = []

        self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads)
        self.Semaphore_Mutex = threading.BoundedSemaphore(value=1)

        self.reReject = []
        self.reNeeded = None

        consoleNg.Console.__init__(self, "dcrawl> ")
        self.Handler = None

        self.Requests = []
Exemple #3
0
 def reset(self):
     self.running = False
     self.stopped = True
     self.Handler.join()
     self.totalUrls = 0
     self.doneUrls = 0
     self.POOL = []
     self.Done = {}
     self.Requests = []
     self.FormAnalysis = WebAnalyzer(forms=True)
Exemple #4
0
    def __init__(self, threads=2, reject=[], store=False, proxy=None, cookie=None):

        self.FormAnalysis = WebAnalyzer(forms=True)
        self.running = False
        self.stopped = False

        self.CFG_threads = threads
        self.runningThreads = 0
        self.CFG_store = store
        self.CFG_reject = reject + ["jpg", "gif", "png", "zip", "exe", "doc", "swf", "rar", "pdf"]

        self.totalUrls = 0
        self.doneUrls = 0
        self.POOL = []
        self.Done = {}
        self.CFG_proxy = proxy

        self.urlOKS = []

        if cookie:
            self.CFG_cookie = cookie
        else:
            self.CFG_cookie = None

        self.threads_list = []

        self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads)
        self.Semaphore_Mutex = threading.BoundedSemaphore(value=1)

        self.reReject = []
        self.reNeeded = None

        consoleNg.Console.__init__(self, "dcrawl> ")
        self.Handler = None

        self.Requests = []
class DCrawl(consoleNg.Console):
    def __init__(self,
                 threads=2,
                 reject=[],
                 store=False,
                 proxy=None,
                 cookie=None):

        self.FormAnalysis = WebAnalyzer(forms=True)
        self.running = False
        self.stopped = False

        self.CFG_threads = threads
        self.runningThreads = 0
        self.CFG_store = store
        self.CFG_reject = reject + [
            "jpg", "gif", "png", "zip", "exe", "doc", "swf", "rar", "pdf"
        ]

        self.totalUrls = 0
        self.doneUrls = 0
        self.POOL = []
        self.Done = {}
        self.CFG_proxy = proxy

        self.urlOKS = []

        if cookie:
            self.CFG_cookie = cookie
        else:
            self.CFG_cookie = None

        self.threads_list = []

        self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads)
        self.Semaphore_Mutex = threading.BoundedSemaphore(value=1)

        self.reReject = []
        self.reNeeded = None

        consoleNg.Console.__init__(self, "dcrawl> ")
        self.Handler = None

        self.Requests = []

    def addReject(self, regex):
        '''para paths que no se quieren visitar'''
        self.reReject.append(re.compile(regex, re.I))

    def getRejects(self):
        return self.reReject

    def addNeeded(self, regex):
        '''Regexp que se debe cumplir para que se visite la url'''
        if regex:
            self.reNeeded = re.compile(regex, re.I)
        else:
            self.reNeeded = None

    def reset(self):
        self.running = False
        self.stopped = True
        self.Handler.join()
        self.totalUrls = 0
        self.doneUrls = 0
        self.POOL = []
        self.Done = {}
        self.Requests = []
        self.FormAnalysis = WebAnalyzer(forms=True)

    def continueWith(self, url):
        '''Logica que decide si se visita o no una URL'''
        if url not in self.Done:
            for i in self.reReject:
                if i.findall(url):
                    return False
            if self.reNeeded and not self.reNeeded.findall(url):
                print url
                return False
            return True
        return False

    def __setattr__(self, item, value):
        '''Utilizado nada mas para la version consola'''
        if item == "CFG_threads":
            if self.running:
                raise Exception, "Crawler running!!! Wait to finish"
            self.__dict__[item] = value
            self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads)
            return

        self.__dict__[item] = value

    def __getattr__(self, item):
        '''Utilizado nada mas para la version consola'''
        if item == "CFG_threads":
            return self.threads
        else:
            raise AttributeError(item)

    def status(self):
        return self.doneUrls, self.totalUrls

    def stop(self):
        self.running = False
        self.stopped = True
        if self.Handler:
            self.Handler.join()

    def restart(self):
        self.running = True
        self.stopped = False
        self.Launch()

    def append(self, url):
        '''append una url en el pool de salida y ejecuta el carwler si esta parado'''
        if self.continueWith(url):
            self.Semaphore_Mutex.acquire()
            self.totalUrls += 1
            self.POOL.append(url)
            self.Done[url] = True
            if url[-1] != "/":
                tmp = url.split("/")
                url = "/".join(tmp[:-1]) + "/"
            self.Semaphore_Mutex.release()
            if not self.running and not self.stopped:
                self.running = True
                self.Launch()

    def postFetch(self, str, url):
        '''Funcion que agrega las urls al pool final o bien guarda todo a HDD'''
        if str:
            if self.CFG_store:
                _, dom, path, pre, vars, _ = urlparse(url)
                if path[-1] == '/' and not pre and not vars:
                    path = [dom] + path.split('/')
                    file = "index.html"
                else:
                    path = path.split('/')
                    file = path[-1]
                    if pre:
                        file += ";" + pre
                    if vars:
                        file += "?" + vars
                    path = [dom] + path[:-1]

                path = [i for i in path if i]

                dirs = ""
                for i in path:
                    dirs = os.path.join(dirs, i)

                try:
                    os.makedirs(dirs)
                    print "Making dir: ", dirs
                except:
                    pass

                finalpath = os.path.join(dirs, file)

                f = open(finalpath, "w")
                f.write(str)
                print "writting: ", finalpath
                f.close()
            else:
                self.FormAnalysis.appendPage(str, url)

    def getForms(self):
        '''deprecated'''
        dic = self.FormAnalysis.formSummary()

        for i, j in dic.items():
            print j, i

    def getDynPages(self):
        '''deprecated'''
        for i in self.FormAnalysis.getDynPages():
            print i

    def getInfo(self):
        '''deprecated'''
        print self.FormAnalysis.infoSummary()

    def wait(self):
        while self.running:
            sleep(1)

    def getAllPools(self):
        res = self.POOL
        self.POOL = []
        return res

    def Launch(self):
        self.Handler = threading.Thread(target=self.threadHandler, kwargs={})
        self.Handler.start()

    def threadHandler(self):
        while self.running:
            urls = self.getAllPools()
            if not urls:
                self.running = False
                break

            for url in urls:
                if not self.running:
                    break
                path = urlparse(url)[2]
                if "." in path:
                    ext = urlparse(url)[2].split(".")[-1]
                else:
                    ext = ""

                if ext in self.CFG_reject:
                    self.postFetch(None, url)
                    self.doneUrls += 1
                    continue

                self.Semaphore.acquire()

                th = threading.Thread(target=self.crawl, kwargs={"url": url})
                th.start()
                self.threads_list.append(th)

            temp = []
            for thr in self.threads_list:
                if not thr.isAlive():
                    temp.append(thr)

            for i in temp:
                i.join()
                self.threads_list.remove(i)

            if not self.running:
                for i in self.threads_list:
                    i.join()
                self.threads_list = []

    def crawl(self, url):
        try:
            sl = SearchLinks(url)
            sl.setProxy(self.CFG_proxy)
            sl.addCookie(self.CFG_cookie)
            for j in sl:
                self.append(j)
            if sl.status != "Ok":
                print "fallo en", url
            else:
                self.Semaphore_Mutex.acquire()
                self.Requests.append(sl.getRequest())
                self.Semaphore_Mutex.release()
                self.urlOKS.append(url)
                #self.CFG_cookie=sl.getCookie()
                self.postFetch(sl.getResponse(), url)
        except Exception, er:
            print "Crawler: ", er

        self.Semaphore_Mutex.acquire()
        self.doneUrls += 1
        self.Semaphore_Mutex.release()
        self.Semaphore.release()
Exemple #6
0
 def run(self):
     while True:
         try:
             services = self.sounder.get()
             if not services:
                 self._save_service_config()
             else:
                 for service in services:
                     time_interval = pendulum.now().subtract(
                         days=service.day_interval,
                         hours=service.hour_interval)
                     if time_interval >= pendulum.parse(
                             service.last_checked):
                         # RUN SERVICE
                         if service == 'openphish':
                             from openphish import OpenPhish
                             for finding in OpenPhish().get():
                                 self.spotted.save(url=finding,
                                                   source=service)
                                 self.publisher.post(finding)
                         if service == 'phishingdatabase':
                             from phishingdatabase import PhishingDatabase
                             for finding in PhishingDatabase().get(
                                     today=True):
                                 self.spotted.save(url=finding,
                                                   source=service)
                                 self.publisher.post(finding)
                         if service == 'phishtank':
                             from phishtank import PhishTank
                             for finding in PhishTank().get():
                                 self.spotted.save(
                                     url=finding['url'],
                                     source=service,
                                     ipv4_address=finding['ip'],
                                     country=finding['country'],
                                     registrar=finding['registrar'])
                                 self.publisher.post(finding['url'])
                         if service == 'twitter':
                             count = 0
                             last_id = self.sounder.get(
                                 service=service)['last_id']
                             if not last_id:
                                 last_id = None
                             from twitterscraper import TwitterScraper
                             for finding in TwitterScraper().get(
                                     since_id=last_id):
                                 if not last_id and count == 0:
                                     self.sounder.save(
                                         service=service,
                                         last_id=finding['id'])
                                     count += 1
                                 self.spotted.save(
                                     tweet_extracted_urls=finding[
                                         'extracted_urls'],
                                     tweet_urls=finding['urls'],
                                     tweet_hash_tags=finding['tags'],
                                     tweet_text=finding['text'],
                                     tweet_id=finding['id'],
                                     source=service)
                                 self.publisher.post(
                                     finding['extracted_urls'])
                         if service == 'urlscan':
                             from urlscan import UrlScan
                             for finding in UrlScan().get():
                                 self.spotted.save(
                                     url=finding['url'],
                                     parsed_ur=finding['parsed_url'],
                                     ipv4_address=finding['ip'],
                                     country=finding['country'],
                                     domain=finding['domain'],
                                     source=finding['source'])
                                 self.publisher.post(finding['url'])
                         if service == 'webanalyzer':
                             from webanalyzer import WebAnalyzer
                             for finding in WebAnalyzer().get():
                                 self.spotted.save(url=finding,
                                                   source=service)
                                 self.publisher.post(finding)
                         if service == 'whoisds':
                             from whoisds import WhoisDs
                             for finding in WhoisDs().get():
                                 self.spotted.save(url=finding,
                                                   source=service)
                                 self.publisher.post(finding)
         except:
             print('ERROR: Error when calling spotter.run: {}'.format(
                 sys.exc_info()[0]))
             pass
Exemple #7
0
class DCrawl(consoleNg.Console):
    def __init__(self, threads=2, reject=[], store=False, proxy=None, cookie=None):

        self.FormAnalysis = WebAnalyzer(forms=True)
        self.running = False
        self.stopped = False

        self.CFG_threads = threads
        self.runningThreads = 0
        self.CFG_store = store
        self.CFG_reject = reject + ["jpg", "gif", "png", "zip", "exe", "doc", "swf", "rar", "pdf"]

        self.totalUrls = 0
        self.doneUrls = 0
        self.POOL = []
        self.Done = {}
        self.CFG_proxy = proxy

        self.urlOKS = []

        if cookie:
            self.CFG_cookie = cookie
        else:
            self.CFG_cookie = None

        self.threads_list = []

        self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads)
        self.Semaphore_Mutex = threading.BoundedSemaphore(value=1)

        self.reReject = []
        self.reNeeded = None

        consoleNg.Console.__init__(self, "dcrawl> ")
        self.Handler = None

        self.Requests = []

    def addReject(self, regex):
        """para paths que no se quieren visitar"""
        self.reReject.append(re.compile(regex, re.I))

    def getRejects(self):
        return self.reReject

    def addNeeded(self, regex):
        """Regexp que se debe cumplir para que se visite la url"""
        if regex:
            self.reNeeded = re.compile(regex, re.I)
        else:
            self.reNeeded = None

    def reset(self):
        self.running = False
        self.stopped = True
        self.Handler.join()
        self.totalUrls = 0
        self.doneUrls = 0
        self.POOL = []
        self.Done = {}
        self.Requests = []
        self.FormAnalysis = WebAnalyzer(forms=True)

    def continueWith(self, url):
        """Logica que decide si se visita o no una URL"""
        if url not in self.Done:
            for i in self.reReject:
                if i.findall(url):
                    return False
            if self.reNeeded and not self.reNeeded.findall(url):
                print url
                return False
            return True
        return False

    def __setattr__(self, item, value):
        """Utilizado nada mas para la version consola"""
        if item == "CFG_threads":
            if self.running:
                raise Exception, "Crawler running!!! Wait to finish"
            self.__dict__[item] = value
            self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads)
            return

        self.__dict__[item] = value

    def __getattr__(self, item):
        """Utilizado nada mas para la version consola"""
        if item == "CFG_threads":
            return self.threads
        else:
            raise AttributeError(item)

    def status(self):
        return self.doneUrls, self.totalUrls

    def stop(self):
        self.running = False
        self.stopped = True
        if self.Handler:
            self.Handler.join()

    def restart(self):
        self.running = True
        self.stopped = False
        self.Launch()

    def append(self, url):
        """append una url en el pool de salida y ejecuta el carwler si esta parado"""
        if self.continueWith(url):
            self.Semaphore_Mutex.acquire()
            self.totalUrls += 1
            self.POOL.append(url)
            self.Done[url] = True
            if url[-1] != "/":
                tmp = url.split("/")
                url = "/".join(tmp[:-1]) + "/"
            self.Semaphore_Mutex.release()
            if not self.running and not self.stopped:
                self.running = True
                self.Launch()

    def postFetch(self, str, url):
        """Funcion que agrega las urls al pool final o bien guarda todo a HDD"""
        if str:
            if self.CFG_store:
                _, dom, path, pre, vars, _ = urlparse(url)
                if path[-1] == "/" and not pre and not vars:
                    path = [dom] + path.split("/")
                    file = "index.html"
                else:
                    path = path.split("/")
                    file = path[-1]
                    if pre:
                        file += ";" + pre
                    if vars:
                        file += "?" + vars
                    path = [dom] + path[:-1]

                path = [i for i in path if i]

                dirs = ""
                for i in path:
                    dirs = os.path.join(dirs, i)

                try:
                    os.makedirs(dirs)
                    print "Making dir: ", dirs
                except:
                    pass

                finalpath = os.path.join(dirs, file)

                f = open(finalpath, "w")
                f.write(str)
                print "writting: ", finalpath
                f.close()
            else:
                self.FormAnalysis.appendPage(str, url)

    def getForms(self):
        """deprecated"""
        dic = self.FormAnalysis.formSummary()

        for i, j in dic.items():
            print j, i

    def getDynPages(self):
        """deprecated"""
        for i in self.FormAnalysis.getDynPages():
            print i

    def getInfo(self):
        """deprecated"""
        print self.FormAnalysis.infoSummary()

    def wait(self):
        while self.running:
            sleep(1)

    def getAllPools(self):
        res = self.POOL
        self.POOL = []
        return res

    def Launch(self):
        self.Handler = threading.Thread(target=self.threadHandler, kwargs={})
        self.Handler.start()

    def threadHandler(self):
        while self.running:
            urls = self.getAllPools()
            if not urls:
                self.running = False
                break

            for url in urls:
                if not self.running:
                    break
                path = urlparse(url)[2]
                if "." in path:
                    ext = urlparse(url)[2].split(".")[-1]
                else:
                    ext = ""

                if ext in self.CFG_reject:
                    self.postFetch(None, url)
                    self.doneUrls += 1
                    continue

                self.Semaphore.acquire()

                th = threading.Thread(target=self.crawl, kwargs={"url": url})
                th.start()
                self.threads_list.append(th)

            temp = []
            for thr in self.threads_list:
                if not thr.isAlive():
                    temp.append(thr)

            for i in temp:
                i.join()
                self.threads_list.remove(i)

            if not self.running:
                for i in self.threads_list:
                    i.join()
                self.threads_list = []

    def crawl(self, url):
        try:
            sl = SearchLinks(url)
            sl.setProxy(self.CFG_proxy)
            sl.addCookie(self.CFG_cookie)
            for j in sl:
                self.append(j)
            if sl.status != "Ok":
                print "fallo en", url
            else:
                self.Semaphore_Mutex.acquire()
                self.Requests.append(sl.getRequest())
                self.Semaphore_Mutex.release()
                self.urlOKS.append(url)
                # self.CFG_cookie=sl.getCookie()
                self.postFetch(sl.getResponse(), url)
        except Exception, er:
            print "Crawler: ", er

        self.Semaphore_Mutex.acquire()
        self.doneUrls += 1
        self.Semaphore_Mutex.release()
        self.Semaphore.release()