Example #1
0
class DCrawl(consoleNg.Console):
    def __init__(self,
                 threads=2,
                 reject=[],
                 store=False,
                 proxy=None,
                 cookie=None):

        self.FormAnalysis = WebAnalyzer(forms=True)
        self.running = False
        self.stopped = False

        self.CFG_threads = threads
        self.runningThreads = 0
        self.CFG_store = store
        self.CFG_reject = reject + [
            "jpg", "gif", "png", "zip", "exe", "doc", "swf", "rar", "pdf"
        ]

        self.totalUrls = 0
        self.doneUrls = 0
        self.POOL = []
        self.Done = {}
        self.CFG_proxy = proxy

        self.urlOKS = []

        if cookie:
            self.CFG_cookie = cookie
        else:
            self.CFG_cookie = None

        self.threads_list = []

        self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads)
        self.Semaphore_Mutex = threading.BoundedSemaphore(value=1)

        self.reReject = []
        self.reNeeded = None

        consoleNg.Console.__init__(self, "dcrawl> ")
        self.Handler = None

        self.Requests = []

    def addReject(self, regex):
        '''para paths que no se quieren visitar'''
        self.reReject.append(re.compile(regex, re.I))

    def getRejects(self):
        return self.reReject

    def addNeeded(self, regex):
        '''Regexp que se debe cumplir para que se visite la url'''
        if regex:
            self.reNeeded = re.compile(regex, re.I)
        else:
            self.reNeeded = None

    def reset(self):
        self.running = False
        self.stopped = True
        self.Handler.join()
        self.totalUrls = 0
        self.doneUrls = 0
        self.POOL = []
        self.Done = {}
        self.Requests = []
        self.FormAnalysis = WebAnalyzer(forms=True)

    def continueWith(self, url):
        '''Logica que decide si se visita o no una URL'''
        if url not in self.Done:
            for i in self.reReject:
                if i.findall(url):
                    return False
            if self.reNeeded and not self.reNeeded.findall(url):
                print url
                return False
            return True
        return False

    def __setattr__(self, item, value):
        '''Utilizado nada mas para la version consola'''
        if item == "CFG_threads":
            if self.running:
                raise Exception, "Crawler running!!! Wait to finish"
            self.__dict__[item] = value
            self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads)
            return

        self.__dict__[item] = value

    def __getattr__(self, item):
        '''Utilizado nada mas para la version consola'''
        if item == "CFG_threads":
            return self.threads
        else:
            raise AttributeError(item)

    def status(self):
        return self.doneUrls, self.totalUrls

    def stop(self):
        self.running = False
        self.stopped = True
        if self.Handler:
            self.Handler.join()

    def restart(self):
        self.running = True
        self.stopped = False
        self.Launch()

    def append(self, url):
        '''append una url en el pool de salida y ejecuta el carwler si esta parado'''
        if self.continueWith(url):
            self.Semaphore_Mutex.acquire()
            self.totalUrls += 1
            self.POOL.append(url)
            self.Done[url] = True
            if url[-1] != "/":
                tmp = url.split("/")
                url = "/".join(tmp[:-1]) + "/"
            self.Semaphore_Mutex.release()
            if not self.running and not self.stopped:
                self.running = True
                self.Launch()

    def postFetch(self, str, url):
        '''Funcion que agrega las urls al pool final o bien guarda todo a HDD'''
        if str:
            if self.CFG_store:
                _, dom, path, pre, vars, _ = urlparse(url)
                if path[-1] == '/' and not pre and not vars:
                    path = [dom] + path.split('/')
                    file = "index.html"
                else:
                    path = path.split('/')
                    file = path[-1]
                    if pre:
                        file += ";" + pre
                    if vars:
                        file += "?" + vars
                    path = [dom] + path[:-1]

                path = [i for i in path if i]

                dirs = ""
                for i in path:
                    dirs = os.path.join(dirs, i)

                try:
                    os.makedirs(dirs)
                    print "Making dir: ", dirs
                except:
                    pass

                finalpath = os.path.join(dirs, file)

                f = open(finalpath, "w")
                f.write(str)
                print "writting: ", finalpath
                f.close()
            else:
                self.FormAnalysis.appendPage(str, url)

    def getForms(self):
        '''deprecated'''
        dic = self.FormAnalysis.formSummary()

        for i, j in dic.items():
            print j, i

    def getDynPages(self):
        '''deprecated'''
        for i in self.FormAnalysis.getDynPages():
            print i

    def getInfo(self):
        '''deprecated'''
        print self.FormAnalysis.infoSummary()

    def wait(self):
        while self.running:
            sleep(1)

    def getAllPools(self):
        res = self.POOL
        self.POOL = []
        return res

    def Launch(self):
        self.Handler = threading.Thread(target=self.threadHandler, kwargs={})
        self.Handler.start()

    def threadHandler(self):
        while self.running:
            urls = self.getAllPools()
            if not urls:
                self.running = False
                break

            for url in urls:
                if not self.running:
                    break
                path = urlparse(url)[2]
                if "." in path:
                    ext = urlparse(url)[2].split(".")[-1]
                else:
                    ext = ""

                if ext in self.CFG_reject:
                    self.postFetch(None, url)
                    self.doneUrls += 1
                    continue

                self.Semaphore.acquire()

                th = threading.Thread(target=self.crawl, kwargs={"url": url})
                th.start()
                self.threads_list.append(th)

            temp = []
            for thr in self.threads_list:
                if not thr.isAlive():
                    temp.append(thr)

            for i in temp:
                i.join()
                self.threads_list.remove(i)

            if not self.running:
                for i in self.threads_list:
                    i.join()
                self.threads_list = []

    def crawl(self, url):
        try:
            sl = SearchLinks(url)
            sl.setProxy(self.CFG_proxy)
            sl.addCookie(self.CFG_cookie)
            for j in sl:
                self.append(j)
            if sl.status != "Ok":
                print "fallo en", url
            else:
                self.Semaphore_Mutex.acquire()
                self.Requests.append(sl.getRequest())
                self.Semaphore_Mutex.release()
                self.urlOKS.append(url)
                #self.CFG_cookie=sl.getCookie()
                self.postFetch(sl.getResponse(), url)
        except Exception, er:
            print "Crawler: ", er

        self.Semaphore_Mutex.acquire()
        self.doneUrls += 1
        self.Semaphore_Mutex.release()
        self.Semaphore.release()
Example #2
0
class DCrawl(consoleNg.Console):
    def __init__(self, threads=2, reject=[], store=False, proxy=None, cookie=None):

        self.FormAnalysis = WebAnalyzer(forms=True)
        self.running = False
        self.stopped = False

        self.CFG_threads = threads
        self.runningThreads = 0
        self.CFG_store = store
        self.CFG_reject = reject + ["jpg", "gif", "png", "zip", "exe", "doc", "swf", "rar", "pdf"]

        self.totalUrls = 0
        self.doneUrls = 0
        self.POOL = []
        self.Done = {}
        self.CFG_proxy = proxy

        self.urlOKS = []

        if cookie:
            self.CFG_cookie = cookie
        else:
            self.CFG_cookie = None

        self.threads_list = []

        self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads)
        self.Semaphore_Mutex = threading.BoundedSemaphore(value=1)

        self.reReject = []
        self.reNeeded = None

        consoleNg.Console.__init__(self, "dcrawl> ")
        self.Handler = None

        self.Requests = []

    def addReject(self, regex):
        """para paths que no se quieren visitar"""
        self.reReject.append(re.compile(regex, re.I))

    def getRejects(self):
        return self.reReject

    def addNeeded(self, regex):
        """Regexp que se debe cumplir para que se visite la url"""
        if regex:
            self.reNeeded = re.compile(regex, re.I)
        else:
            self.reNeeded = None

    def reset(self):
        self.running = False
        self.stopped = True
        self.Handler.join()
        self.totalUrls = 0
        self.doneUrls = 0
        self.POOL = []
        self.Done = {}
        self.Requests = []
        self.FormAnalysis = WebAnalyzer(forms=True)

    def continueWith(self, url):
        """Logica que decide si se visita o no una URL"""
        if url not in self.Done:
            for i in self.reReject:
                if i.findall(url):
                    return False
            if self.reNeeded and not self.reNeeded.findall(url):
                print url
                return False
            return True
        return False

    def __setattr__(self, item, value):
        """Utilizado nada mas para la version consola"""
        if item == "CFG_threads":
            if self.running:
                raise Exception, "Crawler running!!! Wait to finish"
            self.__dict__[item] = value
            self.Semaphore = threading.BoundedSemaphore(value=self.CFG_threads)
            return

        self.__dict__[item] = value

    def __getattr__(self, item):
        """Utilizado nada mas para la version consola"""
        if item == "CFG_threads":
            return self.threads
        else:
            raise AttributeError(item)

    def status(self):
        return self.doneUrls, self.totalUrls

    def stop(self):
        self.running = False
        self.stopped = True
        if self.Handler:
            self.Handler.join()

    def restart(self):
        self.running = True
        self.stopped = False
        self.Launch()

    def append(self, url):
        """append una url en el pool de salida y ejecuta el carwler si esta parado"""
        if self.continueWith(url):
            self.Semaphore_Mutex.acquire()
            self.totalUrls += 1
            self.POOL.append(url)
            self.Done[url] = True
            if url[-1] != "/":
                tmp = url.split("/")
                url = "/".join(tmp[:-1]) + "/"
            self.Semaphore_Mutex.release()
            if not self.running and not self.stopped:
                self.running = True
                self.Launch()

    def postFetch(self, str, url):
        """Funcion que agrega las urls al pool final o bien guarda todo a HDD"""
        if str:
            if self.CFG_store:
                _, dom, path, pre, vars, _ = urlparse(url)
                if path[-1] == "/" and not pre and not vars:
                    path = [dom] + path.split("/")
                    file = "index.html"
                else:
                    path = path.split("/")
                    file = path[-1]
                    if pre:
                        file += ";" + pre
                    if vars:
                        file += "?" + vars
                    path = [dom] + path[:-1]

                path = [i for i in path if i]

                dirs = ""
                for i in path:
                    dirs = os.path.join(dirs, i)

                try:
                    os.makedirs(dirs)
                    print "Making dir: ", dirs
                except:
                    pass

                finalpath = os.path.join(dirs, file)

                f = open(finalpath, "w")
                f.write(str)
                print "writting: ", finalpath
                f.close()
            else:
                self.FormAnalysis.appendPage(str, url)

    def getForms(self):
        """deprecated"""
        dic = self.FormAnalysis.formSummary()

        for i, j in dic.items():
            print j, i

    def getDynPages(self):
        """deprecated"""
        for i in self.FormAnalysis.getDynPages():
            print i

    def getInfo(self):
        """deprecated"""
        print self.FormAnalysis.infoSummary()

    def wait(self):
        while self.running:
            sleep(1)

    def getAllPools(self):
        res = self.POOL
        self.POOL = []
        return res

    def Launch(self):
        self.Handler = threading.Thread(target=self.threadHandler, kwargs={})
        self.Handler.start()

    def threadHandler(self):
        while self.running:
            urls = self.getAllPools()
            if not urls:
                self.running = False
                break

            for url in urls:
                if not self.running:
                    break
                path = urlparse(url)[2]
                if "." in path:
                    ext = urlparse(url)[2].split(".")[-1]
                else:
                    ext = ""

                if ext in self.CFG_reject:
                    self.postFetch(None, url)
                    self.doneUrls += 1
                    continue

                self.Semaphore.acquire()

                th = threading.Thread(target=self.crawl, kwargs={"url": url})
                th.start()
                self.threads_list.append(th)

            temp = []
            for thr in self.threads_list:
                if not thr.isAlive():
                    temp.append(thr)

            for i in temp:
                i.join()
                self.threads_list.remove(i)

            if not self.running:
                for i in self.threads_list:
                    i.join()
                self.threads_list = []

    def crawl(self, url):
        try:
            sl = SearchLinks(url)
            sl.setProxy(self.CFG_proxy)
            sl.addCookie(self.CFG_cookie)
            for j in sl:
                self.append(j)
            if sl.status != "Ok":
                print "fallo en", url
            else:
                self.Semaphore_Mutex.acquire()
                self.Requests.append(sl.getRequest())
                self.Semaphore_Mutex.release()
                self.urlOKS.append(url)
                # self.CFG_cookie=sl.getCookie()
                self.postFetch(sl.getResponse(), url)
        except Exception, er:
            print "Crawler: ", er

        self.Semaphore_Mutex.acquire()
        self.doneUrls += 1
        self.Semaphore_Mutex.release()
        self.Semaphore.release()