コード例 #1
0
ファイル: crawler.py プロジェクト: Deceptio-Solutions/OSINT
class Crawler(object):
    def __init__(self, driver, lock, first_url, db_int=1):
        self.driver = driver
        self.driver.implicitly_wait(10)
        self.driver.set_page_load_timeout(30)
        self.r = RedisRecord()
        self.lock = lock
        self.queue = deque([])
        self.queue.append(first_url)
        self.dbs = [1, 2]

    def init(self):
        self.r.init(self.dbs)
        url = self.queue.popleft()
        self.driver.get(url)
        self.parser(url)

    def parser(self, url):
        self.r.switchDB(1)
        if not self.r.get(url):
            self.driver.get(url)
            elem_links = self.driver.find_elements_by_tag_name('a')
            self.lock.acquire()
            self.sort([link.get_attribute("href") for link in elem_links], url)
            self.lock.release()
            self.r.switchDB(1)
            self.r.put(url, url)

    def navigation(self):

        while (len(self.queue) > 0):
            url = self.queue.popleft()
            try:
                # self.driver.refresh()
                self.r.switchDB(1)
                self.parser(url)

            except URLError as e:
                print url
            except IOError as e:
                self.r.switchDB(2)
                print "I/O error({0}): {1}".format(e.errno, e.strerror)
                # self.r.put(new_url,new_url)
                self.r.switchDB(1)
            except e:
                continue
        try:
            self.driver.quit()
            print "Fin du crawling du site " + url
        except URLError as e:
            self.driver = getattr(webdriver, 'Firefox')()
            print 'boum'
            self.lock.acquire()
            self.r.switchDB(1)
            self.r.put(url, url)
            self.lock.release()

    def sort(self, elem_links, url):
        fex = Faup()
        f = Filters()
        f.load()
        self.r.switchDB(1)
        extend = True
        domainfilter = True
        schemefilter = True
        try:
            for link in elem_links:
                new_url = link
                self.r.switchDB(2)
                if not self.r.get(new_url) and new_url:
                    self.r.switchDB(1)
                    if not self.r.get(new_url):
                        fex.decode(new_url)
                        domain = fex.get_host()
                        if f.isfilteredscheme(fex.get_scheme()):
                            self.r.switchDB(2)
                            self.r.put(new_url, new_url)
                            schemefilter = False
                        if f.isfiltereddomains(domain):
                            self.r.switchDB(2)
                            self.r.put(new_url, new_url)
                            domainfilter = False
                        if f.isfilteredextention(fex.get_resource_path()):
                            extend = False
                            self.r.switchDB(2)
                            self.r.put(new_url, new_url)

                        if extend and domainfilter and schemefilter:
                            self.r.switchDB(1)
                            self.r.rpush('crawl', new_url)
                            self.queue.append(new_url)
        except TypeError as e:
            print "TypeError"
コード例 #2
0
ファイル: crawler.py プロジェクト: 5l1v3r1/OSINT-1
class Crawler(object):
    def __init__(self, driver, lock, first_url, db_int=1):
        self.driver = driver
        self.driver.implicitly_wait(10)
        self.driver.set_page_load_timeout(30)
        self.r = RedisRecord()
        self.lock = lock
        self.queue = deque([])
        self.queue.append(first_url)
        self.dbs = [1, 2]

    def init(self):
        self.r.init(self.dbs)
        url = self.queue.popleft()
        self.driver.get(url)
        self.parser(url)

    def parser(self, url):
        self.r.switchDB(1)
        if not self.r.get(url):
            self.driver.get(url)
            elem_links = self.driver.find_elements_by_tag_name('a')
            self.lock.acquire()
            self.sort([link.get_attribute("href") for link in elem_links], url)
            self.lock.release()
            self.r.switchDB(1)
            self.r.put(url, url)

    def navigation(self):

        while (len(self.queue) > 0):
            url = self.queue.popleft()
            try:
                # self.driver.refresh()
                self.r.switchDB(1)
                self.parser(url)

            except URLError as e:
                print url
            except IOError as e:
                self.r.switchDB(2)
                print "I/O error({0}): {1}".format(e.errno, e.strerror)
                # self.r.put(new_url,new_url)
                self.r.switchDB(1)
            except e:
                continue
        try:
            self.driver.quit()
            print "Fin du crawling du site " + url
        except URLError as e:
            self.driver = getattr(webdriver, 'Firefox')()
            print 'boum'
            self.lock.acquire()
            self.r.switchDB(1)
            self.r.put(url, url)
            self.lock.release()

    def sort(self, elem_links, url):
        fex = Faup()
        f = Filters()
        f.load()
        self.r.switchDB(1)
        extend = True
        domainfilter = True
        schemefilter = True
        try:
            for link in elem_links:
                new_url = link
                self.r.switchDB(2)
                if not self.r.get(new_url) and new_url:
                    self.r.switchDB(1)
                    if not self.r.get(new_url):
                        fex.decode(new_url)
                        domain = fex.get_host()
                        if f.isfilteredscheme(fex.get_scheme()):
                            self.r.switchDB(2)
                            self.r.put(new_url, new_url)
                            schemefilter = False
                        if f.isfiltereddomains(domain):
                            self.r.switchDB(2)
                            self.r.put(new_url, new_url)
                            domainfilter = False
                        if f.isfilteredextention(fex.get_resource_path()):
                            extend = False
                            self.r.switchDB(2)
                            self.r.put(new_url, new_url)

                        if extend and domainfilter and schemefilter:
                            self.r.switchDB(1)
                            self.r.rpush('crawl', new_url)
                            self.queue.append(new_url)
        except TypeError as e:
            print "TypeError"