Example #1
0
class Crawler(object):
    def __init__(self):
        self.visited_url = set()
        self.root_url = None
        self.indexer = Indexer()

    def pass_robot_txt(self, url):
        robot = robotparser.RobotFileParser()
        robot.set_url(self.root_url)
        robot.read()

        return robot.can_fetch('*', url)

    def define_root_url(self, url):
        self.root_url = url

    def add_included_suburls(self, soup):

        urls = set()

        refs = soup.findAll('a')

        for ref in refs:
            try:
                href = ref['href']
            except Exception:
                print("Doesn't contains suburl")
                continue

            if len(href) < 2:
                continue

            if '//' in href:
                continue

            if href[0] != '/':
                continue

            if self.root_url in href:
                urls.add(href)

            urls.add(self.root_url + href)

        return urls

    def get_pair_word_and_count(self, soup):
        def visible(element):
            if element.parent.name in [
                    'head', 'script', 'style', '[document]'
            ]:
                return False

            if re.match('<--.*-->', str(element)):
                return False

            if element == '\n':
                return False

            return True

        data = soup.findAll(text=True)

        visible_text = filter(visible, data)
        words = list()

        for text in visible_text:
            result = re.findall(r'[0-9a-z]+', text.lower())

            for res in result:
                words.append(res)

        self.indexer.add_words(set(words))

        return Counter(words)

    def visit(self, url, width, depth):

        if depth < 0:
            return

        if not self.pass_robot_txt(url):
            raise Exception("robot.txt founded")

        current_url = url
        self.indexer.add_url(current_url)

        depth = depth - 1

        try:
            html = urllib2.urlopen(url).read()
        except Exception:
            print("Can't open this *** url")
            return

        soup = BeautifulSoup(html)

        urls = self.add_included_suburls(soup)

        for url in urls:
            if url in self.visited_url:
                continue

            if width == 0:
                break

            self.visited_url.add(url)
            width = width - 1
            self.visit(url, width, depth)

        words = self.get_pair_word_and_count(soup).iteritems()

        self.indexer.create_index(words, current_url)

    def run(self, url, width, depth):
        self.define_root_url(url)
        self.visit(url, width, depth)
Example #2
-1
class Crawler(object):

    def __init__(self):
        self.visited_url = set()
        self.root_url = None
        self.indexer = Indexer()

    def pass_robot_txt(self,url):
        robot = robotparser.RobotFileParser()
        robot.set_url(self.root_url)
        robot.read()

        return robot.can_fetch('*',url)

    def define_root_url(self,url):
        self.root_url = url

    def add_included_suburls(self, soup):

        urls = set()

        refs = soup.findAll('a')


        for ref in refs:
            try:
                href = ref['href']
            except Exception:
                print("Doesn't contains suburl")
                continue

            if len(href) < 2:
                continue

            if '//' in href:
                continue

            if href[0] != '/':
                continue

            if self.root_url in href:
                urls.add(href)

            urls.add(self.root_url + href)

        return urls

    def get_pair_word_and_count(self, soup):

        def visible(element):
            if element.parent.name in ['head','script','style','[document]']:
                return False

            if re.match('<--.*-->',str(element)):
                return False

            if element == '\n':
                return False

            return True


        data = soup.findAll(text = True)

        visible_text = filter(visible, data)
        words = list()

        for text in visible_text:
            result = re.findall(r'[0-9a-z]+',text.lower())

            for res in result:
                words.append(res)

        self.indexer.add_words(set(words))

        return Counter(words)


    def visit(self, url, width, depth):

        if depth<0:
            return

        if not self.pass_robot_txt(url):
           raise Exception("robot.txt founded")

        current_url = url
        self.indexer.add_url(current_url)

        depth = depth - 1

        try:
            html = urllib2.urlopen(url).read()
        except Exception:
            print("Can't open this *** url")
            return

        soup = BeautifulSoup(html)

        urls = self.add_included_suburls(soup)

        for url in urls:
            if url in self.visited_url:
                continue

            if width == 0:
                break

            self.visited_url.add(url)
            width = width -1
            self.visit(url,width,depth)

        words = self.get_pair_word_and_count(soup).iteritems()

        self.indexer.create_index(words, current_url)

    def run(self,url,width,depth):
        self.define_root_url(url)
        self.visit(url,width,depth)