Beispiel #1
0
    def run(self, max_depth):
        depth = 0
        while len(self.pending) > 0:
            page = self.pending.popleft()

            # get links for the current page
            page_data = self.scraper.run(page)

            page['title'] = page_data['title']
            page['links'] = page_data['links']

            # persist to db
            instance = Page(title=page['title'],
                            url=page['link'],
                            search_term=self.search_term,
                            links=page['links'],
                            created_at=datetime.now(),
                            updated_at=datetime.now())
            if page['parent'] is not None:
                instance.parent = page['parent']
            instance.save()

            if self.curr_parent is None or page[
                    'parent'] is not self.curr_parent:
                parent_id = self.curr_parent

                if parent_id is None:
                    parent_id = instance.id

                if page['parent'] is not self.curr_parent:
                    depth += 1
                    parent_id = instance.id
                    self.curr_parent = page['parent']

            logger.info('crawl level: %d, max depth: %d, queue size: %d',
                        depth, max_depth, len(self.pending))
            logger.info('crawling page id %s, url: %s', str(instance.id),
                        page['link'])

            # queue up the links for another level of crawling
            if depth < max_depth:
                self.enqueue(page_data['links'], parent_id)
 def get_state(self):
     url = self.driver.current_url
     page = Page.objects(url=url).first()
     if page is None:
         self.driver.get(url)
         default_state = state_builder.get_current_state(self.driver)
         default_state.name = self.page
         default_state.save()
         page = Page(url=url,
                     default_state=default_state,
                     states=[default_state])
         page.name = self.page
         page.save()
     for state in page.states:
         if state.name == self.page:
             print "Found state %s" % state.name
             return state
     print "State not found, creating new state"
     new_state = state_builder.get_current_state(self.driver)
     new_state.save()
     new_state.name = self.page
     page.states.append(new_state)
     page.save()
     return new_state
Beispiel #3
0
 def test_page(self):
     default_state = State.objects().first()
     states = State.objects[:5]
     page = Page(url="http://www.google.com/", default_state=default_state, states=states)
     page.save()
     assert len(page.states) > 0