def run(self, max_depth): depth = 0 while len(self.pending) > 0: page = self.pending.popleft() # get links for the current page page_data = self.scraper.run(page) page['title'] = page_data['title'] page['links'] = page_data['links'] # persist to db instance = Page(title=page['title'], url=page['link'], search_term=self.search_term, links=page['links'], created_at=datetime.now(), updated_at=datetime.now()) if page['parent'] is not None: instance.parent = page['parent'] instance.save() if self.curr_parent is None or page[ 'parent'] is not self.curr_parent: parent_id = self.curr_parent if parent_id is None: parent_id = instance.id if page['parent'] is not self.curr_parent: depth += 1 parent_id = instance.id self.curr_parent = page['parent'] logger.info('crawl level: %d, max depth: %d, queue size: %d', depth, max_depth, len(self.pending)) logger.info('crawling page id %s, url: %s', str(instance.id), page['link']) # queue up the links for another level of crawling if depth < max_depth: self.enqueue(page_data['links'], parent_id)
def get_state(self): url = self.driver.current_url page = Page.objects(url=url).first() if page is None: self.driver.get(url) default_state = state_builder.get_current_state(self.driver) default_state.name = self.page default_state.save() page = Page(url=url, default_state=default_state, states=[default_state]) page.name = self.page page.save() for state in page.states: if state.name == self.page: print "Found state %s" % state.name return state print "State not found, creating new state" new_state = state_builder.get_current_state(self.driver) new_state.save() new_state.name = self.page page.states.append(new_state) page.save() return new_state
def test_page(self): default_state = State.objects().first() states = State.objects[:5] page = Page(url="http://www.google.com/", default_state=default_state, states=states) page.save() assert len(page.states) > 0