def calculate_depth(link=None, link_db=None): try: hierarchy = URLProcessor.generate_url_hierarchy(link) _depth = 0 for url in hierarchy: if link_db.is_in_base(url): _depth = max(_depth, int(link_db.get_details(url)[2]) + 1) return _depth except: raise ValueError('Invalid parameters.')
def add_links(self, links, priority, depth=0, source_url=""): _counter = 0 self.logger.debug('Trying to add %d links' % len(links)) for link in links: _link = URLProcessor.validate(link, source_url) if self._evaluate_link(_link) and not self.link_db.is_in_base(_link): #_depth = SimpleCrawlingDepthPolicy.calculate_depth(link, source_url, depth) #_depth = RealDepthCrawlingDepthPolicy.calculate_depth(link, self.link_db) _depth = IgnoreDepthPolicy.calculate_depth() if _depth <= self.max_url_depth: self.logger.debug("Added:%s with priority %d" % (_link, _depth)) self.link_db.add_link(_link, priority, _depth) _counter += 1 self.logger.debug("Added %d new links into DB." % _counter)
def add_links(self, links, priority, depth=0, source_url=""): _counter = 0 self.logger.debug('Trying to add %d links' % len(links)) for link in links: _link = URLProcessor.validate(link, source_url) if self._evaluate_link(_link) and not self.link_db.is_in_base(_link): #_depth = SimpleCrawlingDepthPolicy.calculate_depth(link, source_url, depth) usage example #_depth = RealDepthCrawlingDepthPolicy.calculate_depth(link, self.link_db) usage example _depth = IgnoreDepthPolicy.calculate_depth() if _depth <= self.max_url_depth: try: self.link_db.add_link(_link, priority, _depth) if source_url: self.link_db.points(source_url, _link) except Exception as e: self.logger.error("Add links error:"+str(_link)+"Message:"+str(e.message)) print "Add links error:" + str(_link) + "Message:" + str(e.message) else: self.logger.debug("Added:%s with priority %s" % (_link, priority)) _counter += 1 self.logger.debug("Added %d new links into DB." % _counter)
def calculate_depth(link=None, source_url=None, depth=None): try: return URLProcessor.identical_hosts(link, source_url) and int(depth) + 1 or 0 except: raise ValueError('Invalid parameters.')