Beispiel #1
0
 def calculate_depth(link=None, link_db=None):
     try:
         hierarchy = URLProcessor.generate_url_hierarchy(link)
         _depth = 0
         for url in hierarchy:
             if link_db.is_in_base(url):
                 _depth = max(_depth, int(link_db.get_details(url)[2]) + 1)
         return _depth
     except:
         raise ValueError('Invalid parameters.')
Beispiel #2
0
 def add_links(self, links, priority, depth=0, source_url=""):
     _counter = 0
     self.logger.debug('Trying to add %d links' % len(links))
     for link in links:
         _link = URLProcessor.validate(link, source_url)
         if self._evaluate_link(_link) and not self.link_db.is_in_base(_link):
             #_depth = SimpleCrawlingDepthPolicy.calculate_depth(link, source_url, depth)
             #_depth = RealDepthCrawlingDepthPolicy.calculate_depth(link, self.link_db)
             _depth = IgnoreDepthPolicy.calculate_depth()
             if _depth <= self.max_url_depth:
                 self.logger.debug("Added:%s with priority %d" % (_link, _depth))
                 self.link_db.add_link(_link, priority, _depth)
                 _counter += 1
     self.logger.debug("Added %d new links into DB." % _counter)
Beispiel #3
0
 def add_links(self, links, priority, depth=0, source_url=""):
     _counter = 0
     self.logger.debug('Trying to add %d links' % len(links))
     for link in links:
         _link = URLProcessor.validate(link, source_url)
         if self._evaluate_link(_link) and not self.link_db.is_in_base(_link):
             #_depth = SimpleCrawlingDepthPolicy.calculate_depth(link, source_url, depth) usage example
             #_depth = RealDepthCrawlingDepthPolicy.calculate_depth(link, self.link_db) usage example
             _depth = IgnoreDepthPolicy.calculate_depth()
             if _depth <= self.max_url_depth:
                 try:
                     self.link_db.add_link(_link, priority, _depth)
                     if source_url:
                         self.link_db.points(source_url, _link)
                 except Exception as e:
                     self.logger.error("Add links error:"+str(_link)+"Message:"+str(e.message))
                     print "Add links error:" + str(_link) + "Message:" + str(e.message)
                 else:
                     self.logger.debug("Added:%s with priority %s" % (_link, priority))
                     _counter += 1
     self.logger.debug("Added %d new links into DB." % _counter)
Beispiel #4
0
 def calculate_depth(link=None, source_url=None, depth=None):
     try:
         return URLProcessor.identical_hosts(link, source_url) and int(depth) + 1 or 0
     except:
         raise ValueError('Invalid parameters.')