Exemple #1
0
 def get_url_timestamp(self, url):
     """
     Get timestamp of last <url> visit
     """
     site_visit_info = self._visited_sites.get(get_netlock(url), None)
     if not site_visit_info:
         return -1
     return site_visit_info.visited_urls.get(url, -1)
Exemple #2
0
 def is_visited(self, url):
     """
     Check whether visit history contains <url>
     """
     site_url = get_netlock(url)
     if not self._visited_sites.has_key(site_url):
         return None
     siteVisitInfo = self._visited_sites[site_url]
     if siteVisitInfo:
         return  siteVisitInfo.visited_urls.get(url)
Exemple #3
0
 def set_visited(self, url, weight = 0, timestamp = None ):
     """
     add <url> to page visit history
     """
     if not timestamp:
         timestamp = time.time()
     site_url = get_netlock(url)
     if site_url not in self._visited_sites:
         site_visit_info = SiteVisitInfo()
         self._visited_sites[site_url] = site_visit_info
     else:
         site_visit_info = self._visited_sites[site_url]
     site_visit_info.submit_url(url, weight, timestamp)
Exemple #4
0
 def _max_avg_weight(self, link_queue):
     """
     Returns netloc of the site, which has the highest average weight.
     """
     weights = {}
     for w,l in link_queue._queue:
         netloc = get_netlock(l)
         sum_w, count = weights.get(netloc, (0, 0))
         weights[netloc] = sum_w + w, count + 1
     max_aw = None
     best = None
     t = None
     try:
         for t in weights.items():
             nl, (sum_w, count) = t
             if sum_w / count > max_aw:
                 max_aw = sum_w / count
                 best = nl
     except Exception:
         print 'fail'
     return best
Exemple #5
0
 def _main_proc(self):
     """
     Main procedure:
     get [(link_url, weight)] from ResultFilter and pass best links to PageLoader
     """
     if self._stage_ttl >= 0: #main stage
         #fill queues:
         match_function = lambda x: get_netlock(x.user_data) == self._current_site or self._current_site == ''
         res = self._result_filter.get_result(match_function)
         if res:
             page_url, weight, filtered_links = res
             if not self._current_site:
                 self._current_site = get_netlock(page_url)
             self._visit_history.set_visited(page_url, weight)
             for link, weight in filtered_links:
                 netloc = get_netlock(link)
                 if netloc == self._current_site:
                     if self._visit_history.is_visited(link):
                         continue
                     self._current_site_queue.put((weight,link), True)
                 else:
                     if netloc != self._current_site and self._visit_history.is_visited_site(link):
                         continue
                     self._future_site_queue.put((weight,link), True)
         #assign PL tasks
         while self._page_loader.have_slots() and not self._current_site_queue.is_empty():
             task = PageLoaderTask(self._current_site_queue.get())
             self._page_loader.add_task(task)
     #
     if self._stage_ttl < 0: #switching stages
         self._page_loader.purge_tasks(lambda x: x.user_data != self._current_site)
         self._ranker.purge_tasks(lambda x: get_netlock(x.user_data) != self._current_site)
         self._result_filter.purge_tasks(lambda x: get_netlock(x.user_data) != self._current_site)
         best_nl = self._max_avg_weight(self._future_site_queue)
         if best_nl:
             filter_function = lambda x: get_netlock(x[1]) != best_nl
             self._current_site = best_nl
             self._current_site_queue._queue = self._future_site_queue.filter(filter_function)
         self._stage_ttl = self._site_crawl_quota
     #
     self._continue_work(1)
Exemple #6
0
 def is_visited_site(self, url):
     return self._visited_sites.get(get_netlock(url), None)