class Downloader(object): def __init__(self, num_thread, status): self._num_threads = num_thread self._status = status self._download_workers = ThreadPool(num_thread) def queue_download_task(self, html_task, callback): """assign the tasks(function, parameter, and callback) to the workers(thread pool)""" self._download_workers.queue_task(self.download_page , html_task , callback ) def start(self): self._download_workers.start() def stop(self): self._download_workers.stop() def len(self): return self._download_workers.get_queue_count() def download_page(self, html_task, callback): #req = urllib.request.Request(html_task._url) #python3.3 #data = urllib.request.urlopen(req) #python3.3 #html_task._data = data.read()#.decode('utf-8') #python3.3 try: """download files""" #print "before download"+html_task._url #`socket.setdefaulttimeout(3) netowrk_object = urllib.urlopen(html_task._url) html_task._data = netowrk_object.read() netowrk_object.close() #print "finish download"+html_task._url """pull html data,fill the info into html model""" html_task._id = self._status.get_new_id() html_task._crawled_time = time.time() html_task._return_code = netowrk_object.getcode() html_task._data_size = len(html_task._data) """fill information to status model""" self._status._recent_url.add(html_task) self._status._download_times+=1 self._status._download_size+=html_task._data_size except (Exception) as e: #Log().debug("download_page failed") raise(e) finally: callback(html_task)
class Parser(object): def __init__(self, num_thread, status): self._num_threads = num_thread self._parse_workers = ThreadPool(num_thread) self._parsing_depth = 0 self._parsing_id = 0 self._status = status self._cgihandler = CGIHandler() self._nestlevelhandler = NestLevelHandler() self._schemehandler = SchemeHandler() self._filetypehandler = FileTypeHandler() self._bookmarkhandler = BookMarkHandler() self._urlextender = URLExtender() self._filetypehandler = FileTypeHandler() self._omitindex = OmitIndex() self._config = Configuration() def queue_parse_task(self, html_task, callback): """assign the tasks(function, parameter, and callback) to the workers(thread pool)""" self._parse_workers.queue_task(self.parse_page, html_task, callback) def start(self): self._parse_workers.start() def stop(self): self._parse_workers.stop() def len(self): return self._parse_workers.get_queue_count() def parse_page(self, html_task, callback): links = [] format = formatter.NullFormatter() htmlparser = LinksExtractor(format) self._parsing_depth = html_task._depth self._parsing_id = html_task._id try: htmlparser.feed(html_task._data) htmlparser.close() links = htmlparser.get_links() except (Exception) as e: #print(html_task._data) #print(html_task._url) #Log().debug(e) return #finally: # del html_task for link in links: #print (link) html_task_child = Html(link) self._status._parse_times+=1 """load all strategies to determine if this link can be download""" if(self._schemehandler.SchemeChecker(html_task_child)==False): #print("Ingore the wrong scheme, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) self._status._scheme_type+=1 continue if(self._bookmarkhandler.BookMarkChecker(html_task_child)==True): #print("Ingore bookmark link, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) self._status._bookmark+=1 continue if(self._cgihandler.FindCGI(html_task_child)==True): #print("Ingore the link contain cgi, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) self._status._cgi+=1 continue if(self._nestlevelhandler.checknestlevel(html_task_child,self._config._parser_nlv)==True): #print("Ingore the link nested too much, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) self._status._nestlv +=1 continue if(self._filetypehandler.FileTypeChecker(html_task_child)==False): self._status._file_type +=1 continue self._omitindex.Omit(html_task) if(html_task_child._scheme == "" and html_task_child._hostname==None): self._urlextender.ExtendURL(html_task_child, html_task) #if self.parse_link( html_task_child ) == True: html_task_child._depth = self._parsing_depth+1 html_task_child._parent = self._parsing_id #for test html_task_child._parent_url = html_task._url callback(html_task_child) del html_task '''
class Downloader(object): def __init__(self, num_thread, status): self._num_threads = num_thread self._status = status self._download_workers = ThreadPool(num_thread) def queue_download_task(self, html_task, callback): """assign the tasks(function, parameter, and callback) to the workers(thread pool)""" self._download_workers.queue_task(self.download_page, html_task, callback) def start(self): self._download_workers.start() def stop(self): self._download_workers.stop() def len(self): return self._download_workers.get_queue_count() def download_page(self, html_task, callback): # req = urllib.request.Request(html_task._url) #python3.3 # data = urllib.request.urlopen(req) #python3.3 # html_task._data = data.read()#.decode('utf-8') #python3.3 try: timeout = 2 socket.setdefaulttimeout(timeout) """download files""" # decode url url = urllib2.unquote(html_task._url) req = urllib2.Request(url) # set refer and user-agent req.add_header("Referer", "http://www.poly.edu/") req.add_header("User-agent", "Mozilla/5.0") # print "download url :"+url netowrk_object = urllib2.urlopen(req, None, timeout) html_task._data = netowrk_object.read() netowrk_object.close() # print "finish download"+html_task._url """pull html data,fill the info into html model""" html_task._id = self._status.get_new_id() html_task._crawled_time = time.time() html_task._return_code = netowrk_object.getcode() html_task._data_size = len(html_task._data) """fill information to status model""" self._status._recent_url.add(html_task) self._status._download_times += 1 self._status._download_size += html_task._data_size callback(html_task) # except urllib2.URLError as e: # print "Url error:" # Log().debug(e) # print "url error: url="+url+", code={0}".format(e.code)+" ,resaon="+e.reason # print e # print url # print html_task._parent_url # return except urllib2.HTTPError as e: if e.code == 404: self._status._404 += 1 # print "url error: url="+html_task._url+", code={0}".format(e.code)+" ,resaon="+e.reason return except socket.error: # print('socket time out') self._status._socket_timeout += 1 # print "time out: "+html_task._url return except (Exception) as e: # Log().debug("download_page failed") # print e return