Example #1
0
class Downloader(object):
	
	def __init__(self, num_thread, status):
		self._num_threads	  	= num_thread
		self._status			= status
		self._download_workers	= ThreadPool(num_thread)
	

	def queue_download_task(self, html_task, callback):		
		"""assign the tasks(function, parameter, and callback) to the workers(thread pool)"""
		self._download_workers.queue_task(self.download_page , html_task , callback )


	def start(self):
		self._download_workers.start()

	def stop(self):
		self._download_workers.stop()

	def len(self):
		return self._download_workers.get_queue_count()

	def download_page(self, html_task, callback):
		#req = urllib.request.Request(html_task._url) #python3.3
		#data = urllib.request.urlopen(req) #python3.3
		#html_task._data = data.read()#.decode('utf-8') #python3.3
		try:
			"""download files"""
			#print "before download"+html_task._url
			#`socket.setdefaulttimeout(3)
			netowrk_object 			= urllib.urlopen(html_task._url)
			html_task._data 		= netowrk_object.read()
			netowrk_object.close()
			#print "finish download"+html_task._url
			"""pull html data,fill the info into html model"""
			html_task._id 			= self._status.get_new_id()
			html_task._crawled_time = time.time() 
			html_task._return_code	= netowrk_object.getcode()
			html_task._data_size	= len(html_task._data)



			"""fill information to status model"""
			self._status._recent_url.add(html_task)
			self._status._download_times+=1
			self._status._download_size+=html_task._data_size
		except (Exception) as e:
			#Log().debug("download_page failed")
			raise(e)
		
		finally:	
			callback(html_task)
Example #2
0
class Parser(object):
   
    def __init__(self, num_thread, status):
        self._num_threads = num_thread
        self._parse_workers = ThreadPool(num_thread)
        self._parsing_depth = 0
        self._parsing_id = 0
        self._status            = status
        self._cgihandler        =    CGIHandler()
        self._nestlevelhandler     =    NestLevelHandler()
        self._schemehandler        =    SchemeHandler()
        self._filetypehandler    =    FileTypeHandler()
        self._bookmarkhandler    =    BookMarkHandler()
        self._urlextender        =    URLExtender()
        self._filetypehandler   =   FileTypeHandler()
        self._omitindex            =    OmitIndex()
        self._config            =   Configuration()
                
    def queue_parse_task(self, html_task, callback):
        """assign the tasks(function, parameter, and callback) to the workers(thread pool)"""
        self._parse_workers.queue_task(self.parse_page, html_task, callback)

    def start(self):
        self._parse_workers.start()

    def stop(self):
        self._parse_workers.stop()

    def len(self):
        return self._parse_workers.get_queue_count()
    
    def parse_page(self, html_task, callback):
        
        links = []
        format = formatter.NullFormatter()
        htmlparser = LinksExtractor(format)
        self._parsing_depth = html_task._depth
        self._parsing_id = html_task._id
        try:     
            htmlparser.feed(html_task._data)
            htmlparser.close()
            links = htmlparser.get_links()
        except (Exception) as e:
            #print(html_task._data)
            #print(html_task._url)
            #Log().debug(e)
            return
        #finally:
        #    del html_task


        for link in links:
            #print (link)
            
            html_task_child = Html(link)
            
            self._status._parse_times+=1

            """load all strategies to determine if this link can be download"""
            if(self._schemehandler.SchemeChecker(html_task_child)==False):
                #print("Ingore the wrong scheme, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
                self._status._scheme_type+=1
                continue                  
            if(self._bookmarkhandler.BookMarkChecker(html_task_child)==True):
                #print("Ingore bookmark link, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
                self._status._bookmark+=1
                continue
            if(self._cgihandler.FindCGI(html_task_child)==True):
                #print("Ingore the link contain cgi, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
                self._status._cgi+=1
                continue        
            if(self._nestlevelhandler.checknestlevel(html_task_child,self._config._parser_nlv)==True):
                #print("Ingore the link nested too much, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
                self._status._nestlv +=1
                continue        
            if(self._filetypehandler.FileTypeChecker(html_task_child)==False):
                self._status._file_type +=1
                continue
            
            self._omitindex.Omit(html_task)
            
            if(html_task_child._scheme == "" and html_task_child._hostname==None):
                self._urlextender.ExtendURL(html_task_child, html_task)
                                   
            #if self.parse_link( html_task_child ) == True:
           
            html_task_child._depth = self._parsing_depth+1
            html_task_child._parent = self._parsing_id

            #for test
            html_task_child._parent_url = html_task._url
            callback(html_task_child)

        del html_task
    '''      
class Downloader(object):
    def __init__(self, num_thread, status):
        self._num_threads = num_thread
        self._status = status
        self._download_workers = ThreadPool(num_thread)

    def queue_download_task(self, html_task, callback):
        """assign the tasks(function, parameter, and callback) to the workers(thread pool)"""
        self._download_workers.queue_task(self.download_page, html_task, callback)

    def start(self):
        self._download_workers.start()

    def stop(self):
        self._download_workers.stop()

    def len(self):
        return self._download_workers.get_queue_count()

    def download_page(self, html_task, callback):
        # req = urllib.request.Request(html_task._url) #python3.3
        # data = urllib.request.urlopen(req) #python3.3
        # html_task._data = data.read()#.decode('utf-8') #python3.3
        try:
            timeout = 2
            socket.setdefaulttimeout(timeout)
            """download files"""

            # decode url
            url = urllib2.unquote(html_task._url)
            req = urllib2.Request(url)

            # set refer and user-agent
            req.add_header("Referer", "http://www.poly.edu/")
            req.add_header("User-agent", "Mozilla/5.0")

            # print "download url :"+url

            netowrk_object = urllib2.urlopen(req, None, timeout)
            html_task._data = netowrk_object.read()
            netowrk_object.close()
            # print "finish download"+html_task._url
            """pull html data,fill the info into html model"""
            html_task._id = self._status.get_new_id()
            html_task._crawled_time = time.time()
            html_task._return_code = netowrk_object.getcode()
            html_task._data_size = len(html_task._data)

            """fill information to status model"""
            self._status._recent_url.add(html_task)
            self._status._download_times += 1
            self._status._download_size += html_task._data_size

            callback(html_task)
            # except urllib2.URLError as e:
            # print "Url error:"
            # Log().debug(e)
            # print "url error: url="+url+", code={0}".format(e.code)+" ,resaon="+e.reason
            # print e
            # print url
            # print html_task._parent_url
            # return
        except urllib2.HTTPError as e:
            if e.code == 404:
                self._status._404 += 1
                # print "url error: url="+html_task._url+", code={0}".format(e.code)+" ,resaon="+e.reason
            return
        except socket.error:
            # print('socket time out')
            self._status._socket_timeout += 1
            # print "time out: "+html_task._url
            return
        except (Exception) as e:
            # Log().debug("download_page failed")
            # print e
            return