Beispiel #1
0
def get_page(url):
        try:
            
            parse_dict=urlparse(url)
            if parse_dict.scheme:
            
                base_url=parse_dict.scheme + '://'+ parse_dict.netloc
                robot_url=urljoin(base_url,'/robots.txt')
                parse_robot.user_agent='jocrawler 1.1'
            
                robot_parse=parse_robot.RobotFileParserLookalike()
            
                robot_parse.set_url(robot_url)
            
                
                robot_parse.read()
            
        
                if not robot_parse.can_fetch('jobcrawler 1.1', url):
                    print "This seed page can not be crawled based on robot.txt"
                    return soup(''),''
                
                else:
                    try:
                    
                        useragent='jobcrawler 1.1'
                        #headers={'User-Agent':useragent}
                        request=Request(url)
                        request.add_header('User-Agent', useragent)
                        response=urlopen(request)
                        if response.info().type not in ['text/html']:
                            return soup(''),''
                        the_page=response.read()
                        
                      
                        return soup(the_page),url
                    except URLError as connection_error:
                        print "Failed to reach server"
                        print 'Error code:',connection_error.code
                        
                    except HTTPError as _400_to_500:
                        print "The serve coudnot fulfill the request"
                        print 'Error code:',check_response(_400_to_500.code)
                    
                    else:
                        print 'EVERYTHING IS FINE'

        except URLError as connection_error:
             print 'FAILED TO REACH SERVER::'+url
             return soup(''),''
        except:
            print 'Check url again'+ url
            return soup(''),''
        
        #  print 'Error from fetchpg'
        
    
#we will write seperate models that will fetch urls fetch and insert into models

#print get_page('http://www.facebook.com/recover/initiate')      
            
Beispiel #2
0
def get_page(url):
        try:
            
            parse_dict=urlparse(url)
           
            if parse_dict.scheme:
            
                base_url=parse_dict.scheme + '://'+ parse_dict.netloc
                robot_url=urljoin(base_url,'/robots.txt')
                parse_robot.user_agent='jooble 1.1(http://about.me/jooble)'
            
                robot_parse=parse_robot.RobotFileParserLookalike()
                
                robot_parse.set_url(robot_url)
            
                
                robot_parse.read()
            
        
                if not robot_parse.can_fetch('jooble1.1 http://about.me/jooble', url):
                    print "This seed page can not be crawled based on robot.txt"
                    db.crawler_web_statistic.update({"_id":ObjectId("517dc20440ade61b20becb7d")},{"$inc":{"Number_of_excluded_urls":1}},safe=True)
                    return soup('','lxml'),''
                
                else:
                        #counting number of robot.txt accepted
                        db.crawler_web_statistic.update({"_id":ObjectId("517dc20440ade61b20becb7d")},{"$inc":{"Number_of_robotstxt_request":1}},safe=True)


	
                        # This is where we count the number of urls that did not allow us not to crawl
                        useragent='jobcrawler 1.1 http://about.me/jooble'
                        #headers={'User-Agent':useragent}
                        ##quote(url, safe="%/:=&?~#+!$,;'@()*[]" does not really work well
                        request=Request(url)
                        request.add_header('User-Agent', useragent)
                        
                        try:response=urlopen(request)
                        except URLError: response=urlopen(quote(url, safe="%/:=&?~#+!$,;'@()*[]" ))

		        #updating the database on http request made
			if response:
                        	db.crawler_web_statistic.update({"_id":ObjectId("517dc20440ade61b20becb7d")},{"$inc":{"Number_of_Http_Request":1}},safe=True)
                        else:
				db.crawler_web_statistic.update({"_id":ObjectId("517dc20440ade61b20becb7d")},{"$inc":{"Number_of_Http_Request":1}},safe=True)


				
                        
                        if response.info().type not in ['text/html']:
                            return soup('','lxml'),''
                        the_page=response.read()
                        
                      
                        return soup(the_page,'lxml'),url
                    
                        
                    
                    
              
        
        except URLError as connection_error:
            
             print 'FAILED TO REACH SERVER::'+url,
             	
             db.crawler_http_status_errors.update({"_id":ObjectId("5180bfa440ade62017d1120c")},{"$inc":{"Url_errors":1}},safe=True)

             db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'from_module':str(__file__)})
             return soup('','lxml'),''


        except HTTPError as _400_to_500:
                        print "The server coudnot fulfill the request"
                        print 'Error code:',check_response(_400_to_500.code),_400_to_500.reason
                        db.crawler_http_status_errors.update({"_id":ObjectId("5180bfa440ade62017d1120c")},{"$inc":{"from_400_500":1}},safe=True)
                        db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'from_module':str(__file__)})
                        return soup('','lxml'),''

        except BadStatusLine:
                        print "BadStatusline...................Status Code is unknown"
                        db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'from_module':str(__file__)})
                        db.crawler_http_status_errors.update({"_id":ObjectId("5180bfa440ade62017d1120c")},{"$inc":{"Bad_status_line":1}},safe=True)
                        return soup('','lxml'),''
        except socket.timeout:
                        print "SocketTimeout...................Fail"
                 
                        db.crawler_http_status_errors.update({"_id":ObjectId("5180bfa440ade62017d1120c")},{"$inc":{"Socket_time_out":1}},safe=True)
                        return soup('','lxml'),''
        
                     
                     
                     
        except:
            print 'Check url again ',traceback.print_exc()
            db.crawler_http_status_errors.update({"_id":ObjectId("5180bfa440ade62017d1120c")},{"$inc":{"Ungrouped":1}},safe=True)
            db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'from_module':str(__file__)})
          
            
            return soup('','lxml'),''

        else:
             db.crawler_http_status_errors.update({"_id":ObjectId("5180bfa440ade62017d1120c")},{"$inc":{"ok_200":1}},safe=True)
             print 'ok'