Example #1
0
class RobotsMiddleware(BaseMiddleware):

    def __init__(self, *args, **kwargs):
        self.cache = RobotsCache(*args, **kwargs)
        self.visited = collections.defaultdict(dict)

    def check_disallow(self, url, agent):
        if not self.cache.allowed(url, agent):
            raise RobotsDisallowedError

    def check_crawl_delay(self, url, agent):
        delay = self.cache.delay(url, agent)
        if delay is None:
            return
        now = datetime.datetime.utcnow()
        host = urlparse.urlparse(url).hostname
        try:
            last_visit = self.visited[agent][host]
            if (now - last_visit).seconds < delay:
                raise RobotsThrottledError
        except KeyError:
            pass
        self.visited[agent][host] = now

    def before_send(self, request, *args, **kwargs):
        url = request.url
        agent = request.headers.get('User-Agent')
        self.check_disallow(url, agent)
        self.check_crawl_delay(url, agent)
Example #2
0
	def download_pages_in_queue(self, queue):		
		current_page_url = queue.get()
		
		robot = RobotsCache()
		if (robot.allowed(current_page_url, "*")):

			print current_page_url
			if len(current_page_url) < 10: return	
			current_page_html = download_page_by_url(current_page_url)			
			bs = BeautifulSoup(current_page_html, "html.parser")

			links = bs.find_all('a', href=True)
			post_links = [link['href'] for link in links]
			
			for post_link in post_links:
				if len(post_link) < 10: continue
				if str(post_link).find('http') != 0:
					post_link = str(self.start_url) + str(post_link)
				queue.put(post_link)
			self.sites_num = self.sites_num + 1		

			page = Pages(url = current_page_url, parsed_text = get_text_from_html(current_page_html), is_indexed = False)
			page.save()
		else:
			print "Page can't be indexed because of the rules in ROBOTS.TXT"	
    def __FetchRobotFileInfo__(self, url, robotDictForDomains, timeStamp):
        domainName = self.__GetComSubdomainOfUrl__(url)
        robotUrl = ""

        if robotDictForDomains.has_key(domainName) == False:
            robotUrl = self.__GetRobotUrlForUrl__(domainName)
            cache = RobotsCache()
            try:
                timeStamp[domainName] = datetime.datetime.now()
                robotFileObj = cache.fetch(robotUrl)
                doesUrlExistOnline = self.__DoesUrlExistOnline__(robotUrl)
            except:
                doesUrlExistOnline = False
                robotDictForDomains[domainName] = (doesUrlExistOnline, object)

            if doesUrlExistOnline == True:
                robotDictForDomains[domainName] = (doesUrlExistOnline,
                                                   robotFileObj)
            else:
                robotDictForDomains[domainName] = (doesUrlExistOnline, object)

        doesUrlExistOnline = robotDictForDomains[domainName][0]
        robotFileObj = robotDictForDomains[domainName][1]
        # print "heyyy",robotUrl, doesUrlExistOnline, robotFileObj, robotDictForDomains
        return doesUrlExistOnline, robotFileObj, robotDictForDomains, timeStamp, domainName
Example #4
0
    def download_pages_in_queue(self, queue):
        current_page_url = queue.get()

        robot = RobotsCache()
        if (robot.allowed(current_page_url, "*")):

            print current_page_url
            if len(current_page_url) < 10: return
            current_page_html = download_page_by_url(current_page_url)
            bs = BeautifulSoup(current_page_html, "html.parser")

            links = bs.find_all('a', href=True)
            post_links = [link['href'] for link in links]

            for post_link in post_links:
                if len(post_link) < 10: continue
                if str(post_link).find('http') != 0:
                    post_link = str(self.start_url) + str(post_link)
                queue.put(post_link)
            self.sites_num = self.sites_num + 1

            page = Pages(url=current_page_url,
                         parsed_text=get_text_from_html(current_page_html),
                         is_indexed=False)
            page.save()
        else:
            print "Page can't be indexed because of the rules in ROBOTS.TXT"
Example #5
0
    def __init__(self, url, config={}, proxies={},
                 auth=None, ua=DEFAULT_HODOR_UA,
                 pagination_max_limit=DEFAULT_HODOR_MAX_PAGES,
                 crawl_delay=DEFAULT_CRAWL_DELAY,
                 ssl_verify=False,
                 trim_values=True,
                 robots=True,
                 reppy_capacity=100):

        self.content = None
        self.url = url
        self.domain = self._get_domain()
        self.proxies = proxies
        self.auth = auth
        self.ua = ua
        self.trim_values = trim_values
        self.ssl_verify = ssl_verify
        self.config = {}
        self.extra_config = {}

        self.robots = RobotsCache(capacity=reppy_capacity) if robots else None

        self._pages = []
        self._page_count = 0
        self._pagination_max_limit = pagination_max_limit
        self.crawl_delay = self._crawl_delay(crawl_delay)

        for k, v in config.items():
            if k.startswith("_"):
                self.extra_config[k.lstrip("_")] = v
            else:
                self.config[k] = v
Example #6
0
 def __init__(self, store, *args, **kwargs):
     RobotsCache.__init__(self, *args, **kwargs)
     self._store = store
     self._cache = NoSQLDict(dbtype=self._store["engine"],
                             param={'host': self._store['host'],
                                    'port': self._store['port'],
                                    'db': self._store['db']['robot']})
Example #7
0
    def __init__(self,
                 file,
                 ua,
                 check=True,
                 output="output.csv"
                 ):  #setting output to false disables file output
        if check:  #only setup robot checker if robot checking is enabled
            self.ua = ua  #user agent
            self.robo = RobotsCache(capacity=100)

        #check var disables or enables robots.txt checking
        #recommended to keep default True value
        self.check = check
        self.req = requests  #request obj for parsing url

        self.output = output  #where to output file

        self.data = []  #init array of grabbed sites

        self.configarr = []  #empty array of all configs

        if type(file) is list:
            self.configarr = file
        else:
            self.configarr.append(file)
 def testRobot3(self):
     robots = RobotsCache()
     rules = robots.fetch("http://www.realwire.com/")
     crawl_delay = rules.delay("idiot")
     print("delay is:", crawl_delay)
     for i in range(1, 1000):
         print(rules.allowed("http://api.google.com/search/", agent="idiot"))
Example #9
0
 def testRobot3(self):
     robots = RobotsCache()
     rules = robots.fetch("http://www.realwire.com/")
     crawl_delay = rules.delay("idiot")
     print("delay is:", crawl_delay)
     for i in range(1, 1000):
         print(rules.allowed("http://api.google.com/search/",
                             agent="idiot"))
Example #10
0
 def __init__(self, store, *args, **kwargs):
     RobotsCache.__init__(self, *args, **kwargs)
     self._store = store
     self._cache = NoSQLDict(dbtype=self._store["engine"],
                             param={
                                 'host': self._store['host'],
                                 'port': self._store['port'],
                                 'db': self._store['db']['robot']
                             })
Example #11
0
 def robot_pass(self,page):
     """
     Accepts page [object]
     Creates instance of RobotsCache (from reppy)
     Passes URL of page as string into robots.allowed method
     Returns True or False
     """
     robots = RobotsCache()
     return robots.allowed(page.get_url(), '*')
def get_scanner_mock(request_limit):
    robots_cache = RobotsCache()
    robots_cache.fetch = MagicMock(return_value=robots_cache)
    robots_cache.allowed = MagicMock(return_value=True)
    robots_validator = RobotsValidator(agent='*')
    robots_validator.robots = robots_cache
    scanner = UrlScanner(request_limit)
    scanner.url_fetcher = get_url_fetcher_mock(request_limit)
    scanner.robots_validator = robots_validator
    return scanner
	def robot_rules(_url_scheme, _url_netloc):  # return a robot rules objects
		_domain = urlunparse((_url_scheme, _url_netloc, '', '', '', ''))
		robots = RobotsCache()
		try:
			rules = robots.fetch(_domain, timeout=5)
		except Exception as exc:
			print('FAIL to fatch robot.txt {},{}'.format(_url_scheme, _url_netloc))
			print(exc)
			return None
		return rules
	def check_for_robot_access(self, page):
		self.f.write('--- checking for robots %s\n' %page)
		robots = RobotsCache()
		try:
			if robots.allowed(page+'robots.txt', 'my-agent'):
				print 'robots allowed'
				self.f.write('robots allowed. \n')
				return True
		except ServerError, r:
			print 'error ', r
			return False
Example #15
0
 def check_for_robot_access(self, page):
     self.f.write('--- checking for robots %s\n' % page)
     robots = RobotsCache()
     try:
         if robots.allowed(page + 'robots.txt', 'my-agent'):
             print 'robots allowed'
             self.f.write('robots allowed. \n')
             return True
     except ServerError, r:
         print 'error ', r
         return False
 def robot_rules(_url_scheme, _url_netloc):  # return a robot rules objects
     _domain = urlunparse((_url_scheme, _url_netloc, '', '', '', ''))
     robots = RobotsCache()
     try:
         rules = robots.fetch(_domain, timeout=5)
     except Exception as exc:
         print('FAIL to fatch robot.txt {},{}'.format(
             _url_scheme, _url_netloc))
         print(exc)
         return None
     return rules
Example #17
0
 def get_text_by_base_url(self):
     robots = RobotsCache(capacity=100)
     if not robots.allowed(self.base_url, "python-requests"):
         return ["Crawling this site is not allowed by robots.txt"]
     text_list = []
     for slug in self.__get_links_by_url_depth():
         sleep(0.5)
         text_list.append(
             remove_emoji(
                 remove_url(self.__get_text_by_url(self.base_url +
                                                   slug))).strip())
     return text_list
def confirm_robots_txt(target_url, max_capacity):
    '''confirm that target url is allowed to crawl

    :type target_url: str
    :param target_url: agent wanna crawl
    :type max_capacity: int
    :param max_capacity: limit of max crawling pages
    :rtype: bool
    :return: weather it is possible to scrape
    '''
    robots = RobotsCache(max_capacity)
    return robots.allowed(target_url, 'python program')
Example #19
0
    def run(self):
        global terminator
        pattern='(http://)(\w*\.)+\w+(/\w*)*'
        #Initialize RobotsCache object
        robots=RobotsCache()
        while 1:
            if terminator:
                break
            cur_raw_tweet=raw_tweets.get(True)
            curtweet=json.loads(cur_raw_tweet)
            if DEBUG:
                print "Got an item from raw_tweets", current_thread().getName()

            # Check if twitter has tate limited you by sending a blank tweet
            if u'text' in curtweet.keys():
                text=curtweet[u'text']
            else:
                print "Rate limited by twitter. Continuing"
                continue

            #Get text and check if it has links using regex.
            link=re.search(pattern,text)
            if link:
                if DEBUG:
                    print "match"
                flink=link.group()

                #Check if crawling is allowed
                try:
                    if robots.allowed(flink,'tweetbot'):
                        soup=BeautifulSoup(urllib2.urlopen(flink),"lxml")

                        #Check if page has title
                        if soup.title:
                            curtweet[u'linkTitle']=soup.title.string
                except reppy.ReppyException:
                    print "Error fetching robots.txt. Continuing"
                    continue
                except urllib2.URLError:
                    print "Bad Url. Report to the developer. Continuing"
                    continue
                except urllib2.HTTPError:
                    print "Error Fetching Web Page. Continuing"
                    continue

            else:
                if DEBUG:
                    print "not match"

            processed_tweets.put(json.dumps(curtweet),True)
            if DEBUG:
                print "Put on processed queue. ProcessedSize", processed_tweets.qsize()
Example #20
0
    def __init__(self, file, ua, check=True, output="output.csv"):
        if check:  #only setup robot checker if robot checking is enabled
            self.ua = ua  #user agent
            self.robo = RobotsCache(capacity=0)

        #check disables or enables robots.txt checking
        #recommended to keep default True value
        self.check = check
        self.req = requests

        if os.path.exists(file):
            with open(file) as f:
                self.config = json.load(f)  #opens and parses json file
Example #21
0
    def __init__(self, robots_url=None):
        if robots_url:
            robots = RobotsCache()
            self._rules = robots.fetch(robots_url)
            self.is_use_robots = True
        else:
            self.is_use_robots = False

        self._url_norm = UrlNorm()
        self.counter = 0
        self.urls = dict()
        self.connections = defaultdict(set)
        self._lock = RLock()
Example #22
0
    def __init__(self, robots_url=None):
        if robots_url:
            robots = RobotsCache()
            self._rules = robots.fetch(robots_url)
            self.is_use_robots = True
        else:
            self.is_use_robots = False

        self._url_norm = UrlNorm()
        self.counter = 0
        self.urls = dict()
        self.connections = defaultdict(set)
        self._lock = RLock()
	def robot_rules(_url_scheme, _url_netloc):  # return a robot rules objects
		#_parsed_url = urlparse(_url)
		_domain = urlunparse((_url_scheme, _url_netloc, '', '', '', ''))
		robots = RobotsCache()
		try:
			#print('DOMAIN: {}'.format(_domain))
			rules = robots.fetch(_domain)
		except Exception as exc:
			print('FAIL to fatch robot.txt')
			print(_url_scheme, _url_netloc)
			print(exc)
			return None
		return rules
    def get_robot_agent(root_domain: str, protocol="http") -> Rules:
        if root_domain.startswith("http"):
            root_domain = LinkChecker.get_root_domain(root_domain)[4]
        versions = ["http://", "https://", "http://www.", "https://www."]
        suffix = "/robots.txt"
        current = ""
        found = False
        for version in versions:
            temp_link = version + root_domain + suffix
            try:
                status_code, content_type = LinkChecker.get_response(temp_link)
                if status_code == ResponseCode.LinkOK:
                    current = temp_link
                    found = True
                    break
                else:
                    raise ConnectionError
            except:
                pass
        if found:
            try:
                robots = RobotsCache()
                req = robots.session.get(current)
                ttl = max(robots.min_ttl,
                          Utility.get_ttl(req.headers, robots.default_ttl))
                # And now parse the thing and return it
                return parser.Rules(current, req.status_code, req.content,
                                    time.time() + ttl)

                # rules = robots.fetch(current)
                # return rules
            except:
                return None
        else:
            return None
Example #25
0
def get_all_links(domain, path, maxSize):
    #response = requests.get(domain+path, headers={'User-Agent': 'Mozilla/5.0'})
    driver = webdriver.PhantomJS()
    driver.get(domain + path)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    links = []
    rp = RobotsCache(10000)
    for div in soup.findAll('div'):
        for link in div.findAll('a', href=True):
            #print(link.get('href'))
            if (rrobots(domain, link.get('href'), rp)):
                regex = re.compile(
                    r'^(?:http|ftp)s?://'  # http:// or https://
                    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  #domain...
                    r'localhost|'  #localhost...
                    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
                    r'(?::\d+)?'  # optional port
                    r'(?:/?|[/?]\S+)$',
                    re.IGNORECASE)
                if re.match(regex, domain + link.get('href')) is not None:
                    if (len(link.get('href')) > 0):
                        if ((link.get('href')[0] >= 'a'
                             and link.get('href')[0] <= 'z')
                                or (link.get('href')[0] >= '1'
                                    and link.get('href')[0] <= '9')):
                            links.append('/' + link.get('href'))
                        else:
                            links.append(link.get('href'))
    return links
Example #26
0
 def __init__(self):
     self.DOMAIN = Settings.START_LINK.split('/')[2]
     self.ROBOTS_LINK = Settings.START_LINK + 'robots.txt'
     self.COUNT_URLS = 200000
     self.THREADS = 4
     self.RAND_NUM = 5
     self.TIMEOUT = 30
     self.AGENT = ''
     self.DATABASE = 'mongodb://*****:*****@name="description"]/@content',
         u'keywords': u'//meta[@name="keywords"]/@content',
         u'robots': u'//meta[@name="robots"]/@content',
         u'canonical': u'//link[@rel="canonical"]/@href',
         u'h1': u'//h1//text()',
         u'h2': u'//h2//text()',
         u'h3': u'//h3//text()',
         u'text': u'''//body//*[not(self::script or self::a or self::h1 or
         self::h2 or self::h3)]/text()[normalize-space()]''',
         u'script': u'//script//text()',
         u'p': u'//p//text()',
         u'anchors': u'//a//text()',
         u'alt': u'//img/@alt',
         u'title2': u'//@title'
         }
     self.robots = RobotsCache()
     self.rules = self.robots.cache(self.ROBOTS_LINK)
     self.client = MongoClient(self.DATABASE)
     self.db = self.client[self.DB_NAME]
Example #27
0
    def __init__(self):
        self.agent = "jerry's crawler"
        self.robots = RobotsCache()
        self.pool = None
        self.cookieJar = cookielib.CookieJar()

        timeout = 60
        socket.setdefaulttimeout(timeout)
Example #28
0
    def __init__(self, crawler):
        if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
            raise NotConfigured

        self.completeblacklist = crawler.settings.get('ROBOTSTXT_BLACKLIST',
                                                      ())
        self.blacklist = []
        self.generalblacklist = crawler.settings.get('GENERAL_BLACKLIST', ())
        self.hasblacklist = False
        self.whitelist = crawler.settings.get('ROBOTSTXT_WHITELIST', ())
        self.crawler = crawler
        self._useragent = crawler.settings.get('USER_AGENT')
        self._parsers = {}
        self._spider_netlocs = set()
        self.robots = RobotsCache()

        self.stoprepetitionsrearg = re.compile(ur'.*?\&(.*?\&)\1{1,}.*')
        self.stoprepetitionsreslash = re.compile(ur'.*?\/(.*?\/)\1{1,}.*')
def crawl_Pages(Seed):
    r = RobotsCache()
    robots_url=urljoin(Seed,'/robots.txt')  
    x = r.fetch(robots_url)
    unvisited=[Seed]
    visited=[]
    cnt=0
    delay=5
    while unvisited:
        page=unvisited.pop(0)
        
        hdr={'User-Agent':'*'}
        try:
            req = urllib2.Request(page, headers=hdr)         
            pagecontent=urllib2.urlopen(req)            
            if page not in visited:
                time.sleep(delay)
                s=pagecontent.read()  
                if (ishtmlcontent(pagecontent)):          
                    soup=BeautifulSoup(s)
                    links=soup.findAll('a',href=True)     
                    for l in links:
                        if (isurlvalid(l['href'])):
                            u1=urljoin(page,l['href'])
                            unvisited.append(u1)
                    if x.allowed(page,'*'):
                        visited.append(page)
                        cnt=cnt+1
                        print cnt
                        print 'Crawled:'+page
                        visited=remove_duplicates(visited)
                else:
                    if(page.endswith(".pdf")):
                        visited.append(page)
                        cnt=cnt+1
                        print 'Crawled:'+page
                        visited=remove_duplicates(visited)
            if(len(visited)==100):
                    unvisited=[]
        except Exception, err:
            print Exception, err
            continue
Example #30
0
	def __init__(self, url="file:///Users/tharak/Dropbox/code/Python/webcrawler/mock_website/example.org/index.html", useragent="User Agent", outdir="out", max_depth=1000, debug=0):
		self.url = url					
		self.useragent = useragent		
		self.siteMap = {self.url:""}	
		self.outdir=outdir.rstrip("/")+"/"	
		self.depth = 0					
		self.MaxDepth = max_depth		
		self.crawled=Set([])			
		self.debug=debug				
		self.domains=Set([urlparse(self.url).netloc.lower()])
		self.robots = RobotsCache()
    def __init__(self, base_url, forum_codes, archive_location, user_agent,
                 worker_count):
        archiver_logger.info('Archiver initialized.')
        self.base_url = base_url
        self.archive_base_url = urljoin(self.base_url,
                                        ScraperConfig.ARCHIVE_SUBURL)
        self.forum_codes = forum_codes
        self.archive_location = archive_location
        self.user_agent = user_agent
        self.robot_parser = RobotsCache()
        self.scraper_timer = None
        self.shutdown_event = threading.Event()
        self.delay_time = 1

        self.workers = []
        self.worker_count = worker_count

        self.pages_need_visiting = Queue()
        self.pages_need_analysis_counter = RachetingCounter()
        self.pages_visited_lock = threading.Lock()
        self.pages_visited = []
        self.page_re_filters = []
Example #32
0
	def setup_method(self, _):
		"""Configure the app."""
		self.url = "http://aetfiws.ovh"
		self.code1 = test_data.CODE1
		self.code2 = test_data.CODE2
		self.code3 = test_data.CODE3
		self.parser = parsers.ExtractData()
		self.parser_encoding = parsers.ExtractEncoding()
		self.STOPWORDS = {'fr':('mot', 'pour', 'de')}
		self.BADWORDS = {'fr': ('pipe', 'xxx')}
		self.is_title = True
		self.title = 'letter'
		self.headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'vary': 'X-PJAX, Accept-Encoding'}
		self.reqrobots = RobotsCache(capacity=100)
Example #33
0
 def allowed(self, url):
     surl = urlparse(url)
     rurl = surl.scheme + '://' + surl.hostname + '/robots.txt'
     if rurl in self.__robot:
         if not self.__robot[rurl].expired:
             return self.__robot[rurl].allowed(url, UA)
     try:
         r = RobotsCache().fetch(rurl)
     except:
         return False
     else:
         self.__robot[rurl] = r
         # add a rule object
         return self.__robot[rurl].allowed(url, UA)
    def __init__(self, crawler):
        if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
            raise NotConfigured

        self.completeblacklist = crawler.settings.get('ROBOTSTXT_BLACKLIST', ())
        self.blacklist = []
        self.generalblacklist = crawler.settings.get('GENERAL_BLACKLIST', ())
        self.hasblacklist = False
        self.whitelist = crawler.settings.get('ROBOTSTXT_WHITELIST', ())
        self.crawler = crawler
        self._useragent = crawler.settings.get('USER_AGENT')
        self._parsers = {}
        self._spider_netlocs = set()
        self.robots = RobotsCache()
        
        self.stoprepetitionsrearg = re.compile(ur'.*?\&(.*?\&)\1{1,}.*')
        self.stoprepetitionsreslash = re.compile(ur'.*?\/(.*?\/)\1{1,}.*')
Example #35
0
 def __init__(self, args):
     if args.statsd and not re.match(REGEX_STATSD_HOST, args.statsd):
         raise Exception("Invalid statsd host provided")
     self.statsd_host = self.statsd_port = None
     self.statsd_disabled = True
     if args.statsd:
         self.statsd_host, self.statsd_port = args.statsd.rsplit(":", 1)
         self.statsd_port = int(self.statsd_port)
         self.statsd_disabled = False
     self.statsd_connection = statsd.Connection(host=self.statsd_host,
         port=self.statsd_port, sample_rate=0.5,
         disabled=self.statsd_disabled)
     self.statsd = statsd.Client("hydra_worker", self.statsd_connection)
     self.statsd_timers = {}
     self.statsd_counter = self.statsd.get_client(class_=statsd.Counter)
     
     self.get_info = self.time(self.get_info)
     utils.find_urls = self.time(utils.find_urls)
     self.get_jobs = self.time(self.get_jobs)
     self.done_jobs = self.time(self.done_jobs)
     self.get_tasks = self.time(self.get_tasks)
     
     self.args = args
     self.threads = []
     self.working = True
     self.break_now = False
     self.jobs = {}
     self.job_lock = threading.Lock()
     self.broker_lock = threading.Lock()
     self.insert_queue = queue()
     self.print_queue = queue()
     self.fill_queue = queue()
     self.robots = RobotsCache()
     self.worker_id = args.worker_id or uuid.uuid1().hex
     self.socket = context.socket(zmq.REQ)
     self.socket.connect(args.broker_address)
     self.get_info()
     self.headers = {"Accept-encoding": "gzip", "User-agent": self.info["u"]}
     if args.db_override:
         self.info["d"] += " host=%s" % args.db_override
     self.database = database.Database(self.info["d"])
class RobotsValidator(object):
    """ Validates urls via robots.txt file """
    def __init__(self, agent):
        self._agent = agent
        self.robots = RobotsCache()

    def get_allowed_from(self, child_urls):
        """
        :param child_urls: List of child urls to check robots.txt on
        :return: A list of allowed child urls to crawl
        """
        allowed = []
        domains = list(set('{0}'.format(get_domain(url)) for url in child_urls))
        domain_to_children = {domain: filter(lambda u: get_domain(u) == domain, child_urls) for domain in domains}
        for domain in domain_to_children:
            try:
                rules = self.robots.fetch(domain)
                for url in domain_to_children[domain]:
                    if rules.allowed(url, self._agent):
                        allowed.append(url)
            except:
                allowed.extend(domain_to_children[domain])
        return allowed
Example #37
0
class Mole:
    """ fetch web page based on robots.txt """

    def __init__(self):
        self.agent = "jerry's crawler"
        self.robots = RobotsCache()
        self.pool = None
        self.cookieJar = cookielib.CookieJar()

        timeout = 60
        socket.setdefaulttimeout(timeout)

    def fetch(self, uri):
        # timeout in seconds
        if self.robots.allowed(uri, self.agent):
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookieJar))
            req = urllib2.Request(uri)
            req.add_header('User-Agent', self.agent)
            response = opener.open(req)
            if response.code == 200:
                return response.read()

        return None

    def filter_punctuation(self, tokens):
        non_punct = re.compile('.*[A-Za-z0-9].*')
        return [w for w in tokens if non_punct.match(w)]

    def get_sitexml_robots(self, url):
        robot_url = '/'.join([url, 'robots.txt'])
        content = self.fetch(robot_url)
        lines = content.split('\n')
        site = []
        for line in lines:
            line = line.lower()
            index = line.find("sitemap")
            if index < 0 :
                continue
            m = re.search('sitemap\s*:\s*(\S+)',line[index:])
            site.append(m.group(1))

        return site

    def is_within_days(self, d, days=1):
        ago = date.today() - timedelta(days)
        return ago <= d

    def read_sitemap_file(self, mapfile):
        content = self.fetch(mapfile)

        if content is None:
            return None

        if mapfile.endswith('.gz'):
            d = zlib.decompressobj(16+zlib.MAX_WBITS)
            content = d.decompress(content)

        return content

    def create_thread_pool(self, size=10):
        self.pool = WorkerPool(size)

    def page2tokens(self, content):
        return nltk.word_tokenize(nltk.clean_html(content))
Example #38
0
class Worker(object):
    def __init__(self, args):
        if args.statsd and not re.match(REGEX_STATSD_HOST, args.statsd):
            raise Exception("Invalid statsd host provided")
        self.statsd_host = self.statsd_port = None
        self.statsd_disabled = True
        if args.statsd:
            self.statsd_host, self.statsd_port = args.statsd.rsplit(":", 1)
            self.statsd_port = int(self.statsd_port)
            self.statsd_disabled = False
        self.statsd_connection = statsd.Connection(host=self.statsd_host,
            port=self.statsd_port, sample_rate=0.5,
            disabled=self.statsd_disabled)
        self.statsd = statsd.Client("hydra_worker", self.statsd_connection)
        self.statsd_timers = {}
        self.statsd_counter = self.statsd.get_client(class_=statsd.Counter)
        
        self.get_info = self.time(self.get_info)
        utils.find_urls = self.time(utils.find_urls)
        self.get_jobs = self.time(self.get_jobs)
        self.done_jobs = self.time(self.done_jobs)
        self.get_tasks = self.time(self.get_tasks)
        
        self.args = args
        self.threads = []
        self.working = True
        self.break_now = False
        self.jobs = {}
        self.job_lock = threading.Lock()
        self.broker_lock = threading.Lock()
        self.insert_queue = queue()
        self.print_queue = queue()
        self.fill_queue = queue()
        self.robots = RobotsCache()
        self.worker_id = args.worker_id or uuid.uuid1().hex
        self.socket = context.socket(zmq.REQ)
        self.socket.connect(args.broker_address)
        self.get_info()
        self.headers = {"Accept-encoding": "gzip", "User-agent": self.info["u"]}
        if args.db_override:
            self.info["d"] += " host=%s" % args.db_override
        self.database = database.Database(self.info["d"])
            
    def get_timer(self):
        return self.statsd.get_client(class_=statsd.Timer)
    
    def time(self, function):
        def wrapper(*args, **kwargs):
            with self.get_timer().time(function.__name__):
                return function(*args, **kwargs)
        return wrapper
    
    def start(self):
        threaded_jobs = []
        threaded_jobs.append(self.do_heartbeat)
        threaded_jobs.append(self.insert_urls)
        threaded_jobs.append(self.printer)
        threaded_jobs.append(self.fill_jobs)
        for job in threaded_jobs:
            new_thread = threading.Thread(target=job)
            new_thread.daemon = True
            new_thread.start()
            self.threads.append(new_thread)
        self.do_jobs()
    
    def get_info(self):
        self.socket.send(make_message(BYTE_HELLO, self.worker_id))
        response = check_message(self.socket.recv(), BYTE_HELLO)
        self.info = response[1]
    
    def get_jobs(self, count = 1):
        self.print_queue.put("broker getting jobs")
        jobs = {}
        
        first_loop = True
        while self.working and len(jobs) < count:
            if not first_loop:
                time.sleep(2)
            first_loop = False
            count_needed = count-len(jobs)-1
            self.broker_lock.acquire()
            self.socket.send(make_message(BYTE_GET_JOB, self.worker_id, 
                count_needed))
            response = check_message(self.socket.recv(), BYTE_GET_JOB)
            self.broker_lock.release()
            for job, tasks in response[1]:
                job = job.decode("utf8")
                tasks = [[https, task.decode("utf8")] for https, task in tasks]
                jobs[job] = tasks
        
        for job, tasks in jobs.items():
            try:
                robots = self.robots.fetch("http://%s" % job, timeout=5)
                sleep = robots.delay(self.info["n"])
            except Exception as e:
                robots = sleep = None
                self.print_queue.put("failed to get robots")
            sleep = sleep or self.info["s"]
            self.jobs[job] = {"tasks": tasks, "robots": robots, "sleep": sleep,
                "timestamp": 0.0}
            self.insert_queue.put([job, False, job])
        task_count = 0
        for _, tasks in jobs.items():
            task_count += len(tasks)
        self.print_queue.put("broker got %d jobs (%d tasks)" % (count,
            task_count))
    
    def done_jobs(self, *jobs):
        self.broker_lock.acquire()
        self.socket.send(make_message(BYTE_JOB_DONE, *jobs))
        self.socket.recv()
        self.broker_lock.release()
        self.job_lock.acquire()
        for job in jobs:
            self.jobs.pop(job)
        self.job_lock.release()
    
    def get_tasks(self, job):
        if not job in self.jobs:
            return 0
        new_urls = self.database.get_urls(job)
        self.print_queue.put("got tasks for %s (%d)" % (job, len(new_urls)))
        new_urls = [[https, task.decode("utf8")] for https, task in new_urls]
        new_allowed_urls = []
        for new_https, new_url in new_urls:
            task_scheme = "%s%s" % (utils.get_scheme(new_https), new_url)
            if not self.jobs[job]["robots"] or self.jobs[job]["robots"].allowed(
                    task_scheme, self.info["n"]):
                new_allowed_urls.append([new_https, new_url])
            else:
                self.database.timestamp(new_url)
        self.jobs[job]["tasks"] += new_urls
        return len(new_urls)
    
    def fill_jobs(self):
        while self.working:
            job = self.fill_queue.get(True)
            if self.get_tasks(job) == 0 and not self.jobs[job]["tasks"]:
                self.print_queue.put("removing job %s" % job)
                self.done_jobs(job)
                self.get_jobs()
    
    def yield_tasks(self):
        while self.working:
            tasks = []
            for job in list(self.jobs):
                if not self.jobs[job]["tasks"]:
                    self.fill_queue.put(job)
                    continue
                after_delay = self.jobs[job]["timestamp"]+self.jobs[job]["sleep"]
                time_since = after_delay-time.time()
                if not time_since > 0 or self.jobs[job]["timestamp"] == 0.0:
                    https, task = self.jobs[job]["tasks"].pop(0)
                    if not len(self.jobs[job]["tasks"]) > 0:
                        self.fill_queue.put(job)
                    task_scheme = "%s%s" % (utils.get_scheme(https), task)
                    tasks.append(task_scheme)
                    self.jobs[job]["timestamp"] = time.time()
            if tasks:
                yield tasks
    
    def printer(self):
        while self.working:
            line = self.print_queue.get(True)
            print datetime.datetime.now(), line
    
    def insert_urls(self):
        while self.working:
            self.database.start_transaction()
            urls = []
            hostnames = set([])
            for insert in xrange(self.info["b"]):
                try:
                    to_insert = self.insert_queue.get(True, 4)
                except Empty:
                    break
                if to_insert[0] and to_insert[2]:
                    urls.append(to_insert)
                    hostnames.add((to_insert[2],))
                self.insert_queue.task_done()
            if urls and hostnames:
                self.database.insert(urls, hostnames)
            self.database.stop_transaction()
    
    def do_jobs(self):
        self.get_jobs(self.args.jobs or 2)
        time_before_yield = 0.0
        for tasks in self.yield_tasks():
            if not time_before_yield == 0.0:
                self.print_queue.put("got yield tasks, took %f seconds" % (
                    time.time()-time_before_yield))
            time_before_get = time.time()
            
            get_requests = dict([(grequests.get(task, timeout=5), task)
                for task in tasks])
            grequests.map(get_requests.keys(), stream=True)
            self.print_queue.put("got responses, took %f seconds" % (
                time.time()-time_before_get))
            found_count = 0.0
            get_successful = []
            get_failed = []
            get_wrong_type = []
            get_responses = {}
            time_before_process = time.time()
            for request in list(get_requests):
                original_url = get_requests[request].split("://", 1)[-1]
                response = request.response
                if not response or not response.status_code < 400:
                    get_failed.append(original_url)
                elif not response.headers.get("content-type", "").startswith(
                        "text/"):
                    get_wrong_type.append(original_url)
                else:
                    get_responses[response] = original_url
            self.print_queue.put("finished processing, took %f seconds" % (
                time.time()-time_before_process))
            time_before_responses = time.time()
            gevent.joinall([gevent.spawn(getattr, response, "text") for response
                in get_responses])
            self.print_queue.put("got second responses, took %f seconds" % (
                time.time()-time_before_responses))
            time_before_second_process = time.time()
            for response in list(get_responses):
                original_url = get_responses[response]
                try:
                    text = response.text
                except:
                    get_failed.append(original_url)
                    continue
                if not text:
                    get_failed.append(original_url)
                else:
                    actual_url = response.url
                    get_successful.append(original_url)
                    found_urls = utils.find_urls(response.text, actual_url)
                    found_count += len(found_urls)
                    for url in found_urls:
                        url_parts = utils.process_url(url)
                        if url_parts:
                            self.statsd_counter.increment("url_found")
                            self.insert_queue.put(url_parts)
            
            self.print_queue.put("finished second processing, took %f seconds" %
                (time.time()-time_before_second_process))
            time_taken = time.time()-time_before_get
            stats = "tried %d" % len(tasks)
            stats += ", success %d" % len(get_successful)
            stats += ", fail %d" % len(get_failed)
            stats += ", wrong %d" % len(get_wrong_type)
            stats += ", took %f seconds" % time_taken
            if get_successful:
                stats += ", found %d" % found_count
                stats += ", %f/site" % (found_count/len(get_successful))
                stats += ", %f/second" % (found_count/time_taken)
            self.print_queue.put(stats)
            for url in get_successful:
                self.database.timestamp(url)
            for url in get_failed:
                self.database.timestamp(url, 1)
            for url in get_wrong_type:
                self.database.timestamp(url, 2)
            time_before_join = time.time()
            self.insert_queue.join()
            self.print_queue.put("finished insert queue join, took %f seconds" %
                (time.time()-time_before_join))
            time_before_yield = time.time()
    
    def do_heartbeat(self):
        while self.working:
            time.sleep(self.info["h"])
            request = make_message(BYTE_HEARTBEAT, self.worker_id,
                *list(self.jobs))
            
            self.broker_lock.acquire()
            self.socket.send(request)
            response = check_message(self.socket.recv(), BYTE_HEARTBEAT)
            self.broker_lock.release()
            if len(response) > 1 and response[1] == BYTE_GET_JOB:
                for bad_job in response[2:]:
                    self.job_lock.acquire()
                    assert bad_job in self.jobs
                    del self.jobs[bad_job]
                    self.job_lock.release()
                    self.get_job()
    http://qiita.com/rusarusa/items/d7f014ba80d6fe7a3e07
・PythonでWEB上の画像をまとめてダウンロード
    http://www.dyesac.com/pythonでweb上の画像をまとめてダウンロード/
・画像クローラー
    http://qiita.com/komakomako/items/dd380f980e56e70fa321

Targets:
・https://reverb.com/jp/marketplace/electric-guitars
・https://www.yahoo.co.jp
"""

# (1) クロールするurlを決める
target_url = "https://www.yahoo.co.jp"

# (2) robot.txtを読み込むため際に使用するインスタンスの作成
robots = RobotsCache(100)

# (3) もし、robot.txtを読み込んでみて、クロール許可をもらえたら、先の処理に進む
if robots.allowed(target_url, 'python program'):
    # (4) Javascriptで生成されたコードでもクロールできるようにPhatomJSインスタンスを作成する
    driver = webdriver.PhantomJS()
    # (5) 作成したインスタンスのGetリクエストを呼ぶメソッドに対象のurlを引数として与え、domの情報を手に入れる
    driver.get(target_url)
    # <selenium.webdriver.phantomjs.webdriver.WebDriver (session="b140b9a0-74d3-11e7-b434-8b9f5b309f17")>
    # type(driver)
    # <class 'selenium.webdriver.phantomjs.webdriver.WebDriver'>

    # (6) 先ほど取得したdomの情報をutf-8でエンコードして、クロール対象ページの情報をbyte型として保持する
    html = driver.page_source.encode('utf-8')
    # type(html)
    # <class 'bytes'>
Example #40
0
"""
"""

# Imports
import json
import time
import requests
import urlparse

doi_url = 'http://dx.doi.org/'

# Get crawl-delay parameter from robots.txt
from reppy.cache import RobotsCache
robots = RobotsCache()
doi_delay = robots.delay(doi_url, '*')

def doi_to_csl(doi):
    """ Fetch CSL-formatted reference by DOI. """

    # Build URL
    url = urlparse.urljoin(doi_url, doi)
    
    # Send request
    req = requests.get(
        url, 
        headers={
            'accept' : 'application/citeproc+json'
        }
    )

    # Wait for crawl-delay
 def _fetch_sitemap_from_url(self, url):
     robots = RobotsCache()
     try:
         return robots.fetch(url, timeout=1.5).sitemaps
     except:
         return []
Example #42
0
 def setUp(self):
     self.robots = RobotsCache()
Example #43
0
from reppy.cache import RobotsCache

agent = 'spoderman'
sandcrawler = RobotsCache(timeout=2)

def is_allowed(url):
    try:
        return sandcrawler.allowed(url, agent)
    except:
        return False

def crawl_delay(url):
    try:
        delay = sandcrawler.delay(url, agent)
        Print('Crawl delay for', url, delay)
        return delay if delay else 1
    except:
        return 1

Example #44
0
class EZWS:
    """
	SELF:

	config json config file
	ua     user agent
	robo   robotcache obj
	link   current link
	urlp   url parse object for current link
	soup   current html page soup obj
	req    requests obj
	raw    raw html from req.get()
	check  check for robot files, keep true
	output name of output csv file
	"""
    def __init__(self,
                 file,
                 ua,
                 check=True,
                 output="output.csv"
                 ):  #setting output to false disables file output
        if check:  #only setup robot checker if robot checking is enabled
            self.ua = ua  #user agent
            self.robo = RobotsCache(capacity=100)

        #check var disables or enables robots.txt checking
        #recommended to keep default True value
        self.check = check
        self.req = requests  #request obj for parsing url

        self.output = output  #where to output file

        self.data = []  #init array of grabbed sites

        self.configarr = []  #empty array of all configs

        if type(file) is list:
            self.configarr = file
        else:
            self.configarr.append(file)

    def allowed(self, url):  #checks if url is ok to download
        if self.check:
            if self.robo.allowed(url, self.ua):  #checks robot file
                return True
            else:
                print(url, "is not allowed")  #notify user if url isnt allowed
                return False
        else:
            return True  #if robot checking is off, return true regardless

    @property  #when url is called, return it
    def url(self):
        if hasattr(self, "link"):  #handles whether self has link attribute
            return self.link
        else:
            return ""  #if not return empty string

    @url.setter  #when url is set, parse it
    def url(self, url):
        self.link = url
        self.urlp = urlparse(url)

    def download(self, url):
        if self.allowed(url):
            self.raw = self.req.get(url).content
            self.soup = BeautifulSoup(self.raw,
                                      "html.parser")  #loads html into soup obj

    def xpath(self, html, xp):  #takes html and returns data from xpath
        tree = lxmlhtml.fromstring(html)  #generates tree
        return tree.xpath(xp)  #returns data from tree

    def select(self, html,
               obj):  #determines whether to grab using css or xpath
        if "xpath" in obj:  #if xpath
            items = self.xpath(html.getText(),
                               obj["xpath"])  #return xpath selector arr
        else:  #css
            items = html.select(obj["css"])  #return a css selector arr

        if self.config["header"]:  #if theres a header keep data to one column
            items = items[:1]

        if "css" in obj:  #if data is css attribute(s) from element
            row = []
            for item in items:
                cont = [
                ]  #arr for storing attribs from each css selected element
                if type(
                        obj["contents"]
                ) is str:  #if contents is a string, put it into an array
                    obj["contents"] = [obj["contents"]]

                for content in obj["contents"]:
                    if content:  #if not empty, get the element from tag
                        cont.append(item[content])
                    else:  #if empty, get the text from tag
                        cont.append(item.text)
                row += cont  #append attribs to attrib array
            return row  #return all the attribs (css)
        else:
            return items  #return xpath

    def clear(self):
        self.data = []

    def load(self, index):
        tmp = self.configarr[index]

        if type(tmp) is dict:  #if file is json obj, load it
            self.config = tmp
        else:  #assume it is a file and load it
            if os.path.exists(tmp):
                with open(tmp) as f:
                    self.config = json.load(f)  #opens and parses json file

    def grab(self, index=None):
        if index == None:  #using grab() with no params will grab all configs passed
            for i in range(len(self.configarr)):
                self.grab(i)  #grab "i" config file
        else:
            self.load(index)  #get current file obj
            if self.output:  #only create simplecsv obj if file outputting is on
                sc = simplecsv(self.output,
                               mode="w+")  #using w+ mode to remove old output
                if self.config["header"]:
                    sc.writerow(
                        self.config["header"])  #add header from config to csv

            for link in self.config["links"]:  #loop through links
                samelinks = []  #empty list of links for now
                if type(link["url"]) is str:
                    samelinks.append(
                        link["url"]
                    )  #if url is a single str not array append it to an array
                else:  #assume it is an array
                    samelinks = link["url"]

                for samelink in samelinks:  #passing "url" an array of urls will do the same params on all the links
                    if self.allowed(samelink):  #check if url is allowed
                        self.download(samelink)  #if so download it
                        for divs in self.soup.select(link["container"]):
                            add = []
                            for get in link[
                                    "grab"]:  #grabs each element from inside each div
                                add += self.select(divs, get)

                            self.data += add  #update internal data
                            if self.output:
                                sc.writerow(
                                    add
                                )  #only write to disk if file output is on
            if self.output:
                sc.close()  #only close "sc" if file output is on
Example #45
0
class Hodor(object):
    def __init__(self, url, config={}, proxies={},
                 auth=None, ua=DEFAULT_HODOR_UA,
                 pagination_max_limit=DEFAULT_HODOR_MAX_PAGES,
                 crawl_delay=DEFAULT_CRAWL_DELAY,
                 ssl_verify=False,
                 trim_values=True,
                 robots=True,
                 reppy_capacity=100):

        self.content = None
        self.url = url
        self.domain = self._get_domain()
        self.proxies = proxies
        self.auth = auth
        self.ua = ua
        self.trim_values = trim_values
        self.ssl_verify = ssl_verify
        self.config = {}
        self.extra_config = {}

        self.robots = RobotsCache(capacity=reppy_capacity) if robots else None

        self._pages = []
        self._page_count = 0
        self._pagination_max_limit = pagination_max_limit
        self.crawl_delay = self._crawl_delay(crawl_delay)

        for k, v in config.items():
            if k.startswith("_"):
                self.extra_config[k.lstrip("_")] = v
            else:
                self.config[k] = v

    def _get_domain(self):
        parsed_uri = urlparse(self.url)
        return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

    def _crawl_delay(self, crawl_delay):
        if self.robots not in EMPTY_VALUES:
            expiry, robots = self.robots.fetch('{}robots.txt'.format(self.domain))
            delay = robots.agent(self.ua).delay
            try:
                crawl_delay = max(filter(partial(is_not, None),
                                         [delay, crawl_delay]))
            except ConnectionException:
                pass
        return crawl_delay

    def _fetch(self, url):
        '''Does the requests fetching and stores result in self.content'''

        if self.robots in EMPTY_VALUES or self.robots.allowed(url, self.ua):
            session = requests.session()
            headers = {'User-Agent': self.ua}
            if len(self.proxies) > 0:
                session.proxies = self.proxies
            if self.auth:
                r = session.get(url, headers=headers, auth=self.auth, verify=self.ssl_verify)
            else:
                r = session.get(url, headers=headers, verify=self.ssl_verify)
            self.content = r.content

        return self.content

    @staticmethod
    def _get_value(content, rule):
        '''Returns result for a specific xpath'''
        try:
            tree = html.fromstring(content)
        except TypeError:
            tree = None

        post_processing = rule.get('transform', lambda data: data)

        data = ""
        if tree not in EMPTY_VALUES:
            if 'xpath' in rule:
                data = tree.xpath(rule['xpath'])
            elif 'css' in rule:
                data = [node.text_content() for node in tree.cssselect(rule['css'])]

            many = rule.get('many', True)
            if not many:
                if len(data) == 0:
                    data = None
                else:
                    data = post_processing(data[0])
            else:
                data = [post_processing(d) for d in data]

        return data

    @staticmethod
    def _group_data(data, groups, config):
        del_fields = []
        for dest, group_fields in groups.items():
            if '__all__' in group_fields or group_fields == '__all__':
                group_fields = [rule for rule in config.keys() if not rule.startswith('_')]
                del_fields.extend(group_fields)

            gdata = []
            for field in group_fields:
                gdata.append(data[field])

            data[dest] = []
            for gd in zip(*gdata):
                d = {}
                for i, field in enumerate(group_fields):
                    d[field] = gd[i]
                data[dest].append(d)

        if len(del_fields) == 0:
            del_fields = [field for field_set in groups.values() for field in field_set]

        for field in del_fields:
            if field in data:
                del data[field]

    def _package_pages(self):
        self._data = {}
        if len(self._pages) == 1:
            self._data = self._pages[0]
        else:
            self._data = {key: [] for key in self._pages[0].keys()}
            for page in self._pages:
                for k, v in page.items():
                    if hasattr(v, '__iter__'):
                        self._data[k].extend(v)
                    else:
                        self._data[k].append(v)
        return self._data

    @classmethod
    def _parse(cls, content, config={}, extra_config={}, trim_values=True):
        '''Parses the content based on the config set'''
        if len(config) is 0:
            _data = {'content': content}
        else:
            _data = {}

            try:
                str_class = basestring
            except NameError:
                str_class = str

            for key, rule in config.items():
                value = cls._get_value(content, rule)
                if trim_values and value not in EMPTY_VALUES:
                    if 'many' in rule and rule['many']:
                        value = [v.strip() if isinstance(v, str_class) else v for v in value]
                    else:
                        value = value.strip() if isinstance(value, str_class) else value
                _data[key] = value

        paginate_by = extra_config.get('paginate_by')
        if paginate_by:
            paginate_by = cls._get_value(content, paginate_by)

        groups = extra_config.get('groups', {})
        if groups:
            cls._group_data(_data, groups, config)
        return _data, paginate_by

    def _get(self, url):
        self._fetch(url)
        data, paginate_by = self._parse(self.content, self.config, self.extra_config, self.trim_values)

        if paginate_by not in EMPTY_VALUES:
            paginate_by = urljoin(self.domain, paginate_by)

        return data, paginate_by

    def get(self, url=None):
        url = url if url else self.url
        self._data, paginate_by = self._get(url)

        self._pages.append(self._data)
        self._page_count += 1

        if paginate_by and self._page_count < self._pagination_max_limit:
            time.sleep(self.crawl_delay)
            self.get(paginate_by)

        self._package_pages()
        return self._data

    @property
    def data(self):
        if not hasattr(self, '_data'):
            self.get()
        return self._data
Example #46
0
 def __init__(self,db_name):
     """Initialises the crawler with the name of the database"""
     self.con=sqlite.connect(db_name)
     self.stemmer = nltk.stem.porter.PorterStemmer()
     self.headers = { "User-Agent" : "Faizan Bhat's Web Crawler" }
     self.robots = RobotsCache()
Example #47
0
class Crawler:

    def __init__(self,db_name):
        """Initialises the crawler with the name of the database"""
        self.con=sqlite.connect(db_name)
        self.stemmer = nltk.stem.porter.PorterStemmer()
        self.headers = { "User-Agent" : "Faizan Bhat's Web Crawler" }
        self.robots = RobotsCache()
        
    def __del__(self):
        self.con.close()

    def db_commit(self):
        self.con.commit()

    def get_entry_id(self,table,field,value,create_new=True):
        """Auxiliary function for getting an entry id and adding it if it is not present"""
        
        # Construct query
        cur = self.con.execute("select rowid from %s where %s='%s'" % (table,field,value))
        # Fetch
        res = cur.fetchone()
        # If not found
        if res==None:
            cur=self.con.execute("insert into %s (%s) values ('%s')" % (table,field,value))
            return cur.lastrowid
        else:
            return res[0]

    def add_to_index(self,url,soup):
        """Indexes an individual page"""
        if self.is_indexed(url): return
        print 'Indexing ' + url
        
        # Get text from soup
        text = self.get_text_only(soup)
        # Separate words
        words = self.separate_words(text)
        # Stem the list of words
        words = map(self.stem_word, words)
        # Get the url ID 
        url_id = self.get_entry_id('urllist','url',url)
        
        # Link each word to this url
        for i in range(len(words)):
            word = words[i]
            if word in ignore_words: continue
            word_id=self.get_entry_id('wordlist','word',word)
            self.con.execute('insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)' % (url_id,word_id,i))
        

    def get_text_only(self,soup):
        """Extracts the text from an HTML page (without tags)"""
        v=soup.string
        if v==None:
            c=soup.contents
            result_text=''
            for t in c:
                sub_text = self.get_text_only(t)
                result_text = result_text + sub_text+'\n'
            return result_text
        else:
            return v.strip()

    def separate_words(self,text):
        """Separates the words by any non-whitespace characters"""
        splitter = re.compile('\\W*')
        return [s.lower() for s in splitter.split(text) if s!='']

    def stem_word(self,word):
        """Uses NLTK porter stemming algorithm to stem a word"""
        return self.stemmer.stem(word)
    
    def is_indexed(self,url):
        """Return True if url is already indexed"""
        u=self.con.execute \
        ("select rowid from urllist where url='%s'" % url).fetchone()
        if u!=None:
            # Check if it has been crawled
            v=self.con.execute(
            'select * from wordlocation where urlid=%d' % u[0]).fetchone()
            if v!=None: return True
        return False
            

    def add_link_ref(self,url_from,url_to,link_text):
        """Adds a link between two pages"""
        words = self.separate_words(link_text)
        from_id = self.get_entry_id('urllist','url','urlFrom')
        to_id=self.get_entry_id('urllist','url','urlTo')
        if from_id == to_id: return
        cur = self.con.execute("insert into link(fromid,toid) values (%d,%d)" % (from_id,to_id))
        link_id = cur.lastrowid
        for word in words:
            if word in ignore_words: continue
            word_id = self.get_entry_id('wordlist','word',word)
            self.con.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (link_id,word_id))

    def crawl(self,pages,depth=2):
        """Does a breadth first search on a given list of pages and indexes as we go"""
        for i in range(depth):
            print "Depth = " + str(i)
            newpages=set()
            for page in pages:
                if not self.robots.allowed(page,"*"): 
                    print "%s disallows robots. Moving on." %page
                    continue
                try:
                    req = urllib2.Request(page, None, self.headers)
                    c=urllib2.urlopen(req)
                except:
                    print "Could not open %s" %page
                    continue
                soup = BeautifulSoup(c.read())
                self.add_to_index(page,soup)

                links=soup('a')
                
                for link in links:
                    if ('href' in dict(link.attrs)):
                        url=urljoin(page,link['href'])
                        if url.find("'")!=-1:continue
                        url=url.split('#')[0]
                        if url[0:4]:
                            if not self.is_indexed(url):
                                newpages.add(url)
                        link_text=self.get_text_only(link)
                        self.add_link_ref(page,url,link_text)
                    self.db_commit()
                
                pages=newpages

    def create_index_tables(self):
        """Creates the database tables"""
        self.con.execute('create table urllist(url)')
        self.con.execute('create table wordlist(word)')
        self.con.execute('create table wordlocation(urlid,wordid,location)')
        self.con.execute('create table link(fromid integer, toid integer)')
        self.con.execute('create table linkwords(wordid, linkid)')
        self.con.execute('create index wordidx on wordlist(word)')
        self.con.execute('create index urlidx on urllist(url)')
        self.con.execute('create index wordurlidx on wordlocation(wordid)')
        self.con.execute('create index urltoidx on link(toid)')
        self.con.execute('create index urlfromidx on link(fromid)')
        self.db_commit()
Example #48
0
class WebCrawler():
	""" Web crawler class crawls a specific website
	"""
	def __init__(self, url="file:///Users/tharak/Dropbox/code/Python/webcrawler/mock_website/example.org/index.html", useragent="User Agent", outdir="out", max_depth=1000, debug=0):
		self.url = url					
		self.useragent = useragent		
		self.siteMap = {self.url:""}	
		self.outdir=outdir.rstrip("/")+"/"	
		self.depth = 0					
		self.MaxDepth = max_depth		
		self.crawled=Set([])			
		self.debug=debug				
		self.domains=Set([urlparse(self.url).netloc.lower()])
		self.robots = RobotsCache()
			
		
	def __crawl_site(self, url_key=""):
		"""Recursively crawls the url passed and populates the sitemap datastructure
		"""
		#Do not continue crawling if we are at maximum allowed depth
		if self.depth > self.MaxDepth: 	
			return
		
		
		if url_key=="":    				
			url=self.url				
		else:
			url=url_key
			
		#Check the site's robot.txt to figure the list of allowed locs	
		#Do not check robots.txt if the file is located locally
		if "http" in urlparse(url).scheme:  
			if not self.robots.allowed(url, self.useragent):
				if(self.debug > 0): 
					print "Page disallowed in robots.txt %s"%(url)
				return
			
		if(self.debug > 0): 
			print "Now crawling: %s"%(url)
		
		url_list=[]
		
		#When we cycle through the siteMap datastructure we convert to a url_list
		#Otherwise, the interpreter complains that dictionary is constantly changing
		
		for key in self.siteMap:		
		 	url_list.append(key)		 
		
		for key in url_list:	
			#Fetch the URLs in the webpage and append to siteMap for URLs that have not yet been crawled. 		
			if self.siteMap[key] == "":
				urls =self.__extract_url(url)
				self.siteMap[key] = urls

				for url_key in urls:
					#If the URL has already been crawled or has a # tag, dont crawl it.	
					if (self.debug > 1): 
						print "url_key: %s, crawled: %s"%(url_key,self.crawled)
					if url_key in self.crawled:
						continue
					if "#" in url_key:
						continue
					
					#We do not want to crawl external domains. 
					parsed = urlparse(url_key)
					
					if (self.debug > 1): 
						print parsed.netloc
					
					#If netloc is empty or is the main domain then the page is part of local domain and needs to be crawled.
					if parsed.netloc.lower() in self.domains:		    
						
						if (self.debug > 1): 
							print "\ndepth=%s,URL=%s\n"%(self.depth, url_key)
						self.siteMap[url_key] = ""  
						self.crawled.add(url_key)   
						self.depth = self.depth+1   
						self.__crawl_site(url_key)	
						self.depth = self.depth-1	
			

	def __print_siteMap(self):
		"""Prints the siteMap datastructure in an XML like format
		"""
		#Dump Sitemap to an XML file
		try:                                
			fd = open(self.outdir+"site.xml", "w") 
			try:                           
				fd.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
				fd.write("<WEBSITE>\n")
				for key in self.siteMap:
					fd.write("\t<WEBPAGE>\n")
					fd.write("\t\t<ADDRESS>\"%s\"</ADDRESS>\n"%(key))
					for loc in self.siteMap[key]:
						fd.write("\t\t<LINK>\"%s\"</LINK>\n"%(loc))
					fd.write("\t</WEBPAGE>\n")
				fd.write("</WEBSITE>\n")
			finally:                        
				fd.close()                    			  
		except IOError:                     
			pass    
		#Dump siteMap to a json file
		import json
		with open(self.outdir+'site.json', 'w') as fp:
			json.dump(self.siteMap, fp, indent=4)    
    
		
					
	def get_siteMap(self):
		"""Initiates the crawler and populates the siteMap
		"""
		from os import makedirs
		from shutil	import rmtree 

		rmtree(self.outdir)
		makedirs(self.outdir)

		self.__crawl_site()
		self.__print_siteMap()
		return self.siteMap

	def __extract_url(self, url): 
		"""Extracts the links in the input URL
		"""
		
		import urllib2
		from urllister import URLLister
		from sgmllib import SGMLParseError
		
		req = urllib2.Request(url, headers={'User-Agent' : self.useragent}) 
		try:
			usock = urllib2.urlopen(req)
			parser = URLLister(url)
		
			try:
				parser.feed(usock.read())
				parser.close()
			except Exception as exception:
				if (self.debug > 0): 
					print "sgmllib: Unable to parse web page.\n sgmllib: Raised exception %s"%(type(exception).__name__)
					fd = open(self.outdir+"%s.err"%type(exception).__name__, "a")
					fd.write( "%s\n"%(url))	
					fd.close()
				pass
			usock.close()
			return parser.urls
		except (KeyboardInterrupt, SystemExit):
			raise
		except Exception as exception:
			if (self.debug > 0): 
				print "urllib2: Page does not exist or Malformed web address.\n sgmllib: Raised exception %s"%(type(exception).__name__) 
				fd = open(self.outdir+"%s.err"%type(exception).__name__, "a")
				fd.write( "%s\n"%(url))	
				fd.close()
			return []
Example #49
0
class RobotsTxtMiddleware(object):
    DOWNLOAD_PRIORITY = 1000

    def __init__(self, crawler):
        if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
            raise NotConfigured

        self.completeblacklist = crawler.settings.get('ROBOTSTXT_BLACKLIST',
                                                      ())
        self.blacklist = []
        self.generalblacklist = crawler.settings.get('GENERAL_BLACKLIST', ())
        self.hasblacklist = False
        self.whitelist = crawler.settings.get('ROBOTSTXT_WHITELIST', ())
        self.crawler = crawler
        self._useragent = crawler.settings.get('USER_AGENT')
        self._parsers = {}
        self._spider_netlocs = set()
        self.robots = RobotsCache()

        self.stoprepetitionsrearg = re.compile(ur'.*?\&(.*?\&)\1{1,}.*')
        self.stoprepetitionsreslash = re.compile(ur'.*?\/(.*?\/)\1{1,}.*')

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        useragent = self._useragent
        if not self.hasblacklist:
            self.hasblacklist = True
            if ('http://' + spider.domain
                ) in self.completeblacklist and self.completeblacklist[
                    'http://' + spider.domain] != None:
                self.blacklist = [
                    el.lower()
                    for el in self.completeblacklist['http://' + spider.domain]
                ]
                log.msg(format="Got blacklist from DB for domain",
                        level=log.DEBUG,
                        request=request)
            else:
                log.msg(format="Didn't get a blacklist from DB for domain",
                        level=log.DEBUG,
                        request=request)
            self.blacklist.extend([el.lower() for el in self.generalblacklist])
        #Check for silly repeating arguments
        if self.stoprepetitionsrearg.match(
                request.url) != None or self.stoprepetitionsreslash.match(
                    request.url) != None:
            log.msg(format="URL is suspicious: %(request)s",
                    level=log.DEBUG,
                    request=request)
            raise IgnoreRequest
        #Blacklist overrides whitelist and robots
        if any(bl in request.url.lower() for bl in self.blacklist):
            log.msg(format="Forbidden by blacklist: %(request)s",
                    level=log.DEBUG,
                    request=request)
            raise IgnoreRequest
        if not any(wl in request.url for wl in
                   self.whitelist) and self.robots and not self.robots.allowed(
                       request.url, useragent):
            log.msg(format="Forbidden by robots.txt: %(request)s",
                    level=log.DEBUG,
                    request=request)
            raise IgnoreRequest
class RobotsTxtMiddleware(object):
    DOWNLOAD_PRIORITY = 1000

    def __init__(self, crawler):
        if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
            raise NotConfigured

        self.completeblacklist = crawler.settings.get('ROBOTSTXT_BLACKLIST', ())
        self.blacklist = []
        self.generalblacklist = crawler.settings.get('GENERAL_BLACKLIST', ())
        self.hasblacklist = False
        self.whitelist = crawler.settings.get('ROBOTSTXT_WHITELIST', ())
        self.crawler = crawler
        self._useragent = crawler.settings.get('USER_AGENT')
        self._parsers = {}
        self._spider_netlocs = set()
        self.robots = RobotsCache()
        
        self.stoprepetitionsrearg = re.compile(ur'.*?\&(.*?\&)\1{1,}.*')
        self.stoprepetitionsreslash = re.compile(ur'.*?\/(.*?\/)\1{1,}.*')

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        useragent = self._useragent
        if not self.hasblacklist:
            self.hasblacklist = True
            if ('http://' + spider.domain) in self.completeblacklist and self.completeblacklist['http://' + spider.domain] != None:
                self.blacklist = [el.lower() for el in self.completeblacklist['http://' + spider.domain]]
                log.msg(format="Got blacklist from DB for domain",
                    level=log.DEBUG, request=request)
            else:
                log.msg(format="Didn't get a blacklist from DB for domain",
                    level=log.DEBUG, request=request)
            self.blacklist.extend([el.lower() for el in self.generalblacklist])
        #Check for silly repeating arguments
        if self.stoprepetitionsrearg.match(request.url) != None or self.stoprepetitionsreslash.match(request.url) != None:
            log.msg(format="URL is suspicious: %(request)s",
                    level=log.DEBUG, request=request)
            raise IgnoreRequest
        #Blacklist overrides whitelist and robots
        if any(bl in request.url.lower() for bl in self.blacklist):
            log.msg(format="Forbidden by blacklist: %(request)s",
                    level=log.DEBUG, request=request)
            raise IgnoreRequest
        if not any(wl in request.url for wl in self.whitelist) and self.robots and not self.robots.allowed(request.url, useragent):
            log.msg(format="Forbidden by robots.txt: %(request)s",
                    level=log.DEBUG, request=request)
            raise IgnoreRequest
Example #51
0
#! /usr/bin/env python

from __future__ import print_function

from contextlib import contextmanager
import time

from reppy.cache import RobotsCache
from reppy.parser import Rules

content = '''
User-agent: '*'
Allow: /
'''

cache = RobotsCache()
cache.add(Rules('http://example.com/', 200, content, float('inf')))


@contextmanager
def timer(count):
    '''Time this block.'''
    start = time.time()
    try:
        yield count
    finally:
        duration = time.time() - start
        print('Total: %s' % duration)
        print('  Avg: %s' % (duration / count))
        print(' Rate: %s' % (count / duration))
Example #52
0
 def _set_robot_rule(self):
     """
     Set the robots.txt rules
     """
     self.rules = RobotsCache().fetch(self.url)
Example #53
0
class TestCache(unittest.TestCase):
    def setUp(self):
        self.robots = RobotsCache()

    def test_404(self):
        '''When we get a 404, assume free range'''
        with asis.Server('tests/asis/test_404', port=8080):
            self.assertEqual(self.robots.allowed(
                'http://localhost:8080/foo', 'rogerbot'), True)

    def test_caching(self):
        '''We should be able to cache results'''
        with asis.Server('tests/asis/test_caching', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_context_manager(self):
        '''When using as a context manager, it should clear afterwards'''
        with asis.Server('tests/asis/test_context_manager', port=8080):
            with self.robots:
                self.assertEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
                self.assertNotEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
            # And now, we should have it no longer cached
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_expires(self):
        '''Should be able to recognize expired rules'''
        with asis.Server('tests/asis/test_expires', port=8080):
            old_ttl = self.robots.min_ttl
            self.robots.min_ttl = 0
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo', True), None)
            # Now, it shouldn't be cached, so when we find it again, it should
            # be missing (or at least, requiring a refetch)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo', False), None)
            self.robots.min_ttl = old_ttl

    def test_clear(self):
        '''Should be able to explicitly clear rules'''
        with asis.Server('tests/asis/test_clear', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            # Now if we clear the rules, we should not find it
            self.robots.clear()
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_fetch(self):
        '''Ensure that 'fetch' doesn't cache'''
        with asis.Server('tests/asis/test_fetch', port=8080):
            self.assertNotEqual(
                self.robots.fetch('http://localhost:8080/foo'), None)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_cache(self):
        '''Ensure we can ask it to cache a result'''
        with asis.Server('tests/asis/test_cache', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.assertNotEqual(
                self.robots.cache('http://localhost:8080/foo'), None)
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_add(self):
        '''We should be able to add rules that we get'''
        with asis.Server('tests/asis/test_add', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.add(self.robots.fetch(
                'http://localhost:8080/foo'))
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_server_error(self):
        '''Make sure we can catch server errors'''
        self.assertRaises(ServerError, self.robots.allowed,
            'http://localhost:8080/foo', 'rogerbot')

    def test_disallowed(self):
        '''Check the disallowed interface'''
        with asis.Server('tests/asis/test_disallowed', port=8080):
            self.assertFalse(self.robots.disallowed(
                'http://localhost:8080/foo', 'rogerbot'))
            urls = [
                'http://localhost:8080/foo',
                'http://localhost:8080/bar'
            ]
            self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls)
            self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), [])

    def test_delay(self):
        '''Check the delay interface'''
        with asis.Server('tests/asis/test_delay', port=8080):
            self.assertEqual(self.robots.delay(
                'http://localhost:8080/foo', 'rogerbot'), 5)

    def test_sitemaps(self):
        '''Check the sitemaps interface'''
        with asis.Server('tests/asis/test_sitemaps', port=8080):
            self.assertEqual(
                self.robots.sitemaps('http://localhost:8080/foo'), [
                    'http://localhost:8080/a',
                    'http://localhost:8080/b',
                    'http://localhost:8080/c'
                ])
 def __init__(self, agent):
     self._agent = agent
     self.robots = RobotsCache()
Example #55
0
class TestCache(unittest.TestCase):
    def setUp(self):
        self.robots = RobotsCache()

    def test_404(self):
        '''When we get a 404, assume free range'''
        with asis.Server('tests/asis/test_404', port=8080):
            self.assertEqual(
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot'),
                True)

    def test_caching(self):
        '''We should be able to cache results'''
        with asis.Server('tests/asis/test_caching', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_context_manager(self):
        '''When using as a context manager, it should clear afterwards'''
        with asis.Server('tests/asis/test_context_manager', port=8080):
            with self.robots:
                self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                                 None)
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
                self.assertNotEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
            # And now, we should have it no longer cached
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_expires(self):
        '''Should be able to recognize expired rules'''
        with asis.Server('tests/asis/test_expires', port=8080):
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo', True), None)
            # Now, it shouldn't be cached, so when we find it again, it should
            # be missing (or at least, requiring a refetch)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo', False), None)

    def test_clear(self):
        '''Should be able to explicitly clear rules'''
        with asis.Server('tests/asis/test_clear', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)
            # Now if we clear the rules, we should not find it
            self.robots.clear()
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_fetch(self):
        '''Ensure that 'fetch' doesn't cache'''
        with asis.Server('tests/asis/test_fetch', port=8080):
            self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'),
                                None)
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_cache(self):
        '''Ensure we can ask it to cache a result'''
        with asis.Server('tests/asis/test_cache', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'),
                                None)
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_add(self):
        '''We should be able to add rules that we get'''
        with asis.Server('tests/asis/test_add', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.add(self.robots.fetch('http://localhost:8080/foo'))
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_server_error(self):
        '''Make sure we can catch server errors'''
        self.assertRaises(ServerError, self.robots.allowed,
                          'http://localhost:8080/foo', 'rogerbot')

    def test_disallowed(self):
        '''Check the disallowed interface'''
        with asis.Server('tests/asis/test_disallowed', port=8080):
            self.assertFalse(
                self.robots.disallowed('http://localhost:8080/foo',
                                       'rogerbot'))
            urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar']
            self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls)
            self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), [])

    def test_delay(self):
        '''Check the delay interface'''
        with asis.Server('tests/asis/test_delay', port=8080):
            self.assertEqual(
                self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5)

    def test_sitemaps(self):
        '''Check the sitemaps interface'''
        with asis.Server('tests/asis/test_sitemaps', port=8080):
            self.assertEqual(
                self.robots.sitemaps('http://localhost:8080/foo'), [
                    'http://localhost:8080/a', 'http://localhost:8080/b',
                    'http://localhost:8080/c'
                ])
Example #56
0
 def setUp(self):
     self.robots = RobotsCache()
Example #57
0
import sqlite3
import urllib
import time
from bs4 import BeautifulSoup
from reppy.cache import RobotsCache
from reppy.robots import Robots

#################################################
default_crawl_delay = 5

# caching robots.txt files for fast access
robots_cache = RobotsCache(capacity=200)

# db commit rate
commit_rate = 1
current_r = 0

#################################################

db_location = 'content.db'
conn = sqlite3.connect(db_location)
cur = conn.cursor()

#################################################
#################################################
# populate url_frontier

url_frontier = set()

cur.execute("SELECT `url_link` FROM `crawled_urls` WHERE `is_scraped` = 0") 
Example #58
0
 def __init__(self, *args, **kwargs):
     self.cache = RobotsCache(*args, **kwargs)
     self.visited = collections.defaultdict(dict)
Example #59
0
class EZWS:
    robo = RobotsCache(capacity=100, cache_policy=ReraiseExceptionPolicy(0))
    data: List[str] = []
    """
	SELF:

	config json config file
	ua     user agent
	robo   robotcache obj
	soup   current html page soup obj
	raw    raw html from req.get()
	check  check for robot files, keep true
	output name of output csv file
	"""
    def __init__(self,
                 file: Union[str, Dict],
                 ua: str = "",
                 check: bool = True,
                 output: str = "output.csv") -> None:
        self.ua = ua

        self.check = check

        #setting output to false disables file output
        self.output = output

        self.configarr = _listify(file)

    def allowed(self, url: str) -> bool:
        if not self.check:
            return True

        try:
            if self.robo.allowed(url, self.ua):
                return True
            print(url, "is not allowed")

        except ConnectionException:
            print(url, "seems to be down")

        return False

    def download(self, url: str) -> Optional[Any]:
        if not self.allowed(url):
            return None

        self.raw = requests.get(url).content

        return BeautifulSoup(self.raw, "html.parser")

    def xpath(self, html: str, xp: str) -> List[Any]:
        return cast(List[Any], lxmlhtml.fromstring(html).xpath(xp))

    def select(self, html: Any, json: Dict) -> List[str]:
        xpath = json.get("xpath", "")
        css = json.get("css", "")

        if xpath:
            found = self.xpath(html.getText(), xpath)

            return [found[0]] if self.config["header"] else found

        #assume css was passed
        found = html.select(css)
        if self.config["header"]:
            found = [found[0]]

        completed = []
        for item in found:
            output = []

            contents = _listify(json["contents"])

            for content in contents:
                if content and item.has_attr(content):
                    output.append(item[content])

                else:
                    output.append(item.text)

            completed += output

        return completed

    def clear(self) -> None:
        self.data = []

    def load(self, index: int) -> None:
        config = self.configarr[index]

        if isinstance(config, Dict):
            self.config = config

        else:
            if os.path.exists(config):
                with open(config) as f:
                    self.config = json.load(f)

        return None

    def grab(self, index: Optional[int] = None) -> None:
        if index is None:
            #using grab() with no params will grab all configs passed
            for i in range(len(self.configarr)):
                self.grab(i)

            return None

        self.load(index)
        if self.output:
            sc = simplecsv(self.output, mode="w+")
            if self.config["header"]:
                sc.writerow(self.config["header"])

        for json in self.config["links"]:
            for link in chain(
                    *[explode(link) for link in _listify(json["urls"])]):
                if not self.allowed(link):
                    return None

                soup = self.download(link)
                if not soup:
                    print("could not download file")
                    return None

                for divs in soup.select(json["container"]):
                    data = []
                    for grab in json["grab"]:
                        data += self.select(divs, grab)

                    self.data += data
                    if self.output:
                        sc.writerow(data)

        if self.output:
            sc.close()