def testRobot3(self):
     robots = RobotsCache()
     rules = robots.fetch("http://www.realwire.com/")
     crawl_delay = rules.delay("idiot")
     print("delay is:", crawl_delay)
     for i in range(1, 1000):
         print(rules.allowed("http://api.google.com/search/", agent="idiot"))
    def __FetchRobotFileInfo__(self, url, robotDictForDomains, timeStamp):
        domainName = self.__GetComSubdomainOfUrl__(url)
        robotUrl = ""

        if robotDictForDomains.has_key(domainName) == False:
            robotUrl = self.__GetRobotUrlForUrl__(domainName)
            cache = RobotsCache()
            try:
                timeStamp[domainName] = datetime.datetime.now()
                robotFileObj = cache.fetch(robotUrl)
                doesUrlExistOnline = self.__DoesUrlExistOnline__(robotUrl)
            except:
                doesUrlExistOnline = False
                robotDictForDomains[domainName] = (doesUrlExistOnline, object)

            if doesUrlExistOnline == True:
                robotDictForDomains[domainName] = (doesUrlExistOnline,
                                                   robotFileObj)
            else:
                robotDictForDomains[domainName] = (doesUrlExistOnline, object)

        doesUrlExistOnline = robotDictForDomains[domainName][0]
        robotFileObj = robotDictForDomains[domainName][1]
        # print "heyyy",robotUrl, doesUrlExistOnline, robotFileObj, robotDictForDomains
        return doesUrlExistOnline, robotFileObj, robotDictForDomains, timeStamp, domainName
Ejemplo n.º 3
0
 def testRobot3(self):
     robots = RobotsCache()
     rules = robots.fetch("http://www.realwire.com/")
     crawl_delay = rules.delay("idiot")
     print("delay is:", crawl_delay)
     for i in range(1, 1000):
         print(rules.allowed("http://api.google.com/search/",
                             agent="idiot"))
Ejemplo n.º 4
0
def get_scanner_mock(request_limit):
    robots_cache = RobotsCache()
    robots_cache.fetch = MagicMock(return_value=robots_cache)
    robots_cache.allowed = MagicMock(return_value=True)
    robots_validator = RobotsValidator(agent='*')
    robots_validator.robots = robots_cache
    scanner = UrlScanner(request_limit)
    scanner.url_fetcher = get_url_fetcher_mock(request_limit)
    scanner.robots_validator = robots_validator
    return scanner
	def robot_rules(_url_scheme, _url_netloc):  # return a robot rules objects
		_domain = urlunparse((_url_scheme, _url_netloc, '', '', '', ''))
		robots = RobotsCache()
		try:
			rules = robots.fetch(_domain, timeout=5)
		except Exception as exc:
			print('FAIL to fatch robot.txt {},{}'.format(_url_scheme, _url_netloc))
			print(exc)
			return None
		return rules
 def robot_rules(_url_scheme, _url_netloc):  # return a robot rules objects
     _domain = urlunparse((_url_scheme, _url_netloc, '', '', '', ''))
     robots = RobotsCache()
     try:
         rules = robots.fetch(_domain, timeout=5)
     except Exception as exc:
         print('FAIL to fatch robot.txt {},{}'.format(
             _url_scheme, _url_netloc))
         print(exc)
         return None
     return rules
Ejemplo n.º 7
0
	def robot_rules(_url_scheme, _url_netloc):  # return a robot rules objects
		#_parsed_url = urlparse(_url)
		_domain = urlunparse((_url_scheme, _url_netloc, '', '', '', ''))
		robots = RobotsCache()
		try:
			#print('DOMAIN: {}'.format(_domain))
			rules = robots.fetch(_domain)
		except Exception as exc:
			print('FAIL to fatch robot.txt')
			print(_url_scheme, _url_netloc)
			print(exc)
			return None
		return rules
Ejemplo n.º 8
0
    def __init__(self, robots_url=None):
        if robots_url:
            robots = RobotsCache()
            self._rules = robots.fetch(robots_url)
            self.is_use_robots = True
        else:
            self.is_use_robots = False

        self._url_norm = UrlNorm()
        self.counter = 0
        self.urls = dict()
        self.connections = defaultdict(set)
        self._lock = RLock()
Ejemplo n.º 9
0
    def __init__(self, robots_url=None):
        if robots_url:
            robots = RobotsCache()
            self._rules = robots.fetch(robots_url)
            self.is_use_robots = True
        else:
            self.is_use_robots = False

        self._url_norm = UrlNorm()
        self.counter = 0
        self.urls = dict()
        self.connections = defaultdict(set)
        self._lock = RLock()
Ejemplo n.º 10
0
def crawl_Pages(Seed):
    r = RobotsCache()
    robots_url=urljoin(Seed,'/robots.txt')  
    x = r.fetch(robots_url)
    unvisited=[Seed]
    visited=[]
    cnt=0
    delay=5
    while unvisited:
        page=unvisited.pop(0)
        
        hdr={'User-Agent':'*'}
        try:
            req = urllib2.Request(page, headers=hdr)         
            pagecontent=urllib2.urlopen(req)            
            if page not in visited:
                time.sleep(delay)
                s=pagecontent.read()  
                if (ishtmlcontent(pagecontent)):          
                    soup=BeautifulSoup(s)
                    links=soup.findAll('a',href=True)     
                    for l in links:
                        if (isurlvalid(l['href'])):
                            u1=urljoin(page,l['href'])
                            unvisited.append(u1)
                    if x.allowed(page,'*'):
                        visited.append(page)
                        cnt=cnt+1
                        print cnt
                        print 'Crawled:'+page
                        visited=remove_duplicates(visited)
                else:
                    if(page.endswith(".pdf")):
                        visited.append(page)
                        cnt=cnt+1
                        print 'Crawled:'+page
                        visited=remove_duplicates(visited)
            if(len(visited)==100):
                    unvisited=[]
        except Exception, err:
            print Exception, err
            continue
Ejemplo n.º 11
0
class RobotsValidator(object):
    """ Validates urls via robots.txt file """
    def __init__(self, agent):
        self._agent = agent
        self.robots = RobotsCache()

    def get_allowed_from(self, child_urls):
        """
        :param child_urls: List of child urls to check robots.txt on
        :return: A list of allowed child urls to crawl
        """
        allowed = []
        domains = list(set('{0}'.format(get_domain(url)) for url in child_urls))
        domain_to_children = {domain: filter(lambda u: get_domain(u) == domain, child_urls) for domain in domains}
        for domain in domain_to_children:
            try:
                rules = self.robots.fetch(domain)
                for url in domain_to_children[domain]:
                    if rules.allowed(url, self._agent):
                        allowed.append(url)
            except:
                allowed.extend(domain_to_children[domain])
        return allowed
Ejemplo n.º 12
0
class TestCache(unittest.TestCase):
    def setUp(self):
        self.robots = RobotsCache()

    def test_404(self):
        '''When we get a 404, assume free range'''
        with asis.Server('tests/asis/test_404', port=8080):
            self.assertEqual(self.robots.allowed(
                'http://localhost:8080/foo', 'rogerbot'), True)

    def test_caching(self):
        '''We should be able to cache results'''
        with asis.Server('tests/asis/test_caching', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_context_manager(self):
        '''When using as a context manager, it should clear afterwards'''
        with asis.Server('tests/asis/test_context_manager', port=8080):
            with self.robots:
                self.assertEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
                self.assertNotEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
            # And now, we should have it no longer cached
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_expires(self):
        '''Should be able to recognize expired rules'''
        with asis.Server('tests/asis/test_expires', port=8080):
            old_ttl = self.robots.min_ttl
            self.robots.min_ttl = 0
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo', True), None)
            # Now, it shouldn't be cached, so when we find it again, it should
            # be missing (or at least, requiring a refetch)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo', False), None)
            self.robots.min_ttl = old_ttl

    def test_clear(self):
        '''Should be able to explicitly clear rules'''
        with asis.Server('tests/asis/test_clear', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            # Now if we clear the rules, we should not find it
            self.robots.clear()
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_fetch(self):
        '''Ensure that 'fetch' doesn't cache'''
        with asis.Server('tests/asis/test_fetch', port=8080):
            self.assertNotEqual(
                self.robots.fetch('http://localhost:8080/foo'), None)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_cache(self):
        '''Ensure we can ask it to cache a result'''
        with asis.Server('tests/asis/test_cache', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.assertNotEqual(
                self.robots.cache('http://localhost:8080/foo'), None)
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_add(self):
        '''We should be able to add rules that we get'''
        with asis.Server('tests/asis/test_add', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.add(self.robots.fetch(
                'http://localhost:8080/foo'))
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_server_error(self):
        '''Make sure we can catch server errors'''
        self.assertRaises(ServerError, self.robots.allowed,
            'http://localhost:8080/foo', 'rogerbot')

    def test_disallowed(self):
        '''Check the disallowed interface'''
        with asis.Server('tests/asis/test_disallowed', port=8080):
            self.assertFalse(self.robots.disallowed(
                'http://localhost:8080/foo', 'rogerbot'))
            urls = [
                'http://localhost:8080/foo',
                'http://localhost:8080/bar'
            ]
            self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls)
            self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), [])

    def test_delay(self):
        '''Check the delay interface'''
        with asis.Server('tests/asis/test_delay', port=8080):
            self.assertEqual(self.robots.delay(
                'http://localhost:8080/foo', 'rogerbot'), 5)

    def test_sitemaps(self):
        '''Check the sitemaps interface'''
        with asis.Server('tests/asis/test_sitemaps', port=8080):
            self.assertEqual(
                self.robots.sitemaps('http://localhost:8080/foo'), [
                    'http://localhost:8080/a',
                    'http://localhost:8080/b',
                    'http://localhost:8080/c'
                ])
Ejemplo n.º 13
0
class Worker(object):
    def __init__(self, args):
        if args.statsd and not re.match(REGEX_STATSD_HOST, args.statsd):
            raise Exception("Invalid statsd host provided")
        self.statsd_host = self.statsd_port = None
        self.statsd_disabled = True
        if args.statsd:
            self.statsd_host, self.statsd_port = args.statsd.rsplit(":", 1)
            self.statsd_port = int(self.statsd_port)
            self.statsd_disabled = False
        self.statsd_connection = statsd.Connection(host=self.statsd_host,
            port=self.statsd_port, sample_rate=0.5,
            disabled=self.statsd_disabled)
        self.statsd = statsd.Client("hydra_worker", self.statsd_connection)
        self.statsd_timers = {}
        self.statsd_counter = self.statsd.get_client(class_=statsd.Counter)
        
        self.get_info = self.time(self.get_info)
        utils.find_urls = self.time(utils.find_urls)
        self.get_jobs = self.time(self.get_jobs)
        self.done_jobs = self.time(self.done_jobs)
        self.get_tasks = self.time(self.get_tasks)
        
        self.args = args
        self.threads = []
        self.working = True
        self.break_now = False
        self.jobs = {}
        self.job_lock = threading.Lock()
        self.broker_lock = threading.Lock()
        self.insert_queue = queue()
        self.print_queue = queue()
        self.fill_queue = queue()
        self.robots = RobotsCache()
        self.worker_id = args.worker_id or uuid.uuid1().hex
        self.socket = context.socket(zmq.REQ)
        self.socket.connect(args.broker_address)
        self.get_info()
        self.headers = {"Accept-encoding": "gzip", "User-agent": self.info["u"]}
        if args.db_override:
            self.info["d"] += " host=%s" % args.db_override
        self.database = database.Database(self.info["d"])
            
    def get_timer(self):
        return self.statsd.get_client(class_=statsd.Timer)
    
    def time(self, function):
        def wrapper(*args, **kwargs):
            with self.get_timer().time(function.__name__):
                return function(*args, **kwargs)
        return wrapper
    
    def start(self):
        threaded_jobs = []
        threaded_jobs.append(self.do_heartbeat)
        threaded_jobs.append(self.insert_urls)
        threaded_jobs.append(self.printer)
        threaded_jobs.append(self.fill_jobs)
        for job in threaded_jobs:
            new_thread = threading.Thread(target=job)
            new_thread.daemon = True
            new_thread.start()
            self.threads.append(new_thread)
        self.do_jobs()
    
    def get_info(self):
        self.socket.send(make_message(BYTE_HELLO, self.worker_id))
        response = check_message(self.socket.recv(), BYTE_HELLO)
        self.info = response[1]
    
    def get_jobs(self, count = 1):
        self.print_queue.put("broker getting jobs")
        jobs = {}
        
        first_loop = True
        while self.working and len(jobs) < count:
            if not first_loop:
                time.sleep(2)
            first_loop = False
            count_needed = count-len(jobs)-1
            self.broker_lock.acquire()
            self.socket.send(make_message(BYTE_GET_JOB, self.worker_id, 
                count_needed))
            response = check_message(self.socket.recv(), BYTE_GET_JOB)
            self.broker_lock.release()
            for job, tasks in response[1]:
                job = job.decode("utf8")
                tasks = [[https, task.decode("utf8")] for https, task in tasks]
                jobs[job] = tasks
        
        for job, tasks in jobs.items():
            try:
                robots = self.robots.fetch("http://%s" % job, timeout=5)
                sleep = robots.delay(self.info["n"])
            except Exception as e:
                robots = sleep = None
                self.print_queue.put("failed to get robots")
            sleep = sleep or self.info["s"]
            self.jobs[job] = {"tasks": tasks, "robots": robots, "sleep": sleep,
                "timestamp": 0.0}
            self.insert_queue.put([job, False, job])
        task_count = 0
        for _, tasks in jobs.items():
            task_count += len(tasks)
        self.print_queue.put("broker got %d jobs (%d tasks)" % (count,
            task_count))
    
    def done_jobs(self, *jobs):
        self.broker_lock.acquire()
        self.socket.send(make_message(BYTE_JOB_DONE, *jobs))
        self.socket.recv()
        self.broker_lock.release()
        self.job_lock.acquire()
        for job in jobs:
            self.jobs.pop(job)
        self.job_lock.release()
    
    def get_tasks(self, job):
        if not job in self.jobs:
            return 0
        new_urls = self.database.get_urls(job)
        self.print_queue.put("got tasks for %s (%d)" % (job, len(new_urls)))
        new_urls = [[https, task.decode("utf8")] for https, task in new_urls]
        new_allowed_urls = []
        for new_https, new_url in new_urls:
            task_scheme = "%s%s" % (utils.get_scheme(new_https), new_url)
            if not self.jobs[job]["robots"] or self.jobs[job]["robots"].allowed(
                    task_scheme, self.info["n"]):
                new_allowed_urls.append([new_https, new_url])
            else:
                self.database.timestamp(new_url)
        self.jobs[job]["tasks"] += new_urls
        return len(new_urls)
    
    def fill_jobs(self):
        while self.working:
            job = self.fill_queue.get(True)
            if self.get_tasks(job) == 0 and not self.jobs[job]["tasks"]:
                self.print_queue.put("removing job %s" % job)
                self.done_jobs(job)
                self.get_jobs()
    
    def yield_tasks(self):
        while self.working:
            tasks = []
            for job in list(self.jobs):
                if not self.jobs[job]["tasks"]:
                    self.fill_queue.put(job)
                    continue
                after_delay = self.jobs[job]["timestamp"]+self.jobs[job]["sleep"]
                time_since = after_delay-time.time()
                if not time_since > 0 or self.jobs[job]["timestamp"] == 0.0:
                    https, task = self.jobs[job]["tasks"].pop(0)
                    if not len(self.jobs[job]["tasks"]) > 0:
                        self.fill_queue.put(job)
                    task_scheme = "%s%s" % (utils.get_scheme(https), task)
                    tasks.append(task_scheme)
                    self.jobs[job]["timestamp"] = time.time()
            if tasks:
                yield tasks
    
    def printer(self):
        while self.working:
            line = self.print_queue.get(True)
            print datetime.datetime.now(), line
    
    def insert_urls(self):
        while self.working:
            self.database.start_transaction()
            urls = []
            hostnames = set([])
            for insert in xrange(self.info["b"]):
                try:
                    to_insert = self.insert_queue.get(True, 4)
                except Empty:
                    break
                if to_insert[0] and to_insert[2]:
                    urls.append(to_insert)
                    hostnames.add((to_insert[2],))
                self.insert_queue.task_done()
            if urls and hostnames:
                self.database.insert(urls, hostnames)
            self.database.stop_transaction()
    
    def do_jobs(self):
        self.get_jobs(self.args.jobs or 2)
        time_before_yield = 0.0
        for tasks in self.yield_tasks():
            if not time_before_yield == 0.0:
                self.print_queue.put("got yield tasks, took %f seconds" % (
                    time.time()-time_before_yield))
            time_before_get = time.time()
            
            get_requests = dict([(grequests.get(task, timeout=5), task)
                for task in tasks])
            grequests.map(get_requests.keys(), stream=True)
            self.print_queue.put("got responses, took %f seconds" % (
                time.time()-time_before_get))
            found_count = 0.0
            get_successful = []
            get_failed = []
            get_wrong_type = []
            get_responses = {}
            time_before_process = time.time()
            for request in list(get_requests):
                original_url = get_requests[request].split("://", 1)[-1]
                response = request.response
                if not response or not response.status_code < 400:
                    get_failed.append(original_url)
                elif not response.headers.get("content-type", "").startswith(
                        "text/"):
                    get_wrong_type.append(original_url)
                else:
                    get_responses[response] = original_url
            self.print_queue.put("finished processing, took %f seconds" % (
                time.time()-time_before_process))
            time_before_responses = time.time()
            gevent.joinall([gevent.spawn(getattr, response, "text") for response
                in get_responses])
            self.print_queue.put("got second responses, took %f seconds" % (
                time.time()-time_before_responses))
            time_before_second_process = time.time()
            for response in list(get_responses):
                original_url = get_responses[response]
                try:
                    text = response.text
                except:
                    get_failed.append(original_url)
                    continue
                if not text:
                    get_failed.append(original_url)
                else:
                    actual_url = response.url
                    get_successful.append(original_url)
                    found_urls = utils.find_urls(response.text, actual_url)
                    found_count += len(found_urls)
                    for url in found_urls:
                        url_parts = utils.process_url(url)
                        if url_parts:
                            self.statsd_counter.increment("url_found")
                            self.insert_queue.put(url_parts)
            
            self.print_queue.put("finished second processing, took %f seconds" %
                (time.time()-time_before_second_process))
            time_taken = time.time()-time_before_get
            stats = "tried %d" % len(tasks)
            stats += ", success %d" % len(get_successful)
            stats += ", fail %d" % len(get_failed)
            stats += ", wrong %d" % len(get_wrong_type)
            stats += ", took %f seconds" % time_taken
            if get_successful:
                stats += ", found %d" % found_count
                stats += ", %f/site" % (found_count/len(get_successful))
                stats += ", %f/second" % (found_count/time_taken)
            self.print_queue.put(stats)
            for url in get_successful:
                self.database.timestamp(url)
            for url in get_failed:
                self.database.timestamp(url, 1)
            for url in get_wrong_type:
                self.database.timestamp(url, 2)
            time_before_join = time.time()
            self.insert_queue.join()
            self.print_queue.put("finished insert queue join, took %f seconds" %
                (time.time()-time_before_join))
            time_before_yield = time.time()
    
    def do_heartbeat(self):
        while self.working:
            time.sleep(self.info["h"])
            request = make_message(BYTE_HEARTBEAT, self.worker_id,
                *list(self.jobs))
            
            self.broker_lock.acquire()
            self.socket.send(request)
            response = check_message(self.socket.recv(), BYTE_HEARTBEAT)
            self.broker_lock.release()
            if len(response) > 1 and response[1] == BYTE_GET_JOB:
                for bad_job in response[2:]:
                    self.job_lock.acquire()
                    assert bad_job in self.jobs
                    del self.jobs[bad_job]
                    self.job_lock.release()
                    self.get_job()
Ejemplo n.º 14
0
#################################################
#################################################
# fetch content of url

while len(url_frontier) != 0:
    # pop any random url
    url = url_frontier.pop()
    
    try:        
        print("\n---------------------------------------------------------")
        print("Crawling:", url)
        print("---------------------------------------------------------")


        # get crawl delay
        r = robots_cache.fetch(Robots.robots_url(url))[1]

        # check if its allowed to crawl that url? If not, then skip this url
        if not robots_cache.allowed(url, '*'):
            print("This URL is restricted to be crawled.")
            continue

        # insert this link to database
        cur.execute("INSERT OR IGNORE INTO crawled_urls (url_link) values(?)", (url,))

        # if its allowed to crawl, then get the crawling delay
        crawl_delay = r.agent("*").delay

        if crawl_delay is not None:
            time.sleep(crawl_delay)
        else:
Ejemplo n.º 15
0
class Hodor(object):
    def __init__(self, url, config={}, proxies={},
                 auth=None, ua=DEFAULT_HODOR_UA,
                 pagination_max_limit=DEFAULT_HODOR_MAX_PAGES,
                 crawl_delay=DEFAULT_CRAWL_DELAY,
                 ssl_verify=False,
                 trim_values=True,
                 robots=True,
                 reppy_capacity=100):

        self.content = None
        self.url = url
        self.domain = self._get_domain()
        self.proxies = proxies
        self.auth = auth
        self.ua = ua
        self.trim_values = trim_values
        self.ssl_verify = ssl_verify
        self.config = {}
        self.extra_config = {}

        self.robots = RobotsCache(capacity=reppy_capacity) if robots else None

        self._pages = []
        self._page_count = 0
        self._pagination_max_limit = pagination_max_limit
        self.crawl_delay = self._crawl_delay(crawl_delay)

        for k, v in config.items():
            if k.startswith("_"):
                self.extra_config[k.lstrip("_")] = v
            else:
                self.config[k] = v

    def _get_domain(self):
        parsed_uri = urlparse(self.url)
        return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

    def _crawl_delay(self, crawl_delay):
        if self.robots not in EMPTY_VALUES:
            expiry, robots = self.robots.fetch('{}robots.txt'.format(self.domain))
            delay = robots.agent(self.ua).delay
            try:
                crawl_delay = max(filter(partial(is_not, None),
                                         [delay, crawl_delay]))
            except ConnectionException:
                pass
        return crawl_delay

    def _fetch(self, url):
        '''Does the requests fetching and stores result in self.content'''

        if self.robots in EMPTY_VALUES or self.robots.allowed(url, self.ua):
            session = requests.session()
            headers = {'User-Agent': self.ua}
            if len(self.proxies) > 0:
                session.proxies = self.proxies
            if self.auth:
                r = session.get(url, headers=headers, auth=self.auth, verify=self.ssl_verify)
            else:
                r = session.get(url, headers=headers, verify=self.ssl_verify)
            self.content = r.content

        return self.content

    @staticmethod
    def _get_value(content, rule):
        '''Returns result for a specific xpath'''
        try:
            tree = html.fromstring(content)
        except TypeError:
            tree = None

        post_processing = rule.get('transform', lambda data: data)

        data = ""
        if tree not in EMPTY_VALUES:
            if 'xpath' in rule:
                data = tree.xpath(rule['xpath'])
            elif 'css' in rule:
                data = [node.text_content() for node in tree.cssselect(rule['css'])]

            many = rule.get('many', True)
            if not many:
                if len(data) == 0:
                    data = None
                else:
                    data = post_processing(data[0])
            else:
                data = [post_processing(d) for d in data]

        return data

    @staticmethod
    def _group_data(data, groups, config):
        del_fields = []
        for dest, group_fields in groups.items():
            if '__all__' in group_fields or group_fields == '__all__':
                group_fields = [rule for rule in config.keys() if not rule.startswith('_')]
                del_fields.extend(group_fields)

            gdata = []
            for field in group_fields:
                gdata.append(data[field])

            data[dest] = []
            for gd in zip(*gdata):
                d = {}
                for i, field in enumerate(group_fields):
                    d[field] = gd[i]
                data[dest].append(d)

        if len(del_fields) == 0:
            del_fields = [field for field_set in groups.values() for field in field_set]

        for field in del_fields:
            if field in data:
                del data[field]

    def _package_pages(self):
        self._data = {}
        if len(self._pages) == 1:
            self._data = self._pages[0]
        else:
            self._data = {key: [] for key in self._pages[0].keys()}
            for page in self._pages:
                for k, v in page.items():
                    if hasattr(v, '__iter__'):
                        self._data[k].extend(v)
                    else:
                        self._data[k].append(v)
        return self._data

    @classmethod
    def _parse(cls, content, config={}, extra_config={}, trim_values=True):
        '''Parses the content based on the config set'''
        if len(config) is 0:
            _data = {'content': content}
        else:
            _data = {}

            try:
                str_class = basestring
            except NameError:
                str_class = str

            for key, rule in config.items():
                value = cls._get_value(content, rule)
                if trim_values and value not in EMPTY_VALUES:
                    if 'many' in rule and rule['many']:
                        value = [v.strip() if isinstance(v, str_class) else v for v in value]
                    else:
                        value = value.strip() if isinstance(value, str_class) else value
                _data[key] = value

        paginate_by = extra_config.get('paginate_by')
        if paginate_by:
            paginate_by = cls._get_value(content, paginate_by)

        groups = extra_config.get('groups', {})
        if groups:
            cls._group_data(_data, groups, config)
        return _data, paginate_by

    def _get(self, url):
        self._fetch(url)
        data, paginate_by = self._parse(self.content, self.config, self.extra_config, self.trim_values)

        if paginate_by not in EMPTY_VALUES:
            paginate_by = urljoin(self.domain, paginate_by)

        return data, paginate_by

    def get(self, url=None):
        url = url if url else self.url
        self._data, paginate_by = self._get(url)

        self._pages.append(self._data)
        self._page_count += 1

        if paginate_by and self._page_count < self._pagination_max_limit:
            time.sleep(self.crawl_delay)
            self.get(paginate_by)

        self._package_pages()
        return self._data

    @property
    def data(self):
        if not hasattr(self, '_data'):
            self.get()
        return self._data
Ejemplo n.º 16
0
 def _fetch_sitemap_from_url(self, url):
     robots = RobotsCache()
     try:
         return robots.fetch(url, timeout=1.5).sitemaps
     except:
         return []
Ejemplo n.º 17
0
class TestCache(unittest.TestCase):
    def setUp(self):
        self.robots = RobotsCache()

    def test_404(self):
        '''When we get a 404, assume free range'''
        with asis.Server('tests/asis/test_404', port=8080):
            self.assertEqual(
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot'),
                True)

    def test_caching(self):
        '''We should be able to cache results'''
        with asis.Server('tests/asis/test_caching', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_context_manager(self):
        '''When using as a context manager, it should clear afterwards'''
        with asis.Server('tests/asis/test_context_manager', port=8080):
            with self.robots:
                self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                                 None)
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
                self.assertNotEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
            # And now, we should have it no longer cached
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_expires(self):
        '''Should be able to recognize expired rules'''
        with asis.Server('tests/asis/test_expires', port=8080):
            old_ttl = self.robots.min_ttl
            self.robots.min_ttl = 0
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo',
                                 fetch_if_missing=True), None)
            # If we ignore the TTL, it should still be there.
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo',
                                 fetch_if_missing=False,
                                 honor_ttl=False), None)
            # However, if we honor the TTL, it should be missing in the cache.
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo',
                                 fetch_if_missing=False), None)
            self.robots.min_ttl = old_ttl

    def test_clear(self):
        '''Should be able to explicitly clear rules'''
        with asis.Server('tests/asis/test_clear', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)
            # Now if we clear the rules, we should not find it
            self.robots.clear()
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_fetch(self):
        '''Ensure that 'fetch' doesn't cache'''
        with asis.Server('tests/asis/test_fetch', port=8080):
            self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'),
                                None)
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_cache(self):
        '''Ensure we can ask it to cache a result'''
        with asis.Server('tests/asis/test_cache', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'),
                                None)
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_add(self):
        '''We should be able to add rules that we get'''
        with asis.Server('tests/asis/test_add', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.add(self.robots.fetch('http://localhost:8080/foo'))
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_server_error(self):
        '''Make sure we can catch server errors'''
        with mock.patch.object(self.robots.session,
                               'get',
                               side_effect=TypeError):
            self.assertRaises(ServerError, self.robots.allowed,
                              'http://localhost:8080/foo', 'rogerbot')

    def test_disallowed(self):
        '''Check the disallowed interface'''
        with asis.Server('tests/asis/test_disallowed', port=8080):
            self.assertFalse(
                self.robots.disallowed('http://localhost:8080/foo',
                                       'rogerbot'))
            urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar']
            self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls)
            self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), [])

    def test_delay(self):
        '''Check the delay interface'''
        with asis.Server('tests/asis/test_delay', port=8080):
            self.assertEqual(
                self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5)

    def test_sitemaps(self):
        '''Check the sitemaps interface'''
        with asis.Server('tests/asis/test_sitemaps', port=8080):
            self.assertEqual(
                self.robots.sitemaps('http://localhost:8080/foo'), [
                    'http://localhost:8080/a', 'http://localhost:8080/b',
                    'http://localhost:8080/c'
                ])

    def test_dns_exception(self):
        '''Raises an exception if url does not resolve.'''
        self.assertRaises(ConnectionException, self.robots.allowed,
                          'http://does-not-resolve', 'rogerbot')

    def test_malformed_url(self):
        '''Raises an exception if the url is malformed.'''
        self.assertRaises(MalformedUrl, self.robots.allowed, 'hhttp://moz.com',
                          'rogerbot')

    def test_ssl_exception(self):
        '''Raises an exception if there is an ssl error.'''
        with asis.Server('tests/asis/test_ssl_exception', port=8080):
            self.assertRaises(SSLException, self.robots.allowed,
                              'https://localhost:8080', 'rogerbot')

    def test_excessive_redirects(self):
        '''Raises an exception if there are too many redirects.'''
        with asis.Server('tests/asis/test_excessive_redirects', port=8080):
            self.assertRaises(ExcessiveRedirects, self.robots.allowed,
                              'http://localhost:8080/one', 'rogerbot')

    def test_bad_status_codes(self):
        '''Raises an exception if there is a 5xx status code.'''
        with asis.Server('tests/asis/test_bad_status_codes', port=8080):
            self.assertRaises(BadStatusCode, self.robots.allowed,
                              'http://localhost:8080', 'rogerbot')
Ejemplo n.º 18
0
class TestCache(unittest.TestCase):
    def setUp(self):
        self.robots = RobotsCache()

    def test_404(self):
        '''When we get a 404, assume free range'''
        with asis.Server('tests/asis/test_404', port=8080):
            self.assertEqual(
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot'),
                True)

    def test_caching(self):
        '''We should be able to cache results'''
        with asis.Server('tests/asis/test_caching', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_context_manager(self):
        '''When using as a context manager, it should clear afterwards'''
        with asis.Server('tests/asis/test_context_manager', port=8080):
            with self.robots:
                self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                                 None)
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
                self.assertNotEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
            # And now, we should have it no longer cached
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_expires(self):
        '''Should be able to recognize expired rules'''
        with asis.Server('tests/asis/test_expires', port=8080):
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo', True), None)
            # Now, it shouldn't be cached, so when we find it again, it should
            # be missing (or at least, requiring a refetch)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo', False), None)

    def test_clear(self):
        '''Should be able to explicitly clear rules'''
        with asis.Server('tests/asis/test_clear', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)
            # Now if we clear the rules, we should not find it
            self.robots.clear()
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_fetch(self):
        '''Ensure that 'fetch' doesn't cache'''
        with asis.Server('tests/asis/test_fetch', port=8080):
            self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'),
                                None)
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_cache(self):
        '''Ensure we can ask it to cache a result'''
        with asis.Server('tests/asis/test_cache', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'),
                                None)
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_add(self):
        '''We should be able to add rules that we get'''
        with asis.Server('tests/asis/test_add', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.add(self.robots.fetch('http://localhost:8080/foo'))
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_server_error(self):
        '''Make sure we can catch server errors'''
        self.assertRaises(ServerError, self.robots.allowed,
                          'http://localhost:8080/foo', 'rogerbot')

    def test_disallowed(self):
        '''Check the disallowed interface'''
        with asis.Server('tests/asis/test_disallowed', port=8080):
            self.assertFalse(
                self.robots.disallowed('http://localhost:8080/foo',
                                       'rogerbot'))
            urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar']
            self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls)
            self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), [])

    def test_delay(self):
        '''Check the delay interface'''
        with asis.Server('tests/asis/test_delay', port=8080):
            self.assertEqual(
                self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5)

    def test_sitemaps(self):
        '''Check the sitemaps interface'''
        with asis.Server('tests/asis/test_sitemaps', port=8080):
            self.assertEqual(
                self.robots.sitemaps('http://localhost:8080/foo'), [
                    'http://localhost:8080/a', 'http://localhost:8080/b',
                    'http://localhost:8080/c'
                ])