Esempio n. 1
0
	def download_pages_in_queue(self, queue):		
		current_page_url = queue.get()
		
		robot = RobotsCache()
		if (robot.allowed(current_page_url, "*")):

			print current_page_url
			if len(current_page_url) < 10: return	
			current_page_html = download_page_by_url(current_page_url)			
			bs = BeautifulSoup(current_page_html, "html.parser")

			links = bs.find_all('a', href=True)
			post_links = [link['href'] for link in links]
			
			for post_link in post_links:
				if len(post_link) < 10: continue
				if str(post_link).find('http') != 0:
					post_link = str(self.start_url) + str(post_link)
				queue.put(post_link)
			self.sites_num = self.sites_num + 1		

			page = Pages(url = current_page_url, parsed_text = get_text_from_html(current_page_html), is_indexed = False)
			page.save()
		else:
			print "Page can't be indexed because of the rules in ROBOTS.TXT"	
Esempio n. 2
0
    def download_pages_in_queue(self, queue):
        current_page_url = queue.get()

        robot = RobotsCache()
        if (robot.allowed(current_page_url, "*")):

            print current_page_url
            if len(current_page_url) < 10: return
            current_page_html = download_page_by_url(current_page_url)
            bs = BeautifulSoup(current_page_html, "html.parser")

            links = bs.find_all('a', href=True)
            post_links = [link['href'] for link in links]

            for post_link in post_links:
                if len(post_link) < 10: continue
                if str(post_link).find('http') != 0:
                    post_link = str(self.start_url) + str(post_link)
                queue.put(post_link)
            self.sites_num = self.sites_num + 1

            page = Pages(url=current_page_url,
                         parsed_text=get_text_from_html(current_page_html),
                         is_indexed=False)
            page.save()
        else:
            print "Page can't be indexed because of the rules in ROBOTS.TXT"
Esempio n. 3
0
class RobotsMiddleware(BaseMiddleware):

    def __init__(self, *args, **kwargs):
        self.cache = RobotsCache(*args, **kwargs)
        self.visited = collections.defaultdict(dict)

    def check_disallow(self, url, agent):
        if not self.cache.allowed(url, agent):
            raise RobotsDisallowedError

    def check_crawl_delay(self, url, agent):
        delay = self.cache.delay(url, agent)
        if delay is None:
            return
        now = datetime.datetime.utcnow()
        host = urlparse.urlparse(url).hostname
        try:
            last_visit = self.visited[agent][host]
            if (now - last_visit).seconds < delay:
                raise RobotsThrottledError
        except KeyError:
            pass
        self.visited[agent][host] = now

    def before_send(self, request, *args, **kwargs):
        url = request.url
        agent = request.headers.get('User-Agent')
        self.check_disallow(url, agent)
        self.check_crawl_delay(url, agent)
Esempio n. 4
0
 def robot_pass(self,page):
     """
     Accepts page [object]
     Creates instance of RobotsCache (from reppy)
     Passes URL of page as string into robots.allowed method
     Returns True or False
     """
     robots = RobotsCache()
     return robots.allowed(page.get_url(), '*')
def get_scanner_mock(request_limit):
    robots_cache = RobotsCache()
    robots_cache.fetch = MagicMock(return_value=robots_cache)
    robots_cache.allowed = MagicMock(return_value=True)
    robots_validator = RobotsValidator(agent='*')
    robots_validator.robots = robots_cache
    scanner = UrlScanner(request_limit)
    scanner.url_fetcher = get_url_fetcher_mock(request_limit)
    scanner.robots_validator = robots_validator
    return scanner
Esempio n. 6
0
 def check_for_robot_access(self, page):
     self.f.write('--- checking for robots %s\n' % page)
     robots = RobotsCache()
     try:
         if robots.allowed(page + 'robots.txt', 'my-agent'):
             print 'robots allowed'
             self.f.write('robots allowed. \n')
             return True
     except ServerError, r:
         print 'error ', r
         return False
	def check_for_robot_access(self, page):
		self.f.write('--- checking for robots %s\n' %page)
		robots = RobotsCache()
		try:
			if robots.allowed(page+'robots.txt', 'my-agent'):
				print 'robots allowed'
				self.f.write('robots allowed. \n')
				return True
		except ServerError, r:
			print 'error ', r
			return False
Esempio n. 8
0
 def get_text_by_base_url(self):
     robots = RobotsCache(capacity=100)
     if not robots.allowed(self.base_url, "python-requests"):
         return ["Crawling this site is not allowed by robots.txt"]
     text_list = []
     for slug in self.__get_links_by_url_depth():
         sleep(0.5)
         text_list.append(
             remove_emoji(
                 remove_url(self.__get_text_by_url(self.base_url +
                                                   slug))).strip())
     return text_list
def confirm_robots_txt(target_url, max_capacity):
    '''confirm that target url is allowed to crawl

    :type target_url: str
    :param target_url: agent wanna crawl
    :type max_capacity: int
    :param max_capacity: limit of max crawling pages
    :rtype: bool
    :return: weather it is possible to scrape
    '''
    robots = RobotsCache(max_capacity)
    return robots.allowed(target_url, 'python program')
Esempio n. 10
0
    def run(self):
        global terminator
        pattern='(http://)(\w*\.)+\w+(/\w*)*'
        #Initialize RobotsCache object
        robots=RobotsCache()
        while 1:
            if terminator:
                break
            cur_raw_tweet=raw_tweets.get(True)
            curtweet=json.loads(cur_raw_tweet)
            if DEBUG:
                print "Got an item from raw_tweets", current_thread().getName()

            # Check if twitter has tate limited you by sending a blank tweet
            if u'text' in curtweet.keys():
                text=curtweet[u'text']
            else:
                print "Rate limited by twitter. Continuing"
                continue

            #Get text and check if it has links using regex.
            link=re.search(pattern,text)
            if link:
                if DEBUG:
                    print "match"
                flink=link.group()

                #Check if crawling is allowed
                try:
                    if robots.allowed(flink,'tweetbot'):
                        soup=BeautifulSoup(urllib2.urlopen(flink),"lxml")

                        #Check if page has title
                        if soup.title:
                            curtweet[u'linkTitle']=soup.title.string
                except reppy.ReppyException:
                    print "Error fetching robots.txt. Continuing"
                    continue
                except urllib2.URLError:
                    print "Bad Url. Report to the developer. Continuing"
                    continue
                except urllib2.HTTPError:
                    print "Error Fetching Web Page. Continuing"
                    continue

            else:
                if DEBUG:
                    print "not match"

            processed_tweets.put(json.dumps(curtweet),True)
            if DEBUG:
                print "Put on processed queue. ProcessedSize", processed_tweets.qsize()
Esempio n. 11
0
class EZWS:
    """
	SELF:

	config json config file
	ua     user agent
	robo   robotcache obj
	link   current link
	urlp   url parse object for current link
	soup   current html page soup obj
	req    requests obj
	raw    raw html from req.get()
	check  check for robot files, keep true
	output name of output csv file
	"""
    def __init__(self,
                 file,
                 ua,
                 check=True,
                 output="output.csv"
                 ):  #setting output to false disables file output
        if check:  #only setup robot checker if robot checking is enabled
            self.ua = ua  #user agent
            self.robo = RobotsCache(capacity=100)

        #check var disables or enables robots.txt checking
        #recommended to keep default True value
        self.check = check
        self.req = requests  #request obj for parsing url

        self.output = output  #where to output file

        self.data = []  #init array of grabbed sites

        self.configarr = []  #empty array of all configs

        if type(file) is list:
            self.configarr = file
        else:
            self.configarr.append(file)

    def allowed(self, url):  #checks if url is ok to download
        if self.check:
            if self.robo.allowed(url, self.ua):  #checks robot file
                return True
            else:
                print(url, "is not allowed")  #notify user if url isnt allowed
                return False
        else:
            return True  #if robot checking is off, return true regardless

    @property  #when url is called, return it
    def url(self):
        if hasattr(self, "link"):  #handles whether self has link attribute
            return self.link
        else:
            return ""  #if not return empty string

    @url.setter  #when url is set, parse it
    def url(self, url):
        self.link = url
        self.urlp = urlparse(url)

    def download(self, url):
        if self.allowed(url):
            self.raw = self.req.get(url).content
            self.soup = BeautifulSoup(self.raw,
                                      "html.parser")  #loads html into soup obj

    def xpath(self, html, xp):  #takes html and returns data from xpath
        tree = lxmlhtml.fromstring(html)  #generates tree
        return tree.xpath(xp)  #returns data from tree

    def select(self, html,
               obj):  #determines whether to grab using css or xpath
        if "xpath" in obj:  #if xpath
            items = self.xpath(html.getText(),
                               obj["xpath"])  #return xpath selector arr
        else:  #css
            items = html.select(obj["css"])  #return a css selector arr

        if self.config["header"]:  #if theres a header keep data to one column
            items = items[:1]

        if "css" in obj:  #if data is css attribute(s) from element
            row = []
            for item in items:
                cont = [
                ]  #arr for storing attribs from each css selected element
                if type(
                        obj["contents"]
                ) is str:  #if contents is a string, put it into an array
                    obj["contents"] = [obj["contents"]]

                for content in obj["contents"]:
                    if content:  #if not empty, get the element from tag
                        cont.append(item[content])
                    else:  #if empty, get the text from tag
                        cont.append(item.text)
                row += cont  #append attribs to attrib array
            return row  #return all the attribs (css)
        else:
            return items  #return xpath

    def clear(self):
        self.data = []

    def load(self, index):
        tmp = self.configarr[index]

        if type(tmp) is dict:  #if file is json obj, load it
            self.config = tmp
        else:  #assume it is a file and load it
            if os.path.exists(tmp):
                with open(tmp) as f:
                    self.config = json.load(f)  #opens and parses json file

    def grab(self, index=None):
        if index == None:  #using grab() with no params will grab all configs passed
            for i in range(len(self.configarr)):
                self.grab(i)  #grab "i" config file
        else:
            self.load(index)  #get current file obj
            if self.output:  #only create simplecsv obj if file outputting is on
                sc = simplecsv(self.output,
                               mode="w+")  #using w+ mode to remove old output
                if self.config["header"]:
                    sc.writerow(
                        self.config["header"])  #add header from config to csv

            for link in self.config["links"]:  #loop through links
                samelinks = []  #empty list of links for now
                if type(link["url"]) is str:
                    samelinks.append(
                        link["url"]
                    )  #if url is a single str not array append it to an array
                else:  #assume it is an array
                    samelinks = link["url"]

                for samelink in samelinks:  #passing "url" an array of urls will do the same params on all the links
                    if self.allowed(samelink):  #check if url is allowed
                        self.download(samelink)  #if so download it
                        for divs in self.soup.select(link["container"]):
                            add = []
                            for get in link[
                                    "grab"]:  #grabs each element from inside each div
                                add += self.select(divs, get)

                            self.data += add  #update internal data
                            if self.output:
                                sc.writerow(
                                    add
                                )  #only write to disk if file output is on
            if self.output:
                sc.close()  #only close "sc" if file output is on
Esempio n. 12
0
class EZWS:
    """
	SELF:

	config json config file
	ua     user agent
	txt    path to current robot file
	robo   robotcache obj
	link   current link
	urlp   url parse object for current link
	soup   current html page soup obj
	req    requests obj
	raw    raw html from req.get()
	check  check for robot files, keep true
	output name of output csv file
	"""
    def __init__(self, file, ua, check=True, output="output.csv"):
        if check:  #only setup robot checker if robot checking is enabled
            self.ua = ua  #user agent
            self.robo = RobotsCache(capacity=0)

        #check disables or enables robots.txt checking
        #recommended to keep default True value
        self.check = check
        self.req = requests

        if os.path.exists(file):
            with open(file) as f:
                self.config = json.load(f)  #opens and parses json file

    def allowed(self, url):  #checks if url is ok to download
        if self.check:
            if self.robo.allowed(url, self.ua):  #checks robot file
                return True
            else:
                print(url, "is not allowed")  #notify user if url isnt allowed
                return False
        else:
            return True  #if robot checking is off, return true regardless

    @property  #when url is called, return it
    def url(self):
        if hasattr(self, "link"):  #handles whether self has link attribute
            return self.link
        else:
            return ""  #if not return empty string

    @url.setter  #when url is set, parse it
    def url(self, url):
        self.link = url
        self.urlp = urlparse(url)

    def download(self, url):
        if self.allowed(url):
            self.raw = self.req.get(url).content
            self.soup = BeautifulSoup(self.raw,
                                      "html.parser")  #loads html into soup obj

    def grab(self):
        sc = simplecsv("output.csv",
                       mode="w+")  #using w+ mode to remove old output
        sc.writerow(self.config["header"])  #add header from config to csv

        for link in self.config["links"]:  #loop through links
            if self.allowed(link["url"]):  #check if url is allowed
                self.download(link["url"])  #if so download it
                for divs in self.soup.select(link["container"]):
                    row = []
                    for get in link[
                            "grab"]:  #grabs each element from inside each div
                        item = divs.select(get["css"])[0]
                        if get["content"]:  #if not empty, get the element from tag
                            row.append(item[get["content"]])
                        else:  #if empty, get the text from tag
                            row.append(item.text)
                    sc.writerow(row)
        sc.close()
Esempio n. 13
0
・画像クローラー
    http://qiita.com/komakomako/items/dd380f980e56e70fa321

Targets:
・https://reverb.com/jp/marketplace/electric-guitars
・https://www.yahoo.co.jp
"""

# (1) クロールするurlを決める
target_url = "https://www.yahoo.co.jp"

# (2) robot.txtを読み込むため際に使用するインスタンスの作成
robots = RobotsCache(100)

# (3) もし、robot.txtを読み込んでみて、クロール許可をもらえたら、先の処理に進む
if robots.allowed(target_url, 'python program'):
    # (4) Javascriptで生成されたコードでもクロールできるようにPhatomJSインスタンスを作成する
    driver = webdriver.PhantomJS()
    # (5) 作成したインスタンスのGetリクエストを呼ぶメソッドに対象のurlを引数として与え、domの情報を手に入れる
    driver.get(target_url)
    # <selenium.webdriver.phantomjs.webdriver.WebDriver (session="b140b9a0-74d3-11e7-b434-8b9f5b309f17")>
    # type(driver)
    # <class 'selenium.webdriver.phantomjs.webdriver.WebDriver'>

    # (6) 先ほど取得したdomの情報をutf-8でエンコードして、クロール対象ページの情報をbyte型として保持する
    html = driver.page_source.encode('utf-8')
    # type(html)
    # <class 'bytes'>

    # html = requests.get(target_url)
    # < Response [200]>
Esempio n. 14
0
while len(url_frontier) != 0:
    # pop any random url
    url = url_frontier.pop()
    
    try:        
        print("\n---------------------------------------------------------")
        print("Crawling:", url)
        print("---------------------------------------------------------")


        # get crawl delay
        r = robots_cache.fetch(Robots.robots_url(url))[1]

        # check if its allowed to crawl that url? If not, then skip this url
        if not robots_cache.allowed(url, '*'):
            print("This URL is restricted to be crawled.")
            continue

        # insert this link to database
        cur.execute("INSERT OR IGNORE INTO crawled_urls (url_link) values(?)", (url,))

        # if its allowed to crawl, then get the crawling delay
        crawl_delay = r.agent("*").delay

        if crawl_delay is not None:
            time.sleep(crawl_delay)
        else:
            time.sleep(default_crawl_delay)
            
        #################################################
Esempio n. 15
0
class Hodor(object):
    def __init__(self, url, config={}, proxies={},
                 auth=None, ua=DEFAULT_HODOR_UA,
                 pagination_max_limit=DEFAULT_HODOR_MAX_PAGES,
                 crawl_delay=DEFAULT_CRAWL_DELAY,
                 ssl_verify=False,
                 trim_values=True,
                 robots=True,
                 reppy_capacity=100):

        self.content = None
        self.url = url
        self.domain = self._get_domain()
        self.proxies = proxies
        self.auth = auth
        self.ua = ua
        self.trim_values = trim_values
        self.ssl_verify = ssl_verify
        self.config = {}
        self.extra_config = {}

        self.robots = RobotsCache(capacity=reppy_capacity) if robots else None

        self._pages = []
        self._page_count = 0
        self._pagination_max_limit = pagination_max_limit
        self.crawl_delay = self._crawl_delay(crawl_delay)

        for k, v in config.items():
            if k.startswith("_"):
                self.extra_config[k.lstrip("_")] = v
            else:
                self.config[k] = v

    def _get_domain(self):
        parsed_uri = urlparse(self.url)
        return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

    def _crawl_delay(self, crawl_delay):
        if self.robots not in EMPTY_VALUES:
            expiry, robots = self.robots.fetch('{}robots.txt'.format(self.domain))
            delay = robots.agent(self.ua).delay
            try:
                crawl_delay = max(filter(partial(is_not, None),
                                         [delay, crawl_delay]))
            except ConnectionException:
                pass
        return crawl_delay

    def _fetch(self, url):
        '''Does the requests fetching and stores result in self.content'''

        if self.robots in EMPTY_VALUES or self.robots.allowed(url, self.ua):
            session = requests.session()
            headers = {'User-Agent': self.ua}
            if len(self.proxies) > 0:
                session.proxies = self.proxies
            if self.auth:
                r = session.get(url, headers=headers, auth=self.auth, verify=self.ssl_verify)
            else:
                r = session.get(url, headers=headers, verify=self.ssl_verify)
            self.content = r.content

        return self.content

    @staticmethod
    def _get_value(content, rule):
        '''Returns result for a specific xpath'''
        try:
            tree = html.fromstring(content)
        except TypeError:
            tree = None

        post_processing = rule.get('transform', lambda data: data)

        data = ""
        if tree not in EMPTY_VALUES:
            if 'xpath' in rule:
                data = tree.xpath(rule['xpath'])
            elif 'css' in rule:
                data = [node.text_content() for node in tree.cssselect(rule['css'])]

            many = rule.get('many', True)
            if not many:
                if len(data) == 0:
                    data = None
                else:
                    data = post_processing(data[0])
            else:
                data = [post_processing(d) for d in data]

        return data

    @staticmethod
    def _group_data(data, groups, config):
        del_fields = []
        for dest, group_fields in groups.items():
            if '__all__' in group_fields or group_fields == '__all__':
                group_fields = [rule for rule in config.keys() if not rule.startswith('_')]
                del_fields.extend(group_fields)

            gdata = []
            for field in group_fields:
                gdata.append(data[field])

            data[dest] = []
            for gd in zip(*gdata):
                d = {}
                for i, field in enumerate(group_fields):
                    d[field] = gd[i]
                data[dest].append(d)

        if len(del_fields) == 0:
            del_fields = [field for field_set in groups.values() for field in field_set]

        for field in del_fields:
            if field in data:
                del data[field]

    def _package_pages(self):
        self._data = {}
        if len(self._pages) == 1:
            self._data = self._pages[0]
        else:
            self._data = {key: [] for key in self._pages[0].keys()}
            for page in self._pages:
                for k, v in page.items():
                    if hasattr(v, '__iter__'):
                        self._data[k].extend(v)
                    else:
                        self._data[k].append(v)
        return self._data

    @classmethod
    def _parse(cls, content, config={}, extra_config={}, trim_values=True):
        '''Parses the content based on the config set'''
        if len(config) is 0:
            _data = {'content': content}
        else:
            _data = {}

            try:
                str_class = basestring
            except NameError:
                str_class = str

            for key, rule in config.items():
                value = cls._get_value(content, rule)
                if trim_values and value not in EMPTY_VALUES:
                    if 'many' in rule and rule['many']:
                        value = [v.strip() if isinstance(v, str_class) else v for v in value]
                    else:
                        value = value.strip() if isinstance(value, str_class) else value
                _data[key] = value

        paginate_by = extra_config.get('paginate_by')
        if paginate_by:
            paginate_by = cls._get_value(content, paginate_by)

        groups = extra_config.get('groups', {})
        if groups:
            cls._group_data(_data, groups, config)
        return _data, paginate_by

    def _get(self, url):
        self._fetch(url)
        data, paginate_by = self._parse(self.content, self.config, self.extra_config, self.trim_values)

        if paginate_by not in EMPTY_VALUES:
            paginate_by = urljoin(self.domain, paginate_by)

        return data, paginate_by

    def get(self, url=None):
        url = url if url else self.url
        self._data, paginate_by = self._get(url)

        self._pages.append(self._data)
        self._page_count += 1

        if paginate_by and self._page_count < self._pagination_max_limit:
            time.sleep(self.crawl_delay)
            self.get(paginate_by)

        self._package_pages()
        return self._data

    @property
    def data(self):
        if not hasattr(self, '_data'):
            self.get()
        return self._data
Esempio n. 16
0
class Crawler:

    def __init__(self,db_name):
        """Initialises the crawler with the name of the database"""
        self.con=sqlite.connect(db_name)
        self.stemmer = nltk.stem.porter.PorterStemmer()
        self.headers = { "User-Agent" : "Faizan Bhat's Web Crawler" }
        self.robots = RobotsCache()
        
    def __del__(self):
        self.con.close()

    def db_commit(self):
        self.con.commit()

    def get_entry_id(self,table,field,value,create_new=True):
        """Auxiliary function for getting an entry id and adding it if it is not present"""
        
        # Construct query
        cur = self.con.execute("select rowid from %s where %s='%s'" % (table,field,value))
        # Fetch
        res = cur.fetchone()
        # If not found
        if res==None:
            cur=self.con.execute("insert into %s (%s) values ('%s')" % (table,field,value))
            return cur.lastrowid
        else:
            return res[0]

    def add_to_index(self,url,soup):
        """Indexes an individual page"""
        if self.is_indexed(url): return
        print 'Indexing ' + url
        
        # Get text from soup
        text = self.get_text_only(soup)
        # Separate words
        words = self.separate_words(text)
        # Stem the list of words
        words = map(self.stem_word, words)
        # Get the url ID 
        url_id = self.get_entry_id('urllist','url',url)
        
        # Link each word to this url
        for i in range(len(words)):
            word = words[i]
            if word in ignore_words: continue
            word_id=self.get_entry_id('wordlist','word',word)
            self.con.execute('insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)' % (url_id,word_id,i))
        

    def get_text_only(self,soup):
        """Extracts the text from an HTML page (without tags)"""
        v=soup.string
        if v==None:
            c=soup.contents
            result_text=''
            for t in c:
                sub_text = self.get_text_only(t)
                result_text = result_text + sub_text+'\n'
            return result_text
        else:
            return v.strip()

    def separate_words(self,text):
        """Separates the words by any non-whitespace characters"""
        splitter = re.compile('\\W*')
        return [s.lower() for s in splitter.split(text) if s!='']

    def stem_word(self,word):
        """Uses NLTK porter stemming algorithm to stem a word"""
        return self.stemmer.stem(word)
    
    def is_indexed(self,url):
        """Return True if url is already indexed"""
        u=self.con.execute \
        ("select rowid from urllist where url='%s'" % url).fetchone()
        if u!=None:
            # Check if it has been crawled
            v=self.con.execute(
            'select * from wordlocation where urlid=%d' % u[0]).fetchone()
            if v!=None: return True
        return False
            

    def add_link_ref(self,url_from,url_to,link_text):
        """Adds a link between two pages"""
        words = self.separate_words(link_text)
        from_id = self.get_entry_id('urllist','url','urlFrom')
        to_id=self.get_entry_id('urllist','url','urlTo')
        if from_id == to_id: return
        cur = self.con.execute("insert into link(fromid,toid) values (%d,%d)" % (from_id,to_id))
        link_id = cur.lastrowid
        for word in words:
            if word in ignore_words: continue
            word_id = self.get_entry_id('wordlist','word',word)
            self.con.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (link_id,word_id))

    def crawl(self,pages,depth=2):
        """Does a breadth first search on a given list of pages and indexes as we go"""
        for i in range(depth):
            print "Depth = " + str(i)
            newpages=set()
            for page in pages:
                if not self.robots.allowed(page,"*"): 
                    print "%s disallows robots. Moving on." %page
                    continue
                try:
                    req = urllib2.Request(page, None, self.headers)
                    c=urllib2.urlopen(req)
                except:
                    print "Could not open %s" %page
                    continue
                soup = BeautifulSoup(c.read())
                self.add_to_index(page,soup)

                links=soup('a')
                
                for link in links:
                    if ('href' in dict(link.attrs)):
                        url=urljoin(page,link['href'])
                        if url.find("'")!=-1:continue
                        url=url.split('#')[0]
                        if url[0:4]:
                            if not self.is_indexed(url):
                                newpages.add(url)
                        link_text=self.get_text_only(link)
                        self.add_link_ref(page,url,link_text)
                    self.db_commit()
                
                pages=newpages

    def create_index_tables(self):
        """Creates the database tables"""
        self.con.execute('create table urllist(url)')
        self.con.execute('create table wordlist(word)')
        self.con.execute('create table wordlocation(urlid,wordid,location)')
        self.con.execute('create table link(fromid integer, toid integer)')
        self.con.execute('create table linkwords(wordid, linkid)')
        self.con.execute('create index wordidx on wordlist(word)')
        self.con.execute('create index urlidx on urllist(url)')
        self.con.execute('create index wordurlidx on wordlocation(wordid)')
        self.con.execute('create index urltoidx on link(toid)')
        self.con.execute('create index urlfromidx on link(fromid)')
        self.db_commit()
Esempio n. 17
0
class RobotsTxtMiddleware(object):
    DOWNLOAD_PRIORITY = 1000

    def __init__(self, crawler):
        if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
            raise NotConfigured

        self.completeblacklist = crawler.settings.get('ROBOTSTXT_BLACKLIST',
                                                      ())
        self.blacklist = []
        self.generalblacklist = crawler.settings.get('GENERAL_BLACKLIST', ())
        self.hasblacklist = False
        self.whitelist = crawler.settings.get('ROBOTSTXT_WHITELIST', ())
        self.crawler = crawler
        self._useragent = crawler.settings.get('USER_AGENT')
        self._parsers = {}
        self._spider_netlocs = set()
        self.robots = RobotsCache()

        self.stoprepetitionsrearg = re.compile(ur'.*?\&(.*?\&)\1{1,}.*')
        self.stoprepetitionsreslash = re.compile(ur'.*?\/(.*?\/)\1{1,}.*')

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        useragent = self._useragent
        if not self.hasblacklist:
            self.hasblacklist = True
            if ('http://' + spider.domain
                ) in self.completeblacklist and self.completeblacklist[
                    'http://' + spider.domain] != None:
                self.blacklist = [
                    el.lower()
                    for el in self.completeblacklist['http://' + spider.domain]
                ]
                log.msg(format="Got blacklist from DB for domain",
                        level=log.DEBUG,
                        request=request)
            else:
                log.msg(format="Didn't get a blacklist from DB for domain",
                        level=log.DEBUG,
                        request=request)
            self.blacklist.extend([el.lower() for el in self.generalblacklist])
        #Check for silly repeating arguments
        if self.stoprepetitionsrearg.match(
                request.url) != None or self.stoprepetitionsreslash.match(
                    request.url) != None:
            log.msg(format="URL is suspicious: %(request)s",
                    level=log.DEBUG,
                    request=request)
            raise IgnoreRequest
        #Blacklist overrides whitelist and robots
        if any(bl in request.url.lower() for bl in self.blacklist):
            log.msg(format="Forbidden by blacklist: %(request)s",
                    level=log.DEBUG,
                    request=request)
            raise IgnoreRequest
        if not any(wl in request.url for wl in
                   self.whitelist) and self.robots and not self.robots.allowed(
                       request.url, useragent):
            log.msg(format="Forbidden by robots.txt: %(request)s",
                    level=log.DEBUG,
                    request=request)
            raise IgnoreRequest
Esempio n. 18
0
class TestCache(unittest.TestCase):
    def setUp(self):
        self.robots = RobotsCache()

    def test_404(self):
        '''When we get a 404, assume free range'''
        with asis.Server('tests/asis/test_404', port=8080):
            self.assertEqual(
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot'),
                True)

    def test_caching(self):
        '''We should be able to cache results'''
        with asis.Server('tests/asis/test_caching', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_context_manager(self):
        '''When using as a context manager, it should clear afterwards'''
        with asis.Server('tests/asis/test_context_manager', port=8080):
            with self.robots:
                self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                                 None)
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
                self.assertNotEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
            # And now, we should have it no longer cached
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_expires(self):
        '''Should be able to recognize expired rules'''
        with asis.Server('tests/asis/test_expires', port=8080):
            old_ttl = self.robots.min_ttl
            self.robots.min_ttl = 0
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo',
                                 fetch_if_missing=True), None)
            # If we ignore the TTL, it should still be there.
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo',
                                 fetch_if_missing=False,
                                 honor_ttl=False), None)
            # However, if we honor the TTL, it should be missing in the cache.
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo',
                                 fetch_if_missing=False), None)
            self.robots.min_ttl = old_ttl

    def test_clear(self):
        '''Should be able to explicitly clear rules'''
        with asis.Server('tests/asis/test_clear', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)
            # Now if we clear the rules, we should not find it
            self.robots.clear()
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_fetch(self):
        '''Ensure that 'fetch' doesn't cache'''
        with asis.Server('tests/asis/test_fetch', port=8080):
            self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'),
                                None)
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_cache(self):
        '''Ensure we can ask it to cache a result'''
        with asis.Server('tests/asis/test_cache', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'),
                                None)
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_add(self):
        '''We should be able to add rules that we get'''
        with asis.Server('tests/asis/test_add', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.add(self.robots.fetch('http://localhost:8080/foo'))
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_server_error(self):
        '''Make sure we can catch server errors'''
        with mock.patch.object(self.robots.session,
                               'get',
                               side_effect=TypeError):
            self.assertRaises(ServerError, self.robots.allowed,
                              'http://localhost:8080/foo', 'rogerbot')

    def test_disallowed(self):
        '''Check the disallowed interface'''
        with asis.Server('tests/asis/test_disallowed', port=8080):
            self.assertFalse(
                self.robots.disallowed('http://localhost:8080/foo',
                                       'rogerbot'))
            urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar']
            self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls)
            self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), [])

    def test_delay(self):
        '''Check the delay interface'''
        with asis.Server('tests/asis/test_delay', port=8080):
            self.assertEqual(
                self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5)

    def test_sitemaps(self):
        '''Check the sitemaps interface'''
        with asis.Server('tests/asis/test_sitemaps', port=8080):
            self.assertEqual(
                self.robots.sitemaps('http://localhost:8080/foo'), [
                    'http://localhost:8080/a', 'http://localhost:8080/b',
                    'http://localhost:8080/c'
                ])

    def test_dns_exception(self):
        '''Raises an exception if url does not resolve.'''
        self.assertRaises(ConnectionException, self.robots.allowed,
                          'http://does-not-resolve', 'rogerbot')

    def test_malformed_url(self):
        '''Raises an exception if the url is malformed.'''
        self.assertRaises(MalformedUrl, self.robots.allowed, 'hhttp://moz.com',
                          'rogerbot')

    def test_ssl_exception(self):
        '''Raises an exception if there is an ssl error.'''
        with asis.Server('tests/asis/test_ssl_exception', port=8080):
            self.assertRaises(SSLException, self.robots.allowed,
                              'https://localhost:8080', 'rogerbot')

    def test_excessive_redirects(self):
        '''Raises an exception if there are too many redirects.'''
        with asis.Server('tests/asis/test_excessive_redirects', port=8080):
            self.assertRaises(ExcessiveRedirects, self.robots.allowed,
                              'http://localhost:8080/one', 'rogerbot')

    def test_bad_status_codes(self):
        '''Raises an exception if there is a 5xx status code.'''
        with asis.Server('tests/asis/test_bad_status_codes', port=8080):
            self.assertRaises(BadStatusCode, self.robots.allowed,
                              'http://localhost:8080', 'rogerbot')
Esempio n. 19
0
class TestCache(unittest.TestCase):
    def setUp(self):
        self.robots = RobotsCache()

    def test_404(self):
        '''When we get a 404, assume free range'''
        with asis.Server('tests/asis/test_404', port=8080):
            self.assertEqual(self.robots.allowed(
                'http://localhost:8080/foo', 'rogerbot'), True)

    def test_caching(self):
        '''We should be able to cache results'''
        with asis.Server('tests/asis/test_caching', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_context_manager(self):
        '''When using as a context manager, it should clear afterwards'''
        with asis.Server('tests/asis/test_context_manager', port=8080):
            with self.robots:
                self.assertEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
                self.assertNotEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
            # And now, we should have it no longer cached
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_expires(self):
        '''Should be able to recognize expired rules'''
        with asis.Server('tests/asis/test_expires', port=8080):
            old_ttl = self.robots.min_ttl
            self.robots.min_ttl = 0
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo', True), None)
            # Now, it shouldn't be cached, so when we find it again, it should
            # be missing (or at least, requiring a refetch)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo', False), None)
            self.robots.min_ttl = old_ttl

    def test_clear(self):
        '''Should be able to explicitly clear rules'''
        with asis.Server('tests/asis/test_clear', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            # Now if we clear the rules, we should not find it
            self.robots.clear()
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_fetch(self):
        '''Ensure that 'fetch' doesn't cache'''
        with asis.Server('tests/asis/test_fetch', port=8080):
            self.assertNotEqual(
                self.robots.fetch('http://localhost:8080/foo'), None)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_cache(self):
        '''Ensure we can ask it to cache a result'''
        with asis.Server('tests/asis/test_cache', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.assertNotEqual(
                self.robots.cache('http://localhost:8080/foo'), None)
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_add(self):
        '''We should be able to add rules that we get'''
        with asis.Server('tests/asis/test_add', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.add(self.robots.fetch(
                'http://localhost:8080/foo'))
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_server_error(self):
        '''Make sure we can catch server errors'''
        self.assertRaises(ServerError, self.robots.allowed,
            'http://localhost:8080/foo', 'rogerbot')

    def test_disallowed(self):
        '''Check the disallowed interface'''
        with asis.Server('tests/asis/test_disallowed', port=8080):
            self.assertFalse(self.robots.disallowed(
                'http://localhost:8080/foo', 'rogerbot'))
            urls = [
                'http://localhost:8080/foo',
                'http://localhost:8080/bar'
            ]
            self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls)
            self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), [])

    def test_delay(self):
        '''Check the delay interface'''
        with asis.Server('tests/asis/test_delay', port=8080):
            self.assertEqual(self.robots.delay(
                'http://localhost:8080/foo', 'rogerbot'), 5)

    def test_sitemaps(self):
        '''Check the sitemaps interface'''
        with asis.Server('tests/asis/test_sitemaps', port=8080):
            self.assertEqual(
                self.robots.sitemaps('http://localhost:8080/foo'), [
                    'http://localhost:8080/a',
                    'http://localhost:8080/b',
                    'http://localhost:8080/c'
                ])
Esempio n. 20
0
class Archiver(object):
    ARCHIVE_SUBFORUM_SUBURL_TEMPLATE = 'index.php/f-{forum_code}.html'
    ARCHIVE_SUBFORUM_SUBURL_RE_TEMPLATE = 'index.php/f-{forum_code}[^(.html)]?.html'
    ARCHIVE_THREAD_SUBURL_RE = 'index.php/t-[^(.html)]*.html'
    ARCHIVE_CSS_RE = '[^(.css)]*.css'

    def __init__(self, base_url, forum_codes, archive_location, user_agent,
                 worker_count):
        archiver_logger.info('Archiver initialized.')
        self.base_url = base_url
        self.archive_base_url = urljoin(self.base_url,
                                        ScraperConfig.ARCHIVE_SUBURL)
        self.forum_codes = forum_codes
        self.archive_location = archive_location
        self.user_agent = user_agent
        self.robot_parser = RobotsCache()
        self.scraper_timer = None
        self.shutdown_event = threading.Event()
        self.delay_time = 1

        self.workers = []
        self.worker_count = worker_count

        self.pages_need_visiting = Queue()
        self.pages_need_analysis_counter = RachetingCounter()
        self.pages_visited_lock = threading.Lock()
        self.pages_visited = []
        self.page_re_filters = []

    def setup(self):
        archiver_logger.info('Beginning Archiver setup.')
        success = True

        archiver_logger.info('Building page filters.')
        # Build regular expression filters for pages to attempt to crawl.
        archive_base_url = self.archive_base_url

        # Build regular expression for sub-forums we're interested in.
        for forum_code in self.forum_codes:
            regex = urljoin(
                archive_base_url,
                self.ARCHIVE_SUBFORUM_SUBURL_RE_TEMPLATE.format(
                    forum_code=forum_code))
            self.page_re_filters.append(re.compile(regex))

        # Add a regular expression for thread pages.
        thread_regex = urljoin(archive_base_url, self.ARCHIVE_THREAD_SUBURL_RE)
        self.page_re_filters.append(re.compile(thread_regex))

        # Finally add a regular expression to grab the archive CSS.
        css_regex = urljoin(archive_base_url, self.ARCHIVE_CSS_RE)
        self.page_re_filters.append(re.compile(css_regex))

        archiver_logger.info('Adding seed pages.')
        for fc in self.forum_codes:
            subforum_url = urljoin(
                self.archive_base_url,
                self.ARCHIVE_SUBFORUM_SUBURL_TEMPLATE.format(forum_code=fc))
            self.pages_need_visiting.put(subforum_url)
            self.pages_need_analysis_counter.increment()
            archiver_logger.info(
                'Archiver seeded with page {}.'.format(subforum_url))

        archiver_logger.info('Checking archive location...')
        # Setup archive location.
        base_path, new_archive = os.path.split(self.archive_location)
        if not os.path.exists(base_path) or not os.path.isdir(base_path):
            success = False
            archiver_logger.error(
                'Base path {} does not exist or is not a directory! Aborting!')
            return success
        elif (os.path.exists(self.archive_location)
              and (not os.path.isdir(self.archive_location)
                   or os.listdir(self.archive_location))):
            success = False
            archiver_logger.error(
                'Archive location {} is either a not a directory or is not empty! Aborting!'
                ''.format(self.archive_location))
            return success
        elif not os.path.exists(self.archive_location):
            archiver_logger.info('Creating archive directory {}.'.format(
                self.archive_location))
            try:
                os.mkdir(self.archive_location)
            except OSError:
                success = False
                archiver_logger.exception(
                    'Faulted attempting to create archive directory! Aborting!'
                )
                return success
        else:
            archiver_logger.info(
                'Empty archive directory {} exists. Proceeding...'.format(
                    self.archive_location))

        # Attempt to retrieve robots.txt information about target site.
        if not self.robot_parser.allowed(self.base_url, self.user_agent):
            success = False
            archiver_logger.error('Not allowed to scrape {}! Aborting!'.format(
                self.base_url))
            return success
        else:
            archiver_logger.info(
                'Successfully polled {} for robots.txt, can scrape.'.format(
                    self.base_url))

        # Get crawl delay and build scraper timer.
        delay_time = self.robot_parser.delay(self.base_url, self.user_agent)
        if delay_time:
            archiver_logger.info(
                'Site crawl-delay: {} seconds.'.format(delay_time))

        else:
            delay_time = ScraperConfig.DEFAULT_CRAWL_DELAY
            archiver_logger.info(
                'No crawl delay for this site. Using default crawl delay of {} seconds.'
                ''.format(delay_time))
        archiver_logger.info('Intializng Scraper timer.')
        self.scraper_timer = ScraperTimer(delay_time)
        self.delay_time = delay_time
        if success:
            archiver_logger.info('Archiver setup success!')
        else:
            archiver_logger.error('Archiver setup failure! Check logs!')
        archiver_logger.info('Building workers...')
        for i in xrange(self.worker_count):
            archiver_logger.info('Adding worker {}.'.format(i + 1))
            worker = ArchiverWorker(
                self.shutdown_event, self.user_agent, self.robot_parser,
                self.scraper_timer, self.pages_need_visiting,
                self.pages_visited, self.pages_visited_lock,
                self.page_re_filters, self.pages_need_analysis_counter,
                self.archive_location)
            worker.daemon = True
            self.workers.append(worker)
        return success

    def run(self):
        archiver_logger.info('Starting workers...')
        [worker.start() for worker in self.workers]
        while not self.pages_need_analysis_counter.empty():
            time.sleep(0.1)
        archiver_logger.info(
            'Finished archiving all possible pages. Shutting down.')
        archiver_logger.info('Waiting for threads to finish up.')
        self.shutdown_event.set()
        self.scraper_timer.wait()
        return True

    def teardown(self):
        if not self.shutdown_event.is_set():
            self.shutdown_event.set()
        return True
Esempio n. 21
0
class TestCache(unittest.TestCase):
    def setUp(self):
        self.robots = RobotsCache()

    def test_404(self):
        '''When we get a 404, assume free range'''
        with asis.Server('tests/asis/test_404', port=8080):
            self.assertEqual(
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot'),
                True)

    def test_caching(self):
        '''We should be able to cache results'''
        with asis.Server('tests/asis/test_caching', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_context_manager(self):
        '''When using as a context manager, it should clear afterwards'''
        with asis.Server('tests/asis/test_context_manager', port=8080):
            with self.robots:
                self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                                 None)
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
                self.assertNotEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
            # And now, we should have it no longer cached
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_expires(self):
        '''Should be able to recognize expired rules'''
        with asis.Server('tests/asis/test_expires', port=8080):
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo', True), None)
            # Now, it shouldn't be cached, so when we find it again, it should
            # be missing (or at least, requiring a refetch)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo', False), None)

    def test_clear(self):
        '''Should be able to explicitly clear rules'''
        with asis.Server('tests/asis/test_clear', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)
            # Now if we clear the rules, we should not find it
            self.robots.clear()
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_fetch(self):
        '''Ensure that 'fetch' doesn't cache'''
        with asis.Server('tests/asis/test_fetch', port=8080):
            self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'),
                                None)
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_cache(self):
        '''Ensure we can ask it to cache a result'''
        with asis.Server('tests/asis/test_cache', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'),
                                None)
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_add(self):
        '''We should be able to add rules that we get'''
        with asis.Server('tests/asis/test_add', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.add(self.robots.fetch('http://localhost:8080/foo'))
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_server_error(self):
        '''Make sure we can catch server errors'''
        self.assertRaises(ServerError, self.robots.allowed,
                          'http://localhost:8080/foo', 'rogerbot')

    def test_disallowed(self):
        '''Check the disallowed interface'''
        with asis.Server('tests/asis/test_disallowed', port=8080):
            self.assertFalse(
                self.robots.disallowed('http://localhost:8080/foo',
                                       'rogerbot'))
            urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar']
            self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls)
            self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), [])

    def test_delay(self):
        '''Check the delay interface'''
        with asis.Server('tests/asis/test_delay', port=8080):
            self.assertEqual(
                self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5)

    def test_sitemaps(self):
        '''Check the sitemaps interface'''
        with asis.Server('tests/asis/test_sitemaps', port=8080):
            self.assertEqual(
                self.robots.sitemaps('http://localhost:8080/foo'), [
                    'http://localhost:8080/a', 'http://localhost:8080/b',
                    'http://localhost:8080/c'
                ])
class RobotsTxtMiddleware(object):
    DOWNLOAD_PRIORITY = 1000

    def __init__(self, crawler):
        if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
            raise NotConfigured

        self.completeblacklist = crawler.settings.get('ROBOTSTXT_BLACKLIST', ())
        self.blacklist = []
        self.generalblacklist = crawler.settings.get('GENERAL_BLACKLIST', ())
        self.hasblacklist = False
        self.whitelist = crawler.settings.get('ROBOTSTXT_WHITELIST', ())
        self.crawler = crawler
        self._useragent = crawler.settings.get('USER_AGENT')
        self._parsers = {}
        self._spider_netlocs = set()
        self.robots = RobotsCache()
        
        self.stoprepetitionsrearg = re.compile(ur'.*?\&(.*?\&)\1{1,}.*')
        self.stoprepetitionsreslash = re.compile(ur'.*?\/(.*?\/)\1{1,}.*')

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        useragent = self._useragent
        if not self.hasblacklist:
            self.hasblacklist = True
            if ('http://' + spider.domain) in self.completeblacklist and self.completeblacklist['http://' + spider.domain] != None:
                self.blacklist = [el.lower() for el in self.completeblacklist['http://' + spider.domain]]
                log.msg(format="Got blacklist from DB for domain",
                    level=log.DEBUG, request=request)
            else:
                log.msg(format="Didn't get a blacklist from DB for domain",
                    level=log.DEBUG, request=request)
            self.blacklist.extend([el.lower() for el in self.generalblacklist])
        #Check for silly repeating arguments
        if self.stoprepetitionsrearg.match(request.url) != None or self.stoprepetitionsreslash.match(request.url) != None:
            log.msg(format="URL is suspicious: %(request)s",
                    level=log.DEBUG, request=request)
            raise IgnoreRequest
        #Blacklist overrides whitelist and robots
        if any(bl in request.url.lower() for bl in self.blacklist):
            log.msg(format="Forbidden by blacklist: %(request)s",
                    level=log.DEBUG, request=request)
            raise IgnoreRequest
        if not any(wl in request.url for wl in self.whitelist) and self.robots and not self.robots.allowed(request.url, useragent):
            log.msg(format="Forbidden by robots.txt: %(request)s",
                    level=log.DEBUG, request=request)
            raise IgnoreRequest
Esempio n. 23
0
import time

from reppy.cache import RobotsCache
from reppy.parser import Rules

content = '''
User-agent: '*'
Allow: /
'''

cache = RobotsCache()
cache.add(Rules('http://example.com/', 200, content, float('inf')))


@contextmanager
def timer(count):
    '''Time this block.'''
    start = time.time()
    try:
        yield count
    finally:
        duration = time.time() - start
        print('Total: %s' % duration)
        print('  Avg: %s' % (duration / count))
        print(' Rate: %s' % (count / duration))


with timer(100000) as count:
    for _ in range(count):
        cache.allowed('http://example.com/page', 'agent')
Esempio n. 24
0
class WebConnexion(object):
    """Manage the web connexion with the page to crawl."""
    def __init__(self):
        self.reqrobots = RobotsCache()
        self.parser_encoding = parsers.ExtractEncoding()

    def get_code(self, url):
        """Get source code of given url.

		:param url: url of webpage
		:type url: str
		:return: source code, True if no take links, score and new url (redirection)

		"""
        nofollow, url = connexion.is_nofollow(url)
        result = self.send_request(url)
        if not isinstance(result, requests.models.Response):
            return None, result, None, None, url
        else:
            request = result
            del result
            allowed = self.check_robots_perm(url)
            if request.status_code == requests.codes.ok and request.headers.get(
                    'Content-Type', '').startswith('text/html') and allowed:
                # Search encoding of webpage:
                request.encoding, score = self.search_encoding(
                    request.headers, request.text)
                new_url, code = self.duplicate_content(
                    request, url)  # new_url is clean and maybe without params
                all_urls = connexion.all_urls(
                    request)  # List of urls to delete
                if new_url in all_urls:  # new_url don't be delete
                    all_urls.remove(new_url)
                return new_url, code, nofollow, score, all_urls
            else:
                tell('Webpage infos: status code=' + str(request.status_code) + ', Content-Type=' + \
                 request.headers.get('Content-Type', '') + ', robots perm=' + str(allowed), severity=0)
                # All redirections urls, the first and the last:
                all_urls = connexion.all_urls(request)
                all_urls.append(request.url)
                all_urls.append(url)
                return None, 'ignore', None, None, remove_duplicates(all_urls)

    def send_request(self, url):
        try:
            request = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
        except requests.packages.urllib3.exceptions.ReadTimeoutError:
            tell('Read timeout error (urllib3): ' + url, 3)
            return None
        except requests.exceptions.Timeout:
            tell('Timeout error: ' + url, 4)
            return None
        except requests.exceptions.RequestException as error:
            tell('Connexion failed: {}, {}'.format(str(error), url), 5)
            if connexion.no_connexion():
                return 'no connexion'
            else:
                return None
        else:
            return request

    def search_encoding(self, headers, code):
        """Searche encoding of webpage in source code.

		If an encoding is found in source code, score is 1, but if not
		score is 0 and encoding is utf-8.

		:param headers: hearders of requests
		:type headers: dict
		:param code: source code
		:type code: str
		:return: encoding of webpage and it score

		"""
        # Search in headers:
        headers = str(headers).lower()
        charset = headers.find('charset')
        end_charset = headers.find('\'', charset)
        if charset != -1 and end_charset != -1:
            return headers[charset + 8:end_charset], 1
        else:
            # Search in source code:
            self.parser_encoding.feed(code)
            if self.parser_encoding.encoding != '':
                return self.parser_encoding.encoding, 1
            else:
                tell('No encoding', 9, severity=0)
                return 'utf-8', 0

    def check_robots_perm(self, url):
        """Check robots.txt for permission.

		:param url: webpage url
		:type url: str
		:return: True if can crawl

		"""
        try:
            allowed = self.reqrobots.allowed(url, USER_AGENT)
        except ServerError as error:
            tell('Error robots.txt (reppy): ' + str(error) + ' ' + url, 6)
            allowed = True
        except requests.exceptions.Timeout:
            tell('Error robots.txt (timeout): ' + url)
            allowed = True
        except requests.exceptions.RequestException as error:
            tell('Error robots.txt (requests): ' + str(error) + ' ' + url, 7)
            allowed = True
        except Exception as error:
            tell('Unknow robots.txt error: ' + str(error) + ' ' + url, 8)
            allowed = True
        return allowed

    def duplicate_content(self, request1, url):
        """Avoid param duplicate.

		Compare source codes with params and whitout.
		Return url whitout params if it's the same content.

		:param request: request
		:type request: requests.models.Response
		:return: url, source code

		"""
        url1 = clean_link(request1.url)
        if url1 is None:
            return url, request1.text
        infos_url = urlparse(url1)
        if infos_url.query != '':
            new_url = infos_url.scheme + '://' + infos_url.netloc + infos_url.path
            request2 = self.send_request(new_url)
            if not isinstance(request2, requests.models.Response):
                return url1, request1.text
            request2.encoding = self.search_encoding(request2.headers,
                                                     request2.text)[0]
            url2 = clean_link(request2.url)
            if url2 is None:
                return url1, request1.text
            if connexion.duplicate_content(request1.text, request2.text):
                tell("Same content: " + url1 + " and " + url2)  # Tests
                return url2, request2.text
            else:
                return url1, request1.text
        else:
            return url1, request1.text
Esempio n. 25
0
class WebCrawler():
	""" Web crawler class crawls a specific website
	"""
	def __init__(self, url="file:///Users/tharak/Dropbox/code/Python/webcrawler/mock_website/example.org/index.html", useragent="User Agent", outdir="out", max_depth=1000, debug=0):
		self.url = url					
		self.useragent = useragent		
		self.siteMap = {self.url:""}	
		self.outdir=outdir.rstrip("/")+"/"	
		self.depth = 0					
		self.MaxDepth = max_depth		
		self.crawled=Set([])			
		self.debug=debug				
		self.domains=Set([urlparse(self.url).netloc.lower()])
		self.robots = RobotsCache()
			
		
	def __crawl_site(self, url_key=""):
		"""Recursively crawls the url passed and populates the sitemap datastructure
		"""
		#Do not continue crawling if we are at maximum allowed depth
		if self.depth > self.MaxDepth: 	
			return
		
		
		if url_key=="":    				
			url=self.url				
		else:
			url=url_key
			
		#Check the site's robot.txt to figure the list of allowed locs	
		#Do not check robots.txt if the file is located locally
		if "http" in urlparse(url).scheme:  
			if not self.robots.allowed(url, self.useragent):
				if(self.debug > 0): 
					print "Page disallowed in robots.txt %s"%(url)
				return
			
		if(self.debug > 0): 
			print "Now crawling: %s"%(url)
		
		url_list=[]
		
		#When we cycle through the siteMap datastructure we convert to a url_list
		#Otherwise, the interpreter complains that dictionary is constantly changing
		
		for key in self.siteMap:		
		 	url_list.append(key)		 
		
		for key in url_list:	
			#Fetch the URLs in the webpage and append to siteMap for URLs that have not yet been crawled. 		
			if self.siteMap[key] == "":
				urls =self.__extract_url(url)
				self.siteMap[key] = urls

				for url_key in urls:
					#If the URL has already been crawled or has a # tag, dont crawl it.	
					if (self.debug > 1): 
						print "url_key: %s, crawled: %s"%(url_key,self.crawled)
					if url_key in self.crawled:
						continue
					if "#" in url_key:
						continue
					
					#We do not want to crawl external domains. 
					parsed = urlparse(url_key)
					
					if (self.debug > 1): 
						print parsed.netloc
					
					#If netloc is empty or is the main domain then the page is part of local domain and needs to be crawled.
					if parsed.netloc.lower() in self.domains:		    
						
						if (self.debug > 1): 
							print "\ndepth=%s,URL=%s\n"%(self.depth, url_key)
						self.siteMap[url_key] = ""  
						self.crawled.add(url_key)   
						self.depth = self.depth+1   
						self.__crawl_site(url_key)	
						self.depth = self.depth-1	
			

	def __print_siteMap(self):
		"""Prints the siteMap datastructure in an XML like format
		"""
		#Dump Sitemap to an XML file
		try:                                
			fd = open(self.outdir+"site.xml", "w") 
			try:                           
				fd.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
				fd.write("<WEBSITE>\n")
				for key in self.siteMap:
					fd.write("\t<WEBPAGE>\n")
					fd.write("\t\t<ADDRESS>\"%s\"</ADDRESS>\n"%(key))
					for loc in self.siteMap[key]:
						fd.write("\t\t<LINK>\"%s\"</LINK>\n"%(loc))
					fd.write("\t</WEBPAGE>\n")
				fd.write("</WEBSITE>\n")
			finally:                        
				fd.close()                    			  
		except IOError:                     
			pass    
		#Dump siteMap to a json file
		import json
		with open(self.outdir+'site.json', 'w') as fp:
			json.dump(self.siteMap, fp, indent=4)    
    
		
					
	def get_siteMap(self):
		"""Initiates the crawler and populates the siteMap
		"""
		from os import makedirs
		from shutil	import rmtree 

		rmtree(self.outdir)
		makedirs(self.outdir)

		self.__crawl_site()
		self.__print_siteMap()
		return self.siteMap

	def __extract_url(self, url): 
		"""Extracts the links in the input URL
		"""
		
		import urllib2
		from urllister import URLLister
		from sgmllib import SGMLParseError
		
		req = urllib2.Request(url, headers={'User-Agent' : self.useragent}) 
		try:
			usock = urllib2.urlopen(req)
			parser = URLLister(url)
		
			try:
				parser.feed(usock.read())
				parser.close()
			except Exception as exception:
				if (self.debug > 0): 
					print "sgmllib: Unable to parse web page.\n sgmllib: Raised exception %s"%(type(exception).__name__)
					fd = open(self.outdir+"%s.err"%type(exception).__name__, "a")
					fd.write( "%s\n"%(url))	
					fd.close()
				pass
			usock.close()
			return parser.urls
		except (KeyboardInterrupt, SystemExit):
			raise
		except Exception as exception:
			if (self.debug > 0): 
				print "urllib2: Page does not exist or Malformed web address.\n sgmllib: Raised exception %s"%(type(exception).__name__) 
				fd = open(self.outdir+"%s.err"%type(exception).__name__, "a")
				fd.write( "%s\n"%(url))	
				fd.close()
			return []
Esempio n. 26
0
class Mole:
    """ fetch web page based on robots.txt """

    def __init__(self):
        self.agent = "jerry's crawler"
        self.robots = RobotsCache()
        self.pool = None
        self.cookieJar = cookielib.CookieJar()

        timeout = 60
        socket.setdefaulttimeout(timeout)

    def fetch(self, uri):
        # timeout in seconds
        if self.robots.allowed(uri, self.agent):
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookieJar))
            req = urllib2.Request(uri)
            req.add_header('User-Agent', self.agent)
            response = opener.open(req)
            if response.code == 200:
                return response.read()

        return None

    def filter_punctuation(self, tokens):
        non_punct = re.compile('.*[A-Za-z0-9].*')
        return [w for w in tokens if non_punct.match(w)]

    def get_sitexml_robots(self, url):
        robot_url = '/'.join([url, 'robots.txt'])
        content = self.fetch(robot_url)
        lines = content.split('\n')
        site = []
        for line in lines:
            line = line.lower()
            index = line.find("sitemap")
            if index < 0 :
                continue
            m = re.search('sitemap\s*:\s*(\S+)',line[index:])
            site.append(m.group(1))

        return site

    def is_within_days(self, d, days=1):
        ago = date.today() - timedelta(days)
        return ago <= d

    def read_sitemap_file(self, mapfile):
        content = self.fetch(mapfile)

        if content is None:
            return None

        if mapfile.endswith('.gz'):
            d = zlib.decompressobj(16+zlib.MAX_WBITS)
            content = d.decompress(content)

        return content

    def create_thread_pool(self, size=10):
        self.pool = WorkerPool(size)

    def page2tokens(self, content):
        return nltk.word_tokenize(nltk.clean_html(content))