コード例 #1
0
class NewsSpider(BaseSpider):
    name = "newyorker"
    allowed_domains = ["newyorker.com"]
    start_urls = [
        "http://www.newyorker.com",
    ]

             
    def __init__(self):
		dispatcher.connect(self.on_spider_closed, signals.spider_closed)
		self.db = DbTool('search_engine.sql')

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        curr_url = response.url
        txt = hxs.select('//body')
        if txt: 
            txt = remove_tags(txt.extract()[0])
            self.db.add_to_index(curr_url, txt)
            #for word in self.db.separate_words(txt): print word
		
        urls =  hxs.select('//a[contains(@href,".html")]/@href')
        if urls:
            urls = urls.extract()
            #self.db.commit()
            for url in urls: 
                if url.find("'")!=-1 : continue
                url=url.split('#')[0]
                if url[0:4] !='http': 
                    url = '%s%s'%(base_url, url)
                if urlparse.urlsplit(url)[1].split(':')[0].startswith('www.newyorker.com'):
                    link_text = remove_tags(url)
                    self.db.add_link_ref(curr_url, url, link_text)                
                    yield Request(url, self.parse)                        
            

    
                
    def on_spider_closed(self):
        self.db.commit()
コード例 #2
0
class NewsSpider(BaseSpider):
    name = "newyorker"
    allowed_domains = ["newyorker.com"]
    start_urls = [
        "http://www.newyorker.com",
    ]

    def __init__(self):
        dispatcher.connect(self.on_spider_closed, signals.spider_closed)
        self.db = DbTool('search_engine.sql')

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        curr_url = response.url
        txt = hxs.select('//body')
        if txt:
            txt = remove_tags(txt.extract()[0])
            self.db.add_to_index(curr_url, txt)
            #for word in self.db.separate_words(txt): print word

        urls = hxs.select('//a[contains(@href,".html")]/@href')
        if urls:
            urls = urls.extract()
            #self.db.commit()
            for url in urls:
                if url.find("'") != -1: continue
                url = url.split('#')[0]
                if url[0:4] != 'http':
                    url = '%s%s' % (base_url, url)
                if urlparse.urlsplit(url)[1].split(':')[0].startswith(
                        'www.newyorker.com'):
                    link_text = remove_tags(url)
                    self.db.add_link_ref(curr_url, url, link_text)
                    yield Request(url, self.parse)

    def on_spider_closed(self):
        self.db.commit()
コード例 #3
0
    def __init__(self):
		dispatcher.connect(self.on_spider_closed, signals.spider_closed)
		self.db = DbTool('search_engine.sql')
コード例 #4
0
 def __init__(self):
     dispatcher.connect(self.on_spider_closed, signals.spider_closed)
     self.db = DbTool('search_engine.sql')