def test_extraction_using_single_values(self):
        """Test the extractor's behaviour among different situations"""

        lx = SgmlLinkExtractor(allow="sample")
        self.assertEqual(
            [link for link in lx.extract_links(self.response)],
            [
                Link(url="http://example.com/sample1.html", text=u""),
                Link(url="http://example.com/sample2.html", text=u"sample 2"),
                Link(url="http://example.com/sample3.html", text=u"sample 3 text"),
            ],
        )

        lx = SgmlLinkExtractor(allow="sample", deny="3")
        self.assertEqual(
            [link for link in lx.extract_links(self.response)],
            [
                Link(url="http://example.com/sample1.html", text=u""),
                Link(url="http://example.com/sample2.html", text=u"sample 2"),
            ],
        )

        lx = SgmlLinkExtractor(allow_domains="google.com")
        self.assertEqual(
            [link for link in lx.extract_links(self.response)], [Link(url="http://www.google.com/something", text=u"")]
        )

        lx = SgmlLinkExtractor(deny_domains="example.com")
        self.assertEqual(
            [link for link in lx.extract_links(self.response)], [Link(url="http://www.google.com/something", text=u"")]
        )
    def test_tags(self):
        html = """<html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)

        lx = SgmlLinkExtractor(tags=None)
        self.assertEqual(lx.extract_links(response), [])

        lx = SgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = SgmlLinkExtractor(tags="area")
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])

        lx = SgmlLinkExtractor(tags="a")
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = SgmlLinkExtractor(tags=("a","img"), attrs=("href", "src"), deny_extensions=())
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
        ])
    def test_attrs(self):
        lx = SgmlLinkExtractor(attrs="href")
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = SgmlLinkExtractor(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = SgmlLinkExtractor(attrs=None)
        self.assertEqual(lx.extract_links(self.response), [])

        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)
        lx = SgmlLinkExtractor(attrs=("href"))
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])
    def test_extraction_using_single_values(self):
        '''Test the extractor's behaviour among different situations'''

        lx = SgmlLinkExtractor(allow='sample')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
        ])

        lx = SgmlLinkExtractor(allow='sample', deny='3')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = SgmlLinkExtractor(allow_domains='google.com')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])

        lx = SgmlLinkExtractor(deny_domains='example.com')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])
    def test_extraction_using_single_values(self):
        '''Test the extractor's behaviour among different situations'''

        lx = SgmlLinkExtractor(allow='sample')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text')
        ])

        lx = SgmlLinkExtractor(allow='sample', deny='3')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2')
        ])

        lx = SgmlLinkExtractor(allow_domains='google.com')
        self.assertEqual(
            [link for link in lx.extract_links(self.response)],
            [Link(url='http://www.google.com/something', text=u'')])

        lx = SgmlLinkExtractor(deny_domains='example.com')
        self.assertEqual(
            [link for link in lx.extract_links(self.response)],
            [Link(url='http://www.google.com/something', text=u'')])
Example #6
0
    def parse(self, response):
        print "IN PARSE!"
        # inspect_response(response,self)

        links=SgmlLinkExtractor(
                allow=('https://www.coursera.org/course/\w+'),
            )
        print "TAMANHO:",len(links.extract_links(response))
        for link in links.extract_links(response):
            # print link
            yield Request(link.url,callback=self.parse_item)
    def test_deny_extensions(self):
        html = """<a href="page.html">asd</a> and <a href="photo.jpg">"""
        response = HtmlResponse("http://example.org/", body=html)
        lx = SgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.org/page.html', text=u'asd'),
        ])

        lx = SgmlLinkExtractor(deny_extensions="jpg")
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.org/page.html', text=u'asd'),
        ])
Example #8
0
 def parse_start_url(self, response):
     if not hasattr(response, 'encoding'):
         setattr(response, 'encoding', 'text/html;charset=UTF-8')
     target_le = SgmlLinkExtractor(
         allow=r'/cn/products/products_detail.asp\?Catalog_id=\w+')
     links = target_le.extract_links(response)
     if links:
         return [Request(url=link.url, cookies=self.forged_cookie, callback=self.parse_item) 
             for link in links]
     else:
         general_le = SgmlLinkExtractor(
                     allow=())
         return [Request(url=link.url, cookies=self.forged_cookie)
                 for link in general_le.extract_links(response)]
 def test_urls_type(self):
     '''Test that the resulting urls are regular strings and not a unicode objects'''
     lx = SgmlLinkExtractor()
     self.assertTrue(
         all(
             isinstance(link.url, str)
             for link in lx.extract_links(self.response)))
Example #10
0
 def parseL2(self, response):
     # forums - liks to lists and to threads
     s2 = SgmlLinkExtractor(restrict_xpaths=['//table[@class="forums-list"]/tr/td/a'])
     Links = s2.extract_links(response)
     for l in Links:
         yield Request(l.url, callback=self.parseL3)
     self.scrapeTheadURL(response)    
Example #11
0
 def parse(self, response):
     # title page
     hxs = HtmlXPathSelector(response)
     s1 = SgmlLinkExtractor(restrict_xpaths=['//a[@class="title"]'])
     Links = s1.extract_links(response)       
     for l in Links:
         yield Request(l.url, callback=self.parseL2)
Example #12
0
 def parse(self, response):
     # title page
     hxs = HtmlXPathSelector(response)
     s1 = SgmlLinkExtractor(restrict_xpaths=['//a[@class="title"]'])
     Links = s1.extract_links(response)
     for l in Links:
         yield Request(l.url, callback=self.parseL2)
Example #13
0
class LinkScraper:
    """A scraper to find all URLs in a page """

    def __init__(self):
        self._link_extractor = SgmlLinkExtractor()

    def parse(self, response):
        """Scrape a spider's HttpRequest.Response for links"""

        # sanity check
        if self._link_extractor is None:
            self._link_extractor = SgmlLinkExtractor()

        # use scrapy SgmlLinkExtractor to extract links
        try:
            links = self._link_extractor.extract_links(response)
        except SGMLParseError as e:
            # Page was poorly formatted, oh well
            _linkscraper_logger.error('Exception encountered when link extracting page')
            return []

        # add these links to our Url item
        urls = list()
        for link in links:
            url = ScrapedUrl()
            url['url'] = link.url
            url['domain'] = UrlUtility.get_domain(link.url)
            url['last_visited'] = datetime(1, 1, 1)
            if url not in urls:
                urls.append(url)

        return urls
Example #14
0
class LinkScraper:
    """A scraper to find all URLs in a page """
    def __init__(self):
        self._link_extractor = SgmlLinkExtractor()

    def parse(self, response):
        """Scrape a spider's HttpRequest.Response for links"""

        # sanity check
        if self._link_extractor is None:
            self._link_extractor = SgmlLinkExtractor()

        # use scrapy SgmlLinkExtractor to extract links
        try:
            links = self._link_extractor.extract_links(response)
        except SGMLParseError as e:
            # Page was poorly formatted, oh well
            _linkscraper_logger.error(
                'Exception encountered when link extracting page')
            return []

        # add these links to our Url item
        urls = list()
        for link in links:
            url = ScrapedUrl()
            url['url'] = link.url
            url['domain'] = UrlUtility.get_domain(link.url)
            url['last_visited'] = datetime(1, 1, 1)
            if url not in urls:
                urls.append(url)

        return urls
    def parse_hospital_active_doctor(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.haodf.com/hospital/DE4roiYGYZwXhYmS30yF9V0wc/DE4rO-XCoLU0Jq1rbc1P6dS2aO/daifu.htm
        @returns items 14 14
        @returns requests 20 100
        @scrapes _name hospital specialty title reply2wCount
        """
        hxs = HtmlXPathSelector(response)

        city = response.meta['city']
        area = response.meta['area']
        print "$$$ current city: %s area: %s" % (city[0], area[0])

        #Sample
        #http://www.haodf.com/hospital/DE4roiYGYZwXhYmS30yF9V0wc/DE4rO-XCoLUE-578VWVmvC3uh7/daifu.htm

        linkExtractor = SgmlLinkExtractor(allow=(r"/hospital/\S+/\S+/daifu.htm",), unique=True)
        links = linkExtractor.extract_links(response)
        for link in links:
            request = Request(link.url, callback=self.parse_hospital_active_doctor)
            request.meta['city'] = response.meta['city']
            request.meta["area"] = response.meta['area']
            yield request

        hospital = hxs.select("/html/body/div[3]/div/a[3]/text()").extract()[0]
        print hospital
        specialty = hxs.select("//div[@class='subnav']/a/text()").re(r'(\S+)\s+(\S+)')[0]
        print specialty

        docLinks = hxs.select("//table[@id='doc_list_index']/tr[descendant::td[contains(@class, 'tda')]]")
        #docLinks = hxs.select("//table[@id='doc_list_index']/tr")

        for doc in docLinks:
            l = XPathItemLoader(ActiveDoctorItem(), doc)

            docNames = doc.select("./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()").extract()

            if len(docNames) != 0:
                print docNames[0]

            l.add_xpath('_name', "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()")
            l.add_value('specialty', specialty)
            l.add_value('hospital', hospital)
            l.add_value('city', response.meta['city'])
            l.add_value('area', response.meta['area'])

            title = doc.select("./td[@class='tda']/li/text()").re('\S+')

            if len(title) == 1:
                l.add_value('title', title[0])

            l.add_xpath('count_ReplyInTwoWeeks', u"./td[@class='td_hf']/div[contains(text(), '近2周回复咨询')]/span/text()")
            l.add_xpath('count_ReplyTotal', u"./td[@class='td_hf']/div[contains(text(), '总共回复')]/span/text()")
            l.add_xpath('count_Calls', u"./td[@class='td_hf']/div[contains(text(), '已接听电话咨询')]/span/text()")
            ret = l.load_item()
            #print ret

            yield ret
 def test_encoded_url_in_restricted_xpath(self):
     body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
     response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
     lx = SgmlLinkExtractor(restrict_xpaths="//div")
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
     ])
 def test_encoded_url_in_restricted_xpath(self):
     body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
     response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
     lx = SgmlLinkExtractor(restrict_xpaths="//div")
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
     ])
Example #18
0
 def parse(self, response):
     metadata = response.meta['userdata']
     #处理常规部分
     link_extractor = SgmlLinkExtractor(restrict_xpaths=('//div[@class="linksList"]//a'))
     links = link_extractor.extract_links(response)
     for link in links:
         m = copy.deepcopy(metadata)
         url = link.url
         cat_title = link.text
         cat_name = cat_title.lower()
         m['tags_mapping']['category-0'] = [{'title': cat_title, 'name': cat_name}]
         gender = cm.guess_gender(cat_name)
         if gender:
             m['gender'] = [gender]
         yield Request(url=url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m})
     #处理区域特别部分
     region = metadata['region']
     if region == 'jp':
         extra_urls = [
             'http://www.paulsmith.co.jp/shop/gifts/products',
             'http://www.paulsmith.co.jp/shop/reserve/products',
             'http://www.paulsmith.co.jp/shop/sales/products',
             'http://www.paulsmith.co.jp/shop/paulsmithcollection/products'
         ]
         for url in extra_urls:
             m = copy.deepcopy(metadata)
             yield Request(url=url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m})
     else:
         extra_urls = [
             'http://www.paulsmith.co.uk/%s-en/shop/valentines-day-gifts/valentines-day-gifts-for-her' % region,
             'http://www.paulsmith.co.uk/%s-en/shop/valentines-day-gifts/valentines-day-gifts-for-him' % region,
         ]
         for url in extra_urls:
             m = copy.deepcopy(metadata)
             yield Request(url=url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m})
Example #19
0
 def parse_session_hash(self, response):
     extractor = SgmlLinkExtractor(
         allow=r'/w/valikko\.jsp', tags='frame', attrs=('src', ))
     link = extractor.extract_links(response)[0]
     query = urlparse.urlparse(link.url).query
     params = urlparse.parse_qs(query)
     return params['MD5avain'][0]
 def parseThread(self, response):
     print('inside a thread')
     hxs = HtmlXPathSelector(response)  
     filename = "xxx"+response.url.split("/")[-2][1:]
     with open(filename, 'a') as f:
         for entry in hxs.select('//div[contains(@class,"forums-thread")]'):
             msgID=     entry.select('span/@id').extract()[0]        
             msgDate=   entry.select('h4/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             msgText=' '.join(entry.select('span/text()').extract()).encode('ascii','ignore').replace('\n','')
             try:
                 mgAuthor=  entry.select('h3/span/a/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             except:
                 mgAuthor='none'
             try:
                 msgTitle=  entry.select('h3/strong/text()').extract()[0].encode('ascii','ignore').replace('\n','')                
             except:
                 msgTitle="none"
             f.write('msgID:'+msgID+'\n')
             f.write('msgTitle:'+msgTitle+'\n')
             f.write('mgAuthor:'+mgAuthor+'\n')
             f.write('msgDate:'+msgDate+'\n')
             f.write('msgText:'+msgText+'\n\n')
     s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]'])
     Links = s.extract_links(response)
     if len(Links) > 0:
         print 'going to the next page'
         yield Request(Links[0].url, callback=self.parseThread)
class FollowAllSpider(BaseSpider):

    name = 'followall'

    def __init__(self, **kw):
        super(FollowAllSpider, self).__init__(**kw)
        url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/'
        if not url.startswith('http://') and not url.startswith('https://'):
            url = 'http://%s/' % url
        self.url = url
        self.allowed_domains = [urlparse(url).hostname.lstrip('www.')]
        self.link_extractor = SgmlLinkExtractor()
        self.cookies_seen = set()

    def start_requests(self):
        return [Request(self.url, callback=self.parse)]

    def parse(self, response):
        """Parse a PageItem and all requests to follow

        @url http://www.scrapinghub.com/
        @returns items 1 1
        @returns requests 1
        @scrapes url title foo
        """
        self.log("I am at : "+ response.url)
        page = self._get_item(response)
        r = [page]
        r.extend(self._extract_requests(response))
        return r

    def _get_item(self, response):
        item = Page(url=response.url, size=str(len(response.body)),
            referer=response.request.headers.get('Referer'))
        self._set_title(item, response)
        self._set_new_cookies(item, response)
        return item

    def _extract_requests(self, response):
        r = []
        if isinstance(response, HtmlResponse):
            links = self.link_extractor.extract_links(response)
            r.extend(Request(x.url, callback=self.parse) for x in links)
        return r

    def _set_title(self, page, response):
        if isinstance(response, HtmlResponse):
            title = HtmlXPathSelector(response).select("//title/text()").extract()
            if title:
                page['title'] = title[0]

    def _set_new_cookies(self, page, response):
        cookies = []
        for cookie in [x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie')]:
            if cookie not in self.cookies_seen:
                self.cookies_seen.add(cookie)
                cookies.append(cookie)
        if cookies:
            page['newcookies'] = cookies
 def test_base_url_with_restrict_xpaths(self):
     html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
     <body><p><a href="item/12.html">Item 12</a></p>
     </body></html>"""
     response = HtmlResponse("http://example.org/somepage/index.html", body=html)
     lx = SgmlLinkExtractor(restrict_xpaths="//p")
     self.assertEqual(lx.extract_links(response),
                      [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
 def test_base_url_with_restrict_xpaths(self):
     html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
     <body><p><a href="item/12.html">Item 12</a></p>
     </body></html>"""
     response = HtmlResponse("http://example.org/somepage/index.html", body=html)
     lx = SgmlLinkExtractor(restrict_xpaths="//p")
     self.assertEqual(lx.extract_links(response),
                      [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
 def test_restrict_xpaths_concat_in_handle_data(self):
     """html entities cause SGMLParser to call handle_data hook twice"""
     body = """<html><body><div><a href="/foo">&gt;\xbe\xa9&lt;\xb6\xab</a></body></html>"""
     response = HtmlResponse("http://example.org", body=body, encoding='gb18030')
     lx = SgmlLinkExtractor(restrict_xpaths="//div")
     self.assertEqual(lx.extract_links(response),
                      [Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
                            fragment='', nofollow=False)])
Example #25
0
 def parse_cat(self, response):
     #先处理本页商品
     link_extractor = SgmlLinkExtractor(restrict_xpaths=('//div[@class="category-products"]//h2//a'))
     links = link_extractor.extract_links(response)
     metadata = response.meta['userdata']
     for link in links:
         m = copy.deepcopy(metadata)
         url = link.url
         yield Request(url=url, callback=self.parse_details, errback=self.onerr, meta={'userdata': m})
     #再处理翻页
     link_extractor = SgmlLinkExtractor(restrict_xpaths=('//li[@class="next"]//a'))
     links = link_extractor.extract_links(response)
     if links:
         next_page = links[0]
         next_page_url = next_page.url
         m = copy.deepcopy(metadata)
         yield Request(url=next_page_url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m})
Example #26
0
 def parse(self, response):
     link_extractor = SgmlLinkExtractor(restrict_xpaths=('//div[@id="banners"]'))
     links = link_extractor.extract_links(response)
     metadata = response.meta['userdata']
     for link in links:
         m = copy.deepcopy(metadata)
         url = link.url
         yield Request(url=url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m})
Example #27
0
 def parseL2(self, response):
     # forums - liks to lists and to threads
     s2 = SgmlLinkExtractor(
         restrict_xpaths=['//table[@class="forums-list"]/tr/td/a'])
     Links = s2.extract_links(response)
     for l in Links:
         yield Request(l.url, callback=self.parseL3)
     self.scrapeTheadURL(response)
Example #28
0
 def extractLinks(self, response, **extra): 
     """ 
     抽取链接
     """
     link_extractor = SgmlLinkExtractor(**extra)
     links = link_extractor.extract_links(response)
     log.msg('从%s抽取到的链接:%s' % (response.url,len(links)), level=log.DEBUG)
     return links
 def test_restrict_xpaths_concat_in_handle_data(self):
     """html entities cause SGMLParser to call handle_data hook twice"""
     body = """<html><body><div><a href="/foo">&gt;\xbe\xa9&lt;\xb6\xab</a></body></html>"""
     response = HtmlResponse("http://example.org", body=body, encoding='gb18030')
     lx = SgmlLinkExtractor(restrict_xpaths="//div")
     self.assertEqual(lx.extract_links(response),
                      [Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
                            fragment='', nofollow=False)])
Example #30
0
 def parseL3(self, response):
     # like model specific
     self.scrapeTheadURL(response)
     
     # multipage
     s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]'])
     Links = s.extract_links(response)
     if len(Links) > 0:
         yield Request(Links[0].url, callback=self.parseL3)
 def test_restrict_xpaths(self):
     lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]',))
     self.assertEqual(
         [link for link in lx.extract_links(self.response)],
         [
             Link(url="http://example.com/sample1.html", text=u""),
             Link(url="http://example.com/sample2.html", text=u"sample 2"),
         ],
     )
Example #32
0
    def extract_links(self, response, **extra):  # {{{
        """ Extract links from response
        extra - passed to SgmlLinkExtractor
        """

        link_extractor = SgmlLinkExtractor(**extra)
        links = link_extractor.extract_links(response)

        return links
Example #33
0
class BaseSiteSpider(CrawlSpider):
    
    def __init__(self, **kw):
        super(BaseSiteSpider, self).__init__(**kw)
        url = kw.get('url') or kw.get('domain')
        if not url.startswith('http://') and not url.startswith('https://'):
            url = 'http://%s/' % url
        self.url = url
        self.process = kw.get('process')
        self.deny = [re.compile(x) for x in kw.get('deny', [])]
        self.allowed_domains = [urlparse(url).hostname.lstrip('www.')]
        self.link_extractor = SgmlLinkExtractor()
        #self.cookies_seen = set()
        
    def clean_up(self):
        pass
        
    def setup(self):
        pass
    
    def start_requests(self):
        return [Request(self.url, callback=self.parse)]
    
    def parse(self, response):
        page = self._get_item(response)
        r = [page]
        r.extend(self._extract_requests(response))
        return r
    
    def _should_follow(self, url):
        for pattern in self.deny:
            if pattern.search(url) is not None:
                return False
        return True
    
    def _set_title(self, page, response):
        if isinstance(response, HtmlResponse):
            title = Selector(response).xpath("//title/text()").extract()
            if title:
                page['title'] = title[0]

    def _set_new_cookies(self, page, response):
        cookies = []
        for cookie in [x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie')]:
            if cookie not in self.cookies_seen:
                self.cookies_seen.add(cookie)
                cookies.append(cookie)
        if cookies:
            page['newcookies'] = cookies
            
    def _extract_requests(self, response):
        r = []
        if isinstance(response, HtmlResponse):
            links = (x for x in self.link_extractor.extract_links(response) if self._should_follow(x.url))
            r.extend(Request(x.url, callback=self.parse) for x in links)
        return r
 def test_link_nofollow(self):
     html = """
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     """
     response = HtmlResponse("http://example.org/page.html", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual([link for link in lx.extract_links(response)],
         [ Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
           Link(url='http://example.org/about.html', text=u'About us', nofollow=False) ])
Example #35
0
 def parse_type(self, response):
     link_extractor = SgmlLinkExtractor(
         restrict_xpaths=('//div[@class="inner-nav-content"]//a'))
     links = link_extractor.extract_links(response)
     if links:
         results = self.parse_cat(response)
     else:
         results = self.parse_list(response)
     for result in results:
         yield result
Example #36
0
    def parseL3(self, response):
        # like model specific
        self.scrapeTheadURL(response)

        # multipage
        s = SgmlLinkExtractor(
            restrict_xpaths=['//li[contains(@class, "next")]'])
        Links = s.extract_links(response)
        if len(Links) > 0:
            yield Request(Links[0].url, callback=self.parseL3)
Example #37
0
 def parse(self, response):  # changed to parse to crawl all home page
     lx = SgmlLinkExtractor()
     urls = lx.extract_links(response)
     noworder = 0
     for oneurl in urls:
         noworder += 1
         yield scrapy.Request(
             oneurl.url,
             callback=lambda response, crawllevel=1, order=noworder, loopstr
             ='': self.parse_text(response, crawllevel, order, loopstr))
Example #38
0
 def test_link_nofollow(self):
     html = """
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     """
     response = HtmlResponse("http://example.org/page.html", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual([link for link in lx.extract_links(response)], [
         Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
         Link(url='http://example.org/about.html', text=u'About us', nofollow=False),
     ])
Example #39
0
 def parse(self, response):
     metadata = response.meta['userdata']
     m = metadata
     link_extractor = SgmlLinkExtractor(
         restrict_xpaths=('//div[@class="shared_header"]//li/a'))
     links = link_extractor.extract_links(response)
     enter_url = links[0].url
     yield Request(url=enter_url,
                   callback=self.parse_type,
                   errback=self.onerr,
                   meta={'userdata': m})
    def parse(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.chunyuyisheng.com/clinics/1/doctors
        @returns items 0 0
        @returns requests 500 100000
        """

        hxs = HtmlXPathSelector(response)

        listlinkExtractor = SgmlLinkExtractor(allow=(r"/clinics/\d+/doctors(|\?page=\d+)",), unique=True)
        list_links = listlinkExtractor.extract_links(response)
        for link in list_links:
            yield Request(link.url, callback=self.parse)


        docdetail_linkExtractor = SgmlLinkExtractor(allow=(r"/doctor/clinic_web_\w+$",), unique=True)
        docdetail_links = docdetail_linkExtractor.extract_links(response)
        for link in docdetail_links:
            yield Request(link.url, callback=self.parse_doctor_detail)
Example #41
0
 def parse_list(self, response):
     link_extractor = SgmlLinkExtractor(
         restrict_xpaths=('//div[@class="product_grid"]//a'))
     links = link_extractor.extract_links(response)
     metadata = response.meta['userdata']
     for link in links:
         m = copy.deepcopy(metadata)
         url = link.url
         yield Request(url=url,
                       callback=self.parse_details,
                       errback=self.onerr,
                       meta={'userdata': m})
Example #42
0
 def parse(self, response):  # changed to parse to crawl all home page
     lx = SgmlLinkExtractor()
     urls = lx.extract_links(response)
     noworder = 0
     for oneurl in urls:
         noworder += 1
         yield scrapy.Request(
             oneurl.url,
             callback=lambda response, crawllevel=1, order=noworder, loopstr="": self.parse_text(
                 response, crawllevel, order, loopstr
             ),
         )
Example #43
0
    def parse_faculty_detail(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.haodf.com/faculty/DE4rO-XCoLU0Jq1rbc1P6dS2aO.htm
        @returns items 21 21
        @returns requests 3 3
        @scrapes _name specialty title shortDesc
        """
        hxs = HtmlXPathSelector(response)

        linkExtractor = SgmlLinkExtractor(
            allow=(r"/faculty/\S+/menzhen.htm\?orderby", ), unique=True)
        links = linkExtractor.extract_links(response)
        for link in links:
            yield Request(link.url, callback=self.parse_faculty_detail)

        specialty = hxs.select(
            "/html/body/div[3]/div/div[2]/div/a[3]/text()").extract()
        hospital = hxs.select(
            "/html/body/div[3]/div/div[2]/div/a[2]/text()").extract()

        docLinks = hxs.select(
            "//table[@id='doc_list_index']/tr[descendant::td[contains(@class, 'tda')]]"
        )
        #docLinks = hxs.select("//table[@id='doc_list_index']/tr")

        for doc in docLinks:
            l = XPathItemLoader(DoctorItem(), doc)

            docNames = doc.select(
                "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()"
            ).extract()

            if len(docNames) != 0:
                print docNames[0]

            l.add_xpath(
                '_name',
                "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()"
            )
            l.add_value('specialty', specialty)
            l.add_value('hospital', hospital)
            l.add_xpath('title', "./td[@class='tda']/li/p[1]/text()")
            l.add_xpath('acadamicDegree', "./td[@class='tda']/li/p[2]/text()")
            l.add_xpath('shortDesc', "./td[@class='tdb']/text()")
            #clinic time todo

            ret = l.load_item()
            #print ret

            yield ret
Example #44
0
    def parse(self, response):
        if not self.book:
            log.msg("小说不在数据库中,下面创建小说")
            log.msg("获取小说标题和作者")
            hxs = Selector(response)
            join = Join("")
            _book = hxs.xpath(self.xpath_book).extract()
            if self.author:
                _author = hxs.xpath(self.author).extract()
                author = join(_author)
            else:
                author = ""
            book = join(_book) or None
            log.msg("插入小说到数据库中")
            novel = Novel(name=book,
                          author=author,
                          spider_class=self.config,
                          start_url=self.url,
                          interval=10,
                          alias=slugify(book))
            if self.category:
                category = Category.objects.get(self.category)
                novel.category = category
            else:
                category = Category.objects.all()
                novel.category = category[0]
            novel.save()
            log.msg("插入小说成功")
            self.book = novel

        if self.chapter_list:
            log.msg(u"开始获取章节列表")
            _sgml = SgmlLinkExtractor(restrict_xpaths=self.chapter_list)
            links = _sgml.extract_links(response)
            links = [
                Link(url=l.url, text=l.text, regex=self.regex) for l in links
            ]
            log.msg(u"成功获取章节列表")
            if self.regex:
                log.msg("对章节列表进行排序")
                links = sorted(links, key=get_cid)
                log.msg("章节列表排序完成")
            for n, link in enumerate(links, start=1):
                _q = Collection.objects.filter(
                    url_hash=hashlib.sha1(link.url).hexdigest())
                if not _q:
                    yield Request(url=link.url,
                                  callback=self._parse,
                                  meta=dict(link=link, num=n))
        else:
            log.msg(u"没有获取到章节列表的XPATH,请修改配置文件")
            return
Example #45
0
 def parse_testfile(self, response):
     lx = SgmlLinkExtractor()
     urls = lx.extract_links(response)
     readed = 0
     notreaded = 0
     for oneurl in urls:
         handle = OpenMD5File(oneurl.url, 'rb')
         if handle == False:
             notreaded += 1
         else:
             readed += 1
             handle.close()
     print readed, notreaded
Example #46
0
 def parse_testfile(self, response):
     lx = SgmlLinkExtractor()
     urls = lx.extract_links(response)
     readed = 0
     notreaded = 0
     for oneurl in urls:
         handle = OpenMD5File(oneurl.url, "rb")
         if handle == False:
             notreaded += 1
         else:
             readed += 1
             handle.close()
     print readed, notreaded
    def parse(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.chunyuyisheng.com/clinics/1/doctors
        @returns items 0 0
        @returns requests 500 100000
        """

        hxs = HtmlXPathSelector(response)

        listlinkExtractor = SgmlLinkExtractor(
            allow=(r"/clinics/\d+/doctors(|\?page=\d+)", ), unique=True)
        list_links = listlinkExtractor.extract_links(response)
        for link in list_links:
            yield Request(link.url, callback=self.parse)

        docdetail_linkExtractor = SgmlLinkExtractor(
            allow=(r"/doctor/clinic_web_\w+$", ), unique=True)
        docdetail_links = docdetail_linkExtractor.extract_links(response)
        for link in docdetail_links:
            yield Request(link.url, callback=self.parse_doctor_detail)
Example #48
0
class FollowAllSpider(BaseSpider):

    name = 'followall'

    def __init__(self, **kw):
        super(FollowAllSpider, self).__init__(**kw)
        url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/'
        if not url.startswith('http://') and not url.startswith('https://'):
            url = 'http://%s/' % url
        self.url = url
        self.allowed_domains = [urlparse(url).hostname.lstrip('www.')]
        self.link_extractor = SgmlLinkExtractor()
        self.cookies_seen = set()

    def start_requests(self):
        return [Request(self.url, callback=self.parse)]

    def parse(self, response):
        page = self._get_item(response)
        r = [page]
        r.extend(self._extract_requests(response))
        return r

    def _get_item(self, response):
        item = Page(url=response.url, size=str(len(response.body)),
            referer=response.request.headers.get('Referer'))
        self._set_title(item, response)
        self._set_new_cookies(item, response)
        return item

    def _extract_requests(self, response):
        r = []
        if isinstance(response, HtmlResponse):
            links = self.link_extractor.extract_links(response)
            r.extend(Request(x.url, callback=self.parse) for x in links)
        return r

    def _set_title(self, page, response):
        if isinstance(response, HtmlResponse):
            title = HtmlXPathSelector(response).select("//title/text()").extract()
            if title:
                page['title'] = title[0]

    def _set_new_cookies(self, page, response):
        cookies = []
        for cookie in [x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie')]:
            if cookie not in self.cookies_seen:
                self.cookies_seen.add(cookie)
                cookies.append(cookie)
        if cookies:
            page['newcookies'] = cookies
 def parse(self, response):
     print('inside a thread')
     hxs = HtmlXPathSelector(response)  
     filename_ =    response.url.split("/")[-2][1:]
     filename=      os.path.abspath(databasePath+ "\data\%s" % filename_)
     dumpFilePath = os.path.abspath(databasePath+ "\dump\%s" % filename_)
     try:
         a = response.meta['page']
     except KeyError:
         a=0
         os.mkdir(dumpFilePath)
         with open(filename, 'a') as f:
             #header
             forumTitle=hxs.select('//div[@class="module forums"]/h2/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             extraInfo=hxs.select('//div[@class="module forums discussion tid"]/h4/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             f.write("title:"+forumTitle+"\n")
             f.write("extraInfo:"+extraInfo+"\n")
             f.write(response.url+"\n")
             f.write(filename+"\n")
             f.write(dumpFilePath+"\n\n")
             
     with open(dumpFilePath+ "\\" +str(a)+'.html', 'a') as fd:
         fd.write(response.body)
         
     with open(filename, 'a') as f:
         for entry in hxs.select('//div[contains(@class,"forums-thread")]'):
             msgID=     entry.select('span/@id').extract()[0]        
             msgDate=   entry.select('h4/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             msgText=' '.join(entry.select('span/text()').extract()).encode('ascii','ignore').replace('\n','')
             try:
                 mgAuthor=  entry.select('h3/span/a/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             except:
                 mgAuthor='none'
             try:
                 msgTitle=  entry.select('h3/strong/text()').extract()[0].encode('ascii','ignore').replace('\n','')                
             except:
                 msgTitle="none"
             f.write('msgID:'+msgID+'\n')
             f.write('msgTitle:'+msgTitle+'\n')
             f.write('mgAuthor:'+mgAuthor+'\n')
             f.write('msgDate:'+msgDate+'\n')
             f.write('msgText:'+msgText+'\n\n')
     s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]'])
     Links = s.extract_links(response)
     if len(Links) > 0:
         print 'going to the next page'
         r = Request(googc+Links[0].url, callback=self.parse)
         r.meta['page']=a+1;
         yield r
Example #50
0
class StockHeXun(Spider):

    name = "hexun"
    allowed_domains = ["stock.hexun.com"]
    child_link = re.compile(ur'http://stock.hexun.com/\d+-\d+-\d+/\d+.html')
    start_urls = [
        "http://stock.hexun.com/"]



    def __init__(self ,*arg, **kw):
        super(StockHeXun , self).__init__(*arg , **kw)
        self.link_extract = SgmlLinkExtractor()

    def parse(self, response):
        if response.url in self.start_urls:
            links = self.link_extract.extract_links(response)
            for x in links:
                match = self.child_link.match(x.url)
                if match:
                    if x.url not in BLOOM_FILTER:
                        BLOOM_FILTER.add(x.url)
                    yield Request(x.url, callback=self.parse)
        else:
            item = WenkrItem()
            title = response.xpath('//div[@id="artibodyTitle"]/h1/text()').extract()
            if len(title) == 0:
                title = response.xpath('//head/title//text()').extract()[0]
            else:
                title = title[0]
            item['title'] = title
            item['content'] = self.make_content(
                response.xpath('//div[@id="artibody"]//*//text()').extract())
            item['category'] = '股票'
            author = response.xpath('//span[@id="author_baidu"]/font/text()').extract()
            if len(author) > 0 :
                item['author'] = response.xpath('//span[@id="author_baidu"]/font/text()').extract()[0]
            else:
                item['author'] = u'和讯'
            item['tags'] = response.xpath('//meta[@name="keywords"]/@content').extract()[0]
            if len(item['content']) > 80 :
                yield item

    def make_content(self, ps):
        content = []
        for p in ps:
            if len(p) > 5 :
                content.append('  %s' % p)
        return '\r\n'.join(content)
Example #51
0
    def parse_leagues(self, response):

        sx = SgmlLinkExtractor(allow=[
            r'http://www.sportingbet.com/sports-football/'
            '[A-Za-z0-9-]+/1-102-\d+.html'
        ])

        league_links = sx.extract_links(response)

        # Remove unwanted links, returns True to filter out link
        league_links = [
            link for link in league_links
            if not linkFilter(self.name, link.url)
        ]

        eventClassIdList = []
        # Extract eventClassId from the link.url with regex
        for link in league_links:
            matches = re.findall(
                r'http://www.sportingbet.com/sports-football/'
                '[A-Za-z0-9-]+/1-102-(\d+?).html', link.url)
            if matches:
                eventClassIdList.append(matches[0])

        base_url = 'http://www.sportingbet.com/services/CouponTemplate.mvc/GetCoupon'
        headers = {
            'Content-Type': 'application/x-www-form-urlencoded',
            'Referer':
            'http://www.sportingbet.com/sports-football/0-102-410.html',
            'X-Requested-With': 'XMLHttpRequest',
            'Host': 'www.sportingbet.com',
        }
        # cookies =response.meta['cookies']
        for id in eventClassIdList:
            # Build GETstr
            GETstr = '?couponAction=EVENTCLASSCOUPON&'
            GETstr += 'sportIds=102&'
            GETstr += 'marketTypeId=&'
            GETstr += 'eventId=&'
            GETstr += 'bookId=&'
            GETstr += 'eventClassId=' + str(id) + '&'
            GETstr += 'sportId=102&'
            GETstr += 'eventTimeGroup=ETG_NextFewHours_0_0'
            # make req

            yield Request(url=base_url + GETstr,
                          headers=headers,
                          meta={'eventClassId': str(id)},
                          callback=self.pre_parse_Data)
Example #52
0
  def parse_brands(self,response):

    lx = SgmlLinkExtractor(restrict_xpaths=('//td[@valign="top"]'), allow=('\S+\.com'), unique=True)
    links = lx.extract_links(response)
    brands_all = set(sorted(link.text for link in links))

    self.log(u'Extracted {} brands.'.format(len(brands_all)), scrapy.log.DEBUG)

    """Traverse through all the pages to get all products"""
    """brands_alphabets = ['A','B','C','D','E','F','G','H','I',
                        'J','K','L','M','N','O','P','Q','R',
                        'S','T','U','V','W','X','Y','Z']"""
    brands_alphabets = ['A']
    for alpha in brands_alphabets:
      yield Request(self.url_view_items + str(alpha), callback=self.items_list)
    def test_process_value(self):
        """Test restrict_xpaths with encodings"""
        html = """
        <a href="javascript:goToPage('../other/page.html','photo','width=600,height=540,scrollbars'); return false">Link text</a>
        <a href="/about.html">About us</a>
        """
        response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding="windows-1252")

        def process_value(value):
            m = re.search("javascript:goToPage\('(.*?)'", value)
            if m:
                return m.group(1)

        lx = SgmlLinkExtractor(process_value=process_value)
        self.assertEqual(lx.extract_links(response), [Link(url="http://example.org/other/page.html", text="Link text")])
Example #54
0
    def parse_image_list_page(self, response):
        hxs = HtmlXPathSelector(response)
        selector = SgmlLinkExtractor(allow=('photos/\d+'), restrict_xpaths="//div[@class='gallery-list-wrapper page-block']", unique=True)
        next_page_link = SgmlLinkExtractor(allow=('shop/\d+/photos(\?pg=\d+)*'), restrict_xpaths="//a[@class='NextPage']", unique=True)
        # Prepare cookies
        cookies = {}
        if 'Set-Cookie' in response.headers:
            for eq in response.headers['Set-Cookie'].split(';'):
                k,v = eq.strip().split('=')
                cookies[k] = v

        requests = []
        # follow next-page
        for link in next_page_link.extract_links(response):
            req = Request(link.url, cookies=cookies, callback=self.parse_image_list_page)
            requests.append(req)

        # follow image link
        for link in selector.extract_links(response):
            req = Request(link.url, cookies=cookies, callback=self.extract_image)
            requests.append(req)

        for req in requests:
            yield req
    def test_process_value(self):
        """Test restrict_xpaths with encodings"""
        html = """
        <a href="javascript:goToPage('../other/page.html','photo','width=600,height=540,scrollbars'); return false">Link text</a>
        <a href="/about.html">About us</a>
        """
        response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252')

        def process_value(value):
            m = re.search("javascript:goToPage\('(.*?)'", value)
            if m:
                return m.group(1)

        lx = SgmlLinkExtractor(process_value=process_value)
        self.assertEqual(lx.extract_links(response),
                         [Link(url='http://example.org/other/page.html', text='Link text')])