def test_tags(self):
        html = """<html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)

        lx = SgmlLinkExtractor(tags=None)
        self.assertEqual(lx.extract_links(response), [])

        lx = SgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = SgmlLinkExtractor(tags="area")
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])

        lx = SgmlLinkExtractor(tags="a")
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = SgmlLinkExtractor(tags=("a","img"), attrs=("href", "src"), deny_extensions=())
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
        ])
 def test_encoded_url_in_restricted_xpath(self):
     body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
     response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
     lx = SgmlLinkExtractor(restrict_xpaths="//div")
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
     ])
Example #3
0
 def parse_session_hash(self, response):
     extractor = SgmlLinkExtractor(
         allow=r'/w/valikko\.jsp', tags='frame', attrs=('src', ))
     link = extractor.extract_links(response)[0]
     query = urlparse.urlparse(link.url).query
     params = urlparse.parse_qs(query)
     return params['MD5avain'][0]
 def test_deny_extensions(self):
     html = """<a href="page.html">asd</a> and <a href="photo.jpg">"""
     response = HtmlResponse("http://example.org/", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.org/page.html', text=u'asd'),
     ])
 def __init__(self, allow = (), deny = (), allow_domains = (), deny_domains = (), restrict_xpaths = (),
              tags = ('a', 'area'), attrs = ('href'), canonicalize = True, unique = True, process_value = None, check_url = True):
     #Add check_url parameter
     self.check_url = check_url
     
     SgmlLinkExtractor.__init__(self, allow = allow, deny = deny, allow_domains = allow_domains, deny_domains = deny_domains, restrict_xpaths = restrict_xpaths,
                                tags = tags, attrs = attrs, canonicalize = canonicalize, unique = unique, process_value = process_value)
    def parse_hospital_active_doctor(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.haodf.com/hospital/DE4roiYGYZwXhYmS30yF9V0wc/DE4rO-XCoLU0Jq1rbc1P6dS2aO/daifu.htm
        @returns items 14 14
        @returns requests 20 100
        @scrapes _name hospital specialty title reply2wCount
        """
        hxs = HtmlXPathSelector(response)

        city = response.meta['city']
        area = response.meta['area']
        print "$$$ current city: %s area: %s" % (city[0], area[0])

        #Sample
        #http://www.haodf.com/hospital/DE4roiYGYZwXhYmS30yF9V0wc/DE4rO-XCoLUE-578VWVmvC3uh7/daifu.htm

        linkExtractor = SgmlLinkExtractor(allow=(r"/hospital/\S+/\S+/daifu.htm",), unique=True)
        links = linkExtractor.extract_links(response)
        for link in links:
            request = Request(link.url, callback=self.parse_hospital_active_doctor)
            request.meta['city'] = response.meta['city']
            request.meta["area"] = response.meta['area']
            yield request

        hospital = hxs.select("/html/body/div[3]/div/a[3]/text()").extract()[0]
        print hospital
        specialty = hxs.select("//div[@class='subnav']/a/text()").re(r'(\S+)\s+(\S+)')[0]
        print specialty

        docLinks = hxs.select("//table[@id='doc_list_index']/tr[descendant::td[contains(@class, 'tda')]]")
        #docLinks = hxs.select("//table[@id='doc_list_index']/tr")

        for doc in docLinks:
            l = XPathItemLoader(ActiveDoctorItem(), doc)

            docNames = doc.select("./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()").extract()

            if len(docNames) != 0:
                print docNames[0]

            l.add_xpath('_name', "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()")
            l.add_value('specialty', specialty)
            l.add_value('hospital', hospital)
            l.add_value('city', response.meta['city'])
            l.add_value('area', response.meta['area'])

            title = doc.select("./td[@class='tda']/li/text()").re('\S+')

            if len(title) == 1:
                l.add_value('title', title[0])

            l.add_xpath('count_ReplyInTwoWeeks', u"./td[@class='td_hf']/div[contains(text(), '近2周回复咨询')]/span/text()")
            l.add_xpath('count_ReplyTotal', u"./td[@class='td_hf']/div[contains(text(), '总共回复')]/span/text()")
            l.add_xpath('count_Calls', u"./td[@class='td_hf']/div[contains(text(), '已接听电话咨询')]/span/text()")
            ret = l.load_item()
            #print ret

            yield ret
Example #7
0
 def parseL2(self, response):
     # forums - liks to lists and to threads
     s2 = SgmlLinkExtractor(restrict_xpaths=['//table[@class="forums-list"]/tr/td/a'])
     Links = s2.extract_links(response)
     for l in Links:
         yield Request(l.url, callback=self.parseL3)
     self.scrapeTheadURL(response)    
Example #8
0
 def parse(self, response):
     # title page
     hxs = HtmlXPathSelector(response)
     s1 = SgmlLinkExtractor(restrict_xpaths=['//a[@class="title"]'])
     Links = s1.extract_links(response)       
     for l in Links:
         yield Request(l.url, callback=self.parseL2)
 def parseThread(self, response):
     print('inside a thread')
     hxs = HtmlXPathSelector(response)  
     filename = "xxx"+response.url.split("/")[-2][1:]
     with open(filename, 'a') as f:
         for entry in hxs.select('//div[contains(@class,"forums-thread")]'):
             msgID=     entry.select('span/@id').extract()[0]        
             msgDate=   entry.select('h4/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             msgText=' '.join(entry.select('span/text()').extract()).encode('ascii','ignore').replace('\n','')
             try:
                 mgAuthor=  entry.select('h3/span/a/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             except:
                 mgAuthor='none'
             try:
                 msgTitle=  entry.select('h3/strong/text()').extract()[0].encode('ascii','ignore').replace('\n','')                
             except:
                 msgTitle="none"
             f.write('msgID:'+msgID+'\n')
             f.write('msgTitle:'+msgTitle+'\n')
             f.write('mgAuthor:'+mgAuthor+'\n')
             f.write('msgDate:'+msgDate+'\n')
             f.write('msgText:'+msgText+'\n\n')
     s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]'])
     Links = s.extract_links(response)
     if len(Links) > 0:
         print 'going to the next page'
         yield Request(Links[0].url, callback=self.parseThread)
    def test_attrs(self):
        lx = self.extractor_cls(attrs="href")
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=None)
        self.assertEqual(lx.extract_links(self.response), [])

        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)
        lx = SgmlLinkExtractor(attrs=("href"))
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])
 def test_restrict_xpaths_concat_in_handle_data(self):
     """html entities cause SGMLParser to call handle_data hook twice"""
     body = """<html><body><div><a href="/foo">&gt;\xbe\xa9&lt;\xb6\xab</a></body></html>"""
     response = HtmlResponse("http://example.org", body=body, encoding='gb18030')
     lx = SgmlLinkExtractor(restrict_xpaths="//div")
     self.assertEqual(lx.extract_links(response),
                      [Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
                            fragment='', nofollow=False)])
 def test_base_url_with_restrict_xpaths(self):
     html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
     <body><p><a href="item/12.html">Item 12</a></p>
     </body></html>"""
     response = HtmlResponse("http://example.org/somepage/index.html", body=html)
     lx = SgmlLinkExtractor(restrict_xpaths="//p")
     self.assertEqual(lx.extract_links(response),
                      [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
Example #13
0
 def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), 
              tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
              deny_extensions=None, seen_urls=[]):
     SgmlLinkExtractor.__init__(self,allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, 
              tags=tags, attrs=attrs, canonicalize=canonicalize, unique=unique, process_value=process_value,
              deny_extensions=deny_extensions)
     
     for l in seen_urls: self.seen_urls[l]=True
 def test_restrict_xpaths(self):
     lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]',))
     self.assertEqual(
         [link for link in lx.extract_links(self.response)],
         [
             Link(url="http://example.com/sample1.html", text=u""),
             Link(url="http://example.com/sample2.html", text=u"sample 2"),
         ],
     )
Example #15
0
    def extract_links(self, response, **extra):  # {{{
        """ Extract links from response
        extra - passed to SgmlLinkExtractor
        """

        link_extractor = SgmlLinkExtractor(**extra)
        links = link_extractor.extract_links(response)

        return links
Example #16
0
 def parseL3(self, response):
     # like model specific
     self.scrapeTheadURL(response)
     
     # multipage
     s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]'])
     Links = s.extract_links(response)
     if len(Links) > 0:
         yield Request(Links[0].url, callback=self.parseL3)
Example #17
0
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), 
                 tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
                 ignore_set=set()):

        self.ignore_set = ignore_set

        SgmlLinkExtractor.__init__(self, allow=allow, deny=deny,
                allow_domains=allow_domains, deny_domains=deny_domains,
                restrict_xpaths=restrict_xpaths, tags=tags, attrs=attrs,
                canonicalize=canonicalize, unique=unique, process_value=process_value)
 def test_link_nofollow(self):
     html = """
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     """
     response = HtmlResponse("http://example.org/page.html", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual([link for link in lx.extract_links(response)],
         [ Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
           Link(url='http://example.org/about.html', text=u'About us', nofollow=False) ])
Example #19
0
    def parse(self, response):
        print "IN PARSE!"
        # inspect_response(response,self)

        links=SgmlLinkExtractor(
                allow=('https://www.coursera.org/course/\w+'),
            )
        print "TAMANHO:",len(links.extract_links(response))
        for link in links.extract_links(response):
            # print link
            yield Request(link.url,callback=self.parse_item)
Example #20
0
 def parse(self, response):  # changed to parse to crawl all home page
     lx = SgmlLinkExtractor()
     urls = lx.extract_links(response)
     noworder = 0
     for oneurl in urls:
         noworder += 1
         yield scrapy.Request(
             oneurl.url,
             callback=lambda response, crawllevel=1, order=noworder, loopstr="": self.parse_text(
                 response, crawllevel, order, loopstr
             ),
         )
Example #21
0
 def parse_testfile(self, response):
     lx = SgmlLinkExtractor()
     urls = lx.extract_links(response)
     readed = 0
     notreaded = 0
     for oneurl in urls:
         handle = OpenMD5File(oneurl.url, "rb")
         if handle == False:
             notreaded += 1
         else:
             readed += 1
             handle.close()
     print readed, notreaded
    def test_extraction_using_single_values(self):
        """Test the extractor's behaviour among different situations"""

        lx = SgmlLinkExtractor(allow="sample")
        self.assertEqual(
            [link for link in lx.extract_links(self.response)],
            [
                Link(url="http://example.com/sample1.html", text=u""),
                Link(url="http://example.com/sample2.html", text=u"sample 2"),
                Link(url="http://example.com/sample3.html", text=u"sample 3 text"),
            ],
        )

        lx = SgmlLinkExtractor(allow="sample", deny="3")
        self.assertEqual(
            [link for link in lx.extract_links(self.response)],
            [
                Link(url="http://example.com/sample1.html", text=u""),
                Link(url="http://example.com/sample2.html", text=u"sample 2"),
            ],
        )

        lx = SgmlLinkExtractor(allow_domains="google.com")
        self.assertEqual(
            [link for link in lx.extract_links(self.response)], [Link(url="http://www.google.com/something", text=u"")]
        )

        lx = SgmlLinkExtractor(deny_domains="example.com")
        self.assertEqual(
            [link for link in lx.extract_links(self.response)], [Link(url="http://www.google.com/something", text=u"")]
        )
    def test_extraction_using_single_values(self):
        '''Test the extractor's behaviour among different situations'''

        lx = SgmlLinkExtractor(allow='sample')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
        ])

        lx = SgmlLinkExtractor(allow='sample', deny='3')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = SgmlLinkExtractor(allow_domains='google.com')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])

        lx = SgmlLinkExtractor(deny_domains='example.com')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])
Example #24
0
 def parse_start_url(self, response):
     if not hasattr(response, 'encoding'):
         setattr(response, 'encoding', 'text/html;charset=UTF-8')
     target_le = SgmlLinkExtractor(
         allow=r'/cn/products/products_detail.asp\?Catalog_id=\w+')
     links = target_le.extract_links(response)
     if links:
         return [Request(url=link.url, cookies=self.forged_cookie, callback=self.parse_item) 
             for link in links]
     else:
         general_le = SgmlLinkExtractor(
                     allow=())
         return [Request(url=link.url, cookies=self.forged_cookie)
                 for link in general_le.extract_links(response)]
 def parse(self, response):
     print('inside a thread')
     hxs = HtmlXPathSelector(response)  
     filename_ =    response.url.split("/")[-2][1:]
     filename=      os.path.abspath(databasePath+ "\data\%s" % filename_)
     dumpFilePath = os.path.abspath(databasePath+ "\dump\%s" % filename_)
     try:
         a = response.meta['page']
     except KeyError:
         a=0
         os.mkdir(dumpFilePath)
         with open(filename, 'a') as f:
             #header
             forumTitle=hxs.select('//div[@class="module forums"]/h2/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             extraInfo=hxs.select('//div[@class="module forums discussion tid"]/h4/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             f.write("title:"+forumTitle+"\n")
             f.write("extraInfo:"+extraInfo+"\n")
             f.write(response.url+"\n")
             f.write(filename+"\n")
             f.write(dumpFilePath+"\n\n")
             
     with open(dumpFilePath+ "\\" +str(a)+'.html', 'a') as fd:
         fd.write(response.body)
         
     with open(filename, 'a') as f:
         for entry in hxs.select('//div[contains(@class,"forums-thread")]'):
             msgID=     entry.select('span/@id').extract()[0]        
             msgDate=   entry.select('h4/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             msgText=' '.join(entry.select('span/text()').extract()).encode('ascii','ignore').replace('\n','')
             try:
                 mgAuthor=  entry.select('h3/span/a/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             except:
                 mgAuthor='none'
             try:
                 msgTitle=  entry.select('h3/strong/text()').extract()[0].encode('ascii','ignore').replace('\n','')                
             except:
                 msgTitle="none"
             f.write('msgID:'+msgID+'\n')
             f.write('msgTitle:'+msgTitle+'\n')
             f.write('mgAuthor:'+mgAuthor+'\n')
             f.write('msgDate:'+msgDate+'\n')
             f.write('msgText:'+msgText+'\n\n')
     s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]'])
     Links = s.extract_links(response)
     if len(Links) > 0:
         print 'going to the next page'
         r = Request(googc+Links[0].url, callback=self.parse)
         r.meta['page']=a+1;
         yield r
Example #26
0
  def parse_brands(self,response):

    lx = SgmlLinkExtractor(restrict_xpaths=('//td[@valign="top"]'), allow=('\S+\.com'), unique=True)
    links = lx.extract_links(response)
    brands_all = set(sorted(link.text for link in links))

    self.log(u'Extracted {} brands.'.format(len(brands_all)), scrapy.log.DEBUG)

    """Traverse through all the pages to get all products"""
    """brands_alphabets = ['A','B','C','D','E','F','G','H','I',
                        'J','K','L','M','N','O','P','Q','R',
                        'S','T','U','V','W','X','Y','Z']"""
    brands_alphabets = ['A']
    for alpha in brands_alphabets:
      yield Request(self.url_view_items + str(alpha), callback=self.items_list)
    def test_restrict_xpaths_encoding(self):
        """Test restrict_xpaths with encodings"""
        html = """<html><head><title>Page title<title>
        <body><p><a href="item/12.html">Item 12</a></p>
        <div class='links'>
        <p><a href="/about.html">About us\xa3</a></p>
        </div>
        <div>
        <p><a href="/nofollow.html">This shouldn't be followed</a></p>
        </div>
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding="windows-1252")

        lx = SgmlLinkExtractor(restrict_xpaths="//div[@class='links']")
        self.assertEqual(lx.extract_links(response), [Link(url="http://example.org/about.html", text=u"About us\xa3")])
    def test_process_value(self):
        """Test restrict_xpaths with encodings"""
        html = """
        <a href="javascript:goToPage('../other/page.html','photo','width=600,height=540,scrollbars'); return false">Link text</a>
        <a href="/about.html">About us</a>
        """
        response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding="windows-1252")

        def process_value(value):
            m = re.search("javascript:goToPage\('(.*?)'", value)
            if m:
                return m.group(1)

        lx = SgmlLinkExtractor(process_value=process_value)
        self.assertEqual(lx.extract_links(response), [Link(url="http://example.org/other/page.html", text="Link text")])
Example #29
0
    def crawl_all(self, response):
        print 'Crawling all...'

        # Get list of decks
        self.deck_links = SgmlLinkExtractor(allow = r'/sets/\d+').extract_links(response)

        return self.parse_deck_links(None)
Example #30
0
 def parsePage(self, response):
     hxs = HtmlXPathSelector(response)
     
     item = response.meta['item']
     emails = collectAllEmail(hxs.extract())
     if len(emails) > 0:
         item['email'] = emails[0]
         yield item
        
     extractor = SgmlLinkExtractor(allow_domains=response.url)
     
     for entry in extractor.extract_links(response):
         if entry.url is not None:
             req = Request(entry.url, callback=self.parsePage)
             req.meta['item'] = item
             yield req