Python SgmlLinkExtractor Examples, scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor Python Examples

Example #1

0

Show file

File: test_contrib_linkextractors.py Project: 0xfab/scrapy

    def test_tags(self):
        html = """<html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)

        lx = SgmlLinkExtractor(tags=None)
        self.assertEqual(lx.extract_links(response), [])

        lx = SgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = SgmlLinkExtractor(tags="area")
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])

        lx = SgmlLinkExtractor(tags="a")
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = SgmlLinkExtractor(tags=("a","img"), attrs=("href", "src"), deny_extensions=())
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
        ])

Example #2

0

Show file

File: test_contrib_linkextractors.py Project: Quebec-Python/web-scraping-101

 def test_encoded_url_in_restricted_xpath(self):
     body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
     response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
     lx = SgmlLinkExtractor(restrict_xpaths="//div")
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
     ])

Example #3

0

Show file

File: oodi.py Project: jpvanhal/cloudsizzle

 def parse_session_hash(self, response):
     extractor = SgmlLinkExtractor(
         allow=r'/w/valikko\.jsp', tags='frame', attrs=('src', ))
     link = extractor.extract_links(response)[0]
     query = urlparse.urlparse(link.url).query
     params = urlparse.parse_qs(query)
     return params['MD5avain'][0]

Example #4

0

Show file

File: test_contrib_linkextractors.py Project: Quebec-Python/web-scraping-101

 def test_deny_extensions(self):
     html = """<a href="page.html">asd</a> and <a href="photo.jpg">"""
     response = HtmlResponse("http://example.org/", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.org/page.html', text=u'asd'),
     ])

Example #5

0

Show file

File: linkextractor.py Project: qpwang/CareerTalkCrawler

 def __init__(self, allow = (), deny = (), allow_domains = (), deny_domains = (), restrict_xpaths = (),
              tags = ('a', 'area'), attrs = ('href'), canonicalize = True, unique = True, process_value = None, check_url = True):
     #Add check_url parameter
     self.check_url = check_url
     
     SgmlLinkExtractor.__init__(self, allow = allow, deny = deny, allow_domains = allow_domains, deny_domains = deny_domains, restrict_xpaths = restrict_xpaths,
                                tags = tags, attrs = attrs, canonicalize = canonicalize, unique = unique, process_value = process_value)

Example #6

0

Show file

File: active_doctor_spider.py Project: lee670523/gitPython

    def parse_hospital_active_doctor(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.haodf.com/hospital/DE4roiYGYZwXhYmS30yF9V0wc/DE4rO-XCoLU0Jq1rbc1P6dS2aO/daifu.htm
        @returns items 14 14
        @returns requests 20 100
        @scrapes _name hospital specialty title reply2wCount
        """
        hxs = HtmlXPathSelector(response)

        city = response.meta['city']
        area = response.meta['area']
        print "$$$ current city: %s area: %s" % (city[0], area[0])

        #Sample
        #http://www.haodf.com/hospital/DE4roiYGYZwXhYmS30yF9V0wc/DE4rO-XCoLUE-578VWVmvC3uh7/daifu.htm

        linkExtractor = SgmlLinkExtractor(allow=(r"/hospital/\S+/\S+/daifu.htm",), unique=True)
        links = linkExtractor.extract_links(response)
        for link in links:
            request = Request(link.url, callback=self.parse_hospital_active_doctor)
            request.meta['city'] = response.meta['city']
            request.meta["area"] = response.meta['area']
            yield request

        hospital = hxs.select("/html/body/div[3]/div/a[3]/text()").extract()[0]
        print hospital
        specialty = hxs.select("//div[@class='subnav']/a/text()").re(r'(\S+)\s+(\S+)')[0]
        print specialty

        docLinks = hxs.select("//table[@id='doc_list_index']/tr[descendant::td[contains(@class, 'tda')]]")
        #docLinks = hxs.select("//table[@id='doc_list_index']/tr")

        for doc in docLinks:
            l = XPathItemLoader(ActiveDoctorItem(), doc)

            docNames = doc.select("./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()").extract()

            if len(docNames) != 0:
                print docNames[0]

            l.add_xpath('_name', "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()")
            l.add_value('specialty', specialty)
            l.add_value('hospital', hospital)
            l.add_value('city', response.meta['city'])
            l.add_value('area', response.meta['area'])

            title = doc.select("./td[@class='tda']/li/text()").re('\S+')

            if len(title) == 1:
                l.add_value('title', title[0])

            l.add_xpath('count_ReplyInTwoWeeks', u"./td[@class='td_hf']/div[contains(text(), '近2周回复咨询')]/span/text()")
            l.add_xpath('count_ReplyTotal', u"./td[@class='td_hf']/div[contains(text(), '总共回复')]/span/text()")
            l.add_xpath('count_Calls', u"./td[@class='td_hf']/div[contains(text(), '已接听电话咨询')]/span/text()")
            ret = l.load_item()
            #print ret

            yield ret

Example #7

0

Show file

File: getThreads.py Project: vasusvodorosus/LDA-ermunds

 def parseL2(self, response):
     # forums - liks to lists and to threads
     s2 = SgmlLinkExtractor(restrict_xpaths=['//table[@class="forums-list"]/tr/td/a'])
     Links = s2.extract_links(response)
     for l in Links:
         yield Request(l.url, callback=self.parseL3)
     self.scrapeTheadURL(response)

Example #8

0

Show file

File: getThreads.py Project: vasusvodorosus/LDA-ermunds

 def parse(self, response):
     # title page
     hxs = HtmlXPathSelector(response)
     s1 = SgmlLinkExtractor(restrict_xpaths=['//a[@class="title"]'])
     Links = s1.extract_links(response)       
     for l in Links:
         yield Request(l.url, callback=self.parseL2)

Example #9

0

Show file

File: combination_spider.py Project: vasusvodorosus/LDA-ermunds

 def parseThread(self, response):
     print('inside a thread')
     hxs = HtmlXPathSelector(response)  
     filename = "xxx"+response.url.split("/")[-2][1:]
     with open(filename, 'a') as f:
         for entry in hxs.select('//div[contains(@class,"forums-thread")]'):
             msgID=     entry.select('span/@id').extract()[0]        
             msgDate=   entry.select('h4/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             msgText=' '.join(entry.select('span/text()').extract()).encode('ascii','ignore').replace('\n','')
             try:
                 mgAuthor=  entry.select('h3/span/a/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             except:
                 mgAuthor='none'
             try:
                 msgTitle=  entry.select('h3/strong/text()').extract()[0].encode('ascii','ignore').replace('\n','')                
             except:
                 msgTitle="none"
             f.write('msgID:'+msgID+'\n')
             f.write('msgTitle:'+msgTitle+'\n')
             f.write('mgAuthor:'+mgAuthor+'\n')
             f.write('msgDate:'+msgDate+'\n')
             f.write('msgText:'+msgText+'\n\n')
     s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]'])
     Links = s.extract_links(response)
     if len(Links) > 0:
         print 'going to the next page'
         yield Request(Links[0].url, callback=self.parseThread)

Example #10

0

Show file

File: test_contrib_linkextractors.py Project: 505555998/scrapy

    def test_attrs(self):
        lx = self.extractor_cls(attrs="href")
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=None)
        self.assertEqual(lx.extract_links(self.response), [])

        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)
        lx = SgmlLinkExtractor(attrs=("href"))
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])

Example #11

0

Show file

File: test_contrib_linkextractors.py Project: Quebec-Python/web-scraping-101

 def test_restrict_xpaths_concat_in_handle_data(self):
     """html entities cause SGMLParser to call handle_data hook twice"""
     body = """<html><body><div><a href="/foo">&gt;\xbe\xa9&lt;\xb6\xab</a></body></html>"""
     response = HtmlResponse("http://example.org", body=body, encoding='gb18030')
     lx = SgmlLinkExtractor(restrict_xpaths="//div")
     self.assertEqual(lx.extract_links(response),
                      [Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
                            fragment='', nofollow=False)])

Example #12

0

Show file

File: test_contrib_linkextractors.py Project: Quebec-Python/web-scraping-101

 def test_base_url_with_restrict_xpaths(self):
     html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
     <body><p><a href="item/12.html">Item 12</a></p>
     </body></html>"""
     response = HtmlResponse("http://example.org/somepage/index.html", body=html)
     lx = SgmlLinkExtractor(restrict_xpaths="//p")
     self.assertEqual(lx.extract_links(response),
                      [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])

Example #13

0

Show file

File: q3_spider.py Project: yerihyo/cs6200f13

 def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), 
              tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
              deny_extensions=None, seen_urls=[]):
     SgmlLinkExtractor.__init__(self,allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, 
              tags=tags, attrs=attrs, canonicalize=canonicalize, unique=unique, process_value=process_value,
              deny_extensions=deny_extensions)
     
     for l in seen_urls: self.seen_urls[l]=True

Example #14

0

Show file

File: test_contrib_linkextractors.py Project: serkanh/scrapy

 def test_restrict_xpaths(self):
     lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]',))
     self.assertEqual(
         [link for link in lx.extract_links(self.response)],
         [
             Link(url="http://example.com/sample1.html", text=u""),
             Link(url="http://example.com/sample2.html", text=u"sample 2"),
         ],
     )

Example #15

0

Show file

File: getThreads.py Project: vasusvodorosus/LDA-ermunds

 def parseL3(self, response):
     # like model specific
     self.scrapeTheadURL(response)
     
     # multipage
     s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]'])
     Links = s.extract_links(response)
     if len(Links) > 0:
         yield Request(Links[0].url, callback=self.parseL3)

Example #16

0

Show file

File: spider.py Project: chappyhome/qidian

    def extract_links(self, response, **extra):  # {{{
        """ Extract links from response
        extra - passed to SgmlLinkExtractor
        """

        link_extractor = SgmlLinkExtractor(**extra)
        links = link_extractor.extract_links(response)

        return links

Example #17

0

Show file

File: linkextractors.py Project: shahin/hippolyte

    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), 
                 tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
                 ignore_set=set()):

        self.ignore_set = ignore_set

        SgmlLinkExtractor.__init__(self, allow=allow, deny=deny,
                allow_domains=allow_domains, deny_domains=deny_domains,
                restrict_xpaths=restrict_xpaths, tags=tags, attrs=attrs,
                canonicalize=canonicalize, unique=unique, process_value=process_value)

Example #18

0

Show file

File: test_contrib_linkextractors.py Project: Abioy/scrapy

 def test_link_nofollow(self):
     html = """
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     """
     response = HtmlResponse("http://example.org/page.html", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual([link for link in lx.extract_links(response)],
         [ Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
           Link(url='http://example.org/about.html', text=u'About us', nofollow=False) ])

Example #19

0

Show file

File: dmoz_spider.py Project: jduro/esoteric

    def parse(self, response):
        print "IN PARSE!"
        # inspect_response(response,self)

        links=SgmlLinkExtractor(
                allow=('https://www.coursera.org/course/\w+'),
            )
        print "TAMANHO:",len(links.extract_links(response))
        for link in links.extract_links(response):
            # print link
            yield Request(link.url,callback=self.parse_item)

Example #20

0

Show file

File: SauroDownSplider.py Project: RaymonSHan/Sauro

 def parse(self, response):  # changed to parse to crawl all home page
     lx = SgmlLinkExtractor()
     urls = lx.extract_links(response)
     noworder = 0
     for oneurl in urls:
         noworder += 1
         yield scrapy.Request(
             oneurl.url,
             callback=lambda response, crawllevel=1, order=noworder, loopstr="": self.parse_text(
                 response, crawllevel, order, loopstr
             ),
         )

Example #21

0

Show file

File: SauroDownSplider.py Project: RaymonSHan/Sauro

 def parse_testfile(self, response):
     lx = SgmlLinkExtractor()
     urls = lx.extract_links(response)
     readed = 0
     notreaded = 0
     for oneurl in urls:
         handle = OpenMD5File(oneurl.url, "rb")
         if handle == False:
             notreaded += 1
         else:
             readed += 1
             handle.close()
     print readed, notreaded

Example #22

0

Show file

File: abnova.py Project: fanshaorui/phdgogo

 def parse_start_url(self, response):
     if not hasattr(response, 'encoding'):
         setattr(response, 'encoding', 'text/html;charset=UTF-8')
     target_le = SgmlLinkExtractor(
         allow=r'/cn/products/products_detail.asp\?Catalog_id=\w+')
     links = target_le.extract_links(response)
     if links:
         return [Request(url=link.url, cookies=self.forged_cookie, callback=self.parse_item) 
             for link in links]
     else:
         general_le = SgmlLinkExtractor(
                     allow=())
         return [Request(url=link.url, cookies=self.forged_cookie)
                 for link in general_le.extract_links(response)]

Example #23

0

Show file

File: test_contrib_linkextractors.py Project: Quebec-Python/web-scraping-101

    def test_extraction_using_single_values(self):
        '''Test the extractor's behaviour among different situations'''

        lx = SgmlLinkExtractor(allow='sample')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
        ])

        lx = SgmlLinkExtractor(allow='sample', deny='3')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = SgmlLinkExtractor(allow_domains='google.com')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])

        lx = SgmlLinkExtractor(deny_domains='example.com')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])

Example #24

0

Show file

File: test_contrib_linkextractors.py Project: serkanh/scrapy

    def test_extraction_using_single_values(self):
        """Test the extractor's behaviour among different situations"""

        lx = SgmlLinkExtractor(allow="sample")
        self.assertEqual(
            [link for link in lx.extract_links(self.response)],
            [
                Link(url="http://example.com/sample1.html", text=u""),
                Link(url="http://example.com/sample2.html", text=u"sample 2"),
                Link(url="http://example.com/sample3.html", text=u"sample 3 text"),
            ],
        )

        lx = SgmlLinkExtractor(allow="sample", deny="3")
        self.assertEqual(
            [link for link in lx.extract_links(self.response)],
            [
                Link(url="http://example.com/sample1.html", text=u""),
                Link(url="http://example.com/sample2.html", text=u"sample 2"),
            ],
        )

        lx = SgmlLinkExtractor(allow_domains="google.com")
        self.assertEqual(
            [link for link in lx.extract_links(self.response)], [Link(url="http://www.google.com/something", text=u"")]
        )

        lx = SgmlLinkExtractor(deny_domains="example.com")
        self.assertEqual(
            [link for link in lx.extract_links(self.response)], [Link(url="http://www.google.com/something", text=u"")]
        )

Example #25

0

Show file

File: processThreads.py Project: vasusvodorosus/LDA-ermunds

 def parse(self, response):
     print('inside a thread')
     hxs = HtmlXPathSelector(response)  
     filename_ =    response.url.split("/")[-2][1:]
     filename=      os.path.abspath(databasePath+ "\data\%s" % filename_)
     dumpFilePath = os.path.abspath(databasePath+ "\dump\%s" % filename_)
     try:
         a = response.meta['page']
     except KeyError:
         a=0
         os.mkdir(dumpFilePath)
         with open(filename, 'a') as f:
             #header
             forumTitle=hxs.select('//div[@class="module forums"]/h2/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             extraInfo=hxs.select('//div[@class="module forums discussion tid"]/h4/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             f.write("title:"+forumTitle+"\n")
             f.write("extraInfo:"+extraInfo+"\n")
             f.write(response.url+"\n")
             f.write(filename+"\n")
             f.write(dumpFilePath+"\n\n")
             
     with open(dumpFilePath+ "\\" +str(a)+'.html', 'a') as fd:
         fd.write(response.body)
         
     with open(filename, 'a') as f:
         for entry in hxs.select('//div[contains(@class,"forums-thread")]'):
             msgID=     entry.select('span/@id').extract()[0]        
             msgDate=   entry.select('h4/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             msgText=' '.join(entry.select('span/text()').extract()).encode('ascii','ignore').replace('\n','')
             try:
                 mgAuthor=  entry.select('h3/span/a/text()').extract()[0].encode('ascii','ignore').replace('\n','')
             except:
                 mgAuthor='none'
             try:
                 msgTitle=  entry.select('h3/strong/text()').extract()[0].encode('ascii','ignore').replace('\n','')                
             except:
                 msgTitle="none"
             f.write('msgID:'+msgID+'\n')
             f.write('msgTitle:'+msgTitle+'\n')
             f.write('mgAuthor:'+mgAuthor+'\n')
             f.write('msgDate:'+msgDate+'\n')
             f.write('msgText:'+msgText+'\n\n')
     s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]'])
     Links = s.extract_links(response)
     if len(Links) > 0:
         print 'going to the next page'
         r = Request(googc+Links[0].url, callback=self.parse)
         r.meta['page']=a+1;
         yield r

Example #26

0

Show file

File: test_contrib_linkextractors.py Project: serkanh/scrapy

    def test_restrict_xpaths_encoding(self):
        """Test restrict_xpaths with encodings"""
        html = """<html><head><title>Page title<title>
        <body><p><a href="item/12.html">Item 12</a></p>
        <div class='links'>
        <p><a href="/about.html">About us\xa3</a></p>
        </div>
        <div>
        <p><a href="/nofollow.html">This shouldn't be followed</a></p>
        </div>
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding="windows-1252")

        lx = SgmlLinkExtractor(restrict_xpaths="//div[@class='links']")
        self.assertEqual(lx.extract_links(response), [Link(url="http://example.org/about.html", text=u"About us\xa3")])

Example #27

0

Show file

File: test_contrib_linkextractors.py Project: serkanh/scrapy

    def test_process_value(self):
        """Test restrict_xpaths with encodings"""
        html = """
        <a href="javascript:goToPage('../other/page.html','photo','width=600,height=540,scrollbars'); return false">Link text</a>
        <a href="/about.html">About us</a>
        """
        response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding="windows-1252")

        def process_value(value):
            m = re.search("javascript:goToPage\('(.*?)'", value)
            if m:
                return m.group(1)

        lx = SgmlLinkExtractor(process_value=process_value)
        self.assertEqual(lx.extract_links(response), [Link(url="http://example.org/other/page.html", text="Link text")])

Example #28

0

Show file

File: evitamins.py Project: zahinazher/scrapyprojects

  def parse_brands(self,response):

    lx = SgmlLinkExtractor(restrict_xpaths=('//td[@valign="top"]'), allow=('\S+\.com'), unique=True)
    links = lx.extract_links(response)
    brands_all = set(sorted(link.text for link in links))

    self.log(u'Extracted {} brands.'.format(len(brands_all)), scrapy.log.DEBUG)

    """Traverse through all the pages to get all products"""
    """brands_alphabets = ['A','B','C','D','E','F','G','H','I',
                        'J','K','L','M','N','O','P','Q','R',
                        'S','T','U','V','W','X','Y','Z']"""
    brands_alphabets = ['A']
    for alpha in brands_alphabets:
      yield Request(self.url_view_items + str(alpha), callback=self.items_list)

Example #29

0

Show file

    def crawl_all(self, response):
        print 'Crawling all...'

        # Get list of decks
        self.deck_links = SgmlLinkExtractor(allow = r'/sets/\d+').extract_links(response)

        return self.parse_deck_links(None)

Example #30

0

Show file

File: itunes_spider.py Project: arthurnn/podcast

 def parsePage(self, response):
     hxs = HtmlXPathSelector(response)
     
     item = response.meta['item']
     emails = collectAllEmail(hxs.extract())
     if len(emails) > 0:
         item['email'] = emails[0]
         yield item
        
     extractor = SgmlLinkExtractor(allow_domains=response.url)
     
     for entry in extractor.extract_links(response):
         if entry.url is not None:
             req = Request(entry.url, callback=self.parsePage)
             req.meta['item'] = item
             yield req

Example #31

0

Show file

class HideMyAssSpider(CrawlSpider):
    name = 'hidemyass'
    start_urls = ['http://hidemyass.com/proxy-list/']
    allowed_domains = ['hidemyass.com']

    rules = (Rule(SgmlLinkExtractor(restrict_xpaths=(
        '//div[@id="container"]//div[@id="pagination"]/ul/div/li[@class="nextpageactive"]/a'
    )),
                  callback='parse',
                  follow=True), )

    def parse(self, response):
        self.log('No item received for %s' % response.url)

        for elem in super(HideMyAssSpider, self).parse(response):
            yield elem

        hxs = HtmlXPathSelector(response)
        links = hxs.select('//tr[@class="altshade"]')

        for link in links:
            ipaddress_parts = link.select('td[2]/span')

            style_text = ipaddress_parts.select('style/text()').extract()
            style_text = style_text[0].split('\n')
            display_none = [
                style[1:style.index('{')] for style in style_text
                if 'none' in style
            ]
            display_inline = [
                style[1:style.index('{')] for style in style_text
                if 'inline' in style
            ]
            display_none = set(display_none)
            display_inline = set(display_inline)

            ipaddress = []

            for ipaddress_part in ipaddress_parts.select('span|div|text()'):
                tag_class = tag_style = tag_name = None
                try:
                    tag_class = ipaddress_part.select('@class').extract()
                except TypeError:
                    # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult)
                    pass

                try:
                    tag_style = ipaddress_part.select('@style').extract()
                except TypeError:
                    # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult)
                    pass

                try:
                    tag_name = ipaddress_part.select("name()")
                except TypeError:
                    # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult)
                    pass

                if tag_name:
                    tag_text = ipaddress_part.select('text()').extract()
                else:
                    tag_text = ipaddress_part.extract()

                if tag_style and 'none' in tag_style[0]:
                    continue
                if tag_class and tag_class[0] in display_none:
                    continue

                if isinstance(tag_text, list):
                    tag_text = ''.join(tag_text)

                tag_texts = tag_text.split('.')
                for tag_text in tag_texts:
                    tag_text = tag_text.strip()
                    if not tag_text.isdigit():
                        continue
                    ipaddress.append(tag_text)

            ipaddress = '.'.join(ipaddress)

            loader = WebsiteLoader(selector=link)
            loader.add_value('ipaddress', ipaddress)
            loader.add_xpath('port', 'td[3]/text()')
            loader.add_xpath('country', 'td[4]/span/text()')
            loader.add_xpath('_type', 'td[7]/text()')
            loader.add_xpath('anonimity', 'td[8]/text()')
            loader.add_value('url', response.url)

            item = loader.load_item()

            yield item

Example #32

0

Show file

File: kompas_crawler.py Project: wiwiek-ci/ina-news-crawler

class KompasCrawler(NewsBaseCrawler):
    # Identifier
    name = 'kompas'
    source = 'kompas.com'

    # Debug
    debug = False

    # Rules
    allowed_domains = ['kompas.com']
    start_urls = [
        'http://www1.kompas.com/newsindex/secidx/1/nasional/',
    ]
    rules = (Rule(SgmlLinkExtractor(allow=('/read/', ), unique=True),
                  follow=True,
                  callback='parse_item'), )

    # XPath
    xpath_title = '//div[@class="judul_artikel2011"]'
    xpath_subtitle = '//div[@class="font11 c_orange_kompas2011 pb_5 pt_5"]'
    xpath_category = '//div[@class="menu_kompas"]/ul/li/a[@class="selected"]'
    xpath_author = '//none'
    xpath_published_at = '(//div[@class="font11 c_abu03_kompas2011 pb_3"]/span[@class="c_abu01_kompas2011"])[last()]'
    xpath_place = '//div[@class="isi_berita2011 pt_5"]/p/strong'
    xpath_content = '//p'

    # Overriden methods
    def parse_date(self, date_str):
        split_str = date_str.split(' ')
        year = split_str[3]

        if split_str[2] == 'Januari':
            month = '01'
        elif split_str[2] == 'Pebruari' or split_str[2] == 'Februari':
            month = '02'
        elif split_str[2] == 'Maret':
            month = '03'
        elif split_str[2] == 'April':
            month = '04'
        elif split_str[2] == 'Mei':
            month = '05'
        elif split_str[2] == 'Juni':
            month = '06'
        elif split_str[2] == 'Juli':
            month = '07'
        elif split_str[2] == 'Agustus':
            month = '08'
        elif split_str[2] == 'September':
            month = '09'
        elif split_str[2] == 'Oktober':
            month = '10'
        elif split_str[2] == 'November' or split_str[2] == 'Nopember':
            month = '11'
        elif split_str[2] == 'Desember':
            month = '12'
        else:
            month = '01'

        if split_str[1] < 10:
            day = split_str[1]
        else:
            day = "0" + split_str[1]

        time = split_str[5]
        return "%s-%s-%s %s:00" % (year, month, day, time)

    def parse_place(self, place_str):
        split_str = place_str.split(',')
        return split_str[0]

    def normalize_category(self, category_str):
        if category_str.lower() in ('nasional', 'regional', 'megapolitan'):
            return self.CATEGORY_NATIONAL
        elif category_str.lower() in ('internasional'):
            return self.CATEGORY_INTERNATIONAL
        elif category_str.lower() in ('bisniskeuangan'):
            return self.CATEGORY_ECONOMY
        elif category_str.lower() in ('olahraga'):
            return self.CATEGORY_SPORTS
        elif category_str.lower() in ('sains'):
            return self.CATEGORY_SCITECH
        elif category_str.lower() in ('travel', 'oase', 'edukasi'):
            return self.CATEGORY_HUMANIORA
        else:
            return self.CATEGORY_OTHERS

Example #33

0

Show file

File: example.py Project: jaeindia/Vulnerability-Scanner

class ExampleSpider(CrawlSpider):
    name = 'example.com'
    start_urls = ['https://app1.com/users/home.php']
    item_urls = []
    
   # urlpool = []

    # 'log' and 'pwd' are names of the username and password fields
    # depends on each website, you'll have to change those fields properly
    # one may use loginform lib https://github.com/scrapy/loginform to make it easier
    # when handling multiple credentials from multiple sites. 

    rules = (
        Rule(SgmlLinkExtractor(allow=r'-\w+.html$'),
             callback='parse_page', follow=True),
    )

    def init_request(self):
        return [Request(url='https://app1.com/users/home.php', callback=self.login)]

    def login(self, response):
        print "hell"
        return FormRequest.from_response(response, formdata={'username': '******', 'password': '******'}, callback=self.after_login)


    def after_login(self, response):
        # check login succeed before going on
        if "ERROR: Invalid username" in response.body:
            self.log("Login failed", level=log.ERROR)
            return

        # continue scraping with authenticated session...
        else:
            self.log("Login succeed!", level=log.DEBUG)
            print "Logging in"
            print  "asdsdsd  " + response.url
            return Request(url='https://app1.com/users/home.php',
                           callback=self.parse_page)
            self.initialized()


    # example of crawling all other urls in the site with the same
    # authenticated session.
    def isURLinPool(self, url):

        for t  in self.item_urls:
            if (t.find("?") != -1):
                t = t[:t.find("?")]
            if (url.find("?") != -1):
                url = url[:url.find("?")]    
            if url.lower() == t.lower():
                return False
        return True

    def parse_page(self, response):
        """ Scrape useful stuff from page, and spawn new requests
        """
        hxs = HtmlXPathSelector(response)
        # i = CrawlerItem()
        # find all the link in the <a href> tag
        input_box =  hxs.select('//input/@name').extract()
        print "Scraping the URL " + response.url
        for inputs in input_box:
            print "The input boxes with name " + inputs

        links = hxs.select('//a/@href').extract()
        input_box =  hxs.select('//input/@src').extract()
        
       # print "Scraping the URL " + response.url
        for inputs in input_box:
            print "The input boxes with src " + inputs
        print "\n"

        # Yield a new request for each link we found
        # #this may lead to infinite crawling...
        ur1 = ""
        for link in links:
            url  = "https://app1.com"+link
            if(link.find(":") != -1):
               continue 
            if(self.isURLinPool(url)):
               print "THIS IS A LINK " + link
#              yield Request(url= "https://app1.com"+link, callback=self.parse_page)
               

            #only process external/full link
               link = url
               ur1 = link
               if link.find("http") > -1:
                  print "Before Sending it for parse " + link
                  yield Request(url=link, callback=self.parse_page)
        self.item_urls.append(response.url)        
        item = LinkItem()
        item["title"] = hxs.select('//title/text()').extract()[0]
        item["url"] = response.url
    #    self.item_urls["title"] = hxs.select('//title/text()').extract()[0]
     #   self.item_urls["url"] = response.url
        yield self.collect_item(item)

    def collect_item(self, item):
        return item

Example #34

0

Show file

class ApprenticeSpider(CrawlSpider):
    name = 'apprentice'
    start_urls = START_URLS

    rules = (
        Rule(SgmlLinkExtractor(unique=True),
             callback='parse_item',
             follow=False),  #should it be unique 
    )
    crawled_urls = []
    print 'Loading critic...'
    print "path", os.path.join(DIRNAME, '../../../../data/classifier.pickle')
    supervisor = read_classifier(
        os.path.join(DIRNAME, '../../../../data/classifier.pickle'))
    print 'Critic loaded...'

    apprentice = NBClassifier()

    def parse_item(self, response):
        """
        crawling the webpage and extracts the url.
        Once the crawling is done, evaluate the page content and enter this function again to train the apprentice
        """
        ApprenticeSpider.crawled_urls.append(response.url)
        # if response.meta.has_key ('train_flag') and response.meta ['train_flag']: #entering the train mode
        # print "training the apprentice"
        #html to words
        words = html2words(response.body)

        probs = ApprenticeSpider.supervisor.predict(words)

        interestness = probs['pos']

        #use the score to train the apprentice using the surrouding (word, offset) pairs
        # print "word_offset_pairs = ", response.meta ['word_offset_pairs']
        # print "interestness of %s = %f" %(response.url, interestness)
        if response.meta.has_key('word_offset_pairs'):  #
            ApprenticeSpider.apprentice.train([
                (response.meta['word_offset_pairs'],
                 interestness > 0.5 and "pos" or "neg")
            ])

        item = UrlItem()
        item['url'] = response.url
        item['interestness'] = interestness
        yield item

        # else:
        # print "fetching the urls"
        url_infos = scrape_url_and_words(response.body, response.url, level=3)

        for url_info in url_infos:
            url, word_offset_pairs = url_info
            if url in ApprenticeSpider.crawled_urls:  #already crawled, skip it
                continue

            prediction = ApprenticeSpider.apprentice.predict(word_offset_pairs)
            if prediction.has_key('pos'):
                potential_interestness = prediction[
                    'pos']  #get the potential interest of the url
            else:
                potential_interestness = 0  #neg is 1

            # print "pi of %s is %f" %(potential_interestness, potential_interestness)

            priority = int(
                potential_interestness * 10**3
            )  #converting the priority to int to accord with the PriorityQueue spec

            req = Request(
                url,
                priority=
                priority,  #after the request is done, run parse_item to train the apprentice
                callback=self.parse_item
            )  #this line is extremely important, or items harvest rate drops drastically

            req.meta[
                'word_offset_pairs'] = word_offset_pairs  #passing additional data to request
            # req.meta ['train_flag'] = True #we only do training, nothing else
            yield req

Example #35

0

Show file

File: test_contrib_linkextractors.py Project: reprior123/TraderSoftwareRP

    def test_matches(self):
        url1 = 'http://lotsofstuff.com/stuff1/index'
        url2 = 'http://evenmorestuff.com/uglystuff/index'

        lx = SgmlLinkExtractor(allow=(r'stuff1', ))
        self.assertEqual(lx.matches(url1), True)
        self.assertEqual(lx.matches(url2), False)

        lx = SgmlLinkExtractor(deny=(r'uglystuff', ))
        self.assertEqual(lx.matches(url1), True)
        self.assertEqual(lx.matches(url2), False)

        lx = SgmlLinkExtractor(allow_domains=('evenmorestuff.com', ))
        self.assertEqual(lx.matches(url1), False)
        self.assertEqual(lx.matches(url2), True)

        lx = SgmlLinkExtractor(deny_domains=('lotsofstuff.com', ))
        self.assertEqual(lx.matches(url1), False)
        self.assertEqual(lx.matches(url2), True)

        lx = SgmlLinkExtractor(allow=('blah1',), deny=('blah2',),
                               allow_domains=('blah1.com',),
                               deny_domains=('blah2.com',))
        self.assertEqual(lx.matches('http://blah1.com/blah1'), True)
        self.assertEqual(lx.matches('http://blah1.com/blah2'), False)
        self.assertEqual(lx.matches('http://blah2.com/blah1'), False)
        self.assertEqual(lx.matches('http://blah2.com/blah2'), False)

Example #36

0

Show file

class ItunesSpider(CrawlSpider):
    name = 'itunes'
    allowed_domains = ['apple.com']
    start_urls = ['https://itunes.apple.com/us/genre/ios-games/id6014?mt=8']

    rules = (
        #Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
        Rule(SgmlLinkExtractor(allow=r'us/app/.+'),
             callback='parse_item',
             follow=True), )

    xmlstring = '<?xml version="1.0" encoding="UTF-8"?><root>'
    xmlTemplate = """
        <item>
          <image>%(image)s</image>
          <stars>%(stars)s</stars>
          <title>%(title)s</title>
          <category>%(category)s</category>
          <desc>%(desc)s</desc>
          <link>%(link)s</link>
        </item>
        """

    def parse_item(self, response):
        image = stars = title = category = desc = link = ''
        hxs = HtmlXPathSelector(response)
        image = hxs.select(
            "//div[@id='left-stack']/div/a/div/img/@src").extract()[0]
        title = hxs.select("//div[@id='title']/div/h1/text()").extract()[0]
        category = hxs.select("//li[@class='genre']/a/text()").extract()[0]
        desc = hxs.select("//div[@class='product-review']/p").extract()[0]
        data = {
            'image': image,
            'stars': stars,
            'title': title,
            'category': category,
            'desc': desc,
            'link': link
        }
        self.xmlstring += self.xmlTemplate % data

        # http://www.suchkultur.de/blog/suchmaschinen/crawler/web-scraping-mit-dem-scrapy-framework/
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.select("//div[@id='selectedcontent']/div/ul/li")
        i = 0
        #image = stars = title = category = desc = link = ''
        for site in sites:
            i += 1
            #image = site.select("div/div/div/a[contains(@class,'thumbnail')]/img/@src").extract()[0]
            #stars = ''#site.select("div/div/div/div[contains(@class,'ratings')]/@title").extract()
            #title = site.select("div/div/div/a[contains(@class,'title')]/text()").extract()[0]
            #category = site.select("div/div/span[contains(@class,'attribution')]/div/a/text()").extract()[0]
            #desc = site.select("div/div/p[contains(@class,'snippet-content')]/text()").extract()[0]
            link = site.select("a/@href").extract()[0]
            if i == 1:
                yield Request(link, callback=self.parse_item)
        # print image,stars,title,category,desc,link,'\n'
        #print link
        # print "\n"
        #data = {'image':image,'stars':stars,'title':title,'category':category,'desc':desc,'link':link}
        #self.xmlstring += self.xmlTemplate%data

        self.xmlstring += "<size>" + str(i) + "</size></root>"
        filename = 'itunes.xml'
        f = open(filename, 'w')
        f.write(self.xmlstring)

Example #37

0

Show file

File: test_contrib_linkextractors.py Project: reprior123/TraderSoftwareRP

    def test_extraction(self):
        '''Test the extractor's behaviour among different situations'''

        lx = SgmlLinkExtractor()
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
        ])

        lx = SgmlLinkExtractor(allow=('sample', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
        ])

        lx = SgmlLinkExtractor(allow=('sample', ), unique=False)
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'),
        ])

        lx = SgmlLinkExtractor(allow=('sample', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
        ])

        lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = SgmlLinkExtractor(allow_domains=('google.com', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])

Example #38

0

Show file

class AvitoSpider(CrawlSpider):
    ORIGIN_ID = 1
    MAX_PAGE = 20 
    _last_page = {}
    name = 'avito'
    allowed_domains = ['avito.ru']
    custom_settings = {'DOWNLOAD_DELAY': 3}
    localities = {
                    'gelendzhik':LocalityMapper.GELENDZHIK,
                    'anapa': LocalityMapper.ANAPA,
                    'novorossiysk':LocalityMapper.NOVOROSSIYSK,
                    'temryuk':LocalityMapper.TEMRYUK,
                    'abrau-dyurso': LocalityMapper.ABRAUDYURSO,
                    'anapskaya': LocalityMapper.ANAPSKAYA,
                    'arhipo-osipovka': LocalityMapper.ARHIPOOSIPOVKA,
                    'ahtanizovskaya': LocalityMapper.AHTANIZOVSKAYA,
                    'verhnebakanskiy': LocalityMapper.VERHNEBAKANSKIY,
                    'vinogradnyy': LocalityMapper.VINOGRADNYY,
                    'vityazevo': LocalityMapper.VITYAZEVO,
                    'vyshesteblievskaya': LocalityMapper.VYSHESTEBLIEVSKAYA,
                    'gayduk': LocalityMapper.GAYDUK,
                    'glebovka': LocalityMapper.GLEBOVSKOE,
                    'golubitskaya': LocalityMapper.GOLUBITSKAYA,
                    'gostagaevskaya': LocalityMapper.GOSTAGAEVSKAYA,
                    'kurchanskaya': LocalityMapper.KURCHANSKAYA,
                    'kabardinka': LocalityMapper.KABARDINKA,
                    'divnomorskoe': LocalityMapper.DIVNOMORSKOE,
                    'dzhiginka': LocalityMapper.DZHIGINKA,
                    'myshako': LocalityMapper.MYSHAKO,
                    'natuhaevskaya': LocalityMapper.NATUHAEVSKAYA,
                    'raevskaya': LocalityMapper.RAEVSKAYA,
                    'yurovka': LocalityMapper.YUROVKA,
                    'tsibanobalka': LocalityMapper.TSYBANOBALKA,
                    'taman': LocalityMapper.TAMAN,
                    'supseh': LocalityMapper.SUPSEH,
                    'krasnodarskiy_kray_strelka': LocalityMapper.STRELKA,
                    'starotitarovskaya': LocalityMapper.STAROTITAROVSKAYA,
                    'sennoy': LocalityMapper.SENNOY,
                  }  
    rules = (
        Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@class="pagination__nav clearfix"]/a',)), follow=True, process_request='process_request_filter', callback='process_response_filter'),
        Rule (SgmlLinkExtractor(restrict_xpaths=('//a[@class="description-title-link"]',), process_value=process_value), callback='parse_item'),
    )   
       
    def start_requests(self):
        template = "https://www.avito.ru/%s/%s/prodam?user=1&view=list"
        com_template = "https://www.avito.ru/%s/kommercheskaya_nedvizhimost/prodam/%s/za_vse?user=1&view=list"
        urls = []
        
        types = ['kvartiry', 'komnaty', 'doma_dachi_kottedzhi', 'zemelnye_uchastki', 'garazhi_i_mashinomesta', ]
        com_types = ['magazin', 'gostinicy', 'drugoe', 'proizvodstvo', 'sklad', 'ofis']
        for l in self.localities.iterkeys():            
            for t in types:
                urls.append(template % (l, t))                
            for com_type in com_types:
                urls.append(com_template % (l, com_type)) 
        for url in urls:            
            yield Request(url, self.parse)
    
    def parse_item(self, response):
        item = AvitoItem()
        fields_parser = AvitoFleldsParser(Selector(response), url=response.url, data={'localities': self.localities})
        fields_parser.populate_item(item)        
        item.print_item()
        return item

    def process_response_filter(self, response):
        print response.url
        dates = Selector(response).xpath('//span[@class="date"]/text()')
        for date in dates:
            txt = date.extract()
            key = ur'вчера|сегодня'
            matches = re.search(key, txt, re.I | re.U)
            if not matches:
                page_num = self.get_page_num(response.url)
                if page_num:
                    self.set_last_page(response.url, int(page_num))                               
        return []        
    
    def set_last_page(self, url, value):
        path = urlparse(url).path
        self._last_page[path] = value
        
    def get_last_page(self, url):
        path = urlparse(url).path
        return self._last_page.get(path, self.MAX_PAGE) 

    def get_page_num(self, url):        
            qs = parse_qs(urlparse(url).query)
            if 'p' in qs:
                return int(qs['p'][0])
            return 0
        
    def process_request_filter(self, request):              
        if self.get_page_num(request.url) > self.get_last_page(request.url):            
            return None
        return request

Example #39

0

Show file

class RingspanncorpSpider(CrawlSpider):
    name = "ringspanncorp"
    allowed_domains = ["ringspanncorp.com"]
    items = []

    start_urls = ['http://www.ringspanncorp.com/en/products/overview']

    rules = (
        Rule(
            SgmlLinkExtractor(),
            callback='parse_lol',
            follow=True
        ),
    )

    def parse_lol(self, response):
        if response.xpath('//div[@id="product-view"]'):
            return self.parse_product(response)

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        data = pandas.read_csv("mro/spiders/csv_data/Ringspanncorp/ringspanncorp.csv", sep=',')
        catalog = list(data.catalog_number)
        ids = list(data.id)
        description = list(data.description)
        key1 = list(data.key1)
        key2 = list(data.key2)

        catalog_key1 = dict(zip(catalog, key1))
        catalog_key2 = dict(zip(catalog, key2))
        catalog_description = dict(zip(catalog, description))
        catalog_id = dict(zip(catalog, ids))

        key1_ids = dict(zip(key1, ids))
        key2_ids = dict(zip(key2, ids))
        key1_catalog = dict(zip(key1, catalog))
        key2_catalog = dict(zip(key2, catalog))
        key1_description = dict(zip(key1, description))
        key2_description = dict(zip(key2, description))

        for catalog_n in catalog:
            key = catalog_key1[catalog_n]
            name = ' ' + str(key)
            if name in response.xpath('//h1').extract_first():
                if catalog_n not in self.items:
                    item = UniversalItem()
                    item['ids'] = catalog_id[catalog_n]
                    item['catalog_number'] = catalog_n
                    key_digits = catalog_key1[catalog_n].re('(\d+)')
                    self.items.append(catalog_n)
                    url = response.xpath('//a[@class="cad link_grey"]/@href')
                    yield Request(url=url, meta={'item': item, 'key': key_digits}, callback='cad_page')

        for catalog_n in catalog:
            key = catalog_key2[catalog_n]
            name = ' ' + str(key)
            name2 = str(key) + ' '
            if name in response.xpath('//h1').extract_first() or name2 in response.xpath('//h1').extract_first():
                if catalog_n not in self.items:
                    item = UniversalItem()
                    item['ids'] = catalog_id[catalog_n]
                    item['catalog_number'] = catalog_n
                    key_digits = catalog_key1[catalog_n].re('(\d+)')
                    self.items.append(catalog_n)
                    url = response.xpath('//a[@class="cad link_grey"]/@href')
                    yield Request(url=url, meta={'item': item, 'key': key_digits}, callback='cad_page')

    def cad_page(self, response):
        item = response.meta['item']
        key_digits = response.meta['key_digits']
        data = pandas.read_csv("mro/spiders/csv_data/Ringspanncorp/ringspanncorp.csv", sep=',')
        catalog = list(data.catalog_number)
        ids = list(data.id)
        description = list(data.description)
        key1 = list(data.key1)
        key2 = list(data.key2)

        catalog_key1 = dict(zip(catalog, key1))
        catalog_key2 = dict(zip(catalog, key2))
        catalog_description = dict(zip(catalog, description))
        catalog_id = dict(zip(catalog, ids))

        key1_ids = dict(zip(key1, ids))
        key2_ids = dict(zip(key2, ids))
        key1_catalog = dict(zip(key1, catalog))
        key2_catalog = dict(zip(key2, catalog))
        key1_description = dict(zip(key1, description))
        key2_description = dict(zip(key2, description))

        for catalog_n in catalog:
            key = catalog_key1[catalog_n]
            name = ' ' + str(key)
            if name in response.xpath('//h1').extract_first():
                if catalog_n not in self.items:
                    item = UniversalItem()
                    item['ids'] = catalog_id[catalog_n]
                    item['catalog_number'] = catalog_n
                    self.items.append(catalog_n)
                    url = response.xpath('//a[@class="cad link_grey"]/@href')
                    yield Request(url=url, callback='request_cad')

        for catalog_n in catalog:
            key = catalog_key2[catalog_n]
            name = ' ' + str(key)
            name2 = str(key) + ' '
            if name in response.xpath('//h1').extract_first() or name2 in response.xpath('//h1').extract_first():
                if catalog_n not in self.items:
                    item = UniversalItem()
                    item['ids'] = catalog_id[catalog_n]
                    item['catalog_number'] = catalog_n
                    self.items.append(catalog_n)
                    url = response.xpath('//a[@class="cad link_grey"]/@href')
                    yield Request(url=url, callback='request_cad')

Example #40

0

Show file

File: wiki_spider.py Project: pug/pug

class WikiSpider(CrawlSpider):
    """Crawls wikipedia starting at the seed page. 

    Rate limited by obeying robots.txt (see settings.py), autothrottle, and default delay of 1.1

    $ cd nlp/wikiscrapy
    $ scrapy crawl wiki -o wikipedia_erdos.json -t json

    >>> import subprocess
    >>> subprocess.check_output('scrapy', 'crawl', 'wiki', stderr=subprocess.STDOUT)

    """

    verbosity = 1
    name = 'wiki'
    download_delay = 1.1
    allowed_domains = ['en.wikipedia.org',
                       'en.wiktionary.org']  # , 'en.m.wikipedia.org']
    start_urls = ['''https://en.wikipedia.org/wiki/Paul_Erd%C5%91s''']
    rules = [
        Rule(SgmlLinkExtractor(allow=['/wiki/.+']),
             follow=True,
             process_links='filter_links',
             callback='parse_response'),
        #Rule(SgmlLinkExtractor(allow=['/wiki/.*']), 'parse_response')]
    ]

    def __init__(self, start_urls=None, *args, **kwargs):
        self.start_urls = ['''https://en.wikipedia.org/wiki/Paul_Erd%C5%91s''']
        if start_urls:
            if isinstance(start_urls, basestring):
                self.start_urls = [start_urls]
            else:
                self.start_urls = list(start_urls)
        super(WikiSpider, self).__init__(*args, **kwargs)

    def clean_list(self, l):
        ans = ['']
        for item in l:
            # TODO: regex to see if it's a number of the form 1.2.3 before creating a new line item
            # and use the section number as a key or value in a dictionary
            stripped = item.strip()
            if stripped:
                ans[-1] += stripped + ' '
            if item.endswith('\n'):
                ans[-1] = ans[-1].strip()
                ans += ['']
        return ans

    def filter_links(self, links):
        filtered_list = []
        for link in links:
            if not RE_WIKIPEDIA_SPECIAL.match(link.url):
                filtered_list += [link]
        if self.verbosity > 1:
            print '-' * 20 + ' LINKS ' + '-' * 20
            print '\n'.join(link.url for link in filtered_list)
        # sleep(1.1)
        if self.verbosity > 1:
            print '-' * 20 + '-------' + '-' * 20
        return filtered_list

    def parse_response(self, response):
        # TODO:
        #   1. check for error pages and slowdown or halt crawling
        #   2. throttle based on robots.txt
        #   3. save to database (so that json doesn't have to be loaded manually)
        #   4. use django Models rather than scrapy.Item model
        #   5. incorporate into a django app (or make it a django app configurable through a web interface)
        #   6. incrementally build occurrence matrix rather than saving raw data to django/postgres db
        if self.verbosity > 1:
            print '=' * 20 + ' PARSE ' + '=' * 20
        sel = Selector(response)
        a = WikiItem()
        a['url'] = response.url
        a['title'] = ' '.join(
            sel.xpath("//h1[@id='firstHeading']//text()").extract())
        a['toc'] = ' '.join(
            self.clean_list(
                sel.xpath("//div[@id='toc']//ul//text()").extract()))
        a['text'] = ' '.join(
            sel.xpath('//div[@id="mw-content-text"]//text()').extract())
        a['modified'] = clean_wiki_datetime(
            sel.xpath('//li[@id="footer-info-lastmod"]/text()').re(
                r'([0-9]+\s*\w*)'))
        a['crawled'] = datetime.now()
        a['count'] = dict(Counter(get_words(a['text'])))
        if self.verbosity > 1:
            print '=' * 20 + '=======' + '=' * 20
        yield a

Example #41

0

Show file

File: zzk_spider.py Project: yanjlee/stock

class ZzkSpider(CrawlSpider):
    name = "zzk"
    allowed_domains = [
        "39.net",
    ]
    start_urls = [
        "http://jbk.39.net/bw_t2",  #zhengzhuang home page
    ]
    rules = (Rule(SgmlLinkExtractor(allow=(r'http://jbk.39.net/*')),
                  follow=True,
                  callback="parse_item"), )

    def parse_item(self, response):
        hxs = Selector(response)
        div_xpath = hxs.xpath(
            '//section[contains(@class, "main wrap")]/div[contains(@class, "list_con clearfix")]'
        )
        symptoms_tag = hxs.xpath(
            '//section[contains(@class, "main wrap")]/header[contains(@class, "list_tit")]/h1/cite[contains(@class, "bg2")]/text()'
        ).extract()
        sheet_name = hxs.xpath(
            '//section[contains(@class, "main wrap")]/header[contains(@class, "list_tit")]//div[contains(@id, "list_nav")]//li[contains(@class, "h")]/text()'
        ).extract()
        #如果标签不是症状，或没有标签则返回
        if (cmp(str(symptoms_tag), "[u'\u75c7\u72b6']") == 0) and (cmp(
                str(sheet_name), "[u'\u7efc\u8ff0']") == 0):
            item = ZzkItem()
            item["url"] = response.url
            symptoms_name = hxs.xpath(
                '//section[contains(@class, "main wrap")]/header[contains(@class, "list_tit")]/h1/b/text()'
            ).extract()
            item["symptoms_name"] = symptoms_name
            symptoms_description = hxs.xpath(
                '//section[contains(@class, "main wrap")]/div[contains(@class, "list_con clearfix")]//dd[contains(@id, "intro")]/p[contains(@class, "sort2")]/text()'
            ).extract()
            item["symptoms_description"] = symptoms_description

            disease_info_list_path = div_xpath.xpath(
                '//div[contains(@class, "item")]//table[contains(@class, "dis")]/tr'
            )
            if not disease_info_list_path:
                return

            disease_item_list = []
            for disease_info in disease_info_list_path:
                class_name = disease_info.xpath('.//@class').extract()

                if cmp(str(class_name), "[u'name']") == 0:
                    disease_info_xpath = disease_info.xpath('./td')
                    disease_list = []
                    disease_url = disease_info.xpath(
                        './td[contains(@class, "name")]/a/@href').extract()

                    for disease in disease_info_xpath:
                        disease_name = disease.xpath('./a/@title').extract()
                        disease_list.append(disease_name)

                    disease_item = DiseaseItem()
                    disease_item["disease_name"] = disease_list[0]
                    disease_item["symptoms"] = disease_list[1]
                    disease_item["department"] = disease_list[2]
                    disease_item["disease_url"] = disease_url
                    disease_item_list.append(disease_item)

            item["disease"] = disease_item_list

            return item

Example #42

0

Show file

class IfengSpider(CrawlSpider):
    name = "stock_ifeng"
    szIndexUrl = [
        "http://api.finance.ifeng.com/index.php/akdaily/?code=sh000001&tpye=fq",
        "http://api.finance.ifeng.com/index.php/akdaily/?code=sz399001&tpye=fq"
    ]
    allowed_domains = [
        "bestgo.com",
        "api.finance.ifeng.com",
        "app.finance.ifeng.com",
    ]
    start_urls = []

    day_str = []
    for day_span in range(1, 19, 5):
        day_str.append(
            (date.today() - timedelta(days=day_span)).strftime('%Y%m%d'))
    for page_id in range(1, 32):
        for day in day_str:
            url = "http://www.bestgo.com/fund/SH/%s/1,%d.html" % (day, page_id)
            start_urls.append(url)
            url = "http://www.bestgo.com/fund/%s/1,%d.html" % (day, page_id)
            start_urls.append(url)
    for idx in [1, 2, 3, 5, 7]:
        url = "http://app.finance.ifeng.com/list/all_stock_cate.php?s=%d" % idx
        start_urls.append(url)

    rules = [
        Rule(SgmlLinkExtractor(
            allow=(r'http://www.bestgo.com/fund/.*\d+/1.\d+\.html')),
             follow=True,
             callback="parse_item"),
        Rule(SgmlLinkExtractor(allow=(
            r'http://app.finance.ifeng.com/list/stock_cate.php\?c=\d+$')),
             follow=True,
             callback="parse_cate"),
    ]

    def parse_item(self, response):
        for url in self.szIndexUrl:
            yield scrapy.Request(url, self.parse_api)
        hxs = Selector(response)
        stocks = hxs.xpath('//div[@class="grid-view"]/table/tbody/tr')
        self.log(str(len(stocks)))
        for stock in stocks:
            stock_id = stock.xpath('./td/a/span/text()')
            if len(stock_id) < 1:
                continue
            stock_code = 'sz'
            if 'SH' in response.url:
                stock_code = 'sh'
            stock_code = stock_code + stock_id[0].extract()
            self.log('[' + response.url + "]:[" + stock_code + ']',
                     level=scrapy.log.DEBUG)
            api_url = "http://api.finance.ifeng.com/index.php/akdaily/?code=%s&tpye=fq" % (
                stock_code)
            yield scrapy.Request(api_url, self.parse_api)

    def parse_api(self, response):
        stock_code = None
        params = response.url.split('?')[-1].split('&')
        for param in params:
            [k, v] = param.split('=')
            if k == 'code':
                stock_code = v

        if stock_code is None:
            self.log("no stock_id found in [" + response.url + "]",
                     scrapy.log.CRITICAL)

        data = json.loads(response.body)
        records = data["record"]
        for record in records:
            self.log("stock history:" + stock_code + ":" + ",".join(record))
        item = StockItem()
        item["code"] = stock_code
        item["records"] = records
        item['cate'] = 'ifeng'
        return item

    def parse_cate(self, response):
        stock_cate = None
        params = response.url.split('?')[-1].split('&')
        for param in params:
            [k, v] = param.split('=')
            if k == 'c':
                stock_cate = v
        stock_code_list = []
        hxs = Selector(response)
        stock_codes = hxs.xpath('//table/tr/td[1]/a').extract()
        self.log("[cate_code]" + str(len(stock_codes)))
        for stock_code in stock_codes:
            items = stock_code.split('/')
            if len(items) < 6:
                continue
            code = items[6]
            self.log(code)
            stock_code_list.append(code)
        item = StockItem()
        item['code'] = stock_cate
        item['records'] = stock_code_list
        item['cate'] = "cate"
        return item

Example #43

0

Show file

class CraigslistSpider(CrawlSpider):
    name = "craigslist"
    allowed_domains = ["craigslist.org"]
    start_urls = [
        "http://lascruces.craigslist.org",
        "http://lascruces.craigslist.org/cas/",
        "http://lascruces.craigslist.org/cas/index100.html",
        "http://lascruces.craigslist.org/cas/index200.html",
        "http://lascruces.craigslist.org/cas/index300.html",
        "http://lascruces.craigslist.org/cas/index400.html",
        "http://lascruces.craigslist.org/cas/index500.html",
        "http://lascruces.craigslist.org/cas/index600.html",
        "http://lascruces.craigslist.org/cas/index700.html",
        "http://lascruces.craigslist.org/cas/index800.html",
        "http://lascruces.craigslist.org/cas/index900.html",
        "http://lascruces.craigslist.org/cas/index1000.html",
        "http://lascruces.craigslist.org/cas/index1100.html",
        "http://lascruces.craigslist.org/cas/index1200.html",
        "http://lascruces.craigslist.org/cas/index1300.html",
        "http://lascruces.craigslist.org/cas/index1400.html",
        "http://lascruces.craigslist.org/cas/index1500.html",
        "http://lascruces.craigslist.org/cas/index1600.html",
        "http://lascruces.craigslist.org/cas/index1700.html",
        "http://lascruces.craigslist.org/cas/index1800.html",
        "http://lascruces.craigslist.org/cas/index1900.html",
        "http://lascruces.craigslist.org/cas/index2000.html",
        "http://lascruces.craigslist.org/cas/index2100.html",
        "http://lascruces.craigslist.org/cas/index2200.html",
        "http://lascruces.craigslist.org/cas/index2300.html",
        "http://lascruces.craigslist.org/cas/index2400.html",
        "http://lascruces.craigslist.org/cas/index2500.html",
        "http://lascruces.craigslist.org/cas/index2600.html",
        "http://lascruces.craigslist.org/cas/index2700.html",
        "http://lascruces.craigslist.org/cas/index2800.html",
        "http://lascruces.craigslist.org/cas/index2900.html",
        "http://lascruces.craigslist.org/cas/index3000.html",
        "http://lascruces.craigslist.org/cas/index3100.html",
        "http://lascruces.craigslist.org/cas/index3200.html",
        "http://lascruces.craigslist.org/cas/index3300.html",
        "http://lascruces.craigslist.org/cas/index3400.html",
        "http://lascruces.craigslist.org/cas/index3500.html",
        "http://lascruces.craigslist.org/cas/index3600.html",
        "http://lascruces.craigslist.org/cas/index3700.html",
        "http://lascruces.craigslist.org/cas/index3800.html",
        "http://lascruces.craigslist.org/cas/index3900.html",
        "http://lascruces.craigslist.org/cas/index4000.html",
        "http://lascruces.craigslist.org/cas/index4100.html",
        "http://lascruces.craigslist.org/cas/index4200.html",
        "http://lascruces.craigslist.org/cas/index4300.html",
        "http://lascruces.craigslist.org/cas/index4400.html",
        "http://lascruces.craigslist.org/cas/index4500.html",
        "http://lascruces.craigslist.org/cas/index4600.html",
        "http://lascruces.craigslist.org/cas/index4700.html",
        "http://lascruces.craigslist.org/cas/index4800.html",
        "http://lascruces.craigslist.org/cas/index4900.html",
        "http://lascruces.craigslist.org/cas/index5000.html",
        "http://lascruces.craigslist.org/cas/index5100.html",
        "http://lascruces.craigslist.org/cas/index5200.html",
        "http://lascruces.craigslist.org/cas/index5300.html",
        "http://lascruces.craigslist.org/cas/index5400.html",
        "http://lascruces.craigslist.org/cas/index5500.html",
        "http://lascruces.craigslist.org/cas/index5600.html",
        "http://lascruces.craigslist.org/cas/index5700.html",
        "http://lascruces.craigslist.org/cas/index5800.html",
        "http://lascruces.craigslist.org/cas/index5900.html",
        "http://lascruces.craigslist.org/cas/index6000.html",
        "http://lascruces.craigslist.org/cas/index6100.html",
        "http://lascruces.craigslist.org/cas/index6200.html",
        "http://lascruces.craigslist.org/cas/index6300.html",
        "http://lascruces.craigslist.org/cas/index6400.html",
        "http://lascruces.craigslist.org/cas/index6500.html",
        "http://lascruces.craigslist.org/cas/index6600.html",
        "http://lascruces.craigslist.org/cas/index6700.html",
        "http://lascruces.craigslist.org/cas/index6800.html",
        "http://lascruces.craigslist.org/cas/index6900.html",
        "http://lascruces.craigslist.org/cas/index7000.html",
        "http://lascruces.craigslist.org/cas/index7100.html",
        "http://lascruces.craigslist.org/cas/index7200.html",
        "http://lascruces.craigslist.org/cas/index7300.html",
        "http://lascruces.craigslist.org/cas/index7400.html",
        "http://lascruces.craigslist.org/cas/index7500.html",
        "http://lascruces.craigslist.org/cas/index7600.html",
        "http://lascruces.craigslist.org/cas/index7700.html",
        "http://lascruces.craigslist.org/cas/index7800.html",
        "http://lascruces.craigslist.org/cas/index7900.html",
        "http://lascruces.craigslist.org/cas/index8000.html",
        "http://lascruces.craigslist.org/cas/index8100.html",
        "http://lascruces.craigslist.org/cas/index8200.html",
        "http://lascruces.craigslist.org/cas/index8300.html",
        "http://lascruces.craigslist.org/cas/index8400.html",
        "http://lascruces.craigslist.org/cas/index8500.html",
        "http://lascruces.craigslist.org/cas/index8600.html",
        "http://lascruces.craigslist.org/cas/index8700.html",
        "http://lascruces.craigslist.org/cas/index8800.html",
        "http://lascruces.craigslist.org/cas/index8900.html",
        "http://lascruces.craigslist.org/cas/index9000.html",
        "http://lascruces.craigslist.org/cas/index9100.html",
        "http://lascruces.craigslist.org/cas/index9200.html",
        "http://lascruces.craigslist.org/cas/index9300.html",
        "http://lascruces.craigslist.org/cas/index9400.html",
        "http://lascruces.craigslist.org/cas/index9500.html",
        "http://lascruces.craigslist.org/cas/index9600.html",
        "http://lascruces.craigslist.org/cas/index9700.html",
        "http://lascruces.craigslist.org/cas/index9800.html",
        "http://lascruces.craigslist.org/cas/index9900.html"
    ]
    rules = (Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a')),
                  callback="parse",
                  follow=True), )

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        titles = hxs.select("//span[@class='pl']")
        date_info = hxs.select(
            "//h4[@class='ban']/span[@class='bantext']/text()")
        items = []
        file_to = open("things.txt", "a")
        file_to.write(response.body)
        for titles in titles:
            item = CraigslistSampleItem()
            item["title"] = titles.select("a/text()").extract()
            item["link"] = titles.select("a/@href").extract()
            item["date"] = date_info.extract()
            items.append(item)
        return items

Example #44

0

Show file

File: ms.py Project: luckyfish86/pub-owl-collector

class MSSpider(CrawlSpider):
    name = "ms"
    allowed_domains = []
    f = open('/home/wyp/pub-owl-collector/ms_url.json', mode='r')
    d = json.loads(f.read())
    #print d
    #start_urls = [conf['link'] for conf in d]
    f.close()
    start_urls = [
        "http://academic.research.microsoft.com/Detail?entitytype=3&searchtype=2&id=69"
    ]
    rules = (
        Rule(SgmlLinkExtractor(allow=('Publication/')), callback='parse_item'),
        Rule(SgmlLinkExtractor(restrict_xpaths=('//a[text()="Next"]')),
             follow=True),
    )

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        item = PapersItem()
        item['title'] = hxs.select(
            '//span[@id="ctl00_MainContent_PaperItem_title" and @class="title-span"]/text()'
        ).extract()
        item['fulltext'] = hxs.select(
            '//*[@id="ctl00_MainContent_PaperItem_downLoadList_ctl00_HyperLink2"]/@href'
        ).extract()
        item['description'] = [
            ''.join(
                hxs.select(
                    '//*[@id="ctl00_MainContent_PaperItem_snippet"]/text()').
                extract())
        ]
        item['conference'] = hxs.select(
            '//*[@id="ctl00_MainContent_PaperItem_HLConference"]/text()'
        ).extract()
        patt2 = re.compile(r'^\W+(.*)$')
        item['year'] = [
            re.search(patt2, (hxs.select(
                '//*[@id="ctl00_MainContent_PaperItem_YearConference"]/text()'
            ).extract())[0]).group(1)
        ]
        nodes = hxs.select(
            '//div[@id="ctl00_MainContent_PaperItem_divPaper"]/div/a[@class="author-name-tooltip"]'
        )
        patt = re.compile('\/(\d+)\/')
        opener = urllib2.build_opener()
        author = []
        affiliation = []
        for node in nodes:
            author.append(node.select('text()').extract()[0])
            authorid = re.search(patt,
                                 node.select('@href').extract()[0]).group(1)
            f = opener.open(
                "http://academic.research.microsoft.com/io.ashx?authorID=%s" %
                authorid).read()
            d = json.loads(f)
            affiliation.append(d['Affiliation']
                               and d['Affiliation']['FullName'])
        item['author'] = author
        item['affiliation'] = affiliation

        return item

Example #45

0

Show file

File: test_contrib_linkextractors.py Project: reprior123/TraderSoftwareRP

 def test_urls_type(self):
     '''Test that the resulting urls are regular strings and not a unicode objects'''
     lx = SgmlLinkExtractor()
     self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response)))

Example #46

0

Show file

File: SampleSpider.py Project: RH-xu/depression

class GroupSpider(CrawlSpider):
    name = "Group222"
    allowed_domains = ["douban.com"]
    start_urls = [
        "http://www.douban.com/group/explore?tag=%E8%B4%AD%E7%89%A9",
        "http://www.douban.com/group/explore?tag=%E7%94%9F%E6%B4%BB",
        "http://www.douban.com/group/explore?tag=%E7%A4%BE%E4%BC%9A",
        "http://www.douban.com/group/explore?tag=%E8%89%BA%E6%9C%AF",
        "http://www.douban.com/group/explore?tag=%E5%AD%A6%E6%9C%AF",
        "http://www.douban.com/group/explore?tag=%E6%83%85%E6%84%9F",
        "http://www.douban.com/group/explore?tag=%E9%97%B2%E8%81%8A",
        "http://www.douban.com/group/explore?tag=%E5%85%B4%E8%B6%A3"
    ]

    rules = [
        Rule(SgmlLinkExtractor(allow=('/group/[^/]+/$', )),
             callback='parse_group_home_page',
             process_request='add_cookie'),
        #   Rule(SgmlLinkExtractor(allow=('/group/[^/]+/discussion\?start\=(\d{1,4})$', )), callback='parse_group_topic_list', process_request='add_cookie'),
        Rule(SgmlLinkExtractor(allow=('/group/explore\?tag', )),
             follow=True,
             process_request='add_cookie'),
    ]

    def __get_id_from_group_url(self, url):
        m = re.search("^http://www.douban.com/group/([^/]+)/$", url)
        if (m):
            return m.group(1)
        else:
            return 0

    def add_cookie(self, request):
        request.replace(cookies=[])
        return request

    def parse_group_topic_list(self, response):
        self.log("Fetch group topic list page: %s" % response.url)
        pass

    def parse_group_home_page(self, response):

        self.log("Fetch group home page: %s" % response.url)

        hxs = HtmlXPathSelector(response)
        item = DoubanItem()

        #get group name
        item['groupName'] = hxs.select('//h1/text()').re("^\s+(.*)\s+$")[0]

        #get group id
        item['groupURL'] = response.url
        groupid = self.__get_id_from_group_url(response.url)

        #get group members number
        members_url = "http://www.douban.com/group/%s/members" % groupid
        members_text = hxs.select('//a[contains(@href, "%s")]/text()' %
                                  members_url).re("\((\d+)\)")
        item['totalNumber'] = members_text[0]

        #get relative groups
        item['RelativeGroups'] = []
        groups = hxs.select('//div[contains(@class, "group-list-item")]')
        for group in groups:
            url = group.select(
                'div[contains(@class, "title")]/a/@href').extract()[0]
            item['RelativeGroups'].append(url)
        #item['RelativeGroups'] = ','.join(relative_groups)
        return item

Example #47

0

Show file

File: test_contrib_linkextractors.py Project: reprior123/TraderSoftwareRP

    def test_extraction_using_single_values(self):
        '''Test the extractor's behaviour among different situations'''

        lx = SgmlLinkExtractor(allow='sample')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
        ])

        lx = SgmlLinkExtractor(allow='sample', deny='3')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = SgmlLinkExtractor(allow_domains='google.com')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])

        lx = SgmlLinkExtractor(deny_domains='example.com')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])

Example #48

0

Show file

File: __init__.py Project: forkkit/Site-grabber

class GrabberSpider(CrawlSpider):
    name = "grabber"
    allowed_domains = []

    # Let's think how to pass here url
    start_urls = []

    rules = [
        Rule(SgmlLinkExtractor(), callback='parse_item', follow=True),
        Rule(SgmlLinkExtractor(allow=[r'.*\.css'],
                               deny_extensions=[],
                               tags=[
                                   'link',
                               ],
                               attrs=[
                                   'href',
                               ]),
             callback='parse_css_item',
             follow=False),
    ]

    def check_local_domain_uniqueness(self, local_domain):
        q = self.dbsession.query(WebSite)\
            .filter(WebSite.local_domain == local_domain)
        check = q.first()
        if check and check.original_url not in self.allowed_domains:
            return False
        return True

    def __init__(self, *args, **kw):

        # get extra parameters of scraper launch cmd
        SCRAPED_DOMAIN = kw.pop('SCRAPED_DOMAIN', None)
        START_URL = kw.pop('START_URL', None)
        LOCAL_DOMAIN = kw.pop('LOCAL_DOMAIN', None)

        if START_URL:
            self.start_urls = [START_URL]

        if SCRAPED_DOMAIN:
            if SCRAPED_DOMAIN.strip('.')[0] == 'www':
                self.allowed_domains = [
                    SCRAPED_DOMAIN,
                    SCRAPED_DOMAIN.replace('www.', '')
                ]
            else:
                self.allowed_domains = [
                    SCRAPED_DOMAIN, 'www.%s' % SCRAPED_DOMAIN
                ]

        if LOCAL_DOMAIN is None:
            raise GrabberSpiderError('No local_url is specified for job')

        super(GrabberSpider, self).__init__(*args, **kw)
        log.msg('Init SQL alchemy engine', level=log.DEBUG)
        engine = engine_from_config(WEB_APP_SETTINGS, 'sqlalchemy.')
        conn = engine.connect()
        self.dbsession = Session(bind=conn)

        # patch orm objects to use this local session object

        Base.metadata.create_all(engine)  # while use creating DB here

        if not self.check_local_domain_uniqueness(LOCAL_DOMAIN):
            raise GrabberSpiderError('%s is already used in db')

        q = self.dbsession.query(WebSite)\
            .filter(WebSite.original_url == SCRAPED_DOMAIN)
        website = q.first()
        if website is None:
            website = WebSite(original_url=SCRAPED_DOMAIN,
                              local_domain=LOCAL_DOMAIN)
            self.dbsession.add(website)
            self.dbsession.commit()
        self.website = website
        '''
            Check directory for media and create it if it does not exist
        '''
        media_dir = WEB_APP_SETTINGS.get('downloaded.path')
        if media_dir:
            if not os.path.exists(media_dir):
                os.mkdir(media_dir)
        else:
            raise Exception(
                'Directory for downloaded media is not specified in settings')
        '''
            Check id downloaded media url is in application settings
        '''
        if WEB_APP_SETTINGS.get('downloaded.url') is None:
            raise Exception('URL for downloaded media is not specified')

    def prepare_link(self, url, current_url):
        '''
            Make proper uri from given url
        '''
        if not current_url.endswith('/'):
            current_url += '/'

        # ignore javascript links
        for s in ['javascript:', 'mailto:', '#']:
            if url.startswith(s):
                return None

        if url.find('http://') != -1 or url.find('https://') != -1:
            # in case we have complete http or https protocol uri
            url = url.replace('http://', '').replace('https://', '')
            url_domain = url.split('/')[0]
            # Scrape just this site urls
            if url_domain == self.website.original_url:
                return url
            else:
                return None  # don't scrape external links

        if url.startswith('/'):
            # we get absolute url
            return "http://%s%s" % (self.website.original_url, url)
        else:
            return "%s%s" % (current_url, url)

    def parse_item(self, response):

        log.msg('I\'m here: %s' % response.url, level=log.DEBUG)
        return self.handle_page(response)

    def parse_css_item(self, response):

        log.msg('I\'m here: %s' % response.url, level=log.DEBUG)
        return self.handle_page(response, css=True)

    def _get_path(self, url):
        path = url.replace('http://', '')
        path = path.split('/')
        path[0] = ''
        path = '/'.join(path)
        return urllib.unquote_plus(path)

    def handle_page(self, response, css=False):
        path = self._get_path(response.url)
        content = response.body.decode(response.encoding)
        item = WebPageItem(uri=path,
                           content=content,
                           css=css,
                           response=response)
        return item

Example #49

0

Show file

File: appstore.py Project: secpersu/appcrawl

class AppstoreSpider(CrawlSpider):
    name = 'appstore'
    allowed_domains = ['itunes.apple.com']
    start_urls = [
        'http://itunes.apple.com/us/genre/ios-books/id6018?mt=8',
        'http://itunes.apple.com/us/genre/ios-business/id6000?mt=8',
        'http://itunes.apple.com/us/genre/ios-catalogs/id6022?mt=8',
        'http://itunes.apple.com/us/genre/ios-education/id6017?mt=8',
        'http://itunes.apple.com/us/genre/ios-entertainment/id6016?mt=8',
        'http://itunes.apple.com/us/genre/ios-finance/id6015?mt=8',
        'http://itunes.apple.com/us/genre/ios-food-drink/id6023?mt=8',
        'http://itunes.apple.com/us/genre/ios-games/id6014?mt=8',
        'http://itunes.apple.com/us/genre/ios-health-fitness/id6013?mt=8',
        'http://itunes.apple.com/us/genre/ios-lifestyle/id6012?mt=8',
        'http://itunes.apple.com/us/genre/ios-medical/id6020?mt=8',
        'http://itunes.apple.com/us/genre/ios-music/id6011?mt=8',
        'http://itunes.apple.com/us/genre/ios-navigation/id6010?mt=8',
        'http://itunes.apple.com/us/genre/ios-news/id6009?mt=8',
        'http://itunes.apple.com/us/genre/ios-newsstand/id6021?mt=8',
        'http://itunes.apple.com/us/genre/ios-photo-video/id6008?mt=8',
        'http://itunes.apple.com/us/genre/ios-productivity/id6007?mt=8',
        'http://itunes.apple.com/us/genre/ios-reference/id6006?mt=8',
        'http://itunes.apple.com/us/genre/ios-social-networking/id6005?mt=8',
        'http://itunes.apple.com/us/genre/ios-sports/id6004?mt=8',
        'http://itunes.apple.com/us/genre/ios-travel/id6003?mt=8',
        'http://itunes.apple.com/us/genre/ios-utilities/id6002?mt=8',
        'http://itunes.apple.com/us/genre/ios-weather/id6001?mt=8',
    ]

    rules = (
        Rule(SgmlLinkExtractor(allow='letter=[\w\*]+'),
             follow=True,
             callback="parse_applist"),
        Rule(SgmlLinkExtractor(allow='letter=[\w\*]+\&page=[\d]+'),
             follow=True,
             callback="parse_applist"),
    )

    def parse(self, response):
        r = list(CrawlSpider.parse(self, response))
        return r + list(self.parse_applist(response))

    def parse_applist(self, response):  #parse_applist
        hxs = HtmlXPathSelector(response)
        category = hxs.select('//title/text()').extract()[0].split(
            '-')[0].strip()
        idx = 0
        for url, name in zip(
                hxs.select('//div[contains(@class,"column")]/ul/li/a/@href'
                           ).extract(),
                hxs.select('//div[contains(@class,"column")]/ul/li/a/text()').
                extract()):
            if not '/app/' in url:
                continue
            i = AppItem()
            i['name'] = name
            i['url'] = url
            i['id'] = url.split('/')[-1].split('?')[0]
            i['category'] = category
            i['last_update'] = datetime.date.today().isoformat()
            i['store'] = 'appstore'
            idx += 1
            yield i

    def parse_app(self, response):  #parse_app
        hxs = HtmlXPathSelector(response)
        i = AppStoreItem()
        i['name'] = hxs.select('//div/div/h1/text()').extract()[0]
        i['url'] = response.url
        i['id'] = response.url.split('/')[-1].split('?')[0]
        attrs = hxs.select('//div[@id="content"]')
        i['description'] = "\n".join(
            attrs.select('//div[@class="product-review"]/p/text()').extract())
        i['artwork'] = attrs.select(
            '//div[@class="lockup product application"]/a/div/img/@src'
        ).extract()
        i['price'] = attrs.select('//div[@class="price"]/text()').extract()[0]
        i['release_date'] = attrs.select(
            '//li[@class="release-date"]/text()').extract()[0]
        release_date, version, size, languages, seller, copyright = tuple(
            attrs.select('//li/text()').extract())[
                0:6]  #hugely unsafe but that's how we roll
        i['release_date'] = release_date
        i['version'] = version
        i['size'] = size
        i['languages'] = languages
        i['seller'] = seller
        seller_link = hxs.select('//div[@class="app-links"]/a/@href').extract()
        if len(seller_link) > 1:
            i['seller_link'] = seller_link[0]
        else:
            i['seller_link'] = ''
        i['copyright'] = copyright
        i['rating'] = attrs.select(
            '//a[@href="http://itunes.apple.com/WebObjects/MZStore.woa/wa/appRatings"]/text()'
        ).extract()[0]
        try:
            requirements = attrs.select(
                '//div[@class="lockup product application"]/p/text()').extract(
                )[0]
        except:
            requirements = ''
        i['requirements'] = requirements
        i['reviews'] = ''  #todo
        i['screenshots'] = "|".join(
            hxs.select(
                '//div[@class="swoosh lockup-container application large screenshots"]//img/@src'
            ).extract())
        i['is_iphone'] = 'iPhone' in requirements
        i['is_ipad'] = 'iPad' in requirements
        i['is_ipod'] = 'iPod' in requirements
        i['last_update'] = datetime.date.today().isoformat()
        i['store'] = 'appstore'
        yield i

Example #50

0

Show file

File: documents.py Project: tadaskrisciunas/manoseimas.lt

class LegalActsSpider(ManoSeimasSpider):
    name = 'legal-acts'
    allowed_domains = ['lrs.lt']

    # p_drus - document type
    # p_kalb_id - language
    # p_rus - order by
    # p_gal - document status
    start_urls = [
        # # Current versions
        # ('http://www3.lrs.lt/pls/inter3/dokpaieska.rezult_l?'
        #  'p_drus=102&p_kalb_id=1&p_rus=1&p_gal=33'),
        # Legislation
        ('http://www3.lrs.lt/pls/inter3/dokpaieska.rezult_l?'
         'p_drus=1&p_kalb_id=1&p_rus=1&p_gal='),
        # # Law drafts
        # ('http://www3.lrs.lt/pls/inter3/dokpaieska.rezult_l?'
        #  'p_drus=2&p_kalb_id=1&p_rus=1&p_gal='),
        # Constitution
        ('http://www3.lrs.lt/pls/inter3/dokpaieska.rezult_l?'
         'p_drus=8&p_kalb_id=1&p_rus=1&p_gal='),

        #'http://www3.lrs.lt/pls/inter3/dokpaieska.rezult_l?p_nr=&p_nuo=2010%2006%2001&p_iki=&p_org=&p_drus=2&p_kalb_id=1&p_title=&p_text=&p_pub=&p_met=&p_lnr=&p_denr=&p_es=0&p_rus=1&p_tid=&p_tkid=&p_t=0&p_tr1=2&p_tr2=2&p_gal=',
    ]

    rules = (
        Rule(SgmlLinkExtractor(allow=r'dokpaieska.rezult_l\?')),
        Rule(SgmlLinkExtractor(allow=r'dokpaieska.susije_l\?p_id=-?\d+$'),
             'parse_related_documents'),
        Rule(
            SgmlLinkExtractor(allow=r'dokpaieska.showdoc_l\?p_id=-?\d+.*',
                              deny=r'p_daug=[1-9]'), 'parse_document'),
    )

    pipelines = (pipelines.ManoseimasPipeline, )

    def _fix_name_case(self, act):
        name = act.get_output_value('name')
        for idx, words in (
            (-2, (u'ĮSTATYMO PROJEKTAS')),
            (-1, (u'ĮSTATYMAS')),
        ):
            a, b = split_by_words(name, idx)
            if b in words:
                act.replace_value('name', '%s %s' % (a, b.lower()))
                return

    def _parse_law_act(self, response, hxs, base=False):
        """
        Extracts basic document information and returns law act loader.

        Parameters:

        base
            Return only base information about document. This is used, when
            filling some information bits to a law act from several law act
            documents.

        """
        lang = hxs.select('tr[1]/td[4]/b/text()').extract()[0].strip().lower()

        if lang not in (u'lietuvių', u'rusų', u'anglų', u'ispanų'):
            self.error(response, 'Unknown language: %s' % lang)

        if lang != u'lietuvių':
            return None

        act = Loader(self, response, LegalAct(), hxs, required=REQUIRED_FIELDS)

        act.add_xpath('_id', 'tr[1]/td[2]/b/text()')

        source = self._get_source(response.url, 'p_id')

        if not act.get_output_value('_id'):
            act.replace_value('_id', u'NONUMBER-%s' % source['id'])

        if base:
            return act

        act.add_xpath('name', 'caption/text()')
        act.add_xpath('kind', 'tr[1]/td[1]/b/text()')
        act.add_xpath('number', 'tr[1]/td[2]/b/text()')
        act.add_xpath('date', 'tr[1]/td[3]/b/text()')

        act.add_value('source', source)

        self._fix_name_case(act)

        return act

    def _involved_parts(self, response, hxs, act):
        involved_string = hxs.select('tr[3]/td[1]/b/text()').extract()
        involved_string = ' '.join(involved_string)
        if not involved_string:
            return None

        m = DOCUMENT_INVOLVED_PARTS.match(involved_string)
        if not m:
            return None

        involved = Loader(self,
                          response,
                          DocumentInvolved(),
                          hxs,
                          required=(
                              'date',
                              'how',
                              'institution',
                          ))
        involved.add_value('date', m.group(1))
        involved.add_value('how', m.group(2).lower())
        institution = m.group(3)
        if ',' in institution:
            # TODO: move this to utility function, same code is also used
            # in manoseimas/scrapy/spiders/mps.py:171
            spl = institution.replace(u'Švietimo, mokslo',
                                      u'Švietimo%2c mokslo')
            spl = map(lambda x: urllib.unquote(x.strip()), spl.split(','))
            spl = filter(None, spl)
            if len(spl) == 2:
                person, institution = spl
            else:
                person, group, institution = spl
                spl = group.strip().split()
                group_types = (u'komitetas', u'grupė', u'frakcija',
                               u'komisija')
                if spl[-1].lower() in group_types:
                    group_type = spl[-1].lower()
                elif spl[0].lower() in group_types:
                    group_type = spl[0].lower()
                else:
                    group_type = None

                if group_type:
                    involved.add_value('group', group)
                    involved.add_value('group_type', group_type)
                else:
                    self.error(response, 'Not committee: %s' % group)
            involved.add_value('person', person)
        involved.add_value('institution', institution)
        act.add_value('involved', dict(involved.load_item()))

    def _extract_html_as_attachment(self, response, loader, xpath, name):
        text = HtmlXPathSelector(response).select(xpath).extract()
        text = clean_html('\n'.join(text))
        body = text.encode('utf-8')
        loader.add_value('_attachments', [(name, body, 'text/html')])

    def _get_legislation_links(self, response, hxs):
        for link in hxs.select('tr[4]/td/a'):
            text = get_first(link, 'text()')
            if text == u'Susiję dokumentai':
                url = get_absolute_url(response, get_first(link, '@href'))
                yield Request(url, callback=self.parse_related_documents)

    def _legislation(self, response, hxs):
        act = self._parse_law_act(response, hxs)
        if not act:
            raise StopIteration

        self._involved_parts(response, hxs, act)

        self._extract_html_as_attachment(
            response, act, "/html/body/*[name()='div' or name()='pre']",
            'original_version.html')

        act.reset_required(*(REQUIRED_FIELDS + ('_attachments', )))
        yield act.load_item()

        for request in self._get_legislation_links(response, hxs):
            yield request

    def _current_edition(self, response, hxs):
        # Do not collect documents, if they are not currently valid.
        valid_edition = hxs.select('tr[4]/td[1]/a[2]/font/b/text()')
        if (valid_edition and valid_edition.extract()[0]
                == u'Galiojanti aktuali redakcija'):
            raise StopIteration

        act = self._parse_law_act(response, hxs, base=True)

        if act:
            self._extract_html_as_attachment(
                response, act, "/html/body/*[name()='div' or name()='pre']",
                'updated_version.html')
            act.reset_required('_id', '_attachments')
            yield act.load_item()

    def parse_document(self, response):
        # Some thimes lrs.lt returns empty page...
        if not response.body:
            return

        xpath = '/html/body/table[2]'
        hxs = HtmlXPathSelector(response).select(xpath)[0]

        # Get document kind
        kind = hxs.select('tr[1]/td[1]/b/text()').extract()[0].strip().lower()

        if kind in (u'konstitucija', u'įstatymas', u'įstatymo projektas',
                    u'kodeksas'):
            items = self._legislation(response, hxs)
        elif kind == u'aktuali redakcija':
            items = self._current_edition(response, hxs)
        else:
            items = []

        for item in items:
            yield item

    def parse_related_documents(self, response):
        xpath = '/html/body/div/table/tr[3]/td/table/tr/td/table/tr'
        hxs = HtmlXPathSelector(response).select(xpath)
        act = Loader(self, response, LegalAct(), hxs, required=('_id', ))
        act.add_xpath('_id', 'td[2]/b/text()')

        if not act.get_output_value('_id'):
            p_id = unicode(self._get_query_attr(response.url, 'p_id'))
            act.replace_value('_id', u'NONUMBER-%s' % p_id)

        relations = defaultdict(list)
        xpath = '/html/body/div/table/tr[3]/td/table/tr/td/align/table/tr'
        for row in HtmlXPathSelector(response).select(xpath):
            docid = get_all(row, 'td[4]/span//text()')
            rel_type = row.select('td[6]/span/text()')
            if rel_type:
                rel_type = rel_type.extract()[0].strip().lower()

            if rel_type in (u'pakeistas dokumentas',
                            u'ankstesnė dokumento redakcija'):
                relations['amends'].append(docid)

            elif rel_type == u'priimtas dokumentas':
                relations['adopts'].append(docid)

            elif rel_type == u'ryšys su taikymą nusakančiu dokumentu':
                relations['defines_applicability'].append(docid)

            elif rel_type == u'ryšys su galiojimą nusakančiu dokumentu':
                relations['defines_validity'].append(docid)

            elif rel_type == u'negalioja de jure':
                relations['defines_as_no_longer_valid'].append(docid)

            elif rel_type == u'kitas projekto variantas':
                relations['new_draft_version'].append(docid)

            elif rel_type == u'kitas projekto variantas':
                relations['new_draft_version'].append(docid)

            elif rel_type == u'ryšys su ratifikavimo dokumentu':
                relations['ratification'].append(docid)

        if relations:
            act.add_value('relations', dict(relations))
            yield act.load_item()

    def _find_related_law(self, db, doc):
        keyword = u' įstatymo '
        if 'name' not in doc or keyword not in doc['name']:
            return False

        name = doc['name'].split(keyword, 2)[0] + u' įstatymas'
        rs = db.view('scrapy/by_name', key=name, include_docs=True)
        if len(rs) > 0:
            doc.setdefault('relations', {})['law'] = [rs.rows[0]['id']]
            return True

    def _set_type(self, db, doc):
        if 'kind' not in doc:
            return False

        if (doc['kind'] in (u'įstatymas', u'konstitucija')
                and not doc.get('relations')):
            doc['type'] = u'įstatymas'
            return True

        elif doc['kind'] == u'įstatymas':
            doc['type'] = u'įstatymo pataisa'
            return True

    def post_process(self, db, started):
        #for row in db['legalact'].view('scrapy/by_update_time',
        #                               startkey=started, include_docs=True):
        for row in db['legalact'].view('_all_docs', include_docs=True):
            doc = row.doc

            changed = False
            for fn in (self._set_type, self._find_related_law):
                changed = fn(db['legalact'], doc) or changed

            if changed:
                db['legalact'][doc['_id']] = doc

Example #51

0

Show file

class FollowAllSpider(Spider):

    name = 'followall'

    def __init__(self, **kw):
        super(FollowAllSpider, self).__init__(**kw)
        url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/'
        if not url.startswith('http://') and not url.startswith('https://'):
            url = 'http://%s/' % url
        self.url = url
        self.allowed_domains = [urlparse(url).hostname.lstrip('www.')]
        self.link_extractor = SgmlLinkExtractor()
        self.cookies_seen = set()

    def start_requests(self):
        return [Request(self.url, callback=self.parse)]

    def parse(self, response):
        """Parse a PageItem and all requests to follow

        @url http://www.scrapinghub.com/
        @returns items 1 1
        @returns requests 1
        @scrapes url title foo
        """
        page = self._get_item(response)
        r = [page]
        r.extend(self._extract_requests(response))
        return r

    def _get_item(self, response):
        item = Page(url=response.url,
                    size=str(len(response.body)),
                    referer=response.request.headers.get('Referer'))
        self._set_title(item, response)
        self._set_new_cookies(item, response)
        return item

    def _extract_requests(self, response):
        r = []
        if isinstance(response, HtmlResponse):
            links = self.link_extractor.extract_links(response)
            r.extend(Request(x.url, callback=self.parse) for x in links)
        return r

    def _set_title(self, page, response):
        if isinstance(response, HtmlResponse):
            title = Selector(response).xpath("//title/text()").extract()
            if title:
                page['title'] = title[0]

    def _set_new_cookies(self, page, response):
        cookies = []
        for cookie in [
                x.split(';', 1)[0]
                for x in response.headers.getlist('Set-Cookie')
        ]:
            if cookie not in self.cookies_seen:
                self.cookies_seen.add(cookie)
                cookies.append(cookie)
        if cookies:
            page['newcookies'] = cookies

Example #52

0

Show file

class IndeedSpider(CrawlSpider):
  name = "indeed"
  allowed_domains = ["indeed.com"]
  pages = 4
  url_template = "http://www.indeed.com/jobs?q=%s&l=Chicago&start=%s"
  start_urls = []

  rules = (


        Rule(SgmlLinkExtractor(restrict_xpaths=("//div[@class='row ' or @class='row lastRow']/h2/a/@href"))),
      Rule(SgmlLinkExtractor(allow=('http://www.indeed.com/jobs',),deny=('/my/mysearches', '/preferences', '/advanced_search','/my/myjobs')), callback='parse_item', follow=False),

      )

  #Initialize the start_urls
  job_queries = []
  with open('job_queries.cfg', 'r') as f:
    for line in f:
      job_queries.append(line.strip())


  # Build out the start_urls to scrape
  for job_query in job_queries:
    for page in range(1,pages):
      full_url = url_template % (job_query, str(page*10))
      start_urls.append(full_url)


  '''
  def __init__(self, *args, **kwargs):
    # Get the search queries for the jobs from the job_queries.cfg file
    # Config file must have 1 query per line

    super(IndeedSpider, self).__init__(*args, **kwargs)

  '''

  def get_job_description(self, html, item):

    summary_string = item['summary'][0][1:-5]
    root = lxml.html.document_fromstring(html)
    target_element = None


    # For some reason the summary will not match the lxml extracted text, figure out why
    # This solution is hacky


    # Get only the first sentence
    # Indeed cobbles together multiple sentences from the job posting
    summary_string = summary_string.split(".",1)[0]

    summary_start_list = summary_string.split(" ")[:3]
    summary_start = " ".join(summary_start_list)



    counter  = 0
    # Find the element that contains the initial words in the summary string
    for element in root.iter():
      counter += 1
      if element.text:
        if (summary_start in element.text):
          target_element = element
          print 'YES. element.txt'
          break
      elif element.tail:
        if (summary_start in element.tail):
          target_element = element
          print 'YES element.tail'
          break



    generation_count = 0

    target_ancestor = None

    job_posting_min_length = 400
    job_posting_max_length = 10000


    # Find the best parent element that contains the entire job description without the extra html
    if target_element is not None:
      if target_element.text_content() > job_posting_min_length:
        target_ancestor = target_element
      elif 'shiftgig' in item['source_url']:
        target_ancestor = target_element
      else:
        for ancestor in target_element.iterancestors():
          generation_count += 1

          ancestor_text = ancestor.text_content()

          target_ancestor = ancestor

          # The loop will pre-maturely break once the ancestor elements has minimum threshold of characters
          if len(ancestor_text) > job_posting_min_length:
            break


    return target_ancestor.text_content()

  def parse_next_site(self, response):



    item = response.request.meta['item']
    item['source_url'] = response.url
    item['crawl_timestamp'] =  time.strftime('%Y-%m-%d %H:%M:%S')



    job_description = self.get_job_description(response.body, item)
    item['full_description'] = job_description



    return item


  def parse_item(self, response):
    '''
    import pdb
    pdb.set_trace()
    '''


    self.log('\n Crawling  %s\n' % response.url)
    hxs = HtmlXPathSelector(response)
    sites = hxs.select("//div[@class='row ' or @class='row lastRow']")
    #sites = hxs.select("//div[@class='row ']")
    items = []

    #Skip top two sponsored ads
    for site in sites[:-2]:
      item = IndeedItem(company='none')

      item['job_title'] = site.select('h2/a/@title').extract()
      link_url= site.select('h2/a/@href').extract()
      item['link_url'] = link_url
      item['crawl_url'] = response.url
      item['location'] = site.select("span[@itemprop='jobLocation']/span[@class='location']/span[@itemprop='addressLocality']/text()").extract()
      # Not all entries have a company
      company_name = site.select("span[@class='company']/span[@itemprop='name']/text()").extract()
      if company_name == []:
        item['company'] = [u'']
      else:
        item['company'] = company_name

      item['summary'] =site.select("table/tr/td/div/span[@class='summary']/text()").extract()
      #item['source'] = site.select("table/tr/td/span[@class='source']/text()").extract()
      item['found_date'] =site.select("table/tr/td/span[@class='date']/text()").extract()
      #item['source_url'] = self.get_source(link_url)


      if len(item['link_url']):
        request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site)
        request.meta['item'] = item

        yield request


    return

Example #53

0

Show file

File: DealExtreme.py Project: PranavSonar/scrapers

class DealExtreme(CrawlSpider):
    name = "dealextreme"
    allowed_domains = ["dx.com"]
    start_urls = [url.strip() for url in urllist]
    rules = (
        Rule(SgmlLinkExtractor(
            allow=(), restrict_xpaths=("//div[@class='pagenumber']", )),
             follow=True),
        Rule(SgmlLinkExtractor(allow=(),
                               restrict_xpaths=("//p[@class='title']", )),
             callback="parse_product",
             follow=True),
    )

    def parse(self, response):

        sel = Selector(response)
        # LinkExtraction
        # script = sel.xpath("//script[contains(text(),'productAttrs: [')]").extract()[0]
        # script = re.findall(r'productAttrs(.*)',script)
        # links = re.findall(r'Url":(.*?)",',script[0])
        # for link in links:
        # link = "http://www.dx.com/p/"+link.replace('"',"")
        # row = [link]
        # mywriter.writerow(row)

        #ProductExtraction
        pname = 'Impertus ' + sel.xpath("//h1/span/text()").extract()[0]
        metainfo = "Buy " + pname + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts"
        images = sel.xpath(
            "//ul[@class='product-small-images']//img/@src").extract()
        price = sel.xpath("//span[@id='price']/text()").extract()[0]
        price = float(price) * 115 / 100 * 4
        prince = str(price)
        category = (
            sel.xpath('//div[@class="position"]/a[last()-1]/text()').extract()
            [0] + '/' +
            sel.xpath('//div[@class="position"]/a[last()]/text()').extract()[0]
        )
        description = [
            "DISCLAIMER: LiveYourSport.com does not take responsibility for any support claims and technical troubleshooting."
            +
            "This product is not valid for any technical support, warranty after purchase or protected by our after sales services."
            +
            "We only offer protection against delivery damages and manufacturing defects claimed within 10 days of delivery of the product."
        ]
        description = description.append(
            sel.xpath("//div[@id='overview']").extract() +
            sel.xpath("//div[id='specification']").extract())

        sku = sel.xpath("//span[@id='sku']/text()").extract()[0] + 'DXMDCHN'
        row = [
            "Product", "", pname + '*', "Impertus", price * 140 / 100,
            price * 140 / 100, price, description, sku, 'DEALEXTREME',
            category, pname, '12-19 Working Days', 100, 'N', -270, metainfo,
            metainfo, metainfo, 'Y', 'By Product', 1, 2, 3, 4, 5
        ]
        for image in images:
            image = image.replace("//img", "img")
            row.append(image)
        mywriter.writerow(row)

Example #54

0

Show file

File: compuindia_spider.py Project: ashish2/spidy

class CompuindiaSpider(CrawlSpider):
	name = 'compuindia'
	allowed_domains = ['compuindia.com']
	start_urls = ['http://www.compuindia.com']
	#~start_urls = ['http://www.compuindia.com/touch-pc.html']
	#~start_urls = ['http://www.compuindia.com/touch-pc/new-inspiron-242.html']
	
	urlList = []
	
	# DONE
	# If you are writing a process_value, then you will 
	# have to return the links you want. You can not leave it
	# without returning.
	# process_value
	# NOT USED FTM
	def pv(value):
		"""This function takes values links from Rules, and can process those links in any manner"""
		# Removing a link having "deals.html" in it, as we dont want that link to be crawled
		if re.search('.*deals\.html', value):
			return None
		else:
			return value
	
	# NOT USED FTM
	def pl(value):
		print sys._getframe().f_code.co_name
		print 'value'
		print type(value)
		print value
		print 'length: ' + str( len(value))
		sys.exit('X')
	
	def parse_item(response):
		#~self.log('Hi, this is an item page! %s' % response.url)
		#~print 'function'
		#~print sys._getframe().f_code.co_name
		#~print 'response'
		#~print response
		#~print type(response)
		#~print "response.url"
		#~print response.url
		
		hxs = HtmlXPathSelector(response)
		
		# ATM, all these item values are coming in a List type with just the 0th key
		item = CompuindiaItem()
		
		item['sourceurl'] = [ response.url ]
		
		#~item['code'] = hxs.select('//td[@class="data"]/text()')[0].extract() # Code: Unicode
		item['code'] = hxs.select('//td[@class="data"]/text()')[0].extract().encode('utf-8') # Code: String
		
		item['price'] = hxs.select('//span[@class="price"]/text()')[0].extract().encode('utf-8')
		
		# left
		item['color'] = [None]
		# Try to do matching with class="last odd"
		#~item['color'] = hxs.select('//tbody/tr[@class="last odd"]') 
		
		item['name'] = hxs.select("//div[@class='product-name']/h1/text()").extract()[0]
		
		#~item['features'] =  hxs.select('//ul[@class="config_listing_pd_page"]/li').extract()
		item['features'] =  hxs.select('//ul[@class="config_listing_pd_page"]/li/text()').extract()
		
		#~item['specs'] = hxs.select('//div[@class="box-collateral box-additional"]').extract()
		item['specs'] = hxs.select('//div[@class="box-collateral box-additional"]').extract()[0].encode('utf-8')
		
		#~item['description'] = hxs.select('//div[@class="box-collateral box-description"]').extract()
		item['description'] = hxs.select('//div[@class="box-collateral box-description"]').extract()[0].encode('utf-8')
		
		item['moreDescription'] = [None]
		
		#~item['additionalInfo'] = hxs.select('//div[@id="additional"]').extract()
		item['additionalInfo'] = hxs.select('//div[@id="additional"]').extract()[0].encode('utf-8')
		
		item['relatedProducts'] = [None] # FTM
		
		#IMAGES
		main_img = []
		image_urls = []
		main_img = hxs.select("//p[@class='product-image']/a/@href").extract()
		img_urls = hxs.select("//div[@class='more-views']/ul/li/a/@href").extract()
		
		item['image_urls'] =  list( set( main_img + img_urls ) )
		#IMAGES-
		
		#~print 'item'
		#~print item
		#~
		#~sys.exit('S')
		
		return item
		
		
	rules = (
		# Extract links matching 'category.php' (but not matching 'subsection.php')
		# and follow links from them (since no callback means follow=True by default).
		
		### Now 3rd Page
		# Extracting the actual data and images from Product Page
		#~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//div[@class='product-name']" ] ), callback=parse_item, follow=False ), 
		
		#~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//div[@class='product-view']" ], deny = [".*tata-photon.*",  ] ), callback=parse_item, follow=True ), 
		# FIN THIS
		#~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//div[@class='product-view']" ], deny = [".*tata-photon.*",  ] ), callback = parse_item, follow=False ), 
		#~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//div[@class='col-main']/div[@class='product-view']" ], deny = [".*tata-photon.*",  ] ), callback = parse_item, follow=False ), 
		
		
		### Now 2nd Page
		#~Rule( SgmlLinkExtractor( restrict_xpaths = "//h4[@class='product-name']", process_value= pv2 ), follow=False ),
		#~Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@class='category-products']", process_value= pvN ), follow=False ),
		#~Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@class='category-products']", process_value= pvN ), follow=False ),
		#~Rule( SgmlLinkExtractor( restrict_xpaths="//h4[@class='product-name']", process_value= pvn ), follow=True ),
		# This time i dont need to do process_links
		#~Rule( SgmlLinkExtractor( restrict_xpaths="//h4[@class='product-name']" ), process_links=pl, follow=True ),
		
		# ===PAGE 2: DONE===
		# THESE 2, put it as, list in "restrict_xpaths"
		# IF:
		# THIS
		#~Rule( SgmlLinkExtractor( restrict_xpaths="//h4[@class='product-name']" ), follow=True ),
		# THIS
		#~Rule( SgmlLinkExtractor( restrict_xpaths="//a[@class='next i-next']" ), process_links=pl, follow=True ),
		# ELSE:
		#~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//h4[@class='product-name']", "//a[@class='next i-next']", ] ), process_links=pl, follow=True ),
		#~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//h4[@class='product-name']", "//a[@class='next i-next']", ] ), process_links=pl, follow=False ),
		#~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//h4[@class='product-name']", "//a[@class='next i-next']", ] ), follow=True ),
		
		# FIN THIS
		#PAGE2: NEXT LINKS
		Rule( SgmlLinkExtractor( restrict_xpaths=[ "//a[@class='next i-next']", ] ), follow=True ),
		
		# FIN THIS
		# PAGE2: PRODUCT LINKS
		#~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//h4[@class='product-name']" ] ), follow = True ),
		Rule( SgmlLinkExtractor( restrict_xpaths=[ "//div[@class='category-products']" ], deny = [".*dir=.*", ".*order=.*", ],  ), callback=parse_item , follow = False ),
		#~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//div[@class='category-products']/ul[@class='products-grid']" ] ), follow = True ),
		
		# Not working
		#~Rule( SgmlLinkExtractor( restrict_xpaths="//a[starts-with(., 'next')]" ), process_links=pl, follow=True ),
		
		# ===PAGE 1: DONE===
		# CompuIndia.com
		### 1st Page
		# Awesome!!!
		# Now links are getting extracted from first page,
		# now go to next page, add another rule & get the item links, with another rule,
		# then third page which is actual product page, extract real data from there
		#~Rule( SgmlLinkExtractor( deny = ( 'deals\.html', ), restrict_xpaths = "//div[@class='parentMenu']", process_value= pv ), follow=True ),
		
		#~Rule( SgmlLinkExtractor( deny= re.compile('.*deals\.html', re.I), restrict_xpaths = "//div[@class='parentMenu']", process_value= pv ), follow=True ),
		# Since i m putting process_value, i have to do a return value and return None, wherever necessary, otherwise the parser stops following links
		# as they haven't been returned
		# THIS
		#~Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@class='parentMenu']", process_value= pv ) , follow=True ),
		# FIN THIS
		Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@class='parentMenu']", deny = [".*deals\.htm.*"], ) , follow=True ),
		#~Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@class='parentMenu']", deny = [".*deals\.htm.*"], process_value= pv ) , follow=True ),
		
		#~Rule( SgmlLinkExtractor(process_value=pv) , follow=False  ),
		
		#~Rule( SgmlLinkExtractor( deny = 'deals\.html', restrict_xpaths = "//div[@class='menu']", process_value= pv ), follow=True ),
		#~Rule( SgmlLinkExtractor( deny = 'deals\.html', restrict_xpaths = "//div[@id='custommenu']", process_value= pv ), follow=True ),
		
		
		# Y is this following?
		#~Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@id='custommenu']", process_value= pv ), follow=False ),
		# Working too.
		#~Rule( SgmlLinkExtractor( process_value= pv ), follow=False ),
		
		#~Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@class='parentMenu']" ) , callback = "chk_urls" ),
		#~Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@class='custommenu']" ) , process_links = "chk_urls" ),
		
		#~Rule( SgmlLinkExtractor( restrict_xpaths = ["//div[@class='parentMenu']/a/@href", ] ) , follow=True, process_request = "chk_urls" ),
		
		#~Rule( SgmlLinkExtractor( ) , process_request = "chk_urls" ),
		
		# Extract links matching 'item.php' and parse them with the spider's method parse_item
		#~Rule( SgmlLinkExtractor( allow=('item\.php', ) ), callback='parse_item' ),
	)
	
	
	
	
	#~def parse_start_url(self, response):
		#~print "parse_start_url"
		#~print 'response'
		#~print response
		#~
		#~pass
		
	
	#~def parse_item(self, response):
		#~print "response"
		#~print response
		#~
		#~item = CompuindiaItem()
		#~item['url'] = response.url
		#~
		#~return item
		#~print "self.urlList"
		#~print self.urlList
		#~pass
		#~
		#~return

Example #55

0

Show file

File: Spider.py Project: xiaoq4Git/Scrapy

class doubanSpider(CrawlSpider):
    name = "doubanmovie"
    allowed_domains = ["movie.douban.com"]
    start_urls = ["http://movie.douban.com/top250"]
    rules = [
        Rule(
            SgmlLinkExtractor(
                allow=(r'http://movie.douban.com/top250\?start=\d+.*'))),
        Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/subject/\d+')),
             callback="parse_item"),
    ]

    def parse_item(self, response):
        sel = Selector(response)
        movie_name = sel.select(
            "//div[@id='content']/h1/span[1]/text()").extract()
        movie_director = sel.select(
            "//*[@id='info']/span[1]/span[2]/a/text()").extract()
        movie_writer = sel.select(
            "//*[@id='info']/span[2]/span[2]/a/text()").extract()
        movie_score = sel.xpath(
            "//*[@id='interest_sectl']/div/div[2]/strong/text()").extract()
        movie_classification = sel.xpath(
            "//span[@property='v:genre']/text()").extract()
        movie_description_paths = sel.select("//*[@id='link-report']")

        movie_description = []
        for movie_description_path in movie_description_paths:
            movie_description = movie_description_path.select(
                ".//*[@property='v:summary']/text()").extract()

        movie_roles_paths = sel.select("//*[@id='info']/span[3]/span[2]")
        movie_roles = []
        for movie_roles_path in movie_roles_paths:
            movie_roles = movie_roles_path.select(
                ".//*[@rel='v:starring']/text()").extract()

        movie_detail = sel.select("//*[@id='info']").extract()

        item = WorkItem()
        item['movie_name'] = ''.join(movie_name).strip().replace(
            ',', ';').replace('\'', '\\\'').replace('\"',
                                                    '\\\"').replace(':', ';')
        item['movie_director'] = movie_director[0].strip().replace(
            ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(
                ':', ';') if len(movie_director) > 0 else ''
        item['movie_score'] = movie_score[0].strip().replace(',', ';').replace(
            '\'', '\\\'').replace('\"', '\\\"').replace(
                ':', ';') if len(movie_director) > 0 else ''
        item['movie_classification'] = movie_classification[0].strip().replace(
            ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(
                ':', ';') if len(movie_director) > 0 else ''
        item['movie_description'] = movie_description[0].strip().replace(
            ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(
                ':', ';') if len(movie_description) > 0 else ''
        item['movie_writer'] = ';'.join(movie_writer).strip().replace(
            ',', ';').replace('\'', '\\\'').replace('\"',
                                                    '\\\"').replace(':', ';')
        item['movie_roles'] = ';'.join(movie_roles).strip().replace(
            ',', ';').replace('\'', '\\\'').replace('\"',
                                                    '\\\"').replace(':', ';')

        movie_detail_str = ''.join(movie_detail).strip()

        movie_language_str = ".*语言:</span>(.+?)<span.*".decode("utf8")
        movie_date_str = ".*上映日期:</span> <span property=\"v:initialReleaseDate\" content=\"(\S+?)\">(\S+?)</span>.*".decode(
            "utf8")
        movie_long_str = ".*片长:</span> <span property=\"v:runtime\" content=\"(\d+).*".decode(
            "utf8")

        pattern_language = re.compile(movie_language_str, re.S)
        pattern_date = re.compile(movie_date_str, re.S)
        pattern_long = re.compile(movie_long_str, re.S)

        movie_language = re.search(pattern_language, movie_detail_str)
        movie_date = re.search(pattern_date, movie_detail_str)
        movie_long = re.search(pattern_long, movie_detail_str)

        item['movie_language'] = ""
        if movie_language:
            item['movie_language'] = movie_language.group(1).replace(
                '<br>',
                '').strip().replace(',', ';').replace('\'', '\\\'').replace(
                    '\"', '\\\"').replace(':', ';')

        item['movie_date'] = ""
        if movie_date:
            item['movie_date'] = movie_date.group(1).strip().replace(
                ',', ';').replace('\'',
                                  '\\\'').replace('\"',
                                                  '\\\"').replace(':', ';')

        item['movie_long'] = ""
        if movie_long:
            item['movie_long'] = movie_long.group(1)

        yield item

Example #56

0

Show file

class SpiderSpider(CrawlSpider):
    count = 0
    name = "pcconnection_tv"

    dic = set()

    allowed_domains = init_allowed_domains

    start_urls = init_start_urls

    rules = (
        #only extract links here
        Rule(SgmlLinkExtractor(allow=allowed_url), callback="parse"),
    )

    @property
    def sleep_time(self):
        return random.random() * MAX_SLEEP_TIME

    def parse(self, response):
        '''
        extract
        title
        content
        url
        '''
        print '>'*50
        print 'response url: ', response.url
        hxs = HtmlXPathSelector(response)
        print '>>>> repsonse.url: ', response.url
        #get urls
        content_urls = hxs.select(content_url_format).extract()

        list_urls = hxs.select(list_url_format).extract()
        list_urls = [ up.urljoin(response.url, url) for url in list_urls]
        content_urls = [ up.urljoin(response.url, url) for url in content_urls]
        
        print "@" * 60 
        time.sleep(self.sleep_time)
        self.start_urls.extend(list_urls)

        for url in list_urls:
            yield Request(url, self.parse)

        content_re = re.compile(r'http://www[.]pcconnection[.]com/.*cac=Result')
        for url in content_urls:
            if content_re.match(url):
                if len(self.dic) > 600:
                    self.start_urls = []
                    raise CloseSpider('reach pages limit, end the spider.')

                self.count += 1
                self.dic.add( hash(url))
                #extract data
                item = SpiderItem()
                item['url'] = url
                item['kind'] = self.name
                yield item
            else:
                print "!!!!!!! not match content url:"
                print url

Example #57

0

Show file

File: processThreads.py Project: vasusvodorosus/LDA-ermunds

    def parse(self, response):
        print('inside a thread')
        hxs = HtmlXPathSelector(response)
        filename_ = response.url.split("/")[-2][1:]
        filename = os.path.abspath(databasePath + "\data\%s" % filename_)
        dumpFilePath = os.path.abspath(databasePath + "\dump\%s" % filename_)
        try:
            a = response.meta['page']
        except KeyError:
            a = 0
            os.mkdir(dumpFilePath)
            with open(filename, 'a') as f:
                #header
                forumTitle = hxs.select(
                    '//div[@class="module forums"]/h2/text()').extract(
                    )[0].encode('ascii', 'ignore').replace('\n', '')
                extraInfo = hxs.select(
                    '//div[@class="module forums discussion tid"]/h4/text()'
                ).extract()[0].encode('ascii', 'ignore').replace('\n', '')
                f.write("title:" + forumTitle + "\n")
                f.write("extraInfo:" + extraInfo + "\n")
                f.write(response.url + "\n")
                f.write(filename + "\n")
                f.write(dumpFilePath + "\n\n")

        with open(dumpFilePath + "\\" + str(a) + '.html', 'a') as fd:
            fd.write(response.body)

        with open(filename, 'a') as f:
            for entry in hxs.select('//div[contains(@class,"forums-thread")]'):
                msgID = entry.select('span/@id').extract()[0]
                msgDate = entry.select('h4/text()').extract()[0].encode(
                    'ascii', 'ignore').replace('\n', '')
                msgText = ' '.join(
                    entry.select('span/text()').extract()).encode(
                        'ascii', 'ignore').replace('\n', '')
                try:
                    mgAuthor = entry.select(
                        'h3/span/a/text()').extract()[0].encode(
                            'ascii', 'ignore').replace('\n', '')
                except:
                    mgAuthor = 'none'
                try:
                    msgTitle = entry.select(
                        'h3/strong/text()').extract()[0].encode(
                            'ascii', 'ignore').replace('\n', '')
                except:
                    msgTitle = "none"
                f.write('msgID:' + msgID + '\n')
                f.write('msgTitle:' + msgTitle + '\n')
                f.write('mgAuthor:' + mgAuthor + '\n')
                f.write('msgDate:' + msgDate + '\n')
                f.write('msgText:' + msgText + '\n\n')
        s = SgmlLinkExtractor(
            restrict_xpaths=['//li[contains(@class, "next")]'])
        Links = s.extract_links(response)
        if len(Links) > 0:
            print 'going to the next page'
            r = Request(googc + Links[0].url, callback=self.parse)
            r.meta['page'] = a + 1
            yield r

Example #58

0

Show file

class HuoBiaoSpider(CrawlSpider):
    name = 'huo_biao'
    allowed_domains = ['www.huobiao.cn']
    start_urls = ['http://www.huobiao.cn/search/word/%E7%89%A9%E4%B8%9A/']

    rules = (Rule(SgmlLinkExtractor(allow=('/detail?id=\w+', )),
                  callback='parse_page',
                  follow=True), )
    headers = {
        "Accept":
        "application/json, text/plain, */*",
        "Accept-Encoding":
        "gzip, deflate",
        "Accept-Language":
        "zh-CN,zh;q=0.9",
        "Connection":
        "keep-alive",
        "Referer":
        "http://www.huobiao.cn/",
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    }

    # 重写了爬虫类的方法, 实现了自定义请求, 运行成功后会调用callback回调函数
    def start_requests(self):
        return [
            Request("http://www.huobiao.cn/do_login",
                    meta={'cookiejar': 1},
                    callback=self.post_login)
        ]

        # FormRequeset出问题了
    def post_login(self, response):
        # 登陆成功后, 会调用after_login回调函数
        return [
            FormRequest.from_response(
                response,  # "http://www.zhihu.com/login",
                meta={'cookiejar': response.meta['cookiejar']},
                headers=self.headers,  # 注意此处的headers
                formdata={
                    'phone': '1341953*****',
                    'password': '******',
                    'checkout': 'on'
                },
                callback=self.after_login,
            )
        ]

    def after_login(self, response):
        for url in self.start_urls:
            yield self.make_requests_from_url(url)

    def parse_page(self, response):
        problem = Selector(response)
        item = HuobiaoItem()
        item['title'] = problem.xpath('//span[@class="name"]/text()').extract()
        item['url'] = response.url
        item['name'] = problem.xpath('//span[@class="name"]/text()').extract()

        item['name']
        item['title'] = problem.xpath(
            '//h2[@class="zm-item-title zm-editable-content"]/text()').extract(
            )
        item['description'] = problem.xpath(
            '//div[@class="zm-editable-content"]/text()').extract()
        item['answer'] = problem.xpath(
            '//div[@class=" zm-editable-content clearfix"]/text()').extract()
        return item

Example #59

0

Show file

File: UnderArmour (2).py Project: PranavSonar/scrapers

class UnderArmour(CrawlSpider):
  name = "underarmour1"
  allowed_domains = ["underarmour.com"]
  start_urls = [#"https://www.underarmour.com/en-us/mens/footwear/basketball-shoes",]
                #"https://www.underarmour.com/en-us/mens/apparel/tops/hoodies"]
                "https://www.underarmour.com/en-us/outlet/mens/tops"]
  
    
  
 # rules = (Rule (SgmlLinkExtractor(allow=(),restrict_xpaths=('//div[@class="grid-content"]',))
  #  , callback="parse_items", follow= True),)
  rules = (Rule (SgmlLinkExtractor(allow=(),restrict_xpaths=('//div[@class="next"]',)), follow= True),
   Rule (SgmlLinkExtractor(allow=(),restrict_xpaths=('//div[@class="bottom-section"]',))
    , callback="parse_items", follow= True),)
  
  csvfile = None
  printHeader = True
  def to_csv(self, item):
    if self.printHeader: 
      self.csvfile = open('UnderArmour.csv','w')
    if self.csvfile:
      strWrite = ''
      #headers
      if self.printHeader: 
        strWrite +='Item Type,Product Name,Brand Name,Price,Retail Price,Sale Price,Product Description,Product Code/SKU,'
        strWrite +='Category,Option Set,Product Availability,Current Stock Level,Free Shipping,Sort Order, Meta Description,Page Title, Product Image Description - 1,Product Image Is Thumbnail - 1,'
        strWrite +='Track Inventory,Product Image Sort - 1,Product Image Sort - 2,Product Image Sort - 3,Product Image Sort - 4,Product Image Sort-5,'
        strWrite +='Product Image File - 1,Product Image File - 2,Product Image File - 3,Product Image File - 4,Product Image File - 5 , \n'
        self.printHeader = False

      #print basic product data
      strWrite += 'Product,'+item["Product_Name"]+ ',' + item["Brand_Name"] + ','
      strWrite += item["Price"] + ','+ item["Retail_Price"] +  ',' + item ["Sale_Price"] + ','
      strWrite += ';'.join(item["Product_Description"]).replace(',',';').replace('\n',"").replace("</div>","").replace("<h2>","").replace("</h2>","") + ',' + item["Product_Code"] +  ','  

      #for Images
      
      
      strWrite += item["Category"] + ',' + ';'.join(item["Option_Set"]) + ',' + item["Product_Availability"] +','
      strWrite += item["Current_Stock"] + ',' + item["Free_Shipping"] + ',' + item["Sort_Order"] + ',' + item['MetaDescription'] + ',' + item['TitleTag'] + ','
      strWrite += item["Product_Image_Description_1"] + ',' + item["Product_Image_Is_Thumbnail_1"] + ',' + item["Track_Inventory"] + ','
      strWrite += item["Product_Image_Sort_1"] + ',' + item["Product_Image_Sort_2"] + ',' + item["Product_Image_Sort_3"] + ','
      strWrite += item["Product_Image_Sort_4"] + ',' + item["Product_Image_Sort_5"] + ','
      #strWrite += ','.join(item["Product_Image_File1"]) +  +'\n'

      strWrite += ';'.join(item["Product_Image_File1"]) + ','

      strWrite += '\n'

      #print variant
      for sizes in item['variants']:
        strWrite += 'SKU,[S]Size= US ' + sizes + ',,,,,,' + item["id1"]+ "-" + sizes + item["color"]+',,,,,,,,,,,,,,,,\n'

      self.csvfile.write(strWrite.encode('utf8'))

      
#--BASIC PRODUCT DATA STARTS HERE--
  def parse_items(self,response):
  #def parse(self,response):
    sel = Selector(response)
    item = BigCItem()
    
    
    item ["Item_Type"] = "Product"
    #Product Name    
    color = sel.xpath("//span[@class='current-color-selection']/span[2]/text()")
    pname = sel.xpath("//h1[@itemprop='name']/text()")
    item ["Product_Name"] =  pname.extract()[0] + " " + color.extract()[0]+"*"    
    item["MetaDescription"] = "Get your hands on the " + pname.extract()[0] + " " + color.extract()[0] + ". Buy it Online in India at LiveYourSport.com| Free Shipping and Massive Discounts"
    item["TitleTag"] = "Buy the " +  pname.extract()[0] + " " + color.extract()[0] + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts"

    #Pricing
    mrp = response.xpath("//span[@class='buypanel_productprice--orig']/text()")    
    if mrp:
      item ["Retail_Price"] =  str(float(re.findall(r'\d+\.?\d+|\d+',sel.xpath("//span[@class='buypanel_productprice--orig']/text()").extract()[0])[0]) * 65*131/100+700)
      item ["Sale_Price"] = str(float(re.findall(r'\d+\.?\d+|\d+',sel.xpath("//span[@class='buypanel_productprice-value sale-price']/text()").extract()[0])[0]) * 65*131/100+700)
      item ["Price"] =  str(float(re.findall(r'\d+\.?\d+|\d+',sel.xpath("//span[@class='buypanel_productprice--orig']/text()").extract()[0])[0]) * 65*131/100+700)
    else:
      item ["Retail_Price"] = str(float(re.findall(r'\d+\.?\d+|\d+',sel.xpath("//span[@class='buypanel_productprice-value']/text()").extract()[0])[0]) * 65*131/100+700)
      item ["Price"] = str(float(re.findall(r'\d+\.?\d+|\d+',sel.xpath("//span[@class='buypanel_productprice-value']/text()").extract()[0])[0]) * 65*131/100+700)
      item ["Sale_Price"] = ""
        
    #brandName
    item ["Brand_Name"] = "Under Armour"
    #Product Code Extraction
    id = response.xpath("//meta[@property='og:url']/@content").extract()
    for url in id:
      item ["Product_Code"] = url.split('id')[-1] + color.extract()[0]
      item["id1"] = url.split('id')[-1]
      item["color"] = color.extract()[0]
    
    #Product Description 
    #Product Description 
    desc1 = sel.xpath("//span[@itemprop='description']/text()")
    desc2 = sel.xpath("//div[@class='buypanel_productdescription is-collapsed']/ul")       
    item["Product_Description"] = desc1.extract() + desc2.extract()
    
    #ImageFile
    item["Product_Image_File1"] = [x.replace("a248.e.akamai.net/f/248/9086/10h/","").split('?')[0] for x in sel.xpath("//div[@class='buypanel_productcaroitem--mobile']/img/@src").extract()]

    #CATEGORY
    #cat= sel.xpath("//h1[@itemprop='name']/text()")
    #if  

    
    item["Category"] = "Shoes/Men's Shoes/Basketball Shoes; Team Sports/Basketball/Basketball Shoes"
#Other Constants
    item["Option_Set"] = sel.xpath("//h1[@itemprop='name']/text()").extract()    
    item["Product_Availability"] = "12-17 Working Days"
    item["Current_Stock"] = "100"
    item["Free_Shipping"] = "N"
    item["Sort_Order"] = "-300" 
    item["Product_Image_Description_1"] = "Buy " + sel.xpath("//h1[@itemprop='name']/text()").extract()[0] + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts"
    item["Product_Image_Is_Thumbnail_1"] = "Y"
    item["Track_Inventory"] = "By Product"      
    item["Product_Image_Sort_1"] = "1"
    item["Product_Image_Sort_2"] = "2"
    item["Product_Image_Sort_3"] = "3"
    item["Product_Image_Sort_4"] = "4"
    item["Product_Image_Sort_5"] = "5"
    

    #---Sizes/Variants Start Here----
    item["variants"] = {}
    script                    = response.xpath("//script[contains(.,'JSON.parse')]").extract()[0]
    scriptJson                = script.split('JSON.parse(')[3].split('),')[0]
    scriptJsonDict            = json.loads(scriptJson)
    for x in scriptJsonDict['MATERIALS']:
      item["variants"][x['CODE']]                                     = {}
      item["variants"][x['CODE']]["sizes"]                            = {}
      item["variants"][x['CODE']]["colorCode"]                        = x['COLOR']['PRIMARY']['CODE']
      item["variants"][x['CODE']]["color"]                            = x['COLOR']['PRIMARY']['NAME']
      item["variants"][x['CODE']]["colorRGB"]                         = x['COLOR']['PRIMARY']['RGB']
      item["variants"][x['CODE']]["Retail_Price"]                     = x['PRICE']['ORIG']['MAX']
      item["variants"][x['CODE']]["Sale_Price"]                       = x['PRICE']['CURRENT']['MAX']
      item["variants"][x['CODE']]["image"]                            = []
      for c in x['ASSETS']:
        item["variants"][x['CODE']]["image"].append('https://origin-d4.scene7.com/is/image/Underarmour/'+c['NAME']+'?scl=2')
      
      for c in x['SIZES']:
        item["variants"][x['CODE']]["sizes"][c['CODE']]               = {}
        item["variants"][x['CODE']]["sizes"][c['CODE']]['inventory']  = c['INVENTORY']
        item["variants"][x['CODE']]["sizes"][c['CODE']]['name']       = c['NAME']
      
    self.to_csv(item);

    return item

Example #60

0

Show file

File: test_contrib_linkextractors.py Project: LucasNoga/Python

 def test_restrict_xpaths(self):
     lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', ))
     self.assertEqual([link for link in lx.extract_links(self.response)], [
         Link(url='http://example.com/sample1.html', text=u''),
         Link(url='http://example.com/sample2.html', text=u'sample 2'),
     ])