Esempio n. 1
0
    def parse_film(self, response):
        item = response.meta['item']
        film_rank = Selector(response=response).xpath(
            '/html/body/div[2]/div/div[1]/div[2]/div/ul/li[1]/p/text()'
        ).extract_first()
        # film_viewcount = Selector(response=response).xpath('//*[@id="resource_views"]/../..')
        # film_viewcount = Selector(response=response).xpath('//*[@id="resource_views"]//text()').extract()[0]
        film_viewcount = '0'
        film_class = Selector(response=response).xpath(
            '/html/body/div[2]/div/div[1]/div[1]/div[2]/div[2]/div/img/@src'
        ).extract_first()
        film_cover = Selector(response=response).xpath(
            '/html/body/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[1]/a/img/@src'
        ).extract_first()

        index = film_rank.index(':')
        film_rank = film_rank[index + 1:].lstrip().rstrip()
        print(f'film_rank: {film_rank}')
        print(f'film_class: {film_class}')
        print(f'film_cover: {film_cover}')
        print(f'film_viewcount: {film_viewcount}')
        # print(f'film_viewcount: {film_viewcount.xpath("./div/label/text()").extract()[0]}')
        item['film_class'] = 'a'
        item['film_rank'] = film_rank
        item['film_class'] = film_class
        item['film_cover'] = film_cover
        item['film_viewcount'] = film_viewcount

        # item['film_viewcount'] = film_viewcount
        # print(item)
        print(f"in parse_film , item: {item}")
        return item
Esempio n. 2
0
    def parse(self, response):
        try:
            for div in (response.xpath(
                    "//div[@id='list']/table[@class='table']/tbody/tr/td[@class='ip']"
            )):
                tag_str = re.findall(r'>(\S+)</|>\W+<', str(div.extract()))
                if tag_str:
                    ip_port = ""
                    for temp_str in tag_str:
                        if not temp_str:
                            temp_str = temp_str.replace("", ":")
                        target_str = re.search(r'[^<>/]\d+|\d+\.|\.|\d+|\:',
                                               temp_str)

                        if target_str:
                            ip_port += target_str.group()
                    print ip_port
                    str_list = ip_port.strip().split(":")
                    ip = str_list[0]
                    port = str_list[1]

                    item = IpspiderItem(ip=ip, port=port)
                    yield item

            current_page = response.xpath(".//span[@class='current']/text()")
            if current_page:
                current_page = current_page.extract()[0]

            nextPage_list = Selector(
                response=response).re(u'<a href="(\S*)">\d+</a>')
            index = ""
            if nextPage_list:
                for temp in nextPage_list:
                    page_num = re.search(r'\d+', str(temp))
                    if page_num:
                        if int(page_num.group()) > int(current_page):
                            index = nextPage_list.index(temp)
                            break

            next_page = nextPage_list[int(index)]
            print next_page

            yield scrapy.Request(url="http://www.goubanjia.com/free/" +
                                 str(next_page),
                                 callback=self.parse)

        except Exception, e:
            print e
            pass
Esempio n. 3
0
 def xt(cls, response):
     docs = []
     raw_docs = response.xpath(cls.LI_XPATH)
     for raw_doc in raw_docs:
         html_url, pdf_url = "", ""
         urls = raw_doc.css("a").xpath("@href").extract()
         for url in urls:
             if url.endswith(".pdf"):
                 pdf_url = url
             elif url.endswith(".html"):
                 html_url = url
         title = Selector(text=raw_doc.extract()).xpath("//a[1]/text()").extract()[0]
         title = title[: title.index("/")].strip()
         docs.append({"title": title, "html_url": html_url, "pdf_url": pdf_url})
     return docs
Esempio n. 4
0
 def xt(cls, response):
     docs = []
     raw_docs = response.xpath(cls.LI_XPATH)
     for raw_doc in raw_docs:
         html_url, pdf_url = "", ""
         urls = raw_doc.css('a').xpath('@href').extract()
         for url in urls:
             if url.endswith('.pdf'):
                 pdf_url = url
             elif url.endswith('.html'):
                 html_url = url
         title = Selector(
             text=raw_doc.extract()).xpath('//a[1]/text()').extract()[0]
         title = title[:title.index('/')].strip()
         docs.append({
             'title': title,
             'html_url': html_url,
             'pdf_url': pdf_url
         })
     return docs
Esempio n. 5
0
 def xt(cls, response):
     docs = []
     raw_docs = response.xpath(cls.LI_XPATH)
     for raw_doc in raw_docs:
         html_url, pdf_url = "", ""
         urls = raw_doc.css('a').xpath('@href').extract()
         for url in urls:
             if url.endswith('.pdf'):
                 pdf_url = url
             elif url.endswith('.html'):
                 html_url = url
         title = Selector(text=raw_doc.extract()).xpath(
             '//a[1]/text()').extract()[0]
         title = title[:title.index('/')].strip()
         docs.append({
             'title': title,
             'html_url': html_url,
             'pdf_url': pdf_url
         })
     return docs