コード例 #1
0
ファイル: wechat.py プロジェクト: boyunli/article-generator
 def parse(self):
     #时尚, 八卦, 旅游, 养生
     categorys = [9, 4, 11, 2]
     purl = 'http://weixin.sogou.com/pcindex/pc/pc_{category}/{page}.html'
     for category in categorys:
         urls = [
             purl.format(page=page, category=category)
             for page in range(1, 5)
         ]
         urls.insert(
             0,
             'http://weixin.sogou.com/pcindex/pc/pc_{category}/pc_{category}.html'
             .format(category=category))
         for url in urls:
             resp = rget(url)
             if not resp: continue
             html = etree.HTML(resp.content)
             hrefs = html.xpath(
                 '//ul[@id="pc_0_0"]//li/div[@class="txt-box"]/h3/a/@href')
             if not hrefs:
                 hrefs = html.xpath('//li/div[@class="img-box"]/a/@href')
             if not hrefs:
                 import pdb
                 pdb.set_trace()
             logger.debug("\033[92m 开始爬取:{} \033[0m".format(url))
             details = []
             for href in hrefs:
                 try:
                     item = self._extract(href, url)
                     if not item: continue
                     details.append(item)
                 except IndexError:
                     # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
                     continue
             NewsPipeline().save(details)
コード例 #2
0
    def parse(self):
        resp = rget('http://meilibaobao.com/artlist-217.html')
        html = etree.HTML(resp.content)

        try:
            last_page = ''.join(
                html.xpath('//td[@class="pagernum"]/a[last()]/text()'))
            last_page = int(last_page)
        except:
            last_page = 180

        pages = self._construct_pages(last_page + 1)
        details = []
        for page_url in pages:
            resp = rget(page_url)
            if not resp: continue
            html = etree.HTML(resp.content)

            divs = html.xpath('//div[@id="columns"]/div')
            for dd in divs:
                try:
                    href = ''.join(dd.xpath('./div[@class="pic"]/a/@href'))
                    item = self._extract(href, page_url)
                    if not item: continue
                    details.append(item)
                except IndexError:
                    # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
                    continue
            NewsPipeline().save(details)
コード例 #3
0
    def parse(self):
        resp = rget(self.site_url)
        html = etree.HTML(resp.content)
        try:
            total_url = ''.join(
                html.xpath('//div[@id="pager"]/a[@class="last"]/@href'))
            pages = parse.parse_qs(parse.urlsplit(total_url).query)['page'][0]
        except:
            pages = 12
        urls = self._construct_page_url(int(pages) + 1)

        details = []
        for page_url in urls:
            resp = rget(page_url)
            if not resp: continue
            html = etree.HTML(resp.content)

            hrefs = html.xpath('//div[@class="art_cat_box"]/table//a/@href')
            for href in hrefs:
                try:
                    item = self._extract(href, page_url)
                    if not item: continue
                    details.append(item)
                except IndexError:
                    # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
                    continue
            NewsPipeline().save(details)
コード例 #4
0
    def parse(self):
        resp = rget('http://www.milanstand.com/article-zixun-1/')
        html = etree.HTML(resp.content)

        try:
            last_page = html.xpath(
                '//p[@class="nx"]/following-sibling::p/a/@href')[0]
            last_page = int(last_page.split('-')[-1][:-1])
        except IndexError:
            last_page = 55

        pages = self._construct_pages(last_page + 1)
        details = []
        for page_url in pages:
            resp = rget(page_url)
            if not resp: continue
            html = etree.HTML(resp.content)

            divs = html.xpath(
                '//div[@class="box_3"]/table/tr//div[contains(text(), "包")]')
            for dd in divs:
                try:
                    href = ''.join(dd.xpath('./a/@href'))
                    href = urljoin(self.site_url, href)
                    item = self._extract(href, page_url)
                    if not item: continue
                    details.append(item)
                except IndexError:
                    # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
                    continue
            NewsPipeline().save(details)
コード例 #5
0
    def parse(self):
        pages = [urljoin(self.site_url, 'list_{}.html'.format(page)) for page in range(1, 494)]
        details = []
        for page_url in pages:
            resp = rget(page_url)
            if not resp: continue
            html = etree.HTML(resp.content)

            hrefs = html.xpath('//div[@class="newlist"]//h6//a[2]/@href')
            for href in hrefs:
                try:
                    item = self._extract(href, page_url)
                    if not item: continue
                    details.append(item)
                except IndexError:
                    # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
                    continue
            NewsPipeline().save(details)
コード例 #6
0
ファイル: sohu.py プロジェクト: boyunli/article-generator
 def parse(self):
     url = 'http://m.sohu.com/ch/23/'
     resp = rget(url)
     html = etree.HTML(resp.content)
     hrefs = html.xpath('//div[@class="swiper-wrapper"]/div/a/@href') + \
         html.xpath('//ul[@class="feed-list-area"]//li/a/@href')
     if not hrefs:
         self.parse()
     details = []
     for href in hrefs:
         if href.startswith('http'): continue
         time.sleep(1)
         try:
             href = urljoin(self.url, href)
             logger.debug("\033[92m 开始爬取:{} \033[0m".format(href))
             item = self._extract(href, url)
             if not item: continue
             details.append(item)
         except IndexError:
             # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
             continue
     NewsPipeline().save(details)
コード例 #7
0
ファイル: whmb.py プロジェクト: boyunli/article-generator
 def parse(self):
     pages_url = [self.site_url]
     pages_url += [
         urljoin(self.site_url, 'newsp{}.html'.format(page))
         for page in range(2, 20)
     ]
     for page_url in pages_url:
         details = []
         resp = rget(page_url)
         if not resp: continue
         html = etree.HTML(resp.content)
         hrefs = html.xpath('//div[@id="brand"]/table//tr/td/h3/a/@href')
         for href in hrefs:
             href = urljoin(self.site_url, href)
             try:
                 item = self._extract(href, page_url)
                 if not item: continue
                 details.append(item)
             except IndexError:
                 # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
                 continue
         NewsPipeline().save(details)
コード例 #8
0
 def parse(self):
     url = 'https://www.toutiao.com/ch/news_fashion/'
     chrome_options = get_chrome_options()
     driver = webdriver.Chrome(options=chrome_options)
     driver.get(url)
     html = etree.HTML(driver.page_source)
     hrefs = html.xpath('//a[@class="link title"]/@href')
     if not hrefs:
         self.parse()
     logger.debug("\033[92m 开始爬取:{} \033[0m".format(url))
     details = []
     for href in hrefs:
         if href.startswith('http'): continue
         time.sleep(1)
         try:
             href = urljoin(self.url, href)
             item = self._extract(href, url)
             if not item: continue
             details.append(item)
         except IndexError:
             # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
             continue
     NewsPipeline().save(details)
コード例 #9
0
ファイル: wbzj.py プロジェクト: boyunli/article-generator
    def parse(self):
        resp = rget(self.site_url)
        html = etree.HTML(resp.content)
        typeHrefs = html.xpath('//div[@class="sub_nav"]/div[@class="wrapper"]/ul/li//a/@href')

        for url in typeHrefs:
            pages = [urljoin(url, 'p{}.html'.format(page)) for page in range(1, 5)]
            pages[0] = url
            details = []
            for page_url in pages:
                resp = rget(page_url)
                if not resp: continue
                html = etree.HTML(resp.content)

                hrefs = set(html.xpath('//dl[position()<last()]//a/@href|//a/@href'))
                for href in hrefs:
                    try:
                        item = self._extract(href, page_url)
                        if not item: continue
                        details.append(item)
                    except IndexError:
                        # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
                        continue
                NewsPipeline().save(details)