コード例 #1
0
 def second_requests(self, detail_urls):
     for detail_url in detail_urls:
         response = Req(detail_url).get_select()
         selector = etree.HTML(response.content)
         item = dict()
         item['link'] = detail_url
         item['title'] = xpath_out(
             selector.xpath(
                 '/html/body/div[2]/div[2]/div[1]/div[1]/h1/text()'))
         if item['title']:
             item['datetime'] = xpath_out(
                 selector.xpath(
                     '/html/body/div[2]/div[2]/div[1]/div[1]/div[1]/span[1]/text()'
                 ))
         else:
             item['title'] = xpath_out(
                 selector.xpath(
                     '/html/body/div[2]/div[3]/div[1]/div[2]/h1/text()'))
             if item['title']:
                 item['datetime'] = xpath_out(
                     selector.xpath(
                         '/html/body/div[2]/div[3]/div[1]/div[2]/div[1]/span[1]/text()'
                     ))
             else:
                 item['title'] = xpath_out(
                     selector.xpath(
                         '/html/body/div[4]/div/div[2]/div[1]/h1/text()'))
                 item['datetime'] = xpath_out(
                     selector.xpath('//*[@id="time"]/text()'))
         yield item
コード例 #2
0
    def first_requests(self):

        response = Req(self.url, proxy=True).get_select()
        selector = etree.HTML(response.text)
        set = []
        partA = selector.xpath('//div[@class="column text-align-left visible-desktop visible-mobile last-column"]/div[@class="column-tout   "]')
        partB = selector.xpath('//div[@class="column large-headline"]/div[@class="column-tout   "]')
        partC = selector.xpath('//div[@class="column column-feed"]/div[@class="column-tout   "]')

        for part in partA:
            item = dict()
            item['title'] = xpath_out(part.xpath('div[1]/a/text()')).strip()
            item['link'] = "http://www.fortune.com/" + xpath_out(part.xpath('div[1]/a/@href'))
            if not item['link'] in set:
                set.append(item['link'])
                yield item

        for part in partB:
            item = dict()
            item['title'] = xpath_out(part.xpath('div[1]/a/text()')).strip()
            item['link'] = "http://www.fortune.com/" + xpath_out(part.xpath('div[1]/a/@href'))
            if not item['link'] in set:
                set.append(item['link'])
                yield item


        for part in partC:
            item = dict()
            item['title'] = xpath_out(part.xpath('div[1]/a/text()')).strip()
            item['link'] = "http://www.fortune.com/" + xpath_out(part.xpath('div[1]/a/@href'))
            if not item['link'] in set:
                set.append(item['link'])
                yield item
コード例 #3
0
    def first_requests(self):
        response = Req(self.url, proxy=True).get_select()
        selector = etree.HTML(response.text)
        partA = selector.xpath(
            '//div[@class="column text-align-left visible-desktop visible-mobile last-column"]/div[@class="column-tout  "]'
        )
        partB = selector.xpath(
            '//div[@class="column text-align-left visible-desktop"]/div[@class="column-tout  "]'
        )

        for part in partA:
            item = dict()
            item['title'] = xpath_out(
                part.xpath('div[@class="column-tout-info "]/div/div/a/text()')
            ).strip()
            item['link'] = "http://time.com" + xpath_out(
                part.xpath('div[@class="column-tout-info "]/div/div/a/@href'))
            yield item

        for part in partB:
            item = dict()
            item['title'] = xpath_out(
                part.xpath(
                    'div[@class="column-tout-info "]/div/div[1]/a/text()')
            ).strip()
            item['link'] = "http://time.com" + xpath_out(
                part.xpath(
                    'div[@class="column-tout-info "]/div/div[1]/a/@href'))
            yield item
コード例 #4
0
    def first_requests(self):
        selector = etree.HTML(Req(self.url, proxy=True).get_select().text)
        partA = selector.xpath(
            '//div[@class="module__content"]/ul[@class="media-list"]/li')
        partB = selector.xpath(
            '//div[@class="module__content"]/ul[@class="media-list media-list--fixed-height"]/li'
        )

        for part in partA:
            item = dict()
            item['title'] = xpath_out(
                part.xpath('div/div[@class="media__content"]/h3/a/text()'))
            if item['title'] is not None:
                item['title'] = item['title'].strip()
                item['link'] = xpath_out(
                    part.xpath('div/div[@class="media__content"]/h3/a/@href'))
                yield item
        for part in partB:
            item = dict()
            item['title'] = xpath_out(
                part.xpath('div/div[@class="media__content"]/h3/a/text()'))
            if item['title'] is not None:
                item['title'] = item['title'].strip()
                item['link'] = xpath_out(
                    part.xpath('div/div[@class="media__content"]/h3/a/@href'))
                yield item
コード例 #5
0
    def third_requests(self, url):
        content_api = 'https://openapi.inews.qq.com/getQQNewsNormalContent'   #腾讯异步请求新闻内容的url
        pattern_id = re.compile(r'.*?/([\s\w]*)$', re.S)
        headers = {
            'referer': url,
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3610.2 Safari/537.36'
        }
        params = {
            'id': re.match(pattern_id, url).group(1),
            'chlid': 'news_rss',
            'refer': 'mobilewwwqqcom',
            'otype': 'jsonp',
            'ext_data': 'all',
            'srcfrom': 'newsapp',
            'callback': 'getNewsContentOnlyOutput'
        }
        response = Req(url=content_api, headers=headers, params=params).get_select()
        data = eval("'" + response.content.decode('ascii') + "'")

        pattern_item = re.compile(r'.*?"title":"(.*?)",.*?"pubtime":"(.*?)",.*?$',re.S)
        info = re.match(pattern_item, data).group(1,2)
        item = dict()
        item['link'] = url
        item['title'] = info[0]
        item['datetime'] = info[1]
        if item['title'] is not None:
            yield item
        else:
            Logger().setLogger(tc.log_path, 2, "Get B2 class detail page info failed, title is None")
コード例 #6
0
    def first_requests(self):
        response = Req(self.url).get_select()
        selector = etree.HTML(response.text)
        partA = selector.cssselect(
            'body > div.area.areabg1 > div:nth-child(2) > div > div.tabContents.active > table > tr'
        )
        partA.pop(0)
        for part in partA:
            item = dict()
            item['link'] = part.xpath('td[1]/a/@href')[0]
            item['title'] = part.xpath('td[1]/a/text()')[0]
            item['hot'] = part.xpath('td[2]/text()')[0]
            item['hot'] = round(int(item['hot']) / 10000, 2)
            yield item

        partB = selector.cssselect(
            'body > div.area.areabg1 > div:nth-child(6) > div > div:nth-child(3) > table > tr'
        )
        partB.pop(0)
        for part in partB:
            item = dict()
            item['link'] = part.xpath('td[1]/a/@href')[0]
            item['title'] = part.xpath('td[1]/a/text()')[0]
            item['hot'] = part.xpath('td[2]/text()')[0]
            item['hot'] = round(int(item['hot']) / 10000, 2)
            yield item
コード例 #7
0
    def first_requests(self):
        response = Req(self.url).get_select()
        response.encoding = 'utf-8'   #Important!!
        selector = etree.HTML(response.text)
        hrefs = selector.xpath('//*[@id="focusListNews"]//a')
        url_set = []
        for href in hrefs:
            if not href in url_set:
                url_set.append(href)  #去重操作

                item = dict()
                item['title'] = xpath_out(href.xpath('text()'))
                item['link'] = xpath_out(href.xpath('@href'))

                if item['title'] != None:
                    if re.match(self.re_title, item['title']):
                        yield item
コード例 #8
0
    def first_requests(self):
        selector = etree.HTML(Req(self.url).get_select().text)
        sections = selector.xpath('//*[@id="tab-news-01"]/ul/li/a')
        for section in sections:
            item = dict()
            item['title'] = xpath_out(section.xpath('text()'))
            item['link'] = xpath_out(section.xpath('@href'))

            if item['title'] is not None:
                yield item
コード例 #9
0
 def first_requests(self):
     response = Req(self.url).get_select()
     topics = json.loads(response.content.decode('utf8'))['data']['bang_topic']['topic_list']
     for topic in topics:
         item = dict()
         item['title'] = topic['topic_name']
         item['link'] = topic['topic_url'].replace("&", "&")
         item['hot'] = topic['discuss_num']
         item['hot'] = round(int(item['hot']) / 10000, 2)
         yield item
コード例 #10
0
    def second_requests(self):    #第二次请求详情页
        for url in self.A_urls:   #A类url直接请求,得到数据
            try:
                selector = etree.HTML(Req(url).get_select().content)
                item = dict()
                item['link'] = url
                item['title'] = selector.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()')
                item['datetime'] = selector.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/div/div[1]/span[3]/text()')
                if item['title'] != [] and item['datetime'] != []:
                    item['title'] = item['title'][0]
                    item['datetime'] = item['datetime'][0]
                elif item['title'] != [] and item['datetime'] == []:
                    item['title'] = item['title'][0]
                    item['datetime'] = selector.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/div/div[1]/span[2]/text()')[0]
                else:
                    item['title'] = selector.xpath('//*[@id="Main-Article-QQ"]/div[2]/div[1]/div[2]/div[1]/h1/text()')[0]
                    item['datetime'] = selector.xpath('//*[@id="Main-Article-QQ"]/div[2]/div[1]/div[2]/div[1]/div/div[1]/span[3]/text()')[0]
                yield item
            except:
                Logger().setLogger(tc.log_path, 2, "Failed to get A class detail page info,url is" + url)
                pass

        for url in self.B_urls:    #B类
            try:
                response = Req(url).get_select()
                selector = etree.HTML(response.text)
                data = selector.xpath('/html/head/script[5]/text()')
                if data:        #B类中部分js渲染的页面
                    item = dict()
                    data = json.loads(data[0].strip()[14:])
                    item['link'] = url
                    item['title'] = data['title']
                    item['datetime'] = data['pubtime']
                    yield item
                else:     #B类中全部js渲染的页面
                    self.third_requests(url)
            except:
                Logger().setLogger(tc.log_path, 2, "Get B class detail page info failed,url is" + url)
                pass

        print("Second Time Requests Finished")
コード例 #11
0
    def second_requests(self, urls):
        for url in urls['A']:
            response = Req(url=url, proxy=True).get_select()
            if response is not None:
                selector = etree.HTML(response.content)
                item = dict()
                item['title'] = css_out(
                    selector.cssselect('.vxp-media__body h1')).text
                item['link'] = response.url
                yield item

        for url in urls['B']:
            response = Req(url=url, proxy=True).get_select()
            if response is not None:
                selector = etree.HTML(response.content)
                item = dict()
                item['title'] = css_out(selector.cssselect('.story-body__h1'))
                if item['title'] != None:
                    item['title'] = item['title'].text
                    item['link'] = response.url
                    yield item
コード例 #12
0
    def first_requests(self):
        response = Req(self.url).get_select()

        if response is not None:
            selector = etree.HTML(response.text)
            parts = selector.xpath('//div[@id="newloadmore"]/div')

            for part in parts:
                item = dict()
                item['link'] = xpath_out(part.xpath('a/@href'))
                item['title'] = xpath_out(part.xpath('a/div[1]/text()'))
                yield item
コード例 #13
0
    def first_requests(self):
        self.params['top_time'] = str(datetime.date.today()).replace("-","")
        response = Req(url=self.url, params=self.params).get_select()

        data = json.loads(response.content[10:-2].decode('utf8'))['data']
        for news in data:
            item = dict()
            item['title'] = news['title']
            item['link'] = news['url']
            item['datetime'] = time.strptime(news['time'][:-6], '%a, %d %b %Y %H:%M:%S')
            item['datetime'] = time.strftime('%Y-%m-%d %H:%M:%S', item['datetime'])
            yield item
コード例 #14
0
    def first_requests(self):
        response = Req(self.url).get_select()
        selector = etree.HTML(response.text)
        part = selector.xpath('//div[@id="firehoselist"]/article')
        for article in part:
            item = dict()
            item['title'] = xpath_out(article.xpath('header/h2/span[1]/a/text()'))
            item['link'] = xpath_out(article.xpath('header/h2/span[1]/a/@href'))
            item['datetime'] = xpath_out(article.xpath('header/div[@class="details"]/span[2]/time/text()')).replace("@","")

            item['datetime'] = str(datetime.strptime(item['datetime'], 'on %A %B %d, %Y %I:%M%p'))
            item['link'] = "https://" + item['link']
            yield item
コード例 #15
0
    def first_requests(self):
        response = Req(url=self.url, proxy=True).get_select()
        selector = etree.HTML(response.text)
        articles = selector.xpath(
            '//section[@class="layout-economist-today"]/div//div[@class="teaser__text"]'
        )

        for article in articles:
            item = dict()
            item['title'] = xpath_out(article.xpath('h3/a/span/text()'))
            item['link'] = "https://www.economist.com" + xpath_out(
                article.xpath('h3/a/@href'))
            yield item
コード例 #16
0
    def first_requests(self):
        response = Req(self.url).get_select()
        selector = etree.HTML(response.content)
        detail_urls = []
        partA = selector.xpath('/html/body/div[3]/div[2]/div/div[1]/div[2]/div[2]/div[1]/dl/dt/a/@href')
        for part in partA:
            detail_urls.append(part)

        partB = selector.xpath('//div[@class="secNewsBlock"]/div[@class="secNewsList"]//p/a/@href')
        for part in partB:
            detail_urls.append(part)
        print(detail_urls)

        return detail_urls
コード例 #17
0
    def first_requests(self):
        response = Req(self.url_explore).get_select()
        selector = etree.HTML(response.text)
        gallery = selector.xpath(
            '//*[@id="gallery_main_frame"]/div[@class="item"]')

        for g in gallery:
            item = dict()
            item['link'] = xpath_out(
                g.xpath('div[@class="bd"]/div/div[@class="title"]/a/@href'))
            item['title'] = xpath_out(
                g.xpath('div[@class="bd"]/div/div[@class="title"]/a/text()'))
            item['image'] = str(
                xpath_out(
                    g.xpath(
                        'div[@class="bd"]/div[@class="pic"]/a/@style')))[21:-1]
            item['intro'] = xpath_out(
                g.xpath('div[@class="bd"]/div/p/a/text()'))
            if item['title']:
                yield item

        home = Req(self.url_home).get_select()
        selector = etree.HTML(home.text)
        side = selector.xpath(
            '//*[@id="anony-sns"]/div/div[2]/div[2]/ul/div/ul/li')
        for s in side:
            item = dict()
            item['link'] = xpath_out(s.xpath('a/@href'))
            item['title'] = xpath_out(s.xpath('a/text()'))
            item['hot'] = xpath_out(s.xpath('span/text()'))

            item['hot'] = re.match(self.hot_pattern, str(item['hot']))
            if item['hot'] is not None:
                item['hot'] = item['hot'].group(1)

            if item['title']:
                yield item
コード例 #18
0
    def first_requests(self):
        response = Req(self.url).get_select()
        selector = etree.HTML(response.content)
        detail_urls = []
        partA = selector.xpath(
            '/html/body/div[2]/div[5]/div/div[1]/div[2]/div[2]/dl//a/@href')
        for part in partA:
            detail_urls.append(part)

        partB = selector.xpath(
            '/html/body/div[2]/div[5]/div/div[1]/div[3]/div[3]/div//a/@href')
        for part in partB:
            detail_urls.append(part)

        return detail_urls
コード例 #19
0
    def second_requests(self, detail_urls):
        for detail_url in detail_urls:
            response = Req(detail_url).get_select()
            selector = etree.HTML(response.text)
            item = dict()
            item['link'] = detail_url
            item['title'] = xpath_out(
                selector.xpath(
                    '//*[@id="article-container"]/div[2]/div[1]/div[1]/div[1]/h1/text()'
                ))
            item['datetime'] = xpath_out(
                selector.xpath('//*[@id="news-time"]/text()'))

            if item['title'] != None:
                yield item
コード例 #20
0
 def first_requests(self):
     response = Req(self.url).get_select()
     selector = etree.HTML(response.text)
     part = selector.xpath(
         '//div[@class="archive-listing-component"]/div[1]/ul/li')
     for article in part:
         item = dict()
         item['link'] = "https://www.wired.com" + xpath_out(
             article.xpath(
                 'div[@class="archive-item-component__info"]/a/@href')
         ).encode('utf8').decode('utf8')
         item['title'] = xpath_out(
             article.xpath(
                 'div[@class="archive-item-component__info"]/a/h2/text()'))
         yield item
コード例 #21
0
    def first_requests(self):
        response = Req(self.url).get_select()
        selector = etree.HTML(response.content)
        hrefs = selector.xpath('//*[@id="hpart2L"]//a')
        url_set = []
        for href in hrefs:
            if not href in url_set:
                url_set.append(href)  #去重操作

                item = dict()
                item['title'] = xpath_out(href.xpath('text()'))
                item['link'] = xpath_out(href.xpath('@href'))

                if item['title'] != None:
                    if re.match(self.re_title, item['title']):
                        yield item
コード例 #22
0
    def first_requests(self):
        selector = etree.HTML(Req(self.url).get_select().text)
        partA = selector.xpath(
            '//div[@class="focus-news-box"]/div[@class="news"]/p')
        partB = selector.xpath(
            '//div[@class="focus-news-box"]/div[3]/div/ul/li')
        for part in partA:
            item = dict()
            item['title'] = xpath_out(part.xpath('a/@title'))
            item['link'] = xpath_out(part.xpath('a/@href'))
            yield item

        for part in partB:
            item = dict()
            item['title'] = xpath_out(part.xpath('a/@title'))
            item['link'] = xpath_out(part.xpath('a/@href'))
            yield item
コード例 #23
0
    def first_requests(self):
        response = Req(self.url).get_select()

        if response is not None:
            selector = etree.HTML(response.text)
            partA = selector.xpath('/html/body/div[5]/div[3]/div[2]/dl')
            partB = selector.xpath('/html/body/div[9]/div[2]/div/dl')
            for part in partA:
                item = dict()
                item['link'] = xpath_out(part.xpath('dd/h3/a/@href'))
                item['title'] = xpath_out(part.xpath('dd/h3/a/text()'))
                yield item

            for part in partB:
                item = dict()
                item['link'] = xpath_out(part.xpath('dd/h3/a/@href'))
                item['title'] = xpath_out(part.xpath('dd/h3/a/text()'))
                yield item
コード例 #24
0
    def first_requests(self):
        response = Req(url=self.url, proxy=True).get_select()
        selector = etree.HTML(response.text)
        articles = selector.xpath('//div[@class="teaser-list"]/article')

        for article in articles:
            item = dict()
            item['title'] = xpath_out(
                article.xpath('a/div[2]/h3/span[2]/text()'))
            item['link'] = "https://www.economist.com" + xpath_out(
                article.xpath('a/@href'))
            item['datetime'] = xpath_out(
                article.xpath(
                    'a/div[2]/div[@class="teaser__datetime"]/time/@datetime')
            ).encode('utf8').decode('utf8')
            item['datetime'] = str(
                datetime.strptime(item['datetime'], '%Y-%m-%dT%H:%M:%SZ'))
            yield item
コード例 #25
0
    def first_requests(self):
        response = Req(self.url).get_select()
        selector = etree.HTML(response.content)
        detail_urls = []
        partA = selector.xpath(
            '/html/body/div[4]/div[2]/div[1]/div/div[2]/div/div/div[1]//a/@href'
        )
        partB = selector.xpath(
            '/html/body/div[4]/div[2]/div[1]/div/div[2]/div/div/div[2]/div//a/@href'
        )

        for part in partA:
            detail_urls.append(part)

        for part in partB:
            if not re.match(self.pattern_img, part):
                detail_urls.append(part)

        return detail_urls
コード例 #26
0
    def first_requests(self):
        response = Req(self.url, proxy=True).get_select()
        selector = etree.HTML(response.text)
        partA = selector.xpath('//div[@class="css-1h4m9oq"]/article')
        print(partA)
        partB = selector.xpath('//div[@class="css-1h4m9oq"]/ul/li')
        print(partB)

        for part in partA:
            item = dict()
            item['title'] = xpath_out(part.xpath('a[2]/div/h2/text()'))
            item['link'] = xpath_out(part.xpath('a[2]/@href'))
            yield item

        for part in partB:
            item = dict()
            item['title'] = xpath_out(part.xpath('a/div/h1/text()'))
            item['link'] = xpath_out(part.xpath('a/@href'))
            yield item
コード例 #27
0
    def first_requests(self):
        response = Req(self.url, proxy=True).get_select()
        selector = etree.HTML(response.text)
        partA = selector.xpath(
            '//main[@id="site-content"]/div/div[2]/div[2]/div[1]/div/article')
        partB = selector.xpath(
            '//main[@id="site-content"]/div/div[2]/div[2]/div[1]/div/ul/li')

        for part in partA:
            item = dict()
            item['title'] = xpath_out(part.xpath('a[2]/div/h2/text()'))
            item['link'] = xpath_out(part.xpath('a[2]/@href'))
            yield item

        for part in partB:
            item = dict()
            item['title'] = xpath_out(part.xpath('a/div/h1/text()'))
            item['link'] = xpath_out(part.xpath('a/@href'))
            yield item
コード例 #28
0
 def first_requests(self):   #第一次请求首页以获取详情页url
     selector = etree.HTML(Req(self.url).get_select().content)
     uls = selector.xpath('//*[@id="tab-news-01"]/ul')
     try:
         for ul in uls:
             lis = ul.xpath('li')
             for li in lis:
                 hrefs = li.xpath('a/@href')
                 for href in hrefs:       #对所有的url进行分类A或B
                     if re.match(self.pattern_a, href):
                         self.A_urls.append(href)
                     elif re.match(self.pattern_b, href):
                         self.B_urls.append(href)
                     else:
                         pass
         print("First Time Requests Succeed")
     except:
         Logger().setLogger(tc.log_path, 4, "Failed to get detail_page_urls")
         pass
コード例 #29
0
    def first_requests(self):
        for url in self.urls:
            response = Req(url, proxy=True).get_select()
            selector = etree.HTML(response.text)
            partA = selector.xpath('//div[@class="partial hero"]/article')
            partB = selector.xpath('//div[@class="partial marquee"]/article')

            for part in partA:
                item = dict()
                item['title'] = xpath_out(part.xpath('div/h3/a/text()')).strip()
                item['link'] = "https://time.com" + xpath_out(part.xpath('div/h3/a/@href'))
                yield item


            for part in partB:
                item = dict()
                item['title'] = xpath_out(part.xpath('div/h3/a/text()')).strip()
                item['link'] = "https://time.com" + xpath_out(part.xpath('div/h3/a/@href'))
                yield item
コード例 #30
0
    def first_requests(self):
        response = Req(url=self.url,cookies=self.cookies).get_select()
        selector = etree.HTML(response.text)
        sections = selector.xpath('//*[@class="HotList-list"]//section')

        for section in sections:
            item = dict()
            item['title'] = xpath_out(section.xpath('div[2]/a/h2/text()'))
            item['link'] = xpath_out(section.xpath('div[2]/a/@href'))
            item['hot'] = float(xpath_out(section.xpath('div[2]/div/text()'))[:-3])


            if item['title'] is not None:
                if item['hot'] <= 150:
                    item['home'] = False

                yield item
            else:
                Logger().setLogger(zh.log_path, 2, "Item's title is None, item is " + item)
                pass