Ejemplo n.º 1
0
    def parse_detail(self, response):

        sel = Selector(response)

        news_item = CrawlerAffairItem()
        spider_time = str(int(time.time()))

        publish_time = sel.xpath('//*[@id="pubtime_baidu"]/text()').extract()
        if len(publish_time) > 0:
            publish_time = publish_time[0]
        else:
            publish_time = None

        title = sel.xpath('//h1[@class="articleTitle"]/text()').extract()
        contents = sel.xpath('//*[@id="articleBody"]/p/text()').extract()
        labels = sel.xpath('//*[@id="articleKeywords"]/a/text()').extract()

        news_item["spider_time"] = spider_time
        news_item["publish_time"] = process_time(publish_time)
        news_item["title"] = process_title(title)
        news_item['label'] = process_label(labels)
        news_item["content"] = process_content(contents)
        news_item['url'] = sel.response.url.strip()

        return news_item
Ejemplo n.º 2
0
    def parse_detail(self, response):

        news_item = CrawlerAffairItem()
        spider_time = str(int(time.time()))

        self.detail_browser.get(response.url)
        # wait = WebDriverWait(self.detail_browser, 1)
        # wait.until(EC.element_to_be_clickable((By.XPATH, '/html/head/meta[contains(@name, "apub:time")]')))
        time.sleep(0.5)
        publish_time_element = self.detail_browser.find_element_by_xpath(
            '/html/head/meta[contains(@name, "apub:time")]')
        publish_time = publish_time_element.get_attribute("content")
        title_elements = self.detail_browser.find_elements_by_xpath(
            '//div[@class="qq_conent clearfix"]/div[@class="LEFT"]/h1')
        title = [t.text for t in title_elements]

        contents_element = self.detail_browser.find_elements_by_xpath(
            '//div[@class="content-article"]/p')
        contents = [content.text for content in contents_element]
        labels = []

        news_item["spider_time"] = spider_time
        news_item["publish_time"] = process_time(publish_time)
        news_item["title"] = process_title(title)
        news_item["content"] = process_content(contents)
        news_item["label"] = process_label(labels)
        news_item['url'] = response.url.strip()

        return news_item
Ejemplo n.º 3
0
    def parse_detail(self, response):

        sel = Selector(response)

        news_item = CrawlerAffairItem()
        spider_time = str(int(time.time()))

        publish_time = sel.xpath(
            '//div[@class="big_img"]/div[@class="more"]/text()').extract()
        if len(publish_time) > 0:
            publish_time = publish_time[0]
        else:
            publish_time = None

        title = sel.xpath('//div[@class="big_img"]/h1/text()').extract()
        contents = sel.xpath('//*[@id="content"]/p/text()').extract()
        labels = []

        news_item["spider_time"] = spider_time
        news_item["publish_time"] = process_time(publish_time)
        news_item["title"] = process_title(title)
        news_item["label"] = process_label(labels)
        news_item["content"] = process_content(contents)
        news_item['url'] = sel.response.url.strip()

        return news_item
Ejemplo n.º 4
0
    def parse_detail(self, response):

        sel = Selector(response)

        news_item = CrawlerAffairItem()
        spider_time = str(int(time.time()))

        publish_time = sel.xpath(
            '//div[@class="article-info"]/p/span[@class="article-timestamp ml10"]/text()'
        ).extract()
        if len(publish_time) > 0:
            publish_time = publish_time[0]
        else:
            publish_time = None

        title = sel.xpath('//div[@class="article-title"]/h1/text()').extract()
        contents = sel.xpath(
            '//div[@class="article-content"]/p/text()').extract()
        labels = sel.xpath(
            '//div[@class="fl ml10 article-tags"]/a/text()').extract()

        news_item["spider_time"] = spider_time
        news_item["publish_time"] = process_time(publish_time)
        news_item["title"] = process_title(title)
        news_item["label"] = process_label(labels)
        news_item["content"] = process_content(contents)
        news_item['url'] = sel.response.url.strip()

        return news_item
Ejemplo n.º 5
0
    def parse_detail(self, response):

        news_item = CrawlerAffairItem()
        spider_time = str(int(time.time()))
        self.sub_browser.get(response.url)
        time.sleep(3)

        try:
            publish_time = self.sub_browser.find_element_by_xpath(
                '//div[@class="tit"]/h2/b').text
            title_elements = self.sub_browser.find_elements_by_xpath(
                '//div[@class="tit"]/h3')
            title = [t.text for t in title_elements]

            contents_element = self.sub_browser.find_elements_by_xpath(
                '//div[@class="viewcontent"]')
            contents = [content.text for content in contents_element]
            labels = []
        except Exception as e:
            raise e

        news_item["spider_time"] = spider_time
        news_item["publish_time"] = process_time(publish_time)
        news_item["title"] = process_title(title)
        news_item["content"] = process_content(contents)
        news_item["label"] = process_label(labels)
        news_item['url'] = response.url.strip()

        return news_item
Ejemplo n.º 6
0
    def parse_detail(self, response):

        sel = Selector(response)

        news_item = CrawlerAffairItem()
        spider_time = str(int(time.time()))
        # '/html/body/div[2]/div[3]/div/div[1]'
        publish_time = sel.xpath(
            '//div[@class="newscontent"]/div[@class="news_about"]/p/text()'
        ).extract()
        title = sel.xpath('//div[@class="newscontent"]/h1/text()').extract()
        contents = sel.xpath('//div[@class="news_txt"]/text()').extract()
        labels = []
        if len(sel.xpath('//div[@class="news_keyword"]/text()').extract()) > 0:
            labels = sel.xpath('//div[@class="news_keyword"]/text()'
                               ).extract_first().split('>>')[-1].split(',')

        news_item["spider_time"] = spider_time
        news_item["publish_time"] = process_time(publish_time)
        news_item["title"] = process_title(title)
        news_item["label"] = process_label(labels)
        news_item["content"] = process_content(contents)
        news_item['url'] = sel.response.url.strip()

        return news_item
Ejemplo n.º 7
0
    def parse_detail(self, response):

        sel = Selector(response)

        news_item = CrawlerAffairItem()

        spider_time = str(int(time.time()))
        # '/html/body/div[2]/div[3]/div/div[1]'
        self.sub_browser.get(response.url)
        # wait = WebDriverWait(self.browser, 1)
        # wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="inner-content"]/div[@class="show_title"]')))
        time.sleep(0.5)
        publish_time_element = self.sub_browser.find_elements_by_xpath(
            '//div[@class="inner-content"]/div[@class="show_time"]/div/div[2]')
        publish_time = [time.text for time in publish_time_element]
        title_element = self.sub_browser.find_elements_by_xpath(
            '//div[@class="inner-content"]/div[@class="show_title"]')
        title = [t.text for t in title_element]
        contents_element = self.sub_browser.find_elements_by_xpath(
            '//div[@class="inner-content"]/div[@class="show_content"]/p')
        contents = [c.text for c in contents_element]
        labels = []
        news_item["spider_time"] = spider_time
        news_item["publish_time"] = process_time(publish_time)
        news_item["title"] = process_title(title)
        news_item["label"] = process_label(labels)
        news_item["content"] = process_content(contents)
        news_item['url'] = sel.response.url.strip()

        return news_item
Ejemplo n.º 8
0
    def parse_detail(self, response):

        news_item = CrawlerAffairItem()
        spider_time = str(int(time.time()))

        self.sub_browser.get(response.url)
        # time.sleep(0.5)
        # time.sleep(1)
        wait = WebDriverWait(self.sub_browser, 1)
        wait.until(
            EC.element_to_be_clickable(
                (By.XPATH,
                 '//div[@id="text_block"]/div[@id="content_detail"]')))

        publish_time = self.sub_browser.find_element_by_xpath(
            '//div[@id="text_block"]/div[@id="content_detail"]').text
        title_elements = self.sub_browser.find_elements_by_xpath(
            '//div[@id="text_block"]/div[@class="title_bar"]')
        title = [t.text for t in title_elements]

        contents_element_1 = self.sub_browser.find_elements_by_xpath(
            '//*[@id="content_detail"]/p')
        contents = [content.text for content in contents_element_1]
        contents_element_2 = self.sub_browser.find_elements_by_xpath(
            '//*[@id="zoom"]/p')
        contents_2 = [content.text for content in contents_element_2]
        contents.extend(contents_2)
        contents_element_3 = self.sub_browser.find_elements_by_xpath(
            '//*[@id="content_detail"]/table/tbody/tr/td/p')
        contents_3 = [content.text for content in contents_element_3]
        contents.extend(contents_3)
        contents_element_4 = self.sub_browser.find_elements_by_xpath(
            '//*[@id="content_detail"]/font')
        contents_4 = [content.text for content in contents_element_4]
        contents.extend(contents_4)
        contents_element_5 = self.sub_browser.find_elements_by_xpath(
            '//*[@id="content_detail"]')
        contents_5 = [content.text for content in contents_element_5]
        contents.extend(contents_5)
        contents_element_6 = self.sub_browser.find_elements_by_xpath(
            '//*[@id="content_detail"]/table/tbody/tr/td[@class="detail"]/p')
        contents_6 = [content.text for content in contents_element_6]
        contents.extend(contents_6)

        labels = []
        news_item["spider_time"] = spider_time
        news_item["publish_time"] = process_time(publish_time)
        news_item["title"] = process_title(title)
        news_item["label"] = process_label(labels)
        news_item["content"] = process_content(contents)
        news_item['url'] = response.url.strip()

        return news_item
Ejemplo n.º 9
0
    def parse_detail(self, response):

        news_item = CrawlerAffairItem()
        spider_time = str(int(time.time()))

        self.detail_browser.get(response.url)
        wait = WebDriverWait(self.detail_browser, 1)
        wait.until(
            EC.element_to_be_clickable((
                By.XPATH,
                '//div[@id="top_bar"]/div/div[@class="date-source"]/span[@class="date"]'
            )))

        publish_time_element_0 = self.detail_browser.find_elements_by_xpath(
            '//div[@id="top_bar"]/div/div[@class="date-source"]/span[@class="date"]'
        )
        publish_time = [content.text for content in publish_time_element_0]
        publish_time_element_1 = self.detail_browser.find_elements_by_xpath(
            '//div[@class="page-info"]/span[@class="time-source"]')
        publish_time_1 = [content.text for content in publish_time_element_1]
        publish_time.extend(publish_time_1)
        publish_time = ''.join(publish_time)

        title_elements_0 = self.detail_browser.find_elements_by_xpath(
            '//h1[@class="main-title"]')
        title = [t.text for t in title_elements_0]
        title_elements_1 = self.detail_browser.find_elements_by_xpath(
            '//div[@class="page-header"]/h1')
        title_1 = [t.text for t in title_elements_1]
        title.extend(title_1)

        contents_element = self.detail_browser.find_elements_by_xpath(
            '//div[@id="artibody"]/p')
        contents = [content.text for content in contents_element]
        contents_element_1 = self.detail_browser.find_elements_by_xpath(
            '//div[@id="artibody"]/div[@class="detail_txt"]')
        contents_1 = [content.text for content in contents_element_1]
        contents.extend(contents_1)
        contents_element_2 = self.detail_browser.find_elements_by_xpath(
            '//div[@id="article"]/p')
        contents_2 = [content.text for content in contents_element_2]
        contents.extend(contents_2)
        labels = []

        news_item["spider_time"] = spider_time
        news_item["publish_time"] = process_time(publish_time)
        news_item["title"] = process_title(title)
        news_item["content"] = process_content(contents)
        news_item["label"] = process_label(labels)
        news_item['url'] = response.url.strip()

        return news_item
Ejemplo n.º 10
0
    def parse_detail(self, response):

        sel = Selector(response)

        news_item = CrawlerAffairItem()
        spider_time = str(int(time.time()))


        publish_time = sel.xpath('//div[@class ="box01"]/div[@class="fl"]/text()').extract_first()
        title = sel.xpath('//div[@class="clearfix w1000_320 text_title"]/h1/text()').extract()
        contents = sel.xpath('//*[@id="rwb_zw"]/p/text()').extract()
        labels = []

        news_item["spider_time"] = spider_time
        news_item["publish_time"] = process_time(publish_time)
        news_item["title"] = process_title(title)
        news_item["label"] = process_label(labels)
        news_item["content"] = process_content(contents)
        news_item['url'] = sel.response.url.strip()

        return news_item
Ejemplo n.º 11
0
    def parse_detail(self, response):

        sel = Selector(response)

        news_item = CrawlerAffairItem()
        spider_time = str(int(time.time()))
        # '/html/body/div[2]/div[3]/div/div[1]'
        publish_time = sel.xpath(
            '//div[@class="function"]/span[@class="info"]/i/text()').extract()
        title = sel.xpath('//div[@class="cnt_bd"]/h1/text()').extract()
        contents = sel.xpath('//div[@class="cnt_bd"]/p/text()').extract()
        labels = []

        news_item["spider_time"] = spider_time
        news_item["publish_time"] = process_time(publish_time)
        news_item["title"] = process_title(title)
        news_item["label"] = process_label(labels)
        news_item["content"] = process_content(contents)
        news_item['url'] = sel.response.url.strip()

        return news_item
Ejemplo n.º 12
0
    def parse_detail(self, response):

        sel = Selector(response)

        news_item = CrawlerAffairItem()
        spider_time = str(int(time.time()))
        # '/html/body/div[2]/div[3]/div/div[1]'
        publish_time = sel.xpath(
            '//div[@class="title_area"]/div[@class="info1"]/text()'
        ).extract_first()
        title = sel.xpath('//div[@class="title_area"]/h1/text()').extract()
        contents = sel.xpath('//div[@class="content_area"]/p/text()').extract()
        labels = sel.xpath('//ul[@id="searchkeywords"]/li/a/text()').extract()

        news_item["spider_time"] = spider_time
        news_item["publish_time"] = process_time(publish_time)
        news_item["title"] = process_title(title)
        news_item["label"] = process_label(labels)
        news_item["content"] = process_content(contents)
        news_item['url'] = sel.response.url.strip()

        return news_item
Ejemplo n.º 13
0
    def parse_detail(self, response):

        sel = Selector(response)
        news_item = CrawlerAffairItem()
        spider_time = str(int(time.time()))

        publish_time = sel.xpath(
            '//div[@class="pages-date"]/text()').extract_first()
        # process title
        title = []
        title_0 = sel.xpath(
            '//div[@class="article oneColumn pub_border"]/h1/text()').extract(
            )
        title_1 = sel.xpath('//div[@class="pages-title"]/text()').extract()
        if len(title_0) > 0:
            title = title_0
        elif len(title_1) > 0:
            title = title_1

        contents = sel.xpath(
            '//div[@class="pages_content"]/p/text()').extract()
        contents_1 = sel.xpath(
            '//*[@id="UCAP-CONTENT"]/p/span/span/text()').extract()
        contents.extend(contents_1)
        contents_2 = sel.xpath(
            '//div[@class="pages_content"]/p/span/text()').extract()
        contents.extend(contents_2)
        # print(contents)

        labels = []
        news_item["spider_time"] = spider_time
        news_item["publish_time"] = process_time(publish_time)
        news_item["title"] = process_title(title)
        news_item["label"] = process_label(labels)
        news_item["content"] = process_content(contents)
        news_item['url'] = response.url.strip()

        return news_item