Exemple #1
0
    def parse(self, response, **kwargs):
        """
        :param response: crawler response of the article url
        :return: parsed doc pushed to elastic
        """
        hxs = Selector(response)
        item = NewsItem()

        item["link"] = response.request.url
        item["lang"] = "tr"
        item["source"] = "hurriyet"
        date_time = hxs.xpath(
            "/html/body/article/div[12]/div/section[1]/header/div[1]/div[2]/div[2]/span[2]/time"
        ).extract()
        author = hxs.xpath(
            "/html/body/article/div[12]/div/section[1]/header/section[1]/div[1]/div/div[2]/a[1]/h6"
        ).extract()
        title = hxs.xpath(
            "/html/body/article/div[12]/div/section[1]/header/div[2]/div/h1"
        ).extract()
        intro = hxs.xpath(
            "/html/body/article/div[12]/div/section[3]/div/h2").extract()
        new_content = hxs.xpath(
            "/html/body/article/div[12]/div/section[3]/div/div[4]").extract()
        new_content = ' '.join(new_content)

        #
        # Processing outputs
        item["intro"] = ' '.join(intro)
        item["title"] = ' '.join(title)
        item["content"] = re.sub(r'\s{2,}', ' ', new_content)

        item["date_time"] = " ".join(date_time)
        item["author"] = " ".join(author)
        return item
Exemple #2
0
    def parse_item(self, response):
        r = response
        # inspect_response(response, self)

        title = r.xpath("//div[@class='qq_article']//h1/text()").extract()
        source = r.xpath("//div[@class='qq_article']//span[@class='a_source']/text()").extract()
        if title:
            title = title[0]
        if source:
            source = source[0]
        # 要求格式正确
        if not title or not source:
            redis_conn.hset(redis_invalid_url_key, response.url, 0)
            return
        content = ''.join(r.xpath('//div[@id="Cnt-Main-Article-QQ"]/p/text()').extract())
        raw_time = r.xpath("//div[@class='qq_article']//span[@class='a_time']/text()").extract()[0]
        re_result = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}", raw_time)
        if re_result:
            ts = time.mktime(time.strptime(re_result[0], '%Y-%m-%d %H:%M'))
        else:
            ts = 0
        url = r.url
        new_news = NewsItem(
            title=title,
            content=content,
            source=source,
            published=ts,
            url=url
        )
        return new_news
Exemple #3
0
    def parse(self, response):
        for row in response.xpath(
                "//table[@class='table table-small'][1]/tbody/tr"):
            l = ItemLoader(item=NewsItem(), selector=row)
            l.add_xpath("news", 'td[2]/a/text()')

            yield l.load_item()
Exemple #4
0
    def parse(self, response, **kwargs):
        """
        :param response: crawler response of the article url
        :return: parsed doc pushed to elastic
        """
        hxs = Selector(response)
        item = NewsItem()

        item["link"] = response.request.url
        item["lang"] = "tr"
        item["source"] = "kizlarsoruyor"
        item["category"] = " ".join(hxs.xpath("//a[@class='no-posting tgec']/text()").extract())
        date_time = hxs.xpath("//span[@class='posted-on']/text()").extract()

        author = hxs.xpath("//span[@class='name']/text()").extract()
        if author:
            item["author"] = " ".join(author)
        else:
            author = hxs.xpath("//a[@class='username profile-hover']/text()").extract()
            item["author"] = " ".join(author)
        title = hxs.xpath("//h1/text()").extract()

        new_content = hxs.xpath("//div[@class='detail-body']/text()").extract()
        if new_content:
            new_content = ' '.join(new_content)
        else:
            new_content = hxs.xpath("//div[@class='article-body post-body clearfix']/text()")
            new_content = ' '.join(new_content)

        item["title"] = ' '.join(title)
        item["content"] = re.sub(r'\s{2,}', ' ', new_content)

        item["date_time"] = " ".join(date_time)

        return item
def makeNewsItem(url, docId, title, createTime, content, source):
    item = NewsItem()
    item['url'] = url
    item['docId'] = docId
    item['title'] = title
    item['createTime'] = createTime
    item['content'] = content
    item['source'] = source
    return item
Exemple #6
0
    def parse_item(self, response):
        '''
        parse the data from website
        create new NewsItem and then fills it by the crawler
        '''
        # create new article from our defined item
        article = NewsItem()

        # parse the data from the website
        article['title'] = response.css('h1.title:og:title').extract()[0]
        date = response.css('div.counters:article:published_time').extract()[0]
        article['date'] = self.transform_date(date)
        found_article = response.css('div.bbtext p::text').extract()
        article['article'] = self.transform_article(found_article)
        article['keywords'] = response.css('meta[name=keywords]::attr(content)').extract()[0]
        article['server'] = 'idnes.cz'

        return article
Exemple #7
0
    def parse_item(self, response):
        '''
        parse the data from website
        create new NewsItem and then fills it by the crawler
        '''
        # create new article from our defined item
        article = NewsItem()

        # parse the data from the website
        article['title'] = response.css('h1::text').extract()[0]
        article['date'] = self.current_date
        found_article = response.css('#contentArticleBox p::text').extract()
        article['article'] = self.transform_article(found_article)
        article['keywords'] = response.css(
            'meta[name=keywords]::attr(content)').extract()[0]
        article['server'] = 'novinky.cz'

        return article
    def parse_item(self, response):

        if str(response.url) not in self.OLD_URLS:
            self.log("Scraping: %s" % response.url, level=log.INFO)

            hxs = HtmlXPathSelector(response)

            item = NewsItem()

            #item['_id'] = NewsSpider.k
            item['url'] = response.url
            item['source'] = self.MY_SETTINGS["source"]

            item['title'] = None
            for title_path in self.CONT_PATHS["title"]:
                item['title'] = item['title'] or hxs.xpath(
                    title_path).extract()

            item['date'] = None
            for date_path in self.CONT_PATHS["date"]:
                item['date'] = item['date'] or hxs.xpath(date_path).extract()

            div = None
            for div_path in self.CONT_PATHS["text"]:
                div = div or hxs.xpath(div_path)

            text = re.sub('\s+', ' ',
                          ' '.join(div.extract())).strip().replace("\"", "'")

            #Final item entry
            tmp = ' '.join(item['title']).encode('ascii', 'ignore')
            tmp = tmp.replace("\\", "")
            item['title'] = tmp

            tmp = ' '.join(item['date']).encode('ascii', 'ignore')
            tmp = ' '.join(tmp.split())
            item['date'] = get_date(tmp)

            item['content'] = text
            item['company'] = self.MY_SETTINGS["company"]
            item['isClean'] = False

            self.URLS_FILE.write(str(response.url) + '\n')
            yield item
Exemple #9
0
    def parse_item(self, response):
        '''
        parse the data from website
        create new NewsItem and then fills it by the crawler
        '''
        # create new article from our defined item
        article = NewsItem()
        print(response.url)
        # parse the data from the website
        article['title'] = response.css('section.article-header h1::text').extract()[0]
        date = response.css('div.time::text').extract()[0]
        article['date'] = self.transform_date(date)
        found_article = response.css('section.article-content p::text').extract()
        article['article'] = self.transform_article(found_article)
        keywords = response.css('section.article-tags a::text').extract()
        article['keywords'] = self.transform_keywords(keywords)
        article['server'] = 'parlamentnilisty.cz'

        return article
Exemple #10
0
 def parse_item(self, response):
     # inspect_response(response, self)
     r = response
     title = r.xpath('/html/head/title/text()').extract()[0].strip()
     source = r.xpath("//a[@id='ne_article_source']/text()").extract()[0].strip()
     content = "".join(r.xpath("//div[@id='endText']/p/text()").extract()).strip()
     raw_time = r.xpath("//div[@class='post_time_source']/text()").extract()[0]
     re_result = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", raw_time)
     if re_result:
         ts = time.mktime(time.strptime(re_result[0], '%Y-%m-%d %H:%M:%S'))
     else:
         ts = 0
     url = r.url
     new_news = NewsItem(
         title=title,
         content=content,
         source=source,
         published=ts,
         url=url
     )
     return new_news
    def parse_item(self, response):
        '''
        parse the data from website
        create new NewsItem and then fills it by the crawler
        '''
        # create new article from our defined item
        article = NewsItem()

        # parse the data from the website
        article['title'] = response.css(
            'div.titulek-clanku h1::text').extract()[0]
        date = response.css(
            'meta[property=article\:published_time]::attr(content)').extract(
            )[0]
        article['date'] = self.transform_date(date)
        found_article = response.css('div.clanek-telo p::text').extract()
        article['article'] = self.transform_article(found_article)
        article['keywords'] = response.css(
            'meta[name=keywords]::attr(content)').extract()[0]
        article['server'] = 'aktualne.cz'

        return article
def genaNewsItem(dicN):
    # 保留需要内容
    dicSample = {
        "url": "http:xxxxx",
        "docId": 'xxxxx',
        "title": "广州今年还有3条地铁开工 周边房价地图奉上",
        "createTime": "2017-08-18 17:15:50",
        "content":
        "根据地铁官方公布,18、22、11号线将有望于年内全面开工,其中18、22号线涉及南沙与中心区的联系,11号线为广州首条“市区环线”。这个图集将为大家带来地铁房价地图以及简要的规划利好分析。(数据来源:中原研究发展部)",
        "source": ""
    }

    # 只保留筛选需要的信息
    # 如dicSample的格式
    item = NewsItem()
    info = dicN['info']
    item['url'] = dicN['url']
    item['docId'] = dicN['docId']
    item['title'] = info['setname']
    item['createTime'] = info['lmodify']
    item['content'] = dicN['content']
    item['source'] = info['source']

    return item