Python NewsItem Exemples, newscrawler.items.NewsItem Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : hurriyet.py Projet : ogulcangok/ci-cd_test

    def parse(self, response, **kwargs):
        """
        :param response: crawler response of the article url
        :return: parsed doc pushed to elastic
        """
        hxs = Selector(response)
        item = NewsItem()

        item["link"] = response.request.url
        item["lang"] = "tr"
        item["source"] = "hurriyet"
        date_time = hxs.xpath(
            "/html/body/article/div[12]/div/section[1]/header/div[1]/div[2]/div[2]/span[2]/time"
        ).extract()
        author = hxs.xpath(
            "/html/body/article/div[12]/div/section[1]/header/section[1]/div[1]/div/div[2]/a[1]/h6"
        ).extract()
        title = hxs.xpath(
            "/html/body/article/div[12]/div/section[1]/header/div[2]/div/h1"
        ).extract()
        intro = hxs.xpath(
            "/html/body/article/div[12]/div/section[3]/div/h2").extract()
        new_content = hxs.xpath(
            "/html/body/article/div[12]/div/section[3]/div/div[4]").extract()
        new_content = ' '.join(new_content)

        #
        # Processing outputs
        item["intro"] = ' '.join(intro)
        item["title"] = ' '.join(title)
        item["content"] = re.sub(r'\s{2,}', ' ', new_content)

        item["date_time"] = " ".join(date_time)
        item["author"] = " ".join(author)
        return item

Exemple #2

0

Afficher le fichier

Fichier : qq.py Projet : zer0fire/NewsCrawler

    def parse_item(self, response):
        r = response
        # inspect_response(response, self)

        title = r.xpath("//div[@class='qq_article']//h1/text()").extract()
        source = r.xpath("//div[@class='qq_article']//span[@class='a_source']/text()").extract()
        if title:
            title = title[0]
        if source:
            source = source[0]
        # 要求格式正确
        if not title or not source:
            redis_conn.hset(redis_invalid_url_key, response.url, 0)
            return
        content = ''.join(r.xpath('//div[@id="Cnt-Main-Article-QQ"]/p/text()').extract())
        raw_time = r.xpath("//div[@class='qq_article']//span[@class='a_time']/text()").extract()[0]
        re_result = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}", raw_time)
        if re_result:
            ts = time.mktime(time.strptime(re_result[0], '%Y-%m-%d %H:%M'))
        else:
            ts = 0
        url = r.url
        new_news = NewsItem(
            title=title,
            content=content,
            source=source,
            published=ts,
            url=url
        )
        return new_news

Exemple #3

0

Afficher le fichier

Fichier : news.py Projet : jyjoryi/WQD7005-data-mining

    def parse(self, response):
        for row in response.xpath(
                "//table[@class='table table-small'][1]/tbody/tr"):
            l = ItemLoader(item=NewsItem(), selector=row)
            l.add_xpath("news", 'td[2]/a/text()')

            yield l.load_item()

Exemple #4

0

Afficher le fichier

    def parse(self, response, **kwargs):
        """
        :param response: crawler response of the article url
        :return: parsed doc pushed to elastic
        """
        hxs = Selector(response)
        item = NewsItem()

        item["link"] = response.request.url
        item["lang"] = "tr"
        item["source"] = "kizlarsoruyor"
        item["category"] = " ".join(hxs.xpath("//a[@class='no-posting tgec']/text()").extract())
        date_time = hxs.xpath("//span[@class='posted-on']/text()").extract()

        author = hxs.xpath("//span[@class='name']/text()").extract()
        if author:
            item["author"] = " ".join(author)
        else:
            author = hxs.xpath("//a[@class='username profile-hover']/text()").extract()
            item["author"] = " ".join(author)
        title = hxs.xpath("//h1/text()").extract()

        new_content = hxs.xpath("//div[@class='detail-body']/text()").extract()
        if new_content:
            new_content = ' '.join(new_content)
        else:
            new_content = hxs.xpath("//div[@class='article-body post-body clearfix']/text()")
            new_content = ' '.join(new_content)

        item["title"] = ' '.join(title)
        item["content"] = re.sub(r'\s{2,}', ' ', new_content)

        item["date_time"] = " ".join(date_time)

        return item

Exemple #5

0

Afficher le fichier

Fichier : FeedExports.py Projet : OPHsysbilla/cocokecrawler

def makeNewsItem(url, docId, title, createTime, content, source):
    item = NewsItem()
    item['url'] = url
    item['docId'] = docId
    item['title'] = title
    item['createTime'] = createTime
    item['content'] = content
    item['source'] = source
    return item

Exemple #6

0

Afficher le fichier

    def parse_item(self, response):
        '''
        parse the data from website
        create new NewsItem and then fills it by the crawler
        '''
        # create new article from our defined item
        article = NewsItem()

        # parse the data from the website
        article['title'] = response.css('h1.title:og:title').extract()[0]
        date = response.css('div.counters:article:published_time').extract()[0]
        article['date'] = self.transform_date(date)
        found_article = response.css('div.bbtext p::text').extract()
        article['article'] = self.transform_article(found_article)
        article['keywords'] = response.css('meta[name=keywords]::attr(content)').extract()[0]
        article['server'] = 'idnes.cz'

        return article

Exemple #7

0

Afficher le fichier

    def parse_item(self, response):
        '''
        parse the data from website
        create new NewsItem and then fills it by the crawler
        '''
        # create new article from our defined item
        article = NewsItem()

        # parse the data from the website
        article['title'] = response.css('h1::text').extract()[0]
        article['date'] = self.current_date
        found_article = response.css('#contentArticleBox p::text').extract()
        article['article'] = self.transform_article(found_article)
        article['keywords'] = response.css(
            'meta[name=keywords]::attr(content)').extract()[0]
        article['server'] = 'novinky.cz'

        return article

Exemple #8

0

Afficher le fichier

Fichier : spiders.py Projet : shubhamsharma20/Financial-News-Crawler

    def parse_item(self, response):

        if str(response.url) not in self.OLD_URLS:
            self.log("Scraping: %s" % response.url, level=log.INFO)

            hxs = HtmlXPathSelector(response)

            item = NewsItem()

            #item['_id'] = NewsSpider.k
            item['url'] = response.url
            item['source'] = self.MY_SETTINGS["source"]

            item['title'] = None
            for title_path in self.CONT_PATHS["title"]:
                item['title'] = item['title'] or hxs.xpath(
                    title_path).extract()

            item['date'] = None
            for date_path in self.CONT_PATHS["date"]:
                item['date'] = item['date'] or hxs.xpath(date_path).extract()

            div = None
            for div_path in self.CONT_PATHS["text"]:
                div = div or hxs.xpath(div_path)

            text = re.sub('\s+', ' ',
                          ' '.join(div.extract())).strip().replace("\"", "'")

            #Final item entry
            tmp = ' '.join(item['title']).encode('ascii', 'ignore')
            tmp = tmp.replace("\\", "")
            item['title'] = tmp

            tmp = ' '.join(item['date']).encode('ascii', 'ignore')
            tmp = ' '.join(tmp.split())
            item['date'] = get_date(tmp)

            item['content'] = text
            item['company'] = self.MY_SETTINGS["company"]
            item['isClean'] = False

            self.URLS_FILE.write(str(response.url) + '\n')
            yield item

Exemple #9

0

Afficher le fichier

    def parse_item(self, response):
        '''
        parse the data from website
        create new NewsItem and then fills it by the crawler
        '''
        # create new article from our defined item
        article = NewsItem()
        print(response.url)
        # parse the data from the website
        article['title'] = response.css('section.article-header h1::text').extract()[0]
        date = response.css('div.time::text').extract()[0]
        article['date'] = self.transform_date(date)
        found_article = response.css('section.article-content p::text').extract()
        article['article'] = self.transform_article(found_article)
        keywords = response.css('section.article-tags a::text').extract()
        article['keywords'] = self.transform_keywords(keywords)
        article['server'] = 'parlamentnilisty.cz'

        return article

Exemple #10

0

Afficher le fichier

Fichier : netease.py Projet : zer0fire/NewsCrawler

 def parse_item(self, response):
     # inspect_response(response, self)
     r = response
     title = r.xpath('/html/head/title/text()').extract()[0].strip()
     source = r.xpath("//a[@id='ne_article_source']/text()").extract()[0].strip()
     content = "".join(r.xpath("//div[@id='endText']/p/text()").extract()).strip()
     raw_time = r.xpath("//div[@class='post_time_source']/text()").extract()[0]
     re_result = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", raw_time)
     if re_result:
         ts = time.mktime(time.strptime(re_result[0], '%Y-%m-%d %H:%M:%S'))
     else:
         ts = 0
     url = r.url
     new_news = NewsItem(
         title=title,
         content=content,
         source=source,
         published=ts,
         url=url
     )
     return new_news

Exemple #11

0

Afficher le fichier

Fichier : aktualne.py Projet : DataAnalysisMentoring/newscrawler

    def parse_item(self, response):
        '''
        parse the data from website
        create new NewsItem and then fills it by the crawler
        '''
        # create new article from our defined item
        article = NewsItem()

        # parse the data from the website
        article['title'] = response.css(
            'div.titulek-clanku h1::text').extract()[0]
        date = response.css(
            'meta[property=article\:published_time]::attr(content)').extract(
            )[0]
        article['date'] = self.transform_date(date)
        found_article = response.css('div.clanek-telo p::text').extract()
        article['article'] = self.transform_article(found_article)
        article['keywords'] = response.css(
            'meta[name=keywords]::attr(content)').extract()[0]
        article['server'] = 'aktualne.cz'

        return article

Exemple #12

0

Afficher le fichier

Fichier : FeedExports.py Projet : OPHsysbilla/cocokecrawler

def genaNewsItem(dicN):
    # 保留需要内容
    dicSample = {
        "url": "http:xxxxx",
        "docId": 'xxxxx',
        "title": "广州今年还有3条地铁开工 周边房价地图奉上",
        "createTime": "2017-08-18 17:15:50",
        "content":
        "根据地铁官方公布，18、22、11号线将有望于年内全面开工，其中18、22号线涉及南沙与中心区的联系，11号线为广州首条“市区环线”。这个图集将为大家带来地铁房价地图以及简要的规划利好分析。（数据来源：中原研究发展部）",
        "source": ""
    }

    # 只保留筛选需要的信息
    # 如dicSample的格式
    item = NewsItem()
    info = dicN['info']
    item['url'] = dicN['url']
    item['docId'] = dicN['docId']
    item['title'] = info['setname']
    item['createTime'] = info['lmodify']
    item['content'] = dicN['content']
    item['source'] = info['source']

    return item