Example #1
0
    def parse_page(response):
        domain = 'http://techcrunch.com'
        now_date = datetime.datetime.utcnow()
        now_date = now_date.strftime('%Y-%m-%d %H:%M:%S')
        published_ts = response.xpath('//meta[@name="timestamp"]/@content').extract_first()
        published_ts = datetime_str_to_utc(published_ts, -7)

        item = ArticleItem()
        item['url'] = response.url
        item['title'] = response.css('.tweet-title::text').extract()
        # TODO filter out script and iframe ?
        item['content'] = ''.join(response.css('.article-entry').xpath('./*').extract())
        item['summary'] = None
        item['published_ts'] = published_ts
        item['created_ts'] = now_date
        item['updated_ts'] = now_date
        item['time_str'] = None
        item['author_name'] = response.css('.byline').xpath('.//a/text()').extract_first()
        item['author_link'] = urljoin(domain, response.css('.byline').xpath('.//a/@href').extract_first())
        item['author_avatar'] = None
        item['tags'] = ','.join(response.xpath('//meta[@name="category"]/@content').extract())
        item['site_unique_id'] = response.css('.social-share-list').xpath('@data-post-id').extract_first()
        item['author_id'] = 0
        item['author_email'] = None
        item['author_phone'] = None
        item['author_role'] = None
        item['cover_real_url'] = None
        item['source_type'] = None
        item['views_count'] = 0
        item['cover'] = None
        return item
Example #2
0
    def parse_page(response):
        now_date = datetime.datetime.utcnow()
        now_date = now_date.strftime('%Y-%m-%d %H:%M:%S')
        published_ts = response.css(
            '#post-date::text').extract_first() + ' 00:00:00'

        item = ArticleItem()
        item['url'] = response.url
        item['title'] = response.xpath('//title/text()').extract_first()
        item['content'] = ''.join(
            response.css('.rich_media_content').xpath('./*').extract())
        item['summary'] = None
        item['published_ts'] = published_ts
        item['created_ts'] = now_date
        item['updated_ts'] = now_date
        item['time_str'] = None
        item['author_name'] = response.css(
            '.profile_nickname::text').extract_first()
        item['author_link'] = response.css(
            '.profile_meta_value::text').extract_first()
        item['author_avatar'] = None
        item['tags'] = None
        item['site_unique_id'] = None
        item['author_id'] = 0
        item['author_email'] = None
        item['author_phone'] = None
        item['author_role'] = None
        item['cover_real_url'] = None
        item['source_type'] = None
        item['views_count'] = 0
        item['cover'] = None
        return item
Example #3
0
    def parse_page(response):
        now_date = datetime.datetime.utcnow()
        now_date = now_date.strftime('%Y-%m-%d %H:%M:%S')
        published_ts = response.xpath('//meta[@property="bt:pubDate"]/@content').extract_first()
        time_str = published_ts[:19].replace('T', ' ')
        timezone = int(published_ts[19:22])
        published_ts = datetime_str_to_utc(time_str, timezone)
        short_url = response.xpath('//link[@rel="shortlink"]/@href').extract_first()

        item = ArticleItem()
        item['url'] = response.url
        item['title'] = response.css('h1.article-title::text').extract_first()
        item['content'] = ''.join(response.css('.article-content').xpath('./*').extract())
        item['summary'] = None
        item['published_ts'] = published_ts
        item['created_ts'] = now_date
        item['updated_ts'] = now_date
        item['time_str'] = None
        item['author_name'] = response.xpath('//meta[@property="bt:author"]/@content').extract_first()
        item['author_link'] = response.css('.article-byline a.author').xpath('@href').extract_first()
        item['author_avatar'] = None
        item['tags'] = ','.join(response.css('.article-tags a::text').extract())
        item['site_unique_id'] = short_url[13:]
        item['author_id'] = 0
        item['author_email'] = None
        item['author_phone'] = None
        item['author_role'] = None
        item['cover_real_url'] = None
        item['source_type'] = None
        item['views_count'] = 0
        item['cover'] = response.css('.article-media-header img').xpath('@src').extract_first()
        return item
Example #4
0
    def parse_page(self, response):
        domain = 'http://36kr.com'
        obj = response.css('.js-react-on-rails-component') \
            .xpath('@data-props').extract()
        result = json.loads(obj[0])
        post = result['data']['post']

        now_date = datetime.datetime.utcnow()
        now_date = now_date.strftime('%Y-%m-%d %H:%M:%S')

        item = ArticleItem()
        item['url'] = response.url
        item['title'] = post['title']
        item['content'] = post['display_content']
        item['summary'] = post['summary']
        item['published_ts'] = self.datetime_str_to_utc(post['published_at'])
        item['created_ts'] = now_date
        item['updated_ts'] = now_date
        item['time_str'] = ''
        item['author_name'] = post['author']['display_name']
        item['author_link'] = urljoin(domain, post['author']['domain_path'])
        item['author_avatar'] = post['author']['avatar']
        item['tags'] = ','.join(post['display_tag_list'])
        item['site_unique_id'] = post['url_code']
        item['author_id'] = post['author']['id']
        item['author_email'] = post['author'].get('email', "")
        item['author_phone'] = post['author'].get('phone', "")
        item['author_role'] = post['author'].get('role', "")
        item['cover_real_url'] = post.get('cover_real_url')
        item['source_type'] = post['source_type']
        item['views_count'] = post.get('views_count', 0)
        item['cover'] = post['cover']
        return item
Example #5
0
    def parse_page(response):
        sel = response.selector
        domain = 'http://www.huxiu.com'
        item = ArticleItem()

        content = sel.css('#article_content').extract_first()
        if content is None:
            return item

        now_date = datetime.utcnow()
        now_date = now_date.strftime('%Y-%m-%d %H:%M:%S')

        item['url'] = response.url
        item['title'] = sel.xpath('//title/text()').extract_first()
        item['content'] = content
        item['summary'] = sel.xpath(
            '//meta[@name="description"]/@content').extract_first()
        item['published_ts'] = datetime_str_to_utc(
            sel.css('.article-time::text').extract_first() + ':00', 8)
        item['created_ts'] = now_date
        item['updated_ts'] = now_date
        item['time_str'] = None
        item['author_name'] = response.css('.box-author-info').css(
            '.author-name a::text').extract_first()
        item['author_link'] = urljoin(
            domain,
            response.css('.box-author-info').css(
                '.author-name a::attr(href)').extract_first())
        item['author_avatar'] = sel.css('.box-author-info').css(
            '.author-face img::attr(src)').extract_first()
        item['tags'] = ','.join(
            sel.css('.tag-box').xpath(
                ".//li[@class='transition']/text()").extract())
        item['site_unique_id'] = basename(response.url)
        if item['author_link'].find(urljoin(domain, '/member')) == 0:
            author_id = splitext(basename(item['author_link']))[0]
        else:
            author_id = 0
        item['author_id'] = author_id
        item['author_email'] = None
        item['author_phone'] = None
        item['author_role'] = sel.css('.box-author-info').css(
            '.icon-team-auth::attr(title)').extract_first()
        item['cover_real_url'] = None
        item['source_type'] = None
        item['views_count'] = None
        item['cover'] = None
        return item
Example #6
0
    def parse_page(response):
        now_date = datetime.datetime.utcnow()
        now_date = now_date.strftime('%Y-%m-%d %H:%M:%S')
        published_ts = response.xpath(
            '//meta[@property="article:published_time"]/@content'
        ).extract_first()
        time_str = published_ts[:19].replace('T', ' ')
        timezone = int(published_ts[19:22])
        published_ts = datetime_str_to_utc(time_str, timezone)
        email = response.css('.post-author-contact').xpath(
            '@href').extract_first()
        email = email[7:] if 'mailto' in email else None

        item = ArticleItem()
        item['url'] = response.url
        item['title'] = response.xpath('//title/text()').extract_first()
        item['content'] = ''.join(
            response.css('.post-body').xpath('./*').extract())
        item['summary'] = response.xpath(
            '//meta[@property="og:description"]/@content').extract_first()
        item['published_ts'] = published_ts
        item['created_ts'] = now_date
        item['updated_ts'] = now_date
        item['time_str'] = None
        item['author_name'] = response.xpath(
            '//meta[@name="author"]/@content').extract_first()
        item['author_link'] = response.xpath(
            '//meta[@property="article:author"]/@content').extract_first()
        item['author_avatar'] = None
        item['tags'] = None
        item['site_unique_id'] = response.xpath(
            '//link[@rel="shortlink"]/@href').extract_first()[25:]
        item['author_id'] = 0
        item['author_email'] = email
        item['author_phone'] = None
        item['author_role'] = None
        item['cover_real_url'] = None
        item['source_type'] = None
        item['views_count'] = 0
        item['cover'] = response.css('.post-featuredImage img').xpath(
            '@data-src').extract_first()
        return item
Example #7
0
def parse_html(response, url=None):
    """
    TODO lang="ja"
    :param response:
    :param url:
    :return:
    """
    item = ArticleItem()
    now_date = datetime.datetime.utcnow()
    now_date = now_date.strftime('%Y-%m-%d %H:%M:%S')
    published_ts = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
    if published_ts is None:
        return item
    time_str = published_ts[:19].replace('T', ' ')
    timezone = -7
    published_ts = datetime_str_to_utc(time_str, timezone)
    source_type = response.css('.postArticle--full').xpath('//@lang').extract_first().lower()
    if source_type != 'en':
        return item

    item['url'] = url if url is not None else response.url
    item['title'] = response.xpath('//title/text()').extract_first()
    item['content'] = response.css('.postArticle-content').extract_first()
    item['summary'] = None
    item['published_ts'] = published_ts
    item['created_ts'] = now_date
    item['updated_ts'] = now_date
    item['time_str'] = None
    item['author_name'] = response.xpath('//meta[@name="author"]/@content').extract_first()
    item['author_link'] = response.xpath('//meta[@property="article:author"]/@content').extract_first()
    item['author_avatar'] = None
    item['tags'] = None
    item['site_unique_id'] = None
    item['author_id'] = 0
    item['author_email'] = None
    item['author_phone'] = None
    item['author_role'] = None
    item['cover_real_url'] = None
    item['source_type'] = source_type
    item['views_count'] = 0
    item['cover'] = None
    return item