def parse_page(response): domain = 'http://techcrunch.com' now_date = datetime.datetime.utcnow() now_date = now_date.strftime('%Y-%m-%d %H:%M:%S') published_ts = response.xpath('//meta[@name="timestamp"]/@content').extract_first() published_ts = datetime_str_to_utc(published_ts, -7) item = ArticleItem() item['url'] = response.url item['title'] = response.css('.tweet-title::text').extract() # TODO filter out script and iframe ? item['content'] = ''.join(response.css('.article-entry').xpath('./*').extract()) item['summary'] = None item['published_ts'] = published_ts item['created_ts'] = now_date item['updated_ts'] = now_date item['time_str'] = None item['author_name'] = response.css('.byline').xpath('.//a/text()').extract_first() item['author_link'] = urljoin(domain, response.css('.byline').xpath('.//a/@href').extract_first()) item['author_avatar'] = None item['tags'] = ','.join(response.xpath('//meta[@name="category"]/@content').extract()) item['site_unique_id'] = response.css('.social-share-list').xpath('@data-post-id').extract_first() item['author_id'] = 0 item['author_email'] = None item['author_phone'] = None item['author_role'] = None item['cover_real_url'] = None item['source_type'] = None item['views_count'] = 0 item['cover'] = None return item
def parse_page(response): now_date = datetime.datetime.utcnow() now_date = now_date.strftime('%Y-%m-%d %H:%M:%S') published_ts = response.css( '#post-date::text').extract_first() + ' 00:00:00' item = ArticleItem() item['url'] = response.url item['title'] = response.xpath('//title/text()').extract_first() item['content'] = ''.join( response.css('.rich_media_content').xpath('./*').extract()) item['summary'] = None item['published_ts'] = published_ts item['created_ts'] = now_date item['updated_ts'] = now_date item['time_str'] = None item['author_name'] = response.css( '.profile_nickname::text').extract_first() item['author_link'] = response.css( '.profile_meta_value::text').extract_first() item['author_avatar'] = None item['tags'] = None item['site_unique_id'] = None item['author_id'] = 0 item['author_email'] = None item['author_phone'] = None item['author_role'] = None item['cover_real_url'] = None item['source_type'] = None item['views_count'] = 0 item['cover'] = None return item
def parse_page(response): now_date = datetime.datetime.utcnow() now_date = now_date.strftime('%Y-%m-%d %H:%M:%S') published_ts = response.xpath('//meta[@property="bt:pubDate"]/@content').extract_first() time_str = published_ts[:19].replace('T', ' ') timezone = int(published_ts[19:22]) published_ts = datetime_str_to_utc(time_str, timezone) short_url = response.xpath('//link[@rel="shortlink"]/@href').extract_first() item = ArticleItem() item['url'] = response.url item['title'] = response.css('h1.article-title::text').extract_first() item['content'] = ''.join(response.css('.article-content').xpath('./*').extract()) item['summary'] = None item['published_ts'] = published_ts item['created_ts'] = now_date item['updated_ts'] = now_date item['time_str'] = None item['author_name'] = response.xpath('//meta[@property="bt:author"]/@content').extract_first() item['author_link'] = response.css('.article-byline a.author').xpath('@href').extract_first() item['author_avatar'] = None item['tags'] = ','.join(response.css('.article-tags a::text').extract()) item['site_unique_id'] = short_url[13:] item['author_id'] = 0 item['author_email'] = None item['author_phone'] = None item['author_role'] = None item['cover_real_url'] = None item['source_type'] = None item['views_count'] = 0 item['cover'] = response.css('.article-media-header img').xpath('@src').extract_first() return item
def parse_page(self, response): domain = 'http://36kr.com' obj = response.css('.js-react-on-rails-component') \ .xpath('@data-props').extract() result = json.loads(obj[0]) post = result['data']['post'] now_date = datetime.datetime.utcnow() now_date = now_date.strftime('%Y-%m-%d %H:%M:%S') item = ArticleItem() item['url'] = response.url item['title'] = post['title'] item['content'] = post['display_content'] item['summary'] = post['summary'] item['published_ts'] = self.datetime_str_to_utc(post['published_at']) item['created_ts'] = now_date item['updated_ts'] = now_date item['time_str'] = '' item['author_name'] = post['author']['display_name'] item['author_link'] = urljoin(domain, post['author']['domain_path']) item['author_avatar'] = post['author']['avatar'] item['tags'] = ','.join(post['display_tag_list']) item['site_unique_id'] = post['url_code'] item['author_id'] = post['author']['id'] item['author_email'] = post['author'].get('email', "") item['author_phone'] = post['author'].get('phone', "") item['author_role'] = post['author'].get('role', "") item['cover_real_url'] = post.get('cover_real_url') item['source_type'] = post['source_type'] item['views_count'] = post.get('views_count', 0) item['cover'] = post['cover'] return item
def parse_page(response): sel = response.selector domain = 'http://www.huxiu.com' item = ArticleItem() content = sel.css('#article_content').extract_first() if content is None: return item now_date = datetime.utcnow() now_date = now_date.strftime('%Y-%m-%d %H:%M:%S') item['url'] = response.url item['title'] = sel.xpath('//title/text()').extract_first() item['content'] = content item['summary'] = sel.xpath( '//meta[@name="description"]/@content').extract_first() item['published_ts'] = datetime_str_to_utc( sel.css('.article-time::text').extract_first() + ':00', 8) item['created_ts'] = now_date item['updated_ts'] = now_date item['time_str'] = None item['author_name'] = response.css('.box-author-info').css( '.author-name a::text').extract_first() item['author_link'] = urljoin( domain, response.css('.box-author-info').css( '.author-name a::attr(href)').extract_first()) item['author_avatar'] = sel.css('.box-author-info').css( '.author-face img::attr(src)').extract_first() item['tags'] = ','.join( sel.css('.tag-box').xpath( ".//li[@class='transition']/text()").extract()) item['site_unique_id'] = basename(response.url) if item['author_link'].find(urljoin(domain, '/member')) == 0: author_id = splitext(basename(item['author_link']))[0] else: author_id = 0 item['author_id'] = author_id item['author_email'] = None item['author_phone'] = None item['author_role'] = sel.css('.box-author-info').css( '.icon-team-auth::attr(title)').extract_first() item['cover_real_url'] = None item['source_type'] = None item['views_count'] = None item['cover'] = None return item
def parse_page(response): now_date = datetime.datetime.utcnow() now_date = now_date.strftime('%Y-%m-%d %H:%M:%S') published_ts = response.xpath( '//meta[@property="article:published_time"]/@content' ).extract_first() time_str = published_ts[:19].replace('T', ' ') timezone = int(published_ts[19:22]) published_ts = datetime_str_to_utc(time_str, timezone) email = response.css('.post-author-contact').xpath( '@href').extract_first() email = email[7:] if 'mailto' in email else None item = ArticleItem() item['url'] = response.url item['title'] = response.xpath('//title/text()').extract_first() item['content'] = ''.join( response.css('.post-body').xpath('./*').extract()) item['summary'] = response.xpath( '//meta[@property="og:description"]/@content').extract_first() item['published_ts'] = published_ts item['created_ts'] = now_date item['updated_ts'] = now_date item['time_str'] = None item['author_name'] = response.xpath( '//meta[@name="author"]/@content').extract_first() item['author_link'] = response.xpath( '//meta[@property="article:author"]/@content').extract_first() item['author_avatar'] = None item['tags'] = None item['site_unique_id'] = response.xpath( '//link[@rel="shortlink"]/@href').extract_first()[25:] item['author_id'] = 0 item['author_email'] = email item['author_phone'] = None item['author_role'] = None item['cover_real_url'] = None item['source_type'] = None item['views_count'] = 0 item['cover'] = response.css('.post-featuredImage img').xpath( '@data-src').extract_first() return item
def parse_html(response, url=None): """ TODO lang="ja" :param response: :param url: :return: """ item = ArticleItem() now_date = datetime.datetime.utcnow() now_date = now_date.strftime('%Y-%m-%d %H:%M:%S') published_ts = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() if published_ts is None: return item time_str = published_ts[:19].replace('T', ' ') timezone = -7 published_ts = datetime_str_to_utc(time_str, timezone) source_type = response.css('.postArticle--full').xpath('//@lang').extract_first().lower() if source_type != 'en': return item item['url'] = url if url is not None else response.url item['title'] = response.xpath('//title/text()').extract_first() item['content'] = response.css('.postArticle-content').extract_first() item['summary'] = None item['published_ts'] = published_ts item['created_ts'] = now_date item['updated_ts'] = now_date item['time_str'] = None item['author_name'] = response.xpath('//meta[@name="author"]/@content').extract_first() item['author_link'] = response.xpath('//meta[@property="article:author"]/@content').extract_first() item['author_avatar'] = None item['tags'] = None item['site_unique_id'] = None item['author_id'] = 0 item['author_email'] = None item['author_phone'] = None item['author_role'] = None item['cover_real_url'] = None item['source_type'] = source_type item['views_count'] = 0 item['cover'] = None return item