def parse(self, response): domain = 'http://36kr.com' # parse homepage for update # TODO it doesn't cover every new article in homepage if response.url == domain: lists = re.findall(r'http:\/\/36kr\.com\/p\/\d+\.html', response.body) for link in lists: if Article.check_exists(link) is False: yield scrapy.Request(link, callback=self.parse_page) # parse list if 'b_url_code' in response.url: lists = json.loads(response.body_as_unicode()) for i, page in enumerate(lists['data']['feed_posts']): yield scrapy.Request(domain + '/p/' + str(page['url_code']) + '.html', meta={'data': page}, callback=self.parse_page_from_list) if (i + 1) == len(lists['data']['feed_posts']): print 'end of list'.center(100, '-') self.current_num += 1 if self.current_num + 1 < self.max_article_page: yield scrapy.Request(self.list_entry + str(page['url_code'])) # parse a single page if re.compile(domain + '\/p\/\d+\.html$').match(response.url): yield self.parse_page(response)
def parse(self, response): # parse homepage for update domain = 'http://techcrunch.com' if response.url == domain: lists = self.parse_article_links(response) for link in lists: if Article.check_exists(link) is False: yield scrapy.Request(link, callback=self.parse_page) # parse multiple pages page = re.compile('^' + domain + '\/page\/(\d+)\/').match(response.url) if page is not None: self.current_num = int(page.group(1)) lists = self.parse_article_links(response) for link in lists: yield scrapy.Request(link, callback=self.parse_page) self.logger.info('[page] %s', response.url) # request next page if self.current_num < self.max_article_page: self.current_num += 1 yield scrapy.Request(domain + '/page/' + str(self.current_num) + '/') # parse a specific page # if re.compile('.*\/page\/\d+\/.*').match(response.url) is not None: # lists = response.css('.post-title').xpath('.//a/@href').extract() # for link in lists: # yield scrapy.Request(link, callback=self.parse_page) # parse a single article if re.compile('.*\d{4}\/\d{2}\/\d{2}.*').match(response.url) is not None: item = self.parse_page(response) yield item
def parse(self, response): # parse homepage for update domain = 'http://thenextweb.com' if response.url == domain: lists = self.parse_article_homepage(response) for link in lists: if Article.check_exists(link) is False: yield scrapy.Request(link, callback=self.parse_page) # parse multiple pages if self.enable_multi_page: page = re.compile('.*page=(\d+)').match(response.url) if page is not None: self.current_num = int(page.group(1)) lists = self.parse_article_links(response) for link in lists: yield scrapy.Request(link, callback=self.parse_page) self.logger.info('[page] %s', response.url) # request next page if self.current_num < self.max_article_page: self.current_num += 1 yield scrapy.Request(self.list_entry + 'page=' + str(self.current_num)) # parse a specific page else: if re.compile('^' + domain + '/latest/page/(\d+)/').match(response.url) is not None: lists = self.parse_article_links(response) for link in lists: yield scrapy.Request(link, callback=self.parse_page) # parse a single article if re.compile('.*\d{4}\/\d{2}\/\d{2}.*').match(response.url) is not None: item = self.parse_page(response) yield item
def parse(self, response): # parse homepage for update domain = 'http://www.iheima.com' if response.url == domain: lists = self.parse_article_links(response) for link in lists: if Article.check_exists(link) is False: yield scrapy.Request(link, callback=self.parse_page) # parse multiple pages page = re.compile('^' + domain + '/\?page=(\d*)&.*').match(response.url) if page is not None: self.current_num = int(page.group(1)) lists = self.parse_article_links(response) for link in lists: yield scrapy.Request(link, callback=self.parse_page) self.logger.info('[page %d] %s', self.current_num, response.url) # request next page if self.current_num < self.max_article_page: self.current_num += 1 yield scrapy.Request(domain + '/?page=' + str(self.current_num) + '&category=全部') # parse a single article if re.compile('.*\d*\.shtml').match(response.url) is not None: item = self.parse_page(response) yield item
def parse(self, response): # parse homepage for update domain = 'https://medium.com' # find tech link from hompage if response.url == domain: lists = self.get_article_links(response) for link in lists: if Article.check_exists(link) is False: yield scrapy.Request(link, callback=self.parse_page) # parse a single article if re.compile('.*medium.com\/@.*\/.*').match(response.url) is not None: item = self.parse_page(response) yield item
def parse(self, response): url = response.url articles = get_list_of_curpage(response) if url == 'http://www.pedaily.cn': for link in articles: if Article.check_exists(link) is False: yield Request(link, callback=parse_page) else: for link in articles: yield Request(link, callback=parse_page) last_page = self.total_page if self.current_page <= last_page: self.current_page += 1 cur_url = 'http://www.pedaily.cn/top/handlers/Handler.ashx?action=newslist-all&p=' + \ str(self.current_page)+'&url=http://www.pedaily.cn/top/newslist.aspx?c=all' self.logger.info('[page %d] %s', self.current_page, cur_url) yield Request(cur_url)
def update_table(): """ init or update domain table :return: """ data = [ { "domain": '36kr.com', "spider_name": '36kr' }, { "domain": 'huxiu.com', "spider_name": 'huxiu' }, { "domain": 'iheima.com', "spider_name": 'iheima' }, { "domain": 'medium.com', "spider_name": 'medium' }, { "domain": 'pedaily.cn', "spider_name": 'pedaily' }, { "domain": 'techcrunch.com', "spider_name": 'techcrunch' }, { "domain": 'thenextweb.com', "spider_name": 'thenextweb' }, { "domain": 'tmtpost.com', "spider_name": 'tmt' }, { "domain": 'venturebeat.com', "spider_name": 'venturebeat' }, ] for spider in data: article_num = Article.count(spider["domain"]) Domain.create(article_num=article_num, **spider)
def update_table(): """ init or update domain table :return: """ data = [ {"domain": '36kr.com', "spider_name": '36kr'}, {"domain": 'huxiu.com', "spider_name": 'huxiu'}, {"domain": 'iheima.com', "spider_name": 'iheima'}, {"domain": 'medium.com', "spider_name": 'medium'}, {"domain": 'pedaily.cn', "spider_name": 'pedaily'}, {"domain": 'techcrunch.com', "spider_name": 'techcrunch'}, {"domain": 'thenextweb.com', "spider_name": 'thenextweb'}, {"domain": 'tmtpost.com', "spider_name": 'tmt'}, {"domain": 'venturebeat.com', "spider_name": 'venturebeat'}, ] for spider in data: article_num = Article.count(spider["domain"]) Domain.create(article_num=article_num, **spider)
def parse(self, response): url = urlparse(response.url) updated_everyday = self.get_latest_articles(response) if self.enabled_crontab: for link in updated_everyday: link = urljoin(response.url, link) if Article.check_exists(link) is False: yield Request(link, callback=self.parse_page) else: latest_link = max(updated_everyday) latest_link = urljoin(response.url, latest_link) latest_aid = basename(latest_link) int_aid = int(latest_aid) end_aid = 0 while int_aid > end_aid: # TODO optimize unparsed url next_url = '/'.join([url.scheme+':/', url.netloc, 'article', str(int_aid)]) int_aid -= 1 yield Request(next_url, callback=self.parse_page)
def parse(self, response): url = response.url home_articles = self.get_home_articles(response) if self.enabled_crontab: for link in home_articles: link = urljoin(url, link) if Article.check_exists(link) is False: yield Request(link, callback=self.parse_page) else: if "lists/get_index_list" in response.url: offset = int(response.url.find("offset=")) + 7 self.current_offset = int(response.url[offset:]) lists = self.parse_article_links(response) for link in lists: yield Request(link, callback=self.parse_page) self.logger.info('[page] %s', response.url) # request next page if self.current_offset < self.max_offset: self.current_offset += self.limit yield Request(self.list_entry + 'offset=' + str(self.current_offset))
def parse(self, response): url = urlparse(response.url) updated_everyday = self.get_latest_articles(response) if self.enabled_crontab: for link in updated_everyday: link = urljoin(response.url, link) if Article.check_exists(link) is False: yield Request(link, callback=self.parse_page) else: latest_link = max(updated_everyday) latest_link = urljoin(response.url, latest_link) latest_aid = basename(latest_link) int_aid = int(latest_aid) end_aid = 0 while int_aid > end_aid: # TODO optimize unparsed url next_url = '/'.join( [url.scheme + ':/', url.netloc, 'article', str(int_aid)]) int_aid -= 1 yield Request(next_url, callback=self.parse_page)
def parse(self, response): # parse homepage for update domain = 'http://thenextweb.com' if response.url == domain: lists = self.parse_article_homepage(response) for link in lists: if Article.check_exists(link) is False: yield scrapy.Request(link, callback=self.parse_page) # parse multiple pages if self.enable_multi_page: page = re.compile('.*page=(\d+)').match(response.url) if page is not None: self.current_num = int(page.group(1)) lists = self.parse_article_links(response) for link in lists: yield scrapy.Request(link, callback=self.parse_page) self.logger.info('[page] %s', response.url) # request next page if self.current_num < self.max_article_page: self.current_num += 1 yield scrapy.Request(self.list_entry + 'page=' + str(self.current_num)) # parse a specific page else: if re.compile('^' + domain + '/latest/page/(\d+)/').match( response.url) is not None: lists = self.parse_article_links(response) for link in lists: yield scrapy.Request(link, callback=self.parse_page) # parse a single article if re.compile('.*\d{4}\/\d{2}\/\d{2}.*').match( response.url) is not None: item = self.parse_page(response) yield item