Exemple #1
0
    def parse(self, response):
        domain = 'http://36kr.com'

        # parse homepage for update
        # TODO it doesn't cover every new article in homepage
        if response.url == domain:
            lists = re.findall(r'http:\/\/36kr\.com\/p\/\d+\.html', response.body)
            for link in lists:
                if Article.check_exists(link) is False:
                    yield scrapy.Request(link, callback=self.parse_page)

        # parse list
        if 'b_url_code' in response.url:
            lists = json.loads(response.body_as_unicode())
            for i, page in enumerate(lists['data']['feed_posts']):
                yield scrapy.Request(domain + '/p/' + str(page['url_code']) + '.html',
                                     meta={'data': page},
                                     callback=self.parse_page_from_list)
                if (i + 1) == len(lists['data']['feed_posts']):
                    print 'end of list'.center(100, '-')
                    self.current_num += 1
                    if self.current_num + 1 < self.max_article_page:
                        yield scrapy.Request(self.list_entry + str(page['url_code']))

        # parse a single page
        if re.compile(domain + '\/p\/\d+\.html$').match(response.url):
            yield self.parse_page(response)
Exemple #2
0
    def parse(self, response):
        # parse homepage for update
        domain = 'http://techcrunch.com'
        if response.url == domain:
            lists = self.parse_article_links(response)
            for link in lists:
                if Article.check_exists(link) is False:
                    yield scrapy.Request(link, callback=self.parse_page)

        # parse multiple pages
        page = re.compile('^' + domain + '\/page\/(\d+)\/').match(response.url)
        if page is not None:
            self.current_num = int(page.group(1))
            lists = self.parse_article_links(response)
            for link in lists:
                yield scrapy.Request(link, callback=self.parse_page)
            self.logger.info('[page] %s', response.url)
            # request next page
            if self.current_num < self.max_article_page:
                self.current_num += 1
                yield scrapy.Request(domain + '/page/' + str(self.current_num) + '/')

        # parse a specific page
        # if re.compile('.*\/page\/\d+\/.*').match(response.url) is not None:
        #     lists = response.css('.post-title').xpath('.//a/@href').extract()
        #     for link in lists:
        #         yield scrapy.Request(link, callback=self.parse_page)

        # parse a single article
        if re.compile('.*\d{4}\/\d{2}\/\d{2}.*').match(response.url) is not None:
            item = self.parse_page(response)
            yield item
Exemple #3
0
    def parse(self, response):
        # parse homepage for update
        domain = 'http://thenextweb.com'
        if response.url == domain:
            lists = self.parse_article_homepage(response)
            for link in lists:
                if Article.check_exists(link) is False:
                    yield scrapy.Request(link, callback=self.parse_page)

        # parse multiple pages
        if self.enable_multi_page:
            page = re.compile('.*page=(\d+)').match(response.url)
            if page is not None:
                self.current_num = int(page.group(1))
                lists = self.parse_article_links(response)
                for link in lists:
                    yield scrapy.Request(link, callback=self.parse_page)
                self.logger.info('[page] %s', response.url)
                # request next page
                if self.current_num < self.max_article_page:
                    self.current_num += 1
                    yield scrapy.Request(self.list_entry + 'page=' + str(self.current_num))
        # parse a specific page
        else:
            if re.compile('^' + domain + '/latest/page/(\d+)/').match(response.url) is not None:
                lists = self.parse_article_links(response)
                for link in lists:
                    yield scrapy.Request(link, callback=self.parse_page)

        # parse a single article
        if re.compile('.*\d{4}\/\d{2}\/\d{2}.*').match(response.url) is not None:
            item = self.parse_page(response)
            yield item
Exemple #4
0
    def parse(self, response):
        domain = 'http://36kr.com'

        # parse homepage for update
        # TODO it doesn't cover every new article in homepage
        if response.url == domain:
            lists = re.findall(r'http:\/\/36kr\.com\/p\/\d+\.html',
                               response.body)
            for link in lists:
                if Article.check_exists(link) is False:
                    yield scrapy.Request(link, callback=self.parse_page)

        # parse list
        if 'b_url_code' in response.url:
            lists = json.loads(response.body_as_unicode())
            for i, page in enumerate(lists['data']['feed_posts']):
                yield scrapy.Request(domain + '/p/' + str(page['url_code']) +
                                     '.html',
                                     meta={'data': page},
                                     callback=self.parse_page_from_list)
                if (i + 1) == len(lists['data']['feed_posts']):
                    print 'end of list'.center(100, '-')
                    self.current_num += 1
                    if self.current_num + 1 < self.max_article_page:
                        yield scrapy.Request(self.list_entry +
                                             str(page['url_code']))

        # parse a single page
        if re.compile(domain + '\/p\/\d+\.html$').match(response.url):
            yield self.parse_page(response)
Exemple #5
0
    def parse(self, response):
        # parse homepage for update
        domain = 'http://www.iheima.com'
        if response.url == domain:
            lists = self.parse_article_links(response)
            for link in lists:
                if Article.check_exists(link) is False:
                    yield scrapy.Request(link, callback=self.parse_page)

        # parse multiple pages
        page = re.compile('^' + domain + '/\?page=(\d*)&.*').match(response.url)
        if page is not None:
            self.current_num = int(page.group(1))
            lists = self.parse_article_links(response)
            for link in lists:
                yield scrapy.Request(link, callback=self.parse_page)
            self.logger.info('[page %d] %s', self.current_num, response.url)
            # request next page
            if self.current_num < self.max_article_page:
                self.current_num += 1
                yield scrapy.Request(domain + '/?page=' + str(self.current_num) + '&category=全部')

        # parse a single article
        if re.compile('.*\d*\.shtml').match(response.url) is not None:
            item = self.parse_page(response)
            yield item
Exemple #6
0
    def parse(self, response):
        # parse homepage for update
        domain = 'https://medium.com'
        # find tech link from hompage
        if response.url == domain:
            lists = self.get_article_links(response)
            for link in lists:
                if Article.check_exists(link) is False:
                    yield scrapy.Request(link, callback=self.parse_page)

        # parse a single article
        if re.compile('.*medium.com\/@.*\/.*').match(response.url) is not None:
            item = self.parse_page(response)
            yield item
Exemple #7
0
 def parse(self, response):
     url = response.url
     articles = get_list_of_curpage(response)
     if url == 'http://www.pedaily.cn':
         for link in articles:
             if Article.check_exists(link) is False:
                 yield Request(link, callback=parse_page)
     else:
         for link in articles:
             yield Request(link, callback=parse_page)
             last_page = self.total_page
         if self.current_page <= last_page:
             self.current_page += 1
             cur_url = 'http://www.pedaily.cn/top/handlers/Handler.ashx?action=newslist-all&p=' + \
                     str(self.current_page)+'&url=http://www.pedaily.cn/top/newslist.aspx?c=all'
             self.logger.info('[page %d] %s', self.current_page, cur_url)
             yield Request(cur_url)
Exemple #8
0
 def parse(self, response):
     url = response.url
     articles = get_list_of_curpage(response)
     if url == 'http://www.pedaily.cn':
         for link in articles:
             if Article.check_exists(link) is False:
                 yield Request(link, callback=parse_page)
     else:
         for link in articles:
             yield Request(link, callback=parse_page)
             last_page = self.total_page
         if self.current_page <= last_page:
             self.current_page += 1
             cur_url = 'http://www.pedaily.cn/top/handlers/Handler.ashx?action=newslist-all&p=' + \
                     str(self.current_page)+'&url=http://www.pedaily.cn/top/newslist.aspx?c=all'
             self.logger.info('[page %d] %s', self.current_page, cur_url)
             yield Request(cur_url)
Exemple #9
0
def update_table():
    """
    init or update domain table
    :return:
    """
    data = [
        {
            "domain": '36kr.com',
            "spider_name": '36kr'
        },
        {
            "domain": 'huxiu.com',
            "spider_name": 'huxiu'
        },
        {
            "domain": 'iheima.com',
            "spider_name": 'iheima'
        },
        {
            "domain": 'medium.com',
            "spider_name": 'medium'
        },
        {
            "domain": 'pedaily.cn',
            "spider_name": 'pedaily'
        },
        {
            "domain": 'techcrunch.com',
            "spider_name": 'techcrunch'
        },
        {
            "domain": 'thenextweb.com',
            "spider_name": 'thenextweb'
        },
        {
            "domain": 'tmtpost.com',
            "spider_name": 'tmt'
        },
        {
            "domain": 'venturebeat.com',
            "spider_name": 'venturebeat'
        },
    ]
    for spider in data:
        article_num = Article.count(spider["domain"])
        Domain.create(article_num=article_num, **spider)
Exemple #10
0
def update_table():
    """
    init or update domain table
    :return:
    """
    data = [
        {"domain": '36kr.com', "spider_name": '36kr'},
        {"domain": 'huxiu.com', "spider_name": 'huxiu'},
        {"domain": 'iheima.com', "spider_name": 'iheima'},
        {"domain": 'medium.com', "spider_name": 'medium'},
        {"domain": 'pedaily.cn', "spider_name": 'pedaily'},
        {"domain": 'techcrunch.com', "spider_name": 'techcrunch'},
        {"domain": 'thenextweb.com', "spider_name": 'thenextweb'},
        {"domain": 'tmtpost.com', "spider_name": 'tmt'},
        {"domain": 'venturebeat.com', "spider_name": 'venturebeat'},
    ]
    for spider in data:
        article_num = Article.count(spider["domain"])
        Domain.create(article_num=article_num, **spider)
Exemple #11
0
 def parse(self, response):
     url = urlparse(response.url)
     updated_everyday = self.get_latest_articles(response)
     if self.enabled_crontab:
         for link in updated_everyday:
             link = urljoin(response.url, link)
             if Article.check_exists(link) is False:
                 yield Request(link, callback=self.parse_page)
     else:
         latest_link = max(updated_everyday)
         latest_link = urljoin(response.url, latest_link)
         latest_aid = basename(latest_link)
         int_aid = int(latest_aid)
         end_aid = 0
         while int_aid > end_aid:
             # TODO optimize unparsed url
             next_url = '/'.join([url.scheme+':/', url.netloc, 'article', str(int_aid)])
             int_aid -= 1
             yield Request(next_url, callback=self.parse_page)
Exemple #12
0
 def parse(self, response):
     url = response.url
     home_articles = self.get_home_articles(response)
     if self.enabled_crontab:
         for link in home_articles:
             link = urljoin(url, link)
             if Article.check_exists(link) is False:
                 yield Request(link, callback=self.parse_page)
     else:
         if "lists/get_index_list" in response.url:
             offset = int(response.url.find("offset=")) + 7
             self.current_offset = int(response.url[offset:])
             lists = self.parse_article_links(response)
             for link in lists:
                 yield Request(link, callback=self.parse_page)
             self.logger.info('[page] %s', response.url)
             # request next page
             if self.current_offset < self.max_offset:
                 self.current_offset += self.limit
                 yield Request(self.list_entry + 'offset=' + str(self.current_offset))
Exemple #13
0
 def parse(self, response):
     url = urlparse(response.url)
     updated_everyday = self.get_latest_articles(response)
     if self.enabled_crontab:
         for link in updated_everyday:
             link = urljoin(response.url, link)
             if Article.check_exists(link) is False:
                 yield Request(link, callback=self.parse_page)
     else:
         latest_link = max(updated_everyday)
         latest_link = urljoin(response.url, latest_link)
         latest_aid = basename(latest_link)
         int_aid = int(latest_aid)
         end_aid = 0
         while int_aid > end_aid:
             # TODO optimize unparsed url
             next_url = '/'.join(
                 [url.scheme + ':/', url.netloc, 'article',
                  str(int_aid)])
             int_aid -= 1
             yield Request(next_url, callback=self.parse_page)
Exemple #14
0
 def parse(self, response):
     url = response.url
     home_articles = self.get_home_articles(response)
     if self.enabled_crontab:
         for link in home_articles:
             link = urljoin(url, link)
             if Article.check_exists(link) is False:
                 yield Request(link, callback=self.parse_page)
     else:
         if "lists/get_index_list" in response.url:
             offset = int(response.url.find("offset=")) + 7
             self.current_offset = int(response.url[offset:])
             lists = self.parse_article_links(response)
             for link in lists:
                 yield Request(link, callback=self.parse_page)
             self.logger.info('[page] %s', response.url)
             # request next page
             if self.current_offset < self.max_offset:
                 self.current_offset += self.limit
                 yield Request(self.list_entry + 'offset=' +
                               str(self.current_offset))
Exemple #15
0
    def parse(self, response):
        # parse homepage for update
        domain = 'http://thenextweb.com'
        if response.url == domain:
            lists = self.parse_article_homepage(response)
            for link in lists:
                if Article.check_exists(link) is False:
                    yield scrapy.Request(link, callback=self.parse_page)

        # parse multiple pages
        if self.enable_multi_page:
            page = re.compile('.*page=(\d+)').match(response.url)
            if page is not None:
                self.current_num = int(page.group(1))
                lists = self.parse_article_links(response)
                for link in lists:
                    yield scrapy.Request(link, callback=self.parse_page)
                self.logger.info('[page] %s', response.url)
                # request next page
                if self.current_num < self.max_article_page:
                    self.current_num += 1
                    yield scrapy.Request(self.list_entry + 'page=' +
                                         str(self.current_num))
        # parse a specific page
        else:
            if re.compile('^' + domain + '/latest/page/(\d+)/').match(
                    response.url) is not None:
                lists = self.parse_article_links(response)
                for link in lists:
                    yield scrapy.Request(link, callback=self.parse_page)

        # parse a single article
        if re.compile('.*\d{4}\/\d{2}\/\d{2}.*').match(
                response.url) is not None:
            item = self.parse_page(response)
            yield item