Beispiel #1
0
def blogs():
    blogs = Blogs()
    page = request.args.get('page')
    pagesize = request.args.get('pagesize')
    print page
    print pagesize
    if page and type(page) == int:
        data = blogs.getBlobsBySite(request.args.get('type'))

    data = blogs.getBlobsBySite(request.args.get('type'))
    return render_template('list.html', data=data)
Beispiel #2
0
def getBlogs():
    site = request.args.get('site')
    page_str = request.args.get('page')
    pagesize_str = request.args.get('pagesize')
    try:
        page = int(page_str) if page_str is not None else 1
        pagesize = int(pagesize_str) if pagesize_str is not None else 20
        count = Blogs.getCountBySite(site)
        res = getSuccessResult(count, page, pagesize)
        data = Blogs.getBlobsBySite(site, page=page, pagesize=pagesize)
        res['data'] = []
        for msg in data:
            res['data'].append(msg.to_dict())
        pass
    except Exception, e:
        traceback.print_exc()
        res = BASE_ERROR_RES.copy()
        res['message'] = e.message
Beispiel #3
0
    def parse_csdn(self, response):
        contentlist = response.xpath('//*[@class="blog_list_wrap"]/dl')
        self.logger.info(u'-----------第%s页,%d条数据-----------' %
                         (response.meta['page'], len(contentlist)))
        for item in contentlist:
            page_url = item.xpath('dd/h3/a/@href').extract_first()
            id = hashlib.md5(page_url).hexdigest()
            title = item.xpath('dd/h3/a/text()').extract_first()
            if self.blogs.isExistById(id):
                self.logger.info('id:%s is exist!' % id)
                continue
            summary = item.xpath(
                'dd/div[@class="blog_list_c"]/text()').extract_first()
            dateStr = item.xpath(
                'dd/div[@class="blog_list_b clearfix"]/div[@class="blog_list_b_r fr"]/label/text()'
            ).extract_first()
            label = ','.join(
                item.xpath(
                    'dd/div[@class="blog_list_b clearfix"]/div[@class="blog_list_b_l fl"]/span/a//text()'
                ).extract())
            blog_url = item.xpath('dt/a[2]/@href').extract_first()
            author = item.xpath('dt/a[2]/text()').extract_first()
            avatar = item.xpath('dt/a/img/@src').extract_first()
            pv = item.xpath(
                'dd/div[@class="blog_list_b clearfix"]/div[@class="blog_list_b_r fr"]/span/em/text()'
            ).extract_first()
            positive = item.xpath(
                'dd/div[@class="blog_list_b_b"]/span/em/text()').extract_first(
                )
            blogs = Blogs()
            blogs.id = id
            blogs.site = self.site
            blogs.title = title
            blogs.label = label
            blogs.author = author
            blogs.summary = summary
            blogs.page_url = page_url
            blogs.blog_url = blog_url
            blogs.avatar = avatar
            blogs.pv = pv
            blogs.positive = positive
            blogs.publish_time = parseDateString(dateStr)
            now = datetime.datetime.now()
            blogs.date_update = now.strftime('%Y-%m-%d %H:%M:%S')

            yield ModelItem.getInstance(blogs)
        next_page_url = response.xpath(
            u'//div[@class="page_nav"]/a[text()="下一页"]/@href').extract_first()
        if next_page_url:
            pageNo = re.search('(\d+)', next_page_url).group(1)
            yield Request(
                'http://blog.csdn.net%s' % next_page_url,
                meta={
                    'type': 'list',
                    'page': pageNo
                },
                dont_filter=True,
                headers={
                    'User-Agent':
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
                })
        else:
            self.logger.info(u'----------CSDN最新博客,一共有%s页----------' %
                             (response.meta['page']))
Beispiel #4
0
 def __init__(self, *args, **kwargs):
     super(CsdnSplider, self).__init__(*args, **kwargs)
     self.commonParser = CommonParser()
     self.blogs = Blogs()
     self.site = 'csdn'
Beispiel #5
0
class CsdnSplider(CrawlSpider):
    name = 'blogs.csdn'
    custom_settings = {
        'RETRY_TIMES': 50,
        'ITEM_PIPELINES': {
            'CuteScrapy.pipelines.MysqlORMPipeline': 300
        },
        'DOWNLOADER_MIDDLEWARES': {
            # 'CuteScrapy.middlewares.RandomProxyMiddleware': 800,
            'CuteScrapy.middlewares.UserAgentMiddleware': 600
        },
        'DOWNLOAD_TIMEOUT': 120,
        'CONCURRENT_REQUESTS': 5,
        'REACTOR_THREADPOOL_MAXSIZE': 10
    }

    def __init__(self, *args, **kwargs):
        super(CsdnSplider, self).__init__(*args, **kwargs)
        self.commonParser = CommonParser()
        self.blogs = Blogs()
        self.site = 'csdn'

    def start_requests(self):
        yield Request(
            "http://blog.csdn.net/?&page=1",
            meta={
                'type': 'list',
                'page': 1
            },
            dont_filter=True,
            headers={
                'User-Agent':
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
            })

    def parse(self, response):
        if response.meta['type'] == 'list':
            for item in self.parse_csdn(response):
                yield item

    def parse_csdn(self, response):
        contentlist = response.xpath('//*[@class="blog_list_wrap"]/dl')
        self.logger.info(u'-----------第%s页,%d条数据-----------' %
                         (response.meta['page'], len(contentlist)))
        for item in contentlist:
            page_url = item.xpath('dd/h3/a/@href').extract_first()
            id = hashlib.md5(page_url).hexdigest()
            title = item.xpath('dd/h3/a/text()').extract_first()
            if self.blogs.isExistById(id):
                self.logger.info('id:%s is exist!' % id)
                continue
            summary = item.xpath(
                'dd/div[@class="blog_list_c"]/text()').extract_first()
            dateStr = item.xpath(
                'dd/div[@class="blog_list_b clearfix"]/div[@class="blog_list_b_r fr"]/label/text()'
            ).extract_first()
            label = ','.join(
                item.xpath(
                    'dd/div[@class="blog_list_b clearfix"]/div[@class="blog_list_b_l fl"]/span/a//text()'
                ).extract())
            blog_url = item.xpath('dt/a[2]/@href').extract_first()
            author = item.xpath('dt/a[2]/text()').extract_first()
            avatar = item.xpath('dt/a/img/@src').extract_first()
            pv = item.xpath(
                'dd/div[@class="blog_list_b clearfix"]/div[@class="blog_list_b_r fr"]/span/em/text()'
            ).extract_first()
            positive = item.xpath(
                'dd/div[@class="blog_list_b_b"]/span/em/text()').extract_first(
                )
            blogs = Blogs()
            blogs.id = id
            blogs.site = self.site
            blogs.title = title
            blogs.label = label
            blogs.author = author
            blogs.summary = summary
            blogs.page_url = page_url
            blogs.blog_url = blog_url
            blogs.avatar = avatar
            blogs.pv = pv
            blogs.positive = positive
            blogs.publish_time = parseDateString(dateStr)
            now = datetime.datetime.now()
            blogs.date_update = now.strftime('%Y-%m-%d %H:%M:%S')

            yield ModelItem.getInstance(blogs)
        next_page_url = response.xpath(
            u'//div[@class="page_nav"]/a[text()="下一页"]/@href').extract_first()
        if next_page_url:
            pageNo = re.search('(\d+)', next_page_url).group(1)
            yield Request(
                'http://blog.csdn.net%s' % next_page_url,
                meta={
                    'type': 'list',
                    'page': pageNo
                },
                dont_filter=True,
                headers={
                    'User-Agent':
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
                })
        else:
            self.logger.info(u'----------CSDN最新博客,一共有%s页----------' %
                             (response.meta['page']))
Beispiel #6
0
def home():
    blogs = Blogs()
    data = blogs.getAll()
    return render_template('list.html', data=data)
Beispiel #7
0
import scrapy

from CuteScrapy.model.blogs import Blogs


class ModelItem(scrapy.Item):
    model = scrapy.Field()

    @classmethod
    def getInstance(cls, model):
        modelItem = cls()
        modelItem['model'] = model
        return modelItem


if __name__ == '__main__':
    print ModelItem.getInstance(Blogs())
Beispiel #8
0
    def parse_blogs(self, response):
        for item in response.xpath('//*[@id="post_list"]/div'):
            page_url = item.xpath('div[@class="post_item_body"]/h3/a/@href').extract_first()
            id = hashlib.md5(page_url).hexdigest()
            title = item.xpath('div[@class="post_item_body"]/h3/a/text()').extract_first()
            if self.blogs.isExistById(id):
                self.logger.info('id:%s is exist!' % id)
                continue
            author = item.xpath('div/div[@class="post_item_foot"]/a/text()').extract_first()
            avatar = item.xpath('div[@class="post_item_body"]/p/a/img/@src').extract_first()
            blog_url = item.xpath('div/div[@class="post_item_foot"]/a/@href').extract_first()
            summary = self.commonParser.trim(''.join(item.xpath('div[@class="post_item_body"]/p/text()').extract()))
            dateStr = self.commonParser.trim(''.join(item.xpath('div[@class="post_item_body"]/div/text()').extract()))
            pv = item.xpath('div[2]/div/span[2]/a/text()').extract_first().strip()
            cv = item.xpath('div[2]/div/span[1]/a/text()').extract_first().strip()
            positive = item.xpath('div[1]/div[1]/span/text()').extract_first()
            pv = re.findall(r'\d+', pv)[0]
            cv = re.findall(r'\d+', cv)[0]

            blogs = Blogs()
            blogs.id = id
            blogs.site = self.site
            blogs.title = title
            blogs.label = None
            blogs.author = author
            blogs.summary = summary
            blogs.content = None
            blogs.avatar = avatar
            blogs.page_url = page_url
            blogs.blog_url = blog_url
            blogs.pv = pv
            blogs.cv = cv
            blogs.positive = positive
            blogs.publish_time = parseDateString(dateStr)
            blogs.date_update = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            yield ModelItem.getInstance(blogs)
Beispiel #9
0
class BlogsSplider(CrawlSpider):
    name = 'blogs.cnblogs'
    custom_settings = {
        'RETRY_TIMES': 50,
        'ITEM_PIPELINES': {
            'CuteScrapy.pipelines.MysqlORMPipeline': 300,
            # 'CuteScrapy.pipelines.JsonWriterPipeline': 350,
        },
        'DOWNLOAD_TIMEOUT': 120,
        'CONCURRENT_REQUESTS': 5,
        'REACTOR_THREADPOOL_MAXSIZE': 10
    }

    def __init__(self, *args, **kwargs):
        super(BlogsSplider, self).__init__(*args, **kwargs)
        self.commonParser = CommonParser()
        self.blogs = Blogs()
        self.site = 'cnblogs'

    def start_requests(self):
        for i in range(1, 20):
            yield Request(
                "http://www.cnblogs.com/sitehome/p/%d" % (i),
                meta={'type': 'list'},
                dont_filter=True
            )

    def parse(self, response):
        if response.status == 200:
            if response.meta['type'] == 'list':
                for item in self.parse_blogs(response):
                    yield item

    def parse_blogs(self, response):
        for item in response.xpath('//*[@id="post_list"]/div'):
            page_url = item.xpath('div[@class="post_item_body"]/h3/a/@href').extract_first()
            id = hashlib.md5(page_url).hexdigest()
            title = item.xpath('div[@class="post_item_body"]/h3/a/text()').extract_first()
            if self.blogs.isExistById(id):
                self.logger.info('id:%s is exist!' % id)
                continue
            author = item.xpath('div/div[@class="post_item_foot"]/a/text()').extract_first()
            avatar = item.xpath('div[@class="post_item_body"]/p/a/img/@src').extract_first()
            blog_url = item.xpath('div/div[@class="post_item_foot"]/a/@href').extract_first()
            summary = self.commonParser.trim(''.join(item.xpath('div[@class="post_item_body"]/p/text()').extract()))
            dateStr = self.commonParser.trim(''.join(item.xpath('div[@class="post_item_body"]/div/text()').extract()))
            pv = item.xpath('div[2]/div/span[2]/a/text()').extract_first().strip()
            cv = item.xpath('div[2]/div/span[1]/a/text()').extract_first().strip()
            positive = item.xpath('div[1]/div[1]/span/text()').extract_first()
            pv = re.findall(r'\d+', pv)[0]
            cv = re.findall(r'\d+', cv)[0]

            blogs = Blogs()
            blogs.id = id
            blogs.site = self.site
            blogs.title = title
            blogs.label = None
            blogs.author = author
            blogs.summary = summary
            blogs.content = None
            blogs.avatar = avatar
            blogs.page_url = page_url
            blogs.blog_url = blog_url
            blogs.pv = pv
            blogs.cv = cv
            blogs.positive = positive
            blogs.publish_time = parseDateString(dateStr)
            blogs.date_update = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            yield ModelItem.getInstance(blogs)
Beispiel #10
0
 def get(self):
     data = Blogs().getBlogs()
     return HttpResponse.success(data)