Example #1
0
    def parse_blogcomment(self, response):
        blogcommentitem = BlogCommentItem()
        comment_list = json.loads(response.body)['list']

        for comment in comment_list:
            blogcommentitem['blogcommentID'] = comment['CommentId']
            blogcommentitem['userID'] = comment['UserName']

            blogcommentitem['link'] = response.meta['link']
            blogcommentitem['blogID'] = comment['ArticleId']
            blogcommentitem['authorID'] = response.meta['author']
            blogcommentitem['time'] = comment['PostTime']

            blogcommentitem['content'] = comment['Content']

            if re.findall(r'\[reply\]', comment['Content']):
                blogcommentitem['commenttoID'] = re.findall(
                    r'\[reply\](.*)\[/reply\]', comment['Content'])[0]
            else:
                blogcommentitem['commenttoID'] = response.meta['author']

            yield blogcommentitem

            dbcheck = DB_mysql()
            if dbcheck.check(blogcommentitem['userID']):
                authorItem = AuthorItem()
                authorItem['userID'] = blogcommentitem['userID']
                authorItem[
                    'link'] = 'http://my.csdn.net/' + blogcommentitem['userID']
                authorItem['blog_crawl'] = 0
                authorItem['user_crawl'] = 0
                yield authorItem
Example #2
0
    def parse(self, response):
        separator = re.compile('((\\n|\\t|\\r|( ))+( )*)+')
        useritem = UserItem()
        detail_str = ''
        focus_str = ''
        befoucs_str = ''

        dbupdate = DB_mysql()
        dbupdate.update(response.url.split('/')[-1], 2)

        useritem['userID'] = response.url.split('/')[-1]
        useritem['link'] = response.url
        useritem['nickname'] = response.xpath(
            '//dt[@class="person-nick-name"]/span/text()').extract()[0]

        for detail in response.xpath(
                '//dd[@class="person-detail"]/text()').extract()[:-1]:
            detail_str = detail_str + separator.sub('', detail) + ','
        useritem['detail'] = detail_str[:-1]

        useritem['sign'] = response.xpath(
            '//dd[@class="person-sign"]/text()').extract()[0]
        #useritem['scores_label'] =
        useritem['number_focus'] = int(
            response.xpath(
                '//div[@class="focus"]/div[1]/span/text()').extract()[0])
        useritem['number_befoucs'] = int(
            response.xpath('//div[@class="focus beFocus"]/div[1]/span/text()').
            extract()[0])

        for focus_user in response.xpath(
                '//div[@class="focus"]/div[2]//@href').extract():
            focus_str = focus_str + focus_user + ','
        useritem['focus_userID'] = focus_str[:-1]

        for befocus_user in response.xpath(
                '//div[@class="focus beFocus"]/div[2]//@href').extract():
            befoucs_str = befoucs_str + befocus_user + ','
        useritem['befocus_userID'] = befoucs_str[:-1]

        yield useritem
Example #3
0
class Userspider(scrapy.Spider):
    name = 'user_spider'
    allowed_domains = ['my.csdn.net']
    start_urls = []

    sql = 'select userID from author where user_crawl = 0'
    dbquery = DB_mysql()
    result = dbquery.query(sql)
    for userID in result:
        start_urls.append('http://my.csdn.net/' + userID[0])

    def parse(self, response):
        separator = re.compile('((\\n|\\t|\\r|( ))+( )*)+')
        useritem = UserItem()
        detail_str = ''
        focus_str = ''
        befoucs_str = ''

        dbupdate = DB_mysql()
        dbupdate.update(response.url.split('/')[-1], 2)

        useritem['userID'] = response.url.split('/')[-1]
        useritem['link'] = response.url
        useritem['nickname'] = response.xpath(
            '//dt[@class="person-nick-name"]/span/text()').extract()[0]

        for detail in response.xpath(
                '//dd[@class="person-detail"]/text()').extract()[:-1]:
            detail_str = detail_str + separator.sub('', detail) + ','
        useritem['detail'] = detail_str[:-1]

        useritem['sign'] = response.xpath(
            '//dd[@class="person-sign"]/text()').extract()[0]
        #useritem['scores_label'] =
        useritem['number_focus'] = int(
            response.xpath(
                '//div[@class="focus"]/div[1]/span/text()').extract()[0])
        useritem['number_befoucs'] = int(
            response.xpath('//div[@class="focus beFocus"]/div[1]/span/text()').
            extract()[0])

        for focus_user in response.xpath(
                '//div[@class="focus"]/div[2]//@href').extract():
            focus_str = focus_str + focus_user + ','
        useritem['focus_userID'] = focus_str[:-1]

        for befocus_user in response.xpath(
                '//div[@class="focus beFocus"]/div[2]//@href').extract():
            befoucs_str = befoucs_str + befocus_user + ','
        useritem['befocus_userID'] = befoucs_str[:-1]

        yield useritem
Example #4
0
 def parse(self, response):
     dbupdate = DB_mysql()
     dbupdate.update(response.url.split('/')[-1], 1)
     yield Request(response.url, callback=self.parse_bloglist)
Example #5
0
class Blogspider(scrapy.Spider):
    name = 'blog_spider'
    allowed_domains = ['blog.csdn.net']
    start_urls = []

    sql = 'select userID from author where blog_crawl = 0'
    dbquery = DB_mysql()
    result = dbquery.query(sql)
    for userID in result:
        start_urls.append('http://blog.csdn.net/' + userID[0])

    def parse(self, response):
        dbupdate = DB_mysql()
        dbupdate.update(response.url.split('/')[-1], 1)
        yield Request(response.url, callback=self.parse_bloglist)

    def parse_bloglist(self, response):
        for url in response.xpath(
                '//span[@class="link_title"]/descendant::a/@href').extract():
            blog_url = 'http://blog.csdn.net' + url
            #print blog_url
            yield Request(blog_url, callback=self.parse_blog)
        if response.xpath('//div[@class="pagelist"]/a/text()'):
            if response.xpath('//div[@class="pagelist"]/a/text()').extract(
            )[-2] == u'下一页':
                nextpage_url = 'http://blog.csdn.net' + response.xpath(
                    '//div[@class="pagelist"]/a/@href').extract()[-2]
                # print nextpage_url
                yield Request(nextpage_url, callback=self.parse)

    def parse_blog(self, response):
        blogitem = BlogItem()
        separator = re.compile('((\\n|\\t|\\r|( ))+( )*)+')
        category_str = ' '
        content_str = ''
        count = 0

        blogitem['blogID'] = response.url.split('/')[-1]
        blogitem['userID'] = response.url.split('/')[-4]
        blogitem['link'] = response.url
        blogitem['title'] = separator.sub(
            ' ',
            response.xpath(
                '//div[@class="article_title"]//a/text()').extract()[-1])
        blogitem['time'] = response.xpath(
            '//span[@class="link_postdate"]/text()').extract()[0]
        blogitem['number_read'] = int(
            re.sub(
                r'\D', "",
                response.xpath('//span[@class="link_view"]/text()').extract()
                [0]))
        blogitem['number_comment'] = int(
            re.sub(
                r'\D', "",
                response.xpath(
                    '//span[@class="link_comments"]/text()').extract()[1]))
        blogitem['number_ding'] = int(
            re.sub(
                r'\D', "",
                response.xpath('//dl[@id="btnDigg"]/dd/text()').extract()[0]))
        blogitem['number_cai'] = int(
            re.sub(
                r'\D', "",
                response.xpath('//dl[@id="btnBury"]/dd/text()').extract()[0]))
        blogitem['article_type'] = response.xpath(
            '//div[@class="article_title"]/span/@class').extract()[0].split(
                '_')[-1]

        if response.xpath('//span [@class="link_categories"]/a//text()'):
            for category in response.xpath(
                    '//span [@class="link_categories"]/a//text()').extract():
                category_str = category_str + category + '/'
        blogitem['category'] = category_str[:-1]

        content_list = response.xpath(
            '//div[@id="article_content"]//text()').extract()
        for content in content_list:
            content_str = content_str + content
        blogitem['content'] = content_str
        #print content_str.encode('GBK', 'ignore')

        blogitem['content_xml'] = response.xpath(
            '//div[@id="article_content"]').extract()[0]

        yield blogitem

        if int(
                re.sub(
                    r'\D', "",
                    response.xpath('//span[@class="link_comments"]/text()').
                    extract()[1])) > 0:
            comment_url = 'http://blog.csdn.net/' + response.url.split(
                '/')[-4] + '/comment/list/' + response.url.split('/')[-1]
            #print comment_url
            yield Request(comment_url,
                          callback=self.parse_blogcomment,
                          meta={
                              'author': response.url.split('/')[-4],
                              'link': response.url
                          })

            if response.xpath('//pre').extract():
                for code in response.xpath('//pre'):
                    codeitem = CodeItem()
                    count = count + 1
                    codeitem['codeID'] = response.url.split('/')[
                        -4] + response.url.split('/')[-1] + '_' + str(count)
                    codeitem['userID'] = response.url.split('/')[-4]
                    codeitem['link'] = response.url
                    if code.xpath('code/@class'):
                        codeitem['language'] = code.xpath('@class').extract(
                        )[0] + ' ' + code.xpath('code/@class').extract()[0]
                    elif code.xpath('@class'):
                        codeitem['language'] = code.xpath(
                            '@class').extract()[0]
                    else:
                        codeitem['language'] = ''
                    codeitem['code'] = ''.join(
                        code.xpath('descendant::text()').extract())
                    yield codeitem

    def parse_blogcomment(self, response):
        blogcommentitem = BlogCommentItem()
        comment_list = json.loads(response.body)['list']

        for comment in comment_list:
            blogcommentitem['blogcommentID'] = comment['CommentId']
            blogcommentitem['userID'] = comment['UserName']

            blogcommentitem['link'] = response.meta['link']
            blogcommentitem['blogID'] = comment['ArticleId']
            blogcommentitem['authorID'] = response.meta['author']
            blogcommentitem['time'] = comment['PostTime']

            blogcommentitem['content'] = comment['Content']

            if re.findall(r'\[reply\]', comment['Content']):
                blogcommentitem['commenttoID'] = re.findall(
                    r'\[reply\](.*)\[/reply\]', comment['Content'])[0]
            else:
                blogcommentitem['commenttoID'] = response.meta['author']

            yield blogcommentitem

            dbcheck = DB_mysql()
            if dbcheck.check(blogcommentitem['userID']):
                authorItem = AuthorItem()
                authorItem['userID'] = blogcommentitem['userID']
                authorItem[
                    'link'] = 'http://my.csdn.net/' + blogcommentitem['userID']
                authorItem['blog_crawl'] = 0
                authorItem['user_crawl'] = 0
                yield authorItem
Example #6
0
class BBSspider(scrapy.Spider):
    name = 'bbs_spider'
    #download_delay = 1.75
    allowed_domains = ['bbs.csdn.net']
    start_urls = []

    sql = 'SELECT link FROM db_csdn.bbs_topic order by bbstopicID desc;'
    dbquery = DB_mysql()
    result = dbquery.query(sql)
    for bbs_link in result:
        start_urls.append(bbs_link[0])
        start_urls.append(bbs_link[0] + '/closed')

    def parse(self, response):
        post_list = response.xpath('//tr')
        for post in post_list[1:-1]:
            title = post.xpath('td[1]/a/text()').extract()[0]
            point = int(post.xpath('td[2]/text()').extract()[0])
            number_reply = int(post.xpath('td[4]/text()').extract()[0])
            update_time = post.xpath('td[5]//span/text()').extract()[0]

            yield Request('http://bbs.csdn.net' +
                          post.xpath('td[1]/a/@href').extract()[0],
                          callback=self.parse_bbs,
                          meta={
                              'title': title,
                              'point': point,
                              'number_reply': number_reply,
                              'update_time': update_time
                          })

        if response.xpath('//a[@class="next"]/@href'):
            nextpage_url = 'http://bbs.csdn.net' + response.xpath(
                '//a[@class="next"]/@href').extract()[0]
            yield Request(nextpage_url, callback=self.parse)

    def parse_bbs(self, response):
        separator = re.compile('((\\n|\\t|\\r|( ))+( )*)+')
        start_floor = 0
        if not re.search('page', response.url):
            start_floor = 2
            bbspostitem = BBSPostItem()
            tag_str = ' '
            bbspostitem['bbspostID'] = response.url.split('/')[-1]

            bbspostitem['userID'] = response.xpath(
                '//div[@class="detailed"]/table[1]//dd[@class="username"]/a/text()'
            ).extract()[0]

            dbcheck = DB_mysql()
            if dbcheck.check(bbspostitem['userID']):
                authorItem = AuthorItem()
                authorItem['userID'] = bbspostitem['userID']
                authorItem[
                    'link'] = 'http://my.csdn.net/' + bbspostitem['userID']
                authorItem['blog_crawl'] = 0
                authorItem['user_crawl'] = 0
                yield authorItem

            bbspostitem['link'] = response.url
            bbspostitem['title'] = response.meta['title']

            if response.xpath(
                    '//div[@class="detailed"]/table[1]//div[@class="tag"]/span/a/text()'
            ):
                for tag in response.xpath(
                        '//div[@class="detailed"]/table[1]//div[@class="tag"]/span/a/text()'
                ).extract():
                    tag_str = tag_str + tag + '/'
            bbspostitem['tag'] = tag_str[:-1]

            bbspostitem['point'] = response.meta['point']
            bbspostitem['number_reply'] = response.meta['number_reply']
            bbspostitem['number_ding'] = int(
                re.sub(
                    r'\D', "",
                    response.xpath(
                        '//div[@class="detailed"]/table[1]//div[@class="fr"]/a[@class="red digg"]/text()'
                    ).extract()[0]))
            bbspostitem['number_cai'] = int(
                re.sub(
                    r'\D', "",
                    response.xpath(
                        '//div[@class="detailed"]/table[1]//div[@class="fr"]/a[@class="bury"]/text()'
                    ).extract()[0]))
            bbspostitem['time'] = re.findall(
                r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d',
                separator.sub(
                    ' ', ' '.join(
                        response.xpath(
                            '//div[@class="detailed"]/table[1]//span[@class="time"]/text()'
                        ).extract())))[0]
            bbspostitem['update_time'] = response.meta['update_time']
            bbspostitem['content'] = separator.sub(
                ' ', ''.join(
                    response.xpath(
                        '//div[@class="detailed"]/table[1]//div[@class="post_body"]/text()|//div[@class="detailed"]/table[1]//div[@class="post_body"]/a/text()'
                    ).extract()))
            bbspostitem['content_xml'] = response.xpath(
                '//div[@class="detailed"]/table[1]//div[@class="post_body"]'
            ).extract()[0]

            yield bbspostitem

            if response.xpath(
                    '//div[@class="detailed"]/table[1]//pre').extract():
                count = 0
                for code in response.xpath(
                        '//div[@class="detailed"]/table[1]//pre'):
                    codeitem = CodeItem()
                    codeitem['codeID'] = bbspostitem['userID'] + bbspostitem[
                        'bbspostID'] + '_' + str(count)
                    codeitem['userID'] = bbspostitem['userID']
                    codeitem['link'] = bbspostitem['link']
                    if code.xpath('@class').extract():
                        codeitem['language'] = code.xpath(
                            '@class').extract()[0]
                    else:
                        codeitem['language'] = ''
                    codeitem['code'] = code.xpath('text()').extract()
                    yield codeitem
                    count = count + 1

        for reply in response.xpath(
                '//div[@class="detailed"]//table')[start_floor:]:
            bbsreplyitem = BBSReplyItem()
            bbsreplyitem['bbsreplyID'] = reply.xpath('@id').extract()[0].split(
                '-')[-1]
            bbsreplyitem['userID'] = reply.xpath(
                'descendant::dd[@class="username"]/a/text()').extract()[0]

            dbcheck = DB_mysql()
            if dbcheck.check(bbsreplyitem['userID']):
                authorItem = AuthorItem()
                authorItem['userID'] = bbsreplyitem['userID']
                authorItem[
                    'link'] = 'http://my.csdn.net/' + bbsreplyitem['userID']
                authorItem['blog_crawl'] = 0
                authorItem['user_crawl'] = 0
                yield authorItem

            bbsreplyitem['link'] = response.url + reply.xpath(
                'descendant::span[@class="fr"]/a/@href').extract()[0]

            if re.search('page', response.url):
                bbsreplyitem['replytoID'] = response.meta['replytoID']
            else:
                bbsreplyitem['replytoID'] = bbspostitem['userID']

            bbsreplyitem['score'] = int(
                re.sub(
                    r'\D', "",
                    reply.xpath(
                        'descendant::span[@class="fr"]/text()').extract()[1]))

            if reply.xpath(
                    'descendant::div[@class="fr"]/a[@class="red digg"]'):
                bbsreplyitem['number_ding'] = int(
                    re.sub(
                        r'\D', "",
                        reply.xpath(
                            'descendant::div[@class="fr"]/a[@class="red digg"]/text()'
                        ).extract()[0]))
            else:
                bbsreplyitem['number_ding'] = 0

            if reply.xpath('descendant::div[@class="fr"]/a[@class="bury"]'):
                bbsreplyitem['number_cai'] = int(
                    re.sub(
                        r'\D', "",
                        reply.xpath(
                            'descendant::div[@class="fr"]/a[@class="bury"]/text()'
                        ).extract()[0]))
            else:
                bbsreplyitem['number_cai'] = 0

            bbsreplyitem['time'] = re.findall(
                r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d',
                separator.sub(
                    ' ', ' '.join(
                        reply.xpath('descendant::span[@class="time"]/text()').
                        extract())))[0]
            bbsreplyitem['content'] = separator.sub(
                ' ', ''.join(
                    reply.xpath(
                        'descendant::div[@class="post_body"]/text()|descendant::div[@class="post_body"]/a/text()'
                    ).extract()))
            bbsreplyitem['content_xml'] = reply.xpath(
                'descendant::div[@class="post_body"]').extract()[0]

            yield bbsreplyitem

            if reply.xpath('descendant::pre').extract():
                count = 0
                for code in reply.xpath('descendant::pre'):
                    codeitem = CodeItem()
                    codeitem['codeID'] = bbsreplyitem['userID'] + bbsreplyitem[
                        'bbsreplyID'] + '_' + str(count)
                    codeitem['userID'] = bbsreplyitem['userID']
                    codeitem['link'] = bbsreplyitem['link']
                    if code.xpath('@class').extract():
                        codeitem['language'] = code.xpath(
                            '@class').extract()[0]
                    else:
                        codeitem['language'] = ''
                    codeitem['code'] = code.xpath('text()').extract()[0]
                    yield codeitem
                    count = count + 1

        if response.xpath('//a[@class="next"]/@href'):
            nextpage_url = 'http://bbs.csdn.net' + response.xpath(
                '//a[@class="next"]/@href').extract()[0]
            yield Request(nextpage_url,
                          callback=self.parse_bbs,
                          meta={'replytoID': bbspostitem['userID']})
Example #7
0
    def parse_bbs(self, response):
        separator = re.compile('((\\n|\\t|\\r|( ))+( )*)+')
        start_floor = 0
        if not re.search('page', response.url):
            start_floor = 2
            bbspostitem = BBSPostItem()
            tag_str = ' '
            bbspostitem['bbspostID'] = response.url.split('/')[-1]

            bbspostitem['userID'] = response.xpath(
                '//div[@class="detailed"]/table[1]//dd[@class="username"]/a/text()'
            ).extract()[0]

            dbcheck = DB_mysql()
            if dbcheck.check(bbspostitem['userID']):
                authorItem = AuthorItem()
                authorItem['userID'] = bbspostitem['userID']
                authorItem[
                    'link'] = 'http://my.csdn.net/' + bbspostitem['userID']
                authorItem['blog_crawl'] = 0
                authorItem['user_crawl'] = 0
                yield authorItem

            bbspostitem['link'] = response.url
            bbspostitem['title'] = response.meta['title']

            if response.xpath(
                    '//div[@class="detailed"]/table[1]//div[@class="tag"]/span/a/text()'
            ):
                for tag in response.xpath(
                        '//div[@class="detailed"]/table[1]//div[@class="tag"]/span/a/text()'
                ).extract():
                    tag_str = tag_str + tag + '/'
            bbspostitem['tag'] = tag_str[:-1]

            bbspostitem['point'] = response.meta['point']
            bbspostitem['number_reply'] = response.meta['number_reply']
            bbspostitem['number_ding'] = int(
                re.sub(
                    r'\D', "",
                    response.xpath(
                        '//div[@class="detailed"]/table[1]//div[@class="fr"]/a[@class="red digg"]/text()'
                    ).extract()[0]))
            bbspostitem['number_cai'] = int(
                re.sub(
                    r'\D', "",
                    response.xpath(
                        '//div[@class="detailed"]/table[1]//div[@class="fr"]/a[@class="bury"]/text()'
                    ).extract()[0]))
            bbspostitem['time'] = re.findall(
                r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d',
                separator.sub(
                    ' ', ' '.join(
                        response.xpath(
                            '//div[@class="detailed"]/table[1]//span[@class="time"]/text()'
                        ).extract())))[0]
            bbspostitem['update_time'] = response.meta['update_time']
            bbspostitem['content'] = separator.sub(
                ' ', ''.join(
                    response.xpath(
                        '//div[@class="detailed"]/table[1]//div[@class="post_body"]/text()|//div[@class="detailed"]/table[1]//div[@class="post_body"]/a/text()'
                    ).extract()))
            bbspostitem['content_xml'] = response.xpath(
                '//div[@class="detailed"]/table[1]//div[@class="post_body"]'
            ).extract()[0]

            yield bbspostitem

            if response.xpath(
                    '//div[@class="detailed"]/table[1]//pre').extract():
                count = 0
                for code in response.xpath(
                        '//div[@class="detailed"]/table[1]//pre'):
                    codeitem = CodeItem()
                    codeitem['codeID'] = bbspostitem['userID'] + bbspostitem[
                        'bbspostID'] + '_' + str(count)
                    codeitem['userID'] = bbspostitem['userID']
                    codeitem['link'] = bbspostitem['link']
                    if code.xpath('@class').extract():
                        codeitem['language'] = code.xpath(
                            '@class').extract()[0]
                    else:
                        codeitem['language'] = ''
                    codeitem['code'] = code.xpath('text()').extract()
                    yield codeitem
                    count = count + 1

        for reply in response.xpath(
                '//div[@class="detailed"]//table')[start_floor:]:
            bbsreplyitem = BBSReplyItem()
            bbsreplyitem['bbsreplyID'] = reply.xpath('@id').extract()[0].split(
                '-')[-1]
            bbsreplyitem['userID'] = reply.xpath(
                'descendant::dd[@class="username"]/a/text()').extract()[0]

            dbcheck = DB_mysql()
            if dbcheck.check(bbsreplyitem['userID']):
                authorItem = AuthorItem()
                authorItem['userID'] = bbsreplyitem['userID']
                authorItem[
                    'link'] = 'http://my.csdn.net/' + bbsreplyitem['userID']
                authorItem['blog_crawl'] = 0
                authorItem['user_crawl'] = 0
                yield authorItem

            bbsreplyitem['link'] = response.url + reply.xpath(
                'descendant::span[@class="fr"]/a/@href').extract()[0]

            if re.search('page', response.url):
                bbsreplyitem['replytoID'] = response.meta['replytoID']
            else:
                bbsreplyitem['replytoID'] = bbspostitem['userID']

            bbsreplyitem['score'] = int(
                re.sub(
                    r'\D', "",
                    reply.xpath(
                        'descendant::span[@class="fr"]/text()').extract()[1]))

            if reply.xpath(
                    'descendant::div[@class="fr"]/a[@class="red digg"]'):
                bbsreplyitem['number_ding'] = int(
                    re.sub(
                        r'\D', "",
                        reply.xpath(
                            'descendant::div[@class="fr"]/a[@class="red digg"]/text()'
                        ).extract()[0]))
            else:
                bbsreplyitem['number_ding'] = 0

            if reply.xpath('descendant::div[@class="fr"]/a[@class="bury"]'):
                bbsreplyitem['number_cai'] = int(
                    re.sub(
                        r'\D', "",
                        reply.xpath(
                            'descendant::div[@class="fr"]/a[@class="bury"]/text()'
                        ).extract()[0]))
            else:
                bbsreplyitem['number_cai'] = 0

            bbsreplyitem['time'] = re.findall(
                r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d',
                separator.sub(
                    ' ', ' '.join(
                        reply.xpath('descendant::span[@class="time"]/text()').
                        extract())))[0]
            bbsreplyitem['content'] = separator.sub(
                ' ', ''.join(
                    reply.xpath(
                        'descendant::div[@class="post_body"]/text()|descendant::div[@class="post_body"]/a/text()'
                    ).extract()))
            bbsreplyitem['content_xml'] = reply.xpath(
                'descendant::div[@class="post_body"]').extract()[0]

            yield bbsreplyitem

            if reply.xpath('descendant::pre').extract():
                count = 0
                for code in reply.xpath('descendant::pre'):
                    codeitem = CodeItem()
                    codeitem['codeID'] = bbsreplyitem['userID'] + bbsreplyitem[
                        'bbsreplyID'] + '_' + str(count)
                    codeitem['userID'] = bbsreplyitem['userID']
                    codeitem['link'] = bbsreplyitem['link']
                    if code.xpath('@class').extract():
                        codeitem['language'] = code.xpath(
                            '@class').extract()[0]
                    else:
                        codeitem['language'] = ''
                    codeitem['code'] = code.xpath('text()').extract()[0]
                    yield codeitem
                    count = count + 1

        if response.xpath('//a[@class="next"]/@href'):
            nextpage_url = 'http://bbs.csdn.net' + response.xpath(
                '//a[@class="next"]/@href').extract()[0]
            yield Request(nextpage_url,
                          callback=self.parse_bbs,
                          meta={'replytoID': bbspostitem['userID']})
Example #8
0
    def parse_question(self, response):
        separator = re.compile('((\\n|\\t|\\r|( ))+( )*)+')
        askerID = ''
        answer_accepted = True

        if not re.search('page', response.url):
            qaquestionitem = QAquestionItem()
            tag_str = ' '

            qaquestionitem['qaquestionID'] = response.url.split('/')[-1]
            qaquestionitem['userID'] = response.meta['user']
            askerID = qaquestionitem['userID']

            dbcheck = DB_mysql()
            if dbcheck.check(qaquestionitem['userID']):
                authorItem = AuthorItem()
                authorItem['userID'] = qaquestionitem['userID']
                authorItem[
                    'link'] = 'http://my.csdn.net/' + qaquestionitem['userID']
                authorItem['blog_crawl'] = 0
                authorItem['user_crawl'] = 0
                yield authorItem

            qaquestionitem['link'] = response.url
            qaquestionitem['title'] = response.xpath(
                '//div[@class="questions_detail_con"]//dt/text()').extract()[0]

            if response.xpath('//div[@class="tags"]/a/text()'):
                for tag in response.xpath(
                        '//div[@class="tags"]/a/text()').extract():
                    tag_str = tag_str + tag + '/'
                qaquestionitem['tag'] = tag_str[:-1]
            else:
                qaquestionitem['tag'] = ''

            qaquestionitem['point'] = response.meta['point']
            qaquestionitem['number_answer'] = response.meta['number_answer']
            qaquestionitem['number_read'] = response.meta['number_read']
            qaquestionitem['number_collect'] = response.meta['number_collect']
            qaquestionitem['number_alsoask'] = response.meta['number_alsoask']

            if response.meta['answer_accepted'] == u'已有满意答案':
                qaquestionitem['answer_accepted'] = 'yes'
            elif response.meta['answer_accepted'] == u'暂无满意答案':
                qaquestionitem['answer_accepted'] = 'no'
                answer_accepted = False

            qaquestionitem['time'] = re.findall(
                r'\d\d\d\d.\d\d.\d\d \d\d:\d\d',
                separator.sub(
                    ' ', ' '.join(
                        response.xpath('//div[@class="q_operate"]//span/text()'
                                       ).extract())))[0]

            if response.xpath(
                    '//div[@class="questions_detail_con"]/dl/dd/p/text()'):
                qaquestionitem['content'] = response.xpath(
                    '//div[@class="questions_detail_con"]/dl/dd/p/text()'
                ).extract()[0]
            else:
                qaquestionitem['content'] = ''

            qaquestionitem['content_xml'] = response.xpath(
                '//div[@class="questions_detail_con"]/dl/dd').extract()[0]

            yield qaquestionitem

            if response.xpath('//div[@class="questions_detail_con"]//pre'):
                codeitem = CodeItem()
                codeitem['codeID'] = qaquestionitem['userID'] + qaquestionitem[
                    'qaquestionID']
                codeitem['userID'] = qaquestionitem['userID']
                codeitem['link'] = qaquestionitem['link']
                codeitem['language'] = ''
                codeitem['code'] = re.split(
                    r'</code>',
                    re.split(
                        r'<code>',
                        response.xpath(
                            '//div[@class="questions_detail_con"]//pre/code').
                        extract()[0])[1])[0]

                yield codeitem

            if answer_accepted:
                qaansweritem = QAanswerItem()
                qaansweritem['qaanswerID'] = response.xpath(
                    '//div[@class="\n      answer_accept\n      "]/@id'
                ).extract()[0].split('_')[-1]
                qaansweritem['userID'] = response.xpath(
                    '//div[@class="\n      answer_accept\n      "]//a[@class="user_name"]/text()'
                ).extract()[0]

                dbcheck = DB_mysql()
                if dbcheck.check(qaansweritem['userID']):
                    authorItem = AuthorItem()
                    authorItem['userID'] = qaansweritem['userID']
                    authorItem['link'] = 'http://my.csdn.net/' + qaansweritem[
                        'userID']
                    authorItem['blog_crawl'] = 0
                    authorItem['user_crawl'] = 0
                    yield authorItem

                qaansweritem['link'] = response.url
                qaansweritem['answertoID'] = askerID
                qaansweritem['number_ding'] = int(
                    response.xpath(
                        '//div[@class="\n      answer_accept\n      "]//a[@class="praise"]/label/text()'
                    ).extract()[0])
                qaansweritem['number_cai'] = int(
                    response.xpath(
                        '//div[@class="\n      answer_accept\n      "]//a[@class="stamp"]/label/text()'
                    ).extract()[0])
                qaansweritem['number_comment'] = int(
                    re.sub(
                        r'\D', "",
                        response.xpath(
                            '//div[@class="\n      answer_accept\n      "]//a[@class="collection"]/text()'
                        ).extract()[0]))
                qaansweritem['best_answer'] = 'yes'
                qaansweritem['time'] = \
                response.xpath('//div[@class="\n      answer_accept\n      "]//span[@class="adopt_time"]/text()').extract()[0]
                if response.xpath(
                        '//div[@class="\n      answer_accept\n      "]/div[1]/p'
                ):
                    qaansweritem['content'] = separator.sub(
                        ' ', ''.join(
                            response.xpath(
                                '//div[@class="\n      answer_accept\n      "]/div[1]/p/text()'
                            ).extract()))
                else:
                    qaansweritem['content'] = ''
                qaansweritem['content_xml'] = response.xpath(
                    '//div[@class="\n      answer_accept\n      "]/div[1]'
                ).extract()[0]

                yield qaansweritem

                if response.xpath(
                        '//div[@class="\n      answer_accept\n      "]//pre'):
                    codeitem = CodeItem()
                    codeitem['codeID'] = qaansweritem['userID'] + qaansweritem[
                        'qaanswerID']
                    codeitem['userID'] = qaansweritem['userID']
                    codeitem['link'] = qaansweritem['link']
                    codeitem['language'] = ''
                    codeitem['code'] = ''.join(
                        response.xpath(
                            '//div[@class="\n      answer_accept\n      "]//pre/code/text()'
                        ).extract()[0])

                    yield codeitem

        for answer in response.xpath(
                '//div[@class="\n      answer_detail_con\n      "]'):
            qaansweritem = QAanswerItem()
            qaansweritem['qaanswerID'] = answer.xpath(
                '@id').extract()[0].split('_')[-1]
            qaansweritem['userID'] = answer.xpath(
                'descendant::a[@class="user_name"]/text()').extract()[0]

            if dbcheck.check(qaansweritem['userID']):
                authorItem = AuthorItem()
                authorItem['userID'] = qaansweritem['userID']
                authorItem[
                    'link'] = 'http://my.csdn.net/' + qaansweritem['userID']
                authorItem['blog_crawl'] = 0
                authorItem['user_crawl'] = 0
                yield authorItem

            qaansweritem['link'] = response.url
            qaansweritem['answertoID'] = askerID
            qaansweritem['number_ding'] = int(
                answer.xpath('descendant::a[@class="praise"]/label/text()').
                extract()[0])
            qaansweritem['number_cai'] = int(
                answer.xpath(
                    'descendant::a[@class="stamp"]/label/text()').extract()[0])
            qaansweritem['number_comment'] = int(
                re.sub(
                    r'\D', "",
                    answer.xpath('descendant::a[@class="collection"]/text()').
                    extract()[0]))
            qaansweritem['best_answer'] = 'no'
            qaansweritem['time'] = answer.xpath(
                'descendant::span[@class="adopt_time"]/text()').extract()[0]
            if answer.xpath('div[1]/p'):
                qaansweritem['content'] = separator.sub(
                    ' ', ''.join(answer.xpath('div[1]/p/text()').extract()))
            else:
                qaansweritem['content'] = ''
            qaansweritem['content_xml'] = answer.xpath('div[1]').extract()[0]

            yield qaansweritem

            if answer.xpath('descendant::pre').extract():
                codeitem = CodeItem()
                codeitem['codeID'] = qaansweritem['userID'] + qaansweritem[
                    'qaanswerID']
                codeitem['userID'] = qaansweritem['userID']
                codeitem['link'] = qaansweritem['link']
                codeitem['language'] = ''
                codeitem['code'] = ''.join(
                    answer.xpath('descendant::pre/code/text()').extract())

                yield codeitem

        if response.xpath('//a[@rel="next"]'):
            nextpage_url = 'http://ask.csdn.net' + response.xpath(
                '//a[@rel="next"]/@href').extract()[0]
            print nextpage_url
            yield Request(nextpage_url, callback=self.parse_question)