def parse_blogcomment(self, response): blogcommentitem = BlogCommentItem() comment_list = json.loads(response.body)['list'] for comment in comment_list: blogcommentitem['blogcommentID'] = comment['CommentId'] blogcommentitem['userID'] = comment['UserName'] blogcommentitem['link'] = response.meta['link'] blogcommentitem['blogID'] = comment['ArticleId'] blogcommentitem['authorID'] = response.meta['author'] blogcommentitem['time'] = comment['PostTime'] blogcommentitem['content'] = comment['Content'] if re.findall(r'\[reply\]', comment['Content']): blogcommentitem['commenttoID'] = re.findall( r'\[reply\](.*)\[/reply\]', comment['Content'])[0] else: blogcommentitem['commenttoID'] = response.meta['author'] yield blogcommentitem dbcheck = DB_mysql() if dbcheck.check(blogcommentitem['userID']): authorItem = AuthorItem() authorItem['userID'] = blogcommentitem['userID'] authorItem[ 'link'] = 'http://my.csdn.net/' + blogcommentitem['userID'] authorItem['blog_crawl'] = 0 authorItem['user_crawl'] = 0 yield authorItem
def parse(self, response): separator = re.compile('((\\n|\\t|\\r|( ))+( )*)+') useritem = UserItem() detail_str = '' focus_str = '' befoucs_str = '' dbupdate = DB_mysql() dbupdate.update(response.url.split('/')[-1], 2) useritem['userID'] = response.url.split('/')[-1] useritem['link'] = response.url useritem['nickname'] = response.xpath( '//dt[@class="person-nick-name"]/span/text()').extract()[0] for detail in response.xpath( '//dd[@class="person-detail"]/text()').extract()[:-1]: detail_str = detail_str + separator.sub('', detail) + ',' useritem['detail'] = detail_str[:-1] useritem['sign'] = response.xpath( '//dd[@class="person-sign"]/text()').extract()[0] #useritem['scores_label'] = useritem['number_focus'] = int( response.xpath( '//div[@class="focus"]/div[1]/span/text()').extract()[0]) useritem['number_befoucs'] = int( response.xpath('//div[@class="focus beFocus"]/div[1]/span/text()'). extract()[0]) for focus_user in response.xpath( '//div[@class="focus"]/div[2]//@href').extract(): focus_str = focus_str + focus_user + ',' useritem['focus_userID'] = focus_str[:-1] for befocus_user in response.xpath( '//div[@class="focus beFocus"]/div[2]//@href').extract(): befoucs_str = befoucs_str + befocus_user + ',' useritem['befocus_userID'] = befoucs_str[:-1] yield useritem
class Userspider(scrapy.Spider): name = 'user_spider' allowed_domains = ['my.csdn.net'] start_urls = [] sql = 'select userID from author where user_crawl = 0' dbquery = DB_mysql() result = dbquery.query(sql) for userID in result: start_urls.append('http://my.csdn.net/' + userID[0]) def parse(self, response): separator = re.compile('((\\n|\\t|\\r|( ))+( )*)+') useritem = UserItem() detail_str = '' focus_str = '' befoucs_str = '' dbupdate = DB_mysql() dbupdate.update(response.url.split('/')[-1], 2) useritem['userID'] = response.url.split('/')[-1] useritem['link'] = response.url useritem['nickname'] = response.xpath( '//dt[@class="person-nick-name"]/span/text()').extract()[0] for detail in response.xpath( '//dd[@class="person-detail"]/text()').extract()[:-1]: detail_str = detail_str + separator.sub('', detail) + ',' useritem['detail'] = detail_str[:-1] useritem['sign'] = response.xpath( '//dd[@class="person-sign"]/text()').extract()[0] #useritem['scores_label'] = useritem['number_focus'] = int( response.xpath( '//div[@class="focus"]/div[1]/span/text()').extract()[0]) useritem['number_befoucs'] = int( response.xpath('//div[@class="focus beFocus"]/div[1]/span/text()'). extract()[0]) for focus_user in response.xpath( '//div[@class="focus"]/div[2]//@href').extract(): focus_str = focus_str + focus_user + ',' useritem['focus_userID'] = focus_str[:-1] for befocus_user in response.xpath( '//div[@class="focus beFocus"]/div[2]//@href').extract(): befoucs_str = befoucs_str + befocus_user + ',' useritem['befocus_userID'] = befoucs_str[:-1] yield useritem
def parse(self, response): dbupdate = DB_mysql() dbupdate.update(response.url.split('/')[-1], 1) yield Request(response.url, callback=self.parse_bloglist)
class Blogspider(scrapy.Spider): name = 'blog_spider' allowed_domains = ['blog.csdn.net'] start_urls = [] sql = 'select userID from author where blog_crawl = 0' dbquery = DB_mysql() result = dbquery.query(sql) for userID in result: start_urls.append('http://blog.csdn.net/' + userID[0]) def parse(self, response): dbupdate = DB_mysql() dbupdate.update(response.url.split('/')[-1], 1) yield Request(response.url, callback=self.parse_bloglist) def parse_bloglist(self, response): for url in response.xpath( '//span[@class="link_title"]/descendant::a/@href').extract(): blog_url = 'http://blog.csdn.net' + url #print blog_url yield Request(blog_url, callback=self.parse_blog) if response.xpath('//div[@class="pagelist"]/a/text()'): if response.xpath('//div[@class="pagelist"]/a/text()').extract( )[-2] == u'下一页': nextpage_url = 'http://blog.csdn.net' + response.xpath( '//div[@class="pagelist"]/a/@href').extract()[-2] # print nextpage_url yield Request(nextpage_url, callback=self.parse) def parse_blog(self, response): blogitem = BlogItem() separator = re.compile('((\\n|\\t|\\r|( ))+( )*)+') category_str = ' ' content_str = '' count = 0 blogitem['blogID'] = response.url.split('/')[-1] blogitem['userID'] = response.url.split('/')[-4] blogitem['link'] = response.url blogitem['title'] = separator.sub( ' ', response.xpath( '//div[@class="article_title"]//a/text()').extract()[-1]) blogitem['time'] = response.xpath( '//span[@class="link_postdate"]/text()').extract()[0] blogitem['number_read'] = int( re.sub( r'\D', "", response.xpath('//span[@class="link_view"]/text()').extract() [0])) blogitem['number_comment'] = int( re.sub( r'\D', "", response.xpath( '//span[@class="link_comments"]/text()').extract()[1])) blogitem['number_ding'] = int( re.sub( r'\D', "", response.xpath('//dl[@id="btnDigg"]/dd/text()').extract()[0])) blogitem['number_cai'] = int( re.sub( r'\D', "", response.xpath('//dl[@id="btnBury"]/dd/text()').extract()[0])) blogitem['article_type'] = response.xpath( '//div[@class="article_title"]/span/@class').extract()[0].split( '_')[-1] if response.xpath('//span [@class="link_categories"]/a//text()'): for category in response.xpath( '//span [@class="link_categories"]/a//text()').extract(): category_str = category_str + category + '/' blogitem['category'] = category_str[:-1] content_list = response.xpath( '//div[@id="article_content"]//text()').extract() for content in content_list: content_str = content_str + content blogitem['content'] = content_str #print content_str.encode('GBK', 'ignore') blogitem['content_xml'] = response.xpath( '//div[@id="article_content"]').extract()[0] yield blogitem if int( re.sub( r'\D', "", response.xpath('//span[@class="link_comments"]/text()'). extract()[1])) > 0: comment_url = 'http://blog.csdn.net/' + response.url.split( '/')[-4] + '/comment/list/' + response.url.split('/')[-1] #print comment_url yield Request(comment_url, callback=self.parse_blogcomment, meta={ 'author': response.url.split('/')[-4], 'link': response.url }) if response.xpath('//pre').extract(): for code in response.xpath('//pre'): codeitem = CodeItem() count = count + 1 codeitem['codeID'] = response.url.split('/')[ -4] + response.url.split('/')[-1] + '_' + str(count) codeitem['userID'] = response.url.split('/')[-4] codeitem['link'] = response.url if code.xpath('code/@class'): codeitem['language'] = code.xpath('@class').extract( )[0] + ' ' + code.xpath('code/@class').extract()[0] elif code.xpath('@class'): codeitem['language'] = code.xpath( '@class').extract()[0] else: codeitem['language'] = '' codeitem['code'] = ''.join( code.xpath('descendant::text()').extract()) yield codeitem def parse_blogcomment(self, response): blogcommentitem = BlogCommentItem() comment_list = json.loads(response.body)['list'] for comment in comment_list: blogcommentitem['blogcommentID'] = comment['CommentId'] blogcommentitem['userID'] = comment['UserName'] blogcommentitem['link'] = response.meta['link'] blogcommentitem['blogID'] = comment['ArticleId'] blogcommentitem['authorID'] = response.meta['author'] blogcommentitem['time'] = comment['PostTime'] blogcommentitem['content'] = comment['Content'] if re.findall(r'\[reply\]', comment['Content']): blogcommentitem['commenttoID'] = re.findall( r'\[reply\](.*)\[/reply\]', comment['Content'])[0] else: blogcommentitem['commenttoID'] = response.meta['author'] yield blogcommentitem dbcheck = DB_mysql() if dbcheck.check(blogcommentitem['userID']): authorItem = AuthorItem() authorItem['userID'] = blogcommentitem['userID'] authorItem[ 'link'] = 'http://my.csdn.net/' + blogcommentitem['userID'] authorItem['blog_crawl'] = 0 authorItem['user_crawl'] = 0 yield authorItem
class BBSspider(scrapy.Spider): name = 'bbs_spider' #download_delay = 1.75 allowed_domains = ['bbs.csdn.net'] start_urls = [] sql = 'SELECT link FROM db_csdn.bbs_topic order by bbstopicID desc;' dbquery = DB_mysql() result = dbquery.query(sql) for bbs_link in result: start_urls.append(bbs_link[0]) start_urls.append(bbs_link[0] + '/closed') def parse(self, response): post_list = response.xpath('//tr') for post in post_list[1:-1]: title = post.xpath('td[1]/a/text()').extract()[0] point = int(post.xpath('td[2]/text()').extract()[0]) number_reply = int(post.xpath('td[4]/text()').extract()[0]) update_time = post.xpath('td[5]//span/text()').extract()[0] yield Request('http://bbs.csdn.net' + post.xpath('td[1]/a/@href').extract()[0], callback=self.parse_bbs, meta={ 'title': title, 'point': point, 'number_reply': number_reply, 'update_time': update_time }) if response.xpath('//a[@class="next"]/@href'): nextpage_url = 'http://bbs.csdn.net' + response.xpath( '//a[@class="next"]/@href').extract()[0] yield Request(nextpage_url, callback=self.parse) def parse_bbs(self, response): separator = re.compile('((\\n|\\t|\\r|( ))+( )*)+') start_floor = 0 if not re.search('page', response.url): start_floor = 2 bbspostitem = BBSPostItem() tag_str = ' ' bbspostitem['bbspostID'] = response.url.split('/')[-1] bbspostitem['userID'] = response.xpath( '//div[@class="detailed"]/table[1]//dd[@class="username"]/a/text()' ).extract()[0] dbcheck = DB_mysql() if dbcheck.check(bbspostitem['userID']): authorItem = AuthorItem() authorItem['userID'] = bbspostitem['userID'] authorItem[ 'link'] = 'http://my.csdn.net/' + bbspostitem['userID'] authorItem['blog_crawl'] = 0 authorItem['user_crawl'] = 0 yield authorItem bbspostitem['link'] = response.url bbspostitem['title'] = response.meta['title'] if response.xpath( '//div[@class="detailed"]/table[1]//div[@class="tag"]/span/a/text()' ): for tag in response.xpath( '//div[@class="detailed"]/table[1]//div[@class="tag"]/span/a/text()' ).extract(): tag_str = tag_str + tag + '/' bbspostitem['tag'] = tag_str[:-1] bbspostitem['point'] = response.meta['point'] bbspostitem['number_reply'] = response.meta['number_reply'] bbspostitem['number_ding'] = int( re.sub( r'\D', "", response.xpath( '//div[@class="detailed"]/table[1]//div[@class="fr"]/a[@class="red digg"]/text()' ).extract()[0])) bbspostitem['number_cai'] = int( re.sub( r'\D', "", response.xpath( '//div[@class="detailed"]/table[1]//div[@class="fr"]/a[@class="bury"]/text()' ).extract()[0])) bbspostitem['time'] = re.findall( r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d', separator.sub( ' ', ' '.join( response.xpath( '//div[@class="detailed"]/table[1]//span[@class="time"]/text()' ).extract())))[0] bbspostitem['update_time'] = response.meta['update_time'] bbspostitem['content'] = separator.sub( ' ', ''.join( response.xpath( '//div[@class="detailed"]/table[1]//div[@class="post_body"]/text()|//div[@class="detailed"]/table[1]//div[@class="post_body"]/a/text()' ).extract())) bbspostitem['content_xml'] = response.xpath( '//div[@class="detailed"]/table[1]//div[@class="post_body"]' ).extract()[0] yield bbspostitem if response.xpath( '//div[@class="detailed"]/table[1]//pre').extract(): count = 0 for code in response.xpath( '//div[@class="detailed"]/table[1]//pre'): codeitem = CodeItem() codeitem['codeID'] = bbspostitem['userID'] + bbspostitem[ 'bbspostID'] + '_' + str(count) codeitem['userID'] = bbspostitem['userID'] codeitem['link'] = bbspostitem['link'] if code.xpath('@class').extract(): codeitem['language'] = code.xpath( '@class').extract()[0] else: codeitem['language'] = '' codeitem['code'] = code.xpath('text()').extract() yield codeitem count = count + 1 for reply in response.xpath( '//div[@class="detailed"]//table')[start_floor:]: bbsreplyitem = BBSReplyItem() bbsreplyitem['bbsreplyID'] = reply.xpath('@id').extract()[0].split( '-')[-1] bbsreplyitem['userID'] = reply.xpath( 'descendant::dd[@class="username"]/a/text()').extract()[0] dbcheck = DB_mysql() if dbcheck.check(bbsreplyitem['userID']): authorItem = AuthorItem() authorItem['userID'] = bbsreplyitem['userID'] authorItem[ 'link'] = 'http://my.csdn.net/' + bbsreplyitem['userID'] authorItem['blog_crawl'] = 0 authorItem['user_crawl'] = 0 yield authorItem bbsreplyitem['link'] = response.url + reply.xpath( 'descendant::span[@class="fr"]/a/@href').extract()[0] if re.search('page', response.url): bbsreplyitem['replytoID'] = response.meta['replytoID'] else: bbsreplyitem['replytoID'] = bbspostitem['userID'] bbsreplyitem['score'] = int( re.sub( r'\D', "", reply.xpath( 'descendant::span[@class="fr"]/text()').extract()[1])) if reply.xpath( 'descendant::div[@class="fr"]/a[@class="red digg"]'): bbsreplyitem['number_ding'] = int( re.sub( r'\D', "", reply.xpath( 'descendant::div[@class="fr"]/a[@class="red digg"]/text()' ).extract()[0])) else: bbsreplyitem['number_ding'] = 0 if reply.xpath('descendant::div[@class="fr"]/a[@class="bury"]'): bbsreplyitem['number_cai'] = int( re.sub( r'\D', "", reply.xpath( 'descendant::div[@class="fr"]/a[@class="bury"]/text()' ).extract()[0])) else: bbsreplyitem['number_cai'] = 0 bbsreplyitem['time'] = re.findall( r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d', separator.sub( ' ', ' '.join( reply.xpath('descendant::span[@class="time"]/text()'). extract())))[0] bbsreplyitem['content'] = separator.sub( ' ', ''.join( reply.xpath( 'descendant::div[@class="post_body"]/text()|descendant::div[@class="post_body"]/a/text()' ).extract())) bbsreplyitem['content_xml'] = reply.xpath( 'descendant::div[@class="post_body"]').extract()[0] yield bbsreplyitem if reply.xpath('descendant::pre').extract(): count = 0 for code in reply.xpath('descendant::pre'): codeitem = CodeItem() codeitem['codeID'] = bbsreplyitem['userID'] + bbsreplyitem[ 'bbsreplyID'] + '_' + str(count) codeitem['userID'] = bbsreplyitem['userID'] codeitem['link'] = bbsreplyitem['link'] if code.xpath('@class').extract(): codeitem['language'] = code.xpath( '@class').extract()[0] else: codeitem['language'] = '' codeitem['code'] = code.xpath('text()').extract()[0] yield codeitem count = count + 1 if response.xpath('//a[@class="next"]/@href'): nextpage_url = 'http://bbs.csdn.net' + response.xpath( '//a[@class="next"]/@href').extract()[0] yield Request(nextpage_url, callback=self.parse_bbs, meta={'replytoID': bbspostitem['userID']})
def parse_bbs(self, response): separator = re.compile('((\\n|\\t|\\r|( ))+( )*)+') start_floor = 0 if not re.search('page', response.url): start_floor = 2 bbspostitem = BBSPostItem() tag_str = ' ' bbspostitem['bbspostID'] = response.url.split('/')[-1] bbspostitem['userID'] = response.xpath( '//div[@class="detailed"]/table[1]//dd[@class="username"]/a/text()' ).extract()[0] dbcheck = DB_mysql() if dbcheck.check(bbspostitem['userID']): authorItem = AuthorItem() authorItem['userID'] = bbspostitem['userID'] authorItem[ 'link'] = 'http://my.csdn.net/' + bbspostitem['userID'] authorItem['blog_crawl'] = 0 authorItem['user_crawl'] = 0 yield authorItem bbspostitem['link'] = response.url bbspostitem['title'] = response.meta['title'] if response.xpath( '//div[@class="detailed"]/table[1]//div[@class="tag"]/span/a/text()' ): for tag in response.xpath( '//div[@class="detailed"]/table[1]//div[@class="tag"]/span/a/text()' ).extract(): tag_str = tag_str + tag + '/' bbspostitem['tag'] = tag_str[:-1] bbspostitem['point'] = response.meta['point'] bbspostitem['number_reply'] = response.meta['number_reply'] bbspostitem['number_ding'] = int( re.sub( r'\D', "", response.xpath( '//div[@class="detailed"]/table[1]//div[@class="fr"]/a[@class="red digg"]/text()' ).extract()[0])) bbspostitem['number_cai'] = int( re.sub( r'\D', "", response.xpath( '//div[@class="detailed"]/table[1]//div[@class="fr"]/a[@class="bury"]/text()' ).extract()[0])) bbspostitem['time'] = re.findall( r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d', separator.sub( ' ', ' '.join( response.xpath( '//div[@class="detailed"]/table[1]//span[@class="time"]/text()' ).extract())))[0] bbspostitem['update_time'] = response.meta['update_time'] bbspostitem['content'] = separator.sub( ' ', ''.join( response.xpath( '//div[@class="detailed"]/table[1]//div[@class="post_body"]/text()|//div[@class="detailed"]/table[1]//div[@class="post_body"]/a/text()' ).extract())) bbspostitem['content_xml'] = response.xpath( '//div[@class="detailed"]/table[1]//div[@class="post_body"]' ).extract()[0] yield bbspostitem if response.xpath( '//div[@class="detailed"]/table[1]//pre').extract(): count = 0 for code in response.xpath( '//div[@class="detailed"]/table[1]//pre'): codeitem = CodeItem() codeitem['codeID'] = bbspostitem['userID'] + bbspostitem[ 'bbspostID'] + '_' + str(count) codeitem['userID'] = bbspostitem['userID'] codeitem['link'] = bbspostitem['link'] if code.xpath('@class').extract(): codeitem['language'] = code.xpath( '@class').extract()[0] else: codeitem['language'] = '' codeitem['code'] = code.xpath('text()').extract() yield codeitem count = count + 1 for reply in response.xpath( '//div[@class="detailed"]//table')[start_floor:]: bbsreplyitem = BBSReplyItem() bbsreplyitem['bbsreplyID'] = reply.xpath('@id').extract()[0].split( '-')[-1] bbsreplyitem['userID'] = reply.xpath( 'descendant::dd[@class="username"]/a/text()').extract()[0] dbcheck = DB_mysql() if dbcheck.check(bbsreplyitem['userID']): authorItem = AuthorItem() authorItem['userID'] = bbsreplyitem['userID'] authorItem[ 'link'] = 'http://my.csdn.net/' + bbsreplyitem['userID'] authorItem['blog_crawl'] = 0 authorItem['user_crawl'] = 0 yield authorItem bbsreplyitem['link'] = response.url + reply.xpath( 'descendant::span[@class="fr"]/a/@href').extract()[0] if re.search('page', response.url): bbsreplyitem['replytoID'] = response.meta['replytoID'] else: bbsreplyitem['replytoID'] = bbspostitem['userID'] bbsreplyitem['score'] = int( re.sub( r'\D', "", reply.xpath( 'descendant::span[@class="fr"]/text()').extract()[1])) if reply.xpath( 'descendant::div[@class="fr"]/a[@class="red digg"]'): bbsreplyitem['number_ding'] = int( re.sub( r'\D', "", reply.xpath( 'descendant::div[@class="fr"]/a[@class="red digg"]/text()' ).extract()[0])) else: bbsreplyitem['number_ding'] = 0 if reply.xpath('descendant::div[@class="fr"]/a[@class="bury"]'): bbsreplyitem['number_cai'] = int( re.sub( r'\D', "", reply.xpath( 'descendant::div[@class="fr"]/a[@class="bury"]/text()' ).extract()[0])) else: bbsreplyitem['number_cai'] = 0 bbsreplyitem['time'] = re.findall( r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d', separator.sub( ' ', ' '.join( reply.xpath('descendant::span[@class="time"]/text()'). extract())))[0] bbsreplyitem['content'] = separator.sub( ' ', ''.join( reply.xpath( 'descendant::div[@class="post_body"]/text()|descendant::div[@class="post_body"]/a/text()' ).extract())) bbsreplyitem['content_xml'] = reply.xpath( 'descendant::div[@class="post_body"]').extract()[0] yield bbsreplyitem if reply.xpath('descendant::pre').extract(): count = 0 for code in reply.xpath('descendant::pre'): codeitem = CodeItem() codeitem['codeID'] = bbsreplyitem['userID'] + bbsreplyitem[ 'bbsreplyID'] + '_' + str(count) codeitem['userID'] = bbsreplyitem['userID'] codeitem['link'] = bbsreplyitem['link'] if code.xpath('@class').extract(): codeitem['language'] = code.xpath( '@class').extract()[0] else: codeitem['language'] = '' codeitem['code'] = code.xpath('text()').extract()[0] yield codeitem count = count + 1 if response.xpath('//a[@class="next"]/@href'): nextpage_url = 'http://bbs.csdn.net' + response.xpath( '//a[@class="next"]/@href').extract()[0] yield Request(nextpage_url, callback=self.parse_bbs, meta={'replytoID': bbspostitem['userID']})
def parse_question(self, response): separator = re.compile('((\\n|\\t|\\r|( ))+( )*)+') askerID = '' answer_accepted = True if not re.search('page', response.url): qaquestionitem = QAquestionItem() tag_str = ' ' qaquestionitem['qaquestionID'] = response.url.split('/')[-1] qaquestionitem['userID'] = response.meta['user'] askerID = qaquestionitem['userID'] dbcheck = DB_mysql() if dbcheck.check(qaquestionitem['userID']): authorItem = AuthorItem() authorItem['userID'] = qaquestionitem['userID'] authorItem[ 'link'] = 'http://my.csdn.net/' + qaquestionitem['userID'] authorItem['blog_crawl'] = 0 authorItem['user_crawl'] = 0 yield authorItem qaquestionitem['link'] = response.url qaquestionitem['title'] = response.xpath( '//div[@class="questions_detail_con"]//dt/text()').extract()[0] if response.xpath('//div[@class="tags"]/a/text()'): for tag in response.xpath( '//div[@class="tags"]/a/text()').extract(): tag_str = tag_str + tag + '/' qaquestionitem['tag'] = tag_str[:-1] else: qaquestionitem['tag'] = '' qaquestionitem['point'] = response.meta['point'] qaquestionitem['number_answer'] = response.meta['number_answer'] qaquestionitem['number_read'] = response.meta['number_read'] qaquestionitem['number_collect'] = response.meta['number_collect'] qaquestionitem['number_alsoask'] = response.meta['number_alsoask'] if response.meta['answer_accepted'] == u'已有满意答案': qaquestionitem['answer_accepted'] = 'yes' elif response.meta['answer_accepted'] == u'暂无满意答案': qaquestionitem['answer_accepted'] = 'no' answer_accepted = False qaquestionitem['time'] = re.findall( r'\d\d\d\d.\d\d.\d\d \d\d:\d\d', separator.sub( ' ', ' '.join( response.xpath('//div[@class="q_operate"]//span/text()' ).extract())))[0] if response.xpath( '//div[@class="questions_detail_con"]/dl/dd/p/text()'): qaquestionitem['content'] = response.xpath( '//div[@class="questions_detail_con"]/dl/dd/p/text()' ).extract()[0] else: qaquestionitem['content'] = '' qaquestionitem['content_xml'] = response.xpath( '//div[@class="questions_detail_con"]/dl/dd').extract()[0] yield qaquestionitem if response.xpath('//div[@class="questions_detail_con"]//pre'): codeitem = CodeItem() codeitem['codeID'] = qaquestionitem['userID'] + qaquestionitem[ 'qaquestionID'] codeitem['userID'] = qaquestionitem['userID'] codeitem['link'] = qaquestionitem['link'] codeitem['language'] = '' codeitem['code'] = re.split( r'</code>', re.split( r'<code>', response.xpath( '//div[@class="questions_detail_con"]//pre/code'). extract()[0])[1])[0] yield codeitem if answer_accepted: qaansweritem = QAanswerItem() qaansweritem['qaanswerID'] = response.xpath( '//div[@class="\n answer_accept\n "]/@id' ).extract()[0].split('_')[-1] qaansweritem['userID'] = response.xpath( '//div[@class="\n answer_accept\n "]//a[@class="user_name"]/text()' ).extract()[0] dbcheck = DB_mysql() if dbcheck.check(qaansweritem['userID']): authorItem = AuthorItem() authorItem['userID'] = qaansweritem['userID'] authorItem['link'] = 'http://my.csdn.net/' + qaansweritem[ 'userID'] authorItem['blog_crawl'] = 0 authorItem['user_crawl'] = 0 yield authorItem qaansweritem['link'] = response.url qaansweritem['answertoID'] = askerID qaansweritem['number_ding'] = int( response.xpath( '//div[@class="\n answer_accept\n "]//a[@class="praise"]/label/text()' ).extract()[0]) qaansweritem['number_cai'] = int( response.xpath( '//div[@class="\n answer_accept\n "]//a[@class="stamp"]/label/text()' ).extract()[0]) qaansweritem['number_comment'] = int( re.sub( r'\D', "", response.xpath( '//div[@class="\n answer_accept\n "]//a[@class="collection"]/text()' ).extract()[0])) qaansweritem['best_answer'] = 'yes' qaansweritem['time'] = \ response.xpath('//div[@class="\n answer_accept\n "]//span[@class="adopt_time"]/text()').extract()[0] if response.xpath( '//div[@class="\n answer_accept\n "]/div[1]/p' ): qaansweritem['content'] = separator.sub( ' ', ''.join( response.xpath( '//div[@class="\n answer_accept\n "]/div[1]/p/text()' ).extract())) else: qaansweritem['content'] = '' qaansweritem['content_xml'] = response.xpath( '//div[@class="\n answer_accept\n "]/div[1]' ).extract()[0] yield qaansweritem if response.xpath( '//div[@class="\n answer_accept\n "]//pre'): codeitem = CodeItem() codeitem['codeID'] = qaansweritem['userID'] + qaansweritem[ 'qaanswerID'] codeitem['userID'] = qaansweritem['userID'] codeitem['link'] = qaansweritem['link'] codeitem['language'] = '' codeitem['code'] = ''.join( response.xpath( '//div[@class="\n answer_accept\n "]//pre/code/text()' ).extract()[0]) yield codeitem for answer in response.xpath( '//div[@class="\n answer_detail_con\n "]'): qaansweritem = QAanswerItem() qaansweritem['qaanswerID'] = answer.xpath( '@id').extract()[0].split('_')[-1] qaansweritem['userID'] = answer.xpath( 'descendant::a[@class="user_name"]/text()').extract()[0] if dbcheck.check(qaansweritem['userID']): authorItem = AuthorItem() authorItem['userID'] = qaansweritem['userID'] authorItem[ 'link'] = 'http://my.csdn.net/' + qaansweritem['userID'] authorItem['blog_crawl'] = 0 authorItem['user_crawl'] = 0 yield authorItem qaansweritem['link'] = response.url qaansweritem['answertoID'] = askerID qaansweritem['number_ding'] = int( answer.xpath('descendant::a[@class="praise"]/label/text()'). extract()[0]) qaansweritem['number_cai'] = int( answer.xpath( 'descendant::a[@class="stamp"]/label/text()').extract()[0]) qaansweritem['number_comment'] = int( re.sub( r'\D', "", answer.xpath('descendant::a[@class="collection"]/text()'). extract()[0])) qaansweritem['best_answer'] = 'no' qaansweritem['time'] = answer.xpath( 'descendant::span[@class="adopt_time"]/text()').extract()[0] if answer.xpath('div[1]/p'): qaansweritem['content'] = separator.sub( ' ', ''.join(answer.xpath('div[1]/p/text()').extract())) else: qaansweritem['content'] = '' qaansweritem['content_xml'] = answer.xpath('div[1]').extract()[0] yield qaansweritem if answer.xpath('descendant::pre').extract(): codeitem = CodeItem() codeitem['codeID'] = qaansweritem['userID'] + qaansweritem[ 'qaanswerID'] codeitem['userID'] = qaansweritem['userID'] codeitem['link'] = qaansweritem['link'] codeitem['language'] = '' codeitem['code'] = ''.join( answer.xpath('descendant::pre/code/text()').extract()) yield codeitem if response.xpath('//a[@rel="next"]'): nextpage_url = 'http://ask.csdn.net' + response.xpath( '//a[@rel="next"]/@href').extract()[0] print nextpage_url yield Request(nextpage_url, callback=self.parse_question)