def open_spider(self, spider): #获取数据库中微博内容最新时间戳 if spider.name in ('keyweibocontent', 'weibocontent_userinfo', 'weibocontent_danger', 'weibocontent_userinfo_intime'): db = OracleStore() conn = db.get_connection() sql = "select * from t_user_weibocontent where userID = '%s' order by publishTimeStamp desc" % str( spider.uid) cursor = db.select_operation(conn, sql) count = cursor.fetchone() if not count: #count为None,即数据库中没有该用户微博及时间戳的数据 WeibospiderPipeline.weibocontent_timestamp = None else: WeibospiderPipeline.weibocontent_timestamp = count[ 6] #获取数据库中最新的时间戳(publishTimeStamp)字段 db.close_connection(conn, cursor) if spider.name == 'keyuser': db = OracleStore() conn = db.get_connection() sql = "select * from t_user_keyword where keyword = '%s' order by publishTimeStamp desc" % str( spider.keyword) cursor = db.select_operation(conn, sql) count = cursor.fetchone() if not count: #count为None,即数据库中没有该关键词搜索结果及时间戳的数据 WeibospiderPipeline.keyword_timestamp = None else: WeibospiderPipeline.keyword_timestamp = count[ 4] #获取数据库中最新的时间戳(publishTimeStamp)字段 db.close_connection(conn, cursor)
def get_userinfo(self, response): db = OracleStore() conn = db.get_connection() sql1 = "select * from t_user_info where imagestate = 0" cursor1 = db.select_operation(conn, sql1) sql2 = "select count(*) from t_user_info where imagestate = 0" cursor2 = db.select_operation(conn, sql2) count = cursor2.fetchone() for i in range(count[0]): for result in cursor1.fetchmany(1): if result[0]: mainpageurl = 'http://weibo.com/u/' + str( result[0]) + '?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] #result[1] getweibopage = GetWeibopage() GetWeibopage.data['page'] = 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl( ) yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.get_userurl)
def closed(self, reason): db = OracleStore() conn = db.get_connection() sql = '''update t_spider_state set contentstate = 1''' db.insert_operation(conn, sql) #logger.info('------keyweibocontent_spider closed------') print '------keyweibocontent_spider closed------'
def start_getweiboinfo(self,response): db = OracleStore();conn = db.get_connection() sql1 = '''select * from t_user_info''' cursor1 = db.select_operation(conn,sql1) sql2 = '''select count(*) from t_user_info''' cursor2 = db.select_operation(conn,sql2) count = cursor2.fetchone() if count[0]: for i in range(count[0]): for result in cursor1.fetchmany(1): mainpageurl = 'http://weibo.com/u/'+str(result[0])+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load) else: yield None db.close_connection(conn,cursor1,cursor2)
def parse_follower(self, response): item = WeibospiderItem() analyzer = Analyzer() getweibopage = GetWeibopage() total_follower_pq = analyzer.get_followerhtml(response.body) item['uid'] = response.meta['uid'] item['follower_uid_list'] = analyzer.get_follower(total_follower_pq) item['follow_uid_list'] = [] yield item #获取二级(粉丝)用户的关注和粉丝 if self.uid == response.meta['uid'] and len(item['follower_uid_list']): db = OracleStore() conn = db.get_connection() for follower_uid in item['follower_uid_list']: #获取粉丝用户的关注用户 sql1 = """select count(*) from t_user_follow where userID=%s""" % str( follower_uid) cursor1 = db.select_operation(conn, sql1) count1 = cursor1.fetchone() follower_scraped = count1[0] cursor1.close() if not follower_scraped: #scraped为0,即该账户没有获取过 follow_url = 'http://weibo.com/%s/follow?page=1' % str( follower_uid) yield Request(url=follow_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': follower_uid }, dont_filter=True, callback=self.parse_based_follownum) else: print 'follow_uid existed!', follower_uid yield None #获取粉丝用户的粉丝用户 sql2 = """select count(*) from t_user_follower where userID=%s""" % str( follower_uid) cursor2 = db.select_operation(conn, sql2) count2 = cursor2.fetchone() follower_scraped = count2[0] cursor2.close() if not follower_scraped: #scraped为0,即该账户没有获取过 follower_url = 'http://weibo.com/%s/fans?page=1' % str( follower_uid) yield Request(url=follower_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': follower_uid }, dont_filter=True, callback=self.parse_based_followernum) else: print 'follower_uid existed!', follower_uid yield None conn.close()
def parse_load(self, response): user_info = userinfo.WeiboSpider() request_url = response.request.url p = re.compile('&pre_page=(\d).*&page=(\d)') #用于判断是第一页的第一次加载 match = p.search(request_url) if int(match.group(1)) == 0 and int(match.group(2)) == 1: #进行用户信息的获取 db = OracleStore() conn = db.get_connection() sql = "select count(*) from t_user_info where userID='%s'" % self.uid cursor = db.select_operation(conn, sql) count = cursor.fetchone() if not count[0]: #若没有爬取过该uid用户,则爬取用户基本信息 analyzer = Analyzer() total_pq = analyzer.get_html( response.body, 'script:contains("PCD_person_info")') user_property = analyzer.get_userproperty(total_pq) if user_property == 'icon_verify_co_v': #该账号为公众账号 public_userinfo_url = analyzer.get_public_userinfohref( total_pq) #yield Request(url=public_userinfo_url,meta={'cookiejar':response.meta['cookiejar'],'uid':response.meta['uid'],'user_property':user_property},callback=self.parse_public_userinfo) 暂时不处理公众账号,需要数据库设置外键 else: userinfo_url = analyzer.get_userinfohref(total_pq) yield Request(url=userinfo_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': response.meta['uid'], 'user_property': user_property }, callback=self.parse_userinfo) db.close_connection(conn, cursor) item = WeibospiderItem() #获取用户微博信息及@用户信息 analyzer = Analyzer() friendcircle = FriendCircle() total_pq = analyzer.get_mainhtml(response.body) item['uid'] = response.meta['uid'] item['content'] = analyzer.get_content(total_pq) item['time'], item['timestamp'] = analyzer.get_time(total_pq) atuser_info, item['repost_user'] = analyzer.get_atuser_repostuser( total_pq) atuser_list = friendcircle.atuser_parser(atuser_info) item['atuser_nickname_list'] = atuser_list yield item for atuser_inlist in atuser_list: if atuser_inlist != []: for atuser in atuser_inlist: uid_url = "http://s.weibo.com/user/" + quote( quote(str(atuser))) + "&Refer=SUer_box" yield Request(url=uid_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid, 'atuser_nickname': atuser }, callback=self.parse_atuser_uid) else: continue
def closed(self,reason): db = OracleStore();conn = db.get_connection() cur = conn.cursor() for key in self.atuser_dict.keys(): #插入@用户uid信息 sql= """update t_user_weibocontent_atuser set atuserID = %s where userID = %s and atuser = '******'""" % (self.atuser_dict.get(key),self.uid,key) cur.execute(sql) conn.commit() sql = '''update t_spider_state set contentstate = 1''' db.insert_operation(conn,sql) print '------weibocontent_info_spider closed------'
def closed(self, reason): db = OracleStore() conn = db.get_connection() cur = conn.cursor() for key in self.atuser_dict.keys(): #插入@用户uid信息 sql = """update t_user_weibocontent_atuser set atuserID = %s where userID = %s and atuser = '******'""" % ( self.atuser_dict.get(key), self.uid, key) cur.execute(sql) conn.commit() sql = '''update t_spider_state set contentstate = 1''' db.insert_operation(conn, sql) print '------weibocontent_info_spider closed------'
def open_spider(self,spider): #获取数据库中微博内容最新时间戳 if spider.name in ('keyweibocontent','weibocontent_userinfo','weibocontent_danger','weibocontent_userinfo_intime'): db=OracleStore();conn = db.get_connection() sql = "select * from t_user_weibocontent where userID = '%s' order by publishTimeStamp desc" % str(spider.uid) cursor = db.select_operation(conn,sql) count = cursor.fetchone() if not count: #count为None,即数据库中没有该用户微博及时间戳的数据 WeibospiderPipeline.weibocontent_timestamp = None else: WeibospiderPipeline.weibocontent_timestamp = count[6] #获取数据库中最新的时间戳(publishTimeStamp)字段 db.close_connection(conn,cursor) if spider.name == 'keyuser': db=OracleStore();conn = db.get_connection() sql = "select * from t_user_keyword where keyword = '%s' order by publishTimeStamp desc" % str(spider.keyword) cursor = db.select_operation(conn,sql) count = cursor.fetchone() if not count: #count为None,即数据库中没有该关键词搜索结果及时间戳的数据 WeibospiderPipeline.keyword_timestamp = None else: WeibospiderPipeline.keyword_timestamp = count[4] #获取数据库中最新的时间戳(publishTimeStamp)字段 db.close_connection(conn,cursor)
def start_getweiboinfo(self, response): db = OracleStore() conn = db.get_connection() sql1 = '''select * from t_user_info''' cursor1 = db.select_operation(conn, sql1) sql2 = '''select count(*) from t_user_info''' cursor2 = db.select_operation(conn, sql2) count = cursor2.fetchone() if count[0]: for i in range(count[0]): for result in cursor1.fetchmany(1): mainpageurl = 'http://weibo.com/u/' + str( result[0]) + '?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page + 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl( ) yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl( ) yield Request(url=secondloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl( ) yield Request(url=thirdloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.parse_load) else: yield None db.close_connection(conn, cursor1, cursor2)
def parse_follower(self,response): item = WeibospiderItem() analyzer = Analyzer() getweibopage = GetWeibopage() total_follower_pq = analyzer.get_followerhtml(response.body) item['uid'] = response.meta['uid'] item['follower_uid_list'] = analyzer.get_follower(total_follower_pq) item['follow_uid_list'] = [] yield item if self.uid == response.meta['uid'] and len(item['follower_uid_list']): db = OracleStore() conn = db.get_connection() for follower_uid in item['follower_uid_list']: #获取粉丝用户的关注用户 sql1 = """select count(*) from t_user_follow where userID=%s""" % str(follower_uid) cursor1 = db.select_operation(conn,sql1) count1 = cursor1.fetchone() follower_scraped = count1[0] cursor1.close() if not follower_scraped: #scraped为0,即该账户没有获取过 for page in range(WeiboSpider.follow_page_num,0,-1): GetWeibopage.relation_data['page'] = page follow_url = getinfo.get_follow_mainurl(follower_uid) + getweibopage.get_relation_paramurl() yield Request(url=follow_url,meta={'cookiejar':response.meta['cookiejar'],'uid':follower_uid},callback=self.parse_follow) else: print 'follow_uid existed!',follower_uid yield None #获取粉丝用户的粉丝用户 sql2 = """select count(*) from t_user_follower where userID=%s""" % str(follower_uid) cursor2 = db.select_operation(conn,sql2) count2 = cursor2.fetchone() follower_scraped = count2[0] cursor2.close() if not follower_scraped: #scraped为0,即该账户没有获取过 for page in range(WeiboSpider.follower_page_num,0,-1): GetWeibopage.relation_data['page'] = page follower_url = getinfo.get_follower_mainurl(follower_uid) + getweibopage.get_relation_paramurl() yield Request(url=follower_url,meta={'cookiejar':response.meta['cookiejar'],'uid':follower_uid},callback=self.parse_follower) else: print 'follower_uid existed!',follower_uid yield None conn.close()
def get_userinfo(self,response): db = OracleStore() conn = db.get_connection() sql1 = "select * from t_user_info where imagestate = 0" cursor1 = db.select_operation(conn,sql1) sql2 = "select count(*) from t_user_info where imagestate = 0" cursor2 = db.select_operation(conn,sql2) count = cursor2.fetchone() for i in range(count[0]): for result in cursor1.fetchmany(1): if result[0]: mainpageurl = 'http://weibo.com/u/'+str(result[0])+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] #result[1] getweibopage = GetWeibopage() GetWeibopage.data['page'] = 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.get_userurl)
def parse_follower(self,response): item = WeibospiderItem() analyzer = Analyzer() getweibopage = GetWeibopage() total_follower_pq = analyzer.get_followerhtml(response.body) item['uid'] = response.meta['uid'] item['follower_uid_list'] = analyzer.get_follower(total_follower_pq) item['follow_uid_list'] = [] yield item #获取二级(粉丝)用户的关注和粉丝 if self.uid == response.meta['uid'] and len(item['follower_uid_list']): db = OracleStore() conn = db.get_connection() for follower_uid in item['follower_uid_list']: #获取粉丝用户的关注用户 sql1 = """select count(*) from t_user_follow where userID=%s""" % str(follower_uid) cursor1 = db.select_operation(conn,sql1) count1 = cursor1.fetchone() follower_scraped = count1[0] cursor1.close() if not follower_scraped: #scraped为0,即该账户没有获取过 follow_url = 'http://weibo.com/%s/follow?page=1' % str(follower_uid) yield Request(url=follow_url,meta={'cookiejar':response.meta['cookiejar'],'uid':follower_uid},dont_filter=True,callback=self.parse_based_follownum) else: print 'follow_uid existed!',follower_uid yield None #获取粉丝用户的粉丝用户 sql2 = """select count(*) from t_user_follower where userID=%s""" % str(follower_uid) cursor2 = db.select_operation(conn,sql2) count2 = cursor2.fetchone() follower_scraped = count2[0] cursor2.close() if not follower_scraped: #scraped为0,即该账户没有获取过 follower_url = 'http://weibo.com/%s/fans?page=1' % str(follower_uid) yield Request(url=follower_url,meta={'cookiejar':response.meta['cookiejar'],'uid':follower_uid},dont_filter=True,callback=self.parse_based_followernum) else: print 'follower_uid existed!',follower_uid yield None conn.close()
def parse_load(self,response): user_info = userinfo.WeiboSpider() request_url = response.request.url p=re.compile('&pre_page=(\d).*&page=(\d)') #用于判断是第一页的第一次加载 match = p.search(request_url) if int(match.group(1)) == 0 and int(match.group(2)) == 1: #进行用户信息的获取 db = OracleStore();conn = db.get_connection() sql = "select count(*) from t_user_info where userID='%s'" % self.uid cursor = db.select_operation(conn,sql);count = cursor.fetchone() if not count[0]: #若没有爬取过该uid用户,则爬取用户基本信息 analyzer = Analyzer() total_pq = analyzer.get_html(response.body,'script:contains("PCD_person_info")') user_property = analyzer.get_userproperty(total_pq) if user_property == 'icon_verify_co_v': #该账号为公众账号 public_userinfo_url = analyzer.get_public_userinfohref(total_pq) #yield Request(url=public_userinfo_url,meta={'cookiejar':response.meta['cookiejar'],'uid':response.meta['uid'],'user_property':user_property},callback=self.parse_public_userinfo) 暂时不处理公众账号,需要数据库设置外键 else: userinfo_url = analyzer.get_userinfohref(total_pq) yield Request(url=userinfo_url,meta={'cookiejar':response.meta['cookiejar'],'uid':response.meta['uid'],'user_property':user_property},callback=self.parse_userinfo) db.close_connection(conn,cursor) item = WeibospiderItem() #获取用户微博信息及@用户信息 analyzer = Analyzer() friendcircle = FriendCircle() total_pq = analyzer.get_mainhtml(response.body) item['uid'] = response.meta['uid'] item['content'] = analyzer.get_content(total_pq) item['time'],item['timestamp'] = analyzer.get_time(total_pq) atuser_info,item['repost_user'] = analyzer.get_atuser_repostuser(total_pq) atuser_list = friendcircle.atuser_parser(atuser_info) item['atuser_nickname_list'] = atuser_list yield item for atuser_inlist in atuser_list: if atuser_inlist != []: for atuser in atuser_inlist: uid_url = "http://s.weibo.com/user/"+quote(quote(str(atuser)))+"&Refer=SUer_box" yield Request(url=uid_url,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid,'atuser_nickname':atuser},callback=self.parse_atuser_uid) else: continue
def closed(self,reason): db = OracleStore();conn = db.get_connection() sql = 'update t_spider_state set userinfostate = 1' db.insert_operation(conn,sql) print '------userinfo_list_spider closed------'
def closed(self, reason): db = OracleStore() conn = db.get_connection() sql = 'update t_spider_state set searchstate=1' db.insert_operation(conn, sql) print '------keyuser_spider closed------'
def closed(self, reason): db = OracleStore() conn = db.get_connection() db.close_connection(conn) print '--------closed-------'
def closed(self, reason): db = OracleStore() conn = db.get_connection() db.close_connection(conn) print "--------closed-------"
def closed(self,reason): db = OracleStore();conn = db.get_connection() sql = '''update t_spider_state set contentstate = 1''' db.insert_operation(conn,sql) #logger.info('------keyweibocontent_spider closed------') print '------keyweibocontent_spider closed------'