def insert_proxies(proxies_list): if not proxies_list: return con = MyPyMysql(**mysql_config) sql = """ replace into pt_db.spide_proxies_ip (proxy_host,proxy_port) values %s """ con.insert_query(sql, proxies_list) mylog.info('insert :' + sql + str(proxies_list)) con.close_connect()
def getfollowlist(current_uk): @gen.coroutine def followquery(current_uk, start, limit): query_uk = current_uk url = follow_url.format(start, limit, query_uk) mylog.info('follow: ' + url) response = yield get_spide(url) list_data = json.loads(response.body) if list_data['errno'] != 0: yield followquery(current_uk, start, limit) raise gen.Return(response.body) total_count = list_data['total_count'] follow_list = list_data['follow_list'] raise gen.Return([total_count, follow_list]) try: start = 0 limit = 24 url = follow_url.format(start, limit, current_uk) list_data = yield followquery(current_uk, start, limit) total_count = list_data[0] follow_list = [] person_data = [] con = MyPyMysql(**mysql_config) sql = """select follow_nums from pt_db.spide_all_person_log where uk = %s """ query_data = con.query(sql, current_uk) query_follows = query_data[0]['follow_nums'] if query_data else 0 for_follows = total_count - query_follows # 增量更新,差异多少 if for_follows > 0: for j in range(((for_follows - 1) / limit) + 1): start = j * limit url = follow_url.format(start, limit, current_uk) range_data = yield followquery(current_uk, start, limit) follow_list.extend(range_data[1]) for i in follow_list: person_data.append([ i['follow_uk'], i['follow_uname'].encode("utf-8"), i['fans_count'], i['follow_count'], i['pubshare_count'] ]) # 把所有查到的人添加到数据库 if person_data: sql = """ replace into pt_db.spide_all_person (uk,uk_name,fan_nums,follow_nums,share_nums) values %s """ con.insert_query(sql, person_data) # 记录查询的这个人当前的关注人数 sql = """ insert into pt_db.spide_all_person_log (uk,follow_nums) values (%s,%s) ON DUPLICATE KEY UPDATE follow_nums=%s , m_time = %s;""" con.query(sql, (current_uk, total_count, total_count, now_time)) # 查询没有fetching的用户uk sql = """ SELECT p.uk FROM pt_db.spide_all_person p where p.follow_nums !=0 """ query_uk = con.query(sql) uks = [i['uk'] for i in query_uk] con.close_connect() except Exception as e: mylog.error('followlist 失败: ' + str(url)) mylog.error(e) raise gen.Return([]) raise gen.Return(uks)
def getsharelist(current_uk, start, limit): try: # yield gen.sleep(consume_sleeptime) auth_type = 1 query_uk = current_uk url = share_url.format(auth_type, start, limit, query_uk) response = yield get_spide(url) list_data = json.loads(response.body) if list_data['errno'] != 0: yield getsharelist(current_uk, start, limit) raise gen.Return(response.body) records = list_data['records'] if 'records' in list_data else [] insert_data = [] for i in records: if i['feed_type'] == 'share': for j in i['filelist']: insert_data.append([ j['fs_id'], j['category'], 'http://pan.baidu.com/s/' if 'shorturl' in i.keys() else 'http://pan.baidu.com/share/link?uk={0}&shareid='. format(current_uk), i['shorturl'].encode("utf-8") if 'shorturl' in i.keys() else i['shareid'].encode("utf-8"), i['public'].encode("utf-8"), j['server_filename'].encode("utf-8"), i['uk'], i['username'].encode("utf-8"), j['size'], timestamptotime(i['feed_time'] / 1000) ]) len_insert_data = len(records) # print len_insert_data,insert_data con = MyPyMysql(**mysql_config) if insert_data: sql = """ insert ignore into pt_db.spide_shares (fs_id,category,base_url,share_url,`public`,server_filename,uk,username,`size`,share_time) values %s """ con.insert_query(sql, insert_data) # 记录查询的这个人现在分享到多少了 sql = """ insert into pt_db.spide_all_person_log (uk,share_nums) values (%s,%s) ON DUPLICATE KEY UPDATE share_nums=share_nums+%s , m_time = %s;""" con.query(sql, (current_uk, len_insert_data, len_insert_data, now_time)) con.close_connect() mylog.info('sharelist 成功: ' + url) except Exception as e: mylog.error('sharelist 失败: ' + str(url)) mylog.error(e) raise gen.Return([]) raise gen.Return([])
class SpProducer(object): def __init__(self): self.now_time = time.strftime('%Y-%m-%d %H:%M:%S') self.r = RedisPool(**redis_conf).redis() self.r.rpush("share_list", str(base_uk)) # 初始化把base_url加入到队列中 self.con = MyPyMysql(**mysql_config) def add_most_person(self): ntime = time.time() last_time = self.r.get('if_add_time') if not last_time or last_time is None: self.r.set('if_add_time', str(ntime)) else: if ntime - float(last_time) > 86400: self.r.set('if_add_time', str(ntime)) # 向队列里面添加数据 sql = """SELECT uk FROM pt_db.spide_all_person p where p.share_nums !=0 order by share_nums desc limit 100; """ result = self.con.query(sql) for i in result: self.r.rpush("share_list", str(i['uk'])) mylog.info('向share_list添加前分享前100数据') @gen.coroutine def getsharelist(self, current_uk, start, limit): try: auth_type = 1 query_uk = current_uk url = share_url.format(auth_type, start, limit, query_uk) response = yield self.get_spide(url) list_data = json.loads(response.body) if list_data['errno'] != 0: yield self.getsharelist(current_uk, start, limit) else: records = list_data[ 'records'] if 'records' in list_data else [] insert_data = [] for i in records: if i['feed_type'] == 'share': for j in i['filelist']: insert_data.append([ j['fs_id'], j['category'], 'http://pan.baidu.com/s/' if 'shorturl' in i.keys() else 'http://pan.baidu.com/share/link?uk={0}&shareid=' .format(current_uk), i['shorturl'].encode("utf-8") if 'shorturl' in i.keys() else i['shareid'].encode("utf-8"), i['public'].encode("utf-8"), j['server_filename'].encode("utf-8"), i['uk'], i['username'].encode("utf-8"), j['size'], timestamptotime(i['feed_time'] / 1000) ]) len_insert_data = len(records) # print len_insert_data,insert_data if insert_data: sql = """ insert ignore into pt_db.spide_shares (fs_id,category,base_url,share_url,`public`,server_filename,uk,username,`size`,share_time) values %s """ self.con.insert_query(sql, insert_data) # 记录查询的这个人现在分享到多少了 sql = """ insert into pt_db.spide_all_person_log (uk,share_nums) values (%s,%s) ON DUPLICATE KEY UPDATE share_nums=share_nums+%s , m_time = %s;""" self.con.query(sql, (current_uk, len_insert_data, len_insert_data, time.strftime('%Y-%m-%d %H:%M:%S'))) mylog.info('sharelist 成功: ' + url) except Exception as e: mylog.error('sharelist 失败: ' + str(url)) mylog.error(e) else: raise gen.Return([]) @gen.coroutine def get_spide(self, url): """ 获取 能用的代理连接 :return: """ for _ in range(10): rlist = self.r.lrange("proxy_ip_list", 0, -1) try: if self.r.llen('proxy_ip_list') == 0: self.put_ip() mylog.info('proxy_ip_list队列无值,等待添加中....') i = json.loads(self.r.blpop("proxy_ip_list", timeout=0)[1]) httpconfigs = get_http_config() httpconfigs['proxy_host'] = i['proxy_host'] httpconfigs['proxy_port'] = i['proxy_port'] response = yield Spide(url, **httpconfigs).async_proxy() # response = yield Spide(url, **httpconfigs).async() except Exception as e: mylog.error(str(e)) mylog.error('无法连接... ' + str(len(rlist)) + ' ' + str(i['proxy_host'])) else: mylog.info('连接成功...' + str(len(rlist)) + ' ' + str(i['proxy_host'])) raise gen.Return(response) def put_ip(self): sql = """SELECT proxy_host,proxy_port FROM pt_db.spide_proxies_ip order by rand();""" result = self.con.query(sql) for i in result: self.r.rpush("proxy_ip_list", json.dumps(i)) mylog.info('向proxy_ip_list加数据') def put_share_list(self): # 向队列里面添加数据 sql = """SELECT uk FROM pt_db.spide_all_person p where p.share_nums !=0 order by share_nums desc; """ result = self.con.query(sql) for i in result: self.r.rpush("share_list", str(i['uk'])) mylog.info('向share_list加数据') if not result or result is None: mylog.info('share_list数据库无数据...') def get_all_person(self, uk): """ 获取用户当前的 :return: """ sql = """SELECT ifnull(p.share_nums,0)-ifnull(l.share_nums,0) as share_nums FROM pt_db.spide_all_person p left join pt_db.spide_all_person_log l on p.uk = l.uk where p.share_nums !=0 and p.uk = %s """ all_person = self.con.query(sql, uk) # uk_data = (all_person[0]['share_nums']) if all_person else 0 return uk_data @gen.coroutine def worker(self): while True: try: self.add_most_person() if self.r.llen('share_list') == 0: mylog.info('share_list队列无值,等待添加中....') self.put_share_list() mylog.info('消费队列:share_list:{0}'.format( self.r.llen('share_list'))) current_uk = (self.r.blpop("share_list", timeout=200)[1]) query_share_nums = self.get_all_person(current_uk) limit = 60 if query_share_nums > 0: for j in range(((query_share_nums - 1) / limit) + 1): starts = j * limit yield self.getsharelist(current_uk, starts, limit) except Exception as e: mylog.error(e.message) def runner(self): ioloop.IOLoop.current().run_sync(self.worker)
class SpProducer(object): def __init__(self): self.now_time = time.strftime('%Y-%m-%d %H:%M:%S') self.r = RedisPool(**redis_conf).redis() self.con = MyPyMysql(**mysql_config) self.first_sql = """ insert into pt_db.spide_all_person_log (uk,follow_nums,fan_nums,share_nums) values (%s,%s,%s,%s) ON DUPLICATE KEY UPDATE follow_nums=follow_nums , m_time = %s;""" self.con.query(self.first_sql, [base_uk, 0, 0, 0, self.now_time]) self.r.rpush("follow_list", str(base_uk)) # 初始化把base_url加入到队列中 @gen.coroutine def getfollowlist(self, current_uk): @gen.coroutine def followquery(current_uk, start, limit): query_uk = current_uk url = follow_url.format(start, limit, query_uk) mylog.info('follow: ' + url) response = yield self.get_spide(url) list_data = json.loads(response.body) if list_data['errno'] != 0: mylog.info(response.body) yield followquery(current_uk, start, limit) else: total_count = list_data['total_count'] follow_list = list_data['follow_list'] raise gen.Return([total_count, follow_list]) try: start = 0 limit = 24 url = follow_url.format(start, limit, current_uk) list_data = yield followquery(current_uk, start, limit) total_count = list_data[0] follow_list = [] person_data = [] uk_lists = [] sql = """select follow_nums from pt_db.spide_all_person_log where uk = %s """ query_data = self.con.query(sql, current_uk) query_follows = query_data[0]['follow_nums'] if query_data else 0 for_follows = total_count - query_follows # 增量更新,差异多少 if for_follows > 0: for j in range(((for_follows - 1) / limit) + 1): start = j * limit url = follow_url.format(start, limit, current_uk) range_data = yield followquery(current_uk, start, limit) follow_list.extend(range_data[1]) for i in follow_list: uk_lists.append(i['follow_uk']) person_data.append([ i['follow_uk'], i['follow_uname'].encode("utf-8"), i['fans_count'], i['follow_count'], i['pubshare_count'] ]) # 把所有查到的人添加到数据库 if person_data: sql = """ replace into pt_db.spide_all_person (uk,uk_name,fan_nums,follow_nums,share_nums) values %s """ self.con.insert_query(sql, person_data) # 记录查询的这个人当前的关注人数 sql = """ insert into pt_db.spide_all_person_log (uk,follow_nums) values (%s,%s) ON DUPLICATE KEY UPDATE follow_nums=%s , m_time = %s;""" self.con.query(sql, (current_uk, total_count, total_count, time.strftime('%Y-%m-%d %H:%M:%S'))) except Exception as e: mylog.error('followlist 失败: ' + str(url)) mylog.error(e) raise gen.Return([]) raise gen.Return(uk_lists) @gen.coroutine def get_spide(self, url): """ 获取 能用的代理连接 :return: """ for _ in range(10): rlist = self.r.lrange("proxy_ip_list", 0, -1) try: if self.r.llen('proxy_ip_list') == 0: mylog.info('proxy_ip_list队列无值,等待添加中....') i = json.loads(self.r.blpop("proxy_ip_list", timeout=100)[1]) httpconfigs = get_http_config() httpconfigs['proxy_host'] = i['proxy_host'] httpconfigs['proxy_port'] = i['proxy_port'] response = yield Spide(url, **httpconfigs).async_proxy() # response = yield Spide(url, **httpconfigs).async() except Exception as e: mylog.error(str(e)) mylog.error('无法连接... ' + str(len(rlist)) + ' ' + str(i['proxy_host'])) else: mylog.info('连接成功...' + str(len(rlist)) + ' ' + str(i['proxy_host'])) raise gen.Return(response) @gen.coroutine def put_ip(self): # 向队列里面添加数据 sql = """SELECT uk FROM pt_db.spide_all_person p where follow_nums != 0 order by rand() desc ;""" result = self.con.query(sql) for i in result: self.r.rpush("follow_list", str(i['uk'])) mylog.info('向follow_list加数据') if not result or result is None: mylog.info('spide_all_person无数据...') @gen.coroutine def fetch_url(self): if self.r.llen('follow_list') == 0: mylog.info('follow_list队列无值,等待添加中....') self.put_ip() current_uk = (self.r.blpop("follow_list", timeout=100)[1]) try: mylog.info('生产队列:follow_list:{0},followed_set:{1}'.format( self.r.llen('follow_list'), self.r.scard('followed_set'))) follow_uks = yield self.getfollowlist(current_uk) # fank_uks = yield getfanlist(current_uk,proxies) # uks = list(set(follow_uks + fank_uks)) for uk in follow_uks: # 判断是否在follow_set 里面,不在才插入到队列中 if not self.r.sismember('follow_set', uk): self.r.rpush("follow_list", uk) yield gen.sleep(product_sleeptime) except Exception as e: mylog.error(e.message) @gen.coroutine def worker(self): while True: yield self.fetch_url() def runner(self): ioloop.IOLoop.current().run_sync(self.worker)