def get_user_from_page_comment(): r = redis.Redis('','6379') page = 0 while True: page_id = r.spop('page') if not page_id: print('in get_user_from_page_comment') time.sleep(random.randint(5,10)) continue while True: url = '{}&page={}' response = get_response(url.format(page_id.decode('utf-8'),page)) result = json.loads(response.content) if not result: page = 1 break page += 1 for user in result['data']: user_id = user['user']['id'] flag = r.sismember('used_user',user_id) if not flag: r.sadd('used_user',user_id) r.sadd('users',user_id)
def get_comment_article(): r = redis.Redis('','6379') np = 0 while True: user_id = r.spop('user_comment') if not user_id: time.sleep(random.randint(4,8)) print('in get_comment_article') continue while True: url = '{}/baisishequ-iphone-8.0/{}-20.json' response = get_response(url.format(user_id.decode('utf-8'),np)) result = json.loads(response.content) np = result['info']['np'] if not np: np = 0 break np = int(np) for page in result['list']: page_id = page['topic']['id'] #page_up = page['topic']['up'] page_comment = page['topic']['comment'] flag = r.sismember('used_page',page_id) if int(page_comment) >0 and not flag: r.sadd('used_page',page_id) r.sadd('page',page_id)
def get_user_follows(): r = redis.Redis('', '6379') follow_id = 0 while True: user_id = r.spop('user_follow') if not user_id: time.sleep(random.randint(4, 8)) print('in get user follows') continue while True: url = '{}&userid={}' response = get_response( url.format(follow_id, user_id.decode('utf-8'))) result = json.loads(response.content.decode('utf-8')) follow_id = result['data']['info']['follow_id'] if follow_id == '0': break user_list = result['data']['list'] for user in user_list: new_user_id = user['id'] flag = r.sismember('used_user', new_user_id) if not flag: r.sadd('used_user', new_user_id) r.sadd('users', new_user_id)
def get_share_article(): r = redis.Redis('', '6379') np = 0 while True: user_id = r.spop('user_share') if not user_id: print('in get_share_article') continue while True: url = '{}/baisishequ-iphone-8.0/{}-20.json' response = get_response(url.format(user_id.decode('utf-8'), np)) result = json.loads(response.content) np = result['info']['np'] if not np: np = 0 break np = int(np) for page in result['list']: page_id = page['id'] page_up = page['up'] flag = r.sismember('used_page', page_id) if not flag: r.sadd('used_page', page_id) r.sadd('page', page_id)
def get_user_articles(redis_set='user_article', not_climb_depth=False, check_date=False, redis_increament_set=None): r = redis.Redis('', '6379') mongodb_client = pymongo.MongoClient(host='', port=27017) mongodb_db = mongodb_client.budejie mongodb_table = mongodb_db.budejie_article print( 'redis_set is: {},not_climb_depth is: {},bool is: {},check_date is: {},bool is: {}' .format(redis_set, not_climb_depth, bool(not_climb_depth), check_date, bool(check_date))) while True: user_id = r.spop(redis_set) np = 0 outdate_flag = False if not user_id: time.sleep(random.randint(4, 8)) continue while True: url = '{}/1/desc/baisishequ-win-1.0/{}-20.json' response = get_response(url.format(user_id.decode('utf-8'), np)) try: result = json.loads(response.content) except Exception as e: print(response.content) input() np = result['info']['np'] if not np: break for page in result['list']: value = {'user_id': user_id.decode('utf-8')} page_id = page['id'] # 点赞数 page_comment = page['comment'] if int(page_comment) >= 0 and not r.sismember( 'used_page', page_id): r.sadd('used_page', page_id) r.sadd('page', page_id) up_count = page['up'] article_text = page['text'].replace(' ', '').replace('\n', '') if int(up_count) <= 0 or '该内容已被删除' in article_text: continue data = page['passtime'].split(' ')[0] if check_date and data < str( - datetime.timedelta(days=30)): outdate_flag = True continue if redis_increament_set: r.sadd(redis_increament_set, user_id) print('up_count is: {}'.format(up_count)) article_type = page['type'] value.update({ 'page': page_id, 'up_count': up_count, 'article_type': article_type, 'article_text': article_text }) try: hot_comment_list = [ comment['content'].replace(' ', '').replace('\n', '') for comment in page['top_comments'] if comment['content'] != '' ] value.update({'hot_comment': hot_comment_list}) except Exception as e: pass if article_type == 'image': url = page['image']['big'][0] value.update({'url': url}) elif article_type == 'gif': url = page['gif']['images'][0] value.update({'url': url}) elif article_type == 'video': url = page['video']['download'][0] value.update({'url': url}) value.update({'is_download': 'false'}) try: mongodb_table.insert_many([value]) except Exception as e: print(e) continue if not_climb_depth or outdate_flag: break
def get_user_detail(): r = redis.Redis('', '6379') conn = psycopg2.connect(database='budejie', user='******', password='******', host='', port=5432) cur = conn.cursor() count = 0 while True: user_id = r.spop('users') if not user_id: print('in get_user_information') time.sleep(random.randint(4, 8)) continue url = '{}' count += 1 try: response = get_response(url.format(user_id.decode('utf-8'))) user_information = json.loads( response.content.decode('utf-8'))['data'] user_article_num = user_information['tiezi_count'] user_sex = user_information['sex'] user_follow = user_information['follow_count'] user_fans = user_information['fans_count'] user_phone = user_information['phone'] user_name = user_information['username'].strip() user_id = user_information['id'] user_comment = user_information['comment_count'] #user_share = user_information['share_count'] except Exception as e: continue if 0 < int(user_fans): r.sadd('user_fans', user_id) if 0 < int(user_follow): r.sadd('user_follow', user_id) if 0 < int(user_article_num): r.sadd('user_article', user_id) # if 0 < int(user_share): # r.sadd('user_share',user_id) if 0 < int(user_comment): r.sadd('user_comment', user_id) try: if user_phone != '': cur.execute( "insert into budejie_user (user_id,user_sex,user_phone,user_name,user_fans,user_article,is_download) values (%s,%s,%s,%s,%s,%s,%s);", (user_id, '女' if user_sex == 'f' else '男', user_phone, user_name, user_fans, user_article_num, '0')) print( 'user_id is: {},user_name is: {},user_sex is: {},user_phone is: {},user_comment is: {}' .format(user_id, user_name, user_sex, user_phone, user_comment)) else: continue except Exception as e: print('insert error,user_id is: {}'.format(user_id)) continue if count % 1 == 0: conn.commit() cur.close() conn.close()