Beispiel #1
0
class CheckQualificationByRankList():
	def __init__(self):
		self.get_raw_data = GetRawData()
		self.redis_client = RedisClient()
		self.live_user_list = []
		self.room_id_list = []
		self.stupid_key_words = STUPID_KEY_WORDS

	def get_users(self):
		users = self.redis_client.get_users() #每次获取分数最低的10000个
		return users

	def is_live_user(self, sec_user_id):
		return self.redis_client.is_live_user(sec_user_id)

	def save_rooms(self):
		for each in self.room_id_list:
			self.redis_client.add_rooms(each)

	def add_to_live_users(self):
		for each in self.live_user_list:
			self.redis_client.add_live_users(each, 1)

	def increase_user_score(self, sec_user_id):
		self.redis_client.increase_user_score(sec_user_id)

	def is_qualified_user(self, user):
		nickname = user.get('nickname')
		for word in self.stupid_key_words:
			if word in nickname:
				return False
		return True

	def get_rank_list(self, sec_user_id):

		if not self.is_live_user(sec_user_id):
			try:
				raw_data = self.get_raw_data.get_rank_list(sec_user_id)
			except Exception as e:
				logger.error('get_rank_list出错-' + e.args[0] + '-sec_user_id-' + sec_user_id)
				return None

			try:
				user = raw_data.get('data').get('anchor_info').get('user')
			except Exception as e:
				logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' + sec_user_id)
				return None

			if self.is_qualified_user(user):
				own_room = user.get('own_room')
				if own_room: #如果有这个,说明直播开始了
					room_id = own_room.get('room_ids_str')[0]
					self.live_user_list.append(sec_user_id)
					self.room_id_list.append(room_id)
				self.increase_user_score(sec_user_id)
			else:
				self.redis_client.delete_users(sec_user_id)
				logger.info('删除user-sec_user_id-' + sec_user_id)
		else:
			self.increase_user_score(sec_user_id)
		"""
		try:
			raw_data = self.get_raw_data.get_rank_list(sec_user_id)
		except Exception as e:
			logger.error('get_rank_list出错-' + e.args[0] + '-sec_user_id-' + sec_user_id)
			return None

		try:
			nickname = raw_data.get('data').get('anchor_info').get('user').get('nickname')
			print(nickname)
		except Exception as e:
			logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' + sec_user_id)
			return None
		"""

	def run(self):
		users = self.get_users()

		batch_size = 50 #这个接口80个一批可行的(总共近4000个),更多的我不敢再试了
		for batch_limit in range(0, len(users), batch_size):
			start = batch_limit
			stop = min(batch_limit+batch_size, len(users))
			logger.info('当前获取用户序号-' + str(start+1) + '-' + str(stop))
			tasks = [gevent.spawn(self.get_rank_list, sec_user_id) for sec_user_id in users[start:stop]]
			gevent.joinall(tasks)

			self.save_rooms()
			self.add_to_live_users()
			logger.info('新增room_id-' + str(len(self.room_id_list)))
			self.room_id_list.clear()
			self.live_user_list.clear()
Beispiel #2
0
class SaveLiveUsers():
	def __init__(self):
		self.get_raw_data = GetRawData()
		#self.db = pymysql.connect(host='47.114.166.130', port=13306, user='******', password='******', db='bxdb', charset='utf8mb4')
		#self.cursor = self.db.cursor()
		self.db = pymysql.connect(host='localhost', port=3306, user='******', password='******', db='bxmind', charset='utf8mb4')
		self.cursor = self.db.cursor()
		self.mysql_client = MysqlClient()
		self.redis_client = RedisClient()
		self.stupid_key_words = STUPID_KEY_WORDS
		self.a_list = []
		self.b_list = []

	def into_mysql(self, data, table):
		keys = ','.join(data.keys())
		values = ','.join(['%s'] * len(data))	
		sql = 'insert into %s (%s) values (%s)' %(table, keys, values)
		try:
			self.cursor.execute(sql, tuple(data.values()))
			self.db.commit()
		except Exception as e:
			print(e.args)

	def is_qualified(self, nickname):
		for word in self.stupid_key_words:
			if word in nickname:
				return False
		return True

	def run_a(self):
		table = 'dy_live_lives'
		with open('lives_20200614.txt', 'r', encoding='utf-8') as f:
			lines = f.readlines()
			for line in lines:
				#print(line)

				data = json.loads(line)
				nickname = data.get('nickname')
				sec_user_id = data.get('sec_user_id')
				if data.get('status') == 4:
					if self.is_qualified(nickname):
						self.into_mysql(data, table)
					else:
						self.redis_client.delete_users(sec_user_id)
						self.redis_client.delete_live_users(sec_user_id)

	def replicate_table(self):
		sql = 'CREATE TABLE dy_sample LIKE dy_live_lives'
		self.cursor.execute(sql)

	def select_users(self):
		sql = 'SELECT room_id, sec_user_id, nickname, short_id, total_viewer, like_count, follower_count, signature, city FROM dy_live_lives WHERE total_viewer > 50000 AND follower_count > 500000'
		self.cursor.execute(sql)
		row = self.cursor.fetchone()
		while row:
			data = {}
			data['room_id'] = row[0]
			data['sec_user_id'] = row[1]
			data['nickname'] = row[2]
			data['short_id'] = row[3]
			data['total_viewer'] = row[4]
			data['like_count'] = row[5]
			data['follower_count'] = row[6]
			data['signature'] = row[7]
			data['city'] = row[8]
			self.a_list.append(data)
			row = self.cursor.fetchone()

	def get_cates(self, data):
		sec_user_id = data['sec_user_id']
		try:
			cates_raw_data = self.get_raw_data.get_cates(sec_user_id)
		except Exception as e:
			logger.error('get_cates出错-' + e.args[0] + '-sec_user_id-' + sec_user_id)
			return None
		cate_list = cates_raw_data.get('user_shop_categories')
		for each in cate_list:
			cate = each['name']
			number = each['count']
			if cate in ['零食', '食品', '花茶', '果茶'] and number >= 3:
				self.b_list.append(data)
				break

	def run_b(self):
		self.select_users()
		logger.info('a_list共有数据-' + str(len(self.a_list)))

		batch_size = 100 
		for batch_limit in range(0, len(self.a_list), batch_size):
			start = batch_limit
			stop = min(batch_limit+batch_size, len(self.a_list))
			logger.info('当前爬取用户序号-' + str(start+1) + '-' + str(stop))
			tasks = [gevent.spawn(self.get_cates, data) for data in self.a_list[start:stop]]
			gevent.joinall(tasks)

		logger.info('b_list共有数据-' + str(len(self.b_list)))
		for data in self.b_list:
			self.into_mysql(data, 'dy_sample')

	def run_c(self):
		self.select_users()
		for data in self.a_list:
			self.into_mysql(data, 'dy_sample')

	def select_rooms(self):
		room_list = []
		sql = 'SELECT room_id FROM dy_sample'
		self.cursor.execute(sql)
		row = self.cursor.fetchone()
		while row:
			room_list.append(row[0])
			row = self.cursor.fetchone()
		return room_list

	def get_txt(self):
		with open('lives_20200605.txt', 'r', encoding='utf-8') as f:
			lines = f.readlines()
			for line in lines:
				print(line)
				break

	def write_to_file(self, item_list):
		today = time.strftime('%Y-%m-%d', time.localtime())
		today = today.replace('-', '')
		with open (FILE_DIRECTORY + '/' + 'item_lists_sample'+ '_' + today + '.txt', 'a', encoding='utf-8') as file:
			file.write(item_list + '\n')

	def run_d(self):
		url = 'https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId=606547898363&sellerId=2206709156233&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,activity,fqg,zjys,couponActivity,soldQuantity,page,originalPrice,tradeContract&callback=onSibRequestSuccess'

		headers = {
			'Referer': 'https://item.taobao.com/item.htm?id=606547898363',
			'Sec-Fetch-Mode': 'no-cors',
			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
		}
		response = requests.get(url, headers=headers, allow_redirects=False)
		print(response.text)

	def run_e(self):
		file = 'lives_20200609.txt'
		table = 'dy_live_lives'
		data_batch = []
		batch_size = 200

		loop = asyncio.get_event_loop()
		task = loop.create_task(self.mysql_client.connect_mysql(loop))
		loop.run_until_complete(task)

		with open(file, 'r', encoding='utf-8') as f:
			lines = f.readlines()
			for line in lines:
				line = json.loads(line)
				if self.is_qualified(line.get('nickname')):
					if line.get('status') == 4:
						data_batch.append(line) #最初数据没有入库然后程序很快就结束,是因为,4写成了’4‘。然后我以为其他地方有问题,就去打印,然后由于颜文字,打印出错(颜文字不会给入库造成错误),就在颜文字上卡了很久#最初数据没有入库然后程序很快就结束,是因为,4写成了’4‘。然后我以为其他地方有问题,就去打印,然后由于颜文字,打印出错(颜文字不会给入库造成错误),就在颜文字上卡了很久
						line.pop('mobile')
					if len(data_batch) >= batch_size: 
						tasks = [self.mysql_client.into_mysql(loop, i, table) for i in data_batch]
						loop.run_until_complete(asyncio.wait(tasks))
						#tasks = [gevent.spawn(self.into_mysql, line) for line in data_batch]
						#gevent.joinall(tasks)
						data_batch.clear()
				else:
					self.redis_client.delete_users(line.get('sec_user_id'))
					self.redis_client.delete_live_users(line.get('sec_user_id'))
					print('删除user', line.get('nickname'))

	def run_f(self):
		file_a = 'lives_20200623.txt'
		file_b = '第四批_抖音主播_去重前.csv'
		with open(file_a, 'r', encoding='utf-8') as f:
			with open(file_b, 'a', encoding='utf-8-sig', newline='') as g:
				lines = f.readlines()
				first_line = json.loads(lines[0])
				first_line.pop('mobile')
				keys = list(first_line.keys())
				writer = csv.DictWriter(g, fieldnames = keys) 
				for line in lines:
					line = json.loads(line)
					if self.is_qualified(line.get('nickname')):
						if line.get('status') == 4:
							line.pop('mobile')
							writer.writerow(line)

	def run_g(self):
		read_workbook = xlrd.open_workbook('C:/Users/百芯科技/scraping/douyin7/第四批_抖音主播_去重后.xlsx')
		write_workbook = copy(read_workbook)
		read_sheet = read_workbook.sheet_by_name('Sheet1')
		write_sheet = write_workbook.get_sheet(0)

		nrows = read_sheet.nrows
		ncolumns = read_sheet.ncols
		for row in range(1, nrows):
			text = read_sheet.row(row)[15].value
			if text:
				data = re.match('.*(1\d{10}).*', str(text), re.S)
				if data:
					mobile = data.group(1)
					write_sheet.write(row, ncolumns, mobile)
		write_workbook.save('C:/Users/百芯科技/scraping/douyin7/第四批_抖音主播_去重后_电话.xlsx')
Beispiel #3
0
class GetPromotions():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.aweme_id_list = []

    def get_users(self):
        users = self.redis_client.get_users()
        return users

    def get_promotions(self, sec_user_id):
        try:
            raw_data = self.get_raw_data.get_promotions(sec_user_id)
        except Exception as e:
            logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' +
                         sec_user_id)
            return None

        try:
            promotions = self.parse_promotions(raw_data, sec_user_id)
        except Exception as e:
            logger.error('parse_promotions错误-' + e.args + '-sec_user_id-' +
                         sec_user_id)
            return None

        self.write_to_file(json.dumps(promotions, ensure_ascii=False))

    def parse_promotions(self, raw_data, sec_user_id):
        promotions = []
        data = raw_data.get('promotions')
        if data == []:
            logger.info('该用户不带货,将删除,sec_user_id-' + sec_user_id)
            self.redis_client.delete_users(sec_user_id)
            return None
        else:
            for each in data:
                promotion = {}
                #promotion['user_id'] = user_id
                promotion['sec_user_id'] = sec_user_id
                promotion['price'] = each.get('price') / 100
                promotion['cover_url'] = each.get('images')[0].get(
                    'url_list')[0]
                promotion['title'] = each.get('title')
                promotion['product_id'] = each.get('product_id')
                promotion['product_url'] = each.get('detail_url')
                promotion['min_price'] = str(int(each.get('min_price')) / 100)
                promotion['douyin_sales'] = each.get('sales')
                promotion['product_source'] = each.get('goods_source')
                promotion['create_time'] = str(int(time.time()))

                if each.get('market_price'):
                    promotion['market_price'] = each.get('market_price') / 100

                if each.get('last_aweme_id'):
                    promotion['promotion_type'] = 'video'
                    promotion['aweme_id'] = int(each.get('last_aweme_id'))
                    self.aweme_id_list.append(promotion['aweme_id'])
                else:
                    promotion['promotion_type'] = 'picture'

                if each.get('taobao'):
                    taobao = each.get('taobao')
                    if taobao.get('coupon'):
                        promotion['coupon_amount'] = taobao.get('coupon').get(
                            'coupon_amount')
                        promotion['price_after_coupon'] = promotion[
                            'price'] - float(promotion['coupon_amount'])
                        promotion['coupon_url'] = taobao.get('coupon').get(
                            'coupon_web_url')

                promotions.append(promotion)
            return promotions

    def write_to_file(self, promotions):
        today = time.strftime('%Y-%m-%d', time.localtime())
        today = today.replace('-', '')
        harry_potter = str(random.choice(range(100)))
        with open(FILE_DIRECTORY + '/' + 'promotions' + '_' + harry_potter +
                  '.txt',
                  'a',
                  encoding='utf-8') as file:
            file.write(promotions + '\n')

    def save_awemes(self):
        for each in self.aweme_id_list:
            self.redis_client.add_awemes(each)

    def run(self):
        users = self.get_users()
        logger.info('共有用户数量:' + str(len(users)))
        batch_size = 50  #尽管异步,还是很慢,200个就很慢很慢了,慢到跟同步一样,这可能是抖音某个神奇的特点吧
        for batch_limit in range(0, len(users), batch_size):
            start = batch_limit
            stop = min(batch_limit + batch_size, len(users))
            logger.info('get_promotions爬取当前用户序号-' + str(start + 1) + '-' +
                        str(stop))
            tasks = [
                gevent.spawn(self.get_promotions, sec_user_id)
                for sec_user_id in users[start:stop]
            ]
            gevent.joinall(tasks)

            self.save_awemes()
            self.aweme_id_list.clear()
class GetUserProfile():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.stupid_key_words = STUPID_KEY_WORDS

    def get_users(self):
        users = self.redis_client.test_b()
        return users

    def get_user_profile(self, sec_user_id):
        try:
            raw_data = self.get_raw_data.get_user_profile(sec_user_id)
        except Exception as e:
            logger.error('get_user_profile出错-' + e.args[0] + '-sec_user_id-' +
                         sec_user_id)
            return None

        try:
            user_profile = self.parse_user_profile(raw_data)
        except Exception as e:
            logger.error('parse_user_profile出错-' + e.args[0] +
                         '-sec_user_id-' + sec_user_id)
            return None

        if user_profile:
            self.write_to_file(json.dumps(user_profile, ensure_ascii=False))
            self.redis_client.test_c(sec_user_id)

    def is_qualified_user(self, user):
        nickname = user.get('nickname')
        for word in self.stupid_key_words:
            if word in nickname:
                return False
        if user.get('is_gov_media_vip'):
            return False
        if user.get('enterprise_verify_reason') != '':
            return False
        if user.get('custom_verify') != '':
            return False
        if not user.get('with_fusion_shop_entry'):
            return False
        if not user.get('live_commerce'):
            return False
        if not user.get('with_commerce_entry'):
            return False
        return True

    def parse_user_profile(self, raw_data):
        data = raw_data.get('user')
        user_profile = {}
        user_profile['sec_uid'] = data.get('sec_uid')
        if self.is_qualified_user(data):
            user_profile['follower_count'] = data.get('follower_count')
            user_profile['nickname'] = data.get('nickname')
            user_profile['gender'] = data.get('gender')
            user_profile['location'] = data.get('location')
            user_profile['birthday'] = data.get('birthday')
            user_profile['avatar_url'] = data.get('avatar_larger').get(
                'url_list')[0]
            user_profile['school_name'] = data.get('school_name')
            user_profile['signature'] = data.get('signature')
            user_profile['uid'] = data.get('uid')
            user_profile['short_id'] = data.get('short_id')
            user_profile['unique_id'] = data.get('unique_id')
            user_profile['star_atlas'] = data.get('commerce_user_info').get(
                'star_atlas')
            user_profile['aweme_count'] = data.get('aweme_count')
            user_profile['dongtai_count'] = data.get('dongtai_count')
            user_profile['following_count'] = data.get('following_count')
            user_profile['favoriting_count'] = data.get('favoriting_count')
            user_profile['total_favorited'] = data.get('total_favorited')
            user_profile['live_commerce'] = data.get('live_commerce')
            user_profile['create_time'] = str(int(time.time()))
            return user_profile

        else:
            self.redis_client.delete_users(user_profile['sec_uid'])
            #print(json.dumps(raw_data, ensure_ascii=False))
            logger.info('删除user-sec_user_id-' + user_profile['sec_uid'])
            return None

    def write_to_file(self, user_profile):
        today = time.strftime('%Y-%m-%d', time.localtime())
        today = today.replace('-', '')
        with open(FILE_DIRECTORY + '/' + 'user_profiles' + '_' + today +
                  '.txt',
                  'a',
                  encoding='utf-8') as file:
            file.write(user_profile + '\n')

    def run(self):
        users = self.get_users()
        logger.info('共有users-' + str(len(users)))
        batch_size = 1  #50个会获取不到数据
        for batch_limit in range(0, len(users), batch_size):
            start = batch_limit
            stop = min(batch_limit + batch_size, len(users))
            logger.info('当前爬取用户序号-' + str(start + 1) + '-' + str(stop))
            tasks = [
                gevent.spawn(self.get_user_profile, sec_user_id)
                for sec_user_id in users[start:stop]
            ]
            gevent.joinall(tasks)
Beispiel #5
0
class CheckRooms():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.lives_on_list = []

    def get_rooms(self):
        return self.redis_client.get_rooms(0, 0)

    def change_room_status(self):
        for each in self.lives_on_list:
            self.redis_client.add_rooms(each, 1)

    def check_room(self, room_id):
        try:
            room_raw_data = self.get_raw_data.get_live(room_id)
        except Exception as e:
            logger.error('get_live出错-' + e.args[0] + '-room_id-' + room_id)
            return None
        try:
            owner = room_raw_data.get('data').get('owner')
            follower_count = owner.get('follow_info').get('follower_count')
            sec_user_id = owner.get('sec_uid')
        except Exception as e:
            logger.error('解析room_raw_data出错-' + e.args[0] + '-room_id-' +
                         room_id)
            return None

        if follower_count < 10000:
            self.redis_client.delete_users(sec_user_id)
            self.redis_client.delete_rooms(room_id)
        else:
            status = room_raw_data.get('data').get('status')
            if status == 2:
                try:  #判断该场直播是否带货
                    item_list_raw_data = self.get_raw_data.get_item_list(
                        sec_user_id, room_id)
                except Exception as e:
                    logger.error('get_item_list出错' + e.args[0] +
                                 '-sec_user_id和room_id-' + sec_user_id + '-' +
                                 room_id)
                    return None
                if len(item_list_raw_data.get('promotions')) != 0:
                    self.lives_on_list.append(room_id)

    def run(self):
        all_room_ids = self.get_rooms()
        logger.info('此前未在直播的直播间数量:' + str(len(all_room_ids)))
        batch_size = 200
        for batch_limit in range(0, len(all_room_ids), batch_size):
            start = batch_limit
            stop = min(batch_limit + batch_size, len(all_room_ids))
            logger.info('待查看的此前未在直播的直播间-' + str(start + 1) + '-' + str(stop))
            tasks = [
                gevent.spawn(self.check_room, room_id)
                for room_id in all_room_ids[start:stop]
            ]
            gevent.joinall(tasks)

            logger.info('新发现开始的直播数量-' + str(len(self.lives_on_list)))
            self.change_room_status()
            self.lives_on_list.clear()