コード例 #1
0
ファイル: get_pre_users.py プロジェクト: mrcdyddup/tiktok
 def __init__(self):
     self.get_raw_data = GetRawData()
     self.redis_client = RedisClient()
     self.batch_size = 10
     self.pre_user_list = []
     self.stupid_key_words = [
         '公司', '店', '铺', '厂', '行', '鞋', '装', '市', '服', '饰', '商', '贸', '牌',
         '汇', '馆', '裤', '业', '专', '卖'
     ]
コード例 #2
0
ファイル: test.py プロジェクト: mrcdyddup/tiktok
	def __init__(self):
		self.get_raw_data = GetRawData()
		#self.db = pymysql.connect(host='47.114.166.130', port=13306, user='******', password='******', db='bxdb', charset='utf8mb4')
		#self.cursor = self.db.cursor()
		self.db = pymysql.connect(host='localhost', port=3306, user='******', password='******', db='bxmind', charset='utf8mb4')
		self.cursor = self.db.cursor()
		self.mysql_client = MysqlClient()
		self.redis_client = RedisClient()
		self.stupid_key_words = STUPID_KEY_WORDS
		self.a_list = []
		self.b_list = []
コード例 #3
0
ファイル: get_rooms.py プロジェクト: mrcdyddup/tiktok
class GetRooms():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.sec_user_id_list = []
        self.room_id_list = []

    def get_channel(self):
        try:
            channel_raw_data = self.get_raw_data.get_channel()
        except Exception as e:
            logger.error('get_channel出错-' + e.args[0])
            return None

        try:
            self.parse_channel(channel_raw_data)
        except Exception as e:
            logger.error('parse_channel出错-' + e.args[0])
            return None

        #logger.info(json.dumps([i[-10:-1] for i in self.sec_user_id_list]))
        for each in self.room_id_list:
            self.redis_client.add_rooms(each)
        for each in self.sec_user_id_list:
            self.redis_client.add_users(each, 1)

    def parse_channel(self, channel_raw_data):
        for each in channel_raw_data.get('data'):
            room_id = each.get('data').get('id_str')
            sec_user_id = each.get('data').get('owner').get('sec_uid')
            follower = each.get('data').get('owner').get('follow_info').get(
                'follower_count')

            if follower >= 10000:
                try:
                    item_list = self.get_raw_data.get_item_list(
                        sec_user_id, room_id)
                except Exception as e:
                    logger.error('get_item_list出错-' + e.args[0])
                    return None

                if len(item_list.get('promotions')) != 0:
                    self.room_id_list.append(room_id)
                    self.sec_user_id_list.append(sec_user_id)

    def run(self):
        tasks = [gevent.spawn(self.get_channel) for i in range(1)]
        gevent.joinall(tasks)
        logger.info('本批次共获得room_id和sec_user_id-' +
                    str(len(self.sec_user_id_list)) + '-' +
                    str(len(self.room_id_list)))
        self.sec_user_id_list.clear()
        self.room_id_list.clear()
コード例 #4
0
class GetUsers():
	def __init__(self):
		self.get_raw_data = GetRawData()
		self.redis_client = RedisClient()

	def get_users(self, category_id, page):
		try:
			raw_data = self.get_raw_data.get_users(category_id, page)
		except Exception as e:
			raw_data = None
			logger.error('get_users错误-' + e.args + '-category_id-' + category_id + '-page-' + page)
		if raw_data:
			sec_user_id_list = self.parse_users(raw_data)
			self.save_to_redis(sec_user_id_list)

	def parse_users(self, raw_data):
		sec_user_id_list = []
		data = raw_data.get('aweme_list')
		for each in data:
			sec_user_id = each.get('author').get('sec_uid')
			sec_user_id_list.append(sec_user_id)
		return sec_user_id_list

	def save_to_redis(self, sec_user_id_list):
		for each in sec_user_id_list:
			self.redis_client.add_users(each)

	def run(self):
		cate_list = range(-1, 15)
		for cate in cate_list:
			cate_page_list = [[cate, page] for page in range(0, 100)]
			logger.info('get_users当前爬取cate-' + str(cate))
			tasks = [gevent.spawn(self.get_users, str(cate), str(page)) for cate, page in cate_page_list]
			gevent.joinall(tasks)
コード例 #5
0
ファイル: get_current_room.py プロジェクト: mrcdyddup/tiktok
class GetCurrentRoom():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.room_id_list = []

    def get_users(self):
        users = self.redis_client.get_users()
        return users

    def save_rooms(self):
        for each in self.room_id_list:
            self.redis_client.add_rooms(each, 0)

    def get_current_room(self, sec_user_id):
        try:
            raw_data = self.get_raw_data.get_current_room(sec_user_id)
        except Exception as e:
            logger.error('get_current_room出错-' + e.args[0] + '-sec_user_id-' +
                         sec_user_id)
            return None

        try:
            check = raw_data.get('data').get('pay_grade').get('grade_describe')
        except Exception as e:
            logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' +
                         sec_user_id)
            return None

        own_room = raw_data.get('data').get('own_room')
        if own_room:  #如果有这个,说明直播以及开始了
            room_id = own_room.get('room_ids_str')[0]
            self.room_id_list.append(room_id)
            logger.info(sec_user_id + '-正在直播,room_id-' + room_id)
        else:
            logger.info(sec_user_id + '-未在直播')

    def run(self):
        users = self.get_users()
        logger.info('共有users-' + str(len(users)))
        batch_size = 20
        for batch_limit in range(0, len(users), batch_size):
            start = batch_limit
            stop = min(batch_limit + batch_size, len(users))
            logger.info('当前爬取用户序号-' + str(start + 1) + '-' + str(stop))
            tasks = [
                gevent.spawn(self.get_current_room, sec_user_id)
                for sec_user_id in users[start:stop]
            ]
            gevent.joinall(tasks)
コード例 #6
0
class GetUserDongtai():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.room_id_list = []

    def get_users(self):
        users = self.redis_client.get_users()
        return users

    def save_rooms(self):
        for each in self.room_id_list:
            self.redis_client.add_rooms(each, 0)

    def get_user_dongtai(self, sec_user_id):
        try:
            raw_data = self.get_raw_data.get_user_dongtai(sec_user_id)
        except Exception as e:
            logger.error('get_user_dongtai出错-' + e.args[0] + '-sec_user_id-' +
                         sec_user_id)
            return None

        try:
            self.parse_user_dongtai(raw_data)
        except Exception as e:
            logger.error('parse_user_dongtai出错-' + e.args[0] +
                         '-sec_user_id-' + sec_user_id)

    def parse_user_dongtai(self, raw_data):
        data = raw_data.get('dongtai_list')[0]
        room_id = data.get('aweme').get('author').get('room_id')
        if room_id != 0:
            self.room_id_list.append(str(room_id))
            logger.info('该主播已开始直播,room_id-' + str(room_id))
        else:
            logger.info('该主播尚未开始直播')

    def run(self):
        users = self.get_users()
        logger.info('共有users-' + str(len(users)))
        batch_size = 20  #20个也获取不到数据
        for batch_limit in range(0, len(users), batch_size):
            start = batch_limit
            stop = min(batch_limit + batch_size, len(users))
            logger.info('当前爬取用户序号-' + str(start + 1) + '-' + str(stop))
            tasks = [
                gevent.spawn(self.get_user_dongtai, sec_user_id)
                for sec_user_id in users[start:stop]
            ]
            gevent.joinall(tasks)
コード例 #7
0
class GetSecUserIds():
	def __init__(self):
		self.get_raw_data = GetRawData()
		self.redis_client = RedisClient()

	def get_aweme_lists(self):
		offset = 20
		aweme_lists = []
		awemes = self.redis_client.get_feigua_awemes()
		for i in range(0, len(awemes), offset):
			aweme_list = awemes[i:i+offset]
			aweme_lists.append(aweme_list)
		return aweme_lists

	def get_clips(self, aweme_list):
		sec_user_id_list = []
		aweme_id_list = []
		aweme_int_list =  [int(aweme) for aweme in aweme_list]
		try:
			raw_data = self.get_raw_data.get_clips(aweme_int_list)
		except Exception as e:
			logger.error('get_clips出错-' + e.args[0])
			return None

		if raw_data.get('status_code') == 2053:
			logger.info('这组没有视频')
		else:
			data = raw_data.get('aweme_details')
			for each in data:
				aweme_id = each.get('aweme_id')
				sec_user_id = each.get('author').get('sec_uid')
				if sec_user_id:
					aweme_id_list.append(aweme_id)
					sec_user_id_list.append(sec_user_id)
			for each in aweme_id_list:
				self.redis_client.delete_feigua_awemes(each)
			for each in sec_user_id_list:
				self.redis_client.add_pre_users(each, -1)

	def run(self):
		aweme_lists = self.get_aweme_lists()
		logger.info('共有feigua_aweme组数:' + str(len(aweme_lists)))
		batch_size = 1
		for batch_limit in range(0, len(aweme_lists), batch_size):
			start = batch_limit
			stop = min(batch_limit+batch_size, len(aweme_lists))
			logger.info('get_clips爬取当前feigua_aweme组序号-' + str(start+1) + '-' + str(stop))
			tasks = [gevent.spawn(self.get_clips, aweme_list) for aweme_list in aweme_lists[start:stop]]
			gevent.joinall(tasks)
コード例 #8
0
class CheckQualificationByPromotion():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.batch_size = 50

    def get_pre_users(self):
        batch = []
        while len(batch) < self.batch_size:
            pre_user = self.redis_client.get_pre_users()
            if not self.is_user(pre_user):  #如果这个pre_user在user表中还不存在
                batch.append(pre_user)
        return batch

    def count_pre_users(self):
        return self.redis_client.count_pre_users()

    def is_user(self, sec_user_id):
        return self.redis_client.is_user(sec_user_id)

    def check_qualification_by_promotion(self, sec_user_id):
        try:
            raw_data = self.get_raw_data.get_promotions(sec_user_id)
        except Exception as e:
            logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' +
                         sec_user_id)
            return None

        try:
            raw_data.get('columns')[0].get('name')  #表示确实获取到了页面
            data = raw_data.get('promotions')
            if len(data) > 10:  #确实获取到了页面,promotion大于10
                self.redis_client.add_users(sec_user_id)
        except Exception as e:
            logger.error('解析promotions页面失败-sec_user_id-' + sec_user_id + '-' +
                         e.args[0])

    def run(self):
        if self.count_pre_users() > 0:
            batch = self.get_pre_users()
            tasks = [
                gevent.spawn(self.check_qualification_by_promotion,
                             sec_user_id) for sec_user_id in batch
            ]
            gevent.joinall(tasks)
        else:
            logger.info('pre_users列表空了,程序退出')
            sys.exit()
コード例 #9
0
class CheckQualificationByPromotion():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()

    def get_users(self):
        users = self.redis_client.get_pre_users(-1, -1)
        return users

    def check_commercial(self, sec_user_id):
        try:
            raw_data = self.get_raw_data.get_promotions(sec_user_id)
        except Exception as e:
            logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' +
                         sec_user_id)
            return None

        try:
            raw_data.get('columns')[0].get('name')  #表示确实获取到了页面
            data = raw_data.get('promotions')
            if len(data) == 0:  #确实获取到了页面,promotion仍没有,那就真的不带货了
                logger.info('该用户不带货,将删除,sec_user_id-' + sec_user_id)
                self.redis_client.delete_pre_users(sec_user_id)
            else:  #prrmotion是有的,说明带货,那就状态改为0
                self.redis_client.add_pre_users(sec_user_id, 0)
        except Exception as e:
            logger.error('解析promotions页面失败-sec_user_id-' + sec_user_id + '-' +
                         e.args[0])

    def run(self):
        users = self.get_users()
        logger.info('共有待确认是否带货用户数量:' + str(len(users)))
        batch_size = 50  #尽管异步,还是很慢,200个就很慢很慢了,慢到跟同步一样,这可能是抖音某个神奇的特点吧
        for batch_limit in range(0, len(users), batch_size):
            start = batch_limit
            stop = min(batch_limit + batch_size, len(users))
            logger.info('check_commercial爬取当前用户序号-' + str(start + 1) + '-' +
                        str(stop))
            tasks = [
                gevent.spawn(self.check_commercial, sec_user_id)
                for sec_user_id in users[start:stop]
            ]
            gevent.joinall(tasks)
コード例 #10
0
	def __init__(self):
		self.get_raw_data = GetRawData()
		self.redis_client = RedisClient()
コード例 #11
0
class GetPromotions():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.aweme_id_list = []

    def get_users(self):
        users = self.redis_client.get_users()
        return users

    def get_promotions(self, sec_user_id):
        try:
            raw_data = self.get_raw_data.get_promotions(sec_user_id)
        except Exception as e:
            logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' +
                         sec_user_id)
            return None

        try:
            promotions = self.parse_promotions(raw_data, sec_user_id)
        except Exception as e:
            logger.error('parse_promotions错误-' + e.args + '-sec_user_id-' +
                         sec_user_id)
            return None

        self.write_to_file(json.dumps(promotions, ensure_ascii=False))

    def parse_promotions(self, raw_data, sec_user_id):
        promotions = []
        data = raw_data.get('promotions')
        if data == []:
            logger.info('该用户不带货,将删除,sec_user_id-' + sec_user_id)
            self.redis_client.delete_users(sec_user_id)
            return None
        else:
            for each in data:
                promotion = {}
                #promotion['user_id'] = user_id
                promotion['sec_user_id'] = sec_user_id
                promotion['price'] = each.get('price') / 100
                promotion['cover_url'] = each.get('images')[0].get(
                    'url_list')[0]
                promotion['title'] = each.get('title')
                promotion['product_id'] = each.get('product_id')
                promotion['product_url'] = each.get('detail_url')
                promotion['min_price'] = str(int(each.get('min_price')) / 100)
                promotion['douyin_sales'] = each.get('sales')
                promotion['product_source'] = each.get('goods_source')
                promotion['create_time'] = str(int(time.time()))

                if each.get('market_price'):
                    promotion['market_price'] = each.get('market_price') / 100

                if each.get('last_aweme_id'):
                    promotion['promotion_type'] = 'video'
                    promotion['aweme_id'] = int(each.get('last_aweme_id'))
                    self.aweme_id_list.append(promotion['aweme_id'])
                else:
                    promotion['promotion_type'] = 'picture'

                if each.get('taobao'):
                    taobao = each.get('taobao')
                    if taobao.get('coupon'):
                        promotion['coupon_amount'] = taobao.get('coupon').get(
                            'coupon_amount')
                        promotion['price_after_coupon'] = promotion[
                            'price'] - float(promotion['coupon_amount'])
                        promotion['coupon_url'] = taobao.get('coupon').get(
                            'coupon_web_url')

                promotions.append(promotion)
            return promotions

    def write_to_file(self, promotions):
        today = time.strftime('%Y-%m-%d', time.localtime())
        today = today.replace('-', '')
        harry_potter = str(random.choice(range(100)))
        with open(FILE_DIRECTORY + '/' + 'promotions' + '_' + harry_potter +
                  '.txt',
                  'a',
                  encoding='utf-8') as file:
            file.write(promotions + '\n')

    def save_awemes(self):
        for each in self.aweme_id_list:
            self.redis_client.add_awemes(each)

    def run(self):
        users = self.get_users()
        logger.info('共有用户数量:' + str(len(users)))
        batch_size = 50  #尽管异步,还是很慢,200个就很慢很慢了,慢到跟同步一样,这可能是抖音某个神奇的特点吧
        for batch_limit in range(0, len(users), batch_size):
            start = batch_limit
            stop = min(batch_limit + batch_size, len(users))
            logger.info('get_promotions爬取当前用户序号-' + str(start + 1) + '-' +
                        str(stop))
            tasks = [
                gevent.spawn(self.get_promotions, sec_user_id)
                for sec_user_id in users[start:stop]
            ]
            gevent.joinall(tasks)

            self.save_awemes()
            self.aweme_id_list.clear()
コード例 #12
0
 def __init__(self):
     self.get_raw_data = GetRawData()
     self.redis_client = RedisClient()
     self.lives_off_list = []
コード例 #13
0
ファイル: test.py プロジェクト: mrcdyddup/tiktok
class SaveLiveUsers():
	def __init__(self):
		self.get_raw_data = GetRawData()
		#self.db = pymysql.connect(host='47.114.166.130', port=13306, user='******', password='******', db='bxdb', charset='utf8mb4')
		#self.cursor = self.db.cursor()
		self.db = pymysql.connect(host='localhost', port=3306, user='******', password='******', db='bxmind', charset='utf8mb4')
		self.cursor = self.db.cursor()
		self.mysql_client = MysqlClient()
		self.redis_client = RedisClient()
		self.stupid_key_words = STUPID_KEY_WORDS
		self.a_list = []
		self.b_list = []

	def into_mysql(self, data, table):
		keys = ','.join(data.keys())
		values = ','.join(['%s'] * len(data))	
		sql = 'insert into %s (%s) values (%s)' %(table, keys, values)
		try:
			self.cursor.execute(sql, tuple(data.values()))
			self.db.commit()
		except Exception as e:
			print(e.args)

	def is_qualified(self, nickname):
		for word in self.stupid_key_words:
			if word in nickname:
				return False
		return True

	def run_a(self):
		table = 'dy_live_lives'
		with open('lives_20200614.txt', 'r', encoding='utf-8') as f:
			lines = f.readlines()
			for line in lines:
				#print(line)

				data = json.loads(line)
				nickname = data.get('nickname')
				sec_user_id = data.get('sec_user_id')
				if data.get('status') == 4:
					if self.is_qualified(nickname):
						self.into_mysql(data, table)
					else:
						self.redis_client.delete_users(sec_user_id)
						self.redis_client.delete_live_users(sec_user_id)

	def replicate_table(self):
		sql = 'CREATE TABLE dy_sample LIKE dy_live_lives'
		self.cursor.execute(sql)

	def select_users(self):
		sql = 'SELECT room_id, sec_user_id, nickname, short_id, total_viewer, like_count, follower_count, signature, city FROM dy_live_lives WHERE total_viewer > 50000 AND follower_count > 500000'
		self.cursor.execute(sql)
		row = self.cursor.fetchone()
		while row:
			data = {}
			data['room_id'] = row[0]
			data['sec_user_id'] = row[1]
			data['nickname'] = row[2]
			data['short_id'] = row[3]
			data['total_viewer'] = row[4]
			data['like_count'] = row[5]
			data['follower_count'] = row[6]
			data['signature'] = row[7]
			data['city'] = row[8]
			self.a_list.append(data)
			row = self.cursor.fetchone()

	def get_cates(self, data):
		sec_user_id = data['sec_user_id']
		try:
			cates_raw_data = self.get_raw_data.get_cates(sec_user_id)
		except Exception as e:
			logger.error('get_cates出错-' + e.args[0] + '-sec_user_id-' + sec_user_id)
			return None
		cate_list = cates_raw_data.get('user_shop_categories')
		for each in cate_list:
			cate = each['name']
			number = each['count']
			if cate in ['零食', '食品', '花茶', '果茶'] and number >= 3:
				self.b_list.append(data)
				break

	def run_b(self):
		self.select_users()
		logger.info('a_list共有数据-' + str(len(self.a_list)))

		batch_size = 100 
		for batch_limit in range(0, len(self.a_list), batch_size):
			start = batch_limit
			stop = min(batch_limit+batch_size, len(self.a_list))
			logger.info('当前爬取用户序号-' + str(start+1) + '-' + str(stop))
			tasks = [gevent.spawn(self.get_cates, data) for data in self.a_list[start:stop]]
			gevent.joinall(tasks)

		logger.info('b_list共有数据-' + str(len(self.b_list)))
		for data in self.b_list:
			self.into_mysql(data, 'dy_sample')

	def run_c(self):
		self.select_users()
		for data in self.a_list:
			self.into_mysql(data, 'dy_sample')

	def select_rooms(self):
		room_list = []
		sql = 'SELECT room_id FROM dy_sample'
		self.cursor.execute(sql)
		row = self.cursor.fetchone()
		while row:
			room_list.append(row[0])
			row = self.cursor.fetchone()
		return room_list

	def get_txt(self):
		with open('lives_20200605.txt', 'r', encoding='utf-8') as f:
			lines = f.readlines()
			for line in lines:
				print(line)
				break

	def write_to_file(self, item_list):
		today = time.strftime('%Y-%m-%d', time.localtime())
		today = today.replace('-', '')
		with open (FILE_DIRECTORY + '/' + 'item_lists_sample'+ '_' + today + '.txt', 'a', encoding='utf-8') as file:
			file.write(item_list + '\n')

	def run_d(self):
		url = 'https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId=606547898363&sellerId=2206709156233&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,activity,fqg,zjys,couponActivity,soldQuantity,page,originalPrice,tradeContract&callback=onSibRequestSuccess'

		headers = {
			'Referer': 'https://item.taobao.com/item.htm?id=606547898363',
			'Sec-Fetch-Mode': 'no-cors',
			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
		}
		response = requests.get(url, headers=headers, allow_redirects=False)
		print(response.text)

	def run_e(self):
		file = 'lives_20200609.txt'
		table = 'dy_live_lives'
		data_batch = []
		batch_size = 200

		loop = asyncio.get_event_loop()
		task = loop.create_task(self.mysql_client.connect_mysql(loop))
		loop.run_until_complete(task)

		with open(file, 'r', encoding='utf-8') as f:
			lines = f.readlines()
			for line in lines:
				line = json.loads(line)
				if self.is_qualified(line.get('nickname')):
					if line.get('status') == 4:
						data_batch.append(line) #最初数据没有入库然后程序很快就结束,是因为,4写成了’4‘。然后我以为其他地方有问题,就去打印,然后由于颜文字,打印出错(颜文字不会给入库造成错误),就在颜文字上卡了很久#最初数据没有入库然后程序很快就结束,是因为,4写成了’4‘。然后我以为其他地方有问题,就去打印,然后由于颜文字,打印出错(颜文字不会给入库造成错误),就在颜文字上卡了很久
						line.pop('mobile')
					if len(data_batch) >= batch_size: 
						tasks = [self.mysql_client.into_mysql(loop, i, table) for i in data_batch]
						loop.run_until_complete(asyncio.wait(tasks))
						#tasks = [gevent.spawn(self.into_mysql, line) for line in data_batch]
						#gevent.joinall(tasks)
						data_batch.clear()
				else:
					self.redis_client.delete_users(line.get('sec_user_id'))
					self.redis_client.delete_live_users(line.get('sec_user_id'))
					print('删除user', line.get('nickname'))

	def run_f(self):
		file_a = 'lives_20200623.txt'
		file_b = '第四批_抖音主播_去重前.csv'
		with open(file_a, 'r', encoding='utf-8') as f:
			with open(file_b, 'a', encoding='utf-8-sig', newline='') as g:
				lines = f.readlines()
				first_line = json.loads(lines[0])
				first_line.pop('mobile')
				keys = list(first_line.keys())
				writer = csv.DictWriter(g, fieldnames = keys) 
				for line in lines:
					line = json.loads(line)
					if self.is_qualified(line.get('nickname')):
						if line.get('status') == 4:
							line.pop('mobile')
							writer.writerow(line)

	def run_g(self):
		read_workbook = xlrd.open_workbook('C:/Users/百芯科技/scraping/douyin7/第四批_抖音主播_去重后.xlsx')
		write_workbook = copy(read_workbook)
		read_sheet = read_workbook.sheet_by_name('Sheet1')
		write_sheet = write_workbook.get_sheet(0)

		nrows = read_sheet.nrows
		ncolumns = read_sheet.ncols
		for row in range(1, nrows):
			text = read_sheet.row(row)[15].value
			if text:
				data = re.match('.*(1\d{10}).*', str(text), re.S)
				if data:
					mobile = data.group(1)
					write_sheet.write(row, ncolumns, mobile)
		write_workbook.save('C:/Users/百芯科技/scraping/douyin7/第四批_抖音主播_去重后_电话.xlsx')
コード例 #14
0
class GetUserProfile():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.stupid_key_words = STUPID_KEY_WORDS

    def get_users(self):
        users = self.redis_client.test_b()
        return users

    def get_user_profile(self, sec_user_id):
        try:
            raw_data = self.get_raw_data.get_user_profile(sec_user_id)
        except Exception as e:
            logger.error('get_user_profile出错-' + e.args[0] + '-sec_user_id-' +
                         sec_user_id)
            return None

        try:
            user_profile = self.parse_user_profile(raw_data)
        except Exception as e:
            logger.error('parse_user_profile出错-' + e.args[0] +
                         '-sec_user_id-' + sec_user_id)
            return None

        if user_profile:
            self.write_to_file(json.dumps(user_profile, ensure_ascii=False))
            self.redis_client.test_c(sec_user_id)

    def is_qualified_user(self, user):
        nickname = user.get('nickname')
        for word in self.stupid_key_words:
            if word in nickname:
                return False
        if user.get('is_gov_media_vip'):
            return False
        if user.get('enterprise_verify_reason') != '':
            return False
        if user.get('custom_verify') != '':
            return False
        if not user.get('with_fusion_shop_entry'):
            return False
        if not user.get('live_commerce'):
            return False
        if not user.get('with_commerce_entry'):
            return False
        return True

    def parse_user_profile(self, raw_data):
        data = raw_data.get('user')
        user_profile = {}
        user_profile['sec_uid'] = data.get('sec_uid')
        if self.is_qualified_user(data):
            user_profile['follower_count'] = data.get('follower_count')
            user_profile['nickname'] = data.get('nickname')
            user_profile['gender'] = data.get('gender')
            user_profile['location'] = data.get('location')
            user_profile['birthday'] = data.get('birthday')
            user_profile['avatar_url'] = data.get('avatar_larger').get(
                'url_list')[0]
            user_profile['school_name'] = data.get('school_name')
            user_profile['signature'] = data.get('signature')
            user_profile['uid'] = data.get('uid')
            user_profile['short_id'] = data.get('short_id')
            user_profile['unique_id'] = data.get('unique_id')
            user_profile['star_atlas'] = data.get('commerce_user_info').get(
                'star_atlas')
            user_profile['aweme_count'] = data.get('aweme_count')
            user_profile['dongtai_count'] = data.get('dongtai_count')
            user_profile['following_count'] = data.get('following_count')
            user_profile['favoriting_count'] = data.get('favoriting_count')
            user_profile['total_favorited'] = data.get('total_favorited')
            user_profile['live_commerce'] = data.get('live_commerce')
            user_profile['create_time'] = str(int(time.time()))
            return user_profile

        else:
            self.redis_client.delete_users(user_profile['sec_uid'])
            #print(json.dumps(raw_data, ensure_ascii=False))
            logger.info('删除user-sec_user_id-' + user_profile['sec_uid'])
            return None

    def write_to_file(self, user_profile):
        today = time.strftime('%Y-%m-%d', time.localtime())
        today = today.replace('-', '')
        with open(FILE_DIRECTORY + '/' + 'user_profiles' + '_' + today +
                  '.txt',
                  'a',
                  encoding='utf-8') as file:
            file.write(user_profile + '\n')

    def run(self):
        users = self.get_users()
        logger.info('共有users-' + str(len(users)))
        batch_size = 1  #50个会获取不到数据
        for batch_limit in range(0, len(users), batch_size):
            start = batch_limit
            stop = min(batch_limit + batch_size, len(users))
            logger.info('当前爬取用户序号-' + str(start + 1) + '-' + str(stop))
            tasks = [
                gevent.spawn(self.get_user_profile, sec_user_id)
                for sec_user_id in users[start:stop]
            ]
            gevent.joinall(tasks)
コード例 #15
0
class CheckQualificationByRankList():
	def __init__(self):
		self.get_raw_data = GetRawData()
		self.redis_client = RedisClient()
		self.live_user_list = []
		self.room_id_list = []
		self.stupid_key_words = STUPID_KEY_WORDS

	def get_users(self):
		users = self.redis_client.get_users() #每次获取分数最低的10000个
		return users

	def is_live_user(self, sec_user_id):
		return self.redis_client.is_live_user(sec_user_id)

	def save_rooms(self):
		for each in self.room_id_list:
			self.redis_client.add_rooms(each)

	def add_to_live_users(self):
		for each in self.live_user_list:
			self.redis_client.add_live_users(each, 1)

	def increase_user_score(self, sec_user_id):
		self.redis_client.increase_user_score(sec_user_id)

	def is_qualified_user(self, user):
		nickname = user.get('nickname')
		for word in self.stupid_key_words:
			if word in nickname:
				return False
		return True

	def get_rank_list(self, sec_user_id):

		if not self.is_live_user(sec_user_id):
			try:
				raw_data = self.get_raw_data.get_rank_list(sec_user_id)
			except Exception as e:
				logger.error('get_rank_list出错-' + e.args[0] + '-sec_user_id-' + sec_user_id)
				return None

			try:
				user = raw_data.get('data').get('anchor_info').get('user')
			except Exception as e:
				logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' + sec_user_id)
				return None

			if self.is_qualified_user(user):
				own_room = user.get('own_room')
				if own_room: #如果有这个,说明直播开始了
					room_id = own_room.get('room_ids_str')[0]
					self.live_user_list.append(sec_user_id)
					self.room_id_list.append(room_id)
				self.increase_user_score(sec_user_id)
			else:
				self.redis_client.delete_users(sec_user_id)
				logger.info('删除user-sec_user_id-' + sec_user_id)
		else:
			self.increase_user_score(sec_user_id)
		"""
		try:
			raw_data = self.get_raw_data.get_rank_list(sec_user_id)
		except Exception as e:
			logger.error('get_rank_list出错-' + e.args[0] + '-sec_user_id-' + sec_user_id)
			return None

		try:
			nickname = raw_data.get('data').get('anchor_info').get('user').get('nickname')
			print(nickname)
		except Exception as e:
			logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' + sec_user_id)
			return None
		"""

	def run(self):
		users = self.get_users()

		batch_size = 50 #这个接口80个一批可行的(总共近4000个),更多的我不敢再试了
		for batch_limit in range(0, len(users), batch_size):
			start = batch_limit
			stop = min(batch_limit+batch_size, len(users))
			logger.info('当前获取用户序号-' + str(start+1) + '-' + str(stop))
			tasks = [gevent.spawn(self.get_rank_list, sec_user_id) for sec_user_id in users[start:stop]]
			gevent.joinall(tasks)

			self.save_rooms()
			self.add_to_live_users()
			logger.info('新增room_id-' + str(len(self.room_id_list)))
			self.room_id_list.clear()
			self.live_user_list.clear()
コード例 #16
0
ファイル: get_item_lists.py プロジェクト: mrcdyddup/tiktok
class GetItemLists():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.item_lists_saved_list = []
        self.item_list = []

    def get_room_sec_ids(self):
        return self.redis_client.get_item_lists()

    def delete_item_lists(self):
        for each in self.item_lists_saved_list:
            self.redis_client.delete_item_lists(each)

    def save_items(self):
        self.redis_client.add_items(self.item_list)

    def get_item_lists(self, room_sec_ids):
        ids = room_sec_ids.split('_', 1)
        room_id = ids[0]
        sec_user_id = ids[1]

        try:
            item_list_raw_data = self.get_raw_data.get_item_list(
                sec_user_id, room_id)
        except Exception as e:
            logger.error('get_item_list出错-' + e.args[0] + '-room_sec_ids-' +
                         room_sec_ids)
            return None

        try:
            item_list = self.parse_item_lists(item_list_raw_data, room_id,
                                              sec_user_id)
        except Exception as e:
            logger.error(room_sec_ids + '-parse_item_list失败-' + e.args[0])
            return None

        if len(item_list) != 0:  #表示这场直播挂商品了
            self.write_to_file(json.dumps(item_list,
                                          ensure_ascii=False))  #先写入,再删除,没毛病
        self.item_lists_saved_list.append(room_sec_ids)

    def parse_item_lists(self, item_list_raw_data, room_id, sec_user_id):
        item_list = []
        data = item_list_raw_data.get('promotions')
        for item in data:
            item_info = {}
            item_info['room_id'] = room_id
            item_info['sec_user_id'] = sec_user_id
            item_info['title'] = item.get('title')
            item_info['short_title'] = item.get('short_title')
            item_info['product_id'] = item.get('product_id')
            item_info['promotion_id'] = item.get('promotion_id')
            item_info['price'] = item.get('price') / 100
            item_info['min_price'] = item.get('min_price') / 100
            item_info['item_source'] = item.get('platform_label')
            item_info['shop_id'] = item.get('shop_id')
            item_info['item_type'] = item.get('item_type')
            item_info['cover'] = item.get('cover')
            item_info['index'] = item.get('index')

            coupon_info = item.get('coupons')
            if coupon_info:
                item_info['coupon_tag'] = coupon_info[0].get('tag')
                item_info['coupon_url'] = coupon_info[0].get('coupon_url')

            item_list.append(item_info)
            self.item_list.append(item_info['promotion_id'] + '_' + room_id +
                                  '_' + sec_user_id)
        return item_list

    def write_to_file(self, item_list):
        today = time.strftime('%Y-%m-%d', time.localtime())
        today = today.replace('-', '')
        with open(FILE_DIRECTORY + '/' + 'item_lists' + '_' + today + '.txt',
                  'a',
                  encoding='utf-8') as file:
            file.write(item_list + '\n')

    def run(self):
        all_room_sec_ids = self.get_room_sec_ids()
        logger.info('此前已结束直播并需要获取商品信息的直播间数量:' + str(len(all_room_sec_ids)))
        batch_size = 200
        for batch_limit in range(0, len(all_room_sec_ids), batch_size):
            start = batch_limit
            stop = min(batch_limit + batch_size, len(all_room_sec_ids))
            logger.info('待获取的商品所对应的直播间-' + str(start + 1) + '-' + str(stop))
            tasks = [
                gevent.spawn(self.get_item_lists, room_sec_ids)
                for room_sec_ids in all_room_sec_ids[start:stop]
            ]
            gevent.joinall(tasks)

            logger.info('新获取商品列表/未挂商品的直播间数量-' +
                        str(len(self.item_lists_saved_list)))
            logger.info('新获取商品的数量-' + str(len(self.item_list)))
            self.save_items()  #可能是这个拖慢了速度,得想办法
            self.delete_item_lists()
            self.item_lists_saved_list.clear()
            self.item_list.clear()
コード例 #17
0
class CheckRooms():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.lives_on_list = []

    def get_rooms(self):
        return self.redis_client.get_rooms(0, 0)

    def change_room_status(self):
        for each in self.lives_on_list:
            self.redis_client.add_rooms(each, 1)

    def check_room(self, room_id):
        try:
            room_raw_data = self.get_raw_data.get_live(room_id)
        except Exception as e:
            logger.error('get_live出错-' + e.args[0] + '-room_id-' + room_id)
            return None
        try:
            owner = room_raw_data.get('data').get('owner')
            follower_count = owner.get('follow_info').get('follower_count')
            sec_user_id = owner.get('sec_uid')
        except Exception as e:
            logger.error('解析room_raw_data出错-' + e.args[0] + '-room_id-' +
                         room_id)
            return None

        if follower_count < 10000:
            self.redis_client.delete_users(sec_user_id)
            self.redis_client.delete_rooms(room_id)
        else:
            status = room_raw_data.get('data').get('status')
            if status == 2:
                try:  #判断该场直播是否带货
                    item_list_raw_data = self.get_raw_data.get_item_list(
                        sec_user_id, room_id)
                except Exception as e:
                    logger.error('get_item_list出错' + e.args[0] +
                                 '-sec_user_id和room_id-' + sec_user_id + '-' +
                                 room_id)
                    return None
                if len(item_list_raw_data.get('promotions')) != 0:
                    self.lives_on_list.append(room_id)

    def run(self):
        all_room_ids = self.get_rooms()
        logger.info('此前未在直播的直播间数量:' + str(len(all_room_ids)))
        batch_size = 200
        for batch_limit in range(0, len(all_room_ids), batch_size):
            start = batch_limit
            stop = min(batch_limit + batch_size, len(all_room_ids))
            logger.info('待查看的此前未在直播的直播间-' + str(start + 1) + '-' + str(stop))
            tasks = [
                gevent.spawn(self.check_room, room_id)
                for room_id in all_room_ids[start:stop]
            ]
            gevent.joinall(tasks)

            logger.info('新发现开始的直播数量-' + str(len(self.lives_on_list)))
            self.change_room_status()
            self.lives_on_list.clear()
コード例 #18
0
ファイル: get_pre_users.py プロジェクト: mrcdyddup/tiktok
class GetPreUsers():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.batch_size = 10
        self.pre_user_list = []
        self.stupid_key_words = [
            '公司', '店', '铺', '厂', '行', '鞋', '装', '市', '服', '饰', '商', '贸', '牌',
            '汇', '馆', '裤', '业', '专', '卖'
        ]

    def get_following(self):
        batch = []
        for i in range(self.batch_size):
            batch.append(self.redis_client.get_following())
        return batch

    def add_following_and_pre_users(self):
        for pre_user in self.pre_user_list:
            self.redis_client.add_following(pre_user)
            self.redis_client.add_pre_users(pre_user, -1)

    def is_qualified_user(self, user):
        if user.get('is_gov_media_vip'):
            return False
        if user.get('enterprise_verify_reason') != '':
            return False
        if user.get('custom_verify') != '':
            return False
        nickname = user.get('nickname')
        for word in self.stupid_key_words:
            if word in nickname:
                return False
        return True

    def get_pre_users(self, sec_user_id):
        try:
            raw_data = self.get_raw_data.get_following(sec_user_id)
        except Exception as e:
            logger.error('get_pre_user出错-' + e.args[0] + '-sec_user_id-' +
                         sec_user_id)
            return None

        if not raw_data.get('status_code') == 2096:
            following_list = raw_data.get('followings')
            for user in following_list:
                if self.is_qualified_user(user):
                    self.pre_user_list.append(user.get('sec_uid'))
        else:
            logger.info('关注不可见-sec_user_id-' + sec_user_id)

    def run(self):
        batch = self.get_following()
        tasks = [
            gevent.spawn(self.get_pre_users, sec_user_id)
            for sec_user_id in batch
        ]
        gevent.joinall(tasks)
        logger.info('获取到pre_user-' + str(len(self.pre_user_list)))
        self.add_following_and_pre_users()
        self.pre_user_list.clear()
コード例 #19
0
ファイル: get_rooms.py プロジェクト: mrcdyddup/tiktok
 def __init__(self):
     self.get_raw_data = GetRawData()
     self.redis_client = RedisClient()
     self.sec_user_id_list = []
     self.room_id_list = []
コード例 #20
0
ファイル: get_pre_users.py プロジェクト: mrcdyddup/tiktok
class GetPreUsers():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.batch_size = 10
        self.pre_user_list = []
        self.stupid_key_words = STUPID_KEY_WORDS

    def count_following(self):
        return self.redis_client.count_following()

    def count_pre_users(self):
        return self.redis_client.count_pre_users()

    def get_following(self):
        batch = []
        for i in range(self.batch_size):
            batch.append(self.redis_client.get_following())
        return batch

    def add_following_and_pre_users(self):
        for pre_user in self.pre_user_list:
            if self.count_following() < 100000:
                self.redis_client.add_following(pre_user)
            if self.count_pre_users() < 500000:
                self.redis_client.add_pre_users(pre_user)

    def is_qualified_user(self, user):
        if user.get('is_gov_media_vip'):
            return False
        if user.get('enterprise_verify_reason'):
            return False
        if user.get('custom_verify'):
            if not '自媒体' in user.get('custom_verify'):
                if not '主播' in user.get('custom_verify'):
                    if not '视频' in user.get('custom_verify'):
                        return False
        nickname = user.get('nickname')
        for word in self.stupid_key_words:
            if word in nickname:
                return False
        return True

    def get_pre_users(self, sec_user_id):
        try:
            raw_data = self.get_raw_data.get_following(sec_user_id)
        except Exception as e:
            logger.error('get_pre_user出错-' + e.args[0] + '-sec_user_id-' +
                         sec_user_id)
            return None

        if not raw_data.get('status_code') == 2096:
            following_list = raw_data.get('followings')
            if len(following_list) == 0:
                logger.error('获取不到数据了,程序退出')
                #sys.exit()
            for user in following_list:
                if self.is_qualified_user(user):
                    self.pre_user_list.append(user.get('sec_uid'))
        else:
            logger.info('关注不可见-sec_user_id-' + sec_user_id)

    def run(self):
        if self.count_following() < 100000 or self.count_pre_users() < 500000:
            batch = self.get_following()
            tasks = [
                gevent.spawn(self.get_pre_users, sec_user_id)
                for sec_user_id in batch
            ]
            gevent.joinall(tasks)
            logger.info('获取到pre_user-' + str(len(self.pre_user_list)))
            self.add_following_and_pre_users()
            self.pre_user_list.clear()
        else:
            logger.info('已经有太多following或pre_users了')
コード例 #21
0
ファイル: get_rank_list.py プロジェクト: bzvs1992/tiktok
class GetRankList():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.user_list = []
        self.room_id_list = []

    def get_users(self):
        users = self.redis_client.get_live_users(0, 0)
        return users

    def save_rooms(self):
        for each in self.room_id_list:
            self.redis_client.add_rooms(each)

    def change_user_status(self):
        for each in self.user_list:
            self.redis_client.add_live_users(each, 1)

    def get_rank_list(self, sec_user_id):
        try:
            raw_data = self.get_raw_data.get_rank_list(sec_user_id)
        except Exception as e:
            logger.error('get_rank_list出错-' + e.args[0] + '-sec_user_id-' +
                         sec_user_id)
            return None

        try:
            own_room = raw_data.get('data').get('anchor_info').get('user').get(
                'own_room')
        except Exception as e:
            logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' +
                         sec_user_id)
            return None

        if own_room:  #如果有这个,说明直播开始了
            room_id = own_room.get('room_ids_str')[0]
            self.user_list.append(sec_user_id)
            self.room_id_list.append(room_id)
            #logger.info(sec_user_id + '-正在直播,room_id-' + room_id)
        #else:
        #logger.info(sec_user_id + '-未在直播')

    def run(self):
        users = self.get_users()
        logger.info('共有未在直播的users-' + str(len(users)))

        batch_size = 50  #这个接口80个一批可行的(总共近4000个),更多的我不敢再试了
        for batch_limit in range(0, len(users), batch_size):
            start = batch_limit
            stop = min(batch_limit + batch_size, len(users))
            logger.info('当前获取用户序号-' + str(start + 1) + '-' + str(stop))
            tasks = [
                gevent.spawn(self.get_rank_list, sec_user_id)
                for sec_user_id in users[start:stop]
            ]
            gevent.joinall(tasks)

            self.save_rooms()
            self.change_user_status()
            logger.info('新增room_id-' + str(len(self.room_id_list)))
            self.room_id_list.clear()
            self.user_list.clear()
コード例 #22
0
ファイル: get_pre_users.py プロジェクト: mrcdyddup/tiktok
 def __init__(self):
     self.get_raw_data = GetRawData()
     self.redis_client = RedisClient()
     self.batch_size = 10
     self.pre_user_list = []
     self.stupid_key_words = STUPID_KEY_WORDS
コード例 #23
0
ファイル: get_clips_h5.py プロジェクト: mrcdyddup/tiktok
class GetClipsH5():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()

    def get_aweme_lists(self):
        aweme_lists = []
        awemes = self.redis_client.get_awemes()
        for i in range(0, len(awemes), 20):
            aweme_list = awemes[i:i + 20]
            aweme_lists.append(aweme_list)
        return aweme_lists

    def get_clips_h5(self, aweme_list):
        aweme_int_list = [int(aweme) for aweme in aweme_list]
        try:
            raw_data = self.get_raw_data.get_clips_h5(aweme_int_list)
        except Exception as e:
            logger.error('get_clips_h5出错-' + e.args[0])
            return None

        if raw_data.get('status_code') != 2053:
            try:
                clips = self.parse_clips_h5(raw_data)
            except Exception as e:
                logger.error('parse_clips_h5出错-' + e.args[0])
                return None

            if len(clips) != 0:  #忘记这个if语句是出于什么目的了,可能是因为一批aweme中可能会有一些不是视频吧。
                self.write_to_file(json.dumps(clips, ensure_ascii=False))
                for each in aweme_list:
                    self.redis_client.delete_awemes(each)
            else:
                logger.error('该组clips数量为0')

        else:
            logger.error('status_code 2053,一整批都不是视频')

    def parse_clips_h5(self, raw_data):
        clips = []
        data = raw_data.get('item_list')
        for each in data:
            clip = {}
            clip['aweme_share_url'] = each.get('share_url')
            clip['user_id'] = each.get('author_user_id')
            clip['aweme_duration'] = each.get('duration')
            clip['aweme_time'] = each.get('create_time')
            clip['aweme_id'] = each.get('aweme_id')

            video = each.get('video')
            clip['aweme_cover'] = video.get('cover').get('url_list')[0]
            clip['aweme_url'] = video.get('play_addr').get('url_list')[0]

            author = each.get('author')
            if author:
                clip['user_nickname'] = author.get('nickname')
                clip['user_avatar'] = author.get('avatar_larger').get(
                    'url_list')[0]
                clip['user_short_id'] = author.get('short_id')
                clip['user_signature'] = author.get('signature')
                clip['user_unique_id'] = author.get('unique_id')
            else:
                logger.error('未获取到author')

            statistics = each.get('statistics')
            clip['comment_count'] = statistics.get('comment_count')
            clip['like_count'] = statistics.get('digg_count')

            clips.append(clip)

        return clips

    def write_to_file(self, clips):
        today = time.strftime('%Y-%m-%d', time.localtime())
        today = today.replace('-', '')
        with open(FILE_DIRECTORY + '/' + 'clips_h5' + '_' + today + '.txt',
                  'a',
                  encoding='utf-8') as file:
            file.write(clips + '\n')

    def run(self):
        aweme_lists = self.get_aweme_lists()
        logger.info('共有aweme组数:' + str(len(aweme_lists)))
        batch_size = 200
        for batch_limit in range(0, len(aweme_lists), batch_size):
            start = batch_limit
            stop = min(batch_limit + batch_size, len(aweme_lists))
            logger.info('get_clips_h5爬取当前aweme组序号-' + str(start + 1) + '-' +
                        str(stop))
            tasks = [
                gevent.spawn(self.get_clips_h5, aweme_list)
                for aweme_list in aweme_lists[start:stop]
            ]
            gevent.joinall(tasks)
コード例 #24
0
 def __init__(self):
     self.get_raw_data = GetRawData()
     self.redis_client = RedisClient()
     self.stupid_key_words = STUPID_KEY_WORDS
コード例 #25
0
class GetLives():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.lives_off_list = []

    def get_rooms(self):
        return self.redis_client.get_rooms()

    def delete_rooms(self):  #直播结束了,就将用户的状态改回0,删除room_id,并添加item_list记录
        for each in self.lives_off_list:
            room_id = each.split('_', 1)[0]
            sec_user_id = each.split('_', 1)[1]
            self.redis_client.add_live_users(sec_user_id, 0)
            self.redis_client.delete_rooms(room_id)
            self.redis_client.add_item_lists(each)

    def get_lives(self, room_id):
        try:
            live_raw_data = self.get_raw_data.get_live(room_id)
        except Exception as e:
            logger.error('get_live出错-' + e.args[0] + '-room_id-' + room_id)
            return None
        try:
            live_info = self.parse_lives(live_raw_data, room_id)
        except Exception as e:
            logger.error('parse_lives出错-' + e.args[0] + '-room_id-' + room_id)
            return None

        self.write_to_file(live_info)
        if live_info['status'] == 4:  #写入文件成功之后才删除
            self.lives_off_list.append(live_info['room_id'] + '_' +
                                       live_info['sec_user_id'])

    def parse_lives(self, live_raw_data, room_id):
        data = live_raw_data.get('data')
        live_info = {}
        live_info['room_id'] = room_id
        live_info['start_time'] = data.get('create_time')  #直播开始时间
        live_info['like_count'] = data.get('like_count')
        live_info['share_url'] = data.get('share_url')
        live_info['title'] = data.get('title')
        live_info['status'] = data.get('status')  #2为正在直播,4为直播结束
        live_info['viewer_count'] = data.get('user_count')  #实时观看人数
        live_info['cover_url'] = data.get('cover').get('url_list')[0]

        owner = data.get('owner')
        live_info['avatar_url'] = owner.get('avatar_large').get('url_list')[0]
        live_info['city'] = owner.get('city')
        live_info['follower_count'] = owner.get('follow_info').get(
            'follower_count')
        live_info['gender'] = owner.get('gender')
        live_info['short_id'] = owner.get('short_id')  #主播短id
        live_info['id'] = owner.get('id_str')  #主播长id
        live_info['nickname'] = owner.get('nickname')
        live_info['signature'] = owner.get('signature')
        live_info['short_id'] = owner.get('short_id')
        live_info['sec_user_id'] = owner.get('sec_uid')
        #live_info['mobile'] = owner.get('telephone') #主播手机
        live_info['ticket_count'] = owner.get('ticket_count')  #主播总音浪
        live_info['create_time'] = str(int(time.time()))  #记录时间

        stats = data.get('stats')
        live_info['fan_ticket'] = stats.get('fan_ticket')  #本场收入音浪
        live_info['follow_count'] = stats.get('follow_count')  #本场关注
        live_info['gift_count'] = stats.get('gift_uv_count')  #本场获得礼物
        live_info['total_viewer'] = stats.get('total_user')  #总观看人数

        return live_info

    def write_to_file(self, live_info):
        today = time.strftime('%Y-%m-%d', time.localtime())
        today = today.replace('-', '')
        with open(FILE_DIRECTORY + '/' + 'lives' + '_' + today + '.txt',
                  'a',
                  encoding='utf-8') as file:
            file.write(json.dumps(live_info, ensure_ascii=False) + '\n')

    def run(self):
        all_room_ids = self.get_rooms()
        logger.info('此前正在直播的直播间数量:' + str(len(all_room_ids)))
        batch_size = 150  #本地200可以,云手机200不行
        for batch_limit in range(0, len(all_room_ids), batch_size):
            start = batch_limit
            stop = min(batch_limit + batch_size, len(all_room_ids))
            logger.info('待查看并获取的此前正在直播的直播间-' + str(start + 1) + '-' +
                        str(stop))
            tasks = [
                gevent.spawn(self.get_lives, room_id)
                for room_id in all_room_ids[start:stop]
            ]
            gevent.joinall(tasks)

            logger.info('新发现已结束的直播数量-' + str(len(self.lives_off_list)))
            self.delete_rooms()
            self.lives_off_list.clear()
コード例 #26
0
ファイル: get_item_lists.py プロジェクト: mrcdyddup/tiktok
 def __init__(self):
     self.get_raw_data = GetRawData()
     self.redis_client = RedisClient()
     self.item_lists_saved_list = []
     self.item_list = []
コード例 #27
0
	def __init__(self):
		self.get_raw_data = GetRawData()
		self.redis_client = RedisClient()
		self.stupid_key_words = ['公司', '店', '铺', '厂', '行', '鞋', '装', '市', '服', '饰', '商', '贸', '牌', '汇', '馆', '裤', '业', '专', '卖', '时尚','穿', '搭', '品', '玩具', '语文', '数学', '英语', '科学', '物理', '化学', '生物', '政治', '历史']
コード例 #28
0
	def __init__(self):
		self.get_raw_data = GetRawData()
		self.redis_client = RedisClient()
		self.live_user_list = []
		self.room_id_list = []
		self.stupid_key_words = STUPID_KEY_WORDS
コード例 #29
0
ファイル: get_items.py プロジェクト: mrcdyddup/tiktok
class GetItems():
	def __init__(self):
		self.get_raw_data = GetRawData()
		self.redis_client = RedisClient()
		self.saved_items_list = []

	def get_item_ids(self):
		return self.redis_client.get_items()

	def delete_items(self):
		for each in self.saved_items_list:
			self.redis_client.delete_items(each)

	def get_items(self, item):
		ids = item.split('_', 2)
		promotion_id = ids[0]
		room_id = ids[1]
		sec_user_id = ids[2]

		try:
			item_raw_data = self.get_raw_data.get_item(promotion_id)
		except Exception as e:
			logger.error('get_item出错-item-' + item)
			return None

		try:
			item_info = self.parse_items(item_raw_data, room_id, sec_user_id)
		except Exception as e:
			logger.error('parse_items出错-' + e.args[0] + '-item-'+ item)
			return None

		self.write_to_file(item_info)
		self.saved_items_list.append(item)

	def parse_items(self, item_raw_data, room_id, sec_user_id):
		item_info = {}
		item_info['room_id'] = room_id
		item_info['sec_user_id'] = sec_user_id

		item_raw_data = json.loads(item_raw_data.get('promotion'))[0]
		item_info['promotion_id'] = item_raw_data.get('promotion_id')
		item_info['product_id'] = item_raw_data.get('product_id')
		item_info['title'] = item_raw_data.get('title')
		item_info['sales'] = item_raw_data.get('sales')
		item_info['detail_url'] = item_raw_data.get('detail_url')
		item_info['image_url'] = item_raw_data.get('images')[0].get('url_list')[0]
		item_info['price'] = item_raw_data.get('price')/100

		item_info['market_price'] = ''
		market_price = item_raw_data.get('market_price')
		if market_price:
			item_info['market_price'] = market_price/100

		return item_info

	def write_to_file(self, item_info):
		today = time.strftime('%Y-%m-%d', time.localtime())
		today = today.replace('-', '')
		with open (FILE_DIRECTORY + '/' + 'items' + '_' + today + '.txt', 'a', encoding='utf-8') as file:
			file.write(json.dumps(item_info, ensure_ascii=False) + '\n')

	def run(self):
		all_items = self.get_item_ids()
		logger.info('共有item数量-' + str(len(all_items)))
		batch_size = 200
		for batch_limit in range(0, len(all_items), batch_size):
			start = batch_limit
			stop = min(batch_limit+batch_size, len(all_items))
			logger.info('当前获取的item-' + str(start+1) + '-' + str(stop))
			tasks = [gevent.spawn(self.get_items, item) for item in all_items[start:stop]]
			gevent.joinall(tasks)
			logger.info('获取到items-' + str(len(self.saved_items_list)))
			self.delete_items()
			self.saved_items_list.clear()