async def scrape(mode, username: str=None, count: int=0, likes: int=0, views: int=0, shares: int=0, comments: int=0): ''' general scrape method ''' tt = TikTok() if mode == Scrape.TRENDING: # change videos to number of videos you want to return username = '******' if count < 0: count = 30 try: results = tt.getTrending(count) except Exception as e: print('Exception:', e) return None elif mode == Scrape.USER: try: details = tt.getUserDetails(username) except Exception as e: print('Exception:', e) return None userInfo = details['userInfo'] _id = userInfo['user']['id'] secUid = userInfo['user']['secUid'] stats = details['userInfo']['stats'] videos = stats['videoCount'] if count < 0: count = videos results = tt.getUserTikToks(_id, count) elif mode == Scrape.MUSIC: pass elif mode == Scrape.HASHTAG: pass ##################### # Results Filtering # ##################### # filter according to likes if likes: results = list(filter(lambda x: x['stats']['diggCount'] >= likes, results)) # filter according to views if views: results = list(filter(lambda x: x['stats']['playCount'] >= views, results)) # filter according to shares if shares: results = list(filter(lambda x: x['stats']['shareCount'] >= shares, results)) # filter according to comments if comments: results = list(filter(lambda x: x['stats']['commentCount'] >= comments, results)) # creates username folder if not present path = f'{DOWNLOADS_BASE_DIR}/{username}' if not os.path.exists(path): print(f'Creating directory {path}') os.makedirs(path) # explicitly delete TikTok object as we don't need to make any more API calls del tt # process results in a producer-consumer async loop try: queue = asyncio.Queue(maxsize=1000) # enqueue items for item in results: video_id = item['id'] download_url = item['video']['downloadAddr'] print('Adding to queue:', video_id) await queue.put((username, video_id, download_url)) headers = { 'User-Agent': random.choice(getAllowedAgents()), 'method': 'GET', 'accept-encoding': 'gzip, deflate, br', 'referrer': 'https://www.tiktok.com/trending', 'upgrade-insecure-requests': '1', } # create http session async with aiohttp.ClientSession(headers=headers) as session: tasks = [] # spawn worker tasks for worker in range(MAX_CONCURRENT): task = asyncio.create_task(download_worker(worker, queue, session)) tasks.append(task) # wait until the queue is consumed print(f'\nWaiting for tasks in queue[{queue.qsize()}] to be processed...\n') await queue.join() # dismiss workers once queue is finished print('\nFinishing tasks...\n') for task in tasks: task.cancel() # wait until all workers are dismissed await asyncio.gather(*tasks, return_exceptions=True) except Exception as e: print('Exception', e)
class TikTok: ''' TikTok object with Selenium ''' # Get Allow: / from robots.txt USER_AGENTS = getAllowedAgents() def __init__(self, path: str = None, proxify: bool = False): # select random UserAgent from robots.txt self.UserAgent = random.choice(TikTok.USER_AGENTS) # self.UserAgent = 'Twitterbot' print(f'User-Agent: {self.UserAgent}') # show current ip my_ip = get_my_ip() print(f'IP Address: {my_ip}') # configure proxy if proxify: new_proxy = fetch_proxies()[0] proxy_host = new_proxy['ip'] proxy_port = int(new_proxy['port']) proxy = f'{proxy_host}:{proxy_port}' print(f'Using proxy: {proxy}') webdriver.DesiredCapabilities.CHROME['proxy'] = { 'httpProxy': proxy, 'ftpProxy': proxy, 'sslProxy': proxy, 'proxyType': 'MANUAL', } # define chromedriver executable executable = '/usr/bin/chromedriver' if os.name == 'nt': executable += '.exe' # set default webdriver path self.driver_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), executable) if path is None else path # set chrome options self.chrome_options = Options() self.chrome_options.page_load_strategy = 'none' self.chrome_options.add_argument('--ignore-certificate-errors') self.chrome_options.add_argument( '--ignore-certificate-errors-spki-list') self.chrome_options.add_argument("--ignore-ssl-errors") self.chrome_options.add_argument('--headless') self.chrome_options.add_argument('--disable-gpu') self.chrome_options.add_argument('--incognito') self.chrome_options.add_argument('--log-level=3') self.chrome_options.add_argument(f'user-agent={self.UserAgent}') # start webdriver self.driver = webdriver.Chrome(self.driver_path, options=self.chrome_options) # modify HTTP request headers self.driver.header_overrides = { 'method': 'GET', 'accept-encoding': 'gzip, deflate, br', 'referrer': 'https://www.tiktok.com/trending', 'upgrade-insecure-requests': '1', } # set tiktok default variables self.language = 'en' self.region = 'NY' self.type = 1 self.secUid = 0 self.verifyFp = None self.maxCount = 99 self.minCursor = 0 self.maxCursor = 0 self.sourceType = 8 # 12 for trending def __del__(self): self.driver.quit() def _signURL(self, url): '''Sign URL using duD4 function defined in webpackJsonp''' sign_js_url = 'https://www.tiktok.com/trending' self.driver.get(sign_js_url) # save cookie information if not present if not self.verifyFp: self.verifyFp = self.driver.get_cookie('s_v_web_id')['value'] # execute JS in browser sign url script = 'return window.byted_acrawler.sign({ url: "' + url + '" });' signature = self.driver.execute_script(script) return signature def getUserDetails(self, username): url = f'https://m.tiktok.com/api/user/detail/?uniqueId={username}&language={self.language}&verifyFp={self.verifyFp if self.verifyFp else ""}' signature = self._signURL(url) url = f'{url}&_signature={signature}' self.driver.get(url) text = self.driver.page_source details = json.loads(self.driver.find_element_by_tag_name('pre').text) secUid = details['userInfo']['user']['secUid'] self.secUid = secUid return details def getTrending(self, count: int = 50): '''get list of trending tiktok videos''' self.sourceType = 12 self.type = 5 return self.__getTikToks(_id=1, item_count=count) def getUserTikToks(self, userid, count: int = 0): '''get list of user tiktok videos''' self.sourceType = 8 self.type = 1 return self.__getTikToks(_id=userid, item_count=count) def __getTikToks(self, _id, item_count: int = 0): '''general get tiktok method''' self.minCursor = 0 self.maxCursor = 0 tiktoks = [] # limit maximum number of items per request count = item_count if item_count < self.maxCount else self.maxCount # query api in batches while len(tiktoks) < item_count: # prepare request url url = f'https://m.tiktok.com/api/item_list/?count={count}&id={_id}&type={self.type}&secUid={self.secUid}&maxCursor={self.maxCursor}&minCursor={self.minCursor}&sourceType={self.sourceType}&appId=1233®ion={self.region}&language={self.language}&verifyFp={self.verifyFp if self.verifyFp else ""}' # get signature for request url signature = self._signURL(url) # affix signature to request url url = f'{url}&_signature={signature}' # send request self.driver.get(url) # JSON reply sample # { # "statusCode": 0, # "items": [], # "hasMore": true, # "maxCursor": 1235, # "minCursor": 1234 # } # parse response try: reply = json.loads( self.driver.find_element_by_tag_name('pre').text) items = reply['items'] tiktoks.extend(items) # this is last batch, no more tiktoks to expect if not reply['hasMore']: break # adjust count to reflect items returned in this batch count = item_count - len(tiktoks) self.maxCursor = reply['maxCursor'] except: raise Exception( 'No items returned, possibly bad User-Agent. Please try again.' ) return tiktoks
def test_uas(): # valid as of 2020/05/25 uas = getAllowedAgents() assert set(uas) == {'Googlebot', 'Applebot', 'Bingbot', 'DuckDuckBot', 'Naverbot', 'Twitterbot', 'Yandex'}