コード例 #1
0
ファイル: run.py プロジェクト: rushabhsooni/TikTok
async def scrape(mode, username: str=None, count: int=0, likes: int=0, views: int=0, shares: int=0, comments: int=0):
    ''' general scrape method '''
    tt = TikTok()

    if mode == Scrape.TRENDING:
        # change videos to number of videos you want to return
        username = '******'
        if count < 0:
            count = 30
        try:
            results = tt.getTrending(count)
        except Exception as e:
            print('Exception:', e)
            return None

    elif mode == Scrape.USER:
        try:
            details = tt.getUserDetails(username)
        except Exception as e:
            print('Exception:', e)
            return None

        userInfo = details['userInfo']
        _id = userInfo['user']['id']
        secUid = userInfo['user']['secUid']
        stats = details['userInfo']['stats']
        videos = stats['videoCount']

        if count < 0:
            count = videos

        results = tt.getUserTikToks(_id, count)

    elif mode == Scrape.MUSIC:
        pass

    elif mode == Scrape.HASHTAG:
        pass

    #####################
    # Results Filtering #
    #####################
    
    # filter according to likes
    if likes:
        results = list(filter(lambda x: x['stats']['diggCount'] >= likes, results))

    # filter according to views
    if views:
        results = list(filter(lambda x: x['stats']['playCount'] >= views, results))

    # filter according to shares
    if shares:
        results = list(filter(lambda x: x['stats']['shareCount'] >= shares, results))

    # filter according to comments
    if comments:
        results = list(filter(lambda x: x['stats']['commentCount'] >= comments, results))

    # creates username folder if not present
    path = f'{DOWNLOADS_BASE_DIR}/{username}'
    if not os.path.exists(path):
        print(f'Creating directory {path}')
        os.makedirs(path)

    # explicitly delete TikTok object as we don't need to make any more API calls
    del tt

    # process results in a producer-consumer async loop
    try:
        queue = asyncio.Queue(maxsize=1000)

        # enqueue items
        for item in results:
            video_id = item['id']
            download_url = item['video']['downloadAddr']
            print('Adding to queue:', video_id)
            await queue.put((username, video_id, download_url))

        headers = {
            'User-Agent': random.choice(getAllowedAgents()),
            'method': 'GET',
            'accept-encoding': 'gzip, deflate, br',
            'referrer': 'https://www.tiktok.com/trending',
            'upgrade-insecure-requests': '1',
        }

        # create http session
        async with aiohttp.ClientSession(headers=headers) as session:
            tasks = []
            # spawn worker tasks
            for worker in range(MAX_CONCURRENT):
                task = asyncio.create_task(download_worker(worker, queue, session))
                tasks.append(task)

            # wait until the queue is consumed
            print(f'\nWaiting for tasks in queue[{queue.qsize()}] to be processed...\n')
            await queue.join()

        # dismiss workers once queue is finished
        print('\nFinishing tasks...\n')
        for task in tasks:
            task.cancel()

        # wait until all workers are dismissed
        await asyncio.gather(*tasks, return_exceptions=True)

    except Exception as e:
        print('Exception', e)  
コード例 #2
0
class TikTok:
    ''' TikTok object with Selenium '''

    # Get Allow: / from robots.txt
    USER_AGENTS = getAllowedAgents()

    def __init__(self, path: str = None, proxify: bool = False):
        # select random UserAgent from robots.txt
        self.UserAgent = random.choice(TikTok.USER_AGENTS)

        # self.UserAgent = 'Twitterbot'
        print(f'User-Agent: {self.UserAgent}')

        # show current ip
        my_ip = get_my_ip()
        print(f'IP Address: {my_ip}')

        # configure proxy
        if proxify:
            new_proxy = fetch_proxies()[0]
            proxy_host = new_proxy['ip']
            proxy_port = int(new_proxy['port'])
            proxy = f'{proxy_host}:{proxy_port}'
            print(f'Using proxy: {proxy}')

            webdriver.DesiredCapabilities.CHROME['proxy'] = {
                'httpProxy': proxy,
                'ftpProxy': proxy,
                'sslProxy': proxy,
                'proxyType': 'MANUAL',
            }

        # define chromedriver executable
        executable = '/usr/bin/chromedriver'
        if os.name == 'nt':
            executable += '.exe'

        # set default webdriver path
        self.driver_path = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            executable) if path is None else path

        # set chrome options
        self.chrome_options = Options()
        self.chrome_options.page_load_strategy = 'none'
        self.chrome_options.add_argument('--ignore-certificate-errors')
        self.chrome_options.add_argument(
            '--ignore-certificate-errors-spki-list')
        self.chrome_options.add_argument("--ignore-ssl-errors")
        self.chrome_options.add_argument('--headless')
        self.chrome_options.add_argument('--disable-gpu')
        self.chrome_options.add_argument('--incognito')
        self.chrome_options.add_argument('--log-level=3')
        self.chrome_options.add_argument(f'user-agent={self.UserAgent}')

        # start webdriver
        self.driver = webdriver.Chrome(self.driver_path,
                                       options=self.chrome_options)

        # modify HTTP request headers
        self.driver.header_overrides = {
            'method': 'GET',
            'accept-encoding': 'gzip, deflate, br',
            'referrer': 'https://www.tiktok.com/trending',
            'upgrade-insecure-requests': '1',
        }

        # set tiktok default variables
        self.language = 'en'
        self.region = 'NY'
        self.type = 1
        self.secUid = 0
        self.verifyFp = None
        self.maxCount = 99
        self.minCursor = 0
        self.maxCursor = 0
        self.sourceType = 8  # 12 for trending

    def __del__(self):
        self.driver.quit()

    def _signURL(self, url):
        '''Sign URL using duD4 function defined in webpackJsonp'''
        sign_js_url = 'https://www.tiktok.com/trending'
        self.driver.get(sign_js_url)

        # save cookie information if not present
        if not self.verifyFp:
            self.verifyFp = self.driver.get_cookie('s_v_web_id')['value']

        # execute JS in browser sign url
        script = 'return window.byted_acrawler.sign({ url: "' + url + '" });'
        signature = self.driver.execute_script(script)

        return signature

    def getUserDetails(self, username):
        url = f'https://m.tiktok.com/api/user/detail/?uniqueId={username}&language={self.language}&verifyFp={self.verifyFp if self.verifyFp else ""}'

        signature = self._signURL(url)
        url = f'{url}&_signature={signature}'

        self.driver.get(url)

        text = self.driver.page_source
        details = json.loads(self.driver.find_element_by_tag_name('pre').text)
        secUid = details['userInfo']['user']['secUid']
        self.secUid = secUid
        return details

    def getTrending(self, count: int = 50):
        '''get list of trending tiktok videos'''
        self.sourceType = 12
        self.type = 5
        return self.__getTikToks(_id=1, item_count=count)

    def getUserTikToks(self, userid, count: int = 0):
        '''get list of user tiktok videos'''
        self.sourceType = 8
        self.type = 1
        return self.__getTikToks(_id=userid, item_count=count)

    def __getTikToks(self, _id, item_count: int = 0):
        '''general get tiktok method'''
        self.minCursor = 0
        self.maxCursor = 0

        tiktoks = []

        # limit maximum number of items per request
        count = item_count if item_count < self.maxCount else self.maxCount

        # query api in batches
        while len(tiktoks) < item_count:

            # prepare request url
            url = f'https://m.tiktok.com/api/item_list/?count={count}&id={_id}&type={self.type}&secUid={self.secUid}&maxCursor={self.maxCursor}&minCursor={self.minCursor}&sourceType={self.sourceType}&appId=1233&region={self.region}&language={self.language}&verifyFp={self.verifyFp if self.verifyFp else ""}'

            # get signature for request url
            signature = self._signURL(url)

            # affix signature to request url
            url = f'{url}&_signature={signature}'

            # send request
            self.driver.get(url)

            # JSON reply sample
            # {
            #     "statusCode": 0,
            #     "items": [],
            #     "hasMore": true,
            #     "maxCursor": 1235,
            #     "minCursor": 1234
            # }

            # parse response
            try:
                reply = json.loads(
                    self.driver.find_element_by_tag_name('pre').text)
                items = reply['items']
                tiktoks.extend(items)

                # this is last batch, no more tiktoks to expect
                if not reply['hasMore']:
                    break

                # adjust count to reflect items returned in this batch
                count = item_count - len(tiktoks)
                self.maxCursor = reply['maxCursor']

            except:
                raise Exception(
                    'No items returned, possibly bad User-Agent. Please try again.'
                )

        return tiktoks
コード例 #3
0
def test_uas():
    # valid as of 2020/05/25
    uas = getAllowedAgents()
    assert set(uas) == {'Googlebot', 'Applebot', 'Bingbot', 'DuckDuckBot', 'Naverbot', 'Twitterbot', 'Yandex'}