Example #1
0
    def __get_yt_json(
        self,
        url: str,
        path: Optional[Union[List[Union[str, int]], Union[str, int]]] = None
    ) -> Optional[dict]:
        try:
            j = json.loads(
                '{' +
                strings.between(
                    self._get(url).text,
                    'var ytInitialData = {',
                    '</script>'
                ).rstrip().rstrip(';')
            )

            if path:
                if isinstance(path, str) or isinstance(path, int):
                    path = [path]

                for path_component in path:
                    j = j[path_component]

            return j
        except Exception as e:
            if self.debug:
                print('ERROR - YoutubeScraper - __get_yt_json({})'.format(url), e)

            return None


# ---------------------------------------------------------------------------------------------------------------------------------------- #
Example #2
0
    def get_channel_video_ids(
        self,
        channel_id: Optional[str] = None,
        ignored_titles: Optional[List[str]] = None
    ) -> List[str]:
        video_ids = []
        ignored_titles = ignored_titles or []
        channel_id = channel_id or self.current_user_id

        try:
            self.get(self.__channel_videos_url(channel_id))
            last_page_source = self.browser.driver.page_source

            while True:
                self.browser.scroll(1500)

                i=0
                max_i = 100
                sleep_time = 0.1
                should_break = True

                while i < max_i:
                    i += 1
                    time.sleep(sleep_time)

                    if len(last_page_source) != len(self.browser.driver.page_source):
                        last_page_source = self.browser.driver.page_source
                        should_break = False

                        break

                if should_break:
                    break

            soup = bs(self.browser.driver.page_source, 'lxml')
            elems = soup.find_all('a', {'id':'video-title', 'class':'yt-simple-endpoint style-scope ytd-grid-video-renderer'})

            for elem in elems:
                if 'title' in elem.attrs:
                    should_continue = False
                    title = elem['title'].strip().lower()

                    for ignored_title in ignored_titles:
                        if ignored_title.strip().lower() == title:
                            should_continue = True

                            break

                    if should_continue:
                        continue

                if 'href' in elem.attrs and '/watch?v=' in elem['href']:
                    vid_id = strings.between(elem['href'], '?v=', '&')

                    if vid_id is not None and vid_id not in video_ids:
                        video_ids.append(vid_id)
        except Exception as e:
            self.print(e)

        return video_ids
Example #3
0
    def get_current_channel_id(self, _click_avatar: bool = False, _get_home_url: bool = False) -> Optional[str]:
        if not self.is_logged_in:
            print('Error - \'upload\': Isn\'t logged in')

            return None

        if _get_home_url:
            self.get(YT_URL)

        try:
            if _click_avatar:
                avatar_button = self.browser.find_by('button', id_='avatar-btn', timeout=0.5)

                if avatar_button:
                    avatar_button.click()

            href_containers = self.browser.find_all_by('a', class_='yt-simple-endpoint style-scope ytd-compact-link-renderer', timeout=0.5)

            if href_containers:
                for href_container in href_containers:
                    href = href_container.get_attribute('href')

                    if href and 'channel/' in href:
                        return strings.between(href, 'channel/', '?')
        except Exception as e:
            self.print(e)

        if not _click_avatar:
            return self.get_current_channel_id(_click_avatar=True, _get_home_url=_get_home_url)
        elif not _get_home_url:
            return self.get_current_channel_id(_click_avatar=False, _get_home_url=True)

        return None
Example #4
0
    def get_current_channel_id(self) -> Optional[str]:
        self.browser.get(YT_URL)

        try:
            return json.loads(
                strings.between(self.browser.driver.page_source,
                                'var ytInitialGuideData = ', '};') +
                '}')['responseContext']['serviceTrackingParams'][2]['params'][
                    0]['value']
        except Exception as e:
            print('get_current_channel_id', e)

            return None
Example #5
0
    def parse_products_page(self, response):   
        asin_ids = []
        soup = BeautifulSoup(response.content, 'lxml')
        results = soup.find_all('span', {'class':'a-declarative'})
        
        for elem in results:
            try:
                asin_id = strings.between(elem['data-a-popover'], 'asin=', '&')

                if asin_id is not None:
                    asin_ids.append(asin_id)
            except:
                pass
        
        return asin_ids
Example #6
0
    def __init__(self, preview_dict: Dict):
        try:
            img_dict = preview_dict['images'][0]['source']

            self.url = 'https://i.redd.it/' + strings.between(
                img_dict['url'], 'redd.it/', '?')
            self.width = img_dict['width']
            self.height = img_dict['height']
        except:
            self.url = None
            self.width = None
            self.height = None


# ---------------------------------------------------------------------------------------------------------------------------------------- #
Example #7
0
    def watch_video(
        self,
        video_id: str,
        percent_to_watch: float = -1,  # 0-100 # -1 means all
        like: bool = False
    ) -> Tuple[bool, bool]:  # watched, liked
        watched = False
        liked = False

        try:
            self.get(YT_WATCH_VIDEO_URL.format(video_id))
            length_s = float(
                strings.between(self.browser.driver.page_source,
                                'detailpage\\\\u0026len=', '\\\\'))
            play_button = self.browser.find_by(
                'button',
                class_='ytp-large-play-button ytp-button',
                timeout=0.5)

            if play_button and play_button.is_displayed():
                play_button.click()
                time.sleep(1)

            while True:
                ad = self.browser.find_by('div',
                                          class_='video-ads ytp-ad-module',
                                          timeout=0.5)

                if not ad or not ad.is_displayed():
                    break

                time.sleep(0.1)

            watched = True
            seconds_to_watch = percent_to_watch / 100 * length_s if percent_to_watch >= 0 else length_s

            if seconds_to_watch > 0:
                self.print('Goinng to watch', seconds_to_watch)
                time.sleep(seconds_to_watch)

            return watched, self.like(
                video_id) if like and self.is_logged_in else False
        except Exception as e:
            self.print(e)

            return watched, liked
Example #8
0
    def parse_product(self, response) -> Optional[Dict]:
        categories = []
        features = []
        video_urls = []

        soup = BeautifulSoup(response.content, 'lxml')
        parsed_json = self.__json_loads(strings.between(response.text, 'var obj = jQuery.parseJSON(\'', '\')'))

        if parsed_json is None:
            return None

        title = parsed_json['title']
        asin = parsed_json['mediaAsin']
        images = parsed_json
        videos = parsed_json['videos']

        features = []

        try:
            for feature in soup.find('div', {'class':'a-section a-spacing-medium a-spacing-top-small'}).find_all('span', {'class':'a-list-item'}):
                try:
                    features.append(feature.get_text().strip())
                except:
                    pass
        except:
            pass

        try:
            categories_container = soup.find('div', {'id':'wayfinding-breadcrumbs_container'})

            for category_a in categories_container.find_all('a', {'class':'a-link-normal a-color-tertiary'}):
                try:
                    categories.append(BeautifulSoup(category_a.text, "lxml").text.replace('\\', '/').replace('<', ' ').replace('>', ' ').strip().lower())
                except:
                    pass
        except:
            pass

        # print('categories', categories)
        
        try:
            price_text = soup.find('span', {'id':'priceblock_ourprice'}).text.replace('$', '').strip()
            price = float(price_text)
        except:
            price = None

        table_for_product_info = soup.find('table', {'id':'productDetails_detailBullets_sections1', 'class':'a-keyvalue prodDetTable'})

        product_information_dict = {}
        if table_for_product_info is not None:
            for tr in table_for_product_info.find_all('tr'):
                key = tr.find('th').get_text().strip()

                if key is not None and key not in ['Customer Reviews', 'Best Sellers Rank']:
                    value = tr.find('td').get_text().strip()
                    product_information_dict[key] = value

        image_details = {}

        if 'colorToAsin' in images and images['colorToAsin'] is not None:
            colors = images['colorToAsin']

            for color_name, color_dict in colors.items():
                _asin = color_dict['asin']
                image_details[_asin] = {
                    'name' : color_name,
                    'image_urls' : []
                }
                
                images_by_color = images['colorImages'][color_name]

                for elem in images_by_color:
                    if 'hiRes' in elem: 
                        image_details[_asin]['image_urls'].append(elem['hiRes'])

            for url in videos:
                if 'url' in url:
                    video_urls.append(url['url'])
        
        if image_details is None or image_details == {}:
            try:
                images_json = self.__json_loads(strings.between(response.text, '\'colorImages\': { \'initial\': ', '}]},') + '}]')

                if images_json is not None:
                    image_details[asin] = {
                        'name' : asin,
                        'image_urls' : []
                    }

                    for image_json in images_json:
                        try:
                            image_details[asin]['image_urls'].append(image_json['large'])
                        except Exception as e:
                            print(e)
                            pass
            except:
                pass

        associated_asins = []

        try:
            associated_asins_json = self.__json_loads(strings.between(response.text, 'dimensionToAsinMap :', '},').strip() + '}')

            if associated_asins_json is not None:
                for val in associated_asins_json.values():
                    associated_asins.append(val)
        except:
            pass

        return {
            'title': title, 
            'price': price,
            'categories': categories,
            'features': features,
            'product information': product_information_dict,
            'images': image_details,
            'videos_url': video_urls,
            'associated_asins': associated_asins
        }
Example #9
0
    def parse_product(cls,
                      response: Optional[Response],
                      debug: bool = False) -> Optional[Product]:
        if not response or response.status_code not in [200, 201]:
            return None

        categories = []
        features = []
        videos = []

        soup = bs(response.content, 'lxml')
        parsed_json = cls.__json_loads(
            strings.between(response.text, 'var obj = jQuery.parseJSON(\'',
                            '\')'))

        if parsed_json is None:
            return None

        images = parsed_json
        title = parsed_json['title'].strip()
        asin = parsed_json['mediaAsin']
        videos = parsed_json['videos']

        try:
            for feature in soup.find(
                    'div',
                {
                    'class': 'a-section a-spacing-medium a-spacing-top-small'
                }).find_all('span', {'class': 'a-list-item'}):
                try:
                    features.append(feature.get_text().strip())
                except:
                    pass
        except Exception as e:
            if debug:
                print(e)

        try:
            for cat_a in soup.find('div', {
                    'id': 'wayfinding-breadcrumbs_container'
            }).find_all('a', class_='a-link-normal a-color-tertiary'):
                try:
                    categories.append(
                        bs(cat_a.text, "lxml").text.replace('\\', '/').replace(
                            '<', ' ').replace('>', ' ').strip().lower())
                except:
                    pass
        except Exception as e:
            if debug:
                print(e)

        try:
            price_text = soup.find('span', {
                'id': 'priceblock_ourprice'
            }).text.replace('$', '').strip()
            price = float(price_text)
        except:
            price = None

        try:
            table_for_product_info = soup.find(
                'table', {
                    'id': 'productDetails_detailBullets_sections1',
                    'class': 'a-keyvalue prodDetTable'
                })

            details = {}
            if table_for_product_info is not None:
                for tr in table_for_product_info.find_all('tr'):
                    key = tr.find('th').get_text().strip()

                    if key is not None and key not in [
                            'Customer Reviews', 'Best Sellers Rank'
                    ]:
                        value = tr.find('td').get_text().strip()
                        details[key] = value
        except:
            pass

        image_details = {}

        if 'colorToAsin' in images and images['colorToAsin'] is not None:
            colors = images['colorToAsin']

            for color_name, color_dict in colors.items():
                _asin = color_dict['asin']
                image_details[_asin] = {'name': color_name, 'image_urls': []}

                images_by_color = images['colorImages'][color_name]

                for elem in images_by_color:
                    if 'hiRes' in elem:
                        image_details[_asin]['image_urls'].append(
                            elem['hiRes'])

        added_video_urls = []

        for elem in videos:
            try:
                vid_url = elem['url']
                print(vid_url)

                if vid_url in added_video_urls:
                    continue

                video = {'url': vid_url}

                video['title'] = elem['title'].strip()
                video['height'] = int(elem['videoHeight'] if 'videoHeight' in
                                      elem else elem['height'])
                video['width'] = int(elem['videoWidth'] if 'videoWidth' in
                                     elem else elem['width'])

                videos.append(video)
                added_video_urls.append(vid_url)
            except Exception as e:
                if debug:
                    print(e)

        if image_details is None or image_details == {}:
            try:
                images_json = cls.__json_loads(
                    strings.between(response.text,
                                    '\'colorImages\': { \'initial\': ', '}]},')
                    + '}]')

                if images_json is not None:
                    image_details[asin] = {'name': asin, 'image_urls': []}

                    for image_json in images_json:
                        try:
                            image_details[asin]['image_urls'].append(
                                image_json['large'])
                        except Exception as e:
                            if debug:
                                print(e)
            except:
                pass

        associated_asins = []

        try:
            associated_asins_json = cls.__json_loads(
                strings.between(response.text, 'dimensionToAsinMap :',
                                '},').strip() + '}')

            if associated_asins_json is not None:
                for val in associated_asins_json.values():
                    associated_asins.append(val)
        except:
            pass

        return Product(title, asin, price, categories, features, details,
                       image_details, videos)