def check_args(self):
        # Get request arguments
        max_views: str = request.args.get('max_views', '')
        min_views: str = request.args.get('min_views', '')
        search_word: str = request.args.get('query_word', '')
        # Restrict view num only integers
        if not max_views.isdigit() or not min_views.isdigit() or (
                int(min_views) >= int(max_views)):
            logger.error(
                f'{request.url} error in either max or min views: they must be integers {min_views}, {max_views}'
            )

            return False, {
                'data':
                'error in either max or min views: they must be integers, or min >= max'
            }
        # Restrict min length query words
        if not search_word or len(search_word) < 3:
            logger.error(f'{request.url} query word to short: {search_word}')

            return False, {
                'data': 'query word too short (required min 3 symbols)'
            }

        return True, {
            'search_word': search_word,
            'min_views': int(min_views),
            'max_views': int(max_views)
        }
Ejemplo n.º 2
0
    def run(self) -> Optional[List[dict]]:
        """Run the parser"""
        current_page: int = 1
        out_of_number_of_views: bool = False
        results: List[dict] = list()
        logger.info(f'search videos in youtube with name: {self.search_word}')

        while True:
            # Format the url
            current_url: str = self.search_url.format(
                search_word=self.search_word, page_number=current_page)
            # Send the request
            response: requests.Response = requests.get(current_url,
                                                       headers=self.headers)
            # Cast response to selector
            selector: Selector = Selector(response=response)
            # Get listed items from search result page
            items: List[Selector] = selector.xpath(
                '//ol[@class="section-list"]/li[2]/ol[@class="item-section"]/li'
            )

            for item in items:
                try:
                    # 'Playlist' in case of playlist so we must not take into account
                    duration: Optional[str] = item.xpath(
                        './/h3[@class="yt-lockup-title "]/span/text()'
                    ).extract_first()
                    # Get desired part from item duration
                    # It maybe either duration, 'Playlist', 'Channel, Live(None in case of Live)
                    if not duration:
                        continue
                    duration = duration.replace('Duration:', "").replace(
                        ' ', "").replace(".", "").replace("-", "")
                    # We are not interested in those cases
                    if duration in ('Playlist', 'Channel'):
                        continue
                    # Get item views
                    # None in case of playlist
                    views: Optional[str] = item.xpath(
                        './/div[@class="yt-lockup-meta "]/ul/li[2]/text()'
                    ).extract_first()
                    if views:
                        views = views.replace(' ', "").replace(",",
                                                               "").replace(
                                                                   "views", "")
                        views: int = int(views)
                    # If item views more than 10M do not include in results
                    if views > self.upper_bound:
                        continue
                    # Stop crawling if the item views are smaller than 100K
                    if views < self.lower_bound:
                        out_of_number_of_views = True
                        break
                    # Get item title
                    title: Optional[str] = item.xpath(
                        './/h3[@class="yt-lockup-title "]/a/text()'
                    ).extract_first()
                    # Get the url
                    url: Optional[str] = item.xpath(
                        './/h3[@class="yt-lockup-title "]/a/@href'
                    ).extract_first()
                    url = f'https://youtube.com{url}'
                    # Channel information
                    channel: Optional[str] = item.xpath(
                        './/div[@class="yt-lockup-byline "]/a/text()'
                    ).extract_first()
                    channel_url: Optional[str] = item.xpath(
                        './/div[@class="yt-lockup-byline "]/a/@href'
                    ).extract_first()
                    youtube_id = url.split('=')[-1]
                    # Make a dict result for further processing
                    result: dict = {
                        'title': title,
                        'youtube_id': youtube_id,
                        'url': url,
                        'views': views,
                        'duration': duration,
                        'channel': channel,
                        'search_word': self.search_word,
                        'channel_url': f'https://youtube.com{channel_url}'
                    }

                    results.append(result)
                except TypeError:
                    logger.error(f'TypeError on url {current_url} ')

            # If number of views is smaller than wanted exit from while loop
            if out_of_number_of_views:
                break

            current_page += 1
        # Get artist names already in the database
        youtube_ids: List[str] = [item['youtube_id'] for item in results]
        with application.app_context():
            artist_results: List[Result] = Result.query.filter(
                Result.youtube_id.in_(youtube_ids)).all()
            id_to_artist_mapper: Dict[str:str] = {
                item.youtube_id: item.serialize
                for item in artist_results
            }

        # Update results list
        for item in results:
            youtube_id = item['youtube_id']

            item.update(id_to_artist_mapper.get(youtube_id, {}))

        results: List[dict] = [
            self.get_artist_pool(result) for result in results
        ]

        logger.info(
            f'run youtube search on word: {self.search_word} with {current_page} pages'
        )

        return results
Ejemplo n.º 3
0
    def scrape_video_page(self, url) -> Dict[str, str]:
        """Parse video page to get the metadata"""
        # Base metadata for future acknowledging of scraper status in view
        video_metadata = dict(scraped_youtube_page=True, existing=False)
        # Get the video page
        response: requests.Response = requests.get(url, headers=self.headers)
        # If some problem with request
        if not response.status_code == 200:
            video_metadata['scraped_youtube_page'] = False
            logger.error(f'error getting page: {url}')

            return video_metadata

        selector: Selector = Selector(response)
        # Get category of the video
        category_selector = selector.xpath(
            '//div[@id="watch7-main"]//ul[@class="watch-extras-section"]'
            '//li[h4[normalize-space(text())="Category"]]')
        if category_selector:
            categories: list = category_selector.xpath(
                "./ul//li//text()").extract()
            video_metadata['category'] = ','.join(categories)
        # Get song if presented
        song_selector = selector.xpath(
            '//div[@id="watch7-main"]//ul[@class="watch-extras-section"]'
            '//li[h4[normalize-space(text())="Song"]]')
        if song_selector:
            song_name: Optional[str] = song_selector.xpath(
                "./ul//li//text()").extract_first()
            video_metadata['song'] = song_name
        # Get artist if presented
        artist_selector = selector.xpath(
            '//div[@id="watch7-main"]//ul[@class="watch-extras-section"]'
            '//li[h4[normalize-space(text())="Artist"]]')
        if artist_selector:
            artist_name: Optional[str] = artist_selector.xpath(
                "./ul//li//text()").extract_first()
            video_metadata['artist_name'] = artist_name
        # Get album if presented
        album_selector = selector.xpath(
            '//div[@id="watch7-main"]//ul[@class="watch-extras-section"]'
            '//li[h4[normalize-space(text())="Album"]]')
        if album_selector:
            album: Optional[str] = album_selector.xpath(
                "./ul//li//text()").extract_first()
            video_metadata['album'] = album
        # Get licensor if selected
        license_selector = selector.xpath(
            '//div[@id="watch7-main"]//ul[@class="watch-extras-section"]'
            '//li[h4[contains(normalize-space(text()), "Licensed to")]]')
        if license_selector:
            license_: Optional[str] = license_selector.xpath(
                "./ul//li//text()").extract_first()
            video_metadata['license'] = license_

        meta_selector = selector.xpath(
            '//div[@id="watch7-main"]/div[@id="watch7-content"]')
        if meta_selector:
            # Video description
            description = meta_selector.xpath(
                './/meta[@itemprop="description"]/@content').extract_first()
            video_metadata['description'] = description
            # Is video family friendly
            is_family_friendly = meta_selector.xpath(
                './/meta[@itemprop="isFamilyFriendly"]/@content'
            ).extract_first()
            video_metadata['is_family_friendly'] = is_family_friendly == 'True'
            # Video published date
            date_published = meta_selector.xpath(
                './/meta[@itemprop="datePublished"]/@content').extract_first()
            video_metadata['date_published'] = date_published

        return video_metadata
Ejemplo n.º 4
0
def error_404(error: NotFound):
    logger.error(f'{request.url} {str(error)}')
    return jsonify({'data': f'page not found: {request.url}'}), 404
Ejemplo n.º 5
0
def error_500(error):
    logger.error(f'{request.url} {str(error)}')
    return jsonify({'data': f'server error: {request.url}'}), 500