def __init__(self,
                 wait_time_between_calls_in_seconds=None,
                 cache_path=None):
        self.wait_time_between_calls_in_seconds = wait_time_between_calls_in_seconds
        self.cache_path = cache_path

        self.films = None
        self.tropes = None
        self.tropes_by_film = OrderedDict()
        self.latest_log_datetime = None
        self.parser = ObjectFactory().get_instance(PageParserInterface)
Beispiel #2
0
    def _retrieve(self, url):
        if self._cache is None:
            if self.cache_path is None:
                self._cache = ObjectFactory().get_instance(CacheInterface)
            else:
                self._cache = ObjectFactory().get_instance(CacheInterface, self.cache_path)

        content = self._cache.get(url)
        if content is None:
            retriever = self._get_retriever()
            content = retriever.retrieve(url)
            self._cache.set(url, content)

        return content
Beispiel #3
0
 def _get_info():
     file_cache = ObjectFactory().get_instance(CacheInterface)
     return file_cache.get_info()
Beispiel #4
0
 def _export_to_json(self):
     store = ObjectFactory().get_instance(TropesStoreInterface,
                                          self._file_name,
                                          self.tropes_by_film)
     store.store()
Beispiel #5
0
 def _get_retriever(self):
     if self.wait_time_between_calls_in_seconds is None:
         return ObjectFactory().get_instance(WebPageRetrieverInterface)
     return ObjectFactory().get_instance(
         WebPageRetrieverInterface, self.wait_time_between_calls_in_seconds)
Beispiel #6
0
class ScrapeTropesUseCase(object):
    logger = logging.getLogger(__name__)

    def __init__(self,
                 file_name,
                 wait_time_between_calls_in_seconds=None,
                 cache_path=None):
        self._file_name = file_name
        self.wait_time_between_calls_in_seconds = wait_time_between_calls_in_seconds
        self.cache_path = cache_path
        self._initial_recursivity = 1000000

        self.films = None
        self._all_trope_urls = set()
        self.tropes = None
        self.tropes_by_film = OrderedDict()
        self.latest_log_datetime = None

        self.cache_interface = CacheInterface
        self.page_parser_interface = PageParserInterface
        self.page_retriever_interface = WebPageRetrieverInterface
        self.tropes_store_interface = TropesStoreInterface
        self.object_factory = ObjectFactory

        self.parser = TVTropesParser()
        self._cache = None

    def run(self):
        self._log_instructions()
        self._extract_film_ids()
        self._extract_tropes()
        self._extract_trope_ids_and_link_films()
        self._export_to_json()
        self._log_summary()

        return self.tropes_by_film

    def _log_instructions(self):
        self.logger.info(
            'Process started\n* Remember that you can stop and restart at any time.\n'
            '** Please, remove manually the cache folder when you are done\n')

    def _extract_film_ids(self):
        self.logger.info('Scraping film ids...')

        self.films = set()
        starting_url = self.parser.get_films_starting_url()
        page = self._retrieve(starting_url)
        category_ids = self.parser.extract_categories(page)

        for category_id in category_ids:
            category_url = self.parser.get_category_url(category_id)
            page = self._retrieve(category_url)
            film_ids = self.parser.extract_films(page)
            self.films.update(film_ids)

        self.logger.info(f'Found {len(self.films)} films')

    def _extract_tropes(self):
        self.logger.info('Scraping tropes (this might take a while...)')

        self.tropes = set()
        self.tropes_by_film = OrderedDict()
        sorted_films = sorted(list(self.films))

        self.logger.debug(f'Found {len(sorted_films)} films')

        for counter, film in enumerate(sorted_films):
            url = self.parser.get_film_url(film)
            page = self._retrieve(url)
            trope_ids = self.parser.extract_tropes_from_film_page(page)

            self.logger.debug(
                f'Film {film} ({len(trope_ids)} tropes): {trope_ids}')

            self.tropes.update(trope_ids)
            self.tropes_by_film[film] = sorted(trope_ids)

            if self._should_log_status():
                self._log_status(counter, sorted_films)

    def _extract_trope_ids_and_link_films(self):
        self.logger.info('Scraping trope ids...')

        starting_url = self.parser.get_tropes_starting_url()
        page = self._retrieve(starting_url)

        self._all_trope_urls = set()
        self._all_trope_urls.add(starting_url)

        sys.setrecursionlimit(10**6)
        self.extract_all_tropes_in_page_recursively(
            page, 'tropes', recursivity_level=self._initial_recursivity)
        for film, tropes in self.tropes_by_film.items():
            self.tropes_by_film[film] = sorted(list(set(tropes)))

        self.logger.info(f'Found {len(self._all_trope_urls)} tropes')

    def _should_log_status(self):
        now = datetime.datetime.now()
        return self.latest_log_datetime is None or now - self.latest_log_datetime > datetime.timedelta(
            seconds=5)

    def _log_status(self,
                    counter,
                    total_elements_list=None,
                    item_name_plural='films'):
        total = float('inf') if total_elements_list is None else len(
            total_elements_list)

        values = list(map(lambda x: len(x), self.tropes_by_film.values()))
        average = sum(values) / len(values)
        self.logger.info(
            f'Status: {counter + 1}/{total} {item_name_plural} scraped. '
            f'Average tropes by film: {average}')
        self.latest_log_datetime = datetime.datetime.now()

    def _retrieve(self, url):
        if self._cache is None:
            if self.cache_path is None:
                self._cache = ObjectFactory().get_instance(CacheInterface)
            else:
                self._cache = ObjectFactory().get_instance(
                    CacheInterface, self.cache_path)

        content = self._cache.get(url)
        if content is None:
            retriever = self._get_retriever()
            content = retriever.retrieve(url)
            self._cache.set(url, content)

        return content

    def _get_retriever(self):
        if self.wait_time_between_calls_in_seconds is None:
            return ObjectFactory().get_instance(WebPageRetrieverInterface)
        return ObjectFactory().get_instance(
            WebPageRetrieverInterface, self.wait_time_between_calls_in_seconds)

    def _export_to_json(self):
        store = ObjectFactory().get_instance(TropesStoreInterface,
                                             self._file_name,
                                             self.tropes_by_film)
        store.store()

    def _log_summary(self):
        films_count = len(self.tropes_by_film.keys())
        tropes_count = len(self.tropes)
        self.logger.info(
            f'Summary:\n- Films: {films_count}\n- Tropes: {tropes_count}\n- Cache: {self._get_info()}'
        )

    @staticmethod
    def _get_info():
        file_cache = ObjectFactory().get_instance(CacheInterface)
        return file_cache.get_info()

    def extract_all_tropes_in_page_recursively(self, page, trope_name,
                                               recursivity_level):
        recursivity_level -= 1

        links, films = self.parser.get_all_trope_links_and_paginations(
            page, trope_name)
        for film in films:
            if film in self.tropes_by_film:
                self.tropes_by_film[film].append(trope_name)

        if self._should_log_status():
            self._log_status(len(self._all_trope_urls),
                             item_name_plural='tropes')

        if recursivity_level <= 0:
            return

        filtered_links = []
        for link in links:
            full_link = self.parser.get_link_url(link)
            if full_link in self._all_trope_urls:
                # self.logger.warning(f'Link already visited: {full_link}')
                pass
            else:
                filtered_links.append(full_link)
                self._all_trope_urls.add(full_link)

        if self._should_log_status():
            self._log_status(len(self._all_trope_urls),
                             item_name_plural='tropes')

        for full_link in filtered_links:
            if 'CircularRedirect' in full_link or 'ThisPageRedirectsToItself' in full_link:
                return []

            matches = re.match('^.*/Main/([^/]+)(/.*)?$', full_link)
            if not matches:
                matches = re.match('^.*/pmwiki.php/([^/]+)/.*$', full_link)
                if not matches:
                    self.logger.warning(
                        'Could not get the trope name in url {}'.format(
                            full_link))
                    return

            subtrope = matches.group(1)
            category = full_link.split('/')[-1]
            if (not self.parser.trope_name_is_media_type_to_ignore(subtrope)
                    and not self.parser.trope_name_is_media_type_to_ignore(
                        category)):

                if subtrope != category and category in self.tropes_by_film:
                    self.logger.info(
                        f'URL encoded trope-film relation ({subtrope}=>{category}): {full_link}'
                    )
                    self.tropes_by_film[category].append(subtrope)
                else:
                    page = self._retrieve(full_link)
                    if page is not None:
                        self.extract_all_tropes_in_page_recursively(
                            page, subtrope, recursivity_level)
 def _get_cache(self):
     if self.cache_path is None:
         return ObjectFactory().get_instance(CacheInterface)
     return ObjectFactory().get_instance(CacheInterface, self.cache_path)
class TVTropesScraper(object):
    logger = logging.getLogger(__name__)

    def __init__(self,
                 wait_time_between_calls_in_seconds=None,
                 cache_path=None):
        self.wait_time_between_calls_in_seconds = wait_time_between_calls_in_seconds
        self.cache_path = cache_path

        self.films = None
        self.tropes = None
        self.tropes_by_film = OrderedDict()
        self.latest_log_datetime = None
        self.parser = ObjectFactory().get_instance(PageParserInterface)

    def get_tropes(self):
        self.logger.info(
            'Process started\n* Remember that you can stop and restart at any time.\n'
            '** Please, remove manually the cache folder when you are done\n')

        self._extract_film_ids()
        self._extract_tropes()

        return self.tropes_by_film

    def _extract_film_ids(self):
        self.logger.info('Scraping film ids...')

        self.films = set()
        starting_url = self.parser.get_starting_url()
        page = self._retrieve(starting_url)
        category_ids = self.parser.extract_categories(page)

        for category_id in category_ids:
            category_url = self.parser.get_category_url(category_id)
            page = self._retrieve(category_url)
            film_ids = self.parser.extract_films(page)
            self.films.update(film_ids)

        self.logger.info(f'Found {len(self.films)} films')

    def _extract_tropes(self):
        self.logger.info('Scraping tropes (this might take a while...)')

        self.tropes = set()
        self.tropes_by_film = OrderedDict()
        sorted_films = sorted(list(self.films))

        self.logger.debug(f'Found {len(sorted_films)} films')

        for counter, film in enumerate(sorted_films):
            url = self.parser.get_film_url(film)
            page = self._retrieve(url)
            trope_ids = self.parser.extract_tropes(page)

            self.logger.debug(
                f'Film {film} ({len(trope_ids)} tropes): {trope_ids}')

            self.tropes.update(trope_ids)
            self.tropes_by_film[film] = sorted(trope_ids)

            if self._should_log_status():
                self._log_status(counter, sorted_films)

    def _should_log_status(self):
        now = datetime.datetime.now()
        return self.latest_log_datetime is None or now - self.latest_log_datetime > datetime.timedelta(
            seconds=5)

    def _log_status(self, counter, sorted_films):
        self.logger.info(
            f'Status: {counter + 1}/{len(sorted_films)} films scraped')
        self.latest_log_datetime = datetime.datetime.now()

    def _retrieve(self, url):
        cache = self._get_cache()
        content = cache.get(url)
        if content is None:
            retriever = self._get_retriever()
            content = retriever.retrieve(url)
            cache.set(url, content)

        return content

    def _get_cache(self):
        if self.cache_path is None:
            return ObjectFactory().get_instance(CacheInterface)
        return ObjectFactory().get_instance(CacheInterface, self.cache_path)

    def _get_retriever(self):
        if self.wait_time_between_calls_in_seconds is None:
            return ObjectFactory().get_instance(WebPageRetrieverInterface)
        return ObjectFactory().get_instance(
            WebPageRetrieverInterface, self.wait_time_between_calls_in_seconds)

    @staticmethod
    def get_info():
        file_cache = ObjectFactory().get_instance(CacheInterface)
        return file_cache.get_info()

    def export_to_json(self, filename):
        import json
        with open(filename, 'w') as file:
            json.dump(self.tropes_by_film, file)
        self.logger.info(
            f'Saved dictionary <film_name> -> [<trope_list>] as JSON file {filename}'
        )

    def log_summary(self):
        films_count = len(self.tropes_by_film.keys())
        tropes_count = len(self.tropes)
        self.logger.info(
            f'Summary:\n- Films: {films_count}\n- Tropes: {tropes_count}\n- Cache: {self.get_info()}'
        )