Ejemplo n.º 1
0
    async def populate(cls, debug=False):
        aux = {}
        categories = await GeneralParser.get_categories_page(debug=debug)
        categorized_movies_urls, total_links = await GeneralParser.get_categorized_movie_urls(
            categories, aux=aux, debug=debug)
        movies_urls = {
            url
            for movies_urls in categorized_movies_urls.values()
            for url in movies_urls
        }
        print(f"Total links: {len(movies_urls)}")
        print(aux)

        async def _update_db_wrapper(metadata):
            # check if we have already added this movie
            try:
                instance = await AsyncMovieInstanceCollection.find_one_and_update(
                    {
                        "origin": Config.IDENTIFIER,
                        "movie_id": metadata["movie_id"]
                    }, {"$set": metadata},
                    upsert=True,
                    return_document=ReturnDocument.AFTER)

                # merge all instances of the same movie on different sites into one main instance
                # create the main movie instance if not exists
                matching_movie = await AsyncMovieInstanceCollection.mergeWithCorrespondingMovie(
                    instance=instance)
                movie_object_id = matching_movie["_id"]
                print(movie_object_id)
            except Exception as e:
                if debug:
                    print(e)
                raise e

        async def _routine_wrapper(url, session):
            movieMetadata = []
            metadata = None
            try:
                metadata = await MovieParser.get_movie_info(
                    url,
                    pre_metadata=aux.get(url),
                    debug=debug,
                    session=session)
            except Exception as e:
                if debug:
                    print(e)
                return
            print(metadata)
            await _update_db_wrapper(metadata)

        # process 20 urls at a time to avoid 500 http error
        for urls_range in chunk_iterator(movies_urls, 20):
            session = AsyncSession()
            await asyncio.gather(*(_routine_wrapper(url, session)
                                   for url in urls_range),
                                 return_exceptions=True)
            await session.close()
Ejemplo n.º 2
0
 async def wrapped(*args, session=None, **kwargs):
     need_close = session is None
     if not session:
         session = AsyncSession()
     try:
         result = await func(*args, session=session, **kwargs)
         if need_close:
             await session.close()
         return result
     except Exception as e:
         if need_close:
             await session.close()
         raise e
Ejemplo n.º 3
0
 async def get_categories_page(cls, debug=False) -> List[str]:
     links = []
     try:
         async with AsyncSession() as session:
             res = await session.get(Config.BASE_URL)
             res.raise_for_status()
             html_parser = BeautifulSoup(await res.text(), "html.parser")
             categories = list(html_parser.findAll(
                 "li", class_="menu-item"))[1].find("ul")
             for category in categories.findAll("li"):
                 links.append(
                     urllib.parse.urljoin(Config.BASE_URL,
                                          category.find("a")["href"]))
     except Exception as e:
         if debug:
             print(f"get_categories_page() {repr(e)}")
     return links
Ejemplo n.º 4
0
 async def get_categories_page(cls, debug=False) -> List[str]:
     links = []
     try:
         async with AsyncSession() as session:
             res = await session.get(Config.BASE_URL)
             res.raise_for_status()
             html_parser = BeautifulSoup(await res.text(), "html.parser")
             categories = html_parser.find(
                 "div", {
                     "id": "bs-example-navbar-collapse-1"
                 }).find("ul").find("li").find("ul")
             for category in categories.findAll("li"):
                 links.append(category.find("a")["href"])
     except Exception as e:
         if debug:
             print(f"get_categories_page() {repr(e)}")
     return links