async def populate(cls, debug=False): aux = {} categories = await GeneralParser.get_categories_page(debug=debug) categorized_movies_urls, total_links = await GeneralParser.get_categorized_movie_urls( categories, aux=aux, debug=debug) movies_urls = { url for movies_urls in categorized_movies_urls.values() for url in movies_urls } print(f"Total links: {len(movies_urls)}") print(aux) async def _update_db_wrapper(metadata): # check if we have already added this movie try: instance = await AsyncMovieInstanceCollection.find_one_and_update( { "origin": Config.IDENTIFIER, "movie_id": metadata["movie_id"] }, {"$set": metadata}, upsert=True, return_document=ReturnDocument.AFTER) # merge all instances of the same movie on different sites into one main instance # create the main movie instance if not exists matching_movie = await AsyncMovieInstanceCollection.mergeWithCorrespondingMovie( instance=instance) movie_object_id = matching_movie["_id"] print(movie_object_id) except Exception as e: if debug: print(e) raise e async def _routine_wrapper(url, session): movieMetadata = [] metadata = None try: metadata = await MovieParser.get_movie_info( url, pre_metadata=aux.get(url), debug=debug, session=session) except Exception as e: if debug: print(e) return print(metadata) await _update_db_wrapper(metadata) # process 20 urls at a time to avoid 500 http error for urls_range in chunk_iterator(movies_urls, 20): session = AsyncSession() await asyncio.gather(*(_routine_wrapper(url, session) for url in urls_range), return_exceptions=True) await session.close()
async def wrapped(*args, session=None, **kwargs): need_close = session is None if not session: session = AsyncSession() try: result = await func(*args, session=session, **kwargs) if need_close: await session.close() return result except Exception as e: if need_close: await session.close() raise e
async def get_categories_page(cls, debug=False) -> List[str]: links = [] try: async with AsyncSession() as session: res = await session.get(Config.BASE_URL) res.raise_for_status() html_parser = BeautifulSoup(await res.text(), "html.parser") categories = list(html_parser.findAll( "li", class_="menu-item"))[1].find("ul") for category in categories.findAll("li"): links.append( urllib.parse.urljoin(Config.BASE_URL, category.find("a")["href"])) except Exception as e: if debug: print(f"get_categories_page() {repr(e)}") return links
async def get_categories_page(cls, debug=False) -> List[str]: links = [] try: async with AsyncSession() as session: res = await session.get(Config.BASE_URL) res.raise_for_status() html_parser = BeautifulSoup(await res.text(), "html.parser") categories = html_parser.find( "div", { "id": "bs-example-navbar-collapse-1" }).find("ul").find("li").find("ul") for category in categories.findAll("li"): links.append(category.find("a")["href"]) except Exception as e: if debug: print(f"get_categories_page() {repr(e)}") return links