def parse_movie_info(self, page, page_url): try: content = self.random_delay_get(page_url).content file_name = self._get_temp_filename() with open(file_name, "w") as fd: fd.write(content) movie_urls = [] with open(file_name, "r") as fd: lines = fd.readlines() for line in lines: line = line.strip() if not "ulink" in line: continue dom = BeautifulSoup(line, "lxml") movie_urls.append(dom.body.a["href"]) os.remove(file_name) url_pre = page_url.split("/html", 1)[0] log.info("url_pre: %s", url_pre) for movie_url in movie_urls: movie_url = "".join([url_pre, movie_url]) self.parse_movie_detail_info(movie_url) except Exception: log.error("parse_movie_info failed! page: %s, page_url: %s", page, page_url, exc_info=True)
def parse_movie_list(self, movie_type, url): try: if not url.startswith("http"): url = "".join([self.base_url, url]) url_pre = url.rsplit("/", 1)[0] log.info("url_pre: %s", url_pre) content = self.random_delay_get(url).content page_urls = self._parse_list(content) for page, page_url in page_urls: page_url = "/".join([url_pre, page_url]) self.parse_movie_info(page, page_url) except Exception: log.error("parse_movie_list failed! movie_type: %s, url: %s", movie_type, url, exc_info=True)
def parse_movie_detail_info(self, movie_url): try: content = self.random_delay_get(movie_url).content file_name = self._get_temp_filename() with open(file_name, "w") as fd: fd.write(content.replace("\r", "\n")) movie_info = {"info_url": movie_url, "title": "", "translated_name": "", "name": "", "year": "", "country": "", "tags": [], "language": "", "subtitles": "", "IMDb": 0, "IMDb_total": 10, "IMDb_users_count": 0, "file_format": "", "size": "", "file_size": "", "film_length": "", "director": "", "actors": [], "download_urls": [] } os.remove(file_name) if not MIN_IMDB_SCORE <= movie_info["IMDb"] <= MAX_IMDB_SCORE: return self.datautil.save_movie_info(movie_info) except Exception: log.error("parse_movie_detail_info failed! movie_url: %s", movie_url, exc_info=True)