Esempio n. 1
0
    def parse_movie_info(self, page, page_url):
        try:
            content = self.random_delay_get(page_url).content
            file_name = self._get_temp_filename()
            with open(file_name, "w") as fd:
                fd.write(content)
            
            movie_urls = []
            with open(file_name, "r") as fd:
                lines = fd.readlines()
                for line in lines:
                    line = line.strip()
                    if not "ulink" in line:
                        continue
                    dom = BeautifulSoup(line, "lxml")
                    movie_urls.append(dom.body.a["href"])
            os.remove(file_name)

            url_pre = page_url.split("/html", 1)[0]
            log.info("url_pre: %s", url_pre)
            for movie_url in movie_urls:
                movie_url = "".join([url_pre, movie_url])
                self.parse_movie_detail_info(movie_url)
                
        except Exception:
            log.error("parse_movie_info failed! page: %s, page_url: %s",
                      page, page_url, exc_info=True)
Esempio n. 2
0
 def parse_movie_list(self, movie_type, url):
     try:
         if not url.startswith("http"):
             url = "".join([self.base_url, url])
         url_pre = url.rsplit("/", 1)[0]
         log.info("url_pre: %s", url_pre)
         content = self.random_delay_get(url).content
         page_urls = self._parse_list(content)
         for page, page_url in page_urls:
             page_url = "/".join([url_pre, page_url])
             self.parse_movie_info(page, page_url)
         
     except Exception:
         log.error("parse_movie_list failed! movie_type: %s, url: %s",
                   movie_type, url, exc_info=True)
Esempio n. 3
0
 def parse_movie_detail_info(self, movie_url):
     try:
         content = self.random_delay_get(movie_url).content
         file_name = self._get_temp_filename()
         with open(file_name, "w") as fd:
             fd.write(content.replace("\r", "\n"))
         
         movie_info = {"info_url": movie_url,
                       "title": "",
                       "translated_name": "",
                       "name": "",
                       "year": "",
                       "country": "",
                       "tags": [],
                       "language": "",
                       "subtitles": "",
                       "IMDb": 0,
                       "IMDb_total": 10,
                       "IMDb_users_count": 0,
                       "file_format": "",
                       "size": "",
                       "file_size": "",
                       "film_length": "",
                       "director": "",
                       "actors": [],
                       "download_urls": []
                       }
         
         
         os.remove(file_name)
         
         if not MIN_IMDB_SCORE <= movie_info["IMDb"] <= MAX_IMDB_SCORE:
             return
         self.datautil.save_movie_info(movie_info)
     except Exception:
         log.error("parse_movie_detail_info failed! movie_url: %s",
                   movie_url, exc_info=True)