Esempio n. 1
0
    def parse_movie_info(self, page, page_url):
        try:
            content = self.random_delay_get(page_url).content
            file_name = self._get_temp_filename()
            with open(file_name, "w") as fd:
                fd.write(content)
            
            movie_urls = []
            with open(file_name, "r") as fd:
                lines = fd.readlines()
                for line in lines:
                    line = line.strip()
                    if not "ulink" in line:
                        continue
                    dom = BeautifulSoup(line, "lxml")
                    movie_urls.append(dom.body.a["href"])
            os.remove(file_name)

            url_pre = page_url.split("/html", 1)[0]
            log.info("url_pre: %s", url_pre)
            for movie_url in movie_urls:
                movie_url = "".join([url_pre, movie_url])
                self.parse_movie_detail_info(movie_url)
                
        except Exception:
            log.error("parse_movie_info failed! page: %s, page_url: %s",
                      page, page_url, exc_info=True)
Esempio n. 2
0
 def parse_movie_list(self, movie_type, url):
     try:
         if not url.startswith("http"):
             url = "".join([self.base_url, url])
         url_pre = url.rsplit("/", 1)[0]
         log.info("url_pre: %s", url_pre)
         content = self.random_delay_get(url).content
         page_urls = self._parse_list(content)
         for page, page_url in page_urls:
             page_url = "/".join([url_pre, page_url])
             self.parse_movie_info(page, page_url)
         
     except Exception:
         log.error("parse_movie_list failed! movie_type: %s, url: %s",
                   movie_type, url, exc_info=True)
Esempio n. 3
0
 def _parse_list(self, content):
     page_urls = []
     file_name = self._get_temp_filename()
     with open(file_name, "w") as fd:
         fd.write(content)
     
     with open(file_name, "r") as fd:
         for line in fd.readlines():
             line = line.strip()
             if not "option" in line:
                 continue
             
             try:
                 tag_option = BeautifulSoup(line, "lxml")
                 page = int(tag_option.text)
                 url = tag_option.body.option["value"]
                 log.info("page: %s, url: %s", page, url)
                 page_urls.append([page, url])
             except:
                 pass
     os.remove(file_name)
     return page_urls
Esempio n. 4
0
 def parse_index(self):
     index_content = self.sess.get(self.base_url).content
     dom = BeautifulSoup(index_content, "lxml")
     tag_div_menu = dom.find_all("div", {"id": "menu"})
     
     menu_urls = {}
     tag_a = tag_div_menu[0].find_all("a")
     for url in tag_a:
         menu_urls[url.text] = url["href"]
         log.info("movie_type: %s, url: %s", url.text, url["href"]) 
     log.info(menu_urls)
     
     for movie_type, url in menu_urls.iteritems():
         if url in ["#", "index.html"]:
             log.info("ignore movie_type: %s, url: %s", url.text, url["href"])
             continue
         self.parse_movie_list(movie_type, url)