def parse_title(self, html, video_info): match = re.search(u'影片名称开始代码(.*)影片名称结束代码', html) if match: video_info.title = match.group(1) Log.write_stdout('影片名称: %s', match.group(1)) match = re.search(u'影片副标开始代码(.*)影片副标结束代码', html) if match: video_info.sub_title = match.group(1) Log.write_stdout('影片副标: %s', match.group(1))
def add(self, data): Log.write_stdout("cache add") self._cachedItems.append(data) max = self._maxSize size = len(self._cachedItems) if size == max: Log.write_stdout("cache fulled, write to persistence") self.writeToPersistence(self._cachedItems[:max]) del self._cachedItems[:max] pass
def parse(self, soup): if not soup: Log.write_stdout('ERROR:soup is None') return Log.write_stdout('#####################') video_info = Video_Info() self.video_parser.parse(soup, video_info) for qhash in video_info.qhash_list: print qhash if len(video_info.qhash_list) > 0: self.video_list.append(video_info)
def search(keyword): ''' search content for keyword @author: douzifly ''' if not keyword: return url = "http://www.hakuzy.com/search.asp" keyword = Utils.to_unicode(keyword) params = {"searchword": keyword.encode("gbk")} html = WebTool.request(url, params, "post") # replace with other lib if not html: Log.write_stderr('ERROR:cant get html') return Log.write_stdout(html) # this html only contain search result, no hash parser = HakuzyVideoParser() # do not create parse every time # find video links cache = VideoCache() for url in parser.parse_search_page(html): html = WebTool.request(url) soup = BeautifulSoup(html) video_info = Video_Info() video_info.ref_url = url parser.parse(soup, video_info) Log.write_stdout("###################") Log.write_stdout(video_info) cache.add(video_info) time.sleep(Config.NETWORK_REQUST_INTERVAL) cache.flush() # write left items to persistence
def writeToPersistence(self, datas): # move to thread pool later Log.write_stdout("writeToPersistence len:%d", len(datas)) videoTb = VideoTb() ret = videoTb.open() if not ret: Log.write_stdout("cant open db") return for i, video in enumerate(datas): videoTb.insert(video) videoTb.commit() videoTb.close() Log.write_stdout("write finished") pass
def parse_language(self, html, video_info): match = re.search(u'影片语言开始代码(.*)影片语言结束代码', html) if match: video_info.language = match.group(1) Log.write_stdout('影片语言: %s', match.group(1))
def parse_type(self, html, video_info): match = re.search(u'影片类型开始代码(.*)影片类型结束代码', html) if match: video_info.type = match.group(1) Log.write_stdout('影片类型: %s', match.group(1))
def parse_director(self, html, video_info): match = re.search(u'影片导演开始代码(.*)影片导演结束代码', html) if match: video_info.director = match.group(1) Log.write_stdout('导演: %s', match.group(1))
def parse_actors(self, html, video_info): match = re.search(u'影片演员开始代码(.*)影片演员结束代码', html) if match: video_info.actors = match.group(1) Log.write_stdout('演员: %s', match.group(1))
def __init__(self, webSiteURL, entryFilter=None, yieldFilter=None, identifier=None, enableStatusSupport=False): Koala.__init__(self, webSiteURL, entryFilter, yieldFilter, identifier, enableStatusSupport) self.__total_size = 0 self.video_list = list() self.video_parser = HakuzyVideoParser() Log.write_stdout('Hakuzy.__init__')
def parse_img(self, soup, video_info): img = soup.find('img', onerror=True) if img: video_info.img = img.get('src') Log.write_stdout('img = %s', img.get('src'))
def parse_brief(self, html, video_info): match = re.search(u'影片介绍开始代码(.*)影片介绍结束代码', html) if match: video_info.brief = match.group(1) Log.write_stdout('影片介绍: %s', match.group(1))
def parse_statues(self, html, video_info): match = re.search(u'影片状态:(.*)', html) if match: video_info.status = match.group(1) Log.write_stdout('影片状态: %s', match.group(1))
def parse_update_time(self, html, video_info): match = re.search(u'影片更新时间开始代码(.*)影片更新时间结束代码', html) if match: video_info.update_time = match.group(1) Log.write_stdout('更新时间: %s', match.group(1))
def parse_public_time(self, html, video_info): match = re.search(u'上映日期开始代码(.*)上映日期结束代码', html) if match: video_info.public_time = match.group(1) Log.write_stdout('上映日期: %s', match.group(1))
def parse_area(self, html, video_info): match = re.search(u'影片地区开始代码(.*)影片地区结束代码', html) if match: video_info.area = match.group(1) Log.write_stdout('影片地区: %s', match.group(1))