def __init__(self): self.page_num = 0 self.keyword = "" self.qvod_sql = QvodSql() self.qvod_sql.open("db_media.db")
class QvodScan(object): def __init__(self): self.page_num = 0 self.keyword = "" self.qvod_sql = QvodSql() self.qvod_sql.open("db_media.db") def scan(self, scan_keyword): self.keyword = scan_keyword # save keyword. read_buffer = self.__open_url_addr(scan_keyword) # if read_buffer: ###################################### string_list = self.__read_buffer_to_code(read_buffer) ###################################### # get page sum. self.__get_scan_page_num(string_list) return True else: self.page_num = 0 return False def get_main_index_info(self): for key in scan_index_dict.keys(): print "key:", key scan_index_html = scan_index_dict[key] read_buffer = self.__open_url_addr(scan_index_html % (1), None) string_list = self.__read_buffer_to_code(read_buffer) self.__get_scan_page_num(string_list) for index in range(1, self.page_num + 1): import time time.sleep(3) read_buffer = self.__open_url_addr(scan_index_html % (index), None) string_list = self.__read_buffer_to_code(read_buffer) for info in self.__scan_get_qvod_info(string_list): # 返回 qvod地址, 图片地址, 状态, 备注, 演员, 导演. self.get_qvod_addr(info.addr, info) # print "名称:", info.name # print "地区:", info.area # print "演员:", info.actor # print "导演:", info.direct # print "类型:", info.type # print "日期:", info.date # if info.state == '0': # state = "完结" # else: # state = "更新至" + str(info.state) # print "影片状态:", state # print "备注:", info.other # print "qvod地址:" # for addr in info.qvod_addr.split(","): # if addr != "": # print addr # save to db_media database medias table. if self.check_save_bool(info): if self.qvod_sql.insert_data("medias", info): print info.name, "保存到数据库中成功", "共" , self.page_num, "页", "当前", index, "页" else: print info.name, "保存失败,有相同的数据!!" self.qvod_sql.close() def check_save_bool(self, info): save_bool = True if self.qvod_sql.select_data("medias"): for sql_info in self.qvod_sql.get_query_data(): if sql_info[1] == info.name and sql_info[2] == info.area and sql_info[5] == info.type: save_bool = False return save_bool def get_qvod_addr(self, go_addr, info): temp_qvod_info = info qvod_addr_patter = r"<a>(.+)[\||</a>]" image_patter = 'src="http://(.+)" width' other_patter = '<!--影片介绍开始代码-->(.+)<!--影片介绍结束代码-->' state_patter = '<!--影片状态开始代码-->(.+)<!--影片状态结束代码-->' actor_patter = '<!--影片演员开始代码-->(.+)<!--影片演员结束代码-->' direct_patter = '<!--影片导演开始代码-->(.+)<!--影片导演结束代码-->' read_buffer = self.__open_url_addr(go_addr, None) string_list = self.__read_buffer_to_code(read_buffer) for line in string_list: # 获取演员. actor_result = self.__scan_findall(actor_patter, line) if actor_result != []: temp_qvod_info.actor = actor_result[0] # 获取导演. direct_result = self.__scan_findall(direct_patter, line) if direct_result != []: temp_qvod_info.direct = direct_result[0] # 获取图片地址. image_result = self.__scan_findall(image_patter, line) if image_result != []: temp_qvod_info.image = "http://" + image_result[0] # 获取备注信息. other_result = self.__scan_findall(other_patter, line) if other_result != []: other_result = other_result[0].strip() if other_result.startswith("<p>") and other_result.endswith("</p>"): other_result = other_result[3:][:-4] if other_result.endswith("</br>"): other_result = other_result[:-5] temp_qvod_info.other = other_result # 获取影片状态. state_result = self.__scan_findall(state_patter, line) if state_result != []: temp_qvod_info.state = state_result[0] # 获取所有QVOD地址. qvod_addr_result = self.__scan_findall(qvod_addr_patter, line) if qvod_addr_result != []: if line.endswith("|"): temp_qvod_info.qvod_addr = qvod_addr_result[0] + "|" for result in qvod_addr_result[0].split("checked/> <a>"): qvod_result_find = result[:result.find("</a><!--")] temp_qvod_info.qvod_addr += qvod_result_find + "," def get_qvod_info(self, index): read_buffer = self.__open_url_addr(self.keyword, index) string_list = self.__read_buffer_to_code(read_buffer) return self.__scan_get_qvod_info(string_list) def __scan_get_qvod_info(self, string_list): qvod_info = QvodInfo() qvod_info_list = [] # patter. addr_patter = r'value="<!--影片链接开始代码-->(.+)<!--影片链接结束代码-->' name_patter = r'<!--影片名称开始代码-->(.+)<!--影片名称结束代码-->' last_name_patter = r'<!--影片副标开始代码-->(.+)<!--影片副标结束代码-->' area_patter = r'<!--影片地区开始代码-->(.+)<!--影片地区结束代码-->' type_patter = r'<!--影片类型开始代码-->(.+)<!--影片类型结束代码-->' date_patter = r'<!--上映日期开始代码-->(.+)<!--上映日期结束代码-->' line_index = 0 add_info_bool = [False, False, False, False, False] # for line in string_list: # addr. scan_addr_result = self.__scan_findall(addr_patter, line) if scan_addr_result != []: addr = scan_addr_result[0] # print "addr:", addr line_index += 1 add_info_bool[0] = True # name. scan_name_result = self.__scan_findall(name_patter, line) if scan_name_result != []: name = scan_name_result[0] line_index += 1 add_info_bool[1] = True # get name ->>last name. scan_last_name_result = self.__scan_findall(last_name_patter, line) if scan_last_name_result != []: last_name = scan_last_name_result[0] name += last_name # print "name:", name # area. scan_area_result = self.__scan_findall(area_patter, line) if scan_area_result != []: area = scan_area_result[0] line_index += 1 add_info_bool[2] = True # type. scan_type_result = self.__scan_findall(type_patter, line) if scan_type_result != []: type_ = scan_type_result[0] line_index += 1 add_info_bool[3] = True # date. scan_date_result = self.__scan_findall(date_patter, line) if scan_date_result != []: date = scan_date_result[0] line_index += 1 add_info_bool[4] = True # save info. if not (line_index % 5) and self.__add_info_check(add_info_bool): # save info to qvod_info. qvod_info.addr = addr qvod_info.date = date qvod_info.type = type_ qvod_info.area = area qvod_info.name = name # add to qvod_info_list. qvod_info_list.append(qvod_info) # print "==========================" # print "地址:", qvod_info.addr # print "名称:", qvod_info.name # print "地区:", qvod_info.area # print "类型:", qvod_info.type # print "日期:", qvod_info.date # clear flags. for add_index in range(0, 5): add_info_bool[add_index] = False # clear qvod_info. qvod_info = QvodInfo() return qvod_info_list def __add_info_check(self, add_info_bool): check_bool = True for add_index in range(0, 5): check_bool = add_info_bool[add_index] return check_bool def __read_buffer_to_code(self, read_buffer): try: # no gb2312. string_list = self.__to_code_utf_8(read_buffer).split("\n") except: # gb2312. # check gb2312. string_list = read_buffer.decode('gbk', 'ignore').encode('utf-8').split("\n") return string_list def __open_url_addr(self, scan_keyword, index=1): # if index: keyword = urllib.quote(self.__to_code_gb2312(scan_keyword)) scan_html = SCAN_HTML_PAGE % (index, keyword) else: scan_html = MAIN_HTML + scan_keyword # print scan_html if is_network_connected(): url_open = urllib2.urlopen(scan_html) read_buffer = url_open.read() return read_buffer else: return "" def __scan_findall(self, patter, patter_string): return re.findall(patter, patter_string) def __get_scan_page_num(self, string_list): page_num_patter = r'>(.+)条记录' page_max_patter = r'value="<!--影片链接开始代码-->' scan_record_sum = 0 page_max = 0 # for line in string_list: # get page max. page_max_result = (self.__scan_findall(page_max_patter, line)) if page_max_result != []: page_max += 1 # get result. scan_result = self.__scan_findall(page_num_patter, line) if scan_result != []: scan_record_sum = int(scan_result[0].strip()) # print "%d条记录" % (scan_record_sum) break # # print "max:", page_max self.__scan_record_sum_to_page_num(scan_record_sum, page_max) def __scan_record_sum_to_page_num(self, scan_record_sum, page_max): PAGE_MAX = page_max if page_max > 0: self.page_num = scan_record_sum / PAGE_MAX if (scan_record_sum % PAGE_MAX) > 0: self.page_num += 1 # # print "总共有%d页" % (self.page_num) def __to_code_gb2312(self, keyword): return keyword.decode("utf-8").encode("gb2312") def __to_code_utf_8(self, string): return string.decode('gb2312').encode("utf-8") def __get_movie_name(self, qvod_url): url = qvod_url.replace("qvod://", "") return url.split("|")[2] def __get_hash_str(self, qvod_url): url = qvod_url.replace("qvod://", "") return url.split("|")[1]