class MvImg(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): self.handle_data() def handle_data(self): data = self.get_img_list() thread_pool = ThreadPoolExecutor(max_workers=15) task_list = list() for item in data: task = thread_pool.submit(self.__handle_data, item) task_list.append(task) # break for i in as_completed(task_list): result = i.result() def __handle_data(self, item): url = "https://www.pelisplay.tv" + item['img_src'] header = { # "Referer": "https://www.pelisplay.tv/", "User-Agent": getUserAgent(), "Accept": "image/webp,image/apng,image/*,*/*;q=0.8" } data = curlData(url, header=header) with open("static/images/{id}.jpg".format(id=item['id']), "wb") as f: try: data = data.encode("utf-8") except Exception as e: debug(e) f.write(data) self.__update_data(item) f.close() return {"code": 0} def __update_data(self, item): update_arr = { "table": "list", "set": { "img_status": 1 }, "condition": ['id={id}'.format(id=item['id'])] } lock.acquire() result = self.db.update(update_arr, is_close_db=False) lock.release() return result def get_img_list(self): select_arr = { "table": "list", "columns": ["id", "img_src"], "condition": ['img_status=0'] } data = self.db.select(select_arr, is_close_db=False) return data
class MvList(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() @classmethod def run(cls): """ start get movie list :return: """ mv_list_thread = MvListThread() mv_list_thread.run() def get_mv_list(self): """ :return: """ select_arr = { "table": "list" } mv_list = self.db.select(select_arr, is_close_db=False) return mv_list
def __init__(self): self.recipe_list = RecipeListSpider() self.table_columns = (("id", "int"), ("img_url", "varchar"), ("video_id", "varchar"), ("preparation", "longtext"), ("ingredients", "text"), ("name", "varchar"), ("list_id", "int")) self.handle_num = 0 self.db = DBConfig()
def __init__(self, flfgID, zlsxid, showDetailType, province, index): threading.Thread.__init__(self) self.count = 0 self.flfgID = flfgID self.zlsxid = zlsxid self.showDetailType = showDetailType self.province = province self.index = index self.rv = 0 self.db = DBConfig()
def __init__(self): self.db = DBConfig() self.table_columns = (("id", "int"), ("img_src", "varchar"), ("origin_src", "varchar"), ("url", "varchar"), ("description", "text"), ("description_poster", "text"), ("star", "varchar"), ("title", "varchar"), ("page_views", "int"), ("label", "text"), ("category_id", "int"), ("director", "varchar")) self.handle_num = 0
class DbTest(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def get_columns(self): data = {"name": "test"} sql = self.db.getInsertSql(data, "type") debug(sql)
class RecipeContent(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() @classmethod def run(cls): recipe_list_thread = RecipeContentThread() recipe_list_thread.run()
class MvTest(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): pass def get_category(self): pass
def __init__(self, page_list, category): """ :param page_list: :param category: """ self.page_list = page_list self.category = category self.handle_num = 0 # 自己定义字段可以避免重复查询字段的类型自动拼接 self.table_columns = (("id", "int"), ("name", "varchar"), ("url", "varchar"), ("img_url", "varchar"), ("introduce", "text"), ("recipe_type_id", "int"), ("status", "int"), ("page_views", "int")) self.db = DBConfig()
class MvContent(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() @classmethod def run(cls): """ :return: """ mv_content_thread = MvContentThread() mv_content_thread.run()
class ClearNullData(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): self.handle() def handle(self): data = self.get_data() debug(data) self.del_list(data) self.del_content(data) def del_list(self, data): for item in data: self.__del_list(item) def __del_list(self, item): delete_arr = { "table": "list", "condition": ["id={id}".format(id=item['parent_id'])] } result = self.db.delete(delete_arr, is_close_db=False) return result def del_content(self, data): for item in data: self.__del_content(item) def __del_content(self, item): delete_arr = { "table": "content", "condition": ["parent_id={id}".format(id=item['parent_id'])] } result = self.db.delete(delete_arr, is_close_db=False) return result def get_data(self): select_arr = { "table": "content", "limit": [0, 10], "condition": ["video_src=''", "and", "url=''"] } data = self.db.select(select_arr, is_close_db=False) return data
class HandleData(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def handle_category(self): category_li = self.__handle_category() self.__handle_category_data(category_li) @classmethod def __handle_category(cls): with open("tmp/category_data.txt", "rb") as f: page_resource = f.read().decode("utf-8") f.close() bs_data = BeautifulSoup(page_resource, "html.parser") category_ul = bs_data.find_all("ul", attrs={"class": "sub-menu"}) # only get the next level's li(tag), not include offspring(need to add 'recursive=False') return category_ul[0].find_all("li", recursive=False) def __handle_category_data(self, category_li, handle_type=1, parent_id=0): table_columns = (("id", "int"), ("name", "varchar"), ("page_num", "longtext"), ("nav_type", "int"), ("keyword", "varchar"), ("parent_id", "int")) for item in category_li: insert_arr = {"parent_id": 0, "nav_type": 2} try: href = item.find("a").attrs['href'] try: insert_arr['keyword'] = re.findall('category=([\w\W]*.)', href)[0] except Exception as e: debug(e) if handle_type == 2: insert_arr['parent_id'] = parent_id if href == "#": insert_arr['name'] = item.find("span").getText().strip() insert_arr['nav_type'] = 1 sql = self.db.getInsertSql(insert_arr, "type", table_columns=table_columns) lastest_id = self.db.insertLastId(sql, is_close_db=False) if lastest_id == 0: debug("get data error") continue self.__handle_category_data(item.find_all("li"), 2, lastest_id) else: insert_arr['name'] = item.getText().strip() sql = self.db.getInsertSql(insert_arr, "type", table_columns=table_columns) self.db.insert(sql, is_close_db=False) except Exception as e: debug(e)
class MvListData(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): self.handle() def handle(self): data = self.get_data() for item in data: self.__mv_data(item) def __mv_data(self, item): select_arr = { "table": "list", "condition": ["id={id}".format(id=item['list_id'])] } try: data = self.db.select(select_arr, is_close_db=False)[0] except Exception as e: debug(e) return sql = self.db.getInsertSql(data, "tmp_list") debug(sql) result = self.db.insert(sql, is_close_db=False) return result def get_data(self): select_arr = { "table": "tmp_content", } data = self.db.select(select_arr, is_close_db=False) return data
class RecipeGetPage(object): def __init__(self): self.db = DBConfig() recipe_type = RecipeType() self.category = recipe_type.get() def __del__(self): self.db.closeDB() def get(self): """ :return: """ self.__get_recipe_page() def __get_recipe_page(self): """ :return: """ for item in self.category: url = CommonFunc().generate_url(category=item['keyword']) try: self.__get_recipe_page_data(url, item['id']) except Exception as e: debug("页面数量抓取出错,出错信息:{error}".format(error=e)) def __get_recipe_page_data(self, url, recipe_category_id): """ :param url: :param recipe_category_id: :return: """ page_resource = curlData(url, open_virtual_ip=True) # with open("tmp/category_page_data.txt", "rb") as f: # page_resource = f.read().decode("utf-8") # f.close() bs = BeautifulSoup(page_resource, "html.parser") page_ul = bs.find_all("ul", attrs={"class": "page-numbers"}) # remove prev page and next page for k, v in enumerate(page_ul[0]('a', attrs={"class": "next"})): v.extract() page_a = page_ul[0].find_all("a") page_span = page_ul[0].find("span") page_list = "" for k, v in enumerate(page_a): if k == 0: page_list = page_list + str(v.get_text()).strip() else: page_list = page_list + "," + str(v.get_text()).strip() page_list = page_list + "," + page_span.get_text().strip() page_list = {"page_list": page_list} # update to mysql update_arr = { "table": "type", "set": { "page_num": json.dumps(page_list) }, "condition": ['id={id}'.format(id=recipe_category_id)] } result = self.db.update(update_arr, is_close_db=False) if result == 1: debug("id为{id}的菜谱类型页面数据抓取成功".format(id=recipe_category_id)) else: debug("id为{id}的菜谱类型页面数据抓取失败".format(id=recipe_category_id))
def __init__(self): self.db = DBConfig() recipe_type = RecipeType() self.category = recipe_type.get()
class RecipeListSpider(object): def __init__(self): self.db = DBConfig() self.recipe_type = RecipeType() def __del__(self): self.db.closeDB() def run(self): """ start get recipe list :return: """ self.get_recipe_list() def get_list(self, condition=[], limit=[]): """ :return: """ select_arr = {"table": "list", "condition": ['status=0']} data = self.db.select(select_arr, is_close_db=False) return data def get_category(self): """ get a category's all page num :return: """ return self.recipe_type.get_category() def get_recipe_list(self): """ :return: """ self.__get_recipe_list() def __get_recipe_list(self): """ :return: """ info = self.get_category() for item in info: self.__get_recipe_list_child(item) def __set_status(self, category_id): """ :param category_id: :return: """ update_arr = { "table": "type", "set": { "status": 1 }, "condition": ['id={category_id}'.format(category_id=category_id)] } result = self.db.update(update_arr, is_close_db=False) if result == 0: debug("更新状态出错, 出错原因:unknown") return def __get_recipe_list_child(self, info): """ :param info: :return: """ try: page_list = json.loads(info['page_num'])['page_list'] except Exception as e: debug(e) self.__set_status(info['id']) return category = info['keyword'] if category == "": self.__set_status(info['id']) return page_list = page_list.split(",") recipe_list_thread = RecipeListThread(page_list, info) recipe_list_thread.run() self.__set_status(info['id'])
def getConstitutionData(flfgID, zlsxid, province): # 经过浏览。很明显,具体的宪法数据源url为如下的url,包含两个get类型参数 flfgID zlsxid keyword 前两个是必须的,通过列表传递的js数据拿到 flag = False url = "http://law.npc.gov.cn:8081/FLFG/flfgByID.action" get = dict() get['flfgID'] = flfgID get['zlsxid'] = zlsxid get['keyword'] = "" get = urlencode(get) url = url + "?" + get data = curlData(url, get, url) try: data = data.decode("utf-8") except: pass # with open("constitution.txt", "wb") as f: # f.write(data.encode("utf-8")) # f.close() # with open("constitution.txt", "rb") as f: # data = f.read().decode("utf-8") # f.close() handleDataAll = BeautifulSoup(data, "html.parser") handleData = handleDataAll.find_all("table") columns_list = [ 'type', "department_type", 'office', 'reference_num', 'issue_date', 'execute_date', 'timeliness' ] columns_name_list = [ '资料属性:', '部门分类:', '制定机关:', '颁布文号:', '颁布日期:', '施行日期:', '时 效 性:' ] # 获取头部基本信息 try: table_data = handleData[0].find_all("td") except: table_data = "数据获取出错" flag = True type_data = dict() type_data['url'] = url for k, v in enumerate(table_data): try: if (k + 1) % 2 == 1: type_data[columns_list[columns_name_list.index( table_data[k].getText().strip())]] = table_data[ k + 1].getText().strip() except: type_data[columns_list[columns_name_list.index( table_data[k].getText().strip())]] = "数据获取出错" # 接下来获取标题和内容 try: type_data['title'] = handleDataAll.find_all( "div", attrs={"class": "bt"})[0].getText().strip() except: type_data['title'] = "标题获取出错" flag = True # 进行内容获取 try: type_data['content'] = str( handleDataAll.find_all("div", attrs={"id": "content"})[0]) except: flag = True type_data['province'] = province if flag: type_data['is_get_error'] = 1 else: type_data['is_get_error'] = 0 DB = DBConfig() sql = DB.getInsertSql(type_data, "constitutions") result = DB.insert(sql) return result
def __init__(self): self.db = DBConfig() self.recipe_type = RecipeType()
class MvListThread(object): def __init__(self): self.db = DBConfig() self.table_columns = (("id", "int"), ("img_src", "varchar"), ("origin_src", "varchar"), ("url", "varchar"), ("description", "text"), ("description_poster", "text"), ("star", "varchar"), ("title", "varchar"), ("page_views", "int"), ("label", "text"), ("category_id", "int"), ("director", "varchar")) self.handle_num = 0 def __del__(self): self.db.closeDB() def run(self): thread_pool = ThreadPoolExecutor(max_workers=10) task_list = list() category = self.get_category() for item in category: task_list.append(thread_pool.submit(self.handle_data, item)) debug("本次线程数量:{length}".format(length=len(task_list))) for i in as_completed(task_list): result = i.result() if result['code'] == 0: debug("电影类型为 {category} 的数据抓取完毕".format( category=result['category'])) debug("处理了{length}个线程".format(length=self.handle_num)) @classmethod def get_category(cls): """ :return: """ mv_category = MvCategory() return mv_category.get_category() @classmethod def get_data(cls, category): """ :param category: :return: """ generate_url = GenerateUrl() url = generate_url.generate_url(domian=category) page_resource = curlData(url, open_virtual_ip=True) return page_resource def handle_data(self, category): """ :param category: :return: """ result = self.handle_data_child(category) while result['code'] == 0: category['url'] = result['url'] result = self.handle_data_child(category) return {"code": 0, "category": category['name']} def handle_data_child(self, category): """ :param category: :return: """ code = 0 page_resource = self.get_data(category['url']) # with open("tmp/mv_list_page.txt", "rb") as f: # page_resource = f.read().decode("utf-8") # f.close() bs = BeautifulSoup(page_resource, "html.parser") mv_list = self.__get_li_list(bs) for item in mv_list: self.__handle_data(item, category) lock.acquire() self.handle_num = self.handle_num + 1 lock.release() next_url = self.__get_next_url(bs) if next_url == "": code = 1 return {"code": code, "url": next_url} def __handle_data(self, item, category): insert_arr = dict() insert_arr['img_src'] = self.__get_img_src(item) insert_arr['origin_src'] = self.__get_origin_src(item) insert_arr['url'] = self.__get_url(item) insert_arr['category_id'] = category['id'] insert_arr['description'] = self.__get_description(item) insert_arr['description_poster'] = self.__get_description_poster(item) insert_arr['star'] = self.__get_star(item) insert_arr['title'] = self.__get_title(item) insert_arr['page_views'] = self.__get_page_views(item) insert_arr['director'] = self.__get_director(item) insert_arr['label'] = self.__get_label(item) result = self.__save_data(insert_arr) if result == 0: debug("数据存储出错") else: debug("电影 {name} --> 列表存储成功".format(name=insert_arr['title'])) def __save_data(self, insert_arr): """ :param insert_arr: :return: """ lock.acquire() sql = self.db.getInsertSql(insert_arr, "list", table_columns=self.table_columns) result = self.db.insert(sql, is_close_db=False) lock.release() return result @classmethod def __get_li_list(cls, bs): """ :param bs: :return: """ data = bs.find_all("figure") return data @classmethod def __get_img_src(cls, item): """ :param item: :return: """ img_src = item.find("img") try: img_src = img_src.attrs['src'] except Exception as e: img_src = "" debug("电影封面图获取出错,出错信息:{error}".format(error=e)) return img_src @classmethod def __get_origin_src(cls, item): """ :param item: :return: """ origin_src = item.find_all("img") try: origin_src = origin_src[1].attrs['src'] except Exception as e: origin_src = "" debug("电影小图标获取出错,出错信息:{error}".format(error=e)) return origin_src @classmethod def __get_url(cls, item): """ :param item: :return: """ url = item.find("a") try: url = url.attrs['href'] except Exception as e: url = "" debug("电影详情链接地址获取出错,出错信息:{error}".format(error=e)) return url @classmethod def __get_description(cls, item): """ :param item: :return: """ description = item.find("div", attrs={"class": "Description"}) try: description = description.find("div") description = description.get_text().strip() except Exception as e: description = "" debug("电影描述获取出错,出错信息:{error}".format(error=e)) return description @classmethod def __get_description_poster(cls, item): """ :param item: :return: """ description_poster = item.find("p", attrs={"class": "description_poster"}) try: description_poster = description_poster.get_text().strip() except Exception as e: description_poster = "" debug("电影短述获取出错,出错信息:{error}".format(error=e)) return description_poster @classmethod def __get_star(cls, item): """ :param item: :return: """ star = item.find("span", attrs={"class": "qualification"}) try: star = star.get_text().strip() except Exception as e: star = "" debug("电影短述获取出错,出错信息:{error}".format(error=e)) return star @classmethod def __get_title(cls, item): title = item.find("div", attrs={"class": "Title"}) try: title = title.get_text().strip() except Exception as e: title = "" debug("电影标题获取出错,出错信息:{error}".format(error=e)) return title @classmethod def __get_page_views(cls, item): """ :param item: :return: """ page_views = item.find("div", attrs={"class": "otros"}) try: page_views = page_views.get_text().strip() page_views = page_views.replace(",", "") page_views = page_views.replace(" visitas", "") except Exception as e: page_views = 0 debug("电影浏览量获取出错,出错信息:{error}".format(error=e)) return page_views @classmethod def __get_director(cls, item): """ :param item: :return: """ director = item.find_all("div", attrs={"class": "otros"}) try: director = director[1] director = director.get_text().strip() director = director.replace("Director: ", "") except Exception as e: director = "" debug("电影导演获取出错,出错信息:{error}".format(error=e)) return director @classmethod def __get_label(cls, item): """ :param item: :return: """ label = item.find("div", attrs={"class": "tipcategorias"}) label = label.find_all("span") s = "" try: for k, v in enumerate(label): if k == 0: s = s + v.get_text().strip() else: s = s + "," + v.get_text().strip() except Exception as e: debug("电影标签获取出错,出错信息:{error}".format(error=e)) return s @classmethod def __get_next_url(cls, bs): """ :param bs: :return: """ next_url = bs.find_all("ul", attrs={"class": "pager"}) try: next_url = next_url[0].find("a", attrs={"rel": "next"}) next_url = next_url.attrs['href'] except Exception as e: next_url = "" debug("下一页url获取出错,出错信息:{error}".format(error=e)) debug(next_url) return next_url
class ConstitutionThread(threading.Thread): def __init__(self, flfgID, zlsxid, showDetailType, province, index): threading.Thread.__init__(self) self.count = 0 self.flfgID = flfgID self.zlsxid = zlsxid self.showDetailType = showDetailType self.province = province self.index = index self.rv = 0 self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): debug("线程" + str(self.index) + "开始", True) global rv try: result = self.getConstitutionData(self.flfgID, self.zlsxid, self.showDetailType, self.province) if result != 1: tmp = "第" + str(self.index) + "条获取失败" else: tmp = "第" + str(self.index) + "条获取成功" mylock.acquire() rv = rv + 1 mylock.release() except: tmp = "第" + str(self.index) + "条获取失败" print(tmp) def getConstitutionData(self, flfgID, zlsxid, showDetailType, province): # 经过浏览。很明显,具体的宪法数据源url为如下的url,包含两个get类型参数 flfgID zlsxid keyword 前两个是必须的,通过列表传递的js数据拿到 flag = False url = "http://210.82.32.100:8081/FLFG/flfgByID.action" get = dict() get['flfgID'] = flfgID get['showDetailType'] = showDetailType get['zlsxid'] = zlsxid get['keyword'] = "" get = urlencode(get) url = url + "?" + get while True: try: data = curlData(url, get, url) break except: pass try: data = data.decode("utf-8") except: pass # with open("constitution.txt", "wb") as f: # f.write(data.encode("utf-8")) # f.close() # with open("constitution.txt", "rb") as f: # data = f.read().decode("utf-8") # f.close() handleDataAll = BeautifulSoup(data, "html.parser") handleData = handleDataAll.find_all("table") columns_list = [ 'type', "department_type", 'office', 'reference_num', 'issue_date', 'execute_date', 'timeliness' ] columns_name_list = [ '资料属性:', '部门分类:', '制定机关:', '颁布文号:', '颁布日期:', '施行日期:', '时 效 性:' ] # 获取头部基本信息 try: table_data = handleData[0].find_all("td") except: table_data = "数据获取出错" flag = True type_data = dict() type_data['url'] = url for k, v in enumerate(table_data): try: if (k + 1) % 2 == 1: type_data[columns_list[columns_name_list.index( table_data[k].getText().strip())]] = table_data[ k + 1].getText().strip() except: type_data[columns_list[columns_name_list.index( table_data[k].getText().strip())]] = "数据获取出错" # 接下来获取标题和内容 try: type_data['title'] = handleDataAll.find_all( "div", attrs={"class": "bt"})[0].getText().strip() except: type_data['title'] = "标题获取出错" flag = True # 进行内容获取 try: type_data['content'] = str( handleDataAll.find_all("div", attrs={"id": "content"})[0]) except: flag = True type_data['province'] = province if flag: type_data['is_get_error'] = 1 else: type_data['is_get_error'] = 0 while True: try: mylock.acquire() sql = self.db.getInsertSql(type_data, "constitutions") result = self.db.insert(sql, is_close_db=False) mylock.release() break except Exception as e: debug(e) return result def getRv(self): global rv return rv def reset(self): global rv rv = 0
class RecipeListThread(object): def __init__(self, page_list, category): """ :param page_list: :param category: """ self.page_list = page_list self.category = category self.handle_num = 0 # 自己定义字段可以避免重复查询字段的类型自动拼接 self.table_columns = (("id", "int"), ("name", "varchar"), ("url", "varchar"), ("img_url", "varchar"), ("introduce", "text"), ("recipe_type_id", "int"), ("status", "int"), ("page_views", "int")) self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): """ start threading :return: """ thread_pool = ThreadPoolExecutor(max_workers=10) task_list = list() # 添加线程 for p in self.page_list: task = thread_pool.submit(self.get_data, p) task_list.append(task) debug("本次线程数量:{length}".format(length=len(task_list))) # 开始并阻塞线程 for i in as_completed(task_list): result = i.result() if result['code'] == 0: debug("{category} -- 第{page}页的数据获取完毕".format( category=self.category['keyword'], page=result['page'])) debug("处理了{length}个线程".format(length=self.handle_num)) def get_data(self, page): """ :param page: :return: """ global lock url = CommonFunc().generate_url(page, self.category['keyword']) # 获取数据 page_resource = curlData(url, open_virtual_ip=True) # with open("tmp/recipe_list.txt", "rb") as f: # page_resource = f.read().decode("utf-8") # f.close() # 处理并存储数据 self.handle_data(page_resource) lock.acquire() self.handle_num = self.handle_num + 1 lock.release() return {"code": 0, "page": page} def handle_data(self, page_resource): """ :param page_resource: :return: """ bs = BeautifulSoup(page_resource, "html.parser") li_list = self.__get_li_list(bs) lock.acquire() for item in li_list: insert_arr = dict() insert_arr['recipe_type_id'] = self.category['id'] insert_arr['status'] = 0 insert_arr['img_url'] = self.__get_img_url(item) insert_arr['url'] = self.__get_url(item) insert_arr['introduce'] = self.__get_introduce(item) insert_arr['page_views'] = self.__get_page_views(item) insert_arr['name'] = self.__get_name(item) sql = self.db.getInsertSql(insert_arr, table="list", table_columns=self.table_columns) result = self.db.insert(sql, is_close_db=False) if result == 1: debug("插入成功") else: debug("插入失败") lock.release() @classmethod def __get_li_list(cls, bs): container_list = bs.find_all("div", attrs={"class": "cs-recipes-category"}) li_list = container_list[0].find_all("li", attrs={"class": "cs-recipe"}) return li_list @classmethod def __get_img_url(cls, item): img_url = item.find("img") try: img_url = img_url.attrs['src'] except Exception as e: img_url = "" debug("图片链接获取出错,错误信息:{error}".format(error=e)) return img_url @classmethod def __get_url(cls, item): url = item.find("a") try: url = url.attrs['href'] except Exception as e: url = "" debug("详情页链接获取出错,错误信息:{error}".format(error=e)) return url @classmethod def __get_introduce(cls, item): introduce = item.find_all("span") try: introduce = introduce[0].get_text().strip() except Exception as e: introduce = "" debug("介绍获取出错,错误信息:{error}".format(error=e)) return introduce @classmethod def __get_page_views(cls, item): page_views = item.find_all("span") try: page_views = page_views[1].get_text().strip() # 去掉逗号 page_views = page_views.replace(",", "") page_views = page_views.replace(" Plays", "") except Exception as e: page_views = 0 debug("浏览量获取出错,错误信息:{error}".format(error=e)) return page_views @classmethod def __get_name(cls, item): name = item.find("h3") try: name = name.get_text().strip() except Exception as e: name = "" debug("菜谱名获取出错,错误信息:{error}".format(error=e)) return name
class GetRecipeVideo(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): self.download() def download(self): data = self.get_tmp_content() self.__download(data) def __download(self, data): for item in data: url = "https://www.youtube.com/watch?v=%s" % item['video_id'] debug("开始抓取:--> {video_id}".format(video_id=item['video_id'])) try: youtube = YouTube(url) youtube.streams.filter(subtype="mp4").first().download( "/Users/cpx/code/py/recipe/data/recipe/", filename=item['video_id']) self.__update_data(item['id']) except Exception as e: debug(e) def __update_data(self, list_id): """ :param list_id: :return: """ update_arr = { "table": "tmp_content", "set": { "status": 1 }, "condition": ['id={list_id}'.format(list_id=str(list_id))] } result = self.db.update(update_arr, is_close_db=False) return result def get_tmp_content(self): data = self.db.select( { "table": "tmp_content", "columns": ['id', 'video_id'], "condition": ['status=0'] }, is_close_db=False) return data def handle_data(self): self.move_data() # data = self.get_data() def get_data(self): select_arr = {"table": "recipe_content"} data = self.db.select(select_arr, is_close_db=False) return data def move_data(self): category = self.get_category() for item in category: data = self.get_list_by_type_id(item['id']) self.__move_data(data) def __move_data(self, data): for item in data: content = self.get_content_by_list_id(item['id']) try: content = content[0] content['status'] = 0 self.__insert_data(content) except Exception as e: debug(e) def __insert_data(self, insert_arr): sql = self.db.getInsertSql(insert_arr, "tmp_content") result = self.db.insert(sql, is_close_db=False) return result def get_list_by_type_id(self, type_id): data = self.db.select( { "table": "list", "condition": ['recipe_type_id={type_id}'.format(type_id=type_id)], "limit": [0, 20] }, is_close_db=False) return data def get_content_by_list_id(self, list_id): data = self.db.select( { "table": "content", "columns": ['video_id', 'list_id'], "condition": ["list_id={list_id}".format(list_id=str(list_id))] }, is_close_db=False) return data def get_category(self): data = self.db.select({ "table": "type", "condition": ['keyword<>""'] }, is_close_db=False) return data
class GetImages(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): self.handle() def handle(self): data = self.get_data() self.get_images(data, "large", img_url="img_url_large") @classmethod def start_thread(cls, data, fun, path, img_url, prefix): thread_pool = ThreadPoolExecutor(max_workers=15) task_list = list() result = list() for item in data: task = thread_pool.submit(fun, item, path, img_url, prefix) task_list.append(task) for i in as_completed(task_list): result.append(i.result()) return result def get_images(self, data, path, img_url, prefix=""): self.start_thread(data, self.__get_images, path, img_url, prefix) def __get_images(self, item, path, img_url, prefix): page_resource = self.get_page_resource(prefix + item[img_url]) with open( "static/images/{path}/{id}.jpg".format(path=path, id=item['id']), "wb") as f: try: page_resource = page_resource.encode("utf-8") except Exception as e: debug(e) f.write(page_resource) f.close() update_data = {"status": 1} condition = ["id={id}".format(id=item['id'])] self.__update_data(update_data, "list", condition) @classmethod def get_page_resource(cls, url): data = curlData(url, open_virtual_ip=True) return data def __update_data(self, update_data, table, condition): update_arr = { "table": table, "set": update_data, "condition": condition } lock.acquire() self.db.update(update_arr, is_close_db=False) lock.release() def get_data(self): data = self.db.select( { "table": "list", "columns": ["id", "img_url", "img_url_large"], "condition": ["status=0"] }, is_close_db=False) return data
class RecipeContentThread(object): def __init__(self): self.recipe_list = RecipeListSpider() self.table_columns = (("id", "int"), ("img_url", "varchar"), ("video_id", "varchar"), ("preparation", "longtext"), ("ingredients", "text"), ("name", "varchar"), ("list_id", "int")) self.handle_num = 0 self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): """ :return: """ data = self.get_list() self.start(data) def start(self, data): """ :param data: :return: """ thread_pool = ThreadPoolExecutor(max_workers=15) task_list = list() for item in data: task = thread_pool.submit(self.handle_data, item) task_list.append(task) for i in as_completed(task_list): result = i.result() debug("本次处理成功 {num} 个线程".format(num=self.handle_num)) def get_list(self): """ :return: """ data = self.recipe_list.get_list() return data def handle_data(self, item): """ :param item: :return: """ page_resource = self.get_data(item) result = self.__handle_data(page_resource, item) if result['code'] == 0: debug("菜谱存储出错 --> {name}".format(name=item['name'])) else: debug("菜谱存储成功 --> {name}".format(name=item['name'])) self.__update_status(result['code']) lock.acquire() self.handle_num = self.handle_num + 1 lock.release() return {"code": 0} @classmethod def get_data(cls, item): """ :param item: :return: """ url = CommonFunc().generate_content_url(item['url']) data = curlData(url, open_virtual_ip=True) return data def __handle_data(self, page_resource, item): """ :param page_resource: :param item: :return: """ bs = BeautifulSoup(page_resource, "html.parser") insert_arr = dict() insert_arr['video_id'] = self.__get_video_id(bs) insert_arr['img_url'] = self.__get_img_url(page_resource) insert_arr['name'] = item['name'] insert_arr['preparation'] = self.__get_preparation(bs) insert_arr['ingredients'] = self.__get_ingredients(bs) insert_arr['list_id'] = item['id'] result = self.__save_data(insert_arr) return {'code': result} def __save_data(self, insert_arr): lock.acquire() sql = self.db.getInsertSql(insert_arr, table="content", table_columns=self.table_columns) result = self.db.insertLastId(sql, is_close_db=False) lock.release() return result def __update_status(self, recipe_list_id): update_arr = { "table": "list", "set": { "status": "1" }, "condition": ["id={recipe_list_id}".format(recipe_list_id=recipe_list_id)] } lock.acquire() self.db.update(update_arr, is_close_db=False) lock.release() @classmethod def __get_video_id(cls, bs): """ :param bs: :return: """ video_id = bs.find("iframe") try: video_id = video_id.attrs['src'] video_id = re.findall('https://www.youtube.com/embed/([\w\W]*?)\?', video_id)[0] except Exception as e: video_id = "" debug("视频播放id获取出错,错误信息:{error}".format(error=e)) return video_id @classmethod def __get_img_url(cls, bs): # img_url = bs.find("div", attrs={"class": "ytp-cued-thumbnail-overlay-image"}) try: # img_url = img_url.attrs['style'] img_url = re.findall('"image": "([\w\W]*?)"', str(bs))[0] except Exception as e: img_url = "" debug("菜谱图片链接获取出错,错误信息:{error}".format(error=e)) return img_url @classmethod def __get_preparation(cls, bs): """ :param bs: :return: """ preparation = bs.find("div", attrs={"class": "cs-recipe-single-preparation"}) try: preparation = preparation.find("ul") preparation = str(preparation) preparation = re.findall("<ul>([\w\W]*?)<\/ul>", preparation)[0] except Exception as e: preparation = "" debug("菜谱做法获取出错,错误信息:{error}".format(error=e)) return preparation @classmethod def __get_ingredients(cls, bs): ingredients = bs.find("div", attrs={"class": "cs-ingredients-check-list"}) ingredients_str = "" try: ingredients = ingredients.find("ul") ingredients = ingredients.find_all("li") for k, v in enumerate(ingredients): if k == 0: ingredients_str = ingredients_str + v.get_text().strip() else: ingredients_str = ingredients_str + "," + v.get_text( ).strip() except Exception as e: ingredients_str = "" debug("配料获取出错,错误信息:{error}".format(error=e)) return ingredients_str
def __init__(self): self.cookie = {} # self.get_cookie() self.db = DBConfig()
class MvCategory(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def get_category(self): """ :return: """ select_arr = {"table": "type"} category = self.db.select(select_arr, is_close_db=False) if not category: return [] return category def get(self): """ :return: """ # page_resource = self.get_data() with open("tmp/index_page.txt", "rb") as f: page_resource = f.read().decode("utf-8") f.close() bs = BeautifulSoup(page_resource, "html.parser") category_list = self.__get_category_list(bs) for item in category_list: self.handle_data(item) def handle_data(self, item): """ :param item: :return: """ insert_arr = dict() insert_arr['status'] = 0 insert_arr['url'] = self.__get_category_url(item) insert_arr['img_src'] = self.__get_category_img_src(item) insert_arr['icon_img_src'] = self.__get_category_icon_img_src(item) insert_arr['name'] = self.__get_category_name(item) insert_arr['description'] = self.__get_category_description(item) if self.__save_date(insert_arr): debug("类型存储成功") else: debug("类型存储失败") def __save_date(self, insert_arr): """ :param insert_arr: :return: """ table_columns = (("id", "int"), ("img_src", "varchar"), ("icon_img_src", "varchar"), ("url", "varchar"), ("name", "varchar"), ("description", "text")) sql = self.db.getInsertSql(insert_arr, table="type", table_columns=table_columns) result = self.db.insert(sql, is_close_db=False) if result == 0: return False return True @classmethod def get_data(cls): """ :return: """ url = settings.DOMAIN data = curlData(url, open_virtual_ip=True) return data @classmethod def __get_category_list(cls, bs): """ :param bs: :return: """ category_list = bs.find_all("ul", attrs={"class": "owl-carousel"}) try: category_list = category_list[0].find_all("li", attrs={"class": "item"}) except Exception as e: category_list = list() debug("类型列表获取失败,错误信息:{error}".format(error=e)) return category_list @classmethod def __get_category_url(cls, item): """ :param item: :return: """ category_url = item.find("a") try: category_url = category_url.attrs['href'] except Exception as e: category_url = "" debug("分类url链接获取失败,错误信息:{error}".format(error=e)) return category_url @classmethod def __get_category_img_src(cls, item): """ :param item: :return: """ category_img_src = item.find("img") try: category_img_src = category_img_src.attrs['src'] except Exception as e: category_img_src = "" debug("图片地址获取失败,错误信息:{error}".format(error=e)) return category_img_src @classmethod def __get_category_icon_img_src(cls, item): """ get icon img src :param item: :return: """ category_icon_img_src = item.find("img") try: category_icon_img_src = category_icon_img_src.attrs['src'] except Exception as e: category_icon_img_src = "" debug("icon图片地址获取失败,错误信息:{error}".format(error=e)) return category_icon_img_src @classmethod def __get_category_name(cls, item): """ get category name :param item: :return: """ category_name = item.find("div", attrs={"class": "category-name"}) try: category_name = category_name.get_text().strip() except Exception as e: category_name = "" debug("类型名获取失败,错误信息:{error}".format(error=e)) return category_name @classmethod def __get_category_description(cls, item): """ :param item: :return: """ category_description = item.find( "div", attrs={"class": "category-description"}) try: category_description = category_description.get_text().strip() except Exception as e: category_description = "" debug("类型描述获取失败,错误信息:{error}".format(error=e)) return category_description
def __init__(self): self.db = DBConfig()
class RecipeType(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def get(self): """ :return: """ select_arr = {"table": "type", "condition": ['nav_type=2']} data = self.db.select(select_arr, is_close_db=False) # check whether have any data, if not, get all category if not data: self.get_recipe_type() data = self.db.select(select_arr, is_close_db=False) return data def get_category(self): """ :return: """ select_arr = {"table": "type", "condition": ['status=0']} return self.db.select(select_arr, is_close_db=False) def get_recipe_type(self): """ :return: """ category_li = self.__handle_category() self.__handle_category_data(category_li) @classmethod def __handle_category(cls): """ :return: """ url = CommonFunc().generate_url() page_resource = curlData(url, open_virtual_ip=True) bs_data = BeautifulSoup(page_resource, "html.parser") category_ul = bs_data.find_all("ul", attrs={"class": "sub-menu"}) # only get the next level's li(tag), not include offspring(need to add 'recursive=False') return category_ul[0].find_all("li", recursive=False) def __handle_category_data(self, category_li, handle_type=1, parent_id=0): """ :param category_li: :param handle_type: :param parent_id: :return: """ table_columns = (("id", "int"), ("name", "varchar"), ("page_num", "longtext"), ("nav_type", "int"), ("keyword", "varchar"), ("parent_id", "int")) for item in category_li: insert_arr = {"parent_id": 0, "nav_type": 2} try: href = item.find("a").attrs['href'] try: insert_arr['keyword'] = re.findall('category=([\w\W]*.)', href)[0] except Exception as e: debug(e) if handle_type == 2: insert_arr['parent_id'] = parent_id if href == "#": insert_arr['name'] = item.find("span").getText().strip() insert_arr['nav_type'] = 1 sql = self.db.getInsertSql(insert_arr, "type", table_columns=table_columns) lastest_id = self.db.insertLastId(sql, is_close_db=False) if lastest_id == 0: debug("get data error") continue self.__handle_category_data(item.find_all("li"), 2, lastest_id) else: insert_arr['name'] = item.getText().strip() sql = self.db.getInsertSql(insert_arr, "type", table_columns=table_columns) self.db.insert(sql, is_close_db=False) except Exception as e: debug(e)
class GetImgUrlLarge(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): self.handle() def handle(self): data = self.get_data() for item in data: result = self.__handle(item) if result['result_1'] == 1 and result['result_2'] == 1 and result[ 'result_3'] == 1: debug(item['img_url']) else: break def __handle(self, item): img_url = item['img_url'] try: s = re.findall('squarethumbnails\/([\w\W]*.)', img_url)[0] except Exception as e: s = '' debug(e) if s == '': return s = 'http://www.laurainthekitchen.com/largethumbnails/' + s result = self.__update_data(s, item) return result def __update_data(self, s, item): update_arr_list = { "table": "list", "set": { "img_url_large": s, "status": 1 }, "condition": ["id={id}".format(id=item['id'])] } result_1 = self.db.update(update_arr_list, is_close_db=False) del update_arr_list['set']['status'] update_arr_list['table'] = "tmp_list" result_2 = self.db.update(update_arr_list, is_close_db=False) update_arr_list['table'] = "content" update_arr_list['condition'] = ["list_id={id}".format(id=item['id'])] result_3 = self.db.update(update_arr_list, is_close_db=False) return { "result_1": result_1, "result_2": result_2, "result_3": result_3 } def get_data(self): select_arr = { "table": "list", "columns": ["img_url", "id"], "condition": ["status=0"] } data = self.db.select(select_arr, is_close_db=False) return data
class GetVideoSrc(object): def __init__(self): self.cookie = {} # self.get_cookie() self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): data = self.get_content_list() self.handle_data(data) def handle_data(self, data): thread_pool = ThreadPoolExecutor(max_workers=15) task_list = list() for item in data: if item['url'] == '': continue else: task = thread_pool.submit(self.__handle_data, item) task_list.append(task) # self.__handle_data(item) for i in as_completed(task_list): result = i.result() def __handle_data(self, item): update_data = dict() update_data['status'] = 1 update_data['video_src'] = self.__get_video_src(item) debug(update_data['video_src']) self.__update_data(item['id'], update_data) return {"code": 0} def __get_video_src(self, item): header = { # "Referer": "http://www.wyysdsa.com/", "User-Agent": getUserAgent(), # "Cache-Control": "max-age=0", # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3" } # url = "http://zeus.pelisplay.tv/embed/vip.php?u=Q1A5NUZJM1VDTWlUTk8wTEFmWGNQZDhnbWRIcmt6UVU0VGIxakpXOUF4Mi9yZW51Zi9yaXZlcXFoYnlwL3picC5hYm1uem4uampqLy86ZmNnZ3U&fondo_requerido=" # url = "https://nl.tan90.club/test/testHeader.html" data = curlData(url=item['url'], header=header, cookie=self.cookie) # with open("tmp/content_detail.txt", "rb") as f: # data = f.read().decode("utf-8") # f.close() try: src = re.findall("JSON\.parse\('([\w\W]*?)'\)\);", data)[0] src = src.replace("\\", "") src = json.loads(src) src = src[0]['file'] except Exception as e: src = "" debug(e) return src def __update_data(self, content_id, update_data): update_arr = { "table": "content", "set": update_data, "condition": ['id={content_id}'.format(content_id=content_id)] } lock.acquire() result = self.db.update(update_arr, is_close_db=False) lock.release() return result def get_content_list(self): data = self.db.select({ "table": "content", "columns": ['id', 'url'], "condition": ['status=0'] }, is_close_db=False) return data def get_cookie(self): header = { "User-Agent": getUserAgent(), # "Cache-Control": "max-age=0", # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3" } url = "https://www.pelisplay.tv/" self.cookie = getCookie(url, header=header) debug(self.cookie)