def executeByInsOrUpdOrDel(self, connection, cursor, sql: str, values=None): """ 执行单个sql语句 :param connection: 数据库连接对象 :param cursor: mysql游标 :param sql: 要执行的sql语句 :param values: 需要插入的数据 :return: """ try: res = [] if connection and cursor and sql: if values: if isinstance(values, tuple): cursor.execute(sql, values) elif isinstance(values, list): cursor.executemany(sql, values) connection.commit() res = cursor.fetchall() logger.info("操作成功") logger.info("执行结果:{}".format(res)) return res logger.info("错误:") return res logger.info("None") return res except Exception as e: logger.error("操作失败:{}".format(e)) connection.roback() return None
def get(self): """ # @description: # @param {type} # @return: """ try: key = request.args.get("key", None) if key == "search": values = request.args.get("search", None) if values: search_res = ZeroCrawler(values).start() print(search_res) res = {"code": 200, "msg": "OK", "data": search_res} logger.info("请求成功:{}".format(res)) return res res = {"code": 1002, "msg": "error", "data": []} logger.info("请求失败:{}".format(res)) return res res = {"code": 1000, "msg": "不做处理", "data": []} logger.info("请求成功:{}".format(res)) return res except Exception as e: logger.error(e)
def download_page(self, item): """ 获取下一页的网页地址等 根据当前状态标记新的状态 0:初始页面 1:全局列表页面 2: 抽取下载连接 """ item = MyDict(item) try: print(f" download_page info: {item.url}") logger.info("开始下载网页!{}。类型:{}".format(item.url, item.type)) # current_app.logger.info("开始下载网页!{}。类型:{}".format(item.url, item.type)) old_type = item.type if item.url != "None" and not cache.get(item.url): # if item.url != "None": html_obj = requests.get(item.url, headers=self.headers) html_str = html_obj.content.decode("gbk") item.content = html_str print(len(html_str)) # 请求结果存入redis数据库 cache.set(item.url, html_str) cache.expire(item.url, self.redis_key_timeout) # item.xpath_obj = lxml.html.fromstring(html_str) logger.info("下载前类型:{}, 下载后类型:{}".format(old_type, item.type)) self.push_item_in_redis_list(self.message_b, item) except Exception as e: logger.error("download_page: {}".format(e))
def executeBySelect(self, connection, cursor, sql: str): try: res = [] if connection and cursor and sql: cursor.execute(sql) res = cursor.fetchall() logger.info("操作成功") logger.info("执行结果:{}".format(res)) return res logger.info("缺少关键参数") return res except Exception as e: logger.error(e) return []
def get(self): """ 提供get方法API,调用爬虫爬取相关数据 :return: """ if request.method != "GET": logger.error("error, method not allowed!") return "error, method not allowed!" key = request.args.get("request_type", None) if key == "download_dytt": number, result = self.processes(DyHeavenCrawler, key) return {"code": 200, "msg": "succeed", "data": result} if key == "download_proxy": number, result = self.processes(ProxyCrawler, key) return {"code": 200, "msg": "succeed", "data": result} return {"code": 2001, "msg": "not worker", "data": []}
def _connect_and_get_cursor(self): """ 创建连接并初始化游标 :return: """ try: connection = pymysql.connect(host=self.host, port=self.port, user=self.user, password=self.password, db=self.db, charset=self.charset) cursor = connection.cursor() self.connection, self.cursor = connection, cursor except Exception as e: logger.error(e) return "None", "None"
def connectAndGetCursor(self): """ 创建连接并初始化游标 :return: """ try: print(self.host, self.port, self.user, self.password, self.db, self.charset) connection = pymysql.connect(host=self.host, port=self.port, user=self.user, password=self.password, db=self.db, charset=self.charset) cursor = connection.cursor() return connection, cursor except Exception as e: logger.error(e) return "None", "None"
def merger_result(self, item): """ 获取下一页的网页地址等 """ item = MyDict(item) try: item_res = { "url": item.url, "name": item.name, "title": item.title, "size": item.size, "magnet": item.magnet } self.result.append(item_res) # if item.url in cache.keys() and item_res == cache.get(item.url): # self.result.append(cache.get(item.url)) # else: # cache.set(item.url, json.dumps(item_res, ensure_ascii=False)) # self.res except Exception as e: logger.error("merger_result: {}".format(e))
def primary(self, item): """ 抽取主方法,抽取连接等所需要等内容 """ item = MyDict(item) try: logger.info("开始抽取:{}".format(item.url)) xpath_obj = lxml.html.fromstring(item.content) # current_app.logger.info("开始抽取:{}".format(item.type)) if item.type == "1": try: tables = xpath_obj.xpath( "//div[@class='bd3']//div[@class='co_content8']/ul//table" ) print(len(tables)) for table in tables: # URL抽取 url = table.xpath(".//tr[2]//a/@href") if url: new_url = url[0] if not new_url.startswith("http"): new_url = dyttUrl + new_url tr_item = MyDict() tr_item.url = new_url tr_item.type = "2" title = table.xpath(".//tr[2]//a/text()") tr_item.title = title[0] if title else "None" name = table.xpath( ".//tr[2]//a/text()")[0] if table.xpath( ".//tr[2]//a/text()") else "《None》" tr_item.name = "《{}》".format("".join( re.findall(r"《(.*?)》", name))) date = table.xpath(".//tr[3]//font/text()") tr_item.date = date[0] if date else "None" introduction = table.xpath(".//tr[4]/td/text()") tr_item.introduction = introduction[ 0] if introduction else "None" self.push_item_in_redis_list( self.message_a, tr_item) logger.info("{}".format({ "url": tr_item.url[:30], "name": tr_item.name[:6], "title": tr_item.title[:6], "introduction": tr_item.introduction[:30], "type": tr_item.type[:6] })) except Exception as e: logger.error("列表页抽取: {}".format(e)) # current_app.logger.error("列表页抽取: {}".format(e)) elif item.type == "2": try: logger.info("开始抽取详情页信息:{}".format(item.url)) # current_app.logger.info("开始抽取详情页信息:{}".format(item.url)) down_lists = xpath_obj.xpath("//div[@id='downlist']/table") if not down_lists: down_lists = xpath_obj.xpath("//div[@id='Zoom']/table") # print("再次处理得到的连接:{}".format(len(down_lists))) # print("共有下载连接:{}".format(len(down_lists))) # magnet_info = {} magnet_info = [] for downloader in down_lists: magnet = downloader.xpath( ".//a/@href")[0] if downloader.xpath( ".//a/@href") else "" download_name = downloader.xpath( ".//a/text()")[0] if downloader.xpath( ".//a/text()") else "]" magnet_name = re.split(r"[=\]/]", download_name)[-1] # magnet_info.update({magnet_name: magnet}) magnet_info.append((magnet_name, magnet)) item.magnet = magnet_info self.push_item_in_redis_list(self.message_c, item) except Exception as e: logger.error("信息页抽取: {}".format(e)) # current_app.logger.error("信息页抽取: {}".format(e)) elif item.type == "0": try: area2s = xpath_obj.xpath( "//div[@class='bd2']/div[@class='index_list']/div[@class='co_area2']" ) for area in area2s: url = area.xpath( ".//div[@class='title_all']/p/span/a/@href") if url: new_url = url[0] if not new_url.startswith("http"): new_url = dyttUrl + new_url logger.info("首页抽取抽取到的网址:{}".format(new_url)) # current_app.logger.info("首页抽取抽取到的网址:{}".format(new_url)) # tr_item = DyHeavenCrawler.Default(new_url) tr_item = MyDict() tr_item.url = new_url tr_item.type = "1" tr_item.classify = area.xpath( ".//div[@class='title_all']/p/span/a/text()" )[0] # print("首页抽取抽取到的分类:{}".format(tr_item.classify)) self.push_item_in_redis_list( self.message_a, tr_item) except Exception as e: logger.error("首页抽取: {}".format(e)) # current_app.logger.error("首页抽取: {}".format(e)) except Exception as e: logger.error("primary: {}".format(e))