def process_request(self, request, spider): url = request.url try: spider.browser.get(url) # WebDriverWait(spider.browser, 5).until(EC.presence_of_element_located((By.ID, "pageForm"))) except TimeoutException as e: print("{}--------》》》超时了,记录下来".format(url)) writeFile(url, self.fail_path) return request # spider.browser.execute_script('window.stop()') time.sleep(2) html = spider.browser.page_source selector = Selector(text=html) title = selector.xpath( '/html/body/div/div/table[1]/tbody/tr[1]/td/div[1]/text()' ).extract_first() pageNo = selector.xpath( '/html/body/table[4]/tbody/tr/td[1]/text()').extract_first() if not title and not pageNo: print("{}--------页面没有渲染成功".format(url)) return request # spider.browser.close() return HtmlResponse(url=url, body=html, encoding="utf-8", request=request)
def process_exception(self, request, exception, spider): url = request.url print("{}--------》》请求出现异常--------".format(url)) writeFile(url=url, fileName=self.fail_path) return request pass
def process_response(self, request, response, spider): url = response.url writeFile(url, "url.txt") if response.status != 200: print("{}-------->>>>>>的响应码为{},有问题,需要记录".format( url, response.status)) writeFile(url=url, fileName=self.fail_path) return request return response
def process_country_item(self, info, url): sql = "INSERT INTO `scrapy_dev`.`country_hickey` ( `registry_cert_no`, `registry_per_name`, " \ "`registry_per_addr`, `prod_addr`, `proxy_per_name`, `proxy_per_addr`, `prod_name`, `standard`," \ " `structure`, `apply_limit`, `other_content`, `remark`, `allow_date`, `effective_date`, `enclosure`, " \ "`prod_standard`, `change_date`, `post_no`, `component`, `expect_use`, `prod_save_conditon`, `auth_dept`," \ " `change_situation`)" \ " VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s," \ " %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);" try: self.pool.insert(sql=sql, param=info) self.pool.end("commit") print("{}---------插入成功----".format(url)) except BaseException as e: print("{}-----插入失败,失败原因----{}".format(url, e)) writeFile(url, get_project_settings().get("FAIL_LOG_PATH"))
def process_item(self, item, spider): # medicineInfo = tuple(item["info"]) url = item["url"] if not item["info"]: print("{}未获取到数据,url需要记录---------------".format(url)) writeFile(url=url, fileName=get_project_settings().get("FAIL_LOG_PATH")) raise DropItem("{}-----获取得数据为空,丢弃-------".format(url)) # if isinstance(item,ForeignHickeyItem): # self.process_foreign_hickey(item) # else: # self.process_country_hickey(item) # if isinstance(item,HickeyItem): # self.process_country_item(medicineInfo, url) # elif isinstance(item,ForeignHickeyItem): # self.process_foreign_item(medicineInfo, url) return item
def process_foreign_hickey(self, item): ''' 处理进口器械 :param item: :return: ''' print("-----------------------处理进口器械------------------") title = [ "产品名称", "注册证编号", "注册人名称", "注册人住所", "生产地址", "代理人名称", "代理人住所", "型号、规格", "结构及组成", "适用范围", "生产国或地区(英文)", "附件", "其他内容", "备注", "批准日期", "有效期至", "生产厂商名称(中文)", "产品名称(中文)", "产品标准", "生产国或地区(中文)", "售后服务机构", "变更日期", "主要组成成分(体外诊断试剂)", "预期用途(体外诊断试剂)", "产品储存条件及有效期(体外诊断试剂)", "审批部门", "变更情况" ] try: writeDataIntoExcel( data=item["info"], tilte=title, fileName=get_project_settings().get("FOREIGE_HICKEY_FILE")) print("------------进口器械写入文件成功--------") except BaseException as e: print("进口器械写入文件异常-------{}".format(e)) writeFile(url=item["url"], fileName=get_project_settings().get("FAIL_LOG_PATH"))