Beispiel #1
0
 def process_request(self, request, spider):
     url = request.url
     try:
         spider.browser.get(url)
         # WebDriverWait(spider.browser, 5).until(EC.presence_of_element_located((By.ID, "pageForm")))
     except TimeoutException as e:
         print("{}--------》》》超时了,记录下来".format(url))
         writeFile(url, self.fail_path)
         return request
         # spider.browser.execute_script('window.stop()')
     time.sleep(2)
     html = spider.browser.page_source
     selector = Selector(text=html)
     title = selector.xpath(
         '/html/body/div/div/table[1]/tbody/tr[1]/td/div[1]/text()'
     ).extract_first()
     pageNo = selector.xpath(
         '/html/body/table[4]/tbody/tr/td[1]/text()').extract_first()
     if not title and not pageNo:
         print("{}--------页面没有渲染成功".format(url))
         return request
     # spider.browser.close()
     return HtmlResponse(url=url,
                         body=html,
                         encoding="utf-8",
                         request=request)
Beispiel #2
0
    def process_exception(self, request, exception, spider):

        url = request.url

        print("{}--------》》请求出现异常--------".format(url))
        writeFile(url=url, fileName=self.fail_path)
        return request
        pass
Beispiel #3
0
    def process_response(self, request, response, spider):
        url = response.url
        writeFile(url, "url.txt")

        if response.status != 200:
            print("{}-------->>>>>>的响应码为{},有问题,需要记录".format(
                url, response.status))
            writeFile(url=url, fileName=self.fail_path)
            return request
        return response
Beispiel #4
0
 def process_country_item(self, info, url):
     sql = "INSERT INTO `scrapy_dev`.`country_hickey` ( `registry_cert_no`, `registry_per_name`, " \
           "`registry_per_addr`, `prod_addr`, `proxy_per_name`, `proxy_per_addr`, `prod_name`, `standard`," \
           " `structure`, `apply_limit`, `other_content`, `remark`, `allow_date`, `effective_date`, `enclosure`, " \
           "`prod_standard`, `change_date`, `post_no`, `component`, `expect_use`, `prod_save_conditon`, `auth_dept`," \
           " `change_situation`)" \
           " VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s," \
           " %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"
     try:
         self.pool.insert(sql=sql, param=info)
         self.pool.end("commit")
         print("{}---------插入成功----".format(url))
     except BaseException as e:
         print("{}-----插入失败,失败原因----{}".format(url, e))
         writeFile(url, get_project_settings().get("FAIL_LOG_PATH"))
Beispiel #5
0
    def process_item(self, item, spider):

        # medicineInfo = tuple(item["info"])

        url = item["url"]
        if not item["info"]:
            print("{}未获取到数据,url需要记录---------------".format(url))
            writeFile(url=url,
                      fileName=get_project_settings().get("FAIL_LOG_PATH"))
            raise DropItem("{}-----获取得数据为空,丢弃-------".format(url))

        # if isinstance(item,ForeignHickeyItem):
        #     self.process_foreign_hickey(item)
        # else:
        #     self.process_country_hickey(item)
        # if isinstance(item,HickeyItem):
        #     self.process_country_item(medicineInfo, url)
        # elif isinstance(item,ForeignHickeyItem):
        #     self.process_foreign_item(medicineInfo, url)
        return item
Beispiel #6
0
 def process_foreign_hickey(self, item):
     '''
     处理进口器械
     :param item:
     :return:
     '''
     print("-----------------------处理进口器械------------------")
     title = [
         "产品名称", "注册证编号", "注册人名称", "注册人住所", "生产地址", "代理人名称", "代理人住所",
         "型号、规格", "结构及组成", "适用范围", "生产国或地区(英文)", "附件", "其他内容", "备注", "批准日期",
         "有效期至", "生产厂商名称(中文)", "产品名称(中文)", "产品标准", "生产国或地区(中文)", "售后服务机构",
         "变更日期", "主要组成成分(体外诊断试剂)", "预期用途(体外诊断试剂)", "产品储存条件及有效期(体外诊断试剂)",
         "审批部门", "变更情况"
     ]
     try:
         writeDataIntoExcel(
             data=item["info"],
             tilte=title,
             fileName=get_project_settings().get("FOREIGE_HICKEY_FILE"))
         print("------------进口器械写入文件成功--------")
     except BaseException as e:
         print("进口器械写入文件异常-------{}".format(e))
         writeFile(url=item["url"],
                   fileName=get_project_settings().get("FAIL_LOG_PATH"))