def check_chapter_id(cls, _id): """查找数据库是否存在chapter_id :param _id: 传入的chapter_id参数""" sql = 'SELECT EXISTS(SELECT 1 from `笔趣阁章节内容` WHERE `chapter_id`=%(id)s)' value = {'id': _id} try: cur.execute(sql, value) return cur.fetchone()[0] except Exception as e: logger.warning('当前出现一个SQL Error: {}'.format(e)) return 1
def process_request(cls, request, spider): headers = { 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.8', 'Cache-Control': 'max-age=0', 'User-Agent': spider.settings['USER_AGENT'], 'Connection': 'keep-alive', } desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() for key, value in headers.iteritems(): desired_capabilities['phantomjs.page.customHeaders.{}'.format( key)] = value desired_capabilities[ 'phantomjs.page.customHeaders.User-Agent'] = spider.settings[ 'USER_AGENT'] if spider.settings['USE_PROXY']: service_args = [ '--proxy={ip}:{port}'.format(**cls.get_proxy()), '--proxy-type=http', ] else: service_args = [] driver = webdriver.PhantomJS( executable_path=spider.settings['PHANTOMJS_PATH'], desired_capabilities=desired_capabilities, service_args=service_args) # 隐式等待5秒,可以自己调节 driver.implicitly_wait(20) # 设置10秒页面超时返回,类似于requests.get()的timeout选项,driver.get()没有timeout选项 # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout(20) # 设置10秒脚本超时时间 check = True driver.set_script_timeout(10) driver.get(request.url) i = 1 while not driver.execute_script( 'return document.readyState') == 'complete' and i < 20: logger.warning('sleep 1') time.sleep(1) i += 1 js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) # 可执行js,模仿用户操作。此处为将页面拉至最底端。 body = driver.page_source logger.warning(u"访问" + request.url) url = driver.current_url driver.close() driver.service.process.send_signal(signal.SIGTERM) driver.quit() return HtmlResponse(url, body=body, encoding='utf-8', request=request)
def process_item(self, item, spider): try: if isinstance(item, CommitItem): self.conn.commit() elif isinstance(item, BriefItem): self.cursor.execute(self.brief_sql, (item["code"], item["train_no"], item["start"], item["end"], item["turn"])) elif isinstance(item, InfoItem): self.cursor.execute( self.info_sql, (item["train_no"], item["no"], item["station"], item["start_time"], item["arrive_time"], item["stopover_time"], item["turn"])) else: self.cursor.execute(self.turn_sql, (item["id"], item["mark_time"])) self.conn.commit() except Exception, e: logger.warning("excute sql fail.") logger.warning(str(e))
def process_item(self, item, spider): try: self.cursor.execute(self.sql, (item['crawl_time'], item['key_word'])) self.conn.commit() except Exception , e: logger.warning("execute sql fail.") logger.warning(str(e)) logger.warning(item)
def process_item(self, item, spider): try: if isinstance(item, CommitItem): self.conn.commit() else: self.cursor.execute( self.sql, (item["province"], item["city"], item["county"], item["address"], item["name"], item["windows"], item["start"] + u'00', item["end"] + u'00', item["turn"])) except Exception, e: logger.warning("execute sql fail.") logger.warning(str(e)) logger.warning(item)
def _upload_file(self, path, buf): logger.warning('now i will upload the image {}'.format(path)) self._bucket.put_object(key=path, data=buf.getvalue())