def pharseContext(self, driver): try: articles_p = driver.find_element_by_css_selector("div[id=\"artibody\"]").find_elements_by_tag_name("p") articles = map(lambda article : article.text, articles_p) except NoSuchElementException, e: LOGGER.debug(e) articles_p = driver.find_element_by_class_name("text").find_elements_by_tag_name("p") articles = map(lambda article : article.text, articles_p) return articles
def pharseContext(self, driver): try: articles_p = driver.find_element_by_css_selector( "div[id=\"artibody\"]").find_elements_by_tag_name("p") articles = map(lambda article: article.text, articles_p) except NoSuchElementException, e: LOGGER.debug(e) articles_p = driver.find_element_by_class_name( "text").find_elements_by_tag_name("p") articles = map(lambda article: article.text, articles_p) return articles
def _process(self, msg): ''' This is the main message process method, normally we have all the process logic here ''' try: url, title, tag, sub_tag, abstract = self.pharseMsg(msg) crawler = self.crawlerMapper[tag][sub_tag] crawler.crawlArticle(msg) except Exception, e: LOGGER.error(traceback.format_exc()) LOGGER.error(e)
def insertOne(self,sql,value): """ @summary: 向数据表插入一条记录 @param sql:要插入的SQL格式 @param value:要插入的记录数据tuple/list @return: insertId 受影响的行数 """ LOGGER_CRAWLER.debug("start insert %s"%sql) try: self._cursor.execute(sql,value) except Exception, e: result = False LOGGER_CRAWLER.debug("exception %s"%e)
def insertOne(self, sql, value): """ @summary: 向数据表插入一条记录 @param sql:要插入的SQL格式 @param value:要插入的记录数据tuple/list @return: insertId 受影响的行数 """ LOGGER_CRAWLER.debug("start insert %s" % sql) try: self._cursor.execute(sql, value) except Exception, e: result = False LOGGER_CRAWLER.debug("exception %s" % e)
def getOne(self, sql, param=None): """ @summary: 执行查询,并取出第一条 @param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来 @param param: 可选参数,条件列表值(元组/列表) @return: result list/boolean 查询到的结果集 """ LOGGER_CRAWLER.debug("start getOne %s" % sql) try: if param is None: count = self._cursor.execute(sql) else: count = self._cursor.execute(sql, param) if count > 0: result = self._cursor.fetchone() else: result = False except Exception, e: result = False LOGGER_CRAWLER.debug("exception %s" % e)
def getOne(self,sql,param=None): """ @summary: 执行查询,并取出第一条 @param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来 @param param: 可选参数,条件列表值(元组/列表) @return: result list/boolean 查询到的结果集 """ LOGGER_CRAWLER.debug("start getOne %s"%sql) try: if param is None: count = self._cursor.execute(sql) else: count = self._cursor.execute(sql,param) if count>0: result = self._cursor.fetchone() else: result = False except Exception, e: result = False LOGGER_CRAWLER.debug("exception %s"%e)
def run(self): try: LOGGING = {'version': 1 } QUEUE_NAME = "news_article" LOGGER.info("start the news crawler") threadCount = CRAWLER_THREAD_COUNT messageHandlerList = [] workThreadList = [] for _ in range(threadCount): messageHandler = CrawlerMessageHandler(crawlerMapper) messageHandler.set_inputmessage(QUEUE_NAME) messageHandlerList.append(messageHandler) workerThread = threading.Thread(target=messageHandler.start,args=(LOGGING)) workerThread.start() workThreadList.append(workerThread) for worker in workThreadList: worker.join() except Exception,e: LOGGER.error(traceback.format_exc())
def run(self): try: LOGGING = {'version': 1} QUEUE_NAME = "news_article" LOGGER.info("start the news crawler") threadCount = CRAWLER_THREAD_COUNT messageHandlerList = [] workThreadList = [] for _ in range(threadCount): messageHandler = CrawlerMessageHandler(crawlerMapper) messageHandler.set_inputmessage(QUEUE_NAME) messageHandlerList.append(messageHandler) workerThread = threading.Thread(target=messageHandler.start, args=(LOGGING)) workerThread.start() workThreadList.append(workerThread) for worker in workThreadList: worker.join() except Exception, e: LOGGER.error(traceback.format_exc())
def insertFailed(self, msg): """ insert into failed_url """ try: self.mysql_client.begin() article = self.mysql_client.getOne("select * from failed_url where url=%s", (msg["url"], )) if article == False: self.mysql_client.insertOne("insert into failed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)", \ (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"])); LOGGER.debug("insert failed_url %s" %(msg["url"], )) else: self.mysql_client.update("update failed_url set count = count+1 where url = %s", (msg["url"], )) LOGGER.debug("update failed_url %s" %(msg["url"], )) self.mysql_client.end("commit") LOGGER.debug("commit failed_url %s" %(msg["url"], )) except Exception, e: LOGGER.error(traceback.format_exc()) self.mysql_client.end("rollback")
def end(self,option='commit'): """ @summary: 结束事务 """ LOGGER_CRAWLER.debug("start %s"%option) try: LOGGER_CRAWLER.debug("acquire lock") if option=='commit': self._conn.commit() else: self._conn.rollback() except Exception, e: result = False LOGGER_CRAWLER.debug("exception %s"%e)
def end(self, option='commit'): """ @summary: 结束事务 """ LOGGER_CRAWLER.debug("start %s" % option) try: LOGGER_CRAWLER.debug("acquire lock") if option == 'commit': self._conn.commit() else: self._conn.rollback() except Exception, e: result = False LOGGER_CRAWLER.debug("exception %s" % e)
def insertFailed(self, msg): """ insert into failed_url """ try: self.mysql_client.begin() article = self.mysql_client.getOne( "select * from failed_url where url=%s", (msg["url"], )) if article == False: self.mysql_client.insertOne("insert into failed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)", \ (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"])) LOGGER.debug("insert failed_url %s" % (msg["url"], )) else: self.mysql_client.update( "update failed_url set count = count+1 where url = %s", (msg["url"], )) LOGGER.debug("update failed_url %s" % (msg["url"], )) self.mysql_client.end("commit") LOGGER.debug("commit failed_url %s" % (msg["url"], )) except Exception, e: LOGGER.error(traceback.format_exc()) self.mysql_client.end("rollback")
try: if param is None: count = self._cursor.execute(sql) else: count = self._cursor.execute(sql, param) if count > 0: result = self._cursor.fetchone() else: result = False except Exception, e: result = False LOGGER_CRAWLER.debug("exception %s" % e) finally: pass LOGGER_CRAWLER.debug("end getOne %s" % sql) return result def getMany(self, sql, num, param=None): """ @summary: 执行查询,并取出num条结果 @param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来 @param num:取得的结果条数 @param param: 可选参数,条件列表值(元组/列表) @return: result list/boolean 查询到的结果集 """ if param is None: count = self._cursor.execute(sql) else: count = self._cursor.execute(sql, param)
def insertSuccess(self, msg): """ success crawle the article msg, insert into the successed db, insert into mongodb """ try: self.mysql_client.begin() # print article # print msg["url"] article = self.mysql_client.getOne("select * from failed_url where url=%s", (msg["url"], )) if article != False: article = self.mysql_client.delete("delete from failed_url where url=%s", (msg["url"], )) LOGGER.info("delete the article from failed_url: %s", msg["url"]) article = self.mysql_client.getOne("select * from successed_url where url=%s", (msg["url"], )) if article != False: LOGGER.info("repeat crawler the article give up save: %s", msg["url"]) return self.mongo_client.save(msg) LOGGER.debug("insert into mongo: %s@%s" %(msg["title"], msg["url"])) self.mysql_client.insertOne("insert into successed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)", \ (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"])); LOGGER.debug("insert successed_url %s" %(msg["url"], )) self.mysql_client.end("commit") except Exception, e: self.mysql_client.end("rollback") self.mysql_client.begin() self.insertFailed(msg) LOGGER.error("insert into mongo/successed_url error: %s" %(msg["url"])) LOGGER.error(traceback.format_exc())
def run(self): try: LOGGING = {'version': 1} QUEUE_NAME = "news_article" LOGGER.info("start the news crawler") threadCount = CRAWLER_THREAD_COUNT messageHandlerList = [] workThreadList = [] for _ in range(threadCount): messageHandler = CrawlerMessageHandler(crawlerMapper) messageHandler.set_inputmessage(QUEUE_NAME) messageHandlerList.append(messageHandler) workerThread = threading.Thread(target=messageHandler.start, args=(LOGGING)) workerThread.start() workThreadList.append(workerThread) for worker in workThreadList: worker.join() except Exception, e: LOGGER.error(traceback.format_exc()) finally: LOGGER.info("end the news crawler") if __name__ == "__main__": # daemon_main(Crawler, 'c', sys.argv) crawler = Crawler("./") crawler.run()
def crawlArticle(self, msg): """ crawler the article referer by msg """ url = msg["url"] try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) driver.set_page_load_timeout(10) article = self.mysql_client.getOne("select * from successed_url where url=%s", (msg["url"], )) if article != False: LOGGER.info("repeat crawler the article give up save: %s", msg["url"]) return LOGGER.debug("start extractor from %s" %(url, )) driver.get(url) try: #scroll bar set from bottom to top, make the page load all js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) articles = self.pharseContext(driver) msg["text"] = articles self.insertSuccess(msg) except Exception, e: LOGGER.error(traceback.format_exc()) LOGGER.error("url: %s" %(msg["url"],)) self.insertFailed(msg) except TimeoutException, e: #scroll bar set from bottom to top, make the page load all try: js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) # title = driver.find_element_by_css_selector("h1[id=\"h1title\"]").text articles = self.pharseContext(driver) msg["text"] = articles self.insertSuccess(msg) except Exception, e: self.insertFailed(msg) LOGGER.error(traceback.format_exc()) LOGGER.error("url: %s" %(msg["url"], )) driver.quit()
js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) # title = driver.find_element_by_css_selector("h1[id=\"h1title\"]").text articles = self.pharseContext(driver) msg["text"] = articles self.insertSuccess(msg) except Exception, e: self.insertFailed(msg) LOGGER.error(traceback.format_exc()) LOGGER.error("url: %s" %(msg["url"], )) driver.quit() except Exception, e: self.insertFailed(msg) LOGGER.error(traceback.format_exc()) LOGGER.error("url: %s" %(msg["url"], )) finally: try: driver.quit() except Exception, e: pass def pharseTitle(self, driver): return "" def pharseAbstract(self, driver): return "" def pharseContext(self, driver): """
def __init__(self, pidfile, stdin=os.devnull, stdout=os.devnull, stderr=os.devnull ): super(Crawler, self).__init__(pidfile , stdin, stdout, stderr) def run(self): try: LOGGING = {'version': 1 } QUEUE_NAME = "news_article" LOGGER.info("start the news crawler") threadCount = CRAWLER_THREAD_COUNT messageHandlerList = [] workThreadList = [] for _ in range(threadCount): messageHandler = CrawlerMessageHandler(crawlerMapper) messageHandler.set_inputmessage(QUEUE_NAME) messageHandlerList.append(messageHandler) workerThread = threading.Thread(target=messageHandler.start,args=(LOGGING)) workerThread.start() workThreadList.append(workerThread) for worker in workThreadList: worker.join() except Exception,e: LOGGER.error(traceback.format_exc()) finally: LOGGER.info("end the news crawler") if __name__ == "__main__": # daemon_main(Crawler, 'c', sys.argv) crawler = Crawler("./") crawler.run()
js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) # title = driver.find_element_by_css_selector("h1[id=\"h1title\"]").text articles = self.pharseContext(driver) msg["text"] = articles self.insertSuccess(msg) except Exception, e: self.insertFailed(msg) LOGGER.error(traceback.format_exc()) LOGGER.error("url: %s" % (msg["url"], )) driver.quit() except Exception, e: self.insertFailed(msg) LOGGER.error(traceback.format_exc()) LOGGER.error("url: %s" % (msg["url"], )) finally: try: driver.quit() except Exception, e: pass def pharseTitle(self, driver): return "" def pharseAbstract(self, driver): return "" def pharseContext(self, driver): """
try: if param is None: count = self._cursor.execute(sql) else: count = self._cursor.execute(sql,param) if count>0: result = self._cursor.fetchone() else: result = False except Exception, e: result = False LOGGER_CRAWLER.debug("exception %s"%e) finally: pass LOGGER_CRAWLER.debug("end getOne %s"%sql) return result def getMany(self,sql,num,param=None): """ @summary: 执行查询,并取出num条结果 @param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来 @param num:取得的结果条数 @param param: 可选参数,条件列表值(元组/列表) @return: result list/boolean 查询到的结果集 """ if param is None: count = self._cursor.execute(sql) else: count = self._cursor.execute(sql,param)
def insertSuccess(self, msg): """ success crawle the article msg, insert into the successed db, insert into mongodb """ try: self.mysql_client.begin() # print article # print msg["url"] article = self.mysql_client.getOne( "select * from failed_url where url=%s", (msg["url"], )) if article != False: article = self.mysql_client.delete( "delete from failed_url where url=%s", (msg["url"], )) LOGGER.info("delete the article from failed_url: %s", msg["url"]) article = self.mysql_client.getOne( "select * from successed_url where url=%s", (msg["url"], )) if article != False: LOGGER.info("repeat crawler the article give up save: %s", msg["url"]) return self.mongo_client.save(msg) LOGGER.debug("insert into mongo: %s@%s" % (msg["title"], msg["url"])) self.mysql_client.insertOne("insert into successed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)", \ (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"])) LOGGER.debug("insert successed_url %s" % (msg["url"], )) self.mysql_client.end("commit") except Exception, e: self.mysql_client.end("rollback") self.mysql_client.begin() self.insertFailed(msg) LOGGER.error("insert into mongo/successed_url error: %s" % (msg["url"])) LOGGER.error(traceback.format_exc())
def crawlArticle(self, msg): """ crawler the article referer by msg """ url = msg["url"] try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) driver.set_page_load_timeout(10) article = self.mysql_client.getOne( "select * from successed_url where url=%s", (msg["url"], )) if article != False: LOGGER.info("repeat crawler the article give up save: %s", msg["url"]) return LOGGER.debug("start extractor from %s" % (url, )) driver.get(url) try: #scroll bar set from bottom to top, make the page load all js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) articles = self.pharseContext(driver) msg["text"] = articles self.insertSuccess(msg) except Exception, e: LOGGER.error(traceback.format_exc()) LOGGER.error("url: %s" % (msg["url"], )) self.insertFailed(msg) except TimeoutException, e: #scroll bar set from bottom to top, make the page load all try: js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) # title = driver.find_element_by_css_selector("h1[id=\"h1title\"]").text articles = self.pharseContext(driver) msg["text"] = articles self.insertSuccess(msg) except Exception, e: self.insertFailed(msg) LOGGER.error(traceback.format_exc()) LOGGER.error("url: %s" % (msg["url"], )) driver.quit()