Python LOGGER_CRAWLER Beispiele, config.LogConfig.LOGGER_CRAWLER Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: BasicCrawler.py Projekt: liuminglu19870419/webcrawler-nlp

 def pharseContext(self, driver):
     try:
         articles_p = driver.find_element_by_css_selector("div[id=\"artibody\"]").find_elements_by_tag_name("p")
         articles = map(lambda article : article.text, articles_p)
     except NoSuchElementException, e:
         LOGGER.debug(e)
         articles_p = driver.find_element_by_class_name("text").find_elements_by_tag_name("p")
         articles = map(lambda article : article.text, articles_p)
         return articles

Beispiel #2

0

Datei anzeigen

 def pharseContext(self, driver):
     try:
         articles_p = driver.find_element_by_css_selector(
             "div[id=\"artibody\"]").find_elements_by_tag_name("p")
         articles = map(lambda article: article.text, articles_p)
     except NoSuchElementException, e:
         LOGGER.debug(e)
         articles_p = driver.find_element_by_class_name(
             "text").find_elements_by_tag_name("p")
         articles = map(lambda article: article.text, articles_p)
         return articles

Beispiel #3

0

Datei anzeigen

Datei: BasicCrawler.py Projekt: liuminglu19870419/webcrawler-nlp

 def _process(self, msg):
     '''
     This is the main message process method, normally we have all the process
     logic here
     '''
     try:
         url, title, tag, sub_tag, abstract = self.pharseMsg(msg)
         crawler = self.crawlerMapper[tag][sub_tag]
         crawler.crawlArticle(msg)
     except Exception, e:
         LOGGER.error(traceback.format_exc())
         LOGGER.error(e)

Beispiel #4

0

Datei anzeigen

 def _process(self, msg):
     '''
     This is the main message process method, normally we have all the process
     logic here
     '''
     try:
         url, title, tag, sub_tag, abstract = self.pharseMsg(msg)
         crawler = self.crawlerMapper[tag][sub_tag]
         crawler.crawlArticle(msg)
     except Exception, e:
         LOGGER.error(traceback.format_exc())
         LOGGER.error(e)

Beispiel #5

0

Datei anzeigen

Datei: dbmysql.py Projekt: liuminglu19870419/webcrawler-nlp

 def insertOne(self,sql,value):
     """
     @summary: 向数据表插入一条记录
     @param sql:要插入的ＳＱＬ格式
     @param value:要插入的记录数据tuple/list
     @return: insertId 受影响的行数
     """
     LOGGER_CRAWLER.debug("start insert %s"%sql)
     try:
         self._cursor.execute(sql,value)
     except Exception, e:
         result = False
         LOGGER_CRAWLER.debug("exception %s"%e)

Beispiel #6

0

Datei anzeigen

 def insertOne(self, sql, value):
     """
     @summary: 向数据表插入一条记录
     @param sql:要插入的ＳＱＬ格式
     @param value:要插入的记录数据tuple/list
     @return: insertId 受影响的行数
     """
     LOGGER_CRAWLER.debug("start insert %s" % sql)
     try:
         self._cursor.execute(sql, value)
     except Exception, e:
         result = False
         LOGGER_CRAWLER.debug("exception %s" % e)

Beispiel #7

0

Datei anzeigen

 def getOne(self, sql, param=None):
     """
     @summary: 执行查询，并取出第一条
     @param sql:查询ＳＱＬ，如果有查询条件，请只指定条件列表，并将条件值使用参数[param]传递进来
     @param param: 可选参数，条件列表值（元组/列表）
     @return: result list/boolean 查询到的结果集
     """
     LOGGER_CRAWLER.debug("start getOne %s" % sql)
     try:
         if param is None:
             count = self._cursor.execute(sql)
         else:
             count = self._cursor.execute(sql, param)
         if count > 0:
             result = self._cursor.fetchone()
         else:
             result = False
     except Exception, e:
         result = False
         LOGGER_CRAWLER.debug("exception %s" % e)

Beispiel #8

0

Datei anzeigen

Datei: dbmysql.py Projekt: liuminglu19870419/webcrawler-nlp

 def getOne(self,sql,param=None):
     """
     @summary: 执行查询，并取出第一条
     @param sql:查询ＳＱＬ，如果有查询条件，请只指定条件列表，并将条件值使用参数[param]传递进来
     @param param: 可选参数，条件列表值（元组/列表）
     @return: result list/boolean 查询到的结果集
     """
     LOGGER_CRAWLER.debug("start getOne %s"%sql)
     try:
         if param is None:
             count = self._cursor.execute(sql)
         else:
             count = self._cursor.execute(sql,param)
         if count>0:
             result = self._cursor.fetchone()
         else:
             result = False
     except Exception, e:
         result = False
         LOGGER_CRAWLER.debug("exception %s"%e)

Beispiel #9

0

Datei anzeigen

Datei: crawler_run.py Projekt: liuminglu19870419/webcrawler-nlp

 def run(self):
     try:
         LOGGING = {'version': 1   }
         QUEUE_NAME = "news_article"
         LOGGER.info("start the news crawler")
         threadCount = CRAWLER_THREAD_COUNT
         messageHandlerList = []
         workThreadList = []
         for _ in range(threadCount):
             messageHandler = CrawlerMessageHandler(crawlerMapper)
             messageHandler.set_inputmessage(QUEUE_NAME)
             messageHandlerList.append(messageHandler)
             workerThread = threading.Thread(target=messageHandler.start,args=(LOGGING))
             workerThread.start()
             workThreadList.append(workerThread)
         
         for worker in workThreadList:
             worker.join()
             
     except Exception,e:
         LOGGER.error(traceback.format_exc())

Beispiel #10

0

Datei anzeigen

Datei: crawler_run.py Projekt: tickleliu/webcrawler-nlp

    def run(self):
        try:
            LOGGING = {'version': 1}
            QUEUE_NAME = "news_article"
            LOGGER.info("start the news crawler")
            threadCount = CRAWLER_THREAD_COUNT
            messageHandlerList = []
            workThreadList = []
            for _ in range(threadCount):
                messageHandler = CrawlerMessageHandler(crawlerMapper)
                messageHandler.set_inputmessage(QUEUE_NAME)
                messageHandlerList.append(messageHandler)
                workerThread = threading.Thread(target=messageHandler.start,
                                                args=(LOGGING))
                workerThread.start()
                workThreadList.append(workerThread)

            for worker in workThreadList:
                worker.join()

        except Exception, e:
            LOGGER.error(traceback.format_exc())

Beispiel #11

0

Datei anzeigen

Datei: BasicCrawler.py Projekt: liuminglu19870419/webcrawler-nlp

 def insertFailed(self, msg):
     """
     insert into failed_url
     """
     try:
         self.mysql_client.begin()
         article = self.mysql_client.getOne("select * from failed_url where url=%s", (msg["url"], ))
         if article == False:
             self.mysql_client.insertOne("insert into failed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)",  \
                                     (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"]));
             LOGGER.debug("insert failed_url %s" %(msg["url"], ))
         else:
             self.mysql_client.update("update failed_url set count = count+1 where url = %s", (msg["url"], ))
             LOGGER.debug("update failed_url %s" %(msg["url"], ))
         self.mysql_client.end("commit")
         LOGGER.debug("commit failed_url %s" %(msg["url"], ))
                                     
     except Exception, e:
         LOGGER.error(traceback.format_exc())
         self.mysql_client.end("rollback")

Beispiel #12

0

Datei anzeigen

Datei: dbmysql.py Projekt: liuminglu19870419/webcrawler-nlp

 def end(self,option='commit'):
     """
     @summary: 结束事务
     """        
     LOGGER_CRAWLER.debug("start %s"%option)
     try:
         LOGGER_CRAWLER.debug("acquire lock")
         if option=='commit':
             self._conn.commit()
         else:
             self._conn.rollback()
     except Exception, e:
         result = False
         LOGGER_CRAWLER.debug("exception %s"%e)

Beispiel #13

0

Datei anzeigen

 def end(self, option='commit'):
     """
     @summary: 结束事务
     """
     LOGGER_CRAWLER.debug("start %s" % option)
     try:
         LOGGER_CRAWLER.debug("acquire lock")
         if option == 'commit':
             self._conn.commit()
         else:
             self._conn.rollback()
     except Exception, e:
         result = False
         LOGGER_CRAWLER.debug("exception %s" % e)

Beispiel #14

0

Datei anzeigen

    def insertFailed(self, msg):
        """
        insert into failed_url
        """
        try:
            self.mysql_client.begin()
            article = self.mysql_client.getOne(
                "select * from failed_url where url=%s", (msg["url"], ))
            if article == False:
                self.mysql_client.insertOne("insert into failed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)",  \
                                        (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"]))
                LOGGER.debug("insert failed_url %s" % (msg["url"], ))
            else:
                self.mysql_client.update(
                    "update failed_url set count = count+1 where url = %s",
                    (msg["url"], ))
                LOGGER.debug("update failed_url %s" % (msg["url"], ))
            self.mysql_client.end("commit")
            LOGGER.debug("commit failed_url %s" % (msg["url"], ))

        except Exception, e:
            LOGGER.error(traceback.format_exc())
            self.mysql_client.end("rollback")

Beispiel #15

0

Datei anzeigen

        try:
            if param is None:
                count = self._cursor.execute(sql)
            else:
                count = self._cursor.execute(sql, param)
            if count > 0:
                result = self._cursor.fetchone()
            else:
                result = False
        except Exception, e:
            result = False
            LOGGER_CRAWLER.debug("exception %s" % e)
        finally:
            pass

        LOGGER_CRAWLER.debug("end getOne %s" % sql)

        return result

    def getMany(self, sql, num, param=None):
        """
        @summary: 执行查询，并取出num条结果
        @param sql:查询ＳＱＬ，如果有查询条件，请只指定条件列表，并将条件值使用参数[param]传递进来
        @param num:取得的结果条数
        @param param: 可选参数，条件列表值（元组/列表）
        @return: result list/boolean 查询到的结果集
        """
        if param is None:
            count = self._cursor.execute(sql)
        else:
            count = self._cursor.execute(sql, param)

Beispiel #16

0

Datei anzeigen

Datei: BasicCrawler.py Projekt: liuminglu19870419/webcrawler-nlp

    def insertSuccess(self, msg):
        """
        success crawle the article msg, insert into the successed db, insert into mongodb
        """
        try:
            self.mysql_client.begin()
#             print article
#             print msg["url"]
            
            article = self.mysql_client.getOne("select * from failed_url where url=%s", (msg["url"], ))
            if article != False:
                article = self.mysql_client.delete("delete from failed_url where url=%s", (msg["url"], ))
                LOGGER.info("delete the article from failed_url: %s", msg["url"])

            article = self.mysql_client.getOne("select * from successed_url where url=%s", (msg["url"], ))
            if article != False:
                LOGGER.info("repeat crawler the article give up save: %s", msg["url"])
                return
            
            self.mongo_client.save(msg)
            LOGGER.debug("insert into mongo: %s@%s" %(msg["title"], msg["url"]))
            
            self.mysql_client.insertOne("insert into successed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)",  \
                                        (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"]));
                                        
            LOGGER.debug("insert successed_url %s" %(msg["url"], ))
            self.mysql_client.end("commit")

        except Exception, e:
            self.mysql_client.end("rollback")

            self.mysql_client.begin()
            self.insertFailed(msg)
            LOGGER.error("insert into mongo/successed_url error: %s"  %(msg["url"]))
            LOGGER.error(traceback.format_exc())

Beispiel #17

0

Datei anzeigen

Datei: crawler_run.py Projekt: tickleliu/webcrawler-nlp

    def run(self):
        try:
            LOGGING = {'version': 1}
            QUEUE_NAME = "news_article"
            LOGGER.info("start the news crawler")
            threadCount = CRAWLER_THREAD_COUNT
            messageHandlerList = []
            workThreadList = []
            for _ in range(threadCount):
                messageHandler = CrawlerMessageHandler(crawlerMapper)
                messageHandler.set_inputmessage(QUEUE_NAME)
                messageHandlerList.append(messageHandler)
                workerThread = threading.Thread(target=messageHandler.start,
                                                args=(LOGGING))
                workerThread.start()
                workThreadList.append(workerThread)

            for worker in workThreadList:
                worker.join()

        except Exception, e:
            LOGGER.error(traceback.format_exc())
        finally:
            LOGGER.info("end the news crawler")


if __name__ == "__main__":
    #     daemon_main(Crawler, 'c', sys.argv)
    crawler = Crawler("./")
    crawler.run()

Beispiel #18

0

Datei anzeigen

Datei: BasicCrawler.py Projekt: liuminglu19870419/webcrawler-nlp

    def crawlArticle(self, msg):
        """
        crawler the article referer by msg
        """
        
        url = msg["url"]
        try:
            
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            driver.set_page_load_timeout(10)
            article = self.mysql_client.getOne("select * from successed_url where url=%s", (msg["url"], ))
            if article != False:
                LOGGER.info("repeat crawler the article give up save: %s", msg["url"])
                return

            LOGGER.debug("start extractor from %s" %(url, ))
            driver.get(url)
            try:
                #scroll bar set from bottom to top, make the page load all
                js = "var q=document.documentElement.scrollTop=10000"
                driver.execute_script(js)
                js = "var q=document.documentElement.scrollTop=0"
                driver.execute_script(js)
                articles = self.pharseContext(driver)
                msg["text"] = articles
                self.insertSuccess(msg)

            except Exception, e:
                LOGGER.error(traceback.format_exc())
                LOGGER.error("url: %s" %(msg["url"],))
                self.insertFailed(msg)
  
        except TimeoutException, e:
            #scroll bar set from bottom to top, make the page load all
            try:
                js = "var q=document.documentElement.scrollTop=10000"
                driver.execute_script(js)
                js = "var q=document.documentElement.scrollTop=0"
                driver.execute_script(js)
#                 title = driver.find_element_by_css_selector("h1[id=\"h1title\"]").text
                articles = self.pharseContext(driver)
                msg["text"] = articles
                self.insertSuccess(msg)
                
            except Exception, e:
                self.insertFailed(msg)
                LOGGER.error(traceback.format_exc())
                LOGGER.error("url: %s" %(msg["url"], ))
                driver.quit()

Beispiel #19

0

Datei anzeigen

Datei: BasicCrawler.py Projekt: liuminglu19870419/webcrawler-nlp

                js = "var q=document.documentElement.scrollTop=0"
                driver.execute_script(js)
#                 title = driver.find_element_by_css_selector("h1[id=\"h1title\"]").text
                articles = self.pharseContext(driver)
                msg["text"] = articles
                self.insertSuccess(msg)
                
            except Exception, e:
                self.insertFailed(msg)
                LOGGER.error(traceback.format_exc())
                LOGGER.error("url: %s" %(msg["url"], ))
                driver.quit()

        except Exception, e:
            self.insertFailed(msg)
            LOGGER.error(traceback.format_exc())
            LOGGER.error("url: %s" %(msg["url"], ))
        finally:
            try:
                driver.quit()
            except Exception, e:
                pass
    
    def pharseTitle(self, driver):
        return ""
    
    def pharseAbstract(self, driver):
        return ""
    
    def pharseContext(self, driver):
        """

Beispiel #20

0

Datei anzeigen

Datei: crawler_run.py Projekt: liuminglu19870419/webcrawler-nlp

    def __init__(self, pidfile, stdin=os.devnull, stdout=os.devnull, stderr=os.devnull ):
        super(Crawler, self).__init__(pidfile , stdin, stdout, stderr)

    def run(self):
        try:
            LOGGING = {'version': 1   }
            QUEUE_NAME = "news_article"
            LOGGER.info("start the news crawler")
            threadCount = CRAWLER_THREAD_COUNT
            messageHandlerList = []
            workThreadList = []
            for _ in range(threadCount):
                messageHandler = CrawlerMessageHandler(crawlerMapper)
                messageHandler.set_inputmessage(QUEUE_NAME)
                messageHandlerList.append(messageHandler)
                workerThread = threading.Thread(target=messageHandler.start,args=(LOGGING))
                workerThread.start()
                workThreadList.append(workerThread)
            
            for worker in workThreadList:
                worker.join()
                
        except Exception,e:
            LOGGER.error(traceback.format_exc())
        finally:
            LOGGER.info("end the news crawler")

if __name__ == "__main__":
#     daemon_main(Crawler, 'c', sys.argv)
    crawler = Crawler("./")
    crawler.run()

Beispiel #21

0

Datei anzeigen

                js = "var q=document.documentElement.scrollTop=0"
                driver.execute_script(js)
                #                 title = driver.find_element_by_css_selector("h1[id=\"h1title\"]").text
                articles = self.pharseContext(driver)
                msg["text"] = articles
                self.insertSuccess(msg)

            except Exception, e:
                self.insertFailed(msg)
                LOGGER.error(traceback.format_exc())
                LOGGER.error("url: %s" % (msg["url"], ))
                driver.quit()

        except Exception, e:
            self.insertFailed(msg)
            LOGGER.error(traceback.format_exc())
            LOGGER.error("url: %s" % (msg["url"], ))
        finally:
            try:
                driver.quit()
            except Exception, e:
                pass

    def pharseTitle(self, driver):
        return ""

    def pharseAbstract(self, driver):
        return ""

    def pharseContext(self, driver):
        """

Beispiel #22

0

Datei anzeigen

Datei: dbmysql.py Projekt: liuminglu19870419/webcrawler-nlp

        try:
            if param is None:
                count = self._cursor.execute(sql)
            else:
                count = self._cursor.execute(sql,param)
            if count>0:
                result = self._cursor.fetchone()
            else:
                result = False
        except Exception, e:
            result = False
            LOGGER_CRAWLER.debug("exception %s"%e)
        finally:
            pass
            
        LOGGER_CRAWLER.debug("end getOne %s"%sql)

        return result
 
    def getMany(self,sql,num,param=None):
        """
        @summary: 执行查询，并取出num条结果
        @param sql:查询ＳＱＬ，如果有查询条件，请只指定条件列表，并将条件值使用参数[param]传递进来
        @param num:取得的结果条数
        @param param: 可选参数，条件列表值（元组/列表）
        @return: result list/boolean 查询到的结果集
        """
        if param is None:
            count = self._cursor.execute(sql)
        else:
            count = self._cursor.execute(sql,param)

Beispiel #23

0

Datei anzeigen

    def insertSuccess(self, msg):
        """
        success crawle the article msg, insert into the successed db, insert into mongodb
        """
        try:
            self.mysql_client.begin()
            #             print article
            #             print msg["url"]

            article = self.mysql_client.getOne(
                "select * from failed_url where url=%s", (msg["url"], ))
            if article != False:
                article = self.mysql_client.delete(
                    "delete from failed_url where url=%s", (msg["url"], ))
                LOGGER.info("delete the article from failed_url: %s",
                            msg["url"])

            article = self.mysql_client.getOne(
                "select * from successed_url where url=%s", (msg["url"], ))
            if article != False:
                LOGGER.info("repeat crawler the article give up save: %s",
                            msg["url"])
                return

            self.mongo_client.save(msg)
            LOGGER.debug("insert into mongo: %s@%s" %
                         (msg["title"], msg["url"]))

            self.mysql_client.insertOne("insert into successed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)",  \
                                        (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"]))

            LOGGER.debug("insert successed_url %s" % (msg["url"], ))
            self.mysql_client.end("commit")

        except Exception, e:
            self.mysql_client.end("rollback")

            self.mysql_client.begin()
            self.insertFailed(msg)
            LOGGER.error("insert into mongo/successed_url error: %s" %
                         (msg["url"]))
            LOGGER.error(traceback.format_exc())

Beispiel #24

0

Datei anzeigen

    def crawlArticle(self, msg):
        """
        crawler the article referer by msg
        """

        url = msg["url"]
        try:

            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            driver.set_page_load_timeout(10)
            article = self.mysql_client.getOne(
                "select * from successed_url where url=%s", (msg["url"], ))
            if article != False:
                LOGGER.info("repeat crawler the article give up save: %s",
                            msg["url"])
                return

            LOGGER.debug("start extractor from %s" % (url, ))
            driver.get(url)
            try:
                #scroll bar set from bottom to top, make the page load all
                js = "var q=document.documentElement.scrollTop=10000"
                driver.execute_script(js)
                js = "var q=document.documentElement.scrollTop=0"
                driver.execute_script(js)
                articles = self.pharseContext(driver)
                msg["text"] = articles
                self.insertSuccess(msg)

            except Exception, e:
                LOGGER.error(traceback.format_exc())
                LOGGER.error("url: %s" % (msg["url"], ))
                self.insertFailed(msg)

        except TimeoutException, e:
            #scroll bar set from bottom to top, make the page load all
            try:
                js = "var q=document.documentElement.scrollTop=10000"
                driver.execute_script(js)
                js = "var q=document.documentElement.scrollTop=0"
                driver.execute_script(js)
                #                 title = driver.find_element_by_css_selector("h1[id=\"h1title\"]").text
                articles = self.pharseContext(driver)
                msg["text"] = articles
                self.insertSuccess(msg)

            except Exception, e:
                self.insertFailed(msg)
                LOGGER.error(traceback.format_exc())
                LOGGER.error("url: %s" % (msg["url"], ))
                driver.quit()