def __init__(self, config): ''' Constructor ''' self.url = config.get("url", "") self.tag = config.get("tag", "defaut tag") self.sub_tag = config.get("sub_tag", None) self.mysql_client = MysqlClient() self.news_publisher = NewsPublisher(NEWS_URL_QUEUE)
def run(self): """ """ self.mysql_client = MysqlClient() self.news_publisher = NewsPublisher(NEWS_URL_QUEUE) self.threhold = 5 self.page = 14 LOGGER.info("start re extractor the failed url if count() < %s" % (self.threhold, )) failed_count = 0 try: failed_count = self.mysql_client.getOne( "select count(*) as c from failed_url where count < %s", (self.threhold, )) except Exception, e: LOGGER.error("failed to load the failed url count") LOGGER.error(traceback.format_exc())
def __init__(self): self.mysql_client = MysqlClient() self.mongo_client = MongoClient().tdb.tcoll
@author: lml ''' import sys sys.path.append("../") sys.path.append("../../") sys.path.append("/home/lml/webcrawler/webcrawler-nlp/crawler/") from utils.dbmong import MongoClient from utils.dbmysql import MysqlClient from time import sleep if __name__ == '__main__': while True: print "**************************************************" mysql_client = MysqlClient() mongo_client = MongoClient() published_url_count = mysql_client.getOne( "select count(*) as count from published_url") print "published url count: %s" % published_url_count["count"] successed_url_count = mysql_client.getOne( "select count(*) as count from successed_url") print "successed url count: %s" % successed_url_count["count"] failed_url_count = mysql_client.getOne( "select count(*) as count from failed_url") print "failed url count: %s" % failed_url_count["count"] count = mongo_client.tdb.tcoll.count() print "mongo articles: %s" % count