class BasicArticleCrawler(object): def __init__(self): self.mysql_client = MysqlClient() self.mongo_client = MongoClient().tdb.tcoll def insertSuccess(self, msg): """ success crawle the article msg, insert into the successed db, insert into mongodb """ try: self.mysql_client.begin() # print article # print msg["url"] article = self.mysql_client.getOne( "select * from failed_url where url=%s", (msg["url"], )) if article != False: article = self.mysql_client.delete( "delete from failed_url where url=%s", (msg["url"], )) LOGGER.info("delete the article from failed_url: %s", msg["url"]) article = self.mysql_client.getOne( "select * from successed_url where url=%s", (msg["url"], )) if article != False: LOGGER.info("repeat crawler the article give up save: %s", msg["url"]) return self.mongo_client.save(msg) LOGGER.debug("insert into mongo: %s@%s" % (msg["title"], msg["url"])) self.mysql_client.insertOne("insert into successed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)", \ (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"])) LOGGER.debug("insert successed_url %s" % (msg["url"], )) self.mysql_client.end("commit") except Exception, e: self.mysql_client.end("rollback") self.mysql_client.begin() self.insertFailed(msg) LOGGER.error("insert into mongo/successed_url error: %s" % (msg["url"])) LOGGER.error(traceback.format_exc())
class BasicArticleCrawler(object): def __init__(self): self.mysql_client = MysqlClient() self.mongo_client = MongoClient().tdb.tcoll def insertSuccess(self, msg): """ success crawle the article msg, insert into the successed db, insert into mongodb """ try: self.mysql_client.begin() # print article # print msg["url"] article = self.mysql_client.getOne("select * from failed_url where url=%s", (msg["url"], )) if article != False: article = self.mysql_client.delete("delete from failed_url where url=%s", (msg["url"], )) LOGGER.info("delete the article from failed_url: %s", msg["url"]) article = self.mysql_client.getOne("select * from successed_url where url=%s", (msg["url"], )) if article != False: LOGGER.info("repeat crawler the article give up save: %s", msg["url"]) return self.mongo_client.save(msg) LOGGER.debug("insert into mongo: %s@%s" %(msg["title"], msg["url"])) self.mysql_client.insertOne("insert into successed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)", \ (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"])); LOGGER.debug("insert successed_url %s" %(msg["url"], )) self.mysql_client.end("commit") except Exception, e: self.mysql_client.end("rollback") self.mysql_client.begin() self.insertFailed(msg) LOGGER.error("insert into mongo/successed_url error: %s" %(msg["url"])) LOGGER.error(traceback.format_exc())
def __init__(self): self.mysql_client = MysqlClient() self.mongo_client = MongoClient().tdb.tcoll
@author: lml ''' import sys sys.path.append("../") sys.path.append("../../") sys.path.append("/home/lml/webcrawler/webcrawler-nlp/crawler/") from utils.dbmong import MongoClient from utils.dbmysql import MysqlClient from time import sleep if __name__ == '__main__': while True: print "**************************************************" mysql_client = MysqlClient() mongo_client = MongoClient() published_url_count = mysql_client.getOne( "select count(*) as count from published_url") print "published url count: %s" % published_url_count["count"] successed_url_count = mysql_client.getOne( "select count(*) as count from successed_url") print "successed url count: %s" % successed_url_count["count"] failed_url_count = mysql_client.getOne( "select count(*) as count from failed_url") print "failed url count: %s" % failed_url_count["count"] count = mongo_client.tdb.tcoll.count() print "mongo articles: %s" % count sleep(10)