コード例 #1
0
class BasicArticleCrawler(object):
    def __init__(self):
        self.mysql_client = MysqlClient()
        self.mongo_client = MongoClient().tdb.tcoll

    def insertSuccess(self, msg):
        """
        success crawle the article msg, insert into the successed db, insert into mongodb
        """
        try:
            self.mysql_client.begin()
            #             print article
            #             print msg["url"]

            article = self.mysql_client.getOne(
                "select * from failed_url where url=%s", (msg["url"], ))
            if article != False:
                article = self.mysql_client.delete(
                    "delete from failed_url where url=%s", (msg["url"], ))
                LOGGER.info("delete the article from failed_url: %s",
                            msg["url"])

            article = self.mysql_client.getOne(
                "select * from successed_url where url=%s", (msg["url"], ))
            if article != False:
                LOGGER.info("repeat crawler the article give up save: %s",
                            msg["url"])
                return

            self.mongo_client.save(msg)
            LOGGER.debug("insert into mongo: %s@%s" %
                         (msg["title"], msg["url"]))

            self.mysql_client.insertOne("insert into successed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)",  \
                                        (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"]))

            LOGGER.debug("insert successed_url %s" % (msg["url"], ))
            self.mysql_client.end("commit")

        except Exception, e:
            self.mysql_client.end("rollback")

            self.mysql_client.begin()
            self.insertFailed(msg)
            LOGGER.error("insert into mongo/successed_url error: %s" %
                         (msg["url"]))
            LOGGER.error(traceback.format_exc())
コード例 #2
0
class  BasicArticleCrawler(object):
    
    def __init__(self):
        self.mysql_client = MysqlClient()
        self.mongo_client = MongoClient().tdb.tcoll
    
    def insertSuccess(self, msg):
        """
        success crawle the article msg, insert into the successed db, insert into mongodb
        """
        try:
            self.mysql_client.begin()
#             print article
#             print msg["url"]
            
            article = self.mysql_client.getOne("select * from failed_url where url=%s", (msg["url"], ))
            if article != False:
                article = self.mysql_client.delete("delete from failed_url where url=%s", (msg["url"], ))
                LOGGER.info("delete the article from failed_url: %s", msg["url"])

            article = self.mysql_client.getOne("select * from successed_url where url=%s", (msg["url"], ))
            if article != False:
                LOGGER.info("repeat crawler the article give up save: %s", msg["url"])
                return
            
            self.mongo_client.save(msg)
            LOGGER.debug("insert into mongo: %s@%s" %(msg["title"], msg["url"]))
            
            self.mysql_client.insertOne("insert into successed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)",  \
                                        (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"]));
                                        
            LOGGER.debug("insert successed_url %s" %(msg["url"], ))
            self.mysql_client.end("commit")

        except Exception, e:
            self.mysql_client.end("rollback")

            self.mysql_client.begin()
            self.insertFailed(msg)
            LOGGER.error("insert into mongo/successed_url error: %s"  %(msg["url"]))
            LOGGER.error(traceback.format_exc())
コード例 #3
0
 def __init__(self):
     self.mysql_client = MysqlClient()
     self.mongo_client = MongoClient().tdb.tcoll
コード例 #4
0
 def __init__(self):
     self.mysql_client = MysqlClient()
     self.mongo_client = MongoClient().tdb.tcoll
コード例 #5
0
@author: lml
'''

import sys
sys.path.append("../")
sys.path.append("../../")
sys.path.append("/home/lml/webcrawler/webcrawler-nlp/crawler/")
from utils.dbmong import MongoClient
from utils.dbmysql import MysqlClient
from time import sleep

if __name__ == '__main__':
    while True:
        print "**************************************************"
        mysql_client = MysqlClient()
        mongo_client = MongoClient()
        published_url_count = mysql_client.getOne(
            "select count(*) as count from published_url")
        print "published url count: %s" % published_url_count["count"]

        successed_url_count = mysql_client.getOne(
            "select count(*) as count from successed_url")
        print "successed url count: %s" % successed_url_count["count"]

        failed_url_count = mysql_client.getOne(
            "select count(*) as count from failed_url")
        print "failed url count: %s" % failed_url_count["count"]

        count = mongo_client.tdb.tcoll.count()
        print "mongo articles: %s" % count
        sleep(10)