def __init__(self, config): ''' Constructor ''' self.url = config.get("url", "") self.tag = config.get("tag", "defaut tag") self.sub_tag = config.get("sub_tag", None) self.mysql_client = MysqlClient() self.news_publisher = NewsPublisher(NEWS_URL_QUEUE)
def run(self): """ """ self.mysql_client = MysqlClient() self.news_publisher = NewsPublisher(NEWS_URL_QUEUE) self.threhold = 5 self.page = 14 LOGGER.info("start re extractor the failed url if count() < %s" % (self.threhold, )) failed_count = 0 try: failed_count = self.mysql_client.getOne( "select count(*) as c from failed_url where count < %s", (self.threhold, )) except Exception, e: LOGGER.error("failed to load the failed url count") LOGGER.error(traceback.format_exc())
class FailedExtractor(Daemon): ''' classdocs ''' def __init__(self, pidfile, stdin=os.devnull, stdout=os.devnull, stderr=os.devnull): ''' Constructor ''' super(FailedExtractor, self).__init__(pidfile, stdin, stdout, stderr) def run(self): """ """ self.mysql_client = MysqlClient() self.news_publisher = NewsPublisher(NEWS_URL_QUEUE) self.threhold = 5 self.page = 14 LOGGER.info("start re extractor the failed url if count() < %s" % (self.threhold, )) failed_count = 0 try: failed_count = self.mysql_client.getOne( "select count(*) as c from failed_url where count < %s", (self.threhold, )) except Exception, e: LOGGER.error("failed to load the failed url count") LOGGER.error(traceback.format_exc()) failed_count = int(failed_count["c"]) count = 0 while count < failed_count: try: print count urls = self.mysql_client.getAll( "select * from failed_url where count < %s limit %s, %s", (self.threhold, count, self.page)) if urls == False: break count += len(urls) for url in urls: LOGGER.info("re extractor url: %s" % (url["url"], )) msg = self.mysql_client.getOne( "select abstract, title from published_url where url = %s", (url["url"], )) url["title"] = msg["title"] url["abstract"] = msg["abstract"] self.news_publisher.process(url) except Exception, e: LOGGER.error("re extractor urls error") LOGGER.error(traceback.format_exc())
def run(self): """ """ self.mysql_client = MysqlClient() self.news_publisher = NewsPublisher(NEWS_URL_QUEUE) self.threhold = 5 self.page = 14 LOGGER.info("start re extractor the published url if count() < %s"%(self.threhold, )) failed_count = 0 try: failed_count = self.mysql_client.getOne("select count(*) as c from published_url where count < %s", (self.threhold, ) ) except Exception, e: LOGGER.error("failed to load the published url count") LOGGER.error(traceback.format_exc())
class PublishedExtractor(Daemon): ''' classdocs ''' def __init__(self, pidfile, stdin=os.devnull, stdout=os.devnull, stderr=os.devnull ): ''' Constructor ''' super(PublishedExtractor, self).__init__(pidfile , stdin, stdout, stderr) def run(self): """ """ self.mysql_client = MysqlClient() self.news_publisher = NewsPublisher(NEWS_URL_QUEUE) self.threhold = 5 self.page = 14 LOGGER.info("start re extractor the published url if count() < %s"%(self.threhold, )) failed_count = 0 try: failed_count = self.mysql_client.getOne("select count(*) as c from published_url where count < %s", (self.threhold, ) ) except Exception, e: LOGGER.error("failed to load the published url count") LOGGER.error(traceback.format_exc()) failed_count = int(failed_count["c"]) count = 0 while count < failed_count: try: print count urls = self.mysql_client.getAll("select * from published_url where count < %s limit %s, %s", (self.threhold, count, self.page)) if urls == False: break count += len(urls) for url in urls: LOGGER.info("re extractor url: %s"%(url["url"], )) # msg = self.mysql_client.getOne("select abstract, title from published_url where url = %s", (url["url"], )) # url["title"] = msg["title"] # url["abstract"] = msg["abstract"] self.news_publisher.process(url) except Exception, e: LOGGER.error("re extractor urls error") LOGGER.error(traceback.format_exc())
class BaseExtractor(object): ''' classdocs ''' def __init__(self, config): ''' Constructor ''' self.url = config.get("url", "") self.tag = config.get("tag", "defaut tag") self.sub_tag = config.get("sub_tag", None) self.mysql_client = MysqlClient() self.news_publisher = NewsPublisher(NEWS_URL_QUEUE) def extract_links(self): """ extractor links from url """ def formatMsg(self, url, tag, sub_tag, title, abstract, priority=0): msg = {} msg["url"] = url msg["tag"] = tag msg["sub_tag"] = sub_tag msg["title"] = title msg["abstract"] = abstract msg["__priority"] = priority msg["version"] = VERSION msg["create_time"] = int(time.time() * 1000) return msg def isPublished(self, url): try: url_is_exists = self.mysql_client.getOne( "select * from published_url where url=%s", (url, )) if url_is_exists == False: return False else: return True except Exception, e: return True
def __init__(self): self.mysql_client = MysqlClient() self.mongo_client = MongoClient().tdb.tcoll
class BasicArticleCrawler(object): def __init__(self): self.mysql_client = MysqlClient() self.mongo_client = MongoClient().tdb.tcoll def insertSuccess(self, msg): """ success crawle the article msg, insert into the successed db, insert into mongodb """ try: self.mysql_client.begin() # print article # print msg["url"] article = self.mysql_client.getOne("select * from failed_url where url=%s", (msg["url"], )) if article != False: article = self.mysql_client.delete("delete from failed_url where url=%s", (msg["url"], )) LOGGER.info("delete the article from failed_url: %s", msg["url"]) article = self.mysql_client.getOne("select * from successed_url where url=%s", (msg["url"], )) if article != False: LOGGER.info("repeat crawler the article give up save: %s", msg["url"]) return self.mongo_client.save(msg) LOGGER.debug("insert into mongo: %s@%s" %(msg["title"], msg["url"])) self.mysql_client.insertOne("insert into successed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)", \ (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"])); LOGGER.debug("insert successed_url %s" %(msg["url"], )) self.mysql_client.end("commit") except Exception, e: self.mysql_client.end("rollback") self.mysql_client.begin() self.insertFailed(msg) LOGGER.error("insert into mongo/successed_url error: %s" %(msg["url"])) LOGGER.error(traceback.format_exc())
class BasicArticleCrawler(object): def __init__(self): self.mysql_client = MysqlClient() self.mongo_client = MongoClient().tdb.tcoll def insertSuccess(self, msg): """ success crawle the article msg, insert into the successed db, insert into mongodb """ try: self.mysql_client.begin() # print article # print msg["url"] article = self.mysql_client.getOne( "select * from failed_url where url=%s", (msg["url"], )) if article != False: article = self.mysql_client.delete( "delete from failed_url where url=%s", (msg["url"], )) LOGGER.info("delete the article from failed_url: %s", msg["url"]) article = self.mysql_client.getOne( "select * from successed_url where url=%s", (msg["url"], )) if article != False: LOGGER.info("repeat crawler the article give up save: %s", msg["url"]) return self.mongo_client.save(msg) LOGGER.debug("insert into mongo: %s@%s" % (msg["title"], msg["url"])) self.mysql_client.insertOne("insert into successed_url(url, tag, sub_tag, version, create_time) values(%s, %s, %s, %s, %s)", \ (msg["url"], msg["tag"], msg["sub_tag"], VERSION, msg["create_time"])) LOGGER.debug("insert successed_url %s" % (msg["url"], )) self.mysql_client.end("commit") except Exception, e: self.mysql_client.end("rollback") self.mysql_client.begin() self.insertFailed(msg) LOGGER.error("insert into mongo/successed_url error: %s" % (msg["url"])) LOGGER.error(traceback.format_exc())
@author: lml ''' import sys sys.path.append("../") sys.path.append("../../") sys.path.append("/home/lml/webcrawler/webcrawler-nlp/crawler/") from utils.dbmong import MongoClient from utils.dbmysql import MysqlClient from time import sleep if __name__ == '__main__': while True: print "**************************************************" mysql_client = MysqlClient() mongo_client = MongoClient() published_url_count = mysql_client.getOne("select count(*) as count from published_url") print "published url count: %s"%published_url_count["count"] successed_url_count = mysql_client.getOne("select count(*) as count from successed_url") print "successed url count: %s"%successed_url_count["count"] failed_url_count = mysql_client.getOne("select count(*) as count from failed_url") print "failed url count: %s"%failed_url_count["count"] count = mongo_client.tdb.tcoll.count() print "mongo articles: %s"%count sleep(10) print ""
@author: lml ''' import sys sys.path.append("../") sys.path.append("../../") sys.path.append("/home/lml/webcrawler/webcrawler-nlp/crawler/") from utils.dbmong import MongoClient from utils.dbmysql import MysqlClient from time import sleep if __name__ == '__main__': while True: print "**************************************************" mysql_client = MysqlClient() mongo_client = MongoClient() published_url_count = mysql_client.getOne( "select count(*) as count from published_url") print "published url count: %s" % published_url_count["count"] successed_url_count = mysql_client.getOne( "select count(*) as count from successed_url") print "successed url count: %s" % successed_url_count["count"] failed_url_count = mysql_client.getOne( "select count(*) as count from failed_url") print "failed url count: %s" % failed_url_count["count"] count = mongo_client.tdb.tcoll.count() print "mongo articles: %s" % count