class PaserControl(threading.Thread): def __init__(self): super(PaserControl, self).__init__() self._collector = Collector() self._urlCount = int(tools.getConfValue("html_parser", "url_count")) self._interval = int(tools.getConfValue("html_parser", "sleep_time")) def run(self): while True: try: urls = self._collector.getUrls(self._urlCount) print("取到的url大小 %d" % len(urls)) # 判断是否结束 if self._collector.isFinished(): log.debug("-------------- 结束 --------------") break for url in urls: self.parseUrl(url) time.sleep(self._interval) except Exception as e: log.debug(urls) log.debug(e) def parseUrl(self, urlInfo): website_id = urlInfo['website_id'] try: domain = list(db.website.find({'_id': website_id}))[0]['domain'] if domain == Constance.IFENG: ifeng.parseUrl(urlInfo) elif domain == Constance.SOHU: sohu.parseUrl(urlInfo) elif domain == Constance.TENCENT: tencent.parseUrl(urlInfo) elif domain == Constance.SINA: #sina.parseUrl(urlInfo) pass elif domain == Constance.CCTV: cctv.parseUrl(urlInfo) elif domain == Constance.PEOPLE: people.parseUrl(urlInfo) elif domain == Constance.WANG_YI: wangyi.parseUrl(urlInfo) elif domain == Constance.XIN_HUA: xinhua.parseUrl(urlInfo) except Exception as e: log.debug(e)
class PaserControl(threading.Thread): def __init__(self): super(PaserControl, self).__init__() self._collector = Collector() self._urlCount = int(tools.getConfValue("html_parser", "url_count")) self._interval = int(tools.getConfValue("html_parser", "sleep_time")) def run(self): while True: urls = self._collector.getUrls(self._urlCount) print("取到的url大小 %d" % len(urls)) for url in urls: self.parseUrl(url) time.sleep(self._interval) def parseUrl(self, urlInfo): website_id = urlInfo['website_id'] domain = list(db.website.find({'_id': website_id}))[0]['domain'] if domain == Constance.YOUKU: youku.parseUrl(urlInfo) elif domain == Constance.TENCENT: tencent.parseUrl(urlInfo) elif domain == Constance.WANG_YI: wangyi.parseUrl(urlInfo) elif domain == Constance.PPTV: pptv.parseUrl(urlInfo) elif domain == Constance.KAN_KAN: kankan.parseUrl(urlInfo) elif domain == Constance.CCTV: cctv.parseUrl(urlInfo) elif domain == Constance.TUDOU: tudou.parseUrl(urlInfo) elif domain == Constance.V1: v1.parseUrl(urlInfo) elif domain == Constance.KU6: ku6.parseUrl(urlInfo)
class PaserControl(threading.Thread): def __init__(self): super(PaserControl, self).__init__() self._collector = Collector() self._urlCount = int(tools.getConfValue("html_parser", "url_count")) self._interval = int(tools.getConfValue("html_parser", "sleep_time")) def run(self): while True: urls = self._collector.getUrls(self._urlCount) print("取到的url大小 %d"%len(urls)) # 判断是否结束 if self._collector.isFinished(): log.debug("-------------- 结束 --------------") break for url in urls: self.parseUrl(url) time.sleep(self._interval) def parseUrl(self, urlInfo): parser.parseUrl(urlInfo)