class MiningServer(object): def __init__(self): self.reactor = HttpReactor() self.cleaner = lxml.html.clean.Cleaner(style=True, scripts=True, page_structure=False, safe_attrs_only=False) self.html_parser = lxml.html.HTMLParser(encoding="utf-8") self.logger = logging.getLogger("root") def process_body(self, body, url, obj_id): body = to_unicode(body) body.replace('<?xml version="1.0" encoding="utf-8"?>', "") body = self.cleaner.clean_html(body) with open("../data/mining_task/" + str(obj_id), "wb") as fout: g = gzip.GzipFile(mode="wb", fileobj=fout) try: g.write(body.encode("utf-8")) finally: g.close() print url # print body[:100].encode('utf-8') def process_error(self, failure, url, obj_id): print failure.getErrorMessage() self.logger.error("download error, url:%s, msg:%s" % (url, failure.getTraceback())) def process_task(self, url, obj_id): url = url.encode("utf-8") requestProcess = (lambda x: None, (), {}) bodyProcess = (self.process_body, (url, obj_id), {}) errorProcess = (self.process_error, (url, obj_id), {}) # print "process_task:", url self.reactor.download_and_process(url, None, requestProcess, bodyProcess, errorProcess, redirect=True) def run(self): self.reactor.run()
def main(): #signal.signal(signal.SIGINT, lambda : sys.exit(0)) #signal.signal(signal.SIGTERM, lambda : sys.exit(0)) logging.config.fileConfig("../conf/log_mining_crawler.conf") reactor = HttpReactor() threadpool = HttpThreadpool(40, 200) config = ConfigParser.ConfigParser() config.read('../conf/mining_crawler.conf') init_url = [ 'http://news.qq.com/', 'http://news.163.com/', 'http://news.sina.com.cn/', 'http://news.ifeng.com/', 'http://news.sohu.com/', 'http://www.xinhuanet.com/', ] init_url = ['http://news.qq.com/'] for url in init_url: miner_server = MinerServer(reactor, threadpool, [url], config, False) t = threading.Thread(target=miner_server.start, args=(False,)) t.setDaemon(True) t.start() url = 'http://sports.163.com/' #first_task = miner_server.db_helper.init_mining_job(url) #miner_server.process_task(first_task) reactor.run()
def main(): #signal.signal(signal.SIGINT, lambda : sys.exit(0)) #signal.signal(signal.SIGTERM, lambda : sys.exit(0)) logging.config.fileConfig("../conf/seed_log.conf") conf = dict(address="localhost", port=10010, db_name="news_crawler") queue_service = BlockingQueueService(100) handler = SeedHandler(queue_service) scheduler = SeedScheduler('background', handler, conf) scheduler.start() reactor = HttpReactor() config = ConfigParser.ConfigParser() config.read('../conf/url_dedup.conf') hubserver = HubServer(reactor, queue_service, config) t = threading.Thread(target=hubserver.start) t.daemon = True t.start() url = "http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=60&asc=&page=1" url = "http://www.163.com/" for _ in xrange(2): queue_service.put(SeedTask(url), 1) hubserver.process_task(url) reactor.run()
def main(): reactor = HttpReactor() url = 'http://3g.163.com/news/16/0101/00/BC70TOEK00014AED.html' requestProcess = (process_request, (url,), {}) bodyProcess = (process_body, (url,), {}) errorProcess = (process_error, (url,), {}) reactor.download_and_process(url, None, requestProcess, bodyProcess, errorProcess, redirect=True) reactor.run()