def __init__(self, domain): self.domain = domain self.domains = domain.domains self.mongo = self.domains.catecory self.log = domain.log self.waiting = PQDict( key=lambda x: x.value['_id'], score=lambda x: x.value['next'], ) self.doing = dict() self.updates = set() self.next = 0 self.null = False self.wait = 0
def __init__(self, master, domain): super(Domain, self).__init__() self.master = master self.domain = domain self.re_cate = re.compile(self.domain['re_cate']) self.re_page = re.compile(self.domain['re_page']) self.re_album = re.compile(self.domain['re_album']) self.re_image = re.compile(self.domain['re_image']) self.re_title = re.compile(self.domain['re_title']) self.re_type = re.compile(self.domain['re_type']) self.cates = PQDict(key=lambda x:x.value['_id'], score=time.time) self.albums = PQDict(key=lambda x:x.value['_id'], score=album_score) self.doing = {} self.next = 0
def __init__(self, master, domain): super(Domain, self).__init__() self.master = master self.domain = domain self.re_cate = re.compile(self.domain['re_cate']) self.re_page = re.compile(self.domain['re_page']) self.re_album = re.compile(self.domain['re_album']) self.re_image = re.compile(self.domain['re_image']) self.re_title = re.compile(self.domain['re_title']) self.re_type = re.compile(self.domain['re_type']) self.cates = PQDict(key=lambda x: x.value['_id'], score=time.time) self.albums = PQDict(key=lambda x: x.value['_id'], score=album_score) self.doing = {} self.next = 0
def __init__(self, count, conf): super(Master, self).__init__(count) db = MongoImage(conf, 'img') self.domain = db.domain self.catecory = db.catecory self.album = db.album self.file = db.file self.sync_round = 90 self.doing_round = (count + 1) / 2 self.domains = PQDict(key=lambda x: x.value.domain['_id']) self.last = 0 self.domain.save(o) self.catecory.save(c)
def __init__(self, domain): self.domain = domain self.domains = domain.domains self.articles = domain.domains.master.articles self.article = domain.domains.article self.exc_article = domain.domains.exc_article self.log = domain.log self.queue = PQDict( key=lambda x: x.value['_id'], score=lambda x: -x.value['pubtime'], ) self.updates = set() self.full = 2000 self.common = 1000 self.limit = 100 self.null = False self.next = 0 self.last = 0 self.xcount = 0 self.xlast = 0
def __init__(self, master): self.master = master self.log = master.log self.domain = master.mongo.domain self.catecory = master.mongo.catecory self.template = master.mongo.template self.article = master.mongo.spider_article self.exc_article = master.mongo.spider_exc self.html_file = master.mongo.html_file self.url_redis = redis.Redis(**master.conf.redis_url) self.domains = PQDict(key=lambda x: x.value.domain['_id']) self.fetching = dict() self.waiting = OrderedDict() self.doing = dict() self.sync_round = 1800 self.last = -1 master.fetch_adapter.register(['cate','art','page','img'], self.fetch_get, self.fetch, self.fetch_cancel) master.handle_adapter.register(['cate'], self.handle_get, self.handle, self.handle_cancel)
class Articles(object): def __init__(self, domain): self.domain = domain self.domains = domain.domains self.articles = domain.domains.master.articles self.article = domain.domains.article self.exc_article = domain.domains.exc_article self.log = domain.log self.queue = PQDict( key=lambda x: x.value['_id'], score=lambda x: -x.value['pubtime'], ) self.updates = set() self.full = 2000 self.common = 1000 self.limit = 100 self.null = False self.next = 0 self.last = 0 self.xcount = 0 self.xlast = 0 def __contains__(self, key): return key in self.queue or self.domains.is_fetching(key) def __nonzero__(self): return len(self.queue) > 0 def __len__(self): return len(self.queue) def put(self, article, update=False): if article['_id'] in self: return 0 if update == True: self.updates.add(article['_id']) if self.next < article['pubtime']: self.next = article['pubtime'] self.queue.put(article) return 1 def save(self, article, update=False): if update == True or article['_id'] in self.updates: if article['_id'] in self.updates: self.updates.remove(article['_id']) if 'exc' in article: self.exc_article.save(article) else: self.article.save(article) return 1 return 0 def new(self, url, src_type, src, task, last=0): key = hashlib.md5(url.encode('utf-8')).hexdigest() xlong = unicode2hash(url) tpl = url2tpl(url) if tpl not in self.domain.tpls \ or key in self \ or self.domains.add_url(xlong, self.domain.id()) == 0: return 0 article = { '_id': key, 'id': '', 'long': xlong, 'url': url, 'domain': self.domain.id(), 'tpl': tpl, 'src_type': src_type, 'src': src, 'html': '', 'title': '', 'pages': {}, 'imgs': {}, 'icons': {}, 'tags': [], 'sim': False, 'f': False, 'version': 0, 'v': self.articles.new_version(), 'created': time.time(), 'last': time.time(), } if src_type == 'cate': article['src_link'] = task['url'] article['src_name'] = task['name'] if last > 0: article['pubtime'] = last else: article['pubtime'] = time.time() - 86400 * 60 else: article['src_link'] = self.domain.domain['link'] article['src_name'] = self.domain.domain['name'] article['pubtime'] = task['pubtime'] - 86400 * 15 article['pubtime'] = self.get_pubtime(article) if self.next < article['pubtime']: self.next = article['pubtime'] self.updates.add(article['_id']) self.queue.put(article) return 1 def sync(self, exit): if exit: self.back_on_exit() else: if len(self.queue) >= self.full * 2: self.back() elif len(self.queue) <= self.limit and (not self.null or time.time() >= self.last + self.domains.sync_round): self.load() self.last = time.time() def load(self): doc = { 'domain': self.domain.id(), 'f': False, } articles = self.article.find(doc) articles.sort('pubtime', pymongo.DESCENDING).limit(self.common) num = 0 for article in articles: num += self.put(article) self.null = True if num == 0 else False self.log.info('load %d/%d articles from %s to fetch.' % (num, min(articles.count(), self.common), self.domain.id())) def back(self): count = len(self.queue) limit = count - self.common * 2 for article in self.queue.tail(limit): self.save(article) self.null = False self.log.info('back %d/%d articles from %s.' % (limit, count, self.domain.id())) def back_on_exit(self): count, num = len(self.queue), 0 for article in self.queue.itervalues(): num += self.save(article) self.log.info('back %d/%d articles from %s on exit.' % (num, count, self.domain.id())) def get(self, now=0): if self.queue: if now > 0: today = now // 86400 * 86400 if self.next == 0 or today < self.next <= now: article = self.queue.get() self.next = article['pubtime'] if today < self.next <= now: return article self.queue.put(article) else: # if self.articles.doing_len() > 5000: # if self.xcount >= 50 and time.time() - self.xlast < 5: # return None # if time.time() - self.xlast >= 5: # self.xcount = 0 # self.xlast = time.time() # self.xcount += 1 return self.queue.get() def cancel(self, article): self.queue.put(article) def fetch(self, article, res): article['last'] = time.time() if 'exc' not in res: article['html'] = res['path'] article['f'] = True self.save(article, update=True) ext = { 'html': res['html'], 'selector': self.domain.tpls.selector(article['tpl']), } if self.articles._len < 30000 or time.time() - article['pubtime'] < 86400: self.articles.put(article, ext=ext) self.log.debug('fetch article %s.' % article['url']) else: article['exc'] = res['exc'] self.save(article, update=True) self.log.warn('fetch article except(%s): %s.' % (res['exc'], article['url'])) def get_pubtime(self, article): now = time.time() match = re_date.search(article['url']) if match: year, month, day = match.group(1), match.group(2), match.group(3) if not day: day = '01' try: return min(now, time.mktime(time.strptime( '%s-%s-%s' % (year, month, day), "%Y-%m-%d")) + 43200) except: pass if article['pubtime']: return article['pubtime'] return now - (86400 * 60)
class Articles(object): def __init__(self, domain): self.domain = domain self.domains = domain.domains self.articles = domain.domains.master.articles self.article = domain.domains.article self.exc_article = domain.domains.exc_article self.log = domain.log self.queue = PQDict( key=lambda x: x.value['_id'], score=lambda x: -x.value['pubtime'], ) self.updates = set() self.full = 2000 self.common = 1000 self.limit = 100 self.null = False self.next = 0 self.last = 0 self.xcount = 0 self.xlast = 0 def __contains__(self, key): return key in self.queue or self.domains.is_fetching(key) def __nonzero__(self): return len(self.queue) > 0 def __len__(self): return len(self.queue) def put(self, article, update=False): if article['_id'] in self: return 0 if update == True: self.updates.add(article['_id']) if self.next < article['pubtime']: self.next = article['pubtime'] self.queue.put(article) return 1 def save(self, article, update=False): if update == True or article['_id'] in self.updates: if article['_id'] in self.updates: self.updates.remove(article['_id']) if 'exc' in article: self.exc_article.save(article) else: self.article.save(article) return 1 return 0 def new(self, url, src_type, src, task, last=0): key = hashlib.md5(url.encode('utf-8')).hexdigest() xlong = unicode2hash(url) tpl = url2tpl(url) if tpl not in self.domain.tpls \ or key in self \ or self.domains.add_url(xlong, self.domain.id()) == 0: return 0 article = { '_id': key, 'id': '', 'long': xlong, 'url': url, 'domain': self.domain.id(), 'tpl': tpl, 'src_type': src_type, 'src': src, 'html': '', 'title': '', 'pages': {}, 'imgs': {}, 'icons': {}, 'tags': [], 'sim': False, 'f': False, 'version': 0, 'v': self.articles.new_version(), 'created': time.time(), 'last': time.time(), } if src_type == 'cate': article['src_link'] = task['url'] article['src_name'] = task['name'] if last > 0: article['pubtime'] = last else: article['pubtime'] = time.time() - 86400 * 60 else: article['src_link'] = self.domain.domain['link'] article['src_name'] = self.domain.domain['name'] article['pubtime'] = task['pubtime'] - 86400 * 15 article['pubtime'] = self.get_pubtime(article) if self.next < article['pubtime']: self.next = article['pubtime'] self.updates.add(article['_id']) self.queue.put(article) return 1 def sync(self, exit): if exit: self.back_on_exit() else: if len(self.queue) >= self.full * 2: self.back() elif len(self.queue) <= self.limit and ( not self.null or time.time() >= self.last + self.domains.sync_round): self.load() self.last = time.time() def load(self): doc = { 'domain': self.domain.id(), 'f': False, } articles = self.article.find(doc) articles.sort('pubtime', pymongo.DESCENDING).limit(self.common) num = 0 for article in articles: num += self.put(article) self.null = True if num == 0 else False self.log.info( 'load %d/%d articles from %s to fetch.' % (num, min(articles.count(), self.common), self.domain.id())) def back(self): count = len(self.queue) limit = count - self.common * 2 for article in self.queue.tail(limit): self.save(article) self.null = False self.log.info('back %d/%d articles from %s.' % (limit, count, self.domain.id())) def back_on_exit(self): count, num = len(self.queue), 0 for article in self.queue.itervalues(): num += self.save(article) self.log.info('back %d/%d articles from %s on exit.' % (num, count, self.domain.id())) def get(self, now=0): if self.queue: if now > 0: today = now // 86400 * 86400 if self.next == 0 or today < self.next <= now: article = self.queue.get() self.next = article['pubtime'] if today < self.next <= now: return article self.queue.put(article) else: # if self.articles.doing_len() > 5000: # if self.xcount >= 50 and time.time() - self.xlast < 5: # return None # if time.time() - self.xlast >= 5: # self.xcount = 0 # self.xlast = time.time() # self.xcount += 1 return self.queue.get() def cancel(self, article): self.queue.put(article) def fetch(self, article, res): article['last'] = time.time() if 'exc' not in res: article['html'] = res['path'] article['f'] = True self.save(article, update=True) ext = { 'html': res['html'], 'selector': self.domain.tpls.selector(article['tpl']), } if self.articles._len < 30000 or time.time( ) - article['pubtime'] < 86400: self.articles.put(article, ext=ext) self.log.debug('fetch article %s.' % article['url']) else: article['exc'] = res['exc'] self.save(article, update=True) self.log.warn('fetch article except(%s): %s.' % (res['exc'], article['url'])) def get_pubtime(self, article): now = time.time() match = re_date.search(article['url']) if match: year, month, day = match.group(1), match.group(2), match.group(3) if not day: day = '01' try: return min( now, time.mktime( time.strptime('%s-%s-%s' % (year, month, day), "%Y-%m-%d")) + 43200) except: pass if article['pubtime']: return article['pubtime'] return now - (86400 * 60)
class Master(GeventWorker): def __init__(self, count, conf): super(Master, self).__init__(count) db = MongoImage(conf, 'img') self.domain = db.domain self.catecory = db.catecory self.album = db.album self.file = db.file self.sync_round = 90 self.doing_round = (count + 1) / 2 self.domains = PQDict(key=lambda x: x.value.domain['_id']) self.last = 0 self.domain.save(o) self.catecory.save(c) def run(self): try: self.sync(init=True) while not self.is_exit(): print "I'am coming" self.doing() self.clean() self.sync() self.wait(0.1) except KeyboardInterrupt: print 'keyboardInterrupt' self.sync(exit=True) def sync(self, init=False, exit=False): if not init and not exit \ and self.sync_round > time.time() - self.last: return last = time.time() domains, cates, albums = {}, {}, {} if not init: for domain in self.domains: domains.update(domain.back_domain(self.last, last)) cates.update(domain.back_cate(self.last, last)) albums.update(domain.back_album(self.last, last)) if not exit: self.load_domains(init, self.last, last, domains) self.load_cates(init, self.last, last, cates) self.load_albums(init, self.last, last, albums) for domain in domains.itervalues(): self.domain.save(domain) for cate in cates.itervalues(): self.catecory.save(cate) for album in albums.itervalues(): album = album.copy() album['pages'] = json.dumps(album['pages']) album['imgs'] = json.dumps(album['imgs']) self.album.save(album) @log def doing(self): for x in xrange(self.doing_round): if self.is_exit(): return domain = self.domains.get() task = domain.get() self.domains.put(domain) if not task: break self.do(task) @log def handle(self, index, task): try: index, task = task[0], task[1] url = task['_id'] html = get(u(url)) html = clean_doc(html2doc(html, url=url), return_html=True) if index == 'cate': self.domains[task['domain']].parse_cate(task, {'html': html}) elif index == 'album': pass elif index == 'page': self.domains[task['domain']].parse_album(task, {'html': html}) except KeyboardInterrupt: self.exit() @log def load_domains(self, init, start, end, domains): doc = { 'status': 'valid' } if init else { 'last': { '$gt': start, '$lte': end } } for domain in self.domain.find(doc): self.add_domain(init, domain, domains) @log def load_cates(self, init, start, end, cates): doc = { 'status': 'wait' } if init else { 'last': { '$gt': start, '$lte': end } } for cate in self.catecory.find(doc): if cate['domain'] in self.domains: self.domains[cate['domain']].add_cate(cate, cates) @log def load_albums(self, init, start, end, albums): doc = { 'status': 'wait' } if init else { 'last': { '$gt': start, '$lte': end } } for album in self.album.find(doc): album['pages'] = json.loads(album['pages']) album['imgs'] = json.loads(album['imgs']) if album['domain'] in self.domains: self.domains[album['domain']].add_album(album, albums) @log def add_domain(self, init, domain, domains): _id = domain['_id'] if domain['status'] == 'valid' and _id not in self.domains: domain = Domain(self, domain) if not init: domain.load() self.domains.put(domain) elif domain['status'] == 'invalid' and _id in self.domains: tmp = self.domains.pop(_id).domain if tmp['last'] < self.last: tmp['status'] = 'invalid' domains[_id] = tmp @log def on_exit(self): self.sync(exit=True)
class Domains(object): def __init__(self, master): self.master = master self.log = master.log self.domain = master.mongo.domain self.catecory = master.mongo.catecory self.template = master.mongo.template self.article = master.mongo.spider_article self.exc_article = master.mongo.spider_exc self.html_file = master.mongo.html_file self.url_redis = redis.Redis(**master.conf.redis_url) self.domains = PQDict(key=lambda x: x.value.domain['_id']) self.fetching = dict() self.waiting = OrderedDict() self.doing = dict() self.sync_round = 1800 self.last = -1 master.fetch_adapter.register(['cate','art','page','img'], self.fetch_get, self.fetch, self.fetch_cancel) master.handle_adapter.register(['cate'], self.handle_get, self.handle, self.handle_cancel) @property def counter(self): res = { 'fetch':len(self.fetching), 'domain_wait':len(self.waiting), 'domain_doing':len(self.doing), 'fetch_cate':0, 'fetch_art':0, 'fetch_img':0, 'fetch_page':0, } for cmd, _ in self.fetching.itervalues(): res['fetch_' + cmd] += 1 return res def is_fetching(self, key): return key in self.fetching def is_doing(self, key): return key in self.fetching \ or key in self.waiting \ or key in self.doing def sync(self, quit=False): self.sync_last(quit) self.sync_other(quit) def sync_last(self, quit): if not quit and self.sync_round > time.time() - self.last: return last = time.time() domains, tpls = {}, {} for domain in self.domains: domains.update(domain.back(self.last, last)) tpls.update(domain.tpls.back(self.last, last)) if not quit: self.load_domains(self.last, last) self.load_tpls(self.last, last) for domain in domains.itervalues(): self.domain.save(domain) for tpl in tpls.itervalues(): self.template.save(tpl) self.last = last def sync_other(self, quit): for domain in self.domains: domain.sync(quit) self.domains.heapify() def load_domains(self, start, end): doc = {'status':'common', 'last':{'$gt':start, '$lte':end}} domains = self.domain.find(doc) for domain in domains: if domain['_id'] not in self.domains: xdomain = Domain(self, domain) self.domains.put(xdomain) if start > 0: xdomain.load() else: self.domains[domain['_id']].update(domain) if domains.count() > 0: self.log.info('load %d domains.' % domains.count()) def load_tpls(self, start, end): doc = {'status':'common', 'last':{'$gt':start, '$lte':end}} tpls = self.template.find(doc) for tpl in tpls: with self.domains.get2do(tpl['domain']) as domain: domain.tpls.put(tpl) if tpls.count > 0: self.log.info('load %d tpls.' % tpls.count()) def add_url(self, key, domain): return self.url_redis.sadd(domain, key) def has_url(self, key, domain): return self.url_redis.sismember(domain, key) def selector(self, domain, tpl): if domain in self.domains: return self.domains[domain].tpls.selector(tpl) return {} def new_arts(self, article, urls): with self.domains.get2do(article['domain']) as domain: return domain.new_arts(urls, 'art', article) return 0 def fetch_page(self, page): with self.domains.get2do(page['domain']) as domain: return domain.pages.put(page) return False def fetch_img(self, img): with self.domains.get2do(img['domain']) as domain: return domain.imgs.put(img) return False def new_handle(self, key, value): self.waiting[key] = value def quit(self): for cmd, task in self.fetching.itervalues(): self.domains[task['domain']].cancel(cmd, task) self.fetching.clear() for cmd, task, _ in self.waiting.itervalues(): self.domains[task['domain']].cancel(cmd, task) self.waiting.clear() for cmd, task in self.doing.itervalues(): self.domains[task['domain']].cancel(cmd, task) self.doing.clear() def fetch_get(self, count): tasks = [] if not self.domains: return tasks null = 0 for _ in xrange(count): cmd, task = None, None with self.domains.get2do() as domain: cmd, task = domain.get() if not task: null += 1 if null >= 5: break self.domains.heapify() continue tasks.append({'key':task['_id'], 'cmd':cmd, 'info':task}) self.fetching[task['_id']] = (cmd, task) return tasks def fetch_cancel(self, key, cmd): if key in self.fetching: cmd, task = self.fetching.pop(key) with self.domains.get2do(task['domain']) as domain: domain.cancel(cmd, task) def fetch(self, key, cmd, res): if key in self.fetching: cmd, task = self.fetching.pop(key) with self.domains.get2do(task['domain']) as domain: domain.fetch(cmd, task, res) def handle_get(self, count, **kwargs): tasks = [] for _ in xrange(min(len(self.waiting), count)): cmd, task, ext = self.waiting.popitem()[1] tasks.append({'key':task['_id'], 'cmd':cmd, 'info':task, 'ext':ext}) self.doing[task['_id']] = (cmd, task) return tasks def handle_cancel(self, key, cmd): if key in self.doing: cmd, task = self.doing.pop(key) with self.domains.get2do(task['domain']) as domain: domain.cancel(cmd, task) def handle(self, key, cmd, res): if key in self.doing: cmd, task = self.doing.pop(key) with self.domains.get2do(task['domain']) as domain: domain.handle(cmd, task, res)
class Catecorys(object): def __init__(self, domain): self.domain = domain self.domains = domain.domains self.mongo = self.domains.catecory self.log = domain.log self.waiting = PQDict( key=lambda x: x.value['_id'], score=lambda x: x.value['next'], ) self.doing = dict() self.updates = set() self.next = 0 self.null = False self.wait = 0 def __nonzero__(self): return len(self.waiting) > 0 def __len__(self): return len(self.waiting) def put(self, cate, update=False): self.waiting.put(cate) if cate['next'] < self.wait: self.wait = cate['next'] if update: self.updates.add(cate['_id']) def save(self, cate, update=False): if update == True or cate['_id'] in self.updates: if cate['_id'] in self.updates: self.updates.remove(cate['_id']) self.mongo.save(cate) return 1 return 0 def sync(self, exit): if exit: self.back_on_exit() else: if time.time() >= self.next or self.next == 0: next = time.time() + 3600 self.back(next) self.load(next) self.next = next def load(self, next): doc = { 'domain': self.domain.id(), 'status': 'common', '$or': [{ 'next': { '$gte': self.next, '$lt': next } }, { 'last': 0 }], } cates = self.mongo.find(doc) for cate in cates: if cate['_id'] not in self.waiting \ and cate['_id'] not in self.doing: self.put(cate) if cates.count() > 0: self.log.info('load %d/%d cates from %s.' % (cates.count(), len(self.waiting), self.domain.id())) def back(self, next): num, others = 0, [] for cate in self.waiting.itervalues(): if cate['next'] >= next: num += self.save(cate) else: others.append(cate) self.waiting.extend(others) if num > 0: self.log.info('back %d/%d cate from %s.' % (num, len(others), self.domain.id())) def back_on_exit(self): num, count = 0, len(self.waiting) for cate in self.waiting.itervalues(): num += self.save(cate) self.log.info('back %d/%d cates from %s on exit.' % (num, count, self.domain.id())) def get(self, now): if self.wait <= now and self.waiting: cate = self.waiting.get() self.wait = cate['next'] if self.wait <= now: self.doing[cate['_id']] = cate if self.waiting: with self.waiting.get2do() as tmp: self.wait = tmp['next'] else: self.wait = 2000000000 if not cate['page']: return { '_id': cate['_id'], 'domain': cate['domain'], 'url': cate['url'] } else: return { '_id': cate['_id'], 'domain': cate['domain'], 'url': cate['page'] } self.waiting.put(cate) def cancel(self, page): if page['_id'] not in self.doing: return cate = self.doing[page['_id']] self.waiting.put(cate) if cate['next'] < self.wait: self.wait = cate['next'] def fetch(self, page, res): if page['_id'] not in self.doing: return cate = self.doing[page['_id']] cate['fetch'] += 1 cate['last'] = time.time() self.updates.add(cate['_id']) if 'exc' not in res: self.domains.new_handle(cate['_id'], ('cate', page, { 'html': res['html'] })) self.log.debug('fetch cate: %s.' % cate['url']) else: log = { 'last': time.time(), 'url': page['url'], 'exc': res['exc'], 'arts': 0 } self.make_log(cate, log) cate['next'] = self.get_next(cate) cate['error'] += 1 cate = self.doing.pop(page['_id']) self.put(cate, update=True) self.log.warn('fetch cate except(%s): %s.' % (res['exc'], page['url'])) def handle(self, page, res): if page['_id'] not in self.doing: return log = {'last': time.time(), 'url': page['url'], 'exc': '', 'arts': 0} cate = self.doing.pop(page['_id']) if 'exc' not in res: count = self.domain.new_arts(res['urls'], 'cate', cate, last=cate['next'] - 900) if count == 0: cate['null'] += 1 else: cate['arts'] += count log['arts'] = count if not cate['all'] or count > 0: if res['next']: cate['page'] = res['next'] else: cate['page'] = '' cate['all'] = True else: cate['page'] = '' self.log.debug('parse %d arts from cate: %s.' % (count, page['url'])) else: log['exc'] = res['exc'] cate['error'] += 1 if cate['error'] >= 20: cate['all'] = True self.log.warn('parse except %s from cate: %s.' % (res['exc'], page['url'])) self.make_log(cate, log) cate['next'] = self.get_next(cate) cate['last'] = time.time() self.put(cate, update=True) def make_log(self, cate, log): cate['index'] += 1 if cate['index'] >= 10: cate['index'] = 0 if len(cate['log']) < 10: cate['log'].append(log) else: cate['log'][cate['index']] = log def get_next(self, cate): if not cate['all']: return time.time() + 60 if cate['log']: arts = sum([x['arts'] for x in cate['log']]) error = sum([1 if x['exc'] != '' else 0 for x in cate['log']]) null = sum([1 if x['arts'] == 0 \ and x['exc'] == 0 else 0 for x in cate['log']]) start = min(cate['log'], key=lambda x: x['last'])['last'] end = max(cate['log'], key=lambda x: x['last'])['last'] if error == 10: return time.time() + 300 return time.time() + 60 + (end - start) / float(arts + 5) \ + null * 10 + error * 60 return time.time() + 120
class Master(GeventWorker): def __init__(self, count, conf): super(Master, self).__init__(count) db = MongoImage(conf, 'img') self.domain = db.domain self.catecory = db.catecory self.album = db.album self.file = db.file self.sync_round = 90 self.doing_round = (count + 1) / 2 self.domains = PQDict(key=lambda x: x.value.domain['_id']) self.last = 0 self.domain.save(o) self.catecory.save(c) def run(self): try: self.sync(init=True) while not self.is_exit(): print "I'am coming" self.doing() self.clean() self.sync() self.wait(0.1) except KeyboardInterrupt: print 'keyboardInterrupt' self.sync(exit=True) def sync(self, init=False, exit=False): if not init and not exit \ and self.sync_round > time.time() - self.last: return last = time.time() domains, cates, albums = {}, {}, {} if not init: for domain in self.domains: domains.update(domain.back_domain(self.last, last)) cates.update(domain.back_cate(self.last, last)) albums.update(domain.back_album(self.last, last)) if not exit: self.load_domains(init, self.last, last, domains) self.load_cates(init, self.last, last, cates) self.load_albums(init, self.last, last, albums) for domain in domains.itervalues(): self.domain.save(domain) for cate in cates.itervalues(): self.catecory.save(cate) for album in albums.itervalues(): album = album.copy() album['pages'] = json.dumps(album['pages']) album['imgs'] = json.dumps(album['imgs']) self.album.save(album) @log def doing(self): for x in xrange(self.doing_round): if self.is_exit(): return domain = self.domains.get() task = domain.get() self.domains.put(domain) if not task: break self.do(task) @log def handle(self, index, task): try: index, task = task[0], task[1] url = task['_id'] html = get(u(url)) html = clean_doc(html2doc(html, url=url), return_html=True) if index == 'cate': self.domains[task['domain']].parse_cate(task, {'html':html}) elif index == 'album': pass elif index == 'page': self.domains[task['domain']].parse_album(task, {'html':html}) except KeyboardInterrupt: self.exit() @log def load_domains(self, init, start, end, domains): doc = {'status':'valid'} if init else {'last':{'$gt':start, '$lte':end}} for domain in self.domain.find(doc): self.add_domain(init, domain, domains) @log def load_cates(self, init, start, end, cates): doc = {'status':'wait'} if init else {'last':{'$gt':start, '$lte':end}} for cate in self.catecory.find(doc): if cate['domain'] in self.domains: self.domains[cate['domain']].add_cate(cate, cates) @log def load_albums(self, init, start, end, albums): doc = {'status':'wait'} if init else {'last':{'$gt':start, '$lte':end}} for album in self.album.find(doc): album['pages'] = json.loads(album['pages']) album['imgs'] = json.loads(album['imgs']) if album['domain'] in self.domains: self.domains[album['domain']].add_album(album, albums) @log def add_domain(self, init, domain, domains): _id = domain['_id'] if domain['status'] == 'valid' and _id not in self.domains: domain = Domain(self, domain) if not init: domain.load() self.domains.put(domain) elif domain['status'] == 'invalid' and _id in self.domains: tmp = self.domains.pop(_id).domain if tmp['last'] < self.last: tmp['status'] = 'invalid' domains[_id] = tmp @log def on_exit(self): self.sync(exit=True)
class Domain(object): """docstring for Domain""" def __init__(self, master, domain): super(Domain, self).__init__() self.master = master self.domain = domain self.re_cate = re.compile(self.domain['re_cate']) self.re_page = re.compile(self.domain['re_page']) self.re_album = re.compile(self.domain['re_album']) self.re_image = re.compile(self.domain['re_image']) self.re_title = re.compile(self.domain['re_title']) self.re_type = re.compile(self.domain['re_type']) self.cates = PQDict(key=lambda x:x.value['_id'], score=time.time) self.albums = PQDict(key=lambda x:x.value['_id'], score=album_score) self.doing = {} self.next = 0 @log def load(self): doc = {'domain':self.domain['_id'], 'status':'wait'} cates = self.master.cate.find(doc) for cate in cates: self.add_cate(cate) albums = self.master.album.find(doc) for album in albums: album['pages'] = json.loads(album['pages']) album['imgs'] = json.loads(album['imgs']) self.add_album(album) @log def add_cate(self, cate, cates=None): _id = cate['_id'] if cate['status'] == 'wait' \ and _id not in self.cates \ and _id not in self.doing: self.cates.put(cate) elif cate['status'] == 'invalid' and cates: tmp = None if _id in self.cates: tmp = self.cates.pop(_id) elif _id in self.doing: tmp = self.doing.pop(_id) if tmp and tmp['last'] > self.master.last: tmp['status'] = 'invalid' cates[_id] = tmp @log def add_album(self, album, albums=None): _id = album['_id'] if album['status'] == 'wait' and _id not in self.albums: self.albums[_id] = album elif album['status'] == 'invalid' and _id in self.albums and albums: tmp = self.albums.pop(_id) if tmp and tmp['last'] > self.master.last: tmp['status'] = 'invalid' albums[_id] = tmp @log def back_domain(self, start, end): domains = {} if start < self.domain['last'] <= end: domains[self.domain['_id']] = self.domain return domains @log def back_cate(self, start, end): cates = {} tmp = [] for cate in self.cates.itervalues(): if start < cate['last'] <= end: cates[cate['_id']] = cate tmp.append(cate) self.cates.extend(tmp) if cates: print('back %d cates from domain(%s).' % (len(cates), self.domain['_id'])) for cate in self.doing.itervalues(): if start < cate['last'] <= end and cate['_id'] in self.cates: cates[cate['_id']] = cate return cates @log def back_album(self, start, end): albums = {} tmp = [] for album in self.albums.itervalues(): if start < album['last'] <= end: albums[album['_id']] = album tmp.append(album) self.albums.extend(tmp) for album in self.doing.itervalues(): if start < album['last'] <= end and album['_id'] in self.albums: albums[album['_id']] = album return albums @log def get(self): if not self.cates and not self.albums: return None while True: self.next += 1 if self.next % 2 == 0 and len(self.albums) < 10: if self.cates : cate = self.cates.get() self.doing[cate['_id']] = cate return 'cate', cate if self.albums: album = self.albums.popitem()[1] for page, status in album['pages'].iteritems(): if status == 'wait': album['pages'][page] = 'doing' self.doing[album['_id']] = album self.albums.put(album) return 'page', {'_id':page, 'domain':album['domain'], 'album':album['_id']} @log def finish_cate(self, task): if task['_id'] in self.doing: task['status'] = 'done' self.master.catecory.save(task) del self.doing[task['_id']] if task['_id'] in self.cates: del self.cates[task['_id']] @log def finish_album(self, task): if task['album'] in self.albums: album = self.albums[task['album']] pages = album['pages'] imgs = album['imgs'] pages[task['_id']] = 'done' for value in pages.itervalues(): if value in ['wait', 'doing']: return album = album.copy() album['pages'] = json.dumps(pages) album['imgs'] = json.dumps(imgs) album['status'] = 'done' self.master.album.save(album) if task['album'] in self.albums: del self.albums[task['album']] @log def parse_cate(self, task, result): html = result['html'] cates = self.match(self.re_cate, html) albums = self.match(self.re_album, html) _type = self.search(self.re_type, html) for cate in cates: if cate not in self.cates and not self.master.catecory.find({'_id':cate}): # cha shu ju ku self.cates[cate] = { '_id':cate, '_type': _type, 'domain':task['domain'], 'state':0, 'status':'wait', 'last':time.time() } for album in albums: if album not in self.albums: print album print '*' * 80 self.albums[album] = { '_id':album, 'pages':{album:'wait'}, 'imgs':{} , 'domain':task['domain'], 'cate': task['_id'], 'status':'wait', 'state':'valid', 'title':'', 'last':time.time()} self.finish_cate(task) @log def parse_album(self, task, result): album_imgs = self.albums[task['album']]['imgs'] album_pages = self.albums[task['album']]['pages'] html = result['html'] imgs = self.match(self.re_image, html) pages = self.match(self.re_page, html) title = self.albums[task['album']]['title'] if not len(title): self.albums[task['album']]['title'] = self.search(self.re_title, html) if imgs: print imgs for img in imgs: if img not in album_imgs: album_imgs[img] = 'wait', '' content = get(img, allow_types='*/*', resp=True).content path = self.master.file.put(task['_id'], content, 'jpg') album_imgs[img] = 'done', path else: print 'imgs is None', imgs, task['_id'] album_imgs[img] = 'wait' for page in pages: if page not in album_pages: album_pages[page] = 'wait' self.finish_album(task) @log def match(self, regx, html): """ need implements """ return [x if type(x) in (str, unicode) else x[0] for x in regx.findall(html)] @log def search(self, regx, html, default=''): m = regx.search(html) return m.group(1) if m else default """ { '_id':'http://www.youzi4.com/', 're_cate':'href="(http:\/\/www\.youzi4\.com\/[^"]*?\/(list_.*?\.html)?)"', 're_album':'href="(http:\/\/www\.youzi4\.com\/.*?\/\d+\.html)"', 're_page':'href="(http:\/\/www\.youzi4\.com\/.*?\/\d+_\d+\.html)', 're_title':'alt="(.*?)-.*?"', 're_type':'<li><a class="active".*?>(.*?)<\/a><\/li>', 're_image':'data-original="(http:\/\/img.d843.com\/uploads\/.*?\/\d+-.*?\.jpg)"', 'last':1410705419.978838, 'status':'valid' } """ """
class Domain(object): """docstring for Domain""" def __init__(self, master, domain): super(Domain, self).__init__() self.master = master self.domain = domain self.re_cate = re.compile(self.domain['re_cate']) self.re_page = re.compile(self.domain['re_page']) self.re_album = re.compile(self.domain['re_album']) self.re_image = re.compile(self.domain['re_image']) self.re_title = re.compile(self.domain['re_title']) self.re_type = re.compile(self.domain['re_type']) self.cates = PQDict(key=lambda x: x.value['_id'], score=time.time) self.albums = PQDict(key=lambda x: x.value['_id'], score=album_score) self.doing = {} self.next = 0 @log def load(self): doc = {'domain': self.domain['_id'], 'status': 'wait'} cates = self.master.cate.find(doc) for cate in cates: self.add_cate(cate) albums = self.master.album.find(doc) for album in albums: album['pages'] = json.loads(album['pages']) album['imgs'] = json.loads(album['imgs']) self.add_album(album) @log def add_cate(self, cate, cates=None): _id = cate['_id'] if cate['status'] == 'wait' \ and _id not in self.cates \ and _id not in self.doing: self.cates.put(cate) elif cate['status'] == 'invalid' and cates: tmp = None if _id in self.cates: tmp = self.cates.pop(_id) elif _id in self.doing: tmp = self.doing.pop(_id) if tmp and tmp['last'] > self.master.last: tmp['status'] = 'invalid' cates[_id] = tmp @log def add_album(self, album, albums=None): _id = album['_id'] if album['status'] == 'wait' and _id not in self.albums: self.albums[_id] = album elif album['status'] == 'invalid' and _id in self.albums and albums: tmp = self.albums.pop(_id) if tmp and tmp['last'] > self.master.last: tmp['status'] = 'invalid' albums[_id] = tmp @log def back_domain(self, start, end): domains = {} if start < self.domain['last'] <= end: domains[self.domain['_id']] = self.domain return domains @log def back_cate(self, start, end): cates = {} tmp = [] for cate in self.cates.itervalues(): if start < cate['last'] <= end: cates[cate['_id']] = cate tmp.append(cate) self.cates.extend(tmp) if cates: print('back %d cates from domain(%s).' % (len(cates), self.domain['_id'])) for cate in self.doing.itervalues(): if start < cate['last'] <= end and cate['_id'] in self.cates: cates[cate['_id']] = cate return cates @log def back_album(self, start, end): albums = {} tmp = [] for album in self.albums.itervalues(): if start < album['last'] <= end: albums[album['_id']] = album tmp.append(album) self.albums.extend(tmp) for album in self.doing.itervalues(): if start < album['last'] <= end and album['_id'] in self.albums: albums[album['_id']] = album return albums @log def get(self): if not self.cates and not self.albums: return None while True: self.next += 1 if self.next % 2 == 0 and len(self.albums) < 10: if self.cates: cate = self.cates.get() self.doing[cate['_id']] = cate return 'cate', cate if self.albums: album = self.albums.popitem()[1] for page, status in album['pages'].iteritems(): if status == 'wait': album['pages'][page] = 'doing' self.doing[album['_id']] = album self.albums.put(album) return 'page', { '_id': page, 'domain': album['domain'], 'album': album['_id'] } @log def finish_cate(self, task): if task['_id'] in self.doing: task['status'] = 'done' self.master.catecory.save(task) del self.doing[task['_id']] if task['_id'] in self.cates: del self.cates[task['_id']] @log def finish_album(self, task): if task['album'] in self.albums: album = self.albums[task['album']] pages = album['pages'] imgs = album['imgs'] pages[task['_id']] = 'done' for value in pages.itervalues(): if value in ['wait', 'doing']: return album = album.copy() album['pages'] = json.dumps(pages) album['imgs'] = json.dumps(imgs) album['status'] = 'done' self.master.album.save(album) if task['album'] in self.albums: del self.albums[task['album']] @log def parse_cate(self, task, result): html = result['html'] cates = self.match(self.re_cate, html) albums = self.match(self.re_album, html) _type = self.search(self.re_type, html) for cate in cates: if cate not in self.cates and not self.master.catecory.find( {'_id': cate}): # cha shu ju ku self.cates[cate] = { '_id': cate, '_type': _type, 'domain': task['domain'], 'state': 0, 'status': 'wait', 'last': time.time() } for album in albums: if album not in self.albums: print album print '*' * 80 self.albums[album] = { '_id': album, 'pages': { album: 'wait' }, 'imgs': {}, 'domain': task['domain'], 'cate': task['_id'], 'status': 'wait', 'state': 'valid', 'title': '', 'last': time.time() } self.finish_cate(task) @log def parse_album(self, task, result): album_imgs = self.albums[task['album']]['imgs'] album_pages = self.albums[task['album']]['pages'] html = result['html'] imgs = self.match(self.re_image, html) pages = self.match(self.re_page, html) title = self.albums[task['album']]['title'] if not len(title): self.albums[task['album']]['title'] = self.search( self.re_title, html) if imgs: print imgs for img in imgs: if img not in album_imgs: album_imgs[img] = 'wait', '' content = get(img, allow_types='*/*', resp=True).content path = self.master.file.put(task['_id'], content, 'jpg') album_imgs[img] = 'done', path else: print 'imgs is None', imgs, task['_id'] album_imgs[img] = 'wait' for page in pages: if page not in album_pages: album_pages[page] = 'wait' self.finish_album(task) @log def match(self, regx, html): """ need implements """ return [ x if type(x) in (str, unicode) else x[0] for x in regx.findall(html) ] @log def search(self, regx, html, default=''): m = regx.search(html) return m.group(1) if m else default """ { '_id':'http://www.youzi4.com/', 're_cate':'href="(http:\/\/www\.youzi4\.com\/[^"]*?\/(list_.*?\.html)?)"', 're_album':'href="(http:\/\/www\.youzi4\.com\/.*?\/\d+\.html)"', 're_page':'href="(http:\/\/www\.youzi4\.com\/.*?\/\d+_\d+\.html)', 're_title':'alt="(.*?)-.*?"', 're_type':'<li><a class="active".*?>(.*?)<\/a><\/li>', 're_image':'data-original="(http:\/\/img.d843.com\/uploads\/.*?\/\d+-.*?\.jpg)"', 'last':1410705419.978838, 'status':'valid' } """ """