Example #1
0
 def __init__(self, domain):
     self.domain = domain
     self.domains = domain.domains
     self.mongo = self.domains.catecory
     self.log = domain.log
     self.waiting = PQDict(
         key=lambda x: x.value['_id'],
         score=lambda x: x.value['next'],
     )
     self.doing = dict()
     self.updates = set()
     self.next = 0
     self.null = False
     self.wait = 0
Example #2
0
	def __init__(self, master, domain):
		super(Domain, self).__init__()
		self.master = master
		self.domain = domain
		self.re_cate 	= re.compile(self.domain['re_cate'])
		self.re_page 	= re.compile(self.domain['re_page'])
		self.re_album 	= re.compile(self.domain['re_album'])
		self.re_image 	= re.compile(self.domain['re_image'])
		self.re_title 	= re.compile(self.domain['re_title'])
		self.re_type 	= re.compile(self.domain['re_type'])
		self.cates = PQDict(key=lambda x:x.value['_id'], score=time.time)
		self.albums = PQDict(key=lambda x:x.value['_id'], score=album_score)
		self.doing = {}
		self.next = 0
Example #3
0
 def __init__(self, master, domain):
     super(Domain, self).__init__()
     self.master = master
     self.domain = domain
     self.re_cate = re.compile(self.domain['re_cate'])
     self.re_page = re.compile(self.domain['re_page'])
     self.re_album = re.compile(self.domain['re_album'])
     self.re_image = re.compile(self.domain['re_image'])
     self.re_title = re.compile(self.domain['re_title'])
     self.re_type = re.compile(self.domain['re_type'])
     self.cates = PQDict(key=lambda x: x.value['_id'], score=time.time)
     self.albums = PQDict(key=lambda x: x.value['_id'], score=album_score)
     self.doing = {}
     self.next = 0
Example #4
0
    def __init__(self, count, conf):
        super(Master, self).__init__(count)

        db = MongoImage(conf, 'img')
        self.domain = db.domain
        self.catecory = db.catecory
        self.album = db.album
        self.file = db.file
        self.sync_round = 90
        self.doing_round = (count + 1) / 2
        self.domains = PQDict(key=lambda x: x.value.domain['_id'])

        self.last = 0

        self.domain.save(o)
        self.catecory.save(c)
Example #5
0
    def __init__(self, domain):
        self.domain = domain
        self.domains = domain.domains
        self.articles = domain.domains.master.articles
        self.article = domain.domains.article
        self.exc_article = domain.domains.exc_article
        self.log = domain.log
        self.queue = PQDict(
            key=lambda x: x.value['_id'],
            score=lambda x: -x.value['pubtime'],
        )
        self.updates = set()
        self.full = 2000
        self.common = 1000
        self.limit = 100
        self.null = False
        self.next = 0
        self.last = 0

        self.xcount = 0
        self.xlast = 0
Example #6
0
	def __init__(self, master):
		self.master = master
		self.log = master.log
		self.domain = master.mongo.domain
		self.catecory = master.mongo.catecory
		self.template = master.mongo.template
		self.article = master.mongo.spider_article
		self.exc_article = master.mongo.spider_exc
		self.html_file = master.mongo.html_file
		self.url_redis = redis.Redis(**master.conf.redis_url)
		self.domains = PQDict(key=lambda x: x.value.domain['_id'])
		self.fetching = dict()
		self.waiting = OrderedDict()
		self.doing = dict()
		self.sync_round = 1800
		self.last = -1

		master.fetch_adapter.register(['cate','art','page','img'], 
			self.fetch_get, self.fetch, self.fetch_cancel)
		master.handle_adapter.register(['cate'], 
			self.handle_get, self.handle, self.handle_cancel)
Example #7
0
	def __init__(self, count, conf):
		super(Master, self).__init__(count)

		db = MongoImage(conf, 'img')
		self.domain = db.domain
		self.catecory = db.catecory
		self.album = db.album
		self.file = db.file		
		self.sync_round = 90
		self.doing_round = (count + 1) / 2
		self.domains = PQDict(key=lambda x: x.value.domain['_id'])

		self.last = 0

		self.domain.save(o)
		self.catecory.save(c)
Example #8
0
	def __init__(self, domain):
		self.domain = domain
		self.domains = domain.domains
		self.articles = domain.domains.master.articles
		self.article = domain.domains.article
		self.exc_article = domain.domains.exc_article
		self.log = domain.log
		self.queue = PQDict(
			key=lambda x: x.value['_id'],
			score=lambda x: -x.value['pubtime'],
		)
		self.updates = set()
		self.full = 2000
		self.common = 1000
		self.limit = 100
		self.null = False
		self.next = 0
		self.last = 0

		self.xcount = 0
		self.xlast = 0
Example #9
0
class Articles(object):

	def __init__(self, domain):
		self.domain = domain
		self.domains = domain.domains
		self.articles = domain.domains.master.articles
		self.article = domain.domains.article
		self.exc_article = domain.domains.exc_article
		self.log = domain.log
		self.queue = PQDict(
			key=lambda x: x.value['_id'],
			score=lambda x: -x.value['pubtime'],
		)
		self.updates = set()
		self.full = 2000
		self.common = 1000
		self.limit = 100
		self.null = False
		self.next = 0
		self.last = 0

		self.xcount = 0
		self.xlast = 0

	def __contains__(self, key):
		return key in self.queue or self.domains.is_fetching(key)

	def __nonzero__(self):
		return len(self.queue) > 0

	def __len__(self):
		return len(self.queue)

	def put(self, article, update=False):
		if article['_id'] in self:
			return 0

		if update == True:
			self.updates.add(article['_id'])

		if self.next < article['pubtime']:
			self.next = article['pubtime']

		self.queue.put(article)

		return 1

	def save(self, article, update=False):
		if update == True or article['_id'] in self.updates:
			if article['_id'] in self.updates:
				self.updates.remove(article['_id'])
			if 'exc' in article:
				self.exc_article.save(article)
			else:
				self.article.save(article)
			return 1
		return 0

	def new(self, url, src_type, src, task, last=0):
		key = hashlib.md5(url.encode('utf-8')).hexdigest()
		xlong = unicode2hash(url)
		tpl = url2tpl(url)
		if tpl not in self.domain.tpls \
				or key in self \
				or self.domains.add_url(xlong, self.domain.id()) == 0:
			return 0

		article = {
			'_id': key,
			'id': '',
			'long': xlong,
			'url': url,
			'domain': self.domain.id(),
			'tpl': tpl,
			'src_type': src_type,
			'src': src,
			'html': '',
			'title': '',
			'pages': {},
			'imgs': {},
			'icons': {},
			'tags': [],
			'sim': False,
			'f': False,
			'version': 0,
			'v': self.articles.new_version(),
			'created': time.time(),
			'last': time.time(),
		}

		if src_type == 'cate':
			article['src_link'] = task['url']
			article['src_name'] = task['name']
			if last > 0:
				article['pubtime'] = last
			else:
				article['pubtime'] = time.time() - 86400 * 60
		else:
			article['src_link'] = self.domain.domain['link']
			article['src_name'] = self.domain.domain['name']
			article['pubtime'] = task['pubtime'] - 86400 * 15

		article['pubtime'] = self.get_pubtime(article)

		if self.next < article['pubtime']:
			self.next = article['pubtime']

		self.updates.add(article['_id'])
		self.queue.put(article)
		return 1

	def sync(self, exit):
		if exit:
			self.back_on_exit()
		else:
			if len(self.queue) >= self.full * 2:
				self.back()
			elif len(self.queue) <= self.limit and (not self.null
					or time.time() >= self.last + self.domains.sync_round):
				self.load()
				self.last = time.time()

	def load(self):
		doc = {
			'domain': self.domain.id(),
			'f': False,
		}
		articles = self.article.find(doc)
		articles.sort('pubtime', pymongo.DESCENDING).limit(self.common)
		num = 0
		for article in articles:
			num += self.put(article)

		self.null = True if num == 0 else False
		self.log.info('load %d/%d articles from %s to fetch.'
			% (num, min(articles.count(), self.common), self.domain.id()))

	def back(self):
		count = len(self.queue)
		limit = count - self.common * 2
		for article in self.queue.tail(limit):
			self.save(article)
		self.null = False
		self.log.info('back %d/%d articles from %s.'
			% (limit, count, self.domain.id()))

	def back_on_exit(self):
		count, num = len(self.queue), 0
		for article in self.queue.itervalues():
			num += self.save(article)
		self.log.info('back %d/%d articles from %s on exit.'
			% (num, count, self.domain.id()))

	def get(self, now=0):
		if self.queue:
			if now > 0:
				today = now // 86400 * 86400
				if self.next == 0 or today < self.next <= now:
					article = self.queue.get()
					self.next = article['pubtime']
					if today < self.next <= now:
						return article
					self.queue.put(article)
			else:
				# if self.articles.doing_len() > 5000:
				# 	if self.xcount >= 50 and time.time() - self.xlast < 5:
				# 		return None

				# 	if time.time() - self.xlast >= 5:
				# 		self.xcount = 0
				# 		self.xlast = time.time()

				# self.xcount += 1
				return self.queue.get()

	def cancel(self, article):
		self.queue.put(article)

	def fetch(self, article, res):
		article['last'] = time.time()
		if 'exc' not in res:
			article['html'] = res['path']
			article['f'] = True
			self.save(article, update=True)

			ext = {
				'html': res['html'],
				'selector': self.domain.tpls.selector(article['tpl']),
			}
			if self.articles._len < 30000 or time.time() - article['pubtime'] < 86400:
				self.articles.put(article, ext=ext)
			self.log.debug('fetch article %s.' % article['url'])
		else:
			article['exc'] = res['exc']
			self.save(article, update=True)
			self.log.warn('fetch article except(%s): %s.'
				% (res['exc'], article['url']))

	def get_pubtime(self, article):
		now = time.time()
		match = re_date.search(article['url'])
		if match:
			year, month, day = match.group(1), match.group(2), match.group(3)

			if not day:
				day = '01'

			try:
				return min(now, time.mktime(time.strptime(
					'%s-%s-%s' % (year, month, day), "%Y-%m-%d")) + 43200)
			except:
				pass

		if article['pubtime']:
			return article['pubtime']

		return now - (86400 * 60)
Example #10
0
class Articles(object):
    def __init__(self, domain):
        self.domain = domain
        self.domains = domain.domains
        self.articles = domain.domains.master.articles
        self.article = domain.domains.article
        self.exc_article = domain.domains.exc_article
        self.log = domain.log
        self.queue = PQDict(
            key=lambda x: x.value['_id'],
            score=lambda x: -x.value['pubtime'],
        )
        self.updates = set()
        self.full = 2000
        self.common = 1000
        self.limit = 100
        self.null = False
        self.next = 0
        self.last = 0

        self.xcount = 0
        self.xlast = 0

    def __contains__(self, key):
        return key in self.queue or self.domains.is_fetching(key)

    def __nonzero__(self):
        return len(self.queue) > 0

    def __len__(self):
        return len(self.queue)

    def put(self, article, update=False):
        if article['_id'] in self:
            return 0

        if update == True:
            self.updates.add(article['_id'])

        if self.next < article['pubtime']:
            self.next = article['pubtime']

        self.queue.put(article)

        return 1

    def save(self, article, update=False):
        if update == True or article['_id'] in self.updates:
            if article['_id'] in self.updates:
                self.updates.remove(article['_id'])
            if 'exc' in article:
                self.exc_article.save(article)
            else:
                self.article.save(article)
            return 1
        return 0

    def new(self, url, src_type, src, task, last=0):
        key = hashlib.md5(url.encode('utf-8')).hexdigest()
        xlong = unicode2hash(url)
        tpl = url2tpl(url)
        if tpl not in self.domain.tpls \
          or key in self \
          or self.domains.add_url(xlong, self.domain.id()) == 0:
            return 0

        article = {
            '_id': key,
            'id': '',
            'long': xlong,
            'url': url,
            'domain': self.domain.id(),
            'tpl': tpl,
            'src_type': src_type,
            'src': src,
            'html': '',
            'title': '',
            'pages': {},
            'imgs': {},
            'icons': {},
            'tags': [],
            'sim': False,
            'f': False,
            'version': 0,
            'v': self.articles.new_version(),
            'created': time.time(),
            'last': time.time(),
        }

        if src_type == 'cate':
            article['src_link'] = task['url']
            article['src_name'] = task['name']
            if last > 0:
                article['pubtime'] = last
            else:
                article['pubtime'] = time.time() - 86400 * 60
        else:
            article['src_link'] = self.domain.domain['link']
            article['src_name'] = self.domain.domain['name']
            article['pubtime'] = task['pubtime'] - 86400 * 15

        article['pubtime'] = self.get_pubtime(article)

        if self.next < article['pubtime']:
            self.next = article['pubtime']

        self.updates.add(article['_id'])
        self.queue.put(article)
        return 1

    def sync(self, exit):
        if exit:
            self.back_on_exit()
        else:
            if len(self.queue) >= self.full * 2:
                self.back()
            elif len(self.queue) <= self.limit and (
                    not self.null
                    or time.time() >= self.last + self.domains.sync_round):
                self.load()
                self.last = time.time()

    def load(self):
        doc = {
            'domain': self.domain.id(),
            'f': False,
        }
        articles = self.article.find(doc)
        articles.sort('pubtime', pymongo.DESCENDING).limit(self.common)
        num = 0
        for article in articles:
            num += self.put(article)

        self.null = True if num == 0 else False
        self.log.info(
            'load %d/%d articles from %s to fetch.' %
            (num, min(articles.count(), self.common), self.domain.id()))

    def back(self):
        count = len(self.queue)
        limit = count - self.common * 2
        for article in self.queue.tail(limit):
            self.save(article)
        self.null = False
        self.log.info('back %d/%d articles from %s.' %
                      (limit, count, self.domain.id()))

    def back_on_exit(self):
        count, num = len(self.queue), 0
        for article in self.queue.itervalues():
            num += self.save(article)
        self.log.info('back %d/%d articles from %s on exit.' %
                      (num, count, self.domain.id()))

    def get(self, now=0):
        if self.queue:
            if now > 0:
                today = now // 86400 * 86400
                if self.next == 0 or today < self.next <= now:
                    article = self.queue.get()
                    self.next = article['pubtime']
                    if today < self.next <= now:
                        return article
                    self.queue.put(article)
            else:
                # if self.articles.doing_len() > 5000:
                # 	if self.xcount >= 50 and time.time() - self.xlast < 5:
                # 		return None

                # 	if time.time() - self.xlast >= 5:
                # 		self.xcount = 0
                # 		self.xlast = time.time()

                # self.xcount += 1
                return self.queue.get()

    def cancel(self, article):
        self.queue.put(article)

    def fetch(self, article, res):
        article['last'] = time.time()
        if 'exc' not in res:
            article['html'] = res['path']
            article['f'] = True
            self.save(article, update=True)

            ext = {
                'html': res['html'],
                'selector': self.domain.tpls.selector(article['tpl']),
            }
            if self.articles._len < 30000 or time.time(
            ) - article['pubtime'] < 86400:
                self.articles.put(article, ext=ext)
            self.log.debug('fetch article %s.' % article['url'])
        else:
            article['exc'] = res['exc']
            self.save(article, update=True)
            self.log.warn('fetch article except(%s): %s.' %
                          (res['exc'], article['url']))

    def get_pubtime(self, article):
        now = time.time()
        match = re_date.search(article['url'])
        if match:
            year, month, day = match.group(1), match.group(2), match.group(3)

            if not day:
                day = '01'

            try:
                return min(
                    now,
                    time.mktime(
                        time.strptime('%s-%s-%s' %
                                      (year, month, day), "%Y-%m-%d")) + 43200)
            except:
                pass

        if article['pubtime']:
            return article['pubtime']

        return now - (86400 * 60)
Example #11
0
class Master(GeventWorker):
    def __init__(self, count, conf):
        super(Master, self).__init__(count)

        db = MongoImage(conf, 'img')
        self.domain = db.domain
        self.catecory = db.catecory
        self.album = db.album
        self.file = db.file
        self.sync_round = 90
        self.doing_round = (count + 1) / 2
        self.domains = PQDict(key=lambda x: x.value.domain['_id'])

        self.last = 0

        self.domain.save(o)
        self.catecory.save(c)

    def run(self):
        try:
            self.sync(init=True)
            while not self.is_exit():
                print "I'am coming"
                self.doing()
                self.clean()
                self.sync()
                self.wait(0.1)
        except KeyboardInterrupt:
            print 'keyboardInterrupt'
            self.sync(exit=True)

    def sync(self, init=False, exit=False):
        if not init and not exit \
          and self.sync_round > time.time() - self.last:
            return

        last = time.time()

        domains, cates, albums = {}, {}, {}

        if not init:
            for domain in self.domains:
                domains.update(domain.back_domain(self.last, last))
                cates.update(domain.back_cate(self.last, last))
                albums.update(domain.back_album(self.last, last))

        if not exit:
            self.load_domains(init, self.last, last, domains)
            self.load_cates(init, self.last, last, cates)
            self.load_albums(init, self.last, last, albums)

        for domain in domains.itervalues():
            self.domain.save(domain)

        for cate in cates.itervalues():
            self.catecory.save(cate)

        for album in albums.itervalues():
            album = album.copy()
            album['pages'] = json.dumps(album['pages'])
            album['imgs'] = json.dumps(album['imgs'])
            self.album.save(album)

    @log
    def doing(self):
        for x in xrange(self.doing_round):
            if self.is_exit():
                return
            domain = self.domains.get()
            task = domain.get()
            self.domains.put(domain)
            if not task:
                break
            self.do(task)

    @log
    def handle(self, index, task):
        try:
            index, task = task[0], task[1]
            url = task['_id']
            html = get(u(url))
            html = clean_doc(html2doc(html, url=url), return_html=True)
            if index == 'cate':
                self.domains[task['domain']].parse_cate(task, {'html': html})
            elif index == 'album':
                pass
            elif index == 'page':
                self.domains[task['domain']].parse_album(task, {'html': html})
        except KeyboardInterrupt:
            self.exit()

    @log
    def load_domains(self, init, start, end, domains):
        doc = {
            'status': 'valid'
        } if init else {
            'last': {
                '$gt': start,
                '$lte': end
            }
        }
        for domain in self.domain.find(doc):
            self.add_domain(init, domain, domains)

    @log
    def load_cates(self, init, start, end, cates):
        doc = {
            'status': 'wait'
        } if init else {
            'last': {
                '$gt': start,
                '$lte': end
            }
        }
        for cate in self.catecory.find(doc):
            if cate['domain'] in self.domains:
                self.domains[cate['domain']].add_cate(cate, cates)

    @log
    def load_albums(self, init, start, end, albums):
        doc = {
            'status': 'wait'
        } if init else {
            'last': {
                '$gt': start,
                '$lte': end
            }
        }
        for album in self.album.find(doc):
            album['pages'] = json.loads(album['pages'])
            album['imgs'] = json.loads(album['imgs'])
            if album['domain'] in self.domains:
                self.domains[album['domain']].add_album(album, albums)

    @log
    def add_domain(self, init, domain, domains):
        _id = domain['_id']
        if domain['status'] == 'valid' and _id not in self.domains:
            domain = Domain(self, domain)
            if not init:
                domain.load()
            self.domains.put(domain)
        elif domain['status'] == 'invalid' and _id in self.domains:
            tmp = self.domains.pop(_id).domain
            if tmp['last'] < self.last:
                tmp['status'] = 'invalid'
                domains[_id] = tmp

    @log
    def on_exit(self):
        self.sync(exit=True)
Example #12
0
class Domains(object):

	def __init__(self, master):
		self.master = master
		self.log = master.log
		self.domain = master.mongo.domain
		self.catecory = master.mongo.catecory
		self.template = master.mongo.template
		self.article = master.mongo.spider_article
		self.exc_article = master.mongo.spider_exc
		self.html_file = master.mongo.html_file
		self.url_redis = redis.Redis(**master.conf.redis_url)
		self.domains = PQDict(key=lambda x: x.value.domain['_id'])
		self.fetching = dict()
		self.waiting = OrderedDict()
		self.doing = dict()
		self.sync_round = 1800
		self.last = -1

		master.fetch_adapter.register(['cate','art','page','img'], 
			self.fetch_get, self.fetch, self.fetch_cancel)
		master.handle_adapter.register(['cate'], 
			self.handle_get, self.handle, self.handle_cancel)

	@property
	def counter(self):
		res = {
			'fetch':len(self.fetching), 
			'domain_wait':len(self.waiting),
			'domain_doing':len(self.doing),
			'fetch_cate':0,
			'fetch_art':0,
			'fetch_img':0,
			'fetch_page':0,
		}
		for cmd, _ in self.fetching.itervalues():
			res['fetch_' + cmd] += 1
		return res

	def is_fetching(self, key):
		return key in self.fetching

	def is_doing(self, key):
		return key in self.fetching \
			or key in self.waiting \
			or key in self.doing

	def sync(self, quit=False):
		self.sync_last(quit)
		self.sync_other(quit)

	def sync_last(self, quit):
		if not quit and self.sync_round > time.time() - self.last:
			return

		last = time.time()
		domains, tpls = {}, {}

		for domain in self.domains:
			domains.update(domain.back(self.last, last))
			tpls.update(domain.tpls.back(self.last, last))

		if not quit:
			self.load_domains(self.last, last)
			self.load_tpls(self.last, last)

		for domain in domains.itervalues():
			self.domain.save(domain)
		for tpl in tpls.itervalues():
			self.template.save(tpl)

		self.last = last

	def sync_other(self, quit):
		for domain in self.domains:
			domain.sync(quit)
		self.domains.heapify()

	def load_domains(self, start, end):
		doc = {'status':'common', 'last':{'$gt':start, '$lte':end}}
		domains = self.domain.find(doc)
		for domain in domains:
			if domain['_id'] not in self.domains:
				xdomain = Domain(self, domain)
				self.domains.put(xdomain)
				if start > 0:
					xdomain.load()
			else:
				self.domains[domain['_id']].update(domain)
		if domains.count() > 0:
			self.log.info('load %d domains.' % domains.count())

	def load_tpls(self, start, end):
		doc = {'status':'common', 'last':{'$gt':start, '$lte':end}}
		tpls = self.template.find(doc)
		for tpl in tpls:
			with self.domains.get2do(tpl['domain']) as domain:
				domain.tpls.put(tpl)

		if tpls.count > 0:
			self.log.info('load %d tpls.' % tpls.count())

	def add_url(self, key, domain):
		return self.url_redis.sadd(domain, key)

	def has_url(self, key, domain):
		return self.url_redis.sismember(domain, key)

	def selector(self, domain, tpl):
		if domain in self.domains:
			return self.domains[domain].tpls.selector(tpl)
		return {}

	def new_arts(self, article, urls):
		with self.domains.get2do(article['domain']) as domain:
			return domain.new_arts(urls, 'art', article)
		return 0

	def fetch_page(self, page):
		with self.domains.get2do(page['domain']) as domain:
			return domain.pages.put(page)
		return False

	def fetch_img(self, img):
		with self.domains.get2do(img['domain']) as domain:
			return domain.imgs.put(img)
		return False

	def new_handle(self, key, value):
		self.waiting[key] = value

	def quit(self):
		for cmd, task in self.fetching.itervalues():
			self.domains[task['domain']].cancel(cmd, task)
		self.fetching.clear()

		for cmd, task, _ in self.waiting.itervalues():
			self.domains[task['domain']].cancel(cmd, task)
		self.waiting.clear()

		for cmd, task in self.doing.itervalues():
			self.domains[task['domain']].cancel(cmd, task)
		self.doing.clear()

	def fetch_get(self, count):
		tasks = []
		if not self.domains:
			return tasks

		null = 0
		for _ in xrange(count):
			cmd, task = None, None
			with self.domains.get2do() as domain:
				cmd, task = domain.get()

			if not task:
				null += 1
				if null >= 5:
					break
				self.domains.heapify()
				continue
				
			tasks.append({'key':task['_id'], 'cmd':cmd, 'info':task})
			self.fetching[task['_id']] = (cmd, task)

		return tasks

	def fetch_cancel(self, key, cmd):
		if key in self.fetching:
			cmd, task = self.fetching.pop(key)
			with self.domains.get2do(task['domain']) as domain:
				domain.cancel(cmd, task)

	def fetch(self, key, cmd, res):
		if key in self.fetching:
			cmd, task = self.fetching.pop(key)
			with self.domains.get2do(task['domain']) as domain:
				domain.fetch(cmd, task, res)
				
	def handle_get(self, count, **kwargs):
		tasks = []
		for _ in xrange(min(len(self.waiting), count)):
			cmd, task, ext = self.waiting.popitem()[1]
			tasks.append({'key':task['_id'], 'cmd':cmd, 'info':task, 'ext':ext})
			self.doing[task['_id']] = (cmd, task)
		return tasks

	def handle_cancel(self, key, cmd):
		if key in self.doing:
			cmd, task = self.doing.pop(key)
			with self.domains.get2do(task['domain']) as domain:
				domain.cancel(cmd, task)

	def handle(self, key, cmd, res):
		if key in self.doing:
			cmd, task = self.doing.pop(key)
			with self.domains.get2do(task['domain']) as domain:
				domain.handle(cmd, task, res)
Example #13
0
class Catecorys(object):
    def __init__(self, domain):
        self.domain = domain
        self.domains = domain.domains
        self.mongo = self.domains.catecory
        self.log = domain.log
        self.waiting = PQDict(
            key=lambda x: x.value['_id'],
            score=lambda x: x.value['next'],
        )
        self.doing = dict()
        self.updates = set()
        self.next = 0
        self.null = False
        self.wait = 0

    def __nonzero__(self):
        return len(self.waiting) > 0

    def __len__(self):
        return len(self.waiting)

    def put(self, cate, update=False):
        self.waiting.put(cate)
        if cate['next'] < self.wait:
            self.wait = cate['next']
        if update:
            self.updates.add(cate['_id'])

    def save(self, cate, update=False):
        if update == True or cate['_id'] in self.updates:
            if cate['_id'] in self.updates:
                self.updates.remove(cate['_id'])
            self.mongo.save(cate)
            return 1
        return 0

    def sync(self, exit):
        if exit:
            self.back_on_exit()
        else:
            if time.time() >= self.next or self.next == 0:
                next = time.time() + 3600
                self.back(next)
                self.load(next)
                self.next = next

    def load(self, next):
        doc = {
            'domain': self.domain.id(),
            'status': 'common',
            '$or': [{
                'next': {
                    '$gte': self.next,
                    '$lt': next
                }
            }, {
                'last': 0
            }],
        }
        cates = self.mongo.find(doc)
        for cate in cates:
            if cate['_id'] not in self.waiting \
              and cate['_id'] not in self.doing:
                self.put(cate)

        if cates.count() > 0:
            self.log.info('load %d/%d cates from %s.' %
                          (cates.count(), len(self.waiting), self.domain.id()))

    def back(self, next):
        num, others = 0, []
        for cate in self.waiting.itervalues():
            if cate['next'] >= next:
                num += self.save(cate)
            else:
                others.append(cate)
        self.waiting.extend(others)
        if num > 0:
            self.log.info('back %d/%d cate from %s.' %
                          (num, len(others), self.domain.id()))

    def back_on_exit(self):
        num, count = 0, len(self.waiting)
        for cate in self.waiting.itervalues():
            num += self.save(cate)
        self.log.info('back %d/%d cates from %s on exit.' %
                      (num, count, self.domain.id()))

    def get(self, now):
        if self.wait <= now and self.waiting:
            cate = self.waiting.get()
            self.wait = cate['next']
            if self.wait <= now:
                self.doing[cate['_id']] = cate

                if self.waiting:
                    with self.waiting.get2do() as tmp:
                        self.wait = tmp['next']
                else:
                    self.wait = 2000000000

                if not cate['page']:
                    return {
                        '_id': cate['_id'],
                        'domain': cate['domain'],
                        'url': cate['url']
                    }
                else:
                    return {
                        '_id': cate['_id'],
                        'domain': cate['domain'],
                        'url': cate['page']
                    }
            self.waiting.put(cate)

    def cancel(self, page):
        if page['_id'] not in self.doing:
            return
        cate = self.doing[page['_id']]
        self.waiting.put(cate)
        if cate['next'] < self.wait:
            self.wait = cate['next']

    def fetch(self, page, res):
        if page['_id'] not in self.doing:
            return

        cate = self.doing[page['_id']]
        cate['fetch'] += 1
        cate['last'] = time.time()
        self.updates.add(cate['_id'])
        if 'exc' not in res:
            self.domains.new_handle(cate['_id'], ('cate', page, {
                'html': res['html']
            }))
            self.log.debug('fetch cate: %s.' % cate['url'])
        else:
            log = {
                'last': time.time(),
                'url': page['url'],
                'exc': res['exc'],
                'arts': 0
            }
            self.make_log(cate, log)
            cate['next'] = self.get_next(cate)
            cate['error'] += 1
            cate = self.doing.pop(page['_id'])
            self.put(cate, update=True)
            self.log.warn('fetch cate except(%s): %s.' %
                          (res['exc'], page['url']))

    def handle(self, page, res):
        if page['_id'] not in self.doing:
            return

        log = {'last': time.time(), 'url': page['url'], 'exc': '', 'arts': 0}
        cate = self.doing.pop(page['_id'])
        if 'exc' not in res:
            count = self.domain.new_arts(res['urls'],
                                         'cate',
                                         cate,
                                         last=cate['next'] - 900)
            if count == 0:
                cate['null'] += 1
            else:
                cate['arts'] += count
                log['arts'] = count

            if not cate['all'] or count > 0:
                if res['next']:
                    cate['page'] = res['next']
                else:
                    cate['page'] = ''
                    cate['all'] = True
            else:
                cate['page'] = ''

            self.log.debug('parse %d arts from cate: %s.' %
                           (count, page['url']))
        else:
            log['exc'] = res['exc']
            cate['error'] += 1
            if cate['error'] >= 20:
                cate['all'] = True
            self.log.warn('parse except %s from cate: %s.' %
                          (res['exc'], page['url']))

        self.make_log(cate, log)
        cate['next'] = self.get_next(cate)
        cate['last'] = time.time()
        self.put(cate, update=True)

    def make_log(self, cate, log):
        cate['index'] += 1
        if cate['index'] >= 10:
            cate['index'] = 0

        if len(cate['log']) < 10:
            cate['log'].append(log)
        else:
            cate['log'][cate['index']] = log

    def get_next(self, cate):
        if not cate['all']:
            return time.time() + 60

        if cate['log']:
            arts = sum([x['arts'] for x in cate['log']])
            error = sum([1 if x['exc'] != '' else 0 for x in cate['log']])
            null = sum([1 if x['arts'] == 0 \
             and x['exc'] == 0 else 0 for x in cate['log']])
            start = min(cate['log'], key=lambda x: x['last'])['last']
            end = max(cate['log'], key=lambda x: x['last'])['last']
            if error == 10:
                return time.time() + 300
            return time.time() + 60 + (end - start) / float(arts + 5) \
             + null * 10 + error * 60
        return time.time() + 120
Example #14
0
class Master(GeventWorker):

	def __init__(self, count, conf):
		super(Master, self).__init__(count)

		db = MongoImage(conf, 'img')
		self.domain = db.domain
		self.catecory = db.catecory
		self.album = db.album
		self.file = db.file		
		self.sync_round = 90
		self.doing_round = (count + 1) / 2
		self.domains = PQDict(key=lambda x: x.value.domain['_id'])

		self.last = 0

		self.domain.save(o)
		self.catecory.save(c)
		
	def run(self):
		try:
			self.sync(init=True)
			while not self.is_exit():
				print "I'am coming"
				self.doing()
				self.clean()
				self.sync()
				self.wait(0.1)
		except KeyboardInterrupt:
			print 'keyboardInterrupt'
			self.sync(exit=True)

	
	def sync(self, init=False, exit=False):
		if not init and not exit \
				and self.sync_round > time.time() - self.last:
			return

		last = time.time()

		domains, cates, albums = {}, {}, {}

		if not init:
			for domain in self.domains:
				domains.update(domain.back_domain(self.last, last))
				cates.update(domain.back_cate(self.last, last))
				albums.update(domain.back_album(self.last, last))

		if not exit:
			self.load_domains(init, self.last, last, domains)
			self.load_cates(init, self.last, last, cates)
			self.load_albums(init, self.last, last, albums)

		for domain in domains.itervalues():
			self.domain.save(domain)

		for cate in cates.itervalues():
			self.catecory.save(cate)

		for album in albums.itervalues():
			album = album.copy()
			album['pages'] = json.dumps(album['pages'])
			album['imgs'] = json.dumps(album['imgs'])
			self.album.save(album)

	@log
	def doing(self):
		for x in xrange(self.doing_round):
			if self.is_exit():
				return
			domain = self.domains.get()
			task = domain.get()
			self.domains.put(domain)
			if not task:
				break
			self.do(task)

	@log
	def handle(self, index, task):
		try:
			index, task = task[0], task[1]
			url = task['_id']
			html = get(u(url))
			html = clean_doc(html2doc(html, url=url), return_html=True)
			if index == 'cate':
				self.domains[task['domain']].parse_cate(task, {'html':html})
			elif index == 'album':
				pass
			elif index == 'page':
				self.domains[task['domain']].parse_album(task, {'html':html})
		except KeyboardInterrupt:
			self.exit()


	@log
	def load_domains(self, init, start, end, domains):
		doc = {'status':'valid'} if init else {'last':{'$gt':start, '$lte':end}}
		for domain in self.domain.find(doc):
			self.add_domain(init, domain, domains)

	@log
	def load_cates(self, init, start, end, cates):
		doc = {'status':'wait'} if init else {'last':{'$gt':start, '$lte':end}}
		for cate in self.catecory.find(doc):
			if cate['domain'] in self.domains:
				self.domains[cate['domain']].add_cate(cate, cates)

	@log
	def load_albums(self, init, start, end, albums):
		doc = {'status':'wait'} if init else {'last':{'$gt':start, '$lte':end}}
		for album in self.album.find(doc):
			album['pages'] = json.loads(album['pages'])
			album['imgs'] = json.loads(album['imgs'])
			if album['domain'] in self.domains:
				self.domains[album['domain']].add_album(album, albums)

	@log
	def add_domain(self, init, domain, domains):
		_id = domain['_id']
		if domain['status'] == 'valid' and _id not in self.domains:
			domain = Domain(self, domain)
			if not init:
				domain.load()
			self.domains.put(domain)
		elif domain['status'] == 'invalid' and _id in self.domains:
			tmp = self.domains.pop(_id).domain
			if tmp['last'] < self.last:
				tmp['status'] = 'invalid'
				domains[_id] = tmp
  
  	@log
	def on_exit(self):
		self.sync(exit=True)
Example #15
0
class Domain(object):
	"""docstring for Domain"""
	def __init__(self, master, domain):
		super(Domain, self).__init__()
		self.master = master
		self.domain = domain
		self.re_cate 	= re.compile(self.domain['re_cate'])
		self.re_page 	= re.compile(self.domain['re_page'])
		self.re_album 	= re.compile(self.domain['re_album'])
		self.re_image 	= re.compile(self.domain['re_image'])
		self.re_title 	= re.compile(self.domain['re_title'])
		self.re_type 	= re.compile(self.domain['re_type'])
		self.cates = PQDict(key=lambda x:x.value['_id'], score=time.time)
		self.albums = PQDict(key=lambda x:x.value['_id'], score=album_score)
		self.doing = {}
		self.next = 0

	@log
	def load(self):
		doc = {'domain':self.domain['_id'], 'status':'wait'}
		cates = self.master.cate.find(doc)
		for cate in cates:
			self.add_cate(cate)

		albums = self.master.album.find(doc)
		for album in albums:
			album['pages'] = json.loads(album['pages'])
			album['imgs'] = json.loads(album['imgs'])
			self.add_album(album)

	@log
	def add_cate(self, cate, cates=None):
		_id = cate['_id']
		if cate['status'] == 'wait' \
				and _id not in self.cates \
				and _id not in self.doing:
			self.cates.put(cate)
		elif cate['status'] == 'invalid' and cates:
			tmp = None
			if _id in self.cates:
				tmp = self.cates.pop(_id)
			elif _id in self.doing:
				tmp = self.doing.pop(_id)

			if tmp and tmp['last'] > self.master.last:
				tmp['status'] = 'invalid'
				cates[_id] = tmp

	@log
	def add_album(self, album, albums=None):
		_id = album['_id']
		if album['status'] == 'wait' and _id not in self.albums:
			self.albums[_id] = album

		elif album['status'] == 'invalid' and _id in self.albums and albums:
			tmp = self.albums.pop(_id)
			if tmp and tmp['last'] > self.master.last:
				tmp['status'] = 'invalid'
				albums[_id] = tmp
 
 	@log
	def back_domain(self, start, end):
		domains = {}
		if start < self.domain['last'] <= end:
			domains[self.domain['_id']] = self.domain
		return domains

	@log
	def back_cate(self, start, end):
		cates = {}
		tmp = []
		for cate in self.cates.itervalues():
			if start < cate['last'] <= end:
				cates[cate['_id']] = cate
			tmp.append(cate)
		self.cates.extend(tmp)

		if cates:
			print('back %d cates from domain(%s).' % (len(cates), self.domain['_id']))

		for cate in self.doing.itervalues():
			if start < cate['last'] <= end and cate['_id'] in self.cates:
				cates[cate['_id']] = cate
		return cates

	@log
	def back_album(self, start, end):
		albums = {}
		tmp = []
		for album in self.albums.itervalues():
			if start < album['last'] <= end:
				albums[album['_id']] = album
			tmp.append(album)
		self.albums.extend(tmp)

		for album in self.doing.itervalues():
			if start < album['last'] <= end and album['_id'] in self.albums:
				albums[album['_id']] = album
		return albums

	@log
	def get(self):
		if not self.cates and not self.albums:
			return None
		while True:
			self.next += 1
			if self.next % 2 == 0 and len(self.albums) < 10:
				if self.cates :
					cate = self.cates.get()
					self.doing[cate['_id']] = cate
					return 'cate', cate

			if self.albums:
				album = self.albums.popitem()[1]
				for page, status in album['pages'].iteritems():
					if status == 'wait':
						album['pages'][page] = 'doing'
						self.doing[album['_id']] = album
						self.albums.put(album)
						return 'page', {'_id':page, 'domain':album['domain'], 'album':album['_id']}	

	@log
	def finish_cate(self, task):
		if task['_id'] in self.doing:
			task['status'] = 'done'
			self.master.catecory.save(task)
			del self.doing[task['_id']]
			if task['_id'] in self.cates:
				del self.cates[task['_id']]

	@log
	def finish_album(self, task):
		if task['album'] in self.albums:
			album = self.albums[task['album']]
			pages = album['pages']
			imgs = album['imgs']
			pages[task['_id']] = 'done'
			for value in pages.itervalues():
				if value in ['wait', 'doing']:
					return
			album = album.copy()
			album['pages'] = json.dumps(pages)
			album['imgs'] = json.dumps(imgs)
			album['status'] = 'done'
			self.master.album.save(album)
			if task['album'] in self.albums:
				del self.albums[task['album']]



	@log
	def parse_cate(self, task, result):
		html = result['html']
		cates = self.match(self.re_cate, html)
		albums = self.match(self.re_album, html)
		_type = self.search(self.re_type, html)
		for cate in cates:
			if cate not in self.cates and not self.master.catecory.find({'_id':cate}):
				# cha shu ju ku
				self.cates[cate] = {
					'_id':cate, 
					'_type': _type, 
					'domain':task['domain'], 
					'state':0, 
					'status':'wait', 
					'last':time.time()
				}

		for album in albums:
			if album not in self.albums:
				print album
				print '*' * 80
				self.albums[album] = {
						'_id':album, 
						'pages':{album:'wait'}, 
						'imgs':{} ,
						'domain':task['domain'],
						'cate': task['_id'],
						'status':'wait',
						'state':'valid', 
						'title':'',
						'last':time.time()}

		self.finish_cate(task)
	
	@log
	def parse_album(self, task, result):
		album_imgs = self.albums[task['album']]['imgs']
		album_pages = self.albums[task['album']]['pages']
		html = result['html']
		imgs = self.match(self.re_image, html)
		pages = self.match(self.re_page, html)

		title = self.albums[task['album']]['title']
		if not len(title):
			self.albums[task['album']]['title'] = self.search(self.re_title, html)

		if imgs:
			print imgs
			for img in imgs:
				if img not in album_imgs:
					album_imgs[img] = 'wait', ''
					content = get(img, allow_types='*/*', resp=True).content
					path = self.master.file.put(task['_id'], content, 'jpg')
					album_imgs[img] = 'done', path
		else:
			print 'imgs is None', imgs, task['_id']
			album_imgs[img] = 'wait'
		for page in pages:
			if page not in album_pages:
				album_pages[page] = 'wait'

		self.finish_album(task)

	@log
	def match(self, regx, html):
		""" need implements """
		return [x if type(x) in (str, unicode) else x[0] for x in regx.findall(html)]

	@log
	def search(self, regx, html, default=''):
		m = regx.search(html)
		return m.group(1) if m else default

	"""
		{
			'_id':'http://www.youzi4.com/',
			're_cate':'href="(http:\/\/www\.youzi4\.com\/[^"]*?\/(list_.*?\.html)?)"',
			're_album':'href="(http:\/\/www\.youzi4\.com\/.*?\/\d+\.html)"',
			're_page':'href="(http:\/\/www\.youzi4\.com\/.*?\/\d+_\d+\.html)',
			're_title':'alt="(.*?)-.*?"',
			're_type':'<li><a class="active".*?>(.*?)<\/a><\/li>',
			're_image':'data-original="(http:\/\/img.d843.com\/uploads\/.*?\/\d+-.*?\.jpg)"',
			'last':1410705419.978838,
			'status':'valid'
		}
	"""

	"""
Example #16
0
class Domain(object):
    """docstring for Domain"""
    def __init__(self, master, domain):
        super(Domain, self).__init__()
        self.master = master
        self.domain = domain
        self.re_cate = re.compile(self.domain['re_cate'])
        self.re_page = re.compile(self.domain['re_page'])
        self.re_album = re.compile(self.domain['re_album'])
        self.re_image = re.compile(self.domain['re_image'])
        self.re_title = re.compile(self.domain['re_title'])
        self.re_type = re.compile(self.domain['re_type'])
        self.cates = PQDict(key=lambda x: x.value['_id'], score=time.time)
        self.albums = PQDict(key=lambda x: x.value['_id'], score=album_score)
        self.doing = {}
        self.next = 0

    @log
    def load(self):
        doc = {'domain': self.domain['_id'], 'status': 'wait'}
        cates = self.master.cate.find(doc)
        for cate in cates:
            self.add_cate(cate)

        albums = self.master.album.find(doc)
        for album in albums:
            album['pages'] = json.loads(album['pages'])
            album['imgs'] = json.loads(album['imgs'])
            self.add_album(album)

    @log
    def add_cate(self, cate, cates=None):
        _id = cate['_id']
        if cate['status'] == 'wait' \
          and _id not in self.cates \
          and _id not in self.doing:
            self.cates.put(cate)
        elif cate['status'] == 'invalid' and cates:
            tmp = None
            if _id in self.cates:
                tmp = self.cates.pop(_id)
            elif _id in self.doing:
                tmp = self.doing.pop(_id)

            if tmp and tmp['last'] > self.master.last:
                tmp['status'] = 'invalid'
                cates[_id] = tmp

    @log
    def add_album(self, album, albums=None):
        _id = album['_id']
        if album['status'] == 'wait' and _id not in self.albums:
            self.albums[_id] = album

        elif album['status'] == 'invalid' and _id in self.albums and albums:
            tmp = self.albums.pop(_id)
            if tmp and tmp['last'] > self.master.last:
                tmp['status'] = 'invalid'
                albums[_id] = tmp

    @log
    def back_domain(self, start, end):
        domains = {}
        if start < self.domain['last'] <= end:
            domains[self.domain['_id']] = self.domain
        return domains

    @log
    def back_cate(self, start, end):
        cates = {}
        tmp = []
        for cate in self.cates.itervalues():
            if start < cate['last'] <= end:
                cates[cate['_id']] = cate
            tmp.append(cate)
        self.cates.extend(tmp)

        if cates:
            print('back %d cates from domain(%s).' %
                  (len(cates), self.domain['_id']))

        for cate in self.doing.itervalues():
            if start < cate['last'] <= end and cate['_id'] in self.cates:
                cates[cate['_id']] = cate
        return cates

    @log
    def back_album(self, start, end):
        albums = {}
        tmp = []
        for album in self.albums.itervalues():
            if start < album['last'] <= end:
                albums[album['_id']] = album
            tmp.append(album)
        self.albums.extend(tmp)

        for album in self.doing.itervalues():
            if start < album['last'] <= end and album['_id'] in self.albums:
                albums[album['_id']] = album
        return albums

    @log
    def get(self):
        if not self.cates and not self.albums:
            return None
        while True:
            self.next += 1
            if self.next % 2 == 0 and len(self.albums) < 10:
                if self.cates:
                    cate = self.cates.get()
                    self.doing[cate['_id']] = cate
                    return 'cate', cate

            if self.albums:
                album = self.albums.popitem()[1]
                for page, status in album['pages'].iteritems():
                    if status == 'wait':
                        album['pages'][page] = 'doing'
                        self.doing[album['_id']] = album
                        self.albums.put(album)
                        return 'page', {
                            '_id': page,
                            'domain': album['domain'],
                            'album': album['_id']
                        }

    @log
    def finish_cate(self, task):
        if task['_id'] in self.doing:
            task['status'] = 'done'
            self.master.catecory.save(task)
            del self.doing[task['_id']]
            if task['_id'] in self.cates:
                del self.cates[task['_id']]

    @log
    def finish_album(self, task):
        if task['album'] in self.albums:
            album = self.albums[task['album']]
            pages = album['pages']
            imgs = album['imgs']
            pages[task['_id']] = 'done'
            for value in pages.itervalues():
                if value in ['wait', 'doing']:
                    return
            album = album.copy()
            album['pages'] = json.dumps(pages)
            album['imgs'] = json.dumps(imgs)
            album['status'] = 'done'
            self.master.album.save(album)
            if task['album'] in self.albums:
                del self.albums[task['album']]

    @log
    def parse_cate(self, task, result):
        html = result['html']
        cates = self.match(self.re_cate, html)
        albums = self.match(self.re_album, html)
        _type = self.search(self.re_type, html)
        for cate in cates:
            if cate not in self.cates and not self.master.catecory.find(
                {'_id': cate}):
                # cha shu ju ku
                self.cates[cate] = {
                    '_id': cate,
                    '_type': _type,
                    'domain': task['domain'],
                    'state': 0,
                    'status': 'wait',
                    'last': time.time()
                }

        for album in albums:
            if album not in self.albums:
                print album
                print '*' * 80
                self.albums[album] = {
                    '_id': album,
                    'pages': {
                        album: 'wait'
                    },
                    'imgs': {},
                    'domain': task['domain'],
                    'cate': task['_id'],
                    'status': 'wait',
                    'state': 'valid',
                    'title': '',
                    'last': time.time()
                }

        self.finish_cate(task)

    @log
    def parse_album(self, task, result):
        album_imgs = self.albums[task['album']]['imgs']
        album_pages = self.albums[task['album']]['pages']
        html = result['html']
        imgs = self.match(self.re_image, html)
        pages = self.match(self.re_page, html)

        title = self.albums[task['album']]['title']
        if not len(title):
            self.albums[task['album']]['title'] = self.search(
                self.re_title, html)

        if imgs:
            print imgs
            for img in imgs:
                if img not in album_imgs:
                    album_imgs[img] = 'wait', ''
                    content = get(img, allow_types='*/*', resp=True).content
                    path = self.master.file.put(task['_id'], content, 'jpg')
                    album_imgs[img] = 'done', path
        else:
            print 'imgs is None', imgs, task['_id']
            album_imgs[img] = 'wait'
        for page in pages:
            if page not in album_pages:
                album_pages[page] = 'wait'

        self.finish_album(task)

    @log
    def match(self, regx, html):
        """ need implements """
        return [
            x if type(x) in (str, unicode) else x[0]
            for x in regx.findall(html)
        ]

    @log
    def search(self, regx, html, default=''):
        m = regx.search(html)
        return m.group(1) if m else default

    """
		{
			'_id':'http://www.youzi4.com/',
			're_cate':'href="(http:\/\/www\.youzi4\.com\/[^"]*?\/(list_.*?\.html)?)"',
			're_album':'href="(http:\/\/www\.youzi4\.com\/.*?\/\d+\.html)"',
			're_page':'href="(http:\/\/www\.youzi4\.com\/.*?\/\d+_\d+\.html)',
			're_title':'alt="(.*?)-.*?"',
			're_type':'<li><a class="active".*?>(.*?)<\/a><\/li>',
			're_image':'data-original="(http:\/\/img.d843.com\/uploads\/.*?\/\d+-.*?\.jpg)"',
			'last':1410705419.978838,
			'status':'valid'
		}
	"""
    """