Example #1
0
def save_words(handler, article, words, ext):
	redis_word = handler.redis_word
	mongo = handler.mongo

	if 'word' in article and article['word']:
		row = mongo.word_file.get({article['word']})
		if row:
			row = json.loads(row)
			if row['sim'] == True:
				row['words'] = json.loads(row['words'])
				for word, cnt in row['words']['all'].iteritems():
					word = word.lower()
					hkey = unicode2hash(word)
					key = hkey % 500
					redis_word.hincrby(key, hkey, -1)
				redis_word.incr('total', -1)

	if article['sim'] == True:
		for word, cnt in words['all'].iteritems():
			word = word.lower()
			hkey = unicode2hash(word)
			key = hkey % 500
			redis_word.hincrby(key, hkey, 1)
		redis_word.incr('total', 1)

	if 'id' not in article or not article['id']:
		article['pubdate'], article['id'] = time2id(handler, article['pubtime'])
	else:
		article['pubdate'] = time.strftime('%Y%m%d', time.localtime(pubtime))
	article['words'] = article['_id']

	words = {'words':words, 'sim':article['sim']}
	
	words, article['tags'] = update_index(handler, article, words)
	mongo.word_file.put(article['_id'], json.dumps(words))

	web_article = {
		'_id': article['_id'],
		'id': article['id'],
		'long': article['long'],
		'title': article['title'],
		'domain': article['domain'],
		'src_name': article['src_name'],
		'src_link': article['src_link'],
		'tags': article['tags'],
		'icons': article['icons'],
		'url': article['url'],
		'sim': article['sim'],
		'icons': article['icons'],
		'pubtime': article['pubtime'],
		'last': article['last'],
	}
	content = web_content(handler, article, words, ext)
	web_article['content'] = mongo.text_file.put('web_%s' % article['_id'], content.encode('utf-8'), 'txt')
	while True:
		try:
			mongo.article.save(web_article)
			break
		except pymongo.errors.OperationFailure, e:
			time.sleep(1)
Example #2
0
	def word2num(self, word):
		word = word.lower()
		hkey = unicode2hash(word)
		key = hkey % 500
		num = self.redis.hget(key, hkey)
		num = int(num) if num is not None else 0
		return float(max(1, num))
Example #3
0
		def score(word):
			word = word.lower()
			hkey = unicode2hash(word)
			key = hkey % 500
			df = self.redis.hget(key, hkey)
			df = int(df) if df is not None else 0
			return df >= 5
Example #4
0
    def find(self, word, last=None, limit=20, fields=None):
        word = word.lower()
        whash = unicode2hash(word)
        count = self.keys.find({'word': whash}).count()
        if last is None:
            return count, list(
                self.keys.find({
                    'word': whash
                }, fields=fields).sort([('pubtime', pymongo.DESCENDING)
                                        ]).limit(limit))

        last = self.keys.find_one({
            'word': whash,
            'article': last
        }, {'pubtime': 1})
        if last is None:
            return count, []
        pubtime = last['pubtime']
        topic = self.keys.find({
            'word': whash,
            'pubtime': {
                '$lt': pubtime
            }
        },
                               fields=fields).sort([('pubtime',
                                                     pymongo.DESCENDING)])
        return count, list(topic.limit(limit))
Example #5
0
 def find_page(self, word, page, limit=20, fields=None):
     skip = page * limit - limit
     word = word.lower()
     whash = unicode2hash(word)
     topic = self.keys.find({
         'word': whash
     }, fields=fields).sort([('pubtime', pymongo.DESCENDING)])
     return topic.count(), list(topic.skip(skip).limit(limit))
Example #6
0
	def new(self, url, src_type, src, task, last=0):
		key = hashlib.md5(url.encode('utf-8')).hexdigest()
		xlong = unicode2hash(url)
		tpl = url2tpl(url)
		if tpl not in self.domain.tpls \
				or key in self \
				or self.domains.add_url(xlong, self.domain.id()) == 0:
			return 0

		article = {
			'_id': key,
			'id': '',
			'long': xlong,
			'url': url,
			'domain': self.domain.id(),
			'tpl': tpl,
			'src_type': src_type,
			'src': src,
			'html': '',
			'title': '',
			'pages': {},
			'imgs': {},
			'icons': {},
			'tags': [],
			'sim': False,
			'f': False,
			'version': 0,
			'v': self.articles.new_version(),
			'created': time.time(),
			'last': time.time(),
		}

		if src_type == 'cate':
			article['src_link'] = task['url']
			article['src_name'] = task['name']
			if last > 0:
				article['pubtime'] = last
			else:
				article['pubtime'] = time.time() - 86400 * 60
		else:
			article['src_link'] = self.domain.domain['link']
			article['src_name'] = self.domain.domain['name']
			article['pubtime'] = task['pubtime'] - 86400 * 15

		article['pubtime'] = self.get_pubtime(article)

		if self.next < article['pubtime']:
			self.next = article['pubtime']

		self.updates.add(article['_id'])
		self.queue.put(article)
		return 1
Example #7
0
    def new(self, url, src_type, src, task, last=0):
        key = hashlib.md5(url.encode('utf-8')).hexdigest()
        xlong = unicode2hash(url)
        tpl = url2tpl(url)
        if tpl not in self.domain.tpls \
          or key in self \
          or self.domains.add_url(xlong, self.domain.id()) == 0:
            return 0

        article = {
            '_id': key,
            'id': '',
            'long': xlong,
            'url': url,
            'domain': self.domain.id(),
            'tpl': tpl,
            'src_type': src_type,
            'src': src,
            'html': '',
            'title': '',
            'pages': {},
            'imgs': {},
            'icons': {},
            'tags': [],
            'sim': False,
            'f': False,
            'version': 0,
            'v': self.articles.new_version(),
            'created': time.time(),
            'last': time.time(),
        }

        if src_type == 'cate':
            article['src_link'] = task['url']
            article['src_name'] = task['name']
            if last > 0:
                article['pubtime'] = last
            else:
                article['pubtime'] = time.time() - 86400 * 60
        else:
            article['src_link'] = self.domain.domain['link']
            article['src_name'] = self.domain.domain['name']
            article['pubtime'] = task['pubtime'] - 86400 * 15

        article['pubtime'] = self.get_pubtime(article)

        if self.next < article['pubtime']:
            self.next = article['pubtime']

        self.updates.add(article['_id'])
        self.queue.put(article)
        return 1
Example #8
0
	def find(self, word, last=None, limit=20, fields=None):
		word = word.lower()
		whash = unicode2hash(word)
		count = self.keys.find({'word':whash}).count()
		if last is None:
			return count, list(self.keys.find({'word':whash}, fields=fields).sort([('pubtime',pymongo.DESCENDING)]).limit(limit))

		last = self.keys.find_one({'word':whash, 'article':last}, {'pubtime':1})
		if last is None:
			return count, []
		pubtime = last['pubtime']
		topic = self.keys.find({'word':whash, 'pubtime':{'$lt':pubtime}}, fields=fields).sort([('pubtime',pymongo.DESCENDING)])
		return count, list(topic.limit(limit))
Example #9
0
    def add(self, word, id, imgs, pubtime, icons):
        word = word.lower()
        whash = unicode2hash(word)
        res = self.index.find_one({'_id': word})

        row = {
            '_id': str2hash('%d-%s' % (whash, id)),
            'article': id,
            'word': whash,
            'imgs': imgs,
            'pubtime': pubtime,
            'rank': self.rank(imgs, pubtime)
        }

        if res is None:
            res = {
                '_id': word,
                'word': whash,
                'rank': 0,
                'count': 0,
                'icon': '',
                'icon_time': 0,
                'auto': True,
            }

        if res['count'] >= 1500:
            words = list(
                self.keys.find({
                    'word': whash
                }).sort([('rank', -1)]).skip(999).limit(1))
            if words:
                self.keys.remove({
                    'word': whash,
                    'rank': {
                        '$lt': words[0]['rank']
                    }
                })
                res['rank'] = words[0]['rank']
            res['count'] = 1000

        if row['rank'] > res['rank']:
            if icons and res[
                    'auto'] == True and pubtime - res['icon_time'] > 3 * 86400:
                res['icon'] = icons.pop()
                res['icon_time'] = pubtime
            self.keys.save(row)
            res['count'] += 1
            self.index.save(res)
Example #10
0
def _upgrade_word(row):
	row['words'] = json.loads(row['words'])
	spider.word_file.put(row['_id'], json.dumps({
		'sim': row['sim'],
		'words': row['words'],
	}))

	if row['sim'] == True:
		words = row['words']
		for word, cnt in words['all'].iteritems():
			word = word.lower()
			hkey = unicode2hash(word)
			key = hkey % 500
			redis_word.hincrby(key, hkey, 1)

	redis_word.incr('total', 1)
Example #11
0
    def merger_pages(self, article, pages):
        tmp_pages = article['pages']
        article['pages'] = {}
        for page in pages:
            md5 = hashlib.md5(page.encode('utf-8')).hexdigest()
            xlong = unicode2hash(page)
            self.domains.add_url(xlong, article['domain'])
            if md5 in tmp_pages:
                article['pages'][md5] = tmp_pages[md5]
            else:
                article['pages'][md5] = {
                    'url': page,
                    'path': '',
                    'status': 'wait',
                    'last': time.time(),
                }

        for md5, page in tmp_pages.iteritems():
            if md5 not in article['pages'] and page['status'] == 'done':
                self.articles.html_file.remove(page['path'])
Example #12
0
	def merger_pages(self, article, pages):
		tmp_pages = article['pages']
		article['pages'] = {}
		for page in pages:
			md5 = hashlib.md5(page.encode('utf-8')).hexdigest()
			xlong = unicode2hash(page)
			self.domains.add_url(xlong, article['domain'])
			if md5 in tmp_pages:
				article['pages'][md5] = tmp_pages[md5]
			else:
				article['pages'][md5] = {
					'url': page,
					'path': '',
					'status': 'wait',
					'last': time.time(),
				}

		for md5, page in tmp_pages.iteritems():
			if md5 not in article['pages'] and page['status'] == 'done':
				self.articles.html_file.remove(page['path'])
Example #13
0
	def add(self, word, id, num, imgs, pubtime, icons):
		word = word.lower()
		whash = unicode2hash(word)
		res = self.index.find_one({'_id':word})

		row = {
			'_id': str2hash('%d-%s' % (whash, id)),
			'article': id,
			'word': whash,
			'num': num,
			'imgs': imgs,
			'pubtime': pubtime,
			'rank': self.rank(num, imgs, pubtime)
		}

		if res is None:
			res = {
				'_id': word,
				'word': whash,
				'rank': 0,
				'count': 0,
				'icon': '',
				'icon_time': 0,
				'auto': True,
			}

		if res['count'] >= 2:
			if res['count'] >= 1500:
				words = list(self.keys.find({'word':whash}).sort([('rank', -1)]).skip(999).limit(1))
				if words:
					self.keys.remove({'word':whash, 'rank':{'$lt':words[0]['rank']}})
					res['rank'] = words[0]['rank']

			if row['rank'] > res['rank']:
				if icons and res['auto'] == True and pubtime - res['icon_time'] > 3 * 86400:
					res['icon'] = icons.pop()
					res['icon_time'] = pubtime
			self.keys.save(row)

		res['count'] += 1
		self.index.save(res)
Example #14
0
	def find_page(self, word, page, limit=20, fields=None):
		skip = page * limit - limit
		word = word.lower()
		whash = unicode2hash(word)
		topic = self.keys.find({'word':whash}, fields=fields).sort([('pubtime',pymongo.DESCENDING)])
		return topic.count(), list(topic.skip(skip).limit(limit))
Example #15
0
def _upgrade_article(row):
    article = dict((x, row[x]) for x in keys)

    xlong = unicode2hash(article['url'])
    redis_url.sadd(article['domain'], xlong)
    article['long'] = xlong
    article['pubdate'], article['id'] = '', ''

    del article['v']['tag']

    article['icons'] = row['icons']

    if 'content' in row and row['content']:
        article['content'] = iweb.text_file.put('spider_%s' % article['_id'],
                                                row['content'].encode('utf-8'),
                                                'txt')
    else:
        article['content'] = ''

    if article['v']['sim'] > 0:
        article['sim'] = sim(article, row['content'])
    else:
        article['sim'] = False

    if row['v']['seg'] > 0:
        # article['pubdate'], article['id'] = time2id(article['pubtime'])
        # article['words'] = article['_id']
        # words = iweb.word_file.get(article['words'])
        # if words is not None:
        # 	words = json.loads(words)
        # 	words['sim'] = article['sim']
        # 	words, article['tags'] = update_index(article, words)
        # 	iweb.word_file.put(row['_id'], json.dumps(words))

        # 	web_article = {
        # 		'_id': article['_id'],
        # 		'id': article['id'],
        # 		'long': article['long'],
        # 		'title': article['title'],
        # 		'domain': article['domain'],
        # 		'src_name': article['src_name'],
        # 		'src_link': article['src_link'],
        # 		'tags': article['tags'],
        # 		'icons': article['icons'],
        # 		'url': article['url'],
        # 		'sim': article['sim'],
        # 		'icons': article['icons'],
        # 		'pubtime': article['pubtime'],
        # 		'last': article['last'],
        # 	}
        # 	content = web_content(row, article, words)
        # 	web_article['content'] = iweb.text_file.put('web_%s' % article['_id'], content.encode('utf-8'), 'txt')
        # 	while True:
        # 		try:
        # 			iweb.article.save(web_article)
        # 			break
        # 		except pymongo.errors.OperationFailure, e:
        # 			print str(e)
        # 			gevent.sleep(1)

        # 	if len(article['tags']) >= 3 and article['icons']:
        # 		topics.add(web_article)
        # else:
        # 	row['exc'] = 'ValueError'
        pass
    else:
        article['words'] = ''
        article['tag'] = []

    if row['exc']:
        article['exc'] = row['exc']

        while True:
            try:
                iweb.spider_exc.save(article)
                break
            except pymongo.errors.OperationFailure, e:
                print str(e)
                gevent.sleep(1)
Example #16
0
def save_words(handler, article, words, ext):
    redis_word = handler.redis_word
    mongo = handler.mongo

    if 'word' in article and article['word']:
        row = mongo.word_file.get({article['word']})
        if row:
            row = json.loads(row)
            if row['sim'] == True:
                row['words'] = json.loads(row['words'])
                for word, cnt in row['words']['all'].iteritems():
                    word = word.lower()
                    hkey = unicode2hash(word)
                    key = hkey % 500
                    redis_word.hincrby(key, hkey, -1)
                redis_word.incr('total', -1)

    if article['sim'] == True:
        for word, cnt in words['all'].iteritems():
            word = word.lower()
            hkey = unicode2hash(word)
            key = hkey % 500
            redis_word.hincrby(key, hkey, 1)
        redis_word.incr('total', 1)

    if 'id' not in article or not article['id']:
        article['pubdate'], article['id'] = time2id(handler,
                                                    article['pubtime'])
    else:
        article['pubdate'] = time.strftime('%Y%m%d', time.localtime(pubtime))
    article['words'] = article['_id']

    words = {'words': words, 'sim': article['sim']}

    words, article['tags'] = update_index(handler, article, words)
    mongo.word_file.put(article['_id'], json.dumps(words))

    web_article = {
        '_id': article['_id'],
        'id': article['id'],
        'long': article['long'],
        'title': article['title'],
        'domain': article['domain'],
        'src_name': article['src_name'],
        'src_link': article['src_link'],
        'tags': article['tags'],
        'icons': article['icons'],
        'url': article['url'],
        'sim': article['sim'],
        'icons': article['icons'],
        'pubtime': article['pubtime'],
        'last': article['last'],
    }
    content = web_content(handler, article, words, ext)
    web_article['content'] = mongo.text_file.put('web_%s' % article['_id'],
                                                 content.encode('utf-8'),
                                                 'txt')
    while True:
        try:
            mongo.article.save(web_article)
            break
        except pymongo.errors.OperationFailure, e:
            time.sleep(1)