def fetch_keywords(): """ 获取站点级别的关键字并插入数据库中. """ ss.headers['Referer'] = ym r = ss.get(ym + '/girls/all/') r.encoding = 'gbk' tree = html.fromstring(r.text) links = tree.xpath('//div[@class="listap"]/a') print 'Found %s keywords' % len(links) ret = [] for link in links: msg = 'Processing keyword %s' % link.get('title') text = link.get('title').strip() keyword = Keyword.find_one({'name': text}) name = unicode(text) if keyword: msg += ', skipped as existing' else: keyword = Keyword() keyword.name = name keyword.level = KeywordLevel.SITE keyword.refer = u'%s%s' % (ym, link.get('href')) keyword.save() print msg ret.append(name) return ret
def hearsay(keyword_id): """ 编辑关键字对应的文章. """ keyword = Keyword.find_one({'_id': keyword_id}) if not keyword: abort(404) # Open page if request.method == 'GET': return render_template('seo/hearsay.html', keyword=keyword) # Handle post request else: current_app.logger.info('Try to save hearsay for keyword %s/%s' % (keyword._id, keyword.name)) title = request.form.get('title', '') body = request.form.get('body', '') if not title: return jsonify(success=False, message='文章标题不能为空!') if not body: return jsonify(success=False, message='文章内容不能为空!') is_new = True if not keyword.hearsay else False keyword.hearsay.title = title keyword.hearsay.body = body keyword.updateTime = datetime.now() if is_new: keyword.status = KeywordStatus.PROCESSED keyword.save() if not current_app.debug and is_new: notify_baidu(current_app._get_current_object(), keyword._id) return jsonify(success=True, message='成功保存了你的文章。')
def refresh(keyword_id): """ 刷新一个指定关键字的长尾关键字. """ keyword = Keyword.find_one({'_id': keyword_id}) if not keyword: abort(404) analyze_keyword(current_app._get_current_object(), keyword) return jsonify(success=True, message='成功触发了刷新请求,请稍候查看最新数据。')
def analyze_keyword(k): """ 分析站点级别的关键字, 获取其百度指数以及其相关的长尾关键字. """ keyword = Keyword.find_one({'name': k}) if not keyword: print 'Keyword %s does not exist' % k return if keyword.baiduIndex > 0 or keyword.baiduResult > 0: print 'Keyword %s is imported before' % k return print 'Try to analyze keyword %s/%s' % (keyword._id, k) ss.headers['Referer'] = 'http://www.5118.com/' t = ss.get('http://www.5118.com/seo/words/%s' % url_quote(k)).text tree = html.fromstring(t) dls = tree.xpath('//div[@class="Fn-ui-list dig-list"]/dl') total = len(dls) for dl in dls: if dl.get('class', '') == 'dl-word': continue name = unicode(dl.xpath('./dd[1]//a[1]/@title')[0].strip()) baidu_index = dl.xpath('./dd[2]/text()')[0].strip() baidu_result = dl.xpath('./dd[3]/text()')[0].strip() if not baidu_index.isdigit(): baidu_index = 0 if not baidu_result.isdigit(): baidu_result = 0 print 'Found keyword: %s/%s/%s' % (name, baidu_index, baidu_result) if name == k: keyword.baiduIndex = int(baidu_index) keyword.baiduResult = int(baidu_result) if total > 2: keyword.total = total - 2 keyword.save() else: if Keyword.count({'name': name}) > 0: print 'This keyword already exists' continue long_tail = Keyword() long_tail.name = name long_tail.level = KeywordLevel.LONG_TAIL long_tail.parentId = keyword._id long_tail.baiduIndex = int(baidu_index) long_tail.baiduResult = int(baidu_result) long_tail.save() time.sleep(random.randint(5, 15))
def analyze_keyword(app, keyword): """ 分析站点级别的关键字, 获取其百度指数以及其相关的长尾关键字. 目前是从5118抓取. """ app.logger.info('Try to analyze keyword %s/%s' % (keyword._id, keyword.name)) ss.headers['Referer'] = 'http://www.5118.com/' t = ss.get('http://www.5118.com/seo/words/%s' % url_quote(keyword.name)).text tree = html.fromstring(t) dls = tree.xpath('//div[@class="Fn-ui-list dig-list"]/dl') total = len(dls) for dl in dls: if dl.get('class', '') == 'dl-word': continue name = unicode(dl.xpath('./dd[1]//a[1]/@title')[0].strip()) baidu_index = dl.xpath('./dd[2]/text()')[0].strip() baidu_result = dl.xpath('./dd[3]/text()')[0].strip() if not baidu_index.isdigit(): baidu_index = 0 if not baidu_result.isdigit(): baidu_result = 0 app.logger.info('Found keyword: %s/%s/%s' % (name, baidu_index, baidu_result)) if name == keyword.name: keyword.baiduIndex = int(baidu_index) keyword.baiduResult = int(baidu_result) if total > 2: keyword.total = total - 2 keyword.save() else: long_tail = Keyword.find_one({'name': name}) if not long_tail: long_tail = Keyword() long_tail.name = name long_tail.level = KeywordLevel.LONG_TAIL long_tail.parentId = keyword._id long_tail.baiduIndex = int(baidu_index) long_tail.baiduResult = int(baidu_result) long_tail.save()
def longtail(keyword_id): """ 获取指定站点关键字下的长尾关键字. """ keyword = Keyword.find_one({'_id': keyword_id}) if not keyword: abort(404) s = request.args.get('status', u'bare,processed,repeated') p = int(request.args.get('page', '1')) start = (p - 1) * PAGE_COUNT condition = {'level': KeywordLevel.LONG_TAIL, 'parentId': keyword_id} status = s.split(u',') if status: condition['status'] = {'$in': status} count = Keyword.count(condition) cursor = Keyword.find(condition, skip=start, limit=PAGE_COUNT, sort=[('baiduIndex', pymongo.DESCENDING)]) keywords = [] for c in cursor: keywords.append(c) pagination = Pagination(p, PAGE_COUNT, count) return render_template('seo/longtail.html', keyword=keyword, keywords=keywords, pagination=pagination)