def admin_save(request): if request.POST['article_id']: # update uid = request.POST['article_id'] article = ArticleModel(id=request.POST['article_id'], title=request.POST['title']) try: article.update( actions=[ ArticleModel.category.set(request.POST['category']), ArticleModel.content.set(request.POST['content']), ArticleModel.tags.set(request.POST['tags'].split(',')), ], condition=((ArticleModel.id == request.POST['article_id']))) except Exception as e: print(str(e)) else: # save uid = str(uuid.uuid1()) article = ArticleModel(id=uid, title=request.POST['title'], category=request.POST['category'], content=request.POST['content'], create_time=str(datetime.now()), editor=request.POST['editor'], tags=request.POST['tags'].split(',')) try: article.save() except Exception as e: print(str(e)) return redirect('admin_page', id=uid)
def PUT(self, id): data = web.data() item = json.loads(data) if id == '0': id = None code = item['code'] if ('code' in item) else 'code' title = item['title'] if ('title' in item) else 'title' try: posttime = datetime.datetime.strptime(item['posttime'], '%Y-%m-%d') except (KeyError, ValueError): posttime = datetime.datetime.now() remark = item['remark'] if ('remark' in item) else '' articleobj = ArticleModel(id, code, title, posttime, remark) articledict = { 'code': code, 'title': title, 'posttime': posttime, 'remark': remark } if id: Session.query(ArticleModel).filter( ArticleModel.id == id).update(articledict) else: Session.add(articleobj) Session.commit()
def listCategoryWithCount(limit=BaseService.LIMIT): cats = BaseService.list(model_name='CategoryModel', limit=limit) for cat in cats['data']: cat_cnt = ArticleModel.scan( ArticleModel.category == cat['category_name']) list(cat_cnt) cat['cnt'] = cat_cnt.total_count return cats
def searchArticleByCat(cat, limit=None, last_evaluated_key=None): dataItr = ArticleModel.scan(ArticleModel.category==cat, limit=limit, last_evaluated_key=last_evaluated_key) data = [] lek = None for e in dataItr: lek = dataItr.last_evaluated_key data.append(e.to_dict()) return {'data': data, 'lek': lek}
def searchArticleByKeyword(keyword, limit=None, last_evaluated_key=None): dataItr = ArticleModel.scan(ArticleModel.category.startswith(keyword) | ArticleModel.title.contains(keyword) | ArticleModel.content.contains(keyword), limit=limit, last_evaluated_key=last_evaluated_key) data = [] lek = None for e in dataItr: lek = dataItr.last_evaluated_key data.append(e.to_dict()) return {'data': data, 'lek': lek}
def searchArticleByTag(tag, limit=None, last_evaluated_key=None): dataItr = ArticleModel.scan(ArticleModel.tags.contains(tag), limit=limit, last_evaluated_key=last_evaluated_key) data = [] lek = None for e in dataItr: lek = dataItr.last_evaluated_key data.append(e.to_dict()) return {'data': data, 'lek': lek}
def crawl(indexUrl = '', baseDir = BASE_DIR, listElem = 'a', itemElem = ''): """crawl all pages and save detail urls to database""" print 'start crawl url: %s' % indexUrl urls = buildURLs(indexUrl, listElem) for url in urls: if url: p = Page(url, itemElem) try: print('fetching ' + p.url) p.fetch() except requests.ConnectionError as e: print("Network Error") print('{} items found'.format(len(p.items))) for item in p.items: print item item.save_to_db() list = ArticleModel.select().where(ArticleModel.tags == 'Python') i = 0 for item in list: content = Utils.replace_charentity(item.summary) content = content.replace(r'src="', 'src="' + BASE_URL) i +=1 # 要转一下码,不然加到路径里就悲剧了 title = item.title.decode('utf-8').replace("/", " ") fileName = "%d " % i + title + '.html' base_folder = baseDir tool.FileUtils().mkdir(base_folder) filePath = base_folder + fileName if os.path.exists(filePath): os.remove(filePath) print("removing {}".format(fileName)) # print("skipping {}".format(fileName)) continue try: with open(filePath, 'wb') as f: f.write('<!DOCTYPE html>') f.write('<head>') f.write('<meta content="text/html; charset=UTF-8" http-equiv="Content-Type" />') # advoid wrong decoding f.write('</head>') f.write('<body>') f.write(content.decode('utf-8')) f.write('</body>') f.write('</html>') f.flush() except requests.ConnectionError as e: print("Network Error") except requests.exceptions.RequestException as e: print("Error")
def major(self): categories = ['news_society', 'news_entertainment', 'news_tech', 'news_military', 'news_sports', 'news_car', 'news_finance', 'news_world', 'news_fashion', 'news_travel', 'news_discovery', 'news_baby', 'news_regimen', 'news_story', 'news_essay', 'news_game', 'news_history', 'news_food'] # categories = ['news_society'] for category in categories: print("当前类别: %s" % category) logging.info("当前类别: %s" % category) """ 处理 art """ try: arts_brief_json = self.__art_pro.get_arts_brief_json_by_category(category) logging.info('%s arts_brief_json 获取 成功' % category) except: print('%s arts_brief_json 获取 失败' % category) logging.exception('%s arts_brief_json 获取 失败' % category) continue for art_i, art_brief_json in enumerate(arts_brief_json): print("当前新闻: %d/%d %s" % (art_i, len(arts_brief_json), category)) logging.info("当前新闻: %d/%d %s" % (art_i, len(arts_brief_json), category)) """ 新闻作者 """ art_cus_mod = CusMod.CustomerModel() try: self.__cus_pro.set_art_cus(art_brief_json, art_cus_mod) self.__cus_dao.insert_then_get_cus(art_cus_mod) self.__cus_dao.update_cus_feature(category, art_cus_mod.cus_id, flag=True) logging.info("%s-%d art_cus 处理 成功" % (category, art_i)) except: print("%s-%d art_cus 处理 失败" % (category, art_i)) logging.exception("%s-%d art_cus 处理 失败" % (category, art_i)) continue """ 新闻 """ art_mod = ArtMod.ArticleModel() try: self.__art_pro.set_art(art_brief_json, category, art_cus_mod.cus_id, art_mod) if not self.__art_dao.is_art_exist(art_mod.art_spider): # 新闻不存在的情况 self.__art_dao.insert_art(art_mod) else: print("art 已存在") continue art_mod.art_id = self.__art_dao.search_art_id_by_spider(art_mod.art_spider) # art_mod.art_time = self.__art_dao.search_art_time_by_spider(art_mod.art_spider) logging.info("%s-%d art 操作 成功" % (category, art_i)) except: print("%s-%d art 操作 失败" % (category, art_i)) logging.exception("%s-%d art 操作 失败" % (category, art_i)) continue """ 新闻 用户 行为 """ try: if self.__art_dao.check_art_cus_relationship(art_mod.art_id, art_cus_mod.cus_id): self.__cus_dao.insert_cus_behavior( art_cus_mod.cus_id, art_cus_mod.cus_id, 1, art_mod.art_id, 1, art_mod.art_id, cbr_time=art_mod.art_time ) self.__cus_dao.insert_cus_behavior( art_cus_mod.cus_id, art_cus_mod.cus_id, 2, art_mod.art_id, 1, art_mod.art_id ) self.__cus_dao.update_cus_feature(category, art_cus_mod.cus_id) self.__art_dao.update_art_feature(1, art_mod.art_id, art_mod.art_time) else: pass logging.info("%s-%d rt-cus 行为 1 数据库操作 成功" % (category, art_i)) except: print("%s-%d rt-cus 行为 1 数据库操作 失败" % (category, art_i)) logging.exception("%s-%d rt-cus 行为 1 数据库操作 失败" % (category, art_i)) continue """ 评论与回复处理 """ try: coms_json = self.__com_pro.get_coms_json(art_brief_json) if coms_json is None: continue logging.info("%s-%d coms_json 获取 成功" % (category, art_i)) except: print("\t%s-%d coms_json 获取 失败" % (category, art_i)) logging.exception("%s-%d coms_json 获取 失败" % (category, art_i)) continue for com_i, com_json in enumerate(coms_json): print("\t当前评论: %d/%d" % (com_i, len(coms_json))) logging.info("当前评论: %d/%d" % (com_i, len(coms_json))) """ 评论用户 """ com_cus_mod = CusMod.CustomerModel() try: self.__cus_pro.set_com_cus(com_json, com_cus_mod) self.__cus_dao.insert_then_get_cus(com_cus_mod) self.__cus_dao.update_cus_feature(category, com_cus_mod.cus_id, flag=True) # self.__cus_dao.cus_watch_other_same_category_art(com_cus_mod.cus_id, art_mod.art_id, category) logging.info("%s-%d-%d com_cus 处理 错误" % (category, art_i, com_i)) except: print("\t%s-%d-%d com_cus 处理 错误" % (category, art_i, com_i)) logging.exception("%s-%d-%d com_cus 处理 错误" % (category, art_i, com_i)) continue """ 评论 """ com_mod = ComMod.CommentModel() try: self.__com_pro.set_com(com_json, art_mod.art_id, com_cus_mod.cus_id, com_mod) if not self.__com_dao.is_com_exist(com_mod.com_spider): # 如果评论不存在 self.__com_dao.insert_com(com_mod) else: print("com 已存在") continue com_mod.com_id = self.__com_dao.search_com_id_by_spider(com_mod.com_spider) logging.info("%s-%d-%d com 处理 失败" % (category, art_i, com_i)) except: print("\t%s-%d-%d com 处理 失败" % (category, art_i, com_i)) logging.exception("%s-%d-%d com 处理 失败" % (category, art_i, com_i)) continue """ 评论 用户 行为 """ try: if self.__com_dao.check_com_cus_relationship(art_mod.art_id, com_mod.com_id, com_cus_mod.cus_id): self.__cus_dao.insert_cus_behavior( com_cus_mod.cus_id, art_cus_mod.cus_id, 5, art_mod.art_id, 2, com_mod.com_id, cbr_time=com_mod.com_time ) self.__cus_dao.insert_cus_behavior( com_cus_mod.cus_id, art_cus_mod.cus_id, 2, art_mod.art_id, 1, art_mod.art_id ) self.__cus_dao.update_cus_feature(category, com_cus_mod.cus_id) self.__art_dao.update_art_feature(4, art_mod.art_id, art_mod.art_time) else: pass logging.info("%s-%d-%d art-cus 行为 4 数据库操作 成功" % (category, art_i, com_i)) except: print("\t%s-%d-%d art-cus 行为 4 数据库操作 失败" % (category, art_i, com_i)) logging.exception("%s-%d-%d art-cus 行为 4 数据库操作 失败" % (category, art_i, com_i)) continue """ 评论用户 模拟浏览 """ try: result_list = None rand_category_num = random.randint(1, 2) rand_cates = random.sample(categories, rand_category_num) for rand_cate in rand_cates: result_list = self.__art_dao.get_same_category_art(art_mod.art_id, rand_cate) if result_list is not None: for back_art in result_list: try: self.__cus_dao.insert_cus_behavior( com_cus_mod.cus_id, back_art[1], 2, back_art[0], 1, back_art[0] ) self.__cus_dao.update_cus_feature(rand_cate, com_cus_mod.cus_id, update_num=1) self.__art_dao.update_art_feature(6, back_art[0], art_mod.art_time) except: continue print("\t%d 用户模拟浏览操作 数量 %d 完成" % (com_cus_mod.cus_id, len(result_list))) logging.info("%d 模拟浏览操作 数量 %d 完成" % (com_cus_mod.cus_id, len(result_list))) except: print("\t%d 用户模拟浏览操作 失败" % com_cus_mod.cus_id) logging.exception("%d 用户模拟浏览操作 失败" % com_cus_mod.cus_id) """ 回复处理 """ try: reps_json = self.__rep_pro.get_reps_json(com_json) if reps_json is None: continue logging.info("%s-%d-%d reps_json 获取 成功" % (category, art_i, com_i)) except: print("\t\t%s-%d-%d reps_json 获取 失败" % (category, art_i, com_i)) logging.exception("%s-%d-%d reps_json 获取 失败" % (category, art_i, com_i)) continue for rep_i, rep_json in enumerate(reps_json): """ 回复用户 """ rep_cus_mod = CusMod.CustomerModel() try: self.__cus_pro.set_rep_cus(rep_json, rep_cus_mod) self.__cus_dao.insert_then_get_cus(rep_cus_mod) self.__cus_dao.update_cus_feature(category, rep_cus_mod.cus_id, flag=True) logging.info("%s-%d-%d-%d rep_cus 处理 成功" % (category, art_i, com_i, rep_i)) except: print("\t\t%s-%d-%d-%d rep_cus 处理 失败" % (category, art_i, com_i, rep_i)) logging.exception("%s-%d-%d-%d rep_cus 处理 失败" % (category, art_i, com_i, rep_i)) continue """ 回复 """ rep_mod = RepMod.ReplyModel() try: self.__rep_pro.set_rep(rep_json, art_mod.art_id, com_mod.com_id, rep_cus_mod.cus_id, rep_mod) if not self.__rep_dao.is_rep_exist(rep_mod.rep_spider): self.__rep_dao.search_rep_rep_by_spyder(rep_json, rep_mod) self.__rep_dao.insert_rep(rep_mod) else: print("rep 已存在") continue rep_mod.rep_id = self.__rep_dao.search_rep_id_by_spider(rep_mod.rep_spider) logging.info("%s-%d-%d-%d rep 处理 成功" % (category, art_i, com_i, rep_i)) except: print("\t\t%s-%d-%d-%d rep 处理 失败" % (category, art_i, com_i, rep_i)) logging.exception("%s-%d-%d-%d rep 处理 失败" % (category, art_i, com_i, rep_i)) continue """ 回复 用户 行为 """ try: if self.__rep_dao.check_rep_cus_relationship(art_mod.art_id, rep_mod.rep_id, rep_cus_mod.cus_id): self.__cus_dao.insert_cus_behavior( rep_cus_mod.cus_id, art_cus_mod.cus_id, 8, art_mod.art_id, 3, rep_mod.rep_id, cbr_time=rep_mod.rep_time ) self.__cus_dao.insert_cus_behavior( rep_cus_mod.cus_id, art_cus_mod.cus_id, 2, art_mod.art_id, 1, art_mod.art_id ) self.__cus_dao.update_cus_feature(category, rep_cus_mod.cus_id) self.__art_dao.update_art_feature(5, art_mod.art_id, art_mod.art_time) else: pass logging.info("%s-%d-%d-%d art-cus 行为 5 数据库操作 成功" % (category, art_i, com_i, rep_i)) except: print("\t\t%s-%d-%d-%d art-cus 行为 5 数据库操作 失败" % (category, art_i, com_i, rep_i)) logging.exception("%s-%d-%d-%d art-cus 行为 5 数据库操作 失败" % (category, art_i, com_i, rep_i)) continue """ 回复用户 模拟浏览 """ try: result_list = None rand_category_num = random.randint(1, 2) rand_cates = random.sample(categories, rand_category_num) for rand_cate in rand_cates: result_list = self.__art_dao.get_same_category_art(art_mod.art_id, rand_cate) if result_list is not None: for back_art in result_list: try: self.__cus_dao.insert_cus_behavior( rep_cus_mod.cus_id, back_art[1], 2, back_art[0], 1, back_art[0] ) self.__cus_dao.update_cus_feature(rand_cate, rep_cus_mod.cus_id, update_num=1) self.__art_dao.update_art_feature(6, back_art[0], art_mod.art_time) except: continue print("\t\t%d 用户模拟浏览操作 数量 %d 完成" % (rep_cus_mod.cus_id, len(result_list))) logging.info("%d 用户模拟浏览操作 数量 %d 完成" % (rep_cus_mod.cus_id, len(result_list))) except: print("\t\t%d 用户模拟浏览操作 失败" % rep_cus_mod.cus_id) logging.exception("%d 用户模拟浏览操作 失败" % rep_cus_mod.cus_id)
def major(self): categories = [ 'news_society', 'news_entertainment', 'news_tech', 'news_military', 'news_sports', 'news_car', 'news_finance', 'news_world', 'news_fashion', 'news_travel', 'news_discovery', 'news_baby', 'news_regimen', 'news_story', 'news_essay', 'news_game', 'news_history', 'news_food' ] for category in categories: print("\n当前类别: %s" % category) """ 处理 art """ try: arts_brief_json = self.__art_pro.get_arts_brief_json_by_category( category) if len(arts_brief_json) != 0: print("新闻总长度: %d" % len(arts_brief_json)) # print('arts_brief_json 获取 成功') except: print('arts_brief_json 获取 失败') continue for art_brief_json in arts_brief_json: # art_cus art_cus_mod = CusMod.CustomerModel() try: self.__cus_pro.set_art_cus(art_brief_json, art_cus_mod) self.__cus_dao.group_check_insert_cus_then_search_id( art_cus_mod) # print("art_cus 处理 成功") except: print("art_cus 处理 失败") continue # art art_mod = ArtMod.ArticleModel() try: self.__art_pro.set_art(art_brief_json, category, art_cus_mod.cus_id, art_mod) if not self.__art_dao.is_art_exist(art_mod.art_spider): # 新闻不存在的情况 self.__art_dao.insert_art(art_mod) else: print("art 已存在") continue art_mod.art_id = self.__art_dao.search_art_id_by_spider( art_mod.art_spider) # print("art 操作 成功") except: print("art 操作 失败") continue # art cus behavior try: if self.__art_dao.check_art_cus_relationship( art_mod.art_id, art_cus_mod.cus_id): self.__cus_dao.insert_cus_behavior( 1, art_mod.art_id, art_cus_mod.cus_id, art_mod.art_time) else: pass # print("art-cus 行为 1 数据库操作 成功") except: print("art-cus 行为 1 数据库操作 失败") continue """ handel the coms """ try: coms_json = self.__com_pro.get_coms_json(art_brief_json) if len(coms_json) != 0: print("回复总长 %d" % len(coms_json)) except: print("coms_json 获取 失败") continue for com_json in coms_json: # com_cus com_cus_mod = CusMod.CustomerModel() try: self.__cus_pro.set_com_cus(com_json, com_cus_mod) self.__cus_dao.group_check_insert_cus_then_search_id( com_cus_mod) # print("com_cus 处理 成功") except: print("com_cus 处理 错误") continue # com com_mod = ComMod.CommentModel() try: self.__com_pro.set_com(com_json, art_mod.art_id, com_cus_mod.cus_id, com_mod) if not self.__com_dao.is_com_exist(com_mod.com_spider): # if the com is not exist self.__com_dao.insert_com(com_mod) else: print("com 已存在") continue com_mod.com_id = self.__com_dao.search_com_id_by_spider( com_mod.com_spider) self.__art_dao.update_art_com_number(art_mod.art_id) # print("com 处理 成功") except: print("com 处理 失败") continue # com cus behavior try: if self.__com_dao.check_com_cus_relationship( art_mod.art_id, com_mod.com_id, com_cus_mod.cus_id): self.__cus_dao.insert_cus_behavior( 4, art_mod.art_id, com_cus_mod.cus_id, com_mod.com_time) else: pass # print("art-cus 行为 4 数据库操作 成功") except: print("art-cus 行为 4 数据库操作 失败") continue """ handel the reps """ try: reps_json = self.__rep_pro.get_reps_json(com_json) if len(reps_json) != 0: print("回复总长 %d" % len(reps_json)) except: print("reps_json 获取 失败") continue for rep_json in reps_json: # rep_cus rep_cus_mod = CusMod.CustomerModel() try: self.__cus_pro.set_rep_cus(rep_json, rep_cus_mod) self.__cus_dao.group_check_insert_cus_then_search_id( rep_cus_mod) # print("rep_cus 处理 成功") except: print("rep_cus 处理 失败") continue # rep rep_mod = RepMod.ReplyModel() try: self.__rep_pro.set_rep(rep_json, art_mod.art_id, com_mod.com_id, rep_cus_mod.cus_id, rep_mod) if not self.__rep_dao.is_rep_exist( rep_mod.rep_spider): self.__rep_dao.search_rep_rep_by_spyder( rep_json, rep_mod) self.__rep_dao.insert_rep(rep_mod) else: print("rep 已存在") continue rep_mod.rep_id = self.__rep_dao.search_rep_id_by_spider( rep_mod.rep_spider) # print("rep 处理 成功") except: print("rep 处理 失败") continue # rep cus behavior try: if self.__rep_dao.check_rep_cus_relationship( art_mod.art_id, rep_mod.rep_id, rep_cus_mod.cus_id): self.__cus_dao.insert_cus_behavior( 5, art_mod.art_id, rep_cus_mod.cus_id, rep_mod.rep_time) else: pass # print("art-cus 行为 5 数据库操作 成功") except: print("art-cus 行为 5 数据库操作 失败") continue