def test(request): print("class_1 ......") try: current_page = request.GET.get('p') user_name = request.session.get('user_name') rec_db = MongoOperator('localhost', 27017, 'AgriRecSys', 'news') db_ans = rec_db.find({"class_name": "病虫害"}) data = {} ans_list = [] count = 0 for i, news in enumerate(list(db_ans), 0): ans_list.append({ "news": news, "news_id": str(news["_id"]), #"href": "#href_id%d" % (i), #"content_id": "href_id%d" % (i), #"click_id": "ajax_id_%d" % (i), #"ajax_id": "#ajax_id_%d" % (i), }) count += 1 page_obj = Pagination(count, current_page) data_list = ans_list[page_obj.start():page_obj.end()] data["user_name"] = user_name except: return index(request) return render(request, ROOT_URL + "/recommend_templates/templates/test.html", { 'data': data_list, 'page_obj': page_obj })
def tfidf2Txt(): rec_db = MongoOperator('localhost', 27017, 'AgriRecSys', 'news') content_dict = list(rec_db.find()) key_bucket = [] content = [] for i, content_k in enumerate(content_dict, 0): key_bucket.append( [content_k["_id"], content_k["title"], content_k["class_name"]]) content.append(content_k["jieba_cut_content"]) key_bucket_df = pd.DataFrame(key_bucket, columns=["_id", "title", "class_name"]) vectorizer = CountVectorizer( ) # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值 train_set_vector = vectorizer.fit_transform( content) # 将文本转为词频矩阵,返回[(文章idx,词语id),词频]...... tfidf = transformer.fit_transform( train_set_vector) # 计算tf-idf,返回[(文章idx,词语id),tf-idf值]...... word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语(汉字) weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素weight[i][j]表示j词在i类文本中的tf-idf权重 #print "整个样本集合中(样本个数*词库大小) = ",weight.shape nbrs = NearestNeighbors(n_neighbors=10, algorithm="ball_tree").fit(weight) #返回距离每个点k个最近的点和距离指数,indices可以理解为表示点的下标,distances为距离 distances, indices = nbrs.kneighbors(weight) k_nrbs_list = [] for i in range(len(indices)): k_nrbs_list.append(" ".join(["%d" % x for x in indices[i]])) key_bucket_df["k_nbrs"] = pd.Series(k_nrbs_list) key_bucket_df.to_csv("./recSys/data/key_bucket.csv", index=False, index_label=False)
def count_click_times(request): # print "count_click_times ...... " if request.POST: news_id = request.POST.get('news_id') user_name = request.POST.get('user_name') else: news_id = request.GET.get('news_id') user_name = request.GET.get('user_name') try: rec_db = MongoOperator('localhost', 27017, 'AgriRecSys', 'user') db_ans = rec_db.find({"user_name": user_name})[0] if db_ans.get("looked_list") is None: looked_list = set([news_id]) else: looked_list = set(list(db_ans["looked_list"])) looked_list = looked_list | set([news_id]) # print news_id, user_name rec_db.update( {'user_name': user_name}, {'$set': { "looked_list": list(looked_list) }}, ) except: return index(request)
def userRegist(user_name, user_passwd): value = { "user_name": user_name, "user_passwd": user_passwd, "user_read_id": "" } rec_db = MongoOperator('localhost', 27017, 'AgriRecSys', 'users') rec_db.insert(value, "users")
def userLogin(user_name, user_passwd): rec_db = MongoOperator('localhost', 27017, 'AgriRecSys', 'users') user_id = rec_db.find({ "user_name": user_name, "user_passwd": user_passwd, }) if user_id != None: return "success" else: return user_id["_id"]
def updateDB(): # mongodb使用AgriRecSys数据库,默认数据库集合为: news集合 rec_db = MongoOperator('localhost', 27017, 'AgriRecSys', 'news') news_dict_list = [ #paserClass1(), paserClass2(), paserClass3(), paserClass4(), paserClass5(), ] #print "end get passer list" for news_dict in news_dict_list: if len(news_dict) == 0: continue for key, value in news_dict.items(): #print "success insert.",key rec_db.insert(value, "news") # 向 news集合中插入
def history(request): # print "history ......" data = {} user_name = request.session.get('user_name') try: rec_db = MongoOperator('localhost', 27017, 'AgriRecSys', 'user') db_ans = rec_db.find({"user_name": user_name})[0] new_id_list = db_ans.get("looked_list") ans_list = [] for i, news_id in enumerate(new_id_list, 0): rec_d = MongoOperator('localhost', 27017, 'AgriRecSys', 'news') db_ans = rec_d.find({'_id': ObjectId(news_id)}) if db_ans.count() == 0: continue db_ans = db_ans[0] ans_list.append({ "news": db_ans, "news_id": str(db_ans["_id"]), "href": "#href_id%d" % (i), "content_id": "href_id%d" % (i), "click_id": "ajax_id_%d" % (i), "ajax_id": "#ajax_id_%d" % (i), }) data["user_name"] = user_name data["news_list"] = ans_list except: return index(request) return render(request, ROOT_URL + "/recommend_templates/templates/history.html", data) # 注意路径一定要写对
def myRecommend(request, data={}): # print "myRecommend ......" user_name = request.session.get('user_name') try: rec_db = MongoOperator('localhost', 27017, 'AgriRecSys', 'user') db_ans = rec_db.find({"user_name": user_name})[0] new_id_list = db_ans.get("looked_list") if new_id_list == None: # 面对冷启动问题 pass # else: # print "========*****#######*********" rec_new_id_list = get_K_nearst_love(8, new_id_list) # 推荐5个最优新闻名称给用户 # print "========**************",new_id_list ans_list = [] for i, news_id in enumerate(rec_new_id_list, 0): rec_db = MongoOperator('localhost', 27017, 'AgriRecSys', 'news') db_ans = rec_db.find({"_id": ObjectId(news_id)}) if db_ans.count() == 0: continue db_ans = db_ans[0] ans_list.append({ "news": db_ans, "news_id": str(db_ans["_id"]), "href": "#href_id%d" % (i), "content_id": "href_id%d" % (i), "click_id": "ajax_id_%d" % (i), "ajax_id": "#ajax_id_%d" % (i), }) data["user_name"] = user_name data["news_list"] = ans_list except: return index(request) return render(request, ROOT_URL + "/recommend_templates/templates/myRecommend.html", data) # 注意路径一定要写对
def class_4(request, data={}): # print "class_4 ......" try: user_name = request.session.get('user_name') rec_db = MongoOperator('localhost', 27017, 'AgriRecSys', 'news') db_ans = rec_db.find({"class_name": "市场价格"}) data = {} ans_list = [] for i, news in enumerate(list(db_ans), 0): ans_list.append({ "news": news, "news_id": str(news["_id"]), "href": "#href_id%d" % (i), "content_id": "href_id%d" % (i), "click_id": "ajax_id_%d" % (i), "ajax_id": "#ajax_id_%d" % (i), }) data["news_list"] = ans_list data["user_name"] = user_name except: return index(request) return render(request, ROOT_URL + "/recommend_templates/templates/class_4.html", data) # 注意路径一定要写对
weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素weight[i][j]表示j词在i类文本中的tf-idf权重 #print "整个样本集合中(样本个数*词库大小) = ",weight.shape nbrs = NearestNeighbors(n_neighbors=10, algorithm="ball_tree").fit(weight) #返回距离每个点k个最近的点和距离指数,indices可以理解为表示点的下标,distances为距离 distances, indices = nbrs.kneighbors(weight) k_nrbs_list = [] for i in range(len(indices)): k_nrbs_list.append(" ".join(["%d" % x for x in indices[i]])) key_bucket_df["k_nbrs"] = pd.Series(k_nrbs_list) key_bucket_df.to_csv("./recSys/data/key_bucket.csv", index=False, index_label=False) if __name__ == '__main__': rec_db = MongoOperator('localhost', 27017, 'AgriRecSys', 'news') rec_db.remove("news") #清空数据库 # rec_db.remove("user") #清空数据库 # updateDB() # 爬虫模块入口 from recommend_templates.Main.paserManager.util import CorrectIp ci = CorrectIp() ci.getCorrectIp() #代理ip gzb = GZB(ROOT_PATH) # .........耕种帮......... gzb.get_url_from_each_page() # zgny = ZGNYKJ(ROOT_PATH) # .........中国农业科技......... # zgny.main() # tfidf2Txt() # tf-idf和knn算法入口