def save_newslist_to_db(): # 获取学校列表,数据库配置信息 university_list = get_university_list() db_config = get_database_dict_info() #建立数据库连接 conn = MongoClient(db_config["host"], db_config["port"]) NewsPOA = conn.NewsPOA # NewsPOA["newslist"].drop() for i in range(0, len(university_list)): # for i in range(0,1): uni = university_list[i] if NewsPOA['newslist'].find({"Uname": uni["zh_name"]}).count() != 0: continue news_documents_list = request_baidu_news(uni["zh_name"], 1, MAX_PAGE_NUMBERS, uni["en_name"]) NewsPOA["newslist"].insert(news_documents_list) print(uni["zh_name"], "的新闻列表保存成功") print("新闻全部爬取完毕")
def db_to_dataset_folder(): db_config = get_database_dict_info() conn = MongoClient(db_config["host"],db_config["port"]) NewsPOA = conn.NewsPOA dataset = NewsPOA["newslist"].find( { "$or": [ { "body": { "$regex": ".*学术论坛.*" } } ], } ) to_filt = "/,.<》;:‘\"[]\{\}-_=+!~`@#$%^&*()" for data in dataset: data_title = data["title"] data_body = data["body"] for c in to_filt: data_title = data_title.replace(c, "") data_body = data_body.replace(c, "") with open( "../text_classification/dataset/news_category_dataset/" + data_title + ".txt", "w" ) as f: data_to_save = [] data_to_save.extend(jieba.cut(data_title)) data_to_save.extend(jieba.cut(data_body)) for item in data_to_save: if (item == "\n"): continue f.write(item) f.write(" ")
def save_result_poa_list_to_db(): # 获取学校列表,数据库配置信息 university_list = get_university_list() db_config = get_database_dict_info() #建立数据库连接 conn = MongoClient(db_config["host"], db_config["port"]) NewsPOA = conn.NewsPOA NewsPOA["news"].drop() for i in range(len(university_list)): uni = university_list[i] print("开始", uni["zh_name"]) json_path = "../news_result/" + uni["zh_name"] + ".json" current_uni_news_list = load_json_file(json_path) result_uni_news_list = [] for j in range(len(current_uni_news_list)): result_uni_news_list.append( predict_poa_result_from_documnet_dict( current_uni_news_list[j])) NewsPOA["news"].insert(result_uni_news_list) print(uni["zh_name"], "的新闻分析完毕,共有", str(len(result_uni_news_list)), "条")
def get_connected_database(): db_config = get_database_dict_info() connection = MongoClient(db_config['host'], db_config['port']) database = connection[db_config['database']] if (db_config['host'] != '127.0.0.1' and db_config['host'] != 'localhost'): database.authenticate(db_config['user'], db_config['password']) return database
def insert_university_list(): # 获取学校列表,数据库配置信息 university_list = get_university_list() db_config = get_database_dict_info() #建立数据库连接 conn = MongoClient(db_config["host"], db_config["port"]) NewsPOA = conn.NewsPOA NewsPOA["universitylist"].insert(university_list) print("学校列表表创建成功")
def add_negative_news_from_old_db(): # 获取学校列表,数据库配置信息 university_list = get_university_list() db_config = get_database_dict_info() #建立数据库连接 # new_conn = MongoClient(db_config["host"],db_config["port"]) new_conn = MongoClient("121.42.236.250", 27034) old_conn = MongoClient("121.42.236.250",27034) old_neg_news_cursor = old_conn.ResultPOA["news"].find({"sentiment": "-1"}) old_neg_news_list = [ item for item in old_neg_news_cursor] new_neg_news_cursor = new_conn.NewsPOA["newslist"].find({"sentiment": "-1"}) new_neg_news_list = [item for item in new_neg_news_cursor] add_neg_list = [] for i in range(len(old_neg_news_list)): current_news = old_neg_news_list[i] current_news_url = current_news["url"] if judge_url_in_list(new_neg_news_list,current_news_url) == False: current_news["media"] = "unkown" current_news["ranking"] = "300" add_neg_list.append(current_news) new_conn.NewsPOA["newslist"].insert(add_neg_list) create_news_numbers_info() new_conn.close() old_conn.close()
def compute_score(): # 获取学校列表,数据库配置信息 university_list = get_university_list() db_config = get_database_dict_info() #建立数据库连接 conn = MongoClient("121.42.236.250",27034) NewsPOA = conn.NewsPOA NewsPOA["influence"].drop() for uni in university_list: uni_name = uni['zh_name'] uni_news_list = NewsPOA["newslist"].find({"Uname":uni_name}) print("开始计算 ",uni_name,"的数据...") score = {} for news in uni_news_list: if score.get(news["media"]) is not None: score[news["media"]] += 1/(float(news["ranking"])/100+1) else: score[news["media"]] = 1/(float(news["ranking"])/100+1) score_list = [] for key,value in score.items(): current = { "Uname":uni_name, "media":key, "score":value } score_list.append(current) NewsPOA["influence"].insert(score_list) print(uni_name,"的数据保存完毕")
def get_newslist(): db_config = get_database_dict_info() conn = MongoClient(db_config["host"],db_config["port"]) NewsPOA = conn.NewsPOA newslist = NewsPOA["newslist"] return newslist
def create_news_numbers_info(): # 获取学校列表,数据库配置信息 university_list = get_university_list() db_config = get_database_dict_info() #建立数据库连接 conn = MongoClient("121.42.236.250", 27034) NewsPOA = conn.NewsPOA news_number_list = [] for uni in university_list: studyNumberList = [] activityNumberList = [] entranceNumberList = [] socialNumberList = [] studyNumberList.append(NewsPOA["news"].find({ "Uname": uni["zh_name"], "classification": "study", "sentiment": "-1" }).count()) studyNumberList.append(NewsPOA["news"].find({ "Uname": uni["zh_name"], "classification": "study", "sentiment": "0" }).count()) studyNumberList.append(NewsPOA["news"].find({ "Uname": uni["zh_name"], "classification": "study", "sentiment": "1" }).count()) studyNumberList.append(studyNumberList[0] + studyNumberList[1] + studyNumberList[2]) activityNumberList.append(NewsPOA["news"].find({ "Uname": uni["zh_name"], "classification": "activity", "sentiment": "-1" }).count()) activityNumberList.append(NewsPOA["news"].find({ "Uname": uni["zh_name"], "classification": "activity", "sentiment": "0" }).count()) activityNumberList.append(NewsPOA["news"].find({ "Uname": uni["zh_name"], "classification": "activity", "sentiment": "1" }).count()) activityNumberList.append(activityNumberList[0] + activityNumberList[1] + activityNumberList[2]) entranceNumberList.append(NewsPOA["news"].find({ "Uname": uni["zh_name"], "classification": "entrance", "sentiment": "-1" }).count()) entranceNumberList.append(NewsPOA["news"].find({ "Uname": uni["zh_name"], "classification": "entrance", "sentiment": "0" }).count()) entranceNumberList.append(NewsPOA["news"].find({ "Uname": uni["zh_name"], "classification": "entrance", "sentiment": "1" }).count()) entranceNumberList.append(entranceNumberList[0] + entranceNumberList[1] + entranceNumberList[2]) socialNumberList.append(NewsPOA["news"].find({ "Uname": uni["zh_name"], "classification": "social", "sentiment": "-1" }).count()) socialNumberList.append(NewsPOA["news"].find({ "Uname": uni["zh_name"], "classification": "social", "sentiment": "0" }).count()) socialNumberList.append(NewsPOA["news"].find({ "Uname": uni["zh_name"], "classification": "social", "sentiment": "1" }).count()) socialNumberList.append(socialNumberList[0] + socialNumberList[1] + socialNumberList[2]) news_number_list.append({ "Uname": uni["zh_name"], "abbr": uni["en_name"], "studyNumber": studyNumberList, "activityNumber": activityNumberList, "entranceNumber": entranceNumberList, "socialNumber": socialNumberList }) NewsPOA["newsNumber"].drop() NewsPOA["newsNumber"].insert(news_number_list) print("新闻数量表保存成功")
def reset_sentiment_category(): db_config = get_database_dict_info() conn = MongoClient(db_config["host"], db_config["port"]) NewsPOA = conn.NewsPOA