def copy_Data_from_Raw_to_Result(): # 获得配置文件参数 MongoDB_Host = mod_config.getConfig("database", "db_Host") MongoDB_Port = mod_config.getConfig("database", "db_Port") UniversityList = mod_config.get_University_list() #建立数据库连接 conn = MongoClient(MongoDB_Host, int(MongoDB_Port)) RawPOA = conn.RawPOA ResultPOA = conn.ResultPOA CopyList = [] tempDict = {} for iterm in RawPOA["news"].find(): tempDict = iterm tempDict["classification"] = "undefined" tempDict["sentiment"] = "99" CopyList.append(tempDict) print tempDict ResultPOA["news"].insert(CopyList) print "复制成功"
def spider(): UniversityList = mod_config.get_University_list() resultList = {} #初始化新闻集合的数据结构 for uni in UniversityList: resultList[uni["zh_name"]] = [] # 请求地址模板 base_url = "http://news.baidu.com/ns?word={school}&pn=0&rn=20&cl=2" # 设置用户代理 user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' headers = {'User-Agent': user_agent} #开始爬取20个学校的最近新闻 for uni in UniversityList: url = base_url.format(school=uni["zh_name"]) print "开始爬取" + uni["zh_name"] + "..." time.sleep(2) try: request = urllib2.Request(url, headers=headers) html = urllib2.urlopen(request).read() re_result = r'<h3 class="c-title">(.*?)<span class="c-info">' re_href = r'<a href="(.*?)"' re_date = r'<p class="c-author">.*? (.*?)</p>' result = re.findall(re_result, html, re.S | re.M) for detail in result: href = re.findall(re_href, detail, re.S | re.M)[0] try: date = re.findall(re_date, detail, re.S | re.M)[0] except Exception, e: date = "暂无日期" print e time.sleep(1) try: request = urllib2.Request(href, headers=headers) html = urllib2.urlopen(request, timeout=5).read() parseHTMLResult = process_BodyText(html) Uname = uni["zh_name"] abbr = uni["en_name"] document = { "Uname": Uname, "abbr": abbr, "title": parseHTMLResult["title"], "url": href, "date": date, "body": parseHTMLResult["body"] } if filter(document) == "true": resultList[uni["zh_name"]].append(document) except Exception, e: print e
def insert_university_list(): # 获得配置文件参数 MongoDB_Host = mod_config.getConfig("database", "db_Host") MongoDB_Port = mod_config.getConfig("database", "db_Port") UniversityList = mod_config.get_University_list() #建立数据库连接 conn = MongoClient(MongoDB_Host, int(MongoDB_Port)) ResultPOA = conn.ResultPOA ResultPOA["universitylist"].insert(UniversityList) print "插入成功"
def main(): # 获得配置文件参数 MongoDB_Host = mod_config.getConfig("database", "db_Host") MongoDB_Port = mod_config.getConfig("database", "db_Port") UniversityList = mod_config.get_University_list() Dict = mod_config.get_ModelType_Info() #建立数据库连接 conn = MongoClient(MongoDB_Host, int(MongoDB_Port)) ResultPOA = conn.ResultPOA RawPOA = conn.RawPOA #LocalResultPOA = MongoClient('192.168.1.3',27017).ResultPOA # for uni in UniversityList: # # InsertList = [] # for iterm in RawPOA["news"].find({"Uname": uni["zh_name"]}): # # text_seg = seg_ChineseText(iterm["body"]) # # tfidf_sentiment = get_tfidf(text_seg,Dict["sentiment"]) # tfidf_classification = get_tfidf(text_seg,Dict["classification"]) # # sentimentResult = get_predictedResult(tfidf_sentiment,Dict["sentiment"]) # classificationResult = get_predictedResult(tfidf_classification,Dict["classification"]) # # newsDoc = iterm # newsDoc["sentiment"] = sentimentResult # newsDoc["classification"] = classificationResult # InsertList.append(newsDoc) # # ResultPOA["news"].insert(InsertList) # #LocalResultPOA["news"].insert(InsertList) # print uni["zh_name"] + "的新闻信息已保存完毕" str = "上海海事大学2017年拟录取硕士研究生名单公示" text_seg = seg_ChineseText(str) tfidf_sentiment = get_tfidf(text_seg, Dict["sentiment"]) tfidf_classification = get_tfidf(text_seg, Dict["classification"]) sentimentResult = get_predictedResult(tfidf_sentiment, Dict["sentiment"]) classificationResult = get_predictedResult(tfidf_classification, Dict["classification"]) print "待分类预测文本:'{0}' 的分类结果为:\n 情感分类:{1}\n 类别分类:{2}\n".format( str, sentimentResult, classificationResult)
def main(): # 获得配置文件参数 MongoDB_Host = mod_config.getConfig("database", "db_Host") MongoDB_Port = mod_config.getConfig("database", "db_Port") UniversityList = mod_config.get_University_list() #建立数据库连接 conn = MongoClient(MongoDB_Host, int(MongoDB_Port)) RawPOA = conn.RawPOA #计数器 count = 0 ISOTIMEFORMAT = '%Y-%m-%d %X' print time.strftime(ISOTIMEFORMAT, time.localtime()) #print "/n" for uni in UniversityList: count += 1 #print "开始爬取第"+str(count)+"个学校数据,还有"+str(len(UniversityList)-count)+"个学校爬取" print "The system begins to download the " + str( count) + "st university. There are " + str( len(UniversityList) - count) + " universities left." newsCollection = request_NewsInfo(uni) save_DataToDB(newsCollection, RawPOA) #print + "的新闻爬取完毕。共"+str(len(newsCollection))+"条信息数据\n" print "The system has finished downloading the news of " + uni[ "en_name"] + ". There are total " + str( len(newsCollection)) + " news.\n" print time.strftime(ISOTIMEFORMAT, time.localtime()) #print "/n" print "Work finished!\n" print time.strftime(ISOTIMEFORMAT, time.localtime())
def create_TrainData(): # 获得配置文件参数 MongoDB_Host = mod_config.getConfig("database", "db_Host") MongoDB_Port = mod_config.getConfig("database", "db_Port") UniversityList = mod_config.get_University_list() #建立数据库连接 conn = MongoClient(MongoDB_Host, int(MongoDB_Port)) RawPOA = conn.RawPOA newsCollection = RawPOA["news"] for uni in UniversityList: path = './rawdata/' + uni["chn_name"] + "/" if not os.path.exists(path): os.makedirs(path) # 调取每个学校前50篇新闻作为训练数据集 for iterm in newsCollection.find({"Uname": uni["chn_name"]}).limit(50): filePath = path + str( iterm["title"].encode("utf8")).strip() + ".txt" fileBody = iterm["body"] try: f = codecs.open(filePath, 'w', "utf-8") f.write(fileBody) print filePath + " 写入成功" except Exception, e: print e print filePath + " 写入失败" finally:
def create_NewsNumbersInfo(): # 获得配置文件参数 MongoDB_Host = mod_config.getConfig("database", "db_Host") MongoDB_Port = mod_config.getConfig("database", "db_Port") UniversityList = mod_config.get_University_list() #建立数据库连接 conn = MongoClient(MongoDB_Host, int(MongoDB_Port)) ResultPOA = conn.ResultPOA ResultList = [] for uni in UniversityList: studyNumberList = [] activityNumberList = [] entranceNumberList = [] socialNumberList = [] studyNumberList.append(ResultPOA["news"].find({ "Uname": uni["zh_name"], "classification": "study", "sentiment": "-1" }).count()) studyNumberList.append(ResultPOA["news"].find({ "Uname": uni["zh_name"], "classification": "study", "sentiment": "0" }).count()) studyNumberList.append(ResultPOA["news"].find({ "Uname": uni["zh_name"], "classification": "study", "sentiment": "1" }).count()) studyNumberList.append(studyNumberList[0] + studyNumberList[1] + studyNumberList[2]) activityNumberList.append(ResultPOA["news"].find({ "Uname": uni["zh_name"], "classification": "activity", "sentiment": "-1" }).count()) activityNumberList.append(ResultPOA["news"].find({ "Uname": uni["zh_name"], "classification": "activity", "sentiment": "0" }).count()) activityNumberList.append(ResultPOA["news"].find({ "Uname": uni["zh_name"], "classification": "activity", "sentiment": "1" }).count()) activityNumberList.append(activityNumberList[0] + activityNumberList[1] + activityNumberList[2]) entranceNumberList.append(ResultPOA["news"].find({ "Uname": uni["zh_name"], "classification": "entrance", "sentiment": "-1" }).count()) entranceNumberList.append(ResultPOA["news"].find({ "Uname": uni["zh_name"], "classification": "entrance", "sentiment": "0" }).count()) entranceNumberList.append(ResultPOA["news"].find({ "Uname": uni["zh_name"], "classification": "entrance", "sentiment": "1" }).count()) entranceNumberList.append(entranceNumberList[0] + entranceNumberList[1] + entranceNumberList[2]) socialNumberList.append(ResultPOA["news"].find({ "Uname": uni["zh_name"], "classification": "social", "sentiment": "-1" }).count()) socialNumberList.append(ResultPOA["news"].find({ "Uname": uni["zh_name"], "classification": "social", "sentiment": "0" }).count()) socialNumberList.append(ResultPOA["news"].find({ "Uname": uni["zh_name"], "classification": "social", "sentiment": "1" }).count()) socialNumberList.append(socialNumberList[0] + socialNumberList[1] + socialNumberList[2]) # print uni["zh_name"] # print studyNumberList # print activityNumberList # print entranceNumberList # print socialNumberList ResultList.append({ "Uname": uni["zh_name"], "abbr": uni["en_name"], "studyNumber": studyNumberList, "activityNumber": activityNumberList, "entranceNumber": entranceNumberList, "socialNumber": socialNumberList }) ResultPOA["newsNumber"].drop() ResultPOA["newsNumber"].insert(ResultList) print "保存成功"
def mainTask(): # 获得配置文件参数 MongoDB_Host = mod_config.getConfig("database", "db_Host") MongoDB_Port = mod_config.getConfig("database", "db_Port") UniversityList = mod_config.get_University_list() #建立数据库连接 conn = MongoClient(MongoDB_Host, int(MongoDB_Port)) RawPOA = conn.RawPOA ResultPOA = conn.ResultPOA raw_tempList = {} result_tempList = {} for uni in UniversityList: raw_tempList[uni["zh_name"]] = [] result_tempList[uni["zh_name"]] = [] print "从缓存新闻表中取出数据..." for uni in UniversityList: for iterm in RawPOA["tempNews"].find({ "Uname": uni["zh_name"] }).sort("date", pym.DESCENDING).limit(20): if iterm["date"].encode("utf-8") != "暂无日期": doc = { "body": iterm["body"], "title": iterm["title"], "url": iterm["url"], "Uname": iterm["Uname"], "abbr": iterm["abbr"], "date": iterm["date"] } raw_tempList[uni["zh_name"]].insert(0, doc) for iterm in ResultPOA["tempNews"].find({ "Uname": uni["zh_name"] }).sort("date", pym.DESCENDING).limit(20): if iterm["date"].encode("utf-8") != "暂无日期": doc = { "body": iterm["body"], "title": iterm["title"], "url": iterm["url"], "Uname": iterm["Uname"], "abbr": iterm["abbr"], "date": iterm["date"], "sentiment": iterm["sentiment"], "classification": iterm["classification"] } result_tempList[uni["zh_name"]].insert(0, doc) print "从缓存新闻表中取出数据成功\n\n" spiderList = spider() print "开始查重去重操作..." for uni in UniversityList: count = 0 predictedList = [] for iterm in spiderList[uni["zh_name"]]: if (findUrlInList(iterm["url"], raw_tempList[uni["zh_name"]]) == -1) and (findUrlInList( iterm["url"], result_tempList[uni["zh_name"]])): count = count + 1 raw_tempList[uni["zh_name"]].insert(0, iterm) predictResult = predict(iterm["body"]) tempIterm = { "body": iterm["body"], "title": iterm["title"], "url": iterm["url"], "Uname": iterm["Uname"], "abbr": iterm["abbr"], "date": iterm["date"], "sentiment": predictResult["sentiment"], "classification": predictResult["classification"] } result_tempList[uni["zh_name"]].insert(0, tempIterm) predictedList.append(tempIterm) for i in range(0, count): raw_tempList[uni["zh_name"]].pop() result_tempList[uni["zh_name"]].pop() ResultPOA["news"].insert(predictedList) print "结果数据表的数据保存成功" print "去重操作完成\n\n" RawPOA["tempNews"].drop() ResultPOA["tempNews"].drop() for uni in UniversityList: RawPOA["tempNews"].insert(raw_tempList[uni["zh_name"]]) ResultPOA["tempNews"].insert(result_tempList[uni["zh_name"]]) print "数据库存入数据成功\n\n" update_NewsNumbersInfo()